@astrofoundry/grimoire 3.31.0 → 3.32.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{admin-WUNBDKBC.js → admin-4C3QNVQE.js} +86 -64
- package/dist/admin-4C3QNVQE.js.map +7 -0
- package/dist/{chunk-R46N6C3C.js → chunk-VTTJCQRQ.js} +37 -2
- package/dist/chunk-VTTJCQRQ.js.map +7 -0
- package/dist/cli.js +121 -14
- package/dist/cli.js.map +2 -2
- package/package.json +1 -1
- package/dist/admin-WUNBDKBC.js.map +0 -7
- package/dist/chunk-R46N6C3C.js.map +0 -7
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import {
|
|
2
|
+
RERANK_POOL_SIZE,
|
|
2
3
|
__commonJS,
|
|
3
4
|
__toESM,
|
|
4
5
|
bold,
|
|
5
6
|
cyan,
|
|
7
|
+
rerank,
|
|
8
|
+
rerankDocText,
|
|
6
9
|
yellow
|
|
7
|
-
} from "./chunk-
|
|
10
|
+
} from "./chunk-VTTJCQRQ.js";
|
|
8
11
|
|
|
9
12
|
// node_modules/.pnpm/@joplin+turndown-plugin-gfm@1.0.67/node_modules/@joplin/turndown-plugin-gfm/lib/turndown-plugin-gfm.cjs.js
|
|
10
13
|
var require_turndown_plugin_gfm_cjs = __commonJS({
|
|
@@ -476,9 +479,7 @@ function slugifyUrl(url) {
|
|
|
476
479
|
|
|
477
480
|
// src/scraper.ts
|
|
478
481
|
function filterUrls(urls, includePatterns, excludePatterns) {
|
|
479
|
-
let filtered = urls.filter(
|
|
480
|
-
(url) => url.startsWith("http") && !url.includes("?hl=") && !url.endsWith("#")
|
|
481
|
-
);
|
|
482
|
+
let filtered = urls.map((url) => url.split("#")[0]).filter((url) => url.startsWith("http") && !url.includes("?hl="));
|
|
482
483
|
if (includePatterns && includePatterns.length > 0) {
|
|
483
484
|
filtered = filtered.filter(
|
|
484
485
|
(url) => includePatterns.some((pattern) => url.includes(pattern))
|
|
@@ -688,7 +689,14 @@ function cleanTableRowBreaks(md) {
|
|
|
688
689
|
(line) => line.startsWith("|") ? line.replace(/\s*<br\s*\/?>\s*/g, "; ") : line
|
|
689
690
|
).join("\n");
|
|
690
691
|
}
|
|
692
|
+
var SPAN_STRIP_THRESHOLD_BYTES = 2e6;
|
|
693
|
+
function stripInlineSpans(html) {
|
|
694
|
+
return html.replace(/<\/?span\b[^>]*>/g, "");
|
|
695
|
+
}
|
|
691
696
|
function extractContent(html, contentSelector, removeSelectors, removeTextPatterns) {
|
|
697
|
+
if (html.length > SPAN_STRIP_THRESHOLD_BYTES) {
|
|
698
|
+
html = stripInlineSpans(html);
|
|
699
|
+
}
|
|
692
700
|
const dom = new JSDOM(html);
|
|
693
701
|
const doc = dom.window.document;
|
|
694
702
|
const contentEl = doc.querySelector(contentSelector) ?? doc.body;
|
|
@@ -744,7 +752,14 @@ async function convertSource(sourceName, urls, contentSelector, removeSelectors,
|
|
|
744
752
|
const url = urls[i];
|
|
745
753
|
const slug = slugifyUrl(url);
|
|
746
754
|
const htmlPath = join2(rawDir, `${slug}.html`);
|
|
747
|
-
|
|
755
|
+
let html;
|
|
756
|
+
try {
|
|
757
|
+
html = await readFile2(htmlPath, "utf-8");
|
|
758
|
+
} catch {
|
|
759
|
+
console.warn(` WARNING: no cached HTML for ${url}, skipping page. Run 'grimoire scrape-urls' to fetch it.`);
|
|
760
|
+
completed++;
|
|
761
|
+
continue;
|
|
762
|
+
}
|
|
748
763
|
const page = convertPage(html, sourceName, url, contentSelector, removeSelectors, removeTextPatterns);
|
|
749
764
|
await writeFile2(join2(mdDir, `${slug}.md`), page.markdown, "utf-8");
|
|
750
765
|
pages[i] = page;
|
|
@@ -757,7 +772,7 @@ async function convertSource(sourceName, urls, contentSelector, removeSelectors,
|
|
|
757
772
|
() => worker()
|
|
758
773
|
);
|
|
759
774
|
await Promise.all(workers);
|
|
760
|
-
return pages;
|
|
775
|
+
return pages.filter((page) => page !== void 0);
|
|
761
776
|
}
|
|
762
777
|
|
|
763
778
|
// src/chunker.ts
|
|
@@ -777,7 +792,10 @@ function buildChunkId(source, url, headingSlug, index) {
|
|
|
777
792
|
return index !== void 0 ? `${base}-${index}` : base;
|
|
778
793
|
}
|
|
779
794
|
function buildEmbedText(chunk) {
|
|
780
|
-
|
|
795
|
+
let path = chunk.heading_path[0] === chunk.title ? chunk.heading_path.slice(1) : chunk.heading_path;
|
|
796
|
+
if (chunk.content.startsWith("#") && path.length > 0) {
|
|
797
|
+
path = path.slice(0, -1);
|
|
798
|
+
}
|
|
781
799
|
const context = [chunk.title, ...path].filter(Boolean).join(" > ");
|
|
782
800
|
return context ? `${context}
|
|
783
801
|
|
|
@@ -896,11 +914,25 @@ function parseBlocks(lines) {
|
|
|
896
914
|
flush();
|
|
897
915
|
return blocks;
|
|
898
916
|
}
|
|
917
|
+
function splitLongLine(line, budget) {
|
|
918
|
+
const maxChars = budget * 4;
|
|
919
|
+
if (line.length <= maxChars) return [line];
|
|
920
|
+
const pieces = [];
|
|
921
|
+
let rest = line;
|
|
922
|
+
while (rest.length > maxChars) {
|
|
923
|
+
let cut = rest.lastIndexOf(" ", maxChars);
|
|
924
|
+
if (cut < maxChars / 2) cut = maxChars;
|
|
925
|
+
pieces.push(rest.slice(0, cut));
|
|
926
|
+
rest = rest.slice(cut).trimStart();
|
|
927
|
+
}
|
|
928
|
+
if (rest) pieces.push(rest);
|
|
929
|
+
return pieces;
|
|
930
|
+
}
|
|
899
931
|
function groupLines(lines, budget) {
|
|
900
932
|
const groups = [];
|
|
901
933
|
let current = [];
|
|
902
934
|
let tokens = 0;
|
|
903
|
-
for (const line of lines) {
|
|
935
|
+
for (const line of lines.flatMap((l) => splitLongLine(l, budget))) {
|
|
904
936
|
const lineTokens = estimateTokens(line) + 1;
|
|
905
937
|
if (tokens + lineTokens > budget && current.length > 0) {
|
|
906
938
|
groups.push(current);
|
|
@@ -1295,7 +1327,8 @@ async function getSourceChunkHashes(sourceName) {
|
|
|
1295
1327
|
snapshot.docs.map((doc) => [doc.id, doc.data().content_hash ?? ""])
|
|
1296
1328
|
);
|
|
1297
1329
|
}
|
|
1298
|
-
var TOKEN_QUERY_LIMIT =
|
|
1330
|
+
var TOKEN_QUERY_LIMIT = 40;
|
|
1331
|
+
var TOKEN_RESULT_FIELDS = ["source", "url", "title", "heading_path", "content", "token_count"];
|
|
1299
1332
|
async function tokenSearch(tokens, source) {
|
|
1300
1333
|
if (tokens.length === 0) return [];
|
|
1301
1334
|
const col = chunksCol();
|
|
@@ -1306,7 +1339,7 @@ async function tokenSearch(tokens, source) {
|
|
|
1306
1339
|
if (source) {
|
|
1307
1340
|
query = query.where("source", "==", source);
|
|
1308
1341
|
}
|
|
1309
|
-
return query.limit(TOKEN_QUERY_LIMIT).get();
|
|
1342
|
+
return query.select(...TOKEN_RESULT_FIELDS).limit(TOKEN_QUERY_LIMIT).get();
|
|
1310
1343
|
})
|
|
1311
1344
|
);
|
|
1312
1345
|
for (const snapshot of snapshots) {
|
|
@@ -1338,58 +1371,23 @@ async function vectorSearch(queryEmbedding, limit, source) {
|
|
|
1338
1371
|
const data = doc.data();
|
|
1339
1372
|
const distance = data._distance ?? 0;
|
|
1340
1373
|
delete data._distance;
|
|
1374
|
+
delete data.embedding;
|
|
1341
1375
|
return { id: doc.id, data, distance };
|
|
1342
1376
|
});
|
|
1343
1377
|
}
|
|
1344
1378
|
|
|
1345
|
-
// src/reranker.ts
|
|
1346
|
-
function getRerankerUrl() {
|
|
1347
|
-
const url = process.env.RERANKER_URL;
|
|
1348
|
-
if (!url) {
|
|
1349
|
-
throw new Error("RERANKER_URL environment variable is not set");
|
|
1350
|
-
}
|
|
1351
|
-
return url;
|
|
1352
|
-
}
|
|
1353
|
-
async function rerank(query, documents, topN = 5) {
|
|
1354
|
-
const baseUrl = getRerankerUrl();
|
|
1355
|
-
const response = await fetch(`${baseUrl}/v1/rerank`, {
|
|
1356
|
-
method: "POST",
|
|
1357
|
-
headers: { "Content-Type": "application/json" },
|
|
1358
|
-
body: JSON.stringify({ query, documents, top_n: topN })
|
|
1359
|
-
});
|
|
1360
|
-
if (!response.ok) {
|
|
1361
|
-
throw new Error(`Reranker request failed: ${response.status} ${response.statusText}`);
|
|
1362
|
-
}
|
|
1363
|
-
const data = await response.json();
|
|
1364
|
-
return data.results;
|
|
1365
|
-
}
|
|
1366
|
-
|
|
1367
1379
|
// src/search.ts
|
|
1368
1380
|
var DEFAULT_CANDIDATES = 50;
|
|
1369
1381
|
var RRF_K = 60;
|
|
1370
1382
|
function hasReranker() {
|
|
1371
1383
|
return !!process.env.RERANKER_URL;
|
|
1372
1384
|
}
|
|
1373
|
-
function isVectorLike(value) {
|
|
1374
|
-
return typeof value === "object" && value !== null && typeof value.toArray === "function";
|
|
1375
|
-
}
|
|
1376
|
-
function cosineSimilarity(a, b) {
|
|
1377
|
-
let dot = 0;
|
|
1378
|
-
let normA = 0;
|
|
1379
|
-
let normB = 0;
|
|
1380
|
-
for (let i = 0; i < a.length; i++) {
|
|
1381
|
-
dot += a[i] * b[i];
|
|
1382
|
-
normA += a[i] * a[i];
|
|
1383
|
-
normB += b[i] * b[i];
|
|
1384
|
-
}
|
|
1385
|
-
return dot / (Math.sqrt(normA) * Math.sqrt(normB));
|
|
1386
|
-
}
|
|
1387
1385
|
function contextualText(data) {
|
|
1388
|
-
return
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1386
|
+
return rerankDocText(
|
|
1387
|
+
data.title,
|
|
1388
|
+
data.heading_path,
|
|
1389
|
+
data.content
|
|
1390
|
+
);
|
|
1393
1391
|
}
|
|
1394
1392
|
function toSearchResult(candidate, relevance) {
|
|
1395
1393
|
const data = candidate.data;
|
|
@@ -1426,19 +1424,17 @@ async function search(query, options = {}) {
|
|
|
1426
1424
|
existing.fusedScore += lexicalScore;
|
|
1427
1425
|
return;
|
|
1428
1426
|
}
|
|
1429
|
-
const embedding = hit.data.embedding;
|
|
1430
|
-
delete hit.data.embedding;
|
|
1431
1427
|
pool.set(hit.id, {
|
|
1432
1428
|
id: hit.id,
|
|
1433
1429
|
data: hit.data,
|
|
1434
1430
|
fusedScore: lexicalScore,
|
|
1435
|
-
similarity:
|
|
1431
|
+
similarity: null
|
|
1436
1432
|
});
|
|
1437
1433
|
});
|
|
1438
1434
|
const fused = [...pool.values()].sort((a, b) => b.fusedScore - a.fusedScore);
|
|
1439
1435
|
if (fused.length === 0) return [];
|
|
1440
1436
|
if (hasReranker()) {
|
|
1441
|
-
const rerankPool = fused.slice(0,
|
|
1437
|
+
const rerankPool = fused.slice(0, RERANK_POOL_SIZE);
|
|
1442
1438
|
const documents = rerankPool.map((c) => contextualText(c.data));
|
|
1443
1439
|
const reranked = await rerank(query, documents, topN);
|
|
1444
1440
|
return reranked.map((r) => toSearchResult(rerankPool[r.index], r.relevance_score));
|
|
@@ -1761,7 +1757,10 @@ Source "${name}" added to config/sources.yaml`);
|
|
|
1761
1757
|
}
|
|
1762
1758
|
}
|
|
1763
1759
|
var EMBED_WINDOW = 1e3;
|
|
1764
|
-
|
|
1760
|
+
var SHRINK_GUARD_MIN_EXISTING = 200;
|
|
1761
|
+
var SHRINK_GUARD_MAX_DELETE_RATIO = 0.5;
|
|
1762
|
+
var GROWTH_GUARD_MAX_RATIO = 5;
|
|
1763
|
+
async function syncChunks(sourceName, allChunks, urlCount, version, allowShrink = false) {
|
|
1765
1764
|
console.log(" Comparing with Firestore...");
|
|
1766
1765
|
const existing = await getSourceChunkHashes(sourceName);
|
|
1767
1766
|
const currentIds = new Set(allChunks.map((c) => c.id));
|
|
@@ -1772,10 +1771,15 @@ async function syncChunks(sourceName, allChunks, urlCount, version) {
|
|
|
1772
1771
|
console.log(
|
|
1773
1772
|
` Sync: ${toEmbed.length} to embed, ${allChunks.length - toEmbed.length} unchanged, ${toDelete.length} to delete.`
|
|
1774
1773
|
);
|
|
1775
|
-
if (toDelete.length >
|
|
1776
|
-
|
|
1777
|
-
|
|
1778
|
-
|
|
1774
|
+
if (!allowShrink && existing.size >= SHRINK_GUARD_MIN_EXISTING && toDelete.length > existing.size * SHRINK_GUARD_MAX_DELETE_RATIO) {
|
|
1775
|
+
throw new Error(
|
|
1776
|
+
`Refusing to delete ${toDelete.length} of ${existing.size} stored chunks for "${sourceName}": URL discovery likely found only part of the site. Verify the sitemap/nav/urls.json, then re-run with --allow-shrink if the change is expected, or --full to rebuild from scratch.`
|
|
1777
|
+
);
|
|
1778
|
+
}
|
|
1779
|
+
if (!allowShrink && existing.size >= SHRINK_GUARD_MIN_EXISTING && allChunks.length > existing.size * GROWTH_GUARD_MAX_RATIO) {
|
|
1780
|
+
throw new Error(
|
|
1781
|
+
`Refusing to grow "${sourceName}" from ${existing.size} to ${allChunks.length} chunks (>${GROWTH_GUARD_MAX_RATIO}x): URL discovery may have escaped the doc tree. Verify include_patterns/nav_selector, then re-run with --allow-shrink if the growth is real.`
|
|
1782
|
+
);
|
|
1779
1783
|
}
|
|
1780
1784
|
for (let i = 0; i < toEmbed.length; i += EMBED_WINDOW) {
|
|
1781
1785
|
const window = toEmbed.slice(i, i + EMBED_WINDOW);
|
|
@@ -1788,9 +1792,24 @@ async function syncChunks(sourceName, allChunks, urlCount, version) {
|
|
|
1788
1792
|
console.log(` [${i + cur}/${toEmbed.length}] stored`);
|
|
1789
1793
|
});
|
|
1790
1794
|
}
|
|
1795
|
+
if (toDelete.length > 0) {
|
|
1796
|
+
await deleteChunksByIds(toDelete, (cur, total) => {
|
|
1797
|
+
console.log(` [${cur}/${total}] deleted`);
|
|
1798
|
+
});
|
|
1799
|
+
}
|
|
1791
1800
|
await updateSourceMeta(sourceName, allChunks.length, urlCount, version);
|
|
1792
1801
|
console.log(` Done. ${allChunks.length} chunks live for "${sourceName}".`);
|
|
1793
1802
|
}
|
|
1803
|
+
function applyExcludePatterns(items, getUrl, excludePatterns, label) {
|
|
1804
|
+
if (!excludePatterns || excludePatterns.length === 0) return items;
|
|
1805
|
+
const kept = items.filter(
|
|
1806
|
+
(item) => !excludePatterns.some((pattern) => getUrl(item).includes(pattern))
|
|
1807
|
+
);
|
|
1808
|
+
if (kept.length < items.length) {
|
|
1809
|
+
console.log(` Excluded ${items.length - kept.length} ${label} via exclude_patterns.`);
|
|
1810
|
+
}
|
|
1811
|
+
return kept;
|
|
1812
|
+
}
|
|
1794
1813
|
async function readCachedMarkdownPages(mdDir) {
|
|
1795
1814
|
const mdFiles = await readdir(mdDir).catch(() => []);
|
|
1796
1815
|
const pages = [];
|
|
@@ -1850,14 +1869,15 @@ async function cmdRefresh() {
|
|
|
1850
1869
|
limit: { type: "string" },
|
|
1851
1870
|
"from-html": { type: "boolean", default: false },
|
|
1852
1871
|
"from-markdown": { type: "boolean", default: false },
|
|
1853
|
-
"skip-store": { type: "boolean", default: false }
|
|
1872
|
+
"skip-store": { type: "boolean", default: false },
|
|
1873
|
+
"allow-shrink": { type: "boolean", default: false }
|
|
1854
1874
|
},
|
|
1855
1875
|
allowPositionals: true
|
|
1856
1876
|
});
|
|
1857
1877
|
const config = await loadConfig(CONFIG_PATH);
|
|
1858
1878
|
const sourcesToRefresh = args.values.all ? Object.keys(config.sources) : [args.positionals[0]];
|
|
1859
1879
|
if (!args.values.all && !sourcesToRefresh[0]) {
|
|
1860
|
-
console.error("Usage: grimoire refresh <source> [--full] [--from-html] [--from-markdown] [--skip-store] [--limit <n>] [--concurrency <n>]");
|
|
1880
|
+
console.error("Usage: grimoire refresh <source> [--full] [--from-html] [--from-markdown] [--skip-store] [--allow-shrink] [--limit <n>] [--concurrency <n>]");
|
|
1861
1881
|
process.exit(1);
|
|
1862
1882
|
}
|
|
1863
1883
|
const concurrencyOverride = args.values.concurrency ? parseInt(args.values.concurrency, 10) : void 0;
|
|
@@ -1890,6 +1910,7 @@ Refreshing "${sourceName}"...`);
|
|
|
1890
1910
|
console.error(" No cached markdown found. Run with --from-html first.");
|
|
1891
1911
|
process.exit(1);
|
|
1892
1912
|
}
|
|
1913
|
+
pages = applyExcludePatterns(pages, (p) => p.url, source.exclude_patterns, "cached pages");
|
|
1893
1914
|
console.log(` Found ${pages.length} cached pages.`);
|
|
1894
1915
|
} else if (source.llms_full_url && !args.values["from-html"]) {
|
|
1895
1916
|
console.log(` Fetching llms-full.txt from ${source.llms_full_url}...`);
|
|
@@ -1908,6 +1929,7 @@ Refreshing "${sourceName}"...`);
|
|
|
1908
1929
|
if (args.values["from-html"]) {
|
|
1909
1930
|
console.log(" Reading URLs from cached HTML...");
|
|
1910
1931
|
urls = await recoverUrlsFromHtml(rawDir);
|
|
1932
|
+
urls = applyExcludePatterns(urls, (u) => u, source.exclude_patterns, "cached URLs");
|
|
1911
1933
|
console.log(` Found ${urls.length} cached pages.`);
|
|
1912
1934
|
} else {
|
|
1913
1935
|
console.log(" Scraping URLs...");
|
|
@@ -1941,7 +1963,7 @@ Refreshing "${sourceName}"...`);
|
|
|
1941
1963
|
console.log(` Done. ${allChunks.length} chunks ready (dry run, no embed/store).`);
|
|
1942
1964
|
continue;
|
|
1943
1965
|
}
|
|
1944
|
-
await syncChunks(sourceName, allChunks, pages.length, source.version);
|
|
1966
|
+
await syncChunks(sourceName, allChunks, pages.length, source.version, args.values["allow-shrink"]);
|
|
1945
1967
|
}
|
|
1946
1968
|
}
|
|
1947
1969
|
async function cmdSearch() {
|
|
@@ -2149,4 +2171,4 @@ var ADMIN_COMMANDS = {
|
|
|
2149
2171
|
export {
|
|
2150
2172
|
ADMIN_COMMANDS
|
|
2151
2173
|
};
|
|
2152
|
-
//# sourceMappingURL=admin-
|
|
2174
|
+
//# sourceMappingURL=admin-4C3QNVQE.js.map
|