@astrofoundry/grimoire 3.31.0 → 3.32.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,13 @@
1
1
  import {
2
+ RERANK_POOL_SIZE,
2
3
  __commonJS,
3
4
  __toESM,
4
5
  bold,
5
6
  cyan,
7
+ rerank,
8
+ rerankDocText,
6
9
  yellow
7
- } from "./chunk-R46N6C3C.js";
10
+ } from "./chunk-VTTJCQRQ.js";
8
11
 
9
12
  // node_modules/.pnpm/@joplin+turndown-plugin-gfm@1.0.67/node_modules/@joplin/turndown-plugin-gfm/lib/turndown-plugin-gfm.cjs.js
10
13
  var require_turndown_plugin_gfm_cjs = __commonJS({
@@ -476,9 +479,7 @@ function slugifyUrl(url) {
476
479
 
477
480
  // src/scraper.ts
478
481
  function filterUrls(urls, includePatterns, excludePatterns) {
479
- let filtered = urls.filter(
480
- (url) => url.startsWith("http") && !url.includes("?hl=") && !url.endsWith("#")
481
- );
482
+ let filtered = urls.map((url) => url.split("#")[0]).filter((url) => url.startsWith("http") && !url.includes("?hl="));
482
483
  if (includePatterns && includePatterns.length > 0) {
483
484
  filtered = filtered.filter(
484
485
  (url) => includePatterns.some((pattern) => url.includes(pattern))
@@ -688,7 +689,14 @@ function cleanTableRowBreaks(md) {
688
689
  (line) => line.startsWith("|") ? line.replace(/\s*<br\s*\/?>\s*/g, "; ") : line
689
690
  ).join("\n");
690
691
  }
692
+ var SPAN_STRIP_THRESHOLD_BYTES = 2e6;
693
+ function stripInlineSpans(html) {
694
+ return html.replace(/<\/?span\b[^>]*>/g, "");
695
+ }
691
696
  function extractContent(html, contentSelector, removeSelectors, removeTextPatterns) {
697
+ if (html.length > SPAN_STRIP_THRESHOLD_BYTES) {
698
+ html = stripInlineSpans(html);
699
+ }
692
700
  const dom = new JSDOM(html);
693
701
  const doc = dom.window.document;
694
702
  const contentEl = doc.querySelector(contentSelector) ?? doc.body;
@@ -744,7 +752,14 @@ async function convertSource(sourceName, urls, contentSelector, removeSelectors,
744
752
  const url = urls[i];
745
753
  const slug = slugifyUrl(url);
746
754
  const htmlPath = join2(rawDir, `${slug}.html`);
747
- const html = await readFile2(htmlPath, "utf-8");
755
+ let html;
756
+ try {
757
+ html = await readFile2(htmlPath, "utf-8");
758
+ } catch {
759
+ console.warn(` WARNING: no cached HTML for ${url}, skipping page. Run 'grimoire scrape-urls' to fetch it.`);
760
+ completed++;
761
+ continue;
762
+ }
748
763
  const page = convertPage(html, sourceName, url, contentSelector, removeSelectors, removeTextPatterns);
749
764
  await writeFile2(join2(mdDir, `${slug}.md`), page.markdown, "utf-8");
750
765
  pages[i] = page;
@@ -757,7 +772,7 @@ async function convertSource(sourceName, urls, contentSelector, removeSelectors,
757
772
  () => worker()
758
773
  );
759
774
  await Promise.all(workers);
760
- return pages;
775
+ return pages.filter((page) => page !== void 0);
761
776
  }
762
777
 
763
778
  // src/chunker.ts
@@ -777,7 +792,10 @@ function buildChunkId(source, url, headingSlug, index) {
777
792
  return index !== void 0 ? `${base}-${index}` : base;
778
793
  }
779
794
  function buildEmbedText(chunk) {
780
- const path = chunk.heading_path[0] === chunk.title ? chunk.heading_path.slice(1) : chunk.heading_path;
795
+ let path = chunk.heading_path[0] === chunk.title ? chunk.heading_path.slice(1) : chunk.heading_path;
796
+ if (chunk.content.startsWith("#") && path.length > 0) {
797
+ path = path.slice(0, -1);
798
+ }
781
799
  const context = [chunk.title, ...path].filter(Boolean).join(" > ");
782
800
  return context ? `${context}
783
801
 
@@ -896,11 +914,25 @@ function parseBlocks(lines) {
896
914
  flush();
897
915
  return blocks;
898
916
  }
917
+ function splitLongLine(line, budget) {
918
+ const maxChars = budget * 4;
919
+ if (line.length <= maxChars) return [line];
920
+ const pieces = [];
921
+ let rest = line;
922
+ while (rest.length > maxChars) {
923
+ let cut = rest.lastIndexOf(" ", maxChars);
924
+ if (cut < maxChars / 2) cut = maxChars;
925
+ pieces.push(rest.slice(0, cut));
926
+ rest = rest.slice(cut).trimStart();
927
+ }
928
+ if (rest) pieces.push(rest);
929
+ return pieces;
930
+ }
899
931
  function groupLines(lines, budget) {
900
932
  const groups = [];
901
933
  let current = [];
902
934
  let tokens = 0;
903
- for (const line of lines) {
935
+ for (const line of lines.flatMap((l) => splitLongLine(l, budget))) {
904
936
  const lineTokens = estimateTokens(line) + 1;
905
937
  if (tokens + lineTokens > budget && current.length > 0) {
906
938
  groups.push(current);
@@ -1295,7 +1327,8 @@ async function getSourceChunkHashes(sourceName) {
1295
1327
  snapshot.docs.map((doc) => [doc.id, doc.data().content_hash ?? ""])
1296
1328
  );
1297
1329
  }
1298
- var TOKEN_QUERY_LIMIT = 100;
1330
+ var TOKEN_QUERY_LIMIT = 40;
1331
+ var TOKEN_RESULT_FIELDS = ["source", "url", "title", "heading_path", "content", "token_count"];
1299
1332
  async function tokenSearch(tokens, source) {
1300
1333
  if (tokens.length === 0) return [];
1301
1334
  const col = chunksCol();
@@ -1306,7 +1339,7 @@ async function tokenSearch(tokens, source) {
1306
1339
  if (source) {
1307
1340
  query = query.where("source", "==", source);
1308
1341
  }
1309
- return query.limit(TOKEN_QUERY_LIMIT).get();
1342
+ return query.select(...TOKEN_RESULT_FIELDS).limit(TOKEN_QUERY_LIMIT).get();
1310
1343
  })
1311
1344
  );
1312
1345
  for (const snapshot of snapshots) {
@@ -1338,58 +1371,23 @@ async function vectorSearch(queryEmbedding, limit, source) {
1338
1371
  const data = doc.data();
1339
1372
  const distance = data._distance ?? 0;
1340
1373
  delete data._distance;
1374
+ delete data.embedding;
1341
1375
  return { id: doc.id, data, distance };
1342
1376
  });
1343
1377
  }
1344
1378
 
1345
- // src/reranker.ts
1346
- function getRerankerUrl() {
1347
- const url = process.env.RERANKER_URL;
1348
- if (!url) {
1349
- throw new Error("RERANKER_URL environment variable is not set");
1350
- }
1351
- return url;
1352
- }
1353
- async function rerank(query, documents, topN = 5) {
1354
- const baseUrl = getRerankerUrl();
1355
- const response = await fetch(`${baseUrl}/v1/rerank`, {
1356
- method: "POST",
1357
- headers: { "Content-Type": "application/json" },
1358
- body: JSON.stringify({ query, documents, top_n: topN })
1359
- });
1360
- if (!response.ok) {
1361
- throw new Error(`Reranker request failed: ${response.status} ${response.statusText}`);
1362
- }
1363
- const data = await response.json();
1364
- return data.results;
1365
- }
1366
-
1367
1379
  // src/search.ts
1368
1380
  var DEFAULT_CANDIDATES = 50;
1369
1381
  var RRF_K = 60;
1370
1382
  function hasReranker() {
1371
1383
  return !!process.env.RERANKER_URL;
1372
1384
  }
1373
- function isVectorLike(value) {
1374
- return typeof value === "object" && value !== null && typeof value.toArray === "function";
1375
- }
1376
- function cosineSimilarity(a, b) {
1377
- let dot = 0;
1378
- let normA = 0;
1379
- let normB = 0;
1380
- for (let i = 0; i < a.length; i++) {
1381
- dot += a[i] * b[i];
1382
- normA += a[i] * a[i];
1383
- normB += b[i] * b[i];
1384
- }
1385
- return dot / (Math.sqrt(normA) * Math.sqrt(normB));
1386
- }
1387
1385
  function contextualText(data) {
1388
- return buildEmbedText({
1389
- title: data.title,
1390
- heading_path: data.heading_path,
1391
- content: data.content
1392
- });
1386
+ return rerankDocText(
1387
+ data.title,
1388
+ data.heading_path,
1389
+ data.content
1390
+ );
1393
1391
  }
1394
1392
  function toSearchResult(candidate, relevance) {
1395
1393
  const data = candidate.data;
@@ -1426,19 +1424,17 @@ async function search(query, options = {}) {
1426
1424
  existing.fusedScore += lexicalScore;
1427
1425
  return;
1428
1426
  }
1429
- const embedding = hit.data.embedding;
1430
- delete hit.data.embedding;
1431
1427
  pool.set(hit.id, {
1432
1428
  id: hit.id,
1433
1429
  data: hit.data,
1434
1430
  fusedScore: lexicalScore,
1435
- similarity: isVectorLike(embedding) ? cosineSimilarity(queryEmbedding, embedding.toArray()) : null
1431
+ similarity: null
1436
1432
  });
1437
1433
  });
1438
1434
  const fused = [...pool.values()].sort((a, b) => b.fusedScore - a.fusedScore);
1439
1435
  if (fused.length === 0) return [];
1440
1436
  if (hasReranker()) {
1441
- const rerankPool = fused.slice(0, candidates);
1437
+ const rerankPool = fused.slice(0, RERANK_POOL_SIZE);
1442
1438
  const documents = rerankPool.map((c) => contextualText(c.data));
1443
1439
  const reranked = await rerank(query, documents, topN);
1444
1440
  return reranked.map((r) => toSearchResult(rerankPool[r.index], r.relevance_score));
@@ -1761,7 +1757,10 @@ Source "${name}" added to config/sources.yaml`);
1761
1757
  }
1762
1758
  }
1763
1759
  var EMBED_WINDOW = 1e3;
1764
- async function syncChunks(sourceName, allChunks, urlCount, version) {
1760
+ var SHRINK_GUARD_MIN_EXISTING = 200;
1761
+ var SHRINK_GUARD_MAX_DELETE_RATIO = 0.5;
1762
+ var GROWTH_GUARD_MAX_RATIO = 5;
1763
+ async function syncChunks(sourceName, allChunks, urlCount, version, allowShrink = false) {
1765
1764
  console.log(" Comparing with Firestore...");
1766
1765
  const existing = await getSourceChunkHashes(sourceName);
1767
1766
  const currentIds = new Set(allChunks.map((c) => c.id));
@@ -1772,10 +1771,15 @@ async function syncChunks(sourceName, allChunks, urlCount, version) {
1772
1771
  console.log(
1773
1772
  ` Sync: ${toEmbed.length} to embed, ${allChunks.length - toEmbed.length} unchanged, ${toDelete.length} to delete.`
1774
1773
  );
1775
- if (toDelete.length > 0) {
1776
- await deleteChunksByIds(toDelete, (cur, total) => {
1777
- console.log(` [${cur}/${total}] deleted`);
1778
- });
1774
+ if (!allowShrink && existing.size >= SHRINK_GUARD_MIN_EXISTING && toDelete.length > existing.size * SHRINK_GUARD_MAX_DELETE_RATIO) {
1775
+ throw new Error(
1776
+ `Refusing to delete ${toDelete.length} of ${existing.size} stored chunks for "${sourceName}": URL discovery likely found only part of the site. Verify the sitemap/nav/urls.json, then re-run with --allow-shrink if the change is expected, or --full to rebuild from scratch.`
1777
+ );
1778
+ }
1779
+ if (!allowShrink && existing.size >= SHRINK_GUARD_MIN_EXISTING && allChunks.length > existing.size * GROWTH_GUARD_MAX_RATIO) {
1780
+ throw new Error(
1781
+ `Refusing to grow "${sourceName}" from ${existing.size} to ${allChunks.length} chunks (>${GROWTH_GUARD_MAX_RATIO}x): URL discovery may have escaped the doc tree. Verify include_patterns/nav_selector, then re-run with --allow-shrink if the growth is real.`
1782
+ );
1779
1783
  }
1780
1784
  for (let i = 0; i < toEmbed.length; i += EMBED_WINDOW) {
1781
1785
  const window = toEmbed.slice(i, i + EMBED_WINDOW);
@@ -1788,9 +1792,24 @@ async function syncChunks(sourceName, allChunks, urlCount, version) {
1788
1792
  console.log(` [${i + cur}/${toEmbed.length}] stored`);
1789
1793
  });
1790
1794
  }
1795
+ if (toDelete.length > 0) {
1796
+ await deleteChunksByIds(toDelete, (cur, total) => {
1797
+ console.log(` [${cur}/${total}] deleted`);
1798
+ });
1799
+ }
1791
1800
  await updateSourceMeta(sourceName, allChunks.length, urlCount, version);
1792
1801
  console.log(` Done. ${allChunks.length} chunks live for "${sourceName}".`);
1793
1802
  }
1803
+ function applyExcludePatterns(items, getUrl, excludePatterns, label) {
1804
+ if (!excludePatterns || excludePatterns.length === 0) return items;
1805
+ const kept = items.filter(
1806
+ (item) => !excludePatterns.some((pattern) => getUrl(item).includes(pattern))
1807
+ );
1808
+ if (kept.length < items.length) {
1809
+ console.log(` Excluded ${items.length - kept.length} ${label} via exclude_patterns.`);
1810
+ }
1811
+ return kept;
1812
+ }
1794
1813
  async function readCachedMarkdownPages(mdDir) {
1795
1814
  const mdFiles = await readdir(mdDir).catch(() => []);
1796
1815
  const pages = [];
@@ -1850,14 +1869,15 @@ async function cmdRefresh() {
1850
1869
  limit: { type: "string" },
1851
1870
  "from-html": { type: "boolean", default: false },
1852
1871
  "from-markdown": { type: "boolean", default: false },
1853
- "skip-store": { type: "boolean", default: false }
1872
+ "skip-store": { type: "boolean", default: false },
1873
+ "allow-shrink": { type: "boolean", default: false }
1854
1874
  },
1855
1875
  allowPositionals: true
1856
1876
  });
1857
1877
  const config = await loadConfig(CONFIG_PATH);
1858
1878
  const sourcesToRefresh = args.values.all ? Object.keys(config.sources) : [args.positionals[0]];
1859
1879
  if (!args.values.all && !sourcesToRefresh[0]) {
1860
- console.error("Usage: grimoire refresh <source> [--full] [--from-html] [--from-markdown] [--skip-store] [--limit <n>] [--concurrency <n>]");
1880
+ console.error("Usage: grimoire refresh <source> [--full] [--from-html] [--from-markdown] [--skip-store] [--allow-shrink] [--limit <n>] [--concurrency <n>]");
1861
1881
  process.exit(1);
1862
1882
  }
1863
1883
  const concurrencyOverride = args.values.concurrency ? parseInt(args.values.concurrency, 10) : void 0;
@@ -1890,6 +1910,7 @@ Refreshing "${sourceName}"...`);
1890
1910
  console.error(" No cached markdown found. Run with --from-html first.");
1891
1911
  process.exit(1);
1892
1912
  }
1913
+ pages = applyExcludePatterns(pages, (p) => p.url, source.exclude_patterns, "cached pages");
1893
1914
  console.log(` Found ${pages.length} cached pages.`);
1894
1915
  } else if (source.llms_full_url && !args.values["from-html"]) {
1895
1916
  console.log(` Fetching llms-full.txt from ${source.llms_full_url}...`);
@@ -1908,6 +1929,7 @@ Refreshing "${sourceName}"...`);
1908
1929
  if (args.values["from-html"]) {
1909
1930
  console.log(" Reading URLs from cached HTML...");
1910
1931
  urls = await recoverUrlsFromHtml(rawDir);
1932
+ urls = applyExcludePatterns(urls, (u) => u, source.exclude_patterns, "cached URLs");
1911
1933
  console.log(` Found ${urls.length} cached pages.`);
1912
1934
  } else {
1913
1935
  console.log(" Scraping URLs...");
@@ -1941,7 +1963,7 @@ Refreshing "${sourceName}"...`);
1941
1963
  console.log(` Done. ${allChunks.length} chunks ready (dry run, no embed/store).`);
1942
1964
  continue;
1943
1965
  }
1944
- await syncChunks(sourceName, allChunks, pages.length, source.version);
1966
+ await syncChunks(sourceName, allChunks, pages.length, source.version, args.values["allow-shrink"]);
1945
1967
  }
1946
1968
  }
1947
1969
  async function cmdSearch() {
@@ -2149,4 +2171,4 @@ var ADMIN_COMMANDS = {
2149
2171
  export {
2150
2172
  ADMIN_COMMANDS
2151
2173
  };
2152
- //# sourceMappingURL=admin-WUNBDKBC.js.map
2174
+ //# sourceMappingURL=admin-4C3QNVQE.js.map