@astrofoundry/grimoire 3.31.1 → 3.32.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,13 @@
1
1
  import {
2
+ RERANK_POOL_SIZE,
2
3
  __commonJS,
3
4
  __toESM,
4
5
  bold,
5
6
  cyan,
7
+ rerank,
8
+ rerankDocText,
6
9
  yellow
7
- } from "./chunk-R46N6C3C.js";
10
+ } from "./chunk-VTTJCQRQ.js";
8
11
 
9
12
  // node_modules/.pnpm/@joplin+turndown-plugin-gfm@1.0.67/node_modules/@joplin/turndown-plugin-gfm/lib/turndown-plugin-gfm.cjs.js
10
13
  var require_turndown_plugin_gfm_cjs = __commonJS({
@@ -476,9 +479,7 @@ function slugifyUrl(url) {
476
479
 
477
480
  // src/scraper.ts
478
481
  function filterUrls(urls, includePatterns, excludePatterns) {
479
- let filtered = urls.filter(
480
- (url) => url.startsWith("http") && !url.includes("?hl=") && !url.endsWith("#")
481
- );
482
+ let filtered = urls.map((url) => url.split("#")[0]).filter((url) => url.startsWith("http") && !url.includes("?hl="));
482
483
  if (includePatterns && includePatterns.length > 0) {
483
484
  filtered = filtered.filter(
484
485
  (url) => includePatterns.some((pattern) => url.includes(pattern))
@@ -688,7 +689,14 @@ function cleanTableRowBreaks(md) {
688
689
  (line) => line.startsWith("|") ? line.replace(/\s*<br\s*\/?>\s*/g, "; ") : line
689
690
  ).join("\n");
690
691
  }
692
+ var SPAN_STRIP_THRESHOLD_BYTES = 2e6;
693
+ function stripInlineSpans(html) {
694
+ return html.replace(/<\/?span\b[^>]*>/g, "");
695
+ }
691
696
  function extractContent(html, contentSelector, removeSelectors, removeTextPatterns) {
697
+ if (html.length > SPAN_STRIP_THRESHOLD_BYTES) {
698
+ html = stripInlineSpans(html);
699
+ }
692
700
  const dom = new JSDOM(html);
693
701
  const doc = dom.window.document;
694
702
  const contentEl = doc.querySelector(contentSelector) ?? doc.body;
@@ -744,7 +752,14 @@ async function convertSource(sourceName, urls, contentSelector, removeSelectors,
744
752
  const url = urls[i];
745
753
  const slug = slugifyUrl(url);
746
754
  const htmlPath = join2(rawDir, `${slug}.html`);
747
- const html = await readFile2(htmlPath, "utf-8");
755
+ let html;
756
+ try {
757
+ html = await readFile2(htmlPath, "utf-8");
758
+ } catch {
759
+ console.warn(` WARNING: no cached HTML for ${url}, skipping page. Run 'grimoire scrape-urls' to fetch it.`);
760
+ completed++;
761
+ continue;
762
+ }
748
763
  const page = convertPage(html, sourceName, url, contentSelector, removeSelectors, removeTextPatterns);
749
764
  await writeFile2(join2(mdDir, `${slug}.md`), page.markdown, "utf-8");
750
765
  pages[i] = page;
@@ -757,7 +772,33 @@ async function convertSource(sourceName, urls, contentSelector, removeSelectors,
757
772
  () => worker()
758
773
  );
759
774
  await Promise.all(workers);
760
- return pages;
775
+ return pages.filter((page) => page !== void 0);
776
+ }
777
+
778
+ // src/tokens.ts
779
+ import { createHash } from "node:crypto";
780
+ var IDENTIFIER_PATTERN = /(?<![A-Za-z0-9._])(?:[A-Za-z][A-Za-z0-9]*(?:[_.-][A-Za-z0-9]+)+|[a-z][a-z0-9]*(?:[A-Z][a-z0-9]*)+|(?:[A-Z][a-z0-9]+){2,})(?![A-Za-z0-9])/g;
781
+ var MIN_TOKEN_LENGTH = 4;
782
+ var MAX_TOKEN_LENGTH = 80;
783
+ var MAX_TOKENS_PER_CHUNK = 100;
784
+ function normalizeForTokens(text) {
785
+ return text.replace(/\]\([^)]*\)/g, "]").replace(/https?:\/\/\S+/g, " ").replace(/\\([_*[\]()#`~-])/g, "$1");
786
+ }
787
+ function extractIdentifierTokens(text, limit = MAX_TOKENS_PER_CHUNK) {
788
+ const seen = /* @__PURE__ */ new Set();
789
+ for (const match of normalizeForTokens(text).matchAll(IDENTIFIER_PATTERN)) {
790
+ const token = match[0].toLowerCase();
791
+ if (token.length < MIN_TOKEN_LENGTH || token.length > MAX_TOKEN_LENGTH) continue;
792
+ seen.add(token);
793
+ if (seen.size >= limit) break;
794
+ }
795
+ return [...seen];
796
+ }
797
+ function extractQueryTokens(query) {
798
+ return extractIdentifierTokens(query, 5);
799
+ }
800
+ function contentHash(text) {
801
+ return createHash("sha256").update(text).digest("hex");
761
802
  }
762
803
 
763
804
  // src/chunker.ts
@@ -1010,6 +1051,11 @@ function chunkMarkdown(markdown, source, url, title) {
1010
1051
  usedIds.add(id);
1011
1052
  return id;
1012
1053
  }
1054
+ const slugCounts = /* @__PURE__ */ new Map();
1055
+ for (const section of sections) {
1056
+ const slug = section.heading ? slugifyHeading(section.heading) : "intro";
1057
+ slugCounts.set(slug, (slugCounts.get(slug) ?? 0) + 1);
1058
+ }
1013
1059
  for (const section of sections) {
1014
1060
  const headingLine = section.heading ? `${"#".repeat(section.level)} ${section.heading}
1015
1061
 
@@ -1017,7 +1063,8 @@ function chunkMarkdown(markdown, source, url, title) {
1017
1063
  const body = section.lines.join("\n").trim();
1018
1064
  const content = headingLine + body;
1019
1065
  if (!content.trim()) continue;
1020
- const headingSlug = section.heading ? slugifyHeading(section.heading) : "intro";
1066
+ const baseSlug = section.heading ? slugifyHeading(section.heading) : "intro";
1067
+ const headingSlug = (slugCounts.get(baseSlug) ?? 0) > 1 ? `${baseSlug}-${contentHash(content).slice(0, 8)}` : baseSlug;
1021
1068
  if (estimateTokens(content) <= MAX_TOKENS) {
1022
1069
  chunks.push({
1023
1070
  id: uniqueId(headingSlug),
@@ -1050,32 +1097,6 @@ function chunkMarkdown(markdown, source, url, title) {
1050
1097
  return chunks;
1051
1098
  }
1052
1099
 
1053
- // src/tokens.ts
1054
- import { createHash } from "node:crypto";
1055
- var IDENTIFIER_PATTERN = /(?<![A-Za-z0-9._])(?:[A-Za-z][A-Za-z0-9]*(?:[_.-][A-Za-z0-9]+)+|[a-z][a-z0-9]*(?:[A-Z][a-z0-9]*)+|(?:[A-Z][a-z0-9]+){2,})(?![A-Za-z0-9])/g;
1056
- var MIN_TOKEN_LENGTH = 4;
1057
- var MAX_TOKEN_LENGTH = 80;
1058
- var MAX_TOKENS_PER_CHUNK = 100;
1059
- function normalizeForTokens(text) {
1060
- return text.replace(/\]\([^)]*\)/g, "]").replace(/https?:\/\/\S+/g, " ").replace(/\\([_*[\]()#`~-])/g, "$1");
1061
- }
1062
- function extractIdentifierTokens(text, limit = MAX_TOKENS_PER_CHUNK) {
1063
- const seen = /* @__PURE__ */ new Set();
1064
- for (const match of normalizeForTokens(text).matchAll(IDENTIFIER_PATTERN)) {
1065
- const token = match[0].toLowerCase();
1066
- if (token.length < MIN_TOKEN_LENGTH || token.length > MAX_TOKEN_LENGTH) continue;
1067
- seen.add(token);
1068
- if (seen.size >= limit) break;
1069
- }
1070
- return [...seen];
1071
- }
1072
- function extractQueryTokens(query) {
1073
- return extractIdentifierTokens(query, 5);
1074
- }
1075
- function contentHash(text) {
1076
- return createHash("sha256").update(text).digest("hex");
1077
- }
1078
-
1079
1100
  // src/embedder.ts
1080
1101
  import { GoogleGenerativeAI } from "@google/generative-ai";
1081
1102
  var BATCH_SIZE = 50;
@@ -1361,26 +1382,50 @@ async function vectorSearch(queryEmbedding, limit, source) {
1361
1382
  });
1362
1383
  }
1363
1384
 
1364
- // src/reranker.ts
1365
- function getRerankerUrl() {
1366
- const url = process.env.RERANKER_URL;
1367
- if (!url) {
1368
- throw new Error("RERANKER_URL environment variable is not set");
1369
- }
1370
- return url;
1371
- }
1372
- async function rerank(query, documents, topN = 5) {
1373
- const baseUrl = getRerankerUrl();
1374
- const response = await fetch(`${baseUrl}/v1/rerank`, {
1375
- method: "POST",
1376
- headers: { "Content-Type": "application/json" },
1377
- body: JSON.stringify({ query, documents, top_n: topN })
1378
- });
1379
- if (!response.ok) {
1380
- throw new Error(`Reranker request failed: ${response.status} ${response.statusText}`);
1385
+ // src/sync.ts
1386
+ var EMBED_WINDOW = 1e3;
1387
+ var SHRINK_GUARD_MIN_EXISTING = 200;
1388
+ var SHRINK_GUARD_MAX_DELETE_RATIO = 0.5;
1389
+ var GROWTH_GUARD_MAX_RATIO = 5;
1390
+ async function syncChunks(sourceName, allChunks, urlCount, version, allowShrink = false) {
1391
+ console.log(" Comparing with Firestore...");
1392
+ const existing = await getSourceChunkHashes(sourceName);
1393
+ const currentIds = new Set(allChunks.map((c) => c.id));
1394
+ const toDelete = [...existing.keys()].filter((id) => !currentIds.has(id));
1395
+ const toEmbed = allChunks.filter(
1396
+ (chunk) => existing.get(chunk.id) !== contentHash(buildEmbedText(chunk))
1397
+ );
1398
+ console.log(
1399
+ ` Sync: ${toEmbed.length} to embed, ${allChunks.length - toEmbed.length} unchanged, ${toDelete.length} to delete.`
1400
+ );
1401
+ if (!allowShrink && existing.size >= SHRINK_GUARD_MIN_EXISTING && toDelete.length > existing.size * SHRINK_GUARD_MAX_DELETE_RATIO) {
1402
+ throw new Error(
1403
+ `Refusing to delete ${toDelete.length} of ${existing.size} stored chunks for "${sourceName}": URL discovery likely found only part of the site. Verify the sitemap/nav/urls.json, then re-run with --allow-shrink if the change is expected, or --full to rebuild from scratch.`
1404
+ );
1405
+ }
1406
+ if (!allowShrink && existing.size >= SHRINK_GUARD_MIN_EXISTING && allChunks.length > existing.size * GROWTH_GUARD_MAX_RATIO) {
1407
+ throw new Error(
1408
+ `Refusing to grow "${sourceName}" from ${existing.size} to ${allChunks.length} chunks (>${GROWTH_GUARD_MAX_RATIO}x): URL discovery may have escaped the doc tree. Verify include_patterns/nav_selector, then re-run with --allow-shrink if the growth is real.`
1409
+ );
1410
+ }
1411
+ for (let i = 0; i < toEmbed.length; i += EMBED_WINDOW) {
1412
+ const window = toEmbed.slice(i, i + EMBED_WINDOW);
1413
+ const embeddings = await embedTexts(window.map((c) => buildEmbedText(c)), {
1414
+ onProgress: (done) => {
1415
+ console.log(` [${i + done}/${toEmbed.length}] embedded`);
1416
+ }
1417
+ });
1418
+ await storeChunks(window, embeddings, (cur) => {
1419
+ console.log(` [${i + cur}/${toEmbed.length}] stored`);
1420
+ });
1381
1421
  }
1382
- const data = await response.json();
1383
- return data.results;
1422
+ if (toDelete.length > 0) {
1423
+ await deleteChunksByIds(toDelete, (cur, total) => {
1424
+ console.log(` [${cur}/${total}] deleted`);
1425
+ });
1426
+ }
1427
+ await updateSourceMeta(sourceName, allChunks.length, urlCount, version);
1428
+ console.log(` Done. ${allChunks.length} chunks live for "${sourceName}".`);
1384
1429
  }
1385
1430
 
1386
1431
  // src/search.ts
@@ -1390,11 +1435,11 @@ function hasReranker() {
1390
1435
  return !!process.env.RERANKER_URL;
1391
1436
  }
1392
1437
  function contextualText(data) {
1393
- return buildEmbedText({
1394
- title: data.title,
1395
- heading_path: data.heading_path,
1396
- content: data.content
1397
- });
1438
+ return rerankDocText(
1439
+ data.title,
1440
+ data.heading_path,
1441
+ data.content
1442
+ );
1398
1443
  }
1399
1444
  function toSearchResult(candidate, relevance) {
1400
1445
  const data = candidate.data;
@@ -1420,8 +1465,7 @@ async function search(query, options = {}) {
1420
1465
  pool.set(result.id, {
1421
1466
  id: result.id,
1422
1467
  data: result.data,
1423
- fusedScore: 1 / (RRF_K + rank + 1),
1424
- similarity: 1 - result.distance
1468
+ fusedScore: 1 / (RRF_K + rank + 1)
1425
1469
  });
1426
1470
  });
1427
1471
  lexicalHits.forEach((hit, rank) => {
@@ -1434,22 +1478,21 @@ async function search(query, options = {}) {
1434
1478
  pool.set(hit.id, {
1435
1479
  id: hit.id,
1436
1480
  data: hit.data,
1437
- fusedScore: lexicalScore,
1438
- similarity: null
1481
+ fusedScore: lexicalScore
1439
1482
  });
1440
1483
  });
1441
1484
  const fused = [...pool.values()].sort((a, b) => b.fusedScore - a.fusedScore);
1442
1485
  if (fused.length === 0) return [];
1443
1486
  if (hasReranker()) {
1444
- const rerankPool = fused.slice(0, candidates);
1487
+ const rerankPool = fused.slice(0, RERANK_POOL_SIZE);
1445
1488
  const documents = rerankPool.map((c) => contextualText(c.data));
1446
1489
  const reranked = await rerank(query, documents, topN);
1447
1490
  return reranked.map((r) => toSearchResult(rerankPool[r.index], r.relevance_score));
1448
1491
  }
1449
- return fused.slice(0, topN).map((candidate) => {
1450
- const similarity = candidate.similarity ?? 0;
1451
- return toSearchResult(candidate, Math.max(0, (1 + similarity) / 2));
1452
- });
1492
+ const maxFused = fused[0].fusedScore;
1493
+ return fused.slice(0, topN).map(
1494
+ (candidate) => toSearchResult(candidate, candidate.fusedScore / maxFused)
1495
+ );
1453
1496
  }
1454
1497
 
1455
1498
  // src/apikey.ts
@@ -1763,36 +1806,15 @@ Source "${name}" added to config/sources.yaml`);
1763
1806
  await browser.close();
1764
1807
  }
1765
1808
  }
1766
- var EMBED_WINDOW = 1e3;
1767
- async function syncChunks(sourceName, allChunks, urlCount, version) {
1768
- console.log(" Comparing with Firestore...");
1769
- const existing = await getSourceChunkHashes(sourceName);
1770
- const currentIds = new Set(allChunks.map((c) => c.id));
1771
- const toDelete = [...existing.keys()].filter((id) => !currentIds.has(id));
1772
- const toEmbed = allChunks.filter(
1773
- (chunk) => existing.get(chunk.id) !== contentHash(buildEmbedText(chunk))
1809
+ function applyExcludePatterns(items, getUrl, excludePatterns, label) {
1810
+ if (!excludePatterns || excludePatterns.length === 0) return items;
1811
+ const kept = items.filter(
1812
+ (item) => !excludePatterns.some((pattern) => getUrl(item).includes(pattern))
1774
1813
  );
1775
- console.log(
1776
- ` Sync: ${toEmbed.length} to embed, ${allChunks.length - toEmbed.length} unchanged, ${toDelete.length} to delete.`
1777
- );
1778
- for (let i = 0; i < toEmbed.length; i += EMBED_WINDOW) {
1779
- const window = toEmbed.slice(i, i + EMBED_WINDOW);
1780
- const embeddings = await embedTexts(window.map((c) => buildEmbedText(c)), {
1781
- onProgress: (done) => {
1782
- console.log(` [${i + done}/${toEmbed.length}] embedded`);
1783
- }
1784
- });
1785
- await storeChunks(window, embeddings, (cur) => {
1786
- console.log(` [${i + cur}/${toEmbed.length}] stored`);
1787
- });
1788
- }
1789
- if (toDelete.length > 0) {
1790
- await deleteChunksByIds(toDelete, (cur, total) => {
1791
- console.log(` [${cur}/${total}] deleted`);
1792
- });
1814
+ if (kept.length < items.length) {
1815
+ console.log(` Excluded ${items.length - kept.length} ${label} via exclude_patterns.`);
1793
1816
  }
1794
- await updateSourceMeta(sourceName, allChunks.length, urlCount, version);
1795
- console.log(` Done. ${allChunks.length} chunks live for "${sourceName}".`);
1817
+ return kept;
1796
1818
  }
1797
1819
  async function readCachedMarkdownPages(mdDir) {
1798
1820
  const mdFiles = await readdir(mdDir).catch(() => []);
@@ -1853,14 +1875,15 @@ async function cmdRefresh() {
1853
1875
  limit: { type: "string" },
1854
1876
  "from-html": { type: "boolean", default: false },
1855
1877
  "from-markdown": { type: "boolean", default: false },
1856
- "skip-store": { type: "boolean", default: false }
1878
+ "skip-store": { type: "boolean", default: false },
1879
+ "allow-shrink": { type: "boolean", default: false }
1857
1880
  },
1858
1881
  allowPositionals: true
1859
1882
  });
1860
1883
  const config = await loadConfig(CONFIG_PATH);
1861
1884
  const sourcesToRefresh = args.values.all ? Object.keys(config.sources) : [args.positionals[0]];
1862
1885
  if (!args.values.all && !sourcesToRefresh[0]) {
1863
- console.error("Usage: grimoire refresh <source> [--full] [--from-html] [--from-markdown] [--skip-store] [--limit <n>] [--concurrency <n>]");
1886
+ console.error("Usage: grimoire refresh <source> [--full] [--from-html] [--from-markdown] [--skip-store] [--allow-shrink] [--limit <n>] [--concurrency <n>]");
1864
1887
  process.exit(1);
1865
1888
  }
1866
1889
  const concurrencyOverride = args.values.concurrency ? parseInt(args.values.concurrency, 10) : void 0;
@@ -1893,6 +1916,7 @@ Refreshing "${sourceName}"...`);
1893
1916
  console.error(" No cached markdown found. Run with --from-html first.");
1894
1917
  process.exit(1);
1895
1918
  }
1919
+ pages = applyExcludePatterns(pages, (p) => p.url, source.exclude_patterns, "cached pages");
1896
1920
  console.log(` Found ${pages.length} cached pages.`);
1897
1921
  } else if (source.llms_full_url && !args.values["from-html"]) {
1898
1922
  console.log(` Fetching llms-full.txt from ${source.llms_full_url}...`);
@@ -1911,6 +1935,7 @@ Refreshing "${sourceName}"...`);
1911
1935
  if (args.values["from-html"]) {
1912
1936
  console.log(" Reading URLs from cached HTML...");
1913
1937
  urls = await recoverUrlsFromHtml(rawDir);
1938
+ urls = applyExcludePatterns(urls, (u) => u, source.exclude_patterns, "cached URLs");
1914
1939
  console.log(` Found ${urls.length} cached pages.`);
1915
1940
  } else {
1916
1941
  console.log(" Scraping URLs...");
@@ -1944,7 +1969,7 @@ Refreshing "${sourceName}"...`);
1944
1969
  console.log(` Done. ${allChunks.length} chunks ready (dry run, no embed/store).`);
1945
1970
  continue;
1946
1971
  }
1947
- await syncChunks(sourceName, allChunks, pages.length, source.version);
1972
+ await syncChunks(sourceName, allChunks, pages.length, source.version, args.values["allow-shrink"]);
1948
1973
  }
1949
1974
  }
1950
1975
  async function cmdSearch() {
@@ -2152,4 +2177,4 @@ var ADMIN_COMMANDS = {
2152
2177
  export {
2153
2178
  ADMIN_COMMANDS
2154
2179
  };
2155
- //# sourceMappingURL=admin-ENGUPLQ6.js.map
2180
+ //# sourceMappingURL=admin-MA5SI5CH.js.map