npm - @astrofoundry/grimoire - Versions diffs - 3.31.1 → 3.32.1 - Mend

@astrofoundry/grimoire 3.31.1 → 3.32.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/{admin-ENGUPLQ6.js → admin-MA5SI5CH.js} +123 -98
package/dist/admin-MA5SI5CH.js.map +7 -0
package/dist/{chunk-R46N6C3C.js → chunk-VTTJCQRQ.js} +37 -2
package/dist/chunk-VTTJCQRQ.js.map +7 -0
package/dist/cli.js +126 -18
package/dist/cli.js.map +2 -2
package/package.json +1 -1
package/dist/admin-ENGUPLQ6.js.map +0 -7
package/dist/chunk-R46N6C3C.js.map +0 -7

package/dist/{admin-ENGUPLQ6.js → admin-MA5SI5CH.js} RENAMED Viewed

@@ -1,10 +1,13 @@
 import {
+  RERANK_POOL_SIZE,
   __commonJS,
   __toESM,
   bold,
   cyan,
+  rerank,
+  rerankDocText,
   yellow
-} from "./chunk-R46N6C3C.js";
+} from "./chunk-VTTJCQRQ.js";
 // node_modules/.pnpm/@joplin+turndown-plugin-gfm@1.0.67/node_modules/@joplin/turndown-plugin-gfm/lib/turndown-plugin-gfm.cjs.js
 var require_turndown_plugin_gfm_cjs = __commonJS({
@@ -476,9 +479,7 @@ function slugifyUrl(url) {
 // src/scraper.ts
 function filterUrls(urls, includePatterns, excludePatterns) {
-  let filtered = urls.filter(
-    (url) => url.startsWith("http") && !url.includes("?hl=") && !url.endsWith("#")
-  );
+  let filtered = urls.map((url) => url.split("#")[0]).filter((url) => url.startsWith("http") && !url.includes("?hl="));
   if (includePatterns && includePatterns.length > 0) {
     filtered = filtered.filter(
       (url) => includePatterns.some((pattern) => url.includes(pattern))
@@ -688,7 +689,14 @@ function cleanTableRowBreaks(md) {
     (line) => line.startsWith("|") ? line.replace(/\s*<br\s*\/?>\s*/g, "; ") : line
   ).join("\n");
 }
+var SPAN_STRIP_THRESHOLD_BYTES = 2e6;
+function stripInlineSpans(html) {
+  return html.replace(/<\/?span\b[^>]*>/g, "");
+}
 function extractContent(html, contentSelector, removeSelectors, removeTextPatterns) {
+  if (html.length > SPAN_STRIP_THRESHOLD_BYTES) {
+    html = stripInlineSpans(html);
+  }
   const dom = new JSDOM(html);
   const doc = dom.window.document;
   const contentEl = doc.querySelector(contentSelector) ?? doc.body;
@@ -744,7 +752,14 @@ async function convertSource(sourceName, urls, contentSelector, removeSelectors,
       const url = urls[i];
       const slug = slugifyUrl(url);
       const htmlPath = join2(rawDir, `${slug}.html`);
-      const html = await readFile2(htmlPath, "utf-8");
+      let html;
+      try {
+        html = await readFile2(htmlPath, "utf-8");
+      } catch {
+        console.warn(`  WARNING: no cached HTML for ${url}, skipping page. Run 'grimoire scrape-urls' to fetch it.`);
+        completed++;
+        continue;
+      }
       const page = convertPage(html, sourceName, url, contentSelector, removeSelectors, removeTextPatterns);
       await writeFile2(join2(mdDir, `${slug}.md`), page.markdown, "utf-8");
       pages[i] = page;
@@ -757,7 +772,33 @@ async function convertSource(sourceName, urls, contentSelector, removeSelectors,
     () => worker()
   );
   await Promise.all(workers);
-  return pages;
+  return pages.filter((page) => page !== void 0);
+}
+// src/tokens.ts
+import { createHash } from "node:crypto";
+var IDENTIFIER_PATTERN = /(?<![A-Za-z0-9._])(?:[A-Za-z][A-Za-z0-9]*(?:[_.-][A-Za-z0-9]+)+|[a-z][a-z0-9]*(?:[A-Z][a-z0-9]*)+|(?:[A-Z][a-z0-9]+){2,})(?![A-Za-z0-9])/g;
+var MIN_TOKEN_LENGTH = 4;
+var MAX_TOKEN_LENGTH = 80;
+var MAX_TOKENS_PER_CHUNK = 100;
+function normalizeForTokens(text) {
+  return text.replace(/\]\([^)]*\)/g, "]").replace(/https?:\/\/\S+/g, " ").replace(/\\([_*[\]()#`~-])/g, "$1");
+}
+function extractIdentifierTokens(text, limit = MAX_TOKENS_PER_CHUNK) {
+  const seen = /* @__PURE__ */ new Set();
+  for (const match of normalizeForTokens(text).matchAll(IDENTIFIER_PATTERN)) {
+    const token = match[0].toLowerCase();
+    if (token.length < MIN_TOKEN_LENGTH || token.length > MAX_TOKEN_LENGTH) continue;
+    seen.add(token);
+    if (seen.size >= limit) break;
+  }
+  return [...seen];
+}
+function extractQueryTokens(query) {
+  return extractIdentifierTokens(query, 5);
+}
+function contentHash(text) {
+  return createHash("sha256").update(text).digest("hex");
 }
 // src/chunker.ts
@@ -1010,6 +1051,11 @@ function chunkMarkdown(markdown, source, url, title) {
     usedIds.add(id);
     return id;
   }
+  const slugCounts = /* @__PURE__ */ new Map();
+  for (const section of sections) {
+    const slug = section.heading ? slugifyHeading(section.heading) : "intro";
+    slugCounts.set(slug, (slugCounts.get(slug) ?? 0) + 1);
+  }
   for (const section of sections) {
     const headingLine = section.heading ? `${"#".repeat(section.level)} ${section.heading}
@@ -1017,7 +1063,8 @@ function chunkMarkdown(markdown, source, url, title) {
     const body = section.lines.join("\n").trim();
     const content = headingLine + body;
     if (!content.trim()) continue;
-    const headingSlug = section.heading ? slugifyHeading(section.heading) : "intro";
+    const baseSlug = section.heading ? slugifyHeading(section.heading) : "intro";
+    const headingSlug = (slugCounts.get(baseSlug) ?? 0) > 1 ? `${baseSlug}-${contentHash(content).slice(0, 8)}` : baseSlug;
     if (estimateTokens(content) <= MAX_TOKENS) {
       chunks.push({
         id: uniqueId(headingSlug),
@@ -1050,32 +1097,6 @@ function chunkMarkdown(markdown, source, url, title) {
   return chunks;
 }
-// src/tokens.ts
-import { createHash } from "node:crypto";
-var IDENTIFIER_PATTERN = /(?<![A-Za-z0-9._])(?:[A-Za-z][A-Za-z0-9]*(?:[_.-][A-Za-z0-9]+)+|[a-z][a-z0-9]*(?:[A-Z][a-z0-9]*)+|(?:[A-Z][a-z0-9]+){2,})(?![A-Za-z0-9])/g;
-var MIN_TOKEN_LENGTH = 4;
-var MAX_TOKEN_LENGTH = 80;
-var MAX_TOKENS_PER_CHUNK = 100;
-function normalizeForTokens(text) {
-  return text.replace(/\]\([^)]*\)/g, "]").replace(/https?:\/\/\S+/g, " ").replace(/\\([_*[\]()#`~-])/g, "$1");
-}
-function extractIdentifierTokens(text, limit = MAX_TOKENS_PER_CHUNK) {
-  const seen = /* @__PURE__ */ new Set();
-  for (const match of normalizeForTokens(text).matchAll(IDENTIFIER_PATTERN)) {
-    const token = match[0].toLowerCase();
-    if (token.length < MIN_TOKEN_LENGTH || token.length > MAX_TOKEN_LENGTH) continue;
-    seen.add(token);
-    if (seen.size >= limit) break;
-  }
-  return [...seen];
-}
-function extractQueryTokens(query) {
-  return extractIdentifierTokens(query, 5);
-}
-function contentHash(text) {
-  return createHash("sha256").update(text).digest("hex");
-}
 // src/embedder.ts
 import { GoogleGenerativeAI } from "@google/generative-ai";
 var BATCH_SIZE = 50;
@@ -1361,26 +1382,50 @@ async function vectorSearch(queryEmbedding, limit, source) {
   });
 }
-// src/reranker.ts
-function getRerankerUrl() {
-  const url = process.env.RERANKER_URL;
-  if (!url) {
-    throw new Error("RERANKER_URL environment variable is not set");
-  }
-  return url;
-}
-async function rerank(query, documents, topN = 5) {
-  const baseUrl = getRerankerUrl();
-  const response = await fetch(`${baseUrl}/v1/rerank`, {
-    method: "POST",
-    headers: { "Content-Type": "application/json" },
-    body: JSON.stringify({ query, documents, top_n: topN })
-  });
-  if (!response.ok) {
-    throw new Error(`Reranker request failed: ${response.status} ${response.statusText}`);
+// src/sync.ts
+var EMBED_WINDOW = 1e3;
+var SHRINK_GUARD_MIN_EXISTING = 200;
+var SHRINK_GUARD_MAX_DELETE_RATIO = 0.5;
+var GROWTH_GUARD_MAX_RATIO = 5;
+async function syncChunks(sourceName, allChunks, urlCount, version, allowShrink = false) {
+  console.log("  Comparing with Firestore...");
+  const existing = await getSourceChunkHashes(sourceName);
+  const currentIds = new Set(allChunks.map((c) => c.id));
+  const toDelete = [...existing.keys()].filter((id) => !currentIds.has(id));
+  const toEmbed = allChunks.filter(
+    (chunk) => existing.get(chunk.id) !== contentHash(buildEmbedText(chunk))
+  );
+  console.log(
+    `  Sync: ${toEmbed.length} to embed, ${allChunks.length - toEmbed.length} unchanged, ${toDelete.length} to delete.`
+  );
+  if (!allowShrink && existing.size >= SHRINK_GUARD_MIN_EXISTING && toDelete.length > existing.size * SHRINK_GUARD_MAX_DELETE_RATIO) {
+    throw new Error(
+      `Refusing to delete ${toDelete.length} of ${existing.size} stored chunks for "${sourceName}": URL discovery likely found only part of the site. Verify the sitemap/nav/urls.json, then re-run with --allow-shrink if the change is expected, or --full to rebuild from scratch.`
+    );
+  }
+  if (!allowShrink && existing.size >= SHRINK_GUARD_MIN_EXISTING && allChunks.length > existing.size * GROWTH_GUARD_MAX_RATIO) {
+    throw new Error(
+      `Refusing to grow "${sourceName}" from ${existing.size} to ${allChunks.length} chunks (>${GROWTH_GUARD_MAX_RATIO}x): URL discovery may have escaped the doc tree. Verify include_patterns/nav_selector, then re-run with --allow-shrink if the growth is real.`
+    );
+  }
+  for (let i = 0; i < toEmbed.length; i += EMBED_WINDOW) {
+    const window = toEmbed.slice(i, i + EMBED_WINDOW);
+    const embeddings = await embedTexts(window.map((c) => buildEmbedText(c)), {
+      onProgress: (done) => {
+        console.log(`  [${i + done}/${toEmbed.length}] embedded`);
+      }
+    });
+    await storeChunks(window, embeddings, (cur) => {
+      console.log(`  [${i + cur}/${toEmbed.length}] stored`);
+    });
   }
-  const data = await response.json();
-  return data.results;
+  if (toDelete.length > 0) {
+    await deleteChunksByIds(toDelete, (cur, total) => {
+      console.log(`  [${cur}/${total}] deleted`);
+    });
+  }
+  await updateSourceMeta(sourceName, allChunks.length, urlCount, version);
+  console.log(`  Done. ${allChunks.length} chunks live for "${sourceName}".`);
 }
 // src/search.ts
@@ -1390,11 +1435,11 @@ function hasReranker() {
   return !!process.env.RERANKER_URL;
 }
 function contextualText(data) {
-  return buildEmbedText({
-    title: data.title,
-    heading_path: data.heading_path,
-    content: data.content
-  });
+  return rerankDocText(
+    data.title,
+    data.heading_path,
+    data.content
+  );
 }
 function toSearchResult(candidate, relevance) {
   const data = candidate.data;
@@ -1420,8 +1465,7 @@ async function search(query, options = {}) {
     pool.set(result.id, {
       id: result.id,
       data: result.data,
-      fusedScore: 1 / (RRF_K + rank + 1),
-      similarity: 1 - result.distance
+      fusedScore: 1 / (RRF_K + rank + 1)
     });
   });
   lexicalHits.forEach((hit, rank) => {
@@ -1434,22 +1478,21 @@ async function search(query, options = {}) {
     pool.set(hit.id, {
       id: hit.id,
       data: hit.data,
-      fusedScore: lexicalScore,
-      similarity: null
+      fusedScore: lexicalScore
     });
   });
   const fused = [...pool.values()].sort((a, b) => b.fusedScore - a.fusedScore);
   if (fused.length === 0) return [];
   if (hasReranker()) {
-    const rerankPool = fused.slice(0, candidates);
+    const rerankPool = fused.slice(0, RERANK_POOL_SIZE);
     const documents = rerankPool.map((c) => contextualText(c.data));
     const reranked = await rerank(query, documents, topN);
     return reranked.map((r) => toSearchResult(rerankPool[r.index], r.relevance_score));
   }
-  return fused.slice(0, topN).map((candidate) => {
-    const similarity = candidate.similarity ?? 0;
-    return toSearchResult(candidate, Math.max(0, (1 + similarity) / 2));
-  });
+  const maxFused = fused[0].fusedScore;
+  return fused.slice(0, topN).map(
+    (candidate) => toSearchResult(candidate, candidate.fusedScore / maxFused)
+  );
 }
 // src/apikey.ts
@@ -1763,36 +1806,15 @@ Source "${name}" added to config/sources.yaml`);
     await browser.close();
   }
 }
-var EMBED_WINDOW = 1e3;
-async function syncChunks(sourceName, allChunks, urlCount, version) {
-  console.log("  Comparing with Firestore...");
-  const existing = await getSourceChunkHashes(sourceName);
-  const currentIds = new Set(allChunks.map((c) => c.id));
-  const toDelete = [...existing.keys()].filter((id) => !currentIds.has(id));
-  const toEmbed = allChunks.filter(
-    (chunk) => existing.get(chunk.id) !== contentHash(buildEmbedText(chunk))
+function applyExcludePatterns(items, getUrl, excludePatterns, label) {
+  if (!excludePatterns || excludePatterns.length === 0) return items;
+  const kept = items.filter(
+    (item) => !excludePatterns.some((pattern) => getUrl(item).includes(pattern))
   );
-  console.log(
-    `  Sync: ${toEmbed.length} to embed, ${allChunks.length - toEmbed.length} unchanged, ${toDelete.length} to delete.`
-  );
-  for (let i = 0; i < toEmbed.length; i += EMBED_WINDOW) {
-    const window = toEmbed.slice(i, i + EMBED_WINDOW);
-    const embeddings = await embedTexts(window.map((c) => buildEmbedText(c)), {
-      onProgress: (done) => {
-        console.log(`  [${i + done}/${toEmbed.length}] embedded`);
-      }
-    });
-    await storeChunks(window, embeddings, (cur) => {
-      console.log(`  [${i + cur}/${toEmbed.length}] stored`);
-    });
-  }
-  if (toDelete.length > 0) {
-    await deleteChunksByIds(toDelete, (cur, total) => {
-      console.log(`  [${cur}/${total}] deleted`);
-    });
+  if (kept.length < items.length) {
+    console.log(`  Excluded ${items.length - kept.length} ${label} via exclude_patterns.`);
   }
-  await updateSourceMeta(sourceName, allChunks.length, urlCount, version);
-  console.log(`  Done. ${allChunks.length} chunks live for "${sourceName}".`);
+  return kept;
 }
 async function readCachedMarkdownPages(mdDir) {
   const mdFiles = await readdir(mdDir).catch(() => []);
@@ -1853,14 +1875,15 @@ async function cmdRefresh() {
       limit: { type: "string" },
       "from-html": { type: "boolean", default: false },
       "from-markdown": { type: "boolean", default: false },
-      "skip-store": { type: "boolean", default: false }
+      "skip-store": { type: "boolean", default: false },
+      "allow-shrink": { type: "boolean", default: false }
     },
     allowPositionals: true
   });
   const config = await loadConfig(CONFIG_PATH);
   const sourcesToRefresh = args.values.all ? Object.keys(config.sources) : [args.positionals[0]];
   if (!args.values.all && !sourcesToRefresh[0]) {
-    console.error("Usage: grimoire refresh <source> [--full] [--from-html] [--from-markdown] [--skip-store] [--limit <n>] [--concurrency <n>]");
+    console.error("Usage: grimoire refresh <source> [--full] [--from-html] [--from-markdown] [--skip-store] [--allow-shrink] [--limit <n>] [--concurrency <n>]");
     process.exit(1);
   }
   const concurrencyOverride = args.values.concurrency ? parseInt(args.values.concurrency, 10) : void 0;
@@ -1893,6 +1916,7 @@ Refreshing "${sourceName}"...`);
         console.error("  No cached markdown found. Run with --from-html first.");
         process.exit(1);
       }
+      pages = applyExcludePatterns(pages, (p) => p.url, source.exclude_patterns, "cached pages");
       console.log(`  Found ${pages.length} cached pages.`);
     } else if (source.llms_full_url && !args.values["from-html"]) {
       console.log(`  Fetching llms-full.txt from ${source.llms_full_url}...`);
@@ -1911,6 +1935,7 @@ Refreshing "${sourceName}"...`);
       if (args.values["from-html"]) {
         console.log("  Reading URLs from cached HTML...");
         urls = await recoverUrlsFromHtml(rawDir);
+        urls = applyExcludePatterns(urls, (u) => u, source.exclude_patterns, "cached URLs");
         console.log(`  Found ${urls.length} cached pages.`);
       } else {
         console.log("  Scraping URLs...");
@@ -1944,7 +1969,7 @@ Refreshing "${sourceName}"...`);
       console.log(`  Done. ${allChunks.length} chunks ready (dry run, no embed/store).`);
       continue;
     }
-    await syncChunks(sourceName, allChunks, pages.length, source.version);
+    await syncChunks(sourceName, allChunks, pages.length, source.version, args.values["allow-shrink"]);
   }
 }
 async function cmdSearch() {
@@ -2152,4 +2177,4 @@ var ADMIN_COMMANDS = {
 export {
   ADMIN_COMMANDS
 };
-//# sourceMappingURL=admin-ENGUPLQ6.js.map
+//# sourceMappingURL=admin-MA5SI5CH.js.map