npm - @astrofoundry/grimoire - Versions diffs - 3.31.0 → 3.32.0 - Mend

@astrofoundry/grimoire 3.31.0 → 3.32.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/dist/{admin-WUNBDKBC.js → admin-4C3QNVQE.js} +86 -64
package/dist/admin-4C3QNVQE.js.map +7 -0
package/dist/{chunk-R46N6C3C.js → chunk-VTTJCQRQ.js} +37 -2
package/dist/chunk-VTTJCQRQ.js.map +7 -0
package/dist/cli.js +121 -14
package/dist/cli.js.map +2 -2
package/package.json +1 -1
package/dist/admin-WUNBDKBC.js.map +0 -7
package/dist/chunk-R46N6C3C.js.map +0 -7

package/dist/{admin-WUNBDKBC.js → admin-4C3QNVQE.js} RENAMED Viewed

@@ -1,10 +1,13 @@
 import {
+  RERANK_POOL_SIZE,
   __commonJS,
   __toESM,
   bold,
   cyan,
+  rerank,
+  rerankDocText,
   yellow
-} from "./chunk-R46N6C3C.js";
+} from "./chunk-VTTJCQRQ.js";
 // node_modules/.pnpm/@joplin+turndown-plugin-gfm@1.0.67/node_modules/@joplin/turndown-plugin-gfm/lib/turndown-plugin-gfm.cjs.js
 var require_turndown_plugin_gfm_cjs = __commonJS({
@@ -476,9 +479,7 @@ function slugifyUrl(url) {
 // src/scraper.ts
 function filterUrls(urls, includePatterns, excludePatterns) {
-  let filtered = urls.filter(
-    (url) => url.startsWith("http") && !url.includes("?hl=") && !url.endsWith("#")
-  );
+  let filtered = urls.map((url) => url.split("#")[0]).filter((url) => url.startsWith("http") && !url.includes("?hl="));
   if (includePatterns && includePatterns.length > 0) {
     filtered = filtered.filter(
       (url) => includePatterns.some((pattern) => url.includes(pattern))
@@ -688,7 +689,14 @@ function cleanTableRowBreaks(md) {
     (line) => line.startsWith("|") ? line.replace(/\s*<br\s*\/?>\s*/g, "; ") : line
   ).join("\n");
 }
+var SPAN_STRIP_THRESHOLD_BYTES = 2e6;
+function stripInlineSpans(html) {
+  return html.replace(/<\/?span\b[^>]*>/g, "");
+}
 function extractContent(html, contentSelector, removeSelectors, removeTextPatterns) {
+  if (html.length > SPAN_STRIP_THRESHOLD_BYTES) {
+    html = stripInlineSpans(html);
+  }
   const dom = new JSDOM(html);
   const doc = dom.window.document;
   const contentEl = doc.querySelector(contentSelector) ?? doc.body;
@@ -744,7 +752,14 @@ async function convertSource(sourceName, urls, contentSelector, removeSelectors,
       const url = urls[i];
       const slug = slugifyUrl(url);
       const htmlPath = join2(rawDir, `${slug}.html`);
-      const html = await readFile2(htmlPath, "utf-8");
+      let html;
+      try {
+        html = await readFile2(htmlPath, "utf-8");
+      } catch {
+        console.warn(`  WARNING: no cached HTML for ${url}, skipping page. Run 'grimoire scrape-urls' to fetch it.`);
+        completed++;
+        continue;
+      }
       const page = convertPage(html, sourceName, url, contentSelector, removeSelectors, removeTextPatterns);
       await writeFile2(join2(mdDir, `${slug}.md`), page.markdown, "utf-8");
       pages[i] = page;
@@ -757,7 +772,7 @@ async function convertSource(sourceName, urls, contentSelector, removeSelectors,
     () => worker()
   );
   await Promise.all(workers);
-  return pages;
+  return pages.filter((page) => page !== void 0);
 }
 // src/chunker.ts
@@ -777,7 +792,10 @@ function buildChunkId(source, url, headingSlug, index) {
   return index !== void 0 ? `${base}-${index}` : base;
 }
 function buildEmbedText(chunk) {
-  const path = chunk.heading_path[0] === chunk.title ? chunk.heading_path.slice(1) : chunk.heading_path;
+  let path = chunk.heading_path[0] === chunk.title ? chunk.heading_path.slice(1) : chunk.heading_path;
+  if (chunk.content.startsWith("#") && path.length > 0) {
+    path = path.slice(0, -1);
+  }
   const context = [chunk.title, ...path].filter(Boolean).join(" > ");
   return context ? `${context}
@@ -896,11 +914,25 @@ function parseBlocks(lines) {
   flush();
   return blocks;
 }
+function splitLongLine(line, budget) {
+  const maxChars = budget * 4;
+  if (line.length <= maxChars) return [line];
+  const pieces = [];
+  let rest = line;
+  while (rest.length > maxChars) {
+    let cut = rest.lastIndexOf(" ", maxChars);
+    if (cut < maxChars / 2) cut = maxChars;
+    pieces.push(rest.slice(0, cut));
+    rest = rest.slice(cut).trimStart();
+  }
+  if (rest) pieces.push(rest);
+  return pieces;
+}
 function groupLines(lines, budget) {
   const groups = [];
   let current = [];
   let tokens = 0;
-  for (const line of lines) {
+  for (const line of lines.flatMap((l) => splitLongLine(l, budget))) {
     const lineTokens = estimateTokens(line) + 1;
     if (tokens + lineTokens > budget && current.length > 0) {
       groups.push(current);
@@ -1295,7 +1327,8 @@ async function getSourceChunkHashes(sourceName) {
     snapshot.docs.map((doc) => [doc.id, doc.data().content_hash ?? ""])
   );
 }
-var TOKEN_QUERY_LIMIT = 100;
+var TOKEN_QUERY_LIMIT = 40;
+var TOKEN_RESULT_FIELDS = ["source", "url", "title", "heading_path", "content", "token_count"];
 async function tokenSearch(tokens, source) {
   if (tokens.length === 0) return [];
   const col = chunksCol();
@@ -1306,7 +1339,7 @@ async function tokenSearch(tokens, source) {
       if (source) {
         query = query.where("source", "==", source);
       }
-      return query.limit(TOKEN_QUERY_LIMIT).get();
+      return query.select(...TOKEN_RESULT_FIELDS).limit(TOKEN_QUERY_LIMIT).get();
     })
   );
   for (const snapshot of snapshots) {
@@ -1338,58 +1371,23 @@ async function vectorSearch(queryEmbedding, limit, source) {
     const data = doc.data();
     const distance = data._distance ?? 0;
     delete data._distance;
+    delete data.embedding;
     return { id: doc.id, data, distance };
   });
 }
-// src/reranker.ts
-function getRerankerUrl() {
-  const url = process.env.RERANKER_URL;
-  if (!url) {
-    throw new Error("RERANKER_URL environment variable is not set");
-  }
-  return url;
-}
-async function rerank(query, documents, topN = 5) {
-  const baseUrl = getRerankerUrl();
-  const response = await fetch(`${baseUrl}/v1/rerank`, {
-    method: "POST",
-    headers: { "Content-Type": "application/json" },
-    body: JSON.stringify({ query, documents, top_n: topN })
-  });
-  if (!response.ok) {
-    throw new Error(`Reranker request failed: ${response.status} ${response.statusText}`);
-  }
-  const data = await response.json();
-  return data.results;
-}
 // src/search.ts
 var DEFAULT_CANDIDATES = 50;
 var RRF_K = 60;
 function hasReranker() {
   return !!process.env.RERANKER_URL;
 }
-function isVectorLike(value) {
-  return typeof value === "object" && value !== null && typeof value.toArray === "function";
-}
-function cosineSimilarity(a, b) {
-  let dot = 0;
-  let normA = 0;
-  let normB = 0;
-  for (let i = 0; i < a.length; i++) {
-    dot += a[i] * b[i];
-    normA += a[i] * a[i];
-    normB += b[i] * b[i];
-  }
-  return dot / (Math.sqrt(normA) * Math.sqrt(normB));
-}
 function contextualText(data) {
-  return buildEmbedText({
-    title: data.title,
-    heading_path: data.heading_path,
-    content: data.content
-  });
+  return rerankDocText(
+    data.title,
+    data.heading_path,
+    data.content
+  );
 }
 function toSearchResult(candidate, relevance) {
   const data = candidate.data;
@@ -1426,19 +1424,17 @@ async function search(query, options = {}) {
       existing.fusedScore += lexicalScore;
       return;
     }
-    const embedding = hit.data.embedding;
-    delete hit.data.embedding;
     pool.set(hit.id, {
       id: hit.id,
       data: hit.data,
       fusedScore: lexicalScore,
-      similarity: isVectorLike(embedding) ? cosineSimilarity(queryEmbedding, embedding.toArray()) : null
+      similarity: null
     });
   });
   const fused = [...pool.values()].sort((a, b) => b.fusedScore - a.fusedScore);
   if (fused.length === 0) return [];
   if (hasReranker()) {
-    const rerankPool = fused.slice(0, candidates);
+    const rerankPool = fused.slice(0, RERANK_POOL_SIZE);
     const documents = rerankPool.map((c) => contextualText(c.data));
     const reranked = await rerank(query, documents, topN);
     return reranked.map((r) => toSearchResult(rerankPool[r.index], r.relevance_score));
@@ -1761,7 +1757,10 @@ Source "${name}" added to config/sources.yaml`);
   }
 }
 var EMBED_WINDOW = 1e3;
-async function syncChunks(sourceName, allChunks, urlCount, version) {
+var SHRINK_GUARD_MIN_EXISTING = 200;
+var SHRINK_GUARD_MAX_DELETE_RATIO = 0.5;
+var GROWTH_GUARD_MAX_RATIO = 5;
+async function syncChunks(sourceName, allChunks, urlCount, version, allowShrink = false) {
   console.log("  Comparing with Firestore...");
   const existing = await getSourceChunkHashes(sourceName);
   const currentIds = new Set(allChunks.map((c) => c.id));
@@ -1772,10 +1771,15 @@ async function syncChunks(sourceName, allChunks, urlCount, version) {
   console.log(
     `  Sync: ${toEmbed.length} to embed, ${allChunks.length - toEmbed.length} unchanged, ${toDelete.length} to delete.`
   );
-  if (toDelete.length > 0) {
-    await deleteChunksByIds(toDelete, (cur, total) => {
-      console.log(`  [${cur}/${total}] deleted`);
-    });
+  if (!allowShrink && existing.size >= SHRINK_GUARD_MIN_EXISTING && toDelete.length > existing.size * SHRINK_GUARD_MAX_DELETE_RATIO) {
+    throw new Error(
+      `Refusing to delete ${toDelete.length} of ${existing.size} stored chunks for "${sourceName}": URL discovery likely found only part of the site. Verify the sitemap/nav/urls.json, then re-run with --allow-shrink if the change is expected, or --full to rebuild from scratch.`
+    );
+  }
+  if (!allowShrink && existing.size >= SHRINK_GUARD_MIN_EXISTING && allChunks.length > existing.size * GROWTH_GUARD_MAX_RATIO) {
+    throw new Error(
+      `Refusing to grow "${sourceName}" from ${existing.size} to ${allChunks.length} chunks (>${GROWTH_GUARD_MAX_RATIO}x): URL discovery may have escaped the doc tree. Verify include_patterns/nav_selector, then re-run with --allow-shrink if the growth is real.`
+    );
   }
   for (let i = 0; i < toEmbed.length; i += EMBED_WINDOW) {
     const window = toEmbed.slice(i, i + EMBED_WINDOW);
@@ -1788,9 +1792,24 @@ async function syncChunks(sourceName, allChunks, urlCount, version) {
       console.log(`  [${i + cur}/${toEmbed.length}] stored`);
     });
   }
+  if (toDelete.length > 0) {
+    await deleteChunksByIds(toDelete, (cur, total) => {
+      console.log(`  [${cur}/${total}] deleted`);
+    });
+  }
   await updateSourceMeta(sourceName, allChunks.length, urlCount, version);
   console.log(`  Done. ${allChunks.length} chunks live for "${sourceName}".`);
 }
+function applyExcludePatterns(items, getUrl, excludePatterns, label) {
+  if (!excludePatterns || excludePatterns.length === 0) return items;
+  const kept = items.filter(
+    (item) => !excludePatterns.some((pattern) => getUrl(item).includes(pattern))
+  );
+  if (kept.length < items.length) {
+    console.log(`  Excluded ${items.length - kept.length} ${label} via exclude_patterns.`);
+  }
+  return kept;
+}
 async function readCachedMarkdownPages(mdDir) {
   const mdFiles = await readdir(mdDir).catch(() => []);
   const pages = [];
@@ -1850,14 +1869,15 @@ async function cmdRefresh() {
       limit: { type: "string" },
       "from-html": { type: "boolean", default: false },
       "from-markdown": { type: "boolean", default: false },
-      "skip-store": { type: "boolean", default: false }
+      "skip-store": { type: "boolean", default: false },
+      "allow-shrink": { type: "boolean", default: false }
     },
     allowPositionals: true
   });
   const config = await loadConfig(CONFIG_PATH);
   const sourcesToRefresh = args.values.all ? Object.keys(config.sources) : [args.positionals[0]];
   if (!args.values.all && !sourcesToRefresh[0]) {
-    console.error("Usage: grimoire refresh <source> [--full] [--from-html] [--from-markdown] [--skip-store] [--limit <n>] [--concurrency <n>]");
+    console.error("Usage: grimoire refresh <source> [--full] [--from-html] [--from-markdown] [--skip-store] [--allow-shrink] [--limit <n>] [--concurrency <n>]");
     process.exit(1);
   }
   const concurrencyOverride = args.values.concurrency ? parseInt(args.values.concurrency, 10) : void 0;
@@ -1890,6 +1910,7 @@ Refreshing "${sourceName}"...`);
         console.error("  No cached markdown found. Run with --from-html first.");
         process.exit(1);
       }
+      pages = applyExcludePatterns(pages, (p) => p.url, source.exclude_patterns, "cached pages");
       console.log(`  Found ${pages.length} cached pages.`);
     } else if (source.llms_full_url && !args.values["from-html"]) {
       console.log(`  Fetching llms-full.txt from ${source.llms_full_url}...`);
@@ -1908,6 +1929,7 @@ Refreshing "${sourceName}"...`);
       if (args.values["from-html"]) {
         console.log("  Reading URLs from cached HTML...");
         urls = await recoverUrlsFromHtml(rawDir);
+        urls = applyExcludePatterns(urls, (u) => u, source.exclude_patterns, "cached URLs");
         console.log(`  Found ${urls.length} cached pages.`);
       } else {
         console.log("  Scraping URLs...");
@@ -1941,7 +1963,7 @@ Refreshing "${sourceName}"...`);
       console.log(`  Done. ${allChunks.length} chunks ready (dry run, no embed/store).`);
       continue;
     }
-    await syncChunks(sourceName, allChunks, pages.length, source.version);
+    await syncChunks(sourceName, allChunks, pages.length, source.version, args.values["allow-shrink"]);
   }
 }
 async function cmdSearch() {
@@ -2149,4 +2171,4 @@ var ADMIN_COMMANDS = {
 export {
   ADMIN_COMMANDS
 };
-//# sourceMappingURL=admin-WUNBDKBC.js.map
+//# sourceMappingURL=admin-4C3QNVQE.js.map