npm - @tryformation/querylight-cli - Versions diffs - 0.1.1 → 0.2.0 - Mend

@tryformation/querylight-cli 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/README.md +62 -9
package/dist/chunk/chunker.d.ts +3 -1
package/dist/cli/main.js +1031 -237
package/dist/cli/run-cli.d.ts +4 -1
package/dist/core/concurrency.d.ts +1 -0
package/dist/core/constants.d.ts +3 -1
package/dist/core/progress.d.ts +4 -0
package/dist/core/urls.d.ts +1 -0
package/dist/index/querylight-indexer.d.ts +3 -1
package/dist/index.js +441 -114
package/dist/ingest/adapters/website-adapter.d.ts +6 -1
package/dist/ingest/adapters/website-feed-discovery.d.ts +6 -0
package/dist/ingest/extractors/html-extractor.d.ts +1 -0
package/dist/ingest/ingest-service.d.ts +5 -2
package/dist/types/models.d.ts +2 -2
package/dist/vector/dense.d.ts +3 -1
package/dist/vector/runtime.d.ts +2 -0
package/dist/vector/service.d.ts +20 -2
package/dist/vector/sparse.d.ts +3 -1
package/dist/vector/store.d.ts +8 -2
package/package.json +1 -1

package/dist/index.js CHANGED Viewed

@@ -20,6 +20,15 @@ var CliError = class extends Error {
 import { readFile, writeFile } from "fs/promises";
 import path from "path";
 import YAML from "yaml";
+// src/core/constants.ts
+var DEFAULT_SHARED_MODEL_CACHE_DIR = "~/.qli/models/huggingface";
+var LEGACY_WORKSPACE_MODEL_CACHE_DIR = ".kb/models/huggingface";
+// src/core/config.ts
+function normalizeModelCacheDir(configuredPath) {
+  return configuredPath === LEGACY_WORKSPACE_MODEL_CACHE_DIR ? DEFAULT_SHARED_MODEL_CACHE_DIR : configuredPath;
+}
 var defaultConfig = () => ({
   workspaceVersion: 1,
   index: {
@@ -47,17 +56,17 @@ var defaultConfig = () => ({
   retrieval: {
     defaultMode: "lexical",
     dense: {
-      enabled: false,
+      enabled: true,
       modelId: "Xenova/all-MiniLM-L6-v2",
-      cacheDir: ".kb/models/huggingface",
+      cacheDir: DEFAULT_SHARED_MODEL_CACHE_DIR,
       indexHashTables: 8,
       indexRandomSeed: 42,
       chunkTextMode: "title-heading-text"
     },
     sparse: {
-      enabled: false,
+      enabled: true,
       modelId: "opensearch-project/opensearch-neural-sparse-encoding-doc-v3-distill",
-      cacheDir: ".kb/models/huggingface",
+      cacheDir: DEFAULT_SHARED_MODEL_CACHE_DIR,
       documentTopTokens: 128,
       queryEncoding: "tokenizer-token-weights",
       documentEncoding: "masked-lm-max-log1p-relu",
@@ -68,6 +77,7 @@ var defaultConfig = () => ({
     defaultUserAgent: "querylight-cli/0.1",
     obeyRobotsTxt: true,
     rateLimitMs: 1e3,
+    maxConcurrentRequests: 5,
     renderJs: false,
     retentionDays: 365,
     fetchArticles: true
@@ -118,11 +128,13 @@ async function loadConfig(workspacePath, configPath) {
       ...parsed.retrieval ?? {},
       dense: {
         ...defaults.retrieval.dense,
-        ...parsed.retrieval?.dense ?? {}
+        ...parsed.retrieval?.dense ?? {},
+        cacheDir: normalizeModelCacheDir(parsed.retrieval?.dense?.cacheDir ?? defaults.retrieval.dense.cacheDir)
       },
       sparse: {
         ...defaults.retrieval.sparse,
-        ...parsed.retrieval?.sparse ?? {}
+        ...parsed.retrieval?.sparse ?? {},
+        cacheDir: normalizeModelCacheDir(parsed.retrieval?.sparse?.cacheDir ?? defaults.retrieval.sparse.cacheDir)
       }
     },
     crawler: {
@@ -145,8 +157,6 @@ var DIRS = [
   "normalized",
   "indexes",
   "vectors",
-  "models",
-  "models/huggingface",
   "runs",
   "logs"
 ];
@@ -275,6 +285,27 @@ async function saveChunks(workspacePath, chunks) {
   await writeJsonl(chunksFile(workspacePath), chunks.sort((a, b) => a.id.localeCompare(b.id)));
 }
+// src/core/concurrency.ts
+async function mapWithConcurrency(items, limit, worker) {
+  if (items.length === 0) {
+    return;
+  }
+  const concurrency = Math.max(1, Math.floor(limit));
+  let nextIndex = 0;
+  await Promise.all(
+    Array.from({ length: Math.min(concurrency, items.length) }, async () => {
+      while (true) {
+        const index = nextIndex;
+        nextIndex += 1;
+        if (index >= items.length) {
+          return;
+        }
+        await worker(items[index], index);
+      }
+    })
+  );
+}
 // src/core/files.ts
 import { stat as stat2 } from "fs/promises";
 async function fileExists(filePath) {
@@ -286,6 +317,14 @@ async function fileExists(filePath) {
   }
 }
+// src/core/progress.ts
+function reportProgress(progress, message) {
+  progress?.("info", message);
+}
+function reportProgressDetail(progress, message) {
+  progress?.("detail", message);
+}
 // src/core/runs.ts
 import path6 from "path";
 async function writeRun(workspacePath, run) {
@@ -428,9 +467,41 @@ function stripBoilerplate(html) {
 // src/ingest/extractors/html-extractor.ts
 var turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
+var LOW_SIGNAL_SECTION_SELECTORS = [
+  "script",
+  "style",
+  "noscript",
+  "template",
+  "[data-blog-service-recommendations]",
+  "[data-blog-related-posts]"
+].join(", ");
 function cleanText(value) {
   return value.replace(/\s+/g, " ").trim();
 }
+function pruneLowSignalContent($) {
+  $(LOW_SIGNAL_SECTION_SELECTORS).remove();
+  $("form").each((_, element) => {
+    const action = cleanText($(element).attr("action") ?? "");
+    if (action.includes("substack.com/subscribe")) {
+      $(element).closest("section").remove();
+    }
+  });
+}
+function stripEscapedJsonPayloads(markdown) {
+  return markdown.split("\n").filter((line) => {
+    const trimmed = line.trim();
+    if (trimmed.length === 0) {
+      return true;
+    }
+    if (trimmed.length > 300 && /^"?\\?\[\{\\?"[a-z0-9_]+\\?":/i.test(trimmed)) {
+      return false;
+    }
+    if (trimmed.length > 300 && trimmed.includes('\\"permalink\\":') && trimmed.includes('\\"title\\":')) {
+      return false;
+    }
+    return true;
+  }).join("\n").replace(/\n{3,}/g, "\n\n").trim();
+}
 function chooseMeaningfulTitle($, fallbackTitle) {
   const candidates = [
     cleanText($("meta[property='og:title']").attr("content") ?? ""),
@@ -467,14 +538,27 @@ ${parts.join("\n\n")}
 function extractHtmlToMarkdown(html) {
   const cleaned = stripBoilerplate(html);
   const $ = load(cleaned);
+  pruneLowSignalContent($);
   const fallbackTitle = cleanText($("title").first().text()) || "Untitled";
   const title = chooseMeaningfulTitle($, fallbackTitle);
   const root = $("main").first().html() ?? $.root().html() ?? cleaned;
   return {
-    markdown: turndown.turndown(root),
+    markdown: stripEscapedJsonPayloads(turndown.turndown(root)),
     title
   };
 }
+function extractCanonicalUriFromHtml(html, baseUrl) {
+  const $ = load(html);
+  const href = $("link[rel='canonical']").first().attr("href")?.trim();
+  if (!href) {
+    return null;
+  }
+  try {
+    return new URL(href, baseUrl).href;
+  } catch {
+    return null;
+  }
+}
 function parseDateCandidate(value) {
   const trimmed = value.trim();
   if (!trimmed) {
@@ -879,6 +963,19 @@ async function parseRssFeedDocument(xml, source) {
 // src/ingest/adapters/url-adapter.ts
 import { mkdir as mkdir5, readFile as readFile7, writeFile as writeFile5 } from "fs/promises";
 import path9 from "path";
+// src/core/urls.ts
+function normalizeRemoteUrl(uri) {
+  try {
+    const parsed = new URL(uri);
+    parsed.hash = "";
+    return parsed.href;
+  } catch {
+    return uri;
+  }
+}
+// src/ingest/adapters/url-adapter.ts
 function buildHttpCache(response, validatedAt) {
   return {
     etag: response.headers.get("etag") ?? void 0,
@@ -903,12 +1000,13 @@ async function normalizeRemoteDocument({
   responseStatus
 }) {
   const extracted = extractHtmlToMarkdown(body);
+  const canonicalUri = normalizeRemoteUrl(extractCanonicalUriFromHtml(body, url) ?? url);
   const markdown = `# ${extracted.title}
 ${extracted.markdown}`;
-  const documentId = stableId("doc", source.id, url);
+  const documentId = stableId("doc", source.id, canonicalUri);
   const normalizedPath = path9.resolve(workspacePath, "normalized", `${documentId}.md`);
-  const rawPath = path9.resolve(workspacePath, "raw", source.id, `${sha256(url).slice(0, 12)}.html`);
+  const rawPath = path9.resolve(workspacePath, "raw", source.id, `${sha256(canonicalUri).slice(0, 12)}.html`);
   const contentHash = sha256(markdown);
   const now = (/* @__PURE__ */ new Date()).toISOString();
   const lastChangedAt = previous?.contentHash === contentHash ? previous.lastChangedAt : now;
@@ -921,7 +1019,7 @@ ${extracted.markdown}`;
     documentId,
     sourceId: source.id,
     title: extracted.title,
-    uri: url,
+    uri: canonicalUri,
     sourceUri,
     publicationDate: resolvedPublicationDate,
     crawledAt,
@@ -936,8 +1034,9 @@ ${extracted.markdown}`;
     sourceId: source.id,
     sourceType: source.type,
     title: extracted.title,
-    uri: url,
+    uri: canonicalUri,
     sourceUri,
+    canonicalUri,
     mimeType: "text/html",
     rawPath,
     normalizedPath,
@@ -1111,6 +1210,18 @@ function isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules
   if (url.origin !== baseUrl.origin) {
     return false;
   }
+  if (url.search.length > 0) {
+    return false;
+  }
+  if (url.pathname.endsWith(".xml")) {
+    return false;
+  }
+  if (url.pathname.includes("/cdn-cgi/")) {
+    return false;
+  }
+  if (url.pathname === "/search" || url.pathname === "/search/" || url.pathname.endsWith("/search/")) {
+    return false;
+  }
   if (disallowRules.some((rule) => rule !== "/" && url.pathname.startsWith(rule))) {
     return false;
   }
@@ -1123,56 +1234,75 @@ function isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules
   }
   return true;
 }
-async function crawlWebsite(source) {
+function delay(ms) {
+  return new Promise((resolve2) => setTimeout(resolve2, ms));
+}
+async function crawlWebsite(source, defaults, progress) {
   const baseUrl = new URL(source.uri);
-  const userAgent = source.crawl?.userAgent ?? "querylight-cli/0.1";
+  const userAgent = source.crawl?.userAgent ?? defaults.userAgent;
   const includePatterns = source.crawl?.includePatterns ?? [];
   const excludePatterns = source.crawl?.excludePatterns ?? [];
   const maxDepth = source.crawl?.maxDepth ?? 2;
   const maxPages = source.crawl?.maxPages ?? 100;
-  const rateLimitMs = source.crawl?.rateLimitMs ?? 1e3;
+  const rateLimitMs = source.crawl?.rateLimitMs ?? defaults.rateLimitMs;
+  const maxConcurrentRequests = source.crawl?.maxConcurrentRequests ?? defaults.maxConcurrentRequests;
   const disallowRules = source.crawl?.obeyRobotsTxt === false ? [] : await fetchRobotsDisallow(baseUrl, userAgent);
-  const queue = [{ url: source.uri, depth: 0 }];
   const seen = /* @__PURE__ */ new Set();
   const results = [];
+  let currentLevel = [normalizeRemoteUrl(source.uri)];
   if (source.crawl?.useSitemap !== false) {
-    for (const url of await fetchSitemapUrls(baseUrl, userAgent)) {
-      queue.push({ url, depth: 1 });
-    }
-  }
-  while (queue.length > 0 && results.length < maxPages) {
-    const next = queue.shift();
-    if (!next || seen.has(next.url)) {
-      continue;
-    }
-    seen.add(next.url);
-    const url = new URL(next.url);
-    if (!isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules)) {
-      continue;
+    const sitemapUrls = (await fetchSitemapUrls(baseUrl, userAgent)).map((url) => normalizeRemoteUrl(url));
+    reportProgress(progress, `Discovered ${sitemapUrls.length} sitemap URL${sitemapUrls.length === 1 ? "" : "s"} for ${source.uri}`);
+    currentLevel = [
+      ...currentLevel,
+      ...sitemapUrls
+    ];
+  }
+  for (let depth = 0; depth <= maxDepth && currentLevel.length > 0 && results.length < maxPages; depth += 1) {
+    reportProgress(progress, `Crawl depth ${depth}: evaluating ${currentLevel.length} candidate URL${currentLevel.length === 1 ? "" : "s"}`);
+    const nextLevelCandidates = [];
+    const allowedUrls = [];
+    for (const candidate of currentLevel) {
+      const normalizedCandidate = normalizeRemoteUrl(candidate);
+      if (seen.has(normalizedCandidate)) {
+        continue;
+      }
+      seen.add(normalizedCandidate);
+      const url = new URL(normalizedCandidate);
+      if (!isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules)) {
+        continue;
+      }
+      allowedUrls.push(normalizedCandidate);
+      results.push(normalizedCandidate);
+      reportProgress(progress, `Discovered ${normalizedCandidate}`);
+      if (results.length >= maxPages) {
+        break;
+      }
     }
-    results.push(url.href);
-    if (next.depth >= maxDepth) {
-      continue;
+    reportProgress(progress, `Crawl depth ${depth}: queued ${allowedUrls.length} page${allowedUrls.length === 1 ? "" : "s"} for link extraction`);
+    if (depth >= maxDepth || results.length >= maxPages) {
+      break;
     }
-    const response = await fetch(url, { headers: { "user-agent": userAgent } });
-    const html = await response.text();
-    const $ = load2(html);
-    $("a[href]").each((_, element) => {
-      const href = $(element).attr("href");
-      if (!href) {
-        return;
-      }
-      try {
-        const target = new URL(href, url);
-        if (!seen.has(target.href)) {
-          queue.push({ url: target.href, depth: next.depth + 1 });
+    await mapWithConcurrency(allowedUrls, maxConcurrentRequests, async (pageUrl) => {
+      const page = new URL(pageUrl);
+      const response = await fetch(page, { headers: { "user-agent": userAgent } });
+      const html = await response.text();
+      const $ = load2(html);
+      $("a[href]").each((_, element) => {
+        const href = $(element).attr("href");
+        if (!href) {
+          return;
         }
-      } catch {
+        try {
+          nextLevelCandidates.push(normalizeRemoteUrl(new URL(href, page).href));
+        } catch {
+        }
+      });
+      if (rateLimitMs > 0) {
+        await delay(rateLimitMs);
       }
     });
-    if (rateLimitMs > 0) {
-      await new Promise((resolve2) => setTimeout(resolve2, rateLimitMs));
-    }
+    currentLevel = nextLevelCandidates;
   }
   return results;
 }
@@ -1247,6 +1377,8 @@ async function ingestRssSource({
   source,
   previous,
   nextDocuments,
+  maxConcurrentRequests,
+  onDocumentProcessed,
   onFailure
 }) {
   if (source.crawl?.fetchArticles === false) {
@@ -1254,11 +1386,12 @@ async function ingestRssSource({
   }
   const xml = await fetchFeedText(source);
   const items = await parseRssFeedDocument(xml, source);
+  const processedDocumentIds = /* @__PURE__ */ new Set();
   let added = 0;
   let changed = 0;
   let unchanged = 0;
   let failed = 0;
-  for (const item of items) {
+  await mapWithConcurrency(items, maxConcurrentRequests, async (item) => {
     try {
       const probe = previous.get(stableId("doc", source.id, item.url));
       const document = await fetchUrlDocument({
@@ -1269,28 +1402,40 @@ async function ingestRssSource({
         sourceUri: source.uri,
         publicationDate: item.publicationDate
       });
+      if (processedDocumentIds.has(document.id)) {
+        return;
+      }
+      processedDocumentIds.add(document.id);
+      const existingDocument = probe ?? previous.get(document.id);
       nextDocuments.set(document.id, document);
-      if (!probe) {
+      if (!existingDocument) {
         added += 1;
-      } else if (probe.contentHash !== document.contentHash) {
+        onDocumentProcessed?.(document.uri, "added");
+      } else if (existingDocument.contentHash !== document.contentHash) {
         changed += 1;
+        onDocumentProcessed?.(document.uri, "changed");
       } else {
         unchanged += 1;
+        onDocumentProcessed?.(document.uri, "unchanged");
       }
     } catch (error) {
       failed += 1;
       onFailure(item.url, error);
     }
-  }
+  });
   return { added, changed, unchanged, failed };
 }
 async function ingestSources({
   workspacePath,
   sourceIds,
-  changedOnly = false
+  changedOnly = false,
+  progress
 }) {
   const config = await loadConfig(workspacePath);
   const defaultRetentionDays = config.crawler.retentionDays;
+  const defaultUserAgent = config.crawler.defaultUserAgent;
+  const defaultRateLimitMs = config.crawler.rateLimitMs;
+  const defaultMaxConcurrentRequests = config.crawler.maxConcurrentRequests;
   const sources = (await listSources(workspacePath)).filter((source) => source.enabled && (!sourceIds || sourceIds.includes(source.id)));
   const existing = await loadDocuments(workspacePath);
   const previous = previousMap(existing);
@@ -1300,20 +1445,38 @@ async function ingestSources({
   let unchanged = 0;
   let failed = 0;
   const failures = [];
+  reportProgress(progress, `Ingesting ${sources.length} source${sources.length === 1 ? "" : "s"}`);
   for (const source of sources) {
+    const maxConcurrentRequests = source.crawl?.maxConcurrentRequests ?? defaultMaxConcurrentRequests;
+    const sourceBefore = { added, changed, unchanged, failed };
+    const processedDocumentIds = /* @__PURE__ */ new Set();
+    const reportDocumentOutcome = (uri, outcome) => {
+      const label = outcome === "unchanged" ? "Unchanged" : outcome === "changed" ? "Updated" : "Added";
+      reportProgress(progress, `${label} ${uri}`);
+    };
     const ingestOne = async (uri, producer) => {
       try {
         const probeId = stableId("doc", source.id, uri);
         const earlier = previous.get(probeId);
         const document = await producer();
+        if (processedDocumentIds.has(document.id)) {
+          reportProgressDetail(progress, `Skipped duplicate alias ${uri} -> ${document.uri}`);
+          return null;
+        }
+        processedDocumentIds.add(document.id);
+        const existingDocument = earlier ?? previous.get(document.id);
         nextDocuments.set(document.id, document);
-        if (!earlier) {
+        if (!existingDocument) {
           added += 1;
-        } else if (earlier.contentHash !== document.contentHash) {
+          reportDocumentOutcome(document.uri, "added");
+        } else if (existingDocument.contentHash !== document.contentHash) {
           changed += 1;
+          reportDocumentOutcome(document.uri, "changed");
         } else {
           unchanged += 1;
+          reportDocumentOutcome(document.uri, "unchanged");
         }
+        return document;
       } catch (error) {
         failed += 1;
         failures.push({
@@ -1321,50 +1484,69 @@ async function ingestSources({
           uri,
           message: error instanceof Error ? error.message : String(error)
         });
+        reportProgressDetail(progress, `Failed ${uri}: ${error instanceof Error ? error.message : String(error)}`);
+        return null;
       }
     };
     try {
+      reportProgress(progress, `Source ${source.name} (${source.type})`);
       if (source.type === "file") {
+        reportProgress(progress, `Reading file ${source.uri}`);
         await ingestOne(source.uri, () => ingestFile({ workspacePath, source, filePath: source.uri, previous: previous.get(stableId("doc", source.id, source.uri)) }));
-        continue;
-      }
-      if (source.type === "directory") {
-        for (const filePath of await listDirectoryFiles(source)) {
+      } else if (source.type === "directory") {
+        const files = await listDirectoryFiles(source);
+        reportProgress(progress, `Scanning ${files.length} file${files.length === 1 ? "" : "s"} from ${source.uri}`);
+        for (const filePath of files) {
+          reportProgress(progress, `Reading file ${filePath}`);
           await ingestOne(filePath, () => ingestFile({ workspacePath, source, filePath, previous: previous.get(stableId("doc", source.id, filePath)) }));
         }
-        continue;
-      }
-      if (source.type === "url") {
+      } else if (source.type === "url") {
+        reportProgress(progress, `Fetching ${source.uri}`);
         await ingestOne(source.uri, () => fetchUrlDocument({ workspacePath, source, url: source.uri, previous: previous.get(stableId("doc", source.id, source.uri)) }));
-        continue;
-      }
-      if (source.type === "website") {
-        for (const url of await crawlWebsite(source)) {
-          await ingestOne(url, () => fetchUrlDocument({ workspacePath, source, url, previous: previous.get(stableId("doc", source.id, url)) }));
-        }
-        continue;
-      }
-      if (source.type === "rss") {
+      } else if (source.type === "website") {
+        reportProgress(progress, `Crawling ${source.uri}`);
+        const urls = await crawlWebsite(source, {
+          userAgent: defaultUserAgent,
+          rateLimitMs: defaultRateLimitMs,
+          maxConcurrentRequests
+        }, progress);
+        reportProgress(progress, `Fetched ${urls.length} page${urls.length === 1 ? "" : "s"} from crawl`);
+        const seenCanonicalUrls = /* @__PURE__ */ new Set();
+        await mapWithConcurrency(urls, maxConcurrentRequests, async (url) => {
+          if (seenCanonicalUrls.has(url)) {
+            reportProgressDetail(progress, `Skipped canonical duplicate ${url}`);
+            return;
+          }
+          reportProgress(progress, `Fetching ${url}`);
+          const document = await ingestOne(url, () => fetchUrlDocument({ workspacePath, source, url, previous: previous.get(stableId("doc", source.id, url)) }));
+          if (document) {
+            seenCanonicalUrls.add(document.uri);
+          }
+        });
+      } else if (source.type === "rss") {
+        reportProgress(progress, `Fetching feed ${source.uri}`);
         const result = await ingestRssSource({
           workspacePath,
           source,
           previous,
           nextDocuments,
+          maxConcurrentRequests,
+          onDocumentProcessed: reportDocumentOutcome,
           onFailure: (uri, error) => {
             failures.push({
               sourceId: source.id,
               uri,
               message: error instanceof Error ? error.message : String(error)
             });
+            reportProgressDetail(progress, `Failed ${uri}: ${error instanceof Error ? error.message : String(error)}`);
           }
         });
         added += result.added;
         changed += result.changed;
         unchanged += result.unchanged;
         failed += result.failed;
-        continue;
-      }
-      if (source.type === "markdown" || source.type === "text") {
+      } else if (source.type === "markdown" || source.type === "text") {
+        reportProgress(progress, `Processing inline ${source.type} source ${source.id}`);
         await ingestOne(source.uri, () => ingestInlineContent({
           workspacePath,
           source,
@@ -1381,13 +1563,19 @@ async function ingestSources({
         uri: source.uri,
         message: error instanceof Error ? error.message : String(error)
       });
+      reportProgressDetail(progress, `Failed source ${source.name}: ${error instanceof Error ? error.message : String(error)}`);
     }
+    reportProgress(
+      progress,
+      `Finished ${source.name}: +${added - sourceBefore.added} added, ${changed - sourceBefore.changed} changed, ${unchanged - sourceBefore.unchanged} unchanged, ${failed - sourceBefore.failed} failed`
+    );
   }
   const expiringDocuments = [...nextDocuments.values()].filter((document) => {
     const source = sources.find((candidate) => candidate.id === document.sourceId);
     return source ? shouldExpireRssDocument(document, source, defaultRetentionDays) : false;
   });
   if (expiringDocuments.length > 0) {
+    reportProgress(progress, `Removing ${expiringDocuments.length} expired RSS document${expiringDocuments.length === 1 ? "" : "s"}`);
     const expiredIds = new Set(expiringDocuments.map((document) => document.id));
     for (const document of expiringDocuments) {
       nextDocuments.delete(document.id);
@@ -1414,6 +1602,7 @@ async function ingestSources({
     documentsSnapshot: documentSnapshot(finalDocuments)
   };
   await writeRun(workspacePath, run);
+  reportProgress(progress, `Ingest complete: ${added} added, ${changed} changed, ${unchanged} unchanged, ${failed} failed`);
   return {
     runId: id,
     documents: { added, changed, unchanged, failed },
@@ -1423,7 +1612,8 @@ async function ingestSources({
 async function reprocessDocuments({
   workspacePath,
   sourceId,
-  documentId
+  documentId,
+  progress
 }) {
   const documents = await loadDocuments(workspacePath);
   const sources = await listSources(workspacePath);
@@ -1431,15 +1621,20 @@ async function reprocessDocuments({
   const nextDocuments = new Map(documents.map((document) => [document.id, document]));
   let documentsReprocessed = 0;
   let documentsSkipped = 0;
-  for (const document of documents.filter((candidate) => (!sourceId || candidate.sourceId === sourceId) && (!documentId || candidate.id === documentId))) {
+  const targets = documents.filter((candidate) => (!sourceId || candidate.sourceId === sourceId) && (!documentId || candidate.id === documentId));
+  reportProgress(progress, `Reprocessing ${targets.length} document${targets.length === 1 ? "" : "s"}`);
+  for (const document of targets) {
+    reportProgressDetail(progress, `Reprocessing ${document.id} (${document.title})`);
     const source = sourceMap.get(document.sourceId);
     if (!source || !document.rawPath || !await fileExists(document.rawPath)) {
       documentsSkipped += 1;
+      reportProgressDetail(progress, `Skipped ${document.id}: raw source not available`);
       continue;
     }
     const updated = source.type === "url" || source.type === "website" || source.type === "rss" ? await reprocessRemoteDocument(document, source) : await reprocessStoredDocument(document, source);
     if (!updated) {
       documentsSkipped += 1;
+      reportProgressDetail(progress, `Skipped ${document.id}: source type could not be reprocessed`);
       continue;
     }
     nextDocuments.set(updated.id, updated);
@@ -1459,6 +1654,7 @@ async function reprocessDocuments({
     },
     documentsSnapshot: documentSnapshot(finalDocuments)
   });
+  reportProgress(progress, `Reprocess complete: ${documentsReprocessed} updated, ${documentsSkipped} skipped`);
   return { runId: id, documentsReprocessed, documentsSkipped };
 }
@@ -1560,11 +1756,13 @@ function buildChunksForDocument(document, markdown, config, prior = /* @__PURE__
 async function chunkDocuments({
   workspacePath,
   sourceId,
-  documentId
+  documentId,
+  progress
 }) {
   const config = await loadConfig(workspacePath);
   const documents = await readJsonl(path11.join(workspacePath, "documents", "documents.jsonl"));
   const filtered = documents.filter((document) => (!sourceId || document.sourceId === sourceId) && (!documentId || document.id === documentId));
+  reportProgress(progress, `Chunking ${filtered.length} document${filtered.length === 1 ? "" : "s"}`);
   const targetedDocumentIds = new Set(filtered.map((document) => document.id));
   const existingChunks = await loadChunks(workspacePath);
   const prior = new Map(existingChunks.map((chunk) => [chunk.id, chunk]));
@@ -1572,12 +1770,14 @@ async function chunkDocuments({
     existingChunks.filter((chunk) => !targetedDocumentIds.has(chunk.documentId)).map((chunk) => [chunk.id, chunk])
   );
   for (const document of filtered) {
+    reportProgressDetail(progress, `Chunking ${document.id} (${document.title})`);
     const raw = await readFile8(document.normalizedPath, "utf8");
     for (const chunk of buildChunksForDocument(document, raw, config, prior)) {
       nextChunks.set(chunk.id, chunk);
     }
   }
   await saveChunks(workspacePath, [...nextChunks.values()]);
+  reportProgress(progress, `Chunking complete: ${nextChunks.size} chunk${nextChunks.size === 1 ? "" : "s"} written`);
   return { chunksWritten: nextChunks.size };
 }
@@ -1586,15 +1786,31 @@ import { Analyzer, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, Ranking
 import path17 from "path";
 // src/vector/dense.ts
-import { VectorFieldIndex, createSeededRandom } from "@tryformation/querylight-ts";
+import { VectorFieldIndex, cosineSimilarity, createSeededRandom } from "@tryformation/querylight-ts";
 import { mkdir as mkdir7 } from "fs/promises";
 import path14 from "path";
 // src/vector/runtime.ts
+import os from "os";
 import path12 from "path";
 import { fileURLToPath } from "url";
 import { execFile, execFileSync } from "child_process";
+function resolveQliHomeDir() {
+  return path12.resolve(process.env.QLI_HOME ?? path12.join(os.homedir(), ".qli"));
+}
 function resolveCacheDir(workspacePath, configuredPath) {
+  if (configuredPath === "~/.qli") {
+    return resolveQliHomeDir();
+  }
+  if (configuredPath.startsWith("~/.qli/")) {
+    return path12.join(resolveQliHomeDir(), configuredPath.slice("~/.qli/".length));
+  }
+  if (configuredPath === "~") {
+    return os.homedir();
+  }
+  if (configuredPath.startsWith("~/")) {
+    return path12.join(os.homedir(), configuredPath.slice(2));
+  }
   return path12.isAbsolute(configuredPath) ? configuredPath : path12.resolve(workspacePath, configuredPath.replace(/^\.kb\//, ""));
 }
 function packageRootFromImportMeta(importMetaUrl) {
@@ -1618,6 +1834,14 @@ async function ensureUvAvailable() {
     execFile("uv", ["--version"], (error) => error ? reject(error) : resolve2());
   });
 }
+async function isUvAvailable() {
+  try {
+    await ensureUvAvailable();
+    return true;
+  } catch {
+    return false;
+  }
+}
 async function runSparsePython({
   workspacePath,
   config,
@@ -1666,8 +1890,8 @@ import path13 from "path";
 function vectorsDir(workspacePath) {
   return path13.join(workspacePath, "vectors");
 }
-function modelsDir(workspacePath) {
-  return path13.join(workspacePath, "models");
+function sharedModelStateDir() {
+  return path13.join(resolveQliHomeDir(), "models", "status");
 }
 function denseVectorPath(workspacePath) {
   return path13.join(vectorsDir(workspacePath), "dense.latest.json");
@@ -1681,11 +1905,16 @@ function sparseVectorPath(workspacePath) {
 function sparseMetaPath(workspacePath) {
   return path13.join(vectorsDir(workspacePath), "sparse.latest.meta.json");
 }
-function densePullMarker(workspacePath) {
-  return path13.join(modelsDir(workspacePath), "dense.pulled.json");
+function pullMarkerPath(type, workspacePath, modelId, cacheDir) {
+  const resolvedCacheDir = resolveCacheDir(workspacePath, cacheDir);
+  const cacheKey = sha256(resolvedCacheDir).slice(0, 16);
+  return path13.join(sharedModelStateDir(), type, `${encodeURIComponent(modelId)}.${cacheKey}.json`);
 }
-function sparsePullMarker(workspacePath) {
-  return path13.join(modelsDir(workspacePath), "sparse.pulled.json");
+function densePullMarker(workspacePath, modelId, cacheDir) {
+  return pullMarkerPath("dense", workspacePath, modelId, cacheDir);
+}
+function sparsePullMarker(workspacePath, modelId, cacheDir) {
+  return pullMarkerPath("sparse", workspacePath, modelId, cacheDir);
 }
 async function writeDensePayload(workspacePath, payload) {
   await mkdir6(vectorsDir(workspacePath), { recursive: true });
@@ -1711,7 +1940,7 @@ async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
       configured: dense.enabled,
       modelId: dense.modelId,
       cacheDir: denseCacheDir,
-      available: await fileExists(densePullMarker(workspacePath)),
+      available: await fileExists(densePullMarker(workspacePath, dense.modelId, dense.cacheDir)),
       artifactExists: await fileExists(denseVectorPath(workspacePath))
     },
     sparse: {
@@ -1719,22 +1948,64 @@ async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
       modelId: sparse.modelId,
       cacheDir: sparseCacheDir,
       uvAvailable,
-      available: await fileExists(sparsePullMarker(workspacePath)),
+      available: await fileExists(sparsePullMarker(workspacePath, sparse.modelId, sparse.cacheDir)),
       artifactExists: await fileExists(sparseVectorPath(workspacePath))
     }
   };
 }
 // src/vector/text.ts
+var LOW_SIGNAL_HEADINGS = /* @__PURE__ */ new Set([
+  "choose this instead of",
+  "how xyz runs it",
+  "naechste schritte",
+  "next steps",
+  "overview",
+  "passend wenn",
+  "problem",
+  "right fit",
+  "waehlen sie das stattdessen",
+  "was sie bekommen",
+  "what you get",
+  "wie xyz es umsetzt",
+  "uberblick",
+  "\xFCberblick"
+]);
+function normalizeHeading(value) {
+  return value.trim().toLowerCase();
+}
+function isLowSignalHeading(value) {
+  return LOW_SIGNAL_HEADINGS.has(normalizeHeading(value));
+}
+function stripLeadingHeading(text, heading) {
+  const lines = text.split("\n");
+  const firstContentIndex = lines.findIndex((line) => line.trim().length > 0);
+  if (firstContentIndex < 0) {
+    return text;
+  }
+  const match = /^(#{1,6})\s+(.+)$/.exec(lines[firstContentIndex] ?? "");
+  if (!match?.[2] || normalizeHeading(match[2]) !== normalizeHeading(heading)) {
+    return text;
+  }
+  const next = [...lines.slice(0, firstContentIndex), ...lines.slice(firstContentIndex + 1)].join("\n").trim();
+  return next;
+}
+function createVectorText(chunk) {
+  const meaningfulHeadings = chunk.headingPath.filter((heading) => !isLowSignalHeading(heading) && normalizeHeading(heading) !== normalizeHeading(chunk.title));
+  const textHeading = [...chunk.headingPath].reverse().find((heading) => isLowSignalHeading(heading) || normalizeHeading(heading) === normalizeHeading(chunk.title));
+  const body = textHeading ? stripLeadingHeading(chunk.text, textHeading) : chunk.text.trim();
+  return [chunk.title, ...meaningfulHeadings, body].filter(Boolean).join("\n\n");
+}
 function createDenseChunkText(chunk) {
-  return [chunk.title, ...chunk.headingPath, chunk.text].filter(Boolean).join("\n\n");
+  return createVectorText(chunk);
 }
 function createSparseChunkText(chunk) {
-  return [chunk.title, ...chunk.headingPath, chunk.text].filter(Boolean).join("\n\n");
+  return createVectorText(chunk);
 }
 // src/vector/dense.ts
 var denseEmbedderFactory = null;
+var EXACT_DENSE_RERANK_THRESHOLD = 5e3;
 async function createEmbedder(cacheDir, modelId) {
   if (denseEmbedderFactory) {
     return denseEmbedderFactory(cacheDir, modelId);
@@ -1746,9 +2017,13 @@ async function createEmbedder(cacheDir, modelId) {
     return output.tolist()[0];
   };
 }
+function exactDenseQuery(payload, vector, topK) {
+  return payload.chunks.map((chunk) => [chunk.chunkId, cosineSimilarity(vector, chunk.embedding)]).sort((left, right) => right[1] - left[1]).slice(0, topK);
+}
 async function buildDenseVectors({
   workspacePath,
-  config
+  config,
+  progress
 }) {
   const chunks = await readJsonl(path14.join(workspacePath, "chunks", "chunks.jsonl"));
   const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
@@ -1756,6 +2031,7 @@ async function buildDenseVectors({
   const embed = await createEmbedder(cacheDir, config.modelId);
   const records = [];
   let dimensions = 0;
+  reportProgress(progress, `Encoding ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} for dense retrieval`);
   for (const chunk of chunks) {
     const embedding = await embed(createDenseChunkText(chunk));
     dimensions ||= embedding.length;
@@ -1769,7 +2045,11 @@ async function buildDenseVectors({
       text: chunk.text,
       embedding
     });
+    if (records.length === 1 || records.length % 100 === 0 || records.length === chunks.length) {
+      reportProgressDetail(progress, `Encoded ${records.length}/${chunks.length} chunks for dense retrieval`);
+    }
   }
+  reportProgress(progress, "Building dense vector index");
   const index = new VectorFieldIndex({
     numHashTables: config.indexHashTables,
     dimensions,
@@ -1793,6 +2073,7 @@ async function buildDenseVectors({
     chunks: records
   };
   await writeDensePayload(workspacePath, payload);
+  reportProgress(progress, `Dense vectors written for ${records.length} chunk${records.length === 1 ? "" : "s"}`);
   return payload;
 }
 async function denseQuery({
@@ -1805,12 +2086,19 @@ async function denseQuery({
   const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
   const embed = await createEmbedder(cacheDir, config.modelId);
   const vector = await embed(query);
+  if (payload.chunks.length <= EXACT_DENSE_RERANK_THRESHOLD) {
+    return exactDenseQuery(payload, vector, topK);
+  }
   const index = new VectorFieldIndex({
     numHashTables: payload.metadata.hashTables,
     dimensions: payload.metadata.dimensions,
     random: createSeededRandom(payload.metadata.randomSeed)
   }).loadState(payload.indexState);
-  return index.query(vector, topK);
+  const approximateHits = index.query(vector, topK);
+  if (approximateHits.length >= topK) {
+    return approximateHits;
+  }
+  return exactDenseQuery(payload, vector, topK);
 }
 // src/vector/sparse.ts
@@ -1904,10 +2192,13 @@ async function buildSparseDocuments(workspacePath, config, chunks) {
 }
 async function buildSparseVectors({
   workspacePath,
-  config
+  config,
+  progress
 }) {
   const chunks = await readJsonl(path15.join(workspacePath, "chunks", "chunks.jsonl"));
+  reportProgress(progress, `Encoding ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} for sparse retrieval`);
   const built = await buildSparseDocuments(workspacePath, config, chunks);
+  reportProgress(progress, "Building sparse vector index");
   const index = new SparseVectorFieldIndex();
   for (const record of built.chunks) {
     index.insert(record.chunkId, [record.vector]);
@@ -1929,6 +2220,7 @@ async function buildSparseVectors({
     queryTokenWeights: built.queryTokenWeights
   };
   await writeSparsePayload(workspacePath, payload);
+  reportProgress(progress, `Sparse vectors written for ${built.chunks.length} chunk${built.chunks.length === 1 ? "" : "s"}`);
   return payload;
 }
 async function sparseQuery({
@@ -1951,24 +2243,24 @@ async function buildVectorArtifacts({
   config,
   denseOverride,
   sparseOverride,
-  buildAvailableModels = false
+  buildAvailableModels = false,
+  progress
 }) {
-  const modelStatus = buildAvailableModels ? await buildModelStatus(workspacePath, config.retrieval.dense, config.retrieval.sparse, await (async () => {
-    try {
-      await ensureUvAvailable();
-      return true;
-    } catch {
-      return false;
-    }
-  })()) : null;
+  const uvAvailable = await isUvAvailable();
+  const modelStatus = buildAvailableModels ? await buildModelStatus(workspacePath, config.retrieval.dense, config.retrieval.sparse, uvAvailable) : null;
   const denseEnabled = denseOverride ?? (buildAvailableModels ? config.retrieval.dense.enabled || Boolean(modelStatus?.dense.available) : config.retrieval.dense.enabled);
-  const sparseEnabled = sparseOverride ?? (buildAvailableModels ? (config.retrieval.sparse.enabled || Boolean(modelStatus?.sparse.available)) && Boolean(modelStatus?.sparse.uvAvailable) : config.retrieval.sparse.enabled);
+  const sparseEnabled = sparseOverride ?? (buildAvailableModels ? (config.retrieval.sparse.enabled || Boolean(modelStatus?.sparse.available)) && Boolean(modelStatus?.sparse.uvAvailable) : config.retrieval.sparse.enabled && uvAvailable);
   const result = {};
   if (denseEnabled) {
-    result.dense = await buildDenseVectors({ workspacePath, config: config.retrieval.dense });
+    reportProgress(progress, `Building dense vectors with ${config.retrieval.dense.modelId}`);
+    result.dense = await buildDenseVectors({ workspacePath, config: config.retrieval.dense, progress });
+  }
+  if ((sparseOverride || config.retrieval.sparse.enabled) && !uvAvailable) {
+    reportProgress(progress, "Skipping sparse vectors because uv is not available");
   }
   if (sparseEnabled) {
-    result.sparse = await buildSparseVectors({ workspacePath, config: config.retrieval.sparse });
+    reportProgress(progress, `Building sparse vectors with ${config.retrieval.sparse.modelId}`);
+    result.sparse = await buildSparseVectors({ workspacePath, config: config.retrieval.sparse, progress });
   }
   return result;
 }
@@ -2037,14 +2329,17 @@ async function buildIndex({
   workspacePath,
   denseOverride,
   sparseOverride,
-  buildAvailableModels = false
+  buildAvailableModels = false,
+  progress
 }) {
   const config = await loadConfig(workspacePath);
+  reportProgress(progress, "Loading documents, chunks, and sources");
   const chunks = await readJsonl(path17.join(workspacePath, "chunks", "chunks.jsonl"));
   const documents = await readJsonl(path17.join(workspacePath, "documents", "documents.jsonl"));
   const sources = await readJsonl(path17.join(workspacePath, "sources", "sources.jsonl"));
   const metadataFields = [...new Set(chunks.flatMap((chunk) => Object.keys(chunk.metadata).map((key) => `metadata.${key}`)))];
   const index = new DocumentIndex(createIndexMapping(metadataFields));
+  reportProgress(progress, `Building lexical index from ${chunks.length} chunk${chunks.length === 1 ? "" : "s"}`);
   for (const chunk of chunks) {
     index.index({
       id: chunk.id,
@@ -2059,6 +2354,7 @@ async function buildIndex({
       }
     });
   }
+  reportProgressDetail(progress, `Indexed ${documents.length} document${documents.length === 1 ? "" : "s"} across ${sources.length} source${sources.length === 1 ? "" : "s"}`);
   const createdAt = (/* @__PURE__ */ new Date()).toISOString();
   const metadata = {
     id: `index_${createdAt.replace(/[:.]/g, "-")}`,
@@ -2071,14 +2367,17 @@ async function buildIndex({
     fields: Object.keys(index.mapping),
     indexHash: sha256(JSON.stringify(index.indexState))
   };
+  reportProgress(progress, "Writing lexical index artifacts");
   const artifacts = await writeIndexArtifacts({ workspacePath, indexState: index.indexState, metadata });
   const vectors = await buildVectorArtifacts({
     workspacePath,
     config,
     denseOverride,
     sparseOverride,
-    buildAvailableModels
+    buildAvailableModels,
+    progress
   });
+  reportProgress(progress, `Index build complete: dense=${Boolean(vectors.dense)}, sparse=${Boolean(vectors.sparse)}`);
   return {
     metadata,
     indexPath: artifacts.indexPath,
@@ -2092,7 +2391,15 @@ import { readFile as readFile11 } from "fs/promises";
 import { BoolQuery, MatchQuery, OP, TermQuery, reciprocalRankFusion } from "@tryformation/querylight-ts";
 import path18 from "path";
 async function loadHydratedIndex(workspacePath) {
-  const state = await readLatestIndexState(workspacePath);
+  let state;
+  try {
+    state = await readLatestIndexState(workspacePath);
+  } catch (error) {
+    if (error.code === "ENOENT") {
+      throw new CliError("lexical index is not built; run `qli rebuild` or `qli chunk` followed by `qli index build`", "INDEX_MISSING", 7 /* QueryError */);
+    }
+    throw error;
+  }
   const mapping = createIndexMapping(Object.keys(state.fieldState ?? {}).filter((field) => field.startsWith("metadata.")));
   return new (await import("@tryformation/querylight-ts")).DocumentIndex(mapping).loadState(state);
 }
@@ -2328,9 +2635,25 @@ async function buildSnippetWithAdjacentChunks(chunk, query, {
 function normalizeDisplayTitle(title) {
   return title.replace(/\s*\|\s*Querylight TS Demo\s*$/i, "").replace(/\s+/g, " ").trim();
 }
+var LOW_SIGNAL_RESULT_TITLES = /* @__PURE__ */ new Set([
+  "choose this instead of",
+  "how xyz runs it",
+  "naechste schritte",
+  "next steps",
+  "overview",
+  "passend wenn",
+  "problem",
+  "right fit",
+  "waehlen sie das stattdessen",
+  "was sie bekommen",
+  "what you get",
+  "wie xyz es umsetzt",
+  "uberblick",
+  "\xFCberblick"
+]);
 function chooseResultTitle(chunk) {
   const documentTitle = normalizeDisplayTitle(chunk.title);
-  const headings = chunk.headingPath.map((heading) => normalizeDisplayTitle(heading)).filter(Boolean);
+  const headings = chunk.headingPath.map((heading) => normalizeDisplayTitle(heading)).filter((heading) => heading.length > 0 && !LOW_SIGNAL_RESULT_TITLES.has(heading.toLowerCase()));
   const leafHeading = headings.at(-1);
   if (leafHeading && leafHeading.toLowerCase() !== documentTitle.toLowerCase()) {
     return leafHeading;
@@ -2352,6 +2675,9 @@ function normalizeUriPath(uri) {
     return uri.toLowerCase().replace(/\/+$/, "");
   }
 }
+function normalizeUriIdentity(uri) {
+  return normalizeRemoteUrl(uri).toLowerCase().replace(/\/+$/, "");
+}
 function uriSpecificity(uri) {
   const normalized = normalizeUriPath(uri);
   if (normalized === "/") {
@@ -2368,6 +2694,11 @@ function isMoreSpecificDuplicate(candidate, existing) {
   if (!candidateTitle || candidateTitle !== existingTitle) {
     return false;
   }
+  const candidateIdentity = normalizeUriIdentity(candidate.uri);
+  const existingIdentity = normalizeUriIdentity(existing.uri);
+  if (candidateIdentity === existingIdentity) {
+    return candidate.uri.length < existing.uri.length;
+  }
   const candidatePath = normalizeUriPath(candidate.uri);
   const existingPath = normalizeUriPath(existing.uri);
   if (candidatePath === existingPath) {
@@ -2480,7 +2811,6 @@ async function searchIndex({
           score: 0,
           title: chooseResultTitle(chunk),
           uri: chunk.uri,
-          headingPath: chunk.headingPath,
           snippet: await buildSnippetWithAdjacentChunks(chunk, document.title, {
             document,
             config,
@@ -2544,7 +2874,6 @@ async function searchIndex({
       score,
       title: chooseResultTitle(chunk),
       uri: chunk.uri,
-      headingPath: chunk.headingPath,
       snippet: await buildSnippetWithAdjacentChunks(chunk, normalizedQuery, {
         document: documents.get(chunk.documentId),
         config,
@@ -2564,7 +2893,7 @@ async function searchIndex({
 // src/query/related-service.ts
 import path19 from "path";
-function cosineSimilarity(left, right) {
+function cosineSimilarity2(left, right) {
   let dot = 0;
   let leftNorm = 0;
   let rightNorm = 0;
@@ -2650,7 +2979,7 @@ async function findRelatedDocuments({
   const results = [...vectors.values()].filter((candidate) => candidate.document.id !== selected.id).map((candidate) => ({
     documentId: candidate.document.id,
     sourceId: candidate.document.sourceId,
-    score: cosineSimilarity(sourceVector.embedding, candidate.embedding),
+    score: cosineSimilarity2(sourceVector.embedding, candidate.embedding),
     title: candidate.document.title,
     uri: candidate.document.uri,
     metadata: candidate.document.metadata
@@ -2690,7 +3019,6 @@ async function createContext({
       sourceId: result.sourceId,
       title: result.title,
       uri: result.uri,
-      headingPath: result.headingPath,
       text,
       metadata: result.metadata
     });
@@ -2703,7 +3031,6 @@ async function createContext({
       `Title: ${source.title}`,
       `URL: ${source.uri}`,
       `Chunk ID: ${source.chunkId}`,
-      source.headingPath.length > 0 ? `Heading Path: ${source.headingPath.join(" > ")}` : "",
       "",
       source.text,
       ""