@tryformation/querylight-cli 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -20,6 +20,15 @@ var CliError = class extends Error {
20
20
  import { readFile, writeFile } from "fs/promises";
21
21
  import path from "path";
22
22
  import YAML from "yaml";
23
+
24
+ // src/core/constants.ts
25
+ var DEFAULT_SHARED_MODEL_CACHE_DIR = "~/.qli/models/huggingface";
26
+ var LEGACY_WORKSPACE_MODEL_CACHE_DIR = ".kb/models/huggingface";
27
+
28
+ // src/core/config.ts
29
+ function normalizeModelCacheDir(configuredPath) {
30
+ return configuredPath === LEGACY_WORKSPACE_MODEL_CACHE_DIR ? DEFAULT_SHARED_MODEL_CACHE_DIR : configuredPath;
31
+ }
23
32
  var defaultConfig = () => ({
24
33
  workspaceVersion: 1,
25
34
  index: {
@@ -47,17 +56,17 @@ var defaultConfig = () => ({
47
56
  retrieval: {
48
57
  defaultMode: "lexical",
49
58
  dense: {
50
- enabled: false,
59
+ enabled: true,
51
60
  modelId: "Xenova/all-MiniLM-L6-v2",
52
- cacheDir: ".kb/models/huggingface",
61
+ cacheDir: DEFAULT_SHARED_MODEL_CACHE_DIR,
53
62
  indexHashTables: 8,
54
63
  indexRandomSeed: 42,
55
64
  chunkTextMode: "title-heading-text"
56
65
  },
57
66
  sparse: {
58
- enabled: false,
67
+ enabled: true,
59
68
  modelId: "opensearch-project/opensearch-neural-sparse-encoding-doc-v3-distill",
60
- cacheDir: ".kb/models/huggingface",
69
+ cacheDir: DEFAULT_SHARED_MODEL_CACHE_DIR,
61
70
  documentTopTokens: 128,
62
71
  queryEncoding: "tokenizer-token-weights",
63
72
  documentEncoding: "masked-lm-max-log1p-relu",
@@ -68,6 +77,7 @@ var defaultConfig = () => ({
68
77
  defaultUserAgent: "querylight-cli/0.1",
69
78
  obeyRobotsTxt: true,
70
79
  rateLimitMs: 1e3,
80
+ maxConcurrentRequests: 5,
71
81
  renderJs: false,
72
82
  retentionDays: 365,
73
83
  fetchArticles: true
@@ -118,11 +128,13 @@ async function loadConfig(workspacePath, configPath) {
118
128
  ...parsed.retrieval ?? {},
119
129
  dense: {
120
130
  ...defaults.retrieval.dense,
121
- ...parsed.retrieval?.dense ?? {}
131
+ ...parsed.retrieval?.dense ?? {},
132
+ cacheDir: normalizeModelCacheDir(parsed.retrieval?.dense?.cacheDir ?? defaults.retrieval.dense.cacheDir)
122
133
  },
123
134
  sparse: {
124
135
  ...defaults.retrieval.sparse,
125
- ...parsed.retrieval?.sparse ?? {}
136
+ ...parsed.retrieval?.sparse ?? {},
137
+ cacheDir: normalizeModelCacheDir(parsed.retrieval?.sparse?.cacheDir ?? defaults.retrieval.sparse.cacheDir)
126
138
  }
127
139
  },
128
140
  crawler: {
@@ -145,8 +157,6 @@ var DIRS = [
145
157
  "normalized",
146
158
  "indexes",
147
159
  "vectors",
148
- "models",
149
- "models/huggingface",
150
160
  "runs",
151
161
  "logs"
152
162
  ];
@@ -275,6 +285,27 @@ async function saveChunks(workspacePath, chunks) {
275
285
  await writeJsonl(chunksFile(workspacePath), chunks.sort((a, b) => a.id.localeCompare(b.id)));
276
286
  }
277
287
 
288
+ // src/core/concurrency.ts
289
+ async function mapWithConcurrency(items, limit, worker) {
290
+ if (items.length === 0) {
291
+ return;
292
+ }
293
+ const concurrency = Math.max(1, Math.floor(limit));
294
+ let nextIndex = 0;
295
+ await Promise.all(
296
+ Array.from({ length: Math.min(concurrency, items.length) }, async () => {
297
+ while (true) {
298
+ const index = nextIndex;
299
+ nextIndex += 1;
300
+ if (index >= items.length) {
301
+ return;
302
+ }
303
+ await worker(items[index], index);
304
+ }
305
+ })
306
+ );
307
+ }
308
+
278
309
  // src/core/files.ts
279
310
  import { stat as stat2 } from "fs/promises";
280
311
  async function fileExists(filePath) {
@@ -286,6 +317,14 @@ async function fileExists(filePath) {
286
317
  }
287
318
  }
288
319
 
320
+ // src/core/progress.ts
321
+ function reportProgress(progress, message) {
322
+ progress?.("info", message);
323
+ }
324
+ function reportProgressDetail(progress, message) {
325
+ progress?.("detail", message);
326
+ }
327
+
289
328
  // src/core/runs.ts
290
329
  import path6 from "path";
291
330
  async function writeRun(workspacePath, run) {
@@ -428,9 +467,41 @@ function stripBoilerplate(html) {
428
467
 
429
468
  // src/ingest/extractors/html-extractor.ts
430
469
  var turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
470
+ var LOW_SIGNAL_SECTION_SELECTORS = [
471
+ "script",
472
+ "style",
473
+ "noscript",
474
+ "template",
475
+ "[data-blog-service-recommendations]",
476
+ "[data-blog-related-posts]"
477
+ ].join(", ");
431
478
  function cleanText(value) {
432
479
  return value.replace(/\s+/g, " ").trim();
433
480
  }
481
+ function pruneLowSignalContent($) {
482
+ $(LOW_SIGNAL_SECTION_SELECTORS).remove();
483
+ $("form").each((_, element) => {
484
+ const action = cleanText($(element).attr("action") ?? "");
485
+ if (action.includes("substack.com/subscribe")) {
486
+ $(element).closest("section").remove();
487
+ }
488
+ });
489
+ }
490
+ function stripEscapedJsonPayloads(markdown) {
491
+ return markdown.split("\n").filter((line) => {
492
+ const trimmed = line.trim();
493
+ if (trimmed.length === 0) {
494
+ return true;
495
+ }
496
+ if (trimmed.length > 300 && /^"?\\?\[\{\\?"[a-z0-9_]+\\?":/i.test(trimmed)) {
497
+ return false;
498
+ }
499
+ if (trimmed.length > 300 && trimmed.includes('\\"permalink\\":') && trimmed.includes('\\"title\\":')) {
500
+ return false;
501
+ }
502
+ return true;
503
+ }).join("\n").replace(/\n{3,}/g, "\n\n").trim();
504
+ }
434
505
  function chooseMeaningfulTitle($, fallbackTitle) {
435
506
  const candidates = [
436
507
  cleanText($("meta[property='og:title']").attr("content") ?? ""),
@@ -467,14 +538,27 @@ ${parts.join("\n\n")}
467
538
  function extractHtmlToMarkdown(html) {
468
539
  const cleaned = stripBoilerplate(html);
469
540
  const $ = load(cleaned);
541
+ pruneLowSignalContent($);
470
542
  const fallbackTitle = cleanText($("title").first().text()) || "Untitled";
471
543
  const title = chooseMeaningfulTitle($, fallbackTitle);
472
544
  const root = $("main").first().html() ?? $.root().html() ?? cleaned;
473
545
  return {
474
- markdown: turndown.turndown(root),
546
+ markdown: stripEscapedJsonPayloads(turndown.turndown(root)),
475
547
  title
476
548
  };
477
549
  }
550
+ function extractCanonicalUriFromHtml(html, baseUrl) {
551
+ const $ = load(html);
552
+ const href = $("link[rel='canonical']").first().attr("href")?.trim();
553
+ if (!href) {
554
+ return null;
555
+ }
556
+ try {
557
+ return new URL(href, baseUrl).href;
558
+ } catch {
559
+ return null;
560
+ }
561
+ }
478
562
  function parseDateCandidate(value) {
479
563
  const trimmed = value.trim();
480
564
  if (!trimmed) {
@@ -879,6 +963,19 @@ async function parseRssFeedDocument(xml, source) {
879
963
  // src/ingest/adapters/url-adapter.ts
880
964
  import { mkdir as mkdir5, readFile as readFile7, writeFile as writeFile5 } from "fs/promises";
881
965
  import path9 from "path";
966
+
967
+ // src/core/urls.ts
968
+ function normalizeRemoteUrl(uri) {
969
+ try {
970
+ const parsed = new URL(uri);
971
+ parsed.hash = "";
972
+ return parsed.href;
973
+ } catch {
974
+ return uri;
975
+ }
976
+ }
977
+
978
+ // src/ingest/adapters/url-adapter.ts
882
979
  function buildHttpCache(response, validatedAt) {
883
980
  return {
884
981
  etag: response.headers.get("etag") ?? void 0,
@@ -903,12 +1000,13 @@ async function normalizeRemoteDocument({
903
1000
  responseStatus
904
1001
  }) {
905
1002
  const extracted = extractHtmlToMarkdown(body);
1003
+ const canonicalUri = normalizeRemoteUrl(extractCanonicalUriFromHtml(body, url) ?? url);
906
1004
  const markdown = `# ${extracted.title}
907
1005
 
908
1006
  ${extracted.markdown}`;
909
- const documentId = stableId("doc", source.id, url);
1007
+ const documentId = stableId("doc", source.id, canonicalUri);
910
1008
  const normalizedPath = path9.resolve(workspacePath, "normalized", `${documentId}.md`);
911
- const rawPath = path9.resolve(workspacePath, "raw", source.id, `${sha256(url).slice(0, 12)}.html`);
1009
+ const rawPath = path9.resolve(workspacePath, "raw", source.id, `${sha256(canonicalUri).slice(0, 12)}.html`);
912
1010
  const contentHash = sha256(markdown);
913
1011
  const now = (/* @__PURE__ */ new Date()).toISOString();
914
1012
  const lastChangedAt = previous?.contentHash === contentHash ? previous.lastChangedAt : now;
@@ -921,7 +1019,7 @@ ${extracted.markdown}`;
921
1019
  documentId,
922
1020
  sourceId: source.id,
923
1021
  title: extracted.title,
924
- uri: url,
1022
+ uri: canonicalUri,
925
1023
  sourceUri,
926
1024
  publicationDate: resolvedPublicationDate,
927
1025
  crawledAt,
@@ -936,8 +1034,9 @@ ${extracted.markdown}`;
936
1034
  sourceId: source.id,
937
1035
  sourceType: source.type,
938
1036
  title: extracted.title,
939
- uri: url,
1037
+ uri: canonicalUri,
940
1038
  sourceUri,
1039
+ canonicalUri,
941
1040
  mimeType: "text/html",
942
1041
  rawPath,
943
1042
  normalizedPath,
@@ -1111,6 +1210,18 @@ function isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules
1111
1210
  if (url.origin !== baseUrl.origin) {
1112
1211
  return false;
1113
1212
  }
1213
+ if (url.search.length > 0) {
1214
+ return false;
1215
+ }
1216
+ if (url.pathname.endsWith(".xml")) {
1217
+ return false;
1218
+ }
1219
+ if (url.pathname.includes("/cdn-cgi/")) {
1220
+ return false;
1221
+ }
1222
+ if (url.pathname === "/search" || url.pathname === "/search/" || url.pathname.endsWith("/search/")) {
1223
+ return false;
1224
+ }
1114
1225
  if (disallowRules.some((rule) => rule !== "/" && url.pathname.startsWith(rule))) {
1115
1226
  return false;
1116
1227
  }
@@ -1123,56 +1234,75 @@ function isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules
1123
1234
  }
1124
1235
  return true;
1125
1236
  }
1126
- async function crawlWebsite(source) {
1237
+ function delay(ms) {
1238
+ return new Promise((resolve2) => setTimeout(resolve2, ms));
1239
+ }
1240
+ async function crawlWebsite(source, defaults, progress) {
1127
1241
  const baseUrl = new URL(source.uri);
1128
- const userAgent = source.crawl?.userAgent ?? "querylight-cli/0.1";
1242
+ const userAgent = source.crawl?.userAgent ?? defaults.userAgent;
1129
1243
  const includePatterns = source.crawl?.includePatterns ?? [];
1130
1244
  const excludePatterns = source.crawl?.excludePatterns ?? [];
1131
1245
  const maxDepth = source.crawl?.maxDepth ?? 2;
1132
1246
  const maxPages = source.crawl?.maxPages ?? 100;
1133
- const rateLimitMs = source.crawl?.rateLimitMs ?? 1e3;
1247
+ const rateLimitMs = source.crawl?.rateLimitMs ?? defaults.rateLimitMs;
1248
+ const maxConcurrentRequests = source.crawl?.maxConcurrentRequests ?? defaults.maxConcurrentRequests;
1134
1249
  const disallowRules = source.crawl?.obeyRobotsTxt === false ? [] : await fetchRobotsDisallow(baseUrl, userAgent);
1135
- const queue = [{ url: source.uri, depth: 0 }];
1136
1250
  const seen = /* @__PURE__ */ new Set();
1137
1251
  const results = [];
1252
+ let currentLevel = [normalizeRemoteUrl(source.uri)];
1138
1253
  if (source.crawl?.useSitemap !== false) {
1139
- for (const url of await fetchSitemapUrls(baseUrl, userAgent)) {
1140
- queue.push({ url, depth: 1 });
1141
- }
1142
- }
1143
- while (queue.length > 0 && results.length < maxPages) {
1144
- const next = queue.shift();
1145
- if (!next || seen.has(next.url)) {
1146
- continue;
1147
- }
1148
- seen.add(next.url);
1149
- const url = new URL(next.url);
1150
- if (!isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules)) {
1151
- continue;
1254
+ const sitemapUrls = (await fetchSitemapUrls(baseUrl, userAgent)).map((url) => normalizeRemoteUrl(url));
1255
+ reportProgress(progress, `Discovered ${sitemapUrls.length} sitemap URL${sitemapUrls.length === 1 ? "" : "s"} for ${source.uri}`);
1256
+ currentLevel = [
1257
+ ...currentLevel,
1258
+ ...sitemapUrls
1259
+ ];
1260
+ }
1261
+ for (let depth = 0; depth <= maxDepth && currentLevel.length > 0 && results.length < maxPages; depth += 1) {
1262
+ reportProgress(progress, `Crawl depth ${depth}: evaluating ${currentLevel.length} candidate URL${currentLevel.length === 1 ? "" : "s"}`);
1263
+ const nextLevelCandidates = [];
1264
+ const allowedUrls = [];
1265
+ for (const candidate of currentLevel) {
1266
+ const normalizedCandidate = normalizeRemoteUrl(candidate);
1267
+ if (seen.has(normalizedCandidate)) {
1268
+ continue;
1269
+ }
1270
+ seen.add(normalizedCandidate);
1271
+ const url = new URL(normalizedCandidate);
1272
+ if (!isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules)) {
1273
+ continue;
1274
+ }
1275
+ allowedUrls.push(normalizedCandidate);
1276
+ results.push(normalizedCandidate);
1277
+ reportProgress(progress, `Discovered ${normalizedCandidate}`);
1278
+ if (results.length >= maxPages) {
1279
+ break;
1280
+ }
1152
1281
  }
1153
- results.push(url.href);
1154
- if (next.depth >= maxDepth) {
1155
- continue;
1282
+ reportProgress(progress, `Crawl depth ${depth}: queued ${allowedUrls.length} page${allowedUrls.length === 1 ? "" : "s"} for link extraction`);
1283
+ if (depth >= maxDepth || results.length >= maxPages) {
1284
+ break;
1156
1285
  }
1157
- const response = await fetch(url, { headers: { "user-agent": userAgent } });
1158
- const html = await response.text();
1159
- const $ = load2(html);
1160
- $("a[href]").each((_, element) => {
1161
- const href = $(element).attr("href");
1162
- if (!href) {
1163
- return;
1164
- }
1165
- try {
1166
- const target = new URL(href, url);
1167
- if (!seen.has(target.href)) {
1168
- queue.push({ url: target.href, depth: next.depth + 1 });
1286
+ await mapWithConcurrency(allowedUrls, maxConcurrentRequests, async (pageUrl) => {
1287
+ const page = new URL(pageUrl);
1288
+ const response = await fetch(page, { headers: { "user-agent": userAgent } });
1289
+ const html = await response.text();
1290
+ const $ = load2(html);
1291
+ $("a[href]").each((_, element) => {
1292
+ const href = $(element).attr("href");
1293
+ if (!href) {
1294
+ return;
1169
1295
  }
1170
- } catch {
1296
+ try {
1297
+ nextLevelCandidates.push(normalizeRemoteUrl(new URL(href, page).href));
1298
+ } catch {
1299
+ }
1300
+ });
1301
+ if (rateLimitMs > 0) {
1302
+ await delay(rateLimitMs);
1171
1303
  }
1172
1304
  });
1173
- if (rateLimitMs > 0) {
1174
- await new Promise((resolve2) => setTimeout(resolve2, rateLimitMs));
1175
- }
1305
+ currentLevel = nextLevelCandidates;
1176
1306
  }
1177
1307
  return results;
1178
1308
  }
@@ -1247,6 +1377,8 @@ async function ingestRssSource({
1247
1377
  source,
1248
1378
  previous,
1249
1379
  nextDocuments,
1380
+ maxConcurrentRequests,
1381
+ onDocumentProcessed,
1250
1382
  onFailure
1251
1383
  }) {
1252
1384
  if (source.crawl?.fetchArticles === false) {
@@ -1254,11 +1386,12 @@ async function ingestRssSource({
1254
1386
  }
1255
1387
  const xml = await fetchFeedText(source);
1256
1388
  const items = await parseRssFeedDocument(xml, source);
1389
+ const processedDocumentIds = /* @__PURE__ */ new Set();
1257
1390
  let added = 0;
1258
1391
  let changed = 0;
1259
1392
  let unchanged = 0;
1260
1393
  let failed = 0;
1261
- for (const item of items) {
1394
+ await mapWithConcurrency(items, maxConcurrentRequests, async (item) => {
1262
1395
  try {
1263
1396
  const probe = previous.get(stableId("doc", source.id, item.url));
1264
1397
  const document = await fetchUrlDocument({
@@ -1269,28 +1402,40 @@ async function ingestRssSource({
1269
1402
  sourceUri: source.uri,
1270
1403
  publicationDate: item.publicationDate
1271
1404
  });
1405
+ if (processedDocumentIds.has(document.id)) {
1406
+ return;
1407
+ }
1408
+ processedDocumentIds.add(document.id);
1409
+ const existingDocument = probe ?? previous.get(document.id);
1272
1410
  nextDocuments.set(document.id, document);
1273
- if (!probe) {
1411
+ if (!existingDocument) {
1274
1412
  added += 1;
1275
- } else if (probe.contentHash !== document.contentHash) {
1413
+ onDocumentProcessed?.(document.uri, "added");
1414
+ } else if (existingDocument.contentHash !== document.contentHash) {
1276
1415
  changed += 1;
1416
+ onDocumentProcessed?.(document.uri, "changed");
1277
1417
  } else {
1278
1418
  unchanged += 1;
1419
+ onDocumentProcessed?.(document.uri, "unchanged");
1279
1420
  }
1280
1421
  } catch (error) {
1281
1422
  failed += 1;
1282
1423
  onFailure(item.url, error);
1283
1424
  }
1284
- }
1425
+ });
1285
1426
  return { added, changed, unchanged, failed };
1286
1427
  }
1287
1428
  async function ingestSources({
1288
1429
  workspacePath,
1289
1430
  sourceIds,
1290
- changedOnly = false
1431
+ changedOnly = false,
1432
+ progress
1291
1433
  }) {
1292
1434
  const config = await loadConfig(workspacePath);
1293
1435
  const defaultRetentionDays = config.crawler.retentionDays;
1436
+ const defaultUserAgent = config.crawler.defaultUserAgent;
1437
+ const defaultRateLimitMs = config.crawler.rateLimitMs;
1438
+ const defaultMaxConcurrentRequests = config.crawler.maxConcurrentRequests;
1294
1439
  const sources = (await listSources(workspacePath)).filter((source) => source.enabled && (!sourceIds || sourceIds.includes(source.id)));
1295
1440
  const existing = await loadDocuments(workspacePath);
1296
1441
  const previous = previousMap(existing);
@@ -1300,20 +1445,38 @@ async function ingestSources({
1300
1445
  let unchanged = 0;
1301
1446
  let failed = 0;
1302
1447
  const failures = [];
1448
+ reportProgress(progress, `Ingesting ${sources.length} source${sources.length === 1 ? "" : "s"}`);
1303
1449
  for (const source of sources) {
1450
+ const maxConcurrentRequests = source.crawl?.maxConcurrentRequests ?? defaultMaxConcurrentRequests;
1451
+ const sourceBefore = { added, changed, unchanged, failed };
1452
+ const processedDocumentIds = /* @__PURE__ */ new Set();
1453
+ const reportDocumentOutcome = (uri, outcome) => {
1454
+ const label = outcome === "unchanged" ? "Unchanged" : outcome === "changed" ? "Updated" : "Added";
1455
+ reportProgress(progress, `${label} ${uri}`);
1456
+ };
1304
1457
  const ingestOne = async (uri, producer) => {
1305
1458
  try {
1306
1459
  const probeId = stableId("doc", source.id, uri);
1307
1460
  const earlier = previous.get(probeId);
1308
1461
  const document = await producer();
1462
+ if (processedDocumentIds.has(document.id)) {
1463
+ reportProgressDetail(progress, `Skipped duplicate alias ${uri} -> ${document.uri}`);
1464
+ return null;
1465
+ }
1466
+ processedDocumentIds.add(document.id);
1467
+ const existingDocument = earlier ?? previous.get(document.id);
1309
1468
  nextDocuments.set(document.id, document);
1310
- if (!earlier) {
1469
+ if (!existingDocument) {
1311
1470
  added += 1;
1312
- } else if (earlier.contentHash !== document.contentHash) {
1471
+ reportDocumentOutcome(document.uri, "added");
1472
+ } else if (existingDocument.contentHash !== document.contentHash) {
1313
1473
  changed += 1;
1474
+ reportDocumentOutcome(document.uri, "changed");
1314
1475
  } else {
1315
1476
  unchanged += 1;
1477
+ reportDocumentOutcome(document.uri, "unchanged");
1316
1478
  }
1479
+ return document;
1317
1480
  } catch (error) {
1318
1481
  failed += 1;
1319
1482
  failures.push({
@@ -1321,50 +1484,69 @@ async function ingestSources({
1321
1484
  uri,
1322
1485
  message: error instanceof Error ? error.message : String(error)
1323
1486
  });
1487
+ reportProgressDetail(progress, `Failed ${uri}: ${error instanceof Error ? error.message : String(error)}`);
1488
+ return null;
1324
1489
  }
1325
1490
  };
1326
1491
  try {
1492
+ reportProgress(progress, `Source ${source.name} (${source.type})`);
1327
1493
  if (source.type === "file") {
1494
+ reportProgress(progress, `Reading file ${source.uri}`);
1328
1495
  await ingestOne(source.uri, () => ingestFile({ workspacePath, source, filePath: source.uri, previous: previous.get(stableId("doc", source.id, source.uri)) }));
1329
- continue;
1330
- }
1331
- if (source.type === "directory") {
1332
- for (const filePath of await listDirectoryFiles(source)) {
1496
+ } else if (source.type === "directory") {
1497
+ const files = await listDirectoryFiles(source);
1498
+ reportProgress(progress, `Scanning ${files.length} file${files.length === 1 ? "" : "s"} from ${source.uri}`);
1499
+ for (const filePath of files) {
1500
+ reportProgress(progress, `Reading file ${filePath}`);
1333
1501
  await ingestOne(filePath, () => ingestFile({ workspacePath, source, filePath, previous: previous.get(stableId("doc", source.id, filePath)) }));
1334
1502
  }
1335
- continue;
1336
- }
1337
- if (source.type === "url") {
1503
+ } else if (source.type === "url") {
1504
+ reportProgress(progress, `Fetching ${source.uri}`);
1338
1505
  await ingestOne(source.uri, () => fetchUrlDocument({ workspacePath, source, url: source.uri, previous: previous.get(stableId("doc", source.id, source.uri)) }));
1339
- continue;
1340
- }
1341
- if (source.type === "website") {
1342
- for (const url of await crawlWebsite(source)) {
1343
- await ingestOne(url, () => fetchUrlDocument({ workspacePath, source, url, previous: previous.get(stableId("doc", source.id, url)) }));
1344
- }
1345
- continue;
1346
- }
1347
- if (source.type === "rss") {
1506
+ } else if (source.type === "website") {
1507
+ reportProgress(progress, `Crawling ${source.uri}`);
1508
+ const urls = await crawlWebsite(source, {
1509
+ userAgent: defaultUserAgent,
1510
+ rateLimitMs: defaultRateLimitMs,
1511
+ maxConcurrentRequests
1512
+ }, progress);
1513
+ reportProgress(progress, `Fetched ${urls.length} page${urls.length === 1 ? "" : "s"} from crawl`);
1514
+ const seenCanonicalUrls = /* @__PURE__ */ new Set();
1515
+ await mapWithConcurrency(urls, maxConcurrentRequests, async (url) => {
1516
+ if (seenCanonicalUrls.has(url)) {
1517
+ reportProgressDetail(progress, `Skipped canonical duplicate ${url}`);
1518
+ return;
1519
+ }
1520
+ reportProgress(progress, `Fetching ${url}`);
1521
+ const document = await ingestOne(url, () => fetchUrlDocument({ workspacePath, source, url, previous: previous.get(stableId("doc", source.id, url)) }));
1522
+ if (document) {
1523
+ seenCanonicalUrls.add(document.uri);
1524
+ }
1525
+ });
1526
+ } else if (source.type === "rss") {
1527
+ reportProgress(progress, `Fetching feed ${source.uri}`);
1348
1528
  const result = await ingestRssSource({
1349
1529
  workspacePath,
1350
1530
  source,
1351
1531
  previous,
1352
1532
  nextDocuments,
1533
+ maxConcurrentRequests,
1534
+ onDocumentProcessed: reportDocumentOutcome,
1353
1535
  onFailure: (uri, error) => {
1354
1536
  failures.push({
1355
1537
  sourceId: source.id,
1356
1538
  uri,
1357
1539
  message: error instanceof Error ? error.message : String(error)
1358
1540
  });
1541
+ reportProgressDetail(progress, `Failed ${uri}: ${error instanceof Error ? error.message : String(error)}`);
1359
1542
  }
1360
1543
  });
1361
1544
  added += result.added;
1362
1545
  changed += result.changed;
1363
1546
  unchanged += result.unchanged;
1364
1547
  failed += result.failed;
1365
- continue;
1366
- }
1367
- if (source.type === "markdown" || source.type === "text") {
1548
+ } else if (source.type === "markdown" || source.type === "text") {
1549
+ reportProgress(progress, `Processing inline ${source.type} source ${source.id}`);
1368
1550
  await ingestOne(source.uri, () => ingestInlineContent({
1369
1551
  workspacePath,
1370
1552
  source,
@@ -1381,13 +1563,19 @@ async function ingestSources({
1381
1563
  uri: source.uri,
1382
1564
  message: error instanceof Error ? error.message : String(error)
1383
1565
  });
1566
+ reportProgressDetail(progress, `Failed source ${source.name}: ${error instanceof Error ? error.message : String(error)}`);
1384
1567
  }
1568
+ reportProgress(
1569
+ progress,
1570
+ `Finished ${source.name}: +${added - sourceBefore.added} added, ${changed - sourceBefore.changed} changed, ${unchanged - sourceBefore.unchanged} unchanged, ${failed - sourceBefore.failed} failed`
1571
+ );
1385
1572
  }
1386
1573
  const expiringDocuments = [...nextDocuments.values()].filter((document) => {
1387
1574
  const source = sources.find((candidate) => candidate.id === document.sourceId);
1388
1575
  return source ? shouldExpireRssDocument(document, source, defaultRetentionDays) : false;
1389
1576
  });
1390
1577
  if (expiringDocuments.length > 0) {
1578
+ reportProgress(progress, `Removing ${expiringDocuments.length} expired RSS document${expiringDocuments.length === 1 ? "" : "s"}`);
1391
1579
  const expiredIds = new Set(expiringDocuments.map((document) => document.id));
1392
1580
  for (const document of expiringDocuments) {
1393
1581
  nextDocuments.delete(document.id);
@@ -1414,6 +1602,7 @@ async function ingestSources({
1414
1602
  documentsSnapshot: documentSnapshot(finalDocuments)
1415
1603
  };
1416
1604
  await writeRun(workspacePath, run);
1605
+ reportProgress(progress, `Ingest complete: ${added} added, ${changed} changed, ${unchanged} unchanged, ${failed} failed`);
1417
1606
  return {
1418
1607
  runId: id,
1419
1608
  documents: { added, changed, unchanged, failed },
@@ -1423,7 +1612,8 @@ async function ingestSources({
1423
1612
  async function reprocessDocuments({
1424
1613
  workspacePath,
1425
1614
  sourceId,
1426
- documentId
1615
+ documentId,
1616
+ progress
1427
1617
  }) {
1428
1618
  const documents = await loadDocuments(workspacePath);
1429
1619
  const sources = await listSources(workspacePath);
@@ -1431,15 +1621,20 @@ async function reprocessDocuments({
1431
1621
  const nextDocuments = new Map(documents.map((document) => [document.id, document]));
1432
1622
  let documentsReprocessed = 0;
1433
1623
  let documentsSkipped = 0;
1434
- for (const document of documents.filter((candidate) => (!sourceId || candidate.sourceId === sourceId) && (!documentId || candidate.id === documentId))) {
1624
+ const targets = documents.filter((candidate) => (!sourceId || candidate.sourceId === sourceId) && (!documentId || candidate.id === documentId));
1625
+ reportProgress(progress, `Reprocessing ${targets.length} document${targets.length === 1 ? "" : "s"}`);
1626
+ for (const document of targets) {
1627
+ reportProgressDetail(progress, `Reprocessing ${document.id} (${document.title})`);
1435
1628
  const source = sourceMap.get(document.sourceId);
1436
1629
  if (!source || !document.rawPath || !await fileExists(document.rawPath)) {
1437
1630
  documentsSkipped += 1;
1631
+ reportProgressDetail(progress, `Skipped ${document.id}: raw source not available`);
1438
1632
  continue;
1439
1633
  }
1440
1634
  const updated = source.type === "url" || source.type === "website" || source.type === "rss" ? await reprocessRemoteDocument(document, source) : await reprocessStoredDocument(document, source);
1441
1635
  if (!updated) {
1442
1636
  documentsSkipped += 1;
1637
+ reportProgressDetail(progress, `Skipped ${document.id}: source type could not be reprocessed`);
1443
1638
  continue;
1444
1639
  }
1445
1640
  nextDocuments.set(updated.id, updated);
@@ -1459,6 +1654,7 @@ async function reprocessDocuments({
1459
1654
  },
1460
1655
  documentsSnapshot: documentSnapshot(finalDocuments)
1461
1656
  });
1657
+ reportProgress(progress, `Reprocess complete: ${documentsReprocessed} updated, ${documentsSkipped} skipped`);
1462
1658
  return { runId: id, documentsReprocessed, documentsSkipped };
1463
1659
  }
1464
1660
 
@@ -1560,11 +1756,13 @@ function buildChunksForDocument(document, markdown, config, prior = /* @__PURE__
1560
1756
  async function chunkDocuments({
1561
1757
  workspacePath,
1562
1758
  sourceId,
1563
- documentId
1759
+ documentId,
1760
+ progress
1564
1761
  }) {
1565
1762
  const config = await loadConfig(workspacePath);
1566
1763
  const documents = await readJsonl(path11.join(workspacePath, "documents", "documents.jsonl"));
1567
1764
  const filtered = documents.filter((document) => (!sourceId || document.sourceId === sourceId) && (!documentId || document.id === documentId));
1765
+ reportProgress(progress, `Chunking ${filtered.length} document${filtered.length === 1 ? "" : "s"}`);
1568
1766
  const targetedDocumentIds = new Set(filtered.map((document) => document.id));
1569
1767
  const existingChunks = await loadChunks(workspacePath);
1570
1768
  const prior = new Map(existingChunks.map((chunk) => [chunk.id, chunk]));
@@ -1572,12 +1770,14 @@ async function chunkDocuments({
1572
1770
  existingChunks.filter((chunk) => !targetedDocumentIds.has(chunk.documentId)).map((chunk) => [chunk.id, chunk])
1573
1771
  );
1574
1772
  for (const document of filtered) {
1773
+ reportProgressDetail(progress, `Chunking ${document.id} (${document.title})`);
1575
1774
  const raw = await readFile8(document.normalizedPath, "utf8");
1576
1775
  for (const chunk of buildChunksForDocument(document, raw, config, prior)) {
1577
1776
  nextChunks.set(chunk.id, chunk);
1578
1777
  }
1579
1778
  }
1580
1779
  await saveChunks(workspacePath, [...nextChunks.values()]);
1780
+ reportProgress(progress, `Chunking complete: ${nextChunks.size} chunk${nextChunks.size === 1 ? "" : "s"} written`);
1581
1781
  return { chunksWritten: nextChunks.size };
1582
1782
  }
1583
1783
 
@@ -1586,15 +1786,31 @@ import { Analyzer, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, Ranking
1586
1786
  import path17 from "path";
1587
1787
 
1588
1788
  // src/vector/dense.ts
1589
- import { VectorFieldIndex, createSeededRandom } from "@tryformation/querylight-ts";
1789
+ import { VectorFieldIndex, cosineSimilarity, createSeededRandom } from "@tryformation/querylight-ts";
1590
1790
  import { mkdir as mkdir7 } from "fs/promises";
1591
1791
  import path14 from "path";
1592
1792
 
1593
1793
  // src/vector/runtime.ts
1794
+ import os from "os";
1594
1795
  import path12 from "path";
1595
1796
  import { fileURLToPath } from "url";
1596
1797
  import { execFile, execFileSync } from "child_process";
1798
+ function resolveQliHomeDir() {
1799
+ return path12.resolve(process.env.QLI_HOME ?? path12.join(os.homedir(), ".qli"));
1800
+ }
1597
1801
  function resolveCacheDir(workspacePath, configuredPath) {
1802
+ if (configuredPath === "~/.qli") {
1803
+ return resolveQliHomeDir();
1804
+ }
1805
+ if (configuredPath.startsWith("~/.qli/")) {
1806
+ return path12.join(resolveQliHomeDir(), configuredPath.slice("~/.qli/".length));
1807
+ }
1808
+ if (configuredPath === "~") {
1809
+ return os.homedir();
1810
+ }
1811
+ if (configuredPath.startsWith("~/")) {
1812
+ return path12.join(os.homedir(), configuredPath.slice(2));
1813
+ }
1598
1814
  return path12.isAbsolute(configuredPath) ? configuredPath : path12.resolve(workspacePath, configuredPath.replace(/^\.kb\//, ""));
1599
1815
  }
1600
1816
  function packageRootFromImportMeta(importMetaUrl) {
@@ -1618,6 +1834,14 @@ async function ensureUvAvailable() {
1618
1834
  execFile("uv", ["--version"], (error) => error ? reject(error) : resolve2());
1619
1835
  });
1620
1836
  }
1837
+ async function isUvAvailable() {
1838
+ try {
1839
+ await ensureUvAvailable();
1840
+ return true;
1841
+ } catch {
1842
+ return false;
1843
+ }
1844
+ }
1621
1845
  async function runSparsePython({
1622
1846
  workspacePath,
1623
1847
  config,
@@ -1666,8 +1890,8 @@ import path13 from "path";
1666
1890
  function vectorsDir(workspacePath) {
1667
1891
  return path13.join(workspacePath, "vectors");
1668
1892
  }
1669
- function modelsDir(workspacePath) {
1670
- return path13.join(workspacePath, "models");
1893
+ function sharedModelStateDir() {
1894
+ return path13.join(resolveQliHomeDir(), "models", "status");
1671
1895
  }
1672
1896
  function denseVectorPath(workspacePath) {
1673
1897
  return path13.join(vectorsDir(workspacePath), "dense.latest.json");
@@ -1681,11 +1905,16 @@ function sparseVectorPath(workspacePath) {
1681
1905
  function sparseMetaPath(workspacePath) {
1682
1906
  return path13.join(vectorsDir(workspacePath), "sparse.latest.meta.json");
1683
1907
  }
1684
- function densePullMarker(workspacePath) {
1685
- return path13.join(modelsDir(workspacePath), "dense.pulled.json");
1908
+ function pullMarkerPath(type, workspacePath, modelId, cacheDir) {
1909
+ const resolvedCacheDir = resolveCacheDir(workspacePath, cacheDir);
1910
+ const cacheKey = sha256(resolvedCacheDir).slice(0, 16);
1911
+ return path13.join(sharedModelStateDir(), type, `${encodeURIComponent(modelId)}.${cacheKey}.json`);
1686
1912
  }
1687
- function sparsePullMarker(workspacePath) {
1688
- return path13.join(modelsDir(workspacePath), "sparse.pulled.json");
1913
+ function densePullMarker(workspacePath, modelId, cacheDir) {
1914
+ return pullMarkerPath("dense", workspacePath, modelId, cacheDir);
1915
+ }
1916
+ function sparsePullMarker(workspacePath, modelId, cacheDir) {
1917
+ return pullMarkerPath("sparse", workspacePath, modelId, cacheDir);
1689
1918
  }
1690
1919
  async function writeDensePayload(workspacePath, payload) {
1691
1920
  await mkdir6(vectorsDir(workspacePath), { recursive: true });
@@ -1711,7 +1940,7 @@ async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
1711
1940
  configured: dense.enabled,
1712
1941
  modelId: dense.modelId,
1713
1942
  cacheDir: denseCacheDir,
1714
- available: await fileExists(densePullMarker(workspacePath)),
1943
+ available: await fileExists(densePullMarker(workspacePath, dense.modelId, dense.cacheDir)),
1715
1944
  artifactExists: await fileExists(denseVectorPath(workspacePath))
1716
1945
  },
1717
1946
  sparse: {
@@ -1719,22 +1948,64 @@ async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
1719
1948
  modelId: sparse.modelId,
1720
1949
  cacheDir: sparseCacheDir,
1721
1950
  uvAvailable,
1722
- available: await fileExists(sparsePullMarker(workspacePath)),
1951
+ available: await fileExists(sparsePullMarker(workspacePath, sparse.modelId, sparse.cacheDir)),
1723
1952
  artifactExists: await fileExists(sparseVectorPath(workspacePath))
1724
1953
  }
1725
1954
  };
1726
1955
  }
1727
1956
 
1728
1957
  // src/vector/text.ts
1958
+ var LOW_SIGNAL_HEADINGS = /* @__PURE__ */ new Set([
1959
+ "choose this instead of",
1960
+ "how xyz runs it",
1961
+ "naechste schritte",
1962
+ "next steps",
1963
+ "overview",
1964
+ "passend wenn",
1965
+ "problem",
1966
+ "right fit",
1967
+ "waehlen sie das stattdessen",
1968
+ "was sie bekommen",
1969
+ "what you get",
1970
+ "wie xyz es umsetzt",
1971
+ "uberblick",
1972
+ "\xFCberblick"
1973
+ ]);
1974
+ function normalizeHeading(value) {
1975
+ return value.trim().toLowerCase();
1976
+ }
1977
+ function isLowSignalHeading(value) {
1978
+ return LOW_SIGNAL_HEADINGS.has(normalizeHeading(value));
1979
+ }
1980
+ function stripLeadingHeading(text, heading) {
1981
+ const lines = text.split("\n");
1982
+ const firstContentIndex = lines.findIndex((line) => line.trim().length > 0);
1983
+ if (firstContentIndex < 0) {
1984
+ return text;
1985
+ }
1986
+ const match = /^(#{1,6})\s+(.+)$/.exec(lines[firstContentIndex] ?? "");
1987
+ if (!match?.[2] || normalizeHeading(match[2]) !== normalizeHeading(heading)) {
1988
+ return text;
1989
+ }
1990
+ const next = [...lines.slice(0, firstContentIndex), ...lines.slice(firstContentIndex + 1)].join("\n").trim();
1991
+ return next;
1992
+ }
1993
+ function createVectorText(chunk) {
1994
+ const meaningfulHeadings = chunk.headingPath.filter((heading) => !isLowSignalHeading(heading) && normalizeHeading(heading) !== normalizeHeading(chunk.title));
1995
+ const textHeading = [...chunk.headingPath].reverse().find((heading) => isLowSignalHeading(heading) || normalizeHeading(heading) === normalizeHeading(chunk.title));
1996
+ const body = textHeading ? stripLeadingHeading(chunk.text, textHeading) : chunk.text.trim();
1997
+ return [chunk.title, ...meaningfulHeadings, body].filter(Boolean).join("\n\n");
1998
+ }
1729
1999
  function createDenseChunkText(chunk) {
1730
- return [chunk.title, ...chunk.headingPath, chunk.text].filter(Boolean).join("\n\n");
2000
+ return createVectorText(chunk);
1731
2001
  }
1732
2002
  function createSparseChunkText(chunk) {
1733
- return [chunk.title, ...chunk.headingPath, chunk.text].filter(Boolean).join("\n\n");
2003
+ return createVectorText(chunk);
1734
2004
  }
1735
2005
 
1736
2006
  // src/vector/dense.ts
1737
2007
  var denseEmbedderFactory = null;
2008
+ var EXACT_DENSE_RERANK_THRESHOLD = 5e3;
1738
2009
  async function createEmbedder(cacheDir, modelId) {
1739
2010
  if (denseEmbedderFactory) {
1740
2011
  return denseEmbedderFactory(cacheDir, modelId);
@@ -1746,9 +2017,13 @@ async function createEmbedder(cacheDir, modelId) {
1746
2017
  return output.tolist()[0];
1747
2018
  };
1748
2019
  }
2020
+ function exactDenseQuery(payload, vector, topK) {
2021
+ return payload.chunks.map((chunk) => [chunk.chunkId, cosineSimilarity(vector, chunk.embedding)]).sort((left, right) => right[1] - left[1]).slice(0, topK);
2022
+ }
1749
2023
  async function buildDenseVectors({
1750
2024
  workspacePath,
1751
- config
2025
+ config,
2026
+ progress
1752
2027
  }) {
1753
2028
  const chunks = await readJsonl(path14.join(workspacePath, "chunks", "chunks.jsonl"));
1754
2029
  const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
@@ -1756,6 +2031,7 @@ async function buildDenseVectors({
1756
2031
  const embed = await createEmbedder(cacheDir, config.modelId);
1757
2032
  const records = [];
1758
2033
  let dimensions = 0;
2034
+ reportProgress(progress, `Encoding ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} for dense retrieval`);
1759
2035
  for (const chunk of chunks) {
1760
2036
  const embedding = await embed(createDenseChunkText(chunk));
1761
2037
  dimensions ||= embedding.length;
@@ -1769,7 +2045,11 @@ async function buildDenseVectors({
1769
2045
  text: chunk.text,
1770
2046
  embedding
1771
2047
  });
2048
+ if (records.length === 1 || records.length % 100 === 0 || records.length === chunks.length) {
2049
+ reportProgressDetail(progress, `Encoded ${records.length}/${chunks.length} chunks for dense retrieval`);
2050
+ }
1772
2051
  }
2052
+ reportProgress(progress, "Building dense vector index");
1773
2053
  const index = new VectorFieldIndex({
1774
2054
  numHashTables: config.indexHashTables,
1775
2055
  dimensions,
@@ -1793,6 +2073,7 @@ async function buildDenseVectors({
1793
2073
  chunks: records
1794
2074
  };
1795
2075
  await writeDensePayload(workspacePath, payload);
2076
+ reportProgress(progress, `Dense vectors written for ${records.length} chunk${records.length === 1 ? "" : "s"}`);
1796
2077
  return payload;
1797
2078
  }
1798
2079
  async function denseQuery({
@@ -1805,12 +2086,19 @@ async function denseQuery({
1805
2086
  const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
1806
2087
  const embed = await createEmbedder(cacheDir, config.modelId);
1807
2088
  const vector = await embed(query);
2089
+ if (payload.chunks.length <= EXACT_DENSE_RERANK_THRESHOLD) {
2090
+ return exactDenseQuery(payload, vector, topK);
2091
+ }
1808
2092
  const index = new VectorFieldIndex({
1809
2093
  numHashTables: payload.metadata.hashTables,
1810
2094
  dimensions: payload.metadata.dimensions,
1811
2095
  random: createSeededRandom(payload.metadata.randomSeed)
1812
2096
  }).loadState(payload.indexState);
1813
- return index.query(vector, topK);
2097
+ const approximateHits = index.query(vector, topK);
2098
+ if (approximateHits.length >= topK) {
2099
+ return approximateHits;
2100
+ }
2101
+ return exactDenseQuery(payload, vector, topK);
1814
2102
  }
1815
2103
 
1816
2104
  // src/vector/sparse.ts
@@ -1904,10 +2192,13 @@ async function buildSparseDocuments(workspacePath, config, chunks) {
1904
2192
  }
1905
2193
  async function buildSparseVectors({
1906
2194
  workspacePath,
1907
- config
2195
+ config,
2196
+ progress
1908
2197
  }) {
1909
2198
  const chunks = await readJsonl(path15.join(workspacePath, "chunks", "chunks.jsonl"));
2199
+ reportProgress(progress, `Encoding ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} for sparse retrieval`);
1910
2200
  const built = await buildSparseDocuments(workspacePath, config, chunks);
2201
+ reportProgress(progress, "Building sparse vector index");
1911
2202
  const index = new SparseVectorFieldIndex();
1912
2203
  for (const record of built.chunks) {
1913
2204
  index.insert(record.chunkId, [record.vector]);
@@ -1929,6 +2220,7 @@ async function buildSparseVectors({
1929
2220
  queryTokenWeights: built.queryTokenWeights
1930
2221
  };
1931
2222
  await writeSparsePayload(workspacePath, payload);
2223
+ reportProgress(progress, `Sparse vectors written for ${built.chunks.length} chunk${built.chunks.length === 1 ? "" : "s"}`);
1932
2224
  return payload;
1933
2225
  }
1934
2226
  async function sparseQuery({
@@ -1951,24 +2243,24 @@ async function buildVectorArtifacts({
1951
2243
  config,
1952
2244
  denseOverride,
1953
2245
  sparseOverride,
1954
- buildAvailableModels = false
2246
+ buildAvailableModels = false,
2247
+ progress
1955
2248
  }) {
1956
- const modelStatus = buildAvailableModels ? await buildModelStatus(workspacePath, config.retrieval.dense, config.retrieval.sparse, await (async () => {
1957
- try {
1958
- await ensureUvAvailable();
1959
- return true;
1960
- } catch {
1961
- return false;
1962
- }
1963
- })()) : null;
2249
+ const uvAvailable = await isUvAvailable();
2250
+ const modelStatus = buildAvailableModels ? await buildModelStatus(workspacePath, config.retrieval.dense, config.retrieval.sparse, uvAvailable) : null;
1964
2251
  const denseEnabled = denseOverride ?? (buildAvailableModels ? config.retrieval.dense.enabled || Boolean(modelStatus?.dense.available) : config.retrieval.dense.enabled);
1965
- const sparseEnabled = sparseOverride ?? (buildAvailableModels ? (config.retrieval.sparse.enabled || Boolean(modelStatus?.sparse.available)) && Boolean(modelStatus?.sparse.uvAvailable) : config.retrieval.sparse.enabled);
2252
+ const sparseEnabled = sparseOverride ?? (buildAvailableModels ? (config.retrieval.sparse.enabled || Boolean(modelStatus?.sparse.available)) && Boolean(modelStatus?.sparse.uvAvailable) : config.retrieval.sparse.enabled && uvAvailable);
1966
2253
  const result = {};
1967
2254
  if (denseEnabled) {
1968
- result.dense = await buildDenseVectors({ workspacePath, config: config.retrieval.dense });
2255
+ reportProgress(progress, `Building dense vectors with ${config.retrieval.dense.modelId}`);
2256
+ result.dense = await buildDenseVectors({ workspacePath, config: config.retrieval.dense, progress });
2257
+ }
2258
+ if ((sparseOverride || config.retrieval.sparse.enabled) && !uvAvailable) {
2259
+ reportProgress(progress, "Skipping sparse vectors because uv is not available");
1969
2260
  }
1970
2261
  if (sparseEnabled) {
1971
- result.sparse = await buildSparseVectors({ workspacePath, config: config.retrieval.sparse });
2262
+ reportProgress(progress, `Building sparse vectors with ${config.retrieval.sparse.modelId}`);
2263
+ result.sparse = await buildSparseVectors({ workspacePath, config: config.retrieval.sparse, progress });
1972
2264
  }
1973
2265
  return result;
1974
2266
  }
@@ -2037,14 +2329,17 @@ async function buildIndex({
2037
2329
  workspacePath,
2038
2330
  denseOverride,
2039
2331
  sparseOverride,
2040
- buildAvailableModels = false
2332
+ buildAvailableModels = false,
2333
+ progress
2041
2334
  }) {
2042
2335
  const config = await loadConfig(workspacePath);
2336
+ reportProgress(progress, "Loading documents, chunks, and sources");
2043
2337
  const chunks = await readJsonl(path17.join(workspacePath, "chunks", "chunks.jsonl"));
2044
2338
  const documents = await readJsonl(path17.join(workspacePath, "documents", "documents.jsonl"));
2045
2339
  const sources = await readJsonl(path17.join(workspacePath, "sources", "sources.jsonl"));
2046
2340
  const metadataFields = [...new Set(chunks.flatMap((chunk) => Object.keys(chunk.metadata).map((key) => `metadata.${key}`)))];
2047
2341
  const index = new DocumentIndex(createIndexMapping(metadataFields));
2342
+ reportProgress(progress, `Building lexical index from ${chunks.length} chunk${chunks.length === 1 ? "" : "s"}`);
2048
2343
  for (const chunk of chunks) {
2049
2344
  index.index({
2050
2345
  id: chunk.id,
@@ -2059,6 +2354,7 @@ async function buildIndex({
2059
2354
  }
2060
2355
  });
2061
2356
  }
2357
+ reportProgressDetail(progress, `Indexed ${documents.length} document${documents.length === 1 ? "" : "s"} across ${sources.length} source${sources.length === 1 ? "" : "s"}`);
2062
2358
  const createdAt = (/* @__PURE__ */ new Date()).toISOString();
2063
2359
  const metadata = {
2064
2360
  id: `index_${createdAt.replace(/[:.]/g, "-")}`,
@@ -2071,14 +2367,17 @@ async function buildIndex({
2071
2367
  fields: Object.keys(index.mapping),
2072
2368
  indexHash: sha256(JSON.stringify(index.indexState))
2073
2369
  };
2370
+ reportProgress(progress, "Writing lexical index artifacts");
2074
2371
  const artifacts = await writeIndexArtifacts({ workspacePath, indexState: index.indexState, metadata });
2075
2372
  const vectors = await buildVectorArtifacts({
2076
2373
  workspacePath,
2077
2374
  config,
2078
2375
  denseOverride,
2079
2376
  sparseOverride,
2080
- buildAvailableModels
2377
+ buildAvailableModels,
2378
+ progress
2081
2379
  });
2380
+ reportProgress(progress, `Index build complete: dense=${Boolean(vectors.dense)}, sparse=${Boolean(vectors.sparse)}`);
2082
2381
  return {
2083
2382
  metadata,
2084
2383
  indexPath: artifacts.indexPath,
@@ -2092,7 +2391,15 @@ import { readFile as readFile11 } from "fs/promises";
2092
2391
  import { BoolQuery, MatchQuery, OP, TermQuery, reciprocalRankFusion } from "@tryformation/querylight-ts";
2093
2392
  import path18 from "path";
2094
2393
  async function loadHydratedIndex(workspacePath) {
2095
- const state = await readLatestIndexState(workspacePath);
2394
+ let state;
2395
+ try {
2396
+ state = await readLatestIndexState(workspacePath);
2397
+ } catch (error) {
2398
+ if (error.code === "ENOENT") {
2399
+ throw new CliError("lexical index is not built; run `qli rebuild` or `qli chunk` followed by `qli index build`", "INDEX_MISSING", 7 /* QueryError */);
2400
+ }
2401
+ throw error;
2402
+ }
2096
2403
  const mapping = createIndexMapping(Object.keys(state.fieldState ?? {}).filter((field) => field.startsWith("metadata.")));
2097
2404
  return new (await import("@tryformation/querylight-ts")).DocumentIndex(mapping).loadState(state);
2098
2405
  }
@@ -2328,9 +2635,25 @@ async function buildSnippetWithAdjacentChunks(chunk, query, {
2328
2635
  function normalizeDisplayTitle(title) {
2329
2636
  return title.replace(/\s*\|\s*Querylight TS Demo\s*$/i, "").replace(/\s+/g, " ").trim();
2330
2637
  }
2638
+ var LOW_SIGNAL_RESULT_TITLES = /* @__PURE__ */ new Set([
2639
+ "choose this instead of",
2640
+ "how xyz runs it",
2641
+ "naechste schritte",
2642
+ "next steps",
2643
+ "overview",
2644
+ "passend wenn",
2645
+ "problem",
2646
+ "right fit",
2647
+ "waehlen sie das stattdessen",
2648
+ "was sie bekommen",
2649
+ "what you get",
2650
+ "wie xyz es umsetzt",
2651
+ "uberblick",
2652
+ "\xFCberblick"
2653
+ ]);
2331
2654
  function chooseResultTitle(chunk) {
2332
2655
  const documentTitle = normalizeDisplayTitle(chunk.title);
2333
- const headings = chunk.headingPath.map((heading) => normalizeDisplayTitle(heading)).filter(Boolean);
2656
+ const headings = chunk.headingPath.map((heading) => normalizeDisplayTitle(heading)).filter((heading) => heading.length > 0 && !LOW_SIGNAL_RESULT_TITLES.has(heading.toLowerCase()));
2334
2657
  const leafHeading = headings.at(-1);
2335
2658
  if (leafHeading && leafHeading.toLowerCase() !== documentTitle.toLowerCase()) {
2336
2659
  return leafHeading;
@@ -2352,6 +2675,9 @@ function normalizeUriPath(uri) {
2352
2675
  return uri.toLowerCase().replace(/\/+$/, "");
2353
2676
  }
2354
2677
  }
2678
+ function normalizeUriIdentity(uri) {
2679
+ return normalizeRemoteUrl(uri).toLowerCase().replace(/\/+$/, "");
2680
+ }
2355
2681
  function uriSpecificity(uri) {
2356
2682
  const normalized = normalizeUriPath(uri);
2357
2683
  if (normalized === "/") {
@@ -2368,6 +2694,11 @@ function isMoreSpecificDuplicate(candidate, existing) {
2368
2694
  if (!candidateTitle || candidateTitle !== existingTitle) {
2369
2695
  return false;
2370
2696
  }
2697
+ const candidateIdentity = normalizeUriIdentity(candidate.uri);
2698
+ const existingIdentity = normalizeUriIdentity(existing.uri);
2699
+ if (candidateIdentity === existingIdentity) {
2700
+ return candidate.uri.length < existing.uri.length;
2701
+ }
2371
2702
  const candidatePath = normalizeUriPath(candidate.uri);
2372
2703
  const existingPath = normalizeUriPath(existing.uri);
2373
2704
  if (candidatePath === existingPath) {
@@ -2480,7 +2811,6 @@ async function searchIndex({
2480
2811
  score: 0,
2481
2812
  title: chooseResultTitle(chunk),
2482
2813
  uri: chunk.uri,
2483
- headingPath: chunk.headingPath,
2484
2814
  snippet: await buildSnippetWithAdjacentChunks(chunk, document.title, {
2485
2815
  document,
2486
2816
  config,
@@ -2544,7 +2874,6 @@ async function searchIndex({
2544
2874
  score,
2545
2875
  title: chooseResultTitle(chunk),
2546
2876
  uri: chunk.uri,
2547
- headingPath: chunk.headingPath,
2548
2877
  snippet: await buildSnippetWithAdjacentChunks(chunk, normalizedQuery, {
2549
2878
  document: documents.get(chunk.documentId),
2550
2879
  config,
@@ -2564,7 +2893,7 @@ async function searchIndex({
2564
2893
 
2565
2894
  // src/query/related-service.ts
2566
2895
  import path19 from "path";
2567
- function cosineSimilarity(left, right) {
2896
+ function cosineSimilarity2(left, right) {
2568
2897
  let dot = 0;
2569
2898
  let leftNorm = 0;
2570
2899
  let rightNorm = 0;
@@ -2650,7 +2979,7 @@ async function findRelatedDocuments({
2650
2979
  const results = [...vectors.values()].filter((candidate) => candidate.document.id !== selected.id).map((candidate) => ({
2651
2980
  documentId: candidate.document.id,
2652
2981
  sourceId: candidate.document.sourceId,
2653
- score: cosineSimilarity(sourceVector.embedding, candidate.embedding),
2982
+ score: cosineSimilarity2(sourceVector.embedding, candidate.embedding),
2654
2983
  title: candidate.document.title,
2655
2984
  uri: candidate.document.uri,
2656
2985
  metadata: candidate.document.metadata
@@ -2690,7 +3019,6 @@ async function createContext({
2690
3019
  sourceId: result.sourceId,
2691
3020
  title: result.title,
2692
3021
  uri: result.uri,
2693
- headingPath: result.headingPath,
2694
3022
  text,
2695
3023
  metadata: result.metadata
2696
3024
  });
@@ -2703,7 +3031,6 @@ async function createContext({
2703
3031
  `Title: ${source.title}`,
2704
3032
  `URL: ${source.uri}`,
2705
3033
  `Chunk ID: ${source.chunkId}`,
2706
- source.headingPath.length > 0 ? `Heading Path: ${source.headingPath.join(" > ")}` : "",
2707
3034
  "",
2708
3035
  source.text,
2709
3036
  ""