@tryformation/querylight-cli 0.1.1 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -20,6 +20,15 @@ var CliError = class extends Error {
20
20
  import { readFile, writeFile } from "fs/promises";
21
21
  import path from "path";
22
22
  import YAML from "yaml";
23
+
24
+ // src/core/constants.ts
25
+ var DEFAULT_SHARED_MODEL_CACHE_DIR = "~/.qli/models/huggingface";
26
+ var LEGACY_WORKSPACE_MODEL_CACHE_DIR = ".kb/models/huggingface";
27
+
28
+ // src/core/config.ts
29
+ function normalizeModelCacheDir(configuredPath) {
30
+ return configuredPath === LEGACY_WORKSPACE_MODEL_CACHE_DIR ? DEFAULT_SHARED_MODEL_CACHE_DIR : configuredPath;
31
+ }
23
32
  var defaultConfig = () => ({
24
33
  workspaceVersion: 1,
25
34
  index: {
@@ -47,17 +56,17 @@ var defaultConfig = () => ({
47
56
  retrieval: {
48
57
  defaultMode: "lexical",
49
58
  dense: {
50
- enabled: false,
59
+ enabled: true,
51
60
  modelId: "Xenova/all-MiniLM-L6-v2",
52
- cacheDir: ".kb/models/huggingface",
61
+ cacheDir: DEFAULT_SHARED_MODEL_CACHE_DIR,
53
62
  indexHashTables: 8,
54
63
  indexRandomSeed: 42,
55
64
  chunkTextMode: "title-heading-text"
56
65
  },
57
66
  sparse: {
58
- enabled: false,
67
+ enabled: true,
59
68
  modelId: "opensearch-project/opensearch-neural-sparse-encoding-doc-v3-distill",
60
- cacheDir: ".kb/models/huggingface",
69
+ cacheDir: DEFAULT_SHARED_MODEL_CACHE_DIR,
61
70
  documentTopTokens: 128,
62
71
  queryEncoding: "tokenizer-token-weights",
63
72
  documentEncoding: "masked-lm-max-log1p-relu",
@@ -68,6 +77,7 @@ var defaultConfig = () => ({
68
77
  defaultUserAgent: "querylight-cli/0.1",
69
78
  obeyRobotsTxt: true,
70
79
  rateLimitMs: 1e3,
80
+ maxConcurrentRequests: 5,
71
81
  renderJs: false,
72
82
  retentionDays: 365,
73
83
  fetchArticles: true
@@ -118,11 +128,13 @@ async function loadConfig(workspacePath, configPath) {
118
128
  ...parsed.retrieval ?? {},
119
129
  dense: {
120
130
  ...defaults.retrieval.dense,
121
- ...parsed.retrieval?.dense ?? {}
131
+ ...parsed.retrieval?.dense ?? {},
132
+ cacheDir: normalizeModelCacheDir(parsed.retrieval?.dense?.cacheDir ?? defaults.retrieval.dense.cacheDir)
122
133
  },
123
134
  sparse: {
124
135
  ...defaults.retrieval.sparse,
125
- ...parsed.retrieval?.sparse ?? {}
136
+ ...parsed.retrieval?.sparse ?? {},
137
+ cacheDir: normalizeModelCacheDir(parsed.retrieval?.sparse?.cacheDir ?? defaults.retrieval.sparse.cacheDir)
126
138
  }
127
139
  },
128
140
  crawler: {
@@ -145,8 +157,6 @@ var DIRS = [
145
157
  "normalized",
146
158
  "indexes",
147
159
  "vectors",
148
- "models",
149
- "models/huggingface",
150
160
  "runs",
151
161
  "logs"
152
162
  ];
@@ -275,6 +285,27 @@ async function saveChunks(workspacePath, chunks) {
275
285
  await writeJsonl(chunksFile(workspacePath), chunks.sort((a, b) => a.id.localeCompare(b.id)));
276
286
  }
277
287
 
288
+ // src/core/concurrency.ts
289
+ async function mapWithConcurrency(items, limit, worker) {
290
+ if (items.length === 0) {
291
+ return;
292
+ }
293
+ const concurrency = Math.max(1, Math.floor(limit));
294
+ let nextIndex = 0;
295
+ await Promise.all(
296
+ Array.from({ length: Math.min(concurrency, items.length) }, async () => {
297
+ while (true) {
298
+ const index = nextIndex;
299
+ nextIndex += 1;
300
+ if (index >= items.length) {
301
+ return;
302
+ }
303
+ await worker(items[index], index);
304
+ }
305
+ })
306
+ );
307
+ }
308
+
278
309
  // src/core/files.ts
279
310
  import { stat as stat2 } from "fs/promises";
280
311
  async function fileExists(filePath) {
@@ -286,6 +317,14 @@ async function fileExists(filePath) {
286
317
  }
287
318
  }
288
319
 
320
+ // src/core/progress.ts
321
+ function reportProgress(progress, message) {
322
+ progress?.("info", message);
323
+ }
324
+ function reportProgressDetail(progress, message) {
325
+ progress?.("detail", message);
326
+ }
327
+
289
328
  // src/core/runs.ts
290
329
  import path6 from "path";
291
330
  async function writeRun(workspacePath, run) {
@@ -428,9 +467,41 @@ function stripBoilerplate(html) {
428
467
 
429
468
  // src/ingest/extractors/html-extractor.ts
430
469
  var turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
470
+ var LOW_SIGNAL_SECTION_SELECTORS = [
471
+ "script",
472
+ "style",
473
+ "noscript",
474
+ "template",
475
+ "[data-blog-service-recommendations]",
476
+ "[data-blog-related-posts]"
477
+ ].join(", ");
431
478
  function cleanText(value) {
432
479
  return value.replace(/\s+/g, " ").trim();
433
480
  }
481
+ function pruneLowSignalContent($) {
482
+ $(LOW_SIGNAL_SECTION_SELECTORS).remove();
483
+ $("form").each((_, element) => {
484
+ const action = cleanText($(element).attr("action") ?? "");
485
+ if (action.includes("substack.com/subscribe")) {
486
+ $(element).closest("section").remove();
487
+ }
488
+ });
489
+ }
490
+ function stripEscapedJsonPayloads(markdown) {
491
+ return markdown.split("\n").filter((line) => {
492
+ const trimmed = line.trim();
493
+ if (trimmed.length === 0) {
494
+ return true;
495
+ }
496
+ if (trimmed.length > 300 && /^"?\\?\[\{\\?"[a-z0-9_]+\\?":/i.test(trimmed)) {
497
+ return false;
498
+ }
499
+ if (trimmed.length > 300 && trimmed.includes('\\"permalink\\":') && trimmed.includes('\\"title\\":')) {
500
+ return false;
501
+ }
502
+ return true;
503
+ }).join("\n").replace(/\n{3,}/g, "\n\n").trim();
504
+ }
434
505
  function chooseMeaningfulTitle($, fallbackTitle) {
435
506
  const candidates = [
436
507
  cleanText($("meta[property='og:title']").attr("content") ?? ""),
@@ -467,14 +538,27 @@ ${parts.join("\n\n")}
467
538
  function extractHtmlToMarkdown(html) {
468
539
  const cleaned = stripBoilerplate(html);
469
540
  const $ = load(cleaned);
541
+ pruneLowSignalContent($);
470
542
  const fallbackTitle = cleanText($("title").first().text()) || "Untitled";
471
543
  const title = chooseMeaningfulTitle($, fallbackTitle);
472
544
  const root = $("main").first().html() ?? $.root().html() ?? cleaned;
473
545
  return {
474
- markdown: turndown.turndown(root),
546
+ markdown: stripEscapedJsonPayloads(turndown.turndown(root)),
475
547
  title
476
548
  };
477
549
  }
550
+ function extractCanonicalUriFromHtml(html, baseUrl) {
551
+ const $ = load(html);
552
+ const href = $("link[rel='canonical']").first().attr("href")?.trim();
553
+ if (!href) {
554
+ return null;
555
+ }
556
+ try {
557
+ return new URL(href, baseUrl).href;
558
+ } catch {
559
+ return null;
560
+ }
561
+ }
478
562
  function parseDateCandidate(value) {
479
563
  const trimmed = value.trim();
480
564
  if (!trimmed) {
@@ -879,6 +963,19 @@ async function parseRssFeedDocument(xml, source) {
879
963
  // src/ingest/adapters/url-adapter.ts
880
964
  import { mkdir as mkdir5, readFile as readFile7, writeFile as writeFile5 } from "fs/promises";
881
965
  import path9 from "path";
966
+
967
+ // src/core/urls.ts
968
+ function normalizeRemoteUrl(uri) {
969
+ try {
970
+ const parsed = new URL(uri);
971
+ parsed.hash = "";
972
+ return parsed.href;
973
+ } catch {
974
+ return uri;
975
+ }
976
+ }
977
+
978
+ // src/ingest/adapters/url-adapter.ts
882
979
  function buildHttpCache(response, validatedAt) {
883
980
  return {
884
981
  etag: response.headers.get("etag") ?? void 0,
@@ -903,12 +1000,13 @@ async function normalizeRemoteDocument({
903
1000
  responseStatus
904
1001
  }) {
905
1002
  const extracted = extractHtmlToMarkdown(body);
1003
+ const canonicalUri = normalizeRemoteUrl(extractCanonicalUriFromHtml(body, url) ?? url);
906
1004
  const markdown = `# ${extracted.title}
907
1005
 
908
1006
  ${extracted.markdown}`;
909
- const documentId = stableId("doc", source.id, url);
1007
+ const documentId = stableId("doc", source.id, canonicalUri);
910
1008
  const normalizedPath = path9.resolve(workspacePath, "normalized", `${documentId}.md`);
911
- const rawPath = path9.resolve(workspacePath, "raw", source.id, `${sha256(url).slice(0, 12)}.html`);
1009
+ const rawPath = path9.resolve(workspacePath, "raw", source.id, `${sha256(canonicalUri).slice(0, 12)}.html`);
912
1010
  const contentHash = sha256(markdown);
913
1011
  const now = (/* @__PURE__ */ new Date()).toISOString();
914
1012
  const lastChangedAt = previous?.contentHash === contentHash ? previous.lastChangedAt : now;
@@ -921,7 +1019,7 @@ ${extracted.markdown}`;
921
1019
  documentId,
922
1020
  sourceId: source.id,
923
1021
  title: extracted.title,
924
- uri: url,
1022
+ uri: canonicalUri,
925
1023
  sourceUri,
926
1024
  publicationDate: resolvedPublicationDate,
927
1025
  crawledAt,
@@ -936,8 +1034,9 @@ ${extracted.markdown}`;
936
1034
  sourceId: source.id,
937
1035
  sourceType: source.type,
938
1036
  title: extracted.title,
939
- uri: url,
1037
+ uri: canonicalUri,
940
1038
  sourceUri,
1039
+ canonicalUri,
941
1040
  mimeType: "text/html",
942
1041
  rawPath,
943
1042
  normalizedPath,
@@ -1111,6 +1210,18 @@ function isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules
1111
1210
  if (url.origin !== baseUrl.origin) {
1112
1211
  return false;
1113
1212
  }
1213
+ if (url.search.length > 0) {
1214
+ return false;
1215
+ }
1216
+ if (url.pathname.endsWith(".xml")) {
1217
+ return false;
1218
+ }
1219
+ if (url.pathname.includes("/cdn-cgi/")) {
1220
+ return false;
1221
+ }
1222
+ if (url.pathname === "/search" || url.pathname === "/search/" || url.pathname.endsWith("/search/")) {
1223
+ return false;
1224
+ }
1114
1225
  if (disallowRules.some((rule) => rule !== "/" && url.pathname.startsWith(rule))) {
1115
1226
  return false;
1116
1227
  }
@@ -1123,56 +1234,75 @@ function isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules
1123
1234
  }
1124
1235
  return true;
1125
1236
  }
1126
- async function crawlWebsite(source) {
1237
+ function delay(ms) {
1238
+ return new Promise((resolve2) => setTimeout(resolve2, ms));
1239
+ }
1240
+ async function crawlWebsite(source, defaults, progress) {
1127
1241
  const baseUrl = new URL(source.uri);
1128
- const userAgent = source.crawl?.userAgent ?? "querylight-cli/0.1";
1242
+ const userAgent = source.crawl?.userAgent ?? defaults.userAgent;
1129
1243
  const includePatterns = source.crawl?.includePatterns ?? [];
1130
1244
  const excludePatterns = source.crawl?.excludePatterns ?? [];
1131
1245
  const maxDepth = source.crawl?.maxDepth ?? 2;
1132
1246
  const maxPages = source.crawl?.maxPages ?? 100;
1133
- const rateLimitMs = source.crawl?.rateLimitMs ?? 1e3;
1247
+ const rateLimitMs = source.crawl?.rateLimitMs ?? defaults.rateLimitMs;
1248
+ const maxConcurrentRequests = source.crawl?.maxConcurrentRequests ?? defaults.maxConcurrentRequests;
1134
1249
  const disallowRules = source.crawl?.obeyRobotsTxt === false ? [] : await fetchRobotsDisallow(baseUrl, userAgent);
1135
- const queue = [{ url: source.uri, depth: 0 }];
1136
1250
  const seen = /* @__PURE__ */ new Set();
1137
1251
  const results = [];
1252
+ let currentLevel = [normalizeRemoteUrl(source.uri)];
1138
1253
  if (source.crawl?.useSitemap !== false) {
1139
- for (const url of await fetchSitemapUrls(baseUrl, userAgent)) {
1140
- queue.push({ url, depth: 1 });
1141
- }
1142
- }
1143
- while (queue.length > 0 && results.length < maxPages) {
1144
- const next = queue.shift();
1145
- if (!next || seen.has(next.url)) {
1146
- continue;
1147
- }
1148
- seen.add(next.url);
1149
- const url = new URL(next.url);
1150
- if (!isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules)) {
1151
- continue;
1254
+ const sitemapUrls = (await fetchSitemapUrls(baseUrl, userAgent)).map((url) => normalizeRemoteUrl(url));
1255
+ reportProgress(progress, `Discovered ${sitemapUrls.length} sitemap URL${sitemapUrls.length === 1 ? "" : "s"} for ${source.uri}`);
1256
+ currentLevel = [
1257
+ ...currentLevel,
1258
+ ...sitemapUrls
1259
+ ];
1260
+ }
1261
+ for (let depth = 0; depth <= maxDepth && currentLevel.length > 0 && results.length < maxPages; depth += 1) {
1262
+ reportProgress(progress, `Crawl depth ${depth}: evaluating ${currentLevel.length} candidate URL${currentLevel.length === 1 ? "" : "s"}`);
1263
+ const nextLevelCandidates = [];
1264
+ const allowedUrls = [];
1265
+ for (const candidate of currentLevel) {
1266
+ const normalizedCandidate = normalizeRemoteUrl(candidate);
1267
+ if (seen.has(normalizedCandidate)) {
1268
+ continue;
1269
+ }
1270
+ seen.add(normalizedCandidate);
1271
+ const url = new URL(normalizedCandidate);
1272
+ if (!isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules)) {
1273
+ continue;
1274
+ }
1275
+ allowedUrls.push(normalizedCandidate);
1276
+ results.push(normalizedCandidate);
1277
+ reportProgress(progress, `Discovered ${normalizedCandidate}`);
1278
+ if (results.length >= maxPages) {
1279
+ break;
1280
+ }
1152
1281
  }
1153
- results.push(url.href);
1154
- if (next.depth >= maxDepth) {
1155
- continue;
1282
+ reportProgress(progress, `Crawl depth ${depth}: queued ${allowedUrls.length} page${allowedUrls.length === 1 ? "" : "s"} for link extraction`);
1283
+ if (depth >= maxDepth || results.length >= maxPages) {
1284
+ break;
1156
1285
  }
1157
- const response = await fetch(url, { headers: { "user-agent": userAgent } });
1158
- const html = await response.text();
1159
- const $ = load2(html);
1160
- $("a[href]").each((_, element) => {
1161
- const href = $(element).attr("href");
1162
- if (!href) {
1163
- return;
1164
- }
1165
- try {
1166
- const target = new URL(href, url);
1167
- if (!seen.has(target.href)) {
1168
- queue.push({ url: target.href, depth: next.depth + 1 });
1286
+ await mapWithConcurrency(allowedUrls, maxConcurrentRequests, async (pageUrl) => {
1287
+ const page = new URL(pageUrl);
1288
+ const response = await fetch(page, { headers: { "user-agent": userAgent } });
1289
+ const html = await response.text();
1290
+ const $ = load2(html);
1291
+ $("a[href]").each((_, element) => {
1292
+ const href = $(element).attr("href");
1293
+ if (!href) {
1294
+ return;
1295
+ }
1296
+ try {
1297
+ nextLevelCandidates.push(normalizeRemoteUrl(new URL(href, page).href));
1298
+ } catch {
1169
1299
  }
1170
- } catch {
1300
+ });
1301
+ if (rateLimitMs > 0) {
1302
+ await delay(rateLimitMs);
1171
1303
  }
1172
1304
  });
1173
- if (rateLimitMs > 0) {
1174
- await new Promise((resolve2) => setTimeout(resolve2, rateLimitMs));
1175
- }
1305
+ currentLevel = nextLevelCandidates;
1176
1306
  }
1177
1307
  return results;
1178
1308
  }
@@ -1247,6 +1377,8 @@ async function ingestRssSource({
1247
1377
  source,
1248
1378
  previous,
1249
1379
  nextDocuments,
1380
+ maxConcurrentRequests,
1381
+ onDocumentProcessed,
1250
1382
  onFailure
1251
1383
  }) {
1252
1384
  if (source.crawl?.fetchArticles === false) {
@@ -1254,11 +1386,12 @@ async function ingestRssSource({
1254
1386
  }
1255
1387
  const xml = await fetchFeedText(source);
1256
1388
  const items = await parseRssFeedDocument(xml, source);
1389
+ const processedDocumentIds = /* @__PURE__ */ new Set();
1257
1390
  let added = 0;
1258
1391
  let changed = 0;
1259
1392
  let unchanged = 0;
1260
1393
  let failed = 0;
1261
- for (const item of items) {
1394
+ await mapWithConcurrency(items, maxConcurrentRequests, async (item) => {
1262
1395
  try {
1263
1396
  const probe = previous.get(stableId("doc", source.id, item.url));
1264
1397
  const document = await fetchUrlDocument({
@@ -1269,28 +1402,40 @@ async function ingestRssSource({
1269
1402
  sourceUri: source.uri,
1270
1403
  publicationDate: item.publicationDate
1271
1404
  });
1405
+ if (processedDocumentIds.has(document.id)) {
1406
+ return;
1407
+ }
1408
+ processedDocumentIds.add(document.id);
1409
+ const existingDocument = probe ?? previous.get(document.id);
1272
1410
  nextDocuments.set(document.id, document);
1273
- if (!probe) {
1411
+ if (!existingDocument) {
1274
1412
  added += 1;
1275
- } else if (probe.contentHash !== document.contentHash) {
1413
+ onDocumentProcessed?.(document.uri, "added");
1414
+ } else if (existingDocument.contentHash !== document.contentHash) {
1276
1415
  changed += 1;
1416
+ onDocumentProcessed?.(document.uri, "changed");
1277
1417
  } else {
1278
1418
  unchanged += 1;
1419
+ onDocumentProcessed?.(document.uri, "unchanged");
1279
1420
  }
1280
1421
  } catch (error) {
1281
1422
  failed += 1;
1282
1423
  onFailure(item.url, error);
1283
1424
  }
1284
- }
1425
+ });
1285
1426
  return { added, changed, unchanged, failed };
1286
1427
  }
1287
1428
  async function ingestSources({
1288
1429
  workspacePath,
1289
1430
  sourceIds,
1290
- changedOnly = false
1431
+ changedOnly = false,
1432
+ progress
1291
1433
  }) {
1292
1434
  const config = await loadConfig(workspacePath);
1293
1435
  const defaultRetentionDays = config.crawler.retentionDays;
1436
+ const defaultUserAgent = config.crawler.defaultUserAgent;
1437
+ const defaultRateLimitMs = config.crawler.rateLimitMs;
1438
+ const defaultMaxConcurrentRequests = config.crawler.maxConcurrentRequests;
1294
1439
  const sources = (await listSources(workspacePath)).filter((source) => source.enabled && (!sourceIds || sourceIds.includes(source.id)));
1295
1440
  const existing = await loadDocuments(workspacePath);
1296
1441
  const previous = previousMap(existing);
@@ -1300,20 +1445,38 @@ async function ingestSources({
1300
1445
  let unchanged = 0;
1301
1446
  let failed = 0;
1302
1447
  const failures = [];
1448
+ reportProgress(progress, `Ingesting ${sources.length} source${sources.length === 1 ? "" : "s"}`);
1303
1449
  for (const source of sources) {
1450
+ const maxConcurrentRequests = source.crawl?.maxConcurrentRequests ?? defaultMaxConcurrentRequests;
1451
+ const sourceBefore = { added, changed, unchanged, failed };
1452
+ const processedDocumentIds = /* @__PURE__ */ new Set();
1453
+ const reportDocumentOutcome = (uri, outcome) => {
1454
+ const label = outcome === "unchanged" ? "Unchanged" : outcome === "changed" ? "Updated" : "Added";
1455
+ reportProgress(progress, `${label} ${uri}`);
1456
+ };
1304
1457
  const ingestOne = async (uri, producer) => {
1305
1458
  try {
1306
1459
  const probeId = stableId("doc", source.id, uri);
1307
1460
  const earlier = previous.get(probeId);
1308
1461
  const document = await producer();
1462
+ if (processedDocumentIds.has(document.id)) {
1463
+ reportProgressDetail(progress, `Skipped duplicate alias ${uri} -> ${document.uri}`);
1464
+ return null;
1465
+ }
1466
+ processedDocumentIds.add(document.id);
1467
+ const existingDocument = earlier ?? previous.get(document.id);
1309
1468
  nextDocuments.set(document.id, document);
1310
- if (!earlier) {
1469
+ if (!existingDocument) {
1311
1470
  added += 1;
1312
- } else if (earlier.contentHash !== document.contentHash) {
1471
+ reportDocumentOutcome(document.uri, "added");
1472
+ } else if (existingDocument.contentHash !== document.contentHash) {
1313
1473
  changed += 1;
1474
+ reportDocumentOutcome(document.uri, "changed");
1314
1475
  } else {
1315
1476
  unchanged += 1;
1477
+ reportDocumentOutcome(document.uri, "unchanged");
1316
1478
  }
1479
+ return document;
1317
1480
  } catch (error) {
1318
1481
  failed += 1;
1319
1482
  failures.push({
@@ -1321,50 +1484,69 @@ async function ingestSources({
1321
1484
  uri,
1322
1485
  message: error instanceof Error ? error.message : String(error)
1323
1486
  });
1487
+ reportProgressDetail(progress, `Failed ${uri}: ${error instanceof Error ? error.message : String(error)}`);
1488
+ return null;
1324
1489
  }
1325
1490
  };
1326
1491
  try {
1492
+ reportProgress(progress, `Source ${source.name} (${source.type})`);
1327
1493
  if (source.type === "file") {
1494
+ reportProgress(progress, `Reading file ${source.uri}`);
1328
1495
  await ingestOne(source.uri, () => ingestFile({ workspacePath, source, filePath: source.uri, previous: previous.get(stableId("doc", source.id, source.uri)) }));
1329
- continue;
1330
- }
1331
- if (source.type === "directory") {
1332
- for (const filePath of await listDirectoryFiles(source)) {
1496
+ } else if (source.type === "directory") {
1497
+ const files = await listDirectoryFiles(source);
1498
+ reportProgress(progress, `Scanning ${files.length} file${files.length === 1 ? "" : "s"} from ${source.uri}`);
1499
+ for (const filePath of files) {
1500
+ reportProgress(progress, `Reading file ${filePath}`);
1333
1501
  await ingestOne(filePath, () => ingestFile({ workspacePath, source, filePath, previous: previous.get(stableId("doc", source.id, filePath)) }));
1334
1502
  }
1335
- continue;
1336
- }
1337
- if (source.type === "url") {
1503
+ } else if (source.type === "url") {
1504
+ reportProgress(progress, `Fetching ${source.uri}`);
1338
1505
  await ingestOne(source.uri, () => fetchUrlDocument({ workspacePath, source, url: source.uri, previous: previous.get(stableId("doc", source.id, source.uri)) }));
1339
- continue;
1340
- }
1341
- if (source.type === "website") {
1342
- for (const url of await crawlWebsite(source)) {
1343
- await ingestOne(url, () => fetchUrlDocument({ workspacePath, source, url, previous: previous.get(stableId("doc", source.id, url)) }));
1344
- }
1345
- continue;
1346
- }
1347
- if (source.type === "rss") {
1506
+ } else if (source.type === "website") {
1507
+ reportProgress(progress, `Crawling ${source.uri}`);
1508
+ const urls = await crawlWebsite(source, {
1509
+ userAgent: defaultUserAgent,
1510
+ rateLimitMs: defaultRateLimitMs,
1511
+ maxConcurrentRequests
1512
+ }, progress);
1513
+ reportProgress(progress, `Fetched ${urls.length} page${urls.length === 1 ? "" : "s"} from crawl`);
1514
+ const seenCanonicalUrls = /* @__PURE__ */ new Set();
1515
+ await mapWithConcurrency(urls, maxConcurrentRequests, async (url) => {
1516
+ if (seenCanonicalUrls.has(url)) {
1517
+ reportProgressDetail(progress, `Skipped canonical duplicate ${url}`);
1518
+ return;
1519
+ }
1520
+ reportProgress(progress, `Fetching ${url}`);
1521
+ const document = await ingestOne(url, () => fetchUrlDocument({ workspacePath, source, url, previous: previous.get(stableId("doc", source.id, url)) }));
1522
+ if (document) {
1523
+ seenCanonicalUrls.add(document.uri);
1524
+ }
1525
+ });
1526
+ } else if (source.type === "rss") {
1527
+ reportProgress(progress, `Fetching feed ${source.uri}`);
1348
1528
  const result = await ingestRssSource({
1349
1529
  workspacePath,
1350
1530
  source,
1351
1531
  previous,
1352
1532
  nextDocuments,
1533
+ maxConcurrentRequests,
1534
+ onDocumentProcessed: reportDocumentOutcome,
1353
1535
  onFailure: (uri, error) => {
1354
1536
  failures.push({
1355
1537
  sourceId: source.id,
1356
1538
  uri,
1357
1539
  message: error instanceof Error ? error.message : String(error)
1358
1540
  });
1541
+ reportProgressDetail(progress, `Failed ${uri}: ${error instanceof Error ? error.message : String(error)}`);
1359
1542
  }
1360
1543
  });
1361
1544
  added += result.added;
1362
1545
  changed += result.changed;
1363
1546
  unchanged += result.unchanged;
1364
1547
  failed += result.failed;
1365
- continue;
1366
- }
1367
- if (source.type === "markdown" || source.type === "text") {
1548
+ } else if (source.type === "markdown" || source.type === "text") {
1549
+ reportProgress(progress, `Processing inline ${source.type} source ${source.id}`);
1368
1550
  await ingestOne(source.uri, () => ingestInlineContent({
1369
1551
  workspacePath,
1370
1552
  source,
@@ -1381,13 +1563,19 @@ async function ingestSources({
1381
1563
  uri: source.uri,
1382
1564
  message: error instanceof Error ? error.message : String(error)
1383
1565
  });
1566
+ reportProgressDetail(progress, `Failed source ${source.name}: ${error instanceof Error ? error.message : String(error)}`);
1384
1567
  }
1568
+ reportProgress(
1569
+ progress,
1570
+ `Finished ${source.name}: +${added - sourceBefore.added} added, ${changed - sourceBefore.changed} changed, ${unchanged - sourceBefore.unchanged} unchanged, ${failed - sourceBefore.failed} failed`
1571
+ );
1385
1572
  }
1386
1573
  const expiringDocuments = [...nextDocuments.values()].filter((document) => {
1387
1574
  const source = sources.find((candidate) => candidate.id === document.sourceId);
1388
1575
  return source ? shouldExpireRssDocument(document, source, defaultRetentionDays) : false;
1389
1576
  });
1390
1577
  if (expiringDocuments.length > 0) {
1578
+ reportProgress(progress, `Removing ${expiringDocuments.length} expired RSS document${expiringDocuments.length === 1 ? "" : "s"}`);
1391
1579
  const expiredIds = new Set(expiringDocuments.map((document) => document.id));
1392
1580
  for (const document of expiringDocuments) {
1393
1581
  nextDocuments.delete(document.id);
@@ -1414,6 +1602,7 @@ async function ingestSources({
1414
1602
  documentsSnapshot: documentSnapshot(finalDocuments)
1415
1603
  };
1416
1604
  await writeRun(workspacePath, run);
1605
+ reportProgress(progress, `Ingest complete: ${added} added, ${changed} changed, ${unchanged} unchanged, ${failed} failed`);
1417
1606
  return {
1418
1607
  runId: id,
1419
1608
  documents: { added, changed, unchanged, failed },
@@ -1423,7 +1612,8 @@ async function ingestSources({
1423
1612
  async function reprocessDocuments({
1424
1613
  workspacePath,
1425
1614
  sourceId,
1426
- documentId
1615
+ documentId,
1616
+ progress
1427
1617
  }) {
1428
1618
  const documents = await loadDocuments(workspacePath);
1429
1619
  const sources = await listSources(workspacePath);
@@ -1431,15 +1621,20 @@ async function reprocessDocuments({
1431
1621
  const nextDocuments = new Map(documents.map((document) => [document.id, document]));
1432
1622
  let documentsReprocessed = 0;
1433
1623
  let documentsSkipped = 0;
1434
- for (const document of documents.filter((candidate) => (!sourceId || candidate.sourceId === sourceId) && (!documentId || candidate.id === documentId))) {
1624
+ const targets = documents.filter((candidate) => (!sourceId || candidate.sourceId === sourceId) && (!documentId || candidate.id === documentId));
1625
+ reportProgress(progress, `Reprocessing ${targets.length} document${targets.length === 1 ? "" : "s"}`);
1626
+ for (const document of targets) {
1627
+ reportProgressDetail(progress, `Reprocessing ${document.id} (${document.title})`);
1435
1628
  const source = sourceMap.get(document.sourceId);
1436
1629
  if (!source || !document.rawPath || !await fileExists(document.rawPath)) {
1437
1630
  documentsSkipped += 1;
1631
+ reportProgressDetail(progress, `Skipped ${document.id}: raw source not available`);
1438
1632
  continue;
1439
1633
  }
1440
1634
  const updated = source.type === "url" || source.type === "website" || source.type === "rss" ? await reprocessRemoteDocument(document, source) : await reprocessStoredDocument(document, source);
1441
1635
  if (!updated) {
1442
1636
  documentsSkipped += 1;
1637
+ reportProgressDetail(progress, `Skipped ${document.id}: source type could not be reprocessed`);
1443
1638
  continue;
1444
1639
  }
1445
1640
  nextDocuments.set(updated.id, updated);
@@ -1459,6 +1654,7 @@ async function reprocessDocuments({
1459
1654
  },
1460
1655
  documentsSnapshot: documentSnapshot(finalDocuments)
1461
1656
  });
1657
+ reportProgress(progress, `Reprocess complete: ${documentsReprocessed} updated, ${documentsSkipped} skipped`);
1462
1658
  return { runId: id, documentsReprocessed, documentsSkipped };
1463
1659
  }
1464
1660
 
@@ -1560,11 +1756,13 @@ function buildChunksForDocument(document, markdown, config, prior = /* @__PURE__
1560
1756
  async function chunkDocuments({
1561
1757
  workspacePath,
1562
1758
  sourceId,
1563
- documentId
1759
+ documentId,
1760
+ progress
1564
1761
  }) {
1565
1762
  const config = await loadConfig(workspacePath);
1566
1763
  const documents = await readJsonl(path11.join(workspacePath, "documents", "documents.jsonl"));
1567
1764
  const filtered = documents.filter((document) => (!sourceId || document.sourceId === sourceId) && (!documentId || document.id === documentId));
1765
+ reportProgress(progress, `Chunking ${filtered.length} document${filtered.length === 1 ? "" : "s"}`);
1568
1766
  const targetedDocumentIds = new Set(filtered.map((document) => document.id));
1569
1767
  const existingChunks = await loadChunks(workspacePath);
1570
1768
  const prior = new Map(existingChunks.map((chunk) => [chunk.id, chunk]));
@@ -1572,12 +1770,14 @@ async function chunkDocuments({
1572
1770
  existingChunks.filter((chunk) => !targetedDocumentIds.has(chunk.documentId)).map((chunk) => [chunk.id, chunk])
1573
1771
  );
1574
1772
  for (const document of filtered) {
1773
+ reportProgressDetail(progress, `Chunking ${document.id} (${document.title})`);
1575
1774
  const raw = await readFile8(document.normalizedPath, "utf8");
1576
1775
  for (const chunk of buildChunksForDocument(document, raw, config, prior)) {
1577
1776
  nextChunks.set(chunk.id, chunk);
1578
1777
  }
1579
1778
  }
1580
1779
  await saveChunks(workspacePath, [...nextChunks.values()]);
1780
+ reportProgress(progress, `Chunking complete: ${nextChunks.size} chunk${nextChunks.size === 1 ? "" : "s"} written`);
1581
1781
  return { chunksWritten: nextChunks.size };
1582
1782
  }
1583
1783
 
@@ -1586,15 +1786,31 @@ import { Analyzer, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, Ranking
1586
1786
  import path17 from "path";
1587
1787
 
1588
1788
  // src/vector/dense.ts
1589
- import { VectorFieldIndex, createSeededRandom } from "@tryformation/querylight-ts";
1789
+ import { VectorFieldIndex, cosineSimilarity, createSeededRandom } from "@tryformation/querylight-ts";
1590
1790
  import { mkdir as mkdir7 } from "fs/promises";
1591
1791
  import path14 from "path";
1592
1792
 
1593
1793
  // src/vector/runtime.ts
1794
+ import os from "os";
1594
1795
  import path12 from "path";
1595
1796
  import { fileURLToPath } from "url";
1596
1797
  import { execFile, execFileSync } from "child_process";
1798
+ function resolveQliHomeDir() {
1799
+ return path12.resolve(process.env.QLI_HOME ?? path12.join(os.homedir(), ".qli"));
1800
+ }
1597
1801
  function resolveCacheDir(workspacePath, configuredPath) {
1802
+ if (configuredPath === "~/.qli") {
1803
+ return resolveQliHomeDir();
1804
+ }
1805
+ if (configuredPath.startsWith("~/.qli/")) {
1806
+ return path12.join(resolveQliHomeDir(), configuredPath.slice("~/.qli/".length));
1807
+ }
1808
+ if (configuredPath === "~") {
1809
+ return os.homedir();
1810
+ }
1811
+ if (configuredPath.startsWith("~/")) {
1812
+ return path12.join(os.homedir(), configuredPath.slice(2));
1813
+ }
1598
1814
  return path12.isAbsolute(configuredPath) ? configuredPath : path12.resolve(workspacePath, configuredPath.replace(/^\.kb\//, ""));
1599
1815
  }
1600
1816
  function packageRootFromImportMeta(importMetaUrl) {
@@ -1618,6 +1834,14 @@ async function ensureUvAvailable() {
1618
1834
  execFile("uv", ["--version"], (error) => error ? reject(error) : resolve2());
1619
1835
  });
1620
1836
  }
1837
+ async function isUvAvailable() {
1838
+ try {
1839
+ await ensureUvAvailable();
1840
+ return true;
1841
+ } catch {
1842
+ return false;
1843
+ }
1844
+ }
1621
1845
  async function runSparsePython({
1622
1846
  workspacePath,
1623
1847
  config,
@@ -1661,47 +1885,95 @@ async function getDenseTransformersRuntime(cacheDir) {
1661
1885
  }
1662
1886
 
1663
1887
  // src/vector/store.ts
1664
- import { mkdir as mkdir6, readFile as readFile9, writeFile as writeFile6 } from "fs/promises";
1888
+ import { mkdir as mkdir6, rm as rm2, writeFile as writeFile7 } from "fs/promises";
1665
1889
  import path13 from "path";
1890
+
1891
+ // src/core/gzip-json.ts
1892
+ import { readFile as readFile9, writeFile as writeFile6 } from "fs/promises";
1893
+ import { promisify } from "util";
1894
+ import { gunzip, gzip } from "zlib";
1895
+ var gzipAsync = promisify(gzip);
1896
+ var gunzipAsync = promisify(gunzip);
1897
+ async function writeGzipJson(filePath, value) {
1898
+ const payload = JSON.stringify(value, null, 2);
1899
+ await writeFile6(filePath, await gzipAsync(Buffer.from(payload, "utf8")));
1900
+ }
1901
+ async function readJsonFromGzipOrFile(gzipPath, legacyPath) {
1902
+ if (await fileExists(gzipPath)) {
1903
+ const payload = await readFile9(gzipPath);
1904
+ return JSON.parse((await gunzipAsync(payload)).toString("utf8"));
1905
+ }
1906
+ if (legacyPath && await fileExists(legacyPath)) {
1907
+ return JSON.parse(await readFile9(legacyPath, "utf8"));
1908
+ }
1909
+ return JSON.parse(await readFile9(gzipPath, "utf8"));
1910
+ }
1911
+
1912
+ // src/vector/store.ts
1666
1913
  function vectorsDir(workspacePath) {
1667
1914
  return path13.join(workspacePath, "vectors");
1668
1915
  }
1669
- function modelsDir(workspacePath) {
1670
- return path13.join(workspacePath, "models");
1916
+ function sharedModelStateDir() {
1917
+ return path13.join(resolveQliHomeDir(), "models", "status");
1671
1918
  }
1672
1919
  function denseVectorPath(workspacePath) {
1673
- return path13.join(vectorsDir(workspacePath), "dense.latest.json");
1920
+ return path13.join(vectorsDir(workspacePath), "dense.latest.json.gz");
1674
1921
  }
1675
1922
  function denseMetaPath(workspacePath) {
1676
- return path13.join(vectorsDir(workspacePath), "dense.latest.meta.json");
1923
+ return path13.join(vectorsDir(workspacePath), "dense.latest.meta.json.gz");
1677
1924
  }
1678
1925
  function sparseVectorPath(workspacePath) {
1679
- return path13.join(vectorsDir(workspacePath), "sparse.latest.json");
1926
+ return path13.join(vectorsDir(workspacePath), "sparse.latest.json.gz");
1680
1927
  }
1681
1928
  function sparseMetaPath(workspacePath) {
1929
+ return path13.join(vectorsDir(workspacePath), "sparse.latest.meta.json.gz");
1930
+ }
1931
+ function legacyDenseVectorPath(workspacePath) {
1932
+ return path13.join(vectorsDir(workspacePath), "dense.latest.json");
1933
+ }
1934
+ function legacyDenseMetaPath(workspacePath) {
1935
+ return path13.join(vectorsDir(workspacePath), "dense.latest.meta.json");
1936
+ }
1937
+ function legacySparseVectorPath(workspacePath) {
1938
+ return path13.join(vectorsDir(workspacePath), "sparse.latest.json");
1939
+ }
1940
+ function legacySparseMetaPath(workspacePath) {
1682
1941
  return path13.join(vectorsDir(workspacePath), "sparse.latest.meta.json");
1683
1942
  }
1684
- function densePullMarker(workspacePath) {
1685
- return path13.join(modelsDir(workspacePath), "dense.pulled.json");
1943
+ function pullMarkerPath(type, workspacePath, modelId, cacheDir) {
1944
+ const resolvedCacheDir = resolveCacheDir(workspacePath, cacheDir);
1945
+ const cacheKey = sha256(resolvedCacheDir).slice(0, 16);
1946
+ return path13.join(sharedModelStateDir(), type, `${encodeURIComponent(modelId)}.${cacheKey}.json`);
1947
+ }
1948
+ function densePullMarker(workspacePath, modelId, cacheDir) {
1949
+ return pullMarkerPath("dense", workspacePath, modelId, cacheDir);
1686
1950
  }
1687
- function sparsePullMarker(workspacePath) {
1688
- return path13.join(modelsDir(workspacePath), "sparse.pulled.json");
1951
+ function sparsePullMarker(workspacePath, modelId, cacheDir) {
1952
+ return pullMarkerPath("sparse", workspacePath, modelId, cacheDir);
1689
1953
  }
1690
1954
  async function writeDensePayload(workspacePath, payload) {
1691
1955
  await mkdir6(vectorsDir(workspacePath), { recursive: true });
1692
- await writeFile6(denseVectorPath(workspacePath), JSON.stringify(payload, null, 2), "utf8");
1693
- await writeFile6(denseMetaPath(workspacePath), JSON.stringify(payload.metadata, null, 2), "utf8");
1956
+ await writeGzipJson(denseVectorPath(workspacePath), payload);
1957
+ await writeGzipJson(denseMetaPath(workspacePath), payload.metadata);
1958
+ await Promise.all([
1959
+ rm2(legacyDenseVectorPath(workspacePath), { force: true }),
1960
+ rm2(legacyDenseMetaPath(workspacePath), { force: true })
1961
+ ]);
1694
1962
  }
1695
1963
  async function readDensePayload(workspacePath) {
1696
- return JSON.parse(await readFile9(denseVectorPath(workspacePath), "utf8"));
1964
+ return readJsonFromGzipOrFile(denseVectorPath(workspacePath), legacyDenseVectorPath(workspacePath));
1697
1965
  }
1698
1966
  async function writeSparsePayload(workspacePath, payload) {
1699
1967
  await mkdir6(vectorsDir(workspacePath), { recursive: true });
1700
- await writeFile6(sparseVectorPath(workspacePath), JSON.stringify(payload, null, 2), "utf8");
1701
- await writeFile6(sparseMetaPath(workspacePath), JSON.stringify(payload.metadata, null, 2), "utf8");
1968
+ await writeGzipJson(sparseVectorPath(workspacePath), payload);
1969
+ await writeGzipJson(sparseMetaPath(workspacePath), payload.metadata);
1970
+ await Promise.all([
1971
+ rm2(legacySparseVectorPath(workspacePath), { force: true }),
1972
+ rm2(legacySparseMetaPath(workspacePath), { force: true })
1973
+ ]);
1702
1974
  }
1703
1975
  async function readSparsePayload(workspacePath) {
1704
- return JSON.parse(await readFile9(sparseVectorPath(workspacePath), "utf8"));
1976
+ return readJsonFromGzipOrFile(sparseVectorPath(workspacePath), legacySparseVectorPath(workspacePath));
1705
1977
  }
1706
1978
  async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
1707
1979
  const denseCacheDir = resolveCacheDir(workspacePath, dense.cacheDir);
@@ -1711,30 +1983,72 @@ async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
1711
1983
  configured: dense.enabled,
1712
1984
  modelId: dense.modelId,
1713
1985
  cacheDir: denseCacheDir,
1714
- available: await fileExists(densePullMarker(workspacePath)),
1715
- artifactExists: await fileExists(denseVectorPath(workspacePath))
1986
+ available: await fileExists(densePullMarker(workspacePath, dense.modelId, dense.cacheDir)),
1987
+ artifactExists: await fileExists(denseVectorPath(workspacePath)) || await fileExists(legacyDenseVectorPath(workspacePath))
1716
1988
  },
1717
1989
  sparse: {
1718
1990
  configured: sparse.enabled,
1719
1991
  modelId: sparse.modelId,
1720
1992
  cacheDir: sparseCacheDir,
1721
1993
  uvAvailable,
1722
- available: await fileExists(sparsePullMarker(workspacePath)),
1723
- artifactExists: await fileExists(sparseVectorPath(workspacePath))
1994
+ available: await fileExists(sparsePullMarker(workspacePath, sparse.modelId, sparse.cacheDir)),
1995
+ artifactExists: await fileExists(sparseVectorPath(workspacePath)) || await fileExists(legacySparseVectorPath(workspacePath))
1724
1996
  }
1725
1997
  };
1726
1998
  }
1727
1999
 
1728
2000
  // src/vector/text.ts
2001
+ var LOW_SIGNAL_HEADINGS = /* @__PURE__ */ new Set([
2002
+ "choose this instead of",
2003
+ "how xyz runs it",
2004
+ "naechste schritte",
2005
+ "next steps",
2006
+ "overview",
2007
+ "passend wenn",
2008
+ "problem",
2009
+ "right fit",
2010
+ "waehlen sie das stattdessen",
2011
+ "was sie bekommen",
2012
+ "what you get",
2013
+ "wie xyz es umsetzt",
2014
+ "uberblick",
2015
+ "\xFCberblick"
2016
+ ]);
2017
+ function normalizeHeading(value) {
2018
+ return value.trim().toLowerCase();
2019
+ }
2020
+ function isLowSignalHeading(value) {
2021
+ return LOW_SIGNAL_HEADINGS.has(normalizeHeading(value));
2022
+ }
2023
+ function stripLeadingHeading(text, heading) {
2024
+ const lines = text.split("\n");
2025
+ const firstContentIndex = lines.findIndex((line) => line.trim().length > 0);
2026
+ if (firstContentIndex < 0) {
2027
+ return text;
2028
+ }
2029
+ const match = /^(#{1,6})\s+(.+)$/.exec(lines[firstContentIndex] ?? "");
2030
+ if (!match?.[2] || normalizeHeading(match[2]) !== normalizeHeading(heading)) {
2031
+ return text;
2032
+ }
2033
+ const next = [...lines.slice(0, firstContentIndex), ...lines.slice(firstContentIndex + 1)].join("\n").trim();
2034
+ return next;
2035
+ }
2036
+ function createVectorText(chunk) {
2037
+ const meaningfulHeadings = chunk.headingPath.filter((heading) => !isLowSignalHeading(heading) && normalizeHeading(heading) !== normalizeHeading(chunk.title));
2038
+ const textHeading = [...chunk.headingPath].reverse().find((heading) => isLowSignalHeading(heading) || normalizeHeading(heading) === normalizeHeading(chunk.title));
2039
+ const body = textHeading ? stripLeadingHeading(chunk.text, textHeading) : chunk.text.trim();
2040
+ return [chunk.title, ...meaningfulHeadings, body].filter(Boolean).join("\n\n");
2041
+ }
1729
2042
  function createDenseChunkText(chunk) {
1730
- return [chunk.title, ...chunk.headingPath, chunk.text].filter(Boolean).join("\n\n");
2043
+ return createVectorText(chunk);
1731
2044
  }
1732
2045
  function createSparseChunkText(chunk) {
1733
- return [chunk.title, ...chunk.headingPath, chunk.text].filter(Boolean).join("\n\n");
2046
+ return createVectorText(chunk);
1734
2047
  }
1735
2048
 
1736
2049
  // src/vector/dense.ts
1737
2050
  var denseEmbedderFactory = null;
2051
+ var EXACT_DENSE_RERANK_THRESHOLD = 5e3;
1738
2052
  async function createEmbedder(cacheDir, modelId) {
1739
2053
  if (denseEmbedderFactory) {
1740
2054
  return denseEmbedderFactory(cacheDir, modelId);
@@ -1746,9 +2060,13 @@ async function createEmbedder(cacheDir, modelId) {
1746
2060
  return output.tolist()[0];
1747
2061
  };
1748
2062
  }
2063
+ function exactDenseQuery(payload, vector, topK) {
2064
+ return payload.chunks.map((chunk) => [chunk.chunkId, cosineSimilarity(vector, chunk.embedding)]).sort((left, right) => right[1] - left[1]).slice(0, topK);
2065
+ }
1749
2066
  async function buildDenseVectors({
1750
2067
  workspacePath,
1751
- config
2068
+ config,
2069
+ progress
1752
2070
  }) {
1753
2071
  const chunks = await readJsonl(path14.join(workspacePath, "chunks", "chunks.jsonl"));
1754
2072
  const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
@@ -1756,6 +2074,7 @@ async function buildDenseVectors({
1756
2074
  const embed = await createEmbedder(cacheDir, config.modelId);
1757
2075
  const records = [];
1758
2076
  let dimensions = 0;
2077
+ reportProgress(progress, `Encoding ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} for dense retrieval`);
1759
2078
  for (const chunk of chunks) {
1760
2079
  const embedding = await embed(createDenseChunkText(chunk));
1761
2080
  dimensions ||= embedding.length;
@@ -1769,7 +2088,11 @@ async function buildDenseVectors({
1769
2088
  text: chunk.text,
1770
2089
  embedding
1771
2090
  });
2091
+ if (records.length === 1 || records.length % 100 === 0 || records.length === chunks.length) {
2092
+ reportProgressDetail(progress, `Encoded ${records.length}/${chunks.length} chunks for dense retrieval`);
2093
+ }
1772
2094
  }
2095
+ reportProgress(progress, "Building dense vector index");
1773
2096
  const index = new VectorFieldIndex({
1774
2097
  numHashTables: config.indexHashTables,
1775
2098
  dimensions,
@@ -1793,6 +2116,7 @@ async function buildDenseVectors({
1793
2116
  chunks: records
1794
2117
  };
1795
2118
  await writeDensePayload(workspacePath, payload);
2119
+ reportProgress(progress, `Dense vectors written for ${records.length} chunk${records.length === 1 ? "" : "s"}`);
1796
2120
  return payload;
1797
2121
  }
1798
2122
  async function denseQuery({
@@ -1805,12 +2129,19 @@ async function denseQuery({
1805
2129
  const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
1806
2130
  const embed = await createEmbedder(cacheDir, config.modelId);
1807
2131
  const vector = await embed(query);
2132
+ if (payload.chunks.length <= EXACT_DENSE_RERANK_THRESHOLD) {
2133
+ return exactDenseQuery(payload, vector, topK);
2134
+ }
1808
2135
  const index = new VectorFieldIndex({
1809
2136
  numHashTables: payload.metadata.hashTables,
1810
2137
  dimensions: payload.metadata.dimensions,
1811
2138
  random: createSeededRandom(payload.metadata.randomSeed)
1812
2139
  }).loadState(payload.indexState);
1813
- return index.query(vector, topK);
2140
+ const approximateHits = index.query(vector, topK);
2141
+ if (approximateHits.length >= topK) {
2142
+ return approximateHits;
2143
+ }
2144
+ return exactDenseQuery(payload, vector, topK);
1814
2145
  }
1815
2146
 
1816
2147
  // src/vector/sparse.ts
@@ -1904,10 +2235,13 @@ async function buildSparseDocuments(workspacePath, config, chunks) {
1904
2235
  }
1905
2236
  async function buildSparseVectors({
1906
2237
  workspacePath,
1907
- config
2238
+ config,
2239
+ progress
1908
2240
  }) {
1909
2241
  const chunks = await readJsonl(path15.join(workspacePath, "chunks", "chunks.jsonl"));
2242
+ reportProgress(progress, `Encoding ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} for sparse retrieval`);
1910
2243
  const built = await buildSparseDocuments(workspacePath, config, chunks);
2244
+ reportProgress(progress, "Building sparse vector index");
1911
2245
  const index = new SparseVectorFieldIndex();
1912
2246
  for (const record of built.chunks) {
1913
2247
  index.insert(record.chunkId, [record.vector]);
@@ -1929,6 +2263,7 @@ async function buildSparseVectors({
1929
2263
  queryTokenWeights: built.queryTokenWeights
1930
2264
  };
1931
2265
  await writeSparsePayload(workspacePath, payload);
2266
+ reportProgress(progress, `Sparse vectors written for ${built.chunks.length} chunk${built.chunks.length === 1 ? "" : "s"}`);
1932
2267
  return payload;
1933
2268
  }
1934
2269
  async function sparseQuery({
@@ -1951,51 +2286,80 @@ async function buildVectorArtifacts({
1951
2286
  config,
1952
2287
  denseOverride,
1953
2288
  sparseOverride,
1954
- buildAvailableModels = false
2289
+ buildAvailableModels = false,
2290
+ progress
1955
2291
  }) {
1956
- const modelStatus = buildAvailableModels ? await buildModelStatus(workspacePath, config.retrieval.dense, config.retrieval.sparse, await (async () => {
1957
- try {
1958
- await ensureUvAvailable();
1959
- return true;
1960
- } catch {
1961
- return false;
1962
- }
1963
- })()) : null;
2292
+ const uvAvailable = await isUvAvailable();
2293
+ const modelStatus = buildAvailableModels ? await buildModelStatus(workspacePath, config.retrieval.dense, config.retrieval.sparse, uvAvailable) : null;
1964
2294
  const denseEnabled = denseOverride ?? (buildAvailableModels ? config.retrieval.dense.enabled || Boolean(modelStatus?.dense.available) : config.retrieval.dense.enabled);
1965
- const sparseEnabled = sparseOverride ?? (buildAvailableModels ? (config.retrieval.sparse.enabled || Boolean(modelStatus?.sparse.available)) && Boolean(modelStatus?.sparse.uvAvailable) : config.retrieval.sparse.enabled);
2295
+ const sparseEnabled = sparseOverride ?? (buildAvailableModels ? (config.retrieval.sparse.enabled || Boolean(modelStatus?.sparse.available)) && Boolean(modelStatus?.sparse.uvAvailable) : config.retrieval.sparse.enabled && uvAvailable);
1966
2296
  const result = {};
1967
2297
  if (denseEnabled) {
1968
- result.dense = await buildDenseVectors({ workspacePath, config: config.retrieval.dense });
2298
+ reportProgress(progress, `Building dense vectors with ${config.retrieval.dense.modelId}`);
2299
+ result.dense = await buildDenseVectors({ workspacePath, config: config.retrieval.dense, progress });
2300
+ }
2301
+ if ((sparseOverride || config.retrieval.sparse.enabled) && !uvAvailable) {
2302
+ reportProgress(progress, "Skipping sparse vectors because uv is not available");
1969
2303
  }
1970
2304
  if (sparseEnabled) {
1971
- result.sparse = await buildSparseVectors({ workspacePath, config: config.retrieval.sparse });
2305
+ reportProgress(progress, `Building sparse vectors with ${config.retrieval.sparse.modelId}`);
2306
+ result.sparse = await buildSparseVectors({ workspacePath, config: config.retrieval.sparse, progress });
1972
2307
  }
1973
2308
  return result;
1974
2309
  }
1975
2310
 
1976
2311
  // src/index/index-store.ts
1977
- import { readFile as readFile10, writeFile as writeFile7 } from "fs/promises";
2312
+ import { mkdir as mkdir9, rm as rm3 } from "fs/promises";
1978
2313
  import path16 from "path";
2314
+ function versionedIndexPath(workspacePath, stamp) {
2315
+ return path16.join(workspacePath, "indexes", `${stamp}.json.gz`);
2316
+ }
2317
+ function versionedLegacyIndexPath(workspacePath, stamp) {
2318
+ return path16.join(workspacePath, "indexes", `${stamp}.json`);
2319
+ }
2320
+ function versionedMetaPath(workspacePath, stamp) {
2321
+ return path16.join(workspacePath, "indexes", `${stamp}.meta.json.gz`);
2322
+ }
2323
+ function versionedLegacyMetaPath(workspacePath, stamp) {
2324
+ return path16.join(workspacePath, "indexes", `${stamp}.meta.json`);
2325
+ }
2326
+ function latestIndexPath(workspacePath) {
2327
+ return path16.join(workspacePath, "indexes", "latest.json.gz");
2328
+ }
2329
+ function legacyLatestIndexPath(workspacePath) {
2330
+ return path16.join(workspacePath, "indexes", "latest.json");
2331
+ }
2332
+ function latestMetaPath(workspacePath) {
2333
+ return path16.join(workspacePath, "indexes", "latest.meta.json.gz");
2334
+ }
2335
+ function legacyLatestMetaPath(workspacePath) {
2336
+ return path16.join(workspacePath, "indexes", "latest.meta.json");
2337
+ }
1979
2338
  async function writeIndexArtifacts({
1980
2339
  workspacePath,
1981
2340
  indexState,
1982
2341
  metadata
1983
2342
  }) {
1984
2343
  const stamp = metadata.createdAt.replace(/[:.]/g, "-");
1985
- const indexPath = path16.join(workspacePath, "indexes", `${stamp}.json`);
1986
- const metaPath = path16.join(workspacePath, "indexes", `${stamp}.meta.json`);
1987
- const latestIndexPath = path16.join(workspacePath, "indexes", "latest.json");
1988
- const latestMetaPath = path16.join(workspacePath, "indexes", "latest.meta.json");
1989
- const indexPayload = JSON.stringify(indexState, null, 2);
1990
- const metaPayload = JSON.stringify(metadata, null, 2);
1991
- await writeFile7(indexPath, indexPayload, "utf8");
1992
- await writeFile7(metaPath, metaPayload, "utf8");
1993
- await writeFile7(latestIndexPath, indexPayload, "utf8");
1994
- await writeFile7(latestMetaPath, metaPayload, "utf8");
1995
- return { indexPath: latestIndexPath, metadataPath: latestMetaPath };
2344
+ const indexPath = versionedIndexPath(workspacePath, stamp);
2345
+ const metaPath = versionedMetaPath(workspacePath, stamp);
2346
+ const latestIndexArtifactPath = latestIndexPath(workspacePath);
2347
+ const latestMetadataArtifactPath = latestMetaPath(workspacePath);
2348
+ await mkdir9(path16.join(workspacePath, "indexes"), { recursive: true });
2349
+ await writeGzipJson(indexPath, indexState);
2350
+ await writeGzipJson(metaPath, metadata);
2351
+ await writeGzipJson(latestIndexArtifactPath, indexState);
2352
+ await writeGzipJson(latestMetadataArtifactPath, metadata);
2353
+ await Promise.all([
2354
+ rm3(legacyLatestIndexPath(workspacePath), { force: true }),
2355
+ rm3(legacyLatestMetaPath(workspacePath), { force: true }),
2356
+ rm3(versionedLegacyIndexPath(workspacePath, stamp), { force: true }),
2357
+ rm3(versionedLegacyMetaPath(workspacePath, stamp), { force: true })
2358
+ ]);
2359
+ return { indexPath: latestIndexArtifactPath, metadataPath: latestMetadataArtifactPath };
1996
2360
  }
1997
2361
  async function readLatestIndexState(workspacePath) {
1998
- return JSON.parse(await readFile10(path16.join(workspacePath, "indexes", "latest.json"), "utf8"));
2362
+ return readJsonFromGzipOrFile(latestIndexPath(workspacePath), legacyLatestIndexPath(workspacePath));
1999
2363
  }
2000
2364
 
2001
2365
  // src/index/querylight-indexer.ts
@@ -2037,14 +2401,17 @@ async function buildIndex({
2037
2401
  workspacePath,
2038
2402
  denseOverride,
2039
2403
  sparseOverride,
2040
- buildAvailableModels = false
2404
+ buildAvailableModels = false,
2405
+ progress
2041
2406
  }) {
2042
2407
  const config = await loadConfig(workspacePath);
2408
+ reportProgress(progress, "Loading documents, chunks, and sources");
2043
2409
  const chunks = await readJsonl(path17.join(workspacePath, "chunks", "chunks.jsonl"));
2044
2410
  const documents = await readJsonl(path17.join(workspacePath, "documents", "documents.jsonl"));
2045
2411
  const sources = await readJsonl(path17.join(workspacePath, "sources", "sources.jsonl"));
2046
2412
  const metadataFields = [...new Set(chunks.flatMap((chunk) => Object.keys(chunk.metadata).map((key) => `metadata.${key}`)))];
2047
2413
  const index = new DocumentIndex(createIndexMapping(metadataFields));
2414
+ reportProgress(progress, `Building lexical index from ${chunks.length} chunk${chunks.length === 1 ? "" : "s"}`);
2048
2415
  for (const chunk of chunks) {
2049
2416
  index.index({
2050
2417
  id: chunk.id,
@@ -2059,6 +2426,7 @@ async function buildIndex({
2059
2426
  }
2060
2427
  });
2061
2428
  }
2429
+ reportProgressDetail(progress, `Indexed ${documents.length} document${documents.length === 1 ? "" : "s"} across ${sources.length} source${sources.length === 1 ? "" : "s"}`);
2062
2430
  const createdAt = (/* @__PURE__ */ new Date()).toISOString();
2063
2431
  const metadata = {
2064
2432
  id: `index_${createdAt.replace(/[:.]/g, "-")}`,
@@ -2071,14 +2439,17 @@ async function buildIndex({
2071
2439
  fields: Object.keys(index.mapping),
2072
2440
  indexHash: sha256(JSON.stringify(index.indexState))
2073
2441
  };
2442
+ reportProgress(progress, "Writing lexical index artifacts");
2074
2443
  const artifacts = await writeIndexArtifacts({ workspacePath, indexState: index.indexState, metadata });
2075
2444
  const vectors = await buildVectorArtifacts({
2076
2445
  workspacePath,
2077
2446
  config,
2078
2447
  denseOverride,
2079
2448
  sparseOverride,
2080
- buildAvailableModels
2449
+ buildAvailableModels,
2450
+ progress
2081
2451
  });
2452
+ reportProgress(progress, `Index build complete: dense=${Boolean(vectors.dense)}, sparse=${Boolean(vectors.sparse)}`);
2082
2453
  return {
2083
2454
  metadata,
2084
2455
  indexPath: artifacts.indexPath,
@@ -2088,11 +2459,19 @@ async function buildIndex({
2088
2459
  }
2089
2460
 
2090
2461
  // src/query/search-service.ts
2091
- import { readFile as readFile11 } from "fs/promises";
2462
+ import { readFile as readFile10 } from "fs/promises";
2092
2463
  import { BoolQuery, MatchQuery, OP, TermQuery, reciprocalRankFusion } from "@tryformation/querylight-ts";
2093
2464
  import path18 from "path";
2094
2465
  async function loadHydratedIndex(workspacePath) {
2095
- const state = await readLatestIndexState(workspacePath);
2466
+ let state;
2467
+ try {
2468
+ state = await readLatestIndexState(workspacePath);
2469
+ } catch (error) {
2470
+ if (error.code === "ENOENT") {
2471
+ throw new CliError("lexical index is not built; run `qli rebuild` or `qli chunk` followed by `qli index build`", "INDEX_MISSING", 7 /* QueryError */);
2472
+ }
2473
+ throw error;
2474
+ }
2096
2475
  const mapping = createIndexMapping(Object.keys(state.fieldState ?? {}).filter((field) => field.startsWith("metadata.")));
2097
2476
  return new (await import("@tryformation/querylight-ts")).DocumentIndex(mapping).loadState(state);
2098
2477
  }
@@ -2310,7 +2689,7 @@ async function buildSnippetWithAdjacentChunks(chunk, query, {
2310
2689
  if (!await fileExists(document.normalizedPath)) {
2311
2690
  return buildSnippet(chunk.text, query);
2312
2691
  }
2313
- const raw = await readFile11(document.normalizedPath, "utf8");
2692
+ const raw = await readFile10(document.normalizedPath, "utf8");
2314
2693
  orderedChunks = buildChunksForDocument(document, raw, config);
2315
2694
  orderedChunkCache.set(document.id, orderedChunks);
2316
2695
  }
@@ -2328,9 +2707,25 @@ async function buildSnippetWithAdjacentChunks(chunk, query, {
2328
2707
  function normalizeDisplayTitle(title) {
2329
2708
  return title.replace(/\s*\|\s*Querylight TS Demo\s*$/i, "").replace(/\s+/g, " ").trim();
2330
2709
  }
2710
+ var LOW_SIGNAL_RESULT_TITLES = /* @__PURE__ */ new Set([
2711
+ "choose this instead of",
2712
+ "how xyz runs it",
2713
+ "naechste schritte",
2714
+ "next steps",
2715
+ "overview",
2716
+ "passend wenn",
2717
+ "problem",
2718
+ "right fit",
2719
+ "waehlen sie das stattdessen",
2720
+ "was sie bekommen",
2721
+ "what you get",
2722
+ "wie xyz es umsetzt",
2723
+ "uberblick",
2724
+ "\xFCberblick"
2725
+ ]);
2331
2726
  function chooseResultTitle(chunk) {
2332
2727
  const documentTitle = normalizeDisplayTitle(chunk.title);
2333
- const headings = chunk.headingPath.map((heading) => normalizeDisplayTitle(heading)).filter(Boolean);
2728
+ const headings = chunk.headingPath.map((heading) => normalizeDisplayTitle(heading)).filter((heading) => heading.length > 0 && !LOW_SIGNAL_RESULT_TITLES.has(heading.toLowerCase()));
2334
2729
  const leafHeading = headings.at(-1);
2335
2730
  if (leafHeading && leafHeading.toLowerCase() !== documentTitle.toLowerCase()) {
2336
2731
  return leafHeading;
@@ -2352,6 +2747,9 @@ function normalizeUriPath(uri) {
2352
2747
  return uri.toLowerCase().replace(/\/+$/, "");
2353
2748
  }
2354
2749
  }
2750
+ function normalizeUriIdentity(uri) {
2751
+ return normalizeRemoteUrl(uri).toLowerCase().replace(/\/+$/, "");
2752
+ }
2355
2753
  function uriSpecificity(uri) {
2356
2754
  const normalized = normalizeUriPath(uri);
2357
2755
  if (normalized === "/") {
@@ -2368,6 +2766,11 @@ function isMoreSpecificDuplicate(candidate, existing) {
2368
2766
  if (!candidateTitle || candidateTitle !== existingTitle) {
2369
2767
  return false;
2370
2768
  }
2769
+ const candidateIdentity = normalizeUriIdentity(candidate.uri);
2770
+ const existingIdentity = normalizeUriIdentity(existing.uri);
2771
+ if (candidateIdentity === existingIdentity) {
2772
+ return candidate.uri.length < existing.uri.length;
2773
+ }
2371
2774
  const candidatePath = normalizeUriPath(candidate.uri);
2372
2775
  const existingPath = normalizeUriPath(existing.uri);
2373
2776
  if (candidatePath === existingPath) {
@@ -2480,7 +2883,6 @@ async function searchIndex({
2480
2883
  score: 0,
2481
2884
  title: chooseResultTitle(chunk),
2482
2885
  uri: chunk.uri,
2483
- headingPath: chunk.headingPath,
2484
2886
  snippet: await buildSnippetWithAdjacentChunks(chunk, document.title, {
2485
2887
  document,
2486
2888
  config,
@@ -2544,7 +2946,6 @@ async function searchIndex({
2544
2946
  score,
2545
2947
  title: chooseResultTitle(chunk),
2546
2948
  uri: chunk.uri,
2547
- headingPath: chunk.headingPath,
2548
2949
  snippet: await buildSnippetWithAdjacentChunks(chunk, normalizedQuery, {
2549
2950
  document: documents.get(chunk.documentId),
2550
2951
  config,
@@ -2564,7 +2965,7 @@ async function searchIndex({
2564
2965
 
2565
2966
  // src/query/related-service.ts
2566
2967
  import path19 from "path";
2567
- function cosineSimilarity(left, right) {
2968
+ function cosineSimilarity2(left, right) {
2568
2969
  let dot = 0;
2569
2970
  let leftNorm = 0;
2570
2971
  let rightNorm = 0;
@@ -2650,7 +3051,7 @@ async function findRelatedDocuments({
2650
3051
  const results = [...vectors.values()].filter((candidate) => candidate.document.id !== selected.id).map((candidate) => ({
2651
3052
  documentId: candidate.document.id,
2652
3053
  sourceId: candidate.document.sourceId,
2653
- score: cosineSimilarity(sourceVector.embedding, candidate.embedding),
3054
+ score: cosineSimilarity2(sourceVector.embedding, candidate.embedding),
2654
3055
  title: candidate.document.title,
2655
3056
  uri: candidate.document.uri,
2656
3057
  metadata: candidate.document.metadata
@@ -2690,7 +3091,6 @@ async function createContext({
2690
3091
  sourceId: result.sourceId,
2691
3092
  title: result.title,
2692
3093
  uri: result.uri,
2693
- headingPath: result.headingPath,
2694
3094
  text,
2695
3095
  metadata: result.metadata
2696
3096
  });
@@ -2703,7 +3103,6 @@ async function createContext({
2703
3103
  `Title: ${source.title}`,
2704
3104
  `URL: ${source.uri}`,
2705
3105
  `Chunk ID: ${source.chunkId}`,
2706
- source.headingPath.length > 0 ? `Heading Path: ${source.headingPath.join(" > ")}` : "",
2707
3106
  "",
2708
3107
  source.text,
2709
3108
  ""