@tryformation/querylight-cli 0.1.1 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli/main.js CHANGED
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env node
2
2
 
3
3
  // src/cli/run-cli.ts
4
- import { Command } from "commander";
4
+ import { Command, Option } from "commander";
5
5
  import { stat as stat4 } from "fs/promises";
6
6
  import path21 from "path";
7
7
 
@@ -14,6 +14,17 @@ import path4 from "path";
14
14
  import { readFile, writeFile } from "fs/promises";
15
15
  import path from "path";
16
16
  import YAML from "yaml";
17
+
18
+ // src/core/constants.ts
19
+ var PACKAGE_VERSION = "0.2.1";
20
+ var DEFAULT_WORKSPACE = ".kb";
21
+ var DEFAULT_SHARED_MODEL_CACHE_DIR = "~/.qli/models/huggingface";
22
+ var LEGACY_WORKSPACE_MODEL_CACHE_DIR = ".kb/models/huggingface";
23
+
24
+ // src/core/config.ts
25
+ function normalizeModelCacheDir(configuredPath) {
26
+ return configuredPath === LEGACY_WORKSPACE_MODEL_CACHE_DIR ? DEFAULT_SHARED_MODEL_CACHE_DIR : configuredPath;
27
+ }
17
28
  var defaultConfig = () => ({
18
29
  workspaceVersion: 1,
19
30
  index: {
@@ -41,17 +52,17 @@ var defaultConfig = () => ({
41
52
  retrieval: {
42
53
  defaultMode: "lexical",
43
54
  dense: {
44
- enabled: false,
55
+ enabled: true,
45
56
  modelId: "Xenova/all-MiniLM-L6-v2",
46
- cacheDir: ".kb/models/huggingface",
57
+ cacheDir: DEFAULT_SHARED_MODEL_CACHE_DIR,
47
58
  indexHashTables: 8,
48
59
  indexRandomSeed: 42,
49
60
  chunkTextMode: "title-heading-text"
50
61
  },
51
62
  sparse: {
52
- enabled: false,
63
+ enabled: true,
53
64
  modelId: "opensearch-project/opensearch-neural-sparse-encoding-doc-v3-distill",
54
- cacheDir: ".kb/models/huggingface",
65
+ cacheDir: DEFAULT_SHARED_MODEL_CACHE_DIR,
55
66
  documentTopTokens: 128,
56
67
  queryEncoding: "tokenizer-token-weights",
57
68
  documentEncoding: "masked-lm-max-log1p-relu",
@@ -62,6 +73,7 @@ var defaultConfig = () => ({
62
73
  defaultUserAgent: "querylight-cli/0.1",
63
74
  obeyRobotsTxt: true,
64
75
  rateLimitMs: 1e3,
76
+ maxConcurrentRequests: 5,
65
77
  renderJs: false,
66
78
  retentionDays: 365,
67
79
  fetchArticles: true
@@ -112,11 +124,13 @@ async function loadConfig(workspacePath, configPath) {
112
124
  ...parsed.retrieval ?? {},
113
125
  dense: {
114
126
  ...defaults.retrieval.dense,
115
- ...parsed.retrieval?.dense ?? {}
127
+ ...parsed.retrieval?.dense ?? {},
128
+ cacheDir: normalizeModelCacheDir(parsed.retrieval?.dense?.cacheDir ?? defaults.retrieval.dense.cacheDir)
116
129
  },
117
130
  sparse: {
118
131
  ...defaults.retrieval.sparse,
119
- ...parsed.retrieval?.sparse ?? {}
132
+ ...parsed.retrieval?.sparse ?? {},
133
+ cacheDir: normalizeModelCacheDir(parsed.retrieval?.sparse?.cacheDir ?? defaults.retrieval.sparse.cacheDir)
120
134
  }
121
135
  },
122
136
  crawler: {
@@ -162,6 +176,14 @@ async function writeJsonl(filePath, records) {
162
176
  ` : "", "utf8");
163
177
  }
164
178
 
179
+ // src/core/progress.ts
180
+ function reportProgress(progress, message) {
181
+ progress?.("info", message);
182
+ }
183
+ function reportProgressDetail(progress, message) {
184
+ progress?.("detail", message);
185
+ }
186
+
165
187
  // src/chunk/chunk-store.ts
166
188
  import path3 from "path";
167
189
  function chunksFile(workspacePath) {
@@ -269,11 +291,13 @@ function buildChunksForDocument(document, markdown, config, prior = /* @__PURE__
269
291
  async function chunkDocuments({
270
292
  workspacePath,
271
293
  sourceId,
272
- documentId
294
+ documentId,
295
+ progress
273
296
  }) {
274
297
  const config = await loadConfig(workspacePath);
275
298
  const documents = await readJsonl(path4.join(workspacePath, "documents", "documents.jsonl"));
276
299
  const filtered = documents.filter((document) => (!sourceId || document.sourceId === sourceId) && (!documentId || document.id === documentId));
300
+ reportProgress(progress, `Chunking ${filtered.length} document${filtered.length === 1 ? "" : "s"}`);
277
301
  const targetedDocumentIds = new Set(filtered.map((document) => document.id));
278
302
  const existingChunks = await loadChunks(workspacePath);
279
303
  const prior = new Map(existingChunks.map((chunk) => [chunk.id, chunk]));
@@ -281,19 +305,17 @@ async function chunkDocuments({
281
305
  existingChunks.filter((chunk) => !targetedDocumentIds.has(chunk.documentId)).map((chunk) => [chunk.id, chunk])
282
306
  );
283
307
  for (const document of filtered) {
308
+ reportProgressDetail(progress, `Chunking ${document.id} (${document.title})`);
284
309
  const raw = await readFile3(document.normalizedPath, "utf8");
285
310
  for (const chunk of buildChunksForDocument(document, raw, config, prior)) {
286
311
  nextChunks.set(chunk.id, chunk);
287
312
  }
288
313
  }
289
314
  await saveChunks(workspacePath, [...nextChunks.values()]);
315
+ reportProgress(progress, `Chunking complete: ${nextChunks.size} chunk${nextChunks.size === 1 ? "" : "s"} written`);
290
316
  return { chunksWritten: nextChunks.size };
291
317
  }
292
318
 
293
- // src/core/constants.ts
294
- var PACKAGE_VERSION = "0.1.0";
295
- var DEFAULT_WORKSPACE = ".kb";
296
-
297
319
  // src/core/errors.ts
298
320
  var CliError = class extends Error {
299
321
  constructor(message, code, exitCode, details) {
@@ -319,8 +341,6 @@ var DIRS = [
319
341
  "normalized",
320
342
  "indexes",
321
343
  "vectors",
322
- "models",
323
- "models/huggingface",
324
344
  "runs",
325
345
  "logs"
326
346
  ];
@@ -358,11 +378,12 @@ import { Analyzer, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, Ranking
358
378
  import path11 from "path";
359
379
 
360
380
  // src/vector/dense.ts
361
- import { VectorFieldIndex, createSeededRandom } from "@tryformation/querylight-ts";
381
+ import { VectorFieldIndex, cosineSimilarity, createSeededRandom } from "@tryformation/querylight-ts";
362
382
  import { mkdir as mkdir4 } from "fs/promises";
363
383
  import path8 from "path";
364
384
 
365
385
  // src/vector/runtime.ts
386
+ import os from "os";
366
387
  import path6 from "path";
367
388
  import { fileURLToPath } from "url";
368
389
  import { execFile, execFileSync } from "child_process";
@@ -379,7 +400,22 @@ async function fileExists(filePath) {
379
400
  }
380
401
 
381
402
  // src/vector/runtime.ts
403
+ function resolveQliHomeDir() {
404
+ return path6.resolve(process.env.QLI_HOME ?? path6.join(os.homedir(), ".qli"));
405
+ }
382
406
  function resolveCacheDir(workspacePath, configuredPath) {
407
+ if (configuredPath === "~/.qli") {
408
+ return resolveQliHomeDir();
409
+ }
410
+ if (configuredPath.startsWith("~/.qli/")) {
411
+ return path6.join(resolveQliHomeDir(), configuredPath.slice("~/.qli/".length));
412
+ }
413
+ if (configuredPath === "~") {
414
+ return os.homedir();
415
+ }
416
+ if (configuredPath.startsWith("~/")) {
417
+ return path6.join(os.homedir(), configuredPath.slice(2));
418
+ }
383
419
  return path6.isAbsolute(configuredPath) ? configuredPath : path6.resolve(workspacePath, configuredPath.replace(/^\.kb\//, ""));
384
420
  }
385
421
  function packageRootFromImportMeta(importMetaUrl) {
@@ -403,6 +439,14 @@ async function ensureUvAvailable() {
403
439
  execFile("uv", ["--version"], (error) => error ? reject(error) : resolve2());
404
440
  });
405
441
  }
442
+ async function isUvAvailable() {
443
+ try {
444
+ await ensureUvAvailable();
445
+ return true;
446
+ } catch {
447
+ return false;
448
+ }
449
+ }
406
450
  async function runSparsePython({
407
451
  workspacePath,
408
452
  config,
@@ -446,55 +490,114 @@ async function getDenseTransformersRuntime(cacheDir) {
446
490
  }
447
491
 
448
492
  // src/vector/store.ts
449
- import { mkdir as mkdir3, readFile as readFile4, writeFile as writeFile3 } from "fs/promises";
493
+ import { mkdir as mkdir3, rm, writeFile as writeFile4 } from "fs/promises";
450
494
  import path7 from "path";
495
+
496
+ // src/core/gzip-json.ts
497
+ import { readFile as readFile4, writeFile as writeFile3 } from "fs/promises";
498
+ import { promisify } from "util";
499
+ import { gunzip, gzip } from "zlib";
500
+ var gzipAsync = promisify(gzip);
501
+ var gunzipAsync = promisify(gunzip);
502
+ async function writeGzipJson(filePath, value) {
503
+ const payload = JSON.stringify(value, null, 2);
504
+ await writeFile3(filePath, await gzipAsync(Buffer.from(payload, "utf8")));
505
+ }
506
+ async function readJsonFromGzipOrFile(gzipPath, legacyPath) {
507
+ if (await fileExists(gzipPath)) {
508
+ const payload = await readFile4(gzipPath);
509
+ return JSON.parse((await gunzipAsync(payload)).toString("utf8"));
510
+ }
511
+ if (legacyPath && await fileExists(legacyPath)) {
512
+ return JSON.parse(await readFile4(legacyPath, "utf8"));
513
+ }
514
+ return JSON.parse(await readFile4(gzipPath, "utf8"));
515
+ }
516
+ async function resolveExistingGzipOrFilePath(gzipPath, legacyPath) {
517
+ if (await fileExists(gzipPath)) {
518
+ return gzipPath;
519
+ }
520
+ if (legacyPath && await fileExists(legacyPath)) {
521
+ return legacyPath;
522
+ }
523
+ return gzipPath;
524
+ }
525
+
526
+ // src/vector/store.ts
451
527
  function vectorsDir(workspacePath) {
452
528
  return path7.join(workspacePath, "vectors");
453
529
  }
454
- function modelsDir(workspacePath) {
455
- return path7.join(workspacePath, "models");
530
+ function sharedModelStateDir() {
531
+ return path7.join(resolveQliHomeDir(), "models", "status");
456
532
  }
457
533
  function denseVectorPath(workspacePath) {
458
- return path7.join(vectorsDir(workspacePath), "dense.latest.json");
534
+ return path7.join(vectorsDir(workspacePath), "dense.latest.json.gz");
459
535
  }
460
536
  function denseMetaPath(workspacePath) {
461
- return path7.join(vectorsDir(workspacePath), "dense.latest.meta.json");
537
+ return path7.join(vectorsDir(workspacePath), "dense.latest.meta.json.gz");
462
538
  }
463
539
  function sparseVectorPath(workspacePath) {
464
- return path7.join(vectorsDir(workspacePath), "sparse.latest.json");
540
+ return path7.join(vectorsDir(workspacePath), "sparse.latest.json.gz");
465
541
  }
466
542
  function sparseMetaPath(workspacePath) {
543
+ return path7.join(vectorsDir(workspacePath), "sparse.latest.meta.json.gz");
544
+ }
545
+ function legacyDenseVectorPath(workspacePath) {
546
+ return path7.join(vectorsDir(workspacePath), "dense.latest.json");
547
+ }
548
+ function legacyDenseMetaPath(workspacePath) {
549
+ return path7.join(vectorsDir(workspacePath), "dense.latest.meta.json");
550
+ }
551
+ function legacySparseVectorPath(workspacePath) {
552
+ return path7.join(vectorsDir(workspacePath), "sparse.latest.json");
553
+ }
554
+ function legacySparseMetaPath(workspacePath) {
467
555
  return path7.join(vectorsDir(workspacePath), "sparse.latest.meta.json");
468
556
  }
469
- function densePullMarker(workspacePath) {
470
- return path7.join(modelsDir(workspacePath), "dense.pulled.json");
557
+ function pullMarkerPath(type, workspacePath, modelId, cacheDir) {
558
+ const resolvedCacheDir = resolveCacheDir(workspacePath, cacheDir);
559
+ const cacheKey = sha256(resolvedCacheDir).slice(0, 16);
560
+ return path7.join(sharedModelStateDir(), type, `${encodeURIComponent(modelId)}.${cacheKey}.json`);
561
+ }
562
+ function densePullMarker(workspacePath, modelId, cacheDir) {
563
+ return pullMarkerPath("dense", workspacePath, modelId, cacheDir);
471
564
  }
472
- function sparsePullMarker(workspacePath) {
473
- return path7.join(modelsDir(workspacePath), "sparse.pulled.json");
565
+ function sparsePullMarker(workspacePath, modelId, cacheDir) {
566
+ return pullMarkerPath("sparse", workspacePath, modelId, cacheDir);
474
567
  }
475
568
  async function writeDensePayload(workspacePath, payload) {
476
569
  await mkdir3(vectorsDir(workspacePath), { recursive: true });
477
- await writeFile3(denseVectorPath(workspacePath), JSON.stringify(payload, null, 2), "utf8");
478
- await writeFile3(denseMetaPath(workspacePath), JSON.stringify(payload.metadata, null, 2), "utf8");
570
+ await writeGzipJson(denseVectorPath(workspacePath), payload);
571
+ await writeGzipJson(denseMetaPath(workspacePath), payload.metadata);
572
+ await Promise.all([
573
+ rm(legacyDenseVectorPath(workspacePath), { force: true }),
574
+ rm(legacyDenseMetaPath(workspacePath), { force: true })
575
+ ]);
479
576
  }
480
577
  async function readDensePayload(workspacePath) {
481
- return JSON.parse(await readFile4(denseVectorPath(workspacePath), "utf8"));
578
+ return readJsonFromGzipOrFile(denseVectorPath(workspacePath), legacyDenseVectorPath(workspacePath));
482
579
  }
483
580
  async function writeSparsePayload(workspacePath, payload) {
484
581
  await mkdir3(vectorsDir(workspacePath), { recursive: true });
485
- await writeFile3(sparseVectorPath(workspacePath), JSON.stringify(payload, null, 2), "utf8");
486
- await writeFile3(sparseMetaPath(workspacePath), JSON.stringify(payload.metadata, null, 2), "utf8");
582
+ await writeGzipJson(sparseVectorPath(workspacePath), payload);
583
+ await writeGzipJson(sparseMetaPath(workspacePath), payload.metadata);
584
+ await Promise.all([
585
+ rm(legacySparseVectorPath(workspacePath), { force: true }),
586
+ rm(legacySparseMetaPath(workspacePath), { force: true })
587
+ ]);
487
588
  }
488
589
  async function readSparsePayload(workspacePath) {
489
- return JSON.parse(await readFile4(sparseVectorPath(workspacePath), "utf8"));
590
+ return readJsonFromGzipOrFile(sparseVectorPath(workspacePath), legacySparseVectorPath(workspacePath));
490
591
  }
491
- async function writeDensePullMarker(workspacePath, value) {
492
- await mkdir3(modelsDir(workspacePath), { recursive: true });
493
- await writeFile3(densePullMarker(workspacePath), JSON.stringify(value, null, 2), "utf8");
592
+ async function writeDensePullMarker(workspacePath, model, value) {
593
+ const markerPath = densePullMarker(workspacePath, model.modelId, model.cacheDir);
594
+ await mkdir3(path7.dirname(markerPath), { recursive: true });
595
+ await writeFile4(markerPath, JSON.stringify(value, null, 2), "utf8");
494
596
  }
495
- async function writeSparsePullMarker(workspacePath, value) {
496
- await mkdir3(modelsDir(workspacePath), { recursive: true });
497
- await writeFile3(sparsePullMarker(workspacePath), JSON.stringify(value, null, 2), "utf8");
597
+ async function writeSparsePullMarker(workspacePath, model, value) {
598
+ const markerPath = sparsePullMarker(workspacePath, model.modelId, model.cacheDir);
599
+ await mkdir3(path7.dirname(markerPath), { recursive: true });
600
+ await writeFile4(markerPath, JSON.stringify(value, null, 2), "utf8");
498
601
  }
499
602
  async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
500
603
  const denseCacheDir = resolveCacheDir(workspacePath, dense.cacheDir);
@@ -504,30 +607,72 @@ async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
504
607
  configured: dense.enabled,
505
608
  modelId: dense.modelId,
506
609
  cacheDir: denseCacheDir,
507
- available: await fileExists(densePullMarker(workspacePath)),
508
- artifactExists: await fileExists(denseVectorPath(workspacePath))
610
+ available: await fileExists(densePullMarker(workspacePath, dense.modelId, dense.cacheDir)),
611
+ artifactExists: await fileExists(denseVectorPath(workspacePath)) || await fileExists(legacyDenseVectorPath(workspacePath))
509
612
  },
510
613
  sparse: {
511
614
  configured: sparse.enabled,
512
615
  modelId: sparse.modelId,
513
616
  cacheDir: sparseCacheDir,
514
617
  uvAvailable,
515
- available: await fileExists(sparsePullMarker(workspacePath)),
516
- artifactExists: await fileExists(sparseVectorPath(workspacePath))
618
+ available: await fileExists(sparsePullMarker(workspacePath, sparse.modelId, sparse.cacheDir)),
619
+ artifactExists: await fileExists(sparseVectorPath(workspacePath)) || await fileExists(legacySparseVectorPath(workspacePath))
517
620
  }
518
621
  };
519
622
  }
520
623
 
521
624
  // src/vector/text.ts
625
+ var LOW_SIGNAL_HEADINGS = /* @__PURE__ */ new Set([
626
+ "choose this instead of",
627
+ "how xyz runs it",
628
+ "naechste schritte",
629
+ "next steps",
630
+ "overview",
631
+ "passend wenn",
632
+ "problem",
633
+ "right fit",
634
+ "waehlen sie das stattdessen",
635
+ "was sie bekommen",
636
+ "what you get",
637
+ "wie xyz es umsetzt",
638
+ "uberblick",
639
+ "\xFCberblick"
640
+ ]);
641
+ function normalizeHeading(value) {
642
+ return value.trim().toLowerCase();
643
+ }
644
+ function isLowSignalHeading(value) {
645
+ return LOW_SIGNAL_HEADINGS.has(normalizeHeading(value));
646
+ }
647
+ function stripLeadingHeading(text, heading) {
648
+ const lines = text.split("\n");
649
+ const firstContentIndex = lines.findIndex((line) => line.trim().length > 0);
650
+ if (firstContentIndex < 0) {
651
+ return text;
652
+ }
653
+ const match = /^(#{1,6})\s+(.+)$/.exec(lines[firstContentIndex] ?? "");
654
+ if (!match?.[2] || normalizeHeading(match[2]) !== normalizeHeading(heading)) {
655
+ return text;
656
+ }
657
+ const next = [...lines.slice(0, firstContentIndex), ...lines.slice(firstContentIndex + 1)].join("\n").trim();
658
+ return next;
659
+ }
660
+ function createVectorText(chunk) {
661
+ const meaningfulHeadings = chunk.headingPath.filter((heading) => !isLowSignalHeading(heading) && normalizeHeading(heading) !== normalizeHeading(chunk.title));
662
+ const textHeading = [...chunk.headingPath].reverse().find((heading) => isLowSignalHeading(heading) || normalizeHeading(heading) === normalizeHeading(chunk.title));
663
+ const body = textHeading ? stripLeadingHeading(chunk.text, textHeading) : chunk.text.trim();
664
+ return [chunk.title, ...meaningfulHeadings, body].filter(Boolean).join("\n\n");
665
+ }
522
666
  function createDenseChunkText(chunk) {
523
- return [chunk.title, ...chunk.headingPath, chunk.text].filter(Boolean).join("\n\n");
667
+ return createVectorText(chunk);
524
668
  }
525
669
  function createSparseChunkText(chunk) {
526
- return [chunk.title, ...chunk.headingPath, chunk.text].filter(Boolean).join("\n\n");
670
+ return createVectorText(chunk);
527
671
  }
528
672
 
529
673
  // src/vector/dense.ts
530
674
  var denseEmbedderFactory = null;
675
+ var EXACT_DENSE_RERANK_THRESHOLD = 5e3;
531
676
  async function createEmbedder(cacheDir, modelId) {
532
677
  if (denseEmbedderFactory) {
533
678
  return denseEmbedderFactory(cacheDir, modelId);
@@ -539,6 +684,9 @@ async function createEmbedder(cacheDir, modelId) {
539
684
  return output.tolist()[0];
540
685
  };
541
686
  }
687
+ function exactDenseQuery(payload, vector, topK) {
688
+ return payload.chunks.map((chunk) => [chunk.chunkId, cosineSimilarity(vector, chunk.embedding)]).sort((left, right) => right[1] - left[1]).slice(0, topK);
689
+ }
542
690
  async function pullDenseModel(workspacePath, config) {
543
691
  const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
544
692
  await mkdir4(cacheDir, { recursive: true });
@@ -547,7 +695,8 @@ async function pullDenseModel(workspacePath, config) {
547
695
  }
548
696
  async function buildDenseVectors({
549
697
  workspacePath,
550
- config
698
+ config,
699
+ progress
551
700
  }) {
552
701
  const chunks = await readJsonl(path8.join(workspacePath, "chunks", "chunks.jsonl"));
553
702
  const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
@@ -555,6 +704,7 @@ async function buildDenseVectors({
555
704
  const embed = await createEmbedder(cacheDir, config.modelId);
556
705
  const records = [];
557
706
  let dimensions = 0;
707
+ reportProgress(progress, `Encoding ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} for dense retrieval`);
558
708
  for (const chunk of chunks) {
559
709
  const embedding = await embed(createDenseChunkText(chunk));
560
710
  dimensions ||= embedding.length;
@@ -568,7 +718,11 @@ async function buildDenseVectors({
568
718
  text: chunk.text,
569
719
  embedding
570
720
  });
721
+ if (records.length === 1 || records.length % 100 === 0 || records.length === chunks.length) {
722
+ reportProgressDetail(progress, `Encoded ${records.length}/${chunks.length} chunks for dense retrieval`);
723
+ }
571
724
  }
725
+ reportProgress(progress, "Building dense vector index");
572
726
  const index = new VectorFieldIndex({
573
727
  numHashTables: config.indexHashTables,
574
728
  dimensions,
@@ -592,6 +746,7 @@ async function buildDenseVectors({
592
746
  chunks: records
593
747
  };
594
748
  await writeDensePayload(workspacePath, payload);
749
+ reportProgress(progress, `Dense vectors written for ${records.length} chunk${records.length === 1 ? "" : "s"}`);
595
750
  return payload;
596
751
  }
597
752
  async function denseQuery({
@@ -604,12 +759,19 @@ async function denseQuery({
604
759
  const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
605
760
  const embed = await createEmbedder(cacheDir, config.modelId);
606
761
  const vector = await embed(query);
762
+ if (payload.chunks.length <= EXACT_DENSE_RERANK_THRESHOLD) {
763
+ return exactDenseQuery(payload, vector, topK);
764
+ }
607
765
  const index = new VectorFieldIndex({
608
766
  numHashTables: payload.metadata.hashTables,
609
767
  dimensions: payload.metadata.dimensions,
610
768
  random: createSeededRandom(payload.metadata.randomSeed)
611
769
  }).loadState(payload.indexState);
612
- return index.query(vector, topK);
770
+ const approximateHits = index.query(vector, topK);
771
+ if (approximateHits.length >= topK) {
772
+ return approximateHits;
773
+ }
774
+ return exactDenseQuery(payload, vector, topK);
613
775
  }
614
776
 
615
777
  // src/vector/sparse.ts
@@ -717,10 +879,13 @@ async function buildSparseDocuments(workspacePath, config, chunks) {
717
879
  }
718
880
  async function buildSparseVectors({
719
881
  workspacePath,
720
- config
882
+ config,
883
+ progress
721
884
  }) {
722
885
  const chunks = await readJsonl(path9.join(workspacePath, "chunks", "chunks.jsonl"));
886
+ reportProgress(progress, `Encoding ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} for sparse retrieval`);
723
887
  const built = await buildSparseDocuments(workspacePath, config, chunks);
888
+ reportProgress(progress, "Building sparse vector index");
724
889
  const index = new SparseVectorFieldIndex();
725
890
  for (const record of built.chunks) {
726
891
  index.insert(record.chunkId, [record.vector]);
@@ -742,6 +907,7 @@ async function buildSparseVectors({
742
907
  queryTokenWeights: built.queryTokenWeights
743
908
  };
744
909
  await writeSparsePayload(workspacePath, payload);
910
+ reportProgress(progress, `Sparse vectors written for ${built.chunks.length} chunk${built.chunks.length === 1 ? "" : "s"}`);
745
911
  return payload;
746
912
  }
747
913
  async function sparseQuery({
@@ -759,6 +925,7 @@ async function sparseQuery({
759
925
  }
760
926
 
761
927
  // src/vector/service.ts
928
+ var pullModelsOverrideForTests = null;
762
929
  function resolveModelPullPlan({
763
930
  pullDenseFlag,
764
931
  pullSparseFlag,
@@ -775,90 +942,136 @@ function resolveModelPullPlan({
775
942
  pullSparse: uvAvailable
776
943
  };
777
944
  }
945
+ function resolveMissingConfiguredModelPullPlan({
946
+ config,
947
+ status
948
+ }) {
949
+ return {
950
+ pullDense: config.retrieval.dense.enabled && !status.dense.available,
951
+ pullSparse: config.retrieval.sparse.enabled && status.sparse.uvAvailable && !status.sparse.available
952
+ };
953
+ }
778
954
  async function buildVectorArtifacts({
779
955
  workspacePath,
780
956
  config,
781
957
  denseOverride,
782
958
  sparseOverride,
783
- buildAvailableModels = false
959
+ buildAvailableModels = false,
960
+ progress
784
961
  }) {
785
- const modelStatus = buildAvailableModels ? await buildModelStatus(workspacePath, config.retrieval.dense, config.retrieval.sparse, await (async () => {
786
- try {
787
- await ensureUvAvailable();
788
- return true;
789
- } catch {
790
- return false;
791
- }
792
- })()) : null;
962
+ const uvAvailable = await isUvAvailable();
963
+ const modelStatus = buildAvailableModels ? await buildModelStatus(workspacePath, config.retrieval.dense, config.retrieval.sparse, uvAvailable) : null;
793
964
  const denseEnabled = denseOverride ?? (buildAvailableModels ? config.retrieval.dense.enabled || Boolean(modelStatus?.dense.available) : config.retrieval.dense.enabled);
794
- const sparseEnabled = sparseOverride ?? (buildAvailableModels ? (config.retrieval.sparse.enabled || Boolean(modelStatus?.sparse.available)) && Boolean(modelStatus?.sparse.uvAvailable) : config.retrieval.sparse.enabled);
795
- const result2 = {};
965
+ const sparseEnabled = sparseOverride ?? (buildAvailableModels ? (config.retrieval.sparse.enabled || Boolean(modelStatus?.sparse.available)) && Boolean(modelStatus?.sparse.uvAvailable) : config.retrieval.sparse.enabled && uvAvailable);
966
+ const result = {};
796
967
  if (denseEnabled) {
797
- result2.dense = await buildDenseVectors({ workspacePath, config: config.retrieval.dense });
968
+ reportProgress(progress, `Building dense vectors with ${config.retrieval.dense.modelId}`);
969
+ result.dense = await buildDenseVectors({ workspacePath, config: config.retrieval.dense, progress });
970
+ }
971
+ if ((sparseOverride || config.retrieval.sparse.enabled) && !uvAvailable) {
972
+ reportProgress(progress, "Skipping sparse vectors because uv is not available");
798
973
  }
799
974
  if (sparseEnabled) {
800
- result2.sparse = await buildSparseVectors({ workspacePath, config: config.retrieval.sparse });
975
+ reportProgress(progress, `Building sparse vectors with ${config.retrieval.sparse.modelId}`);
976
+ result.sparse = await buildSparseVectors({ workspacePath, config: config.retrieval.sparse, progress });
801
977
  }
802
- return result2;
978
+ return result;
803
979
  }
804
980
  async function pullModels({
805
981
  workspacePath,
806
982
  config,
807
983
  pullDense,
808
- pullSparse
984
+ pullSparse,
985
+ progress
809
986
  }) {
987
+ if (pullModelsOverrideForTests) {
988
+ await pullModelsOverrideForTests({ workspacePath, config, pullDense, pullSparse, progress });
989
+ return;
990
+ }
810
991
  if (pullDense) {
992
+ reportProgress(progress, `Pulling dense model ${config.retrieval.dense.modelId}`);
811
993
  await pullDenseModel(workspacePath, config.retrieval.dense);
812
- await writeDensePullMarker(workspacePath, {
994
+ await writeDensePullMarker(workspacePath, config.retrieval.dense, {
813
995
  pulledAt: (/* @__PURE__ */ new Date()).toISOString(),
814
- modelId: config.retrieval.dense.modelId
996
+ modelId: config.retrieval.dense.modelId,
997
+ cacheDir: config.retrieval.dense.cacheDir
815
998
  });
999
+ reportProgress(progress, `Dense model ready: ${config.retrieval.dense.modelId}`);
816
1000
  }
817
1001
  if (pullSparse) {
1002
+ reportProgress(progress, `Pulling sparse model ${config.retrieval.sparse.modelId}`);
818
1003
  await pullSparseModel(workspacePath, config.retrieval.sparse);
819
- await writeSparsePullMarker(workspacePath, {
1004
+ await writeSparsePullMarker(workspacePath, config.retrieval.sparse, {
820
1005
  pulledAt: (/* @__PURE__ */ new Date()).toISOString(),
821
- modelId: config.retrieval.sparse.modelId
1006
+ modelId: config.retrieval.sparse.modelId,
1007
+ cacheDir: config.retrieval.sparse.cacheDir
822
1008
  });
1009
+ reportProgress(progress, `Sparse model ready: ${config.retrieval.sparse.modelId}`);
823
1010
  }
824
1011
  }
825
1012
  async function getModelStatus(workspacePath, config) {
826
- let uvAvailable = false;
827
- try {
828
- await ensureUvAvailable();
829
- uvAvailable = true;
830
- } catch {
831
- uvAvailable = false;
832
- }
1013
+ const uvAvailable = await isUvAvailable();
833
1014
  return buildModelStatus(workspacePath, config.retrieval.dense, config.retrieval.sparse, uvAvailable);
834
1015
  }
835
1016
 
836
1017
  // src/index/index-store.ts
837
- import { readFile as readFile5, writeFile as writeFile4 } from "fs/promises";
1018
+ import { mkdir as mkdir6, rm as rm2 } from "fs/promises";
838
1019
  import path10 from "path";
1020
+ function versionedIndexPath(workspacePath, stamp) {
1021
+ return path10.join(workspacePath, "indexes", `${stamp}.json.gz`);
1022
+ }
1023
+ function versionedLegacyIndexPath(workspacePath, stamp) {
1024
+ return path10.join(workspacePath, "indexes", `${stamp}.json`);
1025
+ }
1026
+ function versionedMetaPath(workspacePath, stamp) {
1027
+ return path10.join(workspacePath, "indexes", `${stamp}.meta.json.gz`);
1028
+ }
1029
+ function versionedLegacyMetaPath(workspacePath, stamp) {
1030
+ return path10.join(workspacePath, "indexes", `${stamp}.meta.json`);
1031
+ }
1032
+ function latestIndexPath(workspacePath) {
1033
+ return path10.join(workspacePath, "indexes", "latest.json.gz");
1034
+ }
1035
+ function legacyLatestIndexPath(workspacePath) {
1036
+ return path10.join(workspacePath, "indexes", "latest.json");
1037
+ }
1038
+ function latestMetaPath(workspacePath) {
1039
+ return path10.join(workspacePath, "indexes", "latest.meta.json.gz");
1040
+ }
1041
+ function legacyLatestMetaPath(workspacePath) {
1042
+ return path10.join(workspacePath, "indexes", "latest.meta.json");
1043
+ }
839
1044
  async function writeIndexArtifacts({
840
1045
  workspacePath,
841
1046
  indexState,
842
1047
  metadata
843
1048
  }) {
844
1049
  const stamp = metadata.createdAt.replace(/[:.]/g, "-");
845
- const indexPath = path10.join(workspacePath, "indexes", `${stamp}.json`);
846
- const metaPath = path10.join(workspacePath, "indexes", `${stamp}.meta.json`);
847
- const latestIndexPath = path10.join(workspacePath, "indexes", "latest.json");
848
- const latestMetaPath = path10.join(workspacePath, "indexes", "latest.meta.json");
849
- const indexPayload = JSON.stringify(indexState, null, 2);
850
- const metaPayload = JSON.stringify(metadata, null, 2);
851
- await writeFile4(indexPath, indexPayload, "utf8");
852
- await writeFile4(metaPath, metaPayload, "utf8");
853
- await writeFile4(latestIndexPath, indexPayload, "utf8");
854
- await writeFile4(latestMetaPath, metaPayload, "utf8");
855
- return { indexPath: latestIndexPath, metadataPath: latestMetaPath };
1050
+ const indexPath = versionedIndexPath(workspacePath, stamp);
1051
+ const metaPath = versionedMetaPath(workspacePath, stamp);
1052
+ const latestIndexArtifactPath = latestIndexPath(workspacePath);
1053
+ const latestMetadataArtifactPath = latestMetaPath(workspacePath);
1054
+ await mkdir6(path10.join(workspacePath, "indexes"), { recursive: true });
1055
+ await writeGzipJson(indexPath, indexState);
1056
+ await writeGzipJson(metaPath, metadata);
1057
+ await writeGzipJson(latestIndexArtifactPath, indexState);
1058
+ await writeGzipJson(latestMetadataArtifactPath, metadata);
1059
+ await Promise.all([
1060
+ rm2(legacyLatestIndexPath(workspacePath), { force: true }),
1061
+ rm2(legacyLatestMetaPath(workspacePath), { force: true }),
1062
+ rm2(versionedLegacyIndexPath(workspacePath, stamp), { force: true }),
1063
+ rm2(versionedLegacyMetaPath(workspacePath, stamp), { force: true })
1064
+ ]);
1065
+ return { indexPath: latestIndexArtifactPath, metadataPath: latestMetadataArtifactPath };
856
1066
  }
857
1067
  async function readLatestIndexState(workspacePath) {
858
- return JSON.parse(await readFile5(path10.join(workspacePath, "indexes", "latest.json"), "utf8"));
1068
+ return readJsonFromGzipOrFile(latestIndexPath(workspacePath), legacyLatestIndexPath(workspacePath));
859
1069
  }
860
1070
  async function readLatestIndexMetadata(workspacePath) {
861
- return JSON.parse(await readFile5(path10.join(workspacePath, "indexes", "latest.meta.json"), "utf8"));
1071
+ return readJsonFromGzipOrFile(latestMetaPath(workspacePath), legacyLatestMetaPath(workspacePath));
1072
+ }
1073
+ async function resolveLatestIndexArtifactPath(workspacePath) {
1074
+ return resolveExistingGzipOrFilePath(latestIndexPath(workspacePath), legacyLatestIndexPath(workspacePath));
862
1075
  }
863
1076
 
864
1077
  // src/index/querylight-indexer.ts
@@ -900,14 +1113,17 @@ async function buildIndex({
900
1113
  workspacePath,
901
1114
  denseOverride,
902
1115
  sparseOverride,
903
- buildAvailableModels = false
1116
+ buildAvailableModels = false,
1117
+ progress
904
1118
  }) {
905
1119
  const config = await loadConfig(workspacePath);
1120
+ reportProgress(progress, "Loading documents, chunks, and sources");
906
1121
  const chunks = await readJsonl(path11.join(workspacePath, "chunks", "chunks.jsonl"));
907
1122
  const documents = await readJsonl(path11.join(workspacePath, "documents", "documents.jsonl"));
908
1123
  const sources = await readJsonl(path11.join(workspacePath, "sources", "sources.jsonl"));
909
1124
  const metadataFields = [...new Set(chunks.flatMap((chunk) => Object.keys(chunk.metadata).map((key) => `metadata.${key}`)))];
910
1125
  const index = new DocumentIndex(createIndexMapping(metadataFields));
1126
+ reportProgress(progress, `Building lexical index from ${chunks.length} chunk${chunks.length === 1 ? "" : "s"}`);
911
1127
  for (const chunk of chunks) {
912
1128
  index.index({
913
1129
  id: chunk.id,
@@ -922,6 +1138,7 @@ async function buildIndex({
922
1138
  }
923
1139
  });
924
1140
  }
1141
+ reportProgressDetail(progress, `Indexed ${documents.length} document${documents.length === 1 ? "" : "s"} across ${sources.length} source${sources.length === 1 ? "" : "s"}`);
925
1142
  const createdAt = (/* @__PURE__ */ new Date()).toISOString();
926
1143
  const metadata = {
927
1144
  id: `index_${createdAt.replace(/[:.]/g, "-")}`,
@@ -934,14 +1151,17 @@ async function buildIndex({
934
1151
  fields: Object.keys(index.mapping),
935
1152
  indexHash: sha256(JSON.stringify(index.indexState))
936
1153
  };
1154
+ reportProgress(progress, "Writing lexical index artifacts");
937
1155
  const artifacts = await writeIndexArtifacts({ workspacePath, indexState: index.indexState, metadata });
938
1156
  const vectors = await buildVectorArtifacts({
939
1157
  workspacePath,
940
1158
  config,
941
1159
  denseOverride,
942
1160
  sparseOverride,
943
- buildAvailableModels
1161
+ buildAvailableModels,
1162
+ progress
944
1163
  });
1164
+ reportProgress(progress, `Index build complete: dense=${Boolean(vectors.dense)}, sparse=${Boolean(vectors.sparse)}`);
945
1165
  return {
946
1166
  metadata,
947
1167
  indexPath: artifacts.indexPath,
@@ -953,6 +1173,27 @@ async function buildIndex({
953
1173
  // src/ingest/ingest-service.ts
954
1174
  import path17 from "path";
955
1175
 
1176
+ // src/core/concurrency.ts
1177
+ async function mapWithConcurrency(items, limit, worker) {
1178
+ if (items.length === 0) {
1179
+ return;
1180
+ }
1181
+ const concurrency = Math.max(1, Math.floor(limit));
1182
+ let nextIndex = 0;
1183
+ await Promise.all(
1184
+ Array.from({ length: Math.min(concurrency, items.length) }, async () => {
1185
+ while (true) {
1186
+ const index = nextIndex;
1187
+ nextIndex += 1;
1188
+ if (index >= items.length) {
1189
+ return;
1190
+ }
1191
+ await worker(items[index], index);
1192
+ }
1193
+ })
1194
+ );
1195
+ }
1196
+
956
1197
  // src/core/runs.ts
957
1198
  import path12 from "path";
958
1199
  async function writeRun(workspacePath, run) {
@@ -1021,7 +1262,7 @@ async function removeSource(workspacePath, sourceId) {
1021
1262
  }
1022
1263
 
1023
1264
  // src/ingest/document-utils.ts
1024
- import { mkdir as mkdir6, rm, writeFile as writeFile5 } from "fs/promises";
1265
+ import { mkdir as mkdir7, rm as rm3, writeFile as writeFile5 } from "fs/promises";
1025
1266
  import path14 from "path";
1026
1267
 
1027
1268
  // src/normalize/normalize-markdown.ts
@@ -1074,7 +1315,7 @@ async function writeNormalizedDocument({
1074
1315
  normalizedPath,
1075
1316
  markdown
1076
1317
  }) {
1077
- await mkdir6(path14.dirname(normalizedPath), { recursive: true });
1318
+ await mkdir7(path14.dirname(normalizedPath), { recursive: true });
1078
1319
  await writeFile5(
1079
1320
  normalizedPath,
1080
1321
  withFrontmatter(
@@ -1097,8 +1338,8 @@ async function writeNormalizedDocument({
1097
1338
  }
1098
1339
  async function deleteDocumentArtifacts(document) {
1099
1340
  await Promise.all([
1100
- document.rawPath ? rm(document.rawPath, { force: true }) : Promise.resolve(),
1101
- rm(document.normalizedPath, { force: true })
1341
+ document.rawPath ? rm3(document.rawPath, { force: true }) : Promise.resolve(),
1342
+ rm3(document.normalizedPath, { force: true })
1102
1343
  ]);
1103
1344
  }
1104
1345
 
@@ -1122,13 +1363,13 @@ async function listDirectoryFiles(source) {
1122
1363
 
1123
1364
  // src/ingest/adapters/file-adapter.ts
1124
1365
  import { basename, extname, resolve } from "path";
1125
- import { mkdir as mkdir7, readFile as readFile9, stat as stat3, writeFile as writeFile6 } from "fs/promises";
1366
+ import { mkdir as mkdir8, readFile as readFile8, stat as stat3, writeFile as writeFile6 } from "fs/promises";
1126
1367
 
1127
1368
  // src/ingest/extractors/docx-extractor.ts
1128
1369
  import mammoth from "mammoth";
1129
1370
  async function extractDocx(filePath) {
1130
- const result2 = await mammoth.extractRawText({ path: filePath });
1131
- return result2.value;
1371
+ const result = await mammoth.extractRawText({ path: filePath });
1372
+ return result.value;
1132
1373
  }
1133
1374
 
1134
1375
  // src/ingest/extractors/html-extractor.ts
@@ -1142,9 +1383,41 @@ function stripBoilerplate(html) {
1142
1383
 
1143
1384
  // src/ingest/extractors/html-extractor.ts
1144
1385
  var turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
1386
+ var LOW_SIGNAL_SECTION_SELECTORS = [
1387
+ "script",
1388
+ "style",
1389
+ "noscript",
1390
+ "template",
1391
+ "[data-blog-service-recommendations]",
1392
+ "[data-blog-related-posts]"
1393
+ ].join(", ");
1145
1394
  function cleanText(value) {
1146
1395
  return value.replace(/\s+/g, " ").trim();
1147
1396
  }
1397
+ function pruneLowSignalContent($) {
1398
+ $(LOW_SIGNAL_SECTION_SELECTORS).remove();
1399
+ $("form").each((_, element) => {
1400
+ const action = cleanText($(element).attr("action") ?? "");
1401
+ if (action.includes("substack.com/subscribe")) {
1402
+ $(element).closest("section").remove();
1403
+ }
1404
+ });
1405
+ }
1406
+ function stripEscapedJsonPayloads(markdown) {
1407
+ return markdown.split("\n").filter((line) => {
1408
+ const trimmed = line.trim();
1409
+ if (trimmed.length === 0) {
1410
+ return true;
1411
+ }
1412
+ if (trimmed.length > 300 && /^"?\\?\[\{\\?"[a-z0-9_]+\\?":/i.test(trimmed)) {
1413
+ return false;
1414
+ }
1415
+ if (trimmed.length > 300 && trimmed.includes('\\"permalink\\":') && trimmed.includes('\\"title\\":')) {
1416
+ return false;
1417
+ }
1418
+ return true;
1419
+ }).join("\n").replace(/\n{3,}/g, "\n\n").trim();
1420
+ }
1148
1421
  function chooseMeaningfulTitle($, fallbackTitle) {
1149
1422
  const candidates = [
1150
1423
  cleanText($("meta[property='og:title']").attr("content") ?? ""),
@@ -1181,14 +1454,27 @@ ${parts.join("\n\n")}
1181
1454
  function extractHtmlToMarkdown(html) {
1182
1455
  const cleaned = stripBoilerplate(html);
1183
1456
  const $ = load(cleaned);
1457
+ pruneLowSignalContent($);
1184
1458
  const fallbackTitle = cleanText($("title").first().text()) || "Untitled";
1185
1459
  const title = chooseMeaningfulTitle($, fallbackTitle);
1186
1460
  const root = $("main").first().html() ?? $.root().html() ?? cleaned;
1187
1461
  return {
1188
- markdown: turndown.turndown(root),
1462
+ markdown: stripEscapedJsonPayloads(turndown.turndown(root)),
1189
1463
  title
1190
1464
  };
1191
1465
  }
1466
+ function extractCanonicalUriFromHtml(html, baseUrl) {
1467
+ const $ = load(html);
1468
+ const href = $("link[rel='canonical']").first().attr("href")?.trim();
1469
+ if (!href) {
1470
+ return null;
1471
+ }
1472
+ try {
1473
+ return new URL(href, baseUrl).href;
1474
+ } catch {
1475
+ return null;
1476
+ }
1477
+ }
1192
1478
  function parseDateCandidate(value) {
1193
1479
  const trimmed = value.trim();
1194
1480
  if (!trimmed) {
@@ -1251,16 +1537,16 @@ function extractPublicationDateFromHtml(html) {
1251
1537
  }
1252
1538
 
1253
1539
  // src/ingest/extractors/markdown-extractor.ts
1254
- import { readFile as readFile6 } from "fs/promises";
1540
+ import { readFile as readFile5 } from "fs/promises";
1255
1541
  async function extractMarkdown(filePath) {
1256
- return readFile6(filePath, "utf8");
1542
+ return readFile5(filePath, "utf8");
1257
1543
  }
1258
1544
 
1259
1545
  // src/ingest/extractors/pdf-extractor.ts
1260
- import { readFile as readFile7 } from "fs/promises";
1546
+ import { readFile as readFile6 } from "fs/promises";
1261
1547
  import { PDFParse } from "pdf-parse";
1262
1548
  async function extractPdf(filePath) {
1263
- const buffer = await readFile7(filePath);
1549
+ const buffer = await readFile6(filePath);
1264
1550
  const parser = new PDFParse({ data: buffer });
1265
1551
  try {
1266
1552
  const parsed = await parser.getText();
@@ -1271,9 +1557,9 @@ async function extractPdf(filePath) {
1271
1557
  }
1272
1558
 
1273
1559
  // src/ingest/extractors/text-extractor.ts
1274
- import { readFile as readFile8 } from "fs/promises";
1560
+ import { readFile as readFile7 } from "fs/promises";
1275
1561
  async function extractText(filePath) {
1276
- return readFile8(filePath, "utf8");
1562
+ return readFile7(filePath, "utf8");
1277
1563
  }
1278
1564
 
1279
1565
  // src/ingest/adapters/file-adapter.ts
@@ -1308,7 +1594,7 @@ async function extractFileContent(filePath, mimeType) {
1308
1594
  ${text}`, raw: text };
1309
1595
  }
1310
1596
  if (mimeType === "text/html") {
1311
- const raw = await readFile9(filePath, "utf8");
1597
+ const raw = await readFile8(filePath, "utf8");
1312
1598
  const extracted = extractHtmlToMarkdown(raw);
1313
1599
  return { title: extracted.title, markdown: `# ${extracted.title}
1314
1600
 
@@ -1364,8 +1650,8 @@ async function ingestFile({
1364
1650
  const lastChangedAt = previous?.contentHash === contentHash ? previous.lastChangedAt : now;
1365
1651
  const indexedAt = now;
1366
1652
  const crawledAt = now;
1367
- await mkdir7(resolve(workspacePath, "normalized"), { recursive: true });
1368
- await mkdir7(resolve(workspacePath, "raw", source.id), { recursive: true });
1653
+ await mkdir8(resolve(workspacePath, "normalized"), { recursive: true });
1654
+ await mkdir8(resolve(workspacePath, "raw", source.id), { recursive: true });
1369
1655
  if (extracted.raw) {
1370
1656
  await writeFile6(rawPath, extracted.raw, "utf8");
1371
1657
  }
@@ -1430,7 +1716,7 @@ ${content}`;
1430
1716
  const now = (/* @__PURE__ */ new Date()).toISOString();
1431
1717
  const lastChangedAt = previous?.contentHash === contentHash ? previous.lastChangedAt : now;
1432
1718
  const indexedAt = now;
1433
- await mkdir7(resolve(workspacePath, "normalized"), { recursive: true });
1719
+ await mkdir8(resolve(workspacePath, "normalized"), { recursive: true });
1434
1720
  await writeNormalizedDocument({
1435
1721
  documentId,
1436
1722
  sourceId: source.id,
@@ -1474,7 +1760,7 @@ async function reprocessStoredDocument(document, source) {
1474
1760
  if (!document.rawPath) {
1475
1761
  return null;
1476
1762
  }
1477
- const raw = await readFile9(document.rawPath, "utf8");
1763
+ const raw = await readFile8(document.rawPath, "utf8");
1478
1764
  const fallbackTitle = document.title || basename(document.uri);
1479
1765
  const extracted = await extractRawContent(raw, document.mimeType, fallbackTitle);
1480
1766
  const contentHash = sha256(extracted.markdown);
@@ -1591,8 +1877,21 @@ async function parseRssFeedDocument(xml, source) {
1591
1877
  }
1592
1878
 
1593
1879
  // src/ingest/adapters/url-adapter.ts
1594
- import { mkdir as mkdir8, readFile as readFile10, writeFile as writeFile7 } from "fs/promises";
1880
+ import { mkdir as mkdir9, readFile as readFile9, writeFile as writeFile7 } from "fs/promises";
1595
1881
  import path16 from "path";
1882
+
1883
+ // src/core/urls.ts
1884
+ function normalizeRemoteUrl(uri) {
1885
+ try {
1886
+ const parsed = new URL(uri);
1887
+ parsed.hash = "";
1888
+ return parsed.href;
1889
+ } catch {
1890
+ return uri;
1891
+ }
1892
+ }
1893
+
1894
+ // src/ingest/adapters/url-adapter.ts
1596
1895
  function buildHttpCache(response2, validatedAt) {
1597
1896
  return {
1598
1897
  etag: response2.headers.get("etag") ?? void 0,
@@ -1617,25 +1916,26 @@ async function normalizeRemoteDocument({
1617
1916
  responseStatus
1618
1917
  }) {
1619
1918
  const extracted = extractHtmlToMarkdown(body);
1919
+ const canonicalUri = normalizeRemoteUrl(extractCanonicalUriFromHtml(body, url) ?? url);
1620
1920
  const markdown = `# ${extracted.title}
1621
1921
 
1622
1922
  ${extracted.markdown}`;
1623
- const documentId = stableId("doc", source.id, url);
1923
+ const documentId = stableId("doc", source.id, canonicalUri);
1624
1924
  const normalizedPath = path16.resolve(workspacePath, "normalized", `${documentId}.md`);
1625
- const rawPath = path16.resolve(workspacePath, "raw", source.id, `${sha256(url).slice(0, 12)}.html`);
1925
+ const rawPath = path16.resolve(workspacePath, "raw", source.id, `${sha256(canonicalUri).slice(0, 12)}.html`);
1626
1926
  const contentHash = sha256(markdown);
1627
1927
  const now = (/* @__PURE__ */ new Date()).toISOString();
1628
1928
  const lastChangedAt = previous?.contentHash === contentHash ? previous.lastChangedAt : now;
1629
1929
  const indexedAt = now;
1630
1930
  const crawledAt = now;
1631
1931
  const resolvedPublicationDate = choosePublicationDate(publicationDate, extractPublicationDateFromHtml(body), previous?.publicationDate);
1632
- await mkdir8(path16.resolve(workspacePath, "raw", source.id), { recursive: true });
1932
+ await mkdir9(path16.resolve(workspacePath, "raw", source.id), { recursive: true });
1633
1933
  await writeFile7(rawPath, body, "utf8");
1634
1934
  await writeNormalizedDocument({
1635
1935
  documentId,
1636
1936
  sourceId: source.id,
1637
1937
  title: extracted.title,
1638
- uri: url,
1938
+ uri: canonicalUri,
1639
1939
  sourceUri,
1640
1940
  publicationDate: resolvedPublicationDate,
1641
1941
  crawledAt,
@@ -1650,8 +1950,9 @@ ${extracted.markdown}`;
1650
1950
  sourceId: source.id,
1651
1951
  sourceType: source.type,
1652
1952
  title: extracted.title,
1653
- uri: url,
1953
+ uri: canonicalUri,
1654
1954
  sourceUri,
1955
+ canonicalUri,
1655
1956
  mimeType: "text/html",
1656
1957
  rawPath,
1657
1958
  normalizedPath,
@@ -1749,7 +2050,7 @@ async function reprocessRemoteDocument(document, source) {
1749
2050
  if (!document.rawPath || !await fileExists(document.rawPath)) {
1750
2051
  return null;
1751
2052
  }
1752
- const raw = await readFile10(document.rawPath, "utf8");
2053
+ const raw = await readFile9(document.rawPath, "utf8");
1753
2054
  const extracted = extractHtmlToMarkdown(raw);
1754
2055
  const markdown = `# ${extracted.title}
1755
2056
 
@@ -1825,6 +2126,18 @@ function isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules
1825
2126
  if (url.origin !== baseUrl.origin) {
1826
2127
  return false;
1827
2128
  }
2129
+ if (url.search.length > 0) {
2130
+ return false;
2131
+ }
2132
+ if (url.pathname.endsWith(".xml")) {
2133
+ return false;
2134
+ }
2135
+ if (url.pathname.includes("/cdn-cgi/")) {
2136
+ return false;
2137
+ }
2138
+ if (url.pathname === "/search" || url.pathname === "/search/" || url.pathname.endsWith("/search/")) {
2139
+ return false;
2140
+ }
1828
2141
  if (disallowRules.some((rule) => rule !== "/" && url.pathname.startsWith(rule))) {
1829
2142
  return false;
1830
2143
  }
@@ -1837,56 +2150,75 @@ function isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules
1837
2150
  }
1838
2151
  return true;
1839
2152
  }
1840
- async function crawlWebsite(source) {
2153
+ function delay(ms) {
2154
+ return new Promise((resolve2) => setTimeout(resolve2, ms));
2155
+ }
2156
+ async function crawlWebsite(source, defaults, progress) {
1841
2157
  const baseUrl = new URL(source.uri);
1842
- const userAgent = source.crawl?.userAgent ?? "querylight-cli/0.1";
2158
+ const userAgent = source.crawl?.userAgent ?? defaults.userAgent;
1843
2159
  const includePatterns = source.crawl?.includePatterns ?? [];
1844
2160
  const excludePatterns = source.crawl?.excludePatterns ?? [];
1845
2161
  const maxDepth = source.crawl?.maxDepth ?? 2;
1846
2162
  const maxPages = source.crawl?.maxPages ?? 100;
1847
- const rateLimitMs = source.crawl?.rateLimitMs ?? 1e3;
2163
+ const rateLimitMs = source.crawl?.rateLimitMs ?? defaults.rateLimitMs;
2164
+ const maxConcurrentRequests = source.crawl?.maxConcurrentRequests ?? defaults.maxConcurrentRequests;
1848
2165
  const disallowRules = source.crawl?.obeyRobotsTxt === false ? [] : await fetchRobotsDisallow(baseUrl, userAgent);
1849
- const queue = [{ url: source.uri, depth: 0 }];
1850
2166
  const seen = /* @__PURE__ */ new Set();
1851
2167
  const results = [];
2168
+ let currentLevel = [normalizeRemoteUrl(source.uri)];
1852
2169
  if (source.crawl?.useSitemap !== false) {
1853
- for (const url of await fetchSitemapUrls(baseUrl, userAgent)) {
1854
- queue.push({ url, depth: 1 });
1855
- }
1856
- }
1857
- while (queue.length > 0 && results.length < maxPages) {
1858
- const next = queue.shift();
1859
- if (!next || seen.has(next.url)) {
1860
- continue;
1861
- }
1862
- seen.add(next.url);
1863
- const url = new URL(next.url);
1864
- if (!isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules)) {
1865
- continue;
2170
+ const sitemapUrls = (await fetchSitemapUrls(baseUrl, userAgent)).map((url) => normalizeRemoteUrl(url));
2171
+ reportProgress(progress, `Discovered ${sitemapUrls.length} sitemap URL${sitemapUrls.length === 1 ? "" : "s"} for ${source.uri}`);
2172
+ currentLevel = [
2173
+ ...currentLevel,
2174
+ ...sitemapUrls
2175
+ ];
2176
+ }
2177
+ for (let depth = 0; depth <= maxDepth && currentLevel.length > 0 && results.length < maxPages; depth += 1) {
2178
+ reportProgress(progress, `Crawl depth ${depth}: evaluating ${currentLevel.length} candidate URL${currentLevel.length === 1 ? "" : "s"}`);
2179
+ const nextLevelCandidates = [];
2180
+ const allowedUrls = [];
2181
+ for (const candidate of currentLevel) {
2182
+ const normalizedCandidate = normalizeRemoteUrl(candidate);
2183
+ if (seen.has(normalizedCandidate)) {
2184
+ continue;
2185
+ }
2186
+ seen.add(normalizedCandidate);
2187
+ const url = new URL(normalizedCandidate);
2188
+ if (!isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules)) {
2189
+ continue;
2190
+ }
2191
+ allowedUrls.push(normalizedCandidate);
2192
+ results.push(normalizedCandidate);
2193
+ reportProgress(progress, `Discovered ${normalizedCandidate}`);
2194
+ if (results.length >= maxPages) {
2195
+ break;
2196
+ }
1866
2197
  }
1867
- results.push(url.href);
1868
- if (next.depth >= maxDepth) {
1869
- continue;
2198
+ reportProgress(progress, `Crawl depth ${depth}: queued ${allowedUrls.length} page${allowedUrls.length === 1 ? "" : "s"} for link extraction`);
2199
+ if (depth >= maxDepth || results.length >= maxPages) {
2200
+ break;
1870
2201
  }
1871
- const response2 = await fetch(url, { headers: { "user-agent": userAgent } });
1872
- const html = await response2.text();
1873
- const $ = load2(html);
1874
- $("a[href]").each((_, element) => {
1875
- const href = $(element).attr("href");
1876
- if (!href) {
1877
- return;
1878
- }
1879
- try {
1880
- const target = new URL(href, url);
1881
- if (!seen.has(target.href)) {
1882
- queue.push({ url: target.href, depth: next.depth + 1 });
2202
+ await mapWithConcurrency(allowedUrls, maxConcurrentRequests, async (pageUrl) => {
2203
+ const page = new URL(pageUrl);
2204
+ const response2 = await fetch(page, { headers: { "user-agent": userAgent } });
2205
+ const html = await response2.text();
2206
+ const $ = load2(html);
2207
+ $("a[href]").each((_, element) => {
2208
+ const href = $(element).attr("href");
2209
+ if (!href) {
2210
+ return;
1883
2211
  }
1884
- } catch {
2212
+ try {
2213
+ nextLevelCandidates.push(normalizeRemoteUrl(new URL(href, page).href));
2214
+ } catch {
2215
+ }
2216
+ });
2217
+ if (rateLimitMs > 0) {
2218
+ await delay(rateLimitMs);
1885
2219
  }
1886
2220
  });
1887
- if (rateLimitMs > 0) {
1888
- await new Promise((resolve2) => setTimeout(resolve2, rateLimitMs));
1889
- }
2221
+ currentLevel = nextLevelCandidates;
1890
2222
  }
1891
2223
  return results;
1892
2224
  }
@@ -1961,6 +2293,8 @@ async function ingestRssSource({
1961
2293
  source,
1962
2294
  previous,
1963
2295
  nextDocuments,
2296
+ maxConcurrentRequests,
2297
+ onDocumentProcessed,
1964
2298
  onFailure
1965
2299
  }) {
1966
2300
  if (source.crawl?.fetchArticles === false) {
@@ -1968,11 +2302,12 @@ async function ingestRssSource({
1968
2302
  }
1969
2303
  const xml = await fetchFeedText(source);
1970
2304
  const items = await parseRssFeedDocument(xml, source);
2305
+ const processedDocumentIds = /* @__PURE__ */ new Set();
1971
2306
  let added = 0;
1972
2307
  let changed = 0;
1973
2308
  let unchanged = 0;
1974
2309
  let failed = 0;
1975
- for (const item of items) {
2310
+ await mapWithConcurrency(items, maxConcurrentRequests, async (item) => {
1976
2311
  try {
1977
2312
  const probe = previous.get(stableId("doc", source.id, item.url));
1978
2313
  const document = await fetchUrlDocument({
@@ -1983,28 +2318,40 @@ async function ingestRssSource({
1983
2318
  sourceUri: source.uri,
1984
2319
  publicationDate: item.publicationDate
1985
2320
  });
2321
+ if (processedDocumentIds.has(document.id)) {
2322
+ return;
2323
+ }
2324
+ processedDocumentIds.add(document.id);
2325
+ const existingDocument = probe ?? previous.get(document.id);
1986
2326
  nextDocuments.set(document.id, document);
1987
- if (!probe) {
2327
+ if (!existingDocument) {
1988
2328
  added += 1;
1989
- } else if (probe.contentHash !== document.contentHash) {
2329
+ onDocumentProcessed?.(document.uri, "added");
2330
+ } else if (existingDocument.contentHash !== document.contentHash) {
1990
2331
  changed += 1;
2332
+ onDocumentProcessed?.(document.uri, "changed");
1991
2333
  } else {
1992
2334
  unchanged += 1;
2335
+ onDocumentProcessed?.(document.uri, "unchanged");
1993
2336
  }
1994
2337
  } catch (error) {
1995
2338
  failed += 1;
1996
2339
  onFailure(item.url, error);
1997
2340
  }
1998
- }
2341
+ });
1999
2342
  return { added, changed, unchanged, failed };
2000
2343
  }
2001
2344
  async function ingestSources({
2002
2345
  workspacePath,
2003
2346
  sourceIds,
2004
- changedOnly = false
2347
+ changedOnly = false,
2348
+ progress
2005
2349
  }) {
2006
2350
  const config = await loadConfig(workspacePath);
2007
2351
  const defaultRetentionDays = config.crawler.retentionDays;
2352
+ const defaultUserAgent = config.crawler.defaultUserAgent;
2353
+ const defaultRateLimitMs = config.crawler.rateLimitMs;
2354
+ const defaultMaxConcurrentRequests = config.crawler.maxConcurrentRequests;
2008
2355
  const sources = (await listSources(workspacePath)).filter((source) => source.enabled && (!sourceIds || sourceIds.includes(source.id)));
2009
2356
  const existing = await loadDocuments(workspacePath);
2010
2357
  const previous = previousMap(existing);
@@ -2014,20 +2361,38 @@ async function ingestSources({
2014
2361
  let unchanged = 0;
2015
2362
  let failed = 0;
2016
2363
  const failures = [];
2364
+ reportProgress(progress, `Ingesting ${sources.length} source${sources.length === 1 ? "" : "s"}`);
2017
2365
  for (const source of sources) {
2366
+ const maxConcurrentRequests = source.crawl?.maxConcurrentRequests ?? defaultMaxConcurrentRequests;
2367
+ const sourceBefore = { added, changed, unchanged, failed };
2368
+ const processedDocumentIds = /* @__PURE__ */ new Set();
2369
+ const reportDocumentOutcome = (uri, outcome) => {
2370
+ const label = outcome === "unchanged" ? "Unchanged" : outcome === "changed" ? "Updated" : "Added";
2371
+ reportProgress(progress, `${label} ${uri}`);
2372
+ };
2018
2373
  const ingestOne = async (uri, producer) => {
2019
2374
  try {
2020
2375
  const probeId = stableId("doc", source.id, uri);
2021
2376
  const earlier = previous.get(probeId);
2022
2377
  const document = await producer();
2378
+ if (processedDocumentIds.has(document.id)) {
2379
+ reportProgressDetail(progress, `Skipped duplicate alias ${uri} -> ${document.uri}`);
2380
+ return null;
2381
+ }
2382
+ processedDocumentIds.add(document.id);
2383
+ const existingDocument = earlier ?? previous.get(document.id);
2023
2384
  nextDocuments.set(document.id, document);
2024
- if (!earlier) {
2385
+ if (!existingDocument) {
2025
2386
  added += 1;
2026
- } else if (earlier.contentHash !== document.contentHash) {
2387
+ reportDocumentOutcome(document.uri, "added");
2388
+ } else if (existingDocument.contentHash !== document.contentHash) {
2027
2389
  changed += 1;
2390
+ reportDocumentOutcome(document.uri, "changed");
2028
2391
  } else {
2029
2392
  unchanged += 1;
2393
+ reportDocumentOutcome(document.uri, "unchanged");
2030
2394
  }
2395
+ return document;
2031
2396
  } catch (error) {
2032
2397
  failed += 1;
2033
2398
  failures.push({
@@ -2035,50 +2400,69 @@ async function ingestSources({
2035
2400
  uri,
2036
2401
  message: error instanceof Error ? error.message : String(error)
2037
2402
  });
2403
+ reportProgressDetail(progress, `Failed ${uri}: ${error instanceof Error ? error.message : String(error)}`);
2404
+ return null;
2038
2405
  }
2039
2406
  };
2040
2407
  try {
2408
+ reportProgress(progress, `Source ${source.name} (${source.type})`);
2041
2409
  if (source.type === "file") {
2410
+ reportProgress(progress, `Reading file ${source.uri}`);
2042
2411
  await ingestOne(source.uri, () => ingestFile({ workspacePath, source, filePath: source.uri, previous: previous.get(stableId("doc", source.id, source.uri)) }));
2043
- continue;
2044
- }
2045
- if (source.type === "directory") {
2046
- for (const filePath of await listDirectoryFiles(source)) {
2412
+ } else if (source.type === "directory") {
2413
+ const files = await listDirectoryFiles(source);
2414
+ reportProgress(progress, `Scanning ${files.length} file${files.length === 1 ? "" : "s"} from ${source.uri}`);
2415
+ for (const filePath of files) {
2416
+ reportProgress(progress, `Reading file ${filePath}`);
2047
2417
  await ingestOne(filePath, () => ingestFile({ workspacePath, source, filePath, previous: previous.get(stableId("doc", source.id, filePath)) }));
2048
2418
  }
2049
- continue;
2050
- }
2051
- if (source.type === "url") {
2419
+ } else if (source.type === "url") {
2420
+ reportProgress(progress, `Fetching ${source.uri}`);
2052
2421
  await ingestOne(source.uri, () => fetchUrlDocument({ workspacePath, source, url: source.uri, previous: previous.get(stableId("doc", source.id, source.uri)) }));
2053
- continue;
2054
- }
2055
- if (source.type === "website") {
2056
- for (const url of await crawlWebsite(source)) {
2057
- await ingestOne(url, () => fetchUrlDocument({ workspacePath, source, url, previous: previous.get(stableId("doc", source.id, url)) }));
2058
- }
2059
- continue;
2060
- }
2061
- if (source.type === "rss") {
2062
- const result2 = await ingestRssSource({
2422
+ } else if (source.type === "website") {
2423
+ reportProgress(progress, `Crawling ${source.uri}`);
2424
+ const urls = await crawlWebsite(source, {
2425
+ userAgent: defaultUserAgent,
2426
+ rateLimitMs: defaultRateLimitMs,
2427
+ maxConcurrentRequests
2428
+ }, progress);
2429
+ reportProgress(progress, `Fetched ${urls.length} page${urls.length === 1 ? "" : "s"} from crawl`);
2430
+ const seenCanonicalUrls = /* @__PURE__ */ new Set();
2431
+ await mapWithConcurrency(urls, maxConcurrentRequests, async (url) => {
2432
+ if (seenCanonicalUrls.has(url)) {
2433
+ reportProgressDetail(progress, `Skipped canonical duplicate ${url}`);
2434
+ return;
2435
+ }
2436
+ reportProgress(progress, `Fetching ${url}`);
2437
+ const document = await ingestOne(url, () => fetchUrlDocument({ workspacePath, source, url, previous: previous.get(stableId("doc", source.id, url)) }));
2438
+ if (document) {
2439
+ seenCanonicalUrls.add(document.uri);
2440
+ }
2441
+ });
2442
+ } else if (source.type === "rss") {
2443
+ reportProgress(progress, `Fetching feed ${source.uri}`);
2444
+ const result = await ingestRssSource({
2063
2445
  workspacePath,
2064
2446
  source,
2065
2447
  previous,
2066
2448
  nextDocuments,
2449
+ maxConcurrentRequests,
2450
+ onDocumentProcessed: reportDocumentOutcome,
2067
2451
  onFailure: (uri, error) => {
2068
2452
  failures.push({
2069
2453
  sourceId: source.id,
2070
2454
  uri,
2071
2455
  message: error instanceof Error ? error.message : String(error)
2072
2456
  });
2457
+ reportProgressDetail(progress, `Failed ${uri}: ${error instanceof Error ? error.message : String(error)}`);
2073
2458
  }
2074
2459
  });
2075
- added += result2.added;
2076
- changed += result2.changed;
2077
- unchanged += result2.unchanged;
2078
- failed += result2.failed;
2079
- continue;
2080
- }
2081
- if (source.type === "markdown" || source.type === "text") {
2460
+ added += result.added;
2461
+ changed += result.changed;
2462
+ unchanged += result.unchanged;
2463
+ failed += result.failed;
2464
+ } else if (source.type === "markdown" || source.type === "text") {
2465
+ reportProgress(progress, `Processing inline ${source.type} source ${source.id}`);
2082
2466
  await ingestOne(source.uri, () => ingestInlineContent({
2083
2467
  workspacePath,
2084
2468
  source,
@@ -2095,13 +2479,19 @@ async function ingestSources({
2095
2479
  uri: source.uri,
2096
2480
  message: error instanceof Error ? error.message : String(error)
2097
2481
  });
2482
+ reportProgressDetail(progress, `Failed source ${source.name}: ${error instanceof Error ? error.message : String(error)}`);
2098
2483
  }
2484
+ reportProgress(
2485
+ progress,
2486
+ `Finished ${source.name}: +${added - sourceBefore.added} added, ${changed - sourceBefore.changed} changed, ${unchanged - sourceBefore.unchanged} unchanged, ${failed - sourceBefore.failed} failed`
2487
+ );
2099
2488
  }
2100
2489
  const expiringDocuments = [...nextDocuments.values()].filter((document) => {
2101
2490
  const source = sources.find((candidate) => candidate.id === document.sourceId);
2102
2491
  return source ? shouldExpireRssDocument(document, source, defaultRetentionDays) : false;
2103
2492
  });
2104
2493
  if (expiringDocuments.length > 0) {
2494
+ reportProgress(progress, `Removing ${expiringDocuments.length} expired RSS document${expiringDocuments.length === 1 ? "" : "s"}`);
2105
2495
  const expiredIds = new Set(expiringDocuments.map((document) => document.id));
2106
2496
  for (const document of expiringDocuments) {
2107
2497
  nextDocuments.delete(document.id);
@@ -2128,6 +2518,7 @@ async function ingestSources({
2128
2518
  documentsSnapshot: documentSnapshot(finalDocuments)
2129
2519
  };
2130
2520
  await writeRun(workspacePath, run);
2521
+ reportProgress(progress, `Ingest complete: ${added} added, ${changed} changed, ${unchanged} unchanged, ${failed} failed`);
2131
2522
  return {
2132
2523
  runId: id,
2133
2524
  documents: { added, changed, unchanged, failed },
@@ -2137,7 +2528,8 @@ async function ingestSources({
2137
2528
  async function reprocessDocuments({
2138
2529
  workspacePath,
2139
2530
  sourceId,
2140
- documentId
2531
+ documentId,
2532
+ progress
2141
2533
  }) {
2142
2534
  const documents = await loadDocuments(workspacePath);
2143
2535
  const sources = await listSources(workspacePath);
@@ -2145,15 +2537,20 @@ async function reprocessDocuments({
2145
2537
  const nextDocuments = new Map(documents.map((document) => [document.id, document]));
2146
2538
  let documentsReprocessed = 0;
2147
2539
  let documentsSkipped = 0;
2148
- for (const document of documents.filter((candidate) => (!sourceId || candidate.sourceId === sourceId) && (!documentId || candidate.id === documentId))) {
2540
+ const targets = documents.filter((candidate) => (!sourceId || candidate.sourceId === sourceId) && (!documentId || candidate.id === documentId));
2541
+ reportProgress(progress, `Reprocessing ${targets.length} document${targets.length === 1 ? "" : "s"}`);
2542
+ for (const document of targets) {
2543
+ reportProgressDetail(progress, `Reprocessing ${document.id} (${document.title})`);
2149
2544
  const source = sourceMap.get(document.sourceId);
2150
2545
  if (!source || !document.rawPath || !await fileExists(document.rawPath)) {
2151
2546
  documentsSkipped += 1;
2547
+ reportProgressDetail(progress, `Skipped ${document.id}: raw source not available`);
2152
2548
  continue;
2153
2549
  }
2154
2550
  const updated = source.type === "url" || source.type === "website" || source.type === "rss" ? await reprocessRemoteDocument(document, source) : await reprocessStoredDocument(document, source);
2155
2551
  if (!updated) {
2156
2552
  documentsSkipped += 1;
2553
+ reportProgressDetail(progress, `Skipped ${document.id}: source type could not be reprocessed`);
2157
2554
  continue;
2158
2555
  }
2159
2556
  nextDocuments.set(updated.id, updated);
@@ -2173,15 +2570,217 @@ async function reprocessDocuments({
2173
2570
  },
2174
2571
  documentsSnapshot: documentSnapshot(finalDocuments)
2175
2572
  });
2573
+ reportProgress(progress, `Reprocess complete: ${documentsReprocessed} updated, ${documentsSkipped} skipped`);
2176
2574
  return { runId: id, documentsReprocessed, documentsSkipped };
2177
2575
  }
2178
2576
 
2577
+ // src/ingest/adapters/website-feed-discovery.ts
2578
+ import { load as load3 } from "cheerio";
2579
+ var COMMON_FEED_PATHS = [
2580
+ "/feed",
2581
+ "/feed.xml",
2582
+ "/rss",
2583
+ "/rss.xml",
2584
+ "/atom.xml",
2585
+ "/index.xml",
2586
+ "/blog/feed",
2587
+ "/blog/feed.xml",
2588
+ "/blog/rss.xml",
2589
+ "/blog/atom.xml",
2590
+ "/blog/index.xml",
2591
+ "/news/feed",
2592
+ "/news/feed.xml",
2593
+ "/news/rss.xml",
2594
+ "/news/atom.xml",
2595
+ "/news/index.xml"
2596
+ ];
2597
+ function normalizeCandidateUrl(href, baseUrl) {
2598
+ try {
2599
+ const resolved = new URL(href, baseUrl);
2600
+ if (!["http:", "https:"].includes(resolved.protocol)) {
2601
+ return null;
2602
+ }
2603
+ return resolved.href;
2604
+ } catch {
2605
+ return null;
2606
+ }
2607
+ }
2608
+ function looksLikeFeedLink(typeHint, href) {
2609
+ const type = typeHint?.toLowerCase() ?? "";
2610
+ const lowerHref = href.toLowerCase();
2611
+ return type.includes("rss") || type.includes("atom") || type.includes("xml") || lowerHref.includes("/feed") || lowerHref.includes("/rss") || lowerHref.includes("/atom") || lowerHref.endsWith(".xml");
2612
+ }
2613
+ function extractDeclaredFeedCandidates(html, baseUrl) {
2614
+ const $ = load3(html);
2615
+ const candidates = [];
2616
+ $("link[href]").each((index, element) => {
2617
+ const rel = ($(element).attr("rel") ?? "").split(/\s+/).map((value) => value.trim().toLowerCase()).filter(Boolean);
2618
+ const href = $(element).attr("href");
2619
+ if (!href || !rel.includes("alternate")) {
2620
+ return;
2621
+ }
2622
+ const typeHint = $(element).attr("type") ?? void 0;
2623
+ if (!looksLikeFeedLink(typeHint, href)) {
2624
+ return;
2625
+ }
2626
+ const normalized = normalizeCandidateUrl(href, baseUrl);
2627
+ if (!normalized) {
2628
+ return;
2629
+ }
2630
+ candidates.push({
2631
+ url: normalized,
2632
+ discoveredBy: "declared",
2633
+ order: index,
2634
+ typeHint
2635
+ });
2636
+ });
2637
+ return candidates;
2638
+ }
2639
+ function buildCommonFeedCandidates(baseUrl) {
2640
+ return COMMON_FEED_PATHS.map((pathname, index) => ({
2641
+ url: new URL(pathname, baseUrl).href,
2642
+ discoveredBy: "common",
2643
+ order: index
2644
+ }));
2645
+ }
2646
+ function dedupeCandidates(candidates) {
2647
+ const seen = /* @__PURE__ */ new Set();
2648
+ const deduped = [];
2649
+ for (const candidate of candidates) {
2650
+ if (seen.has(candidate.url)) {
2651
+ continue;
2652
+ }
2653
+ seen.add(candidate.url);
2654
+ deduped.push(candidate);
2655
+ }
2656
+ return deduped;
2657
+ }
2658
+ function looksLikeFeedDocument(contentType, body) {
2659
+ const type = contentType?.toLowerCase() ?? "";
2660
+ const lowerBody = body.toLowerCase();
2661
+ return type.includes("rss") || type.includes("atom") || type.includes("xml") && (lowerBody.includes("<rss") || lowerBody.includes("<feed") || lowerBody.includes("<rdf:rdf")) || lowerBody.includes("<rss") || lowerBody.includes("<feed") || lowerBody.includes("<rdf:rdf");
2662
+ }
2663
+ function hasStablePrefixSegment(segment) {
2664
+ return typeof segment === "string" && segment.length > 0 && /[a-z]/i.test(segment);
2665
+ }
2666
+ function deriveExcludePrefix(itemUrls, websiteOrigin) {
2667
+ const paths = itemUrls.map((itemUrl) => {
2668
+ try {
2669
+ const parsed = new URL(itemUrl);
2670
+ if (parsed.origin !== websiteOrigin) {
2671
+ return null;
2672
+ }
2673
+ return parsed.pathname.split("/").filter(Boolean);
2674
+ } catch {
2675
+ return null;
2676
+ }
2677
+ }).filter((segments) => Array.isArray(segments));
2678
+ if (paths.length < 2) {
2679
+ return void 0;
2680
+ }
2681
+ const first = paths[0];
2682
+ if (!first) {
2683
+ return void 0;
2684
+ }
2685
+ let commonLength = 0;
2686
+ while (commonLength < first.length) {
2687
+ const nextSegment = first[commonLength];
2688
+ if (!hasStablePrefixSegment(nextSegment) || !paths.every((segments) => segments[commonLength] === nextSegment)) {
2689
+ break;
2690
+ }
2691
+ commonLength += 1;
2692
+ }
2693
+ if (commonLength === 0) {
2694
+ return void 0;
2695
+ }
2696
+ return `/${first.slice(0, commonLength).join("/")}/`;
2697
+ }
2698
+ function scoreCandidate(candidate) {
2699
+ const url = new URL(candidate.url);
2700
+ const segments = url.pathname.split("/").filter(Boolean);
2701
+ let score = candidate.discoveredBy === "declared" ? 1e3 : 100;
2702
+ score -= candidate.order;
2703
+ score -= segments.length * 10;
2704
+ if (candidate.typeHint?.toLowerCase().includes("rss") || candidate.typeHint?.toLowerCase().includes("atom")) {
2705
+ score += 25;
2706
+ }
2707
+ if (["/feed", "/feed.xml", "/rss", "/rss.xml", "/atom.xml", "/index.xml"].includes(url.pathname)) {
2708
+ score += 50;
2709
+ }
2710
+ if (url.pathname.includes("comments")) {
2711
+ score -= 200;
2712
+ }
2713
+ return score;
2714
+ }
2715
+ async function validateCandidate(candidate, websiteUrl, userAgent) {
2716
+ try {
2717
+ const response2 = await fetch(candidate.url, { headers: { "user-agent": userAgent } });
2718
+ if (!response2.ok) {
2719
+ return null;
2720
+ }
2721
+ const body = await response2.text();
2722
+ if (!looksLikeFeedDocument(response2.headers.get("content-type"), body)) {
2723
+ return null;
2724
+ }
2725
+ const source = {
2726
+ id: "src_detected_feed",
2727
+ type: "rss",
2728
+ uri: candidate.url,
2729
+ name: "Detected Feed",
2730
+ enabled: true,
2731
+ tags: [],
2732
+ metadata: {},
2733
+ createdAt: "1970-01-01T00:00:00.000Z",
2734
+ updatedAt: "1970-01-01T00:00:00.000Z"
2735
+ };
2736
+ const items = await parseRssFeedDocument(body, source);
2737
+ return {
2738
+ feedUrl: candidate.url,
2739
+ discoveredBy: candidate.discoveredBy,
2740
+ excludePrefix: deriveExcludePrefix(items.map((item) => item.url), websiteUrl.origin)
2741
+ };
2742
+ } catch {
2743
+ return null;
2744
+ }
2745
+ }
2746
+ async function discoverWebsiteFeed(websiteUrl, userAgent) {
2747
+ try {
2748
+ const baseUrl = new URL(websiteUrl);
2749
+ const response2 = await fetch(baseUrl, { headers: { "user-agent": userAgent } });
2750
+ if (!response2.ok) {
2751
+ return null;
2752
+ }
2753
+ const html = await response2.text();
2754
+ const candidates = dedupeCandidates([
2755
+ ...extractDeclaredFeedCandidates(html, baseUrl),
2756
+ ...buildCommonFeedCandidates(baseUrl)
2757
+ ]).sort((left, right) => scoreCandidate(right) - scoreCandidate(left));
2758
+ for (const candidate of candidates) {
2759
+ const validated = await validateCandidate(candidate, baseUrl, userAgent);
2760
+ if (validated) {
2761
+ return validated;
2762
+ }
2763
+ }
2764
+ return null;
2765
+ } catch {
2766
+ return null;
2767
+ }
2768
+ }
2769
+
2179
2770
  // src/query/search-service.ts
2180
- import { readFile as readFile11 } from "fs/promises";
2771
+ import { readFile as readFile10 } from "fs/promises";
2181
2772
  import { BoolQuery, MatchQuery, OP, TermQuery, reciprocalRankFusion } from "@tryformation/querylight-ts";
2182
2773
  import path18 from "path";
2183
2774
  async function loadHydratedIndex(workspacePath) {
2184
- const state = await readLatestIndexState(workspacePath);
2775
+ let state;
2776
+ try {
2777
+ state = await readLatestIndexState(workspacePath);
2778
+ } catch (error) {
2779
+ if (error.code === "ENOENT") {
2780
+ throw new CliError("lexical index is not built; run `qli rebuild` or `qli chunk` followed by `qli index build`", "INDEX_MISSING", 7 /* QueryError */);
2781
+ }
2782
+ throw error;
2783
+ }
2185
2784
  const mapping = createIndexMapping(Object.keys(state.fieldState ?? {}).filter((field) => field.startsWith("metadata.")));
2186
2785
  return new (await import("@tryformation/querylight-ts")).DocumentIndex(mapping).loadState(state);
2187
2786
  }
@@ -2399,7 +2998,7 @@ async function buildSnippetWithAdjacentChunks(chunk, query, {
2399
2998
  if (!await fileExists(document.normalizedPath)) {
2400
2999
  return buildSnippet(chunk.text, query);
2401
3000
  }
2402
- const raw = await readFile11(document.normalizedPath, "utf8");
3001
+ const raw = await readFile10(document.normalizedPath, "utf8");
2403
3002
  orderedChunks = buildChunksForDocument(document, raw, config);
2404
3003
  orderedChunkCache.set(document.id, orderedChunks);
2405
3004
  }
@@ -2417,9 +3016,25 @@ async function buildSnippetWithAdjacentChunks(chunk, query, {
2417
3016
  function normalizeDisplayTitle(title) {
2418
3017
  return title.replace(/\s*\|\s*Querylight TS Demo\s*$/i, "").replace(/\s+/g, " ").trim();
2419
3018
  }
3019
+ var LOW_SIGNAL_RESULT_TITLES = /* @__PURE__ */ new Set([
3020
+ "choose this instead of",
3021
+ "how xyz runs it",
3022
+ "naechste schritte",
3023
+ "next steps",
3024
+ "overview",
3025
+ "passend wenn",
3026
+ "problem",
3027
+ "right fit",
3028
+ "waehlen sie das stattdessen",
3029
+ "was sie bekommen",
3030
+ "what you get",
3031
+ "wie xyz es umsetzt",
3032
+ "uberblick",
3033
+ "\xFCberblick"
3034
+ ]);
2420
3035
  function chooseResultTitle(chunk) {
2421
3036
  const documentTitle = normalizeDisplayTitle(chunk.title);
2422
- const headings = chunk.headingPath.map((heading) => normalizeDisplayTitle(heading)).filter(Boolean);
3037
+ const headings = chunk.headingPath.map((heading) => normalizeDisplayTitle(heading)).filter((heading) => heading.length > 0 && !LOW_SIGNAL_RESULT_TITLES.has(heading.toLowerCase()));
2423
3038
  const leafHeading = headings.at(-1);
2424
3039
  if (leafHeading && leafHeading.toLowerCase() !== documentTitle.toLowerCase()) {
2425
3040
  return leafHeading;
@@ -2441,6 +3056,9 @@ function normalizeUriPath(uri) {
2441
3056
  return uri.toLowerCase().replace(/\/+$/, "");
2442
3057
  }
2443
3058
  }
3059
+ function normalizeUriIdentity(uri) {
3060
+ return normalizeRemoteUrl(uri).toLowerCase().replace(/\/+$/, "");
3061
+ }
2444
3062
  function uriSpecificity(uri) {
2445
3063
  const normalized = normalizeUriPath(uri);
2446
3064
  if (normalized === "/") {
@@ -2457,6 +3075,11 @@ function isMoreSpecificDuplicate(candidate, existing) {
2457
3075
  if (!candidateTitle || candidateTitle !== existingTitle) {
2458
3076
  return false;
2459
3077
  }
3078
+ const candidateIdentity = normalizeUriIdentity(candidate.uri);
3079
+ const existingIdentity = normalizeUriIdentity(existing.uri);
3080
+ if (candidateIdentity === existingIdentity) {
3081
+ return candidate.uri.length < existing.uri.length;
3082
+ }
2460
3083
  const candidatePath = normalizeUriPath(candidate.uri);
2461
3084
  const existingPath = normalizeUriPath(existing.uri);
2462
3085
  if (candidatePath === existingPath) {
@@ -2471,28 +3094,28 @@ function isMoreSpecificDuplicate(candidate, existing) {
2471
3094
  }
2472
3095
  function collapseAggregateDuplicates(results, topK) {
2473
3096
  const deduped = [];
2474
- for (const result2 of results) {
3097
+ for (const result of results) {
2475
3098
  const duplicateIndex = deduped.findIndex(
2476
- (existing) => isMoreSpecificDuplicate(result2, existing) || isMoreSpecificDuplicate(existing, result2)
3099
+ (existing) => isMoreSpecificDuplicate(result, existing) || isMoreSpecificDuplicate(existing, result)
2477
3100
  );
2478
3101
  if (duplicateIndex < 0) {
2479
- deduped.push(result2);
3102
+ deduped.push(result);
2480
3103
  continue;
2481
3104
  }
2482
- if (isMoreSpecificDuplicate(result2, deduped[duplicateIndex])) {
2483
- deduped[duplicateIndex] = result2;
3105
+ if (isMoreSpecificDuplicate(result, deduped[duplicateIndex])) {
3106
+ deduped[duplicateIndex] = result;
2484
3107
  }
2485
3108
  }
2486
3109
  return deduped.slice(0, topK);
2487
3110
  }
2488
3111
  function rerankResultsByDocument(results, topK) {
2489
3112
  const byDocument = /* @__PURE__ */ new Map();
2490
- for (const result2 of results) {
2491
- const existing = byDocument.get(result2.documentId);
3113
+ for (const result of results) {
3114
+ const existing = byDocument.get(result.documentId);
2492
3115
  if (existing) {
2493
- existing.push(result2);
3116
+ existing.push(result);
2494
3117
  } else {
2495
- byDocument.set(result2.documentId, [result2]);
3118
+ byDocument.set(result.documentId, [result]);
2496
3119
  }
2497
3120
  }
2498
3121
  const reranked = [...byDocument.values()].flatMap((group) => {
@@ -2501,7 +3124,7 @@ function rerankResultsByDocument(results, topK) {
2501
3124
  if (!best) {
2502
3125
  return [];
2503
3126
  }
2504
- const tailScore = rest.reduce((sum, result2) => sum + result2.score, 0);
3127
+ const tailScore = rest.reduce((sum, result) => sum + result.score, 0);
2505
3128
  const aggregateScore = best.score + tailScore * 0.35 + (group.length - 1) * 0.2;
2506
3129
  return [{ ...best, score: aggregateScore }];
2507
3130
  }).sort((left, right) => right.score - left.score);
@@ -2569,7 +3192,6 @@ async function searchIndex({
2569
3192
  score: 0,
2570
3193
  title: chooseResultTitle(chunk),
2571
3194
  uri: chunk.uri,
2572
- headingPath: chunk.headingPath,
2573
3195
  snippet: await buildSnippetWithAdjacentChunks(chunk, document.title, {
2574
3196
  document,
2575
3197
  config,
@@ -2584,7 +3206,7 @@ async function searchIndex({
2584
3206
  };
2585
3207
  })
2586
3208
  );
2587
- return { retrievalMode: "lexical", results: latestResults.filter((result2) => result2 != null) };
3209
+ return { retrievalMode: "lexical", results: latestResults.filter((result) => result != null) };
2588
3210
  }
2589
3211
  const lexicalHits = async () => {
2590
3212
  const index = await loadHydratedIndex(workspacePath);
@@ -2633,7 +3255,6 @@ async function searchIndex({
2633
3255
  score,
2634
3256
  title: chooseResultTitle(chunk),
2635
3257
  uri: chunk.uri,
2636
- headingPath: chunk.headingPath,
2637
3258
  snippet: await buildSnippetWithAdjacentChunks(chunk, normalizedQuery, {
2638
3259
  document: documents.get(chunk.documentId),
2639
3260
  config,
@@ -2647,13 +3268,13 @@ async function searchIndex({
2647
3268
  metadata: chunk.metadata
2648
3269
  };
2649
3270
  }));
2650
- const results = rawResults.filter((result2) => result2 != null);
3271
+ const results = rawResults.filter((result) => result != null);
2651
3272
  return { retrievalMode: mode, results: rerankResultsByDocument(results, topK) };
2652
3273
  }
2653
3274
 
2654
3275
  // src/query/related-service.ts
2655
3276
  import path19 from "path";
2656
- function cosineSimilarity(left, right) {
3277
+ function cosineSimilarity2(left, right) {
2657
3278
  let dot = 0;
2658
3279
  let leftNorm = 0;
2659
3280
  let rightNorm = 0;
@@ -2739,7 +3360,7 @@ async function findRelatedDocuments({
2739
3360
  const results = [...vectors.values()].filter((candidate) => candidate.document.id !== selected.id).map((candidate) => ({
2740
3361
  documentId: candidate.document.id,
2741
3362
  sourceId: candidate.document.sourceId,
2742
- score: cosineSimilarity(sourceVector.embedding, candidate.embedding),
3363
+ score: cosineSimilarity2(sourceVector.embedding, candidate.embedding),
2743
3364
  title: candidate.document.title,
2744
3365
  uri: candidate.document.uri,
2745
3366
  metadata: candidate.document.metadata
@@ -2767,21 +3388,20 @@ async function createContext({
2767
3388
  const search = await searchIndex({ workspacePath, query, topK, showChunks: true, retrievalMode });
2768
3389
  const sources = [];
2769
3390
  let total = 0;
2770
- for (const result2 of search.results) {
2771
- const text = result2.text ?? "";
3391
+ for (const result of search.results) {
3392
+ const text = result.text ?? "";
2772
3393
  if (total + text.length > maxChars && sources.length > 0) {
2773
3394
  break;
2774
3395
  }
2775
3396
  total += text.length;
2776
3397
  sources.push({
2777
- chunkId: result2.chunkId,
2778
- documentId: result2.documentId,
2779
- sourceId: result2.sourceId,
2780
- title: result2.title,
2781
- uri: result2.uri,
2782
- headingPath: result2.headingPath,
3398
+ chunkId: result.chunkId,
3399
+ documentId: result.documentId,
3400
+ sourceId: result.sourceId,
3401
+ title: result.title,
3402
+ uri: result.uri,
2783
3403
  text,
2784
- metadata: result2.metadata
3404
+ metadata: result.metadata
2785
3405
  });
2786
3406
  }
2787
3407
  const markdown = [
@@ -2792,7 +3412,6 @@ async function createContext({
2792
3412
  `Title: ${source.title}`,
2793
3413
  `URL: ${source.uri}`,
2794
3414
  `Chunk ID: ${source.chunkId}`,
2795
- source.headingPath.length > 0 ? `Heading Path: ${source.headingPath.join(" > ")}` : "",
2796
3415
  "",
2797
3416
  source.text,
2798
3417
  ""
@@ -2871,27 +3490,30 @@ function formatSourcesTable(sources) {
2871
3490
  return table.toString();
2872
3491
  }
2873
3492
  function formatSearchResults(results) {
2874
- return results.map((result2, index) => [
2875
- `${index + 1}. ${colors.bold(result2.title)}`,
2876
- ` ${result2.uri}`,
2877
- ` Source type: ${result2.sourceType}`,
2878
- ` Published: ${result2.publicationDate ?? "n/a"}`,
2879
- ` Score: ${result2.score.toFixed(3)}`,
2880
- ` ${result2.snippet}`
2881
- ].join("\n")).join("\n\n");
3493
+ return results.map((result, index) => [
3494
+ `${index + 1}. ${colors.bold(result.title)}`,
3495
+ ` URL: ${result.uri}`,
3496
+ ` Source: ${result.sourceType} | Published: ${result.publicationDate ?? "n/a"} | Score: ${result.score.toFixed(3)}`,
3497
+ "",
3498
+ ...result.snippet.split("\n").map((line) => line.length > 0 ? ` ${line}` : "")
3499
+ ].join("\n")).join(`
3500
+
3501
+ ${colors.dim("---")}
3502
+
3503
+ `);
2882
3504
  }
2883
3505
  function formatRelatedDocuments(results) {
2884
- return results.map((result2, index) => [
2885
- `${index + 1}. ${colors.bold(result2.title)}`,
2886
- ` ${result2.uri}`,
2887
- ` Similarity: ${result2.score.toFixed(3)}`
3506
+ return results.map((result, index) => [
3507
+ `${index + 1}. ${colors.bold(result.title)}`,
3508
+ ` ${result.uri}`,
3509
+ ` Similarity: ${result.score.toFixed(3)}`
2888
3510
  ].join("\n")).join("\n\n");
2889
3511
  }
2890
3512
 
2891
3513
  // src/cli/run-cli.ts
2892
3514
  var SOURCE_TYPES = /* @__PURE__ */ new Set(["url", "website", "rss", "file", "directory", "markdown", "text"]);
2893
3515
  var RETRIEVAL_MODES = /* @__PURE__ */ new Set(["lexical", "dense", "sparse", "hybrid"]);
2894
- var SOURCE_TYPE_LIST = ["url", "website", "rss", "file", "directory", "markdown", "text"];
3516
+ var SOURCE_TYPE_LIST = ["page", "website", "rss", "file", "directory", "markdown", "text"];
2895
3517
  var RETRIEVAL_MODE_LIST = ["lexical", "dense", "sparse", "hybrid"];
2896
3518
  var SEARCH_DATE_FIELDS = ["publicationDate", "firstSeenAt", "lastSeenAt", "lastChangedAt", "crawledAt"];
2897
3519
  function parseKeyValue(input) {
@@ -2914,11 +3536,46 @@ function parseOptionalNumber(input, optionName) {
2914
3536
  }
2915
3537
  return value;
2916
3538
  }
3539
+ function parseOptionalPositiveInteger(input, optionName) {
3540
+ const value = parseOptionalNumber(input, optionName);
3541
+ if (value === void 0) {
3542
+ return void 0;
3543
+ }
3544
+ if (!Number.isInteger(value) || value < 1) {
3545
+ throw new CliError(`invalid positive integer for ${optionName}: ${input}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
3546
+ }
3547
+ return value;
3548
+ }
2917
3549
  function setWhenDefined(target, key, value) {
2918
3550
  if (value !== void 0) {
2919
3551
  target[key] = value;
2920
3552
  }
2921
3553
  }
3554
+ function mergePatterns(existing, extra) {
3555
+ const merged = [...existing ?? []];
3556
+ if (extra && !merged.includes(extra)) {
3557
+ merged.push(extra);
3558
+ }
3559
+ return merged.length > 0 ? merged : void 0;
3560
+ }
3561
+ function formatWebsiteSourceAdd(result) {
3562
+ const lines = [`Added source ${result.primarySource.id}`];
3563
+ if (!result.detectedFeed) {
3564
+ lines.push("No feed detected during website registration.");
3565
+ return lines.join("\n");
3566
+ }
3567
+ if (result.detectedFeed.source && result.detectedFeed.wasAdded) {
3568
+ lines.push(`Detected feed ${result.detectedFeed.url} and added source ${result.detectedFeed.source.id}.`);
3569
+ } else if (result.detectedFeed.source) {
3570
+ lines.push(`Detected feed ${result.detectedFeed.url}. Source ${result.detectedFeed.source.id} already exists.`);
3571
+ } else {
3572
+ lines.push(`Detected feed ${result.detectedFeed.url}.`);
3573
+ }
3574
+ if (result.detectedFeed.excludePrefix) {
3575
+ lines.push(`Excluded ${result.detectedFeed.excludePrefix} from the website crawl.`);
3576
+ }
3577
+ return lines.join("\n");
3578
+ }
2922
3579
  function createSourceCrawlConfig(type, options, defaults) {
2923
3580
  if (!["url", "website", "directory", "rss"].includes(type)) {
2924
3581
  return void 0;
@@ -2926,6 +3583,7 @@ function createSourceCrawlConfig(type, options, defaults) {
2926
3583
  const crawl = {};
2927
3584
  setWhenDefined(crawl, "maxDepth", parseOptionalNumber(options.maxDepth, "--max-depth"));
2928
3585
  setWhenDefined(crawl, "maxPages", parseOptionalNumber(options.maxPages, "--max-pages"));
3586
+ setWhenDefined(crawl, "maxConcurrentRequests", parseOptionalPositiveInteger(options.maxConcurrentRequests, "--max-concurrent-requests"));
2929
3587
  setWhenDefined(crawl, "includePatterns", options.include);
2930
3588
  setWhenDefined(crawl, "excludePatterns", options.exclude);
2931
3589
  setWhenDefined(crawl, "obeyRobotsTxt", options.robots);
@@ -2944,14 +3602,48 @@ function createSourceCrawlConfig(type, options, defaults) {
2944
3602
  }
2945
3603
  return Object.keys(crawl).length > 0 ? crawl : void 0;
2946
3604
  }
3605
+ function validateSourceAddOptions(type, options) {
3606
+ const reject = (optionName) => {
3607
+ throw new CliError(`${optionName} is not supported for source type ${type}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
3608
+ };
3609
+ if (options.maxDepth !== void 0 && type !== "website") {
3610
+ reject("--max-depth");
3611
+ }
3612
+ if (options.maxPages !== void 0 && type !== "website") {
3613
+ reject("--max-pages");
3614
+ }
3615
+ if (options.maxConcurrentRequests !== void 0 && !["website", "rss"].includes(type)) {
3616
+ reject("--max-concurrent-requests");
3617
+ }
3618
+ if (options.renderJs && type !== "website") {
3619
+ reject("--render-js");
3620
+ }
3621
+ if (options.robots === false && type !== "website") {
3622
+ reject("--no-robots");
3623
+ }
3624
+ if (options.rateLimitMs !== void 0 && type !== "website") {
3625
+ reject("--rate-limit-ms");
3626
+ }
3627
+ if (options.include !== void 0 && !["website", "directory"].includes(type)) {
3628
+ reject("--include");
3629
+ }
3630
+ if (options.exclude !== void 0 && !["website", "directory"].includes(type)) {
3631
+ reject("--exclude");
3632
+ }
3633
+ if (options.retentionDays !== void 0 && type !== "rss") {
3634
+ reject("--retention-days");
3635
+ }
3636
+ }
2947
3637
  function allowedSourceConfigFields(source) {
2948
3638
  const fields = /* @__PURE__ */ new Set(["name", "tag", "metadata"]);
2949
3639
  if (source.type === "rss") {
2950
3640
  fields.add("retentionDays");
3641
+ fields.add("maxConcurrentRequests");
2951
3642
  }
2952
3643
  if (source.type === "website") {
2953
3644
  fields.add("maxDepth");
2954
3645
  fields.add("maxPages");
3646
+ fields.add("maxConcurrentRequests");
2955
3647
  fields.add("include");
2956
3648
  fields.add("exclude");
2957
3649
  }
@@ -2987,6 +3679,10 @@ function buildSourceConfigPatch(source, options) {
2987
3679
  checkAllowed("maxPages", "--max-pages");
2988
3680
  crawlPatch.maxPages = parseOptionalNumber(options.maxPages, "--max-pages");
2989
3681
  }
3682
+ if (options.maxConcurrentRequests !== void 0) {
3683
+ checkAllowed("maxConcurrentRequests", "--max-concurrent-requests");
3684
+ crawlPatch.maxConcurrentRequests = parseOptionalPositiveInteger(options.maxConcurrentRequests, "--max-concurrent-requests");
3685
+ }
2990
3686
  if (options.include !== void 0) {
2991
3687
  checkAllowed("include", "--include");
2992
3688
  crawlPatch.includePatterns = options.include;
@@ -3016,6 +3712,50 @@ function response(command, workspace, data, error) {
3016
3712
  }
3017
3713
  function writeOutput(capture, value, stderr = false) {
3018
3714
  (stderr ? capture.stderr : capture.stdout).push(value);
3715
+ if (stderr) {
3716
+ capture.onStderr?.(value);
3717
+ return;
3718
+ }
3719
+ capture.onStdout?.(value);
3720
+ }
3721
+ function createProgressHandler(capture, options) {
3722
+ if (options.json || options.silent || options.quiet) {
3723
+ return void 0;
3724
+ }
3725
+ return (level, message) => {
3726
+ if (level === "detail" && !options.verbose) {
3727
+ return;
3728
+ }
3729
+ writeOutput(capture, message, true);
3730
+ };
3731
+ }
3732
+ async function runIngestCommand({
3733
+ workspace,
3734
+ sourceId,
3735
+ changedOnly,
3736
+ dense,
3737
+ sparse,
3738
+ progress
3739
+ }) {
3740
+ progress?.("info", "Ingest step 1/3: fetch and normalize");
3741
+ const ingest = await ingestSources({
3742
+ workspacePath: workspace,
3743
+ sourceIds: sourceId ? [sourceId] : void 0,
3744
+ changedOnly,
3745
+ progress
3746
+ });
3747
+ progress?.("info", "Ingest step 2/3: chunk affected documents");
3748
+ const chunk = await chunkDocuments({ workspacePath: workspace, sourceId, progress });
3749
+ progress?.("info", "Ingest step 3/3: refresh index");
3750
+ const indexBuild = await buildIndex({
3751
+ workspacePath: workspace,
3752
+ denseOverride: dense ? true : void 0,
3753
+ sparseOverride: sparse ? true : void 0,
3754
+ buildAvailableModels: true,
3755
+ progress
3756
+ });
3757
+ progress?.("info", "Ingest complete");
3758
+ return { ingest, chunk, indexPath: indexBuild.indexPath, metadata: indexBuild.metadata };
3019
3759
  }
3020
3760
  function parseRetrievalMode(input) {
3021
3761
  if (!input) {
@@ -3030,10 +3770,11 @@ function parseSourceType(input) {
3030
3770
  if (!input) {
3031
3771
  return void 0;
3032
3772
  }
3033
- if (!SOURCE_TYPES.has(input)) {
3773
+ const normalized = input === "page" ? "url" : input;
3774
+ if (!SOURCE_TYPES.has(normalized)) {
3034
3775
  throw new CliError(`unsupported source type: ${input}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
3035
3776
  }
3036
- return input;
3777
+ return normalized;
3037
3778
  }
3038
3779
  function parseCommaSeparatedList(input) {
3039
3780
  const values = (input ?? "").split(",").map((value) => value.trim()).filter(Boolean);
@@ -3094,56 +3835,96 @@ function workspaceFromArgv(argv) {
3094
3835
  }
3095
3836
  return path21.resolve(DEFAULT_WORKSPACE);
3096
3837
  }
3097
- async function runCli(argv) {
3098
- const capture = { stdout: [], stderr: [] };
3838
+ async function runCli(argv, io = {}) {
3839
+ const capture = { stdout: [], stderr: [], ...io };
3099
3840
  const program = new Command();
3100
- program.name("qli").description("Build and query a local Querylight workspace from files, directories, URLs, websites, and feeds.").showHelpAfterError().option("--workspace <path>", "Workspace directory. Defaults to .kb in the current directory.", DEFAULT_WORKSPACE).option("--config <path>", "Optional config file override. Useful for testing alternate retrieval settings.").option("--json", "Return a stable JSON envelope for automation and agents.").option("--verbose", "Print more operational detail when a command supports it.").option("--quiet", "Suppress non-essential human-readable output.");
3841
+ program.name("qli").description("Build and query a local Querylight workspace from files, directories, URLs, websites, and feeds.").showHelpAfterError().option("--workspace <path>", "Workspace directory. Defaults to .kb in the current directory.", DEFAULT_WORKSPACE).option("--config <path>", "Optional config file override. Useful for testing alternate retrieval settings.").option("--json", "Return a stable JSON envelope for automation and agents.").option("--silent", "Suppress progress logging for long-running commands.").option("--verbose", "Print more operational detail when a command supports it.").addOption(new Option("--quiet", "Deprecated alias for --silent.").hideHelp());
3101
3842
  program.addHelpText("after", `
3102
3843
  Workflow:
3103
3844
  1. Initialize a workspace with qli init
3104
3845
  2. Register one or more sources with qli source add
3105
- 3. Build or refresh the workspace with qli rebuild
3846
+ 3. Refresh the workspace with qli ingest
3106
3847
  4. Query it with qli search, qli related, or qli context
3107
3848
 
3108
3849
  Examples:
3109
3850
  qli init
3110
3851
  qli source add directory ./docs --name "Product Docs" --tag docs
3111
- qli rebuild
3852
+ qli ingest
3853
+ qli rebuild --silent
3112
3854
  qli search "api authentication" --top-k 8
3113
3855
  qli context "How do API keys work?" --top-k 8 --max-chars 8000
3114
3856
 
3857
+ Long-running commands print progress to stderr by default. Use --silent to suppress it.
3858
+ Use --json when another tool needs stable structured output.
3859
+
3115
3860
  Use qli <command> --help for command-specific options and examples.`);
3116
- program.command("init").description("Create a new workspace with the default directory layout and config.").option("--force").addHelpText("after", `
3861
+ program.command("init").description("Create a new workspace with the default directory layout and config, then pull missing retrieval models.").option("--force").addHelpText("after", `
3117
3862
  Examples:
3118
3863
  qli init
3119
3864
  qli init --workspace ./kb
3120
- qli init --workspace /tmp/querylight --force`).action(async function command(options) {
3865
+ qli init --workspace /tmp/querylight --force
3866
+
3867
+ Notes:
3868
+ init enables dense and sparse retrieval in new workspaces.
3869
+ init pulls missing model assets for enabled retrieval modes.
3870
+ Sparse model downloads require uv. If uv is not available, init skips the sparse pull.`).action(async function command(options) {
3871
+ const global = this.optsWithGlobals();
3121
3872
  const workspace = await resolveWorkspace({ workspace: this.optsWithGlobals().workspace });
3122
- const result2 = await ensureWorkspace({ workspacePath: workspace, force: Boolean(options.force) });
3123
- emit(this.optsWithGlobals().json, capture, response("init", workspace, result2), `Initialized workspace at ${workspace}`);
3873
+ const result = await ensureWorkspace({ workspacePath: workspace, force: Boolean(options.force) });
3874
+ const config = await loadConfig(workspace, global.config);
3875
+ const status = await getModelStatus(workspace, config);
3876
+ const { pullDense, pullSparse } = resolveMissingConfiguredModelPullPlan({ config, status });
3877
+ if (pullDense || pullSparse) {
3878
+ await pullModels({ workspacePath: workspace, config, pullDense, pullSparse, progress: createProgressHandler(capture, global) });
3879
+ }
3880
+ emit(this.optsWithGlobals().json, capture, response("init", workspace, result), `Initialized workspace at ${workspace}`);
3124
3881
  });
3125
3882
  const source = program.command("source");
3126
3883
  source.description("Register, inspect, and manage workspace sources.");
3127
- source.command("add").description("Add a source definition. The source is enabled immediately.").argument("<type>", `Source type: ${SOURCE_TYPE_LIST.join(", ")}`).argument("<uri>", "Local path, URL, feed URL, or inline content depending on the source type.").requiredOption("--name <name>").option("--tag <tag...>", "Optional tags used later for filtering during search.").option("--metadata <key=value...>", "Extra metadata fields stored on the source.").option("--max-depth <n>", "Maximum crawl depth for website sources.").option("--max-pages <n>", "Maximum number of pages to ingest from a website source.").option("--include <pattern...>", "Only include matching paths or URLs.").option("--exclude <pattern...>", "Skip matching paths or URLs.").option("--render-js", "Render pages with JavaScript before extraction when supported.").option("--no-robots", "Ignore robots.txt for website crawling. Use only when you control the target site or have permission.").option("--rate-limit-ms <n>", "Delay between website requests.").option("--retention-days <n>", "Retention window in days for RSS items. Defaults to the workspace crawler retention setting.").addHelpText("after", `
3884
+ source.command("add").description("Add a source definition. The source is enabled immediately. Use `page` for one page and `website` for multi-page crawling and feed detection.").argument("<type>", `Source type: ${SOURCE_TYPE_LIST.join(", ")}`).argument("<uri>", "Local path, URL, feed URL, or inline content depending on the source type.").requiredOption("--name <name>").option("--tag <tag...>", "Optional tags used later for filtering during search.").option("--metadata <key=value...>", "Extra metadata fields stored on the source.").option("--max-depth <n>", "Maximum crawl depth for website sources.").option("--max-pages <n>", "Maximum number of pages to ingest from a website source.").option("--max-concurrent-requests <n>", "Maximum remote requests in flight for a website or feed source.").option("--include <pattern...>", "Only include matching paths or URLs.").option("--exclude <pattern...>", "Skip matching paths or URLs.").option("--render-js", "Render pages with JavaScript before extraction when supported.").option("--no-robots", "Ignore robots.txt for website crawling. Use only when you control the target site or have permission.").option("--rate-limit-ms <n>", "Delay between website requests.").option("--retention-days <n>", "Retention window in days for RSS items. Defaults to the workspace crawler retention setting.").addHelpText("after", `
3128
3885
  Examples:
3129
3886
  qli source add directory ./docs --name "Local Docs" --tag docs
3130
3887
  qli source add file ./docs/auth.md --name "Auth Guide"
3131
- qli source add url https://example.com/docs/auth --name "Auth Page"
3888
+ qli source add page https://example.com/docs/auth --name "Auth Page"
3132
3889
  qli source add website https://example.com --name "Docs Site" --max-depth 2 --max-pages 50 --include /docs/
3890
+ qli source add website https://example.com --name "Docs Site" --max-concurrent-requests 8
3891
+ qli source add website https://example.com --name "Example Site" --json
3133
3892
  qli source add rss https://example.com/feed.xml --name "Release Feed"
3893
+ qli source add rss https://example.com/feed.xml --name "Release Feed" --max-concurrent-requests 3
3134
3894
  qli source add rss https://example.com/feed.xml --name "Release Feed" --retention-days 30
3135
3895
 
3136
3896
  Notes:
3897
+ page stores one page. It does not crawl links or detect feeds.
3898
+ Website sources may detect one blog or news feed during registration.
3899
+ When a feed is added, qli also excludes the feed item prefix from the website crawl when it can infer one.
3900
+ Website and RSS sources default to 5 remote requests in flight per source unless config.yaml or source settings override it.
3901
+ Use --json when automation needs the full list of created sources.
3137
3902
  RSS sources store retention per feed.
3138
- When you omit --retention-days for RSS, qli stores the workspace default from config.yaml.`).action(async function command(type, uri, options) {
3903
+ When you omit --retention-days for RSS, qli stores the workspace default from config.yaml.`).action(async function command(typeInput, uri, options) {
3904
+ const type = parseSourceType(typeInput);
3905
+ if (!type) {
3906
+ throw new CliError(`unsupported source type: ${typeInput}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
3907
+ }
3139
3908
  if (!SOURCE_TYPES.has(type)) {
3140
3909
  throw new CliError(`unsupported source type: ${type}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
3141
3910
  }
3911
+ validateSourceAddOptions(type, options);
3142
3912
  const global = this.optsWithGlobals();
3143
3913
  const workspace = await resolveWorkspace({ workspace: global.workspace });
3144
3914
  const config = await loadConfig(workspace, global.config);
3145
3915
  const now = (/* @__PURE__ */ new Date()).toISOString();
3146
- const crawl = createSourceCrawlConfig(type, options, { retentionDays: config.crawler.retentionDays });
3916
+ const initialCrawl = createSourceCrawlConfig(type, options, { retentionDays: config.crawler.retentionDays });
3917
+ let crawl = initialCrawl;
3918
+ let detectedFeed = null;
3919
+ if (type === "website") {
3920
+ detectedFeed = await discoverWebsiteFeed(uri, config.crawler.defaultUserAgent);
3921
+ if (detectedFeed?.excludePrefix) {
3922
+ crawl = {
3923
+ ...crawl ?? {},
3924
+ excludePatterns: mergePatterns(crawl?.excludePatterns, detectedFeed.excludePrefix)
3925
+ };
3926
+ }
3927
+ }
3147
3928
  const stored = await addSource(workspace, {
3148
3929
  type,
3149
3930
  uri: ["file", "directory"].includes(type) ? path21.resolve(uri) : uri,
@@ -3155,11 +3936,50 @@ Notes:
3155
3936
  createdAt: now,
3156
3937
  updatedAt: now
3157
3938
  });
3158
- emit(global.json, capture, response("source add", workspace, stored), `Added source ${stored.id}`);
3939
+ if (type !== "website") {
3940
+ emit(global.json, capture, response("source add", workspace, stored), `Added source ${stored.id}`);
3941
+ return;
3942
+ }
3943
+ let feedSource;
3944
+ let feedWasAdded = false;
3945
+ if (detectedFeed) {
3946
+ const existingSources = await listSources(workspace);
3947
+ feedSource = existingSources.find((source2) => source2.uri === detectedFeed?.feedUrl);
3948
+ if (!feedSource) {
3949
+ feedSource = await addSource(workspace, {
3950
+ type: "rss",
3951
+ uri: detectedFeed.feedUrl,
3952
+ name: `${options.name} Feed`,
3953
+ enabled: true,
3954
+ tags: options.tag ?? [],
3955
+ metadata: normalizeMetadata(options.metadata),
3956
+ crawl: {
3957
+ retentionDays: config.crawler.retentionDays,
3958
+ fetchArticles: true
3959
+ },
3960
+ createdAt: now,
3961
+ updatedAt: now
3962
+ });
3963
+ feedWasAdded = true;
3964
+ }
3965
+ }
3966
+ const result = {
3967
+ primarySource: stored,
3968
+ addedSources: [stored, ...feedWasAdded && feedSource ? [feedSource] : []],
3969
+ detectedFeed: detectedFeed ? {
3970
+ url: detectedFeed.feedUrl,
3971
+ discoveredBy: detectedFeed.discoveredBy,
3972
+ excludePrefix: detectedFeed.excludePrefix,
3973
+ source: feedSource,
3974
+ wasAdded: feedWasAdded
3975
+ } : null
3976
+ };
3977
+ emit(global.json, capture, response("source add", workspace, result), formatWebsiteSourceAdd(result));
3159
3978
  });
3160
- source.command("config").description("Edit supported settings on an existing source.").argument("<sourceId>", "Source id from qli source list.").option("--name <name>", "Update the source name.").option("--tag <tag...>", "Replace source tags with the provided values.").option("--metadata <key=value...>", "Merge metadata keys into the existing source metadata.").option("--max-depth <n>", "Set website crawl depth.").option("--max-pages <n>", "Set the page limit for website sources.").option("--include <pattern...>", "Set include patterns for website or directory sources.").option("--exclude <pattern...>", "Set exclude patterns for website or directory sources.").option("--retention-days <n>", "Set RSS retention in days for this feed.").addHelpText("after", `
3979
+ source.command("config").description("Edit supported settings on an existing source.").argument("<sourceId>", "Source id from qli source list.").option("--name <name>", "Update the source name.").option("--tag <tag...>", "Replace source tags with the provided values.").option("--metadata <key=value...>", "Merge metadata keys into the existing source metadata.").option("--max-depth <n>", "Set website crawl depth.").option("--max-pages <n>", "Set the page limit for website sources.").option("--max-concurrent-requests <n>", "Set the remote request concurrency limit for website or feed sources.").option("--include <pattern...>", "Set include patterns for website or directory sources.").option("--exclude <pattern...>", "Set exclude patterns for website or directory sources.").option("--retention-days <n>", "Set RSS retention in days for this feed.").addHelpText("after", `
3161
3980
  Examples:
3162
3981
  qli source config src_123 --retention-days 30
3982
+ qli source config src_123 --max-concurrent-requests 2
3163
3983
  qli source config src_123 --name "Docs Feed" --tag rss docs
3164
3984
  qli source config src_123 --include /docs/ --exclude /docs/archive/
3165
3985
  qli source config src_123 --metadata team=docs owner=platform --json
@@ -3218,35 +4038,56 @@ Examples:
3218
4038
  const updated = await updateSource(workspace, sourceId, { enabled: true, updatedAt: (/* @__PURE__ */ new Date()).toISOString() });
3219
4039
  emit(global.json, capture, response("source enable", workspace, updated), `Enabled source ${sourceId}`);
3220
4040
  });
3221
- program.command("ingest").description("Fetch and normalize source content into workspace documents.").option("--source <sourceId>", "Only ingest one source.").option("--changed-only", "Skip content that has not changed since the last run.").addHelpText("after", `
4041
+ program.command("ingest").description("Fetch source content, update affected chunks, and refresh retrieval indexes.").option("--source <sourceId>", "Only ingest one source.").option("--changed-only", "Skip content that has not changed since the last run.").option("--dense", "Force a dense vector build if the dense model is available.").option("--sparse", "Force a sparse vector build if the sparse runtime is available.").addHelpText("after", `
3222
4042
  Examples:
3223
4043
  qli ingest
3224
4044
  qli ingest --source src_123
3225
- qli ingest --changed-only`).action(async function command(options) {
4045
+ qli ingest --changed-only
4046
+ qli ingest --dense --sparse
4047
+ qli ingest --silent`).action(async function command(options) {
3226
4048
  const global = this.optsWithGlobals();
3227
4049
  const workspace = await resolveWorkspace({ workspace: global.workspace });
3228
- const result2 = await ingestSources({ workspacePath: workspace, sourceIds: options.source ? [options.source] : void 0, changedOnly: Boolean(options.changedOnly) });
3229
- emit(global.json, capture, response("ingest", workspace, result2), `Ingested ${result2.processedSources} sources`);
4050
+ const result = await runIngestCommand({
4051
+ workspace,
4052
+ sourceId: options.source,
4053
+ changedOnly: Boolean(options.changedOnly),
4054
+ dense: Boolean(options.dense),
4055
+ sparse: Boolean(options.sparse),
4056
+ progress: createProgressHandler(capture, global)
4057
+ });
4058
+ emit(global.json, capture, response("ingest", workspace, result), `Processed ${result.ingest.processedSources} sources, wrote ${result.chunk.chunksWritten} chunks`);
3230
4059
  });
3231
4060
  program.command("chunk").description("Split normalized documents into retrieval chunks.").option("--source <sourceId>", "Only chunk documents from one source.").option("--document <documentId>", "Only chunk one document.").addHelpText("after", `
3232
4061
  Examples:
3233
4062
  qli chunk
3234
4063
  qli chunk --source src_123
3235
- qli chunk --document doc_123`).action(async function command(options) {
4064
+ qli chunk --document doc_123
4065
+ qli chunk --silent`).action(async function command(options) {
3236
4066
  const global = this.optsWithGlobals();
3237
4067
  const workspace = await resolveWorkspace({ workspace: global.workspace });
3238
- const result2 = await chunkDocuments({ workspacePath: workspace, sourceId: options.source, documentId: options.document });
3239
- emit(global.json, capture, response("chunk", workspace, result2), `Wrote ${result2.chunksWritten} chunks`);
4068
+ const result = await chunkDocuments({
4069
+ workspacePath: workspace,
4070
+ sourceId: options.source,
4071
+ documentId: options.document,
4072
+ progress: createProgressHandler(capture, global)
4073
+ });
4074
+ emit(global.json, capture, response("chunk", workspace, result), `Wrote ${result.chunksWritten} chunks`);
3240
4075
  });
3241
4076
  program.command("reprocess").description("Re-run normalization for existing documents without fetching sources again.").option("--source <sourceId>", "Only reprocess documents from one source.").option("--document <documentId>", "Only reprocess one document.").addHelpText("after", `
3242
4077
  Examples:
3243
4078
  qli reprocess
3244
4079
  qli reprocess --source src_123
3245
- qli reprocess --document doc_123`).action(async function command(options) {
4080
+ qli reprocess --document doc_123
4081
+ qli reprocess --silent`).action(async function command(options) {
3246
4082
  const global = this.optsWithGlobals();
3247
4083
  const workspace = await resolveWorkspace({ workspace: global.workspace });
3248
- const result2 = await reprocessDocuments({ workspacePath: workspace, sourceId: options.source, documentId: options.document });
3249
- emit(global.json, capture, response("reprocess", workspace, result2), `Reprocessed ${result2.documentsReprocessed} documents`);
4084
+ const result = await reprocessDocuments({
4085
+ workspacePath: workspace,
4086
+ sourceId: options.source,
4087
+ documentId: options.document,
4088
+ progress: createProgressHandler(capture, global)
4089
+ });
4090
+ emit(global.json, capture, response("reprocess", workspace, result), `Reprocessed ${result.documentsReprocessed} documents`);
3250
4091
  });
3251
4092
  const index = program.command("index");
3252
4093
  index.description("Build and inspect retrieval indexes.");
@@ -3254,33 +4095,47 @@ Examples:
3254
4095
  Examples:
3255
4096
  qli index build
3256
4097
  qli index build --dense
3257
- qli index build --dense --sparse`).action(async function command(options) {
4098
+ qli index build --dense --sparse
4099
+ qli index build --silent`).action(async function command(options) {
3258
4100
  const global = this.optsWithGlobals();
3259
4101
  const workspace = await resolveWorkspace({ workspace: global.workspace });
3260
- const result2 = await buildIndex({
4102
+ const result = await buildIndex({
3261
4103
  workspacePath: workspace,
3262
4104
  denseOverride: options.dense ? true : void 0,
3263
- sparseOverride: options.sparse ? true : void 0
4105
+ sparseOverride: options.sparse ? true : void 0,
4106
+ progress: createProgressHandler(capture, global)
3264
4107
  });
3265
- emit(global.json, capture, response("index build", workspace, result2), `Built index at ${result2.indexPath}`);
4108
+ emit(global.json, capture, response("index build", workspace, result), `Built index at ${result.indexPath}`);
3266
4109
  });
3267
4110
  program.command("rebuild").description("Run ingest, chunk, and index build in one command.").option("--source <sourceId>", "Only rebuild data for one source.").option("--changed-only", "Only ingest changed content before chunking and indexing.").option("--dense", "Force a dense vector build if the dense model is available.").option("--sparse", "Force a sparse vector build if the sparse runtime is available.").addHelpText("after", `
3268
4111
  Examples:
3269
4112
  qli rebuild
3270
4113
  qli rebuild --changed-only
3271
4114
  qli rebuild --source src_123
3272
- qli rebuild --dense --sparse`).action(async function command(options) {
4115
+ qli rebuild --dense --sparse
4116
+ qli rebuild --silent`).action(async function command(options) {
3273
4117
  const global = this.optsWithGlobals();
3274
4118
  const workspace = await resolveWorkspace({ workspace: global.workspace });
3275
- const ingest = await ingestSources({ workspacePath: workspace, sourceIds: options.source ? [options.source] : void 0, changedOnly: Boolean(options.changedOnly) });
3276
- const chunk = await chunkDocuments({ workspacePath: workspace, sourceId: options.source });
4119
+ const progress = createProgressHandler(capture, global);
4120
+ progress?.("info", "Rebuild step 1/3: ingest");
4121
+ const ingest = await ingestSources({
4122
+ workspacePath: workspace,
4123
+ sourceIds: options.source ? [options.source] : void 0,
4124
+ changedOnly: Boolean(options.changedOnly),
4125
+ progress
4126
+ });
4127
+ progress?.("info", "Rebuild step 2/3: chunk");
4128
+ const chunk = await chunkDocuments({ workspacePath: workspace, sourceId: options.source, progress });
4129
+ progress?.("info", "Rebuild step 3/3: index");
3277
4130
  const indexBuild = await buildIndex({
3278
4131
  workspacePath: workspace,
3279
4132
  denseOverride: options.dense ? true : void 0,
3280
4133
  sparseOverride: options.sparse ? true : void 0,
3281
- buildAvailableModels: true
4134
+ buildAvailableModels: true,
4135
+ progress
3282
4136
  });
3283
4137
  const data = { ingest, chunk, indexPath: indexBuild.indexPath, metadata: indexBuild.metadata };
4138
+ progress?.("info", "Rebuild complete");
3284
4139
  emit(global.json, capture, response("rebuild", workspace, data), `Processed ${ingest.processedSources} sources, wrote ${chunk.chunksWritten} chunks`);
3285
4140
  });
3286
4141
  program.command("search").description("Search the built index and return ranked matching documents or chunks.").argument("[query]", "Text query. Omit it to list the latest matching documents.").option("--top-k <n>", "Maximum number of results to return.", "12").option("--source <sourceIds>", "Restrict results to one or more source ids. Use comma-separated values.").option("--source-name <names>", "Restrict results to one or more source names. Use comma-separated values.").option("--source-type <types>", `Restrict results to one or more source types. Use comma-separated values: ${SOURCE_TYPE_LIST.join(", ")}`).option("--uri-prefix <prefixes>", "Restrict results to one or more URI prefixes. Use comma-separated values.").option("--tag <tags>", "Restrict results to one or more source tags. Use comma-separated values.").option("--metadata <key=value...>", "Restrict results to sources with matching metadata.").option("--since <date>", "Shortcut for --publication-date-from.").option("--until <date>", "Shortcut for --publication-date-to.").option("--changed-since <date>", "Only include documents changed on or after this date.").option("--has-publication-date", "Only include documents with a publication date.").option("--publication-date-from <date>", "Only include documents published on or after this date.").option("--publication-date-to <date>", "Only include documents published on or before this date.").option("--first-seen-at-from <date>", "Only include documents first seen on or after this date.").option("--first-seen-at-to <date>", "Only include documents first seen on or before this date.").option("--last-seen-at-from <date>", "Only include documents last seen on or after this date.").option("--last-seen-at-to <date>", "Only include documents last seen on or before this date.").option("--last-changed-at-from <date>", "Only include documents changed on or after this date.").option("--last-changed-at-to <date>", "Only include documents changed on or before this date.").option("--crawled-at-from <date>", "Only include documents crawled on or after this date.").option("--crawled-at-to <date>", "Only include documents crawled on or before this date.").option("--retrieval <mode>", `Retrieval mode: ${RETRIEVAL_MODE_LIST.join(", ")}`).option("--show-chunks", "Return chunk-level matches when available.").addHelpText("after", `
@@ -3291,7 +4146,7 @@ Examples:
3291
4146
  qli search --source-name "Release Feed,Company Blog" --uri-prefix https://example.com/news,https://example.com/blog
3292
4147
  qli search "billing" --metadata team=support
3293
4148
  qli search "embedding model" --retrieval hybrid --show-chunks
3294
- qli search --source-type rss,url --top-k 25 --json
4149
+ qli search --source-type rss,page --top-k 25 --json
3295
4150
 
3296
4151
  Notes:
3297
4152
  lexical works without vector models.
@@ -3299,7 +4154,7 @@ Notes:
3299
4154
  When you omit the query, qli returns the latest matching documents sorted by publication date.`).action(async function command(query, options) {
3300
4155
  const global = this.optsWithGlobals();
3301
4156
  const workspace = await resolveWorkspace({ workspace: global.workspace });
3302
- const result2 = await searchIndex({
4157
+ const result = await searchIndex({
3303
4158
  workspacePath: workspace,
3304
4159
  query: query ?? "",
3305
4160
  topK: Number(options.topK),
@@ -3314,7 +4169,7 @@ Notes:
3314
4169
  retrievalMode: parseRetrievalMode(options.retrieval),
3315
4170
  showChunks: Boolean(options.showChunks)
3316
4171
  });
3317
- emit(global.json, capture, response("search", workspace, result2), formatSearchResults(result2.results));
4172
+ emit(global.json, capture, response("search", workspace, result), formatSearchResults(result.results));
3318
4173
  });
3319
4174
  program.command("related").description("Find documents similar to an existing document by id or URI.").argument("<document>", "Document id, uri, or canonical uri").option("--top-k <n>", "Maximum number of related documents to return.", "12").addHelpText("after", `
3320
4175
  Examples:
@@ -3326,12 +4181,12 @@ Dense vectors usually produce better related-document results. Pull models and r
3326
4181
  qli rebuild --dense`).action(async function command(document, options) {
3327
4182
  const global = this.optsWithGlobals();
3328
4183
  const workspace = await resolveWorkspace({ workspace: global.workspace });
3329
- const result2 = await findRelatedDocuments({
4184
+ const result = await findRelatedDocuments({
3330
4185
  workspacePath: workspace,
3331
4186
  document,
3332
4187
  topK: Number(options.topK)
3333
4188
  });
3334
- emit(global.json, capture, response("related", workspace, result2), formatRelatedDocuments(result2.results));
4189
+ emit(global.json, capture, response("related", workspace, result), formatRelatedDocuments(result.results));
3335
4190
  });
3336
4191
  program.command("context").description("Assemble retrieval context for an external LLM, agent, or prompt pipeline.").argument("<query>").option("--top-k <n>", "Maximum number of source passages to consider.", "12").option("--max-chars <n>", "Maximum output length for the rendered context block.", "12000").option("--retrieval <mode>", `Retrieval mode: ${RETRIEVAL_MODE_LIST.join(", ")}`).addHelpText("after", `
3337
4192
  Examples:
@@ -3342,14 +4197,14 @@ Examples:
3342
4197
  Use --json when another tool needs structured access to the raw passages and metadata.`).action(async function command(query, options) {
3343
4198
  const global = this.optsWithGlobals();
3344
4199
  const workspace = await resolveWorkspace({ workspace: global.workspace });
3345
- const result2 = await createContext({
4200
+ const result = await createContext({
3346
4201
  workspacePath: workspace,
3347
4202
  query,
3348
4203
  topK: Number(options.topK),
3349
4204
  maxChars: Number(options.maxChars),
3350
4205
  retrievalMode: parseRetrievalMode(options.retrieval)
3351
4206
  });
3352
- emit(global.json, capture, response("context", workspace, result2), result2.markdown);
4207
+ emit(global.json, capture, response("context", workspace, result), result.markdown);
3353
4208
  });
3354
4209
  const models = program.command("models");
3355
4210
  models.description("Inspect and download retrieval model assets.");
@@ -3358,7 +4213,9 @@ Examples:
3358
4213
  qli models pull
3359
4214
  qli models pull --dense
3360
4215
  qli models pull --sparse
4216
+ qli models pull --silent
3361
4217
 
4218
+ Pulled model assets are shared under ~/.qli by default.
3362
4219
  If you plan to use related, dense search, or hybrid retrieval, pull the models and rebuild the index first.`).action(async function command(options) {
3363
4220
  const global = this.optsWithGlobals();
3364
4221
  const workspace = await resolveWorkspace({ workspace: global.workspace });
@@ -3369,17 +4226,27 @@ If you plan to use related, dense search, or hybrid retrieval, pull the models a
3369
4226
  pullSparseFlag: Boolean(options.sparse),
3370
4227
  uvAvailable: status.sparse.uvAvailable
3371
4228
  });
3372
- await pullModels({ workspacePath: workspace, config, pullDense, pullSparse });
4229
+ await pullModels({ workspacePath: workspace, config, pullDense, pullSparse, progress: createProgressHandler(capture, global) });
3373
4230
  const data = {
3374
- dense: pullDense ? { pulled: true, modelId: config.retrieval.dense.modelId, cacheDir: config.retrieval.dense.cacheDir } : void 0,
3375
- sparse: pullSparse ? { pulled: true, modelId: config.retrieval.sparse.modelId, cacheDir: config.retrieval.sparse.cacheDir } : void 0
4231
+ dense: pullDense ? {
4232
+ pulled: true,
4233
+ modelId: config.retrieval.dense.modelId,
4234
+ cacheDir: resolveCacheDir(workspace, config.retrieval.dense.cacheDir)
4235
+ } : void 0,
4236
+ sparse: pullSparse ? {
4237
+ pulled: true,
4238
+ modelId: config.retrieval.sparse.modelId,
4239
+ cacheDir: resolveCacheDir(workspace, config.retrieval.sparse.cacheDir)
4240
+ } : void 0
3376
4241
  };
3377
4242
  emit(global.json, capture, response("models pull", workspace, data), "Pulled available models");
3378
4243
  });
3379
- models.command("status").description("Show whether model runtimes and artifacts are available in the workspace.").addHelpText("after", `
4244
+ models.command("status").description("Show whether shared model assets, runtimes, and workspace vector artifacts are available.").addHelpText("after", `
3380
4245
  Examples:
3381
4246
  qli models status
3382
- qli models status --json`).action(async function command() {
4247
+ qli models status --json
4248
+
4249
+ The cacheDir fields show the resolved model cache path for the current workspace config.`).action(async function command() {
3383
4250
  const global = this.optsWithGlobals();
3384
4251
  const workspace = await resolveWorkspace({ workspace: global.workspace });
3385
4252
  const config = await loadConfig(workspace, global.config);
@@ -3394,8 +4261,8 @@ Examples:
3394
4261
  qli diff --since 2026-05-01`).action(async function command(options) {
3395
4262
  const global = this.optsWithGlobals();
3396
4263
  const workspace = await resolveWorkspace({ workspace: global.workspace });
3397
- const result2 = await diffWorkspace({ workspacePath: workspace, sourceId: options.source, documentId: options.document, since: options.since });
3398
- emit(global.json, capture, response("diff", workspace, result2), JSON.stringify(result2, null, 2));
4264
+ const result = await diffWorkspace({ workspacePath: workspace, sourceId: options.source, documentId: options.document, since: options.since });
4265
+ emit(global.json, capture, response("diff", workspace, result), JSON.stringify(result, null, 2));
3399
4266
  });
3400
4267
  const report = program.command("report");
3401
4268
  report.description("Render higher-level reports from workspace data.");
@@ -3427,7 +4294,7 @@ Examples:
3427
4294
  try {
3428
4295
  const meta = await readLatestIndexMetadata(workspace);
3429
4296
  latestIndex = meta.createdAt;
3430
- indexSize = (await stat4(`${workspace}/indexes/latest.json`)).size;
4297
+ indexSize = (await stat4(await resolveLatestIndexArtifactPath(workspace))).size;
3431
4298
  } catch {
3432
4299
  latestIndex = void 0;
3433
4300
  }
@@ -3476,8 +4343,11 @@ Examples:
3476
4343
  checks.push("dense runtime importable");
3477
4344
  }
3478
4345
  if (config.retrieval.sparse.enabled) {
3479
- await ensureUvAvailable();
3480
- checks.push("uv available for sparse runtime");
4346
+ if (await isUvAvailable()) {
4347
+ checks.push("uv available for sparse runtime");
4348
+ } else {
4349
+ checks.push("uv missing for sparse runtime");
4350
+ }
3481
4351
  }
3482
4352
  try {
3483
4353
  await readLatestIndexMetadata(workspace);
@@ -3511,13 +4381,21 @@ function emit(asJson, capture, body, human) {
3511
4381
  }
3512
4382
 
3513
4383
  // src/cli/main.ts
3514
- var result = await runCli(process.argv.slice(2));
3515
- if (result.stdout) {
3516
- process.stdout.write(`${result.stdout}
4384
+ try {
4385
+ const result = await runCli(process.argv.slice(2), {
4386
+ onStdout(value) {
4387
+ process.stdout.write(`${value}
3517
4388
  `);
3518
- }
3519
- if (result.stderr) {
3520
- process.stderr.write(`${result.stderr}
4389
+ },
4390
+ onStderr(value) {
4391
+ process.stderr.write(`${value}
4392
+ `);
4393
+ }
4394
+ });
4395
+ process.exitCode = result.exitCode;
4396
+ } catch (error) {
4397
+ const message = error instanceof Error ? error.stack ?? error.message : String(error);
4398
+ process.stderr.write(`${message}
3521
4399
  `);
4400
+ process.exitCode = 1;
3522
4401
  }
3523
- process.exit(result.exitCode);