@tryformation/querylight-cli 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli/main.js CHANGED
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env node
2
2
 
3
3
  // src/cli/run-cli.ts
4
- import { Command } from "commander";
4
+ import { Command, Option } from "commander";
5
5
  import { stat as stat4 } from "fs/promises";
6
6
  import path21 from "path";
7
7
 
@@ -14,6 +14,17 @@ import path4 from "path";
14
14
  import { readFile, writeFile } from "fs/promises";
15
15
  import path from "path";
16
16
  import YAML from "yaml";
17
+
18
+ // src/core/constants.ts
19
+ var PACKAGE_VERSION = "0.2.0";
20
+ var DEFAULT_WORKSPACE = ".kb";
21
+ var DEFAULT_SHARED_MODEL_CACHE_DIR = "~/.qli/models/huggingface";
22
+ var LEGACY_WORKSPACE_MODEL_CACHE_DIR = ".kb/models/huggingface";
23
+
24
+ // src/core/config.ts
25
+ function normalizeModelCacheDir(configuredPath) {
26
+ return configuredPath === LEGACY_WORKSPACE_MODEL_CACHE_DIR ? DEFAULT_SHARED_MODEL_CACHE_DIR : configuredPath;
27
+ }
17
28
  var defaultConfig = () => ({
18
29
  workspaceVersion: 1,
19
30
  index: {
@@ -41,17 +52,17 @@ var defaultConfig = () => ({
41
52
  retrieval: {
42
53
  defaultMode: "lexical",
43
54
  dense: {
44
- enabled: false,
55
+ enabled: true,
45
56
  modelId: "Xenova/all-MiniLM-L6-v2",
46
- cacheDir: ".kb/models/huggingface",
57
+ cacheDir: DEFAULT_SHARED_MODEL_CACHE_DIR,
47
58
  indexHashTables: 8,
48
59
  indexRandomSeed: 42,
49
60
  chunkTextMode: "title-heading-text"
50
61
  },
51
62
  sparse: {
52
- enabled: false,
63
+ enabled: true,
53
64
  modelId: "opensearch-project/opensearch-neural-sparse-encoding-doc-v3-distill",
54
- cacheDir: ".kb/models/huggingface",
65
+ cacheDir: DEFAULT_SHARED_MODEL_CACHE_DIR,
55
66
  documentTopTokens: 128,
56
67
  queryEncoding: "tokenizer-token-weights",
57
68
  documentEncoding: "masked-lm-max-log1p-relu",
@@ -62,6 +73,7 @@ var defaultConfig = () => ({
62
73
  defaultUserAgent: "querylight-cli/0.1",
63
74
  obeyRobotsTxt: true,
64
75
  rateLimitMs: 1e3,
76
+ maxConcurrentRequests: 5,
65
77
  renderJs: false,
66
78
  retentionDays: 365,
67
79
  fetchArticles: true
@@ -112,11 +124,13 @@ async function loadConfig(workspacePath, configPath) {
112
124
  ...parsed.retrieval ?? {},
113
125
  dense: {
114
126
  ...defaults.retrieval.dense,
115
- ...parsed.retrieval?.dense ?? {}
127
+ ...parsed.retrieval?.dense ?? {},
128
+ cacheDir: normalizeModelCacheDir(parsed.retrieval?.dense?.cacheDir ?? defaults.retrieval.dense.cacheDir)
116
129
  },
117
130
  sparse: {
118
131
  ...defaults.retrieval.sparse,
119
- ...parsed.retrieval?.sparse ?? {}
132
+ ...parsed.retrieval?.sparse ?? {},
133
+ cacheDir: normalizeModelCacheDir(parsed.retrieval?.sparse?.cacheDir ?? defaults.retrieval.sparse.cacheDir)
120
134
  }
121
135
  },
122
136
  crawler: {
@@ -162,6 +176,14 @@ async function writeJsonl(filePath, records) {
162
176
  ` : "", "utf8");
163
177
  }
164
178
 
179
+ // src/core/progress.ts
180
+ function reportProgress(progress, message) {
181
+ progress?.("info", message);
182
+ }
183
+ function reportProgressDetail(progress, message) {
184
+ progress?.("detail", message);
185
+ }
186
+
165
187
  // src/chunk/chunk-store.ts
166
188
  import path3 from "path";
167
189
  function chunksFile(workspacePath) {
@@ -269,11 +291,13 @@ function buildChunksForDocument(document, markdown, config, prior = /* @__PURE__
269
291
  async function chunkDocuments({
270
292
  workspacePath,
271
293
  sourceId,
272
- documentId
294
+ documentId,
295
+ progress
273
296
  }) {
274
297
  const config = await loadConfig(workspacePath);
275
298
  const documents = await readJsonl(path4.join(workspacePath, "documents", "documents.jsonl"));
276
299
  const filtered = documents.filter((document) => (!sourceId || document.sourceId === sourceId) && (!documentId || document.id === documentId));
300
+ reportProgress(progress, `Chunking ${filtered.length} document${filtered.length === 1 ? "" : "s"}`);
277
301
  const targetedDocumentIds = new Set(filtered.map((document) => document.id));
278
302
  const existingChunks = await loadChunks(workspacePath);
279
303
  const prior = new Map(existingChunks.map((chunk) => [chunk.id, chunk]));
@@ -281,19 +305,17 @@ async function chunkDocuments({
281
305
  existingChunks.filter((chunk) => !targetedDocumentIds.has(chunk.documentId)).map((chunk) => [chunk.id, chunk])
282
306
  );
283
307
  for (const document of filtered) {
308
+ reportProgressDetail(progress, `Chunking ${document.id} (${document.title})`);
284
309
  const raw = await readFile3(document.normalizedPath, "utf8");
285
310
  for (const chunk of buildChunksForDocument(document, raw, config, prior)) {
286
311
  nextChunks.set(chunk.id, chunk);
287
312
  }
288
313
  }
289
314
  await saveChunks(workspacePath, [...nextChunks.values()]);
315
+ reportProgress(progress, `Chunking complete: ${nextChunks.size} chunk${nextChunks.size === 1 ? "" : "s"} written`);
290
316
  return { chunksWritten: nextChunks.size };
291
317
  }
292
318
 
293
- // src/core/constants.ts
294
- var PACKAGE_VERSION = "0.1.0";
295
- var DEFAULT_WORKSPACE = ".kb";
296
-
297
319
  // src/core/errors.ts
298
320
  var CliError = class extends Error {
299
321
  constructor(message, code, exitCode, details) {
@@ -319,8 +341,6 @@ var DIRS = [
319
341
  "normalized",
320
342
  "indexes",
321
343
  "vectors",
322
- "models",
323
- "models/huggingface",
324
344
  "runs",
325
345
  "logs"
326
346
  ];
@@ -358,11 +378,12 @@ import { Analyzer, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, Ranking
358
378
  import path11 from "path";
359
379
 
360
380
  // src/vector/dense.ts
361
- import { VectorFieldIndex, createSeededRandom } from "@tryformation/querylight-ts";
381
+ import { VectorFieldIndex, cosineSimilarity, createSeededRandom } from "@tryformation/querylight-ts";
362
382
  import { mkdir as mkdir4 } from "fs/promises";
363
383
  import path8 from "path";
364
384
 
365
385
  // src/vector/runtime.ts
386
+ import os from "os";
366
387
  import path6 from "path";
367
388
  import { fileURLToPath } from "url";
368
389
  import { execFile, execFileSync } from "child_process";
@@ -379,7 +400,22 @@ async function fileExists(filePath) {
379
400
  }
380
401
 
381
402
  // src/vector/runtime.ts
403
+ function resolveQliHomeDir() {
404
+ return path6.resolve(process.env.QLI_HOME ?? path6.join(os.homedir(), ".qli"));
405
+ }
382
406
  function resolveCacheDir(workspacePath, configuredPath) {
407
+ if (configuredPath === "~/.qli") {
408
+ return resolveQliHomeDir();
409
+ }
410
+ if (configuredPath.startsWith("~/.qli/")) {
411
+ return path6.join(resolveQliHomeDir(), configuredPath.slice("~/.qli/".length));
412
+ }
413
+ if (configuredPath === "~") {
414
+ return os.homedir();
415
+ }
416
+ if (configuredPath.startsWith("~/")) {
417
+ return path6.join(os.homedir(), configuredPath.slice(2));
418
+ }
383
419
  return path6.isAbsolute(configuredPath) ? configuredPath : path6.resolve(workspacePath, configuredPath.replace(/^\.kb\//, ""));
384
420
  }
385
421
  function packageRootFromImportMeta(importMetaUrl) {
@@ -403,6 +439,14 @@ async function ensureUvAvailable() {
403
439
  execFile("uv", ["--version"], (error) => error ? reject(error) : resolve2());
404
440
  });
405
441
  }
442
+ async function isUvAvailable() {
443
+ try {
444
+ await ensureUvAvailable();
445
+ return true;
446
+ } catch {
447
+ return false;
448
+ }
449
+ }
406
450
  async function runSparsePython({
407
451
  workspacePath,
408
452
  config,
@@ -451,8 +495,8 @@ import path7 from "path";
451
495
  function vectorsDir(workspacePath) {
452
496
  return path7.join(workspacePath, "vectors");
453
497
  }
454
- function modelsDir(workspacePath) {
455
- return path7.join(workspacePath, "models");
498
+ function sharedModelStateDir() {
499
+ return path7.join(resolveQliHomeDir(), "models", "status");
456
500
  }
457
501
  function denseVectorPath(workspacePath) {
458
502
  return path7.join(vectorsDir(workspacePath), "dense.latest.json");
@@ -466,11 +510,16 @@ function sparseVectorPath(workspacePath) {
466
510
  function sparseMetaPath(workspacePath) {
467
511
  return path7.join(vectorsDir(workspacePath), "sparse.latest.meta.json");
468
512
  }
469
- function densePullMarker(workspacePath) {
470
- return path7.join(modelsDir(workspacePath), "dense.pulled.json");
513
+ function pullMarkerPath(type, workspacePath, modelId, cacheDir) {
514
+ const resolvedCacheDir = resolveCacheDir(workspacePath, cacheDir);
515
+ const cacheKey = sha256(resolvedCacheDir).slice(0, 16);
516
+ return path7.join(sharedModelStateDir(), type, `${encodeURIComponent(modelId)}.${cacheKey}.json`);
471
517
  }
472
- function sparsePullMarker(workspacePath) {
473
- return path7.join(modelsDir(workspacePath), "sparse.pulled.json");
518
+ function densePullMarker(workspacePath, modelId, cacheDir) {
519
+ return pullMarkerPath("dense", workspacePath, modelId, cacheDir);
520
+ }
521
+ function sparsePullMarker(workspacePath, modelId, cacheDir) {
522
+ return pullMarkerPath("sparse", workspacePath, modelId, cacheDir);
474
523
  }
475
524
  async function writeDensePayload(workspacePath, payload) {
476
525
  await mkdir3(vectorsDir(workspacePath), { recursive: true });
@@ -488,13 +537,15 @@ async function writeSparsePayload(workspacePath, payload) {
488
537
  async function readSparsePayload(workspacePath) {
489
538
  return JSON.parse(await readFile4(sparseVectorPath(workspacePath), "utf8"));
490
539
  }
491
- async function writeDensePullMarker(workspacePath, value) {
492
- await mkdir3(modelsDir(workspacePath), { recursive: true });
493
- await writeFile3(densePullMarker(workspacePath), JSON.stringify(value, null, 2), "utf8");
540
+ async function writeDensePullMarker(workspacePath, model, value) {
541
+ const markerPath = densePullMarker(workspacePath, model.modelId, model.cacheDir);
542
+ await mkdir3(path7.dirname(markerPath), { recursive: true });
543
+ await writeFile3(markerPath, JSON.stringify(value, null, 2), "utf8");
494
544
  }
495
- async function writeSparsePullMarker(workspacePath, value) {
496
- await mkdir3(modelsDir(workspacePath), { recursive: true });
497
- await writeFile3(sparsePullMarker(workspacePath), JSON.stringify(value, null, 2), "utf8");
545
+ async function writeSparsePullMarker(workspacePath, model, value) {
546
+ const markerPath = sparsePullMarker(workspacePath, model.modelId, model.cacheDir);
547
+ await mkdir3(path7.dirname(markerPath), { recursive: true });
548
+ await writeFile3(markerPath, JSON.stringify(value, null, 2), "utf8");
498
549
  }
499
550
  async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
500
551
  const denseCacheDir = resolveCacheDir(workspacePath, dense.cacheDir);
@@ -504,7 +555,7 @@ async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
504
555
  configured: dense.enabled,
505
556
  modelId: dense.modelId,
506
557
  cacheDir: denseCacheDir,
507
- available: await fileExists(densePullMarker(workspacePath)),
558
+ available: await fileExists(densePullMarker(workspacePath, dense.modelId, dense.cacheDir)),
508
559
  artifactExists: await fileExists(denseVectorPath(workspacePath))
509
560
  },
510
561
  sparse: {
@@ -512,22 +563,64 @@ async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
512
563
  modelId: sparse.modelId,
513
564
  cacheDir: sparseCacheDir,
514
565
  uvAvailable,
515
- available: await fileExists(sparsePullMarker(workspacePath)),
566
+ available: await fileExists(sparsePullMarker(workspacePath, sparse.modelId, sparse.cacheDir)),
516
567
  artifactExists: await fileExists(sparseVectorPath(workspacePath))
517
568
  }
518
569
  };
519
570
  }
520
571
 
521
572
  // src/vector/text.ts
573
+ var LOW_SIGNAL_HEADINGS = /* @__PURE__ */ new Set([
574
+ "choose this instead of",
575
+ "how xyz runs it",
576
+ "naechste schritte",
577
+ "next steps",
578
+ "overview",
579
+ "passend wenn",
580
+ "problem",
581
+ "right fit",
582
+ "waehlen sie das stattdessen",
583
+ "was sie bekommen",
584
+ "what you get",
585
+ "wie xyz es umsetzt",
586
+ "uberblick",
587
+ "\xFCberblick"
588
+ ]);
589
+ function normalizeHeading(value) {
590
+ return value.trim().toLowerCase();
591
+ }
592
+ function isLowSignalHeading(value) {
593
+ return LOW_SIGNAL_HEADINGS.has(normalizeHeading(value));
594
+ }
595
+ function stripLeadingHeading(text, heading) {
596
+ const lines = text.split("\n");
597
+ const firstContentIndex = lines.findIndex((line) => line.trim().length > 0);
598
+ if (firstContentIndex < 0) {
599
+ return text;
600
+ }
601
+ const match = /^(#{1,6})\s+(.+)$/.exec(lines[firstContentIndex] ?? "");
602
+ if (!match?.[2] || normalizeHeading(match[2]) !== normalizeHeading(heading)) {
603
+ return text;
604
+ }
605
+ const next = [...lines.slice(0, firstContentIndex), ...lines.slice(firstContentIndex + 1)].join("\n").trim();
606
+ return next;
607
+ }
608
+ function createVectorText(chunk) {
609
+ const meaningfulHeadings = chunk.headingPath.filter((heading) => !isLowSignalHeading(heading) && normalizeHeading(heading) !== normalizeHeading(chunk.title));
610
+ const textHeading = [...chunk.headingPath].reverse().find((heading) => isLowSignalHeading(heading) || normalizeHeading(heading) === normalizeHeading(chunk.title));
611
+ const body = textHeading ? stripLeadingHeading(chunk.text, textHeading) : chunk.text.trim();
612
+ return [chunk.title, ...meaningfulHeadings, body].filter(Boolean).join("\n\n");
613
+ }
522
614
  function createDenseChunkText(chunk) {
523
- return [chunk.title, ...chunk.headingPath, chunk.text].filter(Boolean).join("\n\n");
615
+ return createVectorText(chunk);
524
616
  }
525
617
  function createSparseChunkText(chunk) {
526
- return [chunk.title, ...chunk.headingPath, chunk.text].filter(Boolean).join("\n\n");
618
+ return createVectorText(chunk);
527
619
  }
528
620
 
529
621
  // src/vector/dense.ts
530
622
  var denseEmbedderFactory = null;
623
+ var EXACT_DENSE_RERANK_THRESHOLD = 5e3;
531
624
  async function createEmbedder(cacheDir, modelId) {
532
625
  if (denseEmbedderFactory) {
533
626
  return denseEmbedderFactory(cacheDir, modelId);
@@ -539,6 +632,9 @@ async function createEmbedder(cacheDir, modelId) {
539
632
  return output.tolist()[0];
540
633
  };
541
634
  }
635
+ function exactDenseQuery(payload, vector, topK) {
636
+ return payload.chunks.map((chunk) => [chunk.chunkId, cosineSimilarity(vector, chunk.embedding)]).sort((left, right) => right[1] - left[1]).slice(0, topK);
637
+ }
542
638
  async function pullDenseModel(workspacePath, config) {
543
639
  const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
544
640
  await mkdir4(cacheDir, { recursive: true });
@@ -547,7 +643,8 @@ async function pullDenseModel(workspacePath, config) {
547
643
  }
548
644
  async function buildDenseVectors({
549
645
  workspacePath,
550
- config
646
+ config,
647
+ progress
551
648
  }) {
552
649
  const chunks = await readJsonl(path8.join(workspacePath, "chunks", "chunks.jsonl"));
553
650
  const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
@@ -555,6 +652,7 @@ async function buildDenseVectors({
555
652
  const embed = await createEmbedder(cacheDir, config.modelId);
556
653
  const records = [];
557
654
  let dimensions = 0;
655
+ reportProgress(progress, `Encoding ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} for dense retrieval`);
558
656
  for (const chunk of chunks) {
559
657
  const embedding = await embed(createDenseChunkText(chunk));
560
658
  dimensions ||= embedding.length;
@@ -568,7 +666,11 @@ async function buildDenseVectors({
568
666
  text: chunk.text,
569
667
  embedding
570
668
  });
669
+ if (records.length === 1 || records.length % 100 === 0 || records.length === chunks.length) {
670
+ reportProgressDetail(progress, `Encoded ${records.length}/${chunks.length} chunks for dense retrieval`);
671
+ }
571
672
  }
673
+ reportProgress(progress, "Building dense vector index");
572
674
  const index = new VectorFieldIndex({
573
675
  numHashTables: config.indexHashTables,
574
676
  dimensions,
@@ -592,6 +694,7 @@ async function buildDenseVectors({
592
694
  chunks: records
593
695
  };
594
696
  await writeDensePayload(workspacePath, payload);
697
+ reportProgress(progress, `Dense vectors written for ${records.length} chunk${records.length === 1 ? "" : "s"}`);
595
698
  return payload;
596
699
  }
597
700
  async function denseQuery({
@@ -604,12 +707,19 @@ async function denseQuery({
604
707
  const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
605
708
  const embed = await createEmbedder(cacheDir, config.modelId);
606
709
  const vector = await embed(query);
710
+ if (payload.chunks.length <= EXACT_DENSE_RERANK_THRESHOLD) {
711
+ return exactDenseQuery(payload, vector, topK);
712
+ }
607
713
  const index = new VectorFieldIndex({
608
714
  numHashTables: payload.metadata.hashTables,
609
715
  dimensions: payload.metadata.dimensions,
610
716
  random: createSeededRandom(payload.metadata.randomSeed)
611
717
  }).loadState(payload.indexState);
612
- return index.query(vector, topK);
718
+ const approximateHits = index.query(vector, topK);
719
+ if (approximateHits.length >= topK) {
720
+ return approximateHits;
721
+ }
722
+ return exactDenseQuery(payload, vector, topK);
613
723
  }
614
724
 
615
725
  // src/vector/sparse.ts
@@ -717,10 +827,13 @@ async function buildSparseDocuments(workspacePath, config, chunks) {
717
827
  }
718
828
  async function buildSparseVectors({
719
829
  workspacePath,
720
- config
830
+ config,
831
+ progress
721
832
  }) {
722
833
  const chunks = await readJsonl(path9.join(workspacePath, "chunks", "chunks.jsonl"));
834
+ reportProgress(progress, `Encoding ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} for sparse retrieval`);
723
835
  const built = await buildSparseDocuments(workspacePath, config, chunks);
836
+ reportProgress(progress, "Building sparse vector index");
724
837
  const index = new SparseVectorFieldIndex();
725
838
  for (const record of built.chunks) {
726
839
  index.insert(record.chunkId, [record.vector]);
@@ -742,6 +855,7 @@ async function buildSparseVectors({
742
855
  queryTokenWeights: built.queryTokenWeights
743
856
  };
744
857
  await writeSparsePayload(workspacePath, payload);
858
+ reportProgress(progress, `Sparse vectors written for ${built.chunks.length} chunk${built.chunks.length === 1 ? "" : "s"}`);
745
859
  return payload;
746
860
  }
747
861
  async function sparseQuery({
@@ -759,6 +873,7 @@ async function sparseQuery({
759
873
  }
760
874
 
761
875
  // src/vector/service.ts
876
+ var pullModelsOverrideForTests = null;
762
877
  function resolveModelPullPlan({
763
878
  pullDenseFlag,
764
879
  pullSparseFlag,
@@ -775,61 +890,75 @@ function resolveModelPullPlan({
775
890
  pullSparse: uvAvailable
776
891
  };
777
892
  }
893
+ function resolveMissingConfiguredModelPullPlan({
894
+ config,
895
+ status
896
+ }) {
897
+ return {
898
+ pullDense: config.retrieval.dense.enabled && !status.dense.available,
899
+ pullSparse: config.retrieval.sparse.enabled && status.sparse.uvAvailable && !status.sparse.available
900
+ };
901
+ }
778
902
  async function buildVectorArtifacts({
779
903
  workspacePath,
780
904
  config,
781
905
  denseOverride,
782
906
  sparseOverride,
783
- buildAvailableModels = false
907
+ buildAvailableModels = false,
908
+ progress
784
909
  }) {
785
- const modelStatus = buildAvailableModels ? await buildModelStatus(workspacePath, config.retrieval.dense, config.retrieval.sparse, await (async () => {
786
- try {
787
- await ensureUvAvailable();
788
- return true;
789
- } catch {
790
- return false;
791
- }
792
- })()) : null;
910
+ const uvAvailable = await isUvAvailable();
911
+ const modelStatus = buildAvailableModels ? await buildModelStatus(workspacePath, config.retrieval.dense, config.retrieval.sparse, uvAvailable) : null;
793
912
  const denseEnabled = denseOverride ?? (buildAvailableModels ? config.retrieval.dense.enabled || Boolean(modelStatus?.dense.available) : config.retrieval.dense.enabled);
794
- const sparseEnabled = sparseOverride ?? (buildAvailableModels ? (config.retrieval.sparse.enabled || Boolean(modelStatus?.sparse.available)) && Boolean(modelStatus?.sparse.uvAvailable) : config.retrieval.sparse.enabled);
795
- const result2 = {};
913
+ const sparseEnabled = sparseOverride ?? (buildAvailableModels ? (config.retrieval.sparse.enabled || Boolean(modelStatus?.sparse.available)) && Boolean(modelStatus?.sparse.uvAvailable) : config.retrieval.sparse.enabled && uvAvailable);
914
+ const result = {};
796
915
  if (denseEnabled) {
797
- result2.dense = await buildDenseVectors({ workspacePath, config: config.retrieval.dense });
916
+ reportProgress(progress, `Building dense vectors with ${config.retrieval.dense.modelId}`);
917
+ result.dense = await buildDenseVectors({ workspacePath, config: config.retrieval.dense, progress });
918
+ }
919
+ if ((sparseOverride || config.retrieval.sparse.enabled) && !uvAvailable) {
920
+ reportProgress(progress, "Skipping sparse vectors because uv is not available");
798
921
  }
799
922
  if (sparseEnabled) {
800
- result2.sparse = await buildSparseVectors({ workspacePath, config: config.retrieval.sparse });
923
+ reportProgress(progress, `Building sparse vectors with ${config.retrieval.sparse.modelId}`);
924
+ result.sparse = await buildSparseVectors({ workspacePath, config: config.retrieval.sparse, progress });
801
925
  }
802
- return result2;
926
+ return result;
803
927
  }
804
928
  async function pullModels({
805
929
  workspacePath,
806
930
  config,
807
931
  pullDense,
808
- pullSparse
932
+ pullSparse,
933
+ progress
809
934
  }) {
935
+ if (pullModelsOverrideForTests) {
936
+ await pullModelsOverrideForTests({ workspacePath, config, pullDense, pullSparse, progress });
937
+ return;
938
+ }
810
939
  if (pullDense) {
940
+ reportProgress(progress, `Pulling dense model ${config.retrieval.dense.modelId}`);
811
941
  await pullDenseModel(workspacePath, config.retrieval.dense);
812
- await writeDensePullMarker(workspacePath, {
942
+ await writeDensePullMarker(workspacePath, config.retrieval.dense, {
813
943
  pulledAt: (/* @__PURE__ */ new Date()).toISOString(),
814
- modelId: config.retrieval.dense.modelId
944
+ modelId: config.retrieval.dense.modelId,
945
+ cacheDir: config.retrieval.dense.cacheDir
815
946
  });
947
+ reportProgress(progress, `Dense model ready: ${config.retrieval.dense.modelId}`);
816
948
  }
817
949
  if (pullSparse) {
950
+ reportProgress(progress, `Pulling sparse model ${config.retrieval.sparse.modelId}`);
818
951
  await pullSparseModel(workspacePath, config.retrieval.sparse);
819
- await writeSparsePullMarker(workspacePath, {
952
+ await writeSparsePullMarker(workspacePath, config.retrieval.sparse, {
820
953
  pulledAt: (/* @__PURE__ */ new Date()).toISOString(),
821
- modelId: config.retrieval.sparse.modelId
954
+ modelId: config.retrieval.sparse.modelId,
955
+ cacheDir: config.retrieval.sparse.cacheDir
822
956
  });
957
+ reportProgress(progress, `Sparse model ready: ${config.retrieval.sparse.modelId}`);
823
958
  }
824
959
  }
825
960
  async function getModelStatus(workspacePath, config) {
826
- let uvAvailable = false;
827
- try {
828
- await ensureUvAvailable();
829
- uvAvailable = true;
830
- } catch {
831
- uvAvailable = false;
832
- }
961
+ const uvAvailable = await isUvAvailable();
833
962
  return buildModelStatus(workspacePath, config.retrieval.dense, config.retrieval.sparse, uvAvailable);
834
963
  }
835
964
 
@@ -900,14 +1029,17 @@ async function buildIndex({
900
1029
  workspacePath,
901
1030
  denseOverride,
902
1031
  sparseOverride,
903
- buildAvailableModels = false
1032
+ buildAvailableModels = false,
1033
+ progress
904
1034
  }) {
905
1035
  const config = await loadConfig(workspacePath);
1036
+ reportProgress(progress, "Loading documents, chunks, and sources");
906
1037
  const chunks = await readJsonl(path11.join(workspacePath, "chunks", "chunks.jsonl"));
907
1038
  const documents = await readJsonl(path11.join(workspacePath, "documents", "documents.jsonl"));
908
1039
  const sources = await readJsonl(path11.join(workspacePath, "sources", "sources.jsonl"));
909
1040
  const metadataFields = [...new Set(chunks.flatMap((chunk) => Object.keys(chunk.metadata).map((key) => `metadata.${key}`)))];
910
1041
  const index = new DocumentIndex(createIndexMapping(metadataFields));
1042
+ reportProgress(progress, `Building lexical index from ${chunks.length} chunk${chunks.length === 1 ? "" : "s"}`);
911
1043
  for (const chunk of chunks) {
912
1044
  index.index({
913
1045
  id: chunk.id,
@@ -922,6 +1054,7 @@ async function buildIndex({
922
1054
  }
923
1055
  });
924
1056
  }
1057
+ reportProgressDetail(progress, `Indexed ${documents.length} document${documents.length === 1 ? "" : "s"} across ${sources.length} source${sources.length === 1 ? "" : "s"}`);
925
1058
  const createdAt = (/* @__PURE__ */ new Date()).toISOString();
926
1059
  const metadata = {
927
1060
  id: `index_${createdAt.replace(/[:.]/g, "-")}`,
@@ -934,14 +1067,17 @@ async function buildIndex({
934
1067
  fields: Object.keys(index.mapping),
935
1068
  indexHash: sha256(JSON.stringify(index.indexState))
936
1069
  };
1070
+ reportProgress(progress, "Writing lexical index artifacts");
937
1071
  const artifacts = await writeIndexArtifacts({ workspacePath, indexState: index.indexState, metadata });
938
1072
  const vectors = await buildVectorArtifacts({
939
1073
  workspacePath,
940
1074
  config,
941
1075
  denseOverride,
942
1076
  sparseOverride,
943
- buildAvailableModels
1077
+ buildAvailableModels,
1078
+ progress
944
1079
  });
1080
+ reportProgress(progress, `Index build complete: dense=${Boolean(vectors.dense)}, sparse=${Boolean(vectors.sparse)}`);
945
1081
  return {
946
1082
  metadata,
947
1083
  indexPath: artifacts.indexPath,
@@ -953,6 +1089,27 @@ async function buildIndex({
953
1089
  // src/ingest/ingest-service.ts
954
1090
  import path17 from "path";
955
1091
 
1092
+ // src/core/concurrency.ts
1093
+ async function mapWithConcurrency(items, limit, worker) {
1094
+ if (items.length === 0) {
1095
+ return;
1096
+ }
1097
+ const concurrency = Math.max(1, Math.floor(limit));
1098
+ let nextIndex = 0;
1099
+ await Promise.all(
1100
+ Array.from({ length: Math.min(concurrency, items.length) }, async () => {
1101
+ while (true) {
1102
+ const index = nextIndex;
1103
+ nextIndex += 1;
1104
+ if (index >= items.length) {
1105
+ return;
1106
+ }
1107
+ await worker(items[index], index);
1108
+ }
1109
+ })
1110
+ );
1111
+ }
1112
+
956
1113
  // src/core/runs.ts
957
1114
  import path12 from "path";
958
1115
  async function writeRun(workspacePath, run) {
@@ -1127,8 +1284,8 @@ import { mkdir as mkdir7, readFile as readFile9, stat as stat3, writeFile as wri
1127
1284
  // src/ingest/extractors/docx-extractor.ts
1128
1285
  import mammoth from "mammoth";
1129
1286
  async function extractDocx(filePath) {
1130
- const result2 = await mammoth.extractRawText({ path: filePath });
1131
- return result2.value;
1287
+ const result = await mammoth.extractRawText({ path: filePath });
1288
+ return result.value;
1132
1289
  }
1133
1290
 
1134
1291
  // src/ingest/extractors/html-extractor.ts
@@ -1142,9 +1299,41 @@ function stripBoilerplate(html) {
1142
1299
 
1143
1300
  // src/ingest/extractors/html-extractor.ts
1144
1301
  var turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
1302
+ var LOW_SIGNAL_SECTION_SELECTORS = [
1303
+ "script",
1304
+ "style",
1305
+ "noscript",
1306
+ "template",
1307
+ "[data-blog-service-recommendations]",
1308
+ "[data-blog-related-posts]"
1309
+ ].join(", ");
1145
1310
  function cleanText(value) {
1146
1311
  return value.replace(/\s+/g, " ").trim();
1147
1312
  }
1313
+ function pruneLowSignalContent($) {
1314
+ $(LOW_SIGNAL_SECTION_SELECTORS).remove();
1315
+ $("form").each((_, element) => {
1316
+ const action = cleanText($(element).attr("action") ?? "");
1317
+ if (action.includes("substack.com/subscribe")) {
1318
+ $(element).closest("section").remove();
1319
+ }
1320
+ });
1321
+ }
1322
+ function stripEscapedJsonPayloads(markdown) {
1323
+ return markdown.split("\n").filter((line) => {
1324
+ const trimmed = line.trim();
1325
+ if (trimmed.length === 0) {
1326
+ return true;
1327
+ }
1328
+ if (trimmed.length > 300 && /^"?\\?\[\{\\?"[a-z0-9_]+\\?":/i.test(trimmed)) {
1329
+ return false;
1330
+ }
1331
+ if (trimmed.length > 300 && trimmed.includes('\\"permalink\\":') && trimmed.includes('\\"title\\":')) {
1332
+ return false;
1333
+ }
1334
+ return true;
1335
+ }).join("\n").replace(/\n{3,}/g, "\n\n").trim();
1336
+ }
1148
1337
  function chooseMeaningfulTitle($, fallbackTitle) {
1149
1338
  const candidates = [
1150
1339
  cleanText($("meta[property='og:title']").attr("content") ?? ""),
@@ -1181,14 +1370,27 @@ ${parts.join("\n\n")}
1181
1370
  function extractHtmlToMarkdown(html) {
1182
1371
  const cleaned = stripBoilerplate(html);
1183
1372
  const $ = load(cleaned);
1373
+ pruneLowSignalContent($);
1184
1374
  const fallbackTitle = cleanText($("title").first().text()) || "Untitled";
1185
1375
  const title = chooseMeaningfulTitle($, fallbackTitle);
1186
1376
  const root = $("main").first().html() ?? $.root().html() ?? cleaned;
1187
1377
  return {
1188
- markdown: turndown.turndown(root),
1378
+ markdown: stripEscapedJsonPayloads(turndown.turndown(root)),
1189
1379
  title
1190
1380
  };
1191
1381
  }
1382
+ function extractCanonicalUriFromHtml(html, baseUrl) {
1383
+ const $ = load(html);
1384
+ const href = $("link[rel='canonical']").first().attr("href")?.trim();
1385
+ if (!href) {
1386
+ return null;
1387
+ }
1388
+ try {
1389
+ return new URL(href, baseUrl).href;
1390
+ } catch {
1391
+ return null;
1392
+ }
1393
+ }
1192
1394
  function parseDateCandidate(value) {
1193
1395
  const trimmed = value.trim();
1194
1396
  if (!trimmed) {
@@ -1593,6 +1795,19 @@ async function parseRssFeedDocument(xml, source) {
1593
1795
  // src/ingest/adapters/url-adapter.ts
1594
1796
  import { mkdir as mkdir8, readFile as readFile10, writeFile as writeFile7 } from "fs/promises";
1595
1797
  import path16 from "path";
1798
+
1799
+ // src/core/urls.ts
1800
+ function normalizeRemoteUrl(uri) {
1801
+ try {
1802
+ const parsed = new URL(uri);
1803
+ parsed.hash = "";
1804
+ return parsed.href;
1805
+ } catch {
1806
+ return uri;
1807
+ }
1808
+ }
1809
+
1810
+ // src/ingest/adapters/url-adapter.ts
1596
1811
  function buildHttpCache(response2, validatedAt) {
1597
1812
  return {
1598
1813
  etag: response2.headers.get("etag") ?? void 0,
@@ -1617,12 +1832,13 @@ async function normalizeRemoteDocument({
1617
1832
  responseStatus
1618
1833
  }) {
1619
1834
  const extracted = extractHtmlToMarkdown(body);
1835
+ const canonicalUri = normalizeRemoteUrl(extractCanonicalUriFromHtml(body, url) ?? url);
1620
1836
  const markdown = `# ${extracted.title}
1621
1837
 
1622
1838
  ${extracted.markdown}`;
1623
- const documentId = stableId("doc", source.id, url);
1839
+ const documentId = stableId("doc", source.id, canonicalUri);
1624
1840
  const normalizedPath = path16.resolve(workspacePath, "normalized", `${documentId}.md`);
1625
- const rawPath = path16.resolve(workspacePath, "raw", source.id, `${sha256(url).slice(0, 12)}.html`);
1841
+ const rawPath = path16.resolve(workspacePath, "raw", source.id, `${sha256(canonicalUri).slice(0, 12)}.html`);
1626
1842
  const contentHash = sha256(markdown);
1627
1843
  const now = (/* @__PURE__ */ new Date()).toISOString();
1628
1844
  const lastChangedAt = previous?.contentHash === contentHash ? previous.lastChangedAt : now;
@@ -1635,7 +1851,7 @@ ${extracted.markdown}`;
1635
1851
  documentId,
1636
1852
  sourceId: source.id,
1637
1853
  title: extracted.title,
1638
- uri: url,
1854
+ uri: canonicalUri,
1639
1855
  sourceUri,
1640
1856
  publicationDate: resolvedPublicationDate,
1641
1857
  crawledAt,
@@ -1650,8 +1866,9 @@ ${extracted.markdown}`;
1650
1866
  sourceId: source.id,
1651
1867
  sourceType: source.type,
1652
1868
  title: extracted.title,
1653
- uri: url,
1869
+ uri: canonicalUri,
1654
1870
  sourceUri,
1871
+ canonicalUri,
1655
1872
  mimeType: "text/html",
1656
1873
  rawPath,
1657
1874
  normalizedPath,
@@ -1825,6 +2042,18 @@ function isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules
1825
2042
  if (url.origin !== baseUrl.origin) {
1826
2043
  return false;
1827
2044
  }
2045
+ if (url.search.length > 0) {
2046
+ return false;
2047
+ }
2048
+ if (url.pathname.endsWith(".xml")) {
2049
+ return false;
2050
+ }
2051
+ if (url.pathname.includes("/cdn-cgi/")) {
2052
+ return false;
2053
+ }
2054
+ if (url.pathname === "/search" || url.pathname === "/search/" || url.pathname.endsWith("/search/")) {
2055
+ return false;
2056
+ }
1828
2057
  if (disallowRules.some((rule) => rule !== "/" && url.pathname.startsWith(rule))) {
1829
2058
  return false;
1830
2059
  }
@@ -1837,56 +2066,75 @@ function isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules
1837
2066
  }
1838
2067
  return true;
1839
2068
  }
1840
- async function crawlWebsite(source) {
2069
+ function delay(ms) {
2070
+ return new Promise((resolve2) => setTimeout(resolve2, ms));
2071
+ }
2072
+ async function crawlWebsite(source, defaults, progress) {
1841
2073
  const baseUrl = new URL(source.uri);
1842
- const userAgent = source.crawl?.userAgent ?? "querylight-cli/0.1";
2074
+ const userAgent = source.crawl?.userAgent ?? defaults.userAgent;
1843
2075
  const includePatterns = source.crawl?.includePatterns ?? [];
1844
2076
  const excludePatterns = source.crawl?.excludePatterns ?? [];
1845
2077
  const maxDepth = source.crawl?.maxDepth ?? 2;
1846
2078
  const maxPages = source.crawl?.maxPages ?? 100;
1847
- const rateLimitMs = source.crawl?.rateLimitMs ?? 1e3;
2079
+ const rateLimitMs = source.crawl?.rateLimitMs ?? defaults.rateLimitMs;
2080
+ const maxConcurrentRequests = source.crawl?.maxConcurrentRequests ?? defaults.maxConcurrentRequests;
1848
2081
  const disallowRules = source.crawl?.obeyRobotsTxt === false ? [] : await fetchRobotsDisallow(baseUrl, userAgent);
1849
- const queue = [{ url: source.uri, depth: 0 }];
1850
2082
  const seen = /* @__PURE__ */ new Set();
1851
2083
  const results = [];
2084
+ let currentLevel = [normalizeRemoteUrl(source.uri)];
1852
2085
  if (source.crawl?.useSitemap !== false) {
1853
- for (const url of await fetchSitemapUrls(baseUrl, userAgent)) {
1854
- queue.push({ url, depth: 1 });
1855
- }
1856
- }
1857
- while (queue.length > 0 && results.length < maxPages) {
1858
- const next = queue.shift();
1859
- if (!next || seen.has(next.url)) {
1860
- continue;
1861
- }
1862
- seen.add(next.url);
1863
- const url = new URL(next.url);
1864
- if (!isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules)) {
1865
- continue;
2086
+ const sitemapUrls = (await fetchSitemapUrls(baseUrl, userAgent)).map((url) => normalizeRemoteUrl(url));
2087
+ reportProgress(progress, `Discovered ${sitemapUrls.length} sitemap URL${sitemapUrls.length === 1 ? "" : "s"} for ${source.uri}`);
2088
+ currentLevel = [
2089
+ ...currentLevel,
2090
+ ...sitemapUrls
2091
+ ];
2092
+ }
2093
+ for (let depth = 0; depth <= maxDepth && currentLevel.length > 0 && results.length < maxPages; depth += 1) {
2094
+ reportProgress(progress, `Crawl depth ${depth}: evaluating ${currentLevel.length} candidate URL${currentLevel.length === 1 ? "" : "s"}`);
2095
+ const nextLevelCandidates = [];
2096
+ const allowedUrls = [];
2097
+ for (const candidate of currentLevel) {
2098
+ const normalizedCandidate = normalizeRemoteUrl(candidate);
2099
+ if (seen.has(normalizedCandidate)) {
2100
+ continue;
2101
+ }
2102
+ seen.add(normalizedCandidate);
2103
+ const url = new URL(normalizedCandidate);
2104
+ if (!isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules)) {
2105
+ continue;
2106
+ }
2107
+ allowedUrls.push(normalizedCandidate);
2108
+ results.push(normalizedCandidate);
2109
+ reportProgress(progress, `Discovered ${normalizedCandidate}`);
2110
+ if (results.length >= maxPages) {
2111
+ break;
2112
+ }
1866
2113
  }
1867
- results.push(url.href);
1868
- if (next.depth >= maxDepth) {
1869
- continue;
2114
+ reportProgress(progress, `Crawl depth ${depth}: queued ${allowedUrls.length} page${allowedUrls.length === 1 ? "" : "s"} for link extraction`);
2115
+ if (depth >= maxDepth || results.length >= maxPages) {
2116
+ break;
1870
2117
  }
1871
- const response2 = await fetch(url, { headers: { "user-agent": userAgent } });
1872
- const html = await response2.text();
1873
- const $ = load2(html);
1874
- $("a[href]").each((_, element) => {
1875
- const href = $(element).attr("href");
1876
- if (!href) {
1877
- return;
1878
- }
1879
- try {
1880
- const target = new URL(href, url);
1881
- if (!seen.has(target.href)) {
1882
- queue.push({ url: target.href, depth: next.depth + 1 });
2118
+ await mapWithConcurrency(allowedUrls, maxConcurrentRequests, async (pageUrl) => {
2119
+ const page = new URL(pageUrl);
2120
+ const response2 = await fetch(page, { headers: { "user-agent": userAgent } });
2121
+ const html = await response2.text();
2122
+ const $ = load2(html);
2123
+ $("a[href]").each((_, element) => {
2124
+ const href = $(element).attr("href");
2125
+ if (!href) {
2126
+ return;
2127
+ }
2128
+ try {
2129
+ nextLevelCandidates.push(normalizeRemoteUrl(new URL(href, page).href));
2130
+ } catch {
1883
2131
  }
1884
- } catch {
2132
+ });
2133
+ if (rateLimitMs > 0) {
2134
+ await delay(rateLimitMs);
1885
2135
  }
1886
2136
  });
1887
- if (rateLimitMs > 0) {
1888
- await new Promise((resolve2) => setTimeout(resolve2, rateLimitMs));
1889
- }
2137
+ currentLevel = nextLevelCandidates;
1890
2138
  }
1891
2139
  return results;
1892
2140
  }
@@ -1961,6 +2209,8 @@ async function ingestRssSource({
1961
2209
  source,
1962
2210
  previous,
1963
2211
  nextDocuments,
2212
+ maxConcurrentRequests,
2213
+ onDocumentProcessed,
1964
2214
  onFailure
1965
2215
  }) {
1966
2216
  if (source.crawl?.fetchArticles === false) {
@@ -1968,11 +2218,12 @@ async function ingestRssSource({
1968
2218
  }
1969
2219
  const xml = await fetchFeedText(source);
1970
2220
  const items = await parseRssFeedDocument(xml, source);
2221
+ const processedDocumentIds = /* @__PURE__ */ new Set();
1971
2222
  let added = 0;
1972
2223
  let changed = 0;
1973
2224
  let unchanged = 0;
1974
2225
  let failed = 0;
1975
- for (const item of items) {
2226
+ await mapWithConcurrency(items, maxConcurrentRequests, async (item) => {
1976
2227
  try {
1977
2228
  const probe = previous.get(stableId("doc", source.id, item.url));
1978
2229
  const document = await fetchUrlDocument({
@@ -1983,28 +2234,40 @@ async function ingestRssSource({
1983
2234
  sourceUri: source.uri,
1984
2235
  publicationDate: item.publicationDate
1985
2236
  });
2237
+ if (processedDocumentIds.has(document.id)) {
2238
+ return;
2239
+ }
2240
+ processedDocumentIds.add(document.id);
2241
+ const existingDocument = probe ?? previous.get(document.id);
1986
2242
  nextDocuments.set(document.id, document);
1987
- if (!probe) {
2243
+ if (!existingDocument) {
1988
2244
  added += 1;
1989
- } else if (probe.contentHash !== document.contentHash) {
2245
+ onDocumentProcessed?.(document.uri, "added");
2246
+ } else if (existingDocument.contentHash !== document.contentHash) {
1990
2247
  changed += 1;
2248
+ onDocumentProcessed?.(document.uri, "changed");
1991
2249
  } else {
1992
2250
  unchanged += 1;
2251
+ onDocumentProcessed?.(document.uri, "unchanged");
1993
2252
  }
1994
2253
  } catch (error) {
1995
2254
  failed += 1;
1996
2255
  onFailure(item.url, error);
1997
2256
  }
1998
- }
2257
+ });
1999
2258
  return { added, changed, unchanged, failed };
2000
2259
  }
2001
2260
  async function ingestSources({
2002
2261
  workspacePath,
2003
2262
  sourceIds,
2004
- changedOnly = false
2263
+ changedOnly = false,
2264
+ progress
2005
2265
  }) {
2006
2266
  const config = await loadConfig(workspacePath);
2007
2267
  const defaultRetentionDays = config.crawler.retentionDays;
2268
+ const defaultUserAgent = config.crawler.defaultUserAgent;
2269
+ const defaultRateLimitMs = config.crawler.rateLimitMs;
2270
+ const defaultMaxConcurrentRequests = config.crawler.maxConcurrentRequests;
2008
2271
  const sources = (await listSources(workspacePath)).filter((source) => source.enabled && (!sourceIds || sourceIds.includes(source.id)));
2009
2272
  const existing = await loadDocuments(workspacePath);
2010
2273
  const previous = previousMap(existing);
@@ -2014,20 +2277,38 @@ async function ingestSources({
2014
2277
  let unchanged = 0;
2015
2278
  let failed = 0;
2016
2279
  const failures = [];
2280
+ reportProgress(progress, `Ingesting ${sources.length} source${sources.length === 1 ? "" : "s"}`);
2017
2281
  for (const source of sources) {
2282
+ const maxConcurrentRequests = source.crawl?.maxConcurrentRequests ?? defaultMaxConcurrentRequests;
2283
+ const sourceBefore = { added, changed, unchanged, failed };
2284
+ const processedDocumentIds = /* @__PURE__ */ new Set();
2285
+ const reportDocumentOutcome = (uri, outcome) => {
2286
+ const label = outcome === "unchanged" ? "Unchanged" : outcome === "changed" ? "Updated" : "Added";
2287
+ reportProgress(progress, `${label} ${uri}`);
2288
+ };
2018
2289
  const ingestOne = async (uri, producer) => {
2019
2290
  try {
2020
2291
  const probeId = stableId("doc", source.id, uri);
2021
2292
  const earlier = previous.get(probeId);
2022
2293
  const document = await producer();
2294
+ if (processedDocumentIds.has(document.id)) {
2295
+ reportProgressDetail(progress, `Skipped duplicate alias ${uri} -> ${document.uri}`);
2296
+ return null;
2297
+ }
2298
+ processedDocumentIds.add(document.id);
2299
+ const existingDocument = earlier ?? previous.get(document.id);
2023
2300
  nextDocuments.set(document.id, document);
2024
- if (!earlier) {
2301
+ if (!existingDocument) {
2025
2302
  added += 1;
2026
- } else if (earlier.contentHash !== document.contentHash) {
2303
+ reportDocumentOutcome(document.uri, "added");
2304
+ } else if (existingDocument.contentHash !== document.contentHash) {
2027
2305
  changed += 1;
2306
+ reportDocumentOutcome(document.uri, "changed");
2028
2307
  } else {
2029
2308
  unchanged += 1;
2309
+ reportDocumentOutcome(document.uri, "unchanged");
2030
2310
  }
2311
+ return document;
2031
2312
  } catch (error) {
2032
2313
  failed += 1;
2033
2314
  failures.push({
@@ -2035,50 +2316,69 @@ async function ingestSources({
2035
2316
  uri,
2036
2317
  message: error instanceof Error ? error.message : String(error)
2037
2318
  });
2319
+ reportProgressDetail(progress, `Failed ${uri}: ${error instanceof Error ? error.message : String(error)}`);
2320
+ return null;
2038
2321
  }
2039
2322
  };
2040
2323
  try {
2324
+ reportProgress(progress, `Source ${source.name} (${source.type})`);
2041
2325
  if (source.type === "file") {
2326
+ reportProgress(progress, `Reading file ${source.uri}`);
2042
2327
  await ingestOne(source.uri, () => ingestFile({ workspacePath, source, filePath: source.uri, previous: previous.get(stableId("doc", source.id, source.uri)) }));
2043
- continue;
2044
- }
2045
- if (source.type === "directory") {
2046
- for (const filePath of await listDirectoryFiles(source)) {
2328
+ } else if (source.type === "directory") {
2329
+ const files = await listDirectoryFiles(source);
2330
+ reportProgress(progress, `Scanning ${files.length} file${files.length === 1 ? "" : "s"} from ${source.uri}`);
2331
+ for (const filePath of files) {
2332
+ reportProgress(progress, `Reading file ${filePath}`);
2047
2333
  await ingestOne(filePath, () => ingestFile({ workspacePath, source, filePath, previous: previous.get(stableId("doc", source.id, filePath)) }));
2048
2334
  }
2049
- continue;
2050
- }
2051
- if (source.type === "url") {
2335
+ } else if (source.type === "url") {
2336
+ reportProgress(progress, `Fetching ${source.uri}`);
2052
2337
  await ingestOne(source.uri, () => fetchUrlDocument({ workspacePath, source, url: source.uri, previous: previous.get(stableId("doc", source.id, source.uri)) }));
2053
- continue;
2054
- }
2055
- if (source.type === "website") {
2056
- for (const url of await crawlWebsite(source)) {
2057
- await ingestOne(url, () => fetchUrlDocument({ workspacePath, source, url, previous: previous.get(stableId("doc", source.id, url)) }));
2058
- }
2059
- continue;
2060
- }
2061
- if (source.type === "rss") {
2062
- const result2 = await ingestRssSource({
2338
+ } else if (source.type === "website") {
2339
+ reportProgress(progress, `Crawling ${source.uri}`);
2340
+ const urls = await crawlWebsite(source, {
2341
+ userAgent: defaultUserAgent,
2342
+ rateLimitMs: defaultRateLimitMs,
2343
+ maxConcurrentRequests
2344
+ }, progress);
2345
+ reportProgress(progress, `Fetched ${urls.length} page${urls.length === 1 ? "" : "s"} from crawl`);
2346
+ const seenCanonicalUrls = /* @__PURE__ */ new Set();
2347
+ await mapWithConcurrency(urls, maxConcurrentRequests, async (url) => {
2348
+ if (seenCanonicalUrls.has(url)) {
2349
+ reportProgressDetail(progress, `Skipped canonical duplicate ${url}`);
2350
+ return;
2351
+ }
2352
+ reportProgress(progress, `Fetching ${url}`);
2353
+ const document = await ingestOne(url, () => fetchUrlDocument({ workspacePath, source, url, previous: previous.get(stableId("doc", source.id, url)) }));
2354
+ if (document) {
2355
+ seenCanonicalUrls.add(document.uri);
2356
+ }
2357
+ });
2358
+ } else if (source.type === "rss") {
2359
+ reportProgress(progress, `Fetching feed ${source.uri}`);
2360
+ const result = await ingestRssSource({
2063
2361
  workspacePath,
2064
2362
  source,
2065
2363
  previous,
2066
2364
  nextDocuments,
2365
+ maxConcurrentRequests,
2366
+ onDocumentProcessed: reportDocumentOutcome,
2067
2367
  onFailure: (uri, error) => {
2068
2368
  failures.push({
2069
2369
  sourceId: source.id,
2070
2370
  uri,
2071
2371
  message: error instanceof Error ? error.message : String(error)
2072
2372
  });
2373
+ reportProgressDetail(progress, `Failed ${uri}: ${error instanceof Error ? error.message : String(error)}`);
2073
2374
  }
2074
2375
  });
2075
- added += result2.added;
2076
- changed += result2.changed;
2077
- unchanged += result2.unchanged;
2078
- failed += result2.failed;
2079
- continue;
2080
- }
2081
- if (source.type === "markdown" || source.type === "text") {
2376
+ added += result.added;
2377
+ changed += result.changed;
2378
+ unchanged += result.unchanged;
2379
+ failed += result.failed;
2380
+ } else if (source.type === "markdown" || source.type === "text") {
2381
+ reportProgress(progress, `Processing inline ${source.type} source ${source.id}`);
2082
2382
  await ingestOne(source.uri, () => ingestInlineContent({
2083
2383
  workspacePath,
2084
2384
  source,
@@ -2095,13 +2395,19 @@ async function ingestSources({
2095
2395
  uri: source.uri,
2096
2396
  message: error instanceof Error ? error.message : String(error)
2097
2397
  });
2398
+ reportProgressDetail(progress, `Failed source ${source.name}: ${error instanceof Error ? error.message : String(error)}`);
2098
2399
  }
2400
+ reportProgress(
2401
+ progress,
2402
+ `Finished ${source.name}: +${added - sourceBefore.added} added, ${changed - sourceBefore.changed} changed, ${unchanged - sourceBefore.unchanged} unchanged, ${failed - sourceBefore.failed} failed`
2403
+ );
2099
2404
  }
2100
2405
  const expiringDocuments = [...nextDocuments.values()].filter((document) => {
2101
2406
  const source = sources.find((candidate) => candidate.id === document.sourceId);
2102
2407
  return source ? shouldExpireRssDocument(document, source, defaultRetentionDays) : false;
2103
2408
  });
2104
2409
  if (expiringDocuments.length > 0) {
2410
+ reportProgress(progress, `Removing ${expiringDocuments.length} expired RSS document${expiringDocuments.length === 1 ? "" : "s"}`);
2105
2411
  const expiredIds = new Set(expiringDocuments.map((document) => document.id));
2106
2412
  for (const document of expiringDocuments) {
2107
2413
  nextDocuments.delete(document.id);
@@ -2128,6 +2434,7 @@ async function ingestSources({
2128
2434
  documentsSnapshot: documentSnapshot(finalDocuments)
2129
2435
  };
2130
2436
  await writeRun(workspacePath, run);
2437
+ reportProgress(progress, `Ingest complete: ${added} added, ${changed} changed, ${unchanged} unchanged, ${failed} failed`);
2131
2438
  return {
2132
2439
  runId: id,
2133
2440
  documents: { added, changed, unchanged, failed },
@@ -2137,7 +2444,8 @@ async function ingestSources({
2137
2444
  async function reprocessDocuments({
2138
2445
  workspacePath,
2139
2446
  sourceId,
2140
- documentId
2447
+ documentId,
2448
+ progress
2141
2449
  }) {
2142
2450
  const documents = await loadDocuments(workspacePath);
2143
2451
  const sources = await listSources(workspacePath);
@@ -2145,15 +2453,20 @@ async function reprocessDocuments({
2145
2453
  const nextDocuments = new Map(documents.map((document) => [document.id, document]));
2146
2454
  let documentsReprocessed = 0;
2147
2455
  let documentsSkipped = 0;
2148
- for (const document of documents.filter((candidate) => (!sourceId || candidate.sourceId === sourceId) && (!documentId || candidate.id === documentId))) {
2456
+ const targets = documents.filter((candidate) => (!sourceId || candidate.sourceId === sourceId) && (!documentId || candidate.id === documentId));
2457
+ reportProgress(progress, `Reprocessing ${targets.length} document${targets.length === 1 ? "" : "s"}`);
2458
+ for (const document of targets) {
2459
+ reportProgressDetail(progress, `Reprocessing ${document.id} (${document.title})`);
2149
2460
  const source = sourceMap.get(document.sourceId);
2150
2461
  if (!source || !document.rawPath || !await fileExists(document.rawPath)) {
2151
2462
  documentsSkipped += 1;
2463
+ reportProgressDetail(progress, `Skipped ${document.id}: raw source not available`);
2152
2464
  continue;
2153
2465
  }
2154
2466
  const updated = source.type === "url" || source.type === "website" || source.type === "rss" ? await reprocessRemoteDocument(document, source) : await reprocessStoredDocument(document, source);
2155
2467
  if (!updated) {
2156
2468
  documentsSkipped += 1;
2469
+ reportProgressDetail(progress, `Skipped ${document.id}: source type could not be reprocessed`);
2157
2470
  continue;
2158
2471
  }
2159
2472
  nextDocuments.set(updated.id, updated);
@@ -2173,15 +2486,217 @@ async function reprocessDocuments({
2173
2486
  },
2174
2487
  documentsSnapshot: documentSnapshot(finalDocuments)
2175
2488
  });
2489
+ reportProgress(progress, `Reprocess complete: ${documentsReprocessed} updated, ${documentsSkipped} skipped`);
2176
2490
  return { runId: id, documentsReprocessed, documentsSkipped };
2177
2491
  }
2178
2492
 
2493
+ // src/ingest/adapters/website-feed-discovery.ts
2494
+ import { load as load3 } from "cheerio";
2495
+ var COMMON_FEED_PATHS = [
2496
+ "/feed",
2497
+ "/feed.xml",
2498
+ "/rss",
2499
+ "/rss.xml",
2500
+ "/atom.xml",
2501
+ "/index.xml",
2502
+ "/blog/feed",
2503
+ "/blog/feed.xml",
2504
+ "/blog/rss.xml",
2505
+ "/blog/atom.xml",
2506
+ "/blog/index.xml",
2507
+ "/news/feed",
2508
+ "/news/feed.xml",
2509
+ "/news/rss.xml",
2510
+ "/news/atom.xml",
2511
+ "/news/index.xml"
2512
+ ];
2513
+ function normalizeCandidateUrl(href, baseUrl) {
2514
+ try {
2515
+ const resolved = new URL(href, baseUrl);
2516
+ if (!["http:", "https:"].includes(resolved.protocol)) {
2517
+ return null;
2518
+ }
2519
+ return resolved.href;
2520
+ } catch {
2521
+ return null;
2522
+ }
2523
+ }
2524
+ function looksLikeFeedLink(typeHint, href) {
2525
+ const type = typeHint?.toLowerCase() ?? "";
2526
+ const lowerHref = href.toLowerCase();
2527
+ return type.includes("rss") || type.includes("atom") || type.includes("xml") || lowerHref.includes("/feed") || lowerHref.includes("/rss") || lowerHref.includes("/atom") || lowerHref.endsWith(".xml");
2528
+ }
2529
+ function extractDeclaredFeedCandidates(html, baseUrl) {
2530
+ const $ = load3(html);
2531
+ const candidates = [];
2532
+ $("link[href]").each((index, element) => {
2533
+ const rel = ($(element).attr("rel") ?? "").split(/\s+/).map((value) => value.trim().toLowerCase()).filter(Boolean);
2534
+ const href = $(element).attr("href");
2535
+ if (!href || !rel.includes("alternate")) {
2536
+ return;
2537
+ }
2538
+ const typeHint = $(element).attr("type") ?? void 0;
2539
+ if (!looksLikeFeedLink(typeHint, href)) {
2540
+ return;
2541
+ }
2542
+ const normalized = normalizeCandidateUrl(href, baseUrl);
2543
+ if (!normalized) {
2544
+ return;
2545
+ }
2546
+ candidates.push({
2547
+ url: normalized,
2548
+ discoveredBy: "declared",
2549
+ order: index,
2550
+ typeHint
2551
+ });
2552
+ });
2553
+ return candidates;
2554
+ }
2555
+ function buildCommonFeedCandidates(baseUrl) {
2556
+ return COMMON_FEED_PATHS.map((pathname, index) => ({
2557
+ url: new URL(pathname, baseUrl).href,
2558
+ discoveredBy: "common",
2559
+ order: index
2560
+ }));
2561
+ }
2562
+ function dedupeCandidates(candidates) {
2563
+ const seen = /* @__PURE__ */ new Set();
2564
+ const deduped = [];
2565
+ for (const candidate of candidates) {
2566
+ if (seen.has(candidate.url)) {
2567
+ continue;
2568
+ }
2569
+ seen.add(candidate.url);
2570
+ deduped.push(candidate);
2571
+ }
2572
+ return deduped;
2573
+ }
2574
+ function looksLikeFeedDocument(contentType, body) {
2575
+ const type = contentType?.toLowerCase() ?? "";
2576
+ const lowerBody = body.toLowerCase();
2577
+ return type.includes("rss") || type.includes("atom") || type.includes("xml") && (lowerBody.includes("<rss") || lowerBody.includes("<feed") || lowerBody.includes("<rdf:rdf")) || lowerBody.includes("<rss") || lowerBody.includes("<feed") || lowerBody.includes("<rdf:rdf");
2578
+ }
2579
+ function hasStablePrefixSegment(segment) {
2580
+ return typeof segment === "string" && segment.length > 0 && /[a-z]/i.test(segment);
2581
+ }
2582
+ function deriveExcludePrefix(itemUrls, websiteOrigin) {
2583
+ const paths = itemUrls.map((itemUrl) => {
2584
+ try {
2585
+ const parsed = new URL(itemUrl);
2586
+ if (parsed.origin !== websiteOrigin) {
2587
+ return null;
2588
+ }
2589
+ return parsed.pathname.split("/").filter(Boolean);
2590
+ } catch {
2591
+ return null;
2592
+ }
2593
+ }).filter((segments) => Array.isArray(segments));
2594
+ if (paths.length < 2) {
2595
+ return void 0;
2596
+ }
2597
+ const first = paths[0];
2598
+ if (!first) {
2599
+ return void 0;
2600
+ }
2601
+ let commonLength = 0;
2602
+ while (commonLength < first.length) {
2603
+ const nextSegment = first[commonLength];
2604
+ if (!hasStablePrefixSegment(nextSegment) || !paths.every((segments) => segments[commonLength] === nextSegment)) {
2605
+ break;
2606
+ }
2607
+ commonLength += 1;
2608
+ }
2609
+ if (commonLength === 0) {
2610
+ return void 0;
2611
+ }
2612
+ return `/${first.slice(0, commonLength).join("/")}/`;
2613
+ }
2614
+ function scoreCandidate(candidate) {
2615
+ const url = new URL(candidate.url);
2616
+ const segments = url.pathname.split("/").filter(Boolean);
2617
+ let score = candidate.discoveredBy === "declared" ? 1e3 : 100;
2618
+ score -= candidate.order;
2619
+ score -= segments.length * 10;
2620
+ if (candidate.typeHint?.toLowerCase().includes("rss") || candidate.typeHint?.toLowerCase().includes("atom")) {
2621
+ score += 25;
2622
+ }
2623
+ if (["/feed", "/feed.xml", "/rss", "/rss.xml", "/atom.xml", "/index.xml"].includes(url.pathname)) {
2624
+ score += 50;
2625
+ }
2626
+ if (url.pathname.includes("comments")) {
2627
+ score -= 200;
2628
+ }
2629
+ return score;
2630
+ }
2631
+ async function validateCandidate(candidate, websiteUrl, userAgent) {
2632
+ try {
2633
+ const response2 = await fetch(candidate.url, { headers: { "user-agent": userAgent } });
2634
+ if (!response2.ok) {
2635
+ return null;
2636
+ }
2637
+ const body = await response2.text();
2638
+ if (!looksLikeFeedDocument(response2.headers.get("content-type"), body)) {
2639
+ return null;
2640
+ }
2641
+ const source = {
2642
+ id: "src_detected_feed",
2643
+ type: "rss",
2644
+ uri: candidate.url,
2645
+ name: "Detected Feed",
2646
+ enabled: true,
2647
+ tags: [],
2648
+ metadata: {},
2649
+ createdAt: "1970-01-01T00:00:00.000Z",
2650
+ updatedAt: "1970-01-01T00:00:00.000Z"
2651
+ };
2652
+ const items = await parseRssFeedDocument(body, source);
2653
+ return {
2654
+ feedUrl: candidate.url,
2655
+ discoveredBy: candidate.discoveredBy,
2656
+ excludePrefix: deriveExcludePrefix(items.map((item) => item.url), websiteUrl.origin)
2657
+ };
2658
+ } catch {
2659
+ return null;
2660
+ }
2661
+ }
2662
+ async function discoverWebsiteFeed(websiteUrl, userAgent) {
2663
+ try {
2664
+ const baseUrl = new URL(websiteUrl);
2665
+ const response2 = await fetch(baseUrl, { headers: { "user-agent": userAgent } });
2666
+ if (!response2.ok) {
2667
+ return null;
2668
+ }
2669
+ const html = await response2.text();
2670
+ const candidates = dedupeCandidates([
2671
+ ...extractDeclaredFeedCandidates(html, baseUrl),
2672
+ ...buildCommonFeedCandidates(baseUrl)
2673
+ ]).sort((left, right) => scoreCandidate(right) - scoreCandidate(left));
2674
+ for (const candidate of candidates) {
2675
+ const validated = await validateCandidate(candidate, baseUrl, userAgent);
2676
+ if (validated) {
2677
+ return validated;
2678
+ }
2679
+ }
2680
+ return null;
2681
+ } catch {
2682
+ return null;
2683
+ }
2684
+ }
2685
+
2179
2686
  // src/query/search-service.ts
2180
2687
  import { readFile as readFile11 } from "fs/promises";
2181
2688
  import { BoolQuery, MatchQuery, OP, TermQuery, reciprocalRankFusion } from "@tryformation/querylight-ts";
2182
2689
  import path18 from "path";
2183
2690
  async function loadHydratedIndex(workspacePath) {
2184
- const state = await readLatestIndexState(workspacePath);
2691
+ let state;
2692
+ try {
2693
+ state = await readLatestIndexState(workspacePath);
2694
+ } catch (error) {
2695
+ if (error.code === "ENOENT") {
2696
+ throw new CliError("lexical index is not built; run `qli rebuild` or `qli chunk` followed by `qli index build`", "INDEX_MISSING", 7 /* QueryError */);
2697
+ }
2698
+ throw error;
2699
+ }
2185
2700
  const mapping = createIndexMapping(Object.keys(state.fieldState ?? {}).filter((field) => field.startsWith("metadata.")));
2186
2701
  return new (await import("@tryformation/querylight-ts")).DocumentIndex(mapping).loadState(state);
2187
2702
  }
@@ -2417,9 +2932,25 @@ async function buildSnippetWithAdjacentChunks(chunk, query, {
2417
2932
  function normalizeDisplayTitle(title) {
2418
2933
  return title.replace(/\s*\|\s*Querylight TS Demo\s*$/i, "").replace(/\s+/g, " ").trim();
2419
2934
  }
2935
+ var LOW_SIGNAL_RESULT_TITLES = /* @__PURE__ */ new Set([
2936
+ "choose this instead of",
2937
+ "how xyz runs it",
2938
+ "naechste schritte",
2939
+ "next steps",
2940
+ "overview",
2941
+ "passend wenn",
2942
+ "problem",
2943
+ "right fit",
2944
+ "waehlen sie das stattdessen",
2945
+ "was sie bekommen",
2946
+ "what you get",
2947
+ "wie xyz es umsetzt",
2948
+ "uberblick",
2949
+ "\xFCberblick"
2950
+ ]);
2420
2951
  function chooseResultTitle(chunk) {
2421
2952
  const documentTitle = normalizeDisplayTitle(chunk.title);
2422
- const headings = chunk.headingPath.map((heading) => normalizeDisplayTitle(heading)).filter(Boolean);
2953
+ const headings = chunk.headingPath.map((heading) => normalizeDisplayTitle(heading)).filter((heading) => heading.length > 0 && !LOW_SIGNAL_RESULT_TITLES.has(heading.toLowerCase()));
2423
2954
  const leafHeading = headings.at(-1);
2424
2955
  if (leafHeading && leafHeading.toLowerCase() !== documentTitle.toLowerCase()) {
2425
2956
  return leafHeading;
@@ -2441,6 +2972,9 @@ function normalizeUriPath(uri) {
2441
2972
  return uri.toLowerCase().replace(/\/+$/, "");
2442
2973
  }
2443
2974
  }
2975
+ function normalizeUriIdentity(uri) {
2976
+ return normalizeRemoteUrl(uri).toLowerCase().replace(/\/+$/, "");
2977
+ }
2444
2978
  function uriSpecificity(uri) {
2445
2979
  const normalized = normalizeUriPath(uri);
2446
2980
  if (normalized === "/") {
@@ -2457,6 +2991,11 @@ function isMoreSpecificDuplicate(candidate, existing) {
2457
2991
  if (!candidateTitle || candidateTitle !== existingTitle) {
2458
2992
  return false;
2459
2993
  }
2994
+ const candidateIdentity = normalizeUriIdentity(candidate.uri);
2995
+ const existingIdentity = normalizeUriIdentity(existing.uri);
2996
+ if (candidateIdentity === existingIdentity) {
2997
+ return candidate.uri.length < existing.uri.length;
2998
+ }
2460
2999
  const candidatePath = normalizeUriPath(candidate.uri);
2461
3000
  const existingPath = normalizeUriPath(existing.uri);
2462
3001
  if (candidatePath === existingPath) {
@@ -2471,28 +3010,28 @@ function isMoreSpecificDuplicate(candidate, existing) {
2471
3010
  }
2472
3011
  function collapseAggregateDuplicates(results, topK) {
2473
3012
  const deduped = [];
2474
- for (const result2 of results) {
3013
+ for (const result of results) {
2475
3014
  const duplicateIndex = deduped.findIndex(
2476
- (existing) => isMoreSpecificDuplicate(result2, existing) || isMoreSpecificDuplicate(existing, result2)
3015
+ (existing) => isMoreSpecificDuplicate(result, existing) || isMoreSpecificDuplicate(existing, result)
2477
3016
  );
2478
3017
  if (duplicateIndex < 0) {
2479
- deduped.push(result2);
3018
+ deduped.push(result);
2480
3019
  continue;
2481
3020
  }
2482
- if (isMoreSpecificDuplicate(result2, deduped[duplicateIndex])) {
2483
- deduped[duplicateIndex] = result2;
3021
+ if (isMoreSpecificDuplicate(result, deduped[duplicateIndex])) {
3022
+ deduped[duplicateIndex] = result;
2484
3023
  }
2485
3024
  }
2486
3025
  return deduped.slice(0, topK);
2487
3026
  }
2488
3027
  function rerankResultsByDocument(results, topK) {
2489
3028
  const byDocument = /* @__PURE__ */ new Map();
2490
- for (const result2 of results) {
2491
- const existing = byDocument.get(result2.documentId);
3029
+ for (const result of results) {
3030
+ const existing = byDocument.get(result.documentId);
2492
3031
  if (existing) {
2493
- existing.push(result2);
3032
+ existing.push(result);
2494
3033
  } else {
2495
- byDocument.set(result2.documentId, [result2]);
3034
+ byDocument.set(result.documentId, [result]);
2496
3035
  }
2497
3036
  }
2498
3037
  const reranked = [...byDocument.values()].flatMap((group) => {
@@ -2501,7 +3040,7 @@ function rerankResultsByDocument(results, topK) {
2501
3040
  if (!best) {
2502
3041
  return [];
2503
3042
  }
2504
- const tailScore = rest.reduce((sum, result2) => sum + result2.score, 0);
3043
+ const tailScore = rest.reduce((sum, result) => sum + result.score, 0);
2505
3044
  const aggregateScore = best.score + tailScore * 0.35 + (group.length - 1) * 0.2;
2506
3045
  return [{ ...best, score: aggregateScore }];
2507
3046
  }).sort((left, right) => right.score - left.score);
@@ -2569,7 +3108,6 @@ async function searchIndex({
2569
3108
  score: 0,
2570
3109
  title: chooseResultTitle(chunk),
2571
3110
  uri: chunk.uri,
2572
- headingPath: chunk.headingPath,
2573
3111
  snippet: await buildSnippetWithAdjacentChunks(chunk, document.title, {
2574
3112
  document,
2575
3113
  config,
@@ -2584,7 +3122,7 @@ async function searchIndex({
2584
3122
  };
2585
3123
  })
2586
3124
  );
2587
- return { retrievalMode: "lexical", results: latestResults.filter((result2) => result2 != null) };
3125
+ return { retrievalMode: "lexical", results: latestResults.filter((result) => result != null) };
2588
3126
  }
2589
3127
  const lexicalHits = async () => {
2590
3128
  const index = await loadHydratedIndex(workspacePath);
@@ -2633,7 +3171,6 @@ async function searchIndex({
2633
3171
  score,
2634
3172
  title: chooseResultTitle(chunk),
2635
3173
  uri: chunk.uri,
2636
- headingPath: chunk.headingPath,
2637
3174
  snippet: await buildSnippetWithAdjacentChunks(chunk, normalizedQuery, {
2638
3175
  document: documents.get(chunk.documentId),
2639
3176
  config,
@@ -2647,13 +3184,13 @@ async function searchIndex({
2647
3184
  metadata: chunk.metadata
2648
3185
  };
2649
3186
  }));
2650
- const results = rawResults.filter((result2) => result2 != null);
3187
+ const results = rawResults.filter((result) => result != null);
2651
3188
  return { retrievalMode: mode, results: rerankResultsByDocument(results, topK) };
2652
3189
  }
2653
3190
 
2654
3191
  // src/query/related-service.ts
2655
3192
  import path19 from "path";
2656
- function cosineSimilarity(left, right) {
3193
+ function cosineSimilarity2(left, right) {
2657
3194
  let dot = 0;
2658
3195
  let leftNorm = 0;
2659
3196
  let rightNorm = 0;
@@ -2739,7 +3276,7 @@ async function findRelatedDocuments({
2739
3276
  const results = [...vectors.values()].filter((candidate) => candidate.document.id !== selected.id).map((candidate) => ({
2740
3277
  documentId: candidate.document.id,
2741
3278
  sourceId: candidate.document.sourceId,
2742
- score: cosineSimilarity(sourceVector.embedding, candidate.embedding),
3279
+ score: cosineSimilarity2(sourceVector.embedding, candidate.embedding),
2743
3280
  title: candidate.document.title,
2744
3281
  uri: candidate.document.uri,
2745
3282
  metadata: candidate.document.metadata
@@ -2767,21 +3304,20 @@ async function createContext({
2767
3304
  const search = await searchIndex({ workspacePath, query, topK, showChunks: true, retrievalMode });
2768
3305
  const sources = [];
2769
3306
  let total = 0;
2770
- for (const result2 of search.results) {
2771
- const text = result2.text ?? "";
3307
+ for (const result of search.results) {
3308
+ const text = result.text ?? "";
2772
3309
  if (total + text.length > maxChars && sources.length > 0) {
2773
3310
  break;
2774
3311
  }
2775
3312
  total += text.length;
2776
3313
  sources.push({
2777
- chunkId: result2.chunkId,
2778
- documentId: result2.documentId,
2779
- sourceId: result2.sourceId,
2780
- title: result2.title,
2781
- uri: result2.uri,
2782
- headingPath: result2.headingPath,
3314
+ chunkId: result.chunkId,
3315
+ documentId: result.documentId,
3316
+ sourceId: result.sourceId,
3317
+ title: result.title,
3318
+ uri: result.uri,
2783
3319
  text,
2784
- metadata: result2.metadata
3320
+ metadata: result.metadata
2785
3321
  });
2786
3322
  }
2787
3323
  const markdown = [
@@ -2792,7 +3328,6 @@ async function createContext({
2792
3328
  `Title: ${source.title}`,
2793
3329
  `URL: ${source.uri}`,
2794
3330
  `Chunk ID: ${source.chunkId}`,
2795
- source.headingPath.length > 0 ? `Heading Path: ${source.headingPath.join(" > ")}` : "",
2796
3331
  "",
2797
3332
  source.text,
2798
3333
  ""
@@ -2871,27 +3406,30 @@ function formatSourcesTable(sources) {
2871
3406
  return table.toString();
2872
3407
  }
2873
3408
  function formatSearchResults(results) {
2874
- return results.map((result2, index) => [
2875
- `${index + 1}. ${colors.bold(result2.title)}`,
2876
- ` ${result2.uri}`,
2877
- ` Source type: ${result2.sourceType}`,
2878
- ` Published: ${result2.publicationDate ?? "n/a"}`,
2879
- ` Score: ${result2.score.toFixed(3)}`,
2880
- ` ${result2.snippet}`
2881
- ].join("\n")).join("\n\n");
3409
+ return results.map((result, index) => [
3410
+ `${index + 1}. ${colors.bold(result.title)}`,
3411
+ ` URL: ${result.uri}`,
3412
+ ` Source: ${result.sourceType} | Published: ${result.publicationDate ?? "n/a"} | Score: ${result.score.toFixed(3)}`,
3413
+ "",
3414
+ ...result.snippet.split("\n").map((line) => line.length > 0 ? ` ${line}` : "")
3415
+ ].join("\n")).join(`
3416
+
3417
+ ${colors.dim("---")}
3418
+
3419
+ `);
2882
3420
  }
2883
3421
  function formatRelatedDocuments(results) {
2884
- return results.map((result2, index) => [
2885
- `${index + 1}. ${colors.bold(result2.title)}`,
2886
- ` ${result2.uri}`,
2887
- ` Similarity: ${result2.score.toFixed(3)}`
3422
+ return results.map((result, index) => [
3423
+ `${index + 1}. ${colors.bold(result.title)}`,
3424
+ ` ${result.uri}`,
3425
+ ` Similarity: ${result.score.toFixed(3)}`
2888
3426
  ].join("\n")).join("\n\n");
2889
3427
  }
2890
3428
 
2891
3429
  // src/cli/run-cli.ts
2892
3430
  var SOURCE_TYPES = /* @__PURE__ */ new Set(["url", "website", "rss", "file", "directory", "markdown", "text"]);
2893
3431
  var RETRIEVAL_MODES = /* @__PURE__ */ new Set(["lexical", "dense", "sparse", "hybrid"]);
2894
- var SOURCE_TYPE_LIST = ["url", "website", "rss", "file", "directory", "markdown", "text"];
3432
+ var SOURCE_TYPE_LIST = ["page", "website", "rss", "file", "directory", "markdown", "text"];
2895
3433
  var RETRIEVAL_MODE_LIST = ["lexical", "dense", "sparse", "hybrid"];
2896
3434
  var SEARCH_DATE_FIELDS = ["publicationDate", "firstSeenAt", "lastSeenAt", "lastChangedAt", "crawledAt"];
2897
3435
  function parseKeyValue(input) {
@@ -2914,11 +3452,46 @@ function parseOptionalNumber(input, optionName) {
2914
3452
  }
2915
3453
  return value;
2916
3454
  }
3455
+ function parseOptionalPositiveInteger(input, optionName) {
3456
+ const value = parseOptionalNumber(input, optionName);
3457
+ if (value === void 0) {
3458
+ return void 0;
3459
+ }
3460
+ if (!Number.isInteger(value) || value < 1) {
3461
+ throw new CliError(`invalid positive integer for ${optionName}: ${input}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
3462
+ }
3463
+ return value;
3464
+ }
2917
3465
  function setWhenDefined(target, key, value) {
2918
3466
  if (value !== void 0) {
2919
3467
  target[key] = value;
2920
3468
  }
2921
3469
  }
3470
+ function mergePatterns(existing, extra) {
3471
+ const merged = [...existing ?? []];
3472
+ if (extra && !merged.includes(extra)) {
3473
+ merged.push(extra);
3474
+ }
3475
+ return merged.length > 0 ? merged : void 0;
3476
+ }
3477
+ function formatWebsiteSourceAdd(result) {
3478
+ const lines = [`Added source ${result.primarySource.id}`];
3479
+ if (!result.detectedFeed) {
3480
+ lines.push("No feed detected during website registration.");
3481
+ return lines.join("\n");
3482
+ }
3483
+ if (result.detectedFeed.source && result.detectedFeed.wasAdded) {
3484
+ lines.push(`Detected feed ${result.detectedFeed.url} and added source ${result.detectedFeed.source.id}.`);
3485
+ } else if (result.detectedFeed.source) {
3486
+ lines.push(`Detected feed ${result.detectedFeed.url}. Source ${result.detectedFeed.source.id} already exists.`);
3487
+ } else {
3488
+ lines.push(`Detected feed ${result.detectedFeed.url}.`);
3489
+ }
3490
+ if (result.detectedFeed.excludePrefix) {
3491
+ lines.push(`Excluded ${result.detectedFeed.excludePrefix} from the website crawl.`);
3492
+ }
3493
+ return lines.join("\n");
3494
+ }
2922
3495
  function createSourceCrawlConfig(type, options, defaults) {
2923
3496
  if (!["url", "website", "directory", "rss"].includes(type)) {
2924
3497
  return void 0;
@@ -2926,6 +3499,7 @@ function createSourceCrawlConfig(type, options, defaults) {
2926
3499
  const crawl = {};
2927
3500
  setWhenDefined(crawl, "maxDepth", parseOptionalNumber(options.maxDepth, "--max-depth"));
2928
3501
  setWhenDefined(crawl, "maxPages", parseOptionalNumber(options.maxPages, "--max-pages"));
3502
+ setWhenDefined(crawl, "maxConcurrentRequests", parseOptionalPositiveInteger(options.maxConcurrentRequests, "--max-concurrent-requests"));
2929
3503
  setWhenDefined(crawl, "includePatterns", options.include);
2930
3504
  setWhenDefined(crawl, "excludePatterns", options.exclude);
2931
3505
  setWhenDefined(crawl, "obeyRobotsTxt", options.robots);
@@ -2944,14 +3518,48 @@ function createSourceCrawlConfig(type, options, defaults) {
2944
3518
  }
2945
3519
  return Object.keys(crawl).length > 0 ? crawl : void 0;
2946
3520
  }
3521
+ function validateSourceAddOptions(type, options) {
3522
+ const reject = (optionName) => {
3523
+ throw new CliError(`${optionName} is not supported for source type ${type}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
3524
+ };
3525
+ if (options.maxDepth !== void 0 && type !== "website") {
3526
+ reject("--max-depth");
3527
+ }
3528
+ if (options.maxPages !== void 0 && type !== "website") {
3529
+ reject("--max-pages");
3530
+ }
3531
+ if (options.maxConcurrentRequests !== void 0 && !["website", "rss"].includes(type)) {
3532
+ reject("--max-concurrent-requests");
3533
+ }
3534
+ if (options.renderJs && type !== "website") {
3535
+ reject("--render-js");
3536
+ }
3537
+ if (options.robots === false && type !== "website") {
3538
+ reject("--no-robots");
3539
+ }
3540
+ if (options.rateLimitMs !== void 0 && type !== "website") {
3541
+ reject("--rate-limit-ms");
3542
+ }
3543
+ if (options.include !== void 0 && !["website", "directory"].includes(type)) {
3544
+ reject("--include");
3545
+ }
3546
+ if (options.exclude !== void 0 && !["website", "directory"].includes(type)) {
3547
+ reject("--exclude");
3548
+ }
3549
+ if (options.retentionDays !== void 0 && type !== "rss") {
3550
+ reject("--retention-days");
3551
+ }
3552
+ }
2947
3553
  function allowedSourceConfigFields(source) {
2948
3554
  const fields = /* @__PURE__ */ new Set(["name", "tag", "metadata"]);
2949
3555
  if (source.type === "rss") {
2950
3556
  fields.add("retentionDays");
3557
+ fields.add("maxConcurrentRequests");
2951
3558
  }
2952
3559
  if (source.type === "website") {
2953
3560
  fields.add("maxDepth");
2954
3561
  fields.add("maxPages");
3562
+ fields.add("maxConcurrentRequests");
2955
3563
  fields.add("include");
2956
3564
  fields.add("exclude");
2957
3565
  }
@@ -2987,6 +3595,10 @@ function buildSourceConfigPatch(source, options) {
2987
3595
  checkAllowed("maxPages", "--max-pages");
2988
3596
  crawlPatch.maxPages = parseOptionalNumber(options.maxPages, "--max-pages");
2989
3597
  }
3598
+ if (options.maxConcurrentRequests !== void 0) {
3599
+ checkAllowed("maxConcurrentRequests", "--max-concurrent-requests");
3600
+ crawlPatch.maxConcurrentRequests = parseOptionalPositiveInteger(options.maxConcurrentRequests, "--max-concurrent-requests");
3601
+ }
2990
3602
  if (options.include !== void 0) {
2991
3603
  checkAllowed("include", "--include");
2992
3604
  crawlPatch.includePatterns = options.include;
@@ -3016,6 +3628,50 @@ function response(command, workspace, data, error) {
3016
3628
  }
3017
3629
  function writeOutput(capture, value, stderr = false) {
3018
3630
  (stderr ? capture.stderr : capture.stdout).push(value);
3631
+ if (stderr) {
3632
+ capture.onStderr?.(value);
3633
+ return;
3634
+ }
3635
+ capture.onStdout?.(value);
3636
+ }
3637
+ function createProgressHandler(capture, options) {
3638
+ if (options.json || options.silent || options.quiet) {
3639
+ return void 0;
3640
+ }
3641
+ return (level, message) => {
3642
+ if (level === "detail" && !options.verbose) {
3643
+ return;
3644
+ }
3645
+ writeOutput(capture, message, true);
3646
+ };
3647
+ }
3648
+ async function runIngestCommand({
3649
+ workspace,
3650
+ sourceId,
3651
+ changedOnly,
3652
+ dense,
3653
+ sparse,
3654
+ progress
3655
+ }) {
3656
+ progress?.("info", "Ingest step 1/3: fetch and normalize");
3657
+ const ingest = await ingestSources({
3658
+ workspacePath: workspace,
3659
+ sourceIds: sourceId ? [sourceId] : void 0,
3660
+ changedOnly,
3661
+ progress
3662
+ });
3663
+ progress?.("info", "Ingest step 2/3: chunk affected documents");
3664
+ const chunk = await chunkDocuments({ workspacePath: workspace, sourceId, progress });
3665
+ progress?.("info", "Ingest step 3/3: refresh index");
3666
+ const indexBuild = await buildIndex({
3667
+ workspacePath: workspace,
3668
+ denseOverride: dense ? true : void 0,
3669
+ sparseOverride: sparse ? true : void 0,
3670
+ buildAvailableModels: true,
3671
+ progress
3672
+ });
3673
+ progress?.("info", "Ingest complete");
3674
+ return { ingest, chunk, indexPath: indexBuild.indexPath, metadata: indexBuild.metadata };
3019
3675
  }
3020
3676
  function parseRetrievalMode(input) {
3021
3677
  if (!input) {
@@ -3030,10 +3686,11 @@ function parseSourceType(input) {
3030
3686
  if (!input) {
3031
3687
  return void 0;
3032
3688
  }
3033
- if (!SOURCE_TYPES.has(input)) {
3689
+ const normalized = input === "page" ? "url" : input;
3690
+ if (!SOURCE_TYPES.has(normalized)) {
3034
3691
  throw new CliError(`unsupported source type: ${input}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
3035
3692
  }
3036
- return input;
3693
+ return normalized;
3037
3694
  }
3038
3695
  function parseCommaSeparatedList(input) {
3039
3696
  const values = (input ?? "").split(",").map((value) => value.trim()).filter(Boolean);
@@ -3094,56 +3751,96 @@ function workspaceFromArgv(argv) {
3094
3751
  }
3095
3752
  return path21.resolve(DEFAULT_WORKSPACE);
3096
3753
  }
3097
- async function runCli(argv) {
3098
- const capture = { stdout: [], stderr: [] };
3754
+ async function runCli(argv, io = {}) {
3755
+ const capture = { stdout: [], stderr: [], ...io };
3099
3756
  const program = new Command();
3100
- program.name("qli").description("Build and query a local Querylight workspace from files, directories, URLs, websites, and feeds.").showHelpAfterError().option("--workspace <path>", "Workspace directory. Defaults to .kb in the current directory.", DEFAULT_WORKSPACE).option("--config <path>", "Optional config file override. Useful for testing alternate retrieval settings.").option("--json", "Return a stable JSON envelope for automation and agents.").option("--verbose", "Print more operational detail when a command supports it.").option("--quiet", "Suppress non-essential human-readable output.");
3757
+ program.name("qli").description("Build and query a local Querylight workspace from files, directories, URLs, websites, and feeds.").showHelpAfterError().option("--workspace <path>", "Workspace directory. Defaults to .kb in the current directory.", DEFAULT_WORKSPACE).option("--config <path>", "Optional config file override. Useful for testing alternate retrieval settings.").option("--json", "Return a stable JSON envelope for automation and agents.").option("--silent", "Suppress progress logging for long-running commands.").option("--verbose", "Print more operational detail when a command supports it.").addOption(new Option("--quiet", "Deprecated alias for --silent.").hideHelp());
3101
3758
  program.addHelpText("after", `
3102
3759
  Workflow:
3103
3760
  1. Initialize a workspace with qli init
3104
3761
  2. Register one or more sources with qli source add
3105
- 3. Build or refresh the workspace with qli rebuild
3762
+ 3. Refresh the workspace with qli ingest
3106
3763
  4. Query it with qli search, qli related, or qli context
3107
3764
 
3108
3765
  Examples:
3109
3766
  qli init
3110
3767
  qli source add directory ./docs --name "Product Docs" --tag docs
3111
- qli rebuild
3768
+ qli ingest
3769
+ qli rebuild --silent
3112
3770
  qli search "api authentication" --top-k 8
3113
3771
  qli context "How do API keys work?" --top-k 8 --max-chars 8000
3114
3772
 
3773
+ Long-running commands print progress to stderr by default. Use --silent to suppress it.
3774
+ Use --json when another tool needs stable structured output.
3775
+
3115
3776
  Use qli <command> --help for command-specific options and examples.`);
3116
- program.command("init").description("Create a new workspace with the default directory layout and config.").option("--force").addHelpText("after", `
3777
+ program.command("init").description("Create a new workspace with the default directory layout and config, then pull missing retrieval models.").option("--force").addHelpText("after", `
3117
3778
  Examples:
3118
3779
  qli init
3119
3780
  qli init --workspace ./kb
3120
- qli init --workspace /tmp/querylight --force`).action(async function command(options) {
3781
+ qli init --workspace /tmp/querylight --force
3782
+
3783
+ Notes:
3784
+ init enables dense and sparse retrieval in new workspaces.
3785
+ init pulls missing model assets for enabled retrieval modes.
3786
+ Sparse model downloads require uv. If uv is not available, init skips the sparse pull.`).action(async function command(options) {
3787
+ const global = this.optsWithGlobals();
3121
3788
  const workspace = await resolveWorkspace({ workspace: this.optsWithGlobals().workspace });
3122
- const result2 = await ensureWorkspace({ workspacePath: workspace, force: Boolean(options.force) });
3123
- emit(this.optsWithGlobals().json, capture, response("init", workspace, result2), `Initialized workspace at ${workspace}`);
3789
+ const result = await ensureWorkspace({ workspacePath: workspace, force: Boolean(options.force) });
3790
+ const config = await loadConfig(workspace, global.config);
3791
+ const status = await getModelStatus(workspace, config);
3792
+ const { pullDense, pullSparse } = resolveMissingConfiguredModelPullPlan({ config, status });
3793
+ if (pullDense || pullSparse) {
3794
+ await pullModels({ workspacePath: workspace, config, pullDense, pullSparse, progress: createProgressHandler(capture, global) });
3795
+ }
3796
+ emit(this.optsWithGlobals().json, capture, response("init", workspace, result), `Initialized workspace at ${workspace}`);
3124
3797
  });
3125
3798
  const source = program.command("source");
3126
3799
  source.description("Register, inspect, and manage workspace sources.");
3127
- source.command("add").description("Add a source definition. The source is enabled immediately.").argument("<type>", `Source type: ${SOURCE_TYPE_LIST.join(", ")}`).argument("<uri>", "Local path, URL, feed URL, or inline content depending on the source type.").requiredOption("--name <name>").option("--tag <tag...>", "Optional tags used later for filtering during search.").option("--metadata <key=value...>", "Extra metadata fields stored on the source.").option("--max-depth <n>", "Maximum crawl depth for website sources.").option("--max-pages <n>", "Maximum number of pages to ingest from a website source.").option("--include <pattern...>", "Only include matching paths or URLs.").option("--exclude <pattern...>", "Skip matching paths or URLs.").option("--render-js", "Render pages with JavaScript before extraction when supported.").option("--no-robots", "Ignore robots.txt for website crawling. Use only when you control the target site or have permission.").option("--rate-limit-ms <n>", "Delay between website requests.").option("--retention-days <n>", "Retention window in days for RSS items. Defaults to the workspace crawler retention setting.").addHelpText("after", `
3800
+ source.command("add").description("Add a source definition. The source is enabled immediately. Use `page` for one page and `website` for multi-page crawling and feed detection.").argument("<type>", `Source type: ${SOURCE_TYPE_LIST.join(", ")}`).argument("<uri>", "Local path, URL, feed URL, or inline content depending on the source type.").requiredOption("--name <name>").option("--tag <tag...>", "Optional tags used later for filtering during search.").option("--metadata <key=value...>", "Extra metadata fields stored on the source.").option("--max-depth <n>", "Maximum crawl depth for website sources.").option("--max-pages <n>", "Maximum number of pages to ingest from a website source.").option("--max-concurrent-requests <n>", "Maximum remote requests in flight for a website or feed source.").option("--include <pattern...>", "Only include matching paths or URLs.").option("--exclude <pattern...>", "Skip matching paths or URLs.").option("--render-js", "Render pages with JavaScript before extraction when supported.").option("--no-robots", "Ignore robots.txt for website crawling. Use only when you control the target site or have permission.").option("--rate-limit-ms <n>", "Delay between website requests.").option("--retention-days <n>", "Retention window in days for RSS items. Defaults to the workspace crawler retention setting.").addHelpText("after", `
3128
3801
  Examples:
3129
3802
  qli source add directory ./docs --name "Local Docs" --tag docs
3130
3803
  qli source add file ./docs/auth.md --name "Auth Guide"
3131
- qli source add url https://example.com/docs/auth --name "Auth Page"
3804
+ qli source add page https://example.com/docs/auth --name "Auth Page"
3132
3805
  qli source add website https://example.com --name "Docs Site" --max-depth 2 --max-pages 50 --include /docs/
3806
+ qli source add website https://example.com --name "Docs Site" --max-concurrent-requests 8
3807
+ qli source add website https://example.com --name "Example Site" --json
3133
3808
  qli source add rss https://example.com/feed.xml --name "Release Feed"
3809
+ qli source add rss https://example.com/feed.xml --name "Release Feed" --max-concurrent-requests 3
3134
3810
  qli source add rss https://example.com/feed.xml --name "Release Feed" --retention-days 30
3135
3811
 
3136
3812
  Notes:
3813
+ page stores one page. It does not crawl links or detect feeds.
3814
+ Website sources may detect one blog or news feed during registration.
3815
+ When a feed is added, qli also excludes the feed item prefix from the website crawl when it can infer one.
3816
+ Website and RSS sources default to 5 remote requests in flight per source unless config.yaml or source settings override it.
3817
+ Use --json when automation needs the full list of created sources.
3137
3818
  RSS sources store retention per feed.
3138
- When you omit --retention-days for RSS, qli stores the workspace default from config.yaml.`).action(async function command(type, uri, options) {
3819
+ When you omit --retention-days for RSS, qli stores the workspace default from config.yaml.`).action(async function command(typeInput, uri, options) {
3820
+ const type = parseSourceType(typeInput);
3821
+ if (!type) {
3822
+ throw new CliError(`unsupported source type: ${typeInput}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
3823
+ }
3139
3824
  if (!SOURCE_TYPES.has(type)) {
3140
3825
  throw new CliError(`unsupported source type: ${type}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
3141
3826
  }
3827
+ validateSourceAddOptions(type, options);
3142
3828
  const global = this.optsWithGlobals();
3143
3829
  const workspace = await resolveWorkspace({ workspace: global.workspace });
3144
3830
  const config = await loadConfig(workspace, global.config);
3145
3831
  const now = (/* @__PURE__ */ new Date()).toISOString();
3146
- const crawl = createSourceCrawlConfig(type, options, { retentionDays: config.crawler.retentionDays });
3832
+ const initialCrawl = createSourceCrawlConfig(type, options, { retentionDays: config.crawler.retentionDays });
3833
+ let crawl = initialCrawl;
3834
+ let detectedFeed = null;
3835
+ if (type === "website") {
3836
+ detectedFeed = await discoverWebsiteFeed(uri, config.crawler.defaultUserAgent);
3837
+ if (detectedFeed?.excludePrefix) {
3838
+ crawl = {
3839
+ ...crawl ?? {},
3840
+ excludePatterns: mergePatterns(crawl?.excludePatterns, detectedFeed.excludePrefix)
3841
+ };
3842
+ }
3843
+ }
3147
3844
  const stored = await addSource(workspace, {
3148
3845
  type,
3149
3846
  uri: ["file", "directory"].includes(type) ? path21.resolve(uri) : uri,
@@ -3155,11 +3852,50 @@ Notes:
3155
3852
  createdAt: now,
3156
3853
  updatedAt: now
3157
3854
  });
3158
- emit(global.json, capture, response("source add", workspace, stored), `Added source ${stored.id}`);
3855
+ if (type !== "website") {
3856
+ emit(global.json, capture, response("source add", workspace, stored), `Added source ${stored.id}`);
3857
+ return;
3858
+ }
3859
+ let feedSource;
3860
+ let feedWasAdded = false;
3861
+ if (detectedFeed) {
3862
+ const existingSources = await listSources(workspace);
3863
+ feedSource = existingSources.find((source2) => source2.uri === detectedFeed?.feedUrl);
3864
+ if (!feedSource) {
3865
+ feedSource = await addSource(workspace, {
3866
+ type: "rss",
3867
+ uri: detectedFeed.feedUrl,
3868
+ name: `${options.name} Feed`,
3869
+ enabled: true,
3870
+ tags: options.tag ?? [],
3871
+ metadata: normalizeMetadata(options.metadata),
3872
+ crawl: {
3873
+ retentionDays: config.crawler.retentionDays,
3874
+ fetchArticles: true
3875
+ },
3876
+ createdAt: now,
3877
+ updatedAt: now
3878
+ });
3879
+ feedWasAdded = true;
3880
+ }
3881
+ }
3882
+ const result = {
3883
+ primarySource: stored,
3884
+ addedSources: [stored, ...feedWasAdded && feedSource ? [feedSource] : []],
3885
+ detectedFeed: detectedFeed ? {
3886
+ url: detectedFeed.feedUrl,
3887
+ discoveredBy: detectedFeed.discoveredBy,
3888
+ excludePrefix: detectedFeed.excludePrefix,
3889
+ source: feedSource,
3890
+ wasAdded: feedWasAdded
3891
+ } : null
3892
+ };
3893
+ emit(global.json, capture, response("source add", workspace, result), formatWebsiteSourceAdd(result));
3159
3894
  });
3160
- source.command("config").description("Edit supported settings on an existing source.").argument("<sourceId>", "Source id from qli source list.").option("--name <name>", "Update the source name.").option("--tag <tag...>", "Replace source tags with the provided values.").option("--metadata <key=value...>", "Merge metadata keys into the existing source metadata.").option("--max-depth <n>", "Set website crawl depth.").option("--max-pages <n>", "Set the page limit for website sources.").option("--include <pattern...>", "Set include patterns for website or directory sources.").option("--exclude <pattern...>", "Set exclude patterns for website or directory sources.").option("--retention-days <n>", "Set RSS retention in days for this feed.").addHelpText("after", `
3895
+ source.command("config").description("Edit supported settings on an existing source.").argument("<sourceId>", "Source id from qli source list.").option("--name <name>", "Update the source name.").option("--tag <tag...>", "Replace source tags with the provided values.").option("--metadata <key=value...>", "Merge metadata keys into the existing source metadata.").option("--max-depth <n>", "Set website crawl depth.").option("--max-pages <n>", "Set the page limit for website sources.").option("--max-concurrent-requests <n>", "Set the remote request concurrency limit for website or feed sources.").option("--include <pattern...>", "Set include patterns for website or directory sources.").option("--exclude <pattern...>", "Set exclude patterns for website or directory sources.").option("--retention-days <n>", "Set RSS retention in days for this feed.").addHelpText("after", `
3161
3896
  Examples:
3162
3897
  qli source config src_123 --retention-days 30
3898
+ qli source config src_123 --max-concurrent-requests 2
3163
3899
  qli source config src_123 --name "Docs Feed" --tag rss docs
3164
3900
  qli source config src_123 --include /docs/ --exclude /docs/archive/
3165
3901
  qli source config src_123 --metadata team=docs owner=platform --json
@@ -3218,35 +3954,56 @@ Examples:
3218
3954
  const updated = await updateSource(workspace, sourceId, { enabled: true, updatedAt: (/* @__PURE__ */ new Date()).toISOString() });
3219
3955
  emit(global.json, capture, response("source enable", workspace, updated), `Enabled source ${sourceId}`);
3220
3956
  });
3221
- program.command("ingest").description("Fetch and normalize source content into workspace documents.").option("--source <sourceId>", "Only ingest one source.").option("--changed-only", "Skip content that has not changed since the last run.").addHelpText("after", `
3957
+ program.command("ingest").description("Fetch source content, update affected chunks, and refresh retrieval indexes.").option("--source <sourceId>", "Only ingest one source.").option("--changed-only", "Skip content that has not changed since the last run.").option("--dense", "Force a dense vector build if the dense model is available.").option("--sparse", "Force a sparse vector build if the sparse runtime is available.").addHelpText("after", `
3222
3958
  Examples:
3223
3959
  qli ingest
3224
3960
  qli ingest --source src_123
3225
- qli ingest --changed-only`).action(async function command(options) {
3961
+ qli ingest --changed-only
3962
+ qli ingest --dense --sparse
3963
+ qli ingest --silent`).action(async function command(options) {
3226
3964
  const global = this.optsWithGlobals();
3227
3965
  const workspace = await resolveWorkspace({ workspace: global.workspace });
3228
- const result2 = await ingestSources({ workspacePath: workspace, sourceIds: options.source ? [options.source] : void 0, changedOnly: Boolean(options.changedOnly) });
3229
- emit(global.json, capture, response("ingest", workspace, result2), `Ingested ${result2.processedSources} sources`);
3966
+ const result = await runIngestCommand({
3967
+ workspace,
3968
+ sourceId: options.source,
3969
+ changedOnly: Boolean(options.changedOnly),
3970
+ dense: Boolean(options.dense),
3971
+ sparse: Boolean(options.sparse),
3972
+ progress: createProgressHandler(capture, global)
3973
+ });
3974
+ emit(global.json, capture, response("ingest", workspace, result), `Processed ${result.ingest.processedSources} sources, wrote ${result.chunk.chunksWritten} chunks`);
3230
3975
  });
3231
3976
  program.command("chunk").description("Split normalized documents into retrieval chunks.").option("--source <sourceId>", "Only chunk documents from one source.").option("--document <documentId>", "Only chunk one document.").addHelpText("after", `
3232
3977
  Examples:
3233
3978
  qli chunk
3234
3979
  qli chunk --source src_123
3235
- qli chunk --document doc_123`).action(async function command(options) {
3980
+ qli chunk --document doc_123
3981
+ qli chunk --silent`).action(async function command(options) {
3236
3982
  const global = this.optsWithGlobals();
3237
3983
  const workspace = await resolveWorkspace({ workspace: global.workspace });
3238
- const result2 = await chunkDocuments({ workspacePath: workspace, sourceId: options.source, documentId: options.document });
3239
- emit(global.json, capture, response("chunk", workspace, result2), `Wrote ${result2.chunksWritten} chunks`);
3984
+ const result = await chunkDocuments({
3985
+ workspacePath: workspace,
3986
+ sourceId: options.source,
3987
+ documentId: options.document,
3988
+ progress: createProgressHandler(capture, global)
3989
+ });
3990
+ emit(global.json, capture, response("chunk", workspace, result), `Wrote ${result.chunksWritten} chunks`);
3240
3991
  });
3241
3992
  program.command("reprocess").description("Re-run normalization for existing documents without fetching sources again.").option("--source <sourceId>", "Only reprocess documents from one source.").option("--document <documentId>", "Only reprocess one document.").addHelpText("after", `
3242
3993
  Examples:
3243
3994
  qli reprocess
3244
3995
  qli reprocess --source src_123
3245
- qli reprocess --document doc_123`).action(async function command(options) {
3996
+ qli reprocess --document doc_123
3997
+ qli reprocess --silent`).action(async function command(options) {
3246
3998
  const global = this.optsWithGlobals();
3247
3999
  const workspace = await resolveWorkspace({ workspace: global.workspace });
3248
- const result2 = await reprocessDocuments({ workspacePath: workspace, sourceId: options.source, documentId: options.document });
3249
- emit(global.json, capture, response("reprocess", workspace, result2), `Reprocessed ${result2.documentsReprocessed} documents`);
4000
+ const result = await reprocessDocuments({
4001
+ workspacePath: workspace,
4002
+ sourceId: options.source,
4003
+ documentId: options.document,
4004
+ progress: createProgressHandler(capture, global)
4005
+ });
4006
+ emit(global.json, capture, response("reprocess", workspace, result), `Reprocessed ${result.documentsReprocessed} documents`);
3250
4007
  });
3251
4008
  const index = program.command("index");
3252
4009
  index.description("Build and inspect retrieval indexes.");
@@ -3254,33 +4011,47 @@ Examples:
3254
4011
  Examples:
3255
4012
  qli index build
3256
4013
  qli index build --dense
3257
- qli index build --dense --sparse`).action(async function command(options) {
4014
+ qli index build --dense --sparse
4015
+ qli index build --silent`).action(async function command(options) {
3258
4016
  const global = this.optsWithGlobals();
3259
4017
  const workspace = await resolveWorkspace({ workspace: global.workspace });
3260
- const result2 = await buildIndex({
4018
+ const result = await buildIndex({
3261
4019
  workspacePath: workspace,
3262
4020
  denseOverride: options.dense ? true : void 0,
3263
- sparseOverride: options.sparse ? true : void 0
4021
+ sparseOverride: options.sparse ? true : void 0,
4022
+ progress: createProgressHandler(capture, global)
3264
4023
  });
3265
- emit(global.json, capture, response("index build", workspace, result2), `Built index at ${result2.indexPath}`);
4024
+ emit(global.json, capture, response("index build", workspace, result), `Built index at ${result.indexPath}`);
3266
4025
  });
3267
4026
  program.command("rebuild").description("Run ingest, chunk, and index build in one command.").option("--source <sourceId>", "Only rebuild data for one source.").option("--changed-only", "Only ingest changed content before chunking and indexing.").option("--dense", "Force a dense vector build if the dense model is available.").option("--sparse", "Force a sparse vector build if the sparse runtime is available.").addHelpText("after", `
3268
4027
  Examples:
3269
4028
  qli rebuild
3270
4029
  qli rebuild --changed-only
3271
4030
  qli rebuild --source src_123
3272
- qli rebuild --dense --sparse`).action(async function command(options) {
4031
+ qli rebuild --dense --sparse
4032
+ qli rebuild --silent`).action(async function command(options) {
3273
4033
  const global = this.optsWithGlobals();
3274
4034
  const workspace = await resolveWorkspace({ workspace: global.workspace });
3275
- const ingest = await ingestSources({ workspacePath: workspace, sourceIds: options.source ? [options.source] : void 0, changedOnly: Boolean(options.changedOnly) });
3276
- const chunk = await chunkDocuments({ workspacePath: workspace, sourceId: options.source });
4035
+ const progress = createProgressHandler(capture, global);
4036
+ progress?.("info", "Rebuild step 1/3: ingest");
4037
+ const ingest = await ingestSources({
4038
+ workspacePath: workspace,
4039
+ sourceIds: options.source ? [options.source] : void 0,
4040
+ changedOnly: Boolean(options.changedOnly),
4041
+ progress
4042
+ });
4043
+ progress?.("info", "Rebuild step 2/3: chunk");
4044
+ const chunk = await chunkDocuments({ workspacePath: workspace, sourceId: options.source, progress });
4045
+ progress?.("info", "Rebuild step 3/3: index");
3277
4046
  const indexBuild = await buildIndex({
3278
4047
  workspacePath: workspace,
3279
4048
  denseOverride: options.dense ? true : void 0,
3280
4049
  sparseOverride: options.sparse ? true : void 0,
3281
- buildAvailableModels: true
4050
+ buildAvailableModels: true,
4051
+ progress
3282
4052
  });
3283
4053
  const data = { ingest, chunk, indexPath: indexBuild.indexPath, metadata: indexBuild.metadata };
4054
+ progress?.("info", "Rebuild complete");
3284
4055
  emit(global.json, capture, response("rebuild", workspace, data), `Processed ${ingest.processedSources} sources, wrote ${chunk.chunksWritten} chunks`);
3285
4056
  });
3286
4057
  program.command("search").description("Search the built index and return ranked matching documents or chunks.").argument("[query]", "Text query. Omit it to list the latest matching documents.").option("--top-k <n>", "Maximum number of results to return.", "12").option("--source <sourceIds>", "Restrict results to one or more source ids. Use comma-separated values.").option("--source-name <names>", "Restrict results to one or more source names. Use comma-separated values.").option("--source-type <types>", `Restrict results to one or more source types. Use comma-separated values: ${SOURCE_TYPE_LIST.join(", ")}`).option("--uri-prefix <prefixes>", "Restrict results to one or more URI prefixes. Use comma-separated values.").option("--tag <tags>", "Restrict results to one or more source tags. Use comma-separated values.").option("--metadata <key=value...>", "Restrict results to sources with matching metadata.").option("--since <date>", "Shortcut for --publication-date-from.").option("--until <date>", "Shortcut for --publication-date-to.").option("--changed-since <date>", "Only include documents changed on or after this date.").option("--has-publication-date", "Only include documents with a publication date.").option("--publication-date-from <date>", "Only include documents published on or after this date.").option("--publication-date-to <date>", "Only include documents published on or before this date.").option("--first-seen-at-from <date>", "Only include documents first seen on or after this date.").option("--first-seen-at-to <date>", "Only include documents first seen on or before this date.").option("--last-seen-at-from <date>", "Only include documents last seen on or after this date.").option("--last-seen-at-to <date>", "Only include documents last seen on or before this date.").option("--last-changed-at-from <date>", "Only include documents changed on or after this date.").option("--last-changed-at-to <date>", "Only include documents changed on or before this date.").option("--crawled-at-from <date>", "Only include documents crawled on or after this date.").option("--crawled-at-to <date>", "Only include documents crawled on or before this date.").option("--retrieval <mode>", `Retrieval mode: ${RETRIEVAL_MODE_LIST.join(", ")}`).option("--show-chunks", "Return chunk-level matches when available.").addHelpText("after", `
@@ -3291,7 +4062,7 @@ Examples:
3291
4062
  qli search --source-name "Release Feed,Company Blog" --uri-prefix https://example.com/news,https://example.com/blog
3292
4063
  qli search "billing" --metadata team=support
3293
4064
  qli search "embedding model" --retrieval hybrid --show-chunks
3294
- qli search --source-type rss,url --top-k 25 --json
4065
+ qli search --source-type rss,page --top-k 25 --json
3295
4066
 
3296
4067
  Notes:
3297
4068
  lexical works without vector models.
@@ -3299,7 +4070,7 @@ Notes:
3299
4070
  When you omit the query, qli returns the latest matching documents sorted by publication date.`).action(async function command(query, options) {
3300
4071
  const global = this.optsWithGlobals();
3301
4072
  const workspace = await resolveWorkspace({ workspace: global.workspace });
3302
- const result2 = await searchIndex({
4073
+ const result = await searchIndex({
3303
4074
  workspacePath: workspace,
3304
4075
  query: query ?? "",
3305
4076
  topK: Number(options.topK),
@@ -3314,7 +4085,7 @@ Notes:
3314
4085
  retrievalMode: parseRetrievalMode(options.retrieval),
3315
4086
  showChunks: Boolean(options.showChunks)
3316
4087
  });
3317
- emit(global.json, capture, response("search", workspace, result2), formatSearchResults(result2.results));
4088
+ emit(global.json, capture, response("search", workspace, result), formatSearchResults(result.results));
3318
4089
  });
3319
4090
  program.command("related").description("Find documents similar to an existing document by id or URI.").argument("<document>", "Document id, uri, or canonical uri").option("--top-k <n>", "Maximum number of related documents to return.", "12").addHelpText("after", `
3320
4091
  Examples:
@@ -3326,12 +4097,12 @@ Dense vectors usually produce better related-document results. Pull models and r
3326
4097
  qli rebuild --dense`).action(async function command(document, options) {
3327
4098
  const global = this.optsWithGlobals();
3328
4099
  const workspace = await resolveWorkspace({ workspace: global.workspace });
3329
- const result2 = await findRelatedDocuments({
4100
+ const result = await findRelatedDocuments({
3330
4101
  workspacePath: workspace,
3331
4102
  document,
3332
4103
  topK: Number(options.topK)
3333
4104
  });
3334
- emit(global.json, capture, response("related", workspace, result2), formatRelatedDocuments(result2.results));
4105
+ emit(global.json, capture, response("related", workspace, result), formatRelatedDocuments(result.results));
3335
4106
  });
3336
4107
  program.command("context").description("Assemble retrieval context for an external LLM, agent, or prompt pipeline.").argument("<query>").option("--top-k <n>", "Maximum number of source passages to consider.", "12").option("--max-chars <n>", "Maximum output length for the rendered context block.", "12000").option("--retrieval <mode>", `Retrieval mode: ${RETRIEVAL_MODE_LIST.join(", ")}`).addHelpText("after", `
3337
4108
  Examples:
@@ -3342,14 +4113,14 @@ Examples:
3342
4113
  Use --json when another tool needs structured access to the raw passages and metadata.`).action(async function command(query, options) {
3343
4114
  const global = this.optsWithGlobals();
3344
4115
  const workspace = await resolveWorkspace({ workspace: global.workspace });
3345
- const result2 = await createContext({
4116
+ const result = await createContext({
3346
4117
  workspacePath: workspace,
3347
4118
  query,
3348
4119
  topK: Number(options.topK),
3349
4120
  maxChars: Number(options.maxChars),
3350
4121
  retrievalMode: parseRetrievalMode(options.retrieval)
3351
4122
  });
3352
- emit(global.json, capture, response("context", workspace, result2), result2.markdown);
4123
+ emit(global.json, capture, response("context", workspace, result), result.markdown);
3353
4124
  });
3354
4125
  const models = program.command("models");
3355
4126
  models.description("Inspect and download retrieval model assets.");
@@ -3358,7 +4129,9 @@ Examples:
3358
4129
  qli models pull
3359
4130
  qli models pull --dense
3360
4131
  qli models pull --sparse
4132
+ qli models pull --silent
3361
4133
 
4134
+ Pulled model assets are shared under ~/.qli by default.
3362
4135
  If you plan to use related, dense search, or hybrid retrieval, pull the models and rebuild the index first.`).action(async function command(options) {
3363
4136
  const global = this.optsWithGlobals();
3364
4137
  const workspace = await resolveWorkspace({ workspace: global.workspace });
@@ -3369,17 +4142,27 @@ If you plan to use related, dense search, or hybrid retrieval, pull the models a
3369
4142
  pullSparseFlag: Boolean(options.sparse),
3370
4143
  uvAvailable: status.sparse.uvAvailable
3371
4144
  });
3372
- await pullModels({ workspacePath: workspace, config, pullDense, pullSparse });
4145
+ await pullModels({ workspacePath: workspace, config, pullDense, pullSparse, progress: createProgressHandler(capture, global) });
3373
4146
  const data = {
3374
- dense: pullDense ? { pulled: true, modelId: config.retrieval.dense.modelId, cacheDir: config.retrieval.dense.cacheDir } : void 0,
3375
- sparse: pullSparse ? { pulled: true, modelId: config.retrieval.sparse.modelId, cacheDir: config.retrieval.sparse.cacheDir } : void 0
4147
+ dense: pullDense ? {
4148
+ pulled: true,
4149
+ modelId: config.retrieval.dense.modelId,
4150
+ cacheDir: resolveCacheDir(workspace, config.retrieval.dense.cacheDir)
4151
+ } : void 0,
4152
+ sparse: pullSparse ? {
4153
+ pulled: true,
4154
+ modelId: config.retrieval.sparse.modelId,
4155
+ cacheDir: resolveCacheDir(workspace, config.retrieval.sparse.cacheDir)
4156
+ } : void 0
3376
4157
  };
3377
4158
  emit(global.json, capture, response("models pull", workspace, data), "Pulled available models");
3378
4159
  });
3379
- models.command("status").description("Show whether model runtimes and artifacts are available in the workspace.").addHelpText("after", `
4160
+ models.command("status").description("Show whether shared model assets, runtimes, and workspace vector artifacts are available.").addHelpText("after", `
3380
4161
  Examples:
3381
4162
  qli models status
3382
- qli models status --json`).action(async function command() {
4163
+ qli models status --json
4164
+
4165
+ The cacheDir fields show the resolved model cache path for the current workspace config.`).action(async function command() {
3383
4166
  const global = this.optsWithGlobals();
3384
4167
  const workspace = await resolveWorkspace({ workspace: global.workspace });
3385
4168
  const config = await loadConfig(workspace, global.config);
@@ -3394,8 +4177,8 @@ Examples:
3394
4177
  qli diff --since 2026-05-01`).action(async function command(options) {
3395
4178
  const global = this.optsWithGlobals();
3396
4179
  const workspace = await resolveWorkspace({ workspace: global.workspace });
3397
- const result2 = await diffWorkspace({ workspacePath: workspace, sourceId: options.source, documentId: options.document, since: options.since });
3398
- emit(global.json, capture, response("diff", workspace, result2), JSON.stringify(result2, null, 2));
4180
+ const result = await diffWorkspace({ workspacePath: workspace, sourceId: options.source, documentId: options.document, since: options.since });
4181
+ emit(global.json, capture, response("diff", workspace, result), JSON.stringify(result, null, 2));
3399
4182
  });
3400
4183
  const report = program.command("report");
3401
4184
  report.description("Render higher-level reports from workspace data.");
@@ -3476,8 +4259,11 @@ Examples:
3476
4259
  checks.push("dense runtime importable");
3477
4260
  }
3478
4261
  if (config.retrieval.sparse.enabled) {
3479
- await ensureUvAvailable();
3480
- checks.push("uv available for sparse runtime");
4262
+ if (await isUvAvailable()) {
4263
+ checks.push("uv available for sparse runtime");
4264
+ } else {
4265
+ checks.push("uv missing for sparse runtime");
4266
+ }
3481
4267
  }
3482
4268
  try {
3483
4269
  await readLatestIndexMetadata(workspace);
@@ -3511,13 +4297,21 @@ function emit(asJson, capture, body, human) {
3511
4297
  }
3512
4298
 
3513
4299
  // src/cli/main.ts
3514
- var result = await runCli(process.argv.slice(2));
3515
- if (result.stdout) {
3516
- process.stdout.write(`${result.stdout}
4300
+ try {
4301
+ const result = await runCli(process.argv.slice(2), {
4302
+ onStdout(value) {
4303
+ process.stdout.write(`${value}
3517
4304
  `);
3518
- }
3519
- if (result.stderr) {
3520
- process.stderr.write(`${result.stderr}
4305
+ },
4306
+ onStderr(value) {
4307
+ process.stderr.write(`${value}
4308
+ `);
4309
+ }
4310
+ });
4311
+ process.exitCode = result.exitCode;
4312
+ } catch (error) {
4313
+ const message = error instanceof Error ? error.stack ?? error.message : String(error);
4314
+ process.stderr.write(`${message}
3521
4315
  `);
4316
+ process.exitCode = 1;
3522
4317
  }
3523
- process.exit(result.exitCode);