@tryformation/querylight-cli 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/Dockerfile +7 -0
  2. package/LICENSE +21 -0
  3. package/README.md +391 -0
  4. package/dist/chunk/chunk-store.d.ts +4 -0
  5. package/dist/chunk/chunker.d.ts +9 -0
  6. package/dist/cli/format.d.ts +4 -0
  7. package/dist/cli/main.d.ts +2 -0
  8. package/dist/cli/main.js +3523 -0
  9. package/dist/cli/run-cli.d.ts +5 -0
  10. package/dist/core/config.d.ts +4 -0
  11. package/dist/core/constants.d.ts +3 -0
  12. package/dist/core/errors.d.ts +17 -0
  13. package/dist/core/files.d.ts +1 -0
  14. package/dist/core/hashing.d.ts +1 -0
  15. package/dist/core/ids.d.ts +1 -0
  16. package/dist/core/jsonl.d.ts +2 -0
  17. package/dist/core/runs.d.ts +3 -0
  18. package/dist/core/workspace.d.ts +7 -0
  19. package/dist/index/index-store.d.ts +11 -0
  20. package/dist/index/querylight-indexer.d.ts +14 -0
  21. package/dist/index.d.ts +11 -0
  22. package/dist/index.js +2794 -0
  23. package/dist/ingest/adapters/crawl4ai-adapter.d.ts +1 -0
  24. package/dist/ingest/adapters/directory-adapter.d.ts +2 -0
  25. package/dist/ingest/adapters/file-adapter.d.ts +16 -0
  26. package/dist/ingest/adapters/rss-adapter.d.ts +7 -0
  27. package/dist/ingest/adapters/url-adapter.d.ts +11 -0
  28. package/dist/ingest/adapters/website-adapter.d.ts +2 -0
  29. package/dist/ingest/document-utils.d.ts +24 -0
  30. package/dist/ingest/extractors/docx-extractor.d.ts +1 -0
  31. package/dist/ingest/extractors/html-extractor.d.ts +5 -0
  32. package/dist/ingest/extractors/markdown-extractor.d.ts +1 -0
  33. package/dist/ingest/extractors/pdf-extractor.d.ts +1 -0
  34. package/dist/ingest/extractors/text-extractor.d.ts +1 -0
  35. package/dist/ingest/ingest-service.d.ts +23 -0
  36. package/dist/normalize/boilerplate.d.ts +1 -0
  37. package/dist/normalize/normalize-markdown.d.ts +2 -0
  38. package/dist/query/context-builder.d.ts +8 -0
  39. package/dist/query/related-service.d.ts +6 -0
  40. package/dist/query/search-service.d.ts +31 -0
  41. package/dist/report/diff-service.d.ts +23 -0
  42. package/dist/sources/source-model.d.ts +1 -0
  43. package/dist/sources/source-store.d.ts +7 -0
  44. package/dist/types/models.d.ts +309 -0
  45. package/dist/vector/dense.d.ts +13 -0
  46. package/dist/vector/runtime.d.ts +18 -0
  47. package/dist/vector/service.d.ts +26 -0
  48. package/dist/vector/sparse.d.ts +19 -0
  49. package/dist/vector/store.d.ts +20 -0
  50. package/dist/vector/text.d.ts +3 -0
  51. package/package.json +66 -0
  52. package/scripts/sparse-encode.py +104 -0
@@ -0,0 +1,3523 @@
1
+ #!/usr/bin/env node
2
+
3
+ // src/cli/run-cli.ts
4
+ import { Command } from "commander";
5
+ import { stat as stat4 } from "fs/promises";
6
+ import path21 from "path";
7
+
8
+ // src/chunk/chunker.ts
9
+ import { readFile as readFile3 } from "fs/promises";
10
+ import matter from "gray-matter";
11
+ import path4 from "path";
12
+
13
+ // src/core/config.ts
14
+ import { readFile, writeFile } from "fs/promises";
15
+ import path from "path";
16
+ import YAML from "yaml";
17
+ var defaultConfig = () => ({
18
+ workspaceVersion: 1,
19
+ index: {
20
+ name: "default",
21
+ fields: {
22
+ text: { type: "text", weight: 1 },
23
+ title: { type: "text", weight: 2 },
24
+ uri: { type: "keyword" },
25
+ sourceId: { type: "keyword" },
26
+ tags: { type: "keyword" },
27
+ contentType: { type: "keyword" }
28
+ },
29
+ chunking: {
30
+ maxChars: 1800,
31
+ overlapChars: 200,
32
+ minChars: 120,
33
+ splitOnHeadings: true
34
+ }
35
+ },
36
+ rag: {
37
+ defaultTopK: 12,
38
+ maxContextChars: 12e3,
39
+ citationStyle: "markdown"
40
+ },
41
+ retrieval: {
42
+ defaultMode: "lexical",
43
+ dense: {
44
+ enabled: false,
45
+ modelId: "Xenova/all-MiniLM-L6-v2",
46
+ cacheDir: ".kb/models/huggingface",
47
+ indexHashTables: 8,
48
+ indexRandomSeed: 42,
49
+ chunkTextMode: "title-heading-text"
50
+ },
51
+ sparse: {
52
+ enabled: false,
53
+ modelId: "opensearch-project/opensearch-neural-sparse-encoding-doc-v3-distill",
54
+ cacheDir: ".kb/models/huggingface",
55
+ documentTopTokens: 128,
56
+ queryEncoding: "tokenizer-token-weights",
57
+ documentEncoding: "masked-lm-max-log1p-relu",
58
+ chunkTextMode: "title-heading-text"
59
+ }
60
+ },
61
+ crawler: {
62
+ defaultUserAgent: "querylight-cli/0.1",
63
+ obeyRobotsTxt: true,
64
+ rateLimitMs: 1e3,
65
+ renderJs: false,
66
+ retentionDays: 365,
67
+ fetchArticles: true
68
+ },
69
+ limits: {
70
+ maxFileSizeMb: 50,
71
+ maxPagesPerSource: 100,
72
+ maxTotalChunks: 1e5
73
+ }
74
+ });
75
+ async function writeDefaultConfig(workspacePath, force = false) {
76
+ const configPath = path.join(workspacePath, "config.yaml");
77
+ try {
78
+ if (!force) {
79
+ await readFile(configPath, "utf8");
80
+ return;
81
+ }
82
+ } catch {
83
+ }
84
+ await writeFile(configPath, YAML.stringify(defaultConfig()), "utf8");
85
+ }
86
+ async function loadConfig(workspacePath, configPath) {
87
+ const resolved = configPath ?? path.join(workspacePath, "config.yaml");
88
+ const raw = await readFile(resolved, "utf8");
89
+ const parsed = YAML.parse(raw);
90
+ const defaults = defaultConfig();
91
+ return {
92
+ ...defaults,
93
+ ...parsed,
94
+ index: {
95
+ ...defaults.index,
96
+ ...parsed.index,
97
+ fields: {
98
+ ...defaults.index.fields,
99
+ ...parsed.index?.fields ?? {}
100
+ },
101
+ chunking: {
102
+ ...defaults.index.chunking,
103
+ ...parsed.index?.chunking ?? {}
104
+ }
105
+ },
106
+ rag: {
107
+ ...defaults.rag,
108
+ ...parsed.rag ?? {}
109
+ },
110
+ retrieval: {
111
+ ...defaults.retrieval,
112
+ ...parsed.retrieval ?? {},
113
+ dense: {
114
+ ...defaults.retrieval.dense,
115
+ ...parsed.retrieval?.dense ?? {}
116
+ },
117
+ sparse: {
118
+ ...defaults.retrieval.sparse,
119
+ ...parsed.retrieval?.sparse ?? {}
120
+ }
121
+ },
122
+ crawler: {
123
+ ...defaults.crawler,
124
+ ...parsed.crawler ?? {}
125
+ },
126
+ limits: {
127
+ ...defaults.limits,
128
+ ...parsed.limits ?? {}
129
+ }
130
+ };
131
+ }
132
+
133
+ // src/core/hashing.ts
134
+ import { createHash } from "crypto";
135
+ function sha256(input) {
136
+ return createHash("sha256").update(input).digest("hex");
137
+ }
138
+
139
+ // src/core/ids.ts
140
+ function stableId(prefix, ...parts) {
141
+ return `${prefix}_${sha256(parts.join("::")).slice(0, 16)}`;
142
+ }
143
+
144
+ // src/core/jsonl.ts
145
+ import { mkdir, readFile as readFile2, writeFile as writeFile2 } from "fs/promises";
146
+ import path2 from "path";
147
+ async function readJsonl(filePath) {
148
+ try {
149
+ const raw = await readFile2(filePath, "utf8");
150
+ return raw.split("\n").map((line) => line.trim()).filter((line) => line.length > 0).map((line) => JSON.parse(line));
151
+ } catch (error) {
152
+ if (error.code === "ENOENT") {
153
+ return [];
154
+ }
155
+ throw error;
156
+ }
157
+ }
158
+ async function writeJsonl(filePath, records) {
159
+ await mkdir(path2.dirname(filePath), { recursive: true });
160
+ const payload = records.map((record) => JSON.stringify(record)).join("\n");
161
+ await writeFile2(filePath, payload.length > 0 ? `${payload}
162
+ ` : "", "utf8");
163
+ }
164
+
165
+ // src/chunk/chunk-store.ts
166
+ import path3 from "path";
167
+ function chunksFile(workspacePath) {
168
+ return path3.join(workspacePath, "chunks", "chunks.jsonl");
169
+ }
170
+ async function loadChunks(workspacePath) {
171
+ return readJsonl(chunksFile(workspacePath));
172
+ }
173
+ async function saveChunks(workspacePath, chunks) {
174
+ await writeJsonl(chunksFile(workspacePath), chunks.sort((a, b) => a.id.localeCompare(b.id)));
175
+ }
176
+
177
+ // src/chunk/chunker.ts
178
+ function splitSections(markdown) {
179
+ const lines = markdown.split("\n");
180
+ const sections = [];
181
+ let headingPath = [];
182
+ let current = [];
183
+ const flush = () => {
184
+ const text = current.join("\n").trim();
185
+ if (text.length > 0) {
186
+ sections.push({ headingPath: [...headingPath], text });
187
+ }
188
+ current = [];
189
+ };
190
+ for (const line of lines) {
191
+ const match = /^(#{1,6})\s+(.+)$/.exec(line);
192
+ if (match?.[1] && match[2]) {
193
+ flush();
194
+ const level = match[1].length;
195
+ headingPath = [...headingPath.slice(0, level - 1), match[2].trim()];
196
+ current.push(line);
197
+ continue;
198
+ }
199
+ current.push(line);
200
+ }
201
+ flush();
202
+ return sections;
203
+ }
204
+ function splitLongSection(text, maxChars, overlapChars) {
205
+ if (text.length <= maxChars) {
206
+ return [text];
207
+ }
208
+ const chunks = [];
209
+ let start = 0;
210
+ while (start < text.length) {
211
+ const hardEnd = Math.min(text.length, start + maxChars);
212
+ let sliceEnd = hardEnd;
213
+ const window = text.slice(start, hardEnd);
214
+ const paragraphBreak = window.lastIndexOf("\n\n");
215
+ if (paragraphBreak > maxChars / 2 && hardEnd < text.length) {
216
+ const candidateEnd = start + paragraphBreak;
217
+ if (candidateEnd - start > overlapChars) {
218
+ sliceEnd = candidateEnd;
219
+ }
220
+ }
221
+ const slice = text.slice(start, sliceEnd).trim();
222
+ if (slice.length === 0) {
223
+ start = hardEnd;
224
+ continue;
225
+ }
226
+ chunks.push(slice);
227
+ const nextStart = sliceEnd - overlapChars;
228
+ start = nextStart > start ? nextStart : hardEnd;
229
+ }
230
+ return chunks.filter((chunk) => chunk.length > 0);
231
+ }
232
+ function estimateTokens(text) {
233
+ return Math.ceil(text.length / 4);
234
+ }
235
+ function buildChunksForDocument(document, markdown, config, prior = /* @__PURE__ */ new Map(), seenAt = (/* @__PURE__ */ new Date()).toISOString()) {
236
+ const parsed = matter(markdown);
237
+ const sections = splitSections(parsed.content);
238
+ const usefulSections = sections.length > 0 ? sections : [{ headingPath: [document.title], text: parsed.content }];
239
+ const chunks = [];
240
+ for (const section of usefulSections) {
241
+ const pieces = splitLongSection(section.text, config.index.chunking.maxChars, config.index.chunking.overlapChars);
242
+ for (const piece of pieces) {
243
+ if (piece.trim().length < Math.min(40, config.index.chunking.minChars) && pieces.length === 1) {
244
+ continue;
245
+ }
246
+ const text = piece.trim();
247
+ const id = stableId("chunk", document.id, section.headingPath.join(" > "), text);
248
+ const priorChunk = prior.get(id);
249
+ const contentHash = sha256(text);
250
+ chunks.push({
251
+ id,
252
+ documentId: document.id,
253
+ sourceId: document.sourceId,
254
+ title: document.title,
255
+ uri: document.uri,
256
+ headingPath: section.headingPath,
257
+ text,
258
+ tokenEstimate: estimateTokens(text),
259
+ contentHash,
260
+ metadata: document.metadata,
261
+ firstSeenAt: priorChunk?.firstSeenAt ?? document.firstSeenAt,
262
+ lastSeenAt: seenAt,
263
+ lastChangedAt: priorChunk?.contentHash === contentHash ? priorChunk.lastChangedAt : document.lastChangedAt
264
+ });
265
+ }
266
+ }
267
+ return chunks;
268
+ }
269
+ async function chunkDocuments({
270
+ workspacePath,
271
+ sourceId,
272
+ documentId
273
+ }) {
274
+ const config = await loadConfig(workspacePath);
275
+ const documents = await readJsonl(path4.join(workspacePath, "documents", "documents.jsonl"));
276
+ const filtered = documents.filter((document) => (!sourceId || document.sourceId === sourceId) && (!documentId || document.id === documentId));
277
+ const targetedDocumentIds = new Set(filtered.map((document) => document.id));
278
+ const existingChunks = await loadChunks(workspacePath);
279
+ const prior = new Map(existingChunks.map((chunk) => [chunk.id, chunk]));
280
+ const nextChunks = new Map(
281
+ existingChunks.filter((chunk) => !targetedDocumentIds.has(chunk.documentId)).map((chunk) => [chunk.id, chunk])
282
+ );
283
+ for (const document of filtered) {
284
+ const raw = await readFile3(document.normalizedPath, "utf8");
285
+ for (const chunk of buildChunksForDocument(document, raw, config, prior)) {
286
+ nextChunks.set(chunk.id, chunk);
287
+ }
288
+ }
289
+ await saveChunks(workspacePath, [...nextChunks.values()]);
290
+ return { chunksWritten: nextChunks.size };
291
+ }
292
+
293
+ // src/core/constants.ts
294
+ var PACKAGE_VERSION = "0.1.0";
295
+ var DEFAULT_WORKSPACE = ".kb";
296
+
297
+ // src/core/errors.ts
298
+ var CliError = class extends Error {
299
+ constructor(message, code, exitCode, details) {
300
+ super(message);
301
+ this.code = code;
302
+ this.exitCode = exitCode;
303
+ this.details = details;
304
+ this.name = "CliError";
305
+ }
306
+ code;
307
+ exitCode;
308
+ details;
309
+ };
310
+
311
+ // src/core/workspace.ts
312
+ import { mkdir as mkdir2, stat } from "fs/promises";
313
+ import path5 from "path";
314
+ var DIRS = [
315
+ "sources",
316
+ "documents",
317
+ "chunks",
318
+ "raw",
319
+ "normalized",
320
+ "indexes",
321
+ "vectors",
322
+ "models",
323
+ "models/huggingface",
324
+ "runs",
325
+ "logs"
326
+ ];
327
+ async function ensureWorkspace({
328
+ workspacePath,
329
+ force = false
330
+ }) {
331
+ const resolved = path5.resolve(workspacePath);
332
+ await mkdir2(resolved, { recursive: true });
333
+ for (const dir of DIRS) {
334
+ await mkdir2(path5.join(resolved, dir), { recursive: true });
335
+ }
336
+ await writeDefaultConfig(resolved, force);
337
+ return { workspacePath: resolved };
338
+ }
339
+ async function assertWorkspaceExists(workspacePath) {
340
+ const resolved = path5.resolve(workspacePath);
341
+ try {
342
+ const info = await stat(resolved);
343
+ if (!info.isDirectory()) {
344
+ throw new CliError(`workspace is not a directory: ${resolved}`, "WORKSPACE_ERROR", 3 /* WorkspaceError */);
345
+ }
346
+ await stat(path5.join(resolved, "config.yaml"));
347
+ return resolved;
348
+ } catch (error) {
349
+ if (error instanceof CliError) {
350
+ throw error;
351
+ }
352
+ throw new CliError(`workspace does not exist or is invalid: ${resolved}`, "WORKSPACE_ERROR", 3 /* WorkspaceError */);
353
+ }
354
+ }
355
+
356
+ // src/index/querylight-indexer.ts
357
+ import { Analyzer, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, RankingAlgorithm, TextFieldIndex } from "@tryformation/querylight-ts";
358
+ import path11 from "path";
359
+
360
+ // src/vector/dense.ts
361
+ import { VectorFieldIndex, createSeededRandom } from "@tryformation/querylight-ts";
362
+ import { mkdir as mkdir4 } from "fs/promises";
363
+ import path8 from "path";
364
+
365
+ // src/vector/runtime.ts
366
+ import path6 from "path";
367
+ import { fileURLToPath } from "url";
368
+ import { execFile, execFileSync } from "child_process";
369
+
370
+ // src/core/files.ts
371
+ import { stat as stat2 } from "fs/promises";
372
+ async function fileExists(filePath) {
373
+ try {
374
+ await stat2(filePath);
375
+ return true;
376
+ } catch {
377
+ return false;
378
+ }
379
+ }
380
+
381
+ // src/vector/runtime.ts
382
+ function resolveCacheDir(workspacePath, configuredPath) {
383
+ return path6.isAbsolute(configuredPath) ? configuredPath : path6.resolve(workspacePath, configuredPath.replace(/^\.kb\//, ""));
384
+ }
385
+ function packageRootFromImportMeta(importMetaUrl) {
386
+ return path6.resolve(path6.dirname(fileURLToPath(importMetaUrl)), "..");
387
+ }
388
+ async function sparseScriptPath(importMetaUrl) {
389
+ const base = packageRootFromImportMeta(importMetaUrl);
390
+ const candidates = [
391
+ path6.join(base, "scripts", "sparse-encode.py"),
392
+ path6.join(base, "..", "scripts", "sparse-encode.py")
393
+ ];
394
+ for (const candidate of candidates) {
395
+ if (await fileExists(candidate)) {
396
+ return path6.resolve(candidate);
397
+ }
398
+ }
399
+ throw new Error(`sparse helper script not found; checked ${candidates.join(", ")}`);
400
+ }
401
+ async function ensureUvAvailable() {
402
+ await new Promise((resolve2, reject) => {
403
+ execFile("uv", ["--version"], (error) => error ? reject(error) : resolve2());
404
+ });
405
+ }
406
+ async function runSparsePython({
407
+ workspacePath,
408
+ config,
409
+ payload,
410
+ importMetaUrl
411
+ }) {
412
+ const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
413
+ const scriptPath = await sparseScriptPath(importMetaUrl);
414
+ return execFileSync(
415
+ "uv",
416
+ [
417
+ "run",
418
+ "--with",
419
+ "torch",
420
+ "--with",
421
+ "transformers",
422
+ "--with",
423
+ "huggingface_hub",
424
+ "python",
425
+ scriptPath
426
+ ],
427
+ {
428
+ encoding: "utf8",
429
+ maxBuffer: 1024 * 1024 * 1024,
430
+ input: JSON.stringify(payload),
431
+ env: {
432
+ ...process.env,
433
+ HF_HOME: cacheDir
434
+ }
435
+ }
436
+ );
437
+ }
438
+ async function getDenseTransformersRuntime(cacheDir) {
439
+ const transformers = await import("@huggingface/transformers");
440
+ transformers.env.cacheDir = cacheDir;
441
+ transformers.env.allowLocalModels = true;
442
+ return {
443
+ env: transformers.env,
444
+ pipeline: transformers.pipeline
445
+ };
446
+ }
447
+
448
+ // src/vector/store.ts
449
+ import { mkdir as mkdir3, readFile as readFile4, writeFile as writeFile3 } from "fs/promises";
450
+ import path7 from "path";
451
+ function vectorsDir(workspacePath) {
452
+ return path7.join(workspacePath, "vectors");
453
+ }
454
+ function modelsDir(workspacePath) {
455
+ return path7.join(workspacePath, "models");
456
+ }
457
+ function denseVectorPath(workspacePath) {
458
+ return path7.join(vectorsDir(workspacePath), "dense.latest.json");
459
+ }
460
+ function denseMetaPath(workspacePath) {
461
+ return path7.join(vectorsDir(workspacePath), "dense.latest.meta.json");
462
+ }
463
+ function sparseVectorPath(workspacePath) {
464
+ return path7.join(vectorsDir(workspacePath), "sparse.latest.json");
465
+ }
466
+ function sparseMetaPath(workspacePath) {
467
+ return path7.join(vectorsDir(workspacePath), "sparse.latest.meta.json");
468
+ }
469
+ function densePullMarker(workspacePath) {
470
+ return path7.join(modelsDir(workspacePath), "dense.pulled.json");
471
+ }
472
+ function sparsePullMarker(workspacePath) {
473
+ return path7.join(modelsDir(workspacePath), "sparse.pulled.json");
474
+ }
475
+ async function writeDensePayload(workspacePath, payload) {
476
+ await mkdir3(vectorsDir(workspacePath), { recursive: true });
477
+ await writeFile3(denseVectorPath(workspacePath), JSON.stringify(payload, null, 2), "utf8");
478
+ await writeFile3(denseMetaPath(workspacePath), JSON.stringify(payload.metadata, null, 2), "utf8");
479
+ }
480
+ async function readDensePayload(workspacePath) {
481
+ return JSON.parse(await readFile4(denseVectorPath(workspacePath), "utf8"));
482
+ }
483
+ async function writeSparsePayload(workspacePath, payload) {
484
+ await mkdir3(vectorsDir(workspacePath), { recursive: true });
485
+ await writeFile3(sparseVectorPath(workspacePath), JSON.stringify(payload, null, 2), "utf8");
486
+ await writeFile3(sparseMetaPath(workspacePath), JSON.stringify(payload.metadata, null, 2), "utf8");
487
+ }
488
+ async function readSparsePayload(workspacePath) {
489
+ return JSON.parse(await readFile4(sparseVectorPath(workspacePath), "utf8"));
490
+ }
491
+ async function writeDensePullMarker(workspacePath, value) {
492
+ await mkdir3(modelsDir(workspacePath), { recursive: true });
493
+ await writeFile3(densePullMarker(workspacePath), JSON.stringify(value, null, 2), "utf8");
494
+ }
495
+ async function writeSparsePullMarker(workspacePath, value) {
496
+ await mkdir3(modelsDir(workspacePath), { recursive: true });
497
+ await writeFile3(sparsePullMarker(workspacePath), JSON.stringify(value, null, 2), "utf8");
498
+ }
499
+ async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
500
+ const denseCacheDir = resolveCacheDir(workspacePath, dense.cacheDir);
501
+ const sparseCacheDir = resolveCacheDir(workspacePath, sparse.cacheDir);
502
+ return {
503
+ dense: {
504
+ configured: dense.enabled,
505
+ modelId: dense.modelId,
506
+ cacheDir: denseCacheDir,
507
+ available: await fileExists(densePullMarker(workspacePath)),
508
+ artifactExists: await fileExists(denseVectorPath(workspacePath))
509
+ },
510
+ sparse: {
511
+ configured: sparse.enabled,
512
+ modelId: sparse.modelId,
513
+ cacheDir: sparseCacheDir,
514
+ uvAvailable,
515
+ available: await fileExists(sparsePullMarker(workspacePath)),
516
+ artifactExists: await fileExists(sparseVectorPath(workspacePath))
517
+ }
518
+ };
519
+ }
520
+
521
+ // src/vector/text.ts
522
+ function createDenseChunkText(chunk) {
523
+ return [chunk.title, ...chunk.headingPath, chunk.text].filter(Boolean).join("\n\n");
524
+ }
525
+ function createSparseChunkText(chunk) {
526
+ return [chunk.title, ...chunk.headingPath, chunk.text].filter(Boolean).join("\n\n");
527
+ }
528
+
529
+ // src/vector/dense.ts
530
+ var denseEmbedderFactory = null;
531
+ async function createEmbedder(cacheDir, modelId) {
532
+ if (denseEmbedderFactory) {
533
+ return denseEmbedderFactory(cacheDir, modelId);
534
+ }
535
+ const runtime = await getDenseTransformersRuntime(cacheDir);
536
+ const extractor = await runtime.pipeline("feature-extraction", modelId);
537
+ return async (text) => {
538
+ const output = await extractor(text, { pooling: "mean", normalize: true });
539
+ return output.tolist()[0];
540
+ };
541
+ }
542
+ async function pullDenseModel(workspacePath, config) {
543
+ const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
544
+ await mkdir4(cacheDir, { recursive: true });
545
+ const embed = await createEmbedder(cacheDir, config.modelId);
546
+ await embed("warm dense model cache");
547
+ }
548
+ async function buildDenseVectors({
549
+ workspacePath,
550
+ config
551
+ }) {
552
+ const chunks = await readJsonl(path8.join(workspacePath, "chunks", "chunks.jsonl"));
553
+ const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
554
+ await mkdir4(cacheDir, { recursive: true });
555
+ const embed = await createEmbedder(cacheDir, config.modelId);
556
+ const records = [];
557
+ let dimensions = 0;
558
+ for (const chunk of chunks) {
559
+ const embedding = await embed(createDenseChunkText(chunk));
560
+ dimensions ||= embedding.length;
561
+ records.push({
562
+ chunkId: chunk.id,
563
+ documentId: chunk.documentId,
564
+ sourceId: chunk.sourceId,
565
+ title: chunk.title,
566
+ uri: chunk.uri,
567
+ headingPath: chunk.headingPath,
568
+ text: chunk.text,
569
+ embedding
570
+ });
571
+ }
572
+ const index = new VectorFieldIndex({
573
+ numHashTables: config.indexHashTables,
574
+ dimensions,
575
+ random: createSeededRandom(config.indexRandomSeed)
576
+ });
577
+ for (const record of records) {
578
+ index.insert(record.chunkId, [record.embedding]);
579
+ }
580
+ const metadata = {
581
+ createdAt: (/* @__PURE__ */ new Date()).toISOString(),
582
+ modelId: config.modelId,
583
+ dimensions,
584
+ hashTables: config.indexHashTables,
585
+ randomSeed: config.indexRandomSeed,
586
+ chunkCount: records.length,
587
+ indexHash: sha256(JSON.stringify(index.indexState))
588
+ };
589
+ const payload = {
590
+ metadata,
591
+ indexState: index.indexState,
592
+ chunks: records
593
+ };
594
+ await writeDensePayload(workspacePath, payload);
595
+ return payload;
596
+ }
597
+ async function denseQuery({
598
+ workspacePath,
599
+ config,
600
+ query,
601
+ topK
602
+ }) {
603
+ const payload = await readDensePayload(workspacePath);
604
+ const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
605
+ const embed = await createEmbedder(cacheDir, config.modelId);
606
+ const vector = await embed(query);
607
+ const index = new VectorFieldIndex({
608
+ numHashTables: payload.metadata.hashTables,
609
+ dimensions: payload.metadata.dimensions,
610
+ random: createSeededRandom(payload.metadata.randomSeed)
611
+ }).loadState(payload.indexState);
612
+ return index.query(vector, topK);
613
+ }
614
+
615
+ // src/vector/sparse.ts
616
+ import { SparseVectorFieldIndex } from "@tryformation/querylight-ts";
617
+ import { mkdir as mkdir5 } from "fs/promises";
618
+ import path9 from "path";
619
+ var sparseQueryEncoderFactory = null;
620
+ var sparseDocumentBuilderFactory = null;
621
+ function buildSparseQueryVector(tokenIds, tokenWeights) {
622
+ const sparseVector = {};
623
+ for (const tokenId of new Set(tokenIds)) {
624
+ const weight = tokenWeights[tokenId] ?? 0;
625
+ if (weight > 0) {
626
+ sparseVector[String(tokenId)] = weight;
627
+ }
628
+ }
629
+ return sparseVector;
630
+ }
631
+ function normalizeTokenIds(value) {
632
+ if (value && typeof value === "object" && "data" in value) {
633
+ const data = value.data;
634
+ if (Array.isArray(data)) {
635
+ return data.map(Number).filter(Number.isFinite);
636
+ }
637
+ if (ArrayBuffer.isView(data)) {
638
+ return Array.from(data, Number).filter(Number.isFinite);
639
+ }
640
+ }
641
+ if (!Array.isArray(value)) {
642
+ return [];
643
+ }
644
+ if (value.length === 0) {
645
+ return [];
646
+ }
647
+ if (Array.isArray(value[0])) {
648
+ return value[0].map(Number).filter(Number.isFinite);
649
+ }
650
+ return value.map(Number).filter(Number.isFinite);
651
+ }
652
+ async function createSparseQueryEncoder(cacheDir, modelId, queryTokenWeights) {
653
+ if (sparseQueryEncoderFactory) {
654
+ return sparseQueryEncoderFactory(cacheDir, modelId, queryTokenWeights);
655
+ }
656
+ const runtime = await getDenseTransformersRuntime(cacheDir);
657
+ const { AutoTokenizer } = await import("@huggingface/transformers");
658
+ runtime.env.cacheDir = cacheDir;
659
+ const tokenizer = await AutoTokenizer.from_pretrained(modelId);
660
+ return async (text) => {
661
+ const features = await tokenizer([text], {
662
+ truncation: true,
663
+ return_attention_mask: false,
664
+ return_token_type_ids: false
665
+ });
666
+ return buildSparseQueryVector(normalizeTokenIds(features.input_ids), queryTokenWeights);
667
+ };
668
+ }
669
+ async function pullSparseModel(workspacePath, config) {
670
+ await ensureUvAvailable();
671
+ const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
672
+ await mkdir5(cacheDir, { recursive: true });
673
+ await runSparsePython({
674
+ workspacePath,
675
+ config,
676
+ importMetaUrl: import.meta.url,
677
+ payload: {
678
+ action: "download_only",
679
+ model_id: config.modelId
680
+ }
681
+ });
682
+ }
683
+ async function buildSparseDocuments(workspacePath, config, chunks) {
684
+ if (sparseDocumentBuilderFactory) {
685
+ return sparseDocumentBuilderFactory(workspacePath, config, chunks);
686
+ }
687
+ await ensureUvAvailable();
688
+ const output = JSON.parse(await runSparsePython({
689
+ workspacePath,
690
+ config,
691
+ importMetaUrl: import.meta.url,
692
+ payload: {
693
+ action: "encode_documents",
694
+ model_id: config.modelId,
695
+ top_tokens: config.documentTopTokens,
696
+ documents: chunks.map((chunk) => ({
697
+ chunkId: chunk.id,
698
+ text: createSparseChunkText(chunk)
699
+ }))
700
+ }
701
+ }));
702
+ const byId = new Map(output.documents.map((document) => [document.chunkId, document.vector]));
703
+ return {
704
+ queryTokenWeights: output.query_token_weights,
705
+ vocabularySize: output.vocabularySize,
706
+ chunks: chunks.map((chunk) => ({
707
+ chunkId: chunk.id,
708
+ documentId: chunk.documentId,
709
+ sourceId: chunk.sourceId,
710
+ title: chunk.title,
711
+ uri: chunk.uri,
712
+ headingPath: chunk.headingPath,
713
+ text: chunk.text,
714
+ vector: byId.get(chunk.id) ?? {}
715
+ }))
716
+ };
717
+ }
718
+ async function buildSparseVectors({
719
+ workspacePath,
720
+ config
721
+ }) {
722
+ const chunks = await readJsonl(path9.join(workspacePath, "chunks", "chunks.jsonl"));
723
+ const built = await buildSparseDocuments(workspacePath, config, chunks);
724
+ const index = new SparseVectorFieldIndex();
725
+ for (const record of built.chunks) {
726
+ index.insert(record.chunkId, [record.vector]);
727
+ }
728
+ const metadata = {
729
+ createdAt: (/* @__PURE__ */ new Date()).toISOString(),
730
+ modelId: config.modelId,
731
+ vocabularySize: built.vocabularySize,
732
+ documentTopTokens: config.documentTopTokens,
733
+ queryEncoding: config.queryEncoding,
734
+ documentEncoding: config.documentEncoding,
735
+ chunkCount: built.chunks.length,
736
+ indexHash: sha256(JSON.stringify(index.indexState))
737
+ };
738
+ const payload = {
739
+ metadata,
740
+ indexState: index.indexState,
741
+ chunks: built.chunks,
742
+ queryTokenWeights: built.queryTokenWeights
743
+ };
744
+ await writeSparsePayload(workspacePath, payload);
745
+ return payload;
746
+ }
747
+ async function sparseQuery({
748
+ workspacePath,
749
+ config,
750
+ query,
751
+ topK
752
+ }) {
753
+ const payload = await readSparsePayload(workspacePath);
754
+ const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
755
+ const encode = await createSparseQueryEncoder(cacheDir, config.modelId, payload.queryTokenWeights);
756
+ const vector = await encode(query);
757
+ const index = new SparseVectorFieldIndex().loadState(payload.indexState);
758
+ return index.query(vector, topK);
759
+ }
760
+
761
+ // src/vector/service.ts
762
+ function resolveModelPullPlan({
763
+ pullDenseFlag,
764
+ pullSparseFlag,
765
+ uvAvailable
766
+ }) {
767
+ if (pullDenseFlag || pullSparseFlag) {
768
+ return {
769
+ pullDense: pullDenseFlag,
770
+ pullSparse: pullSparseFlag
771
+ };
772
+ }
773
+ return {
774
+ pullDense: true,
775
+ pullSparse: uvAvailable
776
+ };
777
+ }
778
+ async function buildVectorArtifacts({
779
+ workspacePath,
780
+ config,
781
+ denseOverride,
782
+ sparseOverride,
783
+ buildAvailableModels = false
784
+ }) {
785
+ const modelStatus = buildAvailableModels ? await buildModelStatus(workspacePath, config.retrieval.dense, config.retrieval.sparse, await (async () => {
786
+ try {
787
+ await ensureUvAvailable();
788
+ return true;
789
+ } catch {
790
+ return false;
791
+ }
792
+ })()) : null;
793
+ const denseEnabled = denseOverride ?? (buildAvailableModels ? config.retrieval.dense.enabled || Boolean(modelStatus?.dense.available) : config.retrieval.dense.enabled);
794
+ const sparseEnabled = sparseOverride ?? (buildAvailableModels ? (config.retrieval.sparse.enabled || Boolean(modelStatus?.sparse.available)) && Boolean(modelStatus?.sparse.uvAvailable) : config.retrieval.sparse.enabled);
795
+ const result2 = {};
796
+ if (denseEnabled) {
797
+ result2.dense = await buildDenseVectors({ workspacePath, config: config.retrieval.dense });
798
+ }
799
+ if (sparseEnabled) {
800
+ result2.sparse = await buildSparseVectors({ workspacePath, config: config.retrieval.sparse });
801
+ }
802
+ return result2;
803
+ }
804
+ async function pullModels({
805
+ workspacePath,
806
+ config,
807
+ pullDense,
808
+ pullSparse
809
+ }) {
810
+ if (pullDense) {
811
+ await pullDenseModel(workspacePath, config.retrieval.dense);
812
+ await writeDensePullMarker(workspacePath, {
813
+ pulledAt: (/* @__PURE__ */ new Date()).toISOString(),
814
+ modelId: config.retrieval.dense.modelId
815
+ });
816
+ }
817
+ if (pullSparse) {
818
+ await pullSparseModel(workspacePath, config.retrieval.sparse);
819
+ await writeSparsePullMarker(workspacePath, {
820
+ pulledAt: (/* @__PURE__ */ new Date()).toISOString(),
821
+ modelId: config.retrieval.sparse.modelId
822
+ });
823
+ }
824
+ }
825
+ async function getModelStatus(workspacePath, config) {
826
+ let uvAvailable = false;
827
+ try {
828
+ await ensureUvAvailable();
829
+ uvAvailable = true;
830
+ } catch {
831
+ uvAvailable = false;
832
+ }
833
+ return buildModelStatus(workspacePath, config.retrieval.dense, config.retrieval.sparse, uvAvailable);
834
+ }
835
+
836
+ // src/index/index-store.ts
837
+ import { readFile as readFile5, writeFile as writeFile4 } from "fs/promises";
838
+ import path10 from "path";
839
+ async function writeIndexArtifacts({
840
+ workspacePath,
841
+ indexState,
842
+ metadata
843
+ }) {
844
+ const stamp = metadata.createdAt.replace(/[:.]/g, "-");
845
+ const indexPath = path10.join(workspacePath, "indexes", `${stamp}.json`);
846
+ const metaPath = path10.join(workspacePath, "indexes", `${stamp}.meta.json`);
847
+ const latestIndexPath = path10.join(workspacePath, "indexes", "latest.json");
848
+ const latestMetaPath = path10.join(workspacePath, "indexes", "latest.meta.json");
849
+ const indexPayload = JSON.stringify(indexState, null, 2);
850
+ const metaPayload = JSON.stringify(metadata, null, 2);
851
+ await writeFile4(indexPath, indexPayload, "utf8");
852
+ await writeFile4(metaPath, metaPayload, "utf8");
853
+ await writeFile4(latestIndexPath, indexPayload, "utf8");
854
+ await writeFile4(latestMetaPath, metaPayload, "utf8");
855
+ return { indexPath: latestIndexPath, metadataPath: latestMetaPath };
856
+ }
857
+ async function readLatestIndexState(workspacePath) {
858
+ return JSON.parse(await readFile5(path10.join(workspacePath, "indexes", "latest.json"), "utf8"));
859
+ }
860
+ async function readLatestIndexMetadata(workspacePath) {
861
+ return JSON.parse(await readFile5(path10.join(workspacePath, "indexes", "latest.meta.json"), "utf8"));
862
+ }
863
+
864
+ // src/index/querylight-indexer.ts
865
+ function keywordFieldIndex() {
866
+ const analyzer = new Analyzer([new LowerCaseTextFilter()], new KeywordTokenizer());
867
+ return new TextFieldIndex(analyzer, analyzer, RankingAlgorithm.BM25);
868
+ }
869
+ function createIndexMapping(extraFields = []) {
870
+ const lexical = new TextFieldIndex(void 0, void 0, RankingAlgorithm.BM25);
871
+ const mapping = {
872
+ text: lexical,
873
+ title: new TextFieldIndex(void 0, void 0, RankingAlgorithm.BM25),
874
+ uri: keywordFieldIndex(),
875
+ sourceId: keywordFieldIndex(),
876
+ tags: keywordFieldIndex(),
877
+ sourceType: keywordFieldIndex()
878
+ };
879
+ for (const field of extraFields) {
880
+ mapping[field] = keywordFieldIndex();
881
+ }
882
+ return mapping;
883
+ }
884
+ function flattenMetadata(metadata) {
885
+ const flattened = {};
886
+ for (const [key, value] of Object.entries(metadata)) {
887
+ if (value == null) {
888
+ continue;
889
+ }
890
+ const field = `metadata.${key}`;
891
+ if (Array.isArray(value)) {
892
+ flattened[field] = value.map((item) => String(item).toLowerCase());
893
+ } else {
894
+ flattened[field] = [String(value).toLowerCase()];
895
+ }
896
+ }
897
+ return flattened;
898
+ }
899
+ async function buildIndex({
900
+ workspacePath,
901
+ denseOverride,
902
+ sparseOverride,
903
+ buildAvailableModels = false
904
+ }) {
905
+ const config = await loadConfig(workspacePath);
906
+ const chunks = await readJsonl(path11.join(workspacePath, "chunks", "chunks.jsonl"));
907
+ const documents = await readJsonl(path11.join(workspacePath, "documents", "documents.jsonl"));
908
+ const sources = await readJsonl(path11.join(workspacePath, "sources", "sources.jsonl"));
909
+ const metadataFields = [...new Set(chunks.flatMap((chunk) => Object.keys(chunk.metadata).map((key) => `metadata.${key}`)))];
910
+ const index = new DocumentIndex(createIndexMapping(metadataFields));
911
+ for (const chunk of chunks) {
912
+ index.index({
913
+ id: chunk.id,
914
+ fields: {
915
+ text: [chunk.text],
916
+ title: [chunk.title],
917
+ uri: [chunk.uri.toLowerCase()],
918
+ sourceId: [chunk.sourceId.toLowerCase()],
919
+ tags: Array.isArray(chunk.metadata.tags) ? chunk.metadata.tags.map((tag) => String(tag).toLowerCase()) : [],
920
+ sourceType: [String(chunk.metadata.sourceType ?? "").toLowerCase()],
921
+ ...flattenMetadata(chunk.metadata)
922
+ }
923
+ });
924
+ }
925
+ const createdAt = (/* @__PURE__ */ new Date()).toISOString();
926
+ const metadata = {
927
+ id: `index_${createdAt.replace(/[:.]/g, "-")}`,
928
+ createdAt,
929
+ querylightVersion: "0.10.0",
930
+ kbVersion: "0.1.0",
931
+ documentCount: documents.length,
932
+ chunkCount: chunks.length,
933
+ sourceCount: sources.length,
934
+ fields: Object.keys(index.mapping),
935
+ indexHash: sha256(JSON.stringify(index.indexState))
936
+ };
937
+ const artifacts = await writeIndexArtifacts({ workspacePath, indexState: index.indexState, metadata });
938
+ const vectors = await buildVectorArtifacts({
939
+ workspacePath,
940
+ config,
941
+ denseOverride,
942
+ sparseOverride,
943
+ buildAvailableModels
944
+ });
945
+ return {
946
+ metadata,
947
+ indexPath: artifacts.indexPath,
948
+ denseBuilt: Boolean(vectors.dense),
949
+ sparseBuilt: Boolean(vectors.sparse)
950
+ };
951
+ }
952
+
953
+ // src/ingest/ingest-service.ts
954
+ import path17 from "path";
955
+
956
+ // src/core/runs.ts
957
+ import path12 from "path";
958
+ async function writeRun(workspacePath, run) {
959
+ await writeJsonl(path12.join(workspacePath, "runs", `${run.id}.json`), [run]);
960
+ }
961
+ async function listRuns(workspacePath) {
962
+ const fs = await import("fs/promises");
963
+ const dir = path12.join(workspacePath, "runs");
964
+ try {
965
+ const entries = await fs.readdir(dir);
966
+ const records = await Promise.all(entries.filter((name) => name.endsWith(".json")).map(async (name) => {
967
+ const runs = await readJsonl(path12.join(dir, name));
968
+ return runs[0];
969
+ }));
970
+ return records.filter((record) => record != null).sort((a, b) => a.createdAt.localeCompare(b.createdAt));
971
+ } catch {
972
+ return [];
973
+ }
974
+ }
975
+
976
+ // src/sources/source-store.ts
977
+ import path13 from "path";
978
+ var sourcesFile = (workspacePath) => path13.join(workspacePath, "sources", "sources.jsonl");
979
+ async function listSources(workspacePath) {
980
+ return readJsonl(sourcesFile(workspacePath));
981
+ }
982
+ async function addSource(workspacePath, source) {
983
+ const existing = await listSources(workspacePath);
984
+ if (existing.some((candidate) => candidate.uri === source.uri)) {
985
+ throw new CliError(`duplicate source URI: ${source.uri}`, "DUPLICATE_SOURCE", 4 /* SourceError */);
986
+ }
987
+ const id = source.id ?? stableId("src", source.type, source.uri);
988
+ const stored = { ...source, id };
989
+ existing.push(stored);
990
+ await writeJsonl(sourcesFile(workspacePath), existing);
991
+ return stored;
992
+ }
993
+ async function updateSource(workspacePath, sourceId, patch) {
994
+ const sources = await listSources(workspacePath);
995
+ const index = sources.findIndex((source) => source.id === sourceId);
996
+ if (index < 0) {
997
+ throw new CliError(`source not found: ${sourceId}`, "SOURCE_NOT_FOUND", 4 /* SourceError */);
998
+ }
999
+ const current = sources[index];
1000
+ const updated = {
1001
+ ...current,
1002
+ ...patch,
1003
+ id: sourceId,
1004
+ metadata: patch.metadata ? { ...current.metadata, ...patch.metadata } : current.metadata,
1005
+ crawl: patch.crawl ? {
1006
+ ...current.crawl ?? {},
1007
+ ...patch.crawl
1008
+ } : current.crawl
1009
+ };
1010
+ sources[index] = updated;
1011
+ await writeJsonl(sourcesFile(workspacePath), sources);
1012
+ return updated;
1013
+ }
1014
+ async function removeSource(workspacePath, sourceId) {
1015
+ const sources = await listSources(workspacePath);
1016
+ const filtered = sources.filter((source) => source.id !== sourceId);
1017
+ if (filtered.length === sources.length) {
1018
+ throw new CliError(`source not found: ${sourceId}`, "SOURCE_NOT_FOUND", 4 /* SourceError */);
1019
+ }
1020
+ await writeJsonl(sourcesFile(workspacePath), filtered);
1021
+ }
1022
+
1023
+ // src/ingest/document-utils.ts
1024
+ import { mkdir as mkdir6, rm, writeFile as writeFile5 } from "fs/promises";
1025
+ import path14 from "path";
1026
+
1027
+ // src/normalize/normalize-markdown.ts
1028
+ import matter2 from "gray-matter";
1029
+ function normalizeWhitespace(text) {
1030
+ return text.replace(/\r\n/g, "\n").replace(/[ \t]+\n/g, "\n").replace(/\n{3,}/g, "\n\n").trim();
1031
+ }
1032
+ function withFrontmatter(metadata, body) {
1033
+ return matter2.stringify(normalizeWhitespace(body), metadata);
1034
+ }
1035
+
1036
+ // src/ingest/document-utils.ts
1037
+ function asMetadataValue(value) {
1038
+ return value === void 0 ? void 0 : value;
1039
+ }
1040
+ function buildDocumentMetadata({
1041
+ source,
1042
+ sourceUri,
1043
+ publicationDate,
1044
+ crawledAt,
1045
+ indexedAt,
1046
+ extra = {}
1047
+ }) {
1048
+ const merged = {
1049
+ ...source.metadata,
1050
+ ...extra,
1051
+ tags: source.tags,
1052
+ sourceType: source.type,
1053
+ sourceUri,
1054
+ publicationDate: publicationDate ?? null,
1055
+ crawledAt,
1056
+ indexedAt
1057
+ };
1058
+ const filtered = Object.fromEntries(
1059
+ Object.entries(merged).filter(([, value]) => asMetadataValue(value) !== void 0)
1060
+ );
1061
+ return filtered;
1062
+ }
1063
+ async function writeNormalizedDocument({
1064
+ documentId,
1065
+ sourceId,
1066
+ title,
1067
+ uri,
1068
+ sourceUri,
1069
+ publicationDate,
1070
+ crawledAt,
1071
+ indexedAt,
1072
+ contentHash,
1073
+ lastChangedAt,
1074
+ normalizedPath,
1075
+ markdown
1076
+ }) {
1077
+ await mkdir6(path14.dirname(normalizedPath), { recursive: true });
1078
+ await writeFile5(
1079
+ normalizedPath,
1080
+ withFrontmatter(
1081
+ {
1082
+ documentId,
1083
+ sourceId,
1084
+ title,
1085
+ uri,
1086
+ sourceUri,
1087
+ publicationDate: publicationDate ?? null,
1088
+ crawledAt,
1089
+ indexedAt,
1090
+ contentHash,
1091
+ lastChangedAt
1092
+ },
1093
+ markdown
1094
+ ),
1095
+ "utf8"
1096
+ );
1097
+ }
1098
+ async function deleteDocumentArtifacts(document) {
1099
+ await Promise.all([
1100
+ document.rawPath ? rm(document.rawPath, { force: true }) : Promise.resolve(),
1101
+ rm(document.normalizedPath, { force: true })
1102
+ ]);
1103
+ }
1104
+
1105
+ // src/ingest/adapters/directory-adapter.ts
1106
+ import fg from "fast-glob";
1107
+ import path15 from "path";
1108
+ async function listDirectoryFiles(source) {
1109
+ const include = source.crawl?.includePatterns?.length ? source.crawl.includePatterns : ["**/*.md", "**/*.txt", "**/*.html", "**/*.htm", "**/*.pdf", "**/*.docx"];
1110
+ const exclude = source.crawl?.excludePatterns ?? [];
1111
+ const matches = await fg(include, {
1112
+ cwd: source.uri,
1113
+ absolute: true,
1114
+ onlyFiles: true,
1115
+ dot: false,
1116
+ unique: true,
1117
+ ignore: exclude,
1118
+ followSymbolicLinks: false
1119
+ });
1120
+ return matches.map((match) => path15.resolve(match)).sort();
1121
+ }
1122
+
1123
+ // src/ingest/adapters/file-adapter.ts
1124
+ import { basename, extname, resolve } from "path";
1125
+ import { mkdir as mkdir7, readFile as readFile9, stat as stat3, writeFile as writeFile6 } from "fs/promises";
1126
+
1127
+ // src/ingest/extractors/docx-extractor.ts
1128
+ import mammoth from "mammoth";
1129
+ async function extractDocx(filePath) {
1130
+ const result2 = await mammoth.extractRawText({ path: filePath });
1131
+ return result2.value;
1132
+ }
1133
+
1134
+ // src/ingest/extractors/html-extractor.ts
1135
+ import { load } from "cheerio";
1136
+ import TurndownService from "turndown";
1137
+
1138
+ // src/normalize/boilerplate.ts
1139
+ function stripBoilerplate(html) {
1140
+ return html.replace(/<nav[\s\S]*?<\/nav>/gi, "").replace(/<footer[\s\S]*?<\/footer>/gi, "").replace(/cookie notice/gi, "");
1141
+ }
1142
+
1143
+ // src/ingest/extractors/html-extractor.ts
1144
+ var turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
1145
+ function cleanText(value) {
1146
+ return value.replace(/\s+/g, " ").trim();
1147
+ }
1148
+ function chooseMeaningfulTitle($, fallbackTitle) {
1149
+ const candidates = [
1150
+ cleanText($("meta[property='og:title']").attr("content") ?? ""),
1151
+ cleanText($("meta[name='twitter:title']").attr("content") ?? ""),
1152
+ cleanText($("h1").first().text()),
1153
+ cleanText($("title").first().text()),
1154
+ fallbackTitle
1155
+ ].filter(Boolean);
1156
+ return candidates[0] ?? fallbackTitle;
1157
+ }
1158
+ turndown.addRule("docCard", {
1159
+ filter(node) {
1160
+ return node.nodeName === "A" && typeof node.getAttribute === "function" && (node.getAttribute("class") ?? "").split(/\s+/).includes("doc-card");
1161
+ },
1162
+ replacement(_content, node) {
1163
+ const element = node;
1164
+ const href = cleanText(element.getAttribute("href") ?? "");
1165
+ const title = cleanText(element.querySelector("h3")?.textContent ?? "");
1166
+ const summary = cleanText(element.querySelector("p")?.textContent ?? "");
1167
+ const section = cleanText(element.querySelector("span")?.textContent ?? "");
1168
+ const parts = [
1169
+ title ? `### ${title}` : "",
1170
+ summary,
1171
+ section,
1172
+ href
1173
+ ].filter(Boolean);
1174
+ return `
1175
+
1176
+ ${parts.join("\n\n")}
1177
+
1178
+ `;
1179
+ }
1180
+ });
1181
+ function extractHtmlToMarkdown(html) {
1182
+ const cleaned = stripBoilerplate(html);
1183
+ const $ = load(cleaned);
1184
+ const fallbackTitle = cleanText($("title").first().text()) || "Untitled";
1185
+ const title = chooseMeaningfulTitle($, fallbackTitle);
1186
+ const root = $("main").first().html() ?? $.root().html() ?? cleaned;
1187
+ return {
1188
+ markdown: turndown.turndown(root),
1189
+ title
1190
+ };
1191
+ }
1192
+ function parseDateCandidate(value) {
1193
+ const trimmed = value.trim();
1194
+ if (!trimmed) {
1195
+ return null;
1196
+ }
1197
+ const date = new Date(trimmed);
1198
+ return Number.isNaN(date.getTime()) ? null : date.toISOString();
1199
+ }
1200
+ function extractPublicationDateFromHtml(html) {
1201
+ const $ = load(html);
1202
+ const candidates = [
1203
+ $("meta[property='article:published_time']").attr("content"),
1204
+ $("meta[property='og:published_time']").attr("content"),
1205
+ $("meta[name='pubdate']").attr("content"),
1206
+ $("meta[name='publish-date']").attr("content"),
1207
+ $("meta[name='article:published_time']").attr("content"),
1208
+ $("meta[name='date']").attr("content"),
1209
+ $("time[datetime]").first().attr("datetime")
1210
+ ].filter((value) => Boolean(value?.trim()));
1211
+ for (const candidate of candidates) {
1212
+ const parsed = parseDateCandidate(candidate);
1213
+ if (parsed) {
1214
+ return parsed;
1215
+ }
1216
+ }
1217
+ let jsonLdDate = null;
1218
+ $('script[type="application/ld+json"]').each((_, element) => {
1219
+ if (jsonLdDate) {
1220
+ return false;
1221
+ }
1222
+ try {
1223
+ const raw = $(element).text();
1224
+ const parsed = JSON.parse(raw);
1225
+ const queue = Array.isArray(parsed) ? [...parsed] : [parsed];
1226
+ while (queue.length > 0) {
1227
+ const next = queue.shift();
1228
+ if (!next || typeof next !== "object") {
1229
+ continue;
1230
+ }
1231
+ const record = next;
1232
+ for (const key of ["datePublished", "dateCreated", "dateModified"]) {
1233
+ if (typeof record[key] === "string") {
1234
+ const normalized = parseDateCandidate(record[key]);
1235
+ if (normalized) {
1236
+ jsonLdDate = normalized;
1237
+ return false;
1238
+ }
1239
+ }
1240
+ }
1241
+ if (Array.isArray(record["@graph"])) {
1242
+ queue.push(...record["@graph"]);
1243
+ }
1244
+ }
1245
+ } catch (error) {
1246
+ void error;
1247
+ }
1248
+ return void 0;
1249
+ });
1250
+ return jsonLdDate;
1251
+ }
1252
+
1253
+ // src/ingest/extractors/markdown-extractor.ts
1254
+ import { readFile as readFile6 } from "fs/promises";
1255
+ async function extractMarkdown(filePath) {
1256
+ return readFile6(filePath, "utf8");
1257
+ }
1258
+
1259
+ // src/ingest/extractors/pdf-extractor.ts
1260
+ import { readFile as readFile7 } from "fs/promises";
1261
+ import { PDFParse } from "pdf-parse";
1262
+ async function extractPdf(filePath) {
1263
+ const buffer = await readFile7(filePath);
1264
+ const parser = new PDFParse({ data: buffer });
1265
+ try {
1266
+ const parsed = await parser.getText();
1267
+ return parsed.text;
1268
+ } finally {
1269
+ await parser.destroy();
1270
+ }
1271
+ }
1272
+
1273
+ // src/ingest/extractors/text-extractor.ts
1274
+ import { readFile as readFile8 } from "fs/promises";
1275
+ async function extractText(filePath) {
1276
+ return readFile8(filePath, "utf8");
1277
+ }
1278
+
1279
+ // src/ingest/adapters/file-adapter.ts
1280
+ function mimeTypeFor(filePath) {
1281
+ const ext = extname(filePath).toLowerCase();
1282
+ switch (ext) {
1283
+ case ".md":
1284
+ return "text/markdown";
1285
+ case ".txt":
1286
+ return "text/plain";
1287
+ case ".html":
1288
+ case ".htm":
1289
+ return "text/html";
1290
+ case ".pdf":
1291
+ return "application/pdf";
1292
+ case ".docx":
1293
+ return "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
1294
+ default:
1295
+ return "application/octet-stream";
1296
+ }
1297
+ }
1298
+ async function extractFileContent(filePath, mimeType) {
1299
+ if (mimeType === "text/markdown") {
1300
+ const markdown = await extractMarkdown(filePath);
1301
+ const title = markdown.match(/^#\s+(.+)$/m)?.[1]?.trim() ?? basename(filePath);
1302
+ return { title, markdown, raw: markdown };
1303
+ }
1304
+ if (mimeType === "text/plain") {
1305
+ const text = await extractText(filePath);
1306
+ return { title: basename(filePath), markdown: `# ${basename(filePath)}
1307
+
1308
+ ${text}`, raw: text };
1309
+ }
1310
+ if (mimeType === "text/html") {
1311
+ const raw = await readFile9(filePath, "utf8");
1312
+ const extracted = extractHtmlToMarkdown(raw);
1313
+ return { title: extracted.title, markdown: `# ${extracted.title}
1314
+
1315
+ ${extracted.markdown}`, raw };
1316
+ }
1317
+ if (mimeType === "application/pdf") {
1318
+ const text = await extractPdf(filePath);
1319
+ return { title: basename(filePath), markdown: `# ${basename(filePath)}
1320
+
1321
+ ${text}` };
1322
+ }
1323
+ if (mimeType.includes("wordprocessingml")) {
1324
+ const text = await extractDocx(filePath);
1325
+ return { title: basename(filePath), markdown: `# ${basename(filePath)}
1326
+
1327
+ ${text}` };
1328
+ }
1329
+ throw new Error(`unsupported file type: ${mimeType}`);
1330
+ }
1331
+ async function extractRawContent(raw, mimeType, fallbackTitle) {
1332
+ if (mimeType === "text/markdown") {
1333
+ const title = raw.match(/^#\s+(.+)$/m)?.[1]?.trim() ?? fallbackTitle;
1334
+ return { title, markdown: raw };
1335
+ }
1336
+ if (mimeType === "text/plain") {
1337
+ return { title: fallbackTitle, markdown: `# ${fallbackTitle}
1338
+
1339
+ ${raw}` };
1340
+ }
1341
+ if (mimeType === "text/html") {
1342
+ const extracted = extractHtmlToMarkdown(raw);
1343
+ return { title: extracted.title, markdown: `# ${extracted.title}
1344
+
1345
+ ${extracted.markdown}` };
1346
+ }
1347
+ throw new Error(`raw reprocessing is not supported for ${mimeType}`);
1348
+ }
1349
+ async function ingestFile({
1350
+ workspacePath,
1351
+ source,
1352
+ filePath,
1353
+ previous
1354
+ }) {
1355
+ const resolved = resolve(filePath);
1356
+ const fileStat = await stat3(resolved);
1357
+ const mimeType = mimeTypeFor(resolved);
1358
+ const extracted = await extractFileContent(resolved, mimeType);
1359
+ const documentId = stableId("doc", source.id, resolved);
1360
+ const normalizedPath = resolve(workspacePath, "normalized", `${documentId}.md`);
1361
+ const rawPath = resolve(workspacePath, "raw", source.id, basename(resolved));
1362
+ const contentHash = sha256(extracted.markdown);
1363
+ const now = (/* @__PURE__ */ new Date()).toISOString();
1364
+ const lastChangedAt = previous?.contentHash === contentHash ? previous.lastChangedAt : now;
1365
+ const indexedAt = now;
1366
+ const crawledAt = now;
1367
+ await mkdir7(resolve(workspacePath, "normalized"), { recursive: true });
1368
+ await mkdir7(resolve(workspacePath, "raw", source.id), { recursive: true });
1369
+ if (extracted.raw) {
1370
+ await writeFile6(rawPath, extracted.raw, "utf8");
1371
+ }
1372
+ await writeNormalizedDocument({
1373
+ documentId,
1374
+ sourceId: source.id,
1375
+ title: extracted.title,
1376
+ uri: resolved,
1377
+ sourceUri: source.uri,
1378
+ publicationDate: previous?.publicationDate ?? null,
1379
+ crawledAt,
1380
+ indexedAt,
1381
+ contentHash,
1382
+ lastChangedAt,
1383
+ normalizedPath,
1384
+ markdown: extracted.markdown
1385
+ });
1386
+ return {
1387
+ id: documentId,
1388
+ sourceId: source.id,
1389
+ sourceType: source.type,
1390
+ title: extracted.title,
1391
+ uri: resolved,
1392
+ sourceUri: source.uri,
1393
+ mimeType,
1394
+ rawPath: extracted.raw ? rawPath : void 0,
1395
+ normalizedPath,
1396
+ contentHash,
1397
+ metadata: buildDocumentMetadata({
1398
+ source,
1399
+ sourceUri: source.uri,
1400
+ publicationDate: previous?.publicationDate ?? null,
1401
+ crawledAt,
1402
+ indexedAt,
1403
+ extra: {
1404
+ contentType: mimeType,
1405
+ fileSizeBytes: fileStat.size
1406
+ }
1407
+ }),
1408
+ publicationDate: previous?.publicationDate ?? null,
1409
+ crawledAt,
1410
+ firstSeenAt: previous?.firstSeenAt ?? now,
1411
+ lastSeenAt: now,
1412
+ lastChangedAt,
1413
+ indexedAt
1414
+ };
1415
+ }
1416
+ async function ingestInlineContent({
1417
+ workspacePath,
1418
+ source,
1419
+ content,
1420
+ title,
1421
+ uri,
1422
+ previous
1423
+ }) {
1424
+ const markdown = source.type === "markdown" ? content : `# ${title}
1425
+
1426
+ ${content}`;
1427
+ const documentId = stableId("doc", source.id, uri);
1428
+ const normalizedPath = resolve(workspacePath, "normalized", `${documentId}.md`);
1429
+ const contentHash = sha256(markdown);
1430
+ const now = (/* @__PURE__ */ new Date()).toISOString();
1431
+ const lastChangedAt = previous?.contentHash === contentHash ? previous.lastChangedAt : now;
1432
+ const indexedAt = now;
1433
+ await mkdir7(resolve(workspacePath, "normalized"), { recursive: true });
1434
+ await writeNormalizedDocument({
1435
+ documentId,
1436
+ sourceId: source.id,
1437
+ title,
1438
+ uri,
1439
+ sourceUri: source.uri,
1440
+ publicationDate: previous?.publicationDate ?? null,
1441
+ crawledAt: now,
1442
+ indexedAt,
1443
+ contentHash,
1444
+ lastChangedAt,
1445
+ normalizedPath,
1446
+ markdown
1447
+ });
1448
+ return {
1449
+ id: documentId,
1450
+ sourceId: source.id,
1451
+ sourceType: source.type,
1452
+ title,
1453
+ uri,
1454
+ sourceUri: source.uri,
1455
+ mimeType: source.type === "markdown" ? "text/markdown" : "text/plain",
1456
+ normalizedPath,
1457
+ contentHash,
1458
+ metadata: buildDocumentMetadata({
1459
+ source,
1460
+ sourceUri: source.uri,
1461
+ publicationDate: previous?.publicationDate ?? null,
1462
+ crawledAt: now,
1463
+ indexedAt
1464
+ }),
1465
+ publicationDate: previous?.publicationDate ?? null,
1466
+ crawledAt: now,
1467
+ firstSeenAt: previous?.firstSeenAt ?? now,
1468
+ lastSeenAt: now,
1469
+ lastChangedAt,
1470
+ indexedAt
1471
+ };
1472
+ }
1473
+ async function reprocessStoredDocument(document, source) {
1474
+ if (!document.rawPath) {
1475
+ return null;
1476
+ }
1477
+ const raw = await readFile9(document.rawPath, "utf8");
1478
+ const fallbackTitle = document.title || basename(document.uri);
1479
+ const extracted = await extractRawContent(raw, document.mimeType, fallbackTitle);
1480
+ const contentHash = sha256(extracted.markdown);
1481
+ const now = (/* @__PURE__ */ new Date()).toISOString();
1482
+ const indexedAt = now;
1483
+ const lastChangedAt = document.contentHash === contentHash ? document.lastChangedAt : now;
1484
+ await writeNormalizedDocument({
1485
+ documentId: document.id,
1486
+ sourceId: document.sourceId,
1487
+ title: extracted.title,
1488
+ uri: document.uri,
1489
+ sourceUri: document.sourceUri,
1490
+ publicationDate: document.publicationDate ?? null,
1491
+ crawledAt: document.crawledAt,
1492
+ indexedAt,
1493
+ contentHash,
1494
+ lastChangedAt,
1495
+ normalizedPath: document.normalizedPath,
1496
+ markdown: extracted.markdown
1497
+ });
1498
+ return {
1499
+ ...document,
1500
+ title: extracted.title,
1501
+ contentHash,
1502
+ metadata: buildDocumentMetadata({
1503
+ source,
1504
+ sourceUri: document.sourceUri,
1505
+ publicationDate: document.publicationDate ?? null,
1506
+ crawledAt: document.crawledAt,
1507
+ indexedAt,
1508
+ extra: {
1509
+ ...document.metadata,
1510
+ contentType: document.mimeType
1511
+ }
1512
+ }),
1513
+ lastChangedAt,
1514
+ indexedAt
1515
+ };
1516
+ }
1517
+
1518
+ // src/ingest/adapters/rss-adapter.ts
1519
+ import { Readable } from "stream";
1520
+ import FeedParser from "feedparser";
1521
+ import { parseFeed } from "feedsmith";
1522
+ function toIsoDate(value) {
1523
+ if (value instanceof Date) {
1524
+ return Number.isNaN(value.getTime()) ? null : value.toISOString();
1525
+ }
1526
+ if (typeof value === "string" && value.trim().length > 0) {
1527
+ const parsed = new Date(value);
1528
+ return Number.isNaN(parsed.getTime()) ? null : parsed.toISOString();
1529
+ }
1530
+ return null;
1531
+ }
1532
+ function normalizeFeedLink(link, baseUrl) {
1533
+ if (!link?.trim()) {
1534
+ return null;
1535
+ }
1536
+ try {
1537
+ return new URL(link, baseUrl).href;
1538
+ } catch {
1539
+ return null;
1540
+ }
1541
+ }
1542
+ function normalizeFeedsmithItems(feed, baseUrl) {
1543
+ const items = Array.isArray(feed?.items) ? feed.items : Array.isArray(feed?.entries) ? feed.entries : [];
1544
+ return items.map((item) => {
1545
+ const link = normalizeFeedLink(
1546
+ item?.link ?? item?.url ?? item?.id ?? item?.guid ?? item?.links?.[0]?.href ?? item?.links?.[0]?.href,
1547
+ baseUrl
1548
+ );
1549
+ if (!link) {
1550
+ return null;
1551
+ }
1552
+ return {
1553
+ url: link,
1554
+ title: String(item?.title ?? item?.summary ?? link).trim(),
1555
+ publicationDate: toIsoDate(
1556
+ item?.pubDate ?? item?.published ?? item?.updated ?? item?.published_at ?? item?.date_published ?? item?.dc?.date
1557
+ )
1558
+ };
1559
+ }).filter((item) => item !== null);
1560
+ }
1561
+ async function parseWithFeedparser(xml, feedUrl) {
1562
+ const parser = new FeedParser({ feedurl: feedUrl });
1563
+ const items = [];
1564
+ return await new Promise((resolve2, reject) => {
1565
+ parser.on("error", reject);
1566
+ parser.on("readable", function onReadable() {
1567
+ let item;
1568
+ while (item = this.read()) {
1569
+ const link = normalizeFeedLink(item.link || item.origlink, feedUrl);
1570
+ if (!link) {
1571
+ continue;
1572
+ }
1573
+ items.push({
1574
+ url: link,
1575
+ title: String(item.title ?? link).trim(),
1576
+ publicationDate: toIsoDate(item.pubdate ?? item.date)
1577
+ });
1578
+ }
1579
+ });
1580
+ parser.on("end", () => resolve2(items));
1581
+ Readable.from([xml]).pipe(parser);
1582
+ });
1583
+ }
1584
+ async function parseRssFeedDocument(xml, source) {
1585
+ try {
1586
+ const parsed = parseFeed(xml);
1587
+ return normalizeFeedsmithItems(parsed.feed, source.uri);
1588
+ } catch {
1589
+ return parseWithFeedparser(xml, source.uri);
1590
+ }
1591
+ }
1592
+
1593
+ // src/ingest/adapters/url-adapter.ts
1594
+ import { mkdir as mkdir8, readFile as readFile10, writeFile as writeFile7 } from "fs/promises";
1595
+ import path16 from "path";
1596
+ function buildHttpCache(response2, validatedAt) {
1597
+ return {
1598
+ etag: response2.headers.get("etag") ?? void 0,
1599
+ lastModified: response2.headers.get("last-modified") ?? void 0,
1600
+ cacheControl: response2.headers.get("cache-control") ?? void 0,
1601
+ expires: response2.headers.get("expires"),
1602
+ lastValidatedAt: validatedAt,
1603
+ lastStatus: response2.status
1604
+ };
1605
+ }
1606
+ function choosePublicationDate(preferred, fallback, previous) {
1607
+ return preferred ?? fallback ?? previous ?? null;
1608
+ }
1609
+ async function normalizeRemoteDocument({
1610
+ workspacePath,
1611
+ source,
1612
+ url,
1613
+ body,
1614
+ previous,
1615
+ sourceUri,
1616
+ publicationDate,
1617
+ responseStatus
1618
+ }) {
1619
+ const extracted = extractHtmlToMarkdown(body);
1620
+ const markdown = `# ${extracted.title}
1621
+
1622
+ ${extracted.markdown}`;
1623
+ const documentId = stableId("doc", source.id, url);
1624
+ const normalizedPath = path16.resolve(workspacePath, "normalized", `${documentId}.md`);
1625
+ const rawPath = path16.resolve(workspacePath, "raw", source.id, `${sha256(url).slice(0, 12)}.html`);
1626
+ const contentHash = sha256(markdown);
1627
+ const now = (/* @__PURE__ */ new Date()).toISOString();
1628
+ const lastChangedAt = previous?.contentHash === contentHash ? previous.lastChangedAt : now;
1629
+ const indexedAt = now;
1630
+ const crawledAt = now;
1631
+ const resolvedPublicationDate = choosePublicationDate(publicationDate, extractPublicationDateFromHtml(body), previous?.publicationDate);
1632
+ await mkdir8(path16.resolve(workspacePath, "raw", source.id), { recursive: true });
1633
+ await writeFile7(rawPath, body, "utf8");
1634
+ await writeNormalizedDocument({
1635
+ documentId,
1636
+ sourceId: source.id,
1637
+ title: extracted.title,
1638
+ uri: url,
1639
+ sourceUri,
1640
+ publicationDate: resolvedPublicationDate,
1641
+ crawledAt,
1642
+ indexedAt,
1643
+ contentHash,
1644
+ lastChangedAt,
1645
+ normalizedPath,
1646
+ markdown
1647
+ });
1648
+ return {
1649
+ id: documentId,
1650
+ sourceId: source.id,
1651
+ sourceType: source.type,
1652
+ title: extracted.title,
1653
+ uri: url,
1654
+ sourceUri,
1655
+ mimeType: "text/html",
1656
+ rawPath,
1657
+ normalizedPath,
1658
+ contentHash,
1659
+ metadata: buildDocumentMetadata({
1660
+ source,
1661
+ sourceUri,
1662
+ publicationDate: resolvedPublicationDate,
1663
+ crawledAt,
1664
+ indexedAt,
1665
+ extra: {
1666
+ status: responseStatus,
1667
+ contentType: "text/html"
1668
+ }
1669
+ }),
1670
+ publicationDate: resolvedPublicationDate,
1671
+ crawledAt,
1672
+ firstSeenAt: previous?.firstSeenAt ?? now,
1673
+ lastSeenAt: now,
1674
+ lastChangedAt,
1675
+ indexedAt
1676
+ };
1677
+ }
1678
+ async function fetchUrlDocument({
1679
+ workspacePath,
1680
+ source,
1681
+ url,
1682
+ previous,
1683
+ sourceUri,
1684
+ publicationDate
1685
+ }) {
1686
+ const headers = {
1687
+ "user-agent": source.crawl?.userAgent ?? "querylight-cli/0.1"
1688
+ };
1689
+ if (previous?.httpCache?.etag) {
1690
+ headers["if-none-match"] = previous.httpCache.etag;
1691
+ }
1692
+ if (previous?.httpCache?.lastModified) {
1693
+ headers["if-modified-since"] = previous.httpCache.lastModified;
1694
+ }
1695
+ const response2 = await fetch(url, { headers });
1696
+ const now = (/* @__PURE__ */ new Date()).toISOString();
1697
+ const nextHttpCache = buildHttpCache(response2, now);
1698
+ const effectiveSourceUri = sourceUri ?? source.uri;
1699
+ if (response2.status === 304 && previous?.rawPath && await fileExists(previous.rawPath) && await fileExists(previous.normalizedPath)) {
1700
+ return {
1701
+ ...previous,
1702
+ sourceUri: effectiveSourceUri,
1703
+ publicationDate: publicationDate ?? previous.publicationDate ?? null,
1704
+ metadata: buildDocumentMetadata({
1705
+ source,
1706
+ sourceUri: effectiveSourceUri,
1707
+ publicationDate: publicationDate ?? previous.publicationDate ?? null,
1708
+ crawledAt: previous.crawledAt,
1709
+ indexedAt: previous.indexedAt,
1710
+ extra: {
1711
+ ...previous.metadata,
1712
+ status: previous.metadata.status ?? 200,
1713
+ contentType: previous.mimeType
1714
+ }
1715
+ }),
1716
+ lastSeenAt: now,
1717
+ httpCache: nextHttpCache
1718
+ };
1719
+ }
1720
+ const body = await response2.text();
1721
+ const document = await normalizeRemoteDocument({
1722
+ workspacePath,
1723
+ source,
1724
+ url,
1725
+ body,
1726
+ previous,
1727
+ sourceUri: effectiveSourceUri,
1728
+ publicationDate,
1729
+ responseStatus: response2.status
1730
+ });
1731
+ return {
1732
+ ...document,
1733
+ mimeType: response2.headers.get("content-type") ?? document.mimeType,
1734
+ metadata: buildDocumentMetadata({
1735
+ source,
1736
+ sourceUri: effectiveSourceUri,
1737
+ publicationDate: document.publicationDate ?? null,
1738
+ crawledAt: document.crawledAt,
1739
+ indexedAt: document.indexedAt,
1740
+ extra: {
1741
+ status: response2.status,
1742
+ contentType: response2.headers.get("content-type") ?? document.mimeType
1743
+ }
1744
+ }),
1745
+ httpCache: nextHttpCache
1746
+ };
1747
+ }
1748
+ async function reprocessRemoteDocument(document, source) {
1749
+ if (!document.rawPath || !await fileExists(document.rawPath)) {
1750
+ return null;
1751
+ }
1752
+ const raw = await readFile10(document.rawPath, "utf8");
1753
+ const extracted = extractHtmlToMarkdown(raw);
1754
+ const markdown = `# ${extracted.title}
1755
+
1756
+ ${extracted.markdown}`;
1757
+ const contentHash = sha256(markdown);
1758
+ const now = (/* @__PURE__ */ new Date()).toISOString();
1759
+ const indexedAt = now;
1760
+ const lastChangedAt = document.contentHash === contentHash ? document.lastChangedAt : now;
1761
+ const publicationDate = document.publicationDate ?? extractPublicationDateFromHtml(raw);
1762
+ await writeNormalizedDocument({
1763
+ documentId: document.id,
1764
+ sourceId: document.sourceId,
1765
+ title: extracted.title,
1766
+ uri: document.uri,
1767
+ sourceUri: document.sourceUri,
1768
+ publicationDate,
1769
+ crawledAt: document.crawledAt,
1770
+ indexedAt,
1771
+ contentHash,
1772
+ lastChangedAt,
1773
+ normalizedPath: document.normalizedPath,
1774
+ markdown
1775
+ });
1776
+ return {
1777
+ ...document,
1778
+ title: extracted.title,
1779
+ contentHash,
1780
+ publicationDate,
1781
+ metadata: buildDocumentMetadata({
1782
+ source,
1783
+ sourceUri: document.sourceUri,
1784
+ publicationDate,
1785
+ crawledAt: document.crawledAt,
1786
+ indexedAt,
1787
+ extra: {
1788
+ ...document.metadata,
1789
+ status: document.httpCache?.lastStatus ?? document.metadata.status ?? 200,
1790
+ contentType: document.mimeType
1791
+ }
1792
+ }),
1793
+ lastChangedAt,
1794
+ indexedAt
1795
+ };
1796
+ }
1797
+
1798
+ // src/ingest/adapters/website-adapter.ts
1799
+ import { load as load2 } from "cheerio";
1800
+ async function fetchRobotsDisallow(url, userAgent) {
1801
+ try {
1802
+ const response2 = await fetch(new URL("/robots.txt", url), { headers: { "user-agent": userAgent } });
1803
+ if (!response2.ok) {
1804
+ return [];
1805
+ }
1806
+ const text = await response2.text();
1807
+ return text.split("\n").map((line) => line.trim()).filter((line) => /^disallow:/i.test(line)).map((line) => line.split(":")[1]?.trim() ?? "").filter((line) => line.length > 0);
1808
+ } catch {
1809
+ return [];
1810
+ }
1811
+ }
1812
+ async function fetchSitemapUrls(baseUrl, userAgent) {
1813
+ try {
1814
+ const response2 = await fetch(new URL("/sitemap.xml", baseUrl), { headers: { "user-agent": userAgent } });
1815
+ if (!response2.ok) {
1816
+ return [];
1817
+ }
1818
+ const xml = await response2.text();
1819
+ return [...xml.matchAll(/<loc>(.*?)<\/loc>/g)].map((match) => match[1]).filter(Boolean);
1820
+ } catch {
1821
+ return [];
1822
+ }
1823
+ }
1824
+ function isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules) {
1825
+ if (url.origin !== baseUrl.origin) {
1826
+ return false;
1827
+ }
1828
+ if (disallowRules.some((rule) => rule !== "/" && url.pathname.startsWith(rule))) {
1829
+ return false;
1830
+ }
1831
+ const href = url.href;
1832
+ if (includePatterns.length > 0 && !includePatterns.some((pattern) => href.includes(pattern))) {
1833
+ return false;
1834
+ }
1835
+ if (excludePatterns.some((pattern) => href.includes(pattern))) {
1836
+ return false;
1837
+ }
1838
+ return true;
1839
+ }
1840
+ async function crawlWebsite(source) {
1841
+ const baseUrl = new URL(source.uri);
1842
+ const userAgent = source.crawl?.userAgent ?? "querylight-cli/0.1";
1843
+ const includePatterns = source.crawl?.includePatterns ?? [];
1844
+ const excludePatterns = source.crawl?.excludePatterns ?? [];
1845
+ const maxDepth = source.crawl?.maxDepth ?? 2;
1846
+ const maxPages = source.crawl?.maxPages ?? 100;
1847
+ const rateLimitMs = source.crawl?.rateLimitMs ?? 1e3;
1848
+ const disallowRules = source.crawl?.obeyRobotsTxt === false ? [] : await fetchRobotsDisallow(baseUrl, userAgent);
1849
+ const queue = [{ url: source.uri, depth: 0 }];
1850
+ const seen = /* @__PURE__ */ new Set();
1851
+ const results = [];
1852
+ if (source.crawl?.useSitemap !== false) {
1853
+ for (const url of await fetchSitemapUrls(baseUrl, userAgent)) {
1854
+ queue.push({ url, depth: 1 });
1855
+ }
1856
+ }
1857
+ while (queue.length > 0 && results.length < maxPages) {
1858
+ const next = queue.shift();
1859
+ if (!next || seen.has(next.url)) {
1860
+ continue;
1861
+ }
1862
+ seen.add(next.url);
1863
+ const url = new URL(next.url);
1864
+ if (!isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules)) {
1865
+ continue;
1866
+ }
1867
+ results.push(url.href);
1868
+ if (next.depth >= maxDepth) {
1869
+ continue;
1870
+ }
1871
+ const response2 = await fetch(url, { headers: { "user-agent": userAgent } });
1872
+ const html = await response2.text();
1873
+ const $ = load2(html);
1874
+ $("a[href]").each((_, element) => {
1875
+ const href = $(element).attr("href");
1876
+ if (!href) {
1877
+ return;
1878
+ }
1879
+ try {
1880
+ const target = new URL(href, url);
1881
+ if (!seen.has(target.href)) {
1882
+ queue.push({ url: target.href, depth: next.depth + 1 });
1883
+ }
1884
+ } catch {
1885
+ }
1886
+ });
1887
+ if (rateLimitMs > 0) {
1888
+ await new Promise((resolve2) => setTimeout(resolve2, rateLimitMs));
1889
+ }
1890
+ }
1891
+ return results;
1892
+ }
1893
+
1894
+ // src/ingest/ingest-service.ts
1895
+ function documentsFile(workspacePath) {
1896
+ return path17.join(workspacePath, "documents", "documents.jsonl");
1897
+ }
1898
+ async function loadDocuments(workspacePath) {
1899
+ return readJsonl(documentsFile(workspacePath));
1900
+ }
1901
+ async function saveDocuments(workspacePath, documents) {
1902
+ await writeJsonl(documentsFile(workspacePath), documents.sort((a, b) => a.id.localeCompare(b.id)));
1903
+ }
1904
+ function previousMap(documents) {
1905
+ return new Map(documents.map((document) => [document.id, document]));
1906
+ }
1907
+ function nowStamp() {
1908
+ return (/* @__PURE__ */ new Date()).toISOString();
1909
+ }
1910
+ function runId() {
1911
+ return nowStamp().replace(/[:.]/g, "-");
1912
+ }
1913
+ function documentSnapshot(documents) {
1914
+ return documents.map((document) => ({
1915
+ id: document.id,
1916
+ title: document.title,
1917
+ uri: document.uri,
1918
+ contentHash: document.contentHash,
1919
+ lastChangedAt: document.lastChangedAt,
1920
+ sourceId: document.sourceId
1921
+ }));
1922
+ }
1923
+ function shouldExpireRssDocument(document, source, defaultRetentionDays) {
1924
+ if (source.type !== "rss" || !document.publicationDate) {
1925
+ return false;
1926
+ }
1927
+ const retentionDays = source.crawl?.retentionDays ?? defaultRetentionDays;
1928
+ const publishedAt = new Date(document.publicationDate);
1929
+ if (Number.isNaN(publishedAt.getTime())) {
1930
+ return false;
1931
+ }
1932
+ const cutoff = Date.now() - retentionDays * 24 * 60 * 60 * 1e3;
1933
+ return publishedAt.getTime() < cutoff;
1934
+ }
1935
+ async function purgeDocuments(workspacePath, documentIds, documents) {
1936
+ if (documentIds.size === 0) {
1937
+ return;
1938
+ }
1939
+ const chunks = await loadChunks(workspacePath);
1940
+ const filteredChunks = chunks.filter((chunk) => !documentIds.has(chunk.documentId));
1941
+ if (filteredChunks.length !== chunks.length) {
1942
+ await saveChunks(workspacePath, filteredChunks);
1943
+ }
1944
+ await Promise.all(
1945
+ documents.filter((document) => documentIds.has(document.id)).map((document) => deleteDocumentArtifacts(document))
1946
+ );
1947
+ }
1948
+ async function fetchFeedText(source) {
1949
+ const response2 = await fetch(source.uri, {
1950
+ headers: {
1951
+ "user-agent": source.crawl?.userAgent ?? "querylight-cli/0.1"
1952
+ }
1953
+ });
1954
+ if (!response2.ok) {
1955
+ throw new Error(`failed to fetch feed: ${response2.status}`);
1956
+ }
1957
+ return response2.text();
1958
+ }
1959
+ async function ingestRssSource({
1960
+ workspacePath,
1961
+ source,
1962
+ previous,
1963
+ nextDocuments,
1964
+ onFailure
1965
+ }) {
1966
+ if (source.crawl?.fetchArticles === false) {
1967
+ throw new Error("rss sources require article fetching");
1968
+ }
1969
+ const xml = await fetchFeedText(source);
1970
+ const items = await parseRssFeedDocument(xml, source);
1971
+ let added = 0;
1972
+ let changed = 0;
1973
+ let unchanged = 0;
1974
+ let failed = 0;
1975
+ for (const item of items) {
1976
+ try {
1977
+ const probe = previous.get(stableId("doc", source.id, item.url));
1978
+ const document = await fetchUrlDocument({
1979
+ workspacePath,
1980
+ source,
1981
+ url: item.url,
1982
+ previous: probe,
1983
+ sourceUri: source.uri,
1984
+ publicationDate: item.publicationDate
1985
+ });
1986
+ nextDocuments.set(document.id, document);
1987
+ if (!probe) {
1988
+ added += 1;
1989
+ } else if (probe.contentHash !== document.contentHash) {
1990
+ changed += 1;
1991
+ } else {
1992
+ unchanged += 1;
1993
+ }
1994
+ } catch (error) {
1995
+ failed += 1;
1996
+ onFailure(item.url, error);
1997
+ }
1998
+ }
1999
+ return { added, changed, unchanged, failed };
2000
+ }
2001
+ async function ingestSources({
2002
+ workspacePath,
2003
+ sourceIds,
2004
+ changedOnly = false
2005
+ }) {
2006
+ const config = await loadConfig(workspacePath);
2007
+ const defaultRetentionDays = config.crawler.retentionDays;
2008
+ const sources = (await listSources(workspacePath)).filter((source) => source.enabled && (!sourceIds || sourceIds.includes(source.id)));
2009
+ const existing = await loadDocuments(workspacePath);
2010
+ const previous = previousMap(existing);
2011
+ const nextDocuments = new Map(existing.map((document) => [document.id, document]));
2012
+ let added = 0;
2013
+ let changed = 0;
2014
+ let unchanged = 0;
2015
+ let failed = 0;
2016
+ const failures = [];
2017
+ for (const source of sources) {
2018
+ const ingestOne = async (uri, producer) => {
2019
+ try {
2020
+ const probeId = stableId("doc", source.id, uri);
2021
+ const earlier = previous.get(probeId);
2022
+ const document = await producer();
2023
+ nextDocuments.set(document.id, document);
2024
+ if (!earlier) {
2025
+ added += 1;
2026
+ } else if (earlier.contentHash !== document.contentHash) {
2027
+ changed += 1;
2028
+ } else {
2029
+ unchanged += 1;
2030
+ }
2031
+ } catch (error) {
2032
+ failed += 1;
2033
+ failures.push({
2034
+ sourceId: source.id,
2035
+ uri,
2036
+ message: error instanceof Error ? error.message : String(error)
2037
+ });
2038
+ }
2039
+ };
2040
+ try {
2041
+ if (source.type === "file") {
2042
+ await ingestOne(source.uri, () => ingestFile({ workspacePath, source, filePath: source.uri, previous: previous.get(stableId("doc", source.id, source.uri)) }));
2043
+ continue;
2044
+ }
2045
+ if (source.type === "directory") {
2046
+ for (const filePath of await listDirectoryFiles(source)) {
2047
+ await ingestOne(filePath, () => ingestFile({ workspacePath, source, filePath, previous: previous.get(stableId("doc", source.id, filePath)) }));
2048
+ }
2049
+ continue;
2050
+ }
2051
+ if (source.type === "url") {
2052
+ await ingestOne(source.uri, () => fetchUrlDocument({ workspacePath, source, url: source.uri, previous: previous.get(stableId("doc", source.id, source.uri)) }));
2053
+ continue;
2054
+ }
2055
+ if (source.type === "website") {
2056
+ for (const url of await crawlWebsite(source)) {
2057
+ await ingestOne(url, () => fetchUrlDocument({ workspacePath, source, url, previous: previous.get(stableId("doc", source.id, url)) }));
2058
+ }
2059
+ continue;
2060
+ }
2061
+ if (source.type === "rss") {
2062
+ const result2 = await ingestRssSource({
2063
+ workspacePath,
2064
+ source,
2065
+ previous,
2066
+ nextDocuments,
2067
+ onFailure: (uri, error) => {
2068
+ failures.push({
2069
+ sourceId: source.id,
2070
+ uri,
2071
+ message: error instanceof Error ? error.message : String(error)
2072
+ });
2073
+ }
2074
+ });
2075
+ added += result2.added;
2076
+ changed += result2.changed;
2077
+ unchanged += result2.unchanged;
2078
+ failed += result2.failed;
2079
+ continue;
2080
+ }
2081
+ if (source.type === "markdown" || source.type === "text") {
2082
+ await ingestOne(source.uri, () => ingestInlineContent({
2083
+ workspacePath,
2084
+ source,
2085
+ title: source.name,
2086
+ content: source.uri,
2087
+ uri: `inline:${source.id}`,
2088
+ previous: previous.get(stableId("doc", source.id, `inline:${source.id}`))
2089
+ }));
2090
+ }
2091
+ } catch (error) {
2092
+ failed += 1;
2093
+ failures.push({
2094
+ sourceId: source.id,
2095
+ uri: source.uri,
2096
+ message: error instanceof Error ? error.message : String(error)
2097
+ });
2098
+ }
2099
+ }
2100
+ const expiringDocuments = [...nextDocuments.values()].filter((document) => {
2101
+ const source = sources.find((candidate) => candidate.id === document.sourceId);
2102
+ return source ? shouldExpireRssDocument(document, source, defaultRetentionDays) : false;
2103
+ });
2104
+ if (expiringDocuments.length > 0) {
2105
+ const expiredIds = new Set(expiringDocuments.map((document) => document.id));
2106
+ for (const document of expiringDocuments) {
2107
+ nextDocuments.delete(document.id);
2108
+ }
2109
+ await purgeDocuments(workspacePath, expiredIds, [...existing, ...expiringDocuments]);
2110
+ }
2111
+ const finalDocuments = [...nextDocuments.values()];
2112
+ await saveDocuments(workspacePath, finalDocuments);
2113
+ const id = runId();
2114
+ const run = {
2115
+ id,
2116
+ kind: "ingest",
2117
+ createdAt: nowStamp(),
2118
+ success: failed === 0,
2119
+ summary: {
2120
+ processedSources: sources.length,
2121
+ added,
2122
+ changed,
2123
+ unchanged,
2124
+ failed,
2125
+ changedOnly
2126
+ },
2127
+ failures,
2128
+ documentsSnapshot: documentSnapshot(finalDocuments)
2129
+ };
2130
+ await writeRun(workspacePath, run);
2131
+ return {
2132
+ runId: id,
2133
+ documents: { added, changed, unchanged, failed },
2134
+ processedSources: sources.length
2135
+ };
2136
+ }
2137
+ async function reprocessDocuments({
2138
+ workspacePath,
2139
+ sourceId,
2140
+ documentId
2141
+ }) {
2142
+ const documents = await loadDocuments(workspacePath);
2143
+ const sources = await listSources(workspacePath);
2144
+ const sourceMap = new Map(sources.map((source) => [source.id, source]));
2145
+ const nextDocuments = new Map(documents.map((document) => [document.id, document]));
2146
+ let documentsReprocessed = 0;
2147
+ let documentsSkipped = 0;
2148
+ for (const document of documents.filter((candidate) => (!sourceId || candidate.sourceId === sourceId) && (!documentId || candidate.id === documentId))) {
2149
+ const source = sourceMap.get(document.sourceId);
2150
+ if (!source || !document.rawPath || !await fileExists(document.rawPath)) {
2151
+ documentsSkipped += 1;
2152
+ continue;
2153
+ }
2154
+ const updated = source.type === "url" || source.type === "website" || source.type === "rss" ? await reprocessRemoteDocument(document, source) : await reprocessStoredDocument(document, source);
2155
+ if (!updated) {
2156
+ documentsSkipped += 1;
2157
+ continue;
2158
+ }
2159
+ nextDocuments.set(updated.id, updated);
2160
+ documentsReprocessed += 1;
2161
+ }
2162
+ const finalDocuments = [...nextDocuments.values()];
2163
+ await saveDocuments(workspacePath, finalDocuments);
2164
+ const id = runId();
2165
+ await writeRun(workspacePath, {
2166
+ id,
2167
+ kind: "reprocess",
2168
+ createdAt: nowStamp(),
2169
+ success: true,
2170
+ summary: {
2171
+ documentsReprocessed,
2172
+ documentsSkipped
2173
+ },
2174
+ documentsSnapshot: documentSnapshot(finalDocuments)
2175
+ });
2176
+ return { runId: id, documentsReprocessed, documentsSkipped };
2177
+ }
2178
+
2179
+ // src/query/search-service.ts
2180
+ import { readFile as readFile11 } from "fs/promises";
2181
+ import { BoolQuery, MatchQuery, OP, TermQuery, reciprocalRankFusion } from "@tryformation/querylight-ts";
2182
+ import path18 from "path";
2183
+ async function loadHydratedIndex(workspacePath) {
2184
+ const state = await readLatestIndexState(workspacePath);
2185
+ const mapping = createIndexMapping(Object.keys(state.fieldState ?? {}).filter((field) => field.startsWith("metadata.")));
2186
+ return new (await import("@tryformation/querylight-ts")).DocumentIndex(mapping).loadState(state);
2187
+ }
2188
+ function normalizeFilterValues(values) {
2189
+ return (values ?? []).map((value) => value.toLowerCase()).filter(Boolean);
2190
+ }
2191
+ function matchesAny(value, candidates) {
2192
+ return candidates.length === 0 || candidates.includes(value.toLowerCase());
2193
+ }
2194
+ function matchesPrefix(value, prefixes) {
2195
+ if (prefixes.length === 0) {
2196
+ return true;
2197
+ }
2198
+ const lower = value.toLowerCase();
2199
+ return prefixes.some((prefix) => lower.startsWith(prefix));
2200
+ }
2201
+ function buildSearchQuery(query, filters) {
2202
+ const sourceIds = normalizeFilterValues([filters.sourceId, ...filters.sourceIds ?? []].filter((value) => Boolean(value)));
2203
+ const sourceTypes = normalizeFilterValues([filters.sourceType, ...filters.sourceTypes ?? []].filter((value) => Boolean(value)));
2204
+ const tags = normalizeFilterValues([filters.tag, ...filters.tags ?? []].filter((value) => Boolean(value)));
2205
+ return new BoolQuery({
2206
+ should: [
2207
+ new MatchQuery({ field: "title", text: query, operation: OP.AND, boost: 6 }),
2208
+ new MatchQuery({ field: "text", text: query, operation: OP.AND, boost: 4 }),
2209
+ new MatchQuery({ field: "text", text: query, operation: OP.OR, boost: 2 })
2210
+ ],
2211
+ filter: [
2212
+ ...sourceIds.length === 1 ? [new TermQuery({ field: "sourceId", text: sourceIds[0] })] : [],
2213
+ ...sourceTypes.length === 1 ? [new TermQuery({ field: "sourceType", text: sourceTypes[0] })] : [],
2214
+ ...tags.length === 1 ? [new TermQuery({ field: "tags", text: tags[0] })] : [],
2215
+ ...(filters.metadata ?? []).map(({ key, value }) => new TermQuery({ field: `metadata.${key}`, text: value.toLowerCase() }))
2216
+ ]
2217
+ });
2218
+ }
2219
+ function isValidDate(value) {
2220
+ return typeof value === "string" && !Number.isNaN(new Date(value).getTime());
2221
+ }
2222
+ function documentDateValue(document, field) {
2223
+ const value = document[field];
2224
+ return typeof value === "string" && isValidDate(value) ? value : null;
2225
+ }
2226
+ function matchesDateRanges(document, dateRanges) {
2227
+ return dateRanges.every(({ field, from, to }) => {
2228
+ const value = documentDateValue(document, field);
2229
+ if (!value) {
2230
+ return false;
2231
+ }
2232
+ const timestamp = new Date(value).getTime();
2233
+ const fromTime = from ? new Date(from).getTime() : null;
2234
+ const toTime = to ? new Date(to).getTime() : null;
2235
+ return (fromTime == null || timestamp >= fromTime) && (toTime == null || timestamp <= toTime);
2236
+ });
2237
+ }
2238
+ function fallbackSourceType(chunk, document, source) {
2239
+ const metadataSourceType = typeof chunk.metadata.sourceType === "string" ? chunk.metadata.sourceType : void 0;
2240
+ return document?.sourceType ?? source?.type ?? metadataSourceType ?? "text";
2241
+ }
2242
+ function filterChunk(chunk, document, source, {
2243
+ sourceId,
2244
+ sourceIds,
2245
+ sourceName,
2246
+ sourceNames,
2247
+ sourceType,
2248
+ sourceTypes,
2249
+ uriPrefix,
2250
+ uriPrefixes,
2251
+ hasPublicationDate,
2252
+ tag,
2253
+ tags,
2254
+ metadata,
2255
+ dateRanges
2256
+ }) {
2257
+ const normalizedSourceIds = normalizeFilterValues([sourceId, ...sourceIds ?? []].filter((value) => Boolean(value)));
2258
+ const normalizedSourceNames = normalizeFilterValues([sourceName, ...sourceNames ?? []].filter((value) => Boolean(value)));
2259
+ const normalizedSourceTypes = normalizeFilterValues([sourceType, ...sourceTypes ?? []].filter((value) => Boolean(value)));
2260
+ const normalizedUriPrefixes = normalizeFilterValues([uriPrefix, ...uriPrefixes ?? []].filter((value) => Boolean(value)));
2261
+ const normalizedTags = normalizeFilterValues([tag, ...tags ?? []].filter((value) => Boolean(value)));
2262
+ if (!matchesAny(chunk.sourceId, normalizedSourceIds)) {
2263
+ return false;
2264
+ }
2265
+ if (!matchesAny(fallbackSourceType(chunk, document, source), normalizedSourceTypes)) {
2266
+ return false;
2267
+ }
2268
+ if (normalizedSourceNames.length > 0 && !matchesAny(source?.name ?? "", normalizedSourceNames)) {
2269
+ return false;
2270
+ }
2271
+ if (!matchesPrefix(document?.uri ?? chunk.uri, normalizedUriPrefixes)) {
2272
+ return false;
2273
+ }
2274
+ if (hasPublicationDate && (!document || !documentDateValue(document, "publicationDate"))) {
2275
+ return false;
2276
+ }
2277
+ if (normalizedTags.length > 0) {
2278
+ const tags2 = Array.isArray(chunk.metadata.tags) ? chunk.metadata.tags.map(String).map((value) => value.toLowerCase()) : [];
2279
+ if (!normalizedTags.some((tag2) => tags2.includes(tag2))) {
2280
+ return false;
2281
+ }
2282
+ }
2283
+ if (metadata?.length) {
2284
+ const metadataMatches = metadata.every(({ key, value }) => {
2285
+ const candidate = chunk.metadata[key];
2286
+ return Array.isArray(candidate) ? candidate.map(String).map((item) => item.toLowerCase()).includes(value.toLowerCase()) : String(candidate ?? "").toLowerCase() === value.toLowerCase();
2287
+ });
2288
+ if (!metadataMatches) {
2289
+ return false;
2290
+ }
2291
+ }
2292
+ if (!document) {
2293
+ return dateRanges.length === 0;
2294
+ }
2295
+ return matchesDateRanges(document, dateRanges);
2296
+ }
2297
+ function sortDateDescending(left, right) {
2298
+ const leftTime = left ? new Date(left).getTime() : Number.NEGATIVE_INFINITY;
2299
+ const rightTime = right ? new Date(right).getTime() : Number.NEGATIVE_INFINITY;
2300
+ return rightTime - leftTime;
2301
+ }
2302
+ function latestSortDate(document) {
2303
+ return documentDateValue(document, "publicationDate") ?? documentDateValue(document, "lastChangedAt") ?? documentDateValue(document, "lastSeenAt") ?? documentDateValue(document, "firstSeenAt") ?? documentDateValue(document, "crawledAt");
2304
+ }
2305
+ function representativeChunk(chunks) {
2306
+ return [...chunks].sort((left, right) => {
2307
+ if (left.headingPath.length !== right.headingPath.length) {
2308
+ return left.headingPath.length - right.headingPath.length;
2309
+ }
2310
+ if (left.uri !== right.uri) {
2311
+ return left.uri.localeCompare(right.uri);
2312
+ }
2313
+ return left.id.localeCompare(right.id);
2314
+ })[0] ?? chunks[0] ?? void 0;
2315
+ }
2316
+ function stripSnippetMarkdown(text) {
2317
+ return text.replace(/```[\s\S]*?```/g, " ").replace(/\[([^\]]+)\]\([^)]+\)/g, "$1").replace(/!\[([^\]]*)\]\([^)]+\)/g, "$1").replace(/`([^`]+)`/g, "$1").replace(/^#{1,6}\s+/gm, "").replace(/^\s*[-*+]\s+/gm, "");
2318
+ }
2319
+ function extractSnippetParagraphs(text) {
2320
+ return stripSnippetMarkdown(text).split(/\n\s*\n+/).map((paragraph) => paragraph.replace(/\s+/g, " ").trim()).filter(Boolean);
2321
+ }
2322
+ function buildParagraphSnippet(paragraphs, query, targetLength = 900) {
2323
+ if (paragraphs.length === 0) {
2324
+ return "";
2325
+ }
2326
+ const lowerQueryTerms = query.toLowerCase().split(/\s+/).filter(Boolean);
2327
+ const matchIndex = paragraphs.findIndex((paragraph) => {
2328
+ const lower = paragraph.toLowerCase();
2329
+ return lowerQueryTerms.some((term) => lower.includes(term));
2330
+ });
2331
+ let start = matchIndex >= 0 ? matchIndex : 0;
2332
+ let end = start + 1;
2333
+ let totalLength = paragraphs[start]?.length ?? 0;
2334
+ while (totalLength < targetLength && (start > 0 || end < paragraphs.length)) {
2335
+ const previousLength = start > 0 ? paragraphs[start - 1]?.length ?? 0 : -1;
2336
+ const nextLength = end < paragraphs.length ? paragraphs[end]?.length ?? 0 : -1;
2337
+ if (nextLength >= previousLength && end < paragraphs.length) {
2338
+ totalLength += nextLength + 2;
2339
+ end += 1;
2340
+ continue;
2341
+ }
2342
+ if (start > 0) {
2343
+ totalLength += previousLength + 2;
2344
+ start -= 1;
2345
+ continue;
2346
+ }
2347
+ break;
2348
+ }
2349
+ return paragraphs.slice(start, end).join("\n\n").trim();
2350
+ }
2351
+ function buildSnippet(text, query) {
2352
+ return buildParagraphSnippet(extractSnippetParagraphs(text), query);
2353
+ }
2354
+ function buildDocumentParagraphs(chunks) {
2355
+ return chunks.flatMap(
2356
+ (candidate, chunkIndex) => extractSnippetParagraphs(candidate.text).map((text) => ({ chunkIndex, text }))
2357
+ );
2358
+ }
2359
+ function buildExpandedParagraphSnippet(paragraphs, chunkIndex, query, targetLength = 1200) {
2360
+ if (paragraphs.length === 0) {
2361
+ return "";
2362
+ }
2363
+ const lowerQueryTerms = query.toLowerCase().split(/\s+/).filter(Boolean);
2364
+ const currentParagraphIndexes = paragraphs.map((paragraph, index) => ({ ...paragraph, index })).filter((paragraph) => paragraph.chunkIndex === chunkIndex).map((paragraph) => paragraph.index);
2365
+ const anchorIndex = currentParagraphIndexes.find((index) => {
2366
+ const lower = paragraphs[index]?.text.toLowerCase() ?? "";
2367
+ return lowerQueryTerms.some((term) => lower.includes(term));
2368
+ }) ?? currentParagraphIndexes[0] ?? 0;
2369
+ let start = anchorIndex;
2370
+ let end = anchorIndex + 1;
2371
+ let totalLength = paragraphs[anchorIndex]?.text.length ?? 0;
2372
+ while (totalLength < targetLength && (start > 0 || end < paragraphs.length)) {
2373
+ const previousLength = start > 0 ? paragraphs[start - 1]?.text.length ?? 0 : -1;
2374
+ const nextLength = end < paragraphs.length ? paragraphs[end]?.text.length ?? 0 : -1;
2375
+ if (nextLength >= previousLength && end < paragraphs.length) {
2376
+ totalLength += nextLength + 2;
2377
+ end += 1;
2378
+ continue;
2379
+ }
2380
+ if (start > 0) {
2381
+ totalLength += previousLength + 2;
2382
+ start -= 1;
2383
+ continue;
2384
+ }
2385
+ break;
2386
+ }
2387
+ return paragraphs.slice(start, end).map((paragraph) => paragraph.text).join("\n\n").trim();
2388
+ }
2389
+ async function buildSnippetWithAdjacentChunks(chunk, query, {
2390
+ document,
2391
+ config,
2392
+ orderedChunkCache
2393
+ }) {
2394
+ if (!document) {
2395
+ return buildSnippet(chunk.text, query);
2396
+ }
2397
+ let orderedChunks = orderedChunkCache.get(document.id);
2398
+ if (!orderedChunks) {
2399
+ if (!await fileExists(document.normalizedPath)) {
2400
+ return buildSnippet(chunk.text, query);
2401
+ }
2402
+ const raw = await readFile11(document.normalizedPath, "utf8");
2403
+ orderedChunks = buildChunksForDocument(document, raw, config);
2404
+ orderedChunkCache.set(document.id, orderedChunks);
2405
+ }
2406
+ const currentIndex = orderedChunks.findIndex((candidate) => candidate.id === chunk.id);
2407
+ if (currentIndex < 0) {
2408
+ return buildSnippet(chunk.text, query);
2409
+ }
2410
+ const current = orderedChunks[currentIndex];
2411
+ const paragraphs = buildDocumentParagraphs(orderedChunks);
2412
+ if (paragraphs.length === 0) {
2413
+ return buildSnippet(current.text, query);
2414
+ }
2415
+ return buildExpandedParagraphSnippet(paragraphs, currentIndex, query);
2416
+ }
2417
+ function normalizeDisplayTitle(title) {
2418
+ return title.replace(/\s*\|\s*Querylight TS Demo\s*$/i, "").replace(/\s+/g, " ").trim();
2419
+ }
2420
+ function chooseResultTitle(chunk) {
2421
+ const documentTitle = normalizeDisplayTitle(chunk.title);
2422
+ const headings = chunk.headingPath.map((heading) => normalizeDisplayTitle(heading)).filter(Boolean);
2423
+ const leafHeading = headings.at(-1);
2424
+ if (leafHeading && leafHeading.toLowerCase() !== documentTitle.toLowerCase()) {
2425
+ return leafHeading;
2426
+ }
2427
+ if (documentTitle) {
2428
+ return documentTitle;
2429
+ }
2430
+ return leafHeading ?? "Untitled";
2431
+ }
2432
+ function normalizeComparisonText(value) {
2433
+ return value.toLowerCase().replace(/[^a-z0-9]+/g, " ").replace(/\s+/g, " ").trim();
2434
+ }
2435
+ function normalizeUriPath(uri) {
2436
+ try {
2437
+ const parsed = new URL(uri);
2438
+ const pathname = parsed.pathname.replace(/\/+$/, "") || "/";
2439
+ return pathname.toLowerCase();
2440
+ } catch {
2441
+ return uri.toLowerCase().replace(/\/+$/, "");
2442
+ }
2443
+ }
2444
+ function uriSpecificity(uri) {
2445
+ const normalized = normalizeUriPath(uri);
2446
+ if (normalized === "/") {
2447
+ return 0;
2448
+ }
2449
+ return normalized.split("/").filter(Boolean).length;
2450
+ }
2451
+ function isMoreSpecificDuplicate(candidate, existing) {
2452
+ if (candidate.sourceId !== existing.sourceId) {
2453
+ return false;
2454
+ }
2455
+ const candidateTitle = normalizeComparisonText(candidate.title);
2456
+ const existingTitle = normalizeComparisonText(existing.title);
2457
+ if (!candidateTitle || candidateTitle !== existingTitle) {
2458
+ return false;
2459
+ }
2460
+ const candidatePath = normalizeUriPath(candidate.uri);
2461
+ const existingPath = normalizeUriPath(existing.uri);
2462
+ if (candidatePath === existingPath) {
2463
+ return false;
2464
+ }
2465
+ const candidateIsChild = candidatePath.startsWith(existingPath === "/" ? "/" : `${existingPath}/`);
2466
+ const existingIsChild = existingPath.startsWith(candidatePath === "/" ? "/" : `${candidatePath}/`);
2467
+ if (!candidateIsChild && !existingIsChild) {
2468
+ return false;
2469
+ }
2470
+ return uriSpecificity(candidate.uri) > uriSpecificity(existing.uri);
2471
+ }
2472
+ function collapseAggregateDuplicates(results, topK) {
2473
+ const deduped = [];
2474
+ for (const result2 of results) {
2475
+ const duplicateIndex = deduped.findIndex(
2476
+ (existing) => isMoreSpecificDuplicate(result2, existing) || isMoreSpecificDuplicate(existing, result2)
2477
+ );
2478
+ if (duplicateIndex < 0) {
2479
+ deduped.push(result2);
2480
+ continue;
2481
+ }
2482
+ if (isMoreSpecificDuplicate(result2, deduped[duplicateIndex])) {
2483
+ deduped[duplicateIndex] = result2;
2484
+ }
2485
+ }
2486
+ return deduped.slice(0, topK);
2487
+ }
2488
+ function rerankResultsByDocument(results, topK) {
2489
+ const byDocument = /* @__PURE__ */ new Map();
2490
+ for (const result2 of results) {
2491
+ const existing = byDocument.get(result2.documentId);
2492
+ if (existing) {
2493
+ existing.push(result2);
2494
+ } else {
2495
+ byDocument.set(result2.documentId, [result2]);
2496
+ }
2497
+ }
2498
+ const reranked = [...byDocument.values()].flatMap((group) => {
2499
+ const sorted = [...group].sort((left, right) => right.score - left.score);
2500
+ const [best, ...rest] = sorted;
2501
+ if (!best) {
2502
+ return [];
2503
+ }
2504
+ const tailScore = rest.reduce((sum, result2) => sum + result2.score, 0);
2505
+ const aggregateScore = best.score + tailScore * 0.35 + (group.length - 1) * 0.2;
2506
+ return [{ ...best, score: aggregateScore }];
2507
+ }).sort((left, right) => right.score - left.score);
2508
+ return collapseAggregateDuplicates(reranked, topK);
2509
+ }
2510
+ async function searchIndex({
2511
+ workspacePath,
2512
+ query,
2513
+ topK,
2514
+ sourceId,
2515
+ sourceIds,
2516
+ sourceName,
2517
+ sourceNames,
2518
+ sourceType,
2519
+ sourceTypes,
2520
+ uriPrefix,
2521
+ uriPrefixes,
2522
+ hasPublicationDate,
2523
+ tag,
2524
+ tags,
2525
+ metadata,
2526
+ dateRanges = [],
2527
+ retrievalMode,
2528
+ showChunks = false
2529
+ }) {
2530
+ const config = await loadConfig(workspacePath);
2531
+ const mode = retrievalMode ?? config.retrieval.defaultMode;
2532
+ const candidateLimit = Math.max(topK * 5, 50);
2533
+ const chunks = new Map((await readJsonl(path18.join(workspacePath, "chunks", "chunks.jsonl"))).map((chunk) => [chunk.id, chunk]));
2534
+ const documents = new Map((await readJsonl(path18.join(workspacePath, "documents", "documents.jsonl"))).map((document) => [document.id, document]));
2535
+ const sources = new Map((await readJsonl(path18.join(workspacePath, "sources", "sources.jsonl"))).map((source) => [source.id, source]));
2536
+ const orderedChunkCache = /* @__PURE__ */ new Map();
2537
+ const normalizedQuery = query.trim();
2538
+ const filterIds = [...chunks.values()].filter((chunk) => filterChunk(chunk, documents.get(chunk.documentId), sources.get(chunk.sourceId), { sourceId, sourceIds, sourceName, sourceNames, sourceType, sourceTypes, uriPrefix, uriPrefixes, hasPublicationDate, tag, tags, metadata, dateRanges })).map((chunk) => chunk.id);
2539
+ if (normalizedQuery.length === 0) {
2540
+ const chunksByDocument = /* @__PURE__ */ new Map();
2541
+ for (const chunkId of filterIds) {
2542
+ const chunk = chunks.get(chunkId);
2543
+ if (!chunk) {
2544
+ continue;
2545
+ }
2546
+ const existing = chunksByDocument.get(chunk.documentId);
2547
+ if (existing) {
2548
+ existing.push(chunk);
2549
+ } else {
2550
+ chunksByDocument.set(chunk.documentId, [chunk]);
2551
+ }
2552
+ }
2553
+ const latestResults = await Promise.all(
2554
+ [...chunksByDocument.entries()].sort(([leftDocumentId], [rightDocumentId]) => {
2555
+ const leftDocument = documents.get(leftDocumentId);
2556
+ const rightDocument = documents.get(rightDocumentId);
2557
+ return sortDateDescending(leftDocument ? latestSortDate(leftDocument) : null, rightDocument ? latestSortDate(rightDocument) : null);
2558
+ }).slice(0, topK).map(async ([documentId, documentChunks]) => {
2559
+ const document = documents.get(documentId);
2560
+ const chunk = representativeChunk(documentChunks);
2561
+ if (!chunk || !document) {
2562
+ return null;
2563
+ }
2564
+ return {
2565
+ chunkId: chunk.id,
2566
+ documentId: chunk.documentId,
2567
+ sourceId: chunk.sourceId,
2568
+ sourceType: document.sourceType,
2569
+ score: 0,
2570
+ title: chooseResultTitle(chunk),
2571
+ uri: chunk.uri,
2572
+ headingPath: chunk.headingPath,
2573
+ snippet: await buildSnippetWithAdjacentChunks(chunk, document.title, {
2574
+ document,
2575
+ config,
2576
+ orderedChunkCache
2577
+ }),
2578
+ text: showChunks ? chunk.text : void 0,
2579
+ publicationDate: document.publicationDate ?? null,
2580
+ firstSeenAt: document.firstSeenAt,
2581
+ lastSeenAt: document.lastSeenAt,
2582
+ lastChangedAt: document.lastChangedAt,
2583
+ metadata: chunk.metadata
2584
+ };
2585
+ })
2586
+ );
2587
+ return { retrievalMode: "lexical", results: latestResults.filter((result2) => result2 != null) };
2588
+ }
2589
+ const lexicalHits = async () => {
2590
+ const index = await loadHydratedIndex(workspacePath);
2591
+ const all = await index.searchRequest({ query: buildSearchQuery(normalizedQuery, { sourceId, sourceIds, sourceType, sourceTypes, tag, tags, metadata }), limit: candidateLimit });
2592
+ return all.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit);
2593
+ };
2594
+ const denseHits = async () => {
2595
+ if (!await fileExists(denseVectorPath(workspacePath))) {
2596
+ throw new CliError("dense vector index is not built; run `qli models pull --dense` and `qli rebuild`", "DENSE_INDEX_MISSING", 7 /* QueryError */);
2597
+ }
2598
+ return denseQuery({ workspacePath, config: config.retrieval.dense, query: normalizedQuery, topK: candidateLimit }).then((hits2) => hits2.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit));
2599
+ };
2600
+ const sparseHits = async () => {
2601
+ if (!await fileExists(sparseVectorPath(workspacePath))) {
2602
+ throw new CliError("sparse vector index is not built; run `qli models pull --sparse` and `qli rebuild`", "SPARSE_INDEX_MISSING", 7 /* QueryError */);
2603
+ }
2604
+ return sparseQuery({ workspacePath, config: config.retrieval.sparse, query: normalizedQuery, topK: candidateLimit }).then((hits2) => hits2.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit));
2605
+ };
2606
+ let hits;
2607
+ if (mode === "lexical") {
2608
+ hits = await lexicalHits();
2609
+ } else if (mode === "dense") {
2610
+ hits = await denseHits();
2611
+ } else if (mode === "sparse") {
2612
+ hits = await sparseHits();
2613
+ } else {
2614
+ const rankings = [await lexicalHits()];
2615
+ if (await fileExists(denseVectorPath(workspacePath))) {
2616
+ rankings.push(await denseQuery({ workspacePath, config: config.retrieval.dense, query: normalizedQuery, topK: candidateLimit }).then((dense) => dense.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit)));
2617
+ }
2618
+ if (await fileExists(sparseVectorPath(workspacePath))) {
2619
+ rankings.push(await sparseQuery({ workspacePath, config: config.retrieval.sparse, query: normalizedQuery, topK: candidateLimit }).then((sparse) => sparse.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit)));
2620
+ }
2621
+ hits = reciprocalRankFusion(rankings, { rankConstant: 20, weights: rankings.map((_, index) => index === 0 ? 3 : 1) }).slice(0, candidateLimit);
2622
+ }
2623
+ const rawResults = await Promise.all(hits.map(async ([chunkId, score]) => {
2624
+ const chunk = chunks.get(chunkId);
2625
+ if (!chunk) {
2626
+ return null;
2627
+ }
2628
+ return {
2629
+ chunkId,
2630
+ documentId: chunk.documentId,
2631
+ sourceId: chunk.sourceId,
2632
+ sourceType: documents.get(chunk.documentId)?.sourceType ?? "text",
2633
+ score,
2634
+ title: chooseResultTitle(chunk),
2635
+ uri: chunk.uri,
2636
+ headingPath: chunk.headingPath,
2637
+ snippet: await buildSnippetWithAdjacentChunks(chunk, normalizedQuery, {
2638
+ document: documents.get(chunk.documentId),
2639
+ config,
2640
+ orderedChunkCache
2641
+ }),
2642
+ text: showChunks ? chunk.text : void 0,
2643
+ publicationDate: documents.get(chunk.documentId)?.publicationDate ?? null,
2644
+ firstSeenAt: documents.get(chunk.documentId)?.firstSeenAt ?? chunk.firstSeenAt,
2645
+ lastSeenAt: documents.get(chunk.documentId)?.lastSeenAt ?? chunk.lastSeenAt,
2646
+ lastChangedAt: documents.get(chunk.documentId)?.lastChangedAt ?? chunk.lastChangedAt,
2647
+ metadata: chunk.metadata
2648
+ };
2649
+ }));
2650
+ const results = rawResults.filter((result2) => result2 != null);
2651
+ return { retrievalMode: mode, results: rerankResultsByDocument(results, topK) };
2652
+ }
2653
+
2654
+ // src/query/related-service.ts
2655
+ import path19 from "path";
2656
+ function cosineSimilarity(left, right) {
2657
+ let dot = 0;
2658
+ let leftNorm = 0;
2659
+ let rightNorm = 0;
2660
+ for (let index = 0; index < left.length; index += 1) {
2661
+ const leftValue = left[index] ?? 0;
2662
+ const rightValue = right[index] ?? 0;
2663
+ dot += leftValue * rightValue;
2664
+ leftNorm += leftValue * leftValue;
2665
+ rightNorm += rightValue * rightValue;
2666
+ }
2667
+ if (leftNorm === 0 || rightNorm === 0) {
2668
+ return 0;
2669
+ }
2670
+ return dot / (Math.sqrt(leftNorm) * Math.sqrt(rightNorm));
2671
+ }
2672
+ function normalizeVector(values) {
2673
+ const norm = Math.sqrt(values.reduce((sum, value) => sum + value * value, 0));
2674
+ if (norm === 0) {
2675
+ return values.map(() => 0);
2676
+ }
2677
+ return values.map((value) => value / norm);
2678
+ }
2679
+ function averageEmbeddings(records, dimensions) {
2680
+ const totals = new Array(dimensions).fill(0);
2681
+ for (const record of records) {
2682
+ for (let index = 0; index < dimensions; index += 1) {
2683
+ totals[index] = (totals[index] ?? 0) + (record.embedding[index] ?? 0);
2684
+ }
2685
+ }
2686
+ return normalizeVector(totals.map((value) => value / Math.max(records.length, 1)));
2687
+ }
2688
+ function resolveDocumentSelector(documents, selector) {
2689
+ const normalized = selector.trim().toLowerCase();
2690
+ const matches = documents.filter(
2691
+ (document) => document.id.toLowerCase() === normalized || document.uri.toLowerCase() === normalized || document.canonicalUri?.toLowerCase() === normalized
2692
+ );
2693
+ if (matches.length === 0) {
2694
+ throw new CliError(`document not found: ${selector}`, "DOCUMENT_NOT_FOUND", 2 /* InvalidArguments */);
2695
+ }
2696
+ if (matches.length > 1) {
2697
+ throw new CliError(`document selector is ambiguous: ${selector}`, "DOCUMENT_SELECTOR_AMBIGUOUS", 2 /* InvalidArguments */);
2698
+ }
2699
+ return matches[0];
2700
+ }
2701
+ function buildDocumentVectors(documents, denseChunks, dimensions) {
2702
+ const byDocument = /* @__PURE__ */ new Map();
2703
+ for (const chunk of denseChunks) {
2704
+ const existing = byDocument.get(chunk.documentId);
2705
+ if (existing) {
2706
+ existing.push(chunk);
2707
+ } else {
2708
+ byDocument.set(chunk.documentId, [chunk]);
2709
+ }
2710
+ }
2711
+ return new Map(documents.flatMap((document) => {
2712
+ const records = byDocument.get(document.id);
2713
+ if (!records?.length) {
2714
+ return [];
2715
+ }
2716
+ return [[document.id, { document, embedding: averageEmbeddings(records, dimensions) }]];
2717
+ }));
2718
+ }
2719
+ async function findRelatedDocuments({
2720
+ workspacePath,
2721
+ document,
2722
+ topK
2723
+ }) {
2724
+ const config = await loadConfig(workspacePath);
2725
+ if (!config.retrieval.dense.enabled) {
2726
+ throw new CliError("dense retrieval is disabled in config; enable retrieval.dense.enabled and rebuild", "DENSE_RETRIEVAL_DISABLED", 7 /* QueryError */);
2727
+ }
2728
+ if (!await fileExists(denseVectorPath(workspacePath))) {
2729
+ throw new CliError("dense vector index is not built; run `qli models pull --dense` and `qli rebuild`", "DENSE_INDEX_MISSING", 7 /* QueryError */);
2730
+ }
2731
+ const documents = await readJsonl(path19.join(workspacePath, "documents", "documents.jsonl"));
2732
+ const selected = resolveDocumentSelector(documents, document);
2733
+ const densePayload = await readDensePayload(workspacePath);
2734
+ const vectors = buildDocumentVectors(documents, densePayload.chunks, densePayload.metadata.dimensions);
2735
+ const sourceVector = vectors.get(selected.id);
2736
+ if (!sourceVector) {
2737
+ throw new CliError(`dense vectors are missing for document: ${document}`, "DOCUMENT_VECTOR_MISSING", 7 /* QueryError */);
2738
+ }
2739
+ const results = [...vectors.values()].filter((candidate) => candidate.document.id !== selected.id).map((candidate) => ({
2740
+ documentId: candidate.document.id,
2741
+ sourceId: candidate.document.sourceId,
2742
+ score: cosineSimilarity(sourceVector.embedding, candidate.embedding),
2743
+ title: candidate.document.title,
2744
+ uri: candidate.document.uri,
2745
+ metadata: candidate.document.metadata
2746
+ })).sort((left, right) => right.score - left.score).slice(0, topK);
2747
+ return {
2748
+ sourceDocument: {
2749
+ documentId: selected.id,
2750
+ sourceId: selected.sourceId,
2751
+ title: selected.title,
2752
+ uri: selected.uri
2753
+ },
2754
+ retrievalMode: "dense",
2755
+ results
2756
+ };
2757
+ }
2758
+
2759
+ // src/query/context-builder.ts
2760
+ async function createContext({
2761
+ workspacePath,
2762
+ query,
2763
+ topK,
2764
+ maxChars,
2765
+ retrievalMode
2766
+ }) {
2767
+ const search = await searchIndex({ workspacePath, query, topK, showChunks: true, retrievalMode });
2768
+ const sources = [];
2769
+ let total = 0;
2770
+ for (const result2 of search.results) {
2771
+ const text = result2.text ?? "";
2772
+ if (total + text.length > maxChars && sources.length > 0) {
2773
+ break;
2774
+ }
2775
+ total += text.length;
2776
+ sources.push({
2777
+ chunkId: result2.chunkId,
2778
+ documentId: result2.documentId,
2779
+ sourceId: result2.sourceId,
2780
+ title: result2.title,
2781
+ uri: result2.uri,
2782
+ headingPath: result2.headingPath,
2783
+ text,
2784
+ metadata: result2.metadata
2785
+ });
2786
+ }
2787
+ const markdown = [
2788
+ "# Context",
2789
+ "",
2790
+ ...sources.flatMap((source, index) => [
2791
+ `## Source ${index + 1}`,
2792
+ `Title: ${source.title}`,
2793
+ `URL: ${source.uri}`,
2794
+ `Chunk ID: ${source.chunkId}`,
2795
+ source.headingPath.length > 0 ? `Heading Path: ${source.headingPath.join(" > ")}` : "",
2796
+ "",
2797
+ source.text,
2798
+ ""
2799
+ ].filter((line) => line !== ""))
2800
+ ].join("\n");
2801
+ return { markdown, sources, retrievalMode: search.retrievalMode };
2802
+ }
2803
+
2804
+ // src/report/diff-service.ts
2805
+ import path20 from "path";
2806
+ function chooseBaselineRun(runs, since) {
2807
+ if (since === "last-run") {
2808
+ return runs.at(-1);
2809
+ }
2810
+ if (since) {
2811
+ return runs.filter((run) => run.createdAt < since).at(-1) ?? runs.at(-1);
2812
+ }
2813
+ return runs.at(-1);
2814
+ }
2815
+ async function diffWorkspace({
2816
+ workspacePath,
2817
+ sourceId,
2818
+ documentId,
2819
+ since
2820
+ }) {
2821
+ const current = await readJsonl(path20.join(workspacePath, "documents", "documents.jsonl"));
2822
+ const baseline = chooseBaselineRun(await listRuns(workspacePath), since);
2823
+ const previous = new Map((baseline?.documentsSnapshot ?? []).map((document) => [document.id, document]));
2824
+ const changedDocuments = current.filter((document) => (!sourceId || document.sourceId === sourceId) && (!documentId || document.id === documentId)).filter((document) => {
2825
+ const prior = previous.get(document.id);
2826
+ return !prior || prior.contentHash !== document.contentHash || since && document.lastChangedAt >= since;
2827
+ }).map((document) => ({
2828
+ id: document.id,
2829
+ title: document.title,
2830
+ uri: document.uri,
2831
+ sourceId: document.sourceId,
2832
+ previousHash: previous.get(document.id)?.contentHash,
2833
+ currentHash: document.contentHash
2834
+ }));
2835
+ return { changedDocuments };
2836
+ }
2837
+ function renderChangeReport(diff) {
2838
+ return [
2839
+ "# Knowledge Base Change Report",
2840
+ "",
2841
+ "## Summary",
2842
+ "",
2843
+ `Changed documents: ${diff.changedDocuments.length}`,
2844
+ "",
2845
+ "## Added Documents",
2846
+ "",
2847
+ "_No added documents in this simple report._",
2848
+ "",
2849
+ "## Changed Documents",
2850
+ "",
2851
+ ...diff.changedDocuments.map((document) => `- ${document.title} (${document.uri}) [${document.id}]`),
2852
+ "",
2853
+ "## Removed or Missing Documents",
2854
+ "",
2855
+ "_Removal tracking is not available for this report._",
2856
+ "",
2857
+ "## Notable Changed Sections",
2858
+ "",
2859
+ ...diff.changedDocuments.map((document) => `- ${document.sourceId}: ${document.title}`)
2860
+ ].join("\n");
2861
+ }
2862
+
2863
+ // src/cli/format.ts
2864
+ import Table from "cli-table3";
2865
+ import colors from "picocolors";
2866
+ function formatSourcesTable(sources) {
2867
+ const table = new Table({ head: ["ID", "TYPE", "NAME", "URI", "ENABLED", "TAGS"] });
2868
+ for (const source of sources) {
2869
+ table.push([source.id, source.type, source.name, source.uri, String(source.enabled), source.tags.join(",")]);
2870
+ }
2871
+ return table.toString();
2872
+ }
2873
+ function formatSearchResults(results) {
2874
+ return results.map((result2, index) => [
2875
+ `${index + 1}. ${colors.bold(result2.title)}`,
2876
+ ` ${result2.uri}`,
2877
+ ` Source type: ${result2.sourceType}`,
2878
+ ` Published: ${result2.publicationDate ?? "n/a"}`,
2879
+ ` Score: ${result2.score.toFixed(3)}`,
2880
+ ` ${result2.snippet}`
2881
+ ].join("\n")).join("\n\n");
2882
+ }
2883
+ function formatRelatedDocuments(results) {
2884
+ return results.map((result2, index) => [
2885
+ `${index + 1}. ${colors.bold(result2.title)}`,
2886
+ ` ${result2.uri}`,
2887
+ ` Similarity: ${result2.score.toFixed(3)}`
2888
+ ].join("\n")).join("\n\n");
2889
+ }
2890
+
2891
+ // src/cli/run-cli.ts
2892
+ var SOURCE_TYPES = /* @__PURE__ */ new Set(["url", "website", "rss", "file", "directory", "markdown", "text"]);
2893
+ var RETRIEVAL_MODES = /* @__PURE__ */ new Set(["lexical", "dense", "sparse", "hybrid"]);
2894
+ var SOURCE_TYPE_LIST = ["url", "website", "rss", "file", "directory", "markdown", "text"];
2895
+ var RETRIEVAL_MODE_LIST = ["lexical", "dense", "sparse", "hybrid"];
2896
+ var SEARCH_DATE_FIELDS = ["publicationDate", "firstSeenAt", "lastSeenAt", "lastChangedAt", "crawledAt"];
2897
+ function parseKeyValue(input) {
2898
+ const idx = input.indexOf("=");
2899
+ if (idx <= 0) {
2900
+ throw new CliError(`invalid key=value pair: ${input}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
2901
+ }
2902
+ return [input.slice(0, idx), input.slice(idx + 1)];
2903
+ }
2904
+ function normalizeMetadata(values = []) {
2905
+ return Object.fromEntries(values.map(parseKeyValue));
2906
+ }
2907
+ function parseOptionalNumber(input, optionName) {
2908
+ if (input === void 0) {
2909
+ return void 0;
2910
+ }
2911
+ const value = Number(input);
2912
+ if (!Number.isFinite(value)) {
2913
+ throw new CliError(`invalid number for ${optionName}: ${input}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
2914
+ }
2915
+ return value;
2916
+ }
2917
+ function setWhenDefined(target, key, value) {
2918
+ if (value !== void 0) {
2919
+ target[key] = value;
2920
+ }
2921
+ }
2922
+ function createSourceCrawlConfig(type, options, defaults) {
2923
+ if (!["url", "website", "directory", "rss"].includes(type)) {
2924
+ return void 0;
2925
+ }
2926
+ const crawl = {};
2927
+ setWhenDefined(crawl, "maxDepth", parseOptionalNumber(options.maxDepth, "--max-depth"));
2928
+ setWhenDefined(crawl, "maxPages", parseOptionalNumber(options.maxPages, "--max-pages"));
2929
+ setWhenDefined(crawl, "includePatterns", options.include);
2930
+ setWhenDefined(crawl, "excludePatterns", options.exclude);
2931
+ setWhenDefined(crawl, "obeyRobotsTxt", options.robots);
2932
+ setWhenDefined(crawl, "rateLimitMs", parseOptionalNumber(options.rateLimitMs, "--rate-limit-ms"));
2933
+ if (options.renderJs) {
2934
+ crawl.renderJs = true;
2935
+ }
2936
+ if (type === "website") {
2937
+ crawl.useSitemap = true;
2938
+ }
2939
+ if (type === "rss") {
2940
+ crawl.retentionDays = parseOptionalNumber(options.retentionDays, "--retention-days") ?? defaults.retentionDays;
2941
+ crawl.fetchArticles = true;
2942
+ } else {
2943
+ setWhenDefined(crawl, "retentionDays", parseOptionalNumber(options.retentionDays, "--retention-days"));
2944
+ }
2945
+ return Object.keys(crawl).length > 0 ? crawl : void 0;
2946
+ }
2947
+ function allowedSourceConfigFields(source) {
2948
+ const fields = /* @__PURE__ */ new Set(["name", "tag", "metadata"]);
2949
+ if (source.type === "rss") {
2950
+ fields.add("retentionDays");
2951
+ }
2952
+ if (source.type === "website") {
2953
+ fields.add("maxDepth");
2954
+ fields.add("maxPages");
2955
+ fields.add("include");
2956
+ fields.add("exclude");
2957
+ }
2958
+ if (source.type === "directory") {
2959
+ fields.add("include");
2960
+ fields.add("exclude");
2961
+ }
2962
+ return fields;
2963
+ }
2964
+ function buildSourceConfigPatch(source, options) {
2965
+ const allowed = allowedSourceConfigFields(source);
2966
+ const patch = {};
2967
+ if (options.name !== void 0) {
2968
+ patch.name = options.name;
2969
+ }
2970
+ if (options.tag !== void 0) {
2971
+ patch.tags = options.tag;
2972
+ }
2973
+ if (options.metadata !== void 0) {
2974
+ patch.metadata = normalizeMetadata(options.metadata);
2975
+ }
2976
+ const crawlPatch = {};
2977
+ const checkAllowed = (field, optionName) => {
2978
+ if (!allowed.has(field)) {
2979
+ throw new CliError(`${optionName} is not supported for source type ${source.type}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
2980
+ }
2981
+ };
2982
+ if (options.maxDepth !== void 0) {
2983
+ checkAllowed("maxDepth", "--max-depth");
2984
+ crawlPatch.maxDepth = parseOptionalNumber(options.maxDepth, "--max-depth");
2985
+ }
2986
+ if (options.maxPages !== void 0) {
2987
+ checkAllowed("maxPages", "--max-pages");
2988
+ crawlPatch.maxPages = parseOptionalNumber(options.maxPages, "--max-pages");
2989
+ }
2990
+ if (options.include !== void 0) {
2991
+ checkAllowed("include", "--include");
2992
+ crawlPatch.includePatterns = options.include;
2993
+ }
2994
+ if (options.exclude !== void 0) {
2995
+ checkAllowed("exclude", "--exclude");
2996
+ crawlPatch.excludePatterns = options.exclude;
2997
+ }
2998
+ if (options.retentionDays !== void 0) {
2999
+ checkAllowed("retentionDays", "--retention-days");
3000
+ crawlPatch.retentionDays = parseOptionalNumber(options.retentionDays, "--retention-days");
3001
+ }
3002
+ if (Object.keys(crawlPatch).length > 0) {
3003
+ patch.crawl = crawlPatch;
3004
+ }
3005
+ return patch;
3006
+ }
3007
+ function response(command, workspace, data, error) {
3008
+ return {
3009
+ ok: !error,
3010
+ command,
3011
+ workspace,
3012
+ version: PACKAGE_VERSION,
3013
+ data,
3014
+ error
3015
+ };
3016
+ }
3017
+ function writeOutput(capture, value, stderr = false) {
3018
+ (stderr ? capture.stderr : capture.stdout).push(value);
3019
+ }
3020
+ function parseRetrievalMode(input) {
3021
+ if (!input) {
3022
+ return void 0;
3023
+ }
3024
+ if (!RETRIEVAL_MODES.has(input)) {
3025
+ throw new CliError(`unsupported retrieval mode: ${input}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
3026
+ }
3027
+ return input;
3028
+ }
3029
+ function parseSourceType(input) {
3030
+ if (!input) {
3031
+ return void 0;
3032
+ }
3033
+ if (!SOURCE_TYPES.has(input)) {
3034
+ throw new CliError(`unsupported source type: ${input}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
3035
+ }
3036
+ return input;
3037
+ }
3038
+ function parseCommaSeparatedList(input) {
3039
+ const values = (input ?? "").split(",").map((value) => value.trim()).filter(Boolean);
3040
+ return values.length > 0 ? values : void 0;
3041
+ }
3042
+ function parseSourceTypes(input) {
3043
+ const values = parseCommaSeparatedList(input);
3044
+ if (!values) {
3045
+ return void 0;
3046
+ }
3047
+ return values.map((value) => parseSourceType(value));
3048
+ }
3049
+ function parseDateValue(input, optionName) {
3050
+ const parsed = new Date(input);
3051
+ if (Number.isNaN(parsed.getTime())) {
3052
+ throw new CliError(`invalid date for ${optionName}: ${input}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
3053
+ }
3054
+ return parsed.toISOString();
3055
+ }
3056
+ function searchDateRanges(options) {
3057
+ const entries = [];
3058
+ if (options.since || options.until) {
3059
+ entries.push({
3060
+ field: "publicationDate",
3061
+ from: options.since ? parseDateValue(options.since, "--since") : void 0,
3062
+ to: options.until ? parseDateValue(options.until, "--until") : void 0
3063
+ });
3064
+ }
3065
+ if (options.changedSince) {
3066
+ entries.push({
3067
+ field: "lastChangedAt",
3068
+ from: parseDateValue(options.changedSince, "--changed-since")
3069
+ });
3070
+ }
3071
+ for (const field of SEARCH_DATE_FIELDS) {
3072
+ const fromKey = `${field}From`;
3073
+ const toKey = `${field}To`;
3074
+ const from = options[fromKey];
3075
+ const to = options[toKey];
3076
+ if (!from && !to) {
3077
+ continue;
3078
+ }
3079
+ entries.push({
3080
+ field,
3081
+ from: from ? parseDateValue(from, `--${field}-from`) : void 0,
3082
+ to: to ? parseDateValue(to, `--${field}-to`) : void 0
3083
+ });
3084
+ }
3085
+ return entries;
3086
+ }
3087
+ async function resolveWorkspace(options) {
3088
+ return path21.resolve(options.workspace ?? DEFAULT_WORKSPACE);
3089
+ }
3090
+ function workspaceFromArgv(argv) {
3091
+ const index = argv.findIndex((arg) => arg === "--workspace");
3092
+ if (index >= 0 && argv[index + 1]) {
3093
+ return path21.resolve(argv[index + 1]);
3094
+ }
3095
+ return path21.resolve(DEFAULT_WORKSPACE);
3096
+ }
3097
+ async function runCli(argv) {
3098
+ const capture = { stdout: [], stderr: [] };
3099
+ const program = new Command();
3100
+ program.name("qli").description("Build and query a local Querylight workspace from files, directories, URLs, websites, and feeds.").showHelpAfterError().option("--workspace <path>", "Workspace directory. Defaults to .kb in the current directory.", DEFAULT_WORKSPACE).option("--config <path>", "Optional config file override. Useful for testing alternate retrieval settings.").option("--json", "Return a stable JSON envelope for automation and agents.").option("--verbose", "Print more operational detail when a command supports it.").option("--quiet", "Suppress non-essential human-readable output.");
3101
+ program.addHelpText("after", `
3102
+ Workflow:
3103
+ 1. Initialize a workspace with qli init
3104
+ 2. Register one or more sources with qli source add
3105
+ 3. Build or refresh the workspace with qli rebuild
3106
+ 4. Query it with qli search, qli related, or qli context
3107
+
3108
+ Examples:
3109
+ qli init
3110
+ qli source add directory ./docs --name "Product Docs" --tag docs
3111
+ qli rebuild
3112
+ qli search "api authentication" --top-k 8
3113
+ qli context "How do API keys work?" --top-k 8 --max-chars 8000
3114
+
3115
+ Use qli <command> --help for command-specific options and examples.`);
3116
+ program.command("init").description("Create a new workspace with the default directory layout and config.").option("--force").addHelpText("after", `
3117
+ Examples:
3118
+ qli init
3119
+ qli init --workspace ./kb
3120
+ qli init --workspace /tmp/querylight --force`).action(async function command(options) {
3121
+ const workspace = await resolveWorkspace({ workspace: this.optsWithGlobals().workspace });
3122
+ const result2 = await ensureWorkspace({ workspacePath: workspace, force: Boolean(options.force) });
3123
+ emit(this.optsWithGlobals().json, capture, response("init", workspace, result2), `Initialized workspace at ${workspace}`);
3124
+ });
3125
+ const source = program.command("source");
3126
+ source.description("Register, inspect, and manage workspace sources.");
3127
+ source.command("add").description("Add a source definition. The source is enabled immediately.").argument("<type>", `Source type: ${SOURCE_TYPE_LIST.join(", ")}`).argument("<uri>", "Local path, URL, feed URL, or inline content depending on the source type.").requiredOption("--name <name>").option("--tag <tag...>", "Optional tags used later for filtering during search.").option("--metadata <key=value...>", "Extra metadata fields stored on the source.").option("--max-depth <n>", "Maximum crawl depth for website sources.").option("--max-pages <n>", "Maximum number of pages to ingest from a website source.").option("--include <pattern...>", "Only include matching paths or URLs.").option("--exclude <pattern...>", "Skip matching paths or URLs.").option("--render-js", "Render pages with JavaScript before extraction when supported.").option("--no-robots", "Ignore robots.txt for website crawling. Use only when you control the target site or have permission.").option("--rate-limit-ms <n>", "Delay between website requests.").option("--retention-days <n>", "Retention window in days for RSS items. Defaults to the workspace crawler retention setting.").addHelpText("after", `
3128
+ Examples:
3129
+ qli source add directory ./docs --name "Local Docs" --tag docs
3130
+ qli source add file ./docs/auth.md --name "Auth Guide"
3131
+ qli source add url https://example.com/docs/auth --name "Auth Page"
3132
+ qli source add website https://example.com --name "Docs Site" --max-depth 2 --max-pages 50 --include /docs/
3133
+ qli source add rss https://example.com/feed.xml --name "Release Feed"
3134
+ qli source add rss https://example.com/feed.xml --name "Release Feed" --retention-days 30
3135
+
3136
+ Notes:
3137
+ RSS sources store retention per feed.
3138
+ When you omit --retention-days for RSS, qli stores the workspace default from config.yaml.`).action(async function command(type, uri, options) {
3139
+ if (!SOURCE_TYPES.has(type)) {
3140
+ throw new CliError(`unsupported source type: ${type}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
3141
+ }
3142
+ const global = this.optsWithGlobals();
3143
+ const workspace = await resolveWorkspace({ workspace: global.workspace });
3144
+ const config = await loadConfig(workspace, global.config);
3145
+ const now = (/* @__PURE__ */ new Date()).toISOString();
3146
+ const crawl = createSourceCrawlConfig(type, options, { retentionDays: config.crawler.retentionDays });
3147
+ const stored = await addSource(workspace, {
3148
+ type,
3149
+ uri: ["file", "directory"].includes(type) ? path21.resolve(uri) : uri,
3150
+ name: options.name,
3151
+ enabled: true,
3152
+ tags: options.tag ?? [],
3153
+ metadata: normalizeMetadata(options.metadata),
3154
+ crawl,
3155
+ createdAt: now,
3156
+ updatedAt: now
3157
+ });
3158
+ emit(global.json, capture, response("source add", workspace, stored), `Added source ${stored.id}`);
3159
+ });
3160
+ source.command("config").description("Edit supported settings on an existing source.").argument("<sourceId>", "Source id from qli source list.").option("--name <name>", "Update the source name.").option("--tag <tag...>", "Replace source tags with the provided values.").option("--metadata <key=value...>", "Merge metadata keys into the existing source metadata.").option("--max-depth <n>", "Set website crawl depth.").option("--max-pages <n>", "Set the page limit for website sources.").option("--include <pattern...>", "Set include patterns for website or directory sources.").option("--exclude <pattern...>", "Set exclude patterns for website or directory sources.").option("--retention-days <n>", "Set RSS retention in days for this feed.").addHelpText("after", `
3161
+ Examples:
3162
+ qli source config src_123 --retention-days 30
3163
+ qli source config src_123 --name "Docs Feed" --tag rss docs
3164
+ qli source config src_123 --include /docs/ --exclude /docs/archive/
3165
+ qli source config src_123 --metadata team=docs owner=platform --json
3166
+
3167
+ Notes:
3168
+ qli only exposes settings that the current source type uses at runtime.
3169
+ URI, source type, and source id do not change here.`).action(async function command(sourceId, options) {
3170
+ const global = this.optsWithGlobals();
3171
+ const workspace = await resolveWorkspace({ workspace: global.workspace });
3172
+ const sources = await listSources(workspace);
3173
+ const current = sources.find((source2) => source2.id === sourceId);
3174
+ if (!current) {
3175
+ throw new CliError(`source not found: ${sourceId}`, "SOURCE_NOT_FOUND", 4 /* SourceError */);
3176
+ }
3177
+ const patch = buildSourceConfigPatch(current, options);
3178
+ if (Object.keys(patch).length === 0) {
3179
+ throw new CliError("no changes requested", "INVALID_ARGUMENT", 2 /* InvalidArguments */);
3180
+ }
3181
+ patch.updatedAt = (/* @__PURE__ */ new Date()).toISOString();
3182
+ const updated = await updateSource(workspace, sourceId, patch);
3183
+ emit(global.json, capture, response("source config", workspace, updated), `Updated source ${sourceId}`);
3184
+ });
3185
+ source.command("list").description("List all configured sources in the workspace.").addHelpText("after", `
3186
+ Examples:
3187
+ qli source list
3188
+ qli source list --json`).action(async function command() {
3189
+ const global = this.optsWithGlobals();
3190
+ const workspace = await resolveWorkspace({ workspace: global.workspace });
3191
+ const sources = await listSources(workspace);
3192
+ emit(global.json, capture, response("source list", workspace, sources), formatSourcesTable(sources));
3193
+ });
3194
+ source.command("remove").description("Delete a source definition from the workspace.").argument("<sourceId>", "Source id from qli source list.").addHelpText("after", `
3195
+ Examples:
3196
+ qli source remove src_123
3197
+ qli source list --json`).action(async function command(sourceId) {
3198
+ const global = this.optsWithGlobals();
3199
+ const workspace = await resolveWorkspace({ workspace: global.workspace });
3200
+ await removeSource(workspace, sourceId);
3201
+ emit(global.json, capture, response("source remove", workspace, { sourceId }), `Removed source ${sourceId}`);
3202
+ });
3203
+ source.command("disable").description("Disable a source without removing its configuration.").argument("<sourceId>", "Source id from qli source list.").addHelpText("after", `
3204
+ Examples:
3205
+ qli source disable src_123
3206
+ qli source enable src_123`).action(async function command(sourceId) {
3207
+ const global = this.optsWithGlobals();
3208
+ const workspace = await resolveWorkspace({ workspace: global.workspace });
3209
+ const updated = await updateSource(workspace, sourceId, { enabled: false, updatedAt: (/* @__PURE__ */ new Date()).toISOString() });
3210
+ emit(global.json, capture, response("source disable", workspace, updated), `Disabled source ${sourceId}`);
3211
+ });
3212
+ source.command("enable").description("Re-enable a disabled source.").argument("<sourceId>", "Source id from qli source list.").addHelpText("after", `
3213
+ Examples:
3214
+ qli source enable src_123
3215
+ qli source list`).action(async function command(sourceId) {
3216
+ const global = this.optsWithGlobals();
3217
+ const workspace = await resolveWorkspace({ workspace: global.workspace });
3218
+ const updated = await updateSource(workspace, sourceId, { enabled: true, updatedAt: (/* @__PURE__ */ new Date()).toISOString() });
3219
+ emit(global.json, capture, response("source enable", workspace, updated), `Enabled source ${sourceId}`);
3220
+ });
3221
+ program.command("ingest").description("Fetch and normalize source content into workspace documents.").option("--source <sourceId>", "Only ingest one source.").option("--changed-only", "Skip content that has not changed since the last run.").addHelpText("after", `
3222
+ Examples:
3223
+ qli ingest
3224
+ qli ingest --source src_123
3225
+ qli ingest --changed-only`).action(async function command(options) {
3226
+ const global = this.optsWithGlobals();
3227
+ const workspace = await resolveWorkspace({ workspace: global.workspace });
3228
+ const result2 = await ingestSources({ workspacePath: workspace, sourceIds: options.source ? [options.source] : void 0, changedOnly: Boolean(options.changedOnly) });
3229
+ emit(global.json, capture, response("ingest", workspace, result2), `Ingested ${result2.processedSources} sources`);
3230
+ });
3231
+ program.command("chunk").description("Split normalized documents into retrieval chunks.").option("--source <sourceId>", "Only chunk documents from one source.").option("--document <documentId>", "Only chunk one document.").addHelpText("after", `
3232
+ Examples:
3233
+ qli chunk
3234
+ qli chunk --source src_123
3235
+ qli chunk --document doc_123`).action(async function command(options) {
3236
+ const global = this.optsWithGlobals();
3237
+ const workspace = await resolveWorkspace({ workspace: global.workspace });
3238
+ const result2 = await chunkDocuments({ workspacePath: workspace, sourceId: options.source, documentId: options.document });
3239
+ emit(global.json, capture, response("chunk", workspace, result2), `Wrote ${result2.chunksWritten} chunks`);
3240
+ });
3241
+ program.command("reprocess").description("Re-run normalization for existing documents without fetching sources again.").option("--source <sourceId>", "Only reprocess documents from one source.").option("--document <documentId>", "Only reprocess one document.").addHelpText("after", `
3242
+ Examples:
3243
+ qli reprocess
3244
+ qli reprocess --source src_123
3245
+ qli reprocess --document doc_123`).action(async function command(options) {
3246
+ const global = this.optsWithGlobals();
3247
+ const workspace = await resolveWorkspace({ workspace: global.workspace });
3248
+ const result2 = await reprocessDocuments({ workspacePath: workspace, sourceId: options.source, documentId: options.document });
3249
+ emit(global.json, capture, response("reprocess", workspace, result2), `Reprocessed ${result2.documentsReprocessed} documents`);
3250
+ });
3251
+ const index = program.command("index");
3252
+ index.description("Build and inspect retrieval indexes.");
3253
+ index.command("build").description("Build lexical search artifacts and optional dense or sparse vector indexes.").option("--dense", "Force a dense vector build if the dense model is available.").option("--sparse", "Force a sparse vector build if the sparse runtime is available.").addHelpText("after", `
3254
+ Examples:
3255
+ qli index build
3256
+ qli index build --dense
3257
+ qli index build --dense --sparse`).action(async function command(options) {
3258
+ const global = this.optsWithGlobals();
3259
+ const workspace = await resolveWorkspace({ workspace: global.workspace });
3260
+ const result2 = await buildIndex({
3261
+ workspacePath: workspace,
3262
+ denseOverride: options.dense ? true : void 0,
3263
+ sparseOverride: options.sparse ? true : void 0
3264
+ });
3265
+ emit(global.json, capture, response("index build", workspace, result2), `Built index at ${result2.indexPath}`);
3266
+ });
3267
+ program.command("rebuild").description("Run ingest, chunk, and index build in one command.").option("--source <sourceId>", "Only rebuild data for one source.").option("--changed-only", "Only ingest changed content before chunking and indexing.").option("--dense", "Force a dense vector build if the dense model is available.").option("--sparse", "Force a sparse vector build if the sparse runtime is available.").addHelpText("after", `
3268
+ Examples:
3269
+ qli rebuild
3270
+ qli rebuild --changed-only
3271
+ qli rebuild --source src_123
3272
+ qli rebuild --dense --sparse`).action(async function command(options) {
3273
+ const global = this.optsWithGlobals();
3274
+ const workspace = await resolveWorkspace({ workspace: global.workspace });
3275
+ const ingest = await ingestSources({ workspacePath: workspace, sourceIds: options.source ? [options.source] : void 0, changedOnly: Boolean(options.changedOnly) });
3276
+ const chunk = await chunkDocuments({ workspacePath: workspace, sourceId: options.source });
3277
+ const indexBuild = await buildIndex({
3278
+ workspacePath: workspace,
3279
+ denseOverride: options.dense ? true : void 0,
3280
+ sparseOverride: options.sparse ? true : void 0,
3281
+ buildAvailableModels: true
3282
+ });
3283
+ const data = { ingest, chunk, indexPath: indexBuild.indexPath, metadata: indexBuild.metadata };
3284
+ emit(global.json, capture, response("rebuild", workspace, data), `Processed ${ingest.processedSources} sources, wrote ${chunk.chunksWritten} chunks`);
3285
+ });
3286
+ program.command("search").description("Search the built index and return ranked matching documents or chunks.").argument("[query]", "Text query. Omit it to list the latest matching documents.").option("--top-k <n>", "Maximum number of results to return.", "12").option("--source <sourceIds>", "Restrict results to one or more source ids. Use comma-separated values.").option("--source-name <names>", "Restrict results to one or more source names. Use comma-separated values.").option("--source-type <types>", `Restrict results to one or more source types. Use comma-separated values: ${SOURCE_TYPE_LIST.join(", ")}`).option("--uri-prefix <prefixes>", "Restrict results to one or more URI prefixes. Use comma-separated values.").option("--tag <tags>", "Restrict results to one or more source tags. Use comma-separated values.").option("--metadata <key=value...>", "Restrict results to sources with matching metadata.").option("--since <date>", "Shortcut for --publication-date-from.").option("--until <date>", "Shortcut for --publication-date-to.").option("--changed-since <date>", "Only include documents changed on or after this date.").option("--has-publication-date", "Only include documents with a publication date.").option("--publication-date-from <date>", "Only include documents published on or after this date.").option("--publication-date-to <date>", "Only include documents published on or before this date.").option("--first-seen-at-from <date>", "Only include documents first seen on or after this date.").option("--first-seen-at-to <date>", "Only include documents first seen on or before this date.").option("--last-seen-at-from <date>", "Only include documents last seen on or after this date.").option("--last-seen-at-to <date>", "Only include documents last seen on or before this date.").option("--last-changed-at-from <date>", "Only include documents changed on or after this date.").option("--last-changed-at-to <date>", "Only include documents changed on or before this date.").option("--crawled-at-from <date>", "Only include documents crawled on or after this date.").option("--crawled-at-to <date>", "Only include documents crawled on or before this date.").option("--retrieval <mode>", `Retrieval mode: ${RETRIEVAL_MODE_LIST.join(", ")}`).option("--show-chunks", "Return chunk-level matches when available.").addHelpText("after", `
3287
+ Examples:
3288
+ qli search "pricing api limits"
3289
+ qli search "authentication" --top-k 20 --tag docs
3290
+ qli search --source-type rss --since 2026-05-01 --has-publication-date
3291
+ qli search --source-name "Release Feed,Company Blog" --uri-prefix https://example.com/news,https://example.com/blog
3292
+ qli search "billing" --metadata team=support
3293
+ qli search "embedding model" --retrieval hybrid --show-chunks
3294
+ qli search --source-type rss,url --top-k 25 --json
3295
+
3296
+ Notes:
3297
+ lexical works without vector models.
3298
+ dense, sparse, and hybrid require the relevant index artifacts to exist.
3299
+ When you omit the query, qli returns the latest matching documents sorted by publication date.`).action(async function command(query, options) {
3300
+ const global = this.optsWithGlobals();
3301
+ const workspace = await resolveWorkspace({ workspace: global.workspace });
3302
+ const result2 = await searchIndex({
3303
+ workspacePath: workspace,
3304
+ query: query ?? "",
3305
+ topK: Number(options.topK),
3306
+ sourceIds: parseCommaSeparatedList(options.source),
3307
+ sourceNames: parseCommaSeparatedList(options.sourceName),
3308
+ sourceTypes: parseSourceTypes(options.sourceType),
3309
+ uriPrefixes: parseCommaSeparatedList(options.uriPrefix),
3310
+ hasPublicationDate: Boolean(options.hasPublicationDate),
3311
+ tags: parseCommaSeparatedList(options.tag),
3312
+ metadata: (options.metadata ?? []).map(parseKeyValue).map(([key, value]) => ({ key, value })),
3313
+ dateRanges: searchDateRanges(options),
3314
+ retrievalMode: parseRetrievalMode(options.retrieval),
3315
+ showChunks: Boolean(options.showChunks)
3316
+ });
3317
+ emit(global.json, capture, response("search", workspace, result2), formatSearchResults(result2.results));
3318
+ });
3319
+ program.command("related").description("Find documents similar to an existing document by id or URI.").argument("<document>", "Document id, uri, or canonical uri").option("--top-k <n>", "Maximum number of related documents to return.", "12").addHelpText("after", `
3320
+ Examples:
3321
+ qli related doc_123
3322
+ qli related https://example.com/docs/auth
3323
+
3324
+ Dense vectors usually produce better related-document results. Pull models and rebuild first when needed:
3325
+ qli models pull --dense
3326
+ qli rebuild --dense`).action(async function command(document, options) {
3327
+ const global = this.optsWithGlobals();
3328
+ const workspace = await resolveWorkspace({ workspace: global.workspace });
3329
+ const result2 = await findRelatedDocuments({
3330
+ workspacePath: workspace,
3331
+ document,
3332
+ topK: Number(options.topK)
3333
+ });
3334
+ emit(global.json, capture, response("related", workspace, result2), formatRelatedDocuments(result2.results));
3335
+ });
3336
+ program.command("context").description("Assemble retrieval context for an external LLM, agent, or prompt pipeline.").argument("<query>").option("--top-k <n>", "Maximum number of source passages to consider.", "12").option("--max-chars <n>", "Maximum output length for the rendered context block.", "12000").option("--retrieval <mode>", `Retrieval mode: ${RETRIEVAL_MODE_LIST.join(", ")}`).addHelpText("after", `
3337
+ Examples:
3338
+ qli context "How do I configure the API?"
3339
+ qli context "What changed in pricing?" --top-k 10 --max-chars 9000
3340
+ qli context "How does auth work?" --retrieval hybrid
3341
+
3342
+ Use --json when another tool needs structured access to the raw passages and metadata.`).action(async function command(query, options) {
3343
+ const global = this.optsWithGlobals();
3344
+ const workspace = await resolveWorkspace({ workspace: global.workspace });
3345
+ const result2 = await createContext({
3346
+ workspacePath: workspace,
3347
+ query,
3348
+ topK: Number(options.topK),
3349
+ maxChars: Number(options.maxChars),
3350
+ retrievalMode: parseRetrievalMode(options.retrieval)
3351
+ });
3352
+ emit(global.json, capture, response("context", workspace, result2), result2.markdown);
3353
+ });
3354
+ const models = program.command("models");
3355
+ models.description("Inspect and download retrieval model assets.");
3356
+ models.command("pull").description("Download dense and or sparse retrieval assets required by vector search.").option("--dense", "Only pull dense retrieval assets.").option("--sparse", "Only pull sparse retrieval assets.").addHelpText("after", `
3357
+ Examples:
3358
+ qli models pull
3359
+ qli models pull --dense
3360
+ qli models pull --sparse
3361
+
3362
+ If you plan to use related, dense search, or hybrid retrieval, pull the models and rebuild the index first.`).action(async function command(options) {
3363
+ const global = this.optsWithGlobals();
3364
+ const workspace = await resolveWorkspace({ workspace: global.workspace });
3365
+ const config = await loadConfig(workspace, global.config);
3366
+ const status = await getModelStatus(workspace, config);
3367
+ const { pullDense, pullSparse } = resolveModelPullPlan({
3368
+ pullDenseFlag: Boolean(options.dense),
3369
+ pullSparseFlag: Boolean(options.sparse),
3370
+ uvAvailable: status.sparse.uvAvailable
3371
+ });
3372
+ await pullModels({ workspacePath: workspace, config, pullDense, pullSparse });
3373
+ const data = {
3374
+ dense: pullDense ? { pulled: true, modelId: config.retrieval.dense.modelId, cacheDir: config.retrieval.dense.cacheDir } : void 0,
3375
+ sparse: pullSparse ? { pulled: true, modelId: config.retrieval.sparse.modelId, cacheDir: config.retrieval.sparse.cacheDir } : void 0
3376
+ };
3377
+ emit(global.json, capture, response("models pull", workspace, data), "Pulled available models");
3378
+ });
3379
+ models.command("status").description("Show whether model runtimes and artifacts are available in the workspace.").addHelpText("after", `
3380
+ Examples:
3381
+ qli models status
3382
+ qli models status --json`).action(async function command() {
3383
+ const global = this.optsWithGlobals();
3384
+ const workspace = await resolveWorkspace({ workspace: global.workspace });
3385
+ const config = await loadConfig(workspace, global.config);
3386
+ const data = await getModelStatus(workspace, config);
3387
+ emit(global.json, capture, response("models status", workspace, data), JSON.stringify(data, null, 2));
3388
+ });
3389
+ program.command("diff").description("Inspect document-level changes between stored workspace versions.").option("--source <sourceId>", "Only inspect changes for one source.").option("--document <documentId>", "Only inspect one document.").option("--since <timestamp>", "Only include changes since an ISO timestamp.").addHelpText("after", `
3390
+ Examples:
3391
+ qli diff
3392
+ qli diff --source src_123
3393
+ qli diff --document doc_123
3394
+ qli diff --since 2026-05-01`).action(async function command(options) {
3395
+ const global = this.optsWithGlobals();
3396
+ const workspace = await resolveWorkspace({ workspace: global.workspace });
3397
+ const result2 = await diffWorkspace({ workspacePath: workspace, sourceId: options.source, documentId: options.document, since: options.since });
3398
+ emit(global.json, capture, response("diff", workspace, result2), JSON.stringify(result2, null, 2));
3399
+ });
3400
+ const report = program.command("report");
3401
+ report.description("Render higher-level reports from workspace data.");
3402
+ report.command("changes").description("Render a markdown change report from workspace diffs.").option("--source <sourceId>", "Only include one source.").option("--since <timestamp>", "Only include changes since an ISO timestamp.").addHelpText("after", `
3403
+ Examples:
3404
+ qli report changes
3405
+ qli report changes --since 2026-05-01
3406
+ qli report changes --source src_123 --json`).action(async function command(options) {
3407
+ const global = this.optsWithGlobals();
3408
+ const workspace = await resolveWorkspace({ workspace: global.workspace });
3409
+ const diff = await diffWorkspace({ workspacePath: workspace, sourceId: options.source, since: options.since });
3410
+ const markdown = renderChangeReport(diff);
3411
+ emit(global.json, capture, response("report changes", workspace, { markdown, diff }), markdown);
3412
+ });
3413
+ program.command("status").description("Summarize workspace size, index state, and model artifact availability.").addHelpText("after", `
3414
+ Examples:
3415
+ qli status
3416
+ qli status --json`).action(async function command() {
3417
+ const global = this.optsWithGlobals();
3418
+ const workspace = await resolveWorkspace({ workspace: global.workspace });
3419
+ const sources = await listSources(workspace);
3420
+ const documents = await readJsonl(`${workspace}/documents/documents.jsonl`);
3421
+ const chunks = await readJsonl(`${workspace}/chunks/chunks.jsonl`);
3422
+ const runs = await listRuns(workspace);
3423
+ const config = await loadConfig(workspace, global.config);
3424
+ const modelStatus = await getModelStatus(workspace, config);
3425
+ let latestIndex;
3426
+ let indexSize = 0;
3427
+ try {
3428
+ const meta = await readLatestIndexMetadata(workspace);
3429
+ latestIndex = meta.createdAt;
3430
+ indexSize = (await stat4(`${workspace}/indexes/latest.json`)).size;
3431
+ } catch {
3432
+ latestIndex = void 0;
3433
+ }
3434
+ const data = {
3435
+ workspace,
3436
+ sources: sources.length,
3437
+ documents: documents.length,
3438
+ chunks: chunks.length,
3439
+ latestIndex,
3440
+ indexSizeBytes: indexSize,
3441
+ lastRun: runs.at(-1)?.success ? "success" : runs.at(-1) ? "failed" : "none",
3442
+ denseVectorIndex: modelStatus.dense.artifactExists,
3443
+ sparseVectorIndex: modelStatus.sparse.artifactExists
3444
+ };
3445
+ emit(global.json, capture, response("status", workspace, data), [
3446
+ `Workspace: ${workspace}`,
3447
+ `Sources: ${data.sources}`,
3448
+ `Documents: ${data.documents}`,
3449
+ `Chunks: ${data.chunks}`,
3450
+ `Latest index: ${data.latestIndex ?? "none"}`,
3451
+ `Index size: ${Math.round(indexSize / 1024)} KB`,
3452
+ `Last run: ${data.lastRun}`,
3453
+ `Dense vector index: ${data.denseVectorIndex}`,
3454
+ `Sparse vector index: ${data.sparseVectorIndex}`
3455
+ ].join("\n"));
3456
+ });
3457
+ program.command("doctor").description("Run basic workspace and runtime checks.").addHelpText("after", `
3458
+ Examples:
3459
+ qli doctor
3460
+ qli doctor --json`).action(async function command() {
3461
+ const global = this.optsWithGlobals();
3462
+ const workspace = await assertWorkspaceExists(await resolveWorkspace({ workspace: global.workspace }));
3463
+ const checks = [];
3464
+ await loadConfig(workspace, global.config);
3465
+ checks.push("workspace exists");
3466
+ checks.push("config parses");
3467
+ await listSources(workspace);
3468
+ checks.push("sources parse");
3469
+ await readJsonl(`${workspace}/documents/documents.jsonl`);
3470
+ checks.push("documents parse");
3471
+ await readJsonl(`${workspace}/chunks/chunks.jsonl`);
3472
+ checks.push("chunks parse");
3473
+ const config = await loadConfig(workspace, global.config);
3474
+ if (config.retrieval.dense.enabled) {
3475
+ await import("@huggingface/transformers");
3476
+ checks.push("dense runtime importable");
3477
+ }
3478
+ if (config.retrieval.sparse.enabled) {
3479
+ await ensureUvAvailable();
3480
+ checks.push("uv available for sparse runtime");
3481
+ }
3482
+ try {
3483
+ await readLatestIndexMetadata(workspace);
3484
+ checks.push("latest index exists");
3485
+ } catch {
3486
+ checks.push("latest index missing");
3487
+ }
3488
+ emit(global.json, capture, response("doctor", workspace, { checks }), checks.join("\n"));
3489
+ });
3490
+ let exitCode = 0;
3491
+ try {
3492
+ await program.parseAsync(["node", "qli", ...argv], { from: "node" });
3493
+ } catch (error) {
3494
+ const workspace = workspaceFromArgv(argv);
3495
+ const cliError = error instanceof CliError ? error : new CliError(error.message, "GENERAL_ERROR", 1 /* GeneralError */);
3496
+ writeOutput(capture, JSON.stringify(response("error", workspace, void 0, {
3497
+ code: cliError.code,
3498
+ message: cliError.message,
3499
+ details: cliError.details
3500
+ })), true);
3501
+ exitCode = cliError.exitCode;
3502
+ }
3503
+ return {
3504
+ exitCode,
3505
+ stdout: capture.stdout.join("\n"),
3506
+ stderr: capture.stderr.join("\n")
3507
+ };
3508
+ }
3509
+ function emit(asJson, capture, body, human) {
3510
+ writeOutput(capture, asJson ? JSON.stringify(body) : human);
3511
+ }
3512
+
3513
+ // src/cli/main.ts
3514
+ var result = await runCli(process.argv.slice(2));
3515
+ if (result.stdout) {
3516
+ process.stdout.write(`${result.stdout}
3517
+ `);
3518
+ }
3519
+ if (result.stderr) {
3520
+ process.stderr.write(`${result.stderr}
3521
+ `);
3522
+ }
3523
+ process.exit(result.exitCode);