@tryformation/querylight-cli 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/Dockerfile +7 -0
  2. package/LICENSE +21 -0
  3. package/README.md +391 -0
  4. package/dist/chunk/chunk-store.d.ts +4 -0
  5. package/dist/chunk/chunker.d.ts +9 -0
  6. package/dist/cli/format.d.ts +4 -0
  7. package/dist/cli/main.d.ts +2 -0
  8. package/dist/cli/main.js +3523 -0
  9. package/dist/cli/run-cli.d.ts +5 -0
  10. package/dist/core/config.d.ts +4 -0
  11. package/dist/core/constants.d.ts +3 -0
  12. package/dist/core/errors.d.ts +17 -0
  13. package/dist/core/files.d.ts +1 -0
  14. package/dist/core/hashing.d.ts +1 -0
  15. package/dist/core/ids.d.ts +1 -0
  16. package/dist/core/jsonl.d.ts +2 -0
  17. package/dist/core/runs.d.ts +3 -0
  18. package/dist/core/workspace.d.ts +7 -0
  19. package/dist/index/index-store.d.ts +11 -0
  20. package/dist/index/querylight-indexer.d.ts +14 -0
  21. package/dist/index.d.ts +11 -0
  22. package/dist/index.js +2794 -0
  23. package/dist/ingest/adapters/crawl4ai-adapter.d.ts +1 -0
  24. package/dist/ingest/adapters/directory-adapter.d.ts +2 -0
  25. package/dist/ingest/adapters/file-adapter.d.ts +16 -0
  26. package/dist/ingest/adapters/rss-adapter.d.ts +7 -0
  27. package/dist/ingest/adapters/url-adapter.d.ts +11 -0
  28. package/dist/ingest/adapters/website-adapter.d.ts +2 -0
  29. package/dist/ingest/document-utils.d.ts +24 -0
  30. package/dist/ingest/extractors/docx-extractor.d.ts +1 -0
  31. package/dist/ingest/extractors/html-extractor.d.ts +5 -0
  32. package/dist/ingest/extractors/markdown-extractor.d.ts +1 -0
  33. package/dist/ingest/extractors/pdf-extractor.d.ts +1 -0
  34. package/dist/ingest/extractors/text-extractor.d.ts +1 -0
  35. package/dist/ingest/ingest-service.d.ts +23 -0
  36. package/dist/normalize/boilerplate.d.ts +1 -0
  37. package/dist/normalize/normalize-markdown.d.ts +2 -0
  38. package/dist/query/context-builder.d.ts +8 -0
  39. package/dist/query/related-service.d.ts +6 -0
  40. package/dist/query/search-service.d.ts +31 -0
  41. package/dist/report/diff-service.d.ts +23 -0
  42. package/dist/sources/source-model.d.ts +1 -0
  43. package/dist/sources/source-store.d.ts +7 -0
  44. package/dist/types/models.d.ts +309 -0
  45. package/dist/vector/dense.d.ts +13 -0
  46. package/dist/vector/runtime.d.ts +18 -0
  47. package/dist/vector/service.d.ts +26 -0
  48. package/dist/vector/sparse.d.ts +19 -0
  49. package/dist/vector/store.d.ts +20 -0
  50. package/dist/vector/text.d.ts +3 -0
  51. package/package.json +66 -0
  52. package/scripts/sparse-encode.py +104 -0
package/dist/index.js ADDED
@@ -0,0 +1,2794 @@
1
+ // src/core/workspace.ts
2
+ import { mkdir, stat } from "fs/promises";
3
+ import path2 from "path";
4
+
5
+ // src/core/errors.ts
6
+ var CliError = class extends Error {
7
+ constructor(message, code, exitCode, details) {
8
+ super(message);
9
+ this.code = code;
10
+ this.exitCode = exitCode;
11
+ this.details = details;
12
+ this.name = "CliError";
13
+ }
14
+ code;
15
+ exitCode;
16
+ details;
17
+ };
18
+
19
+ // src/core/config.ts
20
+ import { readFile, writeFile } from "fs/promises";
21
+ import path from "path";
22
+ import YAML from "yaml";
23
+ var defaultConfig = () => ({
24
+ workspaceVersion: 1,
25
+ index: {
26
+ name: "default",
27
+ fields: {
28
+ text: { type: "text", weight: 1 },
29
+ title: { type: "text", weight: 2 },
30
+ uri: { type: "keyword" },
31
+ sourceId: { type: "keyword" },
32
+ tags: { type: "keyword" },
33
+ contentType: { type: "keyword" }
34
+ },
35
+ chunking: {
36
+ maxChars: 1800,
37
+ overlapChars: 200,
38
+ minChars: 120,
39
+ splitOnHeadings: true
40
+ }
41
+ },
42
+ rag: {
43
+ defaultTopK: 12,
44
+ maxContextChars: 12e3,
45
+ citationStyle: "markdown"
46
+ },
47
+ retrieval: {
48
+ defaultMode: "lexical",
49
+ dense: {
50
+ enabled: false,
51
+ modelId: "Xenova/all-MiniLM-L6-v2",
52
+ cacheDir: ".kb/models/huggingface",
53
+ indexHashTables: 8,
54
+ indexRandomSeed: 42,
55
+ chunkTextMode: "title-heading-text"
56
+ },
57
+ sparse: {
58
+ enabled: false,
59
+ modelId: "opensearch-project/opensearch-neural-sparse-encoding-doc-v3-distill",
60
+ cacheDir: ".kb/models/huggingface",
61
+ documentTopTokens: 128,
62
+ queryEncoding: "tokenizer-token-weights",
63
+ documentEncoding: "masked-lm-max-log1p-relu",
64
+ chunkTextMode: "title-heading-text"
65
+ }
66
+ },
67
+ crawler: {
68
+ defaultUserAgent: "querylight-cli/0.1",
69
+ obeyRobotsTxt: true,
70
+ rateLimitMs: 1e3,
71
+ renderJs: false,
72
+ retentionDays: 365,
73
+ fetchArticles: true
74
+ },
75
+ limits: {
76
+ maxFileSizeMb: 50,
77
+ maxPagesPerSource: 100,
78
+ maxTotalChunks: 1e5
79
+ }
80
+ });
81
+ async function writeDefaultConfig(workspacePath, force = false) {
82
+ const configPath = path.join(workspacePath, "config.yaml");
83
+ try {
84
+ if (!force) {
85
+ await readFile(configPath, "utf8");
86
+ return;
87
+ }
88
+ } catch {
89
+ }
90
+ await writeFile(configPath, YAML.stringify(defaultConfig()), "utf8");
91
+ }
92
+ async function loadConfig(workspacePath, configPath) {
93
+ const resolved = configPath ?? path.join(workspacePath, "config.yaml");
94
+ const raw = await readFile(resolved, "utf8");
95
+ const parsed = YAML.parse(raw);
96
+ const defaults = defaultConfig();
97
+ return {
98
+ ...defaults,
99
+ ...parsed,
100
+ index: {
101
+ ...defaults.index,
102
+ ...parsed.index,
103
+ fields: {
104
+ ...defaults.index.fields,
105
+ ...parsed.index?.fields ?? {}
106
+ },
107
+ chunking: {
108
+ ...defaults.index.chunking,
109
+ ...parsed.index?.chunking ?? {}
110
+ }
111
+ },
112
+ rag: {
113
+ ...defaults.rag,
114
+ ...parsed.rag ?? {}
115
+ },
116
+ retrieval: {
117
+ ...defaults.retrieval,
118
+ ...parsed.retrieval ?? {},
119
+ dense: {
120
+ ...defaults.retrieval.dense,
121
+ ...parsed.retrieval?.dense ?? {}
122
+ },
123
+ sparse: {
124
+ ...defaults.retrieval.sparse,
125
+ ...parsed.retrieval?.sparse ?? {}
126
+ }
127
+ },
128
+ crawler: {
129
+ ...defaults.crawler,
130
+ ...parsed.crawler ?? {}
131
+ },
132
+ limits: {
133
+ ...defaults.limits,
134
+ ...parsed.limits ?? {}
135
+ }
136
+ };
137
+ }
138
+
139
+ // src/core/workspace.ts
140
+ var DIRS = [
141
+ "sources",
142
+ "documents",
143
+ "chunks",
144
+ "raw",
145
+ "normalized",
146
+ "indexes",
147
+ "vectors",
148
+ "models",
149
+ "models/huggingface",
150
+ "runs",
151
+ "logs"
152
+ ];
153
+ async function ensureWorkspace({
154
+ workspacePath,
155
+ force = false
156
+ }) {
157
+ const resolved = path2.resolve(workspacePath);
158
+ await mkdir(resolved, { recursive: true });
159
+ for (const dir of DIRS) {
160
+ await mkdir(path2.join(resolved, dir), { recursive: true });
161
+ }
162
+ await writeDefaultConfig(resolved, force);
163
+ return { workspacePath: resolved };
164
+ }
165
+ async function assertWorkspaceExists(workspacePath) {
166
+ const resolved = path2.resolve(workspacePath);
167
+ try {
168
+ const info = await stat(resolved);
169
+ if (!info.isDirectory()) {
170
+ throw new CliError(`workspace is not a directory: ${resolved}`, "WORKSPACE_ERROR", 3 /* WorkspaceError */);
171
+ }
172
+ await stat(path2.join(resolved, "config.yaml"));
173
+ return resolved;
174
+ } catch (error) {
175
+ if (error instanceof CliError) {
176
+ throw error;
177
+ }
178
+ throw new CliError(`workspace does not exist or is invalid: ${resolved}`, "WORKSPACE_ERROR", 3 /* WorkspaceError */);
179
+ }
180
+ }
181
+
182
+ // src/sources/source-store.ts
183
+ import path4 from "path";
184
+
185
+ // src/core/hashing.ts
186
+ import { createHash } from "crypto";
187
+ function sha256(input) {
188
+ return createHash("sha256").update(input).digest("hex");
189
+ }
190
+
191
+ // src/core/ids.ts
192
+ function stableId(prefix, ...parts) {
193
+ return `${prefix}_${sha256(parts.join("::")).slice(0, 16)}`;
194
+ }
195
+
196
+ // src/core/jsonl.ts
197
+ import { mkdir as mkdir2, readFile as readFile2, writeFile as writeFile2 } from "fs/promises";
198
+ import path3 from "path";
199
+ async function readJsonl(filePath) {
200
+ try {
201
+ const raw = await readFile2(filePath, "utf8");
202
+ return raw.split("\n").map((line) => line.trim()).filter((line) => line.length > 0).map((line) => JSON.parse(line));
203
+ } catch (error) {
204
+ if (error.code === "ENOENT") {
205
+ return [];
206
+ }
207
+ throw error;
208
+ }
209
+ }
210
+ async function writeJsonl(filePath, records) {
211
+ await mkdir2(path3.dirname(filePath), { recursive: true });
212
+ const payload = records.map((record) => JSON.stringify(record)).join("\n");
213
+ await writeFile2(filePath, payload.length > 0 ? `${payload}
214
+ ` : "", "utf8");
215
+ }
216
+
217
+ // src/sources/source-store.ts
218
+ var sourcesFile = (workspacePath) => path4.join(workspacePath, "sources", "sources.jsonl");
219
+ async function listSources(workspacePath) {
220
+ return readJsonl(sourcesFile(workspacePath));
221
+ }
222
+ async function addSource(workspacePath, source) {
223
+ const existing = await listSources(workspacePath);
224
+ if (existing.some((candidate) => candidate.uri === source.uri)) {
225
+ throw new CliError(`duplicate source URI: ${source.uri}`, "DUPLICATE_SOURCE", 4 /* SourceError */);
226
+ }
227
+ const id = source.id ?? stableId("src", source.type, source.uri);
228
+ const stored = { ...source, id };
229
+ existing.push(stored);
230
+ await writeJsonl(sourcesFile(workspacePath), existing);
231
+ return stored;
232
+ }
233
+ async function updateSource(workspacePath, sourceId, patch) {
234
+ const sources = await listSources(workspacePath);
235
+ const index = sources.findIndex((source) => source.id === sourceId);
236
+ if (index < 0) {
237
+ throw new CliError(`source not found: ${sourceId}`, "SOURCE_NOT_FOUND", 4 /* SourceError */);
238
+ }
239
+ const current = sources[index];
240
+ const updated = {
241
+ ...current,
242
+ ...patch,
243
+ id: sourceId,
244
+ metadata: patch.metadata ? { ...current.metadata, ...patch.metadata } : current.metadata,
245
+ crawl: patch.crawl ? {
246
+ ...current.crawl ?? {},
247
+ ...patch.crawl
248
+ } : current.crawl
249
+ };
250
+ sources[index] = updated;
251
+ await writeJsonl(sourcesFile(workspacePath), sources);
252
+ return updated;
253
+ }
254
+ async function removeSource(workspacePath, sourceId) {
255
+ const sources = await listSources(workspacePath);
256
+ const filtered = sources.filter((source) => source.id !== sourceId);
257
+ if (filtered.length === sources.length) {
258
+ throw new CliError(`source not found: ${sourceId}`, "SOURCE_NOT_FOUND", 4 /* SourceError */);
259
+ }
260
+ await writeJsonl(sourcesFile(workspacePath), filtered);
261
+ }
262
+
263
+ // src/ingest/ingest-service.ts
264
+ import path10 from "path";
265
+
266
+ // src/chunk/chunk-store.ts
267
+ import path5 from "path";
268
+ function chunksFile(workspacePath) {
269
+ return path5.join(workspacePath, "chunks", "chunks.jsonl");
270
+ }
271
+ async function loadChunks(workspacePath) {
272
+ return readJsonl(chunksFile(workspacePath));
273
+ }
274
+ async function saveChunks(workspacePath, chunks) {
275
+ await writeJsonl(chunksFile(workspacePath), chunks.sort((a, b) => a.id.localeCompare(b.id)));
276
+ }
277
+
278
+ // src/core/files.ts
279
+ import { stat as stat2 } from "fs/promises";
280
+ async function fileExists(filePath) {
281
+ try {
282
+ await stat2(filePath);
283
+ return true;
284
+ } catch {
285
+ return false;
286
+ }
287
+ }
288
+
289
+ // src/core/runs.ts
290
+ import path6 from "path";
291
+ async function writeRun(workspacePath, run) {
292
+ await writeJsonl(path6.join(workspacePath, "runs", `${run.id}.json`), [run]);
293
+ }
294
+ async function listRuns(workspacePath) {
295
+ const fs = await import("fs/promises");
296
+ const dir = path6.join(workspacePath, "runs");
297
+ try {
298
+ const entries = await fs.readdir(dir);
299
+ const records = await Promise.all(entries.filter((name) => name.endsWith(".json")).map(async (name) => {
300
+ const runs = await readJsonl(path6.join(dir, name));
301
+ return runs[0];
302
+ }));
303
+ return records.filter((record) => record != null).sort((a, b) => a.createdAt.localeCompare(b.createdAt));
304
+ } catch {
305
+ return [];
306
+ }
307
+ }
308
+
309
+ // src/ingest/document-utils.ts
310
+ import { mkdir as mkdir3, rm, writeFile as writeFile3 } from "fs/promises";
311
+ import path7 from "path";
312
+
313
+ // src/normalize/normalize-markdown.ts
314
+ import matter from "gray-matter";
315
+ function normalizeWhitespace(text) {
316
+ return text.replace(/\r\n/g, "\n").replace(/[ \t]+\n/g, "\n").replace(/\n{3,}/g, "\n\n").trim();
317
+ }
318
+ function withFrontmatter(metadata, body) {
319
+ return matter.stringify(normalizeWhitespace(body), metadata);
320
+ }
321
+
322
+ // src/ingest/document-utils.ts
323
+ function asMetadataValue(value) {
324
+ return value === void 0 ? void 0 : value;
325
+ }
326
+ function buildDocumentMetadata({
327
+ source,
328
+ sourceUri,
329
+ publicationDate,
330
+ crawledAt,
331
+ indexedAt,
332
+ extra = {}
333
+ }) {
334
+ const merged = {
335
+ ...source.metadata,
336
+ ...extra,
337
+ tags: source.tags,
338
+ sourceType: source.type,
339
+ sourceUri,
340
+ publicationDate: publicationDate ?? null,
341
+ crawledAt,
342
+ indexedAt
343
+ };
344
+ const filtered = Object.fromEntries(
345
+ Object.entries(merged).filter(([, value]) => asMetadataValue(value) !== void 0)
346
+ );
347
+ return filtered;
348
+ }
349
+ async function writeNormalizedDocument({
350
+ documentId,
351
+ sourceId,
352
+ title,
353
+ uri,
354
+ sourceUri,
355
+ publicationDate,
356
+ crawledAt,
357
+ indexedAt,
358
+ contentHash,
359
+ lastChangedAt,
360
+ normalizedPath,
361
+ markdown
362
+ }) {
363
+ await mkdir3(path7.dirname(normalizedPath), { recursive: true });
364
+ await writeFile3(
365
+ normalizedPath,
366
+ withFrontmatter(
367
+ {
368
+ documentId,
369
+ sourceId,
370
+ title,
371
+ uri,
372
+ sourceUri,
373
+ publicationDate: publicationDate ?? null,
374
+ crawledAt,
375
+ indexedAt,
376
+ contentHash,
377
+ lastChangedAt
378
+ },
379
+ markdown
380
+ ),
381
+ "utf8"
382
+ );
383
+ }
384
+ async function deleteDocumentArtifacts(document) {
385
+ await Promise.all([
386
+ document.rawPath ? rm(document.rawPath, { force: true }) : Promise.resolve(),
387
+ rm(document.normalizedPath, { force: true })
388
+ ]);
389
+ }
390
+
391
+ // src/ingest/adapters/directory-adapter.ts
392
+ import fg from "fast-glob";
393
+ import path8 from "path";
394
+ async function listDirectoryFiles(source) {
395
+ const include = source.crawl?.includePatterns?.length ? source.crawl.includePatterns : ["**/*.md", "**/*.txt", "**/*.html", "**/*.htm", "**/*.pdf", "**/*.docx"];
396
+ const exclude = source.crawl?.excludePatterns ?? [];
397
+ const matches = await fg(include, {
398
+ cwd: source.uri,
399
+ absolute: true,
400
+ onlyFiles: true,
401
+ dot: false,
402
+ unique: true,
403
+ ignore: exclude,
404
+ followSymbolicLinks: false
405
+ });
406
+ return matches.map((match) => path8.resolve(match)).sort();
407
+ }
408
+
409
+ // src/ingest/adapters/file-adapter.ts
410
+ import { basename, extname, resolve } from "path";
411
+ import { mkdir as mkdir4, readFile as readFile6, stat as stat3, writeFile as writeFile4 } from "fs/promises";
412
+
413
+ // src/ingest/extractors/docx-extractor.ts
414
+ import mammoth from "mammoth";
415
+ async function extractDocx(filePath) {
416
+ const result = await mammoth.extractRawText({ path: filePath });
417
+ return result.value;
418
+ }
419
+
420
+ // src/ingest/extractors/html-extractor.ts
421
+ import { load } from "cheerio";
422
+ import TurndownService from "turndown";
423
+
424
+ // src/normalize/boilerplate.ts
425
+ function stripBoilerplate(html) {
426
+ return html.replace(/<nav[\s\S]*?<\/nav>/gi, "").replace(/<footer[\s\S]*?<\/footer>/gi, "").replace(/cookie notice/gi, "");
427
+ }
428
+
429
+ // src/ingest/extractors/html-extractor.ts
430
+ var turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
431
+ function cleanText(value) {
432
+ return value.replace(/\s+/g, " ").trim();
433
+ }
434
+ function chooseMeaningfulTitle($, fallbackTitle) {
435
+ const candidates = [
436
+ cleanText($("meta[property='og:title']").attr("content") ?? ""),
437
+ cleanText($("meta[name='twitter:title']").attr("content") ?? ""),
438
+ cleanText($("h1").first().text()),
439
+ cleanText($("title").first().text()),
440
+ fallbackTitle
441
+ ].filter(Boolean);
442
+ return candidates[0] ?? fallbackTitle;
443
+ }
444
+ turndown.addRule("docCard", {
445
+ filter(node) {
446
+ return node.nodeName === "A" && typeof node.getAttribute === "function" && (node.getAttribute("class") ?? "").split(/\s+/).includes("doc-card");
447
+ },
448
+ replacement(_content, node) {
449
+ const element = node;
450
+ const href = cleanText(element.getAttribute("href") ?? "");
451
+ const title = cleanText(element.querySelector("h3")?.textContent ?? "");
452
+ const summary = cleanText(element.querySelector("p")?.textContent ?? "");
453
+ const section = cleanText(element.querySelector("span")?.textContent ?? "");
454
+ const parts = [
455
+ title ? `### ${title}` : "",
456
+ summary,
457
+ section,
458
+ href
459
+ ].filter(Boolean);
460
+ return `
461
+
462
+ ${parts.join("\n\n")}
463
+
464
+ `;
465
+ }
466
+ });
467
+ function extractHtmlToMarkdown(html) {
468
+ const cleaned = stripBoilerplate(html);
469
+ const $ = load(cleaned);
470
+ const fallbackTitle = cleanText($("title").first().text()) || "Untitled";
471
+ const title = chooseMeaningfulTitle($, fallbackTitle);
472
+ const root = $("main").first().html() ?? $.root().html() ?? cleaned;
473
+ return {
474
+ markdown: turndown.turndown(root),
475
+ title
476
+ };
477
+ }
478
+ function parseDateCandidate(value) {
479
+ const trimmed = value.trim();
480
+ if (!trimmed) {
481
+ return null;
482
+ }
483
+ const date = new Date(trimmed);
484
+ return Number.isNaN(date.getTime()) ? null : date.toISOString();
485
+ }
486
+ function extractPublicationDateFromHtml(html) {
487
+ const $ = load(html);
488
+ const candidates = [
489
+ $("meta[property='article:published_time']").attr("content"),
490
+ $("meta[property='og:published_time']").attr("content"),
491
+ $("meta[name='pubdate']").attr("content"),
492
+ $("meta[name='publish-date']").attr("content"),
493
+ $("meta[name='article:published_time']").attr("content"),
494
+ $("meta[name='date']").attr("content"),
495
+ $("time[datetime]").first().attr("datetime")
496
+ ].filter((value) => Boolean(value?.trim()));
497
+ for (const candidate of candidates) {
498
+ const parsed = parseDateCandidate(candidate);
499
+ if (parsed) {
500
+ return parsed;
501
+ }
502
+ }
503
+ let jsonLdDate = null;
504
+ $('script[type="application/ld+json"]').each((_, element) => {
505
+ if (jsonLdDate) {
506
+ return false;
507
+ }
508
+ try {
509
+ const raw = $(element).text();
510
+ const parsed = JSON.parse(raw);
511
+ const queue = Array.isArray(parsed) ? [...parsed] : [parsed];
512
+ while (queue.length > 0) {
513
+ const next = queue.shift();
514
+ if (!next || typeof next !== "object") {
515
+ continue;
516
+ }
517
+ const record = next;
518
+ for (const key of ["datePublished", "dateCreated", "dateModified"]) {
519
+ if (typeof record[key] === "string") {
520
+ const normalized = parseDateCandidate(record[key]);
521
+ if (normalized) {
522
+ jsonLdDate = normalized;
523
+ return false;
524
+ }
525
+ }
526
+ }
527
+ if (Array.isArray(record["@graph"])) {
528
+ queue.push(...record["@graph"]);
529
+ }
530
+ }
531
+ } catch (error) {
532
+ void error;
533
+ }
534
+ return void 0;
535
+ });
536
+ return jsonLdDate;
537
+ }
538
+
539
+ // src/ingest/extractors/markdown-extractor.ts
540
+ import { readFile as readFile3 } from "fs/promises";
541
+ async function extractMarkdown(filePath) {
542
+ return readFile3(filePath, "utf8");
543
+ }
544
+
545
+ // src/ingest/extractors/pdf-extractor.ts
546
+ import { readFile as readFile4 } from "fs/promises";
547
+ import { PDFParse } from "pdf-parse";
548
+ async function extractPdf(filePath) {
549
+ const buffer = await readFile4(filePath);
550
+ const parser = new PDFParse({ data: buffer });
551
+ try {
552
+ const parsed = await parser.getText();
553
+ return parsed.text;
554
+ } finally {
555
+ await parser.destroy();
556
+ }
557
+ }
558
+
559
+ // src/ingest/extractors/text-extractor.ts
560
+ import { readFile as readFile5 } from "fs/promises";
561
+ async function extractText(filePath) {
562
+ return readFile5(filePath, "utf8");
563
+ }
564
+
565
+ // src/ingest/adapters/file-adapter.ts
566
+ function mimeTypeFor(filePath) {
567
+ const ext = extname(filePath).toLowerCase();
568
+ switch (ext) {
569
+ case ".md":
570
+ return "text/markdown";
571
+ case ".txt":
572
+ return "text/plain";
573
+ case ".html":
574
+ case ".htm":
575
+ return "text/html";
576
+ case ".pdf":
577
+ return "application/pdf";
578
+ case ".docx":
579
+ return "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
580
+ default:
581
+ return "application/octet-stream";
582
+ }
583
+ }
584
+ async function extractFileContent(filePath, mimeType) {
585
+ if (mimeType === "text/markdown") {
586
+ const markdown = await extractMarkdown(filePath);
587
+ const title = markdown.match(/^#\s+(.+)$/m)?.[1]?.trim() ?? basename(filePath);
588
+ return { title, markdown, raw: markdown };
589
+ }
590
+ if (mimeType === "text/plain") {
591
+ const text = await extractText(filePath);
592
+ return { title: basename(filePath), markdown: `# ${basename(filePath)}
593
+
594
+ ${text}`, raw: text };
595
+ }
596
+ if (mimeType === "text/html") {
597
+ const raw = await readFile6(filePath, "utf8");
598
+ const extracted = extractHtmlToMarkdown(raw);
599
+ return { title: extracted.title, markdown: `# ${extracted.title}
600
+
601
+ ${extracted.markdown}`, raw };
602
+ }
603
+ if (mimeType === "application/pdf") {
604
+ const text = await extractPdf(filePath);
605
+ return { title: basename(filePath), markdown: `# ${basename(filePath)}
606
+
607
+ ${text}` };
608
+ }
609
+ if (mimeType.includes("wordprocessingml")) {
610
+ const text = await extractDocx(filePath);
611
+ return { title: basename(filePath), markdown: `# ${basename(filePath)}
612
+
613
+ ${text}` };
614
+ }
615
+ throw new Error(`unsupported file type: ${mimeType}`);
616
+ }
617
+ async function extractRawContent(raw, mimeType, fallbackTitle) {
618
+ if (mimeType === "text/markdown") {
619
+ const title = raw.match(/^#\s+(.+)$/m)?.[1]?.trim() ?? fallbackTitle;
620
+ return { title, markdown: raw };
621
+ }
622
+ if (mimeType === "text/plain") {
623
+ return { title: fallbackTitle, markdown: `# ${fallbackTitle}
624
+
625
+ ${raw}` };
626
+ }
627
+ if (mimeType === "text/html") {
628
+ const extracted = extractHtmlToMarkdown(raw);
629
+ return { title: extracted.title, markdown: `# ${extracted.title}
630
+
631
+ ${extracted.markdown}` };
632
+ }
633
+ throw new Error(`raw reprocessing is not supported for ${mimeType}`);
634
+ }
635
+ async function ingestFile({
636
+ workspacePath,
637
+ source,
638
+ filePath,
639
+ previous
640
+ }) {
641
+ const resolved = resolve(filePath);
642
+ const fileStat = await stat3(resolved);
643
+ const mimeType = mimeTypeFor(resolved);
644
+ const extracted = await extractFileContent(resolved, mimeType);
645
+ const documentId = stableId("doc", source.id, resolved);
646
+ const normalizedPath = resolve(workspacePath, "normalized", `${documentId}.md`);
647
+ const rawPath = resolve(workspacePath, "raw", source.id, basename(resolved));
648
+ const contentHash = sha256(extracted.markdown);
649
+ const now = (/* @__PURE__ */ new Date()).toISOString();
650
+ const lastChangedAt = previous?.contentHash === contentHash ? previous.lastChangedAt : now;
651
+ const indexedAt = now;
652
+ const crawledAt = now;
653
+ await mkdir4(resolve(workspacePath, "normalized"), { recursive: true });
654
+ await mkdir4(resolve(workspacePath, "raw", source.id), { recursive: true });
655
+ if (extracted.raw) {
656
+ await writeFile4(rawPath, extracted.raw, "utf8");
657
+ }
658
+ await writeNormalizedDocument({
659
+ documentId,
660
+ sourceId: source.id,
661
+ title: extracted.title,
662
+ uri: resolved,
663
+ sourceUri: source.uri,
664
+ publicationDate: previous?.publicationDate ?? null,
665
+ crawledAt,
666
+ indexedAt,
667
+ contentHash,
668
+ lastChangedAt,
669
+ normalizedPath,
670
+ markdown: extracted.markdown
671
+ });
672
+ return {
673
+ id: documentId,
674
+ sourceId: source.id,
675
+ sourceType: source.type,
676
+ title: extracted.title,
677
+ uri: resolved,
678
+ sourceUri: source.uri,
679
+ mimeType,
680
+ rawPath: extracted.raw ? rawPath : void 0,
681
+ normalizedPath,
682
+ contentHash,
683
+ metadata: buildDocumentMetadata({
684
+ source,
685
+ sourceUri: source.uri,
686
+ publicationDate: previous?.publicationDate ?? null,
687
+ crawledAt,
688
+ indexedAt,
689
+ extra: {
690
+ contentType: mimeType,
691
+ fileSizeBytes: fileStat.size
692
+ }
693
+ }),
694
+ publicationDate: previous?.publicationDate ?? null,
695
+ crawledAt,
696
+ firstSeenAt: previous?.firstSeenAt ?? now,
697
+ lastSeenAt: now,
698
+ lastChangedAt,
699
+ indexedAt
700
+ };
701
+ }
702
+ async function ingestInlineContent({
703
+ workspacePath,
704
+ source,
705
+ content,
706
+ title,
707
+ uri,
708
+ previous
709
+ }) {
710
+ const markdown = source.type === "markdown" ? content : `# ${title}
711
+
712
+ ${content}`;
713
+ const documentId = stableId("doc", source.id, uri);
714
+ const normalizedPath = resolve(workspacePath, "normalized", `${documentId}.md`);
715
+ const contentHash = sha256(markdown);
716
+ const now = (/* @__PURE__ */ new Date()).toISOString();
717
+ const lastChangedAt = previous?.contentHash === contentHash ? previous.lastChangedAt : now;
718
+ const indexedAt = now;
719
+ await mkdir4(resolve(workspacePath, "normalized"), { recursive: true });
720
+ await writeNormalizedDocument({
721
+ documentId,
722
+ sourceId: source.id,
723
+ title,
724
+ uri,
725
+ sourceUri: source.uri,
726
+ publicationDate: previous?.publicationDate ?? null,
727
+ crawledAt: now,
728
+ indexedAt,
729
+ contentHash,
730
+ lastChangedAt,
731
+ normalizedPath,
732
+ markdown
733
+ });
734
+ return {
735
+ id: documentId,
736
+ sourceId: source.id,
737
+ sourceType: source.type,
738
+ title,
739
+ uri,
740
+ sourceUri: source.uri,
741
+ mimeType: source.type === "markdown" ? "text/markdown" : "text/plain",
742
+ normalizedPath,
743
+ contentHash,
744
+ metadata: buildDocumentMetadata({
745
+ source,
746
+ sourceUri: source.uri,
747
+ publicationDate: previous?.publicationDate ?? null,
748
+ crawledAt: now,
749
+ indexedAt
750
+ }),
751
+ publicationDate: previous?.publicationDate ?? null,
752
+ crawledAt: now,
753
+ firstSeenAt: previous?.firstSeenAt ?? now,
754
+ lastSeenAt: now,
755
+ lastChangedAt,
756
+ indexedAt
757
+ };
758
+ }
759
+ async function reprocessStoredDocument(document, source) {
760
+ if (!document.rawPath) {
761
+ return null;
762
+ }
763
+ const raw = await readFile6(document.rawPath, "utf8");
764
+ const fallbackTitle = document.title || basename(document.uri);
765
+ const extracted = await extractRawContent(raw, document.mimeType, fallbackTitle);
766
+ const contentHash = sha256(extracted.markdown);
767
+ const now = (/* @__PURE__ */ new Date()).toISOString();
768
+ const indexedAt = now;
769
+ const lastChangedAt = document.contentHash === contentHash ? document.lastChangedAt : now;
770
+ await writeNormalizedDocument({
771
+ documentId: document.id,
772
+ sourceId: document.sourceId,
773
+ title: extracted.title,
774
+ uri: document.uri,
775
+ sourceUri: document.sourceUri,
776
+ publicationDate: document.publicationDate ?? null,
777
+ crawledAt: document.crawledAt,
778
+ indexedAt,
779
+ contentHash,
780
+ lastChangedAt,
781
+ normalizedPath: document.normalizedPath,
782
+ markdown: extracted.markdown
783
+ });
784
+ return {
785
+ ...document,
786
+ title: extracted.title,
787
+ contentHash,
788
+ metadata: buildDocumentMetadata({
789
+ source,
790
+ sourceUri: document.sourceUri,
791
+ publicationDate: document.publicationDate ?? null,
792
+ crawledAt: document.crawledAt,
793
+ indexedAt,
794
+ extra: {
795
+ ...document.metadata,
796
+ contentType: document.mimeType
797
+ }
798
+ }),
799
+ lastChangedAt,
800
+ indexedAt
801
+ };
802
+ }
803
+
804
+ // src/ingest/adapters/rss-adapter.ts
805
+ import { Readable } from "stream";
806
+ import FeedParser from "feedparser";
807
+ import { parseFeed } from "feedsmith";
808
+ function toIsoDate(value) {
809
+ if (value instanceof Date) {
810
+ return Number.isNaN(value.getTime()) ? null : value.toISOString();
811
+ }
812
+ if (typeof value === "string" && value.trim().length > 0) {
813
+ const parsed = new Date(value);
814
+ return Number.isNaN(parsed.getTime()) ? null : parsed.toISOString();
815
+ }
816
+ return null;
817
+ }
818
+ function normalizeFeedLink(link, baseUrl) {
819
+ if (!link?.trim()) {
820
+ return null;
821
+ }
822
+ try {
823
+ return new URL(link, baseUrl).href;
824
+ } catch {
825
+ return null;
826
+ }
827
+ }
828
+ function normalizeFeedsmithItems(feed, baseUrl) {
829
+ const items = Array.isArray(feed?.items) ? feed.items : Array.isArray(feed?.entries) ? feed.entries : [];
830
+ return items.map((item) => {
831
+ const link = normalizeFeedLink(
832
+ item?.link ?? item?.url ?? item?.id ?? item?.guid ?? item?.links?.[0]?.href ?? item?.links?.[0]?.href,
833
+ baseUrl
834
+ );
835
+ if (!link) {
836
+ return null;
837
+ }
838
+ return {
839
+ url: link,
840
+ title: String(item?.title ?? item?.summary ?? link).trim(),
841
+ publicationDate: toIsoDate(
842
+ item?.pubDate ?? item?.published ?? item?.updated ?? item?.published_at ?? item?.date_published ?? item?.dc?.date
843
+ )
844
+ };
845
+ }).filter((item) => item !== null);
846
+ }
847
+ async function parseWithFeedparser(xml, feedUrl) {
848
+ const parser = new FeedParser({ feedurl: feedUrl });
849
+ const items = [];
850
+ return await new Promise((resolve2, reject) => {
851
+ parser.on("error", reject);
852
+ parser.on("readable", function onReadable() {
853
+ let item;
854
+ while (item = this.read()) {
855
+ const link = normalizeFeedLink(item.link || item.origlink, feedUrl);
856
+ if (!link) {
857
+ continue;
858
+ }
859
+ items.push({
860
+ url: link,
861
+ title: String(item.title ?? link).trim(),
862
+ publicationDate: toIsoDate(item.pubdate ?? item.date)
863
+ });
864
+ }
865
+ });
866
+ parser.on("end", () => resolve2(items));
867
+ Readable.from([xml]).pipe(parser);
868
+ });
869
+ }
870
+ async function parseRssFeedDocument(xml, source) {
871
+ try {
872
+ const parsed = parseFeed(xml);
873
+ return normalizeFeedsmithItems(parsed.feed, source.uri);
874
+ } catch {
875
+ return parseWithFeedparser(xml, source.uri);
876
+ }
877
+ }
878
+
879
+ // src/ingest/adapters/url-adapter.ts
880
+ import { mkdir as mkdir5, readFile as readFile7, writeFile as writeFile5 } from "fs/promises";
881
+ import path9 from "path";
882
+ function buildHttpCache(response, validatedAt) {
883
+ return {
884
+ etag: response.headers.get("etag") ?? void 0,
885
+ lastModified: response.headers.get("last-modified") ?? void 0,
886
+ cacheControl: response.headers.get("cache-control") ?? void 0,
887
+ expires: response.headers.get("expires"),
888
+ lastValidatedAt: validatedAt,
889
+ lastStatus: response.status
890
+ };
891
+ }
892
+ function choosePublicationDate(preferred, fallback, previous) {
893
+ return preferred ?? fallback ?? previous ?? null;
894
+ }
895
+ async function normalizeRemoteDocument({
896
+ workspacePath,
897
+ source,
898
+ url,
899
+ body,
900
+ previous,
901
+ sourceUri,
902
+ publicationDate,
903
+ responseStatus
904
+ }) {
905
+ const extracted = extractHtmlToMarkdown(body);
906
+ const markdown = `# ${extracted.title}
907
+
908
+ ${extracted.markdown}`;
909
+ const documentId = stableId("doc", source.id, url);
910
+ const normalizedPath = path9.resolve(workspacePath, "normalized", `${documentId}.md`);
911
+ const rawPath = path9.resolve(workspacePath, "raw", source.id, `${sha256(url).slice(0, 12)}.html`);
912
+ const contentHash = sha256(markdown);
913
+ const now = (/* @__PURE__ */ new Date()).toISOString();
914
+ const lastChangedAt = previous?.contentHash === contentHash ? previous.lastChangedAt : now;
915
+ const indexedAt = now;
916
+ const crawledAt = now;
917
+ const resolvedPublicationDate = choosePublicationDate(publicationDate, extractPublicationDateFromHtml(body), previous?.publicationDate);
918
+ await mkdir5(path9.resolve(workspacePath, "raw", source.id), { recursive: true });
919
+ await writeFile5(rawPath, body, "utf8");
920
+ await writeNormalizedDocument({
921
+ documentId,
922
+ sourceId: source.id,
923
+ title: extracted.title,
924
+ uri: url,
925
+ sourceUri,
926
+ publicationDate: resolvedPublicationDate,
927
+ crawledAt,
928
+ indexedAt,
929
+ contentHash,
930
+ lastChangedAt,
931
+ normalizedPath,
932
+ markdown
933
+ });
934
+ return {
935
+ id: documentId,
936
+ sourceId: source.id,
937
+ sourceType: source.type,
938
+ title: extracted.title,
939
+ uri: url,
940
+ sourceUri,
941
+ mimeType: "text/html",
942
+ rawPath,
943
+ normalizedPath,
944
+ contentHash,
945
+ metadata: buildDocumentMetadata({
946
+ source,
947
+ sourceUri,
948
+ publicationDate: resolvedPublicationDate,
949
+ crawledAt,
950
+ indexedAt,
951
+ extra: {
952
+ status: responseStatus,
953
+ contentType: "text/html"
954
+ }
955
+ }),
956
+ publicationDate: resolvedPublicationDate,
957
+ crawledAt,
958
+ firstSeenAt: previous?.firstSeenAt ?? now,
959
+ lastSeenAt: now,
960
+ lastChangedAt,
961
+ indexedAt
962
+ };
963
+ }
964
+ async function fetchUrlDocument({
965
+ workspacePath,
966
+ source,
967
+ url,
968
+ previous,
969
+ sourceUri,
970
+ publicationDate
971
+ }) {
972
+ const headers = {
973
+ "user-agent": source.crawl?.userAgent ?? "querylight-cli/0.1"
974
+ };
975
+ if (previous?.httpCache?.etag) {
976
+ headers["if-none-match"] = previous.httpCache.etag;
977
+ }
978
+ if (previous?.httpCache?.lastModified) {
979
+ headers["if-modified-since"] = previous.httpCache.lastModified;
980
+ }
981
+ const response = await fetch(url, { headers });
982
+ const now = (/* @__PURE__ */ new Date()).toISOString();
983
+ const nextHttpCache = buildHttpCache(response, now);
984
+ const effectiveSourceUri = sourceUri ?? source.uri;
985
+ if (response.status === 304 && previous?.rawPath && await fileExists(previous.rawPath) && await fileExists(previous.normalizedPath)) {
986
+ return {
987
+ ...previous,
988
+ sourceUri: effectiveSourceUri,
989
+ publicationDate: publicationDate ?? previous.publicationDate ?? null,
990
+ metadata: buildDocumentMetadata({
991
+ source,
992
+ sourceUri: effectiveSourceUri,
993
+ publicationDate: publicationDate ?? previous.publicationDate ?? null,
994
+ crawledAt: previous.crawledAt,
995
+ indexedAt: previous.indexedAt,
996
+ extra: {
997
+ ...previous.metadata,
998
+ status: previous.metadata.status ?? 200,
999
+ contentType: previous.mimeType
1000
+ }
1001
+ }),
1002
+ lastSeenAt: now,
1003
+ httpCache: nextHttpCache
1004
+ };
1005
+ }
1006
+ const body = await response.text();
1007
+ const document = await normalizeRemoteDocument({
1008
+ workspacePath,
1009
+ source,
1010
+ url,
1011
+ body,
1012
+ previous,
1013
+ sourceUri: effectiveSourceUri,
1014
+ publicationDate,
1015
+ responseStatus: response.status
1016
+ });
1017
+ return {
1018
+ ...document,
1019
+ mimeType: response.headers.get("content-type") ?? document.mimeType,
1020
+ metadata: buildDocumentMetadata({
1021
+ source,
1022
+ sourceUri: effectiveSourceUri,
1023
+ publicationDate: document.publicationDate ?? null,
1024
+ crawledAt: document.crawledAt,
1025
+ indexedAt: document.indexedAt,
1026
+ extra: {
1027
+ status: response.status,
1028
+ contentType: response.headers.get("content-type") ?? document.mimeType
1029
+ }
1030
+ }),
1031
+ httpCache: nextHttpCache
1032
+ };
1033
+ }
1034
+ async function reprocessRemoteDocument(document, source) {
1035
+ if (!document.rawPath || !await fileExists(document.rawPath)) {
1036
+ return null;
1037
+ }
1038
+ const raw = await readFile7(document.rawPath, "utf8");
1039
+ const extracted = extractHtmlToMarkdown(raw);
1040
+ const markdown = `# ${extracted.title}
1041
+
1042
+ ${extracted.markdown}`;
1043
+ const contentHash = sha256(markdown);
1044
+ const now = (/* @__PURE__ */ new Date()).toISOString();
1045
+ const indexedAt = now;
1046
+ const lastChangedAt = document.contentHash === contentHash ? document.lastChangedAt : now;
1047
+ const publicationDate = document.publicationDate ?? extractPublicationDateFromHtml(raw);
1048
+ await writeNormalizedDocument({
1049
+ documentId: document.id,
1050
+ sourceId: document.sourceId,
1051
+ title: extracted.title,
1052
+ uri: document.uri,
1053
+ sourceUri: document.sourceUri,
1054
+ publicationDate,
1055
+ crawledAt: document.crawledAt,
1056
+ indexedAt,
1057
+ contentHash,
1058
+ lastChangedAt,
1059
+ normalizedPath: document.normalizedPath,
1060
+ markdown
1061
+ });
1062
+ return {
1063
+ ...document,
1064
+ title: extracted.title,
1065
+ contentHash,
1066
+ publicationDate,
1067
+ metadata: buildDocumentMetadata({
1068
+ source,
1069
+ sourceUri: document.sourceUri,
1070
+ publicationDate,
1071
+ crawledAt: document.crawledAt,
1072
+ indexedAt,
1073
+ extra: {
1074
+ ...document.metadata,
1075
+ status: document.httpCache?.lastStatus ?? document.metadata.status ?? 200,
1076
+ contentType: document.mimeType
1077
+ }
1078
+ }),
1079
+ lastChangedAt,
1080
+ indexedAt
1081
+ };
1082
+ }
1083
+
1084
+ // src/ingest/adapters/website-adapter.ts
1085
+ import { load as load2 } from "cheerio";
1086
+ async function fetchRobotsDisallow(url, userAgent) {
1087
+ try {
1088
+ const response = await fetch(new URL("/robots.txt", url), { headers: { "user-agent": userAgent } });
1089
+ if (!response.ok) {
1090
+ return [];
1091
+ }
1092
+ const text = await response.text();
1093
+ return text.split("\n").map((line) => line.trim()).filter((line) => /^disallow:/i.test(line)).map((line) => line.split(":")[1]?.trim() ?? "").filter((line) => line.length > 0);
1094
+ } catch {
1095
+ return [];
1096
+ }
1097
+ }
1098
+ async function fetchSitemapUrls(baseUrl, userAgent) {
1099
+ try {
1100
+ const response = await fetch(new URL("/sitemap.xml", baseUrl), { headers: { "user-agent": userAgent } });
1101
+ if (!response.ok) {
1102
+ return [];
1103
+ }
1104
+ const xml = await response.text();
1105
+ return [...xml.matchAll(/<loc>(.*?)<\/loc>/g)].map((match) => match[1]).filter(Boolean);
1106
+ } catch {
1107
+ return [];
1108
+ }
1109
+ }
1110
+ function isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules) {
1111
+ if (url.origin !== baseUrl.origin) {
1112
+ return false;
1113
+ }
1114
+ if (disallowRules.some((rule) => rule !== "/" && url.pathname.startsWith(rule))) {
1115
+ return false;
1116
+ }
1117
+ const href = url.href;
1118
+ if (includePatterns.length > 0 && !includePatterns.some((pattern) => href.includes(pattern))) {
1119
+ return false;
1120
+ }
1121
+ if (excludePatterns.some((pattern) => href.includes(pattern))) {
1122
+ return false;
1123
+ }
1124
+ return true;
1125
+ }
1126
+ async function crawlWebsite(source) {
1127
+ const baseUrl = new URL(source.uri);
1128
+ const userAgent = source.crawl?.userAgent ?? "querylight-cli/0.1";
1129
+ const includePatterns = source.crawl?.includePatterns ?? [];
1130
+ const excludePatterns = source.crawl?.excludePatterns ?? [];
1131
+ const maxDepth = source.crawl?.maxDepth ?? 2;
1132
+ const maxPages = source.crawl?.maxPages ?? 100;
1133
+ const rateLimitMs = source.crawl?.rateLimitMs ?? 1e3;
1134
+ const disallowRules = source.crawl?.obeyRobotsTxt === false ? [] : await fetchRobotsDisallow(baseUrl, userAgent);
1135
+ const queue = [{ url: source.uri, depth: 0 }];
1136
+ const seen = /* @__PURE__ */ new Set();
1137
+ const results = [];
1138
+ if (source.crawl?.useSitemap !== false) {
1139
+ for (const url of await fetchSitemapUrls(baseUrl, userAgent)) {
1140
+ queue.push({ url, depth: 1 });
1141
+ }
1142
+ }
1143
+ while (queue.length > 0 && results.length < maxPages) {
1144
+ const next = queue.shift();
1145
+ if (!next || seen.has(next.url)) {
1146
+ continue;
1147
+ }
1148
+ seen.add(next.url);
1149
+ const url = new URL(next.url);
1150
+ if (!isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules)) {
1151
+ continue;
1152
+ }
1153
+ results.push(url.href);
1154
+ if (next.depth >= maxDepth) {
1155
+ continue;
1156
+ }
1157
+ const response = await fetch(url, { headers: { "user-agent": userAgent } });
1158
+ const html = await response.text();
1159
+ const $ = load2(html);
1160
+ $("a[href]").each((_, element) => {
1161
+ const href = $(element).attr("href");
1162
+ if (!href) {
1163
+ return;
1164
+ }
1165
+ try {
1166
+ const target = new URL(href, url);
1167
+ if (!seen.has(target.href)) {
1168
+ queue.push({ url: target.href, depth: next.depth + 1 });
1169
+ }
1170
+ } catch {
1171
+ }
1172
+ });
1173
+ if (rateLimitMs > 0) {
1174
+ await new Promise((resolve2) => setTimeout(resolve2, rateLimitMs));
1175
+ }
1176
+ }
1177
+ return results;
1178
+ }
1179
+
1180
+ // src/ingest/ingest-service.ts
1181
+ function documentsFile(workspacePath) {
1182
+ return path10.join(workspacePath, "documents", "documents.jsonl");
1183
+ }
1184
+ async function loadDocuments(workspacePath) {
1185
+ return readJsonl(documentsFile(workspacePath));
1186
+ }
1187
+ async function saveDocuments(workspacePath, documents) {
1188
+ await writeJsonl(documentsFile(workspacePath), documents.sort((a, b) => a.id.localeCompare(b.id)));
1189
+ }
1190
+ function previousMap(documents) {
1191
+ return new Map(documents.map((document) => [document.id, document]));
1192
+ }
1193
+ function nowStamp() {
1194
+ return (/* @__PURE__ */ new Date()).toISOString();
1195
+ }
1196
+ function runId() {
1197
+ return nowStamp().replace(/[:.]/g, "-");
1198
+ }
1199
+ function documentSnapshot(documents) {
1200
+ return documents.map((document) => ({
1201
+ id: document.id,
1202
+ title: document.title,
1203
+ uri: document.uri,
1204
+ contentHash: document.contentHash,
1205
+ lastChangedAt: document.lastChangedAt,
1206
+ sourceId: document.sourceId
1207
+ }));
1208
+ }
1209
+ function shouldExpireRssDocument(document, source, defaultRetentionDays) {
1210
+ if (source.type !== "rss" || !document.publicationDate) {
1211
+ return false;
1212
+ }
1213
+ const retentionDays = source.crawl?.retentionDays ?? defaultRetentionDays;
1214
+ const publishedAt = new Date(document.publicationDate);
1215
+ if (Number.isNaN(publishedAt.getTime())) {
1216
+ return false;
1217
+ }
1218
+ const cutoff = Date.now() - retentionDays * 24 * 60 * 60 * 1e3;
1219
+ return publishedAt.getTime() < cutoff;
1220
+ }
1221
+ async function purgeDocuments(workspacePath, documentIds, documents) {
1222
+ if (documentIds.size === 0) {
1223
+ return;
1224
+ }
1225
+ const chunks = await loadChunks(workspacePath);
1226
+ const filteredChunks = chunks.filter((chunk) => !documentIds.has(chunk.documentId));
1227
+ if (filteredChunks.length !== chunks.length) {
1228
+ await saveChunks(workspacePath, filteredChunks);
1229
+ }
1230
+ await Promise.all(
1231
+ documents.filter((document) => documentIds.has(document.id)).map((document) => deleteDocumentArtifacts(document))
1232
+ );
1233
+ }
1234
+ async function fetchFeedText(source) {
1235
+ const response = await fetch(source.uri, {
1236
+ headers: {
1237
+ "user-agent": source.crawl?.userAgent ?? "querylight-cli/0.1"
1238
+ }
1239
+ });
1240
+ if (!response.ok) {
1241
+ throw new Error(`failed to fetch feed: ${response.status}`);
1242
+ }
1243
+ return response.text();
1244
+ }
1245
+ async function ingestRssSource({
1246
+ workspacePath,
1247
+ source,
1248
+ previous,
1249
+ nextDocuments,
1250
+ onFailure
1251
+ }) {
1252
+ if (source.crawl?.fetchArticles === false) {
1253
+ throw new Error("rss sources require article fetching");
1254
+ }
1255
+ const xml = await fetchFeedText(source);
1256
+ const items = await parseRssFeedDocument(xml, source);
1257
+ let added = 0;
1258
+ let changed = 0;
1259
+ let unchanged = 0;
1260
+ let failed = 0;
1261
+ for (const item of items) {
1262
+ try {
1263
+ const probe = previous.get(stableId("doc", source.id, item.url));
1264
+ const document = await fetchUrlDocument({
1265
+ workspacePath,
1266
+ source,
1267
+ url: item.url,
1268
+ previous: probe,
1269
+ sourceUri: source.uri,
1270
+ publicationDate: item.publicationDate
1271
+ });
1272
+ nextDocuments.set(document.id, document);
1273
+ if (!probe) {
1274
+ added += 1;
1275
+ } else if (probe.contentHash !== document.contentHash) {
1276
+ changed += 1;
1277
+ } else {
1278
+ unchanged += 1;
1279
+ }
1280
+ } catch (error) {
1281
+ failed += 1;
1282
+ onFailure(item.url, error);
1283
+ }
1284
+ }
1285
+ return { added, changed, unchanged, failed };
1286
+ }
1287
+ async function ingestSources({
1288
+ workspacePath,
1289
+ sourceIds,
1290
+ changedOnly = false
1291
+ }) {
1292
+ const config = await loadConfig(workspacePath);
1293
+ const defaultRetentionDays = config.crawler.retentionDays;
1294
+ const sources = (await listSources(workspacePath)).filter((source) => source.enabled && (!sourceIds || sourceIds.includes(source.id)));
1295
+ const existing = await loadDocuments(workspacePath);
1296
+ const previous = previousMap(existing);
1297
+ const nextDocuments = new Map(existing.map((document) => [document.id, document]));
1298
+ let added = 0;
1299
+ let changed = 0;
1300
+ let unchanged = 0;
1301
+ let failed = 0;
1302
+ const failures = [];
1303
+ for (const source of sources) {
1304
+ const ingestOne = async (uri, producer) => {
1305
+ try {
1306
+ const probeId = stableId("doc", source.id, uri);
1307
+ const earlier = previous.get(probeId);
1308
+ const document = await producer();
1309
+ nextDocuments.set(document.id, document);
1310
+ if (!earlier) {
1311
+ added += 1;
1312
+ } else if (earlier.contentHash !== document.contentHash) {
1313
+ changed += 1;
1314
+ } else {
1315
+ unchanged += 1;
1316
+ }
1317
+ } catch (error) {
1318
+ failed += 1;
1319
+ failures.push({
1320
+ sourceId: source.id,
1321
+ uri,
1322
+ message: error instanceof Error ? error.message : String(error)
1323
+ });
1324
+ }
1325
+ };
1326
+ try {
1327
+ if (source.type === "file") {
1328
+ await ingestOne(source.uri, () => ingestFile({ workspacePath, source, filePath: source.uri, previous: previous.get(stableId("doc", source.id, source.uri)) }));
1329
+ continue;
1330
+ }
1331
+ if (source.type === "directory") {
1332
+ for (const filePath of await listDirectoryFiles(source)) {
1333
+ await ingestOne(filePath, () => ingestFile({ workspacePath, source, filePath, previous: previous.get(stableId("doc", source.id, filePath)) }));
1334
+ }
1335
+ continue;
1336
+ }
1337
+ if (source.type === "url") {
1338
+ await ingestOne(source.uri, () => fetchUrlDocument({ workspacePath, source, url: source.uri, previous: previous.get(stableId("doc", source.id, source.uri)) }));
1339
+ continue;
1340
+ }
1341
+ if (source.type === "website") {
1342
+ for (const url of await crawlWebsite(source)) {
1343
+ await ingestOne(url, () => fetchUrlDocument({ workspacePath, source, url, previous: previous.get(stableId("doc", source.id, url)) }));
1344
+ }
1345
+ continue;
1346
+ }
1347
+ if (source.type === "rss") {
1348
+ const result = await ingestRssSource({
1349
+ workspacePath,
1350
+ source,
1351
+ previous,
1352
+ nextDocuments,
1353
+ onFailure: (uri, error) => {
1354
+ failures.push({
1355
+ sourceId: source.id,
1356
+ uri,
1357
+ message: error instanceof Error ? error.message : String(error)
1358
+ });
1359
+ }
1360
+ });
1361
+ added += result.added;
1362
+ changed += result.changed;
1363
+ unchanged += result.unchanged;
1364
+ failed += result.failed;
1365
+ continue;
1366
+ }
1367
+ if (source.type === "markdown" || source.type === "text") {
1368
+ await ingestOne(source.uri, () => ingestInlineContent({
1369
+ workspacePath,
1370
+ source,
1371
+ title: source.name,
1372
+ content: source.uri,
1373
+ uri: `inline:${source.id}`,
1374
+ previous: previous.get(stableId("doc", source.id, `inline:${source.id}`))
1375
+ }));
1376
+ }
1377
+ } catch (error) {
1378
+ failed += 1;
1379
+ failures.push({
1380
+ sourceId: source.id,
1381
+ uri: source.uri,
1382
+ message: error instanceof Error ? error.message : String(error)
1383
+ });
1384
+ }
1385
+ }
1386
+ const expiringDocuments = [...nextDocuments.values()].filter((document) => {
1387
+ const source = sources.find((candidate) => candidate.id === document.sourceId);
1388
+ return source ? shouldExpireRssDocument(document, source, defaultRetentionDays) : false;
1389
+ });
1390
+ if (expiringDocuments.length > 0) {
1391
+ const expiredIds = new Set(expiringDocuments.map((document) => document.id));
1392
+ for (const document of expiringDocuments) {
1393
+ nextDocuments.delete(document.id);
1394
+ }
1395
+ await purgeDocuments(workspacePath, expiredIds, [...existing, ...expiringDocuments]);
1396
+ }
1397
+ const finalDocuments = [...nextDocuments.values()];
1398
+ await saveDocuments(workspacePath, finalDocuments);
1399
+ const id = runId();
1400
+ const run = {
1401
+ id,
1402
+ kind: "ingest",
1403
+ createdAt: nowStamp(),
1404
+ success: failed === 0,
1405
+ summary: {
1406
+ processedSources: sources.length,
1407
+ added,
1408
+ changed,
1409
+ unchanged,
1410
+ failed,
1411
+ changedOnly
1412
+ },
1413
+ failures,
1414
+ documentsSnapshot: documentSnapshot(finalDocuments)
1415
+ };
1416
+ await writeRun(workspacePath, run);
1417
+ return {
1418
+ runId: id,
1419
+ documents: { added, changed, unchanged, failed },
1420
+ processedSources: sources.length
1421
+ };
1422
+ }
1423
+ async function reprocessDocuments({
1424
+ workspacePath,
1425
+ sourceId,
1426
+ documentId
1427
+ }) {
1428
+ const documents = await loadDocuments(workspacePath);
1429
+ const sources = await listSources(workspacePath);
1430
+ const sourceMap = new Map(sources.map((source) => [source.id, source]));
1431
+ const nextDocuments = new Map(documents.map((document) => [document.id, document]));
1432
+ let documentsReprocessed = 0;
1433
+ let documentsSkipped = 0;
1434
+ for (const document of documents.filter((candidate) => (!sourceId || candidate.sourceId === sourceId) && (!documentId || candidate.id === documentId))) {
1435
+ const source = sourceMap.get(document.sourceId);
1436
+ if (!source || !document.rawPath || !await fileExists(document.rawPath)) {
1437
+ documentsSkipped += 1;
1438
+ continue;
1439
+ }
1440
+ const updated = source.type === "url" || source.type === "website" || source.type === "rss" ? await reprocessRemoteDocument(document, source) : await reprocessStoredDocument(document, source);
1441
+ if (!updated) {
1442
+ documentsSkipped += 1;
1443
+ continue;
1444
+ }
1445
+ nextDocuments.set(updated.id, updated);
1446
+ documentsReprocessed += 1;
1447
+ }
1448
+ const finalDocuments = [...nextDocuments.values()];
1449
+ await saveDocuments(workspacePath, finalDocuments);
1450
+ const id = runId();
1451
+ await writeRun(workspacePath, {
1452
+ id,
1453
+ kind: "reprocess",
1454
+ createdAt: nowStamp(),
1455
+ success: true,
1456
+ summary: {
1457
+ documentsReprocessed,
1458
+ documentsSkipped
1459
+ },
1460
+ documentsSnapshot: documentSnapshot(finalDocuments)
1461
+ });
1462
+ return { runId: id, documentsReprocessed, documentsSkipped };
1463
+ }
1464
+
1465
+ // src/chunk/chunker.ts
1466
+ import { readFile as readFile8 } from "fs/promises";
1467
+ import matter2 from "gray-matter";
1468
+ import path11 from "path";
1469
+ function splitSections(markdown) {
1470
+ const lines = markdown.split("\n");
1471
+ const sections = [];
1472
+ let headingPath = [];
1473
+ let current = [];
1474
+ const flush = () => {
1475
+ const text = current.join("\n").trim();
1476
+ if (text.length > 0) {
1477
+ sections.push({ headingPath: [...headingPath], text });
1478
+ }
1479
+ current = [];
1480
+ };
1481
+ for (const line of lines) {
1482
+ const match = /^(#{1,6})\s+(.+)$/.exec(line);
1483
+ if (match?.[1] && match[2]) {
1484
+ flush();
1485
+ const level = match[1].length;
1486
+ headingPath = [...headingPath.slice(0, level - 1), match[2].trim()];
1487
+ current.push(line);
1488
+ continue;
1489
+ }
1490
+ current.push(line);
1491
+ }
1492
+ flush();
1493
+ return sections;
1494
+ }
1495
+ function splitLongSection(text, maxChars, overlapChars) {
1496
+ if (text.length <= maxChars) {
1497
+ return [text];
1498
+ }
1499
+ const chunks = [];
1500
+ let start = 0;
1501
+ while (start < text.length) {
1502
+ const hardEnd = Math.min(text.length, start + maxChars);
1503
+ let sliceEnd = hardEnd;
1504
+ const window = text.slice(start, hardEnd);
1505
+ const paragraphBreak = window.lastIndexOf("\n\n");
1506
+ if (paragraphBreak > maxChars / 2 && hardEnd < text.length) {
1507
+ const candidateEnd = start + paragraphBreak;
1508
+ if (candidateEnd - start > overlapChars) {
1509
+ sliceEnd = candidateEnd;
1510
+ }
1511
+ }
1512
+ const slice = text.slice(start, sliceEnd).trim();
1513
+ if (slice.length === 0) {
1514
+ start = hardEnd;
1515
+ continue;
1516
+ }
1517
+ chunks.push(slice);
1518
+ const nextStart = sliceEnd - overlapChars;
1519
+ start = nextStart > start ? nextStart : hardEnd;
1520
+ }
1521
+ return chunks.filter((chunk) => chunk.length > 0);
1522
+ }
1523
+ function estimateTokens(text) {
1524
+ return Math.ceil(text.length / 4);
1525
+ }
1526
+ function buildChunksForDocument(document, markdown, config, prior = /* @__PURE__ */ new Map(), seenAt = (/* @__PURE__ */ new Date()).toISOString()) {
1527
+ const parsed = matter2(markdown);
1528
+ const sections = splitSections(parsed.content);
1529
+ const usefulSections = sections.length > 0 ? sections : [{ headingPath: [document.title], text: parsed.content }];
1530
+ const chunks = [];
1531
+ for (const section of usefulSections) {
1532
+ const pieces = splitLongSection(section.text, config.index.chunking.maxChars, config.index.chunking.overlapChars);
1533
+ for (const piece of pieces) {
1534
+ if (piece.trim().length < Math.min(40, config.index.chunking.minChars) && pieces.length === 1) {
1535
+ continue;
1536
+ }
1537
+ const text = piece.trim();
1538
+ const id = stableId("chunk", document.id, section.headingPath.join(" > "), text);
1539
+ const priorChunk = prior.get(id);
1540
+ const contentHash = sha256(text);
1541
+ chunks.push({
1542
+ id,
1543
+ documentId: document.id,
1544
+ sourceId: document.sourceId,
1545
+ title: document.title,
1546
+ uri: document.uri,
1547
+ headingPath: section.headingPath,
1548
+ text,
1549
+ tokenEstimate: estimateTokens(text),
1550
+ contentHash,
1551
+ metadata: document.metadata,
1552
+ firstSeenAt: priorChunk?.firstSeenAt ?? document.firstSeenAt,
1553
+ lastSeenAt: seenAt,
1554
+ lastChangedAt: priorChunk?.contentHash === contentHash ? priorChunk.lastChangedAt : document.lastChangedAt
1555
+ });
1556
+ }
1557
+ }
1558
+ return chunks;
1559
+ }
1560
+ async function chunkDocuments({
1561
+ workspacePath,
1562
+ sourceId,
1563
+ documentId
1564
+ }) {
1565
+ const config = await loadConfig(workspacePath);
1566
+ const documents = await readJsonl(path11.join(workspacePath, "documents", "documents.jsonl"));
1567
+ const filtered = documents.filter((document) => (!sourceId || document.sourceId === sourceId) && (!documentId || document.id === documentId));
1568
+ const targetedDocumentIds = new Set(filtered.map((document) => document.id));
1569
+ const existingChunks = await loadChunks(workspacePath);
1570
+ const prior = new Map(existingChunks.map((chunk) => [chunk.id, chunk]));
1571
+ const nextChunks = new Map(
1572
+ existingChunks.filter((chunk) => !targetedDocumentIds.has(chunk.documentId)).map((chunk) => [chunk.id, chunk])
1573
+ );
1574
+ for (const document of filtered) {
1575
+ const raw = await readFile8(document.normalizedPath, "utf8");
1576
+ for (const chunk of buildChunksForDocument(document, raw, config, prior)) {
1577
+ nextChunks.set(chunk.id, chunk);
1578
+ }
1579
+ }
1580
+ await saveChunks(workspacePath, [...nextChunks.values()]);
1581
+ return { chunksWritten: nextChunks.size };
1582
+ }
1583
+
1584
+ // src/index/querylight-indexer.ts
1585
+ import { Analyzer, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, RankingAlgorithm, TextFieldIndex } from "@tryformation/querylight-ts";
1586
+ import path17 from "path";
1587
+
1588
+ // src/vector/dense.ts
1589
+ import { VectorFieldIndex, createSeededRandom } from "@tryformation/querylight-ts";
1590
+ import { mkdir as mkdir7 } from "fs/promises";
1591
+ import path14 from "path";
1592
+
1593
+ // src/vector/runtime.ts
1594
+ import path12 from "path";
1595
+ import { fileURLToPath } from "url";
1596
+ import { execFile, execFileSync } from "child_process";
1597
+ function resolveCacheDir(workspacePath, configuredPath) {
1598
+ return path12.isAbsolute(configuredPath) ? configuredPath : path12.resolve(workspacePath, configuredPath.replace(/^\.kb\//, ""));
1599
+ }
1600
+ function packageRootFromImportMeta(importMetaUrl) {
1601
+ return path12.resolve(path12.dirname(fileURLToPath(importMetaUrl)), "..");
1602
+ }
1603
+ async function sparseScriptPath(importMetaUrl) {
1604
+ const base = packageRootFromImportMeta(importMetaUrl);
1605
+ const candidates = [
1606
+ path12.join(base, "scripts", "sparse-encode.py"),
1607
+ path12.join(base, "..", "scripts", "sparse-encode.py")
1608
+ ];
1609
+ for (const candidate of candidates) {
1610
+ if (await fileExists(candidate)) {
1611
+ return path12.resolve(candidate);
1612
+ }
1613
+ }
1614
+ throw new Error(`sparse helper script not found; checked ${candidates.join(", ")}`);
1615
+ }
1616
+ async function ensureUvAvailable() {
1617
+ await new Promise((resolve2, reject) => {
1618
+ execFile("uv", ["--version"], (error) => error ? reject(error) : resolve2());
1619
+ });
1620
+ }
1621
+ async function runSparsePython({
1622
+ workspacePath,
1623
+ config,
1624
+ payload,
1625
+ importMetaUrl
1626
+ }) {
1627
+ const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
1628
+ const scriptPath = await sparseScriptPath(importMetaUrl);
1629
+ return execFileSync(
1630
+ "uv",
1631
+ [
1632
+ "run",
1633
+ "--with",
1634
+ "torch",
1635
+ "--with",
1636
+ "transformers",
1637
+ "--with",
1638
+ "huggingface_hub",
1639
+ "python",
1640
+ scriptPath
1641
+ ],
1642
+ {
1643
+ encoding: "utf8",
1644
+ maxBuffer: 1024 * 1024 * 1024,
1645
+ input: JSON.stringify(payload),
1646
+ env: {
1647
+ ...process.env,
1648
+ HF_HOME: cacheDir
1649
+ }
1650
+ }
1651
+ );
1652
+ }
1653
+ async function getDenseTransformersRuntime(cacheDir) {
1654
+ const transformers = await import("@huggingface/transformers");
1655
+ transformers.env.cacheDir = cacheDir;
1656
+ transformers.env.allowLocalModels = true;
1657
+ return {
1658
+ env: transformers.env,
1659
+ pipeline: transformers.pipeline
1660
+ };
1661
+ }
1662
+
1663
+ // src/vector/store.ts
1664
+ import { mkdir as mkdir6, readFile as readFile9, writeFile as writeFile6 } from "fs/promises";
1665
+ import path13 from "path";
1666
+ function vectorsDir(workspacePath) {
1667
+ return path13.join(workspacePath, "vectors");
1668
+ }
1669
+ function modelsDir(workspacePath) {
1670
+ return path13.join(workspacePath, "models");
1671
+ }
1672
+ function denseVectorPath(workspacePath) {
1673
+ return path13.join(vectorsDir(workspacePath), "dense.latest.json");
1674
+ }
1675
+ function denseMetaPath(workspacePath) {
1676
+ return path13.join(vectorsDir(workspacePath), "dense.latest.meta.json");
1677
+ }
1678
+ function sparseVectorPath(workspacePath) {
1679
+ return path13.join(vectorsDir(workspacePath), "sparse.latest.json");
1680
+ }
1681
+ function sparseMetaPath(workspacePath) {
1682
+ return path13.join(vectorsDir(workspacePath), "sparse.latest.meta.json");
1683
+ }
1684
+ function densePullMarker(workspacePath) {
1685
+ return path13.join(modelsDir(workspacePath), "dense.pulled.json");
1686
+ }
1687
+ function sparsePullMarker(workspacePath) {
1688
+ return path13.join(modelsDir(workspacePath), "sparse.pulled.json");
1689
+ }
1690
+ async function writeDensePayload(workspacePath, payload) {
1691
+ await mkdir6(vectorsDir(workspacePath), { recursive: true });
1692
+ await writeFile6(denseVectorPath(workspacePath), JSON.stringify(payload, null, 2), "utf8");
1693
+ await writeFile6(denseMetaPath(workspacePath), JSON.stringify(payload.metadata, null, 2), "utf8");
1694
+ }
1695
+ async function readDensePayload(workspacePath) {
1696
+ return JSON.parse(await readFile9(denseVectorPath(workspacePath), "utf8"));
1697
+ }
1698
+ async function writeSparsePayload(workspacePath, payload) {
1699
+ await mkdir6(vectorsDir(workspacePath), { recursive: true });
1700
+ await writeFile6(sparseVectorPath(workspacePath), JSON.stringify(payload, null, 2), "utf8");
1701
+ await writeFile6(sparseMetaPath(workspacePath), JSON.stringify(payload.metadata, null, 2), "utf8");
1702
+ }
1703
+ async function readSparsePayload(workspacePath) {
1704
+ return JSON.parse(await readFile9(sparseVectorPath(workspacePath), "utf8"));
1705
+ }
1706
+ async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
1707
+ const denseCacheDir = resolveCacheDir(workspacePath, dense.cacheDir);
1708
+ const sparseCacheDir = resolveCacheDir(workspacePath, sparse.cacheDir);
1709
+ return {
1710
+ dense: {
1711
+ configured: dense.enabled,
1712
+ modelId: dense.modelId,
1713
+ cacheDir: denseCacheDir,
1714
+ available: await fileExists(densePullMarker(workspacePath)),
1715
+ artifactExists: await fileExists(denseVectorPath(workspacePath))
1716
+ },
1717
+ sparse: {
1718
+ configured: sparse.enabled,
1719
+ modelId: sparse.modelId,
1720
+ cacheDir: sparseCacheDir,
1721
+ uvAvailable,
1722
+ available: await fileExists(sparsePullMarker(workspacePath)),
1723
+ artifactExists: await fileExists(sparseVectorPath(workspacePath))
1724
+ }
1725
+ };
1726
+ }
1727
+
1728
+ // src/vector/text.ts
1729
+ function createDenseChunkText(chunk) {
1730
+ return [chunk.title, ...chunk.headingPath, chunk.text].filter(Boolean).join("\n\n");
1731
+ }
1732
+ function createSparseChunkText(chunk) {
1733
+ return [chunk.title, ...chunk.headingPath, chunk.text].filter(Boolean).join("\n\n");
1734
+ }
1735
+
1736
+ // src/vector/dense.ts
1737
+ var denseEmbedderFactory = null;
1738
+ async function createEmbedder(cacheDir, modelId) {
1739
+ if (denseEmbedderFactory) {
1740
+ return denseEmbedderFactory(cacheDir, modelId);
1741
+ }
1742
+ const runtime = await getDenseTransformersRuntime(cacheDir);
1743
+ const extractor = await runtime.pipeline("feature-extraction", modelId);
1744
+ return async (text) => {
1745
+ const output = await extractor(text, { pooling: "mean", normalize: true });
1746
+ return output.tolist()[0];
1747
+ };
1748
+ }
1749
+ async function buildDenseVectors({
1750
+ workspacePath,
1751
+ config
1752
+ }) {
1753
+ const chunks = await readJsonl(path14.join(workspacePath, "chunks", "chunks.jsonl"));
1754
+ const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
1755
+ await mkdir7(cacheDir, { recursive: true });
1756
+ const embed = await createEmbedder(cacheDir, config.modelId);
1757
+ const records = [];
1758
+ let dimensions = 0;
1759
+ for (const chunk of chunks) {
1760
+ const embedding = await embed(createDenseChunkText(chunk));
1761
+ dimensions ||= embedding.length;
1762
+ records.push({
1763
+ chunkId: chunk.id,
1764
+ documentId: chunk.documentId,
1765
+ sourceId: chunk.sourceId,
1766
+ title: chunk.title,
1767
+ uri: chunk.uri,
1768
+ headingPath: chunk.headingPath,
1769
+ text: chunk.text,
1770
+ embedding
1771
+ });
1772
+ }
1773
+ const index = new VectorFieldIndex({
1774
+ numHashTables: config.indexHashTables,
1775
+ dimensions,
1776
+ random: createSeededRandom(config.indexRandomSeed)
1777
+ });
1778
+ for (const record of records) {
1779
+ index.insert(record.chunkId, [record.embedding]);
1780
+ }
1781
+ const metadata = {
1782
+ createdAt: (/* @__PURE__ */ new Date()).toISOString(),
1783
+ modelId: config.modelId,
1784
+ dimensions,
1785
+ hashTables: config.indexHashTables,
1786
+ randomSeed: config.indexRandomSeed,
1787
+ chunkCount: records.length,
1788
+ indexHash: sha256(JSON.stringify(index.indexState))
1789
+ };
1790
+ const payload = {
1791
+ metadata,
1792
+ indexState: index.indexState,
1793
+ chunks: records
1794
+ };
1795
+ await writeDensePayload(workspacePath, payload);
1796
+ return payload;
1797
+ }
1798
+ async function denseQuery({
1799
+ workspacePath,
1800
+ config,
1801
+ query,
1802
+ topK
1803
+ }) {
1804
+ const payload = await readDensePayload(workspacePath);
1805
+ const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
1806
+ const embed = await createEmbedder(cacheDir, config.modelId);
1807
+ const vector = await embed(query);
1808
+ const index = new VectorFieldIndex({
1809
+ numHashTables: payload.metadata.hashTables,
1810
+ dimensions: payload.metadata.dimensions,
1811
+ random: createSeededRandom(payload.metadata.randomSeed)
1812
+ }).loadState(payload.indexState);
1813
+ return index.query(vector, topK);
1814
+ }
1815
+
1816
+ // src/vector/sparse.ts
1817
+ import { SparseVectorFieldIndex } from "@tryformation/querylight-ts";
1818
+ import { mkdir as mkdir8 } from "fs/promises";
1819
+ import path15 from "path";
1820
+ var sparseQueryEncoderFactory = null;
1821
+ var sparseDocumentBuilderFactory = null;
1822
+ function buildSparseQueryVector(tokenIds, tokenWeights) {
1823
+ const sparseVector = {};
1824
+ for (const tokenId of new Set(tokenIds)) {
1825
+ const weight = tokenWeights[tokenId] ?? 0;
1826
+ if (weight > 0) {
1827
+ sparseVector[String(tokenId)] = weight;
1828
+ }
1829
+ }
1830
+ return sparseVector;
1831
+ }
1832
+ function normalizeTokenIds(value) {
1833
+ if (value && typeof value === "object" && "data" in value) {
1834
+ const data = value.data;
1835
+ if (Array.isArray(data)) {
1836
+ return data.map(Number).filter(Number.isFinite);
1837
+ }
1838
+ if (ArrayBuffer.isView(data)) {
1839
+ return Array.from(data, Number).filter(Number.isFinite);
1840
+ }
1841
+ }
1842
+ if (!Array.isArray(value)) {
1843
+ return [];
1844
+ }
1845
+ if (value.length === 0) {
1846
+ return [];
1847
+ }
1848
+ if (Array.isArray(value[0])) {
1849
+ return value[0].map(Number).filter(Number.isFinite);
1850
+ }
1851
+ return value.map(Number).filter(Number.isFinite);
1852
+ }
1853
+ async function createSparseQueryEncoder(cacheDir, modelId, queryTokenWeights) {
1854
+ if (sparseQueryEncoderFactory) {
1855
+ return sparseQueryEncoderFactory(cacheDir, modelId, queryTokenWeights);
1856
+ }
1857
+ const runtime = await getDenseTransformersRuntime(cacheDir);
1858
+ const { AutoTokenizer } = await import("@huggingface/transformers");
1859
+ runtime.env.cacheDir = cacheDir;
1860
+ const tokenizer = await AutoTokenizer.from_pretrained(modelId);
1861
+ return async (text) => {
1862
+ const features = await tokenizer([text], {
1863
+ truncation: true,
1864
+ return_attention_mask: false,
1865
+ return_token_type_ids: false
1866
+ });
1867
+ return buildSparseQueryVector(normalizeTokenIds(features.input_ids), queryTokenWeights);
1868
+ };
1869
+ }
1870
+ async function buildSparseDocuments(workspacePath, config, chunks) {
1871
+ if (sparseDocumentBuilderFactory) {
1872
+ return sparseDocumentBuilderFactory(workspacePath, config, chunks);
1873
+ }
1874
+ await ensureUvAvailable();
1875
+ const output = JSON.parse(await runSparsePython({
1876
+ workspacePath,
1877
+ config,
1878
+ importMetaUrl: import.meta.url,
1879
+ payload: {
1880
+ action: "encode_documents",
1881
+ model_id: config.modelId,
1882
+ top_tokens: config.documentTopTokens,
1883
+ documents: chunks.map((chunk) => ({
1884
+ chunkId: chunk.id,
1885
+ text: createSparseChunkText(chunk)
1886
+ }))
1887
+ }
1888
+ }));
1889
+ const byId = new Map(output.documents.map((document) => [document.chunkId, document.vector]));
1890
+ return {
1891
+ queryTokenWeights: output.query_token_weights,
1892
+ vocabularySize: output.vocabularySize,
1893
+ chunks: chunks.map((chunk) => ({
1894
+ chunkId: chunk.id,
1895
+ documentId: chunk.documentId,
1896
+ sourceId: chunk.sourceId,
1897
+ title: chunk.title,
1898
+ uri: chunk.uri,
1899
+ headingPath: chunk.headingPath,
1900
+ text: chunk.text,
1901
+ vector: byId.get(chunk.id) ?? {}
1902
+ }))
1903
+ };
1904
+ }
1905
+ async function buildSparseVectors({
1906
+ workspacePath,
1907
+ config
1908
+ }) {
1909
+ const chunks = await readJsonl(path15.join(workspacePath, "chunks", "chunks.jsonl"));
1910
+ const built = await buildSparseDocuments(workspacePath, config, chunks);
1911
+ const index = new SparseVectorFieldIndex();
1912
+ for (const record of built.chunks) {
1913
+ index.insert(record.chunkId, [record.vector]);
1914
+ }
1915
+ const metadata = {
1916
+ createdAt: (/* @__PURE__ */ new Date()).toISOString(),
1917
+ modelId: config.modelId,
1918
+ vocabularySize: built.vocabularySize,
1919
+ documentTopTokens: config.documentTopTokens,
1920
+ queryEncoding: config.queryEncoding,
1921
+ documentEncoding: config.documentEncoding,
1922
+ chunkCount: built.chunks.length,
1923
+ indexHash: sha256(JSON.stringify(index.indexState))
1924
+ };
1925
+ const payload = {
1926
+ metadata,
1927
+ indexState: index.indexState,
1928
+ chunks: built.chunks,
1929
+ queryTokenWeights: built.queryTokenWeights
1930
+ };
1931
+ await writeSparsePayload(workspacePath, payload);
1932
+ return payload;
1933
+ }
1934
+ async function sparseQuery({
1935
+ workspacePath,
1936
+ config,
1937
+ query,
1938
+ topK
1939
+ }) {
1940
+ const payload = await readSparsePayload(workspacePath);
1941
+ const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
1942
+ const encode = await createSparseQueryEncoder(cacheDir, config.modelId, payload.queryTokenWeights);
1943
+ const vector = await encode(query);
1944
+ const index = new SparseVectorFieldIndex().loadState(payload.indexState);
1945
+ return index.query(vector, topK);
1946
+ }
1947
+
1948
+ // src/vector/service.ts
1949
+ async function buildVectorArtifacts({
1950
+ workspacePath,
1951
+ config,
1952
+ denseOverride,
1953
+ sparseOverride,
1954
+ buildAvailableModels = false
1955
+ }) {
1956
+ const modelStatus = buildAvailableModels ? await buildModelStatus(workspacePath, config.retrieval.dense, config.retrieval.sparse, await (async () => {
1957
+ try {
1958
+ await ensureUvAvailable();
1959
+ return true;
1960
+ } catch {
1961
+ return false;
1962
+ }
1963
+ })()) : null;
1964
+ const denseEnabled = denseOverride ?? (buildAvailableModels ? config.retrieval.dense.enabled || Boolean(modelStatus?.dense.available) : config.retrieval.dense.enabled);
1965
+ const sparseEnabled = sparseOverride ?? (buildAvailableModels ? (config.retrieval.sparse.enabled || Boolean(modelStatus?.sparse.available)) && Boolean(modelStatus?.sparse.uvAvailable) : config.retrieval.sparse.enabled);
1966
+ const result = {};
1967
+ if (denseEnabled) {
1968
+ result.dense = await buildDenseVectors({ workspacePath, config: config.retrieval.dense });
1969
+ }
1970
+ if (sparseEnabled) {
1971
+ result.sparse = await buildSparseVectors({ workspacePath, config: config.retrieval.sparse });
1972
+ }
1973
+ return result;
1974
+ }
1975
+
1976
+ // src/index/index-store.ts
1977
+ import { readFile as readFile10, writeFile as writeFile7 } from "fs/promises";
1978
+ import path16 from "path";
1979
+ async function writeIndexArtifacts({
1980
+ workspacePath,
1981
+ indexState,
1982
+ metadata
1983
+ }) {
1984
+ const stamp = metadata.createdAt.replace(/[:.]/g, "-");
1985
+ const indexPath = path16.join(workspacePath, "indexes", `${stamp}.json`);
1986
+ const metaPath = path16.join(workspacePath, "indexes", `${stamp}.meta.json`);
1987
+ const latestIndexPath = path16.join(workspacePath, "indexes", "latest.json");
1988
+ const latestMetaPath = path16.join(workspacePath, "indexes", "latest.meta.json");
1989
+ const indexPayload = JSON.stringify(indexState, null, 2);
1990
+ const metaPayload = JSON.stringify(metadata, null, 2);
1991
+ await writeFile7(indexPath, indexPayload, "utf8");
1992
+ await writeFile7(metaPath, metaPayload, "utf8");
1993
+ await writeFile7(latestIndexPath, indexPayload, "utf8");
1994
+ await writeFile7(latestMetaPath, metaPayload, "utf8");
1995
+ return { indexPath: latestIndexPath, metadataPath: latestMetaPath };
1996
+ }
1997
+ async function readLatestIndexState(workspacePath) {
1998
+ return JSON.parse(await readFile10(path16.join(workspacePath, "indexes", "latest.json"), "utf8"));
1999
+ }
2000
+
2001
+ // src/index/querylight-indexer.ts
2002
+ function keywordFieldIndex() {
2003
+ const analyzer = new Analyzer([new LowerCaseTextFilter()], new KeywordTokenizer());
2004
+ return new TextFieldIndex(analyzer, analyzer, RankingAlgorithm.BM25);
2005
+ }
2006
+ function createIndexMapping(extraFields = []) {
2007
+ const lexical = new TextFieldIndex(void 0, void 0, RankingAlgorithm.BM25);
2008
+ const mapping = {
2009
+ text: lexical,
2010
+ title: new TextFieldIndex(void 0, void 0, RankingAlgorithm.BM25),
2011
+ uri: keywordFieldIndex(),
2012
+ sourceId: keywordFieldIndex(),
2013
+ tags: keywordFieldIndex(),
2014
+ sourceType: keywordFieldIndex()
2015
+ };
2016
+ for (const field of extraFields) {
2017
+ mapping[field] = keywordFieldIndex();
2018
+ }
2019
+ return mapping;
2020
+ }
2021
+ function flattenMetadata(metadata) {
2022
+ const flattened = {};
2023
+ for (const [key, value] of Object.entries(metadata)) {
2024
+ if (value == null) {
2025
+ continue;
2026
+ }
2027
+ const field = `metadata.${key}`;
2028
+ if (Array.isArray(value)) {
2029
+ flattened[field] = value.map((item) => String(item).toLowerCase());
2030
+ } else {
2031
+ flattened[field] = [String(value).toLowerCase()];
2032
+ }
2033
+ }
2034
+ return flattened;
2035
+ }
2036
+ async function buildIndex({
2037
+ workspacePath,
2038
+ denseOverride,
2039
+ sparseOverride,
2040
+ buildAvailableModels = false
2041
+ }) {
2042
+ const config = await loadConfig(workspacePath);
2043
+ const chunks = await readJsonl(path17.join(workspacePath, "chunks", "chunks.jsonl"));
2044
+ const documents = await readJsonl(path17.join(workspacePath, "documents", "documents.jsonl"));
2045
+ const sources = await readJsonl(path17.join(workspacePath, "sources", "sources.jsonl"));
2046
+ const metadataFields = [...new Set(chunks.flatMap((chunk) => Object.keys(chunk.metadata).map((key) => `metadata.${key}`)))];
2047
+ const index = new DocumentIndex(createIndexMapping(metadataFields));
2048
+ for (const chunk of chunks) {
2049
+ index.index({
2050
+ id: chunk.id,
2051
+ fields: {
2052
+ text: [chunk.text],
2053
+ title: [chunk.title],
2054
+ uri: [chunk.uri.toLowerCase()],
2055
+ sourceId: [chunk.sourceId.toLowerCase()],
2056
+ tags: Array.isArray(chunk.metadata.tags) ? chunk.metadata.tags.map((tag) => String(tag).toLowerCase()) : [],
2057
+ sourceType: [String(chunk.metadata.sourceType ?? "").toLowerCase()],
2058
+ ...flattenMetadata(chunk.metadata)
2059
+ }
2060
+ });
2061
+ }
2062
+ const createdAt = (/* @__PURE__ */ new Date()).toISOString();
2063
+ const metadata = {
2064
+ id: `index_${createdAt.replace(/[:.]/g, "-")}`,
2065
+ createdAt,
2066
+ querylightVersion: "0.10.0",
2067
+ kbVersion: "0.1.0",
2068
+ documentCount: documents.length,
2069
+ chunkCount: chunks.length,
2070
+ sourceCount: sources.length,
2071
+ fields: Object.keys(index.mapping),
2072
+ indexHash: sha256(JSON.stringify(index.indexState))
2073
+ };
2074
+ const artifacts = await writeIndexArtifacts({ workspacePath, indexState: index.indexState, metadata });
2075
+ const vectors = await buildVectorArtifacts({
2076
+ workspacePath,
2077
+ config,
2078
+ denseOverride,
2079
+ sparseOverride,
2080
+ buildAvailableModels
2081
+ });
2082
+ return {
2083
+ metadata,
2084
+ indexPath: artifacts.indexPath,
2085
+ denseBuilt: Boolean(vectors.dense),
2086
+ sparseBuilt: Boolean(vectors.sparse)
2087
+ };
2088
+ }
2089
+
2090
+ // src/query/search-service.ts
2091
+ import { readFile as readFile11 } from "fs/promises";
2092
+ import { BoolQuery, MatchQuery, OP, TermQuery, reciprocalRankFusion } from "@tryformation/querylight-ts";
2093
+ import path18 from "path";
2094
+ async function loadHydratedIndex(workspacePath) {
2095
+ const state = await readLatestIndexState(workspacePath);
2096
+ const mapping = createIndexMapping(Object.keys(state.fieldState ?? {}).filter((field) => field.startsWith("metadata.")));
2097
+ return new (await import("@tryformation/querylight-ts")).DocumentIndex(mapping).loadState(state);
2098
+ }
2099
+ function normalizeFilterValues(values) {
2100
+ return (values ?? []).map((value) => value.toLowerCase()).filter(Boolean);
2101
+ }
2102
+ function matchesAny(value, candidates) {
2103
+ return candidates.length === 0 || candidates.includes(value.toLowerCase());
2104
+ }
2105
+ function matchesPrefix(value, prefixes) {
2106
+ if (prefixes.length === 0) {
2107
+ return true;
2108
+ }
2109
+ const lower = value.toLowerCase();
2110
+ return prefixes.some((prefix) => lower.startsWith(prefix));
2111
+ }
2112
+ function buildSearchQuery(query, filters) {
2113
+ const sourceIds = normalizeFilterValues([filters.sourceId, ...filters.sourceIds ?? []].filter((value) => Boolean(value)));
2114
+ const sourceTypes = normalizeFilterValues([filters.sourceType, ...filters.sourceTypes ?? []].filter((value) => Boolean(value)));
2115
+ const tags = normalizeFilterValues([filters.tag, ...filters.tags ?? []].filter((value) => Boolean(value)));
2116
+ return new BoolQuery({
2117
+ should: [
2118
+ new MatchQuery({ field: "title", text: query, operation: OP.AND, boost: 6 }),
2119
+ new MatchQuery({ field: "text", text: query, operation: OP.AND, boost: 4 }),
2120
+ new MatchQuery({ field: "text", text: query, operation: OP.OR, boost: 2 })
2121
+ ],
2122
+ filter: [
2123
+ ...sourceIds.length === 1 ? [new TermQuery({ field: "sourceId", text: sourceIds[0] })] : [],
2124
+ ...sourceTypes.length === 1 ? [new TermQuery({ field: "sourceType", text: sourceTypes[0] })] : [],
2125
+ ...tags.length === 1 ? [new TermQuery({ field: "tags", text: tags[0] })] : [],
2126
+ ...(filters.metadata ?? []).map(({ key, value }) => new TermQuery({ field: `metadata.${key}`, text: value.toLowerCase() }))
2127
+ ]
2128
+ });
2129
+ }
2130
+ function isValidDate(value) {
2131
+ return typeof value === "string" && !Number.isNaN(new Date(value).getTime());
2132
+ }
2133
+ function documentDateValue(document, field) {
2134
+ const value = document[field];
2135
+ return typeof value === "string" && isValidDate(value) ? value : null;
2136
+ }
2137
+ function matchesDateRanges(document, dateRanges) {
2138
+ return dateRanges.every(({ field, from, to }) => {
2139
+ const value = documentDateValue(document, field);
2140
+ if (!value) {
2141
+ return false;
2142
+ }
2143
+ const timestamp = new Date(value).getTime();
2144
+ const fromTime = from ? new Date(from).getTime() : null;
2145
+ const toTime = to ? new Date(to).getTime() : null;
2146
+ return (fromTime == null || timestamp >= fromTime) && (toTime == null || timestamp <= toTime);
2147
+ });
2148
+ }
2149
+ function fallbackSourceType(chunk, document, source) {
2150
+ const metadataSourceType = typeof chunk.metadata.sourceType === "string" ? chunk.metadata.sourceType : void 0;
2151
+ return document?.sourceType ?? source?.type ?? metadataSourceType ?? "text";
2152
+ }
2153
+ function filterChunk(chunk, document, source, {
2154
+ sourceId,
2155
+ sourceIds,
2156
+ sourceName,
2157
+ sourceNames,
2158
+ sourceType,
2159
+ sourceTypes,
2160
+ uriPrefix,
2161
+ uriPrefixes,
2162
+ hasPublicationDate,
2163
+ tag,
2164
+ tags,
2165
+ metadata,
2166
+ dateRanges
2167
+ }) {
2168
+ const normalizedSourceIds = normalizeFilterValues([sourceId, ...sourceIds ?? []].filter((value) => Boolean(value)));
2169
+ const normalizedSourceNames = normalizeFilterValues([sourceName, ...sourceNames ?? []].filter((value) => Boolean(value)));
2170
+ const normalizedSourceTypes = normalizeFilterValues([sourceType, ...sourceTypes ?? []].filter((value) => Boolean(value)));
2171
+ const normalizedUriPrefixes = normalizeFilterValues([uriPrefix, ...uriPrefixes ?? []].filter((value) => Boolean(value)));
2172
+ const normalizedTags = normalizeFilterValues([tag, ...tags ?? []].filter((value) => Boolean(value)));
2173
+ if (!matchesAny(chunk.sourceId, normalizedSourceIds)) {
2174
+ return false;
2175
+ }
2176
+ if (!matchesAny(fallbackSourceType(chunk, document, source), normalizedSourceTypes)) {
2177
+ return false;
2178
+ }
2179
+ if (normalizedSourceNames.length > 0 && !matchesAny(source?.name ?? "", normalizedSourceNames)) {
2180
+ return false;
2181
+ }
2182
+ if (!matchesPrefix(document?.uri ?? chunk.uri, normalizedUriPrefixes)) {
2183
+ return false;
2184
+ }
2185
+ if (hasPublicationDate && (!document || !documentDateValue(document, "publicationDate"))) {
2186
+ return false;
2187
+ }
2188
+ if (normalizedTags.length > 0) {
2189
+ const tags2 = Array.isArray(chunk.metadata.tags) ? chunk.metadata.tags.map(String).map((value) => value.toLowerCase()) : [];
2190
+ if (!normalizedTags.some((tag2) => tags2.includes(tag2))) {
2191
+ return false;
2192
+ }
2193
+ }
2194
+ if (metadata?.length) {
2195
+ const metadataMatches = metadata.every(({ key, value }) => {
2196
+ const candidate = chunk.metadata[key];
2197
+ return Array.isArray(candidate) ? candidate.map(String).map((item) => item.toLowerCase()).includes(value.toLowerCase()) : String(candidate ?? "").toLowerCase() === value.toLowerCase();
2198
+ });
2199
+ if (!metadataMatches) {
2200
+ return false;
2201
+ }
2202
+ }
2203
+ if (!document) {
2204
+ return dateRanges.length === 0;
2205
+ }
2206
+ return matchesDateRanges(document, dateRanges);
2207
+ }
2208
+ function sortDateDescending(left, right) {
2209
+ const leftTime = left ? new Date(left).getTime() : Number.NEGATIVE_INFINITY;
2210
+ const rightTime = right ? new Date(right).getTime() : Number.NEGATIVE_INFINITY;
2211
+ return rightTime - leftTime;
2212
+ }
2213
+ function latestSortDate(document) {
2214
+ return documentDateValue(document, "publicationDate") ?? documentDateValue(document, "lastChangedAt") ?? documentDateValue(document, "lastSeenAt") ?? documentDateValue(document, "firstSeenAt") ?? documentDateValue(document, "crawledAt");
2215
+ }
2216
+ function representativeChunk(chunks) {
2217
+ return [...chunks].sort((left, right) => {
2218
+ if (left.headingPath.length !== right.headingPath.length) {
2219
+ return left.headingPath.length - right.headingPath.length;
2220
+ }
2221
+ if (left.uri !== right.uri) {
2222
+ return left.uri.localeCompare(right.uri);
2223
+ }
2224
+ return left.id.localeCompare(right.id);
2225
+ })[0] ?? chunks[0] ?? void 0;
2226
+ }
2227
+ function stripSnippetMarkdown(text) {
2228
+ return text.replace(/```[\s\S]*?```/g, " ").replace(/\[([^\]]+)\]\([^)]+\)/g, "$1").replace(/!\[([^\]]*)\]\([^)]+\)/g, "$1").replace(/`([^`]+)`/g, "$1").replace(/^#{1,6}\s+/gm, "").replace(/^\s*[-*+]\s+/gm, "");
2229
+ }
2230
+ function extractSnippetParagraphs(text) {
2231
+ return stripSnippetMarkdown(text).split(/\n\s*\n+/).map((paragraph) => paragraph.replace(/\s+/g, " ").trim()).filter(Boolean);
2232
+ }
2233
+ function buildParagraphSnippet(paragraphs, query, targetLength = 900) {
2234
+ if (paragraphs.length === 0) {
2235
+ return "";
2236
+ }
2237
+ const lowerQueryTerms = query.toLowerCase().split(/\s+/).filter(Boolean);
2238
+ const matchIndex = paragraphs.findIndex((paragraph) => {
2239
+ const lower = paragraph.toLowerCase();
2240
+ return lowerQueryTerms.some((term) => lower.includes(term));
2241
+ });
2242
+ let start = matchIndex >= 0 ? matchIndex : 0;
2243
+ let end = start + 1;
2244
+ let totalLength = paragraphs[start]?.length ?? 0;
2245
+ while (totalLength < targetLength && (start > 0 || end < paragraphs.length)) {
2246
+ const previousLength = start > 0 ? paragraphs[start - 1]?.length ?? 0 : -1;
2247
+ const nextLength = end < paragraphs.length ? paragraphs[end]?.length ?? 0 : -1;
2248
+ if (nextLength >= previousLength && end < paragraphs.length) {
2249
+ totalLength += nextLength + 2;
2250
+ end += 1;
2251
+ continue;
2252
+ }
2253
+ if (start > 0) {
2254
+ totalLength += previousLength + 2;
2255
+ start -= 1;
2256
+ continue;
2257
+ }
2258
+ break;
2259
+ }
2260
+ return paragraphs.slice(start, end).join("\n\n").trim();
2261
+ }
2262
+ function buildSnippet(text, query) {
2263
+ return buildParagraphSnippet(extractSnippetParagraphs(text), query);
2264
+ }
2265
+ function buildDocumentParagraphs(chunks) {
2266
+ return chunks.flatMap(
2267
+ (candidate, chunkIndex) => extractSnippetParagraphs(candidate.text).map((text) => ({ chunkIndex, text }))
2268
+ );
2269
+ }
2270
+ function buildExpandedParagraphSnippet(paragraphs, chunkIndex, query, targetLength = 1200) {
2271
+ if (paragraphs.length === 0) {
2272
+ return "";
2273
+ }
2274
+ const lowerQueryTerms = query.toLowerCase().split(/\s+/).filter(Boolean);
2275
+ const currentParagraphIndexes = paragraphs.map((paragraph, index) => ({ ...paragraph, index })).filter((paragraph) => paragraph.chunkIndex === chunkIndex).map((paragraph) => paragraph.index);
2276
+ const anchorIndex = currentParagraphIndexes.find((index) => {
2277
+ const lower = paragraphs[index]?.text.toLowerCase() ?? "";
2278
+ return lowerQueryTerms.some((term) => lower.includes(term));
2279
+ }) ?? currentParagraphIndexes[0] ?? 0;
2280
+ let start = anchorIndex;
2281
+ let end = anchorIndex + 1;
2282
+ let totalLength = paragraphs[anchorIndex]?.text.length ?? 0;
2283
+ while (totalLength < targetLength && (start > 0 || end < paragraphs.length)) {
2284
+ const previousLength = start > 0 ? paragraphs[start - 1]?.text.length ?? 0 : -1;
2285
+ const nextLength = end < paragraphs.length ? paragraphs[end]?.text.length ?? 0 : -1;
2286
+ if (nextLength >= previousLength && end < paragraphs.length) {
2287
+ totalLength += nextLength + 2;
2288
+ end += 1;
2289
+ continue;
2290
+ }
2291
+ if (start > 0) {
2292
+ totalLength += previousLength + 2;
2293
+ start -= 1;
2294
+ continue;
2295
+ }
2296
+ break;
2297
+ }
2298
+ return paragraphs.slice(start, end).map((paragraph) => paragraph.text).join("\n\n").trim();
2299
+ }
2300
+ async function buildSnippetWithAdjacentChunks(chunk, query, {
2301
+ document,
2302
+ config,
2303
+ orderedChunkCache
2304
+ }) {
2305
+ if (!document) {
2306
+ return buildSnippet(chunk.text, query);
2307
+ }
2308
+ let orderedChunks = orderedChunkCache.get(document.id);
2309
+ if (!orderedChunks) {
2310
+ if (!await fileExists(document.normalizedPath)) {
2311
+ return buildSnippet(chunk.text, query);
2312
+ }
2313
+ const raw = await readFile11(document.normalizedPath, "utf8");
2314
+ orderedChunks = buildChunksForDocument(document, raw, config);
2315
+ orderedChunkCache.set(document.id, orderedChunks);
2316
+ }
2317
+ const currentIndex = orderedChunks.findIndex((candidate) => candidate.id === chunk.id);
2318
+ if (currentIndex < 0) {
2319
+ return buildSnippet(chunk.text, query);
2320
+ }
2321
+ const current = orderedChunks[currentIndex];
2322
+ const paragraphs = buildDocumentParagraphs(orderedChunks);
2323
+ if (paragraphs.length === 0) {
2324
+ return buildSnippet(current.text, query);
2325
+ }
2326
+ return buildExpandedParagraphSnippet(paragraphs, currentIndex, query);
2327
+ }
2328
+ function normalizeDisplayTitle(title) {
2329
+ return title.replace(/\s*\|\s*Querylight TS Demo\s*$/i, "").replace(/\s+/g, " ").trim();
2330
+ }
2331
+ function chooseResultTitle(chunk) {
2332
+ const documentTitle = normalizeDisplayTitle(chunk.title);
2333
+ const headings = chunk.headingPath.map((heading) => normalizeDisplayTitle(heading)).filter(Boolean);
2334
+ const leafHeading = headings.at(-1);
2335
+ if (leafHeading && leafHeading.toLowerCase() !== documentTitle.toLowerCase()) {
2336
+ return leafHeading;
2337
+ }
2338
+ if (documentTitle) {
2339
+ return documentTitle;
2340
+ }
2341
+ return leafHeading ?? "Untitled";
2342
+ }
2343
+ function normalizeComparisonText(value) {
2344
+ return value.toLowerCase().replace(/[^a-z0-9]+/g, " ").replace(/\s+/g, " ").trim();
2345
+ }
2346
+ function normalizeUriPath(uri) {
2347
+ try {
2348
+ const parsed = new URL(uri);
2349
+ const pathname = parsed.pathname.replace(/\/+$/, "") || "/";
2350
+ return pathname.toLowerCase();
2351
+ } catch {
2352
+ return uri.toLowerCase().replace(/\/+$/, "");
2353
+ }
2354
+ }
2355
+ function uriSpecificity(uri) {
2356
+ const normalized = normalizeUriPath(uri);
2357
+ if (normalized === "/") {
2358
+ return 0;
2359
+ }
2360
+ return normalized.split("/").filter(Boolean).length;
2361
+ }
2362
+ function isMoreSpecificDuplicate(candidate, existing) {
2363
+ if (candidate.sourceId !== existing.sourceId) {
2364
+ return false;
2365
+ }
2366
+ const candidateTitle = normalizeComparisonText(candidate.title);
2367
+ const existingTitle = normalizeComparisonText(existing.title);
2368
+ if (!candidateTitle || candidateTitle !== existingTitle) {
2369
+ return false;
2370
+ }
2371
+ const candidatePath = normalizeUriPath(candidate.uri);
2372
+ const existingPath = normalizeUriPath(existing.uri);
2373
+ if (candidatePath === existingPath) {
2374
+ return false;
2375
+ }
2376
+ const candidateIsChild = candidatePath.startsWith(existingPath === "/" ? "/" : `${existingPath}/`);
2377
+ const existingIsChild = existingPath.startsWith(candidatePath === "/" ? "/" : `${candidatePath}/`);
2378
+ if (!candidateIsChild && !existingIsChild) {
2379
+ return false;
2380
+ }
2381
+ return uriSpecificity(candidate.uri) > uriSpecificity(existing.uri);
2382
+ }
2383
+ function collapseAggregateDuplicates(results, topK) {
2384
+ const deduped = [];
2385
+ for (const result of results) {
2386
+ const duplicateIndex = deduped.findIndex(
2387
+ (existing) => isMoreSpecificDuplicate(result, existing) || isMoreSpecificDuplicate(existing, result)
2388
+ );
2389
+ if (duplicateIndex < 0) {
2390
+ deduped.push(result);
2391
+ continue;
2392
+ }
2393
+ if (isMoreSpecificDuplicate(result, deduped[duplicateIndex])) {
2394
+ deduped[duplicateIndex] = result;
2395
+ }
2396
+ }
2397
+ return deduped.slice(0, topK);
2398
+ }
2399
+ function rerankResultsByDocument(results, topK) {
2400
+ const byDocument = /* @__PURE__ */ new Map();
2401
+ for (const result of results) {
2402
+ const existing = byDocument.get(result.documentId);
2403
+ if (existing) {
2404
+ existing.push(result);
2405
+ } else {
2406
+ byDocument.set(result.documentId, [result]);
2407
+ }
2408
+ }
2409
+ const reranked = [...byDocument.values()].flatMap((group) => {
2410
+ const sorted = [...group].sort((left, right) => right.score - left.score);
2411
+ const [best, ...rest] = sorted;
2412
+ if (!best) {
2413
+ return [];
2414
+ }
2415
+ const tailScore = rest.reduce((sum, result) => sum + result.score, 0);
2416
+ const aggregateScore = best.score + tailScore * 0.35 + (group.length - 1) * 0.2;
2417
+ return [{ ...best, score: aggregateScore }];
2418
+ }).sort((left, right) => right.score - left.score);
2419
+ return collapseAggregateDuplicates(reranked, topK);
2420
+ }
2421
+ async function searchIndex({
2422
+ workspacePath,
2423
+ query,
2424
+ topK,
2425
+ sourceId,
2426
+ sourceIds,
2427
+ sourceName,
2428
+ sourceNames,
2429
+ sourceType,
2430
+ sourceTypes,
2431
+ uriPrefix,
2432
+ uriPrefixes,
2433
+ hasPublicationDate,
2434
+ tag,
2435
+ tags,
2436
+ metadata,
2437
+ dateRanges = [],
2438
+ retrievalMode,
2439
+ showChunks = false
2440
+ }) {
2441
+ const config = await loadConfig(workspacePath);
2442
+ const mode = retrievalMode ?? config.retrieval.defaultMode;
2443
+ const candidateLimit = Math.max(topK * 5, 50);
2444
+ const chunks = new Map((await readJsonl(path18.join(workspacePath, "chunks", "chunks.jsonl"))).map((chunk) => [chunk.id, chunk]));
2445
+ const documents = new Map((await readJsonl(path18.join(workspacePath, "documents", "documents.jsonl"))).map((document) => [document.id, document]));
2446
+ const sources = new Map((await readJsonl(path18.join(workspacePath, "sources", "sources.jsonl"))).map((source) => [source.id, source]));
2447
+ const orderedChunkCache = /* @__PURE__ */ new Map();
2448
+ const normalizedQuery = query.trim();
2449
+ const filterIds = [...chunks.values()].filter((chunk) => filterChunk(chunk, documents.get(chunk.documentId), sources.get(chunk.sourceId), { sourceId, sourceIds, sourceName, sourceNames, sourceType, sourceTypes, uriPrefix, uriPrefixes, hasPublicationDate, tag, tags, metadata, dateRanges })).map((chunk) => chunk.id);
2450
+ if (normalizedQuery.length === 0) {
2451
+ const chunksByDocument = /* @__PURE__ */ new Map();
2452
+ for (const chunkId of filterIds) {
2453
+ const chunk = chunks.get(chunkId);
2454
+ if (!chunk) {
2455
+ continue;
2456
+ }
2457
+ const existing = chunksByDocument.get(chunk.documentId);
2458
+ if (existing) {
2459
+ existing.push(chunk);
2460
+ } else {
2461
+ chunksByDocument.set(chunk.documentId, [chunk]);
2462
+ }
2463
+ }
2464
+ const latestResults = await Promise.all(
2465
+ [...chunksByDocument.entries()].sort(([leftDocumentId], [rightDocumentId]) => {
2466
+ const leftDocument = documents.get(leftDocumentId);
2467
+ const rightDocument = documents.get(rightDocumentId);
2468
+ return sortDateDescending(leftDocument ? latestSortDate(leftDocument) : null, rightDocument ? latestSortDate(rightDocument) : null);
2469
+ }).slice(0, topK).map(async ([documentId, documentChunks]) => {
2470
+ const document = documents.get(documentId);
2471
+ const chunk = representativeChunk(documentChunks);
2472
+ if (!chunk || !document) {
2473
+ return null;
2474
+ }
2475
+ return {
2476
+ chunkId: chunk.id,
2477
+ documentId: chunk.documentId,
2478
+ sourceId: chunk.sourceId,
2479
+ sourceType: document.sourceType,
2480
+ score: 0,
2481
+ title: chooseResultTitle(chunk),
2482
+ uri: chunk.uri,
2483
+ headingPath: chunk.headingPath,
2484
+ snippet: await buildSnippetWithAdjacentChunks(chunk, document.title, {
2485
+ document,
2486
+ config,
2487
+ orderedChunkCache
2488
+ }),
2489
+ text: showChunks ? chunk.text : void 0,
2490
+ publicationDate: document.publicationDate ?? null,
2491
+ firstSeenAt: document.firstSeenAt,
2492
+ lastSeenAt: document.lastSeenAt,
2493
+ lastChangedAt: document.lastChangedAt,
2494
+ metadata: chunk.metadata
2495
+ };
2496
+ })
2497
+ );
2498
+ return { retrievalMode: "lexical", results: latestResults.filter((result) => result != null) };
2499
+ }
2500
+ const lexicalHits = async () => {
2501
+ const index = await loadHydratedIndex(workspacePath);
2502
+ const all = await index.searchRequest({ query: buildSearchQuery(normalizedQuery, { sourceId, sourceIds, sourceType, sourceTypes, tag, tags, metadata }), limit: candidateLimit });
2503
+ return all.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit);
2504
+ };
2505
+ const denseHits = async () => {
2506
+ if (!await fileExists(denseVectorPath(workspacePath))) {
2507
+ throw new CliError("dense vector index is not built; run `qli models pull --dense` and `qli rebuild`", "DENSE_INDEX_MISSING", 7 /* QueryError */);
2508
+ }
2509
+ return denseQuery({ workspacePath, config: config.retrieval.dense, query: normalizedQuery, topK: candidateLimit }).then((hits2) => hits2.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit));
2510
+ };
2511
+ const sparseHits = async () => {
2512
+ if (!await fileExists(sparseVectorPath(workspacePath))) {
2513
+ throw new CliError("sparse vector index is not built; run `qli models pull --sparse` and `qli rebuild`", "SPARSE_INDEX_MISSING", 7 /* QueryError */);
2514
+ }
2515
+ return sparseQuery({ workspacePath, config: config.retrieval.sparse, query: normalizedQuery, topK: candidateLimit }).then((hits2) => hits2.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit));
2516
+ };
2517
+ let hits;
2518
+ if (mode === "lexical") {
2519
+ hits = await lexicalHits();
2520
+ } else if (mode === "dense") {
2521
+ hits = await denseHits();
2522
+ } else if (mode === "sparse") {
2523
+ hits = await sparseHits();
2524
+ } else {
2525
+ const rankings = [await lexicalHits()];
2526
+ if (await fileExists(denseVectorPath(workspacePath))) {
2527
+ rankings.push(await denseQuery({ workspacePath, config: config.retrieval.dense, query: normalizedQuery, topK: candidateLimit }).then((dense) => dense.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit)));
2528
+ }
2529
+ if (await fileExists(sparseVectorPath(workspacePath))) {
2530
+ rankings.push(await sparseQuery({ workspacePath, config: config.retrieval.sparse, query: normalizedQuery, topK: candidateLimit }).then((sparse) => sparse.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit)));
2531
+ }
2532
+ hits = reciprocalRankFusion(rankings, { rankConstant: 20, weights: rankings.map((_, index) => index === 0 ? 3 : 1) }).slice(0, candidateLimit);
2533
+ }
2534
+ const rawResults = await Promise.all(hits.map(async ([chunkId, score]) => {
2535
+ const chunk = chunks.get(chunkId);
2536
+ if (!chunk) {
2537
+ return null;
2538
+ }
2539
+ return {
2540
+ chunkId,
2541
+ documentId: chunk.documentId,
2542
+ sourceId: chunk.sourceId,
2543
+ sourceType: documents.get(chunk.documentId)?.sourceType ?? "text",
2544
+ score,
2545
+ title: chooseResultTitle(chunk),
2546
+ uri: chunk.uri,
2547
+ headingPath: chunk.headingPath,
2548
+ snippet: await buildSnippetWithAdjacentChunks(chunk, normalizedQuery, {
2549
+ document: documents.get(chunk.documentId),
2550
+ config,
2551
+ orderedChunkCache
2552
+ }),
2553
+ text: showChunks ? chunk.text : void 0,
2554
+ publicationDate: documents.get(chunk.documentId)?.publicationDate ?? null,
2555
+ firstSeenAt: documents.get(chunk.documentId)?.firstSeenAt ?? chunk.firstSeenAt,
2556
+ lastSeenAt: documents.get(chunk.documentId)?.lastSeenAt ?? chunk.lastSeenAt,
2557
+ lastChangedAt: documents.get(chunk.documentId)?.lastChangedAt ?? chunk.lastChangedAt,
2558
+ metadata: chunk.metadata
2559
+ };
2560
+ }));
2561
+ const results = rawResults.filter((result) => result != null);
2562
+ return { retrievalMode: mode, results: rerankResultsByDocument(results, topK) };
2563
+ }
2564
+
2565
+ // src/query/related-service.ts
2566
+ import path19 from "path";
2567
+ function cosineSimilarity(left, right) {
2568
+ let dot = 0;
2569
+ let leftNorm = 0;
2570
+ let rightNorm = 0;
2571
+ for (let index = 0; index < left.length; index += 1) {
2572
+ const leftValue = left[index] ?? 0;
2573
+ const rightValue = right[index] ?? 0;
2574
+ dot += leftValue * rightValue;
2575
+ leftNorm += leftValue * leftValue;
2576
+ rightNorm += rightValue * rightValue;
2577
+ }
2578
+ if (leftNorm === 0 || rightNorm === 0) {
2579
+ return 0;
2580
+ }
2581
+ return dot / (Math.sqrt(leftNorm) * Math.sqrt(rightNorm));
2582
+ }
2583
+ function normalizeVector(values) {
2584
+ const norm = Math.sqrt(values.reduce((sum, value) => sum + value * value, 0));
2585
+ if (norm === 0) {
2586
+ return values.map(() => 0);
2587
+ }
2588
+ return values.map((value) => value / norm);
2589
+ }
2590
+ function averageEmbeddings(records, dimensions) {
2591
+ const totals = new Array(dimensions).fill(0);
2592
+ for (const record of records) {
2593
+ for (let index = 0; index < dimensions; index += 1) {
2594
+ totals[index] = (totals[index] ?? 0) + (record.embedding[index] ?? 0);
2595
+ }
2596
+ }
2597
+ return normalizeVector(totals.map((value) => value / Math.max(records.length, 1)));
2598
+ }
2599
+ function resolveDocumentSelector(documents, selector) {
2600
+ const normalized = selector.trim().toLowerCase();
2601
+ const matches = documents.filter(
2602
+ (document) => document.id.toLowerCase() === normalized || document.uri.toLowerCase() === normalized || document.canonicalUri?.toLowerCase() === normalized
2603
+ );
2604
+ if (matches.length === 0) {
2605
+ throw new CliError(`document not found: ${selector}`, "DOCUMENT_NOT_FOUND", 2 /* InvalidArguments */);
2606
+ }
2607
+ if (matches.length > 1) {
2608
+ throw new CliError(`document selector is ambiguous: ${selector}`, "DOCUMENT_SELECTOR_AMBIGUOUS", 2 /* InvalidArguments */);
2609
+ }
2610
+ return matches[0];
2611
+ }
2612
+ function buildDocumentVectors(documents, denseChunks, dimensions) {
2613
+ const byDocument = /* @__PURE__ */ new Map();
2614
+ for (const chunk of denseChunks) {
2615
+ const existing = byDocument.get(chunk.documentId);
2616
+ if (existing) {
2617
+ existing.push(chunk);
2618
+ } else {
2619
+ byDocument.set(chunk.documentId, [chunk]);
2620
+ }
2621
+ }
2622
+ return new Map(documents.flatMap((document) => {
2623
+ const records = byDocument.get(document.id);
2624
+ if (!records?.length) {
2625
+ return [];
2626
+ }
2627
+ return [[document.id, { document, embedding: averageEmbeddings(records, dimensions) }]];
2628
+ }));
2629
+ }
2630
+ async function findRelatedDocuments({
2631
+ workspacePath,
2632
+ document,
2633
+ topK
2634
+ }) {
2635
+ const config = await loadConfig(workspacePath);
2636
+ if (!config.retrieval.dense.enabled) {
2637
+ throw new CliError("dense retrieval is disabled in config; enable retrieval.dense.enabled and rebuild", "DENSE_RETRIEVAL_DISABLED", 7 /* QueryError */);
2638
+ }
2639
+ if (!await fileExists(denseVectorPath(workspacePath))) {
2640
+ throw new CliError("dense vector index is not built; run `qli models pull --dense` and `qli rebuild`", "DENSE_INDEX_MISSING", 7 /* QueryError */);
2641
+ }
2642
+ const documents = await readJsonl(path19.join(workspacePath, "documents", "documents.jsonl"));
2643
+ const selected = resolveDocumentSelector(documents, document);
2644
+ const densePayload = await readDensePayload(workspacePath);
2645
+ const vectors = buildDocumentVectors(documents, densePayload.chunks, densePayload.metadata.dimensions);
2646
+ const sourceVector = vectors.get(selected.id);
2647
+ if (!sourceVector) {
2648
+ throw new CliError(`dense vectors are missing for document: ${document}`, "DOCUMENT_VECTOR_MISSING", 7 /* QueryError */);
2649
+ }
2650
+ const results = [...vectors.values()].filter((candidate) => candidate.document.id !== selected.id).map((candidate) => ({
2651
+ documentId: candidate.document.id,
2652
+ sourceId: candidate.document.sourceId,
2653
+ score: cosineSimilarity(sourceVector.embedding, candidate.embedding),
2654
+ title: candidate.document.title,
2655
+ uri: candidate.document.uri,
2656
+ metadata: candidate.document.metadata
2657
+ })).sort((left, right) => right.score - left.score).slice(0, topK);
2658
+ return {
2659
+ sourceDocument: {
2660
+ documentId: selected.id,
2661
+ sourceId: selected.sourceId,
2662
+ title: selected.title,
2663
+ uri: selected.uri
2664
+ },
2665
+ retrievalMode: "dense",
2666
+ results
2667
+ };
2668
+ }
2669
+
2670
+ // src/query/context-builder.ts
2671
+ async function createContext({
2672
+ workspacePath,
2673
+ query,
2674
+ topK,
2675
+ maxChars,
2676
+ retrievalMode
2677
+ }) {
2678
+ const search = await searchIndex({ workspacePath, query, topK, showChunks: true, retrievalMode });
2679
+ const sources = [];
2680
+ let total = 0;
2681
+ for (const result of search.results) {
2682
+ const text = result.text ?? "";
2683
+ if (total + text.length > maxChars && sources.length > 0) {
2684
+ break;
2685
+ }
2686
+ total += text.length;
2687
+ sources.push({
2688
+ chunkId: result.chunkId,
2689
+ documentId: result.documentId,
2690
+ sourceId: result.sourceId,
2691
+ title: result.title,
2692
+ uri: result.uri,
2693
+ headingPath: result.headingPath,
2694
+ text,
2695
+ metadata: result.metadata
2696
+ });
2697
+ }
2698
+ const markdown = [
2699
+ "# Context",
2700
+ "",
2701
+ ...sources.flatMap((source, index) => [
2702
+ `## Source ${index + 1}`,
2703
+ `Title: ${source.title}`,
2704
+ `URL: ${source.uri}`,
2705
+ `Chunk ID: ${source.chunkId}`,
2706
+ source.headingPath.length > 0 ? `Heading Path: ${source.headingPath.join(" > ")}` : "",
2707
+ "",
2708
+ source.text,
2709
+ ""
2710
+ ].filter((line) => line !== ""))
2711
+ ].join("\n");
2712
+ return { markdown, sources, retrievalMode: search.retrievalMode };
2713
+ }
2714
+
2715
+ // src/report/diff-service.ts
2716
+ import path20 from "path";
2717
+ function chooseBaselineRun(runs, since) {
2718
+ if (since === "last-run") {
2719
+ return runs.at(-1);
2720
+ }
2721
+ if (since) {
2722
+ return runs.filter((run) => run.createdAt < since).at(-1) ?? runs.at(-1);
2723
+ }
2724
+ return runs.at(-1);
2725
+ }
2726
+ async function diffWorkspace({
2727
+ workspacePath,
2728
+ sourceId,
2729
+ documentId,
2730
+ since
2731
+ }) {
2732
+ const current = await readJsonl(path20.join(workspacePath, "documents", "documents.jsonl"));
2733
+ const baseline = chooseBaselineRun(await listRuns(workspacePath), since);
2734
+ const previous = new Map((baseline?.documentsSnapshot ?? []).map((document) => [document.id, document]));
2735
+ const changedDocuments = current.filter((document) => (!sourceId || document.sourceId === sourceId) && (!documentId || document.id === documentId)).filter((document) => {
2736
+ const prior = previous.get(document.id);
2737
+ return !prior || prior.contentHash !== document.contentHash || since && document.lastChangedAt >= since;
2738
+ }).map((document) => ({
2739
+ id: document.id,
2740
+ title: document.title,
2741
+ uri: document.uri,
2742
+ sourceId: document.sourceId,
2743
+ previousHash: previous.get(document.id)?.contentHash,
2744
+ currentHash: document.contentHash
2745
+ }));
2746
+ return { changedDocuments };
2747
+ }
2748
+ function renderChangeReport(diff) {
2749
+ return [
2750
+ "# Knowledge Base Change Report",
2751
+ "",
2752
+ "## Summary",
2753
+ "",
2754
+ `Changed documents: ${diff.changedDocuments.length}`,
2755
+ "",
2756
+ "## Added Documents",
2757
+ "",
2758
+ "_No added documents in this simple report._",
2759
+ "",
2760
+ "## Changed Documents",
2761
+ "",
2762
+ ...diff.changedDocuments.map((document) => `- ${document.title} (${document.uri}) [${document.id}]`),
2763
+ "",
2764
+ "## Removed or Missing Documents",
2765
+ "",
2766
+ "_Removal tracking is not available for this report._",
2767
+ "",
2768
+ "## Notable Changed Sections",
2769
+ "",
2770
+ ...diff.changedDocuments.map((document) => `- ${document.sourceId}: ${document.title}`)
2771
+ ].join("\n");
2772
+ }
2773
+ export {
2774
+ addSource,
2775
+ assertWorkspaceExists,
2776
+ buildChunksForDocument,
2777
+ buildIndex,
2778
+ chunkDocuments,
2779
+ createContext,
2780
+ createIndexMapping,
2781
+ defaultConfig,
2782
+ diffWorkspace,
2783
+ ensureWorkspace,
2784
+ findRelatedDocuments,
2785
+ ingestSources,
2786
+ listSources,
2787
+ loadConfig,
2788
+ removeSource,
2789
+ renderChangeReport,
2790
+ reprocessDocuments,
2791
+ searchIndex,
2792
+ updateSource,
2793
+ writeDefaultConfig
2794
+ };