@tryformation/querylight-cli 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Dockerfile +7 -0
- package/LICENSE +21 -0
- package/README.md +391 -0
- package/dist/chunk/chunk-store.d.ts +4 -0
- package/dist/chunk/chunker.d.ts +9 -0
- package/dist/cli/format.d.ts +4 -0
- package/dist/cli/main.d.ts +2 -0
- package/dist/cli/main.js +3523 -0
- package/dist/cli/run-cli.d.ts +5 -0
- package/dist/core/config.d.ts +4 -0
- package/dist/core/constants.d.ts +3 -0
- package/dist/core/errors.d.ts +17 -0
- package/dist/core/files.d.ts +1 -0
- package/dist/core/hashing.d.ts +1 -0
- package/dist/core/ids.d.ts +1 -0
- package/dist/core/jsonl.d.ts +2 -0
- package/dist/core/runs.d.ts +3 -0
- package/dist/core/workspace.d.ts +7 -0
- package/dist/index/index-store.d.ts +11 -0
- package/dist/index/querylight-indexer.d.ts +14 -0
- package/dist/index.d.ts +11 -0
- package/dist/index.js +2794 -0
- package/dist/ingest/adapters/crawl4ai-adapter.d.ts +1 -0
- package/dist/ingest/adapters/directory-adapter.d.ts +2 -0
- package/dist/ingest/adapters/file-adapter.d.ts +16 -0
- package/dist/ingest/adapters/rss-adapter.d.ts +7 -0
- package/dist/ingest/adapters/url-adapter.d.ts +11 -0
- package/dist/ingest/adapters/website-adapter.d.ts +2 -0
- package/dist/ingest/document-utils.d.ts +24 -0
- package/dist/ingest/extractors/docx-extractor.d.ts +1 -0
- package/dist/ingest/extractors/html-extractor.d.ts +5 -0
- package/dist/ingest/extractors/markdown-extractor.d.ts +1 -0
- package/dist/ingest/extractors/pdf-extractor.d.ts +1 -0
- package/dist/ingest/extractors/text-extractor.d.ts +1 -0
- package/dist/ingest/ingest-service.d.ts +23 -0
- package/dist/normalize/boilerplate.d.ts +1 -0
- package/dist/normalize/normalize-markdown.d.ts +2 -0
- package/dist/query/context-builder.d.ts +8 -0
- package/dist/query/related-service.d.ts +6 -0
- package/dist/query/search-service.d.ts +31 -0
- package/dist/report/diff-service.d.ts +23 -0
- package/dist/sources/source-model.d.ts +1 -0
- package/dist/sources/source-store.d.ts +7 -0
- package/dist/types/models.d.ts +309 -0
- package/dist/vector/dense.d.ts +13 -0
- package/dist/vector/runtime.d.ts +18 -0
- package/dist/vector/service.d.ts +26 -0
- package/dist/vector/sparse.d.ts +19 -0
- package/dist/vector/store.d.ts +20 -0
- package/dist/vector/text.d.ts +3 -0
- package/package.json +66 -0
- package/scripts/sparse-encode.py +104 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,2794 @@
|
|
|
1
|
+
// src/core/workspace.ts
|
|
2
|
+
import { mkdir, stat } from "fs/promises";
|
|
3
|
+
import path2 from "path";
|
|
4
|
+
|
|
5
|
+
// src/core/errors.ts
|
|
6
|
+
var CliError = class extends Error {
|
|
7
|
+
constructor(message, code, exitCode, details) {
|
|
8
|
+
super(message);
|
|
9
|
+
this.code = code;
|
|
10
|
+
this.exitCode = exitCode;
|
|
11
|
+
this.details = details;
|
|
12
|
+
this.name = "CliError";
|
|
13
|
+
}
|
|
14
|
+
code;
|
|
15
|
+
exitCode;
|
|
16
|
+
details;
|
|
17
|
+
};
|
|
18
|
+
|
|
19
|
+
// src/core/config.ts
|
|
20
|
+
import { readFile, writeFile } from "fs/promises";
|
|
21
|
+
import path from "path";
|
|
22
|
+
import YAML from "yaml";
|
|
23
|
+
var defaultConfig = () => ({
|
|
24
|
+
workspaceVersion: 1,
|
|
25
|
+
index: {
|
|
26
|
+
name: "default",
|
|
27
|
+
fields: {
|
|
28
|
+
text: { type: "text", weight: 1 },
|
|
29
|
+
title: { type: "text", weight: 2 },
|
|
30
|
+
uri: { type: "keyword" },
|
|
31
|
+
sourceId: { type: "keyword" },
|
|
32
|
+
tags: { type: "keyword" },
|
|
33
|
+
contentType: { type: "keyword" }
|
|
34
|
+
},
|
|
35
|
+
chunking: {
|
|
36
|
+
maxChars: 1800,
|
|
37
|
+
overlapChars: 200,
|
|
38
|
+
minChars: 120,
|
|
39
|
+
splitOnHeadings: true
|
|
40
|
+
}
|
|
41
|
+
},
|
|
42
|
+
rag: {
|
|
43
|
+
defaultTopK: 12,
|
|
44
|
+
maxContextChars: 12e3,
|
|
45
|
+
citationStyle: "markdown"
|
|
46
|
+
},
|
|
47
|
+
retrieval: {
|
|
48
|
+
defaultMode: "lexical",
|
|
49
|
+
dense: {
|
|
50
|
+
enabled: false,
|
|
51
|
+
modelId: "Xenova/all-MiniLM-L6-v2",
|
|
52
|
+
cacheDir: ".kb/models/huggingface",
|
|
53
|
+
indexHashTables: 8,
|
|
54
|
+
indexRandomSeed: 42,
|
|
55
|
+
chunkTextMode: "title-heading-text"
|
|
56
|
+
},
|
|
57
|
+
sparse: {
|
|
58
|
+
enabled: false,
|
|
59
|
+
modelId: "opensearch-project/opensearch-neural-sparse-encoding-doc-v3-distill",
|
|
60
|
+
cacheDir: ".kb/models/huggingface",
|
|
61
|
+
documentTopTokens: 128,
|
|
62
|
+
queryEncoding: "tokenizer-token-weights",
|
|
63
|
+
documentEncoding: "masked-lm-max-log1p-relu",
|
|
64
|
+
chunkTextMode: "title-heading-text"
|
|
65
|
+
}
|
|
66
|
+
},
|
|
67
|
+
crawler: {
|
|
68
|
+
defaultUserAgent: "querylight-cli/0.1",
|
|
69
|
+
obeyRobotsTxt: true,
|
|
70
|
+
rateLimitMs: 1e3,
|
|
71
|
+
renderJs: false,
|
|
72
|
+
retentionDays: 365,
|
|
73
|
+
fetchArticles: true
|
|
74
|
+
},
|
|
75
|
+
limits: {
|
|
76
|
+
maxFileSizeMb: 50,
|
|
77
|
+
maxPagesPerSource: 100,
|
|
78
|
+
maxTotalChunks: 1e5
|
|
79
|
+
}
|
|
80
|
+
});
|
|
81
|
+
async function writeDefaultConfig(workspacePath, force = false) {
|
|
82
|
+
const configPath = path.join(workspacePath, "config.yaml");
|
|
83
|
+
try {
|
|
84
|
+
if (!force) {
|
|
85
|
+
await readFile(configPath, "utf8");
|
|
86
|
+
return;
|
|
87
|
+
}
|
|
88
|
+
} catch {
|
|
89
|
+
}
|
|
90
|
+
await writeFile(configPath, YAML.stringify(defaultConfig()), "utf8");
|
|
91
|
+
}
|
|
92
|
+
async function loadConfig(workspacePath, configPath) {
|
|
93
|
+
const resolved = configPath ?? path.join(workspacePath, "config.yaml");
|
|
94
|
+
const raw = await readFile(resolved, "utf8");
|
|
95
|
+
const parsed = YAML.parse(raw);
|
|
96
|
+
const defaults = defaultConfig();
|
|
97
|
+
return {
|
|
98
|
+
...defaults,
|
|
99
|
+
...parsed,
|
|
100
|
+
index: {
|
|
101
|
+
...defaults.index,
|
|
102
|
+
...parsed.index,
|
|
103
|
+
fields: {
|
|
104
|
+
...defaults.index.fields,
|
|
105
|
+
...parsed.index?.fields ?? {}
|
|
106
|
+
},
|
|
107
|
+
chunking: {
|
|
108
|
+
...defaults.index.chunking,
|
|
109
|
+
...parsed.index?.chunking ?? {}
|
|
110
|
+
}
|
|
111
|
+
},
|
|
112
|
+
rag: {
|
|
113
|
+
...defaults.rag,
|
|
114
|
+
...parsed.rag ?? {}
|
|
115
|
+
},
|
|
116
|
+
retrieval: {
|
|
117
|
+
...defaults.retrieval,
|
|
118
|
+
...parsed.retrieval ?? {},
|
|
119
|
+
dense: {
|
|
120
|
+
...defaults.retrieval.dense,
|
|
121
|
+
...parsed.retrieval?.dense ?? {}
|
|
122
|
+
},
|
|
123
|
+
sparse: {
|
|
124
|
+
...defaults.retrieval.sparse,
|
|
125
|
+
...parsed.retrieval?.sparse ?? {}
|
|
126
|
+
}
|
|
127
|
+
},
|
|
128
|
+
crawler: {
|
|
129
|
+
...defaults.crawler,
|
|
130
|
+
...parsed.crawler ?? {}
|
|
131
|
+
},
|
|
132
|
+
limits: {
|
|
133
|
+
...defaults.limits,
|
|
134
|
+
...parsed.limits ?? {}
|
|
135
|
+
}
|
|
136
|
+
};
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
// src/core/workspace.ts
|
|
140
|
+
var DIRS = [
|
|
141
|
+
"sources",
|
|
142
|
+
"documents",
|
|
143
|
+
"chunks",
|
|
144
|
+
"raw",
|
|
145
|
+
"normalized",
|
|
146
|
+
"indexes",
|
|
147
|
+
"vectors",
|
|
148
|
+
"models",
|
|
149
|
+
"models/huggingface",
|
|
150
|
+
"runs",
|
|
151
|
+
"logs"
|
|
152
|
+
];
|
|
153
|
+
async function ensureWorkspace({
|
|
154
|
+
workspacePath,
|
|
155
|
+
force = false
|
|
156
|
+
}) {
|
|
157
|
+
const resolved = path2.resolve(workspacePath);
|
|
158
|
+
await mkdir(resolved, { recursive: true });
|
|
159
|
+
for (const dir of DIRS) {
|
|
160
|
+
await mkdir(path2.join(resolved, dir), { recursive: true });
|
|
161
|
+
}
|
|
162
|
+
await writeDefaultConfig(resolved, force);
|
|
163
|
+
return { workspacePath: resolved };
|
|
164
|
+
}
|
|
165
|
+
async function assertWorkspaceExists(workspacePath) {
|
|
166
|
+
const resolved = path2.resolve(workspacePath);
|
|
167
|
+
try {
|
|
168
|
+
const info = await stat(resolved);
|
|
169
|
+
if (!info.isDirectory()) {
|
|
170
|
+
throw new CliError(`workspace is not a directory: ${resolved}`, "WORKSPACE_ERROR", 3 /* WorkspaceError */);
|
|
171
|
+
}
|
|
172
|
+
await stat(path2.join(resolved, "config.yaml"));
|
|
173
|
+
return resolved;
|
|
174
|
+
} catch (error) {
|
|
175
|
+
if (error instanceof CliError) {
|
|
176
|
+
throw error;
|
|
177
|
+
}
|
|
178
|
+
throw new CliError(`workspace does not exist or is invalid: ${resolved}`, "WORKSPACE_ERROR", 3 /* WorkspaceError */);
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
// src/sources/source-store.ts
|
|
183
|
+
import path4 from "path";
|
|
184
|
+
|
|
185
|
+
// src/core/hashing.ts
|
|
186
|
+
import { createHash } from "crypto";
|
|
187
|
+
function sha256(input) {
|
|
188
|
+
return createHash("sha256").update(input).digest("hex");
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// src/core/ids.ts
|
|
192
|
+
function stableId(prefix, ...parts) {
|
|
193
|
+
return `${prefix}_${sha256(parts.join("::")).slice(0, 16)}`;
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// src/core/jsonl.ts
|
|
197
|
+
import { mkdir as mkdir2, readFile as readFile2, writeFile as writeFile2 } from "fs/promises";
|
|
198
|
+
import path3 from "path";
|
|
199
|
+
async function readJsonl(filePath) {
|
|
200
|
+
try {
|
|
201
|
+
const raw = await readFile2(filePath, "utf8");
|
|
202
|
+
return raw.split("\n").map((line) => line.trim()).filter((line) => line.length > 0).map((line) => JSON.parse(line));
|
|
203
|
+
} catch (error) {
|
|
204
|
+
if (error.code === "ENOENT") {
|
|
205
|
+
return [];
|
|
206
|
+
}
|
|
207
|
+
throw error;
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
async function writeJsonl(filePath, records) {
|
|
211
|
+
await mkdir2(path3.dirname(filePath), { recursive: true });
|
|
212
|
+
const payload = records.map((record) => JSON.stringify(record)).join("\n");
|
|
213
|
+
await writeFile2(filePath, payload.length > 0 ? `${payload}
|
|
214
|
+
` : "", "utf8");
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
// src/sources/source-store.ts
|
|
218
|
+
var sourcesFile = (workspacePath) => path4.join(workspacePath, "sources", "sources.jsonl");
|
|
219
|
+
async function listSources(workspacePath) {
|
|
220
|
+
return readJsonl(sourcesFile(workspacePath));
|
|
221
|
+
}
|
|
222
|
+
async function addSource(workspacePath, source) {
|
|
223
|
+
const existing = await listSources(workspacePath);
|
|
224
|
+
if (existing.some((candidate) => candidate.uri === source.uri)) {
|
|
225
|
+
throw new CliError(`duplicate source URI: ${source.uri}`, "DUPLICATE_SOURCE", 4 /* SourceError */);
|
|
226
|
+
}
|
|
227
|
+
const id = source.id ?? stableId("src", source.type, source.uri);
|
|
228
|
+
const stored = { ...source, id };
|
|
229
|
+
existing.push(stored);
|
|
230
|
+
await writeJsonl(sourcesFile(workspacePath), existing);
|
|
231
|
+
return stored;
|
|
232
|
+
}
|
|
233
|
+
async function updateSource(workspacePath, sourceId, patch) {
|
|
234
|
+
const sources = await listSources(workspacePath);
|
|
235
|
+
const index = sources.findIndex((source) => source.id === sourceId);
|
|
236
|
+
if (index < 0) {
|
|
237
|
+
throw new CliError(`source not found: ${sourceId}`, "SOURCE_NOT_FOUND", 4 /* SourceError */);
|
|
238
|
+
}
|
|
239
|
+
const current = sources[index];
|
|
240
|
+
const updated = {
|
|
241
|
+
...current,
|
|
242
|
+
...patch,
|
|
243
|
+
id: sourceId,
|
|
244
|
+
metadata: patch.metadata ? { ...current.metadata, ...patch.metadata } : current.metadata,
|
|
245
|
+
crawl: patch.crawl ? {
|
|
246
|
+
...current.crawl ?? {},
|
|
247
|
+
...patch.crawl
|
|
248
|
+
} : current.crawl
|
|
249
|
+
};
|
|
250
|
+
sources[index] = updated;
|
|
251
|
+
await writeJsonl(sourcesFile(workspacePath), sources);
|
|
252
|
+
return updated;
|
|
253
|
+
}
|
|
254
|
+
async function removeSource(workspacePath, sourceId) {
|
|
255
|
+
const sources = await listSources(workspacePath);
|
|
256
|
+
const filtered = sources.filter((source) => source.id !== sourceId);
|
|
257
|
+
if (filtered.length === sources.length) {
|
|
258
|
+
throw new CliError(`source not found: ${sourceId}`, "SOURCE_NOT_FOUND", 4 /* SourceError */);
|
|
259
|
+
}
|
|
260
|
+
await writeJsonl(sourcesFile(workspacePath), filtered);
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
// src/ingest/ingest-service.ts
|
|
264
|
+
import path10 from "path";
|
|
265
|
+
|
|
266
|
+
// src/chunk/chunk-store.ts
|
|
267
|
+
import path5 from "path";
|
|
268
|
+
function chunksFile(workspacePath) {
|
|
269
|
+
return path5.join(workspacePath, "chunks", "chunks.jsonl");
|
|
270
|
+
}
|
|
271
|
+
async function loadChunks(workspacePath) {
|
|
272
|
+
return readJsonl(chunksFile(workspacePath));
|
|
273
|
+
}
|
|
274
|
+
async function saveChunks(workspacePath, chunks) {
|
|
275
|
+
await writeJsonl(chunksFile(workspacePath), chunks.sort((a, b) => a.id.localeCompare(b.id)));
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
// src/core/files.ts
|
|
279
|
+
import { stat as stat2 } from "fs/promises";
|
|
280
|
+
async function fileExists(filePath) {
|
|
281
|
+
try {
|
|
282
|
+
await stat2(filePath);
|
|
283
|
+
return true;
|
|
284
|
+
} catch {
|
|
285
|
+
return false;
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
// src/core/runs.ts
|
|
290
|
+
import path6 from "path";
|
|
291
|
+
async function writeRun(workspacePath, run) {
|
|
292
|
+
await writeJsonl(path6.join(workspacePath, "runs", `${run.id}.json`), [run]);
|
|
293
|
+
}
|
|
294
|
+
async function listRuns(workspacePath) {
|
|
295
|
+
const fs = await import("fs/promises");
|
|
296
|
+
const dir = path6.join(workspacePath, "runs");
|
|
297
|
+
try {
|
|
298
|
+
const entries = await fs.readdir(dir);
|
|
299
|
+
const records = await Promise.all(entries.filter((name) => name.endsWith(".json")).map(async (name) => {
|
|
300
|
+
const runs = await readJsonl(path6.join(dir, name));
|
|
301
|
+
return runs[0];
|
|
302
|
+
}));
|
|
303
|
+
return records.filter((record) => record != null).sort((a, b) => a.createdAt.localeCompare(b.createdAt));
|
|
304
|
+
} catch {
|
|
305
|
+
return [];
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
// src/ingest/document-utils.ts
|
|
310
|
+
import { mkdir as mkdir3, rm, writeFile as writeFile3 } from "fs/promises";
|
|
311
|
+
import path7 from "path";
|
|
312
|
+
|
|
313
|
+
// src/normalize/normalize-markdown.ts
|
|
314
|
+
import matter from "gray-matter";
|
|
315
|
+
function normalizeWhitespace(text) {
|
|
316
|
+
return text.replace(/\r\n/g, "\n").replace(/[ \t]+\n/g, "\n").replace(/\n{3,}/g, "\n\n").trim();
|
|
317
|
+
}
|
|
318
|
+
function withFrontmatter(metadata, body) {
|
|
319
|
+
return matter.stringify(normalizeWhitespace(body), metadata);
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
// src/ingest/document-utils.ts
|
|
323
|
+
function asMetadataValue(value) {
|
|
324
|
+
return value === void 0 ? void 0 : value;
|
|
325
|
+
}
|
|
326
|
+
function buildDocumentMetadata({
|
|
327
|
+
source,
|
|
328
|
+
sourceUri,
|
|
329
|
+
publicationDate,
|
|
330
|
+
crawledAt,
|
|
331
|
+
indexedAt,
|
|
332
|
+
extra = {}
|
|
333
|
+
}) {
|
|
334
|
+
const merged = {
|
|
335
|
+
...source.metadata,
|
|
336
|
+
...extra,
|
|
337
|
+
tags: source.tags,
|
|
338
|
+
sourceType: source.type,
|
|
339
|
+
sourceUri,
|
|
340
|
+
publicationDate: publicationDate ?? null,
|
|
341
|
+
crawledAt,
|
|
342
|
+
indexedAt
|
|
343
|
+
};
|
|
344
|
+
const filtered = Object.fromEntries(
|
|
345
|
+
Object.entries(merged).filter(([, value]) => asMetadataValue(value) !== void 0)
|
|
346
|
+
);
|
|
347
|
+
return filtered;
|
|
348
|
+
}
|
|
349
|
+
async function writeNormalizedDocument({
|
|
350
|
+
documentId,
|
|
351
|
+
sourceId,
|
|
352
|
+
title,
|
|
353
|
+
uri,
|
|
354
|
+
sourceUri,
|
|
355
|
+
publicationDate,
|
|
356
|
+
crawledAt,
|
|
357
|
+
indexedAt,
|
|
358
|
+
contentHash,
|
|
359
|
+
lastChangedAt,
|
|
360
|
+
normalizedPath,
|
|
361
|
+
markdown
|
|
362
|
+
}) {
|
|
363
|
+
await mkdir3(path7.dirname(normalizedPath), { recursive: true });
|
|
364
|
+
await writeFile3(
|
|
365
|
+
normalizedPath,
|
|
366
|
+
withFrontmatter(
|
|
367
|
+
{
|
|
368
|
+
documentId,
|
|
369
|
+
sourceId,
|
|
370
|
+
title,
|
|
371
|
+
uri,
|
|
372
|
+
sourceUri,
|
|
373
|
+
publicationDate: publicationDate ?? null,
|
|
374
|
+
crawledAt,
|
|
375
|
+
indexedAt,
|
|
376
|
+
contentHash,
|
|
377
|
+
lastChangedAt
|
|
378
|
+
},
|
|
379
|
+
markdown
|
|
380
|
+
),
|
|
381
|
+
"utf8"
|
|
382
|
+
);
|
|
383
|
+
}
|
|
384
|
+
async function deleteDocumentArtifacts(document) {
|
|
385
|
+
await Promise.all([
|
|
386
|
+
document.rawPath ? rm(document.rawPath, { force: true }) : Promise.resolve(),
|
|
387
|
+
rm(document.normalizedPath, { force: true })
|
|
388
|
+
]);
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
// src/ingest/adapters/directory-adapter.ts
|
|
392
|
+
import fg from "fast-glob";
|
|
393
|
+
import path8 from "path";
|
|
394
|
+
async function listDirectoryFiles(source) {
|
|
395
|
+
const include = source.crawl?.includePatterns?.length ? source.crawl.includePatterns : ["**/*.md", "**/*.txt", "**/*.html", "**/*.htm", "**/*.pdf", "**/*.docx"];
|
|
396
|
+
const exclude = source.crawl?.excludePatterns ?? [];
|
|
397
|
+
const matches = await fg(include, {
|
|
398
|
+
cwd: source.uri,
|
|
399
|
+
absolute: true,
|
|
400
|
+
onlyFiles: true,
|
|
401
|
+
dot: false,
|
|
402
|
+
unique: true,
|
|
403
|
+
ignore: exclude,
|
|
404
|
+
followSymbolicLinks: false
|
|
405
|
+
});
|
|
406
|
+
return matches.map((match) => path8.resolve(match)).sort();
|
|
407
|
+
}
|
|
408
|
+
|
|
409
|
+
// src/ingest/adapters/file-adapter.ts
|
|
410
|
+
import { basename, extname, resolve } from "path";
|
|
411
|
+
import { mkdir as mkdir4, readFile as readFile6, stat as stat3, writeFile as writeFile4 } from "fs/promises";
|
|
412
|
+
|
|
413
|
+
// src/ingest/extractors/docx-extractor.ts
|
|
414
|
+
import mammoth from "mammoth";
|
|
415
|
+
async function extractDocx(filePath) {
|
|
416
|
+
const result = await mammoth.extractRawText({ path: filePath });
|
|
417
|
+
return result.value;
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
// src/ingest/extractors/html-extractor.ts
|
|
421
|
+
import { load } from "cheerio";
|
|
422
|
+
import TurndownService from "turndown";
|
|
423
|
+
|
|
424
|
+
// src/normalize/boilerplate.ts
|
|
425
|
+
function stripBoilerplate(html) {
|
|
426
|
+
return html.replace(/<nav[\s\S]*?<\/nav>/gi, "").replace(/<footer[\s\S]*?<\/footer>/gi, "").replace(/cookie notice/gi, "");
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
// src/ingest/extractors/html-extractor.ts
|
|
430
|
+
var turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
|
|
431
|
+
function cleanText(value) {
|
|
432
|
+
return value.replace(/\s+/g, " ").trim();
|
|
433
|
+
}
|
|
434
|
+
function chooseMeaningfulTitle($, fallbackTitle) {
|
|
435
|
+
const candidates = [
|
|
436
|
+
cleanText($("meta[property='og:title']").attr("content") ?? ""),
|
|
437
|
+
cleanText($("meta[name='twitter:title']").attr("content") ?? ""),
|
|
438
|
+
cleanText($("h1").first().text()),
|
|
439
|
+
cleanText($("title").first().text()),
|
|
440
|
+
fallbackTitle
|
|
441
|
+
].filter(Boolean);
|
|
442
|
+
return candidates[0] ?? fallbackTitle;
|
|
443
|
+
}
|
|
444
|
+
turndown.addRule("docCard", {
|
|
445
|
+
filter(node) {
|
|
446
|
+
return node.nodeName === "A" && typeof node.getAttribute === "function" && (node.getAttribute("class") ?? "").split(/\s+/).includes("doc-card");
|
|
447
|
+
},
|
|
448
|
+
replacement(_content, node) {
|
|
449
|
+
const element = node;
|
|
450
|
+
const href = cleanText(element.getAttribute("href") ?? "");
|
|
451
|
+
const title = cleanText(element.querySelector("h3")?.textContent ?? "");
|
|
452
|
+
const summary = cleanText(element.querySelector("p")?.textContent ?? "");
|
|
453
|
+
const section = cleanText(element.querySelector("span")?.textContent ?? "");
|
|
454
|
+
const parts = [
|
|
455
|
+
title ? `### ${title}` : "",
|
|
456
|
+
summary,
|
|
457
|
+
section,
|
|
458
|
+
href
|
|
459
|
+
].filter(Boolean);
|
|
460
|
+
return `
|
|
461
|
+
|
|
462
|
+
${parts.join("\n\n")}
|
|
463
|
+
|
|
464
|
+
`;
|
|
465
|
+
}
|
|
466
|
+
});
|
|
467
|
+
function extractHtmlToMarkdown(html) {
|
|
468
|
+
const cleaned = stripBoilerplate(html);
|
|
469
|
+
const $ = load(cleaned);
|
|
470
|
+
const fallbackTitle = cleanText($("title").first().text()) || "Untitled";
|
|
471
|
+
const title = chooseMeaningfulTitle($, fallbackTitle);
|
|
472
|
+
const root = $("main").first().html() ?? $.root().html() ?? cleaned;
|
|
473
|
+
return {
|
|
474
|
+
markdown: turndown.turndown(root),
|
|
475
|
+
title
|
|
476
|
+
};
|
|
477
|
+
}
|
|
478
|
+
function parseDateCandidate(value) {
|
|
479
|
+
const trimmed = value.trim();
|
|
480
|
+
if (!trimmed) {
|
|
481
|
+
return null;
|
|
482
|
+
}
|
|
483
|
+
const date = new Date(trimmed);
|
|
484
|
+
return Number.isNaN(date.getTime()) ? null : date.toISOString();
|
|
485
|
+
}
|
|
486
|
+
function extractPublicationDateFromHtml(html) {
|
|
487
|
+
const $ = load(html);
|
|
488
|
+
const candidates = [
|
|
489
|
+
$("meta[property='article:published_time']").attr("content"),
|
|
490
|
+
$("meta[property='og:published_time']").attr("content"),
|
|
491
|
+
$("meta[name='pubdate']").attr("content"),
|
|
492
|
+
$("meta[name='publish-date']").attr("content"),
|
|
493
|
+
$("meta[name='article:published_time']").attr("content"),
|
|
494
|
+
$("meta[name='date']").attr("content"),
|
|
495
|
+
$("time[datetime]").first().attr("datetime")
|
|
496
|
+
].filter((value) => Boolean(value?.trim()));
|
|
497
|
+
for (const candidate of candidates) {
|
|
498
|
+
const parsed = parseDateCandidate(candidate);
|
|
499
|
+
if (parsed) {
|
|
500
|
+
return parsed;
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
let jsonLdDate = null;
|
|
504
|
+
$('script[type="application/ld+json"]').each((_, element) => {
|
|
505
|
+
if (jsonLdDate) {
|
|
506
|
+
return false;
|
|
507
|
+
}
|
|
508
|
+
try {
|
|
509
|
+
const raw = $(element).text();
|
|
510
|
+
const parsed = JSON.parse(raw);
|
|
511
|
+
const queue = Array.isArray(parsed) ? [...parsed] : [parsed];
|
|
512
|
+
while (queue.length > 0) {
|
|
513
|
+
const next = queue.shift();
|
|
514
|
+
if (!next || typeof next !== "object") {
|
|
515
|
+
continue;
|
|
516
|
+
}
|
|
517
|
+
const record = next;
|
|
518
|
+
for (const key of ["datePublished", "dateCreated", "dateModified"]) {
|
|
519
|
+
if (typeof record[key] === "string") {
|
|
520
|
+
const normalized = parseDateCandidate(record[key]);
|
|
521
|
+
if (normalized) {
|
|
522
|
+
jsonLdDate = normalized;
|
|
523
|
+
return false;
|
|
524
|
+
}
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
if (Array.isArray(record["@graph"])) {
|
|
528
|
+
queue.push(...record["@graph"]);
|
|
529
|
+
}
|
|
530
|
+
}
|
|
531
|
+
} catch (error) {
|
|
532
|
+
void error;
|
|
533
|
+
}
|
|
534
|
+
return void 0;
|
|
535
|
+
});
|
|
536
|
+
return jsonLdDate;
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
// src/ingest/extractors/markdown-extractor.ts
|
|
540
|
+
import { readFile as readFile3 } from "fs/promises";
|
|
541
|
+
async function extractMarkdown(filePath) {
|
|
542
|
+
return readFile3(filePath, "utf8");
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
// src/ingest/extractors/pdf-extractor.ts
|
|
546
|
+
import { readFile as readFile4 } from "fs/promises";
|
|
547
|
+
import { PDFParse } from "pdf-parse";
|
|
548
|
+
async function extractPdf(filePath) {
|
|
549
|
+
const buffer = await readFile4(filePath);
|
|
550
|
+
const parser = new PDFParse({ data: buffer });
|
|
551
|
+
try {
|
|
552
|
+
const parsed = await parser.getText();
|
|
553
|
+
return parsed.text;
|
|
554
|
+
} finally {
|
|
555
|
+
await parser.destroy();
|
|
556
|
+
}
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
// src/ingest/extractors/text-extractor.ts
|
|
560
|
+
import { readFile as readFile5 } from "fs/promises";
|
|
561
|
+
async function extractText(filePath) {
|
|
562
|
+
return readFile5(filePath, "utf8");
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
// src/ingest/adapters/file-adapter.ts
|
|
566
|
+
function mimeTypeFor(filePath) {
|
|
567
|
+
const ext = extname(filePath).toLowerCase();
|
|
568
|
+
switch (ext) {
|
|
569
|
+
case ".md":
|
|
570
|
+
return "text/markdown";
|
|
571
|
+
case ".txt":
|
|
572
|
+
return "text/plain";
|
|
573
|
+
case ".html":
|
|
574
|
+
case ".htm":
|
|
575
|
+
return "text/html";
|
|
576
|
+
case ".pdf":
|
|
577
|
+
return "application/pdf";
|
|
578
|
+
case ".docx":
|
|
579
|
+
return "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
|
|
580
|
+
default:
|
|
581
|
+
return "application/octet-stream";
|
|
582
|
+
}
|
|
583
|
+
}
|
|
584
|
+
async function extractFileContent(filePath, mimeType) {
|
|
585
|
+
if (mimeType === "text/markdown") {
|
|
586
|
+
const markdown = await extractMarkdown(filePath);
|
|
587
|
+
const title = markdown.match(/^#\s+(.+)$/m)?.[1]?.trim() ?? basename(filePath);
|
|
588
|
+
return { title, markdown, raw: markdown };
|
|
589
|
+
}
|
|
590
|
+
if (mimeType === "text/plain") {
|
|
591
|
+
const text = await extractText(filePath);
|
|
592
|
+
return { title: basename(filePath), markdown: `# ${basename(filePath)}
|
|
593
|
+
|
|
594
|
+
${text}`, raw: text };
|
|
595
|
+
}
|
|
596
|
+
if (mimeType === "text/html") {
|
|
597
|
+
const raw = await readFile6(filePath, "utf8");
|
|
598
|
+
const extracted = extractHtmlToMarkdown(raw);
|
|
599
|
+
return { title: extracted.title, markdown: `# ${extracted.title}
|
|
600
|
+
|
|
601
|
+
${extracted.markdown}`, raw };
|
|
602
|
+
}
|
|
603
|
+
if (mimeType === "application/pdf") {
|
|
604
|
+
const text = await extractPdf(filePath);
|
|
605
|
+
return { title: basename(filePath), markdown: `# ${basename(filePath)}
|
|
606
|
+
|
|
607
|
+
${text}` };
|
|
608
|
+
}
|
|
609
|
+
if (mimeType.includes("wordprocessingml")) {
|
|
610
|
+
const text = await extractDocx(filePath);
|
|
611
|
+
return { title: basename(filePath), markdown: `# ${basename(filePath)}
|
|
612
|
+
|
|
613
|
+
${text}` };
|
|
614
|
+
}
|
|
615
|
+
throw new Error(`unsupported file type: ${mimeType}`);
|
|
616
|
+
}
|
|
617
|
+
async function extractRawContent(raw, mimeType, fallbackTitle) {
|
|
618
|
+
if (mimeType === "text/markdown") {
|
|
619
|
+
const title = raw.match(/^#\s+(.+)$/m)?.[1]?.trim() ?? fallbackTitle;
|
|
620
|
+
return { title, markdown: raw };
|
|
621
|
+
}
|
|
622
|
+
if (mimeType === "text/plain") {
|
|
623
|
+
return { title: fallbackTitle, markdown: `# ${fallbackTitle}
|
|
624
|
+
|
|
625
|
+
${raw}` };
|
|
626
|
+
}
|
|
627
|
+
if (mimeType === "text/html") {
|
|
628
|
+
const extracted = extractHtmlToMarkdown(raw);
|
|
629
|
+
return { title: extracted.title, markdown: `# ${extracted.title}
|
|
630
|
+
|
|
631
|
+
${extracted.markdown}` };
|
|
632
|
+
}
|
|
633
|
+
throw new Error(`raw reprocessing is not supported for ${mimeType}`);
|
|
634
|
+
}
|
|
635
|
+
async function ingestFile({
|
|
636
|
+
workspacePath,
|
|
637
|
+
source,
|
|
638
|
+
filePath,
|
|
639
|
+
previous
|
|
640
|
+
}) {
|
|
641
|
+
const resolved = resolve(filePath);
|
|
642
|
+
const fileStat = await stat3(resolved);
|
|
643
|
+
const mimeType = mimeTypeFor(resolved);
|
|
644
|
+
const extracted = await extractFileContent(resolved, mimeType);
|
|
645
|
+
const documentId = stableId("doc", source.id, resolved);
|
|
646
|
+
const normalizedPath = resolve(workspacePath, "normalized", `${documentId}.md`);
|
|
647
|
+
const rawPath = resolve(workspacePath, "raw", source.id, basename(resolved));
|
|
648
|
+
const contentHash = sha256(extracted.markdown);
|
|
649
|
+
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
650
|
+
const lastChangedAt = previous?.contentHash === contentHash ? previous.lastChangedAt : now;
|
|
651
|
+
const indexedAt = now;
|
|
652
|
+
const crawledAt = now;
|
|
653
|
+
await mkdir4(resolve(workspacePath, "normalized"), { recursive: true });
|
|
654
|
+
await mkdir4(resolve(workspacePath, "raw", source.id), { recursive: true });
|
|
655
|
+
if (extracted.raw) {
|
|
656
|
+
await writeFile4(rawPath, extracted.raw, "utf8");
|
|
657
|
+
}
|
|
658
|
+
await writeNormalizedDocument({
|
|
659
|
+
documentId,
|
|
660
|
+
sourceId: source.id,
|
|
661
|
+
title: extracted.title,
|
|
662
|
+
uri: resolved,
|
|
663
|
+
sourceUri: source.uri,
|
|
664
|
+
publicationDate: previous?.publicationDate ?? null,
|
|
665
|
+
crawledAt,
|
|
666
|
+
indexedAt,
|
|
667
|
+
contentHash,
|
|
668
|
+
lastChangedAt,
|
|
669
|
+
normalizedPath,
|
|
670
|
+
markdown: extracted.markdown
|
|
671
|
+
});
|
|
672
|
+
return {
|
|
673
|
+
id: documentId,
|
|
674
|
+
sourceId: source.id,
|
|
675
|
+
sourceType: source.type,
|
|
676
|
+
title: extracted.title,
|
|
677
|
+
uri: resolved,
|
|
678
|
+
sourceUri: source.uri,
|
|
679
|
+
mimeType,
|
|
680
|
+
rawPath: extracted.raw ? rawPath : void 0,
|
|
681
|
+
normalizedPath,
|
|
682
|
+
contentHash,
|
|
683
|
+
metadata: buildDocumentMetadata({
|
|
684
|
+
source,
|
|
685
|
+
sourceUri: source.uri,
|
|
686
|
+
publicationDate: previous?.publicationDate ?? null,
|
|
687
|
+
crawledAt,
|
|
688
|
+
indexedAt,
|
|
689
|
+
extra: {
|
|
690
|
+
contentType: mimeType,
|
|
691
|
+
fileSizeBytes: fileStat.size
|
|
692
|
+
}
|
|
693
|
+
}),
|
|
694
|
+
publicationDate: previous?.publicationDate ?? null,
|
|
695
|
+
crawledAt,
|
|
696
|
+
firstSeenAt: previous?.firstSeenAt ?? now,
|
|
697
|
+
lastSeenAt: now,
|
|
698
|
+
lastChangedAt,
|
|
699
|
+
indexedAt
|
|
700
|
+
};
|
|
701
|
+
}
|
|
702
|
+
async function ingestInlineContent({
|
|
703
|
+
workspacePath,
|
|
704
|
+
source,
|
|
705
|
+
content,
|
|
706
|
+
title,
|
|
707
|
+
uri,
|
|
708
|
+
previous
|
|
709
|
+
}) {
|
|
710
|
+
const markdown = source.type === "markdown" ? content : `# ${title}
|
|
711
|
+
|
|
712
|
+
${content}`;
|
|
713
|
+
const documentId = stableId("doc", source.id, uri);
|
|
714
|
+
const normalizedPath = resolve(workspacePath, "normalized", `${documentId}.md`);
|
|
715
|
+
const contentHash = sha256(markdown);
|
|
716
|
+
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
717
|
+
const lastChangedAt = previous?.contentHash === contentHash ? previous.lastChangedAt : now;
|
|
718
|
+
const indexedAt = now;
|
|
719
|
+
await mkdir4(resolve(workspacePath, "normalized"), { recursive: true });
|
|
720
|
+
await writeNormalizedDocument({
|
|
721
|
+
documentId,
|
|
722
|
+
sourceId: source.id,
|
|
723
|
+
title,
|
|
724
|
+
uri,
|
|
725
|
+
sourceUri: source.uri,
|
|
726
|
+
publicationDate: previous?.publicationDate ?? null,
|
|
727
|
+
crawledAt: now,
|
|
728
|
+
indexedAt,
|
|
729
|
+
contentHash,
|
|
730
|
+
lastChangedAt,
|
|
731
|
+
normalizedPath,
|
|
732
|
+
markdown
|
|
733
|
+
});
|
|
734
|
+
return {
|
|
735
|
+
id: documentId,
|
|
736
|
+
sourceId: source.id,
|
|
737
|
+
sourceType: source.type,
|
|
738
|
+
title,
|
|
739
|
+
uri,
|
|
740
|
+
sourceUri: source.uri,
|
|
741
|
+
mimeType: source.type === "markdown" ? "text/markdown" : "text/plain",
|
|
742
|
+
normalizedPath,
|
|
743
|
+
contentHash,
|
|
744
|
+
metadata: buildDocumentMetadata({
|
|
745
|
+
source,
|
|
746
|
+
sourceUri: source.uri,
|
|
747
|
+
publicationDate: previous?.publicationDate ?? null,
|
|
748
|
+
crawledAt: now,
|
|
749
|
+
indexedAt
|
|
750
|
+
}),
|
|
751
|
+
publicationDate: previous?.publicationDate ?? null,
|
|
752
|
+
crawledAt: now,
|
|
753
|
+
firstSeenAt: previous?.firstSeenAt ?? now,
|
|
754
|
+
lastSeenAt: now,
|
|
755
|
+
lastChangedAt,
|
|
756
|
+
indexedAt
|
|
757
|
+
};
|
|
758
|
+
}
|
|
759
|
+
async function reprocessStoredDocument(document, source) {
|
|
760
|
+
if (!document.rawPath) {
|
|
761
|
+
return null;
|
|
762
|
+
}
|
|
763
|
+
const raw = await readFile6(document.rawPath, "utf8");
|
|
764
|
+
const fallbackTitle = document.title || basename(document.uri);
|
|
765
|
+
const extracted = await extractRawContent(raw, document.mimeType, fallbackTitle);
|
|
766
|
+
const contentHash = sha256(extracted.markdown);
|
|
767
|
+
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
768
|
+
const indexedAt = now;
|
|
769
|
+
const lastChangedAt = document.contentHash === contentHash ? document.lastChangedAt : now;
|
|
770
|
+
await writeNormalizedDocument({
|
|
771
|
+
documentId: document.id,
|
|
772
|
+
sourceId: document.sourceId,
|
|
773
|
+
title: extracted.title,
|
|
774
|
+
uri: document.uri,
|
|
775
|
+
sourceUri: document.sourceUri,
|
|
776
|
+
publicationDate: document.publicationDate ?? null,
|
|
777
|
+
crawledAt: document.crawledAt,
|
|
778
|
+
indexedAt,
|
|
779
|
+
contentHash,
|
|
780
|
+
lastChangedAt,
|
|
781
|
+
normalizedPath: document.normalizedPath,
|
|
782
|
+
markdown: extracted.markdown
|
|
783
|
+
});
|
|
784
|
+
return {
|
|
785
|
+
...document,
|
|
786
|
+
title: extracted.title,
|
|
787
|
+
contentHash,
|
|
788
|
+
metadata: buildDocumentMetadata({
|
|
789
|
+
source,
|
|
790
|
+
sourceUri: document.sourceUri,
|
|
791
|
+
publicationDate: document.publicationDate ?? null,
|
|
792
|
+
crawledAt: document.crawledAt,
|
|
793
|
+
indexedAt,
|
|
794
|
+
extra: {
|
|
795
|
+
...document.metadata,
|
|
796
|
+
contentType: document.mimeType
|
|
797
|
+
}
|
|
798
|
+
}),
|
|
799
|
+
lastChangedAt,
|
|
800
|
+
indexedAt
|
|
801
|
+
};
|
|
802
|
+
}
|
|
803
|
+
|
|
804
|
+
// src/ingest/adapters/rss-adapter.ts
|
|
805
|
+
import { Readable } from "stream";
|
|
806
|
+
import FeedParser from "feedparser";
|
|
807
|
+
import { parseFeed } from "feedsmith";
|
|
808
|
+
function toIsoDate(value) {
|
|
809
|
+
if (value instanceof Date) {
|
|
810
|
+
return Number.isNaN(value.getTime()) ? null : value.toISOString();
|
|
811
|
+
}
|
|
812
|
+
if (typeof value === "string" && value.trim().length > 0) {
|
|
813
|
+
const parsed = new Date(value);
|
|
814
|
+
return Number.isNaN(parsed.getTime()) ? null : parsed.toISOString();
|
|
815
|
+
}
|
|
816
|
+
return null;
|
|
817
|
+
}
|
|
818
|
+
function normalizeFeedLink(link, baseUrl) {
|
|
819
|
+
if (!link?.trim()) {
|
|
820
|
+
return null;
|
|
821
|
+
}
|
|
822
|
+
try {
|
|
823
|
+
return new URL(link, baseUrl).href;
|
|
824
|
+
} catch {
|
|
825
|
+
return null;
|
|
826
|
+
}
|
|
827
|
+
}
|
|
828
|
+
function normalizeFeedsmithItems(feed, baseUrl) {
|
|
829
|
+
const items = Array.isArray(feed?.items) ? feed.items : Array.isArray(feed?.entries) ? feed.entries : [];
|
|
830
|
+
return items.map((item) => {
|
|
831
|
+
const link = normalizeFeedLink(
|
|
832
|
+
item?.link ?? item?.url ?? item?.id ?? item?.guid ?? item?.links?.[0]?.href ?? item?.links?.[0]?.href,
|
|
833
|
+
baseUrl
|
|
834
|
+
);
|
|
835
|
+
if (!link) {
|
|
836
|
+
return null;
|
|
837
|
+
}
|
|
838
|
+
return {
|
|
839
|
+
url: link,
|
|
840
|
+
title: String(item?.title ?? item?.summary ?? link).trim(),
|
|
841
|
+
publicationDate: toIsoDate(
|
|
842
|
+
item?.pubDate ?? item?.published ?? item?.updated ?? item?.published_at ?? item?.date_published ?? item?.dc?.date
|
|
843
|
+
)
|
|
844
|
+
};
|
|
845
|
+
}).filter((item) => item !== null);
|
|
846
|
+
}
|
|
847
|
+
async function parseWithFeedparser(xml, feedUrl) {
|
|
848
|
+
const parser = new FeedParser({ feedurl: feedUrl });
|
|
849
|
+
const items = [];
|
|
850
|
+
return await new Promise((resolve2, reject) => {
|
|
851
|
+
parser.on("error", reject);
|
|
852
|
+
parser.on("readable", function onReadable() {
|
|
853
|
+
let item;
|
|
854
|
+
while (item = this.read()) {
|
|
855
|
+
const link = normalizeFeedLink(item.link || item.origlink, feedUrl);
|
|
856
|
+
if (!link) {
|
|
857
|
+
continue;
|
|
858
|
+
}
|
|
859
|
+
items.push({
|
|
860
|
+
url: link,
|
|
861
|
+
title: String(item.title ?? link).trim(),
|
|
862
|
+
publicationDate: toIsoDate(item.pubdate ?? item.date)
|
|
863
|
+
});
|
|
864
|
+
}
|
|
865
|
+
});
|
|
866
|
+
parser.on("end", () => resolve2(items));
|
|
867
|
+
Readable.from([xml]).pipe(parser);
|
|
868
|
+
});
|
|
869
|
+
}
|
|
870
|
+
async function parseRssFeedDocument(xml, source) {
|
|
871
|
+
try {
|
|
872
|
+
const parsed = parseFeed(xml);
|
|
873
|
+
return normalizeFeedsmithItems(parsed.feed, source.uri);
|
|
874
|
+
} catch {
|
|
875
|
+
return parseWithFeedparser(xml, source.uri);
|
|
876
|
+
}
|
|
877
|
+
}
|
|
878
|
+
|
|
879
|
+
// src/ingest/adapters/url-adapter.ts
|
|
880
|
+
import { mkdir as mkdir5, readFile as readFile7, writeFile as writeFile5 } from "fs/promises";
|
|
881
|
+
import path9 from "path";
|
|
882
|
+
function buildHttpCache(response, validatedAt) {
|
|
883
|
+
return {
|
|
884
|
+
etag: response.headers.get("etag") ?? void 0,
|
|
885
|
+
lastModified: response.headers.get("last-modified") ?? void 0,
|
|
886
|
+
cacheControl: response.headers.get("cache-control") ?? void 0,
|
|
887
|
+
expires: response.headers.get("expires"),
|
|
888
|
+
lastValidatedAt: validatedAt,
|
|
889
|
+
lastStatus: response.status
|
|
890
|
+
};
|
|
891
|
+
}
|
|
892
|
+
function choosePublicationDate(preferred, fallback, previous) {
|
|
893
|
+
return preferred ?? fallback ?? previous ?? null;
|
|
894
|
+
}
|
|
895
|
+
async function normalizeRemoteDocument({
|
|
896
|
+
workspacePath,
|
|
897
|
+
source,
|
|
898
|
+
url,
|
|
899
|
+
body,
|
|
900
|
+
previous,
|
|
901
|
+
sourceUri,
|
|
902
|
+
publicationDate,
|
|
903
|
+
responseStatus
|
|
904
|
+
}) {
|
|
905
|
+
const extracted = extractHtmlToMarkdown(body);
|
|
906
|
+
const markdown = `# ${extracted.title}
|
|
907
|
+
|
|
908
|
+
${extracted.markdown}`;
|
|
909
|
+
const documentId = stableId("doc", source.id, url);
|
|
910
|
+
const normalizedPath = path9.resolve(workspacePath, "normalized", `${documentId}.md`);
|
|
911
|
+
const rawPath = path9.resolve(workspacePath, "raw", source.id, `${sha256(url).slice(0, 12)}.html`);
|
|
912
|
+
const contentHash = sha256(markdown);
|
|
913
|
+
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
914
|
+
const lastChangedAt = previous?.contentHash === contentHash ? previous.lastChangedAt : now;
|
|
915
|
+
const indexedAt = now;
|
|
916
|
+
const crawledAt = now;
|
|
917
|
+
const resolvedPublicationDate = choosePublicationDate(publicationDate, extractPublicationDateFromHtml(body), previous?.publicationDate);
|
|
918
|
+
await mkdir5(path9.resolve(workspacePath, "raw", source.id), { recursive: true });
|
|
919
|
+
await writeFile5(rawPath, body, "utf8");
|
|
920
|
+
await writeNormalizedDocument({
|
|
921
|
+
documentId,
|
|
922
|
+
sourceId: source.id,
|
|
923
|
+
title: extracted.title,
|
|
924
|
+
uri: url,
|
|
925
|
+
sourceUri,
|
|
926
|
+
publicationDate: resolvedPublicationDate,
|
|
927
|
+
crawledAt,
|
|
928
|
+
indexedAt,
|
|
929
|
+
contentHash,
|
|
930
|
+
lastChangedAt,
|
|
931
|
+
normalizedPath,
|
|
932
|
+
markdown
|
|
933
|
+
});
|
|
934
|
+
return {
|
|
935
|
+
id: documentId,
|
|
936
|
+
sourceId: source.id,
|
|
937
|
+
sourceType: source.type,
|
|
938
|
+
title: extracted.title,
|
|
939
|
+
uri: url,
|
|
940
|
+
sourceUri,
|
|
941
|
+
mimeType: "text/html",
|
|
942
|
+
rawPath,
|
|
943
|
+
normalizedPath,
|
|
944
|
+
contentHash,
|
|
945
|
+
metadata: buildDocumentMetadata({
|
|
946
|
+
source,
|
|
947
|
+
sourceUri,
|
|
948
|
+
publicationDate: resolvedPublicationDate,
|
|
949
|
+
crawledAt,
|
|
950
|
+
indexedAt,
|
|
951
|
+
extra: {
|
|
952
|
+
status: responseStatus,
|
|
953
|
+
contentType: "text/html"
|
|
954
|
+
}
|
|
955
|
+
}),
|
|
956
|
+
publicationDate: resolvedPublicationDate,
|
|
957
|
+
crawledAt,
|
|
958
|
+
firstSeenAt: previous?.firstSeenAt ?? now,
|
|
959
|
+
lastSeenAt: now,
|
|
960
|
+
lastChangedAt,
|
|
961
|
+
indexedAt
|
|
962
|
+
};
|
|
963
|
+
}
|
|
964
|
+
async function fetchUrlDocument({
|
|
965
|
+
workspacePath,
|
|
966
|
+
source,
|
|
967
|
+
url,
|
|
968
|
+
previous,
|
|
969
|
+
sourceUri,
|
|
970
|
+
publicationDate
|
|
971
|
+
}) {
|
|
972
|
+
const headers = {
|
|
973
|
+
"user-agent": source.crawl?.userAgent ?? "querylight-cli/0.1"
|
|
974
|
+
};
|
|
975
|
+
if (previous?.httpCache?.etag) {
|
|
976
|
+
headers["if-none-match"] = previous.httpCache.etag;
|
|
977
|
+
}
|
|
978
|
+
if (previous?.httpCache?.lastModified) {
|
|
979
|
+
headers["if-modified-since"] = previous.httpCache.lastModified;
|
|
980
|
+
}
|
|
981
|
+
const response = await fetch(url, { headers });
|
|
982
|
+
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
983
|
+
const nextHttpCache = buildHttpCache(response, now);
|
|
984
|
+
const effectiveSourceUri = sourceUri ?? source.uri;
|
|
985
|
+
if (response.status === 304 && previous?.rawPath && await fileExists(previous.rawPath) && await fileExists(previous.normalizedPath)) {
|
|
986
|
+
return {
|
|
987
|
+
...previous,
|
|
988
|
+
sourceUri: effectiveSourceUri,
|
|
989
|
+
publicationDate: publicationDate ?? previous.publicationDate ?? null,
|
|
990
|
+
metadata: buildDocumentMetadata({
|
|
991
|
+
source,
|
|
992
|
+
sourceUri: effectiveSourceUri,
|
|
993
|
+
publicationDate: publicationDate ?? previous.publicationDate ?? null,
|
|
994
|
+
crawledAt: previous.crawledAt,
|
|
995
|
+
indexedAt: previous.indexedAt,
|
|
996
|
+
extra: {
|
|
997
|
+
...previous.metadata,
|
|
998
|
+
status: previous.metadata.status ?? 200,
|
|
999
|
+
contentType: previous.mimeType
|
|
1000
|
+
}
|
|
1001
|
+
}),
|
|
1002
|
+
lastSeenAt: now,
|
|
1003
|
+
httpCache: nextHttpCache
|
|
1004
|
+
};
|
|
1005
|
+
}
|
|
1006
|
+
const body = await response.text();
|
|
1007
|
+
const document = await normalizeRemoteDocument({
|
|
1008
|
+
workspacePath,
|
|
1009
|
+
source,
|
|
1010
|
+
url,
|
|
1011
|
+
body,
|
|
1012
|
+
previous,
|
|
1013
|
+
sourceUri: effectiveSourceUri,
|
|
1014
|
+
publicationDate,
|
|
1015
|
+
responseStatus: response.status
|
|
1016
|
+
});
|
|
1017
|
+
return {
|
|
1018
|
+
...document,
|
|
1019
|
+
mimeType: response.headers.get("content-type") ?? document.mimeType,
|
|
1020
|
+
metadata: buildDocumentMetadata({
|
|
1021
|
+
source,
|
|
1022
|
+
sourceUri: effectiveSourceUri,
|
|
1023
|
+
publicationDate: document.publicationDate ?? null,
|
|
1024
|
+
crawledAt: document.crawledAt,
|
|
1025
|
+
indexedAt: document.indexedAt,
|
|
1026
|
+
extra: {
|
|
1027
|
+
status: response.status,
|
|
1028
|
+
contentType: response.headers.get("content-type") ?? document.mimeType
|
|
1029
|
+
}
|
|
1030
|
+
}),
|
|
1031
|
+
httpCache: nextHttpCache
|
|
1032
|
+
};
|
|
1033
|
+
}
|
|
1034
|
+
async function reprocessRemoteDocument(document, source) {
|
|
1035
|
+
if (!document.rawPath || !await fileExists(document.rawPath)) {
|
|
1036
|
+
return null;
|
|
1037
|
+
}
|
|
1038
|
+
const raw = await readFile7(document.rawPath, "utf8");
|
|
1039
|
+
const extracted = extractHtmlToMarkdown(raw);
|
|
1040
|
+
const markdown = `# ${extracted.title}
|
|
1041
|
+
|
|
1042
|
+
${extracted.markdown}`;
|
|
1043
|
+
const contentHash = sha256(markdown);
|
|
1044
|
+
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
1045
|
+
const indexedAt = now;
|
|
1046
|
+
const lastChangedAt = document.contentHash === contentHash ? document.lastChangedAt : now;
|
|
1047
|
+
const publicationDate = document.publicationDate ?? extractPublicationDateFromHtml(raw);
|
|
1048
|
+
await writeNormalizedDocument({
|
|
1049
|
+
documentId: document.id,
|
|
1050
|
+
sourceId: document.sourceId,
|
|
1051
|
+
title: extracted.title,
|
|
1052
|
+
uri: document.uri,
|
|
1053
|
+
sourceUri: document.sourceUri,
|
|
1054
|
+
publicationDate,
|
|
1055
|
+
crawledAt: document.crawledAt,
|
|
1056
|
+
indexedAt,
|
|
1057
|
+
contentHash,
|
|
1058
|
+
lastChangedAt,
|
|
1059
|
+
normalizedPath: document.normalizedPath,
|
|
1060
|
+
markdown
|
|
1061
|
+
});
|
|
1062
|
+
return {
|
|
1063
|
+
...document,
|
|
1064
|
+
title: extracted.title,
|
|
1065
|
+
contentHash,
|
|
1066
|
+
publicationDate,
|
|
1067
|
+
metadata: buildDocumentMetadata({
|
|
1068
|
+
source,
|
|
1069
|
+
sourceUri: document.sourceUri,
|
|
1070
|
+
publicationDate,
|
|
1071
|
+
crawledAt: document.crawledAt,
|
|
1072
|
+
indexedAt,
|
|
1073
|
+
extra: {
|
|
1074
|
+
...document.metadata,
|
|
1075
|
+
status: document.httpCache?.lastStatus ?? document.metadata.status ?? 200,
|
|
1076
|
+
contentType: document.mimeType
|
|
1077
|
+
}
|
|
1078
|
+
}),
|
|
1079
|
+
lastChangedAt,
|
|
1080
|
+
indexedAt
|
|
1081
|
+
};
|
|
1082
|
+
}
|
|
1083
|
+
|
|
1084
|
+
// src/ingest/adapters/website-adapter.ts
|
|
1085
|
+
import { load as load2 } from "cheerio";
|
|
1086
|
+
async function fetchRobotsDisallow(url, userAgent) {
|
|
1087
|
+
try {
|
|
1088
|
+
const response = await fetch(new URL("/robots.txt", url), { headers: { "user-agent": userAgent } });
|
|
1089
|
+
if (!response.ok) {
|
|
1090
|
+
return [];
|
|
1091
|
+
}
|
|
1092
|
+
const text = await response.text();
|
|
1093
|
+
return text.split("\n").map((line) => line.trim()).filter((line) => /^disallow:/i.test(line)).map((line) => line.split(":")[1]?.trim() ?? "").filter((line) => line.length > 0);
|
|
1094
|
+
} catch {
|
|
1095
|
+
return [];
|
|
1096
|
+
}
|
|
1097
|
+
}
|
|
1098
|
+
async function fetchSitemapUrls(baseUrl, userAgent) {
|
|
1099
|
+
try {
|
|
1100
|
+
const response = await fetch(new URL("/sitemap.xml", baseUrl), { headers: { "user-agent": userAgent } });
|
|
1101
|
+
if (!response.ok) {
|
|
1102
|
+
return [];
|
|
1103
|
+
}
|
|
1104
|
+
const xml = await response.text();
|
|
1105
|
+
return [...xml.matchAll(/<loc>(.*?)<\/loc>/g)].map((match) => match[1]).filter(Boolean);
|
|
1106
|
+
} catch {
|
|
1107
|
+
return [];
|
|
1108
|
+
}
|
|
1109
|
+
}
|
|
1110
|
+
function isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules) {
|
|
1111
|
+
if (url.origin !== baseUrl.origin) {
|
|
1112
|
+
return false;
|
|
1113
|
+
}
|
|
1114
|
+
if (disallowRules.some((rule) => rule !== "/" && url.pathname.startsWith(rule))) {
|
|
1115
|
+
return false;
|
|
1116
|
+
}
|
|
1117
|
+
const href = url.href;
|
|
1118
|
+
if (includePatterns.length > 0 && !includePatterns.some((pattern) => href.includes(pattern))) {
|
|
1119
|
+
return false;
|
|
1120
|
+
}
|
|
1121
|
+
if (excludePatterns.some((pattern) => href.includes(pattern))) {
|
|
1122
|
+
return false;
|
|
1123
|
+
}
|
|
1124
|
+
return true;
|
|
1125
|
+
}
|
|
1126
|
+
async function crawlWebsite(source) {
|
|
1127
|
+
const baseUrl = new URL(source.uri);
|
|
1128
|
+
const userAgent = source.crawl?.userAgent ?? "querylight-cli/0.1";
|
|
1129
|
+
const includePatterns = source.crawl?.includePatterns ?? [];
|
|
1130
|
+
const excludePatterns = source.crawl?.excludePatterns ?? [];
|
|
1131
|
+
const maxDepth = source.crawl?.maxDepth ?? 2;
|
|
1132
|
+
const maxPages = source.crawl?.maxPages ?? 100;
|
|
1133
|
+
const rateLimitMs = source.crawl?.rateLimitMs ?? 1e3;
|
|
1134
|
+
const disallowRules = source.crawl?.obeyRobotsTxt === false ? [] : await fetchRobotsDisallow(baseUrl, userAgent);
|
|
1135
|
+
const queue = [{ url: source.uri, depth: 0 }];
|
|
1136
|
+
const seen = /* @__PURE__ */ new Set();
|
|
1137
|
+
const results = [];
|
|
1138
|
+
if (source.crawl?.useSitemap !== false) {
|
|
1139
|
+
for (const url of await fetchSitemapUrls(baseUrl, userAgent)) {
|
|
1140
|
+
queue.push({ url, depth: 1 });
|
|
1141
|
+
}
|
|
1142
|
+
}
|
|
1143
|
+
while (queue.length > 0 && results.length < maxPages) {
|
|
1144
|
+
const next = queue.shift();
|
|
1145
|
+
if (!next || seen.has(next.url)) {
|
|
1146
|
+
continue;
|
|
1147
|
+
}
|
|
1148
|
+
seen.add(next.url);
|
|
1149
|
+
const url = new URL(next.url);
|
|
1150
|
+
if (!isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules)) {
|
|
1151
|
+
continue;
|
|
1152
|
+
}
|
|
1153
|
+
results.push(url.href);
|
|
1154
|
+
if (next.depth >= maxDepth) {
|
|
1155
|
+
continue;
|
|
1156
|
+
}
|
|
1157
|
+
const response = await fetch(url, { headers: { "user-agent": userAgent } });
|
|
1158
|
+
const html = await response.text();
|
|
1159
|
+
const $ = load2(html);
|
|
1160
|
+
$("a[href]").each((_, element) => {
|
|
1161
|
+
const href = $(element).attr("href");
|
|
1162
|
+
if (!href) {
|
|
1163
|
+
return;
|
|
1164
|
+
}
|
|
1165
|
+
try {
|
|
1166
|
+
const target = new URL(href, url);
|
|
1167
|
+
if (!seen.has(target.href)) {
|
|
1168
|
+
queue.push({ url: target.href, depth: next.depth + 1 });
|
|
1169
|
+
}
|
|
1170
|
+
} catch {
|
|
1171
|
+
}
|
|
1172
|
+
});
|
|
1173
|
+
if (rateLimitMs > 0) {
|
|
1174
|
+
await new Promise((resolve2) => setTimeout(resolve2, rateLimitMs));
|
|
1175
|
+
}
|
|
1176
|
+
}
|
|
1177
|
+
return results;
|
|
1178
|
+
}
|
|
1179
|
+
|
|
1180
|
+
// src/ingest/ingest-service.ts
|
|
1181
|
+
function documentsFile(workspacePath) {
|
|
1182
|
+
return path10.join(workspacePath, "documents", "documents.jsonl");
|
|
1183
|
+
}
|
|
1184
|
+
async function loadDocuments(workspacePath) {
|
|
1185
|
+
return readJsonl(documentsFile(workspacePath));
|
|
1186
|
+
}
|
|
1187
|
+
async function saveDocuments(workspacePath, documents) {
|
|
1188
|
+
await writeJsonl(documentsFile(workspacePath), documents.sort((a, b) => a.id.localeCompare(b.id)));
|
|
1189
|
+
}
|
|
1190
|
+
function previousMap(documents) {
|
|
1191
|
+
return new Map(documents.map((document) => [document.id, document]));
|
|
1192
|
+
}
|
|
1193
|
+
function nowStamp() {
|
|
1194
|
+
return (/* @__PURE__ */ new Date()).toISOString();
|
|
1195
|
+
}
|
|
1196
|
+
function runId() {
|
|
1197
|
+
return nowStamp().replace(/[:.]/g, "-");
|
|
1198
|
+
}
|
|
1199
|
+
function documentSnapshot(documents) {
|
|
1200
|
+
return documents.map((document) => ({
|
|
1201
|
+
id: document.id,
|
|
1202
|
+
title: document.title,
|
|
1203
|
+
uri: document.uri,
|
|
1204
|
+
contentHash: document.contentHash,
|
|
1205
|
+
lastChangedAt: document.lastChangedAt,
|
|
1206
|
+
sourceId: document.sourceId
|
|
1207
|
+
}));
|
|
1208
|
+
}
|
|
1209
|
+
function shouldExpireRssDocument(document, source, defaultRetentionDays) {
|
|
1210
|
+
if (source.type !== "rss" || !document.publicationDate) {
|
|
1211
|
+
return false;
|
|
1212
|
+
}
|
|
1213
|
+
const retentionDays = source.crawl?.retentionDays ?? defaultRetentionDays;
|
|
1214
|
+
const publishedAt = new Date(document.publicationDate);
|
|
1215
|
+
if (Number.isNaN(publishedAt.getTime())) {
|
|
1216
|
+
return false;
|
|
1217
|
+
}
|
|
1218
|
+
const cutoff = Date.now() - retentionDays * 24 * 60 * 60 * 1e3;
|
|
1219
|
+
return publishedAt.getTime() < cutoff;
|
|
1220
|
+
}
|
|
1221
|
+
async function purgeDocuments(workspacePath, documentIds, documents) {
|
|
1222
|
+
if (documentIds.size === 0) {
|
|
1223
|
+
return;
|
|
1224
|
+
}
|
|
1225
|
+
const chunks = await loadChunks(workspacePath);
|
|
1226
|
+
const filteredChunks = chunks.filter((chunk) => !documentIds.has(chunk.documentId));
|
|
1227
|
+
if (filteredChunks.length !== chunks.length) {
|
|
1228
|
+
await saveChunks(workspacePath, filteredChunks);
|
|
1229
|
+
}
|
|
1230
|
+
await Promise.all(
|
|
1231
|
+
documents.filter((document) => documentIds.has(document.id)).map((document) => deleteDocumentArtifacts(document))
|
|
1232
|
+
);
|
|
1233
|
+
}
|
|
1234
|
+
async function fetchFeedText(source) {
|
|
1235
|
+
const response = await fetch(source.uri, {
|
|
1236
|
+
headers: {
|
|
1237
|
+
"user-agent": source.crawl?.userAgent ?? "querylight-cli/0.1"
|
|
1238
|
+
}
|
|
1239
|
+
});
|
|
1240
|
+
if (!response.ok) {
|
|
1241
|
+
throw new Error(`failed to fetch feed: ${response.status}`);
|
|
1242
|
+
}
|
|
1243
|
+
return response.text();
|
|
1244
|
+
}
|
|
1245
|
+
async function ingestRssSource({
|
|
1246
|
+
workspacePath,
|
|
1247
|
+
source,
|
|
1248
|
+
previous,
|
|
1249
|
+
nextDocuments,
|
|
1250
|
+
onFailure
|
|
1251
|
+
}) {
|
|
1252
|
+
if (source.crawl?.fetchArticles === false) {
|
|
1253
|
+
throw new Error("rss sources require article fetching");
|
|
1254
|
+
}
|
|
1255
|
+
const xml = await fetchFeedText(source);
|
|
1256
|
+
const items = await parseRssFeedDocument(xml, source);
|
|
1257
|
+
let added = 0;
|
|
1258
|
+
let changed = 0;
|
|
1259
|
+
let unchanged = 0;
|
|
1260
|
+
let failed = 0;
|
|
1261
|
+
for (const item of items) {
|
|
1262
|
+
try {
|
|
1263
|
+
const probe = previous.get(stableId("doc", source.id, item.url));
|
|
1264
|
+
const document = await fetchUrlDocument({
|
|
1265
|
+
workspacePath,
|
|
1266
|
+
source,
|
|
1267
|
+
url: item.url,
|
|
1268
|
+
previous: probe,
|
|
1269
|
+
sourceUri: source.uri,
|
|
1270
|
+
publicationDate: item.publicationDate
|
|
1271
|
+
});
|
|
1272
|
+
nextDocuments.set(document.id, document);
|
|
1273
|
+
if (!probe) {
|
|
1274
|
+
added += 1;
|
|
1275
|
+
} else if (probe.contentHash !== document.contentHash) {
|
|
1276
|
+
changed += 1;
|
|
1277
|
+
} else {
|
|
1278
|
+
unchanged += 1;
|
|
1279
|
+
}
|
|
1280
|
+
} catch (error) {
|
|
1281
|
+
failed += 1;
|
|
1282
|
+
onFailure(item.url, error);
|
|
1283
|
+
}
|
|
1284
|
+
}
|
|
1285
|
+
return { added, changed, unchanged, failed };
|
|
1286
|
+
}
|
|
1287
|
+
async function ingestSources({
|
|
1288
|
+
workspacePath,
|
|
1289
|
+
sourceIds,
|
|
1290
|
+
changedOnly = false
|
|
1291
|
+
}) {
|
|
1292
|
+
const config = await loadConfig(workspacePath);
|
|
1293
|
+
const defaultRetentionDays = config.crawler.retentionDays;
|
|
1294
|
+
const sources = (await listSources(workspacePath)).filter((source) => source.enabled && (!sourceIds || sourceIds.includes(source.id)));
|
|
1295
|
+
const existing = await loadDocuments(workspacePath);
|
|
1296
|
+
const previous = previousMap(existing);
|
|
1297
|
+
const nextDocuments = new Map(existing.map((document) => [document.id, document]));
|
|
1298
|
+
let added = 0;
|
|
1299
|
+
let changed = 0;
|
|
1300
|
+
let unchanged = 0;
|
|
1301
|
+
let failed = 0;
|
|
1302
|
+
const failures = [];
|
|
1303
|
+
for (const source of sources) {
|
|
1304
|
+
const ingestOne = async (uri, producer) => {
|
|
1305
|
+
try {
|
|
1306
|
+
const probeId = stableId("doc", source.id, uri);
|
|
1307
|
+
const earlier = previous.get(probeId);
|
|
1308
|
+
const document = await producer();
|
|
1309
|
+
nextDocuments.set(document.id, document);
|
|
1310
|
+
if (!earlier) {
|
|
1311
|
+
added += 1;
|
|
1312
|
+
} else if (earlier.contentHash !== document.contentHash) {
|
|
1313
|
+
changed += 1;
|
|
1314
|
+
} else {
|
|
1315
|
+
unchanged += 1;
|
|
1316
|
+
}
|
|
1317
|
+
} catch (error) {
|
|
1318
|
+
failed += 1;
|
|
1319
|
+
failures.push({
|
|
1320
|
+
sourceId: source.id,
|
|
1321
|
+
uri,
|
|
1322
|
+
message: error instanceof Error ? error.message : String(error)
|
|
1323
|
+
});
|
|
1324
|
+
}
|
|
1325
|
+
};
|
|
1326
|
+
try {
|
|
1327
|
+
if (source.type === "file") {
|
|
1328
|
+
await ingestOne(source.uri, () => ingestFile({ workspacePath, source, filePath: source.uri, previous: previous.get(stableId("doc", source.id, source.uri)) }));
|
|
1329
|
+
continue;
|
|
1330
|
+
}
|
|
1331
|
+
if (source.type === "directory") {
|
|
1332
|
+
for (const filePath of await listDirectoryFiles(source)) {
|
|
1333
|
+
await ingestOne(filePath, () => ingestFile({ workspacePath, source, filePath, previous: previous.get(stableId("doc", source.id, filePath)) }));
|
|
1334
|
+
}
|
|
1335
|
+
continue;
|
|
1336
|
+
}
|
|
1337
|
+
if (source.type === "url") {
|
|
1338
|
+
await ingestOne(source.uri, () => fetchUrlDocument({ workspacePath, source, url: source.uri, previous: previous.get(stableId("doc", source.id, source.uri)) }));
|
|
1339
|
+
continue;
|
|
1340
|
+
}
|
|
1341
|
+
if (source.type === "website") {
|
|
1342
|
+
for (const url of await crawlWebsite(source)) {
|
|
1343
|
+
await ingestOne(url, () => fetchUrlDocument({ workspacePath, source, url, previous: previous.get(stableId("doc", source.id, url)) }));
|
|
1344
|
+
}
|
|
1345
|
+
continue;
|
|
1346
|
+
}
|
|
1347
|
+
if (source.type === "rss") {
|
|
1348
|
+
const result = await ingestRssSource({
|
|
1349
|
+
workspacePath,
|
|
1350
|
+
source,
|
|
1351
|
+
previous,
|
|
1352
|
+
nextDocuments,
|
|
1353
|
+
onFailure: (uri, error) => {
|
|
1354
|
+
failures.push({
|
|
1355
|
+
sourceId: source.id,
|
|
1356
|
+
uri,
|
|
1357
|
+
message: error instanceof Error ? error.message : String(error)
|
|
1358
|
+
});
|
|
1359
|
+
}
|
|
1360
|
+
});
|
|
1361
|
+
added += result.added;
|
|
1362
|
+
changed += result.changed;
|
|
1363
|
+
unchanged += result.unchanged;
|
|
1364
|
+
failed += result.failed;
|
|
1365
|
+
continue;
|
|
1366
|
+
}
|
|
1367
|
+
if (source.type === "markdown" || source.type === "text") {
|
|
1368
|
+
await ingestOne(source.uri, () => ingestInlineContent({
|
|
1369
|
+
workspacePath,
|
|
1370
|
+
source,
|
|
1371
|
+
title: source.name,
|
|
1372
|
+
content: source.uri,
|
|
1373
|
+
uri: `inline:${source.id}`,
|
|
1374
|
+
previous: previous.get(stableId("doc", source.id, `inline:${source.id}`))
|
|
1375
|
+
}));
|
|
1376
|
+
}
|
|
1377
|
+
} catch (error) {
|
|
1378
|
+
failed += 1;
|
|
1379
|
+
failures.push({
|
|
1380
|
+
sourceId: source.id,
|
|
1381
|
+
uri: source.uri,
|
|
1382
|
+
message: error instanceof Error ? error.message : String(error)
|
|
1383
|
+
});
|
|
1384
|
+
}
|
|
1385
|
+
}
|
|
1386
|
+
const expiringDocuments = [...nextDocuments.values()].filter((document) => {
|
|
1387
|
+
const source = sources.find((candidate) => candidate.id === document.sourceId);
|
|
1388
|
+
return source ? shouldExpireRssDocument(document, source, defaultRetentionDays) : false;
|
|
1389
|
+
});
|
|
1390
|
+
if (expiringDocuments.length > 0) {
|
|
1391
|
+
const expiredIds = new Set(expiringDocuments.map((document) => document.id));
|
|
1392
|
+
for (const document of expiringDocuments) {
|
|
1393
|
+
nextDocuments.delete(document.id);
|
|
1394
|
+
}
|
|
1395
|
+
await purgeDocuments(workspacePath, expiredIds, [...existing, ...expiringDocuments]);
|
|
1396
|
+
}
|
|
1397
|
+
const finalDocuments = [...nextDocuments.values()];
|
|
1398
|
+
await saveDocuments(workspacePath, finalDocuments);
|
|
1399
|
+
const id = runId();
|
|
1400
|
+
const run = {
|
|
1401
|
+
id,
|
|
1402
|
+
kind: "ingest",
|
|
1403
|
+
createdAt: nowStamp(),
|
|
1404
|
+
success: failed === 0,
|
|
1405
|
+
summary: {
|
|
1406
|
+
processedSources: sources.length,
|
|
1407
|
+
added,
|
|
1408
|
+
changed,
|
|
1409
|
+
unchanged,
|
|
1410
|
+
failed,
|
|
1411
|
+
changedOnly
|
|
1412
|
+
},
|
|
1413
|
+
failures,
|
|
1414
|
+
documentsSnapshot: documentSnapshot(finalDocuments)
|
|
1415
|
+
};
|
|
1416
|
+
await writeRun(workspacePath, run);
|
|
1417
|
+
return {
|
|
1418
|
+
runId: id,
|
|
1419
|
+
documents: { added, changed, unchanged, failed },
|
|
1420
|
+
processedSources: sources.length
|
|
1421
|
+
};
|
|
1422
|
+
}
|
|
1423
|
+
async function reprocessDocuments({
|
|
1424
|
+
workspacePath,
|
|
1425
|
+
sourceId,
|
|
1426
|
+
documentId
|
|
1427
|
+
}) {
|
|
1428
|
+
const documents = await loadDocuments(workspacePath);
|
|
1429
|
+
const sources = await listSources(workspacePath);
|
|
1430
|
+
const sourceMap = new Map(sources.map((source) => [source.id, source]));
|
|
1431
|
+
const nextDocuments = new Map(documents.map((document) => [document.id, document]));
|
|
1432
|
+
let documentsReprocessed = 0;
|
|
1433
|
+
let documentsSkipped = 0;
|
|
1434
|
+
for (const document of documents.filter((candidate) => (!sourceId || candidate.sourceId === sourceId) && (!documentId || candidate.id === documentId))) {
|
|
1435
|
+
const source = sourceMap.get(document.sourceId);
|
|
1436
|
+
if (!source || !document.rawPath || !await fileExists(document.rawPath)) {
|
|
1437
|
+
documentsSkipped += 1;
|
|
1438
|
+
continue;
|
|
1439
|
+
}
|
|
1440
|
+
const updated = source.type === "url" || source.type === "website" || source.type === "rss" ? await reprocessRemoteDocument(document, source) : await reprocessStoredDocument(document, source);
|
|
1441
|
+
if (!updated) {
|
|
1442
|
+
documentsSkipped += 1;
|
|
1443
|
+
continue;
|
|
1444
|
+
}
|
|
1445
|
+
nextDocuments.set(updated.id, updated);
|
|
1446
|
+
documentsReprocessed += 1;
|
|
1447
|
+
}
|
|
1448
|
+
const finalDocuments = [...nextDocuments.values()];
|
|
1449
|
+
await saveDocuments(workspacePath, finalDocuments);
|
|
1450
|
+
const id = runId();
|
|
1451
|
+
await writeRun(workspacePath, {
|
|
1452
|
+
id,
|
|
1453
|
+
kind: "reprocess",
|
|
1454
|
+
createdAt: nowStamp(),
|
|
1455
|
+
success: true,
|
|
1456
|
+
summary: {
|
|
1457
|
+
documentsReprocessed,
|
|
1458
|
+
documentsSkipped
|
|
1459
|
+
},
|
|
1460
|
+
documentsSnapshot: documentSnapshot(finalDocuments)
|
|
1461
|
+
});
|
|
1462
|
+
return { runId: id, documentsReprocessed, documentsSkipped };
|
|
1463
|
+
}
|
|
1464
|
+
|
|
1465
|
+
// src/chunk/chunker.ts
|
|
1466
|
+
import { readFile as readFile8 } from "fs/promises";
|
|
1467
|
+
import matter2 from "gray-matter";
|
|
1468
|
+
import path11 from "path";
|
|
1469
|
+
function splitSections(markdown) {
|
|
1470
|
+
const lines = markdown.split("\n");
|
|
1471
|
+
const sections = [];
|
|
1472
|
+
let headingPath = [];
|
|
1473
|
+
let current = [];
|
|
1474
|
+
const flush = () => {
|
|
1475
|
+
const text = current.join("\n").trim();
|
|
1476
|
+
if (text.length > 0) {
|
|
1477
|
+
sections.push({ headingPath: [...headingPath], text });
|
|
1478
|
+
}
|
|
1479
|
+
current = [];
|
|
1480
|
+
};
|
|
1481
|
+
for (const line of lines) {
|
|
1482
|
+
const match = /^(#{1,6})\s+(.+)$/.exec(line);
|
|
1483
|
+
if (match?.[1] && match[2]) {
|
|
1484
|
+
flush();
|
|
1485
|
+
const level = match[1].length;
|
|
1486
|
+
headingPath = [...headingPath.slice(0, level - 1), match[2].trim()];
|
|
1487
|
+
current.push(line);
|
|
1488
|
+
continue;
|
|
1489
|
+
}
|
|
1490
|
+
current.push(line);
|
|
1491
|
+
}
|
|
1492
|
+
flush();
|
|
1493
|
+
return sections;
|
|
1494
|
+
}
|
|
1495
|
+
function splitLongSection(text, maxChars, overlapChars) {
|
|
1496
|
+
if (text.length <= maxChars) {
|
|
1497
|
+
return [text];
|
|
1498
|
+
}
|
|
1499
|
+
const chunks = [];
|
|
1500
|
+
let start = 0;
|
|
1501
|
+
while (start < text.length) {
|
|
1502
|
+
const hardEnd = Math.min(text.length, start + maxChars);
|
|
1503
|
+
let sliceEnd = hardEnd;
|
|
1504
|
+
const window = text.slice(start, hardEnd);
|
|
1505
|
+
const paragraphBreak = window.lastIndexOf("\n\n");
|
|
1506
|
+
if (paragraphBreak > maxChars / 2 && hardEnd < text.length) {
|
|
1507
|
+
const candidateEnd = start + paragraphBreak;
|
|
1508
|
+
if (candidateEnd - start > overlapChars) {
|
|
1509
|
+
sliceEnd = candidateEnd;
|
|
1510
|
+
}
|
|
1511
|
+
}
|
|
1512
|
+
const slice = text.slice(start, sliceEnd).trim();
|
|
1513
|
+
if (slice.length === 0) {
|
|
1514
|
+
start = hardEnd;
|
|
1515
|
+
continue;
|
|
1516
|
+
}
|
|
1517
|
+
chunks.push(slice);
|
|
1518
|
+
const nextStart = sliceEnd - overlapChars;
|
|
1519
|
+
start = nextStart > start ? nextStart : hardEnd;
|
|
1520
|
+
}
|
|
1521
|
+
return chunks.filter((chunk) => chunk.length > 0);
|
|
1522
|
+
}
|
|
1523
|
+
function estimateTokens(text) {
|
|
1524
|
+
return Math.ceil(text.length / 4);
|
|
1525
|
+
}
|
|
1526
|
+
function buildChunksForDocument(document, markdown, config, prior = /* @__PURE__ */ new Map(), seenAt = (/* @__PURE__ */ new Date()).toISOString()) {
|
|
1527
|
+
const parsed = matter2(markdown);
|
|
1528
|
+
const sections = splitSections(parsed.content);
|
|
1529
|
+
const usefulSections = sections.length > 0 ? sections : [{ headingPath: [document.title], text: parsed.content }];
|
|
1530
|
+
const chunks = [];
|
|
1531
|
+
for (const section of usefulSections) {
|
|
1532
|
+
const pieces = splitLongSection(section.text, config.index.chunking.maxChars, config.index.chunking.overlapChars);
|
|
1533
|
+
for (const piece of pieces) {
|
|
1534
|
+
if (piece.trim().length < Math.min(40, config.index.chunking.minChars) && pieces.length === 1) {
|
|
1535
|
+
continue;
|
|
1536
|
+
}
|
|
1537
|
+
const text = piece.trim();
|
|
1538
|
+
const id = stableId("chunk", document.id, section.headingPath.join(" > "), text);
|
|
1539
|
+
const priorChunk = prior.get(id);
|
|
1540
|
+
const contentHash = sha256(text);
|
|
1541
|
+
chunks.push({
|
|
1542
|
+
id,
|
|
1543
|
+
documentId: document.id,
|
|
1544
|
+
sourceId: document.sourceId,
|
|
1545
|
+
title: document.title,
|
|
1546
|
+
uri: document.uri,
|
|
1547
|
+
headingPath: section.headingPath,
|
|
1548
|
+
text,
|
|
1549
|
+
tokenEstimate: estimateTokens(text),
|
|
1550
|
+
contentHash,
|
|
1551
|
+
metadata: document.metadata,
|
|
1552
|
+
firstSeenAt: priorChunk?.firstSeenAt ?? document.firstSeenAt,
|
|
1553
|
+
lastSeenAt: seenAt,
|
|
1554
|
+
lastChangedAt: priorChunk?.contentHash === contentHash ? priorChunk.lastChangedAt : document.lastChangedAt
|
|
1555
|
+
});
|
|
1556
|
+
}
|
|
1557
|
+
}
|
|
1558
|
+
return chunks;
|
|
1559
|
+
}
|
|
1560
|
+
async function chunkDocuments({
|
|
1561
|
+
workspacePath,
|
|
1562
|
+
sourceId,
|
|
1563
|
+
documentId
|
|
1564
|
+
}) {
|
|
1565
|
+
const config = await loadConfig(workspacePath);
|
|
1566
|
+
const documents = await readJsonl(path11.join(workspacePath, "documents", "documents.jsonl"));
|
|
1567
|
+
const filtered = documents.filter((document) => (!sourceId || document.sourceId === sourceId) && (!documentId || document.id === documentId));
|
|
1568
|
+
const targetedDocumentIds = new Set(filtered.map((document) => document.id));
|
|
1569
|
+
const existingChunks = await loadChunks(workspacePath);
|
|
1570
|
+
const prior = new Map(existingChunks.map((chunk) => [chunk.id, chunk]));
|
|
1571
|
+
const nextChunks = new Map(
|
|
1572
|
+
existingChunks.filter((chunk) => !targetedDocumentIds.has(chunk.documentId)).map((chunk) => [chunk.id, chunk])
|
|
1573
|
+
);
|
|
1574
|
+
for (const document of filtered) {
|
|
1575
|
+
const raw = await readFile8(document.normalizedPath, "utf8");
|
|
1576
|
+
for (const chunk of buildChunksForDocument(document, raw, config, prior)) {
|
|
1577
|
+
nextChunks.set(chunk.id, chunk);
|
|
1578
|
+
}
|
|
1579
|
+
}
|
|
1580
|
+
await saveChunks(workspacePath, [...nextChunks.values()]);
|
|
1581
|
+
return { chunksWritten: nextChunks.size };
|
|
1582
|
+
}
|
|
1583
|
+
|
|
1584
|
+
// src/index/querylight-indexer.ts
|
|
1585
|
+
import { Analyzer, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, RankingAlgorithm, TextFieldIndex } from "@tryformation/querylight-ts";
|
|
1586
|
+
import path17 from "path";
|
|
1587
|
+
|
|
1588
|
+
// src/vector/dense.ts
|
|
1589
|
+
import { VectorFieldIndex, createSeededRandom } from "@tryformation/querylight-ts";
|
|
1590
|
+
import { mkdir as mkdir7 } from "fs/promises";
|
|
1591
|
+
import path14 from "path";
|
|
1592
|
+
|
|
1593
|
+
// src/vector/runtime.ts
|
|
1594
|
+
import path12 from "path";
|
|
1595
|
+
import { fileURLToPath } from "url";
|
|
1596
|
+
import { execFile, execFileSync } from "child_process";
|
|
1597
|
+
function resolveCacheDir(workspacePath, configuredPath) {
|
|
1598
|
+
return path12.isAbsolute(configuredPath) ? configuredPath : path12.resolve(workspacePath, configuredPath.replace(/^\.kb\//, ""));
|
|
1599
|
+
}
|
|
1600
|
+
function packageRootFromImportMeta(importMetaUrl) {
|
|
1601
|
+
return path12.resolve(path12.dirname(fileURLToPath(importMetaUrl)), "..");
|
|
1602
|
+
}
|
|
1603
|
+
async function sparseScriptPath(importMetaUrl) {
|
|
1604
|
+
const base = packageRootFromImportMeta(importMetaUrl);
|
|
1605
|
+
const candidates = [
|
|
1606
|
+
path12.join(base, "scripts", "sparse-encode.py"),
|
|
1607
|
+
path12.join(base, "..", "scripts", "sparse-encode.py")
|
|
1608
|
+
];
|
|
1609
|
+
for (const candidate of candidates) {
|
|
1610
|
+
if (await fileExists(candidate)) {
|
|
1611
|
+
return path12.resolve(candidate);
|
|
1612
|
+
}
|
|
1613
|
+
}
|
|
1614
|
+
throw new Error(`sparse helper script not found; checked ${candidates.join(", ")}`);
|
|
1615
|
+
}
|
|
1616
|
+
async function ensureUvAvailable() {
|
|
1617
|
+
await new Promise((resolve2, reject) => {
|
|
1618
|
+
execFile("uv", ["--version"], (error) => error ? reject(error) : resolve2());
|
|
1619
|
+
});
|
|
1620
|
+
}
|
|
1621
|
+
async function runSparsePython({
|
|
1622
|
+
workspacePath,
|
|
1623
|
+
config,
|
|
1624
|
+
payload,
|
|
1625
|
+
importMetaUrl
|
|
1626
|
+
}) {
|
|
1627
|
+
const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
|
|
1628
|
+
const scriptPath = await sparseScriptPath(importMetaUrl);
|
|
1629
|
+
return execFileSync(
|
|
1630
|
+
"uv",
|
|
1631
|
+
[
|
|
1632
|
+
"run",
|
|
1633
|
+
"--with",
|
|
1634
|
+
"torch",
|
|
1635
|
+
"--with",
|
|
1636
|
+
"transformers",
|
|
1637
|
+
"--with",
|
|
1638
|
+
"huggingface_hub",
|
|
1639
|
+
"python",
|
|
1640
|
+
scriptPath
|
|
1641
|
+
],
|
|
1642
|
+
{
|
|
1643
|
+
encoding: "utf8",
|
|
1644
|
+
maxBuffer: 1024 * 1024 * 1024,
|
|
1645
|
+
input: JSON.stringify(payload),
|
|
1646
|
+
env: {
|
|
1647
|
+
...process.env,
|
|
1648
|
+
HF_HOME: cacheDir
|
|
1649
|
+
}
|
|
1650
|
+
}
|
|
1651
|
+
);
|
|
1652
|
+
}
|
|
1653
|
+
async function getDenseTransformersRuntime(cacheDir) {
|
|
1654
|
+
const transformers = await import("@huggingface/transformers");
|
|
1655
|
+
transformers.env.cacheDir = cacheDir;
|
|
1656
|
+
transformers.env.allowLocalModels = true;
|
|
1657
|
+
return {
|
|
1658
|
+
env: transformers.env,
|
|
1659
|
+
pipeline: transformers.pipeline
|
|
1660
|
+
};
|
|
1661
|
+
}
|
|
1662
|
+
|
|
1663
|
+
// src/vector/store.ts
|
|
1664
|
+
import { mkdir as mkdir6, readFile as readFile9, writeFile as writeFile6 } from "fs/promises";
|
|
1665
|
+
import path13 from "path";
|
|
1666
|
+
function vectorsDir(workspacePath) {
|
|
1667
|
+
return path13.join(workspacePath, "vectors");
|
|
1668
|
+
}
|
|
1669
|
+
function modelsDir(workspacePath) {
|
|
1670
|
+
return path13.join(workspacePath, "models");
|
|
1671
|
+
}
|
|
1672
|
+
function denseVectorPath(workspacePath) {
|
|
1673
|
+
return path13.join(vectorsDir(workspacePath), "dense.latest.json");
|
|
1674
|
+
}
|
|
1675
|
+
function denseMetaPath(workspacePath) {
|
|
1676
|
+
return path13.join(vectorsDir(workspacePath), "dense.latest.meta.json");
|
|
1677
|
+
}
|
|
1678
|
+
function sparseVectorPath(workspacePath) {
|
|
1679
|
+
return path13.join(vectorsDir(workspacePath), "sparse.latest.json");
|
|
1680
|
+
}
|
|
1681
|
+
function sparseMetaPath(workspacePath) {
|
|
1682
|
+
return path13.join(vectorsDir(workspacePath), "sparse.latest.meta.json");
|
|
1683
|
+
}
|
|
1684
|
+
function densePullMarker(workspacePath) {
|
|
1685
|
+
return path13.join(modelsDir(workspacePath), "dense.pulled.json");
|
|
1686
|
+
}
|
|
1687
|
+
function sparsePullMarker(workspacePath) {
|
|
1688
|
+
return path13.join(modelsDir(workspacePath), "sparse.pulled.json");
|
|
1689
|
+
}
|
|
1690
|
+
async function writeDensePayload(workspacePath, payload) {
|
|
1691
|
+
await mkdir6(vectorsDir(workspacePath), { recursive: true });
|
|
1692
|
+
await writeFile6(denseVectorPath(workspacePath), JSON.stringify(payload, null, 2), "utf8");
|
|
1693
|
+
await writeFile6(denseMetaPath(workspacePath), JSON.stringify(payload.metadata, null, 2), "utf8");
|
|
1694
|
+
}
|
|
1695
|
+
async function readDensePayload(workspacePath) {
|
|
1696
|
+
return JSON.parse(await readFile9(denseVectorPath(workspacePath), "utf8"));
|
|
1697
|
+
}
|
|
1698
|
+
async function writeSparsePayload(workspacePath, payload) {
|
|
1699
|
+
await mkdir6(vectorsDir(workspacePath), { recursive: true });
|
|
1700
|
+
await writeFile6(sparseVectorPath(workspacePath), JSON.stringify(payload, null, 2), "utf8");
|
|
1701
|
+
await writeFile6(sparseMetaPath(workspacePath), JSON.stringify(payload.metadata, null, 2), "utf8");
|
|
1702
|
+
}
|
|
1703
|
+
async function readSparsePayload(workspacePath) {
|
|
1704
|
+
return JSON.parse(await readFile9(sparseVectorPath(workspacePath), "utf8"));
|
|
1705
|
+
}
|
|
1706
|
+
async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
|
|
1707
|
+
const denseCacheDir = resolveCacheDir(workspacePath, dense.cacheDir);
|
|
1708
|
+
const sparseCacheDir = resolveCacheDir(workspacePath, sparse.cacheDir);
|
|
1709
|
+
return {
|
|
1710
|
+
dense: {
|
|
1711
|
+
configured: dense.enabled,
|
|
1712
|
+
modelId: dense.modelId,
|
|
1713
|
+
cacheDir: denseCacheDir,
|
|
1714
|
+
available: await fileExists(densePullMarker(workspacePath)),
|
|
1715
|
+
artifactExists: await fileExists(denseVectorPath(workspacePath))
|
|
1716
|
+
},
|
|
1717
|
+
sparse: {
|
|
1718
|
+
configured: sparse.enabled,
|
|
1719
|
+
modelId: sparse.modelId,
|
|
1720
|
+
cacheDir: sparseCacheDir,
|
|
1721
|
+
uvAvailable,
|
|
1722
|
+
available: await fileExists(sparsePullMarker(workspacePath)),
|
|
1723
|
+
artifactExists: await fileExists(sparseVectorPath(workspacePath))
|
|
1724
|
+
}
|
|
1725
|
+
};
|
|
1726
|
+
}
|
|
1727
|
+
|
|
1728
|
+
// src/vector/text.ts
|
|
1729
|
+
function createDenseChunkText(chunk) {
|
|
1730
|
+
return [chunk.title, ...chunk.headingPath, chunk.text].filter(Boolean).join("\n\n");
|
|
1731
|
+
}
|
|
1732
|
+
function createSparseChunkText(chunk) {
|
|
1733
|
+
return [chunk.title, ...chunk.headingPath, chunk.text].filter(Boolean).join("\n\n");
|
|
1734
|
+
}
|
|
1735
|
+
|
|
1736
|
+
// src/vector/dense.ts
|
|
1737
|
+
var denseEmbedderFactory = null;
|
|
1738
|
+
async function createEmbedder(cacheDir, modelId) {
|
|
1739
|
+
if (denseEmbedderFactory) {
|
|
1740
|
+
return denseEmbedderFactory(cacheDir, modelId);
|
|
1741
|
+
}
|
|
1742
|
+
const runtime = await getDenseTransformersRuntime(cacheDir);
|
|
1743
|
+
const extractor = await runtime.pipeline("feature-extraction", modelId);
|
|
1744
|
+
return async (text) => {
|
|
1745
|
+
const output = await extractor(text, { pooling: "mean", normalize: true });
|
|
1746
|
+
return output.tolist()[0];
|
|
1747
|
+
};
|
|
1748
|
+
}
|
|
1749
|
+
async function buildDenseVectors({
|
|
1750
|
+
workspacePath,
|
|
1751
|
+
config
|
|
1752
|
+
}) {
|
|
1753
|
+
const chunks = await readJsonl(path14.join(workspacePath, "chunks", "chunks.jsonl"));
|
|
1754
|
+
const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
|
|
1755
|
+
await mkdir7(cacheDir, { recursive: true });
|
|
1756
|
+
const embed = await createEmbedder(cacheDir, config.modelId);
|
|
1757
|
+
const records = [];
|
|
1758
|
+
let dimensions = 0;
|
|
1759
|
+
for (const chunk of chunks) {
|
|
1760
|
+
const embedding = await embed(createDenseChunkText(chunk));
|
|
1761
|
+
dimensions ||= embedding.length;
|
|
1762
|
+
records.push({
|
|
1763
|
+
chunkId: chunk.id,
|
|
1764
|
+
documentId: chunk.documentId,
|
|
1765
|
+
sourceId: chunk.sourceId,
|
|
1766
|
+
title: chunk.title,
|
|
1767
|
+
uri: chunk.uri,
|
|
1768
|
+
headingPath: chunk.headingPath,
|
|
1769
|
+
text: chunk.text,
|
|
1770
|
+
embedding
|
|
1771
|
+
});
|
|
1772
|
+
}
|
|
1773
|
+
const index = new VectorFieldIndex({
|
|
1774
|
+
numHashTables: config.indexHashTables,
|
|
1775
|
+
dimensions,
|
|
1776
|
+
random: createSeededRandom(config.indexRandomSeed)
|
|
1777
|
+
});
|
|
1778
|
+
for (const record of records) {
|
|
1779
|
+
index.insert(record.chunkId, [record.embedding]);
|
|
1780
|
+
}
|
|
1781
|
+
const metadata = {
|
|
1782
|
+
createdAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1783
|
+
modelId: config.modelId,
|
|
1784
|
+
dimensions,
|
|
1785
|
+
hashTables: config.indexHashTables,
|
|
1786
|
+
randomSeed: config.indexRandomSeed,
|
|
1787
|
+
chunkCount: records.length,
|
|
1788
|
+
indexHash: sha256(JSON.stringify(index.indexState))
|
|
1789
|
+
};
|
|
1790
|
+
const payload = {
|
|
1791
|
+
metadata,
|
|
1792
|
+
indexState: index.indexState,
|
|
1793
|
+
chunks: records
|
|
1794
|
+
};
|
|
1795
|
+
await writeDensePayload(workspacePath, payload);
|
|
1796
|
+
return payload;
|
|
1797
|
+
}
|
|
1798
|
+
async function denseQuery({
|
|
1799
|
+
workspacePath,
|
|
1800
|
+
config,
|
|
1801
|
+
query,
|
|
1802
|
+
topK
|
|
1803
|
+
}) {
|
|
1804
|
+
const payload = await readDensePayload(workspacePath);
|
|
1805
|
+
const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
|
|
1806
|
+
const embed = await createEmbedder(cacheDir, config.modelId);
|
|
1807
|
+
const vector = await embed(query);
|
|
1808
|
+
const index = new VectorFieldIndex({
|
|
1809
|
+
numHashTables: payload.metadata.hashTables,
|
|
1810
|
+
dimensions: payload.metadata.dimensions,
|
|
1811
|
+
random: createSeededRandom(payload.metadata.randomSeed)
|
|
1812
|
+
}).loadState(payload.indexState);
|
|
1813
|
+
return index.query(vector, topK);
|
|
1814
|
+
}
|
|
1815
|
+
|
|
1816
|
+
// src/vector/sparse.ts
|
|
1817
|
+
import { SparseVectorFieldIndex } from "@tryformation/querylight-ts";
|
|
1818
|
+
import { mkdir as mkdir8 } from "fs/promises";
|
|
1819
|
+
import path15 from "path";
|
|
1820
|
+
var sparseQueryEncoderFactory = null;
|
|
1821
|
+
var sparseDocumentBuilderFactory = null;
|
|
1822
|
+
function buildSparseQueryVector(tokenIds, tokenWeights) {
|
|
1823
|
+
const sparseVector = {};
|
|
1824
|
+
for (const tokenId of new Set(tokenIds)) {
|
|
1825
|
+
const weight = tokenWeights[tokenId] ?? 0;
|
|
1826
|
+
if (weight > 0) {
|
|
1827
|
+
sparseVector[String(tokenId)] = weight;
|
|
1828
|
+
}
|
|
1829
|
+
}
|
|
1830
|
+
return sparseVector;
|
|
1831
|
+
}
|
|
1832
|
+
function normalizeTokenIds(value) {
|
|
1833
|
+
if (value && typeof value === "object" && "data" in value) {
|
|
1834
|
+
const data = value.data;
|
|
1835
|
+
if (Array.isArray(data)) {
|
|
1836
|
+
return data.map(Number).filter(Number.isFinite);
|
|
1837
|
+
}
|
|
1838
|
+
if (ArrayBuffer.isView(data)) {
|
|
1839
|
+
return Array.from(data, Number).filter(Number.isFinite);
|
|
1840
|
+
}
|
|
1841
|
+
}
|
|
1842
|
+
if (!Array.isArray(value)) {
|
|
1843
|
+
return [];
|
|
1844
|
+
}
|
|
1845
|
+
if (value.length === 0) {
|
|
1846
|
+
return [];
|
|
1847
|
+
}
|
|
1848
|
+
if (Array.isArray(value[0])) {
|
|
1849
|
+
return value[0].map(Number).filter(Number.isFinite);
|
|
1850
|
+
}
|
|
1851
|
+
return value.map(Number).filter(Number.isFinite);
|
|
1852
|
+
}
|
|
1853
|
+
async function createSparseQueryEncoder(cacheDir, modelId, queryTokenWeights) {
|
|
1854
|
+
if (sparseQueryEncoderFactory) {
|
|
1855
|
+
return sparseQueryEncoderFactory(cacheDir, modelId, queryTokenWeights);
|
|
1856
|
+
}
|
|
1857
|
+
const runtime = await getDenseTransformersRuntime(cacheDir);
|
|
1858
|
+
const { AutoTokenizer } = await import("@huggingface/transformers");
|
|
1859
|
+
runtime.env.cacheDir = cacheDir;
|
|
1860
|
+
const tokenizer = await AutoTokenizer.from_pretrained(modelId);
|
|
1861
|
+
return async (text) => {
|
|
1862
|
+
const features = await tokenizer([text], {
|
|
1863
|
+
truncation: true,
|
|
1864
|
+
return_attention_mask: false,
|
|
1865
|
+
return_token_type_ids: false
|
|
1866
|
+
});
|
|
1867
|
+
return buildSparseQueryVector(normalizeTokenIds(features.input_ids), queryTokenWeights);
|
|
1868
|
+
};
|
|
1869
|
+
}
|
|
1870
|
+
async function buildSparseDocuments(workspacePath, config, chunks) {
|
|
1871
|
+
if (sparseDocumentBuilderFactory) {
|
|
1872
|
+
return sparseDocumentBuilderFactory(workspacePath, config, chunks);
|
|
1873
|
+
}
|
|
1874
|
+
await ensureUvAvailable();
|
|
1875
|
+
const output = JSON.parse(await runSparsePython({
|
|
1876
|
+
workspacePath,
|
|
1877
|
+
config,
|
|
1878
|
+
importMetaUrl: import.meta.url,
|
|
1879
|
+
payload: {
|
|
1880
|
+
action: "encode_documents",
|
|
1881
|
+
model_id: config.modelId,
|
|
1882
|
+
top_tokens: config.documentTopTokens,
|
|
1883
|
+
documents: chunks.map((chunk) => ({
|
|
1884
|
+
chunkId: chunk.id,
|
|
1885
|
+
text: createSparseChunkText(chunk)
|
|
1886
|
+
}))
|
|
1887
|
+
}
|
|
1888
|
+
}));
|
|
1889
|
+
const byId = new Map(output.documents.map((document) => [document.chunkId, document.vector]));
|
|
1890
|
+
return {
|
|
1891
|
+
queryTokenWeights: output.query_token_weights,
|
|
1892
|
+
vocabularySize: output.vocabularySize,
|
|
1893
|
+
chunks: chunks.map((chunk) => ({
|
|
1894
|
+
chunkId: chunk.id,
|
|
1895
|
+
documentId: chunk.documentId,
|
|
1896
|
+
sourceId: chunk.sourceId,
|
|
1897
|
+
title: chunk.title,
|
|
1898
|
+
uri: chunk.uri,
|
|
1899
|
+
headingPath: chunk.headingPath,
|
|
1900
|
+
text: chunk.text,
|
|
1901
|
+
vector: byId.get(chunk.id) ?? {}
|
|
1902
|
+
}))
|
|
1903
|
+
};
|
|
1904
|
+
}
|
|
1905
|
+
async function buildSparseVectors({
|
|
1906
|
+
workspacePath,
|
|
1907
|
+
config
|
|
1908
|
+
}) {
|
|
1909
|
+
const chunks = await readJsonl(path15.join(workspacePath, "chunks", "chunks.jsonl"));
|
|
1910
|
+
const built = await buildSparseDocuments(workspacePath, config, chunks);
|
|
1911
|
+
const index = new SparseVectorFieldIndex();
|
|
1912
|
+
for (const record of built.chunks) {
|
|
1913
|
+
index.insert(record.chunkId, [record.vector]);
|
|
1914
|
+
}
|
|
1915
|
+
const metadata = {
|
|
1916
|
+
createdAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1917
|
+
modelId: config.modelId,
|
|
1918
|
+
vocabularySize: built.vocabularySize,
|
|
1919
|
+
documentTopTokens: config.documentTopTokens,
|
|
1920
|
+
queryEncoding: config.queryEncoding,
|
|
1921
|
+
documentEncoding: config.documentEncoding,
|
|
1922
|
+
chunkCount: built.chunks.length,
|
|
1923
|
+
indexHash: sha256(JSON.stringify(index.indexState))
|
|
1924
|
+
};
|
|
1925
|
+
const payload = {
|
|
1926
|
+
metadata,
|
|
1927
|
+
indexState: index.indexState,
|
|
1928
|
+
chunks: built.chunks,
|
|
1929
|
+
queryTokenWeights: built.queryTokenWeights
|
|
1930
|
+
};
|
|
1931
|
+
await writeSparsePayload(workspacePath, payload);
|
|
1932
|
+
return payload;
|
|
1933
|
+
}
|
|
1934
|
+
async function sparseQuery({
|
|
1935
|
+
workspacePath,
|
|
1936
|
+
config,
|
|
1937
|
+
query,
|
|
1938
|
+
topK
|
|
1939
|
+
}) {
|
|
1940
|
+
const payload = await readSparsePayload(workspacePath);
|
|
1941
|
+
const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
|
|
1942
|
+
const encode = await createSparseQueryEncoder(cacheDir, config.modelId, payload.queryTokenWeights);
|
|
1943
|
+
const vector = await encode(query);
|
|
1944
|
+
const index = new SparseVectorFieldIndex().loadState(payload.indexState);
|
|
1945
|
+
return index.query(vector, topK);
|
|
1946
|
+
}
|
|
1947
|
+
|
|
1948
|
+
// src/vector/service.ts
|
|
1949
|
+
async function buildVectorArtifacts({
|
|
1950
|
+
workspacePath,
|
|
1951
|
+
config,
|
|
1952
|
+
denseOverride,
|
|
1953
|
+
sparseOverride,
|
|
1954
|
+
buildAvailableModels = false
|
|
1955
|
+
}) {
|
|
1956
|
+
const modelStatus = buildAvailableModels ? await buildModelStatus(workspacePath, config.retrieval.dense, config.retrieval.sparse, await (async () => {
|
|
1957
|
+
try {
|
|
1958
|
+
await ensureUvAvailable();
|
|
1959
|
+
return true;
|
|
1960
|
+
} catch {
|
|
1961
|
+
return false;
|
|
1962
|
+
}
|
|
1963
|
+
})()) : null;
|
|
1964
|
+
const denseEnabled = denseOverride ?? (buildAvailableModels ? config.retrieval.dense.enabled || Boolean(modelStatus?.dense.available) : config.retrieval.dense.enabled);
|
|
1965
|
+
const sparseEnabled = sparseOverride ?? (buildAvailableModels ? (config.retrieval.sparse.enabled || Boolean(modelStatus?.sparse.available)) && Boolean(modelStatus?.sparse.uvAvailable) : config.retrieval.sparse.enabled);
|
|
1966
|
+
const result = {};
|
|
1967
|
+
if (denseEnabled) {
|
|
1968
|
+
result.dense = await buildDenseVectors({ workspacePath, config: config.retrieval.dense });
|
|
1969
|
+
}
|
|
1970
|
+
if (sparseEnabled) {
|
|
1971
|
+
result.sparse = await buildSparseVectors({ workspacePath, config: config.retrieval.sparse });
|
|
1972
|
+
}
|
|
1973
|
+
return result;
|
|
1974
|
+
}
|
|
1975
|
+
|
|
1976
|
+
// src/index/index-store.ts
|
|
1977
|
+
import { readFile as readFile10, writeFile as writeFile7 } from "fs/promises";
|
|
1978
|
+
import path16 from "path";
|
|
1979
|
+
async function writeIndexArtifacts({
|
|
1980
|
+
workspacePath,
|
|
1981
|
+
indexState,
|
|
1982
|
+
metadata
|
|
1983
|
+
}) {
|
|
1984
|
+
const stamp = metadata.createdAt.replace(/[:.]/g, "-");
|
|
1985
|
+
const indexPath = path16.join(workspacePath, "indexes", `${stamp}.json`);
|
|
1986
|
+
const metaPath = path16.join(workspacePath, "indexes", `${stamp}.meta.json`);
|
|
1987
|
+
const latestIndexPath = path16.join(workspacePath, "indexes", "latest.json");
|
|
1988
|
+
const latestMetaPath = path16.join(workspacePath, "indexes", "latest.meta.json");
|
|
1989
|
+
const indexPayload = JSON.stringify(indexState, null, 2);
|
|
1990
|
+
const metaPayload = JSON.stringify(metadata, null, 2);
|
|
1991
|
+
await writeFile7(indexPath, indexPayload, "utf8");
|
|
1992
|
+
await writeFile7(metaPath, metaPayload, "utf8");
|
|
1993
|
+
await writeFile7(latestIndexPath, indexPayload, "utf8");
|
|
1994
|
+
await writeFile7(latestMetaPath, metaPayload, "utf8");
|
|
1995
|
+
return { indexPath: latestIndexPath, metadataPath: latestMetaPath };
|
|
1996
|
+
}
|
|
1997
|
+
async function readLatestIndexState(workspacePath) {
|
|
1998
|
+
return JSON.parse(await readFile10(path16.join(workspacePath, "indexes", "latest.json"), "utf8"));
|
|
1999
|
+
}
|
|
2000
|
+
|
|
2001
|
+
// src/index/querylight-indexer.ts
|
|
2002
|
+
function keywordFieldIndex() {
|
|
2003
|
+
const analyzer = new Analyzer([new LowerCaseTextFilter()], new KeywordTokenizer());
|
|
2004
|
+
return new TextFieldIndex(analyzer, analyzer, RankingAlgorithm.BM25);
|
|
2005
|
+
}
|
|
2006
|
+
function createIndexMapping(extraFields = []) {
|
|
2007
|
+
const lexical = new TextFieldIndex(void 0, void 0, RankingAlgorithm.BM25);
|
|
2008
|
+
const mapping = {
|
|
2009
|
+
text: lexical,
|
|
2010
|
+
title: new TextFieldIndex(void 0, void 0, RankingAlgorithm.BM25),
|
|
2011
|
+
uri: keywordFieldIndex(),
|
|
2012
|
+
sourceId: keywordFieldIndex(),
|
|
2013
|
+
tags: keywordFieldIndex(),
|
|
2014
|
+
sourceType: keywordFieldIndex()
|
|
2015
|
+
};
|
|
2016
|
+
for (const field of extraFields) {
|
|
2017
|
+
mapping[field] = keywordFieldIndex();
|
|
2018
|
+
}
|
|
2019
|
+
return mapping;
|
|
2020
|
+
}
|
|
2021
|
+
function flattenMetadata(metadata) {
|
|
2022
|
+
const flattened = {};
|
|
2023
|
+
for (const [key, value] of Object.entries(metadata)) {
|
|
2024
|
+
if (value == null) {
|
|
2025
|
+
continue;
|
|
2026
|
+
}
|
|
2027
|
+
const field = `metadata.${key}`;
|
|
2028
|
+
if (Array.isArray(value)) {
|
|
2029
|
+
flattened[field] = value.map((item) => String(item).toLowerCase());
|
|
2030
|
+
} else {
|
|
2031
|
+
flattened[field] = [String(value).toLowerCase()];
|
|
2032
|
+
}
|
|
2033
|
+
}
|
|
2034
|
+
return flattened;
|
|
2035
|
+
}
|
|
2036
|
+
async function buildIndex({
|
|
2037
|
+
workspacePath,
|
|
2038
|
+
denseOverride,
|
|
2039
|
+
sparseOverride,
|
|
2040
|
+
buildAvailableModels = false
|
|
2041
|
+
}) {
|
|
2042
|
+
const config = await loadConfig(workspacePath);
|
|
2043
|
+
const chunks = await readJsonl(path17.join(workspacePath, "chunks", "chunks.jsonl"));
|
|
2044
|
+
const documents = await readJsonl(path17.join(workspacePath, "documents", "documents.jsonl"));
|
|
2045
|
+
const sources = await readJsonl(path17.join(workspacePath, "sources", "sources.jsonl"));
|
|
2046
|
+
const metadataFields = [...new Set(chunks.flatMap((chunk) => Object.keys(chunk.metadata).map((key) => `metadata.${key}`)))];
|
|
2047
|
+
const index = new DocumentIndex(createIndexMapping(metadataFields));
|
|
2048
|
+
for (const chunk of chunks) {
|
|
2049
|
+
index.index({
|
|
2050
|
+
id: chunk.id,
|
|
2051
|
+
fields: {
|
|
2052
|
+
text: [chunk.text],
|
|
2053
|
+
title: [chunk.title],
|
|
2054
|
+
uri: [chunk.uri.toLowerCase()],
|
|
2055
|
+
sourceId: [chunk.sourceId.toLowerCase()],
|
|
2056
|
+
tags: Array.isArray(chunk.metadata.tags) ? chunk.metadata.tags.map((tag) => String(tag).toLowerCase()) : [],
|
|
2057
|
+
sourceType: [String(chunk.metadata.sourceType ?? "").toLowerCase()],
|
|
2058
|
+
...flattenMetadata(chunk.metadata)
|
|
2059
|
+
}
|
|
2060
|
+
});
|
|
2061
|
+
}
|
|
2062
|
+
const createdAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
2063
|
+
const metadata = {
|
|
2064
|
+
id: `index_${createdAt.replace(/[:.]/g, "-")}`,
|
|
2065
|
+
createdAt,
|
|
2066
|
+
querylightVersion: "0.10.0",
|
|
2067
|
+
kbVersion: "0.1.0",
|
|
2068
|
+
documentCount: documents.length,
|
|
2069
|
+
chunkCount: chunks.length,
|
|
2070
|
+
sourceCount: sources.length,
|
|
2071
|
+
fields: Object.keys(index.mapping),
|
|
2072
|
+
indexHash: sha256(JSON.stringify(index.indexState))
|
|
2073
|
+
};
|
|
2074
|
+
const artifacts = await writeIndexArtifacts({ workspacePath, indexState: index.indexState, metadata });
|
|
2075
|
+
const vectors = await buildVectorArtifacts({
|
|
2076
|
+
workspacePath,
|
|
2077
|
+
config,
|
|
2078
|
+
denseOverride,
|
|
2079
|
+
sparseOverride,
|
|
2080
|
+
buildAvailableModels
|
|
2081
|
+
});
|
|
2082
|
+
return {
|
|
2083
|
+
metadata,
|
|
2084
|
+
indexPath: artifacts.indexPath,
|
|
2085
|
+
denseBuilt: Boolean(vectors.dense),
|
|
2086
|
+
sparseBuilt: Boolean(vectors.sparse)
|
|
2087
|
+
};
|
|
2088
|
+
}
|
|
2089
|
+
|
|
2090
|
+
// src/query/search-service.ts
|
|
2091
|
+
import { readFile as readFile11 } from "fs/promises";
|
|
2092
|
+
import { BoolQuery, MatchQuery, OP, TermQuery, reciprocalRankFusion } from "@tryformation/querylight-ts";
|
|
2093
|
+
import path18 from "path";
|
|
2094
|
+
async function loadHydratedIndex(workspacePath) {
|
|
2095
|
+
const state = await readLatestIndexState(workspacePath);
|
|
2096
|
+
const mapping = createIndexMapping(Object.keys(state.fieldState ?? {}).filter((field) => field.startsWith("metadata.")));
|
|
2097
|
+
return new (await import("@tryformation/querylight-ts")).DocumentIndex(mapping).loadState(state);
|
|
2098
|
+
}
|
|
2099
|
+
function normalizeFilterValues(values) {
|
|
2100
|
+
return (values ?? []).map((value) => value.toLowerCase()).filter(Boolean);
|
|
2101
|
+
}
|
|
2102
|
+
function matchesAny(value, candidates) {
|
|
2103
|
+
return candidates.length === 0 || candidates.includes(value.toLowerCase());
|
|
2104
|
+
}
|
|
2105
|
+
function matchesPrefix(value, prefixes) {
|
|
2106
|
+
if (prefixes.length === 0) {
|
|
2107
|
+
return true;
|
|
2108
|
+
}
|
|
2109
|
+
const lower = value.toLowerCase();
|
|
2110
|
+
return prefixes.some((prefix) => lower.startsWith(prefix));
|
|
2111
|
+
}
|
|
2112
|
+
function buildSearchQuery(query, filters) {
|
|
2113
|
+
const sourceIds = normalizeFilterValues([filters.sourceId, ...filters.sourceIds ?? []].filter((value) => Boolean(value)));
|
|
2114
|
+
const sourceTypes = normalizeFilterValues([filters.sourceType, ...filters.sourceTypes ?? []].filter((value) => Boolean(value)));
|
|
2115
|
+
const tags = normalizeFilterValues([filters.tag, ...filters.tags ?? []].filter((value) => Boolean(value)));
|
|
2116
|
+
return new BoolQuery({
|
|
2117
|
+
should: [
|
|
2118
|
+
new MatchQuery({ field: "title", text: query, operation: OP.AND, boost: 6 }),
|
|
2119
|
+
new MatchQuery({ field: "text", text: query, operation: OP.AND, boost: 4 }),
|
|
2120
|
+
new MatchQuery({ field: "text", text: query, operation: OP.OR, boost: 2 })
|
|
2121
|
+
],
|
|
2122
|
+
filter: [
|
|
2123
|
+
...sourceIds.length === 1 ? [new TermQuery({ field: "sourceId", text: sourceIds[0] })] : [],
|
|
2124
|
+
...sourceTypes.length === 1 ? [new TermQuery({ field: "sourceType", text: sourceTypes[0] })] : [],
|
|
2125
|
+
...tags.length === 1 ? [new TermQuery({ field: "tags", text: tags[0] })] : [],
|
|
2126
|
+
...(filters.metadata ?? []).map(({ key, value }) => new TermQuery({ field: `metadata.${key}`, text: value.toLowerCase() }))
|
|
2127
|
+
]
|
|
2128
|
+
});
|
|
2129
|
+
}
|
|
2130
|
+
function isValidDate(value) {
|
|
2131
|
+
return typeof value === "string" && !Number.isNaN(new Date(value).getTime());
|
|
2132
|
+
}
|
|
2133
|
+
function documentDateValue(document, field) {
|
|
2134
|
+
const value = document[field];
|
|
2135
|
+
return typeof value === "string" && isValidDate(value) ? value : null;
|
|
2136
|
+
}
|
|
2137
|
+
function matchesDateRanges(document, dateRanges) {
|
|
2138
|
+
return dateRanges.every(({ field, from, to }) => {
|
|
2139
|
+
const value = documentDateValue(document, field);
|
|
2140
|
+
if (!value) {
|
|
2141
|
+
return false;
|
|
2142
|
+
}
|
|
2143
|
+
const timestamp = new Date(value).getTime();
|
|
2144
|
+
const fromTime = from ? new Date(from).getTime() : null;
|
|
2145
|
+
const toTime = to ? new Date(to).getTime() : null;
|
|
2146
|
+
return (fromTime == null || timestamp >= fromTime) && (toTime == null || timestamp <= toTime);
|
|
2147
|
+
});
|
|
2148
|
+
}
|
|
2149
|
+
function fallbackSourceType(chunk, document, source) {
|
|
2150
|
+
const metadataSourceType = typeof chunk.metadata.sourceType === "string" ? chunk.metadata.sourceType : void 0;
|
|
2151
|
+
return document?.sourceType ?? source?.type ?? metadataSourceType ?? "text";
|
|
2152
|
+
}
|
|
2153
|
+
function filterChunk(chunk, document, source, {
|
|
2154
|
+
sourceId,
|
|
2155
|
+
sourceIds,
|
|
2156
|
+
sourceName,
|
|
2157
|
+
sourceNames,
|
|
2158
|
+
sourceType,
|
|
2159
|
+
sourceTypes,
|
|
2160
|
+
uriPrefix,
|
|
2161
|
+
uriPrefixes,
|
|
2162
|
+
hasPublicationDate,
|
|
2163
|
+
tag,
|
|
2164
|
+
tags,
|
|
2165
|
+
metadata,
|
|
2166
|
+
dateRanges
|
|
2167
|
+
}) {
|
|
2168
|
+
const normalizedSourceIds = normalizeFilterValues([sourceId, ...sourceIds ?? []].filter((value) => Boolean(value)));
|
|
2169
|
+
const normalizedSourceNames = normalizeFilterValues([sourceName, ...sourceNames ?? []].filter((value) => Boolean(value)));
|
|
2170
|
+
const normalizedSourceTypes = normalizeFilterValues([sourceType, ...sourceTypes ?? []].filter((value) => Boolean(value)));
|
|
2171
|
+
const normalizedUriPrefixes = normalizeFilterValues([uriPrefix, ...uriPrefixes ?? []].filter((value) => Boolean(value)));
|
|
2172
|
+
const normalizedTags = normalizeFilterValues([tag, ...tags ?? []].filter((value) => Boolean(value)));
|
|
2173
|
+
if (!matchesAny(chunk.sourceId, normalizedSourceIds)) {
|
|
2174
|
+
return false;
|
|
2175
|
+
}
|
|
2176
|
+
if (!matchesAny(fallbackSourceType(chunk, document, source), normalizedSourceTypes)) {
|
|
2177
|
+
return false;
|
|
2178
|
+
}
|
|
2179
|
+
if (normalizedSourceNames.length > 0 && !matchesAny(source?.name ?? "", normalizedSourceNames)) {
|
|
2180
|
+
return false;
|
|
2181
|
+
}
|
|
2182
|
+
if (!matchesPrefix(document?.uri ?? chunk.uri, normalizedUriPrefixes)) {
|
|
2183
|
+
return false;
|
|
2184
|
+
}
|
|
2185
|
+
if (hasPublicationDate && (!document || !documentDateValue(document, "publicationDate"))) {
|
|
2186
|
+
return false;
|
|
2187
|
+
}
|
|
2188
|
+
if (normalizedTags.length > 0) {
|
|
2189
|
+
const tags2 = Array.isArray(chunk.metadata.tags) ? chunk.metadata.tags.map(String).map((value) => value.toLowerCase()) : [];
|
|
2190
|
+
if (!normalizedTags.some((tag2) => tags2.includes(tag2))) {
|
|
2191
|
+
return false;
|
|
2192
|
+
}
|
|
2193
|
+
}
|
|
2194
|
+
if (metadata?.length) {
|
|
2195
|
+
const metadataMatches = metadata.every(({ key, value }) => {
|
|
2196
|
+
const candidate = chunk.metadata[key];
|
|
2197
|
+
return Array.isArray(candidate) ? candidate.map(String).map((item) => item.toLowerCase()).includes(value.toLowerCase()) : String(candidate ?? "").toLowerCase() === value.toLowerCase();
|
|
2198
|
+
});
|
|
2199
|
+
if (!metadataMatches) {
|
|
2200
|
+
return false;
|
|
2201
|
+
}
|
|
2202
|
+
}
|
|
2203
|
+
if (!document) {
|
|
2204
|
+
return dateRanges.length === 0;
|
|
2205
|
+
}
|
|
2206
|
+
return matchesDateRanges(document, dateRanges);
|
|
2207
|
+
}
|
|
2208
|
+
function sortDateDescending(left, right) {
|
|
2209
|
+
const leftTime = left ? new Date(left).getTime() : Number.NEGATIVE_INFINITY;
|
|
2210
|
+
const rightTime = right ? new Date(right).getTime() : Number.NEGATIVE_INFINITY;
|
|
2211
|
+
return rightTime - leftTime;
|
|
2212
|
+
}
|
|
2213
|
+
function latestSortDate(document) {
|
|
2214
|
+
return documentDateValue(document, "publicationDate") ?? documentDateValue(document, "lastChangedAt") ?? documentDateValue(document, "lastSeenAt") ?? documentDateValue(document, "firstSeenAt") ?? documentDateValue(document, "crawledAt");
|
|
2215
|
+
}
|
|
2216
|
+
function representativeChunk(chunks) {
|
|
2217
|
+
return [...chunks].sort((left, right) => {
|
|
2218
|
+
if (left.headingPath.length !== right.headingPath.length) {
|
|
2219
|
+
return left.headingPath.length - right.headingPath.length;
|
|
2220
|
+
}
|
|
2221
|
+
if (left.uri !== right.uri) {
|
|
2222
|
+
return left.uri.localeCompare(right.uri);
|
|
2223
|
+
}
|
|
2224
|
+
return left.id.localeCompare(right.id);
|
|
2225
|
+
})[0] ?? chunks[0] ?? void 0;
|
|
2226
|
+
}
|
|
2227
|
+
function stripSnippetMarkdown(text) {
|
|
2228
|
+
return text.replace(/```[\s\S]*?```/g, " ").replace(/\[([^\]]+)\]\([^)]+\)/g, "$1").replace(/!\[([^\]]*)\]\([^)]+\)/g, "$1").replace(/`([^`]+)`/g, "$1").replace(/^#{1,6}\s+/gm, "").replace(/^\s*[-*+]\s+/gm, "");
|
|
2229
|
+
}
|
|
2230
|
+
function extractSnippetParagraphs(text) {
|
|
2231
|
+
return stripSnippetMarkdown(text).split(/\n\s*\n+/).map((paragraph) => paragraph.replace(/\s+/g, " ").trim()).filter(Boolean);
|
|
2232
|
+
}
|
|
2233
|
+
function buildParagraphSnippet(paragraphs, query, targetLength = 900) {
|
|
2234
|
+
if (paragraphs.length === 0) {
|
|
2235
|
+
return "";
|
|
2236
|
+
}
|
|
2237
|
+
const lowerQueryTerms = query.toLowerCase().split(/\s+/).filter(Boolean);
|
|
2238
|
+
const matchIndex = paragraphs.findIndex((paragraph) => {
|
|
2239
|
+
const lower = paragraph.toLowerCase();
|
|
2240
|
+
return lowerQueryTerms.some((term) => lower.includes(term));
|
|
2241
|
+
});
|
|
2242
|
+
let start = matchIndex >= 0 ? matchIndex : 0;
|
|
2243
|
+
let end = start + 1;
|
|
2244
|
+
let totalLength = paragraphs[start]?.length ?? 0;
|
|
2245
|
+
while (totalLength < targetLength && (start > 0 || end < paragraphs.length)) {
|
|
2246
|
+
const previousLength = start > 0 ? paragraphs[start - 1]?.length ?? 0 : -1;
|
|
2247
|
+
const nextLength = end < paragraphs.length ? paragraphs[end]?.length ?? 0 : -1;
|
|
2248
|
+
if (nextLength >= previousLength && end < paragraphs.length) {
|
|
2249
|
+
totalLength += nextLength + 2;
|
|
2250
|
+
end += 1;
|
|
2251
|
+
continue;
|
|
2252
|
+
}
|
|
2253
|
+
if (start > 0) {
|
|
2254
|
+
totalLength += previousLength + 2;
|
|
2255
|
+
start -= 1;
|
|
2256
|
+
continue;
|
|
2257
|
+
}
|
|
2258
|
+
break;
|
|
2259
|
+
}
|
|
2260
|
+
return paragraphs.slice(start, end).join("\n\n").trim();
|
|
2261
|
+
}
|
|
2262
|
+
function buildSnippet(text, query) {
|
|
2263
|
+
return buildParagraphSnippet(extractSnippetParagraphs(text), query);
|
|
2264
|
+
}
|
|
2265
|
+
function buildDocumentParagraphs(chunks) {
|
|
2266
|
+
return chunks.flatMap(
|
|
2267
|
+
(candidate, chunkIndex) => extractSnippetParagraphs(candidate.text).map((text) => ({ chunkIndex, text }))
|
|
2268
|
+
);
|
|
2269
|
+
}
|
|
2270
|
+
function buildExpandedParagraphSnippet(paragraphs, chunkIndex, query, targetLength = 1200) {
|
|
2271
|
+
if (paragraphs.length === 0) {
|
|
2272
|
+
return "";
|
|
2273
|
+
}
|
|
2274
|
+
const lowerQueryTerms = query.toLowerCase().split(/\s+/).filter(Boolean);
|
|
2275
|
+
const currentParagraphIndexes = paragraphs.map((paragraph, index) => ({ ...paragraph, index })).filter((paragraph) => paragraph.chunkIndex === chunkIndex).map((paragraph) => paragraph.index);
|
|
2276
|
+
const anchorIndex = currentParagraphIndexes.find((index) => {
|
|
2277
|
+
const lower = paragraphs[index]?.text.toLowerCase() ?? "";
|
|
2278
|
+
return lowerQueryTerms.some((term) => lower.includes(term));
|
|
2279
|
+
}) ?? currentParagraphIndexes[0] ?? 0;
|
|
2280
|
+
let start = anchorIndex;
|
|
2281
|
+
let end = anchorIndex + 1;
|
|
2282
|
+
let totalLength = paragraphs[anchorIndex]?.text.length ?? 0;
|
|
2283
|
+
while (totalLength < targetLength && (start > 0 || end < paragraphs.length)) {
|
|
2284
|
+
const previousLength = start > 0 ? paragraphs[start - 1]?.text.length ?? 0 : -1;
|
|
2285
|
+
const nextLength = end < paragraphs.length ? paragraphs[end]?.text.length ?? 0 : -1;
|
|
2286
|
+
if (nextLength >= previousLength && end < paragraphs.length) {
|
|
2287
|
+
totalLength += nextLength + 2;
|
|
2288
|
+
end += 1;
|
|
2289
|
+
continue;
|
|
2290
|
+
}
|
|
2291
|
+
if (start > 0) {
|
|
2292
|
+
totalLength += previousLength + 2;
|
|
2293
|
+
start -= 1;
|
|
2294
|
+
continue;
|
|
2295
|
+
}
|
|
2296
|
+
break;
|
|
2297
|
+
}
|
|
2298
|
+
return paragraphs.slice(start, end).map((paragraph) => paragraph.text).join("\n\n").trim();
|
|
2299
|
+
}
|
|
2300
|
+
async function buildSnippetWithAdjacentChunks(chunk, query, {
|
|
2301
|
+
document,
|
|
2302
|
+
config,
|
|
2303
|
+
orderedChunkCache
|
|
2304
|
+
}) {
|
|
2305
|
+
if (!document) {
|
|
2306
|
+
return buildSnippet(chunk.text, query);
|
|
2307
|
+
}
|
|
2308
|
+
let orderedChunks = orderedChunkCache.get(document.id);
|
|
2309
|
+
if (!orderedChunks) {
|
|
2310
|
+
if (!await fileExists(document.normalizedPath)) {
|
|
2311
|
+
return buildSnippet(chunk.text, query);
|
|
2312
|
+
}
|
|
2313
|
+
const raw = await readFile11(document.normalizedPath, "utf8");
|
|
2314
|
+
orderedChunks = buildChunksForDocument(document, raw, config);
|
|
2315
|
+
orderedChunkCache.set(document.id, orderedChunks);
|
|
2316
|
+
}
|
|
2317
|
+
const currentIndex = orderedChunks.findIndex((candidate) => candidate.id === chunk.id);
|
|
2318
|
+
if (currentIndex < 0) {
|
|
2319
|
+
return buildSnippet(chunk.text, query);
|
|
2320
|
+
}
|
|
2321
|
+
const current = orderedChunks[currentIndex];
|
|
2322
|
+
const paragraphs = buildDocumentParagraphs(orderedChunks);
|
|
2323
|
+
if (paragraphs.length === 0) {
|
|
2324
|
+
return buildSnippet(current.text, query);
|
|
2325
|
+
}
|
|
2326
|
+
return buildExpandedParagraphSnippet(paragraphs, currentIndex, query);
|
|
2327
|
+
}
|
|
2328
|
+
function normalizeDisplayTitle(title) {
|
|
2329
|
+
return title.replace(/\s*\|\s*Querylight TS Demo\s*$/i, "").replace(/\s+/g, " ").trim();
|
|
2330
|
+
}
|
|
2331
|
+
function chooseResultTitle(chunk) {
|
|
2332
|
+
const documentTitle = normalizeDisplayTitle(chunk.title);
|
|
2333
|
+
const headings = chunk.headingPath.map((heading) => normalizeDisplayTitle(heading)).filter(Boolean);
|
|
2334
|
+
const leafHeading = headings.at(-1);
|
|
2335
|
+
if (leafHeading && leafHeading.toLowerCase() !== documentTitle.toLowerCase()) {
|
|
2336
|
+
return leafHeading;
|
|
2337
|
+
}
|
|
2338
|
+
if (documentTitle) {
|
|
2339
|
+
return documentTitle;
|
|
2340
|
+
}
|
|
2341
|
+
return leafHeading ?? "Untitled";
|
|
2342
|
+
}
|
|
2343
|
+
function normalizeComparisonText(value) {
|
|
2344
|
+
return value.toLowerCase().replace(/[^a-z0-9]+/g, " ").replace(/\s+/g, " ").trim();
|
|
2345
|
+
}
|
|
2346
|
+
function normalizeUriPath(uri) {
|
|
2347
|
+
try {
|
|
2348
|
+
const parsed = new URL(uri);
|
|
2349
|
+
const pathname = parsed.pathname.replace(/\/+$/, "") || "/";
|
|
2350
|
+
return pathname.toLowerCase();
|
|
2351
|
+
} catch {
|
|
2352
|
+
return uri.toLowerCase().replace(/\/+$/, "");
|
|
2353
|
+
}
|
|
2354
|
+
}
|
|
2355
|
+
function uriSpecificity(uri) {
|
|
2356
|
+
const normalized = normalizeUriPath(uri);
|
|
2357
|
+
if (normalized === "/") {
|
|
2358
|
+
return 0;
|
|
2359
|
+
}
|
|
2360
|
+
return normalized.split("/").filter(Boolean).length;
|
|
2361
|
+
}
|
|
2362
|
+
function isMoreSpecificDuplicate(candidate, existing) {
|
|
2363
|
+
if (candidate.sourceId !== existing.sourceId) {
|
|
2364
|
+
return false;
|
|
2365
|
+
}
|
|
2366
|
+
const candidateTitle = normalizeComparisonText(candidate.title);
|
|
2367
|
+
const existingTitle = normalizeComparisonText(existing.title);
|
|
2368
|
+
if (!candidateTitle || candidateTitle !== existingTitle) {
|
|
2369
|
+
return false;
|
|
2370
|
+
}
|
|
2371
|
+
const candidatePath = normalizeUriPath(candidate.uri);
|
|
2372
|
+
const existingPath = normalizeUriPath(existing.uri);
|
|
2373
|
+
if (candidatePath === existingPath) {
|
|
2374
|
+
return false;
|
|
2375
|
+
}
|
|
2376
|
+
const candidateIsChild = candidatePath.startsWith(existingPath === "/" ? "/" : `${existingPath}/`);
|
|
2377
|
+
const existingIsChild = existingPath.startsWith(candidatePath === "/" ? "/" : `${candidatePath}/`);
|
|
2378
|
+
if (!candidateIsChild && !existingIsChild) {
|
|
2379
|
+
return false;
|
|
2380
|
+
}
|
|
2381
|
+
return uriSpecificity(candidate.uri) > uriSpecificity(existing.uri);
|
|
2382
|
+
}
|
|
2383
|
+
function collapseAggregateDuplicates(results, topK) {
|
|
2384
|
+
const deduped = [];
|
|
2385
|
+
for (const result of results) {
|
|
2386
|
+
const duplicateIndex = deduped.findIndex(
|
|
2387
|
+
(existing) => isMoreSpecificDuplicate(result, existing) || isMoreSpecificDuplicate(existing, result)
|
|
2388
|
+
);
|
|
2389
|
+
if (duplicateIndex < 0) {
|
|
2390
|
+
deduped.push(result);
|
|
2391
|
+
continue;
|
|
2392
|
+
}
|
|
2393
|
+
if (isMoreSpecificDuplicate(result, deduped[duplicateIndex])) {
|
|
2394
|
+
deduped[duplicateIndex] = result;
|
|
2395
|
+
}
|
|
2396
|
+
}
|
|
2397
|
+
return deduped.slice(0, topK);
|
|
2398
|
+
}
|
|
2399
|
+
function rerankResultsByDocument(results, topK) {
|
|
2400
|
+
const byDocument = /* @__PURE__ */ new Map();
|
|
2401
|
+
for (const result of results) {
|
|
2402
|
+
const existing = byDocument.get(result.documentId);
|
|
2403
|
+
if (existing) {
|
|
2404
|
+
existing.push(result);
|
|
2405
|
+
} else {
|
|
2406
|
+
byDocument.set(result.documentId, [result]);
|
|
2407
|
+
}
|
|
2408
|
+
}
|
|
2409
|
+
const reranked = [...byDocument.values()].flatMap((group) => {
|
|
2410
|
+
const sorted = [...group].sort((left, right) => right.score - left.score);
|
|
2411
|
+
const [best, ...rest] = sorted;
|
|
2412
|
+
if (!best) {
|
|
2413
|
+
return [];
|
|
2414
|
+
}
|
|
2415
|
+
const tailScore = rest.reduce((sum, result) => sum + result.score, 0);
|
|
2416
|
+
const aggregateScore = best.score + tailScore * 0.35 + (group.length - 1) * 0.2;
|
|
2417
|
+
return [{ ...best, score: aggregateScore }];
|
|
2418
|
+
}).sort((left, right) => right.score - left.score);
|
|
2419
|
+
return collapseAggregateDuplicates(reranked, topK);
|
|
2420
|
+
}
|
|
2421
|
+
async function searchIndex({
|
|
2422
|
+
workspacePath,
|
|
2423
|
+
query,
|
|
2424
|
+
topK,
|
|
2425
|
+
sourceId,
|
|
2426
|
+
sourceIds,
|
|
2427
|
+
sourceName,
|
|
2428
|
+
sourceNames,
|
|
2429
|
+
sourceType,
|
|
2430
|
+
sourceTypes,
|
|
2431
|
+
uriPrefix,
|
|
2432
|
+
uriPrefixes,
|
|
2433
|
+
hasPublicationDate,
|
|
2434
|
+
tag,
|
|
2435
|
+
tags,
|
|
2436
|
+
metadata,
|
|
2437
|
+
dateRanges = [],
|
|
2438
|
+
retrievalMode,
|
|
2439
|
+
showChunks = false
|
|
2440
|
+
}) {
|
|
2441
|
+
const config = await loadConfig(workspacePath);
|
|
2442
|
+
const mode = retrievalMode ?? config.retrieval.defaultMode;
|
|
2443
|
+
const candidateLimit = Math.max(topK * 5, 50);
|
|
2444
|
+
const chunks = new Map((await readJsonl(path18.join(workspacePath, "chunks", "chunks.jsonl"))).map((chunk) => [chunk.id, chunk]));
|
|
2445
|
+
const documents = new Map((await readJsonl(path18.join(workspacePath, "documents", "documents.jsonl"))).map((document) => [document.id, document]));
|
|
2446
|
+
const sources = new Map((await readJsonl(path18.join(workspacePath, "sources", "sources.jsonl"))).map((source) => [source.id, source]));
|
|
2447
|
+
const orderedChunkCache = /* @__PURE__ */ new Map();
|
|
2448
|
+
const normalizedQuery = query.trim();
|
|
2449
|
+
const filterIds = [...chunks.values()].filter((chunk) => filterChunk(chunk, documents.get(chunk.documentId), sources.get(chunk.sourceId), { sourceId, sourceIds, sourceName, sourceNames, sourceType, sourceTypes, uriPrefix, uriPrefixes, hasPublicationDate, tag, tags, metadata, dateRanges })).map((chunk) => chunk.id);
|
|
2450
|
+
if (normalizedQuery.length === 0) {
|
|
2451
|
+
const chunksByDocument = /* @__PURE__ */ new Map();
|
|
2452
|
+
for (const chunkId of filterIds) {
|
|
2453
|
+
const chunk = chunks.get(chunkId);
|
|
2454
|
+
if (!chunk) {
|
|
2455
|
+
continue;
|
|
2456
|
+
}
|
|
2457
|
+
const existing = chunksByDocument.get(chunk.documentId);
|
|
2458
|
+
if (existing) {
|
|
2459
|
+
existing.push(chunk);
|
|
2460
|
+
} else {
|
|
2461
|
+
chunksByDocument.set(chunk.documentId, [chunk]);
|
|
2462
|
+
}
|
|
2463
|
+
}
|
|
2464
|
+
const latestResults = await Promise.all(
|
|
2465
|
+
[...chunksByDocument.entries()].sort(([leftDocumentId], [rightDocumentId]) => {
|
|
2466
|
+
const leftDocument = documents.get(leftDocumentId);
|
|
2467
|
+
const rightDocument = documents.get(rightDocumentId);
|
|
2468
|
+
return sortDateDescending(leftDocument ? latestSortDate(leftDocument) : null, rightDocument ? latestSortDate(rightDocument) : null);
|
|
2469
|
+
}).slice(0, topK).map(async ([documentId, documentChunks]) => {
|
|
2470
|
+
const document = documents.get(documentId);
|
|
2471
|
+
const chunk = representativeChunk(documentChunks);
|
|
2472
|
+
if (!chunk || !document) {
|
|
2473
|
+
return null;
|
|
2474
|
+
}
|
|
2475
|
+
return {
|
|
2476
|
+
chunkId: chunk.id,
|
|
2477
|
+
documentId: chunk.documentId,
|
|
2478
|
+
sourceId: chunk.sourceId,
|
|
2479
|
+
sourceType: document.sourceType,
|
|
2480
|
+
score: 0,
|
|
2481
|
+
title: chooseResultTitle(chunk),
|
|
2482
|
+
uri: chunk.uri,
|
|
2483
|
+
headingPath: chunk.headingPath,
|
|
2484
|
+
snippet: await buildSnippetWithAdjacentChunks(chunk, document.title, {
|
|
2485
|
+
document,
|
|
2486
|
+
config,
|
|
2487
|
+
orderedChunkCache
|
|
2488
|
+
}),
|
|
2489
|
+
text: showChunks ? chunk.text : void 0,
|
|
2490
|
+
publicationDate: document.publicationDate ?? null,
|
|
2491
|
+
firstSeenAt: document.firstSeenAt,
|
|
2492
|
+
lastSeenAt: document.lastSeenAt,
|
|
2493
|
+
lastChangedAt: document.lastChangedAt,
|
|
2494
|
+
metadata: chunk.metadata
|
|
2495
|
+
};
|
|
2496
|
+
})
|
|
2497
|
+
);
|
|
2498
|
+
return { retrievalMode: "lexical", results: latestResults.filter((result) => result != null) };
|
|
2499
|
+
}
|
|
2500
|
+
const lexicalHits = async () => {
|
|
2501
|
+
const index = await loadHydratedIndex(workspacePath);
|
|
2502
|
+
const all = await index.searchRequest({ query: buildSearchQuery(normalizedQuery, { sourceId, sourceIds, sourceType, sourceTypes, tag, tags, metadata }), limit: candidateLimit });
|
|
2503
|
+
return all.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit);
|
|
2504
|
+
};
|
|
2505
|
+
const denseHits = async () => {
|
|
2506
|
+
if (!await fileExists(denseVectorPath(workspacePath))) {
|
|
2507
|
+
throw new CliError("dense vector index is not built; run `qli models pull --dense` and `qli rebuild`", "DENSE_INDEX_MISSING", 7 /* QueryError */);
|
|
2508
|
+
}
|
|
2509
|
+
return denseQuery({ workspacePath, config: config.retrieval.dense, query: normalizedQuery, topK: candidateLimit }).then((hits2) => hits2.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit));
|
|
2510
|
+
};
|
|
2511
|
+
const sparseHits = async () => {
|
|
2512
|
+
if (!await fileExists(sparseVectorPath(workspacePath))) {
|
|
2513
|
+
throw new CliError("sparse vector index is not built; run `qli models pull --sparse` and `qli rebuild`", "SPARSE_INDEX_MISSING", 7 /* QueryError */);
|
|
2514
|
+
}
|
|
2515
|
+
return sparseQuery({ workspacePath, config: config.retrieval.sparse, query: normalizedQuery, topK: candidateLimit }).then((hits2) => hits2.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit));
|
|
2516
|
+
};
|
|
2517
|
+
let hits;
|
|
2518
|
+
if (mode === "lexical") {
|
|
2519
|
+
hits = await lexicalHits();
|
|
2520
|
+
} else if (mode === "dense") {
|
|
2521
|
+
hits = await denseHits();
|
|
2522
|
+
} else if (mode === "sparse") {
|
|
2523
|
+
hits = await sparseHits();
|
|
2524
|
+
} else {
|
|
2525
|
+
const rankings = [await lexicalHits()];
|
|
2526
|
+
if (await fileExists(denseVectorPath(workspacePath))) {
|
|
2527
|
+
rankings.push(await denseQuery({ workspacePath, config: config.retrieval.dense, query: normalizedQuery, topK: candidateLimit }).then((dense) => dense.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit)));
|
|
2528
|
+
}
|
|
2529
|
+
if (await fileExists(sparseVectorPath(workspacePath))) {
|
|
2530
|
+
rankings.push(await sparseQuery({ workspacePath, config: config.retrieval.sparse, query: normalizedQuery, topK: candidateLimit }).then((sparse) => sparse.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit)));
|
|
2531
|
+
}
|
|
2532
|
+
hits = reciprocalRankFusion(rankings, { rankConstant: 20, weights: rankings.map((_, index) => index === 0 ? 3 : 1) }).slice(0, candidateLimit);
|
|
2533
|
+
}
|
|
2534
|
+
const rawResults = await Promise.all(hits.map(async ([chunkId, score]) => {
|
|
2535
|
+
const chunk = chunks.get(chunkId);
|
|
2536
|
+
if (!chunk) {
|
|
2537
|
+
return null;
|
|
2538
|
+
}
|
|
2539
|
+
return {
|
|
2540
|
+
chunkId,
|
|
2541
|
+
documentId: chunk.documentId,
|
|
2542
|
+
sourceId: chunk.sourceId,
|
|
2543
|
+
sourceType: documents.get(chunk.documentId)?.sourceType ?? "text",
|
|
2544
|
+
score,
|
|
2545
|
+
title: chooseResultTitle(chunk),
|
|
2546
|
+
uri: chunk.uri,
|
|
2547
|
+
headingPath: chunk.headingPath,
|
|
2548
|
+
snippet: await buildSnippetWithAdjacentChunks(chunk, normalizedQuery, {
|
|
2549
|
+
document: documents.get(chunk.documentId),
|
|
2550
|
+
config,
|
|
2551
|
+
orderedChunkCache
|
|
2552
|
+
}),
|
|
2553
|
+
text: showChunks ? chunk.text : void 0,
|
|
2554
|
+
publicationDate: documents.get(chunk.documentId)?.publicationDate ?? null,
|
|
2555
|
+
firstSeenAt: documents.get(chunk.documentId)?.firstSeenAt ?? chunk.firstSeenAt,
|
|
2556
|
+
lastSeenAt: documents.get(chunk.documentId)?.lastSeenAt ?? chunk.lastSeenAt,
|
|
2557
|
+
lastChangedAt: documents.get(chunk.documentId)?.lastChangedAt ?? chunk.lastChangedAt,
|
|
2558
|
+
metadata: chunk.metadata
|
|
2559
|
+
};
|
|
2560
|
+
}));
|
|
2561
|
+
const results = rawResults.filter((result) => result != null);
|
|
2562
|
+
return { retrievalMode: mode, results: rerankResultsByDocument(results, topK) };
|
|
2563
|
+
}
|
|
2564
|
+
|
|
2565
|
+
// src/query/related-service.ts
|
|
2566
|
+
import path19 from "path";
|
|
2567
|
+
function cosineSimilarity(left, right) {
|
|
2568
|
+
let dot = 0;
|
|
2569
|
+
let leftNorm = 0;
|
|
2570
|
+
let rightNorm = 0;
|
|
2571
|
+
for (let index = 0; index < left.length; index += 1) {
|
|
2572
|
+
const leftValue = left[index] ?? 0;
|
|
2573
|
+
const rightValue = right[index] ?? 0;
|
|
2574
|
+
dot += leftValue * rightValue;
|
|
2575
|
+
leftNorm += leftValue * leftValue;
|
|
2576
|
+
rightNorm += rightValue * rightValue;
|
|
2577
|
+
}
|
|
2578
|
+
if (leftNorm === 0 || rightNorm === 0) {
|
|
2579
|
+
return 0;
|
|
2580
|
+
}
|
|
2581
|
+
return dot / (Math.sqrt(leftNorm) * Math.sqrt(rightNorm));
|
|
2582
|
+
}
|
|
2583
|
+
function normalizeVector(values) {
|
|
2584
|
+
const norm = Math.sqrt(values.reduce((sum, value) => sum + value * value, 0));
|
|
2585
|
+
if (norm === 0) {
|
|
2586
|
+
return values.map(() => 0);
|
|
2587
|
+
}
|
|
2588
|
+
return values.map((value) => value / norm);
|
|
2589
|
+
}
|
|
2590
|
+
function averageEmbeddings(records, dimensions) {
|
|
2591
|
+
const totals = new Array(dimensions).fill(0);
|
|
2592
|
+
for (const record of records) {
|
|
2593
|
+
for (let index = 0; index < dimensions; index += 1) {
|
|
2594
|
+
totals[index] = (totals[index] ?? 0) + (record.embedding[index] ?? 0);
|
|
2595
|
+
}
|
|
2596
|
+
}
|
|
2597
|
+
return normalizeVector(totals.map((value) => value / Math.max(records.length, 1)));
|
|
2598
|
+
}
|
|
2599
|
+
function resolveDocumentSelector(documents, selector) {
|
|
2600
|
+
const normalized = selector.trim().toLowerCase();
|
|
2601
|
+
const matches = documents.filter(
|
|
2602
|
+
(document) => document.id.toLowerCase() === normalized || document.uri.toLowerCase() === normalized || document.canonicalUri?.toLowerCase() === normalized
|
|
2603
|
+
);
|
|
2604
|
+
if (matches.length === 0) {
|
|
2605
|
+
throw new CliError(`document not found: ${selector}`, "DOCUMENT_NOT_FOUND", 2 /* InvalidArguments */);
|
|
2606
|
+
}
|
|
2607
|
+
if (matches.length > 1) {
|
|
2608
|
+
throw new CliError(`document selector is ambiguous: ${selector}`, "DOCUMENT_SELECTOR_AMBIGUOUS", 2 /* InvalidArguments */);
|
|
2609
|
+
}
|
|
2610
|
+
return matches[0];
|
|
2611
|
+
}
|
|
2612
|
+
function buildDocumentVectors(documents, denseChunks, dimensions) {
|
|
2613
|
+
const byDocument = /* @__PURE__ */ new Map();
|
|
2614
|
+
for (const chunk of denseChunks) {
|
|
2615
|
+
const existing = byDocument.get(chunk.documentId);
|
|
2616
|
+
if (existing) {
|
|
2617
|
+
existing.push(chunk);
|
|
2618
|
+
} else {
|
|
2619
|
+
byDocument.set(chunk.documentId, [chunk]);
|
|
2620
|
+
}
|
|
2621
|
+
}
|
|
2622
|
+
return new Map(documents.flatMap((document) => {
|
|
2623
|
+
const records = byDocument.get(document.id);
|
|
2624
|
+
if (!records?.length) {
|
|
2625
|
+
return [];
|
|
2626
|
+
}
|
|
2627
|
+
return [[document.id, { document, embedding: averageEmbeddings(records, dimensions) }]];
|
|
2628
|
+
}));
|
|
2629
|
+
}
|
|
2630
|
+
async function findRelatedDocuments({
|
|
2631
|
+
workspacePath,
|
|
2632
|
+
document,
|
|
2633
|
+
topK
|
|
2634
|
+
}) {
|
|
2635
|
+
const config = await loadConfig(workspacePath);
|
|
2636
|
+
if (!config.retrieval.dense.enabled) {
|
|
2637
|
+
throw new CliError("dense retrieval is disabled in config; enable retrieval.dense.enabled and rebuild", "DENSE_RETRIEVAL_DISABLED", 7 /* QueryError */);
|
|
2638
|
+
}
|
|
2639
|
+
if (!await fileExists(denseVectorPath(workspacePath))) {
|
|
2640
|
+
throw new CliError("dense vector index is not built; run `qli models pull --dense` and `qli rebuild`", "DENSE_INDEX_MISSING", 7 /* QueryError */);
|
|
2641
|
+
}
|
|
2642
|
+
const documents = await readJsonl(path19.join(workspacePath, "documents", "documents.jsonl"));
|
|
2643
|
+
const selected = resolveDocumentSelector(documents, document);
|
|
2644
|
+
const densePayload = await readDensePayload(workspacePath);
|
|
2645
|
+
const vectors = buildDocumentVectors(documents, densePayload.chunks, densePayload.metadata.dimensions);
|
|
2646
|
+
const sourceVector = vectors.get(selected.id);
|
|
2647
|
+
if (!sourceVector) {
|
|
2648
|
+
throw new CliError(`dense vectors are missing for document: ${document}`, "DOCUMENT_VECTOR_MISSING", 7 /* QueryError */);
|
|
2649
|
+
}
|
|
2650
|
+
const results = [...vectors.values()].filter((candidate) => candidate.document.id !== selected.id).map((candidate) => ({
|
|
2651
|
+
documentId: candidate.document.id,
|
|
2652
|
+
sourceId: candidate.document.sourceId,
|
|
2653
|
+
score: cosineSimilarity(sourceVector.embedding, candidate.embedding),
|
|
2654
|
+
title: candidate.document.title,
|
|
2655
|
+
uri: candidate.document.uri,
|
|
2656
|
+
metadata: candidate.document.metadata
|
|
2657
|
+
})).sort((left, right) => right.score - left.score).slice(0, topK);
|
|
2658
|
+
return {
|
|
2659
|
+
sourceDocument: {
|
|
2660
|
+
documentId: selected.id,
|
|
2661
|
+
sourceId: selected.sourceId,
|
|
2662
|
+
title: selected.title,
|
|
2663
|
+
uri: selected.uri
|
|
2664
|
+
},
|
|
2665
|
+
retrievalMode: "dense",
|
|
2666
|
+
results
|
|
2667
|
+
};
|
|
2668
|
+
}
|
|
2669
|
+
|
|
2670
|
+
// src/query/context-builder.ts
|
|
2671
|
+
async function createContext({
|
|
2672
|
+
workspacePath,
|
|
2673
|
+
query,
|
|
2674
|
+
topK,
|
|
2675
|
+
maxChars,
|
|
2676
|
+
retrievalMode
|
|
2677
|
+
}) {
|
|
2678
|
+
const search = await searchIndex({ workspacePath, query, topK, showChunks: true, retrievalMode });
|
|
2679
|
+
const sources = [];
|
|
2680
|
+
let total = 0;
|
|
2681
|
+
for (const result of search.results) {
|
|
2682
|
+
const text = result.text ?? "";
|
|
2683
|
+
if (total + text.length > maxChars && sources.length > 0) {
|
|
2684
|
+
break;
|
|
2685
|
+
}
|
|
2686
|
+
total += text.length;
|
|
2687
|
+
sources.push({
|
|
2688
|
+
chunkId: result.chunkId,
|
|
2689
|
+
documentId: result.documentId,
|
|
2690
|
+
sourceId: result.sourceId,
|
|
2691
|
+
title: result.title,
|
|
2692
|
+
uri: result.uri,
|
|
2693
|
+
headingPath: result.headingPath,
|
|
2694
|
+
text,
|
|
2695
|
+
metadata: result.metadata
|
|
2696
|
+
});
|
|
2697
|
+
}
|
|
2698
|
+
const markdown = [
|
|
2699
|
+
"# Context",
|
|
2700
|
+
"",
|
|
2701
|
+
...sources.flatMap((source, index) => [
|
|
2702
|
+
`## Source ${index + 1}`,
|
|
2703
|
+
`Title: ${source.title}`,
|
|
2704
|
+
`URL: ${source.uri}`,
|
|
2705
|
+
`Chunk ID: ${source.chunkId}`,
|
|
2706
|
+
source.headingPath.length > 0 ? `Heading Path: ${source.headingPath.join(" > ")}` : "",
|
|
2707
|
+
"",
|
|
2708
|
+
source.text,
|
|
2709
|
+
""
|
|
2710
|
+
].filter((line) => line !== ""))
|
|
2711
|
+
].join("\n");
|
|
2712
|
+
return { markdown, sources, retrievalMode: search.retrievalMode };
|
|
2713
|
+
}
|
|
2714
|
+
|
|
2715
|
+
// src/report/diff-service.ts
|
|
2716
|
+
import path20 from "path";
|
|
2717
|
+
function chooseBaselineRun(runs, since) {
|
|
2718
|
+
if (since === "last-run") {
|
|
2719
|
+
return runs.at(-1);
|
|
2720
|
+
}
|
|
2721
|
+
if (since) {
|
|
2722
|
+
return runs.filter((run) => run.createdAt < since).at(-1) ?? runs.at(-1);
|
|
2723
|
+
}
|
|
2724
|
+
return runs.at(-1);
|
|
2725
|
+
}
|
|
2726
|
+
async function diffWorkspace({
|
|
2727
|
+
workspacePath,
|
|
2728
|
+
sourceId,
|
|
2729
|
+
documentId,
|
|
2730
|
+
since
|
|
2731
|
+
}) {
|
|
2732
|
+
const current = await readJsonl(path20.join(workspacePath, "documents", "documents.jsonl"));
|
|
2733
|
+
const baseline = chooseBaselineRun(await listRuns(workspacePath), since);
|
|
2734
|
+
const previous = new Map((baseline?.documentsSnapshot ?? []).map((document) => [document.id, document]));
|
|
2735
|
+
const changedDocuments = current.filter((document) => (!sourceId || document.sourceId === sourceId) && (!documentId || document.id === documentId)).filter((document) => {
|
|
2736
|
+
const prior = previous.get(document.id);
|
|
2737
|
+
return !prior || prior.contentHash !== document.contentHash || since && document.lastChangedAt >= since;
|
|
2738
|
+
}).map((document) => ({
|
|
2739
|
+
id: document.id,
|
|
2740
|
+
title: document.title,
|
|
2741
|
+
uri: document.uri,
|
|
2742
|
+
sourceId: document.sourceId,
|
|
2743
|
+
previousHash: previous.get(document.id)?.contentHash,
|
|
2744
|
+
currentHash: document.contentHash
|
|
2745
|
+
}));
|
|
2746
|
+
return { changedDocuments };
|
|
2747
|
+
}
|
|
2748
|
+
function renderChangeReport(diff) {
|
|
2749
|
+
return [
|
|
2750
|
+
"# Knowledge Base Change Report",
|
|
2751
|
+
"",
|
|
2752
|
+
"## Summary",
|
|
2753
|
+
"",
|
|
2754
|
+
`Changed documents: ${diff.changedDocuments.length}`,
|
|
2755
|
+
"",
|
|
2756
|
+
"## Added Documents",
|
|
2757
|
+
"",
|
|
2758
|
+
"_No added documents in this simple report._",
|
|
2759
|
+
"",
|
|
2760
|
+
"## Changed Documents",
|
|
2761
|
+
"",
|
|
2762
|
+
...diff.changedDocuments.map((document) => `- ${document.title} (${document.uri}) [${document.id}]`),
|
|
2763
|
+
"",
|
|
2764
|
+
"## Removed or Missing Documents",
|
|
2765
|
+
"",
|
|
2766
|
+
"_Removal tracking is not available for this report._",
|
|
2767
|
+
"",
|
|
2768
|
+
"## Notable Changed Sections",
|
|
2769
|
+
"",
|
|
2770
|
+
...diff.changedDocuments.map((document) => `- ${document.sourceId}: ${document.title}`)
|
|
2771
|
+
].join("\n");
|
|
2772
|
+
}
|
|
2773
|
+
export {
|
|
2774
|
+
addSource,
|
|
2775
|
+
assertWorkspaceExists,
|
|
2776
|
+
buildChunksForDocument,
|
|
2777
|
+
buildIndex,
|
|
2778
|
+
chunkDocuments,
|
|
2779
|
+
createContext,
|
|
2780
|
+
createIndexMapping,
|
|
2781
|
+
defaultConfig,
|
|
2782
|
+
diffWorkspace,
|
|
2783
|
+
ensureWorkspace,
|
|
2784
|
+
findRelatedDocuments,
|
|
2785
|
+
ingestSources,
|
|
2786
|
+
listSources,
|
|
2787
|
+
loadConfig,
|
|
2788
|
+
removeSource,
|
|
2789
|
+
renderChangeReport,
|
|
2790
|
+
reprocessDocuments,
|
|
2791
|
+
searchIndex,
|
|
2792
|
+
updateSource,
|
|
2793
|
+
writeDefaultConfig
|
|
2794
|
+
};
|