@tryformation/querylight-cli 0.1.1 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +64 -11
- package/dist/chunk/chunker.d.ts +3 -1
- package/dist/cli/main.js +1163 -285
- package/dist/cli/run-cli.d.ts +4 -1
- package/dist/core/concurrency.d.ts +1 -0
- package/dist/core/constants.d.ts +3 -1
- package/dist/core/gzip-json.d.ts +3 -0
- package/dist/core/progress.d.ts +4 -0
- package/dist/core/urls.d.ts +1 -0
- package/dist/index/index-store.d.ts +3 -0
- package/dist/index/querylight-indexer.d.ts +3 -1
- package/dist/index.js +540 -141
- package/dist/ingest/adapters/website-adapter.d.ts +6 -1
- package/dist/ingest/adapters/website-feed-discovery.d.ts +6 -0
- package/dist/ingest/extractors/html-extractor.d.ts +1 -0
- package/dist/ingest/ingest-service.d.ts +5 -2
- package/dist/types/models.d.ts +2 -2
- package/dist/vector/dense.d.ts +3 -1
- package/dist/vector/runtime.d.ts +2 -0
- package/dist/vector/service.d.ts +20 -2
- package/dist/vector/sparse.d.ts +3 -1
- package/dist/vector/store.d.ts +8 -2
- package/package.json +1 -1
package/dist/cli/main.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
// src/cli/run-cli.ts
|
|
4
|
-
import { Command } from "commander";
|
|
4
|
+
import { Command, Option } from "commander";
|
|
5
5
|
import { stat as stat4 } from "fs/promises";
|
|
6
6
|
import path21 from "path";
|
|
7
7
|
|
|
@@ -14,6 +14,17 @@ import path4 from "path";
|
|
|
14
14
|
import { readFile, writeFile } from "fs/promises";
|
|
15
15
|
import path from "path";
|
|
16
16
|
import YAML from "yaml";
|
|
17
|
+
|
|
18
|
+
// src/core/constants.ts
|
|
19
|
+
var PACKAGE_VERSION = "0.2.1";
|
|
20
|
+
var DEFAULT_WORKSPACE = ".kb";
|
|
21
|
+
var DEFAULT_SHARED_MODEL_CACHE_DIR = "~/.qli/models/huggingface";
|
|
22
|
+
var LEGACY_WORKSPACE_MODEL_CACHE_DIR = ".kb/models/huggingface";
|
|
23
|
+
|
|
24
|
+
// src/core/config.ts
|
|
25
|
+
function normalizeModelCacheDir(configuredPath) {
|
|
26
|
+
return configuredPath === LEGACY_WORKSPACE_MODEL_CACHE_DIR ? DEFAULT_SHARED_MODEL_CACHE_DIR : configuredPath;
|
|
27
|
+
}
|
|
17
28
|
var defaultConfig = () => ({
|
|
18
29
|
workspaceVersion: 1,
|
|
19
30
|
index: {
|
|
@@ -41,17 +52,17 @@ var defaultConfig = () => ({
|
|
|
41
52
|
retrieval: {
|
|
42
53
|
defaultMode: "lexical",
|
|
43
54
|
dense: {
|
|
44
|
-
enabled:
|
|
55
|
+
enabled: true,
|
|
45
56
|
modelId: "Xenova/all-MiniLM-L6-v2",
|
|
46
|
-
cacheDir:
|
|
57
|
+
cacheDir: DEFAULT_SHARED_MODEL_CACHE_DIR,
|
|
47
58
|
indexHashTables: 8,
|
|
48
59
|
indexRandomSeed: 42,
|
|
49
60
|
chunkTextMode: "title-heading-text"
|
|
50
61
|
},
|
|
51
62
|
sparse: {
|
|
52
|
-
enabled:
|
|
63
|
+
enabled: true,
|
|
53
64
|
modelId: "opensearch-project/opensearch-neural-sparse-encoding-doc-v3-distill",
|
|
54
|
-
cacheDir:
|
|
65
|
+
cacheDir: DEFAULT_SHARED_MODEL_CACHE_DIR,
|
|
55
66
|
documentTopTokens: 128,
|
|
56
67
|
queryEncoding: "tokenizer-token-weights",
|
|
57
68
|
documentEncoding: "masked-lm-max-log1p-relu",
|
|
@@ -62,6 +73,7 @@ var defaultConfig = () => ({
|
|
|
62
73
|
defaultUserAgent: "querylight-cli/0.1",
|
|
63
74
|
obeyRobotsTxt: true,
|
|
64
75
|
rateLimitMs: 1e3,
|
|
76
|
+
maxConcurrentRequests: 5,
|
|
65
77
|
renderJs: false,
|
|
66
78
|
retentionDays: 365,
|
|
67
79
|
fetchArticles: true
|
|
@@ -112,11 +124,13 @@ async function loadConfig(workspacePath, configPath) {
|
|
|
112
124
|
...parsed.retrieval ?? {},
|
|
113
125
|
dense: {
|
|
114
126
|
...defaults.retrieval.dense,
|
|
115
|
-
...parsed.retrieval?.dense ?? {}
|
|
127
|
+
...parsed.retrieval?.dense ?? {},
|
|
128
|
+
cacheDir: normalizeModelCacheDir(parsed.retrieval?.dense?.cacheDir ?? defaults.retrieval.dense.cacheDir)
|
|
116
129
|
},
|
|
117
130
|
sparse: {
|
|
118
131
|
...defaults.retrieval.sparse,
|
|
119
|
-
...parsed.retrieval?.sparse ?? {}
|
|
132
|
+
...parsed.retrieval?.sparse ?? {},
|
|
133
|
+
cacheDir: normalizeModelCacheDir(parsed.retrieval?.sparse?.cacheDir ?? defaults.retrieval.sparse.cacheDir)
|
|
120
134
|
}
|
|
121
135
|
},
|
|
122
136
|
crawler: {
|
|
@@ -162,6 +176,14 @@ async function writeJsonl(filePath, records) {
|
|
|
162
176
|
` : "", "utf8");
|
|
163
177
|
}
|
|
164
178
|
|
|
179
|
+
// src/core/progress.ts
|
|
180
|
+
function reportProgress(progress, message) {
|
|
181
|
+
progress?.("info", message);
|
|
182
|
+
}
|
|
183
|
+
function reportProgressDetail(progress, message) {
|
|
184
|
+
progress?.("detail", message);
|
|
185
|
+
}
|
|
186
|
+
|
|
165
187
|
// src/chunk/chunk-store.ts
|
|
166
188
|
import path3 from "path";
|
|
167
189
|
function chunksFile(workspacePath) {
|
|
@@ -269,11 +291,13 @@ function buildChunksForDocument(document, markdown, config, prior = /* @__PURE__
|
|
|
269
291
|
async function chunkDocuments({
|
|
270
292
|
workspacePath,
|
|
271
293
|
sourceId,
|
|
272
|
-
documentId
|
|
294
|
+
documentId,
|
|
295
|
+
progress
|
|
273
296
|
}) {
|
|
274
297
|
const config = await loadConfig(workspacePath);
|
|
275
298
|
const documents = await readJsonl(path4.join(workspacePath, "documents", "documents.jsonl"));
|
|
276
299
|
const filtered = documents.filter((document) => (!sourceId || document.sourceId === sourceId) && (!documentId || document.id === documentId));
|
|
300
|
+
reportProgress(progress, `Chunking ${filtered.length} document${filtered.length === 1 ? "" : "s"}`);
|
|
277
301
|
const targetedDocumentIds = new Set(filtered.map((document) => document.id));
|
|
278
302
|
const existingChunks = await loadChunks(workspacePath);
|
|
279
303
|
const prior = new Map(existingChunks.map((chunk) => [chunk.id, chunk]));
|
|
@@ -281,19 +305,17 @@ async function chunkDocuments({
|
|
|
281
305
|
existingChunks.filter((chunk) => !targetedDocumentIds.has(chunk.documentId)).map((chunk) => [chunk.id, chunk])
|
|
282
306
|
);
|
|
283
307
|
for (const document of filtered) {
|
|
308
|
+
reportProgressDetail(progress, `Chunking ${document.id} (${document.title})`);
|
|
284
309
|
const raw = await readFile3(document.normalizedPath, "utf8");
|
|
285
310
|
for (const chunk of buildChunksForDocument(document, raw, config, prior)) {
|
|
286
311
|
nextChunks.set(chunk.id, chunk);
|
|
287
312
|
}
|
|
288
313
|
}
|
|
289
314
|
await saveChunks(workspacePath, [...nextChunks.values()]);
|
|
315
|
+
reportProgress(progress, `Chunking complete: ${nextChunks.size} chunk${nextChunks.size === 1 ? "" : "s"} written`);
|
|
290
316
|
return { chunksWritten: nextChunks.size };
|
|
291
317
|
}
|
|
292
318
|
|
|
293
|
-
// src/core/constants.ts
|
|
294
|
-
var PACKAGE_VERSION = "0.1.0";
|
|
295
|
-
var DEFAULT_WORKSPACE = ".kb";
|
|
296
|
-
|
|
297
319
|
// src/core/errors.ts
|
|
298
320
|
var CliError = class extends Error {
|
|
299
321
|
constructor(message, code, exitCode, details) {
|
|
@@ -319,8 +341,6 @@ var DIRS = [
|
|
|
319
341
|
"normalized",
|
|
320
342
|
"indexes",
|
|
321
343
|
"vectors",
|
|
322
|
-
"models",
|
|
323
|
-
"models/huggingface",
|
|
324
344
|
"runs",
|
|
325
345
|
"logs"
|
|
326
346
|
];
|
|
@@ -358,11 +378,12 @@ import { Analyzer, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, Ranking
|
|
|
358
378
|
import path11 from "path";
|
|
359
379
|
|
|
360
380
|
// src/vector/dense.ts
|
|
361
|
-
import { VectorFieldIndex, createSeededRandom } from "@tryformation/querylight-ts";
|
|
381
|
+
import { VectorFieldIndex, cosineSimilarity, createSeededRandom } from "@tryformation/querylight-ts";
|
|
362
382
|
import { mkdir as mkdir4 } from "fs/promises";
|
|
363
383
|
import path8 from "path";
|
|
364
384
|
|
|
365
385
|
// src/vector/runtime.ts
|
|
386
|
+
import os from "os";
|
|
366
387
|
import path6 from "path";
|
|
367
388
|
import { fileURLToPath } from "url";
|
|
368
389
|
import { execFile, execFileSync } from "child_process";
|
|
@@ -379,7 +400,22 @@ async function fileExists(filePath) {
|
|
|
379
400
|
}
|
|
380
401
|
|
|
381
402
|
// src/vector/runtime.ts
|
|
403
|
+
function resolveQliHomeDir() {
|
|
404
|
+
return path6.resolve(process.env.QLI_HOME ?? path6.join(os.homedir(), ".qli"));
|
|
405
|
+
}
|
|
382
406
|
function resolveCacheDir(workspacePath, configuredPath) {
|
|
407
|
+
if (configuredPath === "~/.qli") {
|
|
408
|
+
return resolveQliHomeDir();
|
|
409
|
+
}
|
|
410
|
+
if (configuredPath.startsWith("~/.qli/")) {
|
|
411
|
+
return path6.join(resolveQliHomeDir(), configuredPath.slice("~/.qli/".length));
|
|
412
|
+
}
|
|
413
|
+
if (configuredPath === "~") {
|
|
414
|
+
return os.homedir();
|
|
415
|
+
}
|
|
416
|
+
if (configuredPath.startsWith("~/")) {
|
|
417
|
+
return path6.join(os.homedir(), configuredPath.slice(2));
|
|
418
|
+
}
|
|
383
419
|
return path6.isAbsolute(configuredPath) ? configuredPath : path6.resolve(workspacePath, configuredPath.replace(/^\.kb\//, ""));
|
|
384
420
|
}
|
|
385
421
|
function packageRootFromImportMeta(importMetaUrl) {
|
|
@@ -403,6 +439,14 @@ async function ensureUvAvailable() {
|
|
|
403
439
|
execFile("uv", ["--version"], (error) => error ? reject(error) : resolve2());
|
|
404
440
|
});
|
|
405
441
|
}
|
|
442
|
+
async function isUvAvailable() {
|
|
443
|
+
try {
|
|
444
|
+
await ensureUvAvailable();
|
|
445
|
+
return true;
|
|
446
|
+
} catch {
|
|
447
|
+
return false;
|
|
448
|
+
}
|
|
449
|
+
}
|
|
406
450
|
async function runSparsePython({
|
|
407
451
|
workspacePath,
|
|
408
452
|
config,
|
|
@@ -446,55 +490,114 @@ async function getDenseTransformersRuntime(cacheDir) {
|
|
|
446
490
|
}
|
|
447
491
|
|
|
448
492
|
// src/vector/store.ts
|
|
449
|
-
import { mkdir as mkdir3,
|
|
493
|
+
import { mkdir as mkdir3, rm, writeFile as writeFile4 } from "fs/promises";
|
|
450
494
|
import path7 from "path";
|
|
495
|
+
|
|
496
|
+
// src/core/gzip-json.ts
|
|
497
|
+
import { readFile as readFile4, writeFile as writeFile3 } from "fs/promises";
|
|
498
|
+
import { promisify } from "util";
|
|
499
|
+
import { gunzip, gzip } from "zlib";
|
|
500
|
+
var gzipAsync = promisify(gzip);
|
|
501
|
+
var gunzipAsync = promisify(gunzip);
|
|
502
|
+
async function writeGzipJson(filePath, value) {
|
|
503
|
+
const payload = JSON.stringify(value, null, 2);
|
|
504
|
+
await writeFile3(filePath, await gzipAsync(Buffer.from(payload, "utf8")));
|
|
505
|
+
}
|
|
506
|
+
async function readJsonFromGzipOrFile(gzipPath, legacyPath) {
|
|
507
|
+
if (await fileExists(gzipPath)) {
|
|
508
|
+
const payload = await readFile4(gzipPath);
|
|
509
|
+
return JSON.parse((await gunzipAsync(payload)).toString("utf8"));
|
|
510
|
+
}
|
|
511
|
+
if (legacyPath && await fileExists(legacyPath)) {
|
|
512
|
+
return JSON.parse(await readFile4(legacyPath, "utf8"));
|
|
513
|
+
}
|
|
514
|
+
return JSON.parse(await readFile4(gzipPath, "utf8"));
|
|
515
|
+
}
|
|
516
|
+
async function resolveExistingGzipOrFilePath(gzipPath, legacyPath) {
|
|
517
|
+
if (await fileExists(gzipPath)) {
|
|
518
|
+
return gzipPath;
|
|
519
|
+
}
|
|
520
|
+
if (legacyPath && await fileExists(legacyPath)) {
|
|
521
|
+
return legacyPath;
|
|
522
|
+
}
|
|
523
|
+
return gzipPath;
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
// src/vector/store.ts
|
|
451
527
|
function vectorsDir(workspacePath) {
|
|
452
528
|
return path7.join(workspacePath, "vectors");
|
|
453
529
|
}
|
|
454
|
-
function
|
|
455
|
-
return path7.join(
|
|
530
|
+
function sharedModelStateDir() {
|
|
531
|
+
return path7.join(resolveQliHomeDir(), "models", "status");
|
|
456
532
|
}
|
|
457
533
|
function denseVectorPath(workspacePath) {
|
|
458
|
-
return path7.join(vectorsDir(workspacePath), "dense.latest.json");
|
|
534
|
+
return path7.join(vectorsDir(workspacePath), "dense.latest.json.gz");
|
|
459
535
|
}
|
|
460
536
|
function denseMetaPath(workspacePath) {
|
|
461
|
-
return path7.join(vectorsDir(workspacePath), "dense.latest.meta.json");
|
|
537
|
+
return path7.join(vectorsDir(workspacePath), "dense.latest.meta.json.gz");
|
|
462
538
|
}
|
|
463
539
|
function sparseVectorPath(workspacePath) {
|
|
464
|
-
return path7.join(vectorsDir(workspacePath), "sparse.latest.json");
|
|
540
|
+
return path7.join(vectorsDir(workspacePath), "sparse.latest.json.gz");
|
|
465
541
|
}
|
|
466
542
|
function sparseMetaPath(workspacePath) {
|
|
543
|
+
return path7.join(vectorsDir(workspacePath), "sparse.latest.meta.json.gz");
|
|
544
|
+
}
|
|
545
|
+
function legacyDenseVectorPath(workspacePath) {
|
|
546
|
+
return path7.join(vectorsDir(workspacePath), "dense.latest.json");
|
|
547
|
+
}
|
|
548
|
+
function legacyDenseMetaPath(workspacePath) {
|
|
549
|
+
return path7.join(vectorsDir(workspacePath), "dense.latest.meta.json");
|
|
550
|
+
}
|
|
551
|
+
function legacySparseVectorPath(workspacePath) {
|
|
552
|
+
return path7.join(vectorsDir(workspacePath), "sparse.latest.json");
|
|
553
|
+
}
|
|
554
|
+
function legacySparseMetaPath(workspacePath) {
|
|
467
555
|
return path7.join(vectorsDir(workspacePath), "sparse.latest.meta.json");
|
|
468
556
|
}
|
|
469
|
-
function
|
|
470
|
-
|
|
557
|
+
function pullMarkerPath(type, workspacePath, modelId, cacheDir) {
|
|
558
|
+
const resolvedCacheDir = resolveCacheDir(workspacePath, cacheDir);
|
|
559
|
+
const cacheKey = sha256(resolvedCacheDir).slice(0, 16);
|
|
560
|
+
return path7.join(sharedModelStateDir(), type, `${encodeURIComponent(modelId)}.${cacheKey}.json`);
|
|
561
|
+
}
|
|
562
|
+
function densePullMarker(workspacePath, modelId, cacheDir) {
|
|
563
|
+
return pullMarkerPath("dense", workspacePath, modelId, cacheDir);
|
|
471
564
|
}
|
|
472
|
-
function sparsePullMarker(workspacePath) {
|
|
473
|
-
return
|
|
565
|
+
function sparsePullMarker(workspacePath, modelId, cacheDir) {
|
|
566
|
+
return pullMarkerPath("sparse", workspacePath, modelId, cacheDir);
|
|
474
567
|
}
|
|
475
568
|
async function writeDensePayload(workspacePath, payload) {
|
|
476
569
|
await mkdir3(vectorsDir(workspacePath), { recursive: true });
|
|
477
|
-
await
|
|
478
|
-
await
|
|
570
|
+
await writeGzipJson(denseVectorPath(workspacePath), payload);
|
|
571
|
+
await writeGzipJson(denseMetaPath(workspacePath), payload.metadata);
|
|
572
|
+
await Promise.all([
|
|
573
|
+
rm(legacyDenseVectorPath(workspacePath), { force: true }),
|
|
574
|
+
rm(legacyDenseMetaPath(workspacePath), { force: true })
|
|
575
|
+
]);
|
|
479
576
|
}
|
|
480
577
|
async function readDensePayload(workspacePath) {
|
|
481
|
-
return
|
|
578
|
+
return readJsonFromGzipOrFile(denseVectorPath(workspacePath), legacyDenseVectorPath(workspacePath));
|
|
482
579
|
}
|
|
483
580
|
async function writeSparsePayload(workspacePath, payload) {
|
|
484
581
|
await mkdir3(vectorsDir(workspacePath), { recursive: true });
|
|
485
|
-
await
|
|
486
|
-
await
|
|
582
|
+
await writeGzipJson(sparseVectorPath(workspacePath), payload);
|
|
583
|
+
await writeGzipJson(sparseMetaPath(workspacePath), payload.metadata);
|
|
584
|
+
await Promise.all([
|
|
585
|
+
rm(legacySparseVectorPath(workspacePath), { force: true }),
|
|
586
|
+
rm(legacySparseMetaPath(workspacePath), { force: true })
|
|
587
|
+
]);
|
|
487
588
|
}
|
|
488
589
|
async function readSparsePayload(workspacePath) {
|
|
489
|
-
return
|
|
590
|
+
return readJsonFromGzipOrFile(sparseVectorPath(workspacePath), legacySparseVectorPath(workspacePath));
|
|
490
591
|
}
|
|
491
|
-
async function writeDensePullMarker(workspacePath, value) {
|
|
492
|
-
|
|
493
|
-
await
|
|
592
|
+
async function writeDensePullMarker(workspacePath, model, value) {
|
|
593
|
+
const markerPath = densePullMarker(workspacePath, model.modelId, model.cacheDir);
|
|
594
|
+
await mkdir3(path7.dirname(markerPath), { recursive: true });
|
|
595
|
+
await writeFile4(markerPath, JSON.stringify(value, null, 2), "utf8");
|
|
494
596
|
}
|
|
495
|
-
async function writeSparsePullMarker(workspacePath, value) {
|
|
496
|
-
|
|
497
|
-
await
|
|
597
|
+
async function writeSparsePullMarker(workspacePath, model, value) {
|
|
598
|
+
const markerPath = sparsePullMarker(workspacePath, model.modelId, model.cacheDir);
|
|
599
|
+
await mkdir3(path7.dirname(markerPath), { recursive: true });
|
|
600
|
+
await writeFile4(markerPath, JSON.stringify(value, null, 2), "utf8");
|
|
498
601
|
}
|
|
499
602
|
async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
|
|
500
603
|
const denseCacheDir = resolveCacheDir(workspacePath, dense.cacheDir);
|
|
@@ -504,30 +607,72 @@ async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
|
|
|
504
607
|
configured: dense.enabled,
|
|
505
608
|
modelId: dense.modelId,
|
|
506
609
|
cacheDir: denseCacheDir,
|
|
507
|
-
available: await fileExists(densePullMarker(workspacePath)),
|
|
508
|
-
artifactExists: await fileExists(denseVectorPath(workspacePath))
|
|
610
|
+
available: await fileExists(densePullMarker(workspacePath, dense.modelId, dense.cacheDir)),
|
|
611
|
+
artifactExists: await fileExists(denseVectorPath(workspacePath)) || await fileExists(legacyDenseVectorPath(workspacePath))
|
|
509
612
|
},
|
|
510
613
|
sparse: {
|
|
511
614
|
configured: sparse.enabled,
|
|
512
615
|
modelId: sparse.modelId,
|
|
513
616
|
cacheDir: sparseCacheDir,
|
|
514
617
|
uvAvailable,
|
|
515
|
-
available: await fileExists(sparsePullMarker(workspacePath)),
|
|
516
|
-
artifactExists: await fileExists(sparseVectorPath(workspacePath))
|
|
618
|
+
available: await fileExists(sparsePullMarker(workspacePath, sparse.modelId, sparse.cacheDir)),
|
|
619
|
+
artifactExists: await fileExists(sparseVectorPath(workspacePath)) || await fileExists(legacySparseVectorPath(workspacePath))
|
|
517
620
|
}
|
|
518
621
|
};
|
|
519
622
|
}
|
|
520
623
|
|
|
521
624
|
// src/vector/text.ts
|
|
625
|
+
var LOW_SIGNAL_HEADINGS = /* @__PURE__ */ new Set([
|
|
626
|
+
"choose this instead of",
|
|
627
|
+
"how xyz runs it",
|
|
628
|
+
"naechste schritte",
|
|
629
|
+
"next steps",
|
|
630
|
+
"overview",
|
|
631
|
+
"passend wenn",
|
|
632
|
+
"problem",
|
|
633
|
+
"right fit",
|
|
634
|
+
"waehlen sie das stattdessen",
|
|
635
|
+
"was sie bekommen",
|
|
636
|
+
"what you get",
|
|
637
|
+
"wie xyz es umsetzt",
|
|
638
|
+
"uberblick",
|
|
639
|
+
"\xFCberblick"
|
|
640
|
+
]);
|
|
641
|
+
function normalizeHeading(value) {
|
|
642
|
+
return value.trim().toLowerCase();
|
|
643
|
+
}
|
|
644
|
+
function isLowSignalHeading(value) {
|
|
645
|
+
return LOW_SIGNAL_HEADINGS.has(normalizeHeading(value));
|
|
646
|
+
}
|
|
647
|
+
function stripLeadingHeading(text, heading) {
|
|
648
|
+
const lines = text.split("\n");
|
|
649
|
+
const firstContentIndex = lines.findIndex((line) => line.trim().length > 0);
|
|
650
|
+
if (firstContentIndex < 0) {
|
|
651
|
+
return text;
|
|
652
|
+
}
|
|
653
|
+
const match = /^(#{1,6})\s+(.+)$/.exec(lines[firstContentIndex] ?? "");
|
|
654
|
+
if (!match?.[2] || normalizeHeading(match[2]) !== normalizeHeading(heading)) {
|
|
655
|
+
return text;
|
|
656
|
+
}
|
|
657
|
+
const next = [...lines.slice(0, firstContentIndex), ...lines.slice(firstContentIndex + 1)].join("\n").trim();
|
|
658
|
+
return next;
|
|
659
|
+
}
|
|
660
|
+
function createVectorText(chunk) {
|
|
661
|
+
const meaningfulHeadings = chunk.headingPath.filter((heading) => !isLowSignalHeading(heading) && normalizeHeading(heading) !== normalizeHeading(chunk.title));
|
|
662
|
+
const textHeading = [...chunk.headingPath].reverse().find((heading) => isLowSignalHeading(heading) || normalizeHeading(heading) === normalizeHeading(chunk.title));
|
|
663
|
+
const body = textHeading ? stripLeadingHeading(chunk.text, textHeading) : chunk.text.trim();
|
|
664
|
+
return [chunk.title, ...meaningfulHeadings, body].filter(Boolean).join("\n\n");
|
|
665
|
+
}
|
|
522
666
|
function createDenseChunkText(chunk) {
|
|
523
|
-
return
|
|
667
|
+
return createVectorText(chunk);
|
|
524
668
|
}
|
|
525
669
|
function createSparseChunkText(chunk) {
|
|
526
|
-
return
|
|
670
|
+
return createVectorText(chunk);
|
|
527
671
|
}
|
|
528
672
|
|
|
529
673
|
// src/vector/dense.ts
|
|
530
674
|
var denseEmbedderFactory = null;
|
|
675
|
+
var EXACT_DENSE_RERANK_THRESHOLD = 5e3;
|
|
531
676
|
async function createEmbedder(cacheDir, modelId) {
|
|
532
677
|
if (denseEmbedderFactory) {
|
|
533
678
|
return denseEmbedderFactory(cacheDir, modelId);
|
|
@@ -539,6 +684,9 @@ async function createEmbedder(cacheDir, modelId) {
|
|
|
539
684
|
return output.tolist()[0];
|
|
540
685
|
};
|
|
541
686
|
}
|
|
687
|
+
function exactDenseQuery(payload, vector, topK) {
|
|
688
|
+
return payload.chunks.map((chunk) => [chunk.chunkId, cosineSimilarity(vector, chunk.embedding)]).sort((left, right) => right[1] - left[1]).slice(0, topK);
|
|
689
|
+
}
|
|
542
690
|
async function pullDenseModel(workspacePath, config) {
|
|
543
691
|
const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
|
|
544
692
|
await mkdir4(cacheDir, { recursive: true });
|
|
@@ -547,7 +695,8 @@ async function pullDenseModel(workspacePath, config) {
|
|
|
547
695
|
}
|
|
548
696
|
async function buildDenseVectors({
|
|
549
697
|
workspacePath,
|
|
550
|
-
config
|
|
698
|
+
config,
|
|
699
|
+
progress
|
|
551
700
|
}) {
|
|
552
701
|
const chunks = await readJsonl(path8.join(workspacePath, "chunks", "chunks.jsonl"));
|
|
553
702
|
const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
|
|
@@ -555,6 +704,7 @@ async function buildDenseVectors({
|
|
|
555
704
|
const embed = await createEmbedder(cacheDir, config.modelId);
|
|
556
705
|
const records = [];
|
|
557
706
|
let dimensions = 0;
|
|
707
|
+
reportProgress(progress, `Encoding ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} for dense retrieval`);
|
|
558
708
|
for (const chunk of chunks) {
|
|
559
709
|
const embedding = await embed(createDenseChunkText(chunk));
|
|
560
710
|
dimensions ||= embedding.length;
|
|
@@ -568,7 +718,11 @@ async function buildDenseVectors({
|
|
|
568
718
|
text: chunk.text,
|
|
569
719
|
embedding
|
|
570
720
|
});
|
|
721
|
+
if (records.length === 1 || records.length % 100 === 0 || records.length === chunks.length) {
|
|
722
|
+
reportProgressDetail(progress, `Encoded ${records.length}/${chunks.length} chunks for dense retrieval`);
|
|
723
|
+
}
|
|
571
724
|
}
|
|
725
|
+
reportProgress(progress, "Building dense vector index");
|
|
572
726
|
const index = new VectorFieldIndex({
|
|
573
727
|
numHashTables: config.indexHashTables,
|
|
574
728
|
dimensions,
|
|
@@ -592,6 +746,7 @@ async function buildDenseVectors({
|
|
|
592
746
|
chunks: records
|
|
593
747
|
};
|
|
594
748
|
await writeDensePayload(workspacePath, payload);
|
|
749
|
+
reportProgress(progress, `Dense vectors written for ${records.length} chunk${records.length === 1 ? "" : "s"}`);
|
|
595
750
|
return payload;
|
|
596
751
|
}
|
|
597
752
|
async function denseQuery({
|
|
@@ -604,12 +759,19 @@ async function denseQuery({
|
|
|
604
759
|
const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
|
|
605
760
|
const embed = await createEmbedder(cacheDir, config.modelId);
|
|
606
761
|
const vector = await embed(query);
|
|
762
|
+
if (payload.chunks.length <= EXACT_DENSE_RERANK_THRESHOLD) {
|
|
763
|
+
return exactDenseQuery(payload, vector, topK);
|
|
764
|
+
}
|
|
607
765
|
const index = new VectorFieldIndex({
|
|
608
766
|
numHashTables: payload.metadata.hashTables,
|
|
609
767
|
dimensions: payload.metadata.dimensions,
|
|
610
768
|
random: createSeededRandom(payload.metadata.randomSeed)
|
|
611
769
|
}).loadState(payload.indexState);
|
|
612
|
-
|
|
770
|
+
const approximateHits = index.query(vector, topK);
|
|
771
|
+
if (approximateHits.length >= topK) {
|
|
772
|
+
return approximateHits;
|
|
773
|
+
}
|
|
774
|
+
return exactDenseQuery(payload, vector, topK);
|
|
613
775
|
}
|
|
614
776
|
|
|
615
777
|
// src/vector/sparse.ts
|
|
@@ -717,10 +879,13 @@ async function buildSparseDocuments(workspacePath, config, chunks) {
|
|
|
717
879
|
}
|
|
718
880
|
async function buildSparseVectors({
|
|
719
881
|
workspacePath,
|
|
720
|
-
config
|
|
882
|
+
config,
|
|
883
|
+
progress
|
|
721
884
|
}) {
|
|
722
885
|
const chunks = await readJsonl(path9.join(workspacePath, "chunks", "chunks.jsonl"));
|
|
886
|
+
reportProgress(progress, `Encoding ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} for sparse retrieval`);
|
|
723
887
|
const built = await buildSparseDocuments(workspacePath, config, chunks);
|
|
888
|
+
reportProgress(progress, "Building sparse vector index");
|
|
724
889
|
const index = new SparseVectorFieldIndex();
|
|
725
890
|
for (const record of built.chunks) {
|
|
726
891
|
index.insert(record.chunkId, [record.vector]);
|
|
@@ -742,6 +907,7 @@ async function buildSparseVectors({
|
|
|
742
907
|
queryTokenWeights: built.queryTokenWeights
|
|
743
908
|
};
|
|
744
909
|
await writeSparsePayload(workspacePath, payload);
|
|
910
|
+
reportProgress(progress, `Sparse vectors written for ${built.chunks.length} chunk${built.chunks.length === 1 ? "" : "s"}`);
|
|
745
911
|
return payload;
|
|
746
912
|
}
|
|
747
913
|
async function sparseQuery({
|
|
@@ -759,6 +925,7 @@ async function sparseQuery({
|
|
|
759
925
|
}
|
|
760
926
|
|
|
761
927
|
// src/vector/service.ts
|
|
928
|
+
var pullModelsOverrideForTests = null;
|
|
762
929
|
function resolveModelPullPlan({
|
|
763
930
|
pullDenseFlag,
|
|
764
931
|
pullSparseFlag,
|
|
@@ -775,90 +942,136 @@ function resolveModelPullPlan({
|
|
|
775
942
|
pullSparse: uvAvailable
|
|
776
943
|
};
|
|
777
944
|
}
|
|
945
|
+
function resolveMissingConfiguredModelPullPlan({
|
|
946
|
+
config,
|
|
947
|
+
status
|
|
948
|
+
}) {
|
|
949
|
+
return {
|
|
950
|
+
pullDense: config.retrieval.dense.enabled && !status.dense.available,
|
|
951
|
+
pullSparse: config.retrieval.sparse.enabled && status.sparse.uvAvailable && !status.sparse.available
|
|
952
|
+
};
|
|
953
|
+
}
|
|
778
954
|
async function buildVectorArtifacts({
|
|
779
955
|
workspacePath,
|
|
780
956
|
config,
|
|
781
957
|
denseOverride,
|
|
782
958
|
sparseOverride,
|
|
783
|
-
buildAvailableModels = false
|
|
959
|
+
buildAvailableModels = false,
|
|
960
|
+
progress
|
|
784
961
|
}) {
|
|
785
|
-
const
|
|
786
|
-
|
|
787
|
-
await ensureUvAvailable();
|
|
788
|
-
return true;
|
|
789
|
-
} catch {
|
|
790
|
-
return false;
|
|
791
|
-
}
|
|
792
|
-
})()) : null;
|
|
962
|
+
const uvAvailable = await isUvAvailable();
|
|
963
|
+
const modelStatus = buildAvailableModels ? await buildModelStatus(workspacePath, config.retrieval.dense, config.retrieval.sparse, uvAvailable) : null;
|
|
793
964
|
const denseEnabled = denseOverride ?? (buildAvailableModels ? config.retrieval.dense.enabled || Boolean(modelStatus?.dense.available) : config.retrieval.dense.enabled);
|
|
794
|
-
const sparseEnabled = sparseOverride ?? (buildAvailableModels ? (config.retrieval.sparse.enabled || Boolean(modelStatus?.sparse.available)) && Boolean(modelStatus?.sparse.uvAvailable) : config.retrieval.sparse.enabled);
|
|
795
|
-
const
|
|
965
|
+
const sparseEnabled = sparseOverride ?? (buildAvailableModels ? (config.retrieval.sparse.enabled || Boolean(modelStatus?.sparse.available)) && Boolean(modelStatus?.sparse.uvAvailable) : config.retrieval.sparse.enabled && uvAvailable);
|
|
966
|
+
const result = {};
|
|
796
967
|
if (denseEnabled) {
|
|
797
|
-
|
|
968
|
+
reportProgress(progress, `Building dense vectors with ${config.retrieval.dense.modelId}`);
|
|
969
|
+
result.dense = await buildDenseVectors({ workspacePath, config: config.retrieval.dense, progress });
|
|
970
|
+
}
|
|
971
|
+
if ((sparseOverride || config.retrieval.sparse.enabled) && !uvAvailable) {
|
|
972
|
+
reportProgress(progress, "Skipping sparse vectors because uv is not available");
|
|
798
973
|
}
|
|
799
974
|
if (sparseEnabled) {
|
|
800
|
-
|
|
975
|
+
reportProgress(progress, `Building sparse vectors with ${config.retrieval.sparse.modelId}`);
|
|
976
|
+
result.sparse = await buildSparseVectors({ workspacePath, config: config.retrieval.sparse, progress });
|
|
801
977
|
}
|
|
802
|
-
return
|
|
978
|
+
return result;
|
|
803
979
|
}
|
|
804
980
|
async function pullModels({
|
|
805
981
|
workspacePath,
|
|
806
982
|
config,
|
|
807
983
|
pullDense,
|
|
808
|
-
pullSparse
|
|
984
|
+
pullSparse,
|
|
985
|
+
progress
|
|
809
986
|
}) {
|
|
987
|
+
if (pullModelsOverrideForTests) {
|
|
988
|
+
await pullModelsOverrideForTests({ workspacePath, config, pullDense, pullSparse, progress });
|
|
989
|
+
return;
|
|
990
|
+
}
|
|
810
991
|
if (pullDense) {
|
|
992
|
+
reportProgress(progress, `Pulling dense model ${config.retrieval.dense.modelId}`);
|
|
811
993
|
await pullDenseModel(workspacePath, config.retrieval.dense);
|
|
812
|
-
await writeDensePullMarker(workspacePath, {
|
|
994
|
+
await writeDensePullMarker(workspacePath, config.retrieval.dense, {
|
|
813
995
|
pulledAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
814
|
-
modelId: config.retrieval.dense.modelId
|
|
996
|
+
modelId: config.retrieval.dense.modelId,
|
|
997
|
+
cacheDir: config.retrieval.dense.cacheDir
|
|
815
998
|
});
|
|
999
|
+
reportProgress(progress, `Dense model ready: ${config.retrieval.dense.modelId}`);
|
|
816
1000
|
}
|
|
817
1001
|
if (pullSparse) {
|
|
1002
|
+
reportProgress(progress, `Pulling sparse model ${config.retrieval.sparse.modelId}`);
|
|
818
1003
|
await pullSparseModel(workspacePath, config.retrieval.sparse);
|
|
819
|
-
await writeSparsePullMarker(workspacePath, {
|
|
1004
|
+
await writeSparsePullMarker(workspacePath, config.retrieval.sparse, {
|
|
820
1005
|
pulledAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
821
|
-
modelId: config.retrieval.sparse.modelId
|
|
1006
|
+
modelId: config.retrieval.sparse.modelId,
|
|
1007
|
+
cacheDir: config.retrieval.sparse.cacheDir
|
|
822
1008
|
});
|
|
1009
|
+
reportProgress(progress, `Sparse model ready: ${config.retrieval.sparse.modelId}`);
|
|
823
1010
|
}
|
|
824
1011
|
}
|
|
825
1012
|
async function getModelStatus(workspacePath, config) {
|
|
826
|
-
|
|
827
|
-
try {
|
|
828
|
-
await ensureUvAvailable();
|
|
829
|
-
uvAvailable = true;
|
|
830
|
-
} catch {
|
|
831
|
-
uvAvailable = false;
|
|
832
|
-
}
|
|
1013
|
+
const uvAvailable = await isUvAvailable();
|
|
833
1014
|
return buildModelStatus(workspacePath, config.retrieval.dense, config.retrieval.sparse, uvAvailable);
|
|
834
1015
|
}
|
|
835
1016
|
|
|
836
1017
|
// src/index/index-store.ts
|
|
837
|
-
import {
|
|
1018
|
+
import { mkdir as mkdir6, rm as rm2 } from "fs/promises";
|
|
838
1019
|
import path10 from "path";
|
|
1020
|
+
function versionedIndexPath(workspacePath, stamp) {
|
|
1021
|
+
return path10.join(workspacePath, "indexes", `${stamp}.json.gz`);
|
|
1022
|
+
}
|
|
1023
|
+
function versionedLegacyIndexPath(workspacePath, stamp) {
|
|
1024
|
+
return path10.join(workspacePath, "indexes", `${stamp}.json`);
|
|
1025
|
+
}
|
|
1026
|
+
function versionedMetaPath(workspacePath, stamp) {
|
|
1027
|
+
return path10.join(workspacePath, "indexes", `${stamp}.meta.json.gz`);
|
|
1028
|
+
}
|
|
1029
|
+
function versionedLegacyMetaPath(workspacePath, stamp) {
|
|
1030
|
+
return path10.join(workspacePath, "indexes", `${stamp}.meta.json`);
|
|
1031
|
+
}
|
|
1032
|
+
function latestIndexPath(workspacePath) {
|
|
1033
|
+
return path10.join(workspacePath, "indexes", "latest.json.gz");
|
|
1034
|
+
}
|
|
1035
|
+
function legacyLatestIndexPath(workspacePath) {
|
|
1036
|
+
return path10.join(workspacePath, "indexes", "latest.json");
|
|
1037
|
+
}
|
|
1038
|
+
function latestMetaPath(workspacePath) {
|
|
1039
|
+
return path10.join(workspacePath, "indexes", "latest.meta.json.gz");
|
|
1040
|
+
}
|
|
1041
|
+
function legacyLatestMetaPath(workspacePath) {
|
|
1042
|
+
return path10.join(workspacePath, "indexes", "latest.meta.json");
|
|
1043
|
+
}
|
|
839
1044
|
async function writeIndexArtifacts({
|
|
840
1045
|
workspacePath,
|
|
841
1046
|
indexState,
|
|
842
1047
|
metadata
|
|
843
1048
|
}) {
|
|
844
1049
|
const stamp = metadata.createdAt.replace(/[:.]/g, "-");
|
|
845
|
-
const indexPath =
|
|
846
|
-
const metaPath =
|
|
847
|
-
const
|
|
848
|
-
const
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
await
|
|
852
|
-
await
|
|
853
|
-
await
|
|
854
|
-
await
|
|
855
|
-
|
|
1050
|
+
const indexPath = versionedIndexPath(workspacePath, stamp);
|
|
1051
|
+
const metaPath = versionedMetaPath(workspacePath, stamp);
|
|
1052
|
+
const latestIndexArtifactPath = latestIndexPath(workspacePath);
|
|
1053
|
+
const latestMetadataArtifactPath = latestMetaPath(workspacePath);
|
|
1054
|
+
await mkdir6(path10.join(workspacePath, "indexes"), { recursive: true });
|
|
1055
|
+
await writeGzipJson(indexPath, indexState);
|
|
1056
|
+
await writeGzipJson(metaPath, metadata);
|
|
1057
|
+
await writeGzipJson(latestIndexArtifactPath, indexState);
|
|
1058
|
+
await writeGzipJson(latestMetadataArtifactPath, metadata);
|
|
1059
|
+
await Promise.all([
|
|
1060
|
+
rm2(legacyLatestIndexPath(workspacePath), { force: true }),
|
|
1061
|
+
rm2(legacyLatestMetaPath(workspacePath), { force: true }),
|
|
1062
|
+
rm2(versionedLegacyIndexPath(workspacePath, stamp), { force: true }),
|
|
1063
|
+
rm2(versionedLegacyMetaPath(workspacePath, stamp), { force: true })
|
|
1064
|
+
]);
|
|
1065
|
+
return { indexPath: latestIndexArtifactPath, metadataPath: latestMetadataArtifactPath };
|
|
856
1066
|
}
|
|
857
1067
|
async function readLatestIndexState(workspacePath) {
|
|
858
|
-
return
|
|
1068
|
+
return readJsonFromGzipOrFile(latestIndexPath(workspacePath), legacyLatestIndexPath(workspacePath));
|
|
859
1069
|
}
|
|
860
1070
|
async function readLatestIndexMetadata(workspacePath) {
|
|
861
|
-
return
|
|
1071
|
+
return readJsonFromGzipOrFile(latestMetaPath(workspacePath), legacyLatestMetaPath(workspacePath));
|
|
1072
|
+
}
|
|
1073
|
+
async function resolveLatestIndexArtifactPath(workspacePath) {
|
|
1074
|
+
return resolveExistingGzipOrFilePath(latestIndexPath(workspacePath), legacyLatestIndexPath(workspacePath));
|
|
862
1075
|
}
|
|
863
1076
|
|
|
864
1077
|
// src/index/querylight-indexer.ts
|
|
@@ -900,14 +1113,17 @@ async function buildIndex({
|
|
|
900
1113
|
workspacePath,
|
|
901
1114
|
denseOverride,
|
|
902
1115
|
sparseOverride,
|
|
903
|
-
buildAvailableModels = false
|
|
1116
|
+
buildAvailableModels = false,
|
|
1117
|
+
progress
|
|
904
1118
|
}) {
|
|
905
1119
|
const config = await loadConfig(workspacePath);
|
|
1120
|
+
reportProgress(progress, "Loading documents, chunks, and sources");
|
|
906
1121
|
const chunks = await readJsonl(path11.join(workspacePath, "chunks", "chunks.jsonl"));
|
|
907
1122
|
const documents = await readJsonl(path11.join(workspacePath, "documents", "documents.jsonl"));
|
|
908
1123
|
const sources = await readJsonl(path11.join(workspacePath, "sources", "sources.jsonl"));
|
|
909
1124
|
const metadataFields = [...new Set(chunks.flatMap((chunk) => Object.keys(chunk.metadata).map((key) => `metadata.${key}`)))];
|
|
910
1125
|
const index = new DocumentIndex(createIndexMapping(metadataFields));
|
|
1126
|
+
reportProgress(progress, `Building lexical index from ${chunks.length} chunk${chunks.length === 1 ? "" : "s"}`);
|
|
911
1127
|
for (const chunk of chunks) {
|
|
912
1128
|
index.index({
|
|
913
1129
|
id: chunk.id,
|
|
@@ -922,6 +1138,7 @@ async function buildIndex({
|
|
|
922
1138
|
}
|
|
923
1139
|
});
|
|
924
1140
|
}
|
|
1141
|
+
reportProgressDetail(progress, `Indexed ${documents.length} document${documents.length === 1 ? "" : "s"} across ${sources.length} source${sources.length === 1 ? "" : "s"}`);
|
|
925
1142
|
const createdAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
926
1143
|
const metadata = {
|
|
927
1144
|
id: `index_${createdAt.replace(/[:.]/g, "-")}`,
|
|
@@ -934,14 +1151,17 @@ async function buildIndex({
|
|
|
934
1151
|
fields: Object.keys(index.mapping),
|
|
935
1152
|
indexHash: sha256(JSON.stringify(index.indexState))
|
|
936
1153
|
};
|
|
1154
|
+
reportProgress(progress, "Writing lexical index artifacts");
|
|
937
1155
|
const artifacts = await writeIndexArtifacts({ workspacePath, indexState: index.indexState, metadata });
|
|
938
1156
|
const vectors = await buildVectorArtifacts({
|
|
939
1157
|
workspacePath,
|
|
940
1158
|
config,
|
|
941
1159
|
denseOverride,
|
|
942
1160
|
sparseOverride,
|
|
943
|
-
buildAvailableModels
|
|
1161
|
+
buildAvailableModels,
|
|
1162
|
+
progress
|
|
944
1163
|
});
|
|
1164
|
+
reportProgress(progress, `Index build complete: dense=${Boolean(vectors.dense)}, sparse=${Boolean(vectors.sparse)}`);
|
|
945
1165
|
return {
|
|
946
1166
|
metadata,
|
|
947
1167
|
indexPath: artifacts.indexPath,
|
|
@@ -953,6 +1173,27 @@ async function buildIndex({
|
|
|
953
1173
|
// src/ingest/ingest-service.ts
|
|
954
1174
|
import path17 from "path";
|
|
955
1175
|
|
|
1176
|
+
// src/core/concurrency.ts
|
|
1177
|
+
async function mapWithConcurrency(items, limit, worker) {
|
|
1178
|
+
if (items.length === 0) {
|
|
1179
|
+
return;
|
|
1180
|
+
}
|
|
1181
|
+
const concurrency = Math.max(1, Math.floor(limit));
|
|
1182
|
+
let nextIndex = 0;
|
|
1183
|
+
await Promise.all(
|
|
1184
|
+
Array.from({ length: Math.min(concurrency, items.length) }, async () => {
|
|
1185
|
+
while (true) {
|
|
1186
|
+
const index = nextIndex;
|
|
1187
|
+
nextIndex += 1;
|
|
1188
|
+
if (index >= items.length) {
|
|
1189
|
+
return;
|
|
1190
|
+
}
|
|
1191
|
+
await worker(items[index], index);
|
|
1192
|
+
}
|
|
1193
|
+
})
|
|
1194
|
+
);
|
|
1195
|
+
}
|
|
1196
|
+
|
|
956
1197
|
// src/core/runs.ts
|
|
957
1198
|
import path12 from "path";
|
|
958
1199
|
async function writeRun(workspacePath, run) {
|
|
@@ -1021,7 +1262,7 @@ async function removeSource(workspacePath, sourceId) {
|
|
|
1021
1262
|
}
|
|
1022
1263
|
|
|
1023
1264
|
// src/ingest/document-utils.ts
|
|
1024
|
-
import { mkdir as
|
|
1265
|
+
import { mkdir as mkdir7, rm as rm3, writeFile as writeFile5 } from "fs/promises";
|
|
1025
1266
|
import path14 from "path";
|
|
1026
1267
|
|
|
1027
1268
|
// src/normalize/normalize-markdown.ts
|
|
@@ -1074,7 +1315,7 @@ async function writeNormalizedDocument({
|
|
|
1074
1315
|
normalizedPath,
|
|
1075
1316
|
markdown
|
|
1076
1317
|
}) {
|
|
1077
|
-
await
|
|
1318
|
+
await mkdir7(path14.dirname(normalizedPath), { recursive: true });
|
|
1078
1319
|
await writeFile5(
|
|
1079
1320
|
normalizedPath,
|
|
1080
1321
|
withFrontmatter(
|
|
@@ -1097,8 +1338,8 @@ async function writeNormalizedDocument({
|
|
|
1097
1338
|
}
|
|
1098
1339
|
async function deleteDocumentArtifacts(document) {
|
|
1099
1340
|
await Promise.all([
|
|
1100
|
-
document.rawPath ?
|
|
1101
|
-
|
|
1341
|
+
document.rawPath ? rm3(document.rawPath, { force: true }) : Promise.resolve(),
|
|
1342
|
+
rm3(document.normalizedPath, { force: true })
|
|
1102
1343
|
]);
|
|
1103
1344
|
}
|
|
1104
1345
|
|
|
@@ -1122,13 +1363,13 @@ async function listDirectoryFiles(source) {
|
|
|
1122
1363
|
|
|
1123
1364
|
// src/ingest/adapters/file-adapter.ts
|
|
1124
1365
|
import { basename, extname, resolve } from "path";
|
|
1125
|
-
import { mkdir as
|
|
1366
|
+
import { mkdir as mkdir8, readFile as readFile8, stat as stat3, writeFile as writeFile6 } from "fs/promises";
|
|
1126
1367
|
|
|
1127
1368
|
// src/ingest/extractors/docx-extractor.ts
|
|
1128
1369
|
import mammoth from "mammoth";
|
|
1129
1370
|
async function extractDocx(filePath) {
|
|
1130
|
-
const
|
|
1131
|
-
return
|
|
1371
|
+
const result = await mammoth.extractRawText({ path: filePath });
|
|
1372
|
+
return result.value;
|
|
1132
1373
|
}
|
|
1133
1374
|
|
|
1134
1375
|
// src/ingest/extractors/html-extractor.ts
|
|
@@ -1142,9 +1383,41 @@ function stripBoilerplate(html) {
|
|
|
1142
1383
|
|
|
1143
1384
|
// src/ingest/extractors/html-extractor.ts
|
|
1144
1385
|
var turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
|
|
1386
|
+
var LOW_SIGNAL_SECTION_SELECTORS = [
|
|
1387
|
+
"script",
|
|
1388
|
+
"style",
|
|
1389
|
+
"noscript",
|
|
1390
|
+
"template",
|
|
1391
|
+
"[data-blog-service-recommendations]",
|
|
1392
|
+
"[data-blog-related-posts]"
|
|
1393
|
+
].join(", ");
|
|
1145
1394
|
function cleanText(value) {
|
|
1146
1395
|
return value.replace(/\s+/g, " ").trim();
|
|
1147
1396
|
}
|
|
1397
|
+
function pruneLowSignalContent($) {
|
|
1398
|
+
$(LOW_SIGNAL_SECTION_SELECTORS).remove();
|
|
1399
|
+
$("form").each((_, element) => {
|
|
1400
|
+
const action = cleanText($(element).attr("action") ?? "");
|
|
1401
|
+
if (action.includes("substack.com/subscribe")) {
|
|
1402
|
+
$(element).closest("section").remove();
|
|
1403
|
+
}
|
|
1404
|
+
});
|
|
1405
|
+
}
|
|
1406
|
+
function stripEscapedJsonPayloads(markdown) {
|
|
1407
|
+
return markdown.split("\n").filter((line) => {
|
|
1408
|
+
const trimmed = line.trim();
|
|
1409
|
+
if (trimmed.length === 0) {
|
|
1410
|
+
return true;
|
|
1411
|
+
}
|
|
1412
|
+
if (trimmed.length > 300 && /^"?\\?\[\{\\?"[a-z0-9_]+\\?":/i.test(trimmed)) {
|
|
1413
|
+
return false;
|
|
1414
|
+
}
|
|
1415
|
+
if (trimmed.length > 300 && trimmed.includes('\\"permalink\\":') && trimmed.includes('\\"title\\":')) {
|
|
1416
|
+
return false;
|
|
1417
|
+
}
|
|
1418
|
+
return true;
|
|
1419
|
+
}).join("\n").replace(/\n{3,}/g, "\n\n").trim();
|
|
1420
|
+
}
|
|
1148
1421
|
function chooseMeaningfulTitle($, fallbackTitle) {
|
|
1149
1422
|
const candidates = [
|
|
1150
1423
|
cleanText($("meta[property='og:title']").attr("content") ?? ""),
|
|
@@ -1181,14 +1454,27 @@ ${parts.join("\n\n")}
|
|
|
1181
1454
|
function extractHtmlToMarkdown(html) {
|
|
1182
1455
|
const cleaned = stripBoilerplate(html);
|
|
1183
1456
|
const $ = load(cleaned);
|
|
1457
|
+
pruneLowSignalContent($);
|
|
1184
1458
|
const fallbackTitle = cleanText($("title").first().text()) || "Untitled";
|
|
1185
1459
|
const title = chooseMeaningfulTitle($, fallbackTitle);
|
|
1186
1460
|
const root = $("main").first().html() ?? $.root().html() ?? cleaned;
|
|
1187
1461
|
return {
|
|
1188
|
-
markdown: turndown.turndown(root),
|
|
1462
|
+
markdown: stripEscapedJsonPayloads(turndown.turndown(root)),
|
|
1189
1463
|
title
|
|
1190
1464
|
};
|
|
1191
1465
|
}
|
|
1466
|
+
function extractCanonicalUriFromHtml(html, baseUrl) {
|
|
1467
|
+
const $ = load(html);
|
|
1468
|
+
const href = $("link[rel='canonical']").first().attr("href")?.trim();
|
|
1469
|
+
if (!href) {
|
|
1470
|
+
return null;
|
|
1471
|
+
}
|
|
1472
|
+
try {
|
|
1473
|
+
return new URL(href, baseUrl).href;
|
|
1474
|
+
} catch {
|
|
1475
|
+
return null;
|
|
1476
|
+
}
|
|
1477
|
+
}
|
|
1192
1478
|
function parseDateCandidate(value) {
|
|
1193
1479
|
const trimmed = value.trim();
|
|
1194
1480
|
if (!trimmed) {
|
|
@@ -1251,16 +1537,16 @@ function extractPublicationDateFromHtml(html) {
|
|
|
1251
1537
|
}
|
|
1252
1538
|
|
|
1253
1539
|
// src/ingest/extractors/markdown-extractor.ts
|
|
1254
|
-
import { readFile as
|
|
1540
|
+
import { readFile as readFile5 } from "fs/promises";
|
|
1255
1541
|
async function extractMarkdown(filePath) {
|
|
1256
|
-
return
|
|
1542
|
+
return readFile5(filePath, "utf8");
|
|
1257
1543
|
}
|
|
1258
1544
|
|
|
1259
1545
|
// src/ingest/extractors/pdf-extractor.ts
|
|
1260
|
-
import { readFile as
|
|
1546
|
+
import { readFile as readFile6 } from "fs/promises";
|
|
1261
1547
|
import { PDFParse } from "pdf-parse";
|
|
1262
1548
|
async function extractPdf(filePath) {
|
|
1263
|
-
const buffer = await
|
|
1549
|
+
const buffer = await readFile6(filePath);
|
|
1264
1550
|
const parser = new PDFParse({ data: buffer });
|
|
1265
1551
|
try {
|
|
1266
1552
|
const parsed = await parser.getText();
|
|
@@ -1271,9 +1557,9 @@ async function extractPdf(filePath) {
|
|
|
1271
1557
|
}
|
|
1272
1558
|
|
|
1273
1559
|
// src/ingest/extractors/text-extractor.ts
|
|
1274
|
-
import { readFile as
|
|
1560
|
+
import { readFile as readFile7 } from "fs/promises";
|
|
1275
1561
|
async function extractText(filePath) {
|
|
1276
|
-
return
|
|
1562
|
+
return readFile7(filePath, "utf8");
|
|
1277
1563
|
}
|
|
1278
1564
|
|
|
1279
1565
|
// src/ingest/adapters/file-adapter.ts
|
|
@@ -1308,7 +1594,7 @@ async function extractFileContent(filePath, mimeType) {
|
|
|
1308
1594
|
${text}`, raw: text };
|
|
1309
1595
|
}
|
|
1310
1596
|
if (mimeType === "text/html") {
|
|
1311
|
-
const raw = await
|
|
1597
|
+
const raw = await readFile8(filePath, "utf8");
|
|
1312
1598
|
const extracted = extractHtmlToMarkdown(raw);
|
|
1313
1599
|
return { title: extracted.title, markdown: `# ${extracted.title}
|
|
1314
1600
|
|
|
@@ -1364,8 +1650,8 @@ async function ingestFile({
|
|
|
1364
1650
|
const lastChangedAt = previous?.contentHash === contentHash ? previous.lastChangedAt : now;
|
|
1365
1651
|
const indexedAt = now;
|
|
1366
1652
|
const crawledAt = now;
|
|
1367
|
-
await
|
|
1368
|
-
await
|
|
1653
|
+
await mkdir8(resolve(workspacePath, "normalized"), { recursive: true });
|
|
1654
|
+
await mkdir8(resolve(workspacePath, "raw", source.id), { recursive: true });
|
|
1369
1655
|
if (extracted.raw) {
|
|
1370
1656
|
await writeFile6(rawPath, extracted.raw, "utf8");
|
|
1371
1657
|
}
|
|
@@ -1430,7 +1716,7 @@ ${content}`;
|
|
|
1430
1716
|
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
1431
1717
|
const lastChangedAt = previous?.contentHash === contentHash ? previous.lastChangedAt : now;
|
|
1432
1718
|
const indexedAt = now;
|
|
1433
|
-
await
|
|
1719
|
+
await mkdir8(resolve(workspacePath, "normalized"), { recursive: true });
|
|
1434
1720
|
await writeNormalizedDocument({
|
|
1435
1721
|
documentId,
|
|
1436
1722
|
sourceId: source.id,
|
|
@@ -1474,7 +1760,7 @@ async function reprocessStoredDocument(document, source) {
|
|
|
1474
1760
|
if (!document.rawPath) {
|
|
1475
1761
|
return null;
|
|
1476
1762
|
}
|
|
1477
|
-
const raw = await
|
|
1763
|
+
const raw = await readFile8(document.rawPath, "utf8");
|
|
1478
1764
|
const fallbackTitle = document.title || basename(document.uri);
|
|
1479
1765
|
const extracted = await extractRawContent(raw, document.mimeType, fallbackTitle);
|
|
1480
1766
|
const contentHash = sha256(extracted.markdown);
|
|
@@ -1591,8 +1877,21 @@ async function parseRssFeedDocument(xml, source) {
|
|
|
1591
1877
|
}
|
|
1592
1878
|
|
|
1593
1879
|
// src/ingest/adapters/url-adapter.ts
|
|
1594
|
-
import { mkdir as
|
|
1880
|
+
import { mkdir as mkdir9, readFile as readFile9, writeFile as writeFile7 } from "fs/promises";
|
|
1595
1881
|
import path16 from "path";
|
|
1882
|
+
|
|
1883
|
+
// src/core/urls.ts
|
|
1884
|
+
function normalizeRemoteUrl(uri) {
|
|
1885
|
+
try {
|
|
1886
|
+
const parsed = new URL(uri);
|
|
1887
|
+
parsed.hash = "";
|
|
1888
|
+
return parsed.href;
|
|
1889
|
+
} catch {
|
|
1890
|
+
return uri;
|
|
1891
|
+
}
|
|
1892
|
+
}
|
|
1893
|
+
|
|
1894
|
+
// src/ingest/adapters/url-adapter.ts
|
|
1596
1895
|
function buildHttpCache(response2, validatedAt) {
|
|
1597
1896
|
return {
|
|
1598
1897
|
etag: response2.headers.get("etag") ?? void 0,
|
|
@@ -1617,25 +1916,26 @@ async function normalizeRemoteDocument({
|
|
|
1617
1916
|
responseStatus
|
|
1618
1917
|
}) {
|
|
1619
1918
|
const extracted = extractHtmlToMarkdown(body);
|
|
1919
|
+
const canonicalUri = normalizeRemoteUrl(extractCanonicalUriFromHtml(body, url) ?? url);
|
|
1620
1920
|
const markdown = `# ${extracted.title}
|
|
1621
1921
|
|
|
1622
1922
|
${extracted.markdown}`;
|
|
1623
|
-
const documentId = stableId("doc", source.id,
|
|
1923
|
+
const documentId = stableId("doc", source.id, canonicalUri);
|
|
1624
1924
|
const normalizedPath = path16.resolve(workspacePath, "normalized", `${documentId}.md`);
|
|
1625
|
-
const rawPath = path16.resolve(workspacePath, "raw", source.id, `${sha256(
|
|
1925
|
+
const rawPath = path16.resolve(workspacePath, "raw", source.id, `${sha256(canonicalUri).slice(0, 12)}.html`);
|
|
1626
1926
|
const contentHash = sha256(markdown);
|
|
1627
1927
|
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
1628
1928
|
const lastChangedAt = previous?.contentHash === contentHash ? previous.lastChangedAt : now;
|
|
1629
1929
|
const indexedAt = now;
|
|
1630
1930
|
const crawledAt = now;
|
|
1631
1931
|
const resolvedPublicationDate = choosePublicationDate(publicationDate, extractPublicationDateFromHtml(body), previous?.publicationDate);
|
|
1632
|
-
await
|
|
1932
|
+
await mkdir9(path16.resolve(workspacePath, "raw", source.id), { recursive: true });
|
|
1633
1933
|
await writeFile7(rawPath, body, "utf8");
|
|
1634
1934
|
await writeNormalizedDocument({
|
|
1635
1935
|
documentId,
|
|
1636
1936
|
sourceId: source.id,
|
|
1637
1937
|
title: extracted.title,
|
|
1638
|
-
uri:
|
|
1938
|
+
uri: canonicalUri,
|
|
1639
1939
|
sourceUri,
|
|
1640
1940
|
publicationDate: resolvedPublicationDate,
|
|
1641
1941
|
crawledAt,
|
|
@@ -1650,8 +1950,9 @@ ${extracted.markdown}`;
|
|
|
1650
1950
|
sourceId: source.id,
|
|
1651
1951
|
sourceType: source.type,
|
|
1652
1952
|
title: extracted.title,
|
|
1653
|
-
uri:
|
|
1953
|
+
uri: canonicalUri,
|
|
1654
1954
|
sourceUri,
|
|
1955
|
+
canonicalUri,
|
|
1655
1956
|
mimeType: "text/html",
|
|
1656
1957
|
rawPath,
|
|
1657
1958
|
normalizedPath,
|
|
@@ -1749,7 +2050,7 @@ async function reprocessRemoteDocument(document, source) {
|
|
|
1749
2050
|
if (!document.rawPath || !await fileExists(document.rawPath)) {
|
|
1750
2051
|
return null;
|
|
1751
2052
|
}
|
|
1752
|
-
const raw = await
|
|
2053
|
+
const raw = await readFile9(document.rawPath, "utf8");
|
|
1753
2054
|
const extracted = extractHtmlToMarkdown(raw);
|
|
1754
2055
|
const markdown = `# ${extracted.title}
|
|
1755
2056
|
|
|
@@ -1825,6 +2126,18 @@ function isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules
|
|
|
1825
2126
|
if (url.origin !== baseUrl.origin) {
|
|
1826
2127
|
return false;
|
|
1827
2128
|
}
|
|
2129
|
+
if (url.search.length > 0) {
|
|
2130
|
+
return false;
|
|
2131
|
+
}
|
|
2132
|
+
if (url.pathname.endsWith(".xml")) {
|
|
2133
|
+
return false;
|
|
2134
|
+
}
|
|
2135
|
+
if (url.pathname.includes("/cdn-cgi/")) {
|
|
2136
|
+
return false;
|
|
2137
|
+
}
|
|
2138
|
+
if (url.pathname === "/search" || url.pathname === "/search/" || url.pathname.endsWith("/search/")) {
|
|
2139
|
+
return false;
|
|
2140
|
+
}
|
|
1828
2141
|
if (disallowRules.some((rule) => rule !== "/" && url.pathname.startsWith(rule))) {
|
|
1829
2142
|
return false;
|
|
1830
2143
|
}
|
|
@@ -1837,56 +2150,75 @@ function isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules
|
|
|
1837
2150
|
}
|
|
1838
2151
|
return true;
|
|
1839
2152
|
}
|
|
1840
|
-
|
|
2153
|
+
function delay(ms) {
|
|
2154
|
+
return new Promise((resolve2) => setTimeout(resolve2, ms));
|
|
2155
|
+
}
|
|
2156
|
+
async function crawlWebsite(source, defaults, progress) {
|
|
1841
2157
|
const baseUrl = new URL(source.uri);
|
|
1842
|
-
const userAgent = source.crawl?.userAgent ??
|
|
2158
|
+
const userAgent = source.crawl?.userAgent ?? defaults.userAgent;
|
|
1843
2159
|
const includePatterns = source.crawl?.includePatterns ?? [];
|
|
1844
2160
|
const excludePatterns = source.crawl?.excludePatterns ?? [];
|
|
1845
2161
|
const maxDepth = source.crawl?.maxDepth ?? 2;
|
|
1846
2162
|
const maxPages = source.crawl?.maxPages ?? 100;
|
|
1847
|
-
const rateLimitMs = source.crawl?.rateLimitMs ??
|
|
2163
|
+
const rateLimitMs = source.crawl?.rateLimitMs ?? defaults.rateLimitMs;
|
|
2164
|
+
const maxConcurrentRequests = source.crawl?.maxConcurrentRequests ?? defaults.maxConcurrentRequests;
|
|
1848
2165
|
const disallowRules = source.crawl?.obeyRobotsTxt === false ? [] : await fetchRobotsDisallow(baseUrl, userAgent);
|
|
1849
|
-
const queue = [{ url: source.uri, depth: 0 }];
|
|
1850
2166
|
const seen = /* @__PURE__ */ new Set();
|
|
1851
2167
|
const results = [];
|
|
2168
|
+
let currentLevel = [normalizeRemoteUrl(source.uri)];
|
|
1852
2169
|
if (source.crawl?.useSitemap !== false) {
|
|
1853
|
-
|
|
1854
|
-
|
|
1855
|
-
|
|
1856
|
-
|
|
1857
|
-
|
|
1858
|
-
|
|
1859
|
-
|
|
1860
|
-
|
|
1861
|
-
}
|
|
1862
|
-
|
|
1863
|
-
const
|
|
1864
|
-
|
|
1865
|
-
|
|
2170
|
+
const sitemapUrls = (await fetchSitemapUrls(baseUrl, userAgent)).map((url) => normalizeRemoteUrl(url));
|
|
2171
|
+
reportProgress(progress, `Discovered ${sitemapUrls.length} sitemap URL${sitemapUrls.length === 1 ? "" : "s"} for ${source.uri}`);
|
|
2172
|
+
currentLevel = [
|
|
2173
|
+
...currentLevel,
|
|
2174
|
+
...sitemapUrls
|
|
2175
|
+
];
|
|
2176
|
+
}
|
|
2177
|
+
for (let depth = 0; depth <= maxDepth && currentLevel.length > 0 && results.length < maxPages; depth += 1) {
|
|
2178
|
+
reportProgress(progress, `Crawl depth ${depth}: evaluating ${currentLevel.length} candidate URL${currentLevel.length === 1 ? "" : "s"}`);
|
|
2179
|
+
const nextLevelCandidates = [];
|
|
2180
|
+
const allowedUrls = [];
|
|
2181
|
+
for (const candidate of currentLevel) {
|
|
2182
|
+
const normalizedCandidate = normalizeRemoteUrl(candidate);
|
|
2183
|
+
if (seen.has(normalizedCandidate)) {
|
|
2184
|
+
continue;
|
|
2185
|
+
}
|
|
2186
|
+
seen.add(normalizedCandidate);
|
|
2187
|
+
const url = new URL(normalizedCandidate);
|
|
2188
|
+
if (!isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules)) {
|
|
2189
|
+
continue;
|
|
2190
|
+
}
|
|
2191
|
+
allowedUrls.push(normalizedCandidate);
|
|
2192
|
+
results.push(normalizedCandidate);
|
|
2193
|
+
reportProgress(progress, `Discovered ${normalizedCandidate}`);
|
|
2194
|
+
if (results.length >= maxPages) {
|
|
2195
|
+
break;
|
|
2196
|
+
}
|
|
1866
2197
|
}
|
|
1867
|
-
|
|
1868
|
-
if (
|
|
1869
|
-
|
|
2198
|
+
reportProgress(progress, `Crawl depth ${depth}: queued ${allowedUrls.length} page${allowedUrls.length === 1 ? "" : "s"} for link extraction`);
|
|
2199
|
+
if (depth >= maxDepth || results.length >= maxPages) {
|
|
2200
|
+
break;
|
|
1870
2201
|
}
|
|
1871
|
-
|
|
1872
|
-
|
|
1873
|
-
|
|
1874
|
-
|
|
1875
|
-
const
|
|
1876
|
-
|
|
1877
|
-
|
|
1878
|
-
|
|
1879
|
-
|
|
1880
|
-
const target = new URL(href, url);
|
|
1881
|
-
if (!seen.has(target.href)) {
|
|
1882
|
-
queue.push({ url: target.href, depth: next.depth + 1 });
|
|
2202
|
+
await mapWithConcurrency(allowedUrls, maxConcurrentRequests, async (pageUrl) => {
|
|
2203
|
+
const page = new URL(pageUrl);
|
|
2204
|
+
const response2 = await fetch(page, { headers: { "user-agent": userAgent } });
|
|
2205
|
+
const html = await response2.text();
|
|
2206
|
+
const $ = load2(html);
|
|
2207
|
+
$("a[href]").each((_, element) => {
|
|
2208
|
+
const href = $(element).attr("href");
|
|
2209
|
+
if (!href) {
|
|
2210
|
+
return;
|
|
1883
2211
|
}
|
|
1884
|
-
|
|
2212
|
+
try {
|
|
2213
|
+
nextLevelCandidates.push(normalizeRemoteUrl(new URL(href, page).href));
|
|
2214
|
+
} catch {
|
|
2215
|
+
}
|
|
2216
|
+
});
|
|
2217
|
+
if (rateLimitMs > 0) {
|
|
2218
|
+
await delay(rateLimitMs);
|
|
1885
2219
|
}
|
|
1886
2220
|
});
|
|
1887
|
-
|
|
1888
|
-
await new Promise((resolve2) => setTimeout(resolve2, rateLimitMs));
|
|
1889
|
-
}
|
|
2221
|
+
currentLevel = nextLevelCandidates;
|
|
1890
2222
|
}
|
|
1891
2223
|
return results;
|
|
1892
2224
|
}
|
|
@@ -1961,6 +2293,8 @@ async function ingestRssSource({
|
|
|
1961
2293
|
source,
|
|
1962
2294
|
previous,
|
|
1963
2295
|
nextDocuments,
|
|
2296
|
+
maxConcurrentRequests,
|
|
2297
|
+
onDocumentProcessed,
|
|
1964
2298
|
onFailure
|
|
1965
2299
|
}) {
|
|
1966
2300
|
if (source.crawl?.fetchArticles === false) {
|
|
@@ -1968,11 +2302,12 @@ async function ingestRssSource({
|
|
|
1968
2302
|
}
|
|
1969
2303
|
const xml = await fetchFeedText(source);
|
|
1970
2304
|
const items = await parseRssFeedDocument(xml, source);
|
|
2305
|
+
const processedDocumentIds = /* @__PURE__ */ new Set();
|
|
1971
2306
|
let added = 0;
|
|
1972
2307
|
let changed = 0;
|
|
1973
2308
|
let unchanged = 0;
|
|
1974
2309
|
let failed = 0;
|
|
1975
|
-
|
|
2310
|
+
await mapWithConcurrency(items, maxConcurrentRequests, async (item) => {
|
|
1976
2311
|
try {
|
|
1977
2312
|
const probe = previous.get(stableId("doc", source.id, item.url));
|
|
1978
2313
|
const document = await fetchUrlDocument({
|
|
@@ -1983,28 +2318,40 @@ async function ingestRssSource({
|
|
|
1983
2318
|
sourceUri: source.uri,
|
|
1984
2319
|
publicationDate: item.publicationDate
|
|
1985
2320
|
});
|
|
2321
|
+
if (processedDocumentIds.has(document.id)) {
|
|
2322
|
+
return;
|
|
2323
|
+
}
|
|
2324
|
+
processedDocumentIds.add(document.id);
|
|
2325
|
+
const existingDocument = probe ?? previous.get(document.id);
|
|
1986
2326
|
nextDocuments.set(document.id, document);
|
|
1987
|
-
if (!
|
|
2327
|
+
if (!existingDocument) {
|
|
1988
2328
|
added += 1;
|
|
1989
|
-
|
|
2329
|
+
onDocumentProcessed?.(document.uri, "added");
|
|
2330
|
+
} else if (existingDocument.contentHash !== document.contentHash) {
|
|
1990
2331
|
changed += 1;
|
|
2332
|
+
onDocumentProcessed?.(document.uri, "changed");
|
|
1991
2333
|
} else {
|
|
1992
2334
|
unchanged += 1;
|
|
2335
|
+
onDocumentProcessed?.(document.uri, "unchanged");
|
|
1993
2336
|
}
|
|
1994
2337
|
} catch (error) {
|
|
1995
2338
|
failed += 1;
|
|
1996
2339
|
onFailure(item.url, error);
|
|
1997
2340
|
}
|
|
1998
|
-
}
|
|
2341
|
+
});
|
|
1999
2342
|
return { added, changed, unchanged, failed };
|
|
2000
2343
|
}
|
|
2001
2344
|
async function ingestSources({
|
|
2002
2345
|
workspacePath,
|
|
2003
2346
|
sourceIds,
|
|
2004
|
-
changedOnly = false
|
|
2347
|
+
changedOnly = false,
|
|
2348
|
+
progress
|
|
2005
2349
|
}) {
|
|
2006
2350
|
const config = await loadConfig(workspacePath);
|
|
2007
2351
|
const defaultRetentionDays = config.crawler.retentionDays;
|
|
2352
|
+
const defaultUserAgent = config.crawler.defaultUserAgent;
|
|
2353
|
+
const defaultRateLimitMs = config.crawler.rateLimitMs;
|
|
2354
|
+
const defaultMaxConcurrentRequests = config.crawler.maxConcurrentRequests;
|
|
2008
2355
|
const sources = (await listSources(workspacePath)).filter((source) => source.enabled && (!sourceIds || sourceIds.includes(source.id)));
|
|
2009
2356
|
const existing = await loadDocuments(workspacePath);
|
|
2010
2357
|
const previous = previousMap(existing);
|
|
@@ -2014,20 +2361,38 @@ async function ingestSources({
|
|
|
2014
2361
|
let unchanged = 0;
|
|
2015
2362
|
let failed = 0;
|
|
2016
2363
|
const failures = [];
|
|
2364
|
+
reportProgress(progress, `Ingesting ${sources.length} source${sources.length === 1 ? "" : "s"}`);
|
|
2017
2365
|
for (const source of sources) {
|
|
2366
|
+
const maxConcurrentRequests = source.crawl?.maxConcurrentRequests ?? defaultMaxConcurrentRequests;
|
|
2367
|
+
const sourceBefore = { added, changed, unchanged, failed };
|
|
2368
|
+
const processedDocumentIds = /* @__PURE__ */ new Set();
|
|
2369
|
+
const reportDocumentOutcome = (uri, outcome) => {
|
|
2370
|
+
const label = outcome === "unchanged" ? "Unchanged" : outcome === "changed" ? "Updated" : "Added";
|
|
2371
|
+
reportProgress(progress, `${label} ${uri}`);
|
|
2372
|
+
};
|
|
2018
2373
|
const ingestOne = async (uri, producer) => {
|
|
2019
2374
|
try {
|
|
2020
2375
|
const probeId = stableId("doc", source.id, uri);
|
|
2021
2376
|
const earlier = previous.get(probeId);
|
|
2022
2377
|
const document = await producer();
|
|
2378
|
+
if (processedDocumentIds.has(document.id)) {
|
|
2379
|
+
reportProgressDetail(progress, `Skipped duplicate alias ${uri} -> ${document.uri}`);
|
|
2380
|
+
return null;
|
|
2381
|
+
}
|
|
2382
|
+
processedDocumentIds.add(document.id);
|
|
2383
|
+
const existingDocument = earlier ?? previous.get(document.id);
|
|
2023
2384
|
nextDocuments.set(document.id, document);
|
|
2024
|
-
if (!
|
|
2385
|
+
if (!existingDocument) {
|
|
2025
2386
|
added += 1;
|
|
2026
|
-
|
|
2387
|
+
reportDocumentOutcome(document.uri, "added");
|
|
2388
|
+
} else if (existingDocument.contentHash !== document.contentHash) {
|
|
2027
2389
|
changed += 1;
|
|
2390
|
+
reportDocumentOutcome(document.uri, "changed");
|
|
2028
2391
|
} else {
|
|
2029
2392
|
unchanged += 1;
|
|
2393
|
+
reportDocumentOutcome(document.uri, "unchanged");
|
|
2030
2394
|
}
|
|
2395
|
+
return document;
|
|
2031
2396
|
} catch (error) {
|
|
2032
2397
|
failed += 1;
|
|
2033
2398
|
failures.push({
|
|
@@ -2035,50 +2400,69 @@ async function ingestSources({
|
|
|
2035
2400
|
uri,
|
|
2036
2401
|
message: error instanceof Error ? error.message : String(error)
|
|
2037
2402
|
});
|
|
2403
|
+
reportProgressDetail(progress, `Failed ${uri}: ${error instanceof Error ? error.message : String(error)}`);
|
|
2404
|
+
return null;
|
|
2038
2405
|
}
|
|
2039
2406
|
};
|
|
2040
2407
|
try {
|
|
2408
|
+
reportProgress(progress, `Source ${source.name} (${source.type})`);
|
|
2041
2409
|
if (source.type === "file") {
|
|
2410
|
+
reportProgress(progress, `Reading file ${source.uri}`);
|
|
2042
2411
|
await ingestOne(source.uri, () => ingestFile({ workspacePath, source, filePath: source.uri, previous: previous.get(stableId("doc", source.id, source.uri)) }));
|
|
2043
|
-
|
|
2044
|
-
|
|
2045
|
-
|
|
2046
|
-
for (const filePath of
|
|
2412
|
+
} else if (source.type === "directory") {
|
|
2413
|
+
const files = await listDirectoryFiles(source);
|
|
2414
|
+
reportProgress(progress, `Scanning ${files.length} file${files.length === 1 ? "" : "s"} from ${source.uri}`);
|
|
2415
|
+
for (const filePath of files) {
|
|
2416
|
+
reportProgress(progress, `Reading file ${filePath}`);
|
|
2047
2417
|
await ingestOne(filePath, () => ingestFile({ workspacePath, source, filePath, previous: previous.get(stableId("doc", source.id, filePath)) }));
|
|
2048
2418
|
}
|
|
2049
|
-
|
|
2050
|
-
|
|
2051
|
-
if (source.type === "url") {
|
|
2419
|
+
} else if (source.type === "url") {
|
|
2420
|
+
reportProgress(progress, `Fetching ${source.uri}`);
|
|
2052
2421
|
await ingestOne(source.uri, () => fetchUrlDocument({ workspacePath, source, url: source.uri, previous: previous.get(stableId("doc", source.id, source.uri)) }));
|
|
2053
|
-
|
|
2054
|
-
|
|
2055
|
-
|
|
2056
|
-
|
|
2057
|
-
|
|
2058
|
-
|
|
2059
|
-
|
|
2060
|
-
|
|
2061
|
-
|
|
2062
|
-
|
|
2422
|
+
} else if (source.type === "website") {
|
|
2423
|
+
reportProgress(progress, `Crawling ${source.uri}`);
|
|
2424
|
+
const urls = await crawlWebsite(source, {
|
|
2425
|
+
userAgent: defaultUserAgent,
|
|
2426
|
+
rateLimitMs: defaultRateLimitMs,
|
|
2427
|
+
maxConcurrentRequests
|
|
2428
|
+
}, progress);
|
|
2429
|
+
reportProgress(progress, `Fetched ${urls.length} page${urls.length === 1 ? "" : "s"} from crawl`);
|
|
2430
|
+
const seenCanonicalUrls = /* @__PURE__ */ new Set();
|
|
2431
|
+
await mapWithConcurrency(urls, maxConcurrentRequests, async (url) => {
|
|
2432
|
+
if (seenCanonicalUrls.has(url)) {
|
|
2433
|
+
reportProgressDetail(progress, `Skipped canonical duplicate ${url}`);
|
|
2434
|
+
return;
|
|
2435
|
+
}
|
|
2436
|
+
reportProgress(progress, `Fetching ${url}`);
|
|
2437
|
+
const document = await ingestOne(url, () => fetchUrlDocument({ workspacePath, source, url, previous: previous.get(stableId("doc", source.id, url)) }));
|
|
2438
|
+
if (document) {
|
|
2439
|
+
seenCanonicalUrls.add(document.uri);
|
|
2440
|
+
}
|
|
2441
|
+
});
|
|
2442
|
+
} else if (source.type === "rss") {
|
|
2443
|
+
reportProgress(progress, `Fetching feed ${source.uri}`);
|
|
2444
|
+
const result = await ingestRssSource({
|
|
2063
2445
|
workspacePath,
|
|
2064
2446
|
source,
|
|
2065
2447
|
previous,
|
|
2066
2448
|
nextDocuments,
|
|
2449
|
+
maxConcurrentRequests,
|
|
2450
|
+
onDocumentProcessed: reportDocumentOutcome,
|
|
2067
2451
|
onFailure: (uri, error) => {
|
|
2068
2452
|
failures.push({
|
|
2069
2453
|
sourceId: source.id,
|
|
2070
2454
|
uri,
|
|
2071
2455
|
message: error instanceof Error ? error.message : String(error)
|
|
2072
2456
|
});
|
|
2457
|
+
reportProgressDetail(progress, `Failed ${uri}: ${error instanceof Error ? error.message : String(error)}`);
|
|
2073
2458
|
}
|
|
2074
2459
|
});
|
|
2075
|
-
added +=
|
|
2076
|
-
changed +=
|
|
2077
|
-
unchanged +=
|
|
2078
|
-
failed +=
|
|
2079
|
-
|
|
2080
|
-
|
|
2081
|
-
if (source.type === "markdown" || source.type === "text") {
|
|
2460
|
+
added += result.added;
|
|
2461
|
+
changed += result.changed;
|
|
2462
|
+
unchanged += result.unchanged;
|
|
2463
|
+
failed += result.failed;
|
|
2464
|
+
} else if (source.type === "markdown" || source.type === "text") {
|
|
2465
|
+
reportProgress(progress, `Processing inline ${source.type} source ${source.id}`);
|
|
2082
2466
|
await ingestOne(source.uri, () => ingestInlineContent({
|
|
2083
2467
|
workspacePath,
|
|
2084
2468
|
source,
|
|
@@ -2095,13 +2479,19 @@ async function ingestSources({
|
|
|
2095
2479
|
uri: source.uri,
|
|
2096
2480
|
message: error instanceof Error ? error.message : String(error)
|
|
2097
2481
|
});
|
|
2482
|
+
reportProgressDetail(progress, `Failed source ${source.name}: ${error instanceof Error ? error.message : String(error)}`);
|
|
2098
2483
|
}
|
|
2484
|
+
reportProgress(
|
|
2485
|
+
progress,
|
|
2486
|
+
`Finished ${source.name}: +${added - sourceBefore.added} added, ${changed - sourceBefore.changed} changed, ${unchanged - sourceBefore.unchanged} unchanged, ${failed - sourceBefore.failed} failed`
|
|
2487
|
+
);
|
|
2099
2488
|
}
|
|
2100
2489
|
const expiringDocuments = [...nextDocuments.values()].filter((document) => {
|
|
2101
2490
|
const source = sources.find((candidate) => candidate.id === document.sourceId);
|
|
2102
2491
|
return source ? shouldExpireRssDocument(document, source, defaultRetentionDays) : false;
|
|
2103
2492
|
});
|
|
2104
2493
|
if (expiringDocuments.length > 0) {
|
|
2494
|
+
reportProgress(progress, `Removing ${expiringDocuments.length} expired RSS document${expiringDocuments.length === 1 ? "" : "s"}`);
|
|
2105
2495
|
const expiredIds = new Set(expiringDocuments.map((document) => document.id));
|
|
2106
2496
|
for (const document of expiringDocuments) {
|
|
2107
2497
|
nextDocuments.delete(document.id);
|
|
@@ -2128,6 +2518,7 @@ async function ingestSources({
|
|
|
2128
2518
|
documentsSnapshot: documentSnapshot(finalDocuments)
|
|
2129
2519
|
};
|
|
2130
2520
|
await writeRun(workspacePath, run);
|
|
2521
|
+
reportProgress(progress, `Ingest complete: ${added} added, ${changed} changed, ${unchanged} unchanged, ${failed} failed`);
|
|
2131
2522
|
return {
|
|
2132
2523
|
runId: id,
|
|
2133
2524
|
documents: { added, changed, unchanged, failed },
|
|
@@ -2137,7 +2528,8 @@ async function ingestSources({
|
|
|
2137
2528
|
async function reprocessDocuments({
|
|
2138
2529
|
workspacePath,
|
|
2139
2530
|
sourceId,
|
|
2140
|
-
documentId
|
|
2531
|
+
documentId,
|
|
2532
|
+
progress
|
|
2141
2533
|
}) {
|
|
2142
2534
|
const documents = await loadDocuments(workspacePath);
|
|
2143
2535
|
const sources = await listSources(workspacePath);
|
|
@@ -2145,15 +2537,20 @@ async function reprocessDocuments({
|
|
|
2145
2537
|
const nextDocuments = new Map(documents.map((document) => [document.id, document]));
|
|
2146
2538
|
let documentsReprocessed = 0;
|
|
2147
2539
|
let documentsSkipped = 0;
|
|
2148
|
-
|
|
2540
|
+
const targets = documents.filter((candidate) => (!sourceId || candidate.sourceId === sourceId) && (!documentId || candidate.id === documentId));
|
|
2541
|
+
reportProgress(progress, `Reprocessing ${targets.length} document${targets.length === 1 ? "" : "s"}`);
|
|
2542
|
+
for (const document of targets) {
|
|
2543
|
+
reportProgressDetail(progress, `Reprocessing ${document.id} (${document.title})`);
|
|
2149
2544
|
const source = sourceMap.get(document.sourceId);
|
|
2150
2545
|
if (!source || !document.rawPath || !await fileExists(document.rawPath)) {
|
|
2151
2546
|
documentsSkipped += 1;
|
|
2547
|
+
reportProgressDetail(progress, `Skipped ${document.id}: raw source not available`);
|
|
2152
2548
|
continue;
|
|
2153
2549
|
}
|
|
2154
2550
|
const updated = source.type === "url" || source.type === "website" || source.type === "rss" ? await reprocessRemoteDocument(document, source) : await reprocessStoredDocument(document, source);
|
|
2155
2551
|
if (!updated) {
|
|
2156
2552
|
documentsSkipped += 1;
|
|
2553
|
+
reportProgressDetail(progress, `Skipped ${document.id}: source type could not be reprocessed`);
|
|
2157
2554
|
continue;
|
|
2158
2555
|
}
|
|
2159
2556
|
nextDocuments.set(updated.id, updated);
|
|
@@ -2173,15 +2570,217 @@ async function reprocessDocuments({
|
|
|
2173
2570
|
},
|
|
2174
2571
|
documentsSnapshot: documentSnapshot(finalDocuments)
|
|
2175
2572
|
});
|
|
2573
|
+
reportProgress(progress, `Reprocess complete: ${documentsReprocessed} updated, ${documentsSkipped} skipped`);
|
|
2176
2574
|
return { runId: id, documentsReprocessed, documentsSkipped };
|
|
2177
2575
|
}
|
|
2178
2576
|
|
|
2577
|
+
// src/ingest/adapters/website-feed-discovery.ts
|
|
2578
|
+
import { load as load3 } from "cheerio";
|
|
2579
|
+
var COMMON_FEED_PATHS = [
|
|
2580
|
+
"/feed",
|
|
2581
|
+
"/feed.xml",
|
|
2582
|
+
"/rss",
|
|
2583
|
+
"/rss.xml",
|
|
2584
|
+
"/atom.xml",
|
|
2585
|
+
"/index.xml",
|
|
2586
|
+
"/blog/feed",
|
|
2587
|
+
"/blog/feed.xml",
|
|
2588
|
+
"/blog/rss.xml",
|
|
2589
|
+
"/blog/atom.xml",
|
|
2590
|
+
"/blog/index.xml",
|
|
2591
|
+
"/news/feed",
|
|
2592
|
+
"/news/feed.xml",
|
|
2593
|
+
"/news/rss.xml",
|
|
2594
|
+
"/news/atom.xml",
|
|
2595
|
+
"/news/index.xml"
|
|
2596
|
+
];
|
|
2597
|
+
function normalizeCandidateUrl(href, baseUrl) {
|
|
2598
|
+
try {
|
|
2599
|
+
const resolved = new URL(href, baseUrl);
|
|
2600
|
+
if (!["http:", "https:"].includes(resolved.protocol)) {
|
|
2601
|
+
return null;
|
|
2602
|
+
}
|
|
2603
|
+
return resolved.href;
|
|
2604
|
+
} catch {
|
|
2605
|
+
return null;
|
|
2606
|
+
}
|
|
2607
|
+
}
|
|
2608
|
+
function looksLikeFeedLink(typeHint, href) {
|
|
2609
|
+
const type = typeHint?.toLowerCase() ?? "";
|
|
2610
|
+
const lowerHref = href.toLowerCase();
|
|
2611
|
+
return type.includes("rss") || type.includes("atom") || type.includes("xml") || lowerHref.includes("/feed") || lowerHref.includes("/rss") || lowerHref.includes("/atom") || lowerHref.endsWith(".xml");
|
|
2612
|
+
}
|
|
2613
|
+
function extractDeclaredFeedCandidates(html, baseUrl) {
|
|
2614
|
+
const $ = load3(html);
|
|
2615
|
+
const candidates = [];
|
|
2616
|
+
$("link[href]").each((index, element) => {
|
|
2617
|
+
const rel = ($(element).attr("rel") ?? "").split(/\s+/).map((value) => value.trim().toLowerCase()).filter(Boolean);
|
|
2618
|
+
const href = $(element).attr("href");
|
|
2619
|
+
if (!href || !rel.includes("alternate")) {
|
|
2620
|
+
return;
|
|
2621
|
+
}
|
|
2622
|
+
const typeHint = $(element).attr("type") ?? void 0;
|
|
2623
|
+
if (!looksLikeFeedLink(typeHint, href)) {
|
|
2624
|
+
return;
|
|
2625
|
+
}
|
|
2626
|
+
const normalized = normalizeCandidateUrl(href, baseUrl);
|
|
2627
|
+
if (!normalized) {
|
|
2628
|
+
return;
|
|
2629
|
+
}
|
|
2630
|
+
candidates.push({
|
|
2631
|
+
url: normalized,
|
|
2632
|
+
discoveredBy: "declared",
|
|
2633
|
+
order: index,
|
|
2634
|
+
typeHint
|
|
2635
|
+
});
|
|
2636
|
+
});
|
|
2637
|
+
return candidates;
|
|
2638
|
+
}
|
|
2639
|
+
function buildCommonFeedCandidates(baseUrl) {
|
|
2640
|
+
return COMMON_FEED_PATHS.map((pathname, index) => ({
|
|
2641
|
+
url: new URL(pathname, baseUrl).href,
|
|
2642
|
+
discoveredBy: "common",
|
|
2643
|
+
order: index
|
|
2644
|
+
}));
|
|
2645
|
+
}
|
|
2646
|
+
function dedupeCandidates(candidates) {
|
|
2647
|
+
const seen = /* @__PURE__ */ new Set();
|
|
2648
|
+
const deduped = [];
|
|
2649
|
+
for (const candidate of candidates) {
|
|
2650
|
+
if (seen.has(candidate.url)) {
|
|
2651
|
+
continue;
|
|
2652
|
+
}
|
|
2653
|
+
seen.add(candidate.url);
|
|
2654
|
+
deduped.push(candidate);
|
|
2655
|
+
}
|
|
2656
|
+
return deduped;
|
|
2657
|
+
}
|
|
2658
|
+
function looksLikeFeedDocument(contentType, body) {
|
|
2659
|
+
const type = contentType?.toLowerCase() ?? "";
|
|
2660
|
+
const lowerBody = body.toLowerCase();
|
|
2661
|
+
return type.includes("rss") || type.includes("atom") || type.includes("xml") && (lowerBody.includes("<rss") || lowerBody.includes("<feed") || lowerBody.includes("<rdf:rdf")) || lowerBody.includes("<rss") || lowerBody.includes("<feed") || lowerBody.includes("<rdf:rdf");
|
|
2662
|
+
}
|
|
2663
|
+
function hasStablePrefixSegment(segment) {
|
|
2664
|
+
return typeof segment === "string" && segment.length > 0 && /[a-z]/i.test(segment);
|
|
2665
|
+
}
|
|
2666
|
+
function deriveExcludePrefix(itemUrls, websiteOrigin) {
|
|
2667
|
+
const paths = itemUrls.map((itemUrl) => {
|
|
2668
|
+
try {
|
|
2669
|
+
const parsed = new URL(itemUrl);
|
|
2670
|
+
if (parsed.origin !== websiteOrigin) {
|
|
2671
|
+
return null;
|
|
2672
|
+
}
|
|
2673
|
+
return parsed.pathname.split("/").filter(Boolean);
|
|
2674
|
+
} catch {
|
|
2675
|
+
return null;
|
|
2676
|
+
}
|
|
2677
|
+
}).filter((segments) => Array.isArray(segments));
|
|
2678
|
+
if (paths.length < 2) {
|
|
2679
|
+
return void 0;
|
|
2680
|
+
}
|
|
2681
|
+
const first = paths[0];
|
|
2682
|
+
if (!first) {
|
|
2683
|
+
return void 0;
|
|
2684
|
+
}
|
|
2685
|
+
let commonLength = 0;
|
|
2686
|
+
while (commonLength < first.length) {
|
|
2687
|
+
const nextSegment = first[commonLength];
|
|
2688
|
+
if (!hasStablePrefixSegment(nextSegment) || !paths.every((segments) => segments[commonLength] === nextSegment)) {
|
|
2689
|
+
break;
|
|
2690
|
+
}
|
|
2691
|
+
commonLength += 1;
|
|
2692
|
+
}
|
|
2693
|
+
if (commonLength === 0) {
|
|
2694
|
+
return void 0;
|
|
2695
|
+
}
|
|
2696
|
+
return `/${first.slice(0, commonLength).join("/")}/`;
|
|
2697
|
+
}
|
|
2698
|
+
function scoreCandidate(candidate) {
|
|
2699
|
+
const url = new URL(candidate.url);
|
|
2700
|
+
const segments = url.pathname.split("/").filter(Boolean);
|
|
2701
|
+
let score = candidate.discoveredBy === "declared" ? 1e3 : 100;
|
|
2702
|
+
score -= candidate.order;
|
|
2703
|
+
score -= segments.length * 10;
|
|
2704
|
+
if (candidate.typeHint?.toLowerCase().includes("rss") || candidate.typeHint?.toLowerCase().includes("atom")) {
|
|
2705
|
+
score += 25;
|
|
2706
|
+
}
|
|
2707
|
+
if (["/feed", "/feed.xml", "/rss", "/rss.xml", "/atom.xml", "/index.xml"].includes(url.pathname)) {
|
|
2708
|
+
score += 50;
|
|
2709
|
+
}
|
|
2710
|
+
if (url.pathname.includes("comments")) {
|
|
2711
|
+
score -= 200;
|
|
2712
|
+
}
|
|
2713
|
+
return score;
|
|
2714
|
+
}
|
|
2715
|
+
async function validateCandidate(candidate, websiteUrl, userAgent) {
|
|
2716
|
+
try {
|
|
2717
|
+
const response2 = await fetch(candidate.url, { headers: { "user-agent": userAgent } });
|
|
2718
|
+
if (!response2.ok) {
|
|
2719
|
+
return null;
|
|
2720
|
+
}
|
|
2721
|
+
const body = await response2.text();
|
|
2722
|
+
if (!looksLikeFeedDocument(response2.headers.get("content-type"), body)) {
|
|
2723
|
+
return null;
|
|
2724
|
+
}
|
|
2725
|
+
const source = {
|
|
2726
|
+
id: "src_detected_feed",
|
|
2727
|
+
type: "rss",
|
|
2728
|
+
uri: candidate.url,
|
|
2729
|
+
name: "Detected Feed",
|
|
2730
|
+
enabled: true,
|
|
2731
|
+
tags: [],
|
|
2732
|
+
metadata: {},
|
|
2733
|
+
createdAt: "1970-01-01T00:00:00.000Z",
|
|
2734
|
+
updatedAt: "1970-01-01T00:00:00.000Z"
|
|
2735
|
+
};
|
|
2736
|
+
const items = await parseRssFeedDocument(body, source);
|
|
2737
|
+
return {
|
|
2738
|
+
feedUrl: candidate.url,
|
|
2739
|
+
discoveredBy: candidate.discoveredBy,
|
|
2740
|
+
excludePrefix: deriveExcludePrefix(items.map((item) => item.url), websiteUrl.origin)
|
|
2741
|
+
};
|
|
2742
|
+
} catch {
|
|
2743
|
+
return null;
|
|
2744
|
+
}
|
|
2745
|
+
}
|
|
2746
|
+
async function discoverWebsiteFeed(websiteUrl, userAgent) {
|
|
2747
|
+
try {
|
|
2748
|
+
const baseUrl = new URL(websiteUrl);
|
|
2749
|
+
const response2 = await fetch(baseUrl, { headers: { "user-agent": userAgent } });
|
|
2750
|
+
if (!response2.ok) {
|
|
2751
|
+
return null;
|
|
2752
|
+
}
|
|
2753
|
+
const html = await response2.text();
|
|
2754
|
+
const candidates = dedupeCandidates([
|
|
2755
|
+
...extractDeclaredFeedCandidates(html, baseUrl),
|
|
2756
|
+
...buildCommonFeedCandidates(baseUrl)
|
|
2757
|
+
]).sort((left, right) => scoreCandidate(right) - scoreCandidate(left));
|
|
2758
|
+
for (const candidate of candidates) {
|
|
2759
|
+
const validated = await validateCandidate(candidate, baseUrl, userAgent);
|
|
2760
|
+
if (validated) {
|
|
2761
|
+
return validated;
|
|
2762
|
+
}
|
|
2763
|
+
}
|
|
2764
|
+
return null;
|
|
2765
|
+
} catch {
|
|
2766
|
+
return null;
|
|
2767
|
+
}
|
|
2768
|
+
}
|
|
2769
|
+
|
|
2179
2770
|
// src/query/search-service.ts
|
|
2180
|
-
import { readFile as
|
|
2771
|
+
import { readFile as readFile10 } from "fs/promises";
|
|
2181
2772
|
import { BoolQuery, MatchQuery, OP, TermQuery, reciprocalRankFusion } from "@tryformation/querylight-ts";
|
|
2182
2773
|
import path18 from "path";
|
|
2183
2774
|
async function loadHydratedIndex(workspacePath) {
|
|
2184
|
-
|
|
2775
|
+
let state;
|
|
2776
|
+
try {
|
|
2777
|
+
state = await readLatestIndexState(workspacePath);
|
|
2778
|
+
} catch (error) {
|
|
2779
|
+
if (error.code === "ENOENT") {
|
|
2780
|
+
throw new CliError("lexical index is not built; run `qli rebuild` or `qli chunk` followed by `qli index build`", "INDEX_MISSING", 7 /* QueryError */);
|
|
2781
|
+
}
|
|
2782
|
+
throw error;
|
|
2783
|
+
}
|
|
2185
2784
|
const mapping = createIndexMapping(Object.keys(state.fieldState ?? {}).filter((field) => field.startsWith("metadata.")));
|
|
2186
2785
|
return new (await import("@tryformation/querylight-ts")).DocumentIndex(mapping).loadState(state);
|
|
2187
2786
|
}
|
|
@@ -2399,7 +2998,7 @@ async function buildSnippetWithAdjacentChunks(chunk, query, {
|
|
|
2399
2998
|
if (!await fileExists(document.normalizedPath)) {
|
|
2400
2999
|
return buildSnippet(chunk.text, query);
|
|
2401
3000
|
}
|
|
2402
|
-
const raw = await
|
|
3001
|
+
const raw = await readFile10(document.normalizedPath, "utf8");
|
|
2403
3002
|
orderedChunks = buildChunksForDocument(document, raw, config);
|
|
2404
3003
|
orderedChunkCache.set(document.id, orderedChunks);
|
|
2405
3004
|
}
|
|
@@ -2417,9 +3016,25 @@ async function buildSnippetWithAdjacentChunks(chunk, query, {
|
|
|
2417
3016
|
function normalizeDisplayTitle(title) {
|
|
2418
3017
|
return title.replace(/\s*\|\s*Querylight TS Demo\s*$/i, "").replace(/\s+/g, " ").trim();
|
|
2419
3018
|
}
|
|
3019
|
+
var LOW_SIGNAL_RESULT_TITLES = /* @__PURE__ */ new Set([
|
|
3020
|
+
"choose this instead of",
|
|
3021
|
+
"how xyz runs it",
|
|
3022
|
+
"naechste schritte",
|
|
3023
|
+
"next steps",
|
|
3024
|
+
"overview",
|
|
3025
|
+
"passend wenn",
|
|
3026
|
+
"problem",
|
|
3027
|
+
"right fit",
|
|
3028
|
+
"waehlen sie das stattdessen",
|
|
3029
|
+
"was sie bekommen",
|
|
3030
|
+
"what you get",
|
|
3031
|
+
"wie xyz es umsetzt",
|
|
3032
|
+
"uberblick",
|
|
3033
|
+
"\xFCberblick"
|
|
3034
|
+
]);
|
|
2420
3035
|
function chooseResultTitle(chunk) {
|
|
2421
3036
|
const documentTitle = normalizeDisplayTitle(chunk.title);
|
|
2422
|
-
const headings = chunk.headingPath.map((heading) => normalizeDisplayTitle(heading)).filter(
|
|
3037
|
+
const headings = chunk.headingPath.map((heading) => normalizeDisplayTitle(heading)).filter((heading) => heading.length > 0 && !LOW_SIGNAL_RESULT_TITLES.has(heading.toLowerCase()));
|
|
2423
3038
|
const leafHeading = headings.at(-1);
|
|
2424
3039
|
if (leafHeading && leafHeading.toLowerCase() !== documentTitle.toLowerCase()) {
|
|
2425
3040
|
return leafHeading;
|
|
@@ -2441,6 +3056,9 @@ function normalizeUriPath(uri) {
|
|
|
2441
3056
|
return uri.toLowerCase().replace(/\/+$/, "");
|
|
2442
3057
|
}
|
|
2443
3058
|
}
|
|
3059
|
+
function normalizeUriIdentity(uri) {
|
|
3060
|
+
return normalizeRemoteUrl(uri).toLowerCase().replace(/\/+$/, "");
|
|
3061
|
+
}
|
|
2444
3062
|
function uriSpecificity(uri) {
|
|
2445
3063
|
const normalized = normalizeUriPath(uri);
|
|
2446
3064
|
if (normalized === "/") {
|
|
@@ -2457,6 +3075,11 @@ function isMoreSpecificDuplicate(candidate, existing) {
|
|
|
2457
3075
|
if (!candidateTitle || candidateTitle !== existingTitle) {
|
|
2458
3076
|
return false;
|
|
2459
3077
|
}
|
|
3078
|
+
const candidateIdentity = normalizeUriIdentity(candidate.uri);
|
|
3079
|
+
const existingIdentity = normalizeUriIdentity(existing.uri);
|
|
3080
|
+
if (candidateIdentity === existingIdentity) {
|
|
3081
|
+
return candidate.uri.length < existing.uri.length;
|
|
3082
|
+
}
|
|
2460
3083
|
const candidatePath = normalizeUriPath(candidate.uri);
|
|
2461
3084
|
const existingPath = normalizeUriPath(existing.uri);
|
|
2462
3085
|
if (candidatePath === existingPath) {
|
|
@@ -2471,28 +3094,28 @@ function isMoreSpecificDuplicate(candidate, existing) {
|
|
|
2471
3094
|
}
|
|
2472
3095
|
function collapseAggregateDuplicates(results, topK) {
|
|
2473
3096
|
const deduped = [];
|
|
2474
|
-
for (const
|
|
3097
|
+
for (const result of results) {
|
|
2475
3098
|
const duplicateIndex = deduped.findIndex(
|
|
2476
|
-
(existing) => isMoreSpecificDuplicate(
|
|
3099
|
+
(existing) => isMoreSpecificDuplicate(result, existing) || isMoreSpecificDuplicate(existing, result)
|
|
2477
3100
|
);
|
|
2478
3101
|
if (duplicateIndex < 0) {
|
|
2479
|
-
deduped.push(
|
|
3102
|
+
deduped.push(result);
|
|
2480
3103
|
continue;
|
|
2481
3104
|
}
|
|
2482
|
-
if (isMoreSpecificDuplicate(
|
|
2483
|
-
deduped[duplicateIndex] =
|
|
3105
|
+
if (isMoreSpecificDuplicate(result, deduped[duplicateIndex])) {
|
|
3106
|
+
deduped[duplicateIndex] = result;
|
|
2484
3107
|
}
|
|
2485
3108
|
}
|
|
2486
3109
|
return deduped.slice(0, topK);
|
|
2487
3110
|
}
|
|
2488
3111
|
function rerankResultsByDocument(results, topK) {
|
|
2489
3112
|
const byDocument = /* @__PURE__ */ new Map();
|
|
2490
|
-
for (const
|
|
2491
|
-
const existing = byDocument.get(
|
|
3113
|
+
for (const result of results) {
|
|
3114
|
+
const existing = byDocument.get(result.documentId);
|
|
2492
3115
|
if (existing) {
|
|
2493
|
-
existing.push(
|
|
3116
|
+
existing.push(result);
|
|
2494
3117
|
} else {
|
|
2495
|
-
byDocument.set(
|
|
3118
|
+
byDocument.set(result.documentId, [result]);
|
|
2496
3119
|
}
|
|
2497
3120
|
}
|
|
2498
3121
|
const reranked = [...byDocument.values()].flatMap((group) => {
|
|
@@ -2501,7 +3124,7 @@ function rerankResultsByDocument(results, topK) {
|
|
|
2501
3124
|
if (!best) {
|
|
2502
3125
|
return [];
|
|
2503
3126
|
}
|
|
2504
|
-
const tailScore = rest.reduce((sum,
|
|
3127
|
+
const tailScore = rest.reduce((sum, result) => sum + result.score, 0);
|
|
2505
3128
|
const aggregateScore = best.score + tailScore * 0.35 + (group.length - 1) * 0.2;
|
|
2506
3129
|
return [{ ...best, score: aggregateScore }];
|
|
2507
3130
|
}).sort((left, right) => right.score - left.score);
|
|
@@ -2569,7 +3192,6 @@ async function searchIndex({
|
|
|
2569
3192
|
score: 0,
|
|
2570
3193
|
title: chooseResultTitle(chunk),
|
|
2571
3194
|
uri: chunk.uri,
|
|
2572
|
-
headingPath: chunk.headingPath,
|
|
2573
3195
|
snippet: await buildSnippetWithAdjacentChunks(chunk, document.title, {
|
|
2574
3196
|
document,
|
|
2575
3197
|
config,
|
|
@@ -2584,7 +3206,7 @@ async function searchIndex({
|
|
|
2584
3206
|
};
|
|
2585
3207
|
})
|
|
2586
3208
|
);
|
|
2587
|
-
return { retrievalMode: "lexical", results: latestResults.filter((
|
|
3209
|
+
return { retrievalMode: "lexical", results: latestResults.filter((result) => result != null) };
|
|
2588
3210
|
}
|
|
2589
3211
|
const lexicalHits = async () => {
|
|
2590
3212
|
const index = await loadHydratedIndex(workspacePath);
|
|
@@ -2633,7 +3255,6 @@ async function searchIndex({
|
|
|
2633
3255
|
score,
|
|
2634
3256
|
title: chooseResultTitle(chunk),
|
|
2635
3257
|
uri: chunk.uri,
|
|
2636
|
-
headingPath: chunk.headingPath,
|
|
2637
3258
|
snippet: await buildSnippetWithAdjacentChunks(chunk, normalizedQuery, {
|
|
2638
3259
|
document: documents.get(chunk.documentId),
|
|
2639
3260
|
config,
|
|
@@ -2647,13 +3268,13 @@ async function searchIndex({
|
|
|
2647
3268
|
metadata: chunk.metadata
|
|
2648
3269
|
};
|
|
2649
3270
|
}));
|
|
2650
|
-
const results = rawResults.filter((
|
|
3271
|
+
const results = rawResults.filter((result) => result != null);
|
|
2651
3272
|
return { retrievalMode: mode, results: rerankResultsByDocument(results, topK) };
|
|
2652
3273
|
}
|
|
2653
3274
|
|
|
2654
3275
|
// src/query/related-service.ts
|
|
2655
3276
|
import path19 from "path";
|
|
2656
|
-
function
|
|
3277
|
+
function cosineSimilarity2(left, right) {
|
|
2657
3278
|
let dot = 0;
|
|
2658
3279
|
let leftNorm = 0;
|
|
2659
3280
|
let rightNorm = 0;
|
|
@@ -2739,7 +3360,7 @@ async function findRelatedDocuments({
|
|
|
2739
3360
|
const results = [...vectors.values()].filter((candidate) => candidate.document.id !== selected.id).map((candidate) => ({
|
|
2740
3361
|
documentId: candidate.document.id,
|
|
2741
3362
|
sourceId: candidate.document.sourceId,
|
|
2742
|
-
score:
|
|
3363
|
+
score: cosineSimilarity2(sourceVector.embedding, candidate.embedding),
|
|
2743
3364
|
title: candidate.document.title,
|
|
2744
3365
|
uri: candidate.document.uri,
|
|
2745
3366
|
metadata: candidate.document.metadata
|
|
@@ -2767,21 +3388,20 @@ async function createContext({
|
|
|
2767
3388
|
const search = await searchIndex({ workspacePath, query, topK, showChunks: true, retrievalMode });
|
|
2768
3389
|
const sources = [];
|
|
2769
3390
|
let total = 0;
|
|
2770
|
-
for (const
|
|
2771
|
-
const text =
|
|
3391
|
+
for (const result of search.results) {
|
|
3392
|
+
const text = result.text ?? "";
|
|
2772
3393
|
if (total + text.length > maxChars && sources.length > 0) {
|
|
2773
3394
|
break;
|
|
2774
3395
|
}
|
|
2775
3396
|
total += text.length;
|
|
2776
3397
|
sources.push({
|
|
2777
|
-
chunkId:
|
|
2778
|
-
documentId:
|
|
2779
|
-
sourceId:
|
|
2780
|
-
title:
|
|
2781
|
-
uri:
|
|
2782
|
-
headingPath: result2.headingPath,
|
|
3398
|
+
chunkId: result.chunkId,
|
|
3399
|
+
documentId: result.documentId,
|
|
3400
|
+
sourceId: result.sourceId,
|
|
3401
|
+
title: result.title,
|
|
3402
|
+
uri: result.uri,
|
|
2783
3403
|
text,
|
|
2784
|
-
metadata:
|
|
3404
|
+
metadata: result.metadata
|
|
2785
3405
|
});
|
|
2786
3406
|
}
|
|
2787
3407
|
const markdown = [
|
|
@@ -2792,7 +3412,6 @@ async function createContext({
|
|
|
2792
3412
|
`Title: ${source.title}`,
|
|
2793
3413
|
`URL: ${source.uri}`,
|
|
2794
3414
|
`Chunk ID: ${source.chunkId}`,
|
|
2795
|
-
source.headingPath.length > 0 ? `Heading Path: ${source.headingPath.join(" > ")}` : "",
|
|
2796
3415
|
"",
|
|
2797
3416
|
source.text,
|
|
2798
3417
|
""
|
|
@@ -2871,27 +3490,30 @@ function formatSourcesTable(sources) {
|
|
|
2871
3490
|
return table.toString();
|
|
2872
3491
|
}
|
|
2873
3492
|
function formatSearchResults(results) {
|
|
2874
|
-
return results.map((
|
|
2875
|
-
`${index + 1}. ${colors.bold(
|
|
2876
|
-
` ${
|
|
2877
|
-
` Source
|
|
2878
|
-
|
|
2879
|
-
`
|
|
2880
|
-
|
|
2881
|
-
|
|
3493
|
+
return results.map((result, index) => [
|
|
3494
|
+
`${index + 1}. ${colors.bold(result.title)}`,
|
|
3495
|
+
` URL: ${result.uri}`,
|
|
3496
|
+
` Source: ${result.sourceType} | Published: ${result.publicationDate ?? "n/a"} | Score: ${result.score.toFixed(3)}`,
|
|
3497
|
+
"",
|
|
3498
|
+
...result.snippet.split("\n").map((line) => line.length > 0 ? ` ${line}` : "")
|
|
3499
|
+
].join("\n")).join(`
|
|
3500
|
+
|
|
3501
|
+
${colors.dim("---")}
|
|
3502
|
+
|
|
3503
|
+
`);
|
|
2882
3504
|
}
|
|
2883
3505
|
function formatRelatedDocuments(results) {
|
|
2884
|
-
return results.map((
|
|
2885
|
-
`${index + 1}. ${colors.bold(
|
|
2886
|
-
` ${
|
|
2887
|
-
` Similarity: ${
|
|
3506
|
+
return results.map((result, index) => [
|
|
3507
|
+
`${index + 1}. ${colors.bold(result.title)}`,
|
|
3508
|
+
` ${result.uri}`,
|
|
3509
|
+
` Similarity: ${result.score.toFixed(3)}`
|
|
2888
3510
|
].join("\n")).join("\n\n");
|
|
2889
3511
|
}
|
|
2890
3512
|
|
|
2891
3513
|
// src/cli/run-cli.ts
|
|
2892
3514
|
var SOURCE_TYPES = /* @__PURE__ */ new Set(["url", "website", "rss", "file", "directory", "markdown", "text"]);
|
|
2893
3515
|
var RETRIEVAL_MODES = /* @__PURE__ */ new Set(["lexical", "dense", "sparse", "hybrid"]);
|
|
2894
|
-
var SOURCE_TYPE_LIST = ["
|
|
3516
|
+
var SOURCE_TYPE_LIST = ["page", "website", "rss", "file", "directory", "markdown", "text"];
|
|
2895
3517
|
var RETRIEVAL_MODE_LIST = ["lexical", "dense", "sparse", "hybrid"];
|
|
2896
3518
|
var SEARCH_DATE_FIELDS = ["publicationDate", "firstSeenAt", "lastSeenAt", "lastChangedAt", "crawledAt"];
|
|
2897
3519
|
function parseKeyValue(input) {
|
|
@@ -2914,11 +3536,46 @@ function parseOptionalNumber(input, optionName) {
|
|
|
2914
3536
|
}
|
|
2915
3537
|
return value;
|
|
2916
3538
|
}
|
|
3539
|
+
function parseOptionalPositiveInteger(input, optionName) {
|
|
3540
|
+
const value = parseOptionalNumber(input, optionName);
|
|
3541
|
+
if (value === void 0) {
|
|
3542
|
+
return void 0;
|
|
3543
|
+
}
|
|
3544
|
+
if (!Number.isInteger(value) || value < 1) {
|
|
3545
|
+
throw new CliError(`invalid positive integer for ${optionName}: ${input}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
|
|
3546
|
+
}
|
|
3547
|
+
return value;
|
|
3548
|
+
}
|
|
2917
3549
|
function setWhenDefined(target, key, value) {
|
|
2918
3550
|
if (value !== void 0) {
|
|
2919
3551
|
target[key] = value;
|
|
2920
3552
|
}
|
|
2921
3553
|
}
|
|
3554
|
+
function mergePatterns(existing, extra) {
|
|
3555
|
+
const merged = [...existing ?? []];
|
|
3556
|
+
if (extra && !merged.includes(extra)) {
|
|
3557
|
+
merged.push(extra);
|
|
3558
|
+
}
|
|
3559
|
+
return merged.length > 0 ? merged : void 0;
|
|
3560
|
+
}
|
|
3561
|
+
function formatWebsiteSourceAdd(result) {
|
|
3562
|
+
const lines = [`Added source ${result.primarySource.id}`];
|
|
3563
|
+
if (!result.detectedFeed) {
|
|
3564
|
+
lines.push("No feed detected during website registration.");
|
|
3565
|
+
return lines.join("\n");
|
|
3566
|
+
}
|
|
3567
|
+
if (result.detectedFeed.source && result.detectedFeed.wasAdded) {
|
|
3568
|
+
lines.push(`Detected feed ${result.detectedFeed.url} and added source ${result.detectedFeed.source.id}.`);
|
|
3569
|
+
} else if (result.detectedFeed.source) {
|
|
3570
|
+
lines.push(`Detected feed ${result.detectedFeed.url}. Source ${result.detectedFeed.source.id} already exists.`);
|
|
3571
|
+
} else {
|
|
3572
|
+
lines.push(`Detected feed ${result.detectedFeed.url}.`);
|
|
3573
|
+
}
|
|
3574
|
+
if (result.detectedFeed.excludePrefix) {
|
|
3575
|
+
lines.push(`Excluded ${result.detectedFeed.excludePrefix} from the website crawl.`);
|
|
3576
|
+
}
|
|
3577
|
+
return lines.join("\n");
|
|
3578
|
+
}
|
|
2922
3579
|
function createSourceCrawlConfig(type, options, defaults) {
|
|
2923
3580
|
if (!["url", "website", "directory", "rss"].includes(type)) {
|
|
2924
3581
|
return void 0;
|
|
@@ -2926,6 +3583,7 @@ function createSourceCrawlConfig(type, options, defaults) {
|
|
|
2926
3583
|
const crawl = {};
|
|
2927
3584
|
setWhenDefined(crawl, "maxDepth", parseOptionalNumber(options.maxDepth, "--max-depth"));
|
|
2928
3585
|
setWhenDefined(crawl, "maxPages", parseOptionalNumber(options.maxPages, "--max-pages"));
|
|
3586
|
+
setWhenDefined(crawl, "maxConcurrentRequests", parseOptionalPositiveInteger(options.maxConcurrentRequests, "--max-concurrent-requests"));
|
|
2929
3587
|
setWhenDefined(crawl, "includePatterns", options.include);
|
|
2930
3588
|
setWhenDefined(crawl, "excludePatterns", options.exclude);
|
|
2931
3589
|
setWhenDefined(crawl, "obeyRobotsTxt", options.robots);
|
|
@@ -2944,14 +3602,48 @@ function createSourceCrawlConfig(type, options, defaults) {
|
|
|
2944
3602
|
}
|
|
2945
3603
|
return Object.keys(crawl).length > 0 ? crawl : void 0;
|
|
2946
3604
|
}
|
|
3605
|
+
function validateSourceAddOptions(type, options) {
|
|
3606
|
+
const reject = (optionName) => {
|
|
3607
|
+
throw new CliError(`${optionName} is not supported for source type ${type}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
|
|
3608
|
+
};
|
|
3609
|
+
if (options.maxDepth !== void 0 && type !== "website") {
|
|
3610
|
+
reject("--max-depth");
|
|
3611
|
+
}
|
|
3612
|
+
if (options.maxPages !== void 0 && type !== "website") {
|
|
3613
|
+
reject("--max-pages");
|
|
3614
|
+
}
|
|
3615
|
+
if (options.maxConcurrentRequests !== void 0 && !["website", "rss"].includes(type)) {
|
|
3616
|
+
reject("--max-concurrent-requests");
|
|
3617
|
+
}
|
|
3618
|
+
if (options.renderJs && type !== "website") {
|
|
3619
|
+
reject("--render-js");
|
|
3620
|
+
}
|
|
3621
|
+
if (options.robots === false && type !== "website") {
|
|
3622
|
+
reject("--no-robots");
|
|
3623
|
+
}
|
|
3624
|
+
if (options.rateLimitMs !== void 0 && type !== "website") {
|
|
3625
|
+
reject("--rate-limit-ms");
|
|
3626
|
+
}
|
|
3627
|
+
if (options.include !== void 0 && !["website", "directory"].includes(type)) {
|
|
3628
|
+
reject("--include");
|
|
3629
|
+
}
|
|
3630
|
+
if (options.exclude !== void 0 && !["website", "directory"].includes(type)) {
|
|
3631
|
+
reject("--exclude");
|
|
3632
|
+
}
|
|
3633
|
+
if (options.retentionDays !== void 0 && type !== "rss") {
|
|
3634
|
+
reject("--retention-days");
|
|
3635
|
+
}
|
|
3636
|
+
}
|
|
2947
3637
|
function allowedSourceConfigFields(source) {
|
|
2948
3638
|
const fields = /* @__PURE__ */ new Set(["name", "tag", "metadata"]);
|
|
2949
3639
|
if (source.type === "rss") {
|
|
2950
3640
|
fields.add("retentionDays");
|
|
3641
|
+
fields.add("maxConcurrentRequests");
|
|
2951
3642
|
}
|
|
2952
3643
|
if (source.type === "website") {
|
|
2953
3644
|
fields.add("maxDepth");
|
|
2954
3645
|
fields.add("maxPages");
|
|
3646
|
+
fields.add("maxConcurrentRequests");
|
|
2955
3647
|
fields.add("include");
|
|
2956
3648
|
fields.add("exclude");
|
|
2957
3649
|
}
|
|
@@ -2987,6 +3679,10 @@ function buildSourceConfigPatch(source, options) {
|
|
|
2987
3679
|
checkAllowed("maxPages", "--max-pages");
|
|
2988
3680
|
crawlPatch.maxPages = parseOptionalNumber(options.maxPages, "--max-pages");
|
|
2989
3681
|
}
|
|
3682
|
+
if (options.maxConcurrentRequests !== void 0) {
|
|
3683
|
+
checkAllowed("maxConcurrentRequests", "--max-concurrent-requests");
|
|
3684
|
+
crawlPatch.maxConcurrentRequests = parseOptionalPositiveInteger(options.maxConcurrentRequests, "--max-concurrent-requests");
|
|
3685
|
+
}
|
|
2990
3686
|
if (options.include !== void 0) {
|
|
2991
3687
|
checkAllowed("include", "--include");
|
|
2992
3688
|
crawlPatch.includePatterns = options.include;
|
|
@@ -3016,6 +3712,50 @@ function response(command, workspace, data, error) {
|
|
|
3016
3712
|
}
|
|
3017
3713
|
function writeOutput(capture, value, stderr = false) {
|
|
3018
3714
|
(stderr ? capture.stderr : capture.stdout).push(value);
|
|
3715
|
+
if (stderr) {
|
|
3716
|
+
capture.onStderr?.(value);
|
|
3717
|
+
return;
|
|
3718
|
+
}
|
|
3719
|
+
capture.onStdout?.(value);
|
|
3720
|
+
}
|
|
3721
|
+
function createProgressHandler(capture, options) {
|
|
3722
|
+
if (options.json || options.silent || options.quiet) {
|
|
3723
|
+
return void 0;
|
|
3724
|
+
}
|
|
3725
|
+
return (level, message) => {
|
|
3726
|
+
if (level === "detail" && !options.verbose) {
|
|
3727
|
+
return;
|
|
3728
|
+
}
|
|
3729
|
+
writeOutput(capture, message, true);
|
|
3730
|
+
};
|
|
3731
|
+
}
|
|
3732
|
+
async function runIngestCommand({
|
|
3733
|
+
workspace,
|
|
3734
|
+
sourceId,
|
|
3735
|
+
changedOnly,
|
|
3736
|
+
dense,
|
|
3737
|
+
sparse,
|
|
3738
|
+
progress
|
|
3739
|
+
}) {
|
|
3740
|
+
progress?.("info", "Ingest step 1/3: fetch and normalize");
|
|
3741
|
+
const ingest = await ingestSources({
|
|
3742
|
+
workspacePath: workspace,
|
|
3743
|
+
sourceIds: sourceId ? [sourceId] : void 0,
|
|
3744
|
+
changedOnly,
|
|
3745
|
+
progress
|
|
3746
|
+
});
|
|
3747
|
+
progress?.("info", "Ingest step 2/3: chunk affected documents");
|
|
3748
|
+
const chunk = await chunkDocuments({ workspacePath: workspace, sourceId, progress });
|
|
3749
|
+
progress?.("info", "Ingest step 3/3: refresh index");
|
|
3750
|
+
const indexBuild = await buildIndex({
|
|
3751
|
+
workspacePath: workspace,
|
|
3752
|
+
denseOverride: dense ? true : void 0,
|
|
3753
|
+
sparseOverride: sparse ? true : void 0,
|
|
3754
|
+
buildAvailableModels: true,
|
|
3755
|
+
progress
|
|
3756
|
+
});
|
|
3757
|
+
progress?.("info", "Ingest complete");
|
|
3758
|
+
return { ingest, chunk, indexPath: indexBuild.indexPath, metadata: indexBuild.metadata };
|
|
3019
3759
|
}
|
|
3020
3760
|
function parseRetrievalMode(input) {
|
|
3021
3761
|
if (!input) {
|
|
@@ -3030,10 +3770,11 @@ function parseSourceType(input) {
|
|
|
3030
3770
|
if (!input) {
|
|
3031
3771
|
return void 0;
|
|
3032
3772
|
}
|
|
3033
|
-
|
|
3773
|
+
const normalized = input === "page" ? "url" : input;
|
|
3774
|
+
if (!SOURCE_TYPES.has(normalized)) {
|
|
3034
3775
|
throw new CliError(`unsupported source type: ${input}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
|
|
3035
3776
|
}
|
|
3036
|
-
return
|
|
3777
|
+
return normalized;
|
|
3037
3778
|
}
|
|
3038
3779
|
function parseCommaSeparatedList(input) {
|
|
3039
3780
|
const values = (input ?? "").split(",").map((value) => value.trim()).filter(Boolean);
|
|
@@ -3094,56 +3835,96 @@ function workspaceFromArgv(argv) {
|
|
|
3094
3835
|
}
|
|
3095
3836
|
return path21.resolve(DEFAULT_WORKSPACE);
|
|
3096
3837
|
}
|
|
3097
|
-
async function runCli(argv) {
|
|
3098
|
-
const capture = { stdout: [], stderr: [] };
|
|
3838
|
+
async function runCli(argv, io = {}) {
|
|
3839
|
+
const capture = { stdout: [], stderr: [], ...io };
|
|
3099
3840
|
const program = new Command();
|
|
3100
|
-
program.name("qli").description("Build and query a local Querylight workspace from files, directories, URLs, websites, and feeds.").showHelpAfterError().option("--workspace <path>", "Workspace directory. Defaults to .kb in the current directory.", DEFAULT_WORKSPACE).option("--config <path>", "Optional config file override. Useful for testing alternate retrieval settings.").option("--json", "Return a stable JSON envelope for automation and agents.").option("--verbose", "Print more operational detail when a command supports it.").
|
|
3841
|
+
program.name("qli").description("Build and query a local Querylight workspace from files, directories, URLs, websites, and feeds.").showHelpAfterError().option("--workspace <path>", "Workspace directory. Defaults to .kb in the current directory.", DEFAULT_WORKSPACE).option("--config <path>", "Optional config file override. Useful for testing alternate retrieval settings.").option("--json", "Return a stable JSON envelope for automation and agents.").option("--silent", "Suppress progress logging for long-running commands.").option("--verbose", "Print more operational detail when a command supports it.").addOption(new Option("--quiet", "Deprecated alias for --silent.").hideHelp());
|
|
3101
3842
|
program.addHelpText("after", `
|
|
3102
3843
|
Workflow:
|
|
3103
3844
|
1. Initialize a workspace with qli init
|
|
3104
3845
|
2. Register one or more sources with qli source add
|
|
3105
|
-
3.
|
|
3846
|
+
3. Refresh the workspace with qli ingest
|
|
3106
3847
|
4. Query it with qli search, qli related, or qli context
|
|
3107
3848
|
|
|
3108
3849
|
Examples:
|
|
3109
3850
|
qli init
|
|
3110
3851
|
qli source add directory ./docs --name "Product Docs" --tag docs
|
|
3111
|
-
qli
|
|
3852
|
+
qli ingest
|
|
3853
|
+
qli rebuild --silent
|
|
3112
3854
|
qli search "api authentication" --top-k 8
|
|
3113
3855
|
qli context "How do API keys work?" --top-k 8 --max-chars 8000
|
|
3114
3856
|
|
|
3857
|
+
Long-running commands print progress to stderr by default. Use --silent to suppress it.
|
|
3858
|
+
Use --json when another tool needs stable structured output.
|
|
3859
|
+
|
|
3115
3860
|
Use qli <command> --help for command-specific options and examples.`);
|
|
3116
|
-
program.command("init").description("Create a new workspace with the default directory layout and config.").option("--force").addHelpText("after", `
|
|
3861
|
+
program.command("init").description("Create a new workspace with the default directory layout and config, then pull missing retrieval models.").option("--force").addHelpText("after", `
|
|
3117
3862
|
Examples:
|
|
3118
3863
|
qli init
|
|
3119
3864
|
qli init --workspace ./kb
|
|
3120
|
-
qli init --workspace /tmp/querylight --force
|
|
3865
|
+
qli init --workspace /tmp/querylight --force
|
|
3866
|
+
|
|
3867
|
+
Notes:
|
|
3868
|
+
init enables dense and sparse retrieval in new workspaces.
|
|
3869
|
+
init pulls missing model assets for enabled retrieval modes.
|
|
3870
|
+
Sparse model downloads require uv. If uv is not available, init skips the sparse pull.`).action(async function command(options) {
|
|
3871
|
+
const global = this.optsWithGlobals();
|
|
3121
3872
|
const workspace = await resolveWorkspace({ workspace: this.optsWithGlobals().workspace });
|
|
3122
|
-
const
|
|
3123
|
-
|
|
3873
|
+
const result = await ensureWorkspace({ workspacePath: workspace, force: Boolean(options.force) });
|
|
3874
|
+
const config = await loadConfig(workspace, global.config);
|
|
3875
|
+
const status = await getModelStatus(workspace, config);
|
|
3876
|
+
const { pullDense, pullSparse } = resolveMissingConfiguredModelPullPlan({ config, status });
|
|
3877
|
+
if (pullDense || pullSparse) {
|
|
3878
|
+
await pullModels({ workspacePath: workspace, config, pullDense, pullSparse, progress: createProgressHandler(capture, global) });
|
|
3879
|
+
}
|
|
3880
|
+
emit(this.optsWithGlobals().json, capture, response("init", workspace, result), `Initialized workspace at ${workspace}`);
|
|
3124
3881
|
});
|
|
3125
3882
|
const source = program.command("source");
|
|
3126
3883
|
source.description("Register, inspect, and manage workspace sources.");
|
|
3127
|
-
source.command("add").description("Add a source definition. The source is enabled immediately.").argument("<type>", `Source type: ${SOURCE_TYPE_LIST.join(", ")}`).argument("<uri>", "Local path, URL, feed URL, or inline content depending on the source type.").requiredOption("--name <name>").option("--tag <tag...>", "Optional tags used later for filtering during search.").option("--metadata <key=value...>", "Extra metadata fields stored on the source.").option("--max-depth <n>", "Maximum crawl depth for website sources.").option("--max-pages <n>", "Maximum number of pages to ingest from a website source.").option("--include <pattern...>", "Only include matching paths or URLs.").option("--exclude <pattern...>", "Skip matching paths or URLs.").option("--render-js", "Render pages with JavaScript before extraction when supported.").option("--no-robots", "Ignore robots.txt for website crawling. Use only when you control the target site or have permission.").option("--rate-limit-ms <n>", "Delay between website requests.").option("--retention-days <n>", "Retention window in days for RSS items. Defaults to the workspace crawler retention setting.").addHelpText("after", `
|
|
3884
|
+
source.command("add").description("Add a source definition. The source is enabled immediately. Use `page` for one page and `website` for multi-page crawling and feed detection.").argument("<type>", `Source type: ${SOURCE_TYPE_LIST.join(", ")}`).argument("<uri>", "Local path, URL, feed URL, or inline content depending on the source type.").requiredOption("--name <name>").option("--tag <tag...>", "Optional tags used later for filtering during search.").option("--metadata <key=value...>", "Extra metadata fields stored on the source.").option("--max-depth <n>", "Maximum crawl depth for website sources.").option("--max-pages <n>", "Maximum number of pages to ingest from a website source.").option("--max-concurrent-requests <n>", "Maximum remote requests in flight for a website or feed source.").option("--include <pattern...>", "Only include matching paths or URLs.").option("--exclude <pattern...>", "Skip matching paths or URLs.").option("--render-js", "Render pages with JavaScript before extraction when supported.").option("--no-robots", "Ignore robots.txt for website crawling. Use only when you control the target site or have permission.").option("--rate-limit-ms <n>", "Delay between website requests.").option("--retention-days <n>", "Retention window in days for RSS items. Defaults to the workspace crawler retention setting.").addHelpText("after", `
|
|
3128
3885
|
Examples:
|
|
3129
3886
|
qli source add directory ./docs --name "Local Docs" --tag docs
|
|
3130
3887
|
qli source add file ./docs/auth.md --name "Auth Guide"
|
|
3131
|
-
qli source add
|
|
3888
|
+
qli source add page https://example.com/docs/auth --name "Auth Page"
|
|
3132
3889
|
qli source add website https://example.com --name "Docs Site" --max-depth 2 --max-pages 50 --include /docs/
|
|
3890
|
+
qli source add website https://example.com --name "Docs Site" --max-concurrent-requests 8
|
|
3891
|
+
qli source add website https://example.com --name "Example Site" --json
|
|
3133
3892
|
qli source add rss https://example.com/feed.xml --name "Release Feed"
|
|
3893
|
+
qli source add rss https://example.com/feed.xml --name "Release Feed" --max-concurrent-requests 3
|
|
3134
3894
|
qli source add rss https://example.com/feed.xml --name "Release Feed" --retention-days 30
|
|
3135
3895
|
|
|
3136
3896
|
Notes:
|
|
3897
|
+
page stores one page. It does not crawl links or detect feeds.
|
|
3898
|
+
Website sources may detect one blog or news feed during registration.
|
|
3899
|
+
When a feed is added, qli also excludes the feed item prefix from the website crawl when it can infer one.
|
|
3900
|
+
Website and RSS sources default to 5 remote requests in flight per source unless config.yaml or source settings override it.
|
|
3901
|
+
Use --json when automation needs the full list of created sources.
|
|
3137
3902
|
RSS sources store retention per feed.
|
|
3138
|
-
When you omit --retention-days for RSS, qli stores the workspace default from config.yaml.`).action(async function command(
|
|
3903
|
+
When you omit --retention-days for RSS, qli stores the workspace default from config.yaml.`).action(async function command(typeInput, uri, options) {
|
|
3904
|
+
const type = parseSourceType(typeInput);
|
|
3905
|
+
if (!type) {
|
|
3906
|
+
throw new CliError(`unsupported source type: ${typeInput}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
|
|
3907
|
+
}
|
|
3139
3908
|
if (!SOURCE_TYPES.has(type)) {
|
|
3140
3909
|
throw new CliError(`unsupported source type: ${type}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
|
|
3141
3910
|
}
|
|
3911
|
+
validateSourceAddOptions(type, options);
|
|
3142
3912
|
const global = this.optsWithGlobals();
|
|
3143
3913
|
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
3144
3914
|
const config = await loadConfig(workspace, global.config);
|
|
3145
3915
|
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
3146
|
-
const
|
|
3916
|
+
const initialCrawl = createSourceCrawlConfig(type, options, { retentionDays: config.crawler.retentionDays });
|
|
3917
|
+
let crawl = initialCrawl;
|
|
3918
|
+
let detectedFeed = null;
|
|
3919
|
+
if (type === "website") {
|
|
3920
|
+
detectedFeed = await discoverWebsiteFeed(uri, config.crawler.defaultUserAgent);
|
|
3921
|
+
if (detectedFeed?.excludePrefix) {
|
|
3922
|
+
crawl = {
|
|
3923
|
+
...crawl ?? {},
|
|
3924
|
+
excludePatterns: mergePatterns(crawl?.excludePatterns, detectedFeed.excludePrefix)
|
|
3925
|
+
};
|
|
3926
|
+
}
|
|
3927
|
+
}
|
|
3147
3928
|
const stored = await addSource(workspace, {
|
|
3148
3929
|
type,
|
|
3149
3930
|
uri: ["file", "directory"].includes(type) ? path21.resolve(uri) : uri,
|
|
@@ -3155,11 +3936,50 @@ Notes:
|
|
|
3155
3936
|
createdAt: now,
|
|
3156
3937
|
updatedAt: now
|
|
3157
3938
|
});
|
|
3158
|
-
|
|
3939
|
+
if (type !== "website") {
|
|
3940
|
+
emit(global.json, capture, response("source add", workspace, stored), `Added source ${stored.id}`);
|
|
3941
|
+
return;
|
|
3942
|
+
}
|
|
3943
|
+
let feedSource;
|
|
3944
|
+
let feedWasAdded = false;
|
|
3945
|
+
if (detectedFeed) {
|
|
3946
|
+
const existingSources = await listSources(workspace);
|
|
3947
|
+
feedSource = existingSources.find((source2) => source2.uri === detectedFeed?.feedUrl);
|
|
3948
|
+
if (!feedSource) {
|
|
3949
|
+
feedSource = await addSource(workspace, {
|
|
3950
|
+
type: "rss",
|
|
3951
|
+
uri: detectedFeed.feedUrl,
|
|
3952
|
+
name: `${options.name} Feed`,
|
|
3953
|
+
enabled: true,
|
|
3954
|
+
tags: options.tag ?? [],
|
|
3955
|
+
metadata: normalizeMetadata(options.metadata),
|
|
3956
|
+
crawl: {
|
|
3957
|
+
retentionDays: config.crawler.retentionDays,
|
|
3958
|
+
fetchArticles: true
|
|
3959
|
+
},
|
|
3960
|
+
createdAt: now,
|
|
3961
|
+
updatedAt: now
|
|
3962
|
+
});
|
|
3963
|
+
feedWasAdded = true;
|
|
3964
|
+
}
|
|
3965
|
+
}
|
|
3966
|
+
const result = {
|
|
3967
|
+
primarySource: stored,
|
|
3968
|
+
addedSources: [stored, ...feedWasAdded && feedSource ? [feedSource] : []],
|
|
3969
|
+
detectedFeed: detectedFeed ? {
|
|
3970
|
+
url: detectedFeed.feedUrl,
|
|
3971
|
+
discoveredBy: detectedFeed.discoveredBy,
|
|
3972
|
+
excludePrefix: detectedFeed.excludePrefix,
|
|
3973
|
+
source: feedSource,
|
|
3974
|
+
wasAdded: feedWasAdded
|
|
3975
|
+
} : null
|
|
3976
|
+
};
|
|
3977
|
+
emit(global.json, capture, response("source add", workspace, result), formatWebsiteSourceAdd(result));
|
|
3159
3978
|
});
|
|
3160
|
-
source.command("config").description("Edit supported settings on an existing source.").argument("<sourceId>", "Source id from qli source list.").option("--name <name>", "Update the source name.").option("--tag <tag...>", "Replace source tags with the provided values.").option("--metadata <key=value...>", "Merge metadata keys into the existing source metadata.").option("--max-depth <n>", "Set website crawl depth.").option("--max-pages <n>", "Set the page limit for website sources.").option("--include <pattern...>", "Set include patterns for website or directory sources.").option("--exclude <pattern...>", "Set exclude patterns for website or directory sources.").option("--retention-days <n>", "Set RSS retention in days for this feed.").addHelpText("after", `
|
|
3979
|
+
source.command("config").description("Edit supported settings on an existing source.").argument("<sourceId>", "Source id from qli source list.").option("--name <name>", "Update the source name.").option("--tag <tag...>", "Replace source tags with the provided values.").option("--metadata <key=value...>", "Merge metadata keys into the existing source metadata.").option("--max-depth <n>", "Set website crawl depth.").option("--max-pages <n>", "Set the page limit for website sources.").option("--max-concurrent-requests <n>", "Set the remote request concurrency limit for website or feed sources.").option("--include <pattern...>", "Set include patterns for website or directory sources.").option("--exclude <pattern...>", "Set exclude patterns for website or directory sources.").option("--retention-days <n>", "Set RSS retention in days for this feed.").addHelpText("after", `
|
|
3161
3980
|
Examples:
|
|
3162
3981
|
qli source config src_123 --retention-days 30
|
|
3982
|
+
qli source config src_123 --max-concurrent-requests 2
|
|
3163
3983
|
qli source config src_123 --name "Docs Feed" --tag rss docs
|
|
3164
3984
|
qli source config src_123 --include /docs/ --exclude /docs/archive/
|
|
3165
3985
|
qli source config src_123 --metadata team=docs owner=platform --json
|
|
@@ -3218,35 +4038,56 @@ Examples:
|
|
|
3218
4038
|
const updated = await updateSource(workspace, sourceId, { enabled: true, updatedAt: (/* @__PURE__ */ new Date()).toISOString() });
|
|
3219
4039
|
emit(global.json, capture, response("source enable", workspace, updated), `Enabled source ${sourceId}`);
|
|
3220
4040
|
});
|
|
3221
|
-
program.command("ingest").description("Fetch
|
|
4041
|
+
program.command("ingest").description("Fetch source content, update affected chunks, and refresh retrieval indexes.").option("--source <sourceId>", "Only ingest one source.").option("--changed-only", "Skip content that has not changed since the last run.").option("--dense", "Force a dense vector build if the dense model is available.").option("--sparse", "Force a sparse vector build if the sparse runtime is available.").addHelpText("after", `
|
|
3222
4042
|
Examples:
|
|
3223
4043
|
qli ingest
|
|
3224
4044
|
qli ingest --source src_123
|
|
3225
|
-
qli ingest --changed-only
|
|
4045
|
+
qli ingest --changed-only
|
|
4046
|
+
qli ingest --dense --sparse
|
|
4047
|
+
qli ingest --silent`).action(async function command(options) {
|
|
3226
4048
|
const global = this.optsWithGlobals();
|
|
3227
4049
|
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
3228
|
-
const
|
|
3229
|
-
|
|
4050
|
+
const result = await runIngestCommand({
|
|
4051
|
+
workspace,
|
|
4052
|
+
sourceId: options.source,
|
|
4053
|
+
changedOnly: Boolean(options.changedOnly),
|
|
4054
|
+
dense: Boolean(options.dense),
|
|
4055
|
+
sparse: Boolean(options.sparse),
|
|
4056
|
+
progress: createProgressHandler(capture, global)
|
|
4057
|
+
});
|
|
4058
|
+
emit(global.json, capture, response("ingest", workspace, result), `Processed ${result.ingest.processedSources} sources, wrote ${result.chunk.chunksWritten} chunks`);
|
|
3230
4059
|
});
|
|
3231
4060
|
program.command("chunk").description("Split normalized documents into retrieval chunks.").option("--source <sourceId>", "Only chunk documents from one source.").option("--document <documentId>", "Only chunk one document.").addHelpText("after", `
|
|
3232
4061
|
Examples:
|
|
3233
4062
|
qli chunk
|
|
3234
4063
|
qli chunk --source src_123
|
|
3235
|
-
qli chunk --document doc_123
|
|
4064
|
+
qli chunk --document doc_123
|
|
4065
|
+
qli chunk --silent`).action(async function command(options) {
|
|
3236
4066
|
const global = this.optsWithGlobals();
|
|
3237
4067
|
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
3238
|
-
const
|
|
3239
|
-
|
|
4068
|
+
const result = await chunkDocuments({
|
|
4069
|
+
workspacePath: workspace,
|
|
4070
|
+
sourceId: options.source,
|
|
4071
|
+
documentId: options.document,
|
|
4072
|
+
progress: createProgressHandler(capture, global)
|
|
4073
|
+
});
|
|
4074
|
+
emit(global.json, capture, response("chunk", workspace, result), `Wrote ${result.chunksWritten} chunks`);
|
|
3240
4075
|
});
|
|
3241
4076
|
program.command("reprocess").description("Re-run normalization for existing documents without fetching sources again.").option("--source <sourceId>", "Only reprocess documents from one source.").option("--document <documentId>", "Only reprocess one document.").addHelpText("after", `
|
|
3242
4077
|
Examples:
|
|
3243
4078
|
qli reprocess
|
|
3244
4079
|
qli reprocess --source src_123
|
|
3245
|
-
qli reprocess --document doc_123
|
|
4080
|
+
qli reprocess --document doc_123
|
|
4081
|
+
qli reprocess --silent`).action(async function command(options) {
|
|
3246
4082
|
const global = this.optsWithGlobals();
|
|
3247
4083
|
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
3248
|
-
const
|
|
3249
|
-
|
|
4084
|
+
const result = await reprocessDocuments({
|
|
4085
|
+
workspacePath: workspace,
|
|
4086
|
+
sourceId: options.source,
|
|
4087
|
+
documentId: options.document,
|
|
4088
|
+
progress: createProgressHandler(capture, global)
|
|
4089
|
+
});
|
|
4090
|
+
emit(global.json, capture, response("reprocess", workspace, result), `Reprocessed ${result.documentsReprocessed} documents`);
|
|
3250
4091
|
});
|
|
3251
4092
|
const index = program.command("index");
|
|
3252
4093
|
index.description("Build and inspect retrieval indexes.");
|
|
@@ -3254,33 +4095,47 @@ Examples:
|
|
|
3254
4095
|
Examples:
|
|
3255
4096
|
qli index build
|
|
3256
4097
|
qli index build --dense
|
|
3257
|
-
qli index build --dense --sparse
|
|
4098
|
+
qli index build --dense --sparse
|
|
4099
|
+
qli index build --silent`).action(async function command(options) {
|
|
3258
4100
|
const global = this.optsWithGlobals();
|
|
3259
4101
|
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
3260
|
-
const
|
|
4102
|
+
const result = await buildIndex({
|
|
3261
4103
|
workspacePath: workspace,
|
|
3262
4104
|
denseOverride: options.dense ? true : void 0,
|
|
3263
|
-
sparseOverride: options.sparse ? true : void 0
|
|
4105
|
+
sparseOverride: options.sparse ? true : void 0,
|
|
4106
|
+
progress: createProgressHandler(capture, global)
|
|
3264
4107
|
});
|
|
3265
|
-
emit(global.json, capture, response("index build", workspace,
|
|
4108
|
+
emit(global.json, capture, response("index build", workspace, result), `Built index at ${result.indexPath}`);
|
|
3266
4109
|
});
|
|
3267
4110
|
program.command("rebuild").description("Run ingest, chunk, and index build in one command.").option("--source <sourceId>", "Only rebuild data for one source.").option("--changed-only", "Only ingest changed content before chunking and indexing.").option("--dense", "Force a dense vector build if the dense model is available.").option("--sparse", "Force a sparse vector build if the sparse runtime is available.").addHelpText("after", `
|
|
3268
4111
|
Examples:
|
|
3269
4112
|
qli rebuild
|
|
3270
4113
|
qli rebuild --changed-only
|
|
3271
4114
|
qli rebuild --source src_123
|
|
3272
|
-
qli rebuild --dense --sparse
|
|
4115
|
+
qli rebuild --dense --sparse
|
|
4116
|
+
qli rebuild --silent`).action(async function command(options) {
|
|
3273
4117
|
const global = this.optsWithGlobals();
|
|
3274
4118
|
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
3275
|
-
const
|
|
3276
|
-
|
|
4119
|
+
const progress = createProgressHandler(capture, global);
|
|
4120
|
+
progress?.("info", "Rebuild step 1/3: ingest");
|
|
4121
|
+
const ingest = await ingestSources({
|
|
4122
|
+
workspacePath: workspace,
|
|
4123
|
+
sourceIds: options.source ? [options.source] : void 0,
|
|
4124
|
+
changedOnly: Boolean(options.changedOnly),
|
|
4125
|
+
progress
|
|
4126
|
+
});
|
|
4127
|
+
progress?.("info", "Rebuild step 2/3: chunk");
|
|
4128
|
+
const chunk = await chunkDocuments({ workspacePath: workspace, sourceId: options.source, progress });
|
|
4129
|
+
progress?.("info", "Rebuild step 3/3: index");
|
|
3277
4130
|
const indexBuild = await buildIndex({
|
|
3278
4131
|
workspacePath: workspace,
|
|
3279
4132
|
denseOverride: options.dense ? true : void 0,
|
|
3280
4133
|
sparseOverride: options.sparse ? true : void 0,
|
|
3281
|
-
buildAvailableModels: true
|
|
4134
|
+
buildAvailableModels: true,
|
|
4135
|
+
progress
|
|
3282
4136
|
});
|
|
3283
4137
|
const data = { ingest, chunk, indexPath: indexBuild.indexPath, metadata: indexBuild.metadata };
|
|
4138
|
+
progress?.("info", "Rebuild complete");
|
|
3284
4139
|
emit(global.json, capture, response("rebuild", workspace, data), `Processed ${ingest.processedSources} sources, wrote ${chunk.chunksWritten} chunks`);
|
|
3285
4140
|
});
|
|
3286
4141
|
program.command("search").description("Search the built index and return ranked matching documents or chunks.").argument("[query]", "Text query. Omit it to list the latest matching documents.").option("--top-k <n>", "Maximum number of results to return.", "12").option("--source <sourceIds>", "Restrict results to one or more source ids. Use comma-separated values.").option("--source-name <names>", "Restrict results to one or more source names. Use comma-separated values.").option("--source-type <types>", `Restrict results to one or more source types. Use comma-separated values: ${SOURCE_TYPE_LIST.join(", ")}`).option("--uri-prefix <prefixes>", "Restrict results to one or more URI prefixes. Use comma-separated values.").option("--tag <tags>", "Restrict results to one or more source tags. Use comma-separated values.").option("--metadata <key=value...>", "Restrict results to sources with matching metadata.").option("--since <date>", "Shortcut for --publication-date-from.").option("--until <date>", "Shortcut for --publication-date-to.").option("--changed-since <date>", "Only include documents changed on or after this date.").option("--has-publication-date", "Only include documents with a publication date.").option("--publication-date-from <date>", "Only include documents published on or after this date.").option("--publication-date-to <date>", "Only include documents published on or before this date.").option("--first-seen-at-from <date>", "Only include documents first seen on or after this date.").option("--first-seen-at-to <date>", "Only include documents first seen on or before this date.").option("--last-seen-at-from <date>", "Only include documents last seen on or after this date.").option("--last-seen-at-to <date>", "Only include documents last seen on or before this date.").option("--last-changed-at-from <date>", "Only include documents changed on or after this date.").option("--last-changed-at-to <date>", "Only include documents changed on or before this date.").option("--crawled-at-from <date>", "Only include documents crawled on or after this date.").option("--crawled-at-to <date>", "Only include documents crawled on or before this date.").option("--retrieval <mode>", `Retrieval mode: ${RETRIEVAL_MODE_LIST.join(", ")}`).option("--show-chunks", "Return chunk-level matches when available.").addHelpText("after", `
|
|
@@ -3291,7 +4146,7 @@ Examples:
|
|
|
3291
4146
|
qli search --source-name "Release Feed,Company Blog" --uri-prefix https://example.com/news,https://example.com/blog
|
|
3292
4147
|
qli search "billing" --metadata team=support
|
|
3293
4148
|
qli search "embedding model" --retrieval hybrid --show-chunks
|
|
3294
|
-
qli search --source-type rss,
|
|
4149
|
+
qli search --source-type rss,page --top-k 25 --json
|
|
3295
4150
|
|
|
3296
4151
|
Notes:
|
|
3297
4152
|
lexical works without vector models.
|
|
@@ -3299,7 +4154,7 @@ Notes:
|
|
|
3299
4154
|
When you omit the query, qli returns the latest matching documents sorted by publication date.`).action(async function command(query, options) {
|
|
3300
4155
|
const global = this.optsWithGlobals();
|
|
3301
4156
|
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
3302
|
-
const
|
|
4157
|
+
const result = await searchIndex({
|
|
3303
4158
|
workspacePath: workspace,
|
|
3304
4159
|
query: query ?? "",
|
|
3305
4160
|
topK: Number(options.topK),
|
|
@@ -3314,7 +4169,7 @@ Notes:
|
|
|
3314
4169
|
retrievalMode: parseRetrievalMode(options.retrieval),
|
|
3315
4170
|
showChunks: Boolean(options.showChunks)
|
|
3316
4171
|
});
|
|
3317
|
-
emit(global.json, capture, response("search", workspace,
|
|
4172
|
+
emit(global.json, capture, response("search", workspace, result), formatSearchResults(result.results));
|
|
3318
4173
|
});
|
|
3319
4174
|
program.command("related").description("Find documents similar to an existing document by id or URI.").argument("<document>", "Document id, uri, or canonical uri").option("--top-k <n>", "Maximum number of related documents to return.", "12").addHelpText("after", `
|
|
3320
4175
|
Examples:
|
|
@@ -3326,12 +4181,12 @@ Dense vectors usually produce better related-document results. Pull models and r
|
|
|
3326
4181
|
qli rebuild --dense`).action(async function command(document, options) {
|
|
3327
4182
|
const global = this.optsWithGlobals();
|
|
3328
4183
|
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
3329
|
-
const
|
|
4184
|
+
const result = await findRelatedDocuments({
|
|
3330
4185
|
workspacePath: workspace,
|
|
3331
4186
|
document,
|
|
3332
4187
|
topK: Number(options.topK)
|
|
3333
4188
|
});
|
|
3334
|
-
emit(global.json, capture, response("related", workspace,
|
|
4189
|
+
emit(global.json, capture, response("related", workspace, result), formatRelatedDocuments(result.results));
|
|
3335
4190
|
});
|
|
3336
4191
|
program.command("context").description("Assemble retrieval context for an external LLM, agent, or prompt pipeline.").argument("<query>").option("--top-k <n>", "Maximum number of source passages to consider.", "12").option("--max-chars <n>", "Maximum output length for the rendered context block.", "12000").option("--retrieval <mode>", `Retrieval mode: ${RETRIEVAL_MODE_LIST.join(", ")}`).addHelpText("after", `
|
|
3337
4192
|
Examples:
|
|
@@ -3342,14 +4197,14 @@ Examples:
|
|
|
3342
4197
|
Use --json when another tool needs structured access to the raw passages and metadata.`).action(async function command(query, options) {
|
|
3343
4198
|
const global = this.optsWithGlobals();
|
|
3344
4199
|
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
3345
|
-
const
|
|
4200
|
+
const result = await createContext({
|
|
3346
4201
|
workspacePath: workspace,
|
|
3347
4202
|
query,
|
|
3348
4203
|
topK: Number(options.topK),
|
|
3349
4204
|
maxChars: Number(options.maxChars),
|
|
3350
4205
|
retrievalMode: parseRetrievalMode(options.retrieval)
|
|
3351
4206
|
});
|
|
3352
|
-
emit(global.json, capture, response("context", workspace,
|
|
4207
|
+
emit(global.json, capture, response("context", workspace, result), result.markdown);
|
|
3353
4208
|
});
|
|
3354
4209
|
const models = program.command("models");
|
|
3355
4210
|
models.description("Inspect and download retrieval model assets.");
|
|
@@ -3358,7 +4213,9 @@ Examples:
|
|
|
3358
4213
|
qli models pull
|
|
3359
4214
|
qli models pull --dense
|
|
3360
4215
|
qli models pull --sparse
|
|
4216
|
+
qli models pull --silent
|
|
3361
4217
|
|
|
4218
|
+
Pulled model assets are shared under ~/.qli by default.
|
|
3362
4219
|
If you plan to use related, dense search, or hybrid retrieval, pull the models and rebuild the index first.`).action(async function command(options) {
|
|
3363
4220
|
const global = this.optsWithGlobals();
|
|
3364
4221
|
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
@@ -3369,17 +4226,27 @@ If you plan to use related, dense search, or hybrid retrieval, pull the models a
|
|
|
3369
4226
|
pullSparseFlag: Boolean(options.sparse),
|
|
3370
4227
|
uvAvailable: status.sparse.uvAvailable
|
|
3371
4228
|
});
|
|
3372
|
-
await pullModels({ workspacePath: workspace, config, pullDense, pullSparse });
|
|
4229
|
+
await pullModels({ workspacePath: workspace, config, pullDense, pullSparse, progress: createProgressHandler(capture, global) });
|
|
3373
4230
|
const data = {
|
|
3374
|
-
dense: pullDense ? {
|
|
3375
|
-
|
|
4231
|
+
dense: pullDense ? {
|
|
4232
|
+
pulled: true,
|
|
4233
|
+
modelId: config.retrieval.dense.modelId,
|
|
4234
|
+
cacheDir: resolveCacheDir(workspace, config.retrieval.dense.cacheDir)
|
|
4235
|
+
} : void 0,
|
|
4236
|
+
sparse: pullSparse ? {
|
|
4237
|
+
pulled: true,
|
|
4238
|
+
modelId: config.retrieval.sparse.modelId,
|
|
4239
|
+
cacheDir: resolveCacheDir(workspace, config.retrieval.sparse.cacheDir)
|
|
4240
|
+
} : void 0
|
|
3376
4241
|
};
|
|
3377
4242
|
emit(global.json, capture, response("models pull", workspace, data), "Pulled available models");
|
|
3378
4243
|
});
|
|
3379
|
-
models.command("status").description("Show whether model runtimes and artifacts are available
|
|
4244
|
+
models.command("status").description("Show whether shared model assets, runtimes, and workspace vector artifacts are available.").addHelpText("after", `
|
|
3380
4245
|
Examples:
|
|
3381
4246
|
qli models status
|
|
3382
|
-
qli models status --json
|
|
4247
|
+
qli models status --json
|
|
4248
|
+
|
|
4249
|
+
The cacheDir fields show the resolved model cache path for the current workspace config.`).action(async function command() {
|
|
3383
4250
|
const global = this.optsWithGlobals();
|
|
3384
4251
|
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
3385
4252
|
const config = await loadConfig(workspace, global.config);
|
|
@@ -3394,8 +4261,8 @@ Examples:
|
|
|
3394
4261
|
qli diff --since 2026-05-01`).action(async function command(options) {
|
|
3395
4262
|
const global = this.optsWithGlobals();
|
|
3396
4263
|
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
3397
|
-
const
|
|
3398
|
-
emit(global.json, capture, response("diff", workspace,
|
|
4264
|
+
const result = await diffWorkspace({ workspacePath: workspace, sourceId: options.source, documentId: options.document, since: options.since });
|
|
4265
|
+
emit(global.json, capture, response("diff", workspace, result), JSON.stringify(result, null, 2));
|
|
3399
4266
|
});
|
|
3400
4267
|
const report = program.command("report");
|
|
3401
4268
|
report.description("Render higher-level reports from workspace data.");
|
|
@@ -3427,7 +4294,7 @@ Examples:
|
|
|
3427
4294
|
try {
|
|
3428
4295
|
const meta = await readLatestIndexMetadata(workspace);
|
|
3429
4296
|
latestIndex = meta.createdAt;
|
|
3430
|
-
indexSize = (await stat4(
|
|
4297
|
+
indexSize = (await stat4(await resolveLatestIndexArtifactPath(workspace))).size;
|
|
3431
4298
|
} catch {
|
|
3432
4299
|
latestIndex = void 0;
|
|
3433
4300
|
}
|
|
@@ -3476,8 +4343,11 @@ Examples:
|
|
|
3476
4343
|
checks.push("dense runtime importable");
|
|
3477
4344
|
}
|
|
3478
4345
|
if (config.retrieval.sparse.enabled) {
|
|
3479
|
-
await
|
|
3480
|
-
|
|
4346
|
+
if (await isUvAvailable()) {
|
|
4347
|
+
checks.push("uv available for sparse runtime");
|
|
4348
|
+
} else {
|
|
4349
|
+
checks.push("uv missing for sparse runtime");
|
|
4350
|
+
}
|
|
3481
4351
|
}
|
|
3482
4352
|
try {
|
|
3483
4353
|
await readLatestIndexMetadata(workspace);
|
|
@@ -3511,13 +4381,21 @@ function emit(asJson, capture, body, human) {
|
|
|
3511
4381
|
}
|
|
3512
4382
|
|
|
3513
4383
|
// src/cli/main.ts
|
|
3514
|
-
|
|
3515
|
-
|
|
3516
|
-
|
|
4384
|
+
try {
|
|
4385
|
+
const result = await runCli(process.argv.slice(2), {
|
|
4386
|
+
onStdout(value) {
|
|
4387
|
+
process.stdout.write(`${value}
|
|
3517
4388
|
`);
|
|
3518
|
-
}
|
|
3519
|
-
|
|
3520
|
-
|
|
4389
|
+
},
|
|
4390
|
+
onStderr(value) {
|
|
4391
|
+
process.stderr.write(`${value}
|
|
4392
|
+
`);
|
|
4393
|
+
}
|
|
4394
|
+
});
|
|
4395
|
+
process.exitCode = result.exitCode;
|
|
4396
|
+
} catch (error) {
|
|
4397
|
+
const message = error instanceof Error ? error.stack ?? error.message : String(error);
|
|
4398
|
+
process.stderr.write(`${message}
|
|
3521
4399
|
`);
|
|
4400
|
+
process.exitCode = 1;
|
|
3522
4401
|
}
|
|
3523
|
-
process.exit(result.exitCode);
|