@tryformation/querylight-cli 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +62 -9
- package/dist/chunk/chunker.d.ts +3 -1
- package/dist/cli/main.js +1031 -237
- package/dist/cli/run-cli.d.ts +4 -1
- package/dist/core/concurrency.d.ts +1 -0
- package/dist/core/constants.d.ts +3 -1
- package/dist/core/progress.d.ts +4 -0
- package/dist/core/urls.d.ts +1 -0
- package/dist/index/querylight-indexer.d.ts +3 -1
- package/dist/index.js +441 -114
- package/dist/ingest/adapters/website-adapter.d.ts +6 -1
- package/dist/ingest/adapters/website-feed-discovery.d.ts +6 -0
- package/dist/ingest/extractors/html-extractor.d.ts +1 -0
- package/dist/ingest/ingest-service.d.ts +5 -2
- package/dist/types/models.d.ts +2 -2
- package/dist/vector/dense.d.ts +3 -1
- package/dist/vector/runtime.d.ts +2 -0
- package/dist/vector/service.d.ts +20 -2
- package/dist/vector/sparse.d.ts +3 -1
- package/dist/vector/store.d.ts +8 -2
- package/package.json +1 -1
package/dist/cli/main.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
// src/cli/run-cli.ts
|
|
4
|
-
import { Command } from "commander";
|
|
4
|
+
import { Command, Option } from "commander";
|
|
5
5
|
import { stat as stat4 } from "fs/promises";
|
|
6
6
|
import path21 from "path";
|
|
7
7
|
|
|
@@ -14,6 +14,17 @@ import path4 from "path";
|
|
|
14
14
|
import { readFile, writeFile } from "fs/promises";
|
|
15
15
|
import path from "path";
|
|
16
16
|
import YAML from "yaml";
|
|
17
|
+
|
|
18
|
+
// src/core/constants.ts
|
|
19
|
+
var PACKAGE_VERSION = "0.2.0";
|
|
20
|
+
var DEFAULT_WORKSPACE = ".kb";
|
|
21
|
+
var DEFAULT_SHARED_MODEL_CACHE_DIR = "~/.qli/models/huggingface";
|
|
22
|
+
var LEGACY_WORKSPACE_MODEL_CACHE_DIR = ".kb/models/huggingface";
|
|
23
|
+
|
|
24
|
+
// src/core/config.ts
|
|
25
|
+
function normalizeModelCacheDir(configuredPath) {
|
|
26
|
+
return configuredPath === LEGACY_WORKSPACE_MODEL_CACHE_DIR ? DEFAULT_SHARED_MODEL_CACHE_DIR : configuredPath;
|
|
27
|
+
}
|
|
17
28
|
var defaultConfig = () => ({
|
|
18
29
|
workspaceVersion: 1,
|
|
19
30
|
index: {
|
|
@@ -41,17 +52,17 @@ var defaultConfig = () => ({
|
|
|
41
52
|
retrieval: {
|
|
42
53
|
defaultMode: "lexical",
|
|
43
54
|
dense: {
|
|
44
|
-
enabled:
|
|
55
|
+
enabled: true,
|
|
45
56
|
modelId: "Xenova/all-MiniLM-L6-v2",
|
|
46
|
-
cacheDir:
|
|
57
|
+
cacheDir: DEFAULT_SHARED_MODEL_CACHE_DIR,
|
|
47
58
|
indexHashTables: 8,
|
|
48
59
|
indexRandomSeed: 42,
|
|
49
60
|
chunkTextMode: "title-heading-text"
|
|
50
61
|
},
|
|
51
62
|
sparse: {
|
|
52
|
-
enabled:
|
|
63
|
+
enabled: true,
|
|
53
64
|
modelId: "opensearch-project/opensearch-neural-sparse-encoding-doc-v3-distill",
|
|
54
|
-
cacheDir:
|
|
65
|
+
cacheDir: DEFAULT_SHARED_MODEL_CACHE_DIR,
|
|
55
66
|
documentTopTokens: 128,
|
|
56
67
|
queryEncoding: "tokenizer-token-weights",
|
|
57
68
|
documentEncoding: "masked-lm-max-log1p-relu",
|
|
@@ -62,6 +73,7 @@ var defaultConfig = () => ({
|
|
|
62
73
|
defaultUserAgent: "querylight-cli/0.1",
|
|
63
74
|
obeyRobotsTxt: true,
|
|
64
75
|
rateLimitMs: 1e3,
|
|
76
|
+
maxConcurrentRequests: 5,
|
|
65
77
|
renderJs: false,
|
|
66
78
|
retentionDays: 365,
|
|
67
79
|
fetchArticles: true
|
|
@@ -112,11 +124,13 @@ async function loadConfig(workspacePath, configPath) {
|
|
|
112
124
|
...parsed.retrieval ?? {},
|
|
113
125
|
dense: {
|
|
114
126
|
...defaults.retrieval.dense,
|
|
115
|
-
...parsed.retrieval?.dense ?? {}
|
|
127
|
+
...parsed.retrieval?.dense ?? {},
|
|
128
|
+
cacheDir: normalizeModelCacheDir(parsed.retrieval?.dense?.cacheDir ?? defaults.retrieval.dense.cacheDir)
|
|
116
129
|
},
|
|
117
130
|
sparse: {
|
|
118
131
|
...defaults.retrieval.sparse,
|
|
119
|
-
...parsed.retrieval?.sparse ?? {}
|
|
132
|
+
...parsed.retrieval?.sparse ?? {},
|
|
133
|
+
cacheDir: normalizeModelCacheDir(parsed.retrieval?.sparse?.cacheDir ?? defaults.retrieval.sparse.cacheDir)
|
|
120
134
|
}
|
|
121
135
|
},
|
|
122
136
|
crawler: {
|
|
@@ -162,6 +176,14 @@ async function writeJsonl(filePath, records) {
|
|
|
162
176
|
` : "", "utf8");
|
|
163
177
|
}
|
|
164
178
|
|
|
179
|
+
// src/core/progress.ts
|
|
180
|
+
function reportProgress(progress, message) {
|
|
181
|
+
progress?.("info", message);
|
|
182
|
+
}
|
|
183
|
+
function reportProgressDetail(progress, message) {
|
|
184
|
+
progress?.("detail", message);
|
|
185
|
+
}
|
|
186
|
+
|
|
165
187
|
// src/chunk/chunk-store.ts
|
|
166
188
|
import path3 from "path";
|
|
167
189
|
function chunksFile(workspacePath) {
|
|
@@ -269,11 +291,13 @@ function buildChunksForDocument(document, markdown, config, prior = /* @__PURE__
|
|
|
269
291
|
async function chunkDocuments({
|
|
270
292
|
workspacePath,
|
|
271
293
|
sourceId,
|
|
272
|
-
documentId
|
|
294
|
+
documentId,
|
|
295
|
+
progress
|
|
273
296
|
}) {
|
|
274
297
|
const config = await loadConfig(workspacePath);
|
|
275
298
|
const documents = await readJsonl(path4.join(workspacePath, "documents", "documents.jsonl"));
|
|
276
299
|
const filtered = documents.filter((document) => (!sourceId || document.sourceId === sourceId) && (!documentId || document.id === documentId));
|
|
300
|
+
reportProgress(progress, `Chunking ${filtered.length} document${filtered.length === 1 ? "" : "s"}`);
|
|
277
301
|
const targetedDocumentIds = new Set(filtered.map((document) => document.id));
|
|
278
302
|
const existingChunks = await loadChunks(workspacePath);
|
|
279
303
|
const prior = new Map(existingChunks.map((chunk) => [chunk.id, chunk]));
|
|
@@ -281,19 +305,17 @@ async function chunkDocuments({
|
|
|
281
305
|
existingChunks.filter((chunk) => !targetedDocumentIds.has(chunk.documentId)).map((chunk) => [chunk.id, chunk])
|
|
282
306
|
);
|
|
283
307
|
for (const document of filtered) {
|
|
308
|
+
reportProgressDetail(progress, `Chunking ${document.id} (${document.title})`);
|
|
284
309
|
const raw = await readFile3(document.normalizedPath, "utf8");
|
|
285
310
|
for (const chunk of buildChunksForDocument(document, raw, config, prior)) {
|
|
286
311
|
nextChunks.set(chunk.id, chunk);
|
|
287
312
|
}
|
|
288
313
|
}
|
|
289
314
|
await saveChunks(workspacePath, [...nextChunks.values()]);
|
|
315
|
+
reportProgress(progress, `Chunking complete: ${nextChunks.size} chunk${nextChunks.size === 1 ? "" : "s"} written`);
|
|
290
316
|
return { chunksWritten: nextChunks.size };
|
|
291
317
|
}
|
|
292
318
|
|
|
293
|
-
// src/core/constants.ts
|
|
294
|
-
var PACKAGE_VERSION = "0.1.0";
|
|
295
|
-
var DEFAULT_WORKSPACE = ".kb";
|
|
296
|
-
|
|
297
319
|
// src/core/errors.ts
|
|
298
320
|
var CliError = class extends Error {
|
|
299
321
|
constructor(message, code, exitCode, details) {
|
|
@@ -319,8 +341,6 @@ var DIRS = [
|
|
|
319
341
|
"normalized",
|
|
320
342
|
"indexes",
|
|
321
343
|
"vectors",
|
|
322
|
-
"models",
|
|
323
|
-
"models/huggingface",
|
|
324
344
|
"runs",
|
|
325
345
|
"logs"
|
|
326
346
|
];
|
|
@@ -358,11 +378,12 @@ import { Analyzer, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, Ranking
|
|
|
358
378
|
import path11 from "path";
|
|
359
379
|
|
|
360
380
|
// src/vector/dense.ts
|
|
361
|
-
import { VectorFieldIndex, createSeededRandom } from "@tryformation/querylight-ts";
|
|
381
|
+
import { VectorFieldIndex, cosineSimilarity, createSeededRandom } from "@tryformation/querylight-ts";
|
|
362
382
|
import { mkdir as mkdir4 } from "fs/promises";
|
|
363
383
|
import path8 from "path";
|
|
364
384
|
|
|
365
385
|
// src/vector/runtime.ts
|
|
386
|
+
import os from "os";
|
|
366
387
|
import path6 from "path";
|
|
367
388
|
import { fileURLToPath } from "url";
|
|
368
389
|
import { execFile, execFileSync } from "child_process";
|
|
@@ -379,7 +400,22 @@ async function fileExists(filePath) {
|
|
|
379
400
|
}
|
|
380
401
|
|
|
381
402
|
// src/vector/runtime.ts
|
|
403
|
+
function resolveQliHomeDir() {
|
|
404
|
+
return path6.resolve(process.env.QLI_HOME ?? path6.join(os.homedir(), ".qli"));
|
|
405
|
+
}
|
|
382
406
|
function resolveCacheDir(workspacePath, configuredPath) {
|
|
407
|
+
if (configuredPath === "~/.qli") {
|
|
408
|
+
return resolveQliHomeDir();
|
|
409
|
+
}
|
|
410
|
+
if (configuredPath.startsWith("~/.qli/")) {
|
|
411
|
+
return path6.join(resolveQliHomeDir(), configuredPath.slice("~/.qli/".length));
|
|
412
|
+
}
|
|
413
|
+
if (configuredPath === "~") {
|
|
414
|
+
return os.homedir();
|
|
415
|
+
}
|
|
416
|
+
if (configuredPath.startsWith("~/")) {
|
|
417
|
+
return path6.join(os.homedir(), configuredPath.slice(2));
|
|
418
|
+
}
|
|
383
419
|
return path6.isAbsolute(configuredPath) ? configuredPath : path6.resolve(workspacePath, configuredPath.replace(/^\.kb\//, ""));
|
|
384
420
|
}
|
|
385
421
|
function packageRootFromImportMeta(importMetaUrl) {
|
|
@@ -403,6 +439,14 @@ async function ensureUvAvailable() {
|
|
|
403
439
|
execFile("uv", ["--version"], (error) => error ? reject(error) : resolve2());
|
|
404
440
|
});
|
|
405
441
|
}
|
|
442
|
+
async function isUvAvailable() {
|
|
443
|
+
try {
|
|
444
|
+
await ensureUvAvailable();
|
|
445
|
+
return true;
|
|
446
|
+
} catch {
|
|
447
|
+
return false;
|
|
448
|
+
}
|
|
449
|
+
}
|
|
406
450
|
async function runSparsePython({
|
|
407
451
|
workspacePath,
|
|
408
452
|
config,
|
|
@@ -451,8 +495,8 @@ import path7 from "path";
|
|
|
451
495
|
function vectorsDir(workspacePath) {
|
|
452
496
|
return path7.join(workspacePath, "vectors");
|
|
453
497
|
}
|
|
454
|
-
function
|
|
455
|
-
return path7.join(
|
|
498
|
+
function sharedModelStateDir() {
|
|
499
|
+
return path7.join(resolveQliHomeDir(), "models", "status");
|
|
456
500
|
}
|
|
457
501
|
function denseVectorPath(workspacePath) {
|
|
458
502
|
return path7.join(vectorsDir(workspacePath), "dense.latest.json");
|
|
@@ -466,11 +510,16 @@ function sparseVectorPath(workspacePath) {
|
|
|
466
510
|
function sparseMetaPath(workspacePath) {
|
|
467
511
|
return path7.join(vectorsDir(workspacePath), "sparse.latest.meta.json");
|
|
468
512
|
}
|
|
469
|
-
function
|
|
470
|
-
|
|
513
|
+
function pullMarkerPath(type, workspacePath, modelId, cacheDir) {
|
|
514
|
+
const resolvedCacheDir = resolveCacheDir(workspacePath, cacheDir);
|
|
515
|
+
const cacheKey = sha256(resolvedCacheDir).slice(0, 16);
|
|
516
|
+
return path7.join(sharedModelStateDir(), type, `${encodeURIComponent(modelId)}.${cacheKey}.json`);
|
|
471
517
|
}
|
|
472
|
-
function
|
|
473
|
-
return
|
|
518
|
+
function densePullMarker(workspacePath, modelId, cacheDir) {
|
|
519
|
+
return pullMarkerPath("dense", workspacePath, modelId, cacheDir);
|
|
520
|
+
}
|
|
521
|
+
function sparsePullMarker(workspacePath, modelId, cacheDir) {
|
|
522
|
+
return pullMarkerPath("sparse", workspacePath, modelId, cacheDir);
|
|
474
523
|
}
|
|
475
524
|
async function writeDensePayload(workspacePath, payload) {
|
|
476
525
|
await mkdir3(vectorsDir(workspacePath), { recursive: true });
|
|
@@ -488,13 +537,15 @@ async function writeSparsePayload(workspacePath, payload) {
|
|
|
488
537
|
async function readSparsePayload(workspacePath) {
|
|
489
538
|
return JSON.parse(await readFile4(sparseVectorPath(workspacePath), "utf8"));
|
|
490
539
|
}
|
|
491
|
-
async function writeDensePullMarker(workspacePath, value) {
|
|
492
|
-
|
|
493
|
-
await
|
|
540
|
+
async function writeDensePullMarker(workspacePath, model, value) {
|
|
541
|
+
const markerPath = densePullMarker(workspacePath, model.modelId, model.cacheDir);
|
|
542
|
+
await mkdir3(path7.dirname(markerPath), { recursive: true });
|
|
543
|
+
await writeFile3(markerPath, JSON.stringify(value, null, 2), "utf8");
|
|
494
544
|
}
|
|
495
|
-
async function writeSparsePullMarker(workspacePath, value) {
|
|
496
|
-
|
|
497
|
-
await
|
|
545
|
+
async function writeSparsePullMarker(workspacePath, model, value) {
|
|
546
|
+
const markerPath = sparsePullMarker(workspacePath, model.modelId, model.cacheDir);
|
|
547
|
+
await mkdir3(path7.dirname(markerPath), { recursive: true });
|
|
548
|
+
await writeFile3(markerPath, JSON.stringify(value, null, 2), "utf8");
|
|
498
549
|
}
|
|
499
550
|
async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
|
|
500
551
|
const denseCacheDir = resolveCacheDir(workspacePath, dense.cacheDir);
|
|
@@ -504,7 +555,7 @@ async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
|
|
|
504
555
|
configured: dense.enabled,
|
|
505
556
|
modelId: dense.modelId,
|
|
506
557
|
cacheDir: denseCacheDir,
|
|
507
|
-
available: await fileExists(densePullMarker(workspacePath)),
|
|
558
|
+
available: await fileExists(densePullMarker(workspacePath, dense.modelId, dense.cacheDir)),
|
|
508
559
|
artifactExists: await fileExists(denseVectorPath(workspacePath))
|
|
509
560
|
},
|
|
510
561
|
sparse: {
|
|
@@ -512,22 +563,64 @@ async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
|
|
|
512
563
|
modelId: sparse.modelId,
|
|
513
564
|
cacheDir: sparseCacheDir,
|
|
514
565
|
uvAvailable,
|
|
515
|
-
available: await fileExists(sparsePullMarker(workspacePath)),
|
|
566
|
+
available: await fileExists(sparsePullMarker(workspacePath, sparse.modelId, sparse.cacheDir)),
|
|
516
567
|
artifactExists: await fileExists(sparseVectorPath(workspacePath))
|
|
517
568
|
}
|
|
518
569
|
};
|
|
519
570
|
}
|
|
520
571
|
|
|
521
572
|
// src/vector/text.ts
|
|
573
|
+
var LOW_SIGNAL_HEADINGS = /* @__PURE__ */ new Set([
|
|
574
|
+
"choose this instead of",
|
|
575
|
+
"how xyz runs it",
|
|
576
|
+
"naechste schritte",
|
|
577
|
+
"next steps",
|
|
578
|
+
"overview",
|
|
579
|
+
"passend wenn",
|
|
580
|
+
"problem",
|
|
581
|
+
"right fit",
|
|
582
|
+
"waehlen sie das stattdessen",
|
|
583
|
+
"was sie bekommen",
|
|
584
|
+
"what you get",
|
|
585
|
+
"wie xyz es umsetzt",
|
|
586
|
+
"uberblick",
|
|
587
|
+
"\xFCberblick"
|
|
588
|
+
]);
|
|
589
|
+
function normalizeHeading(value) {
|
|
590
|
+
return value.trim().toLowerCase();
|
|
591
|
+
}
|
|
592
|
+
function isLowSignalHeading(value) {
|
|
593
|
+
return LOW_SIGNAL_HEADINGS.has(normalizeHeading(value));
|
|
594
|
+
}
|
|
595
|
+
function stripLeadingHeading(text, heading) {
|
|
596
|
+
const lines = text.split("\n");
|
|
597
|
+
const firstContentIndex = lines.findIndex((line) => line.trim().length > 0);
|
|
598
|
+
if (firstContentIndex < 0) {
|
|
599
|
+
return text;
|
|
600
|
+
}
|
|
601
|
+
const match = /^(#{1,6})\s+(.+)$/.exec(lines[firstContentIndex] ?? "");
|
|
602
|
+
if (!match?.[2] || normalizeHeading(match[2]) !== normalizeHeading(heading)) {
|
|
603
|
+
return text;
|
|
604
|
+
}
|
|
605
|
+
const next = [...lines.slice(0, firstContentIndex), ...lines.slice(firstContentIndex + 1)].join("\n").trim();
|
|
606
|
+
return next;
|
|
607
|
+
}
|
|
608
|
+
function createVectorText(chunk) {
|
|
609
|
+
const meaningfulHeadings = chunk.headingPath.filter((heading) => !isLowSignalHeading(heading) && normalizeHeading(heading) !== normalizeHeading(chunk.title));
|
|
610
|
+
const textHeading = [...chunk.headingPath].reverse().find((heading) => isLowSignalHeading(heading) || normalizeHeading(heading) === normalizeHeading(chunk.title));
|
|
611
|
+
const body = textHeading ? stripLeadingHeading(chunk.text, textHeading) : chunk.text.trim();
|
|
612
|
+
return [chunk.title, ...meaningfulHeadings, body].filter(Boolean).join("\n\n");
|
|
613
|
+
}
|
|
522
614
|
function createDenseChunkText(chunk) {
|
|
523
|
-
return
|
|
615
|
+
return createVectorText(chunk);
|
|
524
616
|
}
|
|
525
617
|
function createSparseChunkText(chunk) {
|
|
526
|
-
return
|
|
618
|
+
return createVectorText(chunk);
|
|
527
619
|
}
|
|
528
620
|
|
|
529
621
|
// src/vector/dense.ts
|
|
530
622
|
var denseEmbedderFactory = null;
|
|
623
|
+
var EXACT_DENSE_RERANK_THRESHOLD = 5e3;
|
|
531
624
|
async function createEmbedder(cacheDir, modelId) {
|
|
532
625
|
if (denseEmbedderFactory) {
|
|
533
626
|
return denseEmbedderFactory(cacheDir, modelId);
|
|
@@ -539,6 +632,9 @@ async function createEmbedder(cacheDir, modelId) {
|
|
|
539
632
|
return output.tolist()[0];
|
|
540
633
|
};
|
|
541
634
|
}
|
|
635
|
+
function exactDenseQuery(payload, vector, topK) {
|
|
636
|
+
return payload.chunks.map((chunk) => [chunk.chunkId, cosineSimilarity(vector, chunk.embedding)]).sort((left, right) => right[1] - left[1]).slice(0, topK);
|
|
637
|
+
}
|
|
542
638
|
async function pullDenseModel(workspacePath, config) {
|
|
543
639
|
const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
|
|
544
640
|
await mkdir4(cacheDir, { recursive: true });
|
|
@@ -547,7 +643,8 @@ async function pullDenseModel(workspacePath, config) {
|
|
|
547
643
|
}
|
|
548
644
|
async function buildDenseVectors({
|
|
549
645
|
workspacePath,
|
|
550
|
-
config
|
|
646
|
+
config,
|
|
647
|
+
progress
|
|
551
648
|
}) {
|
|
552
649
|
const chunks = await readJsonl(path8.join(workspacePath, "chunks", "chunks.jsonl"));
|
|
553
650
|
const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
|
|
@@ -555,6 +652,7 @@ async function buildDenseVectors({
|
|
|
555
652
|
const embed = await createEmbedder(cacheDir, config.modelId);
|
|
556
653
|
const records = [];
|
|
557
654
|
let dimensions = 0;
|
|
655
|
+
reportProgress(progress, `Encoding ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} for dense retrieval`);
|
|
558
656
|
for (const chunk of chunks) {
|
|
559
657
|
const embedding = await embed(createDenseChunkText(chunk));
|
|
560
658
|
dimensions ||= embedding.length;
|
|
@@ -568,7 +666,11 @@ async function buildDenseVectors({
|
|
|
568
666
|
text: chunk.text,
|
|
569
667
|
embedding
|
|
570
668
|
});
|
|
669
|
+
if (records.length === 1 || records.length % 100 === 0 || records.length === chunks.length) {
|
|
670
|
+
reportProgressDetail(progress, `Encoded ${records.length}/${chunks.length} chunks for dense retrieval`);
|
|
671
|
+
}
|
|
571
672
|
}
|
|
673
|
+
reportProgress(progress, "Building dense vector index");
|
|
572
674
|
const index = new VectorFieldIndex({
|
|
573
675
|
numHashTables: config.indexHashTables,
|
|
574
676
|
dimensions,
|
|
@@ -592,6 +694,7 @@ async function buildDenseVectors({
|
|
|
592
694
|
chunks: records
|
|
593
695
|
};
|
|
594
696
|
await writeDensePayload(workspacePath, payload);
|
|
697
|
+
reportProgress(progress, `Dense vectors written for ${records.length} chunk${records.length === 1 ? "" : "s"}`);
|
|
595
698
|
return payload;
|
|
596
699
|
}
|
|
597
700
|
async function denseQuery({
|
|
@@ -604,12 +707,19 @@ async function denseQuery({
|
|
|
604
707
|
const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
|
|
605
708
|
const embed = await createEmbedder(cacheDir, config.modelId);
|
|
606
709
|
const vector = await embed(query);
|
|
710
|
+
if (payload.chunks.length <= EXACT_DENSE_RERANK_THRESHOLD) {
|
|
711
|
+
return exactDenseQuery(payload, vector, topK);
|
|
712
|
+
}
|
|
607
713
|
const index = new VectorFieldIndex({
|
|
608
714
|
numHashTables: payload.metadata.hashTables,
|
|
609
715
|
dimensions: payload.metadata.dimensions,
|
|
610
716
|
random: createSeededRandom(payload.metadata.randomSeed)
|
|
611
717
|
}).loadState(payload.indexState);
|
|
612
|
-
|
|
718
|
+
const approximateHits = index.query(vector, topK);
|
|
719
|
+
if (approximateHits.length >= topK) {
|
|
720
|
+
return approximateHits;
|
|
721
|
+
}
|
|
722
|
+
return exactDenseQuery(payload, vector, topK);
|
|
613
723
|
}
|
|
614
724
|
|
|
615
725
|
// src/vector/sparse.ts
|
|
@@ -717,10 +827,13 @@ async function buildSparseDocuments(workspacePath, config, chunks) {
|
|
|
717
827
|
}
|
|
718
828
|
async function buildSparseVectors({
|
|
719
829
|
workspacePath,
|
|
720
|
-
config
|
|
830
|
+
config,
|
|
831
|
+
progress
|
|
721
832
|
}) {
|
|
722
833
|
const chunks = await readJsonl(path9.join(workspacePath, "chunks", "chunks.jsonl"));
|
|
834
|
+
reportProgress(progress, `Encoding ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} for sparse retrieval`);
|
|
723
835
|
const built = await buildSparseDocuments(workspacePath, config, chunks);
|
|
836
|
+
reportProgress(progress, "Building sparse vector index");
|
|
724
837
|
const index = new SparseVectorFieldIndex();
|
|
725
838
|
for (const record of built.chunks) {
|
|
726
839
|
index.insert(record.chunkId, [record.vector]);
|
|
@@ -742,6 +855,7 @@ async function buildSparseVectors({
|
|
|
742
855
|
queryTokenWeights: built.queryTokenWeights
|
|
743
856
|
};
|
|
744
857
|
await writeSparsePayload(workspacePath, payload);
|
|
858
|
+
reportProgress(progress, `Sparse vectors written for ${built.chunks.length} chunk${built.chunks.length === 1 ? "" : "s"}`);
|
|
745
859
|
return payload;
|
|
746
860
|
}
|
|
747
861
|
async function sparseQuery({
|
|
@@ -759,6 +873,7 @@ async function sparseQuery({
|
|
|
759
873
|
}
|
|
760
874
|
|
|
761
875
|
// src/vector/service.ts
|
|
876
|
+
var pullModelsOverrideForTests = null;
|
|
762
877
|
function resolveModelPullPlan({
|
|
763
878
|
pullDenseFlag,
|
|
764
879
|
pullSparseFlag,
|
|
@@ -775,61 +890,75 @@ function resolveModelPullPlan({
|
|
|
775
890
|
pullSparse: uvAvailable
|
|
776
891
|
};
|
|
777
892
|
}
|
|
893
|
+
function resolveMissingConfiguredModelPullPlan({
|
|
894
|
+
config,
|
|
895
|
+
status
|
|
896
|
+
}) {
|
|
897
|
+
return {
|
|
898
|
+
pullDense: config.retrieval.dense.enabled && !status.dense.available,
|
|
899
|
+
pullSparse: config.retrieval.sparse.enabled && status.sparse.uvAvailable && !status.sparse.available
|
|
900
|
+
};
|
|
901
|
+
}
|
|
778
902
|
async function buildVectorArtifacts({
|
|
779
903
|
workspacePath,
|
|
780
904
|
config,
|
|
781
905
|
denseOverride,
|
|
782
906
|
sparseOverride,
|
|
783
|
-
buildAvailableModels = false
|
|
907
|
+
buildAvailableModels = false,
|
|
908
|
+
progress
|
|
784
909
|
}) {
|
|
785
|
-
const
|
|
786
|
-
|
|
787
|
-
await ensureUvAvailable();
|
|
788
|
-
return true;
|
|
789
|
-
} catch {
|
|
790
|
-
return false;
|
|
791
|
-
}
|
|
792
|
-
})()) : null;
|
|
910
|
+
const uvAvailable = await isUvAvailable();
|
|
911
|
+
const modelStatus = buildAvailableModels ? await buildModelStatus(workspacePath, config.retrieval.dense, config.retrieval.sparse, uvAvailable) : null;
|
|
793
912
|
const denseEnabled = denseOverride ?? (buildAvailableModels ? config.retrieval.dense.enabled || Boolean(modelStatus?.dense.available) : config.retrieval.dense.enabled);
|
|
794
|
-
const sparseEnabled = sparseOverride ?? (buildAvailableModels ? (config.retrieval.sparse.enabled || Boolean(modelStatus?.sparse.available)) && Boolean(modelStatus?.sparse.uvAvailable) : config.retrieval.sparse.enabled);
|
|
795
|
-
const
|
|
913
|
+
const sparseEnabled = sparseOverride ?? (buildAvailableModels ? (config.retrieval.sparse.enabled || Boolean(modelStatus?.sparse.available)) && Boolean(modelStatus?.sparse.uvAvailable) : config.retrieval.sparse.enabled && uvAvailable);
|
|
914
|
+
const result = {};
|
|
796
915
|
if (denseEnabled) {
|
|
797
|
-
|
|
916
|
+
reportProgress(progress, `Building dense vectors with ${config.retrieval.dense.modelId}`);
|
|
917
|
+
result.dense = await buildDenseVectors({ workspacePath, config: config.retrieval.dense, progress });
|
|
918
|
+
}
|
|
919
|
+
if ((sparseOverride || config.retrieval.sparse.enabled) && !uvAvailable) {
|
|
920
|
+
reportProgress(progress, "Skipping sparse vectors because uv is not available");
|
|
798
921
|
}
|
|
799
922
|
if (sparseEnabled) {
|
|
800
|
-
|
|
923
|
+
reportProgress(progress, `Building sparse vectors with ${config.retrieval.sparse.modelId}`);
|
|
924
|
+
result.sparse = await buildSparseVectors({ workspacePath, config: config.retrieval.sparse, progress });
|
|
801
925
|
}
|
|
802
|
-
return
|
|
926
|
+
return result;
|
|
803
927
|
}
|
|
804
928
|
async function pullModels({
|
|
805
929
|
workspacePath,
|
|
806
930
|
config,
|
|
807
931
|
pullDense,
|
|
808
|
-
pullSparse
|
|
932
|
+
pullSparse,
|
|
933
|
+
progress
|
|
809
934
|
}) {
|
|
935
|
+
if (pullModelsOverrideForTests) {
|
|
936
|
+
await pullModelsOverrideForTests({ workspacePath, config, pullDense, pullSparse, progress });
|
|
937
|
+
return;
|
|
938
|
+
}
|
|
810
939
|
if (pullDense) {
|
|
940
|
+
reportProgress(progress, `Pulling dense model ${config.retrieval.dense.modelId}`);
|
|
811
941
|
await pullDenseModel(workspacePath, config.retrieval.dense);
|
|
812
|
-
await writeDensePullMarker(workspacePath, {
|
|
942
|
+
await writeDensePullMarker(workspacePath, config.retrieval.dense, {
|
|
813
943
|
pulledAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
814
|
-
modelId: config.retrieval.dense.modelId
|
|
944
|
+
modelId: config.retrieval.dense.modelId,
|
|
945
|
+
cacheDir: config.retrieval.dense.cacheDir
|
|
815
946
|
});
|
|
947
|
+
reportProgress(progress, `Dense model ready: ${config.retrieval.dense.modelId}`);
|
|
816
948
|
}
|
|
817
949
|
if (pullSparse) {
|
|
950
|
+
reportProgress(progress, `Pulling sparse model ${config.retrieval.sparse.modelId}`);
|
|
818
951
|
await pullSparseModel(workspacePath, config.retrieval.sparse);
|
|
819
|
-
await writeSparsePullMarker(workspacePath, {
|
|
952
|
+
await writeSparsePullMarker(workspacePath, config.retrieval.sparse, {
|
|
820
953
|
pulledAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
821
|
-
modelId: config.retrieval.sparse.modelId
|
|
954
|
+
modelId: config.retrieval.sparse.modelId,
|
|
955
|
+
cacheDir: config.retrieval.sparse.cacheDir
|
|
822
956
|
});
|
|
957
|
+
reportProgress(progress, `Sparse model ready: ${config.retrieval.sparse.modelId}`);
|
|
823
958
|
}
|
|
824
959
|
}
|
|
825
960
|
async function getModelStatus(workspacePath, config) {
|
|
826
|
-
|
|
827
|
-
try {
|
|
828
|
-
await ensureUvAvailable();
|
|
829
|
-
uvAvailable = true;
|
|
830
|
-
} catch {
|
|
831
|
-
uvAvailable = false;
|
|
832
|
-
}
|
|
961
|
+
const uvAvailable = await isUvAvailable();
|
|
833
962
|
return buildModelStatus(workspacePath, config.retrieval.dense, config.retrieval.sparse, uvAvailable);
|
|
834
963
|
}
|
|
835
964
|
|
|
@@ -900,14 +1029,17 @@ async function buildIndex({
|
|
|
900
1029
|
workspacePath,
|
|
901
1030
|
denseOverride,
|
|
902
1031
|
sparseOverride,
|
|
903
|
-
buildAvailableModels = false
|
|
1032
|
+
buildAvailableModels = false,
|
|
1033
|
+
progress
|
|
904
1034
|
}) {
|
|
905
1035
|
const config = await loadConfig(workspacePath);
|
|
1036
|
+
reportProgress(progress, "Loading documents, chunks, and sources");
|
|
906
1037
|
const chunks = await readJsonl(path11.join(workspacePath, "chunks", "chunks.jsonl"));
|
|
907
1038
|
const documents = await readJsonl(path11.join(workspacePath, "documents", "documents.jsonl"));
|
|
908
1039
|
const sources = await readJsonl(path11.join(workspacePath, "sources", "sources.jsonl"));
|
|
909
1040
|
const metadataFields = [...new Set(chunks.flatMap((chunk) => Object.keys(chunk.metadata).map((key) => `metadata.${key}`)))];
|
|
910
1041
|
const index = new DocumentIndex(createIndexMapping(metadataFields));
|
|
1042
|
+
reportProgress(progress, `Building lexical index from ${chunks.length} chunk${chunks.length === 1 ? "" : "s"}`);
|
|
911
1043
|
for (const chunk of chunks) {
|
|
912
1044
|
index.index({
|
|
913
1045
|
id: chunk.id,
|
|
@@ -922,6 +1054,7 @@ async function buildIndex({
|
|
|
922
1054
|
}
|
|
923
1055
|
});
|
|
924
1056
|
}
|
|
1057
|
+
reportProgressDetail(progress, `Indexed ${documents.length} document${documents.length === 1 ? "" : "s"} across ${sources.length} source${sources.length === 1 ? "" : "s"}`);
|
|
925
1058
|
const createdAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
926
1059
|
const metadata = {
|
|
927
1060
|
id: `index_${createdAt.replace(/[:.]/g, "-")}`,
|
|
@@ -934,14 +1067,17 @@ async function buildIndex({
|
|
|
934
1067
|
fields: Object.keys(index.mapping),
|
|
935
1068
|
indexHash: sha256(JSON.stringify(index.indexState))
|
|
936
1069
|
};
|
|
1070
|
+
reportProgress(progress, "Writing lexical index artifacts");
|
|
937
1071
|
const artifacts = await writeIndexArtifacts({ workspacePath, indexState: index.indexState, metadata });
|
|
938
1072
|
const vectors = await buildVectorArtifacts({
|
|
939
1073
|
workspacePath,
|
|
940
1074
|
config,
|
|
941
1075
|
denseOverride,
|
|
942
1076
|
sparseOverride,
|
|
943
|
-
buildAvailableModels
|
|
1077
|
+
buildAvailableModels,
|
|
1078
|
+
progress
|
|
944
1079
|
});
|
|
1080
|
+
reportProgress(progress, `Index build complete: dense=${Boolean(vectors.dense)}, sparse=${Boolean(vectors.sparse)}`);
|
|
945
1081
|
return {
|
|
946
1082
|
metadata,
|
|
947
1083
|
indexPath: artifacts.indexPath,
|
|
@@ -953,6 +1089,27 @@ async function buildIndex({
|
|
|
953
1089
|
// src/ingest/ingest-service.ts
|
|
954
1090
|
import path17 from "path";
|
|
955
1091
|
|
|
1092
|
+
// src/core/concurrency.ts
|
|
1093
|
+
async function mapWithConcurrency(items, limit, worker) {
|
|
1094
|
+
if (items.length === 0) {
|
|
1095
|
+
return;
|
|
1096
|
+
}
|
|
1097
|
+
const concurrency = Math.max(1, Math.floor(limit));
|
|
1098
|
+
let nextIndex = 0;
|
|
1099
|
+
await Promise.all(
|
|
1100
|
+
Array.from({ length: Math.min(concurrency, items.length) }, async () => {
|
|
1101
|
+
while (true) {
|
|
1102
|
+
const index = nextIndex;
|
|
1103
|
+
nextIndex += 1;
|
|
1104
|
+
if (index >= items.length) {
|
|
1105
|
+
return;
|
|
1106
|
+
}
|
|
1107
|
+
await worker(items[index], index);
|
|
1108
|
+
}
|
|
1109
|
+
})
|
|
1110
|
+
);
|
|
1111
|
+
}
|
|
1112
|
+
|
|
956
1113
|
// src/core/runs.ts
|
|
957
1114
|
import path12 from "path";
|
|
958
1115
|
async function writeRun(workspacePath, run) {
|
|
@@ -1127,8 +1284,8 @@ import { mkdir as mkdir7, readFile as readFile9, stat as stat3, writeFile as wri
|
|
|
1127
1284
|
// src/ingest/extractors/docx-extractor.ts
|
|
1128
1285
|
import mammoth from "mammoth";
|
|
1129
1286
|
async function extractDocx(filePath) {
|
|
1130
|
-
const
|
|
1131
|
-
return
|
|
1287
|
+
const result = await mammoth.extractRawText({ path: filePath });
|
|
1288
|
+
return result.value;
|
|
1132
1289
|
}
|
|
1133
1290
|
|
|
1134
1291
|
// src/ingest/extractors/html-extractor.ts
|
|
@@ -1142,9 +1299,41 @@ function stripBoilerplate(html) {
|
|
|
1142
1299
|
|
|
1143
1300
|
// src/ingest/extractors/html-extractor.ts
|
|
1144
1301
|
var turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
|
|
1302
|
+
var LOW_SIGNAL_SECTION_SELECTORS = [
|
|
1303
|
+
"script",
|
|
1304
|
+
"style",
|
|
1305
|
+
"noscript",
|
|
1306
|
+
"template",
|
|
1307
|
+
"[data-blog-service-recommendations]",
|
|
1308
|
+
"[data-blog-related-posts]"
|
|
1309
|
+
].join(", ");
|
|
1145
1310
|
function cleanText(value) {
|
|
1146
1311
|
return value.replace(/\s+/g, " ").trim();
|
|
1147
1312
|
}
|
|
1313
|
+
function pruneLowSignalContent($) {
|
|
1314
|
+
$(LOW_SIGNAL_SECTION_SELECTORS).remove();
|
|
1315
|
+
$("form").each((_, element) => {
|
|
1316
|
+
const action = cleanText($(element).attr("action") ?? "");
|
|
1317
|
+
if (action.includes("substack.com/subscribe")) {
|
|
1318
|
+
$(element).closest("section").remove();
|
|
1319
|
+
}
|
|
1320
|
+
});
|
|
1321
|
+
}
|
|
1322
|
+
function stripEscapedJsonPayloads(markdown) {
|
|
1323
|
+
return markdown.split("\n").filter((line) => {
|
|
1324
|
+
const trimmed = line.trim();
|
|
1325
|
+
if (trimmed.length === 0) {
|
|
1326
|
+
return true;
|
|
1327
|
+
}
|
|
1328
|
+
if (trimmed.length > 300 && /^"?\\?\[\{\\?"[a-z0-9_]+\\?":/i.test(trimmed)) {
|
|
1329
|
+
return false;
|
|
1330
|
+
}
|
|
1331
|
+
if (trimmed.length > 300 && trimmed.includes('\\"permalink\\":') && trimmed.includes('\\"title\\":')) {
|
|
1332
|
+
return false;
|
|
1333
|
+
}
|
|
1334
|
+
return true;
|
|
1335
|
+
}).join("\n").replace(/\n{3,}/g, "\n\n").trim();
|
|
1336
|
+
}
|
|
1148
1337
|
function chooseMeaningfulTitle($, fallbackTitle) {
|
|
1149
1338
|
const candidates = [
|
|
1150
1339
|
cleanText($("meta[property='og:title']").attr("content") ?? ""),
|
|
@@ -1181,14 +1370,27 @@ ${parts.join("\n\n")}
|
|
|
1181
1370
|
function extractHtmlToMarkdown(html) {
|
|
1182
1371
|
const cleaned = stripBoilerplate(html);
|
|
1183
1372
|
const $ = load(cleaned);
|
|
1373
|
+
pruneLowSignalContent($);
|
|
1184
1374
|
const fallbackTitle = cleanText($("title").first().text()) || "Untitled";
|
|
1185
1375
|
const title = chooseMeaningfulTitle($, fallbackTitle);
|
|
1186
1376
|
const root = $("main").first().html() ?? $.root().html() ?? cleaned;
|
|
1187
1377
|
return {
|
|
1188
|
-
markdown: turndown.turndown(root),
|
|
1378
|
+
markdown: stripEscapedJsonPayloads(turndown.turndown(root)),
|
|
1189
1379
|
title
|
|
1190
1380
|
};
|
|
1191
1381
|
}
|
|
1382
|
+
function extractCanonicalUriFromHtml(html, baseUrl) {
|
|
1383
|
+
const $ = load(html);
|
|
1384
|
+
const href = $("link[rel='canonical']").first().attr("href")?.trim();
|
|
1385
|
+
if (!href) {
|
|
1386
|
+
return null;
|
|
1387
|
+
}
|
|
1388
|
+
try {
|
|
1389
|
+
return new URL(href, baseUrl).href;
|
|
1390
|
+
} catch {
|
|
1391
|
+
return null;
|
|
1392
|
+
}
|
|
1393
|
+
}
|
|
1192
1394
|
function parseDateCandidate(value) {
|
|
1193
1395
|
const trimmed = value.trim();
|
|
1194
1396
|
if (!trimmed) {
|
|
@@ -1593,6 +1795,19 @@ async function parseRssFeedDocument(xml, source) {
|
|
|
1593
1795
|
// src/ingest/adapters/url-adapter.ts
|
|
1594
1796
|
import { mkdir as mkdir8, readFile as readFile10, writeFile as writeFile7 } from "fs/promises";
|
|
1595
1797
|
import path16 from "path";
|
|
1798
|
+
|
|
1799
|
+
// src/core/urls.ts
|
|
1800
|
+
function normalizeRemoteUrl(uri) {
|
|
1801
|
+
try {
|
|
1802
|
+
const parsed = new URL(uri);
|
|
1803
|
+
parsed.hash = "";
|
|
1804
|
+
return parsed.href;
|
|
1805
|
+
} catch {
|
|
1806
|
+
return uri;
|
|
1807
|
+
}
|
|
1808
|
+
}
|
|
1809
|
+
|
|
1810
|
+
// src/ingest/adapters/url-adapter.ts
|
|
1596
1811
|
function buildHttpCache(response2, validatedAt) {
|
|
1597
1812
|
return {
|
|
1598
1813
|
etag: response2.headers.get("etag") ?? void 0,
|
|
@@ -1617,12 +1832,13 @@ async function normalizeRemoteDocument({
|
|
|
1617
1832
|
responseStatus
|
|
1618
1833
|
}) {
|
|
1619
1834
|
const extracted = extractHtmlToMarkdown(body);
|
|
1835
|
+
const canonicalUri = normalizeRemoteUrl(extractCanonicalUriFromHtml(body, url) ?? url);
|
|
1620
1836
|
const markdown = `# ${extracted.title}
|
|
1621
1837
|
|
|
1622
1838
|
${extracted.markdown}`;
|
|
1623
|
-
const documentId = stableId("doc", source.id,
|
|
1839
|
+
const documentId = stableId("doc", source.id, canonicalUri);
|
|
1624
1840
|
const normalizedPath = path16.resolve(workspacePath, "normalized", `${documentId}.md`);
|
|
1625
|
-
const rawPath = path16.resolve(workspacePath, "raw", source.id, `${sha256(
|
|
1841
|
+
const rawPath = path16.resolve(workspacePath, "raw", source.id, `${sha256(canonicalUri).slice(0, 12)}.html`);
|
|
1626
1842
|
const contentHash = sha256(markdown);
|
|
1627
1843
|
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
1628
1844
|
const lastChangedAt = previous?.contentHash === contentHash ? previous.lastChangedAt : now;
|
|
@@ -1635,7 +1851,7 @@ ${extracted.markdown}`;
|
|
|
1635
1851
|
documentId,
|
|
1636
1852
|
sourceId: source.id,
|
|
1637
1853
|
title: extracted.title,
|
|
1638
|
-
uri:
|
|
1854
|
+
uri: canonicalUri,
|
|
1639
1855
|
sourceUri,
|
|
1640
1856
|
publicationDate: resolvedPublicationDate,
|
|
1641
1857
|
crawledAt,
|
|
@@ -1650,8 +1866,9 @@ ${extracted.markdown}`;
|
|
|
1650
1866
|
sourceId: source.id,
|
|
1651
1867
|
sourceType: source.type,
|
|
1652
1868
|
title: extracted.title,
|
|
1653
|
-
uri:
|
|
1869
|
+
uri: canonicalUri,
|
|
1654
1870
|
sourceUri,
|
|
1871
|
+
canonicalUri,
|
|
1655
1872
|
mimeType: "text/html",
|
|
1656
1873
|
rawPath,
|
|
1657
1874
|
normalizedPath,
|
|
@@ -1825,6 +2042,18 @@ function isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules
|
|
|
1825
2042
|
if (url.origin !== baseUrl.origin) {
|
|
1826
2043
|
return false;
|
|
1827
2044
|
}
|
|
2045
|
+
if (url.search.length > 0) {
|
|
2046
|
+
return false;
|
|
2047
|
+
}
|
|
2048
|
+
if (url.pathname.endsWith(".xml")) {
|
|
2049
|
+
return false;
|
|
2050
|
+
}
|
|
2051
|
+
if (url.pathname.includes("/cdn-cgi/")) {
|
|
2052
|
+
return false;
|
|
2053
|
+
}
|
|
2054
|
+
if (url.pathname === "/search" || url.pathname === "/search/" || url.pathname.endsWith("/search/")) {
|
|
2055
|
+
return false;
|
|
2056
|
+
}
|
|
1828
2057
|
if (disallowRules.some((rule) => rule !== "/" && url.pathname.startsWith(rule))) {
|
|
1829
2058
|
return false;
|
|
1830
2059
|
}
|
|
@@ -1837,56 +2066,75 @@ function isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules
|
|
|
1837
2066
|
}
|
|
1838
2067
|
return true;
|
|
1839
2068
|
}
|
|
1840
|
-
|
|
2069
|
+
function delay(ms) {
|
|
2070
|
+
return new Promise((resolve2) => setTimeout(resolve2, ms));
|
|
2071
|
+
}
|
|
2072
|
+
async function crawlWebsite(source, defaults, progress) {
|
|
1841
2073
|
const baseUrl = new URL(source.uri);
|
|
1842
|
-
const userAgent = source.crawl?.userAgent ??
|
|
2074
|
+
const userAgent = source.crawl?.userAgent ?? defaults.userAgent;
|
|
1843
2075
|
const includePatterns = source.crawl?.includePatterns ?? [];
|
|
1844
2076
|
const excludePatterns = source.crawl?.excludePatterns ?? [];
|
|
1845
2077
|
const maxDepth = source.crawl?.maxDepth ?? 2;
|
|
1846
2078
|
const maxPages = source.crawl?.maxPages ?? 100;
|
|
1847
|
-
const rateLimitMs = source.crawl?.rateLimitMs ??
|
|
2079
|
+
const rateLimitMs = source.crawl?.rateLimitMs ?? defaults.rateLimitMs;
|
|
2080
|
+
const maxConcurrentRequests = source.crawl?.maxConcurrentRequests ?? defaults.maxConcurrentRequests;
|
|
1848
2081
|
const disallowRules = source.crawl?.obeyRobotsTxt === false ? [] : await fetchRobotsDisallow(baseUrl, userAgent);
|
|
1849
|
-
const queue = [{ url: source.uri, depth: 0 }];
|
|
1850
2082
|
const seen = /* @__PURE__ */ new Set();
|
|
1851
2083
|
const results = [];
|
|
2084
|
+
let currentLevel = [normalizeRemoteUrl(source.uri)];
|
|
1852
2085
|
if (source.crawl?.useSitemap !== false) {
|
|
1853
|
-
|
|
1854
|
-
|
|
1855
|
-
|
|
1856
|
-
|
|
1857
|
-
|
|
1858
|
-
|
|
1859
|
-
|
|
1860
|
-
|
|
1861
|
-
}
|
|
1862
|
-
|
|
1863
|
-
const
|
|
1864
|
-
|
|
1865
|
-
|
|
2086
|
+
const sitemapUrls = (await fetchSitemapUrls(baseUrl, userAgent)).map((url) => normalizeRemoteUrl(url));
|
|
2087
|
+
reportProgress(progress, `Discovered ${sitemapUrls.length} sitemap URL${sitemapUrls.length === 1 ? "" : "s"} for ${source.uri}`);
|
|
2088
|
+
currentLevel = [
|
|
2089
|
+
...currentLevel,
|
|
2090
|
+
...sitemapUrls
|
|
2091
|
+
];
|
|
2092
|
+
}
|
|
2093
|
+
for (let depth = 0; depth <= maxDepth && currentLevel.length > 0 && results.length < maxPages; depth += 1) {
|
|
2094
|
+
reportProgress(progress, `Crawl depth ${depth}: evaluating ${currentLevel.length} candidate URL${currentLevel.length === 1 ? "" : "s"}`);
|
|
2095
|
+
const nextLevelCandidates = [];
|
|
2096
|
+
const allowedUrls = [];
|
|
2097
|
+
for (const candidate of currentLevel) {
|
|
2098
|
+
const normalizedCandidate = normalizeRemoteUrl(candidate);
|
|
2099
|
+
if (seen.has(normalizedCandidate)) {
|
|
2100
|
+
continue;
|
|
2101
|
+
}
|
|
2102
|
+
seen.add(normalizedCandidate);
|
|
2103
|
+
const url = new URL(normalizedCandidate);
|
|
2104
|
+
if (!isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules)) {
|
|
2105
|
+
continue;
|
|
2106
|
+
}
|
|
2107
|
+
allowedUrls.push(normalizedCandidate);
|
|
2108
|
+
results.push(normalizedCandidate);
|
|
2109
|
+
reportProgress(progress, `Discovered ${normalizedCandidate}`);
|
|
2110
|
+
if (results.length >= maxPages) {
|
|
2111
|
+
break;
|
|
2112
|
+
}
|
|
1866
2113
|
}
|
|
1867
|
-
|
|
1868
|
-
if (
|
|
1869
|
-
|
|
2114
|
+
reportProgress(progress, `Crawl depth ${depth}: queued ${allowedUrls.length} page${allowedUrls.length === 1 ? "" : "s"} for link extraction`);
|
|
2115
|
+
if (depth >= maxDepth || results.length >= maxPages) {
|
|
2116
|
+
break;
|
|
1870
2117
|
}
|
|
1871
|
-
|
|
1872
|
-
|
|
1873
|
-
|
|
1874
|
-
|
|
1875
|
-
const
|
|
1876
|
-
|
|
1877
|
-
|
|
1878
|
-
|
|
1879
|
-
|
|
1880
|
-
|
|
1881
|
-
|
|
1882
|
-
|
|
2118
|
+
await mapWithConcurrency(allowedUrls, maxConcurrentRequests, async (pageUrl) => {
|
|
2119
|
+
const page = new URL(pageUrl);
|
|
2120
|
+
const response2 = await fetch(page, { headers: { "user-agent": userAgent } });
|
|
2121
|
+
const html = await response2.text();
|
|
2122
|
+
const $ = load2(html);
|
|
2123
|
+
$("a[href]").each((_, element) => {
|
|
2124
|
+
const href = $(element).attr("href");
|
|
2125
|
+
if (!href) {
|
|
2126
|
+
return;
|
|
2127
|
+
}
|
|
2128
|
+
try {
|
|
2129
|
+
nextLevelCandidates.push(normalizeRemoteUrl(new URL(href, page).href));
|
|
2130
|
+
} catch {
|
|
1883
2131
|
}
|
|
1884
|
-
}
|
|
2132
|
+
});
|
|
2133
|
+
if (rateLimitMs > 0) {
|
|
2134
|
+
await delay(rateLimitMs);
|
|
1885
2135
|
}
|
|
1886
2136
|
});
|
|
1887
|
-
|
|
1888
|
-
await new Promise((resolve2) => setTimeout(resolve2, rateLimitMs));
|
|
1889
|
-
}
|
|
2137
|
+
currentLevel = nextLevelCandidates;
|
|
1890
2138
|
}
|
|
1891
2139
|
return results;
|
|
1892
2140
|
}
|
|
@@ -1961,6 +2209,8 @@ async function ingestRssSource({
|
|
|
1961
2209
|
source,
|
|
1962
2210
|
previous,
|
|
1963
2211
|
nextDocuments,
|
|
2212
|
+
maxConcurrentRequests,
|
|
2213
|
+
onDocumentProcessed,
|
|
1964
2214
|
onFailure
|
|
1965
2215
|
}) {
|
|
1966
2216
|
if (source.crawl?.fetchArticles === false) {
|
|
@@ -1968,11 +2218,12 @@ async function ingestRssSource({
|
|
|
1968
2218
|
}
|
|
1969
2219
|
const xml = await fetchFeedText(source);
|
|
1970
2220
|
const items = await parseRssFeedDocument(xml, source);
|
|
2221
|
+
const processedDocumentIds = /* @__PURE__ */ new Set();
|
|
1971
2222
|
let added = 0;
|
|
1972
2223
|
let changed = 0;
|
|
1973
2224
|
let unchanged = 0;
|
|
1974
2225
|
let failed = 0;
|
|
1975
|
-
|
|
2226
|
+
await mapWithConcurrency(items, maxConcurrentRequests, async (item) => {
|
|
1976
2227
|
try {
|
|
1977
2228
|
const probe = previous.get(stableId("doc", source.id, item.url));
|
|
1978
2229
|
const document = await fetchUrlDocument({
|
|
@@ -1983,28 +2234,40 @@ async function ingestRssSource({
|
|
|
1983
2234
|
sourceUri: source.uri,
|
|
1984
2235
|
publicationDate: item.publicationDate
|
|
1985
2236
|
});
|
|
2237
|
+
if (processedDocumentIds.has(document.id)) {
|
|
2238
|
+
return;
|
|
2239
|
+
}
|
|
2240
|
+
processedDocumentIds.add(document.id);
|
|
2241
|
+
const existingDocument = probe ?? previous.get(document.id);
|
|
1986
2242
|
nextDocuments.set(document.id, document);
|
|
1987
|
-
if (!
|
|
2243
|
+
if (!existingDocument) {
|
|
1988
2244
|
added += 1;
|
|
1989
|
-
|
|
2245
|
+
onDocumentProcessed?.(document.uri, "added");
|
|
2246
|
+
} else if (existingDocument.contentHash !== document.contentHash) {
|
|
1990
2247
|
changed += 1;
|
|
2248
|
+
onDocumentProcessed?.(document.uri, "changed");
|
|
1991
2249
|
} else {
|
|
1992
2250
|
unchanged += 1;
|
|
2251
|
+
onDocumentProcessed?.(document.uri, "unchanged");
|
|
1993
2252
|
}
|
|
1994
2253
|
} catch (error) {
|
|
1995
2254
|
failed += 1;
|
|
1996
2255
|
onFailure(item.url, error);
|
|
1997
2256
|
}
|
|
1998
|
-
}
|
|
2257
|
+
});
|
|
1999
2258
|
return { added, changed, unchanged, failed };
|
|
2000
2259
|
}
|
|
2001
2260
|
async function ingestSources({
|
|
2002
2261
|
workspacePath,
|
|
2003
2262
|
sourceIds,
|
|
2004
|
-
changedOnly = false
|
|
2263
|
+
changedOnly = false,
|
|
2264
|
+
progress
|
|
2005
2265
|
}) {
|
|
2006
2266
|
const config = await loadConfig(workspacePath);
|
|
2007
2267
|
const defaultRetentionDays = config.crawler.retentionDays;
|
|
2268
|
+
const defaultUserAgent = config.crawler.defaultUserAgent;
|
|
2269
|
+
const defaultRateLimitMs = config.crawler.rateLimitMs;
|
|
2270
|
+
const defaultMaxConcurrentRequests = config.crawler.maxConcurrentRequests;
|
|
2008
2271
|
const sources = (await listSources(workspacePath)).filter((source) => source.enabled && (!sourceIds || sourceIds.includes(source.id)));
|
|
2009
2272
|
const existing = await loadDocuments(workspacePath);
|
|
2010
2273
|
const previous = previousMap(existing);
|
|
@@ -2014,20 +2277,38 @@ async function ingestSources({
|
|
|
2014
2277
|
let unchanged = 0;
|
|
2015
2278
|
let failed = 0;
|
|
2016
2279
|
const failures = [];
|
|
2280
|
+
reportProgress(progress, `Ingesting ${sources.length} source${sources.length === 1 ? "" : "s"}`);
|
|
2017
2281
|
for (const source of sources) {
|
|
2282
|
+
const maxConcurrentRequests = source.crawl?.maxConcurrentRequests ?? defaultMaxConcurrentRequests;
|
|
2283
|
+
const sourceBefore = { added, changed, unchanged, failed };
|
|
2284
|
+
const processedDocumentIds = /* @__PURE__ */ new Set();
|
|
2285
|
+
const reportDocumentOutcome = (uri, outcome) => {
|
|
2286
|
+
const label = outcome === "unchanged" ? "Unchanged" : outcome === "changed" ? "Updated" : "Added";
|
|
2287
|
+
reportProgress(progress, `${label} ${uri}`);
|
|
2288
|
+
};
|
|
2018
2289
|
const ingestOne = async (uri, producer) => {
|
|
2019
2290
|
try {
|
|
2020
2291
|
const probeId = stableId("doc", source.id, uri);
|
|
2021
2292
|
const earlier = previous.get(probeId);
|
|
2022
2293
|
const document = await producer();
|
|
2294
|
+
if (processedDocumentIds.has(document.id)) {
|
|
2295
|
+
reportProgressDetail(progress, `Skipped duplicate alias ${uri} -> ${document.uri}`);
|
|
2296
|
+
return null;
|
|
2297
|
+
}
|
|
2298
|
+
processedDocumentIds.add(document.id);
|
|
2299
|
+
const existingDocument = earlier ?? previous.get(document.id);
|
|
2023
2300
|
nextDocuments.set(document.id, document);
|
|
2024
|
-
if (!
|
|
2301
|
+
if (!existingDocument) {
|
|
2025
2302
|
added += 1;
|
|
2026
|
-
|
|
2303
|
+
reportDocumentOutcome(document.uri, "added");
|
|
2304
|
+
} else if (existingDocument.contentHash !== document.contentHash) {
|
|
2027
2305
|
changed += 1;
|
|
2306
|
+
reportDocumentOutcome(document.uri, "changed");
|
|
2028
2307
|
} else {
|
|
2029
2308
|
unchanged += 1;
|
|
2309
|
+
reportDocumentOutcome(document.uri, "unchanged");
|
|
2030
2310
|
}
|
|
2311
|
+
return document;
|
|
2031
2312
|
} catch (error) {
|
|
2032
2313
|
failed += 1;
|
|
2033
2314
|
failures.push({
|
|
@@ -2035,50 +2316,69 @@ async function ingestSources({
|
|
|
2035
2316
|
uri,
|
|
2036
2317
|
message: error instanceof Error ? error.message : String(error)
|
|
2037
2318
|
});
|
|
2319
|
+
reportProgressDetail(progress, `Failed ${uri}: ${error instanceof Error ? error.message : String(error)}`);
|
|
2320
|
+
return null;
|
|
2038
2321
|
}
|
|
2039
2322
|
};
|
|
2040
2323
|
try {
|
|
2324
|
+
reportProgress(progress, `Source ${source.name} (${source.type})`);
|
|
2041
2325
|
if (source.type === "file") {
|
|
2326
|
+
reportProgress(progress, `Reading file ${source.uri}`);
|
|
2042
2327
|
await ingestOne(source.uri, () => ingestFile({ workspacePath, source, filePath: source.uri, previous: previous.get(stableId("doc", source.id, source.uri)) }));
|
|
2043
|
-
|
|
2044
|
-
|
|
2045
|
-
|
|
2046
|
-
for (const filePath of
|
|
2328
|
+
} else if (source.type === "directory") {
|
|
2329
|
+
const files = await listDirectoryFiles(source);
|
|
2330
|
+
reportProgress(progress, `Scanning ${files.length} file${files.length === 1 ? "" : "s"} from ${source.uri}`);
|
|
2331
|
+
for (const filePath of files) {
|
|
2332
|
+
reportProgress(progress, `Reading file ${filePath}`);
|
|
2047
2333
|
await ingestOne(filePath, () => ingestFile({ workspacePath, source, filePath, previous: previous.get(stableId("doc", source.id, filePath)) }));
|
|
2048
2334
|
}
|
|
2049
|
-
|
|
2050
|
-
|
|
2051
|
-
if (source.type === "url") {
|
|
2335
|
+
} else if (source.type === "url") {
|
|
2336
|
+
reportProgress(progress, `Fetching ${source.uri}`);
|
|
2052
2337
|
await ingestOne(source.uri, () => fetchUrlDocument({ workspacePath, source, url: source.uri, previous: previous.get(stableId("doc", source.id, source.uri)) }));
|
|
2053
|
-
|
|
2054
|
-
|
|
2055
|
-
|
|
2056
|
-
|
|
2057
|
-
|
|
2058
|
-
|
|
2059
|
-
|
|
2060
|
-
|
|
2061
|
-
|
|
2062
|
-
|
|
2338
|
+
} else if (source.type === "website") {
|
|
2339
|
+
reportProgress(progress, `Crawling ${source.uri}`);
|
|
2340
|
+
const urls = await crawlWebsite(source, {
|
|
2341
|
+
userAgent: defaultUserAgent,
|
|
2342
|
+
rateLimitMs: defaultRateLimitMs,
|
|
2343
|
+
maxConcurrentRequests
|
|
2344
|
+
}, progress);
|
|
2345
|
+
reportProgress(progress, `Fetched ${urls.length} page${urls.length === 1 ? "" : "s"} from crawl`);
|
|
2346
|
+
const seenCanonicalUrls = /* @__PURE__ */ new Set();
|
|
2347
|
+
await mapWithConcurrency(urls, maxConcurrentRequests, async (url) => {
|
|
2348
|
+
if (seenCanonicalUrls.has(url)) {
|
|
2349
|
+
reportProgressDetail(progress, `Skipped canonical duplicate ${url}`);
|
|
2350
|
+
return;
|
|
2351
|
+
}
|
|
2352
|
+
reportProgress(progress, `Fetching ${url}`);
|
|
2353
|
+
const document = await ingestOne(url, () => fetchUrlDocument({ workspacePath, source, url, previous: previous.get(stableId("doc", source.id, url)) }));
|
|
2354
|
+
if (document) {
|
|
2355
|
+
seenCanonicalUrls.add(document.uri);
|
|
2356
|
+
}
|
|
2357
|
+
});
|
|
2358
|
+
} else if (source.type === "rss") {
|
|
2359
|
+
reportProgress(progress, `Fetching feed ${source.uri}`);
|
|
2360
|
+
const result = await ingestRssSource({
|
|
2063
2361
|
workspacePath,
|
|
2064
2362
|
source,
|
|
2065
2363
|
previous,
|
|
2066
2364
|
nextDocuments,
|
|
2365
|
+
maxConcurrentRequests,
|
|
2366
|
+
onDocumentProcessed: reportDocumentOutcome,
|
|
2067
2367
|
onFailure: (uri, error) => {
|
|
2068
2368
|
failures.push({
|
|
2069
2369
|
sourceId: source.id,
|
|
2070
2370
|
uri,
|
|
2071
2371
|
message: error instanceof Error ? error.message : String(error)
|
|
2072
2372
|
});
|
|
2373
|
+
reportProgressDetail(progress, `Failed ${uri}: ${error instanceof Error ? error.message : String(error)}`);
|
|
2073
2374
|
}
|
|
2074
2375
|
});
|
|
2075
|
-
added +=
|
|
2076
|
-
changed +=
|
|
2077
|
-
unchanged +=
|
|
2078
|
-
failed +=
|
|
2079
|
-
|
|
2080
|
-
|
|
2081
|
-
if (source.type === "markdown" || source.type === "text") {
|
|
2376
|
+
added += result.added;
|
|
2377
|
+
changed += result.changed;
|
|
2378
|
+
unchanged += result.unchanged;
|
|
2379
|
+
failed += result.failed;
|
|
2380
|
+
} else if (source.type === "markdown" || source.type === "text") {
|
|
2381
|
+
reportProgress(progress, `Processing inline ${source.type} source ${source.id}`);
|
|
2082
2382
|
await ingestOne(source.uri, () => ingestInlineContent({
|
|
2083
2383
|
workspacePath,
|
|
2084
2384
|
source,
|
|
@@ -2095,13 +2395,19 @@ async function ingestSources({
|
|
|
2095
2395
|
uri: source.uri,
|
|
2096
2396
|
message: error instanceof Error ? error.message : String(error)
|
|
2097
2397
|
});
|
|
2398
|
+
reportProgressDetail(progress, `Failed source ${source.name}: ${error instanceof Error ? error.message : String(error)}`);
|
|
2098
2399
|
}
|
|
2400
|
+
reportProgress(
|
|
2401
|
+
progress,
|
|
2402
|
+
`Finished ${source.name}: +${added - sourceBefore.added} added, ${changed - sourceBefore.changed} changed, ${unchanged - sourceBefore.unchanged} unchanged, ${failed - sourceBefore.failed} failed`
|
|
2403
|
+
);
|
|
2099
2404
|
}
|
|
2100
2405
|
const expiringDocuments = [...nextDocuments.values()].filter((document) => {
|
|
2101
2406
|
const source = sources.find((candidate) => candidate.id === document.sourceId);
|
|
2102
2407
|
return source ? shouldExpireRssDocument(document, source, defaultRetentionDays) : false;
|
|
2103
2408
|
});
|
|
2104
2409
|
if (expiringDocuments.length > 0) {
|
|
2410
|
+
reportProgress(progress, `Removing ${expiringDocuments.length} expired RSS document${expiringDocuments.length === 1 ? "" : "s"}`);
|
|
2105
2411
|
const expiredIds = new Set(expiringDocuments.map((document) => document.id));
|
|
2106
2412
|
for (const document of expiringDocuments) {
|
|
2107
2413
|
nextDocuments.delete(document.id);
|
|
@@ -2128,6 +2434,7 @@ async function ingestSources({
|
|
|
2128
2434
|
documentsSnapshot: documentSnapshot(finalDocuments)
|
|
2129
2435
|
};
|
|
2130
2436
|
await writeRun(workspacePath, run);
|
|
2437
|
+
reportProgress(progress, `Ingest complete: ${added} added, ${changed} changed, ${unchanged} unchanged, ${failed} failed`);
|
|
2131
2438
|
return {
|
|
2132
2439
|
runId: id,
|
|
2133
2440
|
documents: { added, changed, unchanged, failed },
|
|
@@ -2137,7 +2444,8 @@ async function ingestSources({
|
|
|
2137
2444
|
async function reprocessDocuments({
|
|
2138
2445
|
workspacePath,
|
|
2139
2446
|
sourceId,
|
|
2140
|
-
documentId
|
|
2447
|
+
documentId,
|
|
2448
|
+
progress
|
|
2141
2449
|
}) {
|
|
2142
2450
|
const documents = await loadDocuments(workspacePath);
|
|
2143
2451
|
const sources = await listSources(workspacePath);
|
|
@@ -2145,15 +2453,20 @@ async function reprocessDocuments({
|
|
|
2145
2453
|
const nextDocuments = new Map(documents.map((document) => [document.id, document]));
|
|
2146
2454
|
let documentsReprocessed = 0;
|
|
2147
2455
|
let documentsSkipped = 0;
|
|
2148
|
-
|
|
2456
|
+
const targets = documents.filter((candidate) => (!sourceId || candidate.sourceId === sourceId) && (!documentId || candidate.id === documentId));
|
|
2457
|
+
reportProgress(progress, `Reprocessing ${targets.length} document${targets.length === 1 ? "" : "s"}`);
|
|
2458
|
+
for (const document of targets) {
|
|
2459
|
+
reportProgressDetail(progress, `Reprocessing ${document.id} (${document.title})`);
|
|
2149
2460
|
const source = sourceMap.get(document.sourceId);
|
|
2150
2461
|
if (!source || !document.rawPath || !await fileExists(document.rawPath)) {
|
|
2151
2462
|
documentsSkipped += 1;
|
|
2463
|
+
reportProgressDetail(progress, `Skipped ${document.id}: raw source not available`);
|
|
2152
2464
|
continue;
|
|
2153
2465
|
}
|
|
2154
2466
|
const updated = source.type === "url" || source.type === "website" || source.type === "rss" ? await reprocessRemoteDocument(document, source) : await reprocessStoredDocument(document, source);
|
|
2155
2467
|
if (!updated) {
|
|
2156
2468
|
documentsSkipped += 1;
|
|
2469
|
+
reportProgressDetail(progress, `Skipped ${document.id}: source type could not be reprocessed`);
|
|
2157
2470
|
continue;
|
|
2158
2471
|
}
|
|
2159
2472
|
nextDocuments.set(updated.id, updated);
|
|
@@ -2173,15 +2486,217 @@ async function reprocessDocuments({
|
|
|
2173
2486
|
},
|
|
2174
2487
|
documentsSnapshot: documentSnapshot(finalDocuments)
|
|
2175
2488
|
});
|
|
2489
|
+
reportProgress(progress, `Reprocess complete: ${documentsReprocessed} updated, ${documentsSkipped} skipped`);
|
|
2176
2490
|
return { runId: id, documentsReprocessed, documentsSkipped };
|
|
2177
2491
|
}
|
|
2178
2492
|
|
|
2493
|
+
// src/ingest/adapters/website-feed-discovery.ts
|
|
2494
|
+
import { load as load3 } from "cheerio";
|
|
2495
|
+
var COMMON_FEED_PATHS = [
|
|
2496
|
+
"/feed",
|
|
2497
|
+
"/feed.xml",
|
|
2498
|
+
"/rss",
|
|
2499
|
+
"/rss.xml",
|
|
2500
|
+
"/atom.xml",
|
|
2501
|
+
"/index.xml",
|
|
2502
|
+
"/blog/feed",
|
|
2503
|
+
"/blog/feed.xml",
|
|
2504
|
+
"/blog/rss.xml",
|
|
2505
|
+
"/blog/atom.xml",
|
|
2506
|
+
"/blog/index.xml",
|
|
2507
|
+
"/news/feed",
|
|
2508
|
+
"/news/feed.xml",
|
|
2509
|
+
"/news/rss.xml",
|
|
2510
|
+
"/news/atom.xml",
|
|
2511
|
+
"/news/index.xml"
|
|
2512
|
+
];
|
|
2513
|
+
function normalizeCandidateUrl(href, baseUrl) {
|
|
2514
|
+
try {
|
|
2515
|
+
const resolved = new URL(href, baseUrl);
|
|
2516
|
+
if (!["http:", "https:"].includes(resolved.protocol)) {
|
|
2517
|
+
return null;
|
|
2518
|
+
}
|
|
2519
|
+
return resolved.href;
|
|
2520
|
+
} catch {
|
|
2521
|
+
return null;
|
|
2522
|
+
}
|
|
2523
|
+
}
|
|
2524
|
+
function looksLikeFeedLink(typeHint, href) {
|
|
2525
|
+
const type = typeHint?.toLowerCase() ?? "";
|
|
2526
|
+
const lowerHref = href.toLowerCase();
|
|
2527
|
+
return type.includes("rss") || type.includes("atom") || type.includes("xml") || lowerHref.includes("/feed") || lowerHref.includes("/rss") || lowerHref.includes("/atom") || lowerHref.endsWith(".xml");
|
|
2528
|
+
}
|
|
2529
|
+
function extractDeclaredFeedCandidates(html, baseUrl) {
|
|
2530
|
+
const $ = load3(html);
|
|
2531
|
+
const candidates = [];
|
|
2532
|
+
$("link[href]").each((index, element) => {
|
|
2533
|
+
const rel = ($(element).attr("rel") ?? "").split(/\s+/).map((value) => value.trim().toLowerCase()).filter(Boolean);
|
|
2534
|
+
const href = $(element).attr("href");
|
|
2535
|
+
if (!href || !rel.includes("alternate")) {
|
|
2536
|
+
return;
|
|
2537
|
+
}
|
|
2538
|
+
const typeHint = $(element).attr("type") ?? void 0;
|
|
2539
|
+
if (!looksLikeFeedLink(typeHint, href)) {
|
|
2540
|
+
return;
|
|
2541
|
+
}
|
|
2542
|
+
const normalized = normalizeCandidateUrl(href, baseUrl);
|
|
2543
|
+
if (!normalized) {
|
|
2544
|
+
return;
|
|
2545
|
+
}
|
|
2546
|
+
candidates.push({
|
|
2547
|
+
url: normalized,
|
|
2548
|
+
discoveredBy: "declared",
|
|
2549
|
+
order: index,
|
|
2550
|
+
typeHint
|
|
2551
|
+
});
|
|
2552
|
+
});
|
|
2553
|
+
return candidates;
|
|
2554
|
+
}
|
|
2555
|
+
function buildCommonFeedCandidates(baseUrl) {
|
|
2556
|
+
return COMMON_FEED_PATHS.map((pathname, index) => ({
|
|
2557
|
+
url: new URL(pathname, baseUrl).href,
|
|
2558
|
+
discoveredBy: "common",
|
|
2559
|
+
order: index
|
|
2560
|
+
}));
|
|
2561
|
+
}
|
|
2562
|
+
function dedupeCandidates(candidates) {
|
|
2563
|
+
const seen = /* @__PURE__ */ new Set();
|
|
2564
|
+
const deduped = [];
|
|
2565
|
+
for (const candidate of candidates) {
|
|
2566
|
+
if (seen.has(candidate.url)) {
|
|
2567
|
+
continue;
|
|
2568
|
+
}
|
|
2569
|
+
seen.add(candidate.url);
|
|
2570
|
+
deduped.push(candidate);
|
|
2571
|
+
}
|
|
2572
|
+
return deduped;
|
|
2573
|
+
}
|
|
2574
|
+
function looksLikeFeedDocument(contentType, body) {
|
|
2575
|
+
const type = contentType?.toLowerCase() ?? "";
|
|
2576
|
+
const lowerBody = body.toLowerCase();
|
|
2577
|
+
return type.includes("rss") || type.includes("atom") || type.includes("xml") && (lowerBody.includes("<rss") || lowerBody.includes("<feed") || lowerBody.includes("<rdf:rdf")) || lowerBody.includes("<rss") || lowerBody.includes("<feed") || lowerBody.includes("<rdf:rdf");
|
|
2578
|
+
}
|
|
2579
|
+
function hasStablePrefixSegment(segment) {
|
|
2580
|
+
return typeof segment === "string" && segment.length > 0 && /[a-z]/i.test(segment);
|
|
2581
|
+
}
|
|
2582
|
+
function deriveExcludePrefix(itemUrls, websiteOrigin) {
|
|
2583
|
+
const paths = itemUrls.map((itemUrl) => {
|
|
2584
|
+
try {
|
|
2585
|
+
const parsed = new URL(itemUrl);
|
|
2586
|
+
if (parsed.origin !== websiteOrigin) {
|
|
2587
|
+
return null;
|
|
2588
|
+
}
|
|
2589
|
+
return parsed.pathname.split("/").filter(Boolean);
|
|
2590
|
+
} catch {
|
|
2591
|
+
return null;
|
|
2592
|
+
}
|
|
2593
|
+
}).filter((segments) => Array.isArray(segments));
|
|
2594
|
+
if (paths.length < 2) {
|
|
2595
|
+
return void 0;
|
|
2596
|
+
}
|
|
2597
|
+
const first = paths[0];
|
|
2598
|
+
if (!first) {
|
|
2599
|
+
return void 0;
|
|
2600
|
+
}
|
|
2601
|
+
let commonLength = 0;
|
|
2602
|
+
while (commonLength < first.length) {
|
|
2603
|
+
const nextSegment = first[commonLength];
|
|
2604
|
+
if (!hasStablePrefixSegment(nextSegment) || !paths.every((segments) => segments[commonLength] === nextSegment)) {
|
|
2605
|
+
break;
|
|
2606
|
+
}
|
|
2607
|
+
commonLength += 1;
|
|
2608
|
+
}
|
|
2609
|
+
if (commonLength === 0) {
|
|
2610
|
+
return void 0;
|
|
2611
|
+
}
|
|
2612
|
+
return `/${first.slice(0, commonLength).join("/")}/`;
|
|
2613
|
+
}
|
|
2614
|
+
function scoreCandidate(candidate) {
|
|
2615
|
+
const url = new URL(candidate.url);
|
|
2616
|
+
const segments = url.pathname.split("/").filter(Boolean);
|
|
2617
|
+
let score = candidate.discoveredBy === "declared" ? 1e3 : 100;
|
|
2618
|
+
score -= candidate.order;
|
|
2619
|
+
score -= segments.length * 10;
|
|
2620
|
+
if (candidate.typeHint?.toLowerCase().includes("rss") || candidate.typeHint?.toLowerCase().includes("atom")) {
|
|
2621
|
+
score += 25;
|
|
2622
|
+
}
|
|
2623
|
+
if (["/feed", "/feed.xml", "/rss", "/rss.xml", "/atom.xml", "/index.xml"].includes(url.pathname)) {
|
|
2624
|
+
score += 50;
|
|
2625
|
+
}
|
|
2626
|
+
if (url.pathname.includes("comments")) {
|
|
2627
|
+
score -= 200;
|
|
2628
|
+
}
|
|
2629
|
+
return score;
|
|
2630
|
+
}
|
|
2631
|
+
async function validateCandidate(candidate, websiteUrl, userAgent) {
|
|
2632
|
+
try {
|
|
2633
|
+
const response2 = await fetch(candidate.url, { headers: { "user-agent": userAgent } });
|
|
2634
|
+
if (!response2.ok) {
|
|
2635
|
+
return null;
|
|
2636
|
+
}
|
|
2637
|
+
const body = await response2.text();
|
|
2638
|
+
if (!looksLikeFeedDocument(response2.headers.get("content-type"), body)) {
|
|
2639
|
+
return null;
|
|
2640
|
+
}
|
|
2641
|
+
const source = {
|
|
2642
|
+
id: "src_detected_feed",
|
|
2643
|
+
type: "rss",
|
|
2644
|
+
uri: candidate.url,
|
|
2645
|
+
name: "Detected Feed",
|
|
2646
|
+
enabled: true,
|
|
2647
|
+
tags: [],
|
|
2648
|
+
metadata: {},
|
|
2649
|
+
createdAt: "1970-01-01T00:00:00.000Z",
|
|
2650
|
+
updatedAt: "1970-01-01T00:00:00.000Z"
|
|
2651
|
+
};
|
|
2652
|
+
const items = await parseRssFeedDocument(body, source);
|
|
2653
|
+
return {
|
|
2654
|
+
feedUrl: candidate.url,
|
|
2655
|
+
discoveredBy: candidate.discoveredBy,
|
|
2656
|
+
excludePrefix: deriveExcludePrefix(items.map((item) => item.url), websiteUrl.origin)
|
|
2657
|
+
};
|
|
2658
|
+
} catch {
|
|
2659
|
+
return null;
|
|
2660
|
+
}
|
|
2661
|
+
}
|
|
2662
|
+
async function discoverWebsiteFeed(websiteUrl, userAgent) {
|
|
2663
|
+
try {
|
|
2664
|
+
const baseUrl = new URL(websiteUrl);
|
|
2665
|
+
const response2 = await fetch(baseUrl, { headers: { "user-agent": userAgent } });
|
|
2666
|
+
if (!response2.ok) {
|
|
2667
|
+
return null;
|
|
2668
|
+
}
|
|
2669
|
+
const html = await response2.text();
|
|
2670
|
+
const candidates = dedupeCandidates([
|
|
2671
|
+
...extractDeclaredFeedCandidates(html, baseUrl),
|
|
2672
|
+
...buildCommonFeedCandidates(baseUrl)
|
|
2673
|
+
]).sort((left, right) => scoreCandidate(right) - scoreCandidate(left));
|
|
2674
|
+
for (const candidate of candidates) {
|
|
2675
|
+
const validated = await validateCandidate(candidate, baseUrl, userAgent);
|
|
2676
|
+
if (validated) {
|
|
2677
|
+
return validated;
|
|
2678
|
+
}
|
|
2679
|
+
}
|
|
2680
|
+
return null;
|
|
2681
|
+
} catch {
|
|
2682
|
+
return null;
|
|
2683
|
+
}
|
|
2684
|
+
}
|
|
2685
|
+
|
|
2179
2686
|
// src/query/search-service.ts
|
|
2180
2687
|
import { readFile as readFile11 } from "fs/promises";
|
|
2181
2688
|
import { BoolQuery, MatchQuery, OP, TermQuery, reciprocalRankFusion } from "@tryformation/querylight-ts";
|
|
2182
2689
|
import path18 from "path";
|
|
2183
2690
|
async function loadHydratedIndex(workspacePath) {
|
|
2184
|
-
|
|
2691
|
+
let state;
|
|
2692
|
+
try {
|
|
2693
|
+
state = await readLatestIndexState(workspacePath);
|
|
2694
|
+
} catch (error) {
|
|
2695
|
+
if (error.code === "ENOENT") {
|
|
2696
|
+
throw new CliError("lexical index is not built; run `qli rebuild` or `qli chunk` followed by `qli index build`", "INDEX_MISSING", 7 /* QueryError */);
|
|
2697
|
+
}
|
|
2698
|
+
throw error;
|
|
2699
|
+
}
|
|
2185
2700
|
const mapping = createIndexMapping(Object.keys(state.fieldState ?? {}).filter((field) => field.startsWith("metadata.")));
|
|
2186
2701
|
return new (await import("@tryformation/querylight-ts")).DocumentIndex(mapping).loadState(state);
|
|
2187
2702
|
}
|
|
@@ -2417,9 +2932,25 @@ async function buildSnippetWithAdjacentChunks(chunk, query, {
|
|
|
2417
2932
|
function normalizeDisplayTitle(title) {
|
|
2418
2933
|
return title.replace(/\s*\|\s*Querylight TS Demo\s*$/i, "").replace(/\s+/g, " ").trim();
|
|
2419
2934
|
}
|
|
2935
|
+
var LOW_SIGNAL_RESULT_TITLES = /* @__PURE__ */ new Set([
|
|
2936
|
+
"choose this instead of",
|
|
2937
|
+
"how xyz runs it",
|
|
2938
|
+
"naechste schritte",
|
|
2939
|
+
"next steps",
|
|
2940
|
+
"overview",
|
|
2941
|
+
"passend wenn",
|
|
2942
|
+
"problem",
|
|
2943
|
+
"right fit",
|
|
2944
|
+
"waehlen sie das stattdessen",
|
|
2945
|
+
"was sie bekommen",
|
|
2946
|
+
"what you get",
|
|
2947
|
+
"wie xyz es umsetzt",
|
|
2948
|
+
"uberblick",
|
|
2949
|
+
"\xFCberblick"
|
|
2950
|
+
]);
|
|
2420
2951
|
function chooseResultTitle(chunk) {
|
|
2421
2952
|
const documentTitle = normalizeDisplayTitle(chunk.title);
|
|
2422
|
-
const headings = chunk.headingPath.map((heading) => normalizeDisplayTitle(heading)).filter(
|
|
2953
|
+
const headings = chunk.headingPath.map((heading) => normalizeDisplayTitle(heading)).filter((heading) => heading.length > 0 && !LOW_SIGNAL_RESULT_TITLES.has(heading.toLowerCase()));
|
|
2423
2954
|
const leafHeading = headings.at(-1);
|
|
2424
2955
|
if (leafHeading && leafHeading.toLowerCase() !== documentTitle.toLowerCase()) {
|
|
2425
2956
|
return leafHeading;
|
|
@@ -2441,6 +2972,9 @@ function normalizeUriPath(uri) {
|
|
|
2441
2972
|
return uri.toLowerCase().replace(/\/+$/, "");
|
|
2442
2973
|
}
|
|
2443
2974
|
}
|
|
2975
|
+
function normalizeUriIdentity(uri) {
|
|
2976
|
+
return normalizeRemoteUrl(uri).toLowerCase().replace(/\/+$/, "");
|
|
2977
|
+
}
|
|
2444
2978
|
function uriSpecificity(uri) {
|
|
2445
2979
|
const normalized = normalizeUriPath(uri);
|
|
2446
2980
|
if (normalized === "/") {
|
|
@@ -2457,6 +2991,11 @@ function isMoreSpecificDuplicate(candidate, existing) {
|
|
|
2457
2991
|
if (!candidateTitle || candidateTitle !== existingTitle) {
|
|
2458
2992
|
return false;
|
|
2459
2993
|
}
|
|
2994
|
+
const candidateIdentity = normalizeUriIdentity(candidate.uri);
|
|
2995
|
+
const existingIdentity = normalizeUriIdentity(existing.uri);
|
|
2996
|
+
if (candidateIdentity === existingIdentity) {
|
|
2997
|
+
return candidate.uri.length < existing.uri.length;
|
|
2998
|
+
}
|
|
2460
2999
|
const candidatePath = normalizeUriPath(candidate.uri);
|
|
2461
3000
|
const existingPath = normalizeUriPath(existing.uri);
|
|
2462
3001
|
if (candidatePath === existingPath) {
|
|
@@ -2471,28 +3010,28 @@ function isMoreSpecificDuplicate(candidate, existing) {
|
|
|
2471
3010
|
}
|
|
2472
3011
|
function collapseAggregateDuplicates(results, topK) {
|
|
2473
3012
|
const deduped = [];
|
|
2474
|
-
for (const
|
|
3013
|
+
for (const result of results) {
|
|
2475
3014
|
const duplicateIndex = deduped.findIndex(
|
|
2476
|
-
(existing) => isMoreSpecificDuplicate(
|
|
3015
|
+
(existing) => isMoreSpecificDuplicate(result, existing) || isMoreSpecificDuplicate(existing, result)
|
|
2477
3016
|
);
|
|
2478
3017
|
if (duplicateIndex < 0) {
|
|
2479
|
-
deduped.push(
|
|
3018
|
+
deduped.push(result);
|
|
2480
3019
|
continue;
|
|
2481
3020
|
}
|
|
2482
|
-
if (isMoreSpecificDuplicate(
|
|
2483
|
-
deduped[duplicateIndex] =
|
|
3021
|
+
if (isMoreSpecificDuplicate(result, deduped[duplicateIndex])) {
|
|
3022
|
+
deduped[duplicateIndex] = result;
|
|
2484
3023
|
}
|
|
2485
3024
|
}
|
|
2486
3025
|
return deduped.slice(0, topK);
|
|
2487
3026
|
}
|
|
2488
3027
|
function rerankResultsByDocument(results, topK) {
|
|
2489
3028
|
const byDocument = /* @__PURE__ */ new Map();
|
|
2490
|
-
for (const
|
|
2491
|
-
const existing = byDocument.get(
|
|
3029
|
+
for (const result of results) {
|
|
3030
|
+
const existing = byDocument.get(result.documentId);
|
|
2492
3031
|
if (existing) {
|
|
2493
|
-
existing.push(
|
|
3032
|
+
existing.push(result);
|
|
2494
3033
|
} else {
|
|
2495
|
-
byDocument.set(
|
|
3034
|
+
byDocument.set(result.documentId, [result]);
|
|
2496
3035
|
}
|
|
2497
3036
|
}
|
|
2498
3037
|
const reranked = [...byDocument.values()].flatMap((group) => {
|
|
@@ -2501,7 +3040,7 @@ function rerankResultsByDocument(results, topK) {
|
|
|
2501
3040
|
if (!best) {
|
|
2502
3041
|
return [];
|
|
2503
3042
|
}
|
|
2504
|
-
const tailScore = rest.reduce((sum,
|
|
3043
|
+
const tailScore = rest.reduce((sum, result) => sum + result.score, 0);
|
|
2505
3044
|
const aggregateScore = best.score + tailScore * 0.35 + (group.length - 1) * 0.2;
|
|
2506
3045
|
return [{ ...best, score: aggregateScore }];
|
|
2507
3046
|
}).sort((left, right) => right.score - left.score);
|
|
@@ -2569,7 +3108,6 @@ async function searchIndex({
|
|
|
2569
3108
|
score: 0,
|
|
2570
3109
|
title: chooseResultTitle(chunk),
|
|
2571
3110
|
uri: chunk.uri,
|
|
2572
|
-
headingPath: chunk.headingPath,
|
|
2573
3111
|
snippet: await buildSnippetWithAdjacentChunks(chunk, document.title, {
|
|
2574
3112
|
document,
|
|
2575
3113
|
config,
|
|
@@ -2584,7 +3122,7 @@ async function searchIndex({
|
|
|
2584
3122
|
};
|
|
2585
3123
|
})
|
|
2586
3124
|
);
|
|
2587
|
-
return { retrievalMode: "lexical", results: latestResults.filter((
|
|
3125
|
+
return { retrievalMode: "lexical", results: latestResults.filter((result) => result != null) };
|
|
2588
3126
|
}
|
|
2589
3127
|
const lexicalHits = async () => {
|
|
2590
3128
|
const index = await loadHydratedIndex(workspacePath);
|
|
@@ -2633,7 +3171,6 @@ async function searchIndex({
|
|
|
2633
3171
|
score,
|
|
2634
3172
|
title: chooseResultTitle(chunk),
|
|
2635
3173
|
uri: chunk.uri,
|
|
2636
|
-
headingPath: chunk.headingPath,
|
|
2637
3174
|
snippet: await buildSnippetWithAdjacentChunks(chunk, normalizedQuery, {
|
|
2638
3175
|
document: documents.get(chunk.documentId),
|
|
2639
3176
|
config,
|
|
@@ -2647,13 +3184,13 @@ async function searchIndex({
|
|
|
2647
3184
|
metadata: chunk.metadata
|
|
2648
3185
|
};
|
|
2649
3186
|
}));
|
|
2650
|
-
const results = rawResults.filter((
|
|
3187
|
+
const results = rawResults.filter((result) => result != null);
|
|
2651
3188
|
return { retrievalMode: mode, results: rerankResultsByDocument(results, topK) };
|
|
2652
3189
|
}
|
|
2653
3190
|
|
|
2654
3191
|
// src/query/related-service.ts
|
|
2655
3192
|
import path19 from "path";
|
|
2656
|
-
function
|
|
3193
|
+
function cosineSimilarity2(left, right) {
|
|
2657
3194
|
let dot = 0;
|
|
2658
3195
|
let leftNorm = 0;
|
|
2659
3196
|
let rightNorm = 0;
|
|
@@ -2739,7 +3276,7 @@ async function findRelatedDocuments({
|
|
|
2739
3276
|
const results = [...vectors.values()].filter((candidate) => candidate.document.id !== selected.id).map((candidate) => ({
|
|
2740
3277
|
documentId: candidate.document.id,
|
|
2741
3278
|
sourceId: candidate.document.sourceId,
|
|
2742
|
-
score:
|
|
3279
|
+
score: cosineSimilarity2(sourceVector.embedding, candidate.embedding),
|
|
2743
3280
|
title: candidate.document.title,
|
|
2744
3281
|
uri: candidate.document.uri,
|
|
2745
3282
|
metadata: candidate.document.metadata
|
|
@@ -2767,21 +3304,20 @@ async function createContext({
|
|
|
2767
3304
|
const search = await searchIndex({ workspacePath, query, topK, showChunks: true, retrievalMode });
|
|
2768
3305
|
const sources = [];
|
|
2769
3306
|
let total = 0;
|
|
2770
|
-
for (const
|
|
2771
|
-
const text =
|
|
3307
|
+
for (const result of search.results) {
|
|
3308
|
+
const text = result.text ?? "";
|
|
2772
3309
|
if (total + text.length > maxChars && sources.length > 0) {
|
|
2773
3310
|
break;
|
|
2774
3311
|
}
|
|
2775
3312
|
total += text.length;
|
|
2776
3313
|
sources.push({
|
|
2777
|
-
chunkId:
|
|
2778
|
-
documentId:
|
|
2779
|
-
sourceId:
|
|
2780
|
-
title:
|
|
2781
|
-
uri:
|
|
2782
|
-
headingPath: result2.headingPath,
|
|
3314
|
+
chunkId: result.chunkId,
|
|
3315
|
+
documentId: result.documentId,
|
|
3316
|
+
sourceId: result.sourceId,
|
|
3317
|
+
title: result.title,
|
|
3318
|
+
uri: result.uri,
|
|
2783
3319
|
text,
|
|
2784
|
-
metadata:
|
|
3320
|
+
metadata: result.metadata
|
|
2785
3321
|
});
|
|
2786
3322
|
}
|
|
2787
3323
|
const markdown = [
|
|
@@ -2792,7 +3328,6 @@ async function createContext({
|
|
|
2792
3328
|
`Title: ${source.title}`,
|
|
2793
3329
|
`URL: ${source.uri}`,
|
|
2794
3330
|
`Chunk ID: ${source.chunkId}`,
|
|
2795
|
-
source.headingPath.length > 0 ? `Heading Path: ${source.headingPath.join(" > ")}` : "",
|
|
2796
3331
|
"",
|
|
2797
3332
|
source.text,
|
|
2798
3333
|
""
|
|
@@ -2871,27 +3406,30 @@ function formatSourcesTable(sources) {
|
|
|
2871
3406
|
return table.toString();
|
|
2872
3407
|
}
|
|
2873
3408
|
function formatSearchResults(results) {
|
|
2874
|
-
return results.map((
|
|
2875
|
-
`${index + 1}. ${colors.bold(
|
|
2876
|
-
` ${
|
|
2877
|
-
` Source
|
|
2878
|
-
|
|
2879
|
-
`
|
|
2880
|
-
|
|
2881
|
-
|
|
3409
|
+
return results.map((result, index) => [
|
|
3410
|
+
`${index + 1}. ${colors.bold(result.title)}`,
|
|
3411
|
+
` URL: ${result.uri}`,
|
|
3412
|
+
` Source: ${result.sourceType} | Published: ${result.publicationDate ?? "n/a"} | Score: ${result.score.toFixed(3)}`,
|
|
3413
|
+
"",
|
|
3414
|
+
...result.snippet.split("\n").map((line) => line.length > 0 ? ` ${line}` : "")
|
|
3415
|
+
].join("\n")).join(`
|
|
3416
|
+
|
|
3417
|
+
${colors.dim("---")}
|
|
3418
|
+
|
|
3419
|
+
`);
|
|
2882
3420
|
}
|
|
2883
3421
|
function formatRelatedDocuments(results) {
|
|
2884
|
-
return results.map((
|
|
2885
|
-
`${index + 1}. ${colors.bold(
|
|
2886
|
-
` ${
|
|
2887
|
-
` Similarity: ${
|
|
3422
|
+
return results.map((result, index) => [
|
|
3423
|
+
`${index + 1}. ${colors.bold(result.title)}`,
|
|
3424
|
+
` ${result.uri}`,
|
|
3425
|
+
` Similarity: ${result.score.toFixed(3)}`
|
|
2888
3426
|
].join("\n")).join("\n\n");
|
|
2889
3427
|
}
|
|
2890
3428
|
|
|
2891
3429
|
// src/cli/run-cli.ts
|
|
2892
3430
|
var SOURCE_TYPES = /* @__PURE__ */ new Set(["url", "website", "rss", "file", "directory", "markdown", "text"]);
|
|
2893
3431
|
var RETRIEVAL_MODES = /* @__PURE__ */ new Set(["lexical", "dense", "sparse", "hybrid"]);
|
|
2894
|
-
var SOURCE_TYPE_LIST = ["
|
|
3432
|
+
var SOURCE_TYPE_LIST = ["page", "website", "rss", "file", "directory", "markdown", "text"];
|
|
2895
3433
|
var RETRIEVAL_MODE_LIST = ["lexical", "dense", "sparse", "hybrid"];
|
|
2896
3434
|
var SEARCH_DATE_FIELDS = ["publicationDate", "firstSeenAt", "lastSeenAt", "lastChangedAt", "crawledAt"];
|
|
2897
3435
|
function parseKeyValue(input) {
|
|
@@ -2914,11 +3452,46 @@ function parseOptionalNumber(input, optionName) {
|
|
|
2914
3452
|
}
|
|
2915
3453
|
return value;
|
|
2916
3454
|
}
|
|
3455
|
+
function parseOptionalPositiveInteger(input, optionName) {
|
|
3456
|
+
const value = parseOptionalNumber(input, optionName);
|
|
3457
|
+
if (value === void 0) {
|
|
3458
|
+
return void 0;
|
|
3459
|
+
}
|
|
3460
|
+
if (!Number.isInteger(value) || value < 1) {
|
|
3461
|
+
throw new CliError(`invalid positive integer for ${optionName}: ${input}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
|
|
3462
|
+
}
|
|
3463
|
+
return value;
|
|
3464
|
+
}
|
|
2917
3465
|
function setWhenDefined(target, key, value) {
|
|
2918
3466
|
if (value !== void 0) {
|
|
2919
3467
|
target[key] = value;
|
|
2920
3468
|
}
|
|
2921
3469
|
}
|
|
3470
|
+
function mergePatterns(existing, extra) {
|
|
3471
|
+
const merged = [...existing ?? []];
|
|
3472
|
+
if (extra && !merged.includes(extra)) {
|
|
3473
|
+
merged.push(extra);
|
|
3474
|
+
}
|
|
3475
|
+
return merged.length > 0 ? merged : void 0;
|
|
3476
|
+
}
|
|
3477
|
+
function formatWebsiteSourceAdd(result) {
|
|
3478
|
+
const lines = [`Added source ${result.primarySource.id}`];
|
|
3479
|
+
if (!result.detectedFeed) {
|
|
3480
|
+
lines.push("No feed detected during website registration.");
|
|
3481
|
+
return lines.join("\n");
|
|
3482
|
+
}
|
|
3483
|
+
if (result.detectedFeed.source && result.detectedFeed.wasAdded) {
|
|
3484
|
+
lines.push(`Detected feed ${result.detectedFeed.url} and added source ${result.detectedFeed.source.id}.`);
|
|
3485
|
+
} else if (result.detectedFeed.source) {
|
|
3486
|
+
lines.push(`Detected feed ${result.detectedFeed.url}. Source ${result.detectedFeed.source.id} already exists.`);
|
|
3487
|
+
} else {
|
|
3488
|
+
lines.push(`Detected feed ${result.detectedFeed.url}.`);
|
|
3489
|
+
}
|
|
3490
|
+
if (result.detectedFeed.excludePrefix) {
|
|
3491
|
+
lines.push(`Excluded ${result.detectedFeed.excludePrefix} from the website crawl.`);
|
|
3492
|
+
}
|
|
3493
|
+
return lines.join("\n");
|
|
3494
|
+
}
|
|
2922
3495
|
function createSourceCrawlConfig(type, options, defaults) {
|
|
2923
3496
|
if (!["url", "website", "directory", "rss"].includes(type)) {
|
|
2924
3497
|
return void 0;
|
|
@@ -2926,6 +3499,7 @@ function createSourceCrawlConfig(type, options, defaults) {
|
|
|
2926
3499
|
const crawl = {};
|
|
2927
3500
|
setWhenDefined(crawl, "maxDepth", parseOptionalNumber(options.maxDepth, "--max-depth"));
|
|
2928
3501
|
setWhenDefined(crawl, "maxPages", parseOptionalNumber(options.maxPages, "--max-pages"));
|
|
3502
|
+
setWhenDefined(crawl, "maxConcurrentRequests", parseOptionalPositiveInteger(options.maxConcurrentRequests, "--max-concurrent-requests"));
|
|
2929
3503
|
setWhenDefined(crawl, "includePatterns", options.include);
|
|
2930
3504
|
setWhenDefined(crawl, "excludePatterns", options.exclude);
|
|
2931
3505
|
setWhenDefined(crawl, "obeyRobotsTxt", options.robots);
|
|
@@ -2944,14 +3518,48 @@ function createSourceCrawlConfig(type, options, defaults) {
|
|
|
2944
3518
|
}
|
|
2945
3519
|
return Object.keys(crawl).length > 0 ? crawl : void 0;
|
|
2946
3520
|
}
|
|
3521
|
+
function validateSourceAddOptions(type, options) {
|
|
3522
|
+
const reject = (optionName) => {
|
|
3523
|
+
throw new CliError(`${optionName} is not supported for source type ${type}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
|
|
3524
|
+
};
|
|
3525
|
+
if (options.maxDepth !== void 0 && type !== "website") {
|
|
3526
|
+
reject("--max-depth");
|
|
3527
|
+
}
|
|
3528
|
+
if (options.maxPages !== void 0 && type !== "website") {
|
|
3529
|
+
reject("--max-pages");
|
|
3530
|
+
}
|
|
3531
|
+
if (options.maxConcurrentRequests !== void 0 && !["website", "rss"].includes(type)) {
|
|
3532
|
+
reject("--max-concurrent-requests");
|
|
3533
|
+
}
|
|
3534
|
+
if (options.renderJs && type !== "website") {
|
|
3535
|
+
reject("--render-js");
|
|
3536
|
+
}
|
|
3537
|
+
if (options.robots === false && type !== "website") {
|
|
3538
|
+
reject("--no-robots");
|
|
3539
|
+
}
|
|
3540
|
+
if (options.rateLimitMs !== void 0 && type !== "website") {
|
|
3541
|
+
reject("--rate-limit-ms");
|
|
3542
|
+
}
|
|
3543
|
+
if (options.include !== void 0 && !["website", "directory"].includes(type)) {
|
|
3544
|
+
reject("--include");
|
|
3545
|
+
}
|
|
3546
|
+
if (options.exclude !== void 0 && !["website", "directory"].includes(type)) {
|
|
3547
|
+
reject("--exclude");
|
|
3548
|
+
}
|
|
3549
|
+
if (options.retentionDays !== void 0 && type !== "rss") {
|
|
3550
|
+
reject("--retention-days");
|
|
3551
|
+
}
|
|
3552
|
+
}
|
|
2947
3553
|
function allowedSourceConfigFields(source) {
|
|
2948
3554
|
const fields = /* @__PURE__ */ new Set(["name", "tag", "metadata"]);
|
|
2949
3555
|
if (source.type === "rss") {
|
|
2950
3556
|
fields.add("retentionDays");
|
|
3557
|
+
fields.add("maxConcurrentRequests");
|
|
2951
3558
|
}
|
|
2952
3559
|
if (source.type === "website") {
|
|
2953
3560
|
fields.add("maxDepth");
|
|
2954
3561
|
fields.add("maxPages");
|
|
3562
|
+
fields.add("maxConcurrentRequests");
|
|
2955
3563
|
fields.add("include");
|
|
2956
3564
|
fields.add("exclude");
|
|
2957
3565
|
}
|
|
@@ -2987,6 +3595,10 @@ function buildSourceConfigPatch(source, options) {
|
|
|
2987
3595
|
checkAllowed("maxPages", "--max-pages");
|
|
2988
3596
|
crawlPatch.maxPages = parseOptionalNumber(options.maxPages, "--max-pages");
|
|
2989
3597
|
}
|
|
3598
|
+
if (options.maxConcurrentRequests !== void 0) {
|
|
3599
|
+
checkAllowed("maxConcurrentRequests", "--max-concurrent-requests");
|
|
3600
|
+
crawlPatch.maxConcurrentRequests = parseOptionalPositiveInteger(options.maxConcurrentRequests, "--max-concurrent-requests");
|
|
3601
|
+
}
|
|
2990
3602
|
if (options.include !== void 0) {
|
|
2991
3603
|
checkAllowed("include", "--include");
|
|
2992
3604
|
crawlPatch.includePatterns = options.include;
|
|
@@ -3016,6 +3628,50 @@ function response(command, workspace, data, error) {
|
|
|
3016
3628
|
}
|
|
3017
3629
|
function writeOutput(capture, value, stderr = false) {
|
|
3018
3630
|
(stderr ? capture.stderr : capture.stdout).push(value);
|
|
3631
|
+
if (stderr) {
|
|
3632
|
+
capture.onStderr?.(value);
|
|
3633
|
+
return;
|
|
3634
|
+
}
|
|
3635
|
+
capture.onStdout?.(value);
|
|
3636
|
+
}
|
|
3637
|
+
function createProgressHandler(capture, options) {
|
|
3638
|
+
if (options.json || options.silent || options.quiet) {
|
|
3639
|
+
return void 0;
|
|
3640
|
+
}
|
|
3641
|
+
return (level, message) => {
|
|
3642
|
+
if (level === "detail" && !options.verbose) {
|
|
3643
|
+
return;
|
|
3644
|
+
}
|
|
3645
|
+
writeOutput(capture, message, true);
|
|
3646
|
+
};
|
|
3647
|
+
}
|
|
3648
|
+
async function runIngestCommand({
|
|
3649
|
+
workspace,
|
|
3650
|
+
sourceId,
|
|
3651
|
+
changedOnly,
|
|
3652
|
+
dense,
|
|
3653
|
+
sparse,
|
|
3654
|
+
progress
|
|
3655
|
+
}) {
|
|
3656
|
+
progress?.("info", "Ingest step 1/3: fetch and normalize");
|
|
3657
|
+
const ingest = await ingestSources({
|
|
3658
|
+
workspacePath: workspace,
|
|
3659
|
+
sourceIds: sourceId ? [sourceId] : void 0,
|
|
3660
|
+
changedOnly,
|
|
3661
|
+
progress
|
|
3662
|
+
});
|
|
3663
|
+
progress?.("info", "Ingest step 2/3: chunk affected documents");
|
|
3664
|
+
const chunk = await chunkDocuments({ workspacePath: workspace, sourceId, progress });
|
|
3665
|
+
progress?.("info", "Ingest step 3/3: refresh index");
|
|
3666
|
+
const indexBuild = await buildIndex({
|
|
3667
|
+
workspacePath: workspace,
|
|
3668
|
+
denseOverride: dense ? true : void 0,
|
|
3669
|
+
sparseOverride: sparse ? true : void 0,
|
|
3670
|
+
buildAvailableModels: true,
|
|
3671
|
+
progress
|
|
3672
|
+
});
|
|
3673
|
+
progress?.("info", "Ingest complete");
|
|
3674
|
+
return { ingest, chunk, indexPath: indexBuild.indexPath, metadata: indexBuild.metadata };
|
|
3019
3675
|
}
|
|
3020
3676
|
function parseRetrievalMode(input) {
|
|
3021
3677
|
if (!input) {
|
|
@@ -3030,10 +3686,11 @@ function parseSourceType(input) {
|
|
|
3030
3686
|
if (!input) {
|
|
3031
3687
|
return void 0;
|
|
3032
3688
|
}
|
|
3033
|
-
|
|
3689
|
+
const normalized = input === "page" ? "url" : input;
|
|
3690
|
+
if (!SOURCE_TYPES.has(normalized)) {
|
|
3034
3691
|
throw new CliError(`unsupported source type: ${input}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
|
|
3035
3692
|
}
|
|
3036
|
-
return
|
|
3693
|
+
return normalized;
|
|
3037
3694
|
}
|
|
3038
3695
|
function parseCommaSeparatedList(input) {
|
|
3039
3696
|
const values = (input ?? "").split(",").map((value) => value.trim()).filter(Boolean);
|
|
@@ -3094,56 +3751,96 @@ function workspaceFromArgv(argv) {
|
|
|
3094
3751
|
}
|
|
3095
3752
|
return path21.resolve(DEFAULT_WORKSPACE);
|
|
3096
3753
|
}
|
|
3097
|
-
async function runCli(argv) {
|
|
3098
|
-
const capture = { stdout: [], stderr: [] };
|
|
3754
|
+
async function runCli(argv, io = {}) {
|
|
3755
|
+
const capture = { stdout: [], stderr: [], ...io };
|
|
3099
3756
|
const program = new Command();
|
|
3100
|
-
program.name("qli").description("Build and query a local Querylight workspace from files, directories, URLs, websites, and feeds.").showHelpAfterError().option("--workspace <path>", "Workspace directory. Defaults to .kb in the current directory.", DEFAULT_WORKSPACE).option("--config <path>", "Optional config file override. Useful for testing alternate retrieval settings.").option("--json", "Return a stable JSON envelope for automation and agents.").option("--verbose", "Print more operational detail when a command supports it.").
|
|
3757
|
+
program.name("qli").description("Build and query a local Querylight workspace from files, directories, URLs, websites, and feeds.").showHelpAfterError().option("--workspace <path>", "Workspace directory. Defaults to .kb in the current directory.", DEFAULT_WORKSPACE).option("--config <path>", "Optional config file override. Useful for testing alternate retrieval settings.").option("--json", "Return a stable JSON envelope for automation and agents.").option("--silent", "Suppress progress logging for long-running commands.").option("--verbose", "Print more operational detail when a command supports it.").addOption(new Option("--quiet", "Deprecated alias for --silent.").hideHelp());
|
|
3101
3758
|
program.addHelpText("after", `
|
|
3102
3759
|
Workflow:
|
|
3103
3760
|
1. Initialize a workspace with qli init
|
|
3104
3761
|
2. Register one or more sources with qli source add
|
|
3105
|
-
3.
|
|
3762
|
+
3. Refresh the workspace with qli ingest
|
|
3106
3763
|
4. Query it with qli search, qli related, or qli context
|
|
3107
3764
|
|
|
3108
3765
|
Examples:
|
|
3109
3766
|
qli init
|
|
3110
3767
|
qli source add directory ./docs --name "Product Docs" --tag docs
|
|
3111
|
-
qli
|
|
3768
|
+
qli ingest
|
|
3769
|
+
qli rebuild --silent
|
|
3112
3770
|
qli search "api authentication" --top-k 8
|
|
3113
3771
|
qli context "How do API keys work?" --top-k 8 --max-chars 8000
|
|
3114
3772
|
|
|
3773
|
+
Long-running commands print progress to stderr by default. Use --silent to suppress it.
|
|
3774
|
+
Use --json when another tool needs stable structured output.
|
|
3775
|
+
|
|
3115
3776
|
Use qli <command> --help for command-specific options and examples.`);
|
|
3116
|
-
program.command("init").description("Create a new workspace with the default directory layout and config.").option("--force").addHelpText("after", `
|
|
3777
|
+
program.command("init").description("Create a new workspace with the default directory layout and config, then pull missing retrieval models.").option("--force").addHelpText("after", `
|
|
3117
3778
|
Examples:
|
|
3118
3779
|
qli init
|
|
3119
3780
|
qli init --workspace ./kb
|
|
3120
|
-
qli init --workspace /tmp/querylight --force
|
|
3781
|
+
qli init --workspace /tmp/querylight --force
|
|
3782
|
+
|
|
3783
|
+
Notes:
|
|
3784
|
+
init enables dense and sparse retrieval in new workspaces.
|
|
3785
|
+
init pulls missing model assets for enabled retrieval modes.
|
|
3786
|
+
Sparse model downloads require uv. If uv is not available, init skips the sparse pull.`).action(async function command(options) {
|
|
3787
|
+
const global = this.optsWithGlobals();
|
|
3121
3788
|
const workspace = await resolveWorkspace({ workspace: this.optsWithGlobals().workspace });
|
|
3122
|
-
const
|
|
3123
|
-
|
|
3789
|
+
const result = await ensureWorkspace({ workspacePath: workspace, force: Boolean(options.force) });
|
|
3790
|
+
const config = await loadConfig(workspace, global.config);
|
|
3791
|
+
const status = await getModelStatus(workspace, config);
|
|
3792
|
+
const { pullDense, pullSparse } = resolveMissingConfiguredModelPullPlan({ config, status });
|
|
3793
|
+
if (pullDense || pullSparse) {
|
|
3794
|
+
await pullModels({ workspacePath: workspace, config, pullDense, pullSparse, progress: createProgressHandler(capture, global) });
|
|
3795
|
+
}
|
|
3796
|
+
emit(this.optsWithGlobals().json, capture, response("init", workspace, result), `Initialized workspace at ${workspace}`);
|
|
3124
3797
|
});
|
|
3125
3798
|
const source = program.command("source");
|
|
3126
3799
|
source.description("Register, inspect, and manage workspace sources.");
|
|
3127
|
-
source.command("add").description("Add a source definition. The source is enabled immediately.").argument("<type>", `Source type: ${SOURCE_TYPE_LIST.join(", ")}`).argument("<uri>", "Local path, URL, feed URL, or inline content depending on the source type.").requiredOption("--name <name>").option("--tag <tag...>", "Optional tags used later for filtering during search.").option("--metadata <key=value...>", "Extra metadata fields stored on the source.").option("--max-depth <n>", "Maximum crawl depth for website sources.").option("--max-pages <n>", "Maximum number of pages to ingest from a website source.").option("--include <pattern...>", "Only include matching paths or URLs.").option("--exclude <pattern...>", "Skip matching paths or URLs.").option("--render-js", "Render pages with JavaScript before extraction when supported.").option("--no-robots", "Ignore robots.txt for website crawling. Use only when you control the target site or have permission.").option("--rate-limit-ms <n>", "Delay between website requests.").option("--retention-days <n>", "Retention window in days for RSS items. Defaults to the workspace crawler retention setting.").addHelpText("after", `
|
|
3800
|
+
source.command("add").description("Add a source definition. The source is enabled immediately. Use `page` for one page and `website` for multi-page crawling and feed detection.").argument("<type>", `Source type: ${SOURCE_TYPE_LIST.join(", ")}`).argument("<uri>", "Local path, URL, feed URL, or inline content depending on the source type.").requiredOption("--name <name>").option("--tag <tag...>", "Optional tags used later for filtering during search.").option("--metadata <key=value...>", "Extra metadata fields stored on the source.").option("--max-depth <n>", "Maximum crawl depth for website sources.").option("--max-pages <n>", "Maximum number of pages to ingest from a website source.").option("--max-concurrent-requests <n>", "Maximum remote requests in flight for a website or feed source.").option("--include <pattern...>", "Only include matching paths or URLs.").option("--exclude <pattern...>", "Skip matching paths or URLs.").option("--render-js", "Render pages with JavaScript before extraction when supported.").option("--no-robots", "Ignore robots.txt for website crawling. Use only when you control the target site or have permission.").option("--rate-limit-ms <n>", "Delay between website requests.").option("--retention-days <n>", "Retention window in days for RSS items. Defaults to the workspace crawler retention setting.").addHelpText("after", `
|
|
3128
3801
|
Examples:
|
|
3129
3802
|
qli source add directory ./docs --name "Local Docs" --tag docs
|
|
3130
3803
|
qli source add file ./docs/auth.md --name "Auth Guide"
|
|
3131
|
-
qli source add
|
|
3804
|
+
qli source add page https://example.com/docs/auth --name "Auth Page"
|
|
3132
3805
|
qli source add website https://example.com --name "Docs Site" --max-depth 2 --max-pages 50 --include /docs/
|
|
3806
|
+
qli source add website https://example.com --name "Docs Site" --max-concurrent-requests 8
|
|
3807
|
+
qli source add website https://example.com --name "Example Site" --json
|
|
3133
3808
|
qli source add rss https://example.com/feed.xml --name "Release Feed"
|
|
3809
|
+
qli source add rss https://example.com/feed.xml --name "Release Feed" --max-concurrent-requests 3
|
|
3134
3810
|
qli source add rss https://example.com/feed.xml --name "Release Feed" --retention-days 30
|
|
3135
3811
|
|
|
3136
3812
|
Notes:
|
|
3813
|
+
page stores one page. It does not crawl links or detect feeds.
|
|
3814
|
+
Website sources may detect one blog or news feed during registration.
|
|
3815
|
+
When a feed is added, qli also excludes the feed item prefix from the website crawl when it can infer one.
|
|
3816
|
+
Website and RSS sources default to 5 remote requests in flight per source unless config.yaml or source settings override it.
|
|
3817
|
+
Use --json when automation needs the full list of created sources.
|
|
3137
3818
|
RSS sources store retention per feed.
|
|
3138
|
-
When you omit --retention-days for RSS, qli stores the workspace default from config.yaml.`).action(async function command(
|
|
3819
|
+
When you omit --retention-days for RSS, qli stores the workspace default from config.yaml.`).action(async function command(typeInput, uri, options) {
|
|
3820
|
+
const type = parseSourceType(typeInput);
|
|
3821
|
+
if (!type) {
|
|
3822
|
+
throw new CliError(`unsupported source type: ${typeInput}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
|
|
3823
|
+
}
|
|
3139
3824
|
if (!SOURCE_TYPES.has(type)) {
|
|
3140
3825
|
throw new CliError(`unsupported source type: ${type}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
|
|
3141
3826
|
}
|
|
3827
|
+
validateSourceAddOptions(type, options);
|
|
3142
3828
|
const global = this.optsWithGlobals();
|
|
3143
3829
|
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
3144
3830
|
const config = await loadConfig(workspace, global.config);
|
|
3145
3831
|
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
3146
|
-
const
|
|
3832
|
+
const initialCrawl = createSourceCrawlConfig(type, options, { retentionDays: config.crawler.retentionDays });
|
|
3833
|
+
let crawl = initialCrawl;
|
|
3834
|
+
let detectedFeed = null;
|
|
3835
|
+
if (type === "website") {
|
|
3836
|
+
detectedFeed = await discoverWebsiteFeed(uri, config.crawler.defaultUserAgent);
|
|
3837
|
+
if (detectedFeed?.excludePrefix) {
|
|
3838
|
+
crawl = {
|
|
3839
|
+
...crawl ?? {},
|
|
3840
|
+
excludePatterns: mergePatterns(crawl?.excludePatterns, detectedFeed.excludePrefix)
|
|
3841
|
+
};
|
|
3842
|
+
}
|
|
3843
|
+
}
|
|
3147
3844
|
const stored = await addSource(workspace, {
|
|
3148
3845
|
type,
|
|
3149
3846
|
uri: ["file", "directory"].includes(type) ? path21.resolve(uri) : uri,
|
|
@@ -3155,11 +3852,50 @@ Notes:
|
|
|
3155
3852
|
createdAt: now,
|
|
3156
3853
|
updatedAt: now
|
|
3157
3854
|
});
|
|
3158
|
-
|
|
3855
|
+
if (type !== "website") {
|
|
3856
|
+
emit(global.json, capture, response("source add", workspace, stored), `Added source ${stored.id}`);
|
|
3857
|
+
return;
|
|
3858
|
+
}
|
|
3859
|
+
let feedSource;
|
|
3860
|
+
let feedWasAdded = false;
|
|
3861
|
+
if (detectedFeed) {
|
|
3862
|
+
const existingSources = await listSources(workspace);
|
|
3863
|
+
feedSource = existingSources.find((source2) => source2.uri === detectedFeed?.feedUrl);
|
|
3864
|
+
if (!feedSource) {
|
|
3865
|
+
feedSource = await addSource(workspace, {
|
|
3866
|
+
type: "rss",
|
|
3867
|
+
uri: detectedFeed.feedUrl,
|
|
3868
|
+
name: `${options.name} Feed`,
|
|
3869
|
+
enabled: true,
|
|
3870
|
+
tags: options.tag ?? [],
|
|
3871
|
+
metadata: normalizeMetadata(options.metadata),
|
|
3872
|
+
crawl: {
|
|
3873
|
+
retentionDays: config.crawler.retentionDays,
|
|
3874
|
+
fetchArticles: true
|
|
3875
|
+
},
|
|
3876
|
+
createdAt: now,
|
|
3877
|
+
updatedAt: now
|
|
3878
|
+
});
|
|
3879
|
+
feedWasAdded = true;
|
|
3880
|
+
}
|
|
3881
|
+
}
|
|
3882
|
+
const result = {
|
|
3883
|
+
primarySource: stored,
|
|
3884
|
+
addedSources: [stored, ...feedWasAdded && feedSource ? [feedSource] : []],
|
|
3885
|
+
detectedFeed: detectedFeed ? {
|
|
3886
|
+
url: detectedFeed.feedUrl,
|
|
3887
|
+
discoveredBy: detectedFeed.discoveredBy,
|
|
3888
|
+
excludePrefix: detectedFeed.excludePrefix,
|
|
3889
|
+
source: feedSource,
|
|
3890
|
+
wasAdded: feedWasAdded
|
|
3891
|
+
} : null
|
|
3892
|
+
};
|
|
3893
|
+
emit(global.json, capture, response("source add", workspace, result), formatWebsiteSourceAdd(result));
|
|
3159
3894
|
});
|
|
3160
|
-
source.command("config").description("Edit supported settings on an existing source.").argument("<sourceId>", "Source id from qli source list.").option("--name <name>", "Update the source name.").option("--tag <tag...>", "Replace source tags with the provided values.").option("--metadata <key=value...>", "Merge metadata keys into the existing source metadata.").option("--max-depth <n>", "Set website crawl depth.").option("--max-pages <n>", "Set the page limit for website sources.").option("--include <pattern...>", "Set include patterns for website or directory sources.").option("--exclude <pattern...>", "Set exclude patterns for website or directory sources.").option("--retention-days <n>", "Set RSS retention in days for this feed.").addHelpText("after", `
|
|
3895
|
+
source.command("config").description("Edit supported settings on an existing source.").argument("<sourceId>", "Source id from qli source list.").option("--name <name>", "Update the source name.").option("--tag <tag...>", "Replace source tags with the provided values.").option("--metadata <key=value...>", "Merge metadata keys into the existing source metadata.").option("--max-depth <n>", "Set website crawl depth.").option("--max-pages <n>", "Set the page limit for website sources.").option("--max-concurrent-requests <n>", "Set the remote request concurrency limit for website or feed sources.").option("--include <pattern...>", "Set include patterns for website or directory sources.").option("--exclude <pattern...>", "Set exclude patterns for website or directory sources.").option("--retention-days <n>", "Set RSS retention in days for this feed.").addHelpText("after", `
|
|
3161
3896
|
Examples:
|
|
3162
3897
|
qli source config src_123 --retention-days 30
|
|
3898
|
+
qli source config src_123 --max-concurrent-requests 2
|
|
3163
3899
|
qli source config src_123 --name "Docs Feed" --tag rss docs
|
|
3164
3900
|
qli source config src_123 --include /docs/ --exclude /docs/archive/
|
|
3165
3901
|
qli source config src_123 --metadata team=docs owner=platform --json
|
|
@@ -3218,35 +3954,56 @@ Examples:
|
|
|
3218
3954
|
const updated = await updateSource(workspace, sourceId, { enabled: true, updatedAt: (/* @__PURE__ */ new Date()).toISOString() });
|
|
3219
3955
|
emit(global.json, capture, response("source enable", workspace, updated), `Enabled source ${sourceId}`);
|
|
3220
3956
|
});
|
|
3221
|
-
program.command("ingest").description("Fetch
|
|
3957
|
+
program.command("ingest").description("Fetch source content, update affected chunks, and refresh retrieval indexes.").option("--source <sourceId>", "Only ingest one source.").option("--changed-only", "Skip content that has not changed since the last run.").option("--dense", "Force a dense vector build if the dense model is available.").option("--sparse", "Force a sparse vector build if the sparse runtime is available.").addHelpText("after", `
|
|
3222
3958
|
Examples:
|
|
3223
3959
|
qli ingest
|
|
3224
3960
|
qli ingest --source src_123
|
|
3225
|
-
qli ingest --changed-only
|
|
3961
|
+
qli ingest --changed-only
|
|
3962
|
+
qli ingest --dense --sparse
|
|
3963
|
+
qli ingest --silent`).action(async function command(options) {
|
|
3226
3964
|
const global = this.optsWithGlobals();
|
|
3227
3965
|
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
3228
|
-
const
|
|
3229
|
-
|
|
3966
|
+
const result = await runIngestCommand({
|
|
3967
|
+
workspace,
|
|
3968
|
+
sourceId: options.source,
|
|
3969
|
+
changedOnly: Boolean(options.changedOnly),
|
|
3970
|
+
dense: Boolean(options.dense),
|
|
3971
|
+
sparse: Boolean(options.sparse),
|
|
3972
|
+
progress: createProgressHandler(capture, global)
|
|
3973
|
+
});
|
|
3974
|
+
emit(global.json, capture, response("ingest", workspace, result), `Processed ${result.ingest.processedSources} sources, wrote ${result.chunk.chunksWritten} chunks`);
|
|
3230
3975
|
});
|
|
3231
3976
|
program.command("chunk").description("Split normalized documents into retrieval chunks.").option("--source <sourceId>", "Only chunk documents from one source.").option("--document <documentId>", "Only chunk one document.").addHelpText("after", `
|
|
3232
3977
|
Examples:
|
|
3233
3978
|
qli chunk
|
|
3234
3979
|
qli chunk --source src_123
|
|
3235
|
-
qli chunk --document doc_123
|
|
3980
|
+
qli chunk --document doc_123
|
|
3981
|
+
qli chunk --silent`).action(async function command(options) {
|
|
3236
3982
|
const global = this.optsWithGlobals();
|
|
3237
3983
|
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
3238
|
-
const
|
|
3239
|
-
|
|
3984
|
+
const result = await chunkDocuments({
|
|
3985
|
+
workspacePath: workspace,
|
|
3986
|
+
sourceId: options.source,
|
|
3987
|
+
documentId: options.document,
|
|
3988
|
+
progress: createProgressHandler(capture, global)
|
|
3989
|
+
});
|
|
3990
|
+
emit(global.json, capture, response("chunk", workspace, result), `Wrote ${result.chunksWritten} chunks`);
|
|
3240
3991
|
});
|
|
3241
3992
|
program.command("reprocess").description("Re-run normalization for existing documents without fetching sources again.").option("--source <sourceId>", "Only reprocess documents from one source.").option("--document <documentId>", "Only reprocess one document.").addHelpText("after", `
|
|
3242
3993
|
Examples:
|
|
3243
3994
|
qli reprocess
|
|
3244
3995
|
qli reprocess --source src_123
|
|
3245
|
-
qli reprocess --document doc_123
|
|
3996
|
+
qli reprocess --document doc_123
|
|
3997
|
+
qli reprocess --silent`).action(async function command(options) {
|
|
3246
3998
|
const global = this.optsWithGlobals();
|
|
3247
3999
|
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
3248
|
-
const
|
|
3249
|
-
|
|
4000
|
+
const result = await reprocessDocuments({
|
|
4001
|
+
workspacePath: workspace,
|
|
4002
|
+
sourceId: options.source,
|
|
4003
|
+
documentId: options.document,
|
|
4004
|
+
progress: createProgressHandler(capture, global)
|
|
4005
|
+
});
|
|
4006
|
+
emit(global.json, capture, response("reprocess", workspace, result), `Reprocessed ${result.documentsReprocessed} documents`);
|
|
3250
4007
|
});
|
|
3251
4008
|
const index = program.command("index");
|
|
3252
4009
|
index.description("Build and inspect retrieval indexes.");
|
|
@@ -3254,33 +4011,47 @@ Examples:
|
|
|
3254
4011
|
Examples:
|
|
3255
4012
|
qli index build
|
|
3256
4013
|
qli index build --dense
|
|
3257
|
-
qli index build --dense --sparse
|
|
4014
|
+
qli index build --dense --sparse
|
|
4015
|
+
qli index build --silent`).action(async function command(options) {
|
|
3258
4016
|
const global = this.optsWithGlobals();
|
|
3259
4017
|
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
3260
|
-
const
|
|
4018
|
+
const result = await buildIndex({
|
|
3261
4019
|
workspacePath: workspace,
|
|
3262
4020
|
denseOverride: options.dense ? true : void 0,
|
|
3263
|
-
sparseOverride: options.sparse ? true : void 0
|
|
4021
|
+
sparseOverride: options.sparse ? true : void 0,
|
|
4022
|
+
progress: createProgressHandler(capture, global)
|
|
3264
4023
|
});
|
|
3265
|
-
emit(global.json, capture, response("index build", workspace,
|
|
4024
|
+
emit(global.json, capture, response("index build", workspace, result), `Built index at ${result.indexPath}`);
|
|
3266
4025
|
});
|
|
3267
4026
|
program.command("rebuild").description("Run ingest, chunk, and index build in one command.").option("--source <sourceId>", "Only rebuild data for one source.").option("--changed-only", "Only ingest changed content before chunking and indexing.").option("--dense", "Force a dense vector build if the dense model is available.").option("--sparse", "Force a sparse vector build if the sparse runtime is available.").addHelpText("after", `
|
|
3268
4027
|
Examples:
|
|
3269
4028
|
qli rebuild
|
|
3270
4029
|
qli rebuild --changed-only
|
|
3271
4030
|
qli rebuild --source src_123
|
|
3272
|
-
qli rebuild --dense --sparse
|
|
4031
|
+
qli rebuild --dense --sparse
|
|
4032
|
+
qli rebuild --silent`).action(async function command(options) {
|
|
3273
4033
|
const global = this.optsWithGlobals();
|
|
3274
4034
|
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
3275
|
-
const
|
|
3276
|
-
|
|
4035
|
+
const progress = createProgressHandler(capture, global);
|
|
4036
|
+
progress?.("info", "Rebuild step 1/3: ingest");
|
|
4037
|
+
const ingest = await ingestSources({
|
|
4038
|
+
workspacePath: workspace,
|
|
4039
|
+
sourceIds: options.source ? [options.source] : void 0,
|
|
4040
|
+
changedOnly: Boolean(options.changedOnly),
|
|
4041
|
+
progress
|
|
4042
|
+
});
|
|
4043
|
+
progress?.("info", "Rebuild step 2/3: chunk");
|
|
4044
|
+
const chunk = await chunkDocuments({ workspacePath: workspace, sourceId: options.source, progress });
|
|
4045
|
+
progress?.("info", "Rebuild step 3/3: index");
|
|
3277
4046
|
const indexBuild = await buildIndex({
|
|
3278
4047
|
workspacePath: workspace,
|
|
3279
4048
|
denseOverride: options.dense ? true : void 0,
|
|
3280
4049
|
sparseOverride: options.sparse ? true : void 0,
|
|
3281
|
-
buildAvailableModels: true
|
|
4050
|
+
buildAvailableModels: true,
|
|
4051
|
+
progress
|
|
3282
4052
|
});
|
|
3283
4053
|
const data = { ingest, chunk, indexPath: indexBuild.indexPath, metadata: indexBuild.metadata };
|
|
4054
|
+
progress?.("info", "Rebuild complete");
|
|
3284
4055
|
emit(global.json, capture, response("rebuild", workspace, data), `Processed ${ingest.processedSources} sources, wrote ${chunk.chunksWritten} chunks`);
|
|
3285
4056
|
});
|
|
3286
4057
|
program.command("search").description("Search the built index and return ranked matching documents or chunks.").argument("[query]", "Text query. Omit it to list the latest matching documents.").option("--top-k <n>", "Maximum number of results to return.", "12").option("--source <sourceIds>", "Restrict results to one or more source ids. Use comma-separated values.").option("--source-name <names>", "Restrict results to one or more source names. Use comma-separated values.").option("--source-type <types>", `Restrict results to one or more source types. Use comma-separated values: ${SOURCE_TYPE_LIST.join(", ")}`).option("--uri-prefix <prefixes>", "Restrict results to one or more URI prefixes. Use comma-separated values.").option("--tag <tags>", "Restrict results to one or more source tags. Use comma-separated values.").option("--metadata <key=value...>", "Restrict results to sources with matching metadata.").option("--since <date>", "Shortcut for --publication-date-from.").option("--until <date>", "Shortcut for --publication-date-to.").option("--changed-since <date>", "Only include documents changed on or after this date.").option("--has-publication-date", "Only include documents with a publication date.").option("--publication-date-from <date>", "Only include documents published on or after this date.").option("--publication-date-to <date>", "Only include documents published on or before this date.").option("--first-seen-at-from <date>", "Only include documents first seen on or after this date.").option("--first-seen-at-to <date>", "Only include documents first seen on or before this date.").option("--last-seen-at-from <date>", "Only include documents last seen on or after this date.").option("--last-seen-at-to <date>", "Only include documents last seen on or before this date.").option("--last-changed-at-from <date>", "Only include documents changed on or after this date.").option("--last-changed-at-to <date>", "Only include documents changed on or before this date.").option("--crawled-at-from <date>", "Only include documents crawled on or after this date.").option("--crawled-at-to <date>", "Only include documents crawled on or before this date.").option("--retrieval <mode>", `Retrieval mode: ${RETRIEVAL_MODE_LIST.join(", ")}`).option("--show-chunks", "Return chunk-level matches when available.").addHelpText("after", `
|
|
@@ -3291,7 +4062,7 @@ Examples:
|
|
|
3291
4062
|
qli search --source-name "Release Feed,Company Blog" --uri-prefix https://example.com/news,https://example.com/blog
|
|
3292
4063
|
qli search "billing" --metadata team=support
|
|
3293
4064
|
qli search "embedding model" --retrieval hybrid --show-chunks
|
|
3294
|
-
qli search --source-type rss,
|
|
4065
|
+
qli search --source-type rss,page --top-k 25 --json
|
|
3295
4066
|
|
|
3296
4067
|
Notes:
|
|
3297
4068
|
lexical works without vector models.
|
|
@@ -3299,7 +4070,7 @@ Notes:
|
|
|
3299
4070
|
When you omit the query, qli returns the latest matching documents sorted by publication date.`).action(async function command(query, options) {
|
|
3300
4071
|
const global = this.optsWithGlobals();
|
|
3301
4072
|
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
3302
|
-
const
|
|
4073
|
+
const result = await searchIndex({
|
|
3303
4074
|
workspacePath: workspace,
|
|
3304
4075
|
query: query ?? "",
|
|
3305
4076
|
topK: Number(options.topK),
|
|
@@ -3314,7 +4085,7 @@ Notes:
|
|
|
3314
4085
|
retrievalMode: parseRetrievalMode(options.retrieval),
|
|
3315
4086
|
showChunks: Boolean(options.showChunks)
|
|
3316
4087
|
});
|
|
3317
|
-
emit(global.json, capture, response("search", workspace,
|
|
4088
|
+
emit(global.json, capture, response("search", workspace, result), formatSearchResults(result.results));
|
|
3318
4089
|
});
|
|
3319
4090
|
program.command("related").description("Find documents similar to an existing document by id or URI.").argument("<document>", "Document id, uri, or canonical uri").option("--top-k <n>", "Maximum number of related documents to return.", "12").addHelpText("after", `
|
|
3320
4091
|
Examples:
|
|
@@ -3326,12 +4097,12 @@ Dense vectors usually produce better related-document results. Pull models and r
|
|
|
3326
4097
|
qli rebuild --dense`).action(async function command(document, options) {
|
|
3327
4098
|
const global = this.optsWithGlobals();
|
|
3328
4099
|
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
3329
|
-
const
|
|
4100
|
+
const result = await findRelatedDocuments({
|
|
3330
4101
|
workspacePath: workspace,
|
|
3331
4102
|
document,
|
|
3332
4103
|
topK: Number(options.topK)
|
|
3333
4104
|
});
|
|
3334
|
-
emit(global.json, capture, response("related", workspace,
|
|
4105
|
+
emit(global.json, capture, response("related", workspace, result), formatRelatedDocuments(result.results));
|
|
3335
4106
|
});
|
|
3336
4107
|
program.command("context").description("Assemble retrieval context for an external LLM, agent, or prompt pipeline.").argument("<query>").option("--top-k <n>", "Maximum number of source passages to consider.", "12").option("--max-chars <n>", "Maximum output length for the rendered context block.", "12000").option("--retrieval <mode>", `Retrieval mode: ${RETRIEVAL_MODE_LIST.join(", ")}`).addHelpText("after", `
|
|
3337
4108
|
Examples:
|
|
@@ -3342,14 +4113,14 @@ Examples:
|
|
|
3342
4113
|
Use --json when another tool needs structured access to the raw passages and metadata.`).action(async function command(query, options) {
|
|
3343
4114
|
const global = this.optsWithGlobals();
|
|
3344
4115
|
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
3345
|
-
const
|
|
4116
|
+
const result = await createContext({
|
|
3346
4117
|
workspacePath: workspace,
|
|
3347
4118
|
query,
|
|
3348
4119
|
topK: Number(options.topK),
|
|
3349
4120
|
maxChars: Number(options.maxChars),
|
|
3350
4121
|
retrievalMode: parseRetrievalMode(options.retrieval)
|
|
3351
4122
|
});
|
|
3352
|
-
emit(global.json, capture, response("context", workspace,
|
|
4123
|
+
emit(global.json, capture, response("context", workspace, result), result.markdown);
|
|
3353
4124
|
});
|
|
3354
4125
|
const models = program.command("models");
|
|
3355
4126
|
models.description("Inspect and download retrieval model assets.");
|
|
@@ -3358,7 +4129,9 @@ Examples:
|
|
|
3358
4129
|
qli models pull
|
|
3359
4130
|
qli models pull --dense
|
|
3360
4131
|
qli models pull --sparse
|
|
4132
|
+
qli models pull --silent
|
|
3361
4133
|
|
|
4134
|
+
Pulled model assets are shared under ~/.qli by default.
|
|
3362
4135
|
If you plan to use related, dense search, or hybrid retrieval, pull the models and rebuild the index first.`).action(async function command(options) {
|
|
3363
4136
|
const global = this.optsWithGlobals();
|
|
3364
4137
|
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
@@ -3369,17 +4142,27 @@ If you plan to use related, dense search, or hybrid retrieval, pull the models a
|
|
|
3369
4142
|
pullSparseFlag: Boolean(options.sparse),
|
|
3370
4143
|
uvAvailable: status.sparse.uvAvailable
|
|
3371
4144
|
});
|
|
3372
|
-
await pullModels({ workspacePath: workspace, config, pullDense, pullSparse });
|
|
4145
|
+
await pullModels({ workspacePath: workspace, config, pullDense, pullSparse, progress: createProgressHandler(capture, global) });
|
|
3373
4146
|
const data = {
|
|
3374
|
-
dense: pullDense ? {
|
|
3375
|
-
|
|
4147
|
+
dense: pullDense ? {
|
|
4148
|
+
pulled: true,
|
|
4149
|
+
modelId: config.retrieval.dense.modelId,
|
|
4150
|
+
cacheDir: resolveCacheDir(workspace, config.retrieval.dense.cacheDir)
|
|
4151
|
+
} : void 0,
|
|
4152
|
+
sparse: pullSparse ? {
|
|
4153
|
+
pulled: true,
|
|
4154
|
+
modelId: config.retrieval.sparse.modelId,
|
|
4155
|
+
cacheDir: resolveCacheDir(workspace, config.retrieval.sparse.cacheDir)
|
|
4156
|
+
} : void 0
|
|
3376
4157
|
};
|
|
3377
4158
|
emit(global.json, capture, response("models pull", workspace, data), "Pulled available models");
|
|
3378
4159
|
});
|
|
3379
|
-
models.command("status").description("Show whether model runtimes and artifacts are available
|
|
4160
|
+
models.command("status").description("Show whether shared model assets, runtimes, and workspace vector artifacts are available.").addHelpText("after", `
|
|
3380
4161
|
Examples:
|
|
3381
4162
|
qli models status
|
|
3382
|
-
qli models status --json
|
|
4163
|
+
qli models status --json
|
|
4164
|
+
|
|
4165
|
+
The cacheDir fields show the resolved model cache path for the current workspace config.`).action(async function command() {
|
|
3383
4166
|
const global = this.optsWithGlobals();
|
|
3384
4167
|
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
3385
4168
|
const config = await loadConfig(workspace, global.config);
|
|
@@ -3394,8 +4177,8 @@ Examples:
|
|
|
3394
4177
|
qli diff --since 2026-05-01`).action(async function command(options) {
|
|
3395
4178
|
const global = this.optsWithGlobals();
|
|
3396
4179
|
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
3397
|
-
const
|
|
3398
|
-
emit(global.json, capture, response("diff", workspace,
|
|
4180
|
+
const result = await diffWorkspace({ workspacePath: workspace, sourceId: options.source, documentId: options.document, since: options.since });
|
|
4181
|
+
emit(global.json, capture, response("diff", workspace, result), JSON.stringify(result, null, 2));
|
|
3399
4182
|
});
|
|
3400
4183
|
const report = program.command("report");
|
|
3401
4184
|
report.description("Render higher-level reports from workspace data.");
|
|
@@ -3476,8 +4259,11 @@ Examples:
|
|
|
3476
4259
|
checks.push("dense runtime importable");
|
|
3477
4260
|
}
|
|
3478
4261
|
if (config.retrieval.sparse.enabled) {
|
|
3479
|
-
await
|
|
3480
|
-
|
|
4262
|
+
if (await isUvAvailable()) {
|
|
4263
|
+
checks.push("uv available for sparse runtime");
|
|
4264
|
+
} else {
|
|
4265
|
+
checks.push("uv missing for sparse runtime");
|
|
4266
|
+
}
|
|
3481
4267
|
}
|
|
3482
4268
|
try {
|
|
3483
4269
|
await readLatestIndexMetadata(workspace);
|
|
@@ -3511,13 +4297,21 @@ function emit(asJson, capture, body, human) {
|
|
|
3511
4297
|
}
|
|
3512
4298
|
|
|
3513
4299
|
// src/cli/main.ts
|
|
3514
|
-
|
|
3515
|
-
|
|
3516
|
-
|
|
4300
|
+
try {
|
|
4301
|
+
const result = await runCli(process.argv.slice(2), {
|
|
4302
|
+
onStdout(value) {
|
|
4303
|
+
process.stdout.write(`${value}
|
|
3517
4304
|
`);
|
|
3518
|
-
}
|
|
3519
|
-
|
|
3520
|
-
|
|
4305
|
+
},
|
|
4306
|
+
onStderr(value) {
|
|
4307
|
+
process.stderr.write(`${value}
|
|
4308
|
+
`);
|
|
4309
|
+
}
|
|
4310
|
+
});
|
|
4311
|
+
process.exitCode = result.exitCode;
|
|
4312
|
+
} catch (error) {
|
|
4313
|
+
const message = error instanceof Error ? error.stack ?? error.message : String(error);
|
|
4314
|
+
process.stderr.write(`${message}
|
|
3521
4315
|
`);
|
|
4316
|
+
process.exitCode = 1;
|
|
3522
4317
|
}
|
|
3523
|
-
process.exit(result.exitCode);
|