@tryformation/querylight-cli 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +62 -9
- package/dist/chunk/chunker.d.ts +3 -1
- package/dist/cli/main.js +1031 -237
- package/dist/cli/run-cli.d.ts +4 -1
- package/dist/core/concurrency.d.ts +1 -0
- package/dist/core/constants.d.ts +3 -1
- package/dist/core/progress.d.ts +4 -0
- package/dist/core/urls.d.ts +1 -0
- package/dist/index/querylight-indexer.d.ts +3 -1
- package/dist/index.js +441 -114
- package/dist/ingest/adapters/website-adapter.d.ts +6 -1
- package/dist/ingest/adapters/website-feed-discovery.d.ts +6 -0
- package/dist/ingest/extractors/html-extractor.d.ts +1 -0
- package/dist/ingest/ingest-service.d.ts +5 -2
- package/dist/types/models.d.ts +2 -2
- package/dist/vector/dense.d.ts +3 -1
- package/dist/vector/runtime.d.ts +2 -0
- package/dist/vector/service.d.ts +20 -2
- package/dist/vector/sparse.d.ts +3 -1
- package/dist/vector/store.d.ts +8 -2
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -20,6 +20,15 @@ var CliError = class extends Error {
|
|
|
20
20
|
import { readFile, writeFile } from "fs/promises";
|
|
21
21
|
import path from "path";
|
|
22
22
|
import YAML from "yaml";
|
|
23
|
+
|
|
24
|
+
// src/core/constants.ts
|
|
25
|
+
var DEFAULT_SHARED_MODEL_CACHE_DIR = "~/.qli/models/huggingface";
|
|
26
|
+
var LEGACY_WORKSPACE_MODEL_CACHE_DIR = ".kb/models/huggingface";
|
|
27
|
+
|
|
28
|
+
// src/core/config.ts
|
|
29
|
+
function normalizeModelCacheDir(configuredPath) {
|
|
30
|
+
return configuredPath === LEGACY_WORKSPACE_MODEL_CACHE_DIR ? DEFAULT_SHARED_MODEL_CACHE_DIR : configuredPath;
|
|
31
|
+
}
|
|
23
32
|
var defaultConfig = () => ({
|
|
24
33
|
workspaceVersion: 1,
|
|
25
34
|
index: {
|
|
@@ -47,17 +56,17 @@ var defaultConfig = () => ({
|
|
|
47
56
|
retrieval: {
|
|
48
57
|
defaultMode: "lexical",
|
|
49
58
|
dense: {
|
|
50
|
-
enabled:
|
|
59
|
+
enabled: true,
|
|
51
60
|
modelId: "Xenova/all-MiniLM-L6-v2",
|
|
52
|
-
cacheDir:
|
|
61
|
+
cacheDir: DEFAULT_SHARED_MODEL_CACHE_DIR,
|
|
53
62
|
indexHashTables: 8,
|
|
54
63
|
indexRandomSeed: 42,
|
|
55
64
|
chunkTextMode: "title-heading-text"
|
|
56
65
|
},
|
|
57
66
|
sparse: {
|
|
58
|
-
enabled:
|
|
67
|
+
enabled: true,
|
|
59
68
|
modelId: "opensearch-project/opensearch-neural-sparse-encoding-doc-v3-distill",
|
|
60
|
-
cacheDir:
|
|
69
|
+
cacheDir: DEFAULT_SHARED_MODEL_CACHE_DIR,
|
|
61
70
|
documentTopTokens: 128,
|
|
62
71
|
queryEncoding: "tokenizer-token-weights",
|
|
63
72
|
documentEncoding: "masked-lm-max-log1p-relu",
|
|
@@ -68,6 +77,7 @@ var defaultConfig = () => ({
|
|
|
68
77
|
defaultUserAgent: "querylight-cli/0.1",
|
|
69
78
|
obeyRobotsTxt: true,
|
|
70
79
|
rateLimitMs: 1e3,
|
|
80
|
+
maxConcurrentRequests: 5,
|
|
71
81
|
renderJs: false,
|
|
72
82
|
retentionDays: 365,
|
|
73
83
|
fetchArticles: true
|
|
@@ -118,11 +128,13 @@ async function loadConfig(workspacePath, configPath) {
|
|
|
118
128
|
...parsed.retrieval ?? {},
|
|
119
129
|
dense: {
|
|
120
130
|
...defaults.retrieval.dense,
|
|
121
|
-
...parsed.retrieval?.dense ?? {}
|
|
131
|
+
...parsed.retrieval?.dense ?? {},
|
|
132
|
+
cacheDir: normalizeModelCacheDir(parsed.retrieval?.dense?.cacheDir ?? defaults.retrieval.dense.cacheDir)
|
|
122
133
|
},
|
|
123
134
|
sparse: {
|
|
124
135
|
...defaults.retrieval.sparse,
|
|
125
|
-
...parsed.retrieval?.sparse ?? {}
|
|
136
|
+
...parsed.retrieval?.sparse ?? {},
|
|
137
|
+
cacheDir: normalizeModelCacheDir(parsed.retrieval?.sparse?.cacheDir ?? defaults.retrieval.sparse.cacheDir)
|
|
126
138
|
}
|
|
127
139
|
},
|
|
128
140
|
crawler: {
|
|
@@ -145,8 +157,6 @@ var DIRS = [
|
|
|
145
157
|
"normalized",
|
|
146
158
|
"indexes",
|
|
147
159
|
"vectors",
|
|
148
|
-
"models",
|
|
149
|
-
"models/huggingface",
|
|
150
160
|
"runs",
|
|
151
161
|
"logs"
|
|
152
162
|
];
|
|
@@ -275,6 +285,27 @@ async function saveChunks(workspacePath, chunks) {
|
|
|
275
285
|
await writeJsonl(chunksFile(workspacePath), chunks.sort((a, b) => a.id.localeCompare(b.id)));
|
|
276
286
|
}
|
|
277
287
|
|
|
288
|
+
// src/core/concurrency.ts
|
|
289
|
+
async function mapWithConcurrency(items, limit, worker) {
|
|
290
|
+
if (items.length === 0) {
|
|
291
|
+
return;
|
|
292
|
+
}
|
|
293
|
+
const concurrency = Math.max(1, Math.floor(limit));
|
|
294
|
+
let nextIndex = 0;
|
|
295
|
+
await Promise.all(
|
|
296
|
+
Array.from({ length: Math.min(concurrency, items.length) }, async () => {
|
|
297
|
+
while (true) {
|
|
298
|
+
const index = nextIndex;
|
|
299
|
+
nextIndex += 1;
|
|
300
|
+
if (index >= items.length) {
|
|
301
|
+
return;
|
|
302
|
+
}
|
|
303
|
+
await worker(items[index], index);
|
|
304
|
+
}
|
|
305
|
+
})
|
|
306
|
+
);
|
|
307
|
+
}
|
|
308
|
+
|
|
278
309
|
// src/core/files.ts
|
|
279
310
|
import { stat as stat2 } from "fs/promises";
|
|
280
311
|
async function fileExists(filePath) {
|
|
@@ -286,6 +317,14 @@ async function fileExists(filePath) {
|
|
|
286
317
|
}
|
|
287
318
|
}
|
|
288
319
|
|
|
320
|
+
// src/core/progress.ts
|
|
321
|
+
function reportProgress(progress, message) {
|
|
322
|
+
progress?.("info", message);
|
|
323
|
+
}
|
|
324
|
+
function reportProgressDetail(progress, message) {
|
|
325
|
+
progress?.("detail", message);
|
|
326
|
+
}
|
|
327
|
+
|
|
289
328
|
// src/core/runs.ts
|
|
290
329
|
import path6 from "path";
|
|
291
330
|
async function writeRun(workspacePath, run) {
|
|
@@ -428,9 +467,41 @@ function stripBoilerplate(html) {
|
|
|
428
467
|
|
|
429
468
|
// src/ingest/extractors/html-extractor.ts
|
|
430
469
|
var turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
|
|
470
|
+
var LOW_SIGNAL_SECTION_SELECTORS = [
|
|
471
|
+
"script",
|
|
472
|
+
"style",
|
|
473
|
+
"noscript",
|
|
474
|
+
"template",
|
|
475
|
+
"[data-blog-service-recommendations]",
|
|
476
|
+
"[data-blog-related-posts]"
|
|
477
|
+
].join(", ");
|
|
431
478
|
function cleanText(value) {
|
|
432
479
|
return value.replace(/\s+/g, " ").trim();
|
|
433
480
|
}
|
|
481
|
+
function pruneLowSignalContent($) {
|
|
482
|
+
$(LOW_SIGNAL_SECTION_SELECTORS).remove();
|
|
483
|
+
$("form").each((_, element) => {
|
|
484
|
+
const action = cleanText($(element).attr("action") ?? "");
|
|
485
|
+
if (action.includes("substack.com/subscribe")) {
|
|
486
|
+
$(element).closest("section").remove();
|
|
487
|
+
}
|
|
488
|
+
});
|
|
489
|
+
}
|
|
490
|
+
function stripEscapedJsonPayloads(markdown) {
|
|
491
|
+
return markdown.split("\n").filter((line) => {
|
|
492
|
+
const trimmed = line.trim();
|
|
493
|
+
if (trimmed.length === 0) {
|
|
494
|
+
return true;
|
|
495
|
+
}
|
|
496
|
+
if (trimmed.length > 300 && /^"?\\?\[\{\\?"[a-z0-9_]+\\?":/i.test(trimmed)) {
|
|
497
|
+
return false;
|
|
498
|
+
}
|
|
499
|
+
if (trimmed.length > 300 && trimmed.includes('\\"permalink\\":') && trimmed.includes('\\"title\\":')) {
|
|
500
|
+
return false;
|
|
501
|
+
}
|
|
502
|
+
return true;
|
|
503
|
+
}).join("\n").replace(/\n{3,}/g, "\n\n").trim();
|
|
504
|
+
}
|
|
434
505
|
function chooseMeaningfulTitle($, fallbackTitle) {
|
|
435
506
|
const candidates = [
|
|
436
507
|
cleanText($("meta[property='og:title']").attr("content") ?? ""),
|
|
@@ -467,14 +538,27 @@ ${parts.join("\n\n")}
|
|
|
467
538
|
function extractHtmlToMarkdown(html) {
|
|
468
539
|
const cleaned = stripBoilerplate(html);
|
|
469
540
|
const $ = load(cleaned);
|
|
541
|
+
pruneLowSignalContent($);
|
|
470
542
|
const fallbackTitle = cleanText($("title").first().text()) || "Untitled";
|
|
471
543
|
const title = chooseMeaningfulTitle($, fallbackTitle);
|
|
472
544
|
const root = $("main").first().html() ?? $.root().html() ?? cleaned;
|
|
473
545
|
return {
|
|
474
|
-
markdown: turndown.turndown(root),
|
|
546
|
+
markdown: stripEscapedJsonPayloads(turndown.turndown(root)),
|
|
475
547
|
title
|
|
476
548
|
};
|
|
477
549
|
}
|
|
550
|
+
function extractCanonicalUriFromHtml(html, baseUrl) {
|
|
551
|
+
const $ = load(html);
|
|
552
|
+
const href = $("link[rel='canonical']").first().attr("href")?.trim();
|
|
553
|
+
if (!href) {
|
|
554
|
+
return null;
|
|
555
|
+
}
|
|
556
|
+
try {
|
|
557
|
+
return new URL(href, baseUrl).href;
|
|
558
|
+
} catch {
|
|
559
|
+
return null;
|
|
560
|
+
}
|
|
561
|
+
}
|
|
478
562
|
function parseDateCandidate(value) {
|
|
479
563
|
const trimmed = value.trim();
|
|
480
564
|
if (!trimmed) {
|
|
@@ -879,6 +963,19 @@ async function parseRssFeedDocument(xml, source) {
|
|
|
879
963
|
// src/ingest/adapters/url-adapter.ts
|
|
880
964
|
import { mkdir as mkdir5, readFile as readFile7, writeFile as writeFile5 } from "fs/promises";
|
|
881
965
|
import path9 from "path";
|
|
966
|
+
|
|
967
|
+
// src/core/urls.ts
|
|
968
|
+
function normalizeRemoteUrl(uri) {
|
|
969
|
+
try {
|
|
970
|
+
const parsed = new URL(uri);
|
|
971
|
+
parsed.hash = "";
|
|
972
|
+
return parsed.href;
|
|
973
|
+
} catch {
|
|
974
|
+
return uri;
|
|
975
|
+
}
|
|
976
|
+
}
|
|
977
|
+
|
|
978
|
+
// src/ingest/adapters/url-adapter.ts
|
|
882
979
|
function buildHttpCache(response, validatedAt) {
|
|
883
980
|
return {
|
|
884
981
|
etag: response.headers.get("etag") ?? void 0,
|
|
@@ -903,12 +1000,13 @@ async function normalizeRemoteDocument({
|
|
|
903
1000
|
responseStatus
|
|
904
1001
|
}) {
|
|
905
1002
|
const extracted = extractHtmlToMarkdown(body);
|
|
1003
|
+
const canonicalUri = normalizeRemoteUrl(extractCanonicalUriFromHtml(body, url) ?? url);
|
|
906
1004
|
const markdown = `# ${extracted.title}
|
|
907
1005
|
|
|
908
1006
|
${extracted.markdown}`;
|
|
909
|
-
const documentId = stableId("doc", source.id,
|
|
1007
|
+
const documentId = stableId("doc", source.id, canonicalUri);
|
|
910
1008
|
const normalizedPath = path9.resolve(workspacePath, "normalized", `${documentId}.md`);
|
|
911
|
-
const rawPath = path9.resolve(workspacePath, "raw", source.id, `${sha256(
|
|
1009
|
+
const rawPath = path9.resolve(workspacePath, "raw", source.id, `${sha256(canonicalUri).slice(0, 12)}.html`);
|
|
912
1010
|
const contentHash = sha256(markdown);
|
|
913
1011
|
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
914
1012
|
const lastChangedAt = previous?.contentHash === contentHash ? previous.lastChangedAt : now;
|
|
@@ -921,7 +1019,7 @@ ${extracted.markdown}`;
|
|
|
921
1019
|
documentId,
|
|
922
1020
|
sourceId: source.id,
|
|
923
1021
|
title: extracted.title,
|
|
924
|
-
uri:
|
|
1022
|
+
uri: canonicalUri,
|
|
925
1023
|
sourceUri,
|
|
926
1024
|
publicationDate: resolvedPublicationDate,
|
|
927
1025
|
crawledAt,
|
|
@@ -936,8 +1034,9 @@ ${extracted.markdown}`;
|
|
|
936
1034
|
sourceId: source.id,
|
|
937
1035
|
sourceType: source.type,
|
|
938
1036
|
title: extracted.title,
|
|
939
|
-
uri:
|
|
1037
|
+
uri: canonicalUri,
|
|
940
1038
|
sourceUri,
|
|
1039
|
+
canonicalUri,
|
|
941
1040
|
mimeType: "text/html",
|
|
942
1041
|
rawPath,
|
|
943
1042
|
normalizedPath,
|
|
@@ -1111,6 +1210,18 @@ function isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules
|
|
|
1111
1210
|
if (url.origin !== baseUrl.origin) {
|
|
1112
1211
|
return false;
|
|
1113
1212
|
}
|
|
1213
|
+
if (url.search.length > 0) {
|
|
1214
|
+
return false;
|
|
1215
|
+
}
|
|
1216
|
+
if (url.pathname.endsWith(".xml")) {
|
|
1217
|
+
return false;
|
|
1218
|
+
}
|
|
1219
|
+
if (url.pathname.includes("/cdn-cgi/")) {
|
|
1220
|
+
return false;
|
|
1221
|
+
}
|
|
1222
|
+
if (url.pathname === "/search" || url.pathname === "/search/" || url.pathname.endsWith("/search/")) {
|
|
1223
|
+
return false;
|
|
1224
|
+
}
|
|
1114
1225
|
if (disallowRules.some((rule) => rule !== "/" && url.pathname.startsWith(rule))) {
|
|
1115
1226
|
return false;
|
|
1116
1227
|
}
|
|
@@ -1123,56 +1234,75 @@ function isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules
|
|
|
1123
1234
|
}
|
|
1124
1235
|
return true;
|
|
1125
1236
|
}
|
|
1126
|
-
|
|
1237
|
+
function delay(ms) {
|
|
1238
|
+
return new Promise((resolve2) => setTimeout(resolve2, ms));
|
|
1239
|
+
}
|
|
1240
|
+
async function crawlWebsite(source, defaults, progress) {
|
|
1127
1241
|
const baseUrl = new URL(source.uri);
|
|
1128
|
-
const userAgent = source.crawl?.userAgent ??
|
|
1242
|
+
const userAgent = source.crawl?.userAgent ?? defaults.userAgent;
|
|
1129
1243
|
const includePatterns = source.crawl?.includePatterns ?? [];
|
|
1130
1244
|
const excludePatterns = source.crawl?.excludePatterns ?? [];
|
|
1131
1245
|
const maxDepth = source.crawl?.maxDepth ?? 2;
|
|
1132
1246
|
const maxPages = source.crawl?.maxPages ?? 100;
|
|
1133
|
-
const rateLimitMs = source.crawl?.rateLimitMs ??
|
|
1247
|
+
const rateLimitMs = source.crawl?.rateLimitMs ?? defaults.rateLimitMs;
|
|
1248
|
+
const maxConcurrentRequests = source.crawl?.maxConcurrentRequests ?? defaults.maxConcurrentRequests;
|
|
1134
1249
|
const disallowRules = source.crawl?.obeyRobotsTxt === false ? [] : await fetchRobotsDisallow(baseUrl, userAgent);
|
|
1135
|
-
const queue = [{ url: source.uri, depth: 0 }];
|
|
1136
1250
|
const seen = /* @__PURE__ */ new Set();
|
|
1137
1251
|
const results = [];
|
|
1252
|
+
let currentLevel = [normalizeRemoteUrl(source.uri)];
|
|
1138
1253
|
if (source.crawl?.useSitemap !== false) {
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
}
|
|
1148
|
-
|
|
1149
|
-
const
|
|
1150
|
-
|
|
1151
|
-
|
|
1254
|
+
const sitemapUrls = (await fetchSitemapUrls(baseUrl, userAgent)).map((url) => normalizeRemoteUrl(url));
|
|
1255
|
+
reportProgress(progress, `Discovered ${sitemapUrls.length} sitemap URL${sitemapUrls.length === 1 ? "" : "s"} for ${source.uri}`);
|
|
1256
|
+
currentLevel = [
|
|
1257
|
+
...currentLevel,
|
|
1258
|
+
...sitemapUrls
|
|
1259
|
+
];
|
|
1260
|
+
}
|
|
1261
|
+
for (let depth = 0; depth <= maxDepth && currentLevel.length > 0 && results.length < maxPages; depth += 1) {
|
|
1262
|
+
reportProgress(progress, `Crawl depth ${depth}: evaluating ${currentLevel.length} candidate URL${currentLevel.length === 1 ? "" : "s"}`);
|
|
1263
|
+
const nextLevelCandidates = [];
|
|
1264
|
+
const allowedUrls = [];
|
|
1265
|
+
for (const candidate of currentLevel) {
|
|
1266
|
+
const normalizedCandidate = normalizeRemoteUrl(candidate);
|
|
1267
|
+
if (seen.has(normalizedCandidate)) {
|
|
1268
|
+
continue;
|
|
1269
|
+
}
|
|
1270
|
+
seen.add(normalizedCandidate);
|
|
1271
|
+
const url = new URL(normalizedCandidate);
|
|
1272
|
+
if (!isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules)) {
|
|
1273
|
+
continue;
|
|
1274
|
+
}
|
|
1275
|
+
allowedUrls.push(normalizedCandidate);
|
|
1276
|
+
results.push(normalizedCandidate);
|
|
1277
|
+
reportProgress(progress, `Discovered ${normalizedCandidate}`);
|
|
1278
|
+
if (results.length >= maxPages) {
|
|
1279
|
+
break;
|
|
1280
|
+
}
|
|
1152
1281
|
}
|
|
1153
|
-
|
|
1154
|
-
if (
|
|
1155
|
-
|
|
1282
|
+
reportProgress(progress, `Crawl depth ${depth}: queued ${allowedUrls.length} page${allowedUrls.length === 1 ? "" : "s"} for link extraction`);
|
|
1283
|
+
if (depth >= maxDepth || results.length >= maxPages) {
|
|
1284
|
+
break;
|
|
1156
1285
|
}
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
const
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
const target = new URL(href, url);
|
|
1167
|
-
if (!seen.has(target.href)) {
|
|
1168
|
-
queue.push({ url: target.href, depth: next.depth + 1 });
|
|
1286
|
+
await mapWithConcurrency(allowedUrls, maxConcurrentRequests, async (pageUrl) => {
|
|
1287
|
+
const page = new URL(pageUrl);
|
|
1288
|
+
const response = await fetch(page, { headers: { "user-agent": userAgent } });
|
|
1289
|
+
const html = await response.text();
|
|
1290
|
+
const $ = load2(html);
|
|
1291
|
+
$("a[href]").each((_, element) => {
|
|
1292
|
+
const href = $(element).attr("href");
|
|
1293
|
+
if (!href) {
|
|
1294
|
+
return;
|
|
1169
1295
|
}
|
|
1170
|
-
|
|
1296
|
+
try {
|
|
1297
|
+
nextLevelCandidates.push(normalizeRemoteUrl(new URL(href, page).href));
|
|
1298
|
+
} catch {
|
|
1299
|
+
}
|
|
1300
|
+
});
|
|
1301
|
+
if (rateLimitMs > 0) {
|
|
1302
|
+
await delay(rateLimitMs);
|
|
1171
1303
|
}
|
|
1172
1304
|
});
|
|
1173
|
-
|
|
1174
|
-
await new Promise((resolve2) => setTimeout(resolve2, rateLimitMs));
|
|
1175
|
-
}
|
|
1305
|
+
currentLevel = nextLevelCandidates;
|
|
1176
1306
|
}
|
|
1177
1307
|
return results;
|
|
1178
1308
|
}
|
|
@@ -1247,6 +1377,8 @@ async function ingestRssSource({
|
|
|
1247
1377
|
source,
|
|
1248
1378
|
previous,
|
|
1249
1379
|
nextDocuments,
|
|
1380
|
+
maxConcurrentRequests,
|
|
1381
|
+
onDocumentProcessed,
|
|
1250
1382
|
onFailure
|
|
1251
1383
|
}) {
|
|
1252
1384
|
if (source.crawl?.fetchArticles === false) {
|
|
@@ -1254,11 +1386,12 @@ async function ingestRssSource({
|
|
|
1254
1386
|
}
|
|
1255
1387
|
const xml = await fetchFeedText(source);
|
|
1256
1388
|
const items = await parseRssFeedDocument(xml, source);
|
|
1389
|
+
const processedDocumentIds = /* @__PURE__ */ new Set();
|
|
1257
1390
|
let added = 0;
|
|
1258
1391
|
let changed = 0;
|
|
1259
1392
|
let unchanged = 0;
|
|
1260
1393
|
let failed = 0;
|
|
1261
|
-
|
|
1394
|
+
await mapWithConcurrency(items, maxConcurrentRequests, async (item) => {
|
|
1262
1395
|
try {
|
|
1263
1396
|
const probe = previous.get(stableId("doc", source.id, item.url));
|
|
1264
1397
|
const document = await fetchUrlDocument({
|
|
@@ -1269,28 +1402,40 @@ async function ingestRssSource({
|
|
|
1269
1402
|
sourceUri: source.uri,
|
|
1270
1403
|
publicationDate: item.publicationDate
|
|
1271
1404
|
});
|
|
1405
|
+
if (processedDocumentIds.has(document.id)) {
|
|
1406
|
+
return;
|
|
1407
|
+
}
|
|
1408
|
+
processedDocumentIds.add(document.id);
|
|
1409
|
+
const existingDocument = probe ?? previous.get(document.id);
|
|
1272
1410
|
nextDocuments.set(document.id, document);
|
|
1273
|
-
if (!
|
|
1411
|
+
if (!existingDocument) {
|
|
1274
1412
|
added += 1;
|
|
1275
|
-
|
|
1413
|
+
onDocumentProcessed?.(document.uri, "added");
|
|
1414
|
+
} else if (existingDocument.contentHash !== document.contentHash) {
|
|
1276
1415
|
changed += 1;
|
|
1416
|
+
onDocumentProcessed?.(document.uri, "changed");
|
|
1277
1417
|
} else {
|
|
1278
1418
|
unchanged += 1;
|
|
1419
|
+
onDocumentProcessed?.(document.uri, "unchanged");
|
|
1279
1420
|
}
|
|
1280
1421
|
} catch (error) {
|
|
1281
1422
|
failed += 1;
|
|
1282
1423
|
onFailure(item.url, error);
|
|
1283
1424
|
}
|
|
1284
|
-
}
|
|
1425
|
+
});
|
|
1285
1426
|
return { added, changed, unchanged, failed };
|
|
1286
1427
|
}
|
|
1287
1428
|
async function ingestSources({
|
|
1288
1429
|
workspacePath,
|
|
1289
1430
|
sourceIds,
|
|
1290
|
-
changedOnly = false
|
|
1431
|
+
changedOnly = false,
|
|
1432
|
+
progress
|
|
1291
1433
|
}) {
|
|
1292
1434
|
const config = await loadConfig(workspacePath);
|
|
1293
1435
|
const defaultRetentionDays = config.crawler.retentionDays;
|
|
1436
|
+
const defaultUserAgent = config.crawler.defaultUserAgent;
|
|
1437
|
+
const defaultRateLimitMs = config.crawler.rateLimitMs;
|
|
1438
|
+
const defaultMaxConcurrentRequests = config.crawler.maxConcurrentRequests;
|
|
1294
1439
|
const sources = (await listSources(workspacePath)).filter((source) => source.enabled && (!sourceIds || sourceIds.includes(source.id)));
|
|
1295
1440
|
const existing = await loadDocuments(workspacePath);
|
|
1296
1441
|
const previous = previousMap(existing);
|
|
@@ -1300,20 +1445,38 @@ async function ingestSources({
|
|
|
1300
1445
|
let unchanged = 0;
|
|
1301
1446
|
let failed = 0;
|
|
1302
1447
|
const failures = [];
|
|
1448
|
+
reportProgress(progress, `Ingesting ${sources.length} source${sources.length === 1 ? "" : "s"}`);
|
|
1303
1449
|
for (const source of sources) {
|
|
1450
|
+
const maxConcurrentRequests = source.crawl?.maxConcurrentRequests ?? defaultMaxConcurrentRequests;
|
|
1451
|
+
const sourceBefore = { added, changed, unchanged, failed };
|
|
1452
|
+
const processedDocumentIds = /* @__PURE__ */ new Set();
|
|
1453
|
+
const reportDocumentOutcome = (uri, outcome) => {
|
|
1454
|
+
const label = outcome === "unchanged" ? "Unchanged" : outcome === "changed" ? "Updated" : "Added";
|
|
1455
|
+
reportProgress(progress, `${label} ${uri}`);
|
|
1456
|
+
};
|
|
1304
1457
|
const ingestOne = async (uri, producer) => {
|
|
1305
1458
|
try {
|
|
1306
1459
|
const probeId = stableId("doc", source.id, uri);
|
|
1307
1460
|
const earlier = previous.get(probeId);
|
|
1308
1461
|
const document = await producer();
|
|
1462
|
+
if (processedDocumentIds.has(document.id)) {
|
|
1463
|
+
reportProgressDetail(progress, `Skipped duplicate alias ${uri} -> ${document.uri}`);
|
|
1464
|
+
return null;
|
|
1465
|
+
}
|
|
1466
|
+
processedDocumentIds.add(document.id);
|
|
1467
|
+
const existingDocument = earlier ?? previous.get(document.id);
|
|
1309
1468
|
nextDocuments.set(document.id, document);
|
|
1310
|
-
if (!
|
|
1469
|
+
if (!existingDocument) {
|
|
1311
1470
|
added += 1;
|
|
1312
|
-
|
|
1471
|
+
reportDocumentOutcome(document.uri, "added");
|
|
1472
|
+
} else if (existingDocument.contentHash !== document.contentHash) {
|
|
1313
1473
|
changed += 1;
|
|
1474
|
+
reportDocumentOutcome(document.uri, "changed");
|
|
1314
1475
|
} else {
|
|
1315
1476
|
unchanged += 1;
|
|
1477
|
+
reportDocumentOutcome(document.uri, "unchanged");
|
|
1316
1478
|
}
|
|
1479
|
+
return document;
|
|
1317
1480
|
} catch (error) {
|
|
1318
1481
|
failed += 1;
|
|
1319
1482
|
failures.push({
|
|
@@ -1321,50 +1484,69 @@ async function ingestSources({
|
|
|
1321
1484
|
uri,
|
|
1322
1485
|
message: error instanceof Error ? error.message : String(error)
|
|
1323
1486
|
});
|
|
1487
|
+
reportProgressDetail(progress, `Failed ${uri}: ${error instanceof Error ? error.message : String(error)}`);
|
|
1488
|
+
return null;
|
|
1324
1489
|
}
|
|
1325
1490
|
};
|
|
1326
1491
|
try {
|
|
1492
|
+
reportProgress(progress, `Source ${source.name} (${source.type})`);
|
|
1327
1493
|
if (source.type === "file") {
|
|
1494
|
+
reportProgress(progress, `Reading file ${source.uri}`);
|
|
1328
1495
|
await ingestOne(source.uri, () => ingestFile({ workspacePath, source, filePath: source.uri, previous: previous.get(stableId("doc", source.id, source.uri)) }));
|
|
1329
|
-
|
|
1330
|
-
|
|
1331
|
-
|
|
1332
|
-
for (const filePath of
|
|
1496
|
+
} else if (source.type === "directory") {
|
|
1497
|
+
const files = await listDirectoryFiles(source);
|
|
1498
|
+
reportProgress(progress, `Scanning ${files.length} file${files.length === 1 ? "" : "s"} from ${source.uri}`);
|
|
1499
|
+
for (const filePath of files) {
|
|
1500
|
+
reportProgress(progress, `Reading file ${filePath}`);
|
|
1333
1501
|
await ingestOne(filePath, () => ingestFile({ workspacePath, source, filePath, previous: previous.get(stableId("doc", source.id, filePath)) }));
|
|
1334
1502
|
}
|
|
1335
|
-
|
|
1336
|
-
|
|
1337
|
-
if (source.type === "url") {
|
|
1503
|
+
} else if (source.type === "url") {
|
|
1504
|
+
reportProgress(progress, `Fetching ${source.uri}`);
|
|
1338
1505
|
await ingestOne(source.uri, () => fetchUrlDocument({ workspacePath, source, url: source.uri, previous: previous.get(stableId("doc", source.id, source.uri)) }));
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1506
|
+
} else if (source.type === "website") {
|
|
1507
|
+
reportProgress(progress, `Crawling ${source.uri}`);
|
|
1508
|
+
const urls = await crawlWebsite(source, {
|
|
1509
|
+
userAgent: defaultUserAgent,
|
|
1510
|
+
rateLimitMs: defaultRateLimitMs,
|
|
1511
|
+
maxConcurrentRequests
|
|
1512
|
+
}, progress);
|
|
1513
|
+
reportProgress(progress, `Fetched ${urls.length} page${urls.length === 1 ? "" : "s"} from crawl`);
|
|
1514
|
+
const seenCanonicalUrls = /* @__PURE__ */ new Set();
|
|
1515
|
+
await mapWithConcurrency(urls, maxConcurrentRequests, async (url) => {
|
|
1516
|
+
if (seenCanonicalUrls.has(url)) {
|
|
1517
|
+
reportProgressDetail(progress, `Skipped canonical duplicate ${url}`);
|
|
1518
|
+
return;
|
|
1519
|
+
}
|
|
1520
|
+
reportProgress(progress, `Fetching ${url}`);
|
|
1521
|
+
const document = await ingestOne(url, () => fetchUrlDocument({ workspacePath, source, url, previous: previous.get(stableId("doc", source.id, url)) }));
|
|
1522
|
+
if (document) {
|
|
1523
|
+
seenCanonicalUrls.add(document.uri);
|
|
1524
|
+
}
|
|
1525
|
+
});
|
|
1526
|
+
} else if (source.type === "rss") {
|
|
1527
|
+
reportProgress(progress, `Fetching feed ${source.uri}`);
|
|
1348
1528
|
const result = await ingestRssSource({
|
|
1349
1529
|
workspacePath,
|
|
1350
1530
|
source,
|
|
1351
1531
|
previous,
|
|
1352
1532
|
nextDocuments,
|
|
1533
|
+
maxConcurrentRequests,
|
|
1534
|
+
onDocumentProcessed: reportDocumentOutcome,
|
|
1353
1535
|
onFailure: (uri, error) => {
|
|
1354
1536
|
failures.push({
|
|
1355
1537
|
sourceId: source.id,
|
|
1356
1538
|
uri,
|
|
1357
1539
|
message: error instanceof Error ? error.message : String(error)
|
|
1358
1540
|
});
|
|
1541
|
+
reportProgressDetail(progress, `Failed ${uri}: ${error instanceof Error ? error.message : String(error)}`);
|
|
1359
1542
|
}
|
|
1360
1543
|
});
|
|
1361
1544
|
added += result.added;
|
|
1362
1545
|
changed += result.changed;
|
|
1363
1546
|
unchanged += result.unchanged;
|
|
1364
1547
|
failed += result.failed;
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
if (source.type === "markdown" || source.type === "text") {
|
|
1548
|
+
} else if (source.type === "markdown" || source.type === "text") {
|
|
1549
|
+
reportProgress(progress, `Processing inline ${source.type} source ${source.id}`);
|
|
1368
1550
|
await ingestOne(source.uri, () => ingestInlineContent({
|
|
1369
1551
|
workspacePath,
|
|
1370
1552
|
source,
|
|
@@ -1381,13 +1563,19 @@ async function ingestSources({
|
|
|
1381
1563
|
uri: source.uri,
|
|
1382
1564
|
message: error instanceof Error ? error.message : String(error)
|
|
1383
1565
|
});
|
|
1566
|
+
reportProgressDetail(progress, `Failed source ${source.name}: ${error instanceof Error ? error.message : String(error)}`);
|
|
1384
1567
|
}
|
|
1568
|
+
reportProgress(
|
|
1569
|
+
progress,
|
|
1570
|
+
`Finished ${source.name}: +${added - sourceBefore.added} added, ${changed - sourceBefore.changed} changed, ${unchanged - sourceBefore.unchanged} unchanged, ${failed - sourceBefore.failed} failed`
|
|
1571
|
+
);
|
|
1385
1572
|
}
|
|
1386
1573
|
const expiringDocuments = [...nextDocuments.values()].filter((document) => {
|
|
1387
1574
|
const source = sources.find((candidate) => candidate.id === document.sourceId);
|
|
1388
1575
|
return source ? shouldExpireRssDocument(document, source, defaultRetentionDays) : false;
|
|
1389
1576
|
});
|
|
1390
1577
|
if (expiringDocuments.length > 0) {
|
|
1578
|
+
reportProgress(progress, `Removing ${expiringDocuments.length} expired RSS document${expiringDocuments.length === 1 ? "" : "s"}`);
|
|
1391
1579
|
const expiredIds = new Set(expiringDocuments.map((document) => document.id));
|
|
1392
1580
|
for (const document of expiringDocuments) {
|
|
1393
1581
|
nextDocuments.delete(document.id);
|
|
@@ -1414,6 +1602,7 @@ async function ingestSources({
|
|
|
1414
1602
|
documentsSnapshot: documentSnapshot(finalDocuments)
|
|
1415
1603
|
};
|
|
1416
1604
|
await writeRun(workspacePath, run);
|
|
1605
|
+
reportProgress(progress, `Ingest complete: ${added} added, ${changed} changed, ${unchanged} unchanged, ${failed} failed`);
|
|
1417
1606
|
return {
|
|
1418
1607
|
runId: id,
|
|
1419
1608
|
documents: { added, changed, unchanged, failed },
|
|
@@ -1423,7 +1612,8 @@ async function ingestSources({
|
|
|
1423
1612
|
async function reprocessDocuments({
|
|
1424
1613
|
workspacePath,
|
|
1425
1614
|
sourceId,
|
|
1426
|
-
documentId
|
|
1615
|
+
documentId,
|
|
1616
|
+
progress
|
|
1427
1617
|
}) {
|
|
1428
1618
|
const documents = await loadDocuments(workspacePath);
|
|
1429
1619
|
const sources = await listSources(workspacePath);
|
|
@@ -1431,15 +1621,20 @@ async function reprocessDocuments({
|
|
|
1431
1621
|
const nextDocuments = new Map(documents.map((document) => [document.id, document]));
|
|
1432
1622
|
let documentsReprocessed = 0;
|
|
1433
1623
|
let documentsSkipped = 0;
|
|
1434
|
-
|
|
1624
|
+
const targets = documents.filter((candidate) => (!sourceId || candidate.sourceId === sourceId) && (!documentId || candidate.id === documentId));
|
|
1625
|
+
reportProgress(progress, `Reprocessing ${targets.length} document${targets.length === 1 ? "" : "s"}`);
|
|
1626
|
+
for (const document of targets) {
|
|
1627
|
+
reportProgressDetail(progress, `Reprocessing ${document.id} (${document.title})`);
|
|
1435
1628
|
const source = sourceMap.get(document.sourceId);
|
|
1436
1629
|
if (!source || !document.rawPath || !await fileExists(document.rawPath)) {
|
|
1437
1630
|
documentsSkipped += 1;
|
|
1631
|
+
reportProgressDetail(progress, `Skipped ${document.id}: raw source not available`);
|
|
1438
1632
|
continue;
|
|
1439
1633
|
}
|
|
1440
1634
|
const updated = source.type === "url" || source.type === "website" || source.type === "rss" ? await reprocessRemoteDocument(document, source) : await reprocessStoredDocument(document, source);
|
|
1441
1635
|
if (!updated) {
|
|
1442
1636
|
documentsSkipped += 1;
|
|
1637
|
+
reportProgressDetail(progress, `Skipped ${document.id}: source type could not be reprocessed`);
|
|
1443
1638
|
continue;
|
|
1444
1639
|
}
|
|
1445
1640
|
nextDocuments.set(updated.id, updated);
|
|
@@ -1459,6 +1654,7 @@ async function reprocessDocuments({
|
|
|
1459
1654
|
},
|
|
1460
1655
|
documentsSnapshot: documentSnapshot(finalDocuments)
|
|
1461
1656
|
});
|
|
1657
|
+
reportProgress(progress, `Reprocess complete: ${documentsReprocessed} updated, ${documentsSkipped} skipped`);
|
|
1462
1658
|
return { runId: id, documentsReprocessed, documentsSkipped };
|
|
1463
1659
|
}
|
|
1464
1660
|
|
|
@@ -1560,11 +1756,13 @@ function buildChunksForDocument(document, markdown, config, prior = /* @__PURE__
|
|
|
1560
1756
|
async function chunkDocuments({
|
|
1561
1757
|
workspacePath,
|
|
1562
1758
|
sourceId,
|
|
1563
|
-
documentId
|
|
1759
|
+
documentId,
|
|
1760
|
+
progress
|
|
1564
1761
|
}) {
|
|
1565
1762
|
const config = await loadConfig(workspacePath);
|
|
1566
1763
|
const documents = await readJsonl(path11.join(workspacePath, "documents", "documents.jsonl"));
|
|
1567
1764
|
const filtered = documents.filter((document) => (!sourceId || document.sourceId === sourceId) && (!documentId || document.id === documentId));
|
|
1765
|
+
reportProgress(progress, `Chunking ${filtered.length} document${filtered.length === 1 ? "" : "s"}`);
|
|
1568
1766
|
const targetedDocumentIds = new Set(filtered.map((document) => document.id));
|
|
1569
1767
|
const existingChunks = await loadChunks(workspacePath);
|
|
1570
1768
|
const prior = new Map(existingChunks.map((chunk) => [chunk.id, chunk]));
|
|
@@ -1572,12 +1770,14 @@ async function chunkDocuments({
|
|
|
1572
1770
|
existingChunks.filter((chunk) => !targetedDocumentIds.has(chunk.documentId)).map((chunk) => [chunk.id, chunk])
|
|
1573
1771
|
);
|
|
1574
1772
|
for (const document of filtered) {
|
|
1773
|
+
reportProgressDetail(progress, `Chunking ${document.id} (${document.title})`);
|
|
1575
1774
|
const raw = await readFile8(document.normalizedPath, "utf8");
|
|
1576
1775
|
for (const chunk of buildChunksForDocument(document, raw, config, prior)) {
|
|
1577
1776
|
nextChunks.set(chunk.id, chunk);
|
|
1578
1777
|
}
|
|
1579
1778
|
}
|
|
1580
1779
|
await saveChunks(workspacePath, [...nextChunks.values()]);
|
|
1780
|
+
reportProgress(progress, `Chunking complete: ${nextChunks.size} chunk${nextChunks.size === 1 ? "" : "s"} written`);
|
|
1581
1781
|
return { chunksWritten: nextChunks.size };
|
|
1582
1782
|
}
|
|
1583
1783
|
|
|
@@ -1586,15 +1786,31 @@ import { Analyzer, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, Ranking
|
|
|
1586
1786
|
import path17 from "path";
|
|
1587
1787
|
|
|
1588
1788
|
// src/vector/dense.ts
|
|
1589
|
-
import { VectorFieldIndex, createSeededRandom } from "@tryformation/querylight-ts";
|
|
1789
|
+
import { VectorFieldIndex, cosineSimilarity, createSeededRandom } from "@tryformation/querylight-ts";
|
|
1590
1790
|
import { mkdir as mkdir7 } from "fs/promises";
|
|
1591
1791
|
import path14 from "path";
|
|
1592
1792
|
|
|
1593
1793
|
// src/vector/runtime.ts
|
|
1794
|
+
import os from "os";
|
|
1594
1795
|
import path12 from "path";
|
|
1595
1796
|
import { fileURLToPath } from "url";
|
|
1596
1797
|
import { execFile, execFileSync } from "child_process";
|
|
1798
|
+
function resolveQliHomeDir() {
|
|
1799
|
+
return path12.resolve(process.env.QLI_HOME ?? path12.join(os.homedir(), ".qli"));
|
|
1800
|
+
}
|
|
1597
1801
|
function resolveCacheDir(workspacePath, configuredPath) {
|
|
1802
|
+
if (configuredPath === "~/.qli") {
|
|
1803
|
+
return resolveQliHomeDir();
|
|
1804
|
+
}
|
|
1805
|
+
if (configuredPath.startsWith("~/.qli/")) {
|
|
1806
|
+
return path12.join(resolveQliHomeDir(), configuredPath.slice("~/.qli/".length));
|
|
1807
|
+
}
|
|
1808
|
+
if (configuredPath === "~") {
|
|
1809
|
+
return os.homedir();
|
|
1810
|
+
}
|
|
1811
|
+
if (configuredPath.startsWith("~/")) {
|
|
1812
|
+
return path12.join(os.homedir(), configuredPath.slice(2));
|
|
1813
|
+
}
|
|
1598
1814
|
return path12.isAbsolute(configuredPath) ? configuredPath : path12.resolve(workspacePath, configuredPath.replace(/^\.kb\//, ""));
|
|
1599
1815
|
}
|
|
1600
1816
|
function packageRootFromImportMeta(importMetaUrl) {
|
|
@@ -1618,6 +1834,14 @@ async function ensureUvAvailable() {
|
|
|
1618
1834
|
execFile("uv", ["--version"], (error) => error ? reject(error) : resolve2());
|
|
1619
1835
|
});
|
|
1620
1836
|
}
|
|
1837
|
+
async function isUvAvailable() {
|
|
1838
|
+
try {
|
|
1839
|
+
await ensureUvAvailable();
|
|
1840
|
+
return true;
|
|
1841
|
+
} catch {
|
|
1842
|
+
return false;
|
|
1843
|
+
}
|
|
1844
|
+
}
|
|
1621
1845
|
async function runSparsePython({
|
|
1622
1846
|
workspacePath,
|
|
1623
1847
|
config,
|
|
@@ -1666,8 +1890,8 @@ import path13 from "path";
|
|
|
1666
1890
|
function vectorsDir(workspacePath) {
|
|
1667
1891
|
return path13.join(workspacePath, "vectors");
|
|
1668
1892
|
}
|
|
1669
|
-
function
|
|
1670
|
-
return path13.join(
|
|
1893
|
+
function sharedModelStateDir() {
|
|
1894
|
+
return path13.join(resolveQliHomeDir(), "models", "status");
|
|
1671
1895
|
}
|
|
1672
1896
|
function denseVectorPath(workspacePath) {
|
|
1673
1897
|
return path13.join(vectorsDir(workspacePath), "dense.latest.json");
|
|
@@ -1681,11 +1905,16 @@ function sparseVectorPath(workspacePath) {
|
|
|
1681
1905
|
function sparseMetaPath(workspacePath) {
|
|
1682
1906
|
return path13.join(vectorsDir(workspacePath), "sparse.latest.meta.json");
|
|
1683
1907
|
}
|
|
1684
|
-
function
|
|
1685
|
-
|
|
1908
|
+
function pullMarkerPath(type, workspacePath, modelId, cacheDir) {
|
|
1909
|
+
const resolvedCacheDir = resolveCacheDir(workspacePath, cacheDir);
|
|
1910
|
+
const cacheKey = sha256(resolvedCacheDir).slice(0, 16);
|
|
1911
|
+
return path13.join(sharedModelStateDir(), type, `${encodeURIComponent(modelId)}.${cacheKey}.json`);
|
|
1686
1912
|
}
|
|
1687
|
-
function
|
|
1688
|
-
return
|
|
1913
|
+
function densePullMarker(workspacePath, modelId, cacheDir) {
|
|
1914
|
+
return pullMarkerPath("dense", workspacePath, modelId, cacheDir);
|
|
1915
|
+
}
|
|
1916
|
+
function sparsePullMarker(workspacePath, modelId, cacheDir) {
|
|
1917
|
+
return pullMarkerPath("sparse", workspacePath, modelId, cacheDir);
|
|
1689
1918
|
}
|
|
1690
1919
|
async function writeDensePayload(workspacePath, payload) {
|
|
1691
1920
|
await mkdir6(vectorsDir(workspacePath), { recursive: true });
|
|
@@ -1711,7 +1940,7 @@ async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
|
|
|
1711
1940
|
configured: dense.enabled,
|
|
1712
1941
|
modelId: dense.modelId,
|
|
1713
1942
|
cacheDir: denseCacheDir,
|
|
1714
|
-
available: await fileExists(densePullMarker(workspacePath)),
|
|
1943
|
+
available: await fileExists(densePullMarker(workspacePath, dense.modelId, dense.cacheDir)),
|
|
1715
1944
|
artifactExists: await fileExists(denseVectorPath(workspacePath))
|
|
1716
1945
|
},
|
|
1717
1946
|
sparse: {
|
|
@@ -1719,22 +1948,64 @@ async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
|
|
|
1719
1948
|
modelId: sparse.modelId,
|
|
1720
1949
|
cacheDir: sparseCacheDir,
|
|
1721
1950
|
uvAvailable,
|
|
1722
|
-
available: await fileExists(sparsePullMarker(workspacePath)),
|
|
1951
|
+
available: await fileExists(sparsePullMarker(workspacePath, sparse.modelId, sparse.cacheDir)),
|
|
1723
1952
|
artifactExists: await fileExists(sparseVectorPath(workspacePath))
|
|
1724
1953
|
}
|
|
1725
1954
|
};
|
|
1726
1955
|
}
|
|
1727
1956
|
|
|
1728
1957
|
// src/vector/text.ts
|
|
1958
|
+
var LOW_SIGNAL_HEADINGS = /* @__PURE__ */ new Set([
|
|
1959
|
+
"choose this instead of",
|
|
1960
|
+
"how xyz runs it",
|
|
1961
|
+
"naechste schritte",
|
|
1962
|
+
"next steps",
|
|
1963
|
+
"overview",
|
|
1964
|
+
"passend wenn",
|
|
1965
|
+
"problem",
|
|
1966
|
+
"right fit",
|
|
1967
|
+
"waehlen sie das stattdessen",
|
|
1968
|
+
"was sie bekommen",
|
|
1969
|
+
"what you get",
|
|
1970
|
+
"wie xyz es umsetzt",
|
|
1971
|
+
"uberblick",
|
|
1972
|
+
"\xFCberblick"
|
|
1973
|
+
]);
|
|
1974
|
+
function normalizeHeading(value) {
|
|
1975
|
+
return value.trim().toLowerCase();
|
|
1976
|
+
}
|
|
1977
|
+
function isLowSignalHeading(value) {
|
|
1978
|
+
return LOW_SIGNAL_HEADINGS.has(normalizeHeading(value));
|
|
1979
|
+
}
|
|
1980
|
+
function stripLeadingHeading(text, heading) {
|
|
1981
|
+
const lines = text.split("\n");
|
|
1982
|
+
const firstContentIndex = lines.findIndex((line) => line.trim().length > 0);
|
|
1983
|
+
if (firstContentIndex < 0) {
|
|
1984
|
+
return text;
|
|
1985
|
+
}
|
|
1986
|
+
const match = /^(#{1,6})\s+(.+)$/.exec(lines[firstContentIndex] ?? "");
|
|
1987
|
+
if (!match?.[2] || normalizeHeading(match[2]) !== normalizeHeading(heading)) {
|
|
1988
|
+
return text;
|
|
1989
|
+
}
|
|
1990
|
+
const next = [...lines.slice(0, firstContentIndex), ...lines.slice(firstContentIndex + 1)].join("\n").trim();
|
|
1991
|
+
return next;
|
|
1992
|
+
}
|
|
1993
|
+
function createVectorText(chunk) {
|
|
1994
|
+
const meaningfulHeadings = chunk.headingPath.filter((heading) => !isLowSignalHeading(heading) && normalizeHeading(heading) !== normalizeHeading(chunk.title));
|
|
1995
|
+
const textHeading = [...chunk.headingPath].reverse().find((heading) => isLowSignalHeading(heading) || normalizeHeading(heading) === normalizeHeading(chunk.title));
|
|
1996
|
+
const body = textHeading ? stripLeadingHeading(chunk.text, textHeading) : chunk.text.trim();
|
|
1997
|
+
return [chunk.title, ...meaningfulHeadings, body].filter(Boolean).join("\n\n");
|
|
1998
|
+
}
|
|
1729
1999
|
function createDenseChunkText(chunk) {
|
|
1730
|
-
return
|
|
2000
|
+
return createVectorText(chunk);
|
|
1731
2001
|
}
|
|
1732
2002
|
function createSparseChunkText(chunk) {
|
|
1733
|
-
return
|
|
2003
|
+
return createVectorText(chunk);
|
|
1734
2004
|
}
|
|
1735
2005
|
|
|
1736
2006
|
// src/vector/dense.ts
|
|
1737
2007
|
var denseEmbedderFactory = null;
|
|
2008
|
+
var EXACT_DENSE_RERANK_THRESHOLD = 5e3;
|
|
1738
2009
|
async function createEmbedder(cacheDir, modelId) {
|
|
1739
2010
|
if (denseEmbedderFactory) {
|
|
1740
2011
|
return denseEmbedderFactory(cacheDir, modelId);
|
|
@@ -1746,9 +2017,13 @@ async function createEmbedder(cacheDir, modelId) {
|
|
|
1746
2017
|
return output.tolist()[0];
|
|
1747
2018
|
};
|
|
1748
2019
|
}
|
|
2020
|
+
function exactDenseQuery(payload, vector, topK) {
|
|
2021
|
+
return payload.chunks.map((chunk) => [chunk.chunkId, cosineSimilarity(vector, chunk.embedding)]).sort((left, right) => right[1] - left[1]).slice(0, topK);
|
|
2022
|
+
}
|
|
1749
2023
|
async function buildDenseVectors({
|
|
1750
2024
|
workspacePath,
|
|
1751
|
-
config
|
|
2025
|
+
config,
|
|
2026
|
+
progress
|
|
1752
2027
|
}) {
|
|
1753
2028
|
const chunks = await readJsonl(path14.join(workspacePath, "chunks", "chunks.jsonl"));
|
|
1754
2029
|
const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
|
|
@@ -1756,6 +2031,7 @@ async function buildDenseVectors({
|
|
|
1756
2031
|
const embed = await createEmbedder(cacheDir, config.modelId);
|
|
1757
2032
|
const records = [];
|
|
1758
2033
|
let dimensions = 0;
|
|
2034
|
+
reportProgress(progress, `Encoding ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} for dense retrieval`);
|
|
1759
2035
|
for (const chunk of chunks) {
|
|
1760
2036
|
const embedding = await embed(createDenseChunkText(chunk));
|
|
1761
2037
|
dimensions ||= embedding.length;
|
|
@@ -1769,7 +2045,11 @@ async function buildDenseVectors({
|
|
|
1769
2045
|
text: chunk.text,
|
|
1770
2046
|
embedding
|
|
1771
2047
|
});
|
|
2048
|
+
if (records.length === 1 || records.length % 100 === 0 || records.length === chunks.length) {
|
|
2049
|
+
reportProgressDetail(progress, `Encoded ${records.length}/${chunks.length} chunks for dense retrieval`);
|
|
2050
|
+
}
|
|
1772
2051
|
}
|
|
2052
|
+
reportProgress(progress, "Building dense vector index");
|
|
1773
2053
|
const index = new VectorFieldIndex({
|
|
1774
2054
|
numHashTables: config.indexHashTables,
|
|
1775
2055
|
dimensions,
|
|
@@ -1793,6 +2073,7 @@ async function buildDenseVectors({
|
|
|
1793
2073
|
chunks: records
|
|
1794
2074
|
};
|
|
1795
2075
|
await writeDensePayload(workspacePath, payload);
|
|
2076
|
+
reportProgress(progress, `Dense vectors written for ${records.length} chunk${records.length === 1 ? "" : "s"}`);
|
|
1796
2077
|
return payload;
|
|
1797
2078
|
}
|
|
1798
2079
|
async function denseQuery({
|
|
@@ -1805,12 +2086,19 @@ async function denseQuery({
|
|
|
1805
2086
|
const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
|
|
1806
2087
|
const embed = await createEmbedder(cacheDir, config.modelId);
|
|
1807
2088
|
const vector = await embed(query);
|
|
2089
|
+
if (payload.chunks.length <= EXACT_DENSE_RERANK_THRESHOLD) {
|
|
2090
|
+
return exactDenseQuery(payload, vector, topK);
|
|
2091
|
+
}
|
|
1808
2092
|
const index = new VectorFieldIndex({
|
|
1809
2093
|
numHashTables: payload.metadata.hashTables,
|
|
1810
2094
|
dimensions: payload.metadata.dimensions,
|
|
1811
2095
|
random: createSeededRandom(payload.metadata.randomSeed)
|
|
1812
2096
|
}).loadState(payload.indexState);
|
|
1813
|
-
|
|
2097
|
+
const approximateHits = index.query(vector, topK);
|
|
2098
|
+
if (approximateHits.length >= topK) {
|
|
2099
|
+
return approximateHits;
|
|
2100
|
+
}
|
|
2101
|
+
return exactDenseQuery(payload, vector, topK);
|
|
1814
2102
|
}
|
|
1815
2103
|
|
|
1816
2104
|
// src/vector/sparse.ts
|
|
@@ -1904,10 +2192,13 @@ async function buildSparseDocuments(workspacePath, config, chunks) {
|
|
|
1904
2192
|
}
|
|
1905
2193
|
async function buildSparseVectors({
|
|
1906
2194
|
workspacePath,
|
|
1907
|
-
config
|
|
2195
|
+
config,
|
|
2196
|
+
progress
|
|
1908
2197
|
}) {
|
|
1909
2198
|
const chunks = await readJsonl(path15.join(workspacePath, "chunks", "chunks.jsonl"));
|
|
2199
|
+
reportProgress(progress, `Encoding ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} for sparse retrieval`);
|
|
1910
2200
|
const built = await buildSparseDocuments(workspacePath, config, chunks);
|
|
2201
|
+
reportProgress(progress, "Building sparse vector index");
|
|
1911
2202
|
const index = new SparseVectorFieldIndex();
|
|
1912
2203
|
for (const record of built.chunks) {
|
|
1913
2204
|
index.insert(record.chunkId, [record.vector]);
|
|
@@ -1929,6 +2220,7 @@ async function buildSparseVectors({
|
|
|
1929
2220
|
queryTokenWeights: built.queryTokenWeights
|
|
1930
2221
|
};
|
|
1931
2222
|
await writeSparsePayload(workspacePath, payload);
|
|
2223
|
+
reportProgress(progress, `Sparse vectors written for ${built.chunks.length} chunk${built.chunks.length === 1 ? "" : "s"}`);
|
|
1932
2224
|
return payload;
|
|
1933
2225
|
}
|
|
1934
2226
|
async function sparseQuery({
|
|
@@ -1951,24 +2243,24 @@ async function buildVectorArtifacts({
|
|
|
1951
2243
|
config,
|
|
1952
2244
|
denseOverride,
|
|
1953
2245
|
sparseOverride,
|
|
1954
|
-
buildAvailableModels = false
|
|
2246
|
+
buildAvailableModels = false,
|
|
2247
|
+
progress
|
|
1955
2248
|
}) {
|
|
1956
|
-
const
|
|
1957
|
-
|
|
1958
|
-
await ensureUvAvailable();
|
|
1959
|
-
return true;
|
|
1960
|
-
} catch {
|
|
1961
|
-
return false;
|
|
1962
|
-
}
|
|
1963
|
-
})()) : null;
|
|
2249
|
+
const uvAvailable = await isUvAvailable();
|
|
2250
|
+
const modelStatus = buildAvailableModels ? await buildModelStatus(workspacePath, config.retrieval.dense, config.retrieval.sparse, uvAvailable) : null;
|
|
1964
2251
|
const denseEnabled = denseOverride ?? (buildAvailableModels ? config.retrieval.dense.enabled || Boolean(modelStatus?.dense.available) : config.retrieval.dense.enabled);
|
|
1965
|
-
const sparseEnabled = sparseOverride ?? (buildAvailableModels ? (config.retrieval.sparse.enabled || Boolean(modelStatus?.sparse.available)) && Boolean(modelStatus?.sparse.uvAvailable) : config.retrieval.sparse.enabled);
|
|
2252
|
+
const sparseEnabled = sparseOverride ?? (buildAvailableModels ? (config.retrieval.sparse.enabled || Boolean(modelStatus?.sparse.available)) && Boolean(modelStatus?.sparse.uvAvailable) : config.retrieval.sparse.enabled && uvAvailable);
|
|
1966
2253
|
const result = {};
|
|
1967
2254
|
if (denseEnabled) {
|
|
1968
|
-
|
|
2255
|
+
reportProgress(progress, `Building dense vectors with ${config.retrieval.dense.modelId}`);
|
|
2256
|
+
result.dense = await buildDenseVectors({ workspacePath, config: config.retrieval.dense, progress });
|
|
2257
|
+
}
|
|
2258
|
+
if ((sparseOverride || config.retrieval.sparse.enabled) && !uvAvailable) {
|
|
2259
|
+
reportProgress(progress, "Skipping sparse vectors because uv is not available");
|
|
1969
2260
|
}
|
|
1970
2261
|
if (sparseEnabled) {
|
|
1971
|
-
|
|
2262
|
+
reportProgress(progress, `Building sparse vectors with ${config.retrieval.sparse.modelId}`);
|
|
2263
|
+
result.sparse = await buildSparseVectors({ workspacePath, config: config.retrieval.sparse, progress });
|
|
1972
2264
|
}
|
|
1973
2265
|
return result;
|
|
1974
2266
|
}
|
|
@@ -2037,14 +2329,17 @@ async function buildIndex({
|
|
|
2037
2329
|
workspacePath,
|
|
2038
2330
|
denseOverride,
|
|
2039
2331
|
sparseOverride,
|
|
2040
|
-
buildAvailableModels = false
|
|
2332
|
+
buildAvailableModels = false,
|
|
2333
|
+
progress
|
|
2041
2334
|
}) {
|
|
2042
2335
|
const config = await loadConfig(workspacePath);
|
|
2336
|
+
reportProgress(progress, "Loading documents, chunks, and sources");
|
|
2043
2337
|
const chunks = await readJsonl(path17.join(workspacePath, "chunks", "chunks.jsonl"));
|
|
2044
2338
|
const documents = await readJsonl(path17.join(workspacePath, "documents", "documents.jsonl"));
|
|
2045
2339
|
const sources = await readJsonl(path17.join(workspacePath, "sources", "sources.jsonl"));
|
|
2046
2340
|
const metadataFields = [...new Set(chunks.flatMap((chunk) => Object.keys(chunk.metadata).map((key) => `metadata.${key}`)))];
|
|
2047
2341
|
const index = new DocumentIndex(createIndexMapping(metadataFields));
|
|
2342
|
+
reportProgress(progress, `Building lexical index from ${chunks.length} chunk${chunks.length === 1 ? "" : "s"}`);
|
|
2048
2343
|
for (const chunk of chunks) {
|
|
2049
2344
|
index.index({
|
|
2050
2345
|
id: chunk.id,
|
|
@@ -2059,6 +2354,7 @@ async function buildIndex({
|
|
|
2059
2354
|
}
|
|
2060
2355
|
});
|
|
2061
2356
|
}
|
|
2357
|
+
reportProgressDetail(progress, `Indexed ${documents.length} document${documents.length === 1 ? "" : "s"} across ${sources.length} source${sources.length === 1 ? "" : "s"}`);
|
|
2062
2358
|
const createdAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
2063
2359
|
const metadata = {
|
|
2064
2360
|
id: `index_${createdAt.replace(/[:.]/g, "-")}`,
|
|
@@ -2071,14 +2367,17 @@ async function buildIndex({
|
|
|
2071
2367
|
fields: Object.keys(index.mapping),
|
|
2072
2368
|
indexHash: sha256(JSON.stringify(index.indexState))
|
|
2073
2369
|
};
|
|
2370
|
+
reportProgress(progress, "Writing lexical index artifacts");
|
|
2074
2371
|
const artifacts = await writeIndexArtifacts({ workspacePath, indexState: index.indexState, metadata });
|
|
2075
2372
|
const vectors = await buildVectorArtifacts({
|
|
2076
2373
|
workspacePath,
|
|
2077
2374
|
config,
|
|
2078
2375
|
denseOverride,
|
|
2079
2376
|
sparseOverride,
|
|
2080
|
-
buildAvailableModels
|
|
2377
|
+
buildAvailableModels,
|
|
2378
|
+
progress
|
|
2081
2379
|
});
|
|
2380
|
+
reportProgress(progress, `Index build complete: dense=${Boolean(vectors.dense)}, sparse=${Boolean(vectors.sparse)}`);
|
|
2082
2381
|
return {
|
|
2083
2382
|
metadata,
|
|
2084
2383
|
indexPath: artifacts.indexPath,
|
|
@@ -2092,7 +2391,15 @@ import { readFile as readFile11 } from "fs/promises";
|
|
|
2092
2391
|
import { BoolQuery, MatchQuery, OP, TermQuery, reciprocalRankFusion } from "@tryformation/querylight-ts";
|
|
2093
2392
|
import path18 from "path";
|
|
2094
2393
|
async function loadHydratedIndex(workspacePath) {
|
|
2095
|
-
|
|
2394
|
+
let state;
|
|
2395
|
+
try {
|
|
2396
|
+
state = await readLatestIndexState(workspacePath);
|
|
2397
|
+
} catch (error) {
|
|
2398
|
+
if (error.code === "ENOENT") {
|
|
2399
|
+
throw new CliError("lexical index is not built; run `qli rebuild` or `qli chunk` followed by `qli index build`", "INDEX_MISSING", 7 /* QueryError */);
|
|
2400
|
+
}
|
|
2401
|
+
throw error;
|
|
2402
|
+
}
|
|
2096
2403
|
const mapping = createIndexMapping(Object.keys(state.fieldState ?? {}).filter((field) => field.startsWith("metadata.")));
|
|
2097
2404
|
return new (await import("@tryformation/querylight-ts")).DocumentIndex(mapping).loadState(state);
|
|
2098
2405
|
}
|
|
@@ -2328,9 +2635,25 @@ async function buildSnippetWithAdjacentChunks(chunk, query, {
|
|
|
2328
2635
|
function normalizeDisplayTitle(title) {
|
|
2329
2636
|
return title.replace(/\s*\|\s*Querylight TS Demo\s*$/i, "").replace(/\s+/g, " ").trim();
|
|
2330
2637
|
}
|
|
2638
|
+
var LOW_SIGNAL_RESULT_TITLES = /* @__PURE__ */ new Set([
|
|
2639
|
+
"choose this instead of",
|
|
2640
|
+
"how xyz runs it",
|
|
2641
|
+
"naechste schritte",
|
|
2642
|
+
"next steps",
|
|
2643
|
+
"overview",
|
|
2644
|
+
"passend wenn",
|
|
2645
|
+
"problem",
|
|
2646
|
+
"right fit",
|
|
2647
|
+
"waehlen sie das stattdessen",
|
|
2648
|
+
"was sie bekommen",
|
|
2649
|
+
"what you get",
|
|
2650
|
+
"wie xyz es umsetzt",
|
|
2651
|
+
"uberblick",
|
|
2652
|
+
"\xFCberblick"
|
|
2653
|
+
]);
|
|
2331
2654
|
function chooseResultTitle(chunk) {
|
|
2332
2655
|
const documentTitle = normalizeDisplayTitle(chunk.title);
|
|
2333
|
-
const headings = chunk.headingPath.map((heading) => normalizeDisplayTitle(heading)).filter(
|
|
2656
|
+
const headings = chunk.headingPath.map((heading) => normalizeDisplayTitle(heading)).filter((heading) => heading.length > 0 && !LOW_SIGNAL_RESULT_TITLES.has(heading.toLowerCase()));
|
|
2334
2657
|
const leafHeading = headings.at(-1);
|
|
2335
2658
|
if (leafHeading && leafHeading.toLowerCase() !== documentTitle.toLowerCase()) {
|
|
2336
2659
|
return leafHeading;
|
|
@@ -2352,6 +2675,9 @@ function normalizeUriPath(uri) {
|
|
|
2352
2675
|
return uri.toLowerCase().replace(/\/+$/, "");
|
|
2353
2676
|
}
|
|
2354
2677
|
}
|
|
2678
|
+
function normalizeUriIdentity(uri) {
|
|
2679
|
+
return normalizeRemoteUrl(uri).toLowerCase().replace(/\/+$/, "");
|
|
2680
|
+
}
|
|
2355
2681
|
function uriSpecificity(uri) {
|
|
2356
2682
|
const normalized = normalizeUriPath(uri);
|
|
2357
2683
|
if (normalized === "/") {
|
|
@@ -2368,6 +2694,11 @@ function isMoreSpecificDuplicate(candidate, existing) {
|
|
|
2368
2694
|
if (!candidateTitle || candidateTitle !== existingTitle) {
|
|
2369
2695
|
return false;
|
|
2370
2696
|
}
|
|
2697
|
+
const candidateIdentity = normalizeUriIdentity(candidate.uri);
|
|
2698
|
+
const existingIdentity = normalizeUriIdentity(existing.uri);
|
|
2699
|
+
if (candidateIdentity === existingIdentity) {
|
|
2700
|
+
return candidate.uri.length < existing.uri.length;
|
|
2701
|
+
}
|
|
2371
2702
|
const candidatePath = normalizeUriPath(candidate.uri);
|
|
2372
2703
|
const existingPath = normalizeUriPath(existing.uri);
|
|
2373
2704
|
if (candidatePath === existingPath) {
|
|
@@ -2480,7 +2811,6 @@ async function searchIndex({
|
|
|
2480
2811
|
score: 0,
|
|
2481
2812
|
title: chooseResultTitle(chunk),
|
|
2482
2813
|
uri: chunk.uri,
|
|
2483
|
-
headingPath: chunk.headingPath,
|
|
2484
2814
|
snippet: await buildSnippetWithAdjacentChunks(chunk, document.title, {
|
|
2485
2815
|
document,
|
|
2486
2816
|
config,
|
|
@@ -2544,7 +2874,6 @@ async function searchIndex({
|
|
|
2544
2874
|
score,
|
|
2545
2875
|
title: chooseResultTitle(chunk),
|
|
2546
2876
|
uri: chunk.uri,
|
|
2547
|
-
headingPath: chunk.headingPath,
|
|
2548
2877
|
snippet: await buildSnippetWithAdjacentChunks(chunk, normalizedQuery, {
|
|
2549
2878
|
document: documents.get(chunk.documentId),
|
|
2550
2879
|
config,
|
|
@@ -2564,7 +2893,7 @@ async function searchIndex({
|
|
|
2564
2893
|
|
|
2565
2894
|
// src/query/related-service.ts
|
|
2566
2895
|
import path19 from "path";
|
|
2567
|
-
function
|
|
2896
|
+
function cosineSimilarity2(left, right) {
|
|
2568
2897
|
let dot = 0;
|
|
2569
2898
|
let leftNorm = 0;
|
|
2570
2899
|
let rightNorm = 0;
|
|
@@ -2650,7 +2979,7 @@ async function findRelatedDocuments({
|
|
|
2650
2979
|
const results = [...vectors.values()].filter((candidate) => candidate.document.id !== selected.id).map((candidate) => ({
|
|
2651
2980
|
documentId: candidate.document.id,
|
|
2652
2981
|
sourceId: candidate.document.sourceId,
|
|
2653
|
-
score:
|
|
2982
|
+
score: cosineSimilarity2(sourceVector.embedding, candidate.embedding),
|
|
2654
2983
|
title: candidate.document.title,
|
|
2655
2984
|
uri: candidate.document.uri,
|
|
2656
2985
|
metadata: candidate.document.metadata
|
|
@@ -2690,7 +3019,6 @@ async function createContext({
|
|
|
2690
3019
|
sourceId: result.sourceId,
|
|
2691
3020
|
title: result.title,
|
|
2692
3021
|
uri: result.uri,
|
|
2693
|
-
headingPath: result.headingPath,
|
|
2694
3022
|
text,
|
|
2695
3023
|
metadata: result.metadata
|
|
2696
3024
|
});
|
|
@@ -2703,7 +3031,6 @@ async function createContext({
|
|
|
2703
3031
|
`Title: ${source.title}`,
|
|
2704
3032
|
`URL: ${source.uri}`,
|
|
2705
3033
|
`Chunk ID: ${source.chunkId}`,
|
|
2706
|
-
source.headingPath.length > 0 ? `Heading Path: ${source.headingPath.join(" > ")}` : "",
|
|
2707
3034
|
"",
|
|
2708
3035
|
source.text,
|
|
2709
3036
|
""
|