@tryformation/querylight-cli 0.1.1 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +64 -11
- package/dist/chunk/chunker.d.ts +3 -1
- package/dist/cli/main.js +1163 -285
- package/dist/cli/run-cli.d.ts +4 -1
- package/dist/core/concurrency.d.ts +1 -0
- package/dist/core/constants.d.ts +3 -1
- package/dist/core/gzip-json.d.ts +3 -0
- package/dist/core/progress.d.ts +4 -0
- package/dist/core/urls.d.ts +1 -0
- package/dist/index/index-store.d.ts +3 -0
- package/dist/index/querylight-indexer.d.ts +3 -1
- package/dist/index.js +540 -141
- package/dist/ingest/adapters/website-adapter.d.ts +6 -1
- package/dist/ingest/adapters/website-feed-discovery.d.ts +6 -0
- package/dist/ingest/extractors/html-extractor.d.ts +1 -0
- package/dist/ingest/ingest-service.d.ts +5 -2
- package/dist/types/models.d.ts +2 -2
- package/dist/vector/dense.d.ts +3 -1
- package/dist/vector/runtime.d.ts +2 -0
- package/dist/vector/service.d.ts +20 -2
- package/dist/vector/sparse.d.ts +3 -1
- package/dist/vector/store.d.ts +8 -2
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -20,6 +20,15 @@ var CliError = class extends Error {
|
|
|
20
20
|
import { readFile, writeFile } from "fs/promises";
|
|
21
21
|
import path from "path";
|
|
22
22
|
import YAML from "yaml";
|
|
23
|
+
|
|
24
|
+
// src/core/constants.ts
|
|
25
|
+
var DEFAULT_SHARED_MODEL_CACHE_DIR = "~/.qli/models/huggingface";
|
|
26
|
+
var LEGACY_WORKSPACE_MODEL_CACHE_DIR = ".kb/models/huggingface";
|
|
27
|
+
|
|
28
|
+
// src/core/config.ts
|
|
29
|
+
function normalizeModelCacheDir(configuredPath) {
|
|
30
|
+
return configuredPath === LEGACY_WORKSPACE_MODEL_CACHE_DIR ? DEFAULT_SHARED_MODEL_CACHE_DIR : configuredPath;
|
|
31
|
+
}
|
|
23
32
|
var defaultConfig = () => ({
|
|
24
33
|
workspaceVersion: 1,
|
|
25
34
|
index: {
|
|
@@ -47,17 +56,17 @@ var defaultConfig = () => ({
|
|
|
47
56
|
retrieval: {
|
|
48
57
|
defaultMode: "lexical",
|
|
49
58
|
dense: {
|
|
50
|
-
enabled:
|
|
59
|
+
enabled: true,
|
|
51
60
|
modelId: "Xenova/all-MiniLM-L6-v2",
|
|
52
|
-
cacheDir:
|
|
61
|
+
cacheDir: DEFAULT_SHARED_MODEL_CACHE_DIR,
|
|
53
62
|
indexHashTables: 8,
|
|
54
63
|
indexRandomSeed: 42,
|
|
55
64
|
chunkTextMode: "title-heading-text"
|
|
56
65
|
},
|
|
57
66
|
sparse: {
|
|
58
|
-
enabled:
|
|
67
|
+
enabled: true,
|
|
59
68
|
modelId: "opensearch-project/opensearch-neural-sparse-encoding-doc-v3-distill",
|
|
60
|
-
cacheDir:
|
|
69
|
+
cacheDir: DEFAULT_SHARED_MODEL_CACHE_DIR,
|
|
61
70
|
documentTopTokens: 128,
|
|
62
71
|
queryEncoding: "tokenizer-token-weights",
|
|
63
72
|
documentEncoding: "masked-lm-max-log1p-relu",
|
|
@@ -68,6 +77,7 @@ var defaultConfig = () => ({
|
|
|
68
77
|
defaultUserAgent: "querylight-cli/0.1",
|
|
69
78
|
obeyRobotsTxt: true,
|
|
70
79
|
rateLimitMs: 1e3,
|
|
80
|
+
maxConcurrentRequests: 5,
|
|
71
81
|
renderJs: false,
|
|
72
82
|
retentionDays: 365,
|
|
73
83
|
fetchArticles: true
|
|
@@ -118,11 +128,13 @@ async function loadConfig(workspacePath, configPath) {
|
|
|
118
128
|
...parsed.retrieval ?? {},
|
|
119
129
|
dense: {
|
|
120
130
|
...defaults.retrieval.dense,
|
|
121
|
-
...parsed.retrieval?.dense ?? {}
|
|
131
|
+
...parsed.retrieval?.dense ?? {},
|
|
132
|
+
cacheDir: normalizeModelCacheDir(parsed.retrieval?.dense?.cacheDir ?? defaults.retrieval.dense.cacheDir)
|
|
122
133
|
},
|
|
123
134
|
sparse: {
|
|
124
135
|
...defaults.retrieval.sparse,
|
|
125
|
-
...parsed.retrieval?.sparse ?? {}
|
|
136
|
+
...parsed.retrieval?.sparse ?? {},
|
|
137
|
+
cacheDir: normalizeModelCacheDir(parsed.retrieval?.sparse?.cacheDir ?? defaults.retrieval.sparse.cacheDir)
|
|
126
138
|
}
|
|
127
139
|
},
|
|
128
140
|
crawler: {
|
|
@@ -145,8 +157,6 @@ var DIRS = [
|
|
|
145
157
|
"normalized",
|
|
146
158
|
"indexes",
|
|
147
159
|
"vectors",
|
|
148
|
-
"models",
|
|
149
|
-
"models/huggingface",
|
|
150
160
|
"runs",
|
|
151
161
|
"logs"
|
|
152
162
|
];
|
|
@@ -275,6 +285,27 @@ async function saveChunks(workspacePath, chunks) {
|
|
|
275
285
|
await writeJsonl(chunksFile(workspacePath), chunks.sort((a, b) => a.id.localeCompare(b.id)));
|
|
276
286
|
}
|
|
277
287
|
|
|
288
|
+
// src/core/concurrency.ts
|
|
289
|
+
async function mapWithConcurrency(items, limit, worker) {
|
|
290
|
+
if (items.length === 0) {
|
|
291
|
+
return;
|
|
292
|
+
}
|
|
293
|
+
const concurrency = Math.max(1, Math.floor(limit));
|
|
294
|
+
let nextIndex = 0;
|
|
295
|
+
await Promise.all(
|
|
296
|
+
Array.from({ length: Math.min(concurrency, items.length) }, async () => {
|
|
297
|
+
while (true) {
|
|
298
|
+
const index = nextIndex;
|
|
299
|
+
nextIndex += 1;
|
|
300
|
+
if (index >= items.length) {
|
|
301
|
+
return;
|
|
302
|
+
}
|
|
303
|
+
await worker(items[index], index);
|
|
304
|
+
}
|
|
305
|
+
})
|
|
306
|
+
);
|
|
307
|
+
}
|
|
308
|
+
|
|
278
309
|
// src/core/files.ts
|
|
279
310
|
import { stat as stat2 } from "fs/promises";
|
|
280
311
|
async function fileExists(filePath) {
|
|
@@ -286,6 +317,14 @@ async function fileExists(filePath) {
|
|
|
286
317
|
}
|
|
287
318
|
}
|
|
288
319
|
|
|
320
|
+
// src/core/progress.ts
|
|
321
|
+
function reportProgress(progress, message) {
|
|
322
|
+
progress?.("info", message);
|
|
323
|
+
}
|
|
324
|
+
function reportProgressDetail(progress, message) {
|
|
325
|
+
progress?.("detail", message);
|
|
326
|
+
}
|
|
327
|
+
|
|
289
328
|
// src/core/runs.ts
|
|
290
329
|
import path6 from "path";
|
|
291
330
|
async function writeRun(workspacePath, run) {
|
|
@@ -428,9 +467,41 @@ function stripBoilerplate(html) {
|
|
|
428
467
|
|
|
429
468
|
// src/ingest/extractors/html-extractor.ts
|
|
430
469
|
var turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
|
|
470
|
+
var LOW_SIGNAL_SECTION_SELECTORS = [
|
|
471
|
+
"script",
|
|
472
|
+
"style",
|
|
473
|
+
"noscript",
|
|
474
|
+
"template",
|
|
475
|
+
"[data-blog-service-recommendations]",
|
|
476
|
+
"[data-blog-related-posts]"
|
|
477
|
+
].join(", ");
|
|
431
478
|
function cleanText(value) {
|
|
432
479
|
return value.replace(/\s+/g, " ").trim();
|
|
433
480
|
}
|
|
481
|
+
function pruneLowSignalContent($) {
|
|
482
|
+
$(LOW_SIGNAL_SECTION_SELECTORS).remove();
|
|
483
|
+
$("form").each((_, element) => {
|
|
484
|
+
const action = cleanText($(element).attr("action") ?? "");
|
|
485
|
+
if (action.includes("substack.com/subscribe")) {
|
|
486
|
+
$(element).closest("section").remove();
|
|
487
|
+
}
|
|
488
|
+
});
|
|
489
|
+
}
|
|
490
|
+
function stripEscapedJsonPayloads(markdown) {
|
|
491
|
+
return markdown.split("\n").filter((line) => {
|
|
492
|
+
const trimmed = line.trim();
|
|
493
|
+
if (trimmed.length === 0) {
|
|
494
|
+
return true;
|
|
495
|
+
}
|
|
496
|
+
if (trimmed.length > 300 && /^"?\\?\[\{\\?"[a-z0-9_]+\\?":/i.test(trimmed)) {
|
|
497
|
+
return false;
|
|
498
|
+
}
|
|
499
|
+
if (trimmed.length > 300 && trimmed.includes('\\"permalink\\":') && trimmed.includes('\\"title\\":')) {
|
|
500
|
+
return false;
|
|
501
|
+
}
|
|
502
|
+
return true;
|
|
503
|
+
}).join("\n").replace(/\n{3,}/g, "\n\n").trim();
|
|
504
|
+
}
|
|
434
505
|
function chooseMeaningfulTitle($, fallbackTitle) {
|
|
435
506
|
const candidates = [
|
|
436
507
|
cleanText($("meta[property='og:title']").attr("content") ?? ""),
|
|
@@ -467,14 +538,27 @@ ${parts.join("\n\n")}
|
|
|
467
538
|
function extractHtmlToMarkdown(html) {
|
|
468
539
|
const cleaned = stripBoilerplate(html);
|
|
469
540
|
const $ = load(cleaned);
|
|
541
|
+
pruneLowSignalContent($);
|
|
470
542
|
const fallbackTitle = cleanText($("title").first().text()) || "Untitled";
|
|
471
543
|
const title = chooseMeaningfulTitle($, fallbackTitle);
|
|
472
544
|
const root = $("main").first().html() ?? $.root().html() ?? cleaned;
|
|
473
545
|
return {
|
|
474
|
-
markdown: turndown.turndown(root),
|
|
546
|
+
markdown: stripEscapedJsonPayloads(turndown.turndown(root)),
|
|
475
547
|
title
|
|
476
548
|
};
|
|
477
549
|
}
|
|
550
|
+
function extractCanonicalUriFromHtml(html, baseUrl) {
|
|
551
|
+
const $ = load(html);
|
|
552
|
+
const href = $("link[rel='canonical']").first().attr("href")?.trim();
|
|
553
|
+
if (!href) {
|
|
554
|
+
return null;
|
|
555
|
+
}
|
|
556
|
+
try {
|
|
557
|
+
return new URL(href, baseUrl).href;
|
|
558
|
+
} catch {
|
|
559
|
+
return null;
|
|
560
|
+
}
|
|
561
|
+
}
|
|
478
562
|
function parseDateCandidate(value) {
|
|
479
563
|
const trimmed = value.trim();
|
|
480
564
|
if (!trimmed) {
|
|
@@ -879,6 +963,19 @@ async function parseRssFeedDocument(xml, source) {
|
|
|
879
963
|
// src/ingest/adapters/url-adapter.ts
|
|
880
964
|
import { mkdir as mkdir5, readFile as readFile7, writeFile as writeFile5 } from "fs/promises";
|
|
881
965
|
import path9 from "path";
|
|
966
|
+
|
|
967
|
+
// src/core/urls.ts
|
|
968
|
+
function normalizeRemoteUrl(uri) {
|
|
969
|
+
try {
|
|
970
|
+
const parsed = new URL(uri);
|
|
971
|
+
parsed.hash = "";
|
|
972
|
+
return parsed.href;
|
|
973
|
+
} catch {
|
|
974
|
+
return uri;
|
|
975
|
+
}
|
|
976
|
+
}
|
|
977
|
+
|
|
978
|
+
// src/ingest/adapters/url-adapter.ts
|
|
882
979
|
function buildHttpCache(response, validatedAt) {
|
|
883
980
|
return {
|
|
884
981
|
etag: response.headers.get("etag") ?? void 0,
|
|
@@ -903,12 +1000,13 @@ async function normalizeRemoteDocument({
|
|
|
903
1000
|
responseStatus
|
|
904
1001
|
}) {
|
|
905
1002
|
const extracted = extractHtmlToMarkdown(body);
|
|
1003
|
+
const canonicalUri = normalizeRemoteUrl(extractCanonicalUriFromHtml(body, url) ?? url);
|
|
906
1004
|
const markdown = `# ${extracted.title}
|
|
907
1005
|
|
|
908
1006
|
${extracted.markdown}`;
|
|
909
|
-
const documentId = stableId("doc", source.id,
|
|
1007
|
+
const documentId = stableId("doc", source.id, canonicalUri);
|
|
910
1008
|
const normalizedPath = path9.resolve(workspacePath, "normalized", `${documentId}.md`);
|
|
911
|
-
const rawPath = path9.resolve(workspacePath, "raw", source.id, `${sha256(
|
|
1009
|
+
const rawPath = path9.resolve(workspacePath, "raw", source.id, `${sha256(canonicalUri).slice(0, 12)}.html`);
|
|
912
1010
|
const contentHash = sha256(markdown);
|
|
913
1011
|
const now = (/* @__PURE__ */ new Date()).toISOString();
|
|
914
1012
|
const lastChangedAt = previous?.contentHash === contentHash ? previous.lastChangedAt : now;
|
|
@@ -921,7 +1019,7 @@ ${extracted.markdown}`;
|
|
|
921
1019
|
documentId,
|
|
922
1020
|
sourceId: source.id,
|
|
923
1021
|
title: extracted.title,
|
|
924
|
-
uri:
|
|
1022
|
+
uri: canonicalUri,
|
|
925
1023
|
sourceUri,
|
|
926
1024
|
publicationDate: resolvedPublicationDate,
|
|
927
1025
|
crawledAt,
|
|
@@ -936,8 +1034,9 @@ ${extracted.markdown}`;
|
|
|
936
1034
|
sourceId: source.id,
|
|
937
1035
|
sourceType: source.type,
|
|
938
1036
|
title: extracted.title,
|
|
939
|
-
uri:
|
|
1037
|
+
uri: canonicalUri,
|
|
940
1038
|
sourceUri,
|
|
1039
|
+
canonicalUri,
|
|
941
1040
|
mimeType: "text/html",
|
|
942
1041
|
rawPath,
|
|
943
1042
|
normalizedPath,
|
|
@@ -1111,6 +1210,18 @@ function isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules
|
|
|
1111
1210
|
if (url.origin !== baseUrl.origin) {
|
|
1112
1211
|
return false;
|
|
1113
1212
|
}
|
|
1213
|
+
if (url.search.length > 0) {
|
|
1214
|
+
return false;
|
|
1215
|
+
}
|
|
1216
|
+
if (url.pathname.endsWith(".xml")) {
|
|
1217
|
+
return false;
|
|
1218
|
+
}
|
|
1219
|
+
if (url.pathname.includes("/cdn-cgi/")) {
|
|
1220
|
+
return false;
|
|
1221
|
+
}
|
|
1222
|
+
if (url.pathname === "/search" || url.pathname === "/search/" || url.pathname.endsWith("/search/")) {
|
|
1223
|
+
return false;
|
|
1224
|
+
}
|
|
1114
1225
|
if (disallowRules.some((rule) => rule !== "/" && url.pathname.startsWith(rule))) {
|
|
1115
1226
|
return false;
|
|
1116
1227
|
}
|
|
@@ -1123,56 +1234,75 @@ function isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules
|
|
|
1123
1234
|
}
|
|
1124
1235
|
return true;
|
|
1125
1236
|
}
|
|
1126
|
-
|
|
1237
|
+
function delay(ms) {
|
|
1238
|
+
return new Promise((resolve2) => setTimeout(resolve2, ms));
|
|
1239
|
+
}
|
|
1240
|
+
async function crawlWebsite(source, defaults, progress) {
|
|
1127
1241
|
const baseUrl = new URL(source.uri);
|
|
1128
|
-
const userAgent = source.crawl?.userAgent ??
|
|
1242
|
+
const userAgent = source.crawl?.userAgent ?? defaults.userAgent;
|
|
1129
1243
|
const includePatterns = source.crawl?.includePatterns ?? [];
|
|
1130
1244
|
const excludePatterns = source.crawl?.excludePatterns ?? [];
|
|
1131
1245
|
const maxDepth = source.crawl?.maxDepth ?? 2;
|
|
1132
1246
|
const maxPages = source.crawl?.maxPages ?? 100;
|
|
1133
|
-
const rateLimitMs = source.crawl?.rateLimitMs ??
|
|
1247
|
+
const rateLimitMs = source.crawl?.rateLimitMs ?? defaults.rateLimitMs;
|
|
1248
|
+
const maxConcurrentRequests = source.crawl?.maxConcurrentRequests ?? defaults.maxConcurrentRequests;
|
|
1134
1249
|
const disallowRules = source.crawl?.obeyRobotsTxt === false ? [] : await fetchRobotsDisallow(baseUrl, userAgent);
|
|
1135
|
-
const queue = [{ url: source.uri, depth: 0 }];
|
|
1136
1250
|
const seen = /* @__PURE__ */ new Set();
|
|
1137
1251
|
const results = [];
|
|
1252
|
+
let currentLevel = [normalizeRemoteUrl(source.uri)];
|
|
1138
1253
|
if (source.crawl?.useSitemap !== false) {
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
}
|
|
1148
|
-
|
|
1149
|
-
const
|
|
1150
|
-
|
|
1151
|
-
|
|
1254
|
+
const sitemapUrls = (await fetchSitemapUrls(baseUrl, userAgent)).map((url) => normalizeRemoteUrl(url));
|
|
1255
|
+
reportProgress(progress, `Discovered ${sitemapUrls.length} sitemap URL${sitemapUrls.length === 1 ? "" : "s"} for ${source.uri}`);
|
|
1256
|
+
currentLevel = [
|
|
1257
|
+
...currentLevel,
|
|
1258
|
+
...sitemapUrls
|
|
1259
|
+
];
|
|
1260
|
+
}
|
|
1261
|
+
for (let depth = 0; depth <= maxDepth && currentLevel.length > 0 && results.length < maxPages; depth += 1) {
|
|
1262
|
+
reportProgress(progress, `Crawl depth ${depth}: evaluating ${currentLevel.length} candidate URL${currentLevel.length === 1 ? "" : "s"}`);
|
|
1263
|
+
const nextLevelCandidates = [];
|
|
1264
|
+
const allowedUrls = [];
|
|
1265
|
+
for (const candidate of currentLevel) {
|
|
1266
|
+
const normalizedCandidate = normalizeRemoteUrl(candidate);
|
|
1267
|
+
if (seen.has(normalizedCandidate)) {
|
|
1268
|
+
continue;
|
|
1269
|
+
}
|
|
1270
|
+
seen.add(normalizedCandidate);
|
|
1271
|
+
const url = new URL(normalizedCandidate);
|
|
1272
|
+
if (!isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules)) {
|
|
1273
|
+
continue;
|
|
1274
|
+
}
|
|
1275
|
+
allowedUrls.push(normalizedCandidate);
|
|
1276
|
+
results.push(normalizedCandidate);
|
|
1277
|
+
reportProgress(progress, `Discovered ${normalizedCandidate}`);
|
|
1278
|
+
if (results.length >= maxPages) {
|
|
1279
|
+
break;
|
|
1280
|
+
}
|
|
1152
1281
|
}
|
|
1153
|
-
|
|
1154
|
-
if (
|
|
1155
|
-
|
|
1282
|
+
reportProgress(progress, `Crawl depth ${depth}: queued ${allowedUrls.length} page${allowedUrls.length === 1 ? "" : "s"} for link extraction`);
|
|
1283
|
+
if (depth >= maxDepth || results.length >= maxPages) {
|
|
1284
|
+
break;
|
|
1156
1285
|
}
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
const
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1286
|
+
await mapWithConcurrency(allowedUrls, maxConcurrentRequests, async (pageUrl) => {
|
|
1287
|
+
const page = new URL(pageUrl);
|
|
1288
|
+
const response = await fetch(page, { headers: { "user-agent": userAgent } });
|
|
1289
|
+
const html = await response.text();
|
|
1290
|
+
const $ = load2(html);
|
|
1291
|
+
$("a[href]").each((_, element) => {
|
|
1292
|
+
const href = $(element).attr("href");
|
|
1293
|
+
if (!href) {
|
|
1294
|
+
return;
|
|
1295
|
+
}
|
|
1296
|
+
try {
|
|
1297
|
+
nextLevelCandidates.push(normalizeRemoteUrl(new URL(href, page).href));
|
|
1298
|
+
} catch {
|
|
1169
1299
|
}
|
|
1170
|
-
}
|
|
1300
|
+
});
|
|
1301
|
+
if (rateLimitMs > 0) {
|
|
1302
|
+
await delay(rateLimitMs);
|
|
1171
1303
|
}
|
|
1172
1304
|
});
|
|
1173
|
-
|
|
1174
|
-
await new Promise((resolve2) => setTimeout(resolve2, rateLimitMs));
|
|
1175
|
-
}
|
|
1305
|
+
currentLevel = nextLevelCandidates;
|
|
1176
1306
|
}
|
|
1177
1307
|
return results;
|
|
1178
1308
|
}
|
|
@@ -1247,6 +1377,8 @@ async function ingestRssSource({
|
|
|
1247
1377
|
source,
|
|
1248
1378
|
previous,
|
|
1249
1379
|
nextDocuments,
|
|
1380
|
+
maxConcurrentRequests,
|
|
1381
|
+
onDocumentProcessed,
|
|
1250
1382
|
onFailure
|
|
1251
1383
|
}) {
|
|
1252
1384
|
if (source.crawl?.fetchArticles === false) {
|
|
@@ -1254,11 +1386,12 @@ async function ingestRssSource({
|
|
|
1254
1386
|
}
|
|
1255
1387
|
const xml = await fetchFeedText(source);
|
|
1256
1388
|
const items = await parseRssFeedDocument(xml, source);
|
|
1389
|
+
const processedDocumentIds = /* @__PURE__ */ new Set();
|
|
1257
1390
|
let added = 0;
|
|
1258
1391
|
let changed = 0;
|
|
1259
1392
|
let unchanged = 0;
|
|
1260
1393
|
let failed = 0;
|
|
1261
|
-
|
|
1394
|
+
await mapWithConcurrency(items, maxConcurrentRequests, async (item) => {
|
|
1262
1395
|
try {
|
|
1263
1396
|
const probe = previous.get(stableId("doc", source.id, item.url));
|
|
1264
1397
|
const document = await fetchUrlDocument({
|
|
@@ -1269,28 +1402,40 @@ async function ingestRssSource({
|
|
|
1269
1402
|
sourceUri: source.uri,
|
|
1270
1403
|
publicationDate: item.publicationDate
|
|
1271
1404
|
});
|
|
1405
|
+
if (processedDocumentIds.has(document.id)) {
|
|
1406
|
+
return;
|
|
1407
|
+
}
|
|
1408
|
+
processedDocumentIds.add(document.id);
|
|
1409
|
+
const existingDocument = probe ?? previous.get(document.id);
|
|
1272
1410
|
nextDocuments.set(document.id, document);
|
|
1273
|
-
if (!
|
|
1411
|
+
if (!existingDocument) {
|
|
1274
1412
|
added += 1;
|
|
1275
|
-
|
|
1413
|
+
onDocumentProcessed?.(document.uri, "added");
|
|
1414
|
+
} else if (existingDocument.contentHash !== document.contentHash) {
|
|
1276
1415
|
changed += 1;
|
|
1416
|
+
onDocumentProcessed?.(document.uri, "changed");
|
|
1277
1417
|
} else {
|
|
1278
1418
|
unchanged += 1;
|
|
1419
|
+
onDocumentProcessed?.(document.uri, "unchanged");
|
|
1279
1420
|
}
|
|
1280
1421
|
} catch (error) {
|
|
1281
1422
|
failed += 1;
|
|
1282
1423
|
onFailure(item.url, error);
|
|
1283
1424
|
}
|
|
1284
|
-
}
|
|
1425
|
+
});
|
|
1285
1426
|
return { added, changed, unchanged, failed };
|
|
1286
1427
|
}
|
|
1287
1428
|
async function ingestSources({
|
|
1288
1429
|
workspacePath,
|
|
1289
1430
|
sourceIds,
|
|
1290
|
-
changedOnly = false
|
|
1431
|
+
changedOnly = false,
|
|
1432
|
+
progress
|
|
1291
1433
|
}) {
|
|
1292
1434
|
const config = await loadConfig(workspacePath);
|
|
1293
1435
|
const defaultRetentionDays = config.crawler.retentionDays;
|
|
1436
|
+
const defaultUserAgent = config.crawler.defaultUserAgent;
|
|
1437
|
+
const defaultRateLimitMs = config.crawler.rateLimitMs;
|
|
1438
|
+
const defaultMaxConcurrentRequests = config.crawler.maxConcurrentRequests;
|
|
1294
1439
|
const sources = (await listSources(workspacePath)).filter((source) => source.enabled && (!sourceIds || sourceIds.includes(source.id)));
|
|
1295
1440
|
const existing = await loadDocuments(workspacePath);
|
|
1296
1441
|
const previous = previousMap(existing);
|
|
@@ -1300,20 +1445,38 @@ async function ingestSources({
|
|
|
1300
1445
|
let unchanged = 0;
|
|
1301
1446
|
let failed = 0;
|
|
1302
1447
|
const failures = [];
|
|
1448
|
+
reportProgress(progress, `Ingesting ${sources.length} source${sources.length === 1 ? "" : "s"}`);
|
|
1303
1449
|
for (const source of sources) {
|
|
1450
|
+
const maxConcurrentRequests = source.crawl?.maxConcurrentRequests ?? defaultMaxConcurrentRequests;
|
|
1451
|
+
const sourceBefore = { added, changed, unchanged, failed };
|
|
1452
|
+
const processedDocumentIds = /* @__PURE__ */ new Set();
|
|
1453
|
+
const reportDocumentOutcome = (uri, outcome) => {
|
|
1454
|
+
const label = outcome === "unchanged" ? "Unchanged" : outcome === "changed" ? "Updated" : "Added";
|
|
1455
|
+
reportProgress(progress, `${label} ${uri}`);
|
|
1456
|
+
};
|
|
1304
1457
|
const ingestOne = async (uri, producer) => {
|
|
1305
1458
|
try {
|
|
1306
1459
|
const probeId = stableId("doc", source.id, uri);
|
|
1307
1460
|
const earlier = previous.get(probeId);
|
|
1308
1461
|
const document = await producer();
|
|
1462
|
+
if (processedDocumentIds.has(document.id)) {
|
|
1463
|
+
reportProgressDetail(progress, `Skipped duplicate alias ${uri} -> ${document.uri}`);
|
|
1464
|
+
return null;
|
|
1465
|
+
}
|
|
1466
|
+
processedDocumentIds.add(document.id);
|
|
1467
|
+
const existingDocument = earlier ?? previous.get(document.id);
|
|
1309
1468
|
nextDocuments.set(document.id, document);
|
|
1310
|
-
if (!
|
|
1469
|
+
if (!existingDocument) {
|
|
1311
1470
|
added += 1;
|
|
1312
|
-
|
|
1471
|
+
reportDocumentOutcome(document.uri, "added");
|
|
1472
|
+
} else if (existingDocument.contentHash !== document.contentHash) {
|
|
1313
1473
|
changed += 1;
|
|
1474
|
+
reportDocumentOutcome(document.uri, "changed");
|
|
1314
1475
|
} else {
|
|
1315
1476
|
unchanged += 1;
|
|
1477
|
+
reportDocumentOutcome(document.uri, "unchanged");
|
|
1316
1478
|
}
|
|
1479
|
+
return document;
|
|
1317
1480
|
} catch (error) {
|
|
1318
1481
|
failed += 1;
|
|
1319
1482
|
failures.push({
|
|
@@ -1321,50 +1484,69 @@ async function ingestSources({
|
|
|
1321
1484
|
uri,
|
|
1322
1485
|
message: error instanceof Error ? error.message : String(error)
|
|
1323
1486
|
});
|
|
1487
|
+
reportProgressDetail(progress, `Failed ${uri}: ${error instanceof Error ? error.message : String(error)}`);
|
|
1488
|
+
return null;
|
|
1324
1489
|
}
|
|
1325
1490
|
};
|
|
1326
1491
|
try {
|
|
1492
|
+
reportProgress(progress, `Source ${source.name} (${source.type})`);
|
|
1327
1493
|
if (source.type === "file") {
|
|
1494
|
+
reportProgress(progress, `Reading file ${source.uri}`);
|
|
1328
1495
|
await ingestOne(source.uri, () => ingestFile({ workspacePath, source, filePath: source.uri, previous: previous.get(stableId("doc", source.id, source.uri)) }));
|
|
1329
|
-
|
|
1330
|
-
|
|
1331
|
-
|
|
1332
|
-
for (const filePath of
|
|
1496
|
+
} else if (source.type === "directory") {
|
|
1497
|
+
const files = await listDirectoryFiles(source);
|
|
1498
|
+
reportProgress(progress, `Scanning ${files.length} file${files.length === 1 ? "" : "s"} from ${source.uri}`);
|
|
1499
|
+
for (const filePath of files) {
|
|
1500
|
+
reportProgress(progress, `Reading file ${filePath}`);
|
|
1333
1501
|
await ingestOne(filePath, () => ingestFile({ workspacePath, source, filePath, previous: previous.get(stableId("doc", source.id, filePath)) }));
|
|
1334
1502
|
}
|
|
1335
|
-
|
|
1336
|
-
|
|
1337
|
-
if (source.type === "url") {
|
|
1503
|
+
} else if (source.type === "url") {
|
|
1504
|
+
reportProgress(progress, `Fetching ${source.uri}`);
|
|
1338
1505
|
await ingestOne(source.uri, () => fetchUrlDocument({ workspacePath, source, url: source.uri, previous: previous.get(stableId("doc", source.id, source.uri)) }));
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1506
|
+
} else if (source.type === "website") {
|
|
1507
|
+
reportProgress(progress, `Crawling ${source.uri}`);
|
|
1508
|
+
const urls = await crawlWebsite(source, {
|
|
1509
|
+
userAgent: defaultUserAgent,
|
|
1510
|
+
rateLimitMs: defaultRateLimitMs,
|
|
1511
|
+
maxConcurrentRequests
|
|
1512
|
+
}, progress);
|
|
1513
|
+
reportProgress(progress, `Fetched ${urls.length} page${urls.length === 1 ? "" : "s"} from crawl`);
|
|
1514
|
+
const seenCanonicalUrls = /* @__PURE__ */ new Set();
|
|
1515
|
+
await mapWithConcurrency(urls, maxConcurrentRequests, async (url) => {
|
|
1516
|
+
if (seenCanonicalUrls.has(url)) {
|
|
1517
|
+
reportProgressDetail(progress, `Skipped canonical duplicate ${url}`);
|
|
1518
|
+
return;
|
|
1519
|
+
}
|
|
1520
|
+
reportProgress(progress, `Fetching ${url}`);
|
|
1521
|
+
const document = await ingestOne(url, () => fetchUrlDocument({ workspacePath, source, url, previous: previous.get(stableId("doc", source.id, url)) }));
|
|
1522
|
+
if (document) {
|
|
1523
|
+
seenCanonicalUrls.add(document.uri);
|
|
1524
|
+
}
|
|
1525
|
+
});
|
|
1526
|
+
} else if (source.type === "rss") {
|
|
1527
|
+
reportProgress(progress, `Fetching feed ${source.uri}`);
|
|
1348
1528
|
const result = await ingestRssSource({
|
|
1349
1529
|
workspacePath,
|
|
1350
1530
|
source,
|
|
1351
1531
|
previous,
|
|
1352
1532
|
nextDocuments,
|
|
1533
|
+
maxConcurrentRequests,
|
|
1534
|
+
onDocumentProcessed: reportDocumentOutcome,
|
|
1353
1535
|
onFailure: (uri, error) => {
|
|
1354
1536
|
failures.push({
|
|
1355
1537
|
sourceId: source.id,
|
|
1356
1538
|
uri,
|
|
1357
1539
|
message: error instanceof Error ? error.message : String(error)
|
|
1358
1540
|
});
|
|
1541
|
+
reportProgressDetail(progress, `Failed ${uri}: ${error instanceof Error ? error.message : String(error)}`);
|
|
1359
1542
|
}
|
|
1360
1543
|
});
|
|
1361
1544
|
added += result.added;
|
|
1362
1545
|
changed += result.changed;
|
|
1363
1546
|
unchanged += result.unchanged;
|
|
1364
1547
|
failed += result.failed;
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
if (source.type === "markdown" || source.type === "text") {
|
|
1548
|
+
} else if (source.type === "markdown" || source.type === "text") {
|
|
1549
|
+
reportProgress(progress, `Processing inline ${source.type} source ${source.id}`);
|
|
1368
1550
|
await ingestOne(source.uri, () => ingestInlineContent({
|
|
1369
1551
|
workspacePath,
|
|
1370
1552
|
source,
|
|
@@ -1381,13 +1563,19 @@ async function ingestSources({
|
|
|
1381
1563
|
uri: source.uri,
|
|
1382
1564
|
message: error instanceof Error ? error.message : String(error)
|
|
1383
1565
|
});
|
|
1566
|
+
reportProgressDetail(progress, `Failed source ${source.name}: ${error instanceof Error ? error.message : String(error)}`);
|
|
1384
1567
|
}
|
|
1568
|
+
reportProgress(
|
|
1569
|
+
progress,
|
|
1570
|
+
`Finished ${source.name}: +${added - sourceBefore.added} added, ${changed - sourceBefore.changed} changed, ${unchanged - sourceBefore.unchanged} unchanged, ${failed - sourceBefore.failed} failed`
|
|
1571
|
+
);
|
|
1385
1572
|
}
|
|
1386
1573
|
const expiringDocuments = [...nextDocuments.values()].filter((document) => {
|
|
1387
1574
|
const source = sources.find((candidate) => candidate.id === document.sourceId);
|
|
1388
1575
|
return source ? shouldExpireRssDocument(document, source, defaultRetentionDays) : false;
|
|
1389
1576
|
});
|
|
1390
1577
|
if (expiringDocuments.length > 0) {
|
|
1578
|
+
reportProgress(progress, `Removing ${expiringDocuments.length} expired RSS document${expiringDocuments.length === 1 ? "" : "s"}`);
|
|
1391
1579
|
const expiredIds = new Set(expiringDocuments.map((document) => document.id));
|
|
1392
1580
|
for (const document of expiringDocuments) {
|
|
1393
1581
|
nextDocuments.delete(document.id);
|
|
@@ -1414,6 +1602,7 @@ async function ingestSources({
|
|
|
1414
1602
|
documentsSnapshot: documentSnapshot(finalDocuments)
|
|
1415
1603
|
};
|
|
1416
1604
|
await writeRun(workspacePath, run);
|
|
1605
|
+
reportProgress(progress, `Ingest complete: ${added} added, ${changed} changed, ${unchanged} unchanged, ${failed} failed`);
|
|
1417
1606
|
return {
|
|
1418
1607
|
runId: id,
|
|
1419
1608
|
documents: { added, changed, unchanged, failed },
|
|
@@ -1423,7 +1612,8 @@ async function ingestSources({
|
|
|
1423
1612
|
async function reprocessDocuments({
|
|
1424
1613
|
workspacePath,
|
|
1425
1614
|
sourceId,
|
|
1426
|
-
documentId
|
|
1615
|
+
documentId,
|
|
1616
|
+
progress
|
|
1427
1617
|
}) {
|
|
1428
1618
|
const documents = await loadDocuments(workspacePath);
|
|
1429
1619
|
const sources = await listSources(workspacePath);
|
|
@@ -1431,15 +1621,20 @@ async function reprocessDocuments({
|
|
|
1431
1621
|
const nextDocuments = new Map(documents.map((document) => [document.id, document]));
|
|
1432
1622
|
let documentsReprocessed = 0;
|
|
1433
1623
|
let documentsSkipped = 0;
|
|
1434
|
-
|
|
1624
|
+
const targets = documents.filter((candidate) => (!sourceId || candidate.sourceId === sourceId) && (!documentId || candidate.id === documentId));
|
|
1625
|
+
reportProgress(progress, `Reprocessing ${targets.length} document${targets.length === 1 ? "" : "s"}`);
|
|
1626
|
+
for (const document of targets) {
|
|
1627
|
+
reportProgressDetail(progress, `Reprocessing ${document.id} (${document.title})`);
|
|
1435
1628
|
const source = sourceMap.get(document.sourceId);
|
|
1436
1629
|
if (!source || !document.rawPath || !await fileExists(document.rawPath)) {
|
|
1437
1630
|
documentsSkipped += 1;
|
|
1631
|
+
reportProgressDetail(progress, `Skipped ${document.id}: raw source not available`);
|
|
1438
1632
|
continue;
|
|
1439
1633
|
}
|
|
1440
1634
|
const updated = source.type === "url" || source.type === "website" || source.type === "rss" ? await reprocessRemoteDocument(document, source) : await reprocessStoredDocument(document, source);
|
|
1441
1635
|
if (!updated) {
|
|
1442
1636
|
documentsSkipped += 1;
|
|
1637
|
+
reportProgressDetail(progress, `Skipped ${document.id}: source type could not be reprocessed`);
|
|
1443
1638
|
continue;
|
|
1444
1639
|
}
|
|
1445
1640
|
nextDocuments.set(updated.id, updated);
|
|
@@ -1459,6 +1654,7 @@ async function reprocessDocuments({
|
|
|
1459
1654
|
},
|
|
1460
1655
|
documentsSnapshot: documentSnapshot(finalDocuments)
|
|
1461
1656
|
});
|
|
1657
|
+
reportProgress(progress, `Reprocess complete: ${documentsReprocessed} updated, ${documentsSkipped} skipped`);
|
|
1462
1658
|
return { runId: id, documentsReprocessed, documentsSkipped };
|
|
1463
1659
|
}
|
|
1464
1660
|
|
|
@@ -1560,11 +1756,13 @@ function buildChunksForDocument(document, markdown, config, prior = /* @__PURE__
|
|
|
1560
1756
|
async function chunkDocuments({
|
|
1561
1757
|
workspacePath,
|
|
1562
1758
|
sourceId,
|
|
1563
|
-
documentId
|
|
1759
|
+
documentId,
|
|
1760
|
+
progress
|
|
1564
1761
|
}) {
|
|
1565
1762
|
const config = await loadConfig(workspacePath);
|
|
1566
1763
|
const documents = await readJsonl(path11.join(workspacePath, "documents", "documents.jsonl"));
|
|
1567
1764
|
const filtered = documents.filter((document) => (!sourceId || document.sourceId === sourceId) && (!documentId || document.id === documentId));
|
|
1765
|
+
reportProgress(progress, `Chunking ${filtered.length} document${filtered.length === 1 ? "" : "s"}`);
|
|
1568
1766
|
const targetedDocumentIds = new Set(filtered.map((document) => document.id));
|
|
1569
1767
|
const existingChunks = await loadChunks(workspacePath);
|
|
1570
1768
|
const prior = new Map(existingChunks.map((chunk) => [chunk.id, chunk]));
|
|
@@ -1572,12 +1770,14 @@ async function chunkDocuments({
|
|
|
1572
1770
|
existingChunks.filter((chunk) => !targetedDocumentIds.has(chunk.documentId)).map((chunk) => [chunk.id, chunk])
|
|
1573
1771
|
);
|
|
1574
1772
|
for (const document of filtered) {
|
|
1773
|
+
reportProgressDetail(progress, `Chunking ${document.id} (${document.title})`);
|
|
1575
1774
|
const raw = await readFile8(document.normalizedPath, "utf8");
|
|
1576
1775
|
for (const chunk of buildChunksForDocument(document, raw, config, prior)) {
|
|
1577
1776
|
nextChunks.set(chunk.id, chunk);
|
|
1578
1777
|
}
|
|
1579
1778
|
}
|
|
1580
1779
|
await saveChunks(workspacePath, [...nextChunks.values()]);
|
|
1780
|
+
reportProgress(progress, `Chunking complete: ${nextChunks.size} chunk${nextChunks.size === 1 ? "" : "s"} written`);
|
|
1581
1781
|
return { chunksWritten: nextChunks.size };
|
|
1582
1782
|
}
|
|
1583
1783
|
|
|
@@ -1586,15 +1786,31 @@ import { Analyzer, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, Ranking
|
|
|
1586
1786
|
import path17 from "path";
|
|
1587
1787
|
|
|
1588
1788
|
// src/vector/dense.ts
|
|
1589
|
-
import { VectorFieldIndex, createSeededRandom } from "@tryformation/querylight-ts";
|
|
1789
|
+
import { VectorFieldIndex, cosineSimilarity, createSeededRandom } from "@tryformation/querylight-ts";
|
|
1590
1790
|
import { mkdir as mkdir7 } from "fs/promises";
|
|
1591
1791
|
import path14 from "path";
|
|
1592
1792
|
|
|
1593
1793
|
// src/vector/runtime.ts
|
|
1794
|
+
import os from "os";
|
|
1594
1795
|
import path12 from "path";
|
|
1595
1796
|
import { fileURLToPath } from "url";
|
|
1596
1797
|
import { execFile, execFileSync } from "child_process";
|
|
1798
|
+
function resolveQliHomeDir() {
|
|
1799
|
+
return path12.resolve(process.env.QLI_HOME ?? path12.join(os.homedir(), ".qli"));
|
|
1800
|
+
}
|
|
1597
1801
|
function resolveCacheDir(workspacePath, configuredPath) {
|
|
1802
|
+
if (configuredPath === "~/.qli") {
|
|
1803
|
+
return resolveQliHomeDir();
|
|
1804
|
+
}
|
|
1805
|
+
if (configuredPath.startsWith("~/.qli/")) {
|
|
1806
|
+
return path12.join(resolveQliHomeDir(), configuredPath.slice("~/.qli/".length));
|
|
1807
|
+
}
|
|
1808
|
+
if (configuredPath === "~") {
|
|
1809
|
+
return os.homedir();
|
|
1810
|
+
}
|
|
1811
|
+
if (configuredPath.startsWith("~/")) {
|
|
1812
|
+
return path12.join(os.homedir(), configuredPath.slice(2));
|
|
1813
|
+
}
|
|
1598
1814
|
return path12.isAbsolute(configuredPath) ? configuredPath : path12.resolve(workspacePath, configuredPath.replace(/^\.kb\//, ""));
|
|
1599
1815
|
}
|
|
1600
1816
|
function packageRootFromImportMeta(importMetaUrl) {
|
|
@@ -1618,6 +1834,14 @@ async function ensureUvAvailable() {
|
|
|
1618
1834
|
execFile("uv", ["--version"], (error) => error ? reject(error) : resolve2());
|
|
1619
1835
|
});
|
|
1620
1836
|
}
|
|
1837
|
+
async function isUvAvailable() {
|
|
1838
|
+
try {
|
|
1839
|
+
await ensureUvAvailable();
|
|
1840
|
+
return true;
|
|
1841
|
+
} catch {
|
|
1842
|
+
return false;
|
|
1843
|
+
}
|
|
1844
|
+
}
|
|
1621
1845
|
async function runSparsePython({
|
|
1622
1846
|
workspacePath,
|
|
1623
1847
|
config,
|
|
@@ -1661,47 +1885,95 @@ async function getDenseTransformersRuntime(cacheDir) {
|
|
|
1661
1885
|
}
|
|
1662
1886
|
|
|
1663
1887
|
// src/vector/store.ts
|
|
1664
|
-
import { mkdir as mkdir6,
|
|
1888
|
+
import { mkdir as mkdir6, rm as rm2, writeFile as writeFile7 } from "fs/promises";
|
|
1665
1889
|
import path13 from "path";
|
|
1890
|
+
|
|
1891
|
+
// src/core/gzip-json.ts
|
|
1892
|
+
import { readFile as readFile9, writeFile as writeFile6 } from "fs/promises";
|
|
1893
|
+
import { promisify } from "util";
|
|
1894
|
+
import { gunzip, gzip } from "zlib";
|
|
1895
|
+
var gzipAsync = promisify(gzip);
|
|
1896
|
+
var gunzipAsync = promisify(gunzip);
|
|
1897
|
+
async function writeGzipJson(filePath, value) {
|
|
1898
|
+
const payload = JSON.stringify(value, null, 2);
|
|
1899
|
+
await writeFile6(filePath, await gzipAsync(Buffer.from(payload, "utf8")));
|
|
1900
|
+
}
|
|
1901
|
+
async function readJsonFromGzipOrFile(gzipPath, legacyPath) {
|
|
1902
|
+
if (await fileExists(gzipPath)) {
|
|
1903
|
+
const payload = await readFile9(gzipPath);
|
|
1904
|
+
return JSON.parse((await gunzipAsync(payload)).toString("utf8"));
|
|
1905
|
+
}
|
|
1906
|
+
if (legacyPath && await fileExists(legacyPath)) {
|
|
1907
|
+
return JSON.parse(await readFile9(legacyPath, "utf8"));
|
|
1908
|
+
}
|
|
1909
|
+
return JSON.parse(await readFile9(gzipPath, "utf8"));
|
|
1910
|
+
}
|
|
1911
|
+
|
|
1912
|
+
// src/vector/store.ts
|
|
1666
1913
|
function vectorsDir(workspacePath) {
|
|
1667
1914
|
return path13.join(workspacePath, "vectors");
|
|
1668
1915
|
}
|
|
1669
|
-
function
|
|
1670
|
-
return path13.join(
|
|
1916
|
+
function sharedModelStateDir() {
|
|
1917
|
+
return path13.join(resolveQliHomeDir(), "models", "status");
|
|
1671
1918
|
}
|
|
1672
1919
|
function denseVectorPath(workspacePath) {
|
|
1673
|
-
return path13.join(vectorsDir(workspacePath), "dense.latest.json");
|
|
1920
|
+
return path13.join(vectorsDir(workspacePath), "dense.latest.json.gz");
|
|
1674
1921
|
}
|
|
1675
1922
|
function denseMetaPath(workspacePath) {
|
|
1676
|
-
return path13.join(vectorsDir(workspacePath), "dense.latest.meta.json");
|
|
1923
|
+
return path13.join(vectorsDir(workspacePath), "dense.latest.meta.json.gz");
|
|
1677
1924
|
}
|
|
1678
1925
|
function sparseVectorPath(workspacePath) {
|
|
1679
|
-
return path13.join(vectorsDir(workspacePath), "sparse.latest.json");
|
|
1926
|
+
return path13.join(vectorsDir(workspacePath), "sparse.latest.json.gz");
|
|
1680
1927
|
}
|
|
1681
1928
|
function sparseMetaPath(workspacePath) {
|
|
1929
|
+
return path13.join(vectorsDir(workspacePath), "sparse.latest.meta.json.gz");
|
|
1930
|
+
}
|
|
1931
|
+
function legacyDenseVectorPath(workspacePath) {
|
|
1932
|
+
return path13.join(vectorsDir(workspacePath), "dense.latest.json");
|
|
1933
|
+
}
|
|
1934
|
+
function legacyDenseMetaPath(workspacePath) {
|
|
1935
|
+
return path13.join(vectorsDir(workspacePath), "dense.latest.meta.json");
|
|
1936
|
+
}
|
|
1937
|
+
function legacySparseVectorPath(workspacePath) {
|
|
1938
|
+
return path13.join(vectorsDir(workspacePath), "sparse.latest.json");
|
|
1939
|
+
}
|
|
1940
|
+
function legacySparseMetaPath(workspacePath) {
|
|
1682
1941
|
return path13.join(vectorsDir(workspacePath), "sparse.latest.meta.json");
|
|
1683
1942
|
}
|
|
1684
|
-
function
|
|
1685
|
-
|
|
1943
|
+
function pullMarkerPath(type, workspacePath, modelId, cacheDir) {
|
|
1944
|
+
const resolvedCacheDir = resolveCacheDir(workspacePath, cacheDir);
|
|
1945
|
+
const cacheKey = sha256(resolvedCacheDir).slice(0, 16);
|
|
1946
|
+
return path13.join(sharedModelStateDir(), type, `${encodeURIComponent(modelId)}.${cacheKey}.json`);
|
|
1947
|
+
}
|
|
1948
|
+
function densePullMarker(workspacePath, modelId, cacheDir) {
|
|
1949
|
+
return pullMarkerPath("dense", workspacePath, modelId, cacheDir);
|
|
1686
1950
|
}
|
|
1687
|
-
function sparsePullMarker(workspacePath) {
|
|
1688
|
-
return
|
|
1951
|
+
function sparsePullMarker(workspacePath, modelId, cacheDir) {
|
|
1952
|
+
return pullMarkerPath("sparse", workspacePath, modelId, cacheDir);
|
|
1689
1953
|
}
|
|
1690
1954
|
async function writeDensePayload(workspacePath, payload) {
|
|
1691
1955
|
await mkdir6(vectorsDir(workspacePath), { recursive: true });
|
|
1692
|
-
await
|
|
1693
|
-
await
|
|
1956
|
+
await writeGzipJson(denseVectorPath(workspacePath), payload);
|
|
1957
|
+
await writeGzipJson(denseMetaPath(workspacePath), payload.metadata);
|
|
1958
|
+
await Promise.all([
|
|
1959
|
+
rm2(legacyDenseVectorPath(workspacePath), { force: true }),
|
|
1960
|
+
rm2(legacyDenseMetaPath(workspacePath), { force: true })
|
|
1961
|
+
]);
|
|
1694
1962
|
}
|
|
1695
1963
|
async function readDensePayload(workspacePath) {
|
|
1696
|
-
return
|
|
1964
|
+
return readJsonFromGzipOrFile(denseVectorPath(workspacePath), legacyDenseVectorPath(workspacePath));
|
|
1697
1965
|
}
|
|
1698
1966
|
async function writeSparsePayload(workspacePath, payload) {
|
|
1699
1967
|
await mkdir6(vectorsDir(workspacePath), { recursive: true });
|
|
1700
|
-
await
|
|
1701
|
-
await
|
|
1968
|
+
await writeGzipJson(sparseVectorPath(workspacePath), payload);
|
|
1969
|
+
await writeGzipJson(sparseMetaPath(workspacePath), payload.metadata);
|
|
1970
|
+
await Promise.all([
|
|
1971
|
+
rm2(legacySparseVectorPath(workspacePath), { force: true }),
|
|
1972
|
+
rm2(legacySparseMetaPath(workspacePath), { force: true })
|
|
1973
|
+
]);
|
|
1702
1974
|
}
|
|
1703
1975
|
async function readSparsePayload(workspacePath) {
|
|
1704
|
-
return
|
|
1976
|
+
return readJsonFromGzipOrFile(sparseVectorPath(workspacePath), legacySparseVectorPath(workspacePath));
|
|
1705
1977
|
}
|
|
1706
1978
|
async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
|
|
1707
1979
|
const denseCacheDir = resolveCacheDir(workspacePath, dense.cacheDir);
|
|
@@ -1711,30 +1983,72 @@ async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
|
|
|
1711
1983
|
configured: dense.enabled,
|
|
1712
1984
|
modelId: dense.modelId,
|
|
1713
1985
|
cacheDir: denseCacheDir,
|
|
1714
|
-
available: await fileExists(densePullMarker(workspacePath)),
|
|
1715
|
-
artifactExists: await fileExists(denseVectorPath(workspacePath))
|
|
1986
|
+
available: await fileExists(densePullMarker(workspacePath, dense.modelId, dense.cacheDir)),
|
|
1987
|
+
artifactExists: await fileExists(denseVectorPath(workspacePath)) || await fileExists(legacyDenseVectorPath(workspacePath))
|
|
1716
1988
|
},
|
|
1717
1989
|
sparse: {
|
|
1718
1990
|
configured: sparse.enabled,
|
|
1719
1991
|
modelId: sparse.modelId,
|
|
1720
1992
|
cacheDir: sparseCacheDir,
|
|
1721
1993
|
uvAvailable,
|
|
1722
|
-
available: await fileExists(sparsePullMarker(workspacePath)),
|
|
1723
|
-
artifactExists: await fileExists(sparseVectorPath(workspacePath))
|
|
1994
|
+
available: await fileExists(sparsePullMarker(workspacePath, sparse.modelId, sparse.cacheDir)),
|
|
1995
|
+
artifactExists: await fileExists(sparseVectorPath(workspacePath)) || await fileExists(legacySparseVectorPath(workspacePath))
|
|
1724
1996
|
}
|
|
1725
1997
|
};
|
|
1726
1998
|
}
|
|
1727
1999
|
|
|
1728
2000
|
// src/vector/text.ts
|
|
2001
|
+
var LOW_SIGNAL_HEADINGS = /* @__PURE__ */ new Set([
|
|
2002
|
+
"choose this instead of",
|
|
2003
|
+
"how xyz runs it",
|
|
2004
|
+
"naechste schritte",
|
|
2005
|
+
"next steps",
|
|
2006
|
+
"overview",
|
|
2007
|
+
"passend wenn",
|
|
2008
|
+
"problem",
|
|
2009
|
+
"right fit",
|
|
2010
|
+
"waehlen sie das stattdessen",
|
|
2011
|
+
"was sie bekommen",
|
|
2012
|
+
"what you get",
|
|
2013
|
+
"wie xyz es umsetzt",
|
|
2014
|
+
"uberblick",
|
|
2015
|
+
"\xFCberblick"
|
|
2016
|
+
]);
|
|
2017
|
+
function normalizeHeading(value) {
|
|
2018
|
+
return value.trim().toLowerCase();
|
|
2019
|
+
}
|
|
2020
|
+
function isLowSignalHeading(value) {
|
|
2021
|
+
return LOW_SIGNAL_HEADINGS.has(normalizeHeading(value));
|
|
2022
|
+
}
|
|
2023
|
+
function stripLeadingHeading(text, heading) {
|
|
2024
|
+
const lines = text.split("\n");
|
|
2025
|
+
const firstContentIndex = lines.findIndex((line) => line.trim().length > 0);
|
|
2026
|
+
if (firstContentIndex < 0) {
|
|
2027
|
+
return text;
|
|
2028
|
+
}
|
|
2029
|
+
const match = /^(#{1,6})\s+(.+)$/.exec(lines[firstContentIndex] ?? "");
|
|
2030
|
+
if (!match?.[2] || normalizeHeading(match[2]) !== normalizeHeading(heading)) {
|
|
2031
|
+
return text;
|
|
2032
|
+
}
|
|
2033
|
+
const next = [...lines.slice(0, firstContentIndex), ...lines.slice(firstContentIndex + 1)].join("\n").trim();
|
|
2034
|
+
return next;
|
|
2035
|
+
}
|
|
2036
|
+
function createVectorText(chunk) {
|
|
2037
|
+
const meaningfulHeadings = chunk.headingPath.filter((heading) => !isLowSignalHeading(heading) && normalizeHeading(heading) !== normalizeHeading(chunk.title));
|
|
2038
|
+
const textHeading = [...chunk.headingPath].reverse().find((heading) => isLowSignalHeading(heading) || normalizeHeading(heading) === normalizeHeading(chunk.title));
|
|
2039
|
+
const body = textHeading ? stripLeadingHeading(chunk.text, textHeading) : chunk.text.trim();
|
|
2040
|
+
return [chunk.title, ...meaningfulHeadings, body].filter(Boolean).join("\n\n");
|
|
2041
|
+
}
|
|
1729
2042
|
function createDenseChunkText(chunk) {
|
|
1730
|
-
return
|
|
2043
|
+
return createVectorText(chunk);
|
|
1731
2044
|
}
|
|
1732
2045
|
function createSparseChunkText(chunk) {
|
|
1733
|
-
return
|
|
2046
|
+
return createVectorText(chunk);
|
|
1734
2047
|
}
|
|
1735
2048
|
|
|
1736
2049
|
// src/vector/dense.ts
|
|
1737
2050
|
var denseEmbedderFactory = null;
|
|
2051
|
+
var EXACT_DENSE_RERANK_THRESHOLD = 5e3;
|
|
1738
2052
|
async function createEmbedder(cacheDir, modelId) {
|
|
1739
2053
|
if (denseEmbedderFactory) {
|
|
1740
2054
|
return denseEmbedderFactory(cacheDir, modelId);
|
|
@@ -1746,9 +2060,13 @@ async function createEmbedder(cacheDir, modelId) {
|
|
|
1746
2060
|
return output.tolist()[0];
|
|
1747
2061
|
};
|
|
1748
2062
|
}
|
|
2063
|
+
function exactDenseQuery(payload, vector, topK) {
|
|
2064
|
+
return payload.chunks.map((chunk) => [chunk.chunkId, cosineSimilarity(vector, chunk.embedding)]).sort((left, right) => right[1] - left[1]).slice(0, topK);
|
|
2065
|
+
}
|
|
1749
2066
|
async function buildDenseVectors({
|
|
1750
2067
|
workspacePath,
|
|
1751
|
-
config
|
|
2068
|
+
config,
|
|
2069
|
+
progress
|
|
1752
2070
|
}) {
|
|
1753
2071
|
const chunks = await readJsonl(path14.join(workspacePath, "chunks", "chunks.jsonl"));
|
|
1754
2072
|
const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
|
|
@@ -1756,6 +2074,7 @@ async function buildDenseVectors({
|
|
|
1756
2074
|
const embed = await createEmbedder(cacheDir, config.modelId);
|
|
1757
2075
|
const records = [];
|
|
1758
2076
|
let dimensions = 0;
|
|
2077
|
+
reportProgress(progress, `Encoding ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} for dense retrieval`);
|
|
1759
2078
|
for (const chunk of chunks) {
|
|
1760
2079
|
const embedding = await embed(createDenseChunkText(chunk));
|
|
1761
2080
|
dimensions ||= embedding.length;
|
|
@@ -1769,7 +2088,11 @@ async function buildDenseVectors({
|
|
|
1769
2088
|
text: chunk.text,
|
|
1770
2089
|
embedding
|
|
1771
2090
|
});
|
|
2091
|
+
if (records.length === 1 || records.length % 100 === 0 || records.length === chunks.length) {
|
|
2092
|
+
reportProgressDetail(progress, `Encoded ${records.length}/${chunks.length} chunks for dense retrieval`);
|
|
2093
|
+
}
|
|
1772
2094
|
}
|
|
2095
|
+
reportProgress(progress, "Building dense vector index");
|
|
1773
2096
|
const index = new VectorFieldIndex({
|
|
1774
2097
|
numHashTables: config.indexHashTables,
|
|
1775
2098
|
dimensions,
|
|
@@ -1793,6 +2116,7 @@ async function buildDenseVectors({
|
|
|
1793
2116
|
chunks: records
|
|
1794
2117
|
};
|
|
1795
2118
|
await writeDensePayload(workspacePath, payload);
|
|
2119
|
+
reportProgress(progress, `Dense vectors written for ${records.length} chunk${records.length === 1 ? "" : "s"}`);
|
|
1796
2120
|
return payload;
|
|
1797
2121
|
}
|
|
1798
2122
|
async function denseQuery({
|
|
@@ -1805,12 +2129,19 @@ async function denseQuery({
|
|
|
1805
2129
|
const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
|
|
1806
2130
|
const embed = await createEmbedder(cacheDir, config.modelId);
|
|
1807
2131
|
const vector = await embed(query);
|
|
2132
|
+
if (payload.chunks.length <= EXACT_DENSE_RERANK_THRESHOLD) {
|
|
2133
|
+
return exactDenseQuery(payload, vector, topK);
|
|
2134
|
+
}
|
|
1808
2135
|
const index = new VectorFieldIndex({
|
|
1809
2136
|
numHashTables: payload.metadata.hashTables,
|
|
1810
2137
|
dimensions: payload.metadata.dimensions,
|
|
1811
2138
|
random: createSeededRandom(payload.metadata.randomSeed)
|
|
1812
2139
|
}).loadState(payload.indexState);
|
|
1813
|
-
|
|
2140
|
+
const approximateHits = index.query(vector, topK);
|
|
2141
|
+
if (approximateHits.length >= topK) {
|
|
2142
|
+
return approximateHits;
|
|
2143
|
+
}
|
|
2144
|
+
return exactDenseQuery(payload, vector, topK);
|
|
1814
2145
|
}
|
|
1815
2146
|
|
|
1816
2147
|
// src/vector/sparse.ts
|
|
@@ -1904,10 +2235,13 @@ async function buildSparseDocuments(workspacePath, config, chunks) {
|
|
|
1904
2235
|
}
|
|
1905
2236
|
async function buildSparseVectors({
|
|
1906
2237
|
workspacePath,
|
|
1907
|
-
config
|
|
2238
|
+
config,
|
|
2239
|
+
progress
|
|
1908
2240
|
}) {
|
|
1909
2241
|
const chunks = await readJsonl(path15.join(workspacePath, "chunks", "chunks.jsonl"));
|
|
2242
|
+
reportProgress(progress, `Encoding ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} for sparse retrieval`);
|
|
1910
2243
|
const built = await buildSparseDocuments(workspacePath, config, chunks);
|
|
2244
|
+
reportProgress(progress, "Building sparse vector index");
|
|
1911
2245
|
const index = new SparseVectorFieldIndex();
|
|
1912
2246
|
for (const record of built.chunks) {
|
|
1913
2247
|
index.insert(record.chunkId, [record.vector]);
|
|
@@ -1929,6 +2263,7 @@ async function buildSparseVectors({
|
|
|
1929
2263
|
queryTokenWeights: built.queryTokenWeights
|
|
1930
2264
|
};
|
|
1931
2265
|
await writeSparsePayload(workspacePath, payload);
|
|
2266
|
+
reportProgress(progress, `Sparse vectors written for ${built.chunks.length} chunk${built.chunks.length === 1 ? "" : "s"}`);
|
|
1932
2267
|
return payload;
|
|
1933
2268
|
}
|
|
1934
2269
|
async function sparseQuery({
|
|
@@ -1951,51 +2286,80 @@ async function buildVectorArtifacts({
|
|
|
1951
2286
|
config,
|
|
1952
2287
|
denseOverride,
|
|
1953
2288
|
sparseOverride,
|
|
1954
|
-
buildAvailableModels = false
|
|
2289
|
+
buildAvailableModels = false,
|
|
2290
|
+
progress
|
|
1955
2291
|
}) {
|
|
1956
|
-
const
|
|
1957
|
-
|
|
1958
|
-
await ensureUvAvailable();
|
|
1959
|
-
return true;
|
|
1960
|
-
} catch {
|
|
1961
|
-
return false;
|
|
1962
|
-
}
|
|
1963
|
-
})()) : null;
|
|
2292
|
+
const uvAvailable = await isUvAvailable();
|
|
2293
|
+
const modelStatus = buildAvailableModels ? await buildModelStatus(workspacePath, config.retrieval.dense, config.retrieval.sparse, uvAvailable) : null;
|
|
1964
2294
|
const denseEnabled = denseOverride ?? (buildAvailableModels ? config.retrieval.dense.enabled || Boolean(modelStatus?.dense.available) : config.retrieval.dense.enabled);
|
|
1965
|
-
const sparseEnabled = sparseOverride ?? (buildAvailableModels ? (config.retrieval.sparse.enabled || Boolean(modelStatus?.sparse.available)) && Boolean(modelStatus?.sparse.uvAvailable) : config.retrieval.sparse.enabled);
|
|
2295
|
+
const sparseEnabled = sparseOverride ?? (buildAvailableModels ? (config.retrieval.sparse.enabled || Boolean(modelStatus?.sparse.available)) && Boolean(modelStatus?.sparse.uvAvailable) : config.retrieval.sparse.enabled && uvAvailable);
|
|
1966
2296
|
const result = {};
|
|
1967
2297
|
if (denseEnabled) {
|
|
1968
|
-
|
|
2298
|
+
reportProgress(progress, `Building dense vectors with ${config.retrieval.dense.modelId}`);
|
|
2299
|
+
result.dense = await buildDenseVectors({ workspacePath, config: config.retrieval.dense, progress });
|
|
2300
|
+
}
|
|
2301
|
+
if ((sparseOverride || config.retrieval.sparse.enabled) && !uvAvailable) {
|
|
2302
|
+
reportProgress(progress, "Skipping sparse vectors because uv is not available");
|
|
1969
2303
|
}
|
|
1970
2304
|
if (sparseEnabled) {
|
|
1971
|
-
|
|
2305
|
+
reportProgress(progress, `Building sparse vectors with ${config.retrieval.sparse.modelId}`);
|
|
2306
|
+
result.sparse = await buildSparseVectors({ workspacePath, config: config.retrieval.sparse, progress });
|
|
1972
2307
|
}
|
|
1973
2308
|
return result;
|
|
1974
2309
|
}
|
|
1975
2310
|
|
|
1976
2311
|
// src/index/index-store.ts
|
|
1977
|
-
import {
|
|
2312
|
+
import { mkdir as mkdir9, rm as rm3 } from "fs/promises";
|
|
1978
2313
|
import path16 from "path";
|
|
2314
|
+
function versionedIndexPath(workspacePath, stamp) {
|
|
2315
|
+
return path16.join(workspacePath, "indexes", `${stamp}.json.gz`);
|
|
2316
|
+
}
|
|
2317
|
+
function versionedLegacyIndexPath(workspacePath, stamp) {
|
|
2318
|
+
return path16.join(workspacePath, "indexes", `${stamp}.json`);
|
|
2319
|
+
}
|
|
2320
|
+
function versionedMetaPath(workspacePath, stamp) {
|
|
2321
|
+
return path16.join(workspacePath, "indexes", `${stamp}.meta.json.gz`);
|
|
2322
|
+
}
|
|
2323
|
+
function versionedLegacyMetaPath(workspacePath, stamp) {
|
|
2324
|
+
return path16.join(workspacePath, "indexes", `${stamp}.meta.json`);
|
|
2325
|
+
}
|
|
2326
|
+
function latestIndexPath(workspacePath) {
|
|
2327
|
+
return path16.join(workspacePath, "indexes", "latest.json.gz");
|
|
2328
|
+
}
|
|
2329
|
+
function legacyLatestIndexPath(workspacePath) {
|
|
2330
|
+
return path16.join(workspacePath, "indexes", "latest.json");
|
|
2331
|
+
}
|
|
2332
|
+
function latestMetaPath(workspacePath) {
|
|
2333
|
+
return path16.join(workspacePath, "indexes", "latest.meta.json.gz");
|
|
2334
|
+
}
|
|
2335
|
+
function legacyLatestMetaPath(workspacePath) {
|
|
2336
|
+
return path16.join(workspacePath, "indexes", "latest.meta.json");
|
|
2337
|
+
}
|
|
1979
2338
|
async function writeIndexArtifacts({
|
|
1980
2339
|
workspacePath,
|
|
1981
2340
|
indexState,
|
|
1982
2341
|
metadata
|
|
1983
2342
|
}) {
|
|
1984
2343
|
const stamp = metadata.createdAt.replace(/[:.]/g, "-");
|
|
1985
|
-
const indexPath =
|
|
1986
|
-
const metaPath =
|
|
1987
|
-
const
|
|
1988
|
-
const
|
|
1989
|
-
|
|
1990
|
-
|
|
1991
|
-
await
|
|
1992
|
-
await
|
|
1993
|
-
await
|
|
1994
|
-
await
|
|
1995
|
-
|
|
2344
|
+
const indexPath = versionedIndexPath(workspacePath, stamp);
|
|
2345
|
+
const metaPath = versionedMetaPath(workspacePath, stamp);
|
|
2346
|
+
const latestIndexArtifactPath = latestIndexPath(workspacePath);
|
|
2347
|
+
const latestMetadataArtifactPath = latestMetaPath(workspacePath);
|
|
2348
|
+
await mkdir9(path16.join(workspacePath, "indexes"), { recursive: true });
|
|
2349
|
+
await writeGzipJson(indexPath, indexState);
|
|
2350
|
+
await writeGzipJson(metaPath, metadata);
|
|
2351
|
+
await writeGzipJson(latestIndexArtifactPath, indexState);
|
|
2352
|
+
await writeGzipJson(latestMetadataArtifactPath, metadata);
|
|
2353
|
+
await Promise.all([
|
|
2354
|
+
rm3(legacyLatestIndexPath(workspacePath), { force: true }),
|
|
2355
|
+
rm3(legacyLatestMetaPath(workspacePath), { force: true }),
|
|
2356
|
+
rm3(versionedLegacyIndexPath(workspacePath, stamp), { force: true }),
|
|
2357
|
+
rm3(versionedLegacyMetaPath(workspacePath, stamp), { force: true })
|
|
2358
|
+
]);
|
|
2359
|
+
return { indexPath: latestIndexArtifactPath, metadataPath: latestMetadataArtifactPath };
|
|
1996
2360
|
}
|
|
1997
2361
|
async function readLatestIndexState(workspacePath) {
|
|
1998
|
-
return
|
|
2362
|
+
return readJsonFromGzipOrFile(latestIndexPath(workspacePath), legacyLatestIndexPath(workspacePath));
|
|
1999
2363
|
}
|
|
2000
2364
|
|
|
2001
2365
|
// src/index/querylight-indexer.ts
|
|
@@ -2037,14 +2401,17 @@ async function buildIndex({
|
|
|
2037
2401
|
workspacePath,
|
|
2038
2402
|
denseOverride,
|
|
2039
2403
|
sparseOverride,
|
|
2040
|
-
buildAvailableModels = false
|
|
2404
|
+
buildAvailableModels = false,
|
|
2405
|
+
progress
|
|
2041
2406
|
}) {
|
|
2042
2407
|
const config = await loadConfig(workspacePath);
|
|
2408
|
+
reportProgress(progress, "Loading documents, chunks, and sources");
|
|
2043
2409
|
const chunks = await readJsonl(path17.join(workspacePath, "chunks", "chunks.jsonl"));
|
|
2044
2410
|
const documents = await readJsonl(path17.join(workspacePath, "documents", "documents.jsonl"));
|
|
2045
2411
|
const sources = await readJsonl(path17.join(workspacePath, "sources", "sources.jsonl"));
|
|
2046
2412
|
const metadataFields = [...new Set(chunks.flatMap((chunk) => Object.keys(chunk.metadata).map((key) => `metadata.${key}`)))];
|
|
2047
2413
|
const index = new DocumentIndex(createIndexMapping(metadataFields));
|
|
2414
|
+
reportProgress(progress, `Building lexical index from ${chunks.length} chunk${chunks.length === 1 ? "" : "s"}`);
|
|
2048
2415
|
for (const chunk of chunks) {
|
|
2049
2416
|
index.index({
|
|
2050
2417
|
id: chunk.id,
|
|
@@ -2059,6 +2426,7 @@ async function buildIndex({
|
|
|
2059
2426
|
}
|
|
2060
2427
|
});
|
|
2061
2428
|
}
|
|
2429
|
+
reportProgressDetail(progress, `Indexed ${documents.length} document${documents.length === 1 ? "" : "s"} across ${sources.length} source${sources.length === 1 ? "" : "s"}`);
|
|
2062
2430
|
const createdAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
2063
2431
|
const metadata = {
|
|
2064
2432
|
id: `index_${createdAt.replace(/[:.]/g, "-")}`,
|
|
@@ -2071,14 +2439,17 @@ async function buildIndex({
|
|
|
2071
2439
|
fields: Object.keys(index.mapping),
|
|
2072
2440
|
indexHash: sha256(JSON.stringify(index.indexState))
|
|
2073
2441
|
};
|
|
2442
|
+
reportProgress(progress, "Writing lexical index artifacts");
|
|
2074
2443
|
const artifacts = await writeIndexArtifacts({ workspacePath, indexState: index.indexState, metadata });
|
|
2075
2444
|
const vectors = await buildVectorArtifacts({
|
|
2076
2445
|
workspacePath,
|
|
2077
2446
|
config,
|
|
2078
2447
|
denseOverride,
|
|
2079
2448
|
sparseOverride,
|
|
2080
|
-
buildAvailableModels
|
|
2449
|
+
buildAvailableModels,
|
|
2450
|
+
progress
|
|
2081
2451
|
});
|
|
2452
|
+
reportProgress(progress, `Index build complete: dense=${Boolean(vectors.dense)}, sparse=${Boolean(vectors.sparse)}`);
|
|
2082
2453
|
return {
|
|
2083
2454
|
metadata,
|
|
2084
2455
|
indexPath: artifacts.indexPath,
|
|
@@ -2088,11 +2459,19 @@ async function buildIndex({
|
|
|
2088
2459
|
}
|
|
2089
2460
|
|
|
2090
2461
|
// src/query/search-service.ts
|
|
2091
|
-
import { readFile as
|
|
2462
|
+
import { readFile as readFile10 } from "fs/promises";
|
|
2092
2463
|
import { BoolQuery, MatchQuery, OP, TermQuery, reciprocalRankFusion } from "@tryformation/querylight-ts";
|
|
2093
2464
|
import path18 from "path";
|
|
2094
2465
|
async function loadHydratedIndex(workspacePath) {
|
|
2095
|
-
|
|
2466
|
+
let state;
|
|
2467
|
+
try {
|
|
2468
|
+
state = await readLatestIndexState(workspacePath);
|
|
2469
|
+
} catch (error) {
|
|
2470
|
+
if (error.code === "ENOENT") {
|
|
2471
|
+
throw new CliError("lexical index is not built; run `qli rebuild` or `qli chunk` followed by `qli index build`", "INDEX_MISSING", 7 /* QueryError */);
|
|
2472
|
+
}
|
|
2473
|
+
throw error;
|
|
2474
|
+
}
|
|
2096
2475
|
const mapping = createIndexMapping(Object.keys(state.fieldState ?? {}).filter((field) => field.startsWith("metadata.")));
|
|
2097
2476
|
return new (await import("@tryformation/querylight-ts")).DocumentIndex(mapping).loadState(state);
|
|
2098
2477
|
}
|
|
@@ -2310,7 +2689,7 @@ async function buildSnippetWithAdjacentChunks(chunk, query, {
|
|
|
2310
2689
|
if (!await fileExists(document.normalizedPath)) {
|
|
2311
2690
|
return buildSnippet(chunk.text, query);
|
|
2312
2691
|
}
|
|
2313
|
-
const raw = await
|
|
2692
|
+
const raw = await readFile10(document.normalizedPath, "utf8");
|
|
2314
2693
|
orderedChunks = buildChunksForDocument(document, raw, config);
|
|
2315
2694
|
orderedChunkCache.set(document.id, orderedChunks);
|
|
2316
2695
|
}
|
|
@@ -2328,9 +2707,25 @@ async function buildSnippetWithAdjacentChunks(chunk, query, {
|
|
|
2328
2707
|
function normalizeDisplayTitle(title) {
|
|
2329
2708
|
return title.replace(/\s*\|\s*Querylight TS Demo\s*$/i, "").replace(/\s+/g, " ").trim();
|
|
2330
2709
|
}
|
|
2710
|
+
var LOW_SIGNAL_RESULT_TITLES = /* @__PURE__ */ new Set([
|
|
2711
|
+
"choose this instead of",
|
|
2712
|
+
"how xyz runs it",
|
|
2713
|
+
"naechste schritte",
|
|
2714
|
+
"next steps",
|
|
2715
|
+
"overview",
|
|
2716
|
+
"passend wenn",
|
|
2717
|
+
"problem",
|
|
2718
|
+
"right fit",
|
|
2719
|
+
"waehlen sie das stattdessen",
|
|
2720
|
+
"was sie bekommen",
|
|
2721
|
+
"what you get",
|
|
2722
|
+
"wie xyz es umsetzt",
|
|
2723
|
+
"uberblick",
|
|
2724
|
+
"\xFCberblick"
|
|
2725
|
+
]);
|
|
2331
2726
|
function chooseResultTitle(chunk) {
|
|
2332
2727
|
const documentTitle = normalizeDisplayTitle(chunk.title);
|
|
2333
|
-
const headings = chunk.headingPath.map((heading) => normalizeDisplayTitle(heading)).filter(
|
|
2728
|
+
const headings = chunk.headingPath.map((heading) => normalizeDisplayTitle(heading)).filter((heading) => heading.length > 0 && !LOW_SIGNAL_RESULT_TITLES.has(heading.toLowerCase()));
|
|
2334
2729
|
const leafHeading = headings.at(-1);
|
|
2335
2730
|
if (leafHeading && leafHeading.toLowerCase() !== documentTitle.toLowerCase()) {
|
|
2336
2731
|
return leafHeading;
|
|
@@ -2352,6 +2747,9 @@ function normalizeUriPath(uri) {
|
|
|
2352
2747
|
return uri.toLowerCase().replace(/\/+$/, "");
|
|
2353
2748
|
}
|
|
2354
2749
|
}
|
|
2750
|
+
function normalizeUriIdentity(uri) {
|
|
2751
|
+
return normalizeRemoteUrl(uri).toLowerCase().replace(/\/+$/, "");
|
|
2752
|
+
}
|
|
2355
2753
|
function uriSpecificity(uri) {
|
|
2356
2754
|
const normalized = normalizeUriPath(uri);
|
|
2357
2755
|
if (normalized === "/") {
|
|
@@ -2368,6 +2766,11 @@ function isMoreSpecificDuplicate(candidate, existing) {
|
|
|
2368
2766
|
if (!candidateTitle || candidateTitle !== existingTitle) {
|
|
2369
2767
|
return false;
|
|
2370
2768
|
}
|
|
2769
|
+
const candidateIdentity = normalizeUriIdentity(candidate.uri);
|
|
2770
|
+
const existingIdentity = normalizeUriIdentity(existing.uri);
|
|
2771
|
+
if (candidateIdentity === existingIdentity) {
|
|
2772
|
+
return candidate.uri.length < existing.uri.length;
|
|
2773
|
+
}
|
|
2371
2774
|
const candidatePath = normalizeUriPath(candidate.uri);
|
|
2372
2775
|
const existingPath = normalizeUriPath(existing.uri);
|
|
2373
2776
|
if (candidatePath === existingPath) {
|
|
@@ -2480,7 +2883,6 @@ async function searchIndex({
|
|
|
2480
2883
|
score: 0,
|
|
2481
2884
|
title: chooseResultTitle(chunk),
|
|
2482
2885
|
uri: chunk.uri,
|
|
2483
|
-
headingPath: chunk.headingPath,
|
|
2484
2886
|
snippet: await buildSnippetWithAdjacentChunks(chunk, document.title, {
|
|
2485
2887
|
document,
|
|
2486
2888
|
config,
|
|
@@ -2544,7 +2946,6 @@ async function searchIndex({
|
|
|
2544
2946
|
score,
|
|
2545
2947
|
title: chooseResultTitle(chunk),
|
|
2546
2948
|
uri: chunk.uri,
|
|
2547
|
-
headingPath: chunk.headingPath,
|
|
2548
2949
|
snippet: await buildSnippetWithAdjacentChunks(chunk, normalizedQuery, {
|
|
2549
2950
|
document: documents.get(chunk.documentId),
|
|
2550
2951
|
config,
|
|
@@ -2564,7 +2965,7 @@ async function searchIndex({
|
|
|
2564
2965
|
|
|
2565
2966
|
// src/query/related-service.ts
|
|
2566
2967
|
import path19 from "path";
|
|
2567
|
-
function
|
|
2968
|
+
function cosineSimilarity2(left, right) {
|
|
2568
2969
|
let dot = 0;
|
|
2569
2970
|
let leftNorm = 0;
|
|
2570
2971
|
let rightNorm = 0;
|
|
@@ -2650,7 +3051,7 @@ async function findRelatedDocuments({
|
|
|
2650
3051
|
const results = [...vectors.values()].filter((candidate) => candidate.document.id !== selected.id).map((candidate) => ({
|
|
2651
3052
|
documentId: candidate.document.id,
|
|
2652
3053
|
sourceId: candidate.document.sourceId,
|
|
2653
|
-
score:
|
|
3054
|
+
score: cosineSimilarity2(sourceVector.embedding, candidate.embedding),
|
|
2654
3055
|
title: candidate.document.title,
|
|
2655
3056
|
uri: candidate.document.uri,
|
|
2656
3057
|
metadata: candidate.document.metadata
|
|
@@ -2690,7 +3091,6 @@ async function createContext({
|
|
|
2690
3091
|
sourceId: result.sourceId,
|
|
2691
3092
|
title: result.title,
|
|
2692
3093
|
uri: result.uri,
|
|
2693
|
-
headingPath: result.headingPath,
|
|
2694
3094
|
text,
|
|
2695
3095
|
metadata: result.metadata
|
|
2696
3096
|
});
|
|
@@ -2703,7 +3103,6 @@ async function createContext({
|
|
|
2703
3103
|
`Title: ${source.title}`,
|
|
2704
3104
|
`URL: ${source.uri}`,
|
|
2705
3105
|
`Chunk ID: ${source.chunkId}`,
|
|
2706
|
-
source.headingPath.length > 0 ? `Heading Path: ${source.headingPath.join(" > ")}` : "",
|
|
2707
3106
|
"",
|
|
2708
3107
|
source.text,
|
|
2709
3108
|
""
|