@tryformation/querylight-cli 0.2.2 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +33 -3
- package/dist/cli/format.d.ts +2 -2
- package/dist/cli/main.js +694 -135
- package/dist/core/constants.d.ts +1 -1
- package/dist/index/querylight-indexer.d.ts +2 -2
- package/dist/index.d.ts +1 -0
- package/dist/index.js +592 -123
- package/dist/query/search-service.d.ts +14 -1
- package/dist/server/search-api.d.ts +15 -0
- package/dist/types/models.d.ts +36 -1
- package/dist/vector/dense.d.ts +6 -1
- package/package.json +2 -2
- package/scripts/sparse-encode.py +29 -8
package/dist/index.js
CHANGED
|
@@ -57,7 +57,7 @@ var defaultConfig = () => ({
|
|
|
57
57
|
defaultMode: "lexical",
|
|
58
58
|
dense: {
|
|
59
59
|
enabled: true,
|
|
60
|
-
modelId: "Xenova/
|
|
60
|
+
modelId: "Xenova/paraphrase-MiniLM-L3-v2",
|
|
61
61
|
cacheDir: DEFAULT_SHARED_MODEL_CACHE_DIR,
|
|
62
62
|
indexHashTables: 8,
|
|
63
63
|
indexRandomSeed: 42,
|
|
@@ -65,7 +65,7 @@ var defaultConfig = () => ({
|
|
|
65
65
|
},
|
|
66
66
|
sparse: {
|
|
67
67
|
enabled: true,
|
|
68
|
-
modelId: "opensearch-project/opensearch-neural-sparse-encoding-doc-
|
|
68
|
+
modelId: "opensearch-project/opensearch-neural-sparse-encoding-doc-v2-mini",
|
|
69
69
|
cacheDir: DEFAULT_SHARED_MODEL_CACHE_DIR,
|
|
70
70
|
documentTopTokens: 128,
|
|
71
71
|
queryEncoding: "tokenizer-token-weights",
|
|
@@ -1213,13 +1213,17 @@ function isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules
|
|
|
1213
1213
|
if (url.search.length > 0) {
|
|
1214
1214
|
return false;
|
|
1215
1215
|
}
|
|
1216
|
-
|
|
1216
|
+
const pathname = url.pathname.toLowerCase();
|
|
1217
|
+
if (pathname.endsWith(".xml")) {
|
|
1217
1218
|
return false;
|
|
1218
1219
|
}
|
|
1219
|
-
if (
|
|
1220
|
+
if (pathname.endsWith(".pdf")) {
|
|
1220
1221
|
return false;
|
|
1221
1222
|
}
|
|
1222
|
-
if (
|
|
1223
|
+
if (pathname.includes("/cdn-cgi/")) {
|
|
1224
|
+
return false;
|
|
1225
|
+
}
|
|
1226
|
+
if (pathname === "/search" || pathname === "/search/" || pathname.endsWith("/search/")) {
|
|
1223
1227
|
return false;
|
|
1224
1228
|
}
|
|
1225
1229
|
if (disallowRules.some((rule) => rule !== "/" && url.pathname.startsWith(rule))) {
|
|
@@ -1782,7 +1786,7 @@ async function chunkDocuments({
|
|
|
1782
1786
|
}
|
|
1783
1787
|
|
|
1784
1788
|
// src/index/querylight-indexer.ts
|
|
1785
|
-
import { Analyzer, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, RankingAlgorithm, TextFieldIndex } from "@tryformation/querylight-ts";
|
|
1789
|
+
import { Analyzer, DateFieldIndex, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, RankingAlgorithm, StoredSourceIndex, TextFieldIndex } from "@tryformation/querylight-ts";
|
|
1786
1790
|
import path17 from "path";
|
|
1787
1791
|
|
|
1788
1792
|
// src/vector/dense.ts
|
|
@@ -2058,15 +2062,26 @@ function createSparseChunkText(chunk) {
|
|
|
2058
2062
|
// src/vector/dense.ts
|
|
2059
2063
|
var denseEmbedderFactory = null;
|
|
2060
2064
|
var EXACT_DENSE_RERANK_THRESHOLD = 5e3;
|
|
2065
|
+
function normalizeDenseEmbedder(embedder) {
|
|
2066
|
+
if (typeof embedder === "function") {
|
|
2067
|
+
return { embed: embedder };
|
|
2068
|
+
}
|
|
2069
|
+
return embedder;
|
|
2070
|
+
}
|
|
2061
2071
|
async function createEmbedder(cacheDir, modelId) {
|
|
2062
2072
|
if (denseEmbedderFactory) {
|
|
2063
|
-
return denseEmbedderFactory(cacheDir, modelId);
|
|
2073
|
+
return normalizeDenseEmbedder(await denseEmbedderFactory(cacheDir, modelId));
|
|
2064
2074
|
}
|
|
2065
2075
|
const runtime = await getDenseTransformersRuntime(cacheDir);
|
|
2066
2076
|
const extractor = await runtime.pipeline("feature-extraction", modelId);
|
|
2067
|
-
return
|
|
2068
|
-
|
|
2069
|
-
|
|
2077
|
+
return {
|
|
2078
|
+
async embed(text) {
|
|
2079
|
+
const output = await extractor(text, { pooling: "mean", normalize: true });
|
|
2080
|
+
return output.tolist()[0];
|
|
2081
|
+
},
|
|
2082
|
+
async dispose() {
|
|
2083
|
+
await extractor.dispose();
|
|
2084
|
+
}
|
|
2070
2085
|
};
|
|
2071
2086
|
}
|
|
2072
2087
|
function exactDenseQuery(payload, vector, topK) {
|
|
@@ -2080,53 +2095,57 @@ async function buildDenseVectors({
|
|
|
2080
2095
|
const chunks = await readJsonl(path14.join(workspacePath, "chunks", "chunks.jsonl"));
|
|
2081
2096
|
const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
|
|
2082
2097
|
await mkdir7(cacheDir, { recursive: true });
|
|
2083
|
-
const
|
|
2084
|
-
|
|
2085
|
-
|
|
2086
|
-
|
|
2087
|
-
|
|
2088
|
-
const
|
|
2089
|
-
|
|
2090
|
-
|
|
2091
|
-
|
|
2092
|
-
|
|
2093
|
-
|
|
2094
|
-
|
|
2095
|
-
|
|
2096
|
-
|
|
2097
|
-
|
|
2098
|
-
|
|
2099
|
-
|
|
2100
|
-
|
|
2101
|
-
|
|
2098
|
+
const embedder = await createEmbedder(cacheDir, config.modelId);
|
|
2099
|
+
try {
|
|
2100
|
+
const records = [];
|
|
2101
|
+
let dimensions = 0;
|
|
2102
|
+
reportProgress(progress, `Encoding ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} for dense retrieval`);
|
|
2103
|
+
for (const chunk of chunks) {
|
|
2104
|
+
const embedding = await embedder.embed(createDenseChunkText(chunk));
|
|
2105
|
+
dimensions ||= embedding.length;
|
|
2106
|
+
records.push({
|
|
2107
|
+
chunkId: chunk.id,
|
|
2108
|
+
documentId: chunk.documentId,
|
|
2109
|
+
sourceId: chunk.sourceId,
|
|
2110
|
+
title: chunk.title,
|
|
2111
|
+
uri: chunk.uri,
|
|
2112
|
+
headingPath: chunk.headingPath,
|
|
2113
|
+
text: chunk.text,
|
|
2114
|
+
embedding
|
|
2115
|
+
});
|
|
2116
|
+
if (records.length === 1 || records.length % 100 === 0 || records.length === chunks.length) {
|
|
2117
|
+
reportProgressDetail(progress, `Encoded ${records.length}/${chunks.length} chunks for dense retrieval`);
|
|
2118
|
+
}
|
|
2102
2119
|
}
|
|
2120
|
+
reportProgress(progress, "Building dense vector index");
|
|
2121
|
+
const index = new VectorFieldIndex({
|
|
2122
|
+
numHashTables: config.indexHashTables,
|
|
2123
|
+
dimensions,
|
|
2124
|
+
random: createSeededRandom(config.indexRandomSeed)
|
|
2125
|
+
});
|
|
2126
|
+
for (const record of records) {
|
|
2127
|
+
index.insert(record.chunkId, [record.embedding]);
|
|
2128
|
+
}
|
|
2129
|
+
const metadata = {
|
|
2130
|
+
createdAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2131
|
+
modelId: config.modelId,
|
|
2132
|
+
dimensions,
|
|
2133
|
+
hashTables: config.indexHashTables,
|
|
2134
|
+
randomSeed: config.indexRandomSeed,
|
|
2135
|
+
chunkCount: records.length,
|
|
2136
|
+
indexHash: sha256(JSON.stringify(index.indexState))
|
|
2137
|
+
};
|
|
2138
|
+
const payload = {
|
|
2139
|
+
metadata,
|
|
2140
|
+
indexState: index.indexState,
|
|
2141
|
+
chunks: records
|
|
2142
|
+
};
|
|
2143
|
+
await writeDensePayload(workspacePath, payload);
|
|
2144
|
+
reportProgress(progress, `Dense vectors written for ${records.length} chunk${records.length === 1 ? "" : "s"}`);
|
|
2145
|
+
return payload;
|
|
2146
|
+
} finally {
|
|
2147
|
+
await embedder.dispose?.();
|
|
2103
2148
|
}
|
|
2104
|
-
reportProgress(progress, "Building dense vector index");
|
|
2105
|
-
const index = new VectorFieldIndex({
|
|
2106
|
-
numHashTables: config.indexHashTables,
|
|
2107
|
-
dimensions,
|
|
2108
|
-
random: createSeededRandom(config.indexRandomSeed)
|
|
2109
|
-
});
|
|
2110
|
-
for (const record of records) {
|
|
2111
|
-
index.insert(record.chunkId, [record.embedding]);
|
|
2112
|
-
}
|
|
2113
|
-
const metadata = {
|
|
2114
|
-
createdAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2115
|
-
modelId: config.modelId,
|
|
2116
|
-
dimensions,
|
|
2117
|
-
hashTables: config.indexHashTables,
|
|
2118
|
-
randomSeed: config.indexRandomSeed,
|
|
2119
|
-
chunkCount: records.length,
|
|
2120
|
-
indexHash: sha256(JSON.stringify(index.indexState))
|
|
2121
|
-
};
|
|
2122
|
-
const payload = {
|
|
2123
|
-
metadata,
|
|
2124
|
-
indexState: index.indexState,
|
|
2125
|
-
chunks: records
|
|
2126
|
-
};
|
|
2127
|
-
await writeDensePayload(workspacePath, payload);
|
|
2128
|
-
reportProgress(progress, `Dense vectors written for ${records.length} chunk${records.length === 1 ? "" : "s"}`);
|
|
2129
|
-
return payload;
|
|
2130
2149
|
}
|
|
2131
2150
|
async function denseQuery({
|
|
2132
2151
|
workspacePath,
|
|
@@ -2136,21 +2155,25 @@ async function denseQuery({
|
|
|
2136
2155
|
}) {
|
|
2137
2156
|
const payload = await readDensePayload(workspacePath);
|
|
2138
2157
|
const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
|
|
2139
|
-
const
|
|
2140
|
-
|
|
2141
|
-
|
|
2158
|
+
const embedder = await createEmbedder(cacheDir, config.modelId);
|
|
2159
|
+
try {
|
|
2160
|
+
const vector = await embedder.embed(query);
|
|
2161
|
+
if (payload.chunks.length <= EXACT_DENSE_RERANK_THRESHOLD) {
|
|
2162
|
+
return exactDenseQuery(payload, vector, topK);
|
|
2163
|
+
}
|
|
2164
|
+
const index = new VectorFieldIndex({
|
|
2165
|
+
numHashTables: payload.metadata.hashTables,
|
|
2166
|
+
dimensions: payload.metadata.dimensions,
|
|
2167
|
+
random: createSeededRandom(payload.metadata.randomSeed)
|
|
2168
|
+
}).loadState(payload.indexState);
|
|
2169
|
+
const approximateHits = index.query(vector, topK);
|
|
2170
|
+
if (approximateHits.length >= topK) {
|
|
2171
|
+
return approximateHits;
|
|
2172
|
+
}
|
|
2142
2173
|
return exactDenseQuery(payload, vector, topK);
|
|
2174
|
+
} finally {
|
|
2175
|
+
await embedder.dispose?.();
|
|
2143
2176
|
}
|
|
2144
|
-
const index = new VectorFieldIndex({
|
|
2145
|
-
numHashTables: payload.metadata.hashTables,
|
|
2146
|
-
dimensions: payload.metadata.dimensions,
|
|
2147
|
-
random: createSeededRandom(payload.metadata.randomSeed)
|
|
2148
|
-
}).loadState(payload.indexState);
|
|
2149
|
-
const approximateHits = index.query(vector, topK);
|
|
2150
|
-
if (approximateHits.length >= topK) {
|
|
2151
|
-
return approximateHits;
|
|
2152
|
-
}
|
|
2153
|
-
return exactDenseQuery(payload, vector, topK);
|
|
2154
2177
|
}
|
|
2155
2178
|
|
|
2156
2179
|
// src/vector/sparse.ts
|
|
@@ -2379,12 +2402,19 @@ function keywordFieldIndex() {
|
|
|
2379
2402
|
function createIndexMapping(extraFields = []) {
|
|
2380
2403
|
const lexical = new TextFieldIndex(void 0, void 0, RankingAlgorithm.BM25);
|
|
2381
2404
|
const mapping = {
|
|
2405
|
+
_source: new StoredSourceIndex(),
|
|
2382
2406
|
text: lexical,
|
|
2383
2407
|
title: new TextFieldIndex(void 0, void 0, RankingAlgorithm.BM25),
|
|
2384
2408
|
uri: keywordFieldIndex(),
|
|
2385
2409
|
sourceId: keywordFieldIndex(),
|
|
2410
|
+
sourceName: keywordFieldIndex(),
|
|
2386
2411
|
tags: keywordFieldIndex(),
|
|
2387
|
-
sourceType: keywordFieldIndex()
|
|
2412
|
+
sourceType: keywordFieldIndex(),
|
|
2413
|
+
publicationDate: new DateFieldIndex(),
|
|
2414
|
+
firstSeenAt: new DateFieldIndex(),
|
|
2415
|
+
lastSeenAt: new DateFieldIndex(),
|
|
2416
|
+
lastChangedAt: new DateFieldIndex(),
|
|
2417
|
+
crawledAt: new DateFieldIndex()
|
|
2388
2418
|
};
|
|
2389
2419
|
for (const field of extraFields) {
|
|
2390
2420
|
mapping[field] = keywordFieldIndex();
|
|
@@ -2420,8 +2450,12 @@ async function buildIndex({
|
|
|
2420
2450
|
const sources = await readJsonl(path17.join(workspacePath, "sources", "sources.jsonl"));
|
|
2421
2451
|
const metadataFields = [...new Set(chunks.flatMap((chunk) => Object.keys(chunk.metadata).map((key) => `metadata.${key}`)))];
|
|
2422
2452
|
const index = new DocumentIndex(createIndexMapping(metadataFields));
|
|
2453
|
+
const documentsById = new Map(documents.map((document) => [document.id, document]));
|
|
2454
|
+
const sourcesById = new Map(sources.map((source) => [source.id, source]));
|
|
2423
2455
|
reportProgress(progress, `Building lexical index from ${chunks.length} chunk${chunks.length === 1 ? "" : "s"}`);
|
|
2424
2456
|
for (const chunk of chunks) {
|
|
2457
|
+
const document = documentsById.get(chunk.documentId);
|
|
2458
|
+
const source = sourcesById.get(chunk.sourceId);
|
|
2425
2459
|
index.index({
|
|
2426
2460
|
id: chunk.id,
|
|
2427
2461
|
fields: {
|
|
@@ -2429,9 +2463,33 @@ async function buildIndex({
|
|
|
2429
2463
|
title: [chunk.title],
|
|
2430
2464
|
uri: [chunk.uri.toLowerCase()],
|
|
2431
2465
|
sourceId: [chunk.sourceId.toLowerCase()],
|
|
2466
|
+
sourceName: source ? [source.name.toLowerCase()] : [],
|
|
2432
2467
|
tags: Array.isArray(chunk.metadata.tags) ? chunk.metadata.tags.map((tag) => String(tag).toLowerCase()) : [],
|
|
2433
2468
|
sourceType: [String(chunk.metadata.sourceType ?? "").toLowerCase()],
|
|
2469
|
+
publicationDate: document?.publicationDate ? [document.publicationDate] : [],
|
|
2470
|
+
firstSeenAt: [document?.firstSeenAt ?? chunk.firstSeenAt],
|
|
2471
|
+
lastSeenAt: [document?.lastSeenAt ?? chunk.lastSeenAt],
|
|
2472
|
+
lastChangedAt: [document?.lastChangedAt ?? chunk.lastChangedAt],
|
|
2473
|
+
crawledAt: document?.crawledAt ? [document.crawledAt] : [],
|
|
2434
2474
|
...flattenMetadata(chunk.metadata)
|
|
2475
|
+
},
|
|
2476
|
+
source: {
|
|
2477
|
+
chunkId: chunk.id,
|
|
2478
|
+
documentId: chunk.documentId,
|
|
2479
|
+
sourceId: chunk.sourceId,
|
|
2480
|
+
sourceType: document?.sourceType ?? "text",
|
|
2481
|
+
sourceName: source?.name,
|
|
2482
|
+
title: chunk.title,
|
|
2483
|
+
uri: chunk.uri,
|
|
2484
|
+
headingPath: chunk.headingPath,
|
|
2485
|
+
text: chunk.text,
|
|
2486
|
+
normalizedPath: document?.normalizedPath,
|
|
2487
|
+
publicationDate: document?.publicationDate ?? null,
|
|
2488
|
+
crawledAt: document?.crawledAt,
|
|
2489
|
+
firstSeenAt: document?.firstSeenAt ?? chunk.firstSeenAt,
|
|
2490
|
+
lastSeenAt: document?.lastSeenAt ?? chunk.lastSeenAt,
|
|
2491
|
+
lastChangedAt: document?.lastChangedAt ?? chunk.lastChangedAt,
|
|
2492
|
+
metadata: chunk.metadata
|
|
2435
2493
|
}
|
|
2436
2494
|
});
|
|
2437
2495
|
}
|
|
@@ -2440,7 +2498,7 @@ async function buildIndex({
|
|
|
2440
2498
|
const metadata = {
|
|
2441
2499
|
id: `index_${createdAt.replace(/[:.]/g, "-")}`,
|
|
2442
2500
|
createdAt,
|
|
2443
|
-
querylightVersion: "0.
|
|
2501
|
+
querylightVersion: "0.11.0",
|
|
2444
2502
|
kbVersion: "0.1.0",
|
|
2445
2503
|
documentCount: documents.length,
|
|
2446
2504
|
chunkCount: chunks.length,
|
|
@@ -2469,7 +2527,7 @@ async function buildIndex({
|
|
|
2469
2527
|
|
|
2470
2528
|
// src/query/search-service.ts
|
|
2471
2529
|
import { readFile as readFile10 } from "fs/promises";
|
|
2472
|
-
import {
|
|
2530
|
+
import { reciprocalRankFusion, searchJsonDsl } from "@tryformation/querylight-ts";
|
|
2473
2531
|
import path18 from "path";
|
|
2474
2532
|
async function loadHydratedIndex(workspacePath) {
|
|
2475
2533
|
let state;
|
|
@@ -2497,24 +2555,6 @@ function matchesPrefix(value, prefixes) {
|
|
|
2497
2555
|
const lower = value.toLowerCase();
|
|
2498
2556
|
return prefixes.some((prefix) => lower.startsWith(prefix));
|
|
2499
2557
|
}
|
|
2500
|
-
function buildSearchQuery(query, filters) {
|
|
2501
|
-
const sourceIds = normalizeFilterValues([filters.sourceId, ...filters.sourceIds ?? []].filter((value) => Boolean(value)));
|
|
2502
|
-
const sourceTypes = normalizeFilterValues([filters.sourceType, ...filters.sourceTypes ?? []].filter((value) => Boolean(value)));
|
|
2503
|
-
const tags = normalizeFilterValues([filters.tag, ...filters.tags ?? []].filter((value) => Boolean(value)));
|
|
2504
|
-
return new BoolQuery({
|
|
2505
|
-
should: [
|
|
2506
|
-
new MatchQuery({ field: "title", text: query, operation: OP.AND, boost: 6 }),
|
|
2507
|
-
new MatchQuery({ field: "text", text: query, operation: OP.AND, boost: 4 }),
|
|
2508
|
-
new MatchQuery({ field: "text", text: query, operation: OP.OR, boost: 2 })
|
|
2509
|
-
],
|
|
2510
|
-
filter: [
|
|
2511
|
-
...sourceIds.length === 1 ? [new TermQuery({ field: "sourceId", text: sourceIds[0] })] : [],
|
|
2512
|
-
...sourceTypes.length === 1 ? [new TermQuery({ field: "sourceType", text: sourceTypes[0] })] : [],
|
|
2513
|
-
...tags.length === 1 ? [new TermQuery({ field: "tags", text: tags[0] })] : [],
|
|
2514
|
-
...(filters.metadata ?? []).map(({ key, value }) => new TermQuery({ field: `metadata.${key}`, text: value.toLowerCase() }))
|
|
2515
|
-
]
|
|
2516
|
-
});
|
|
2517
|
-
}
|
|
2518
2558
|
function isValidDate(value) {
|
|
2519
2559
|
return typeof value === "string" && !Number.isNaN(new Date(value).getTime());
|
|
2520
2560
|
}
|
|
@@ -2713,6 +2753,185 @@ async function buildSnippetWithAdjacentChunks(chunk, query, {
|
|
|
2713
2753
|
}
|
|
2714
2754
|
return buildExpandedParagraphSnippet(paragraphs, currentIndex, query);
|
|
2715
2755
|
}
|
|
2756
|
+
function buildSearchDslRequest({
|
|
2757
|
+
query,
|
|
2758
|
+
topK,
|
|
2759
|
+
filters,
|
|
2760
|
+
dateRanges
|
|
2761
|
+
}) {
|
|
2762
|
+
const filterClauses = [];
|
|
2763
|
+
const sourceIds = normalizeFilterValues([filters.sourceId, ...filters.sourceIds ?? []].filter((value) => Boolean(value)));
|
|
2764
|
+
const sourceNames = normalizeFilterValues([filters.sourceName, ...filters.sourceNames ?? []].filter((value) => Boolean(value)));
|
|
2765
|
+
const sourceTypes = normalizeFilterValues([filters.sourceType, ...filters.sourceTypes ?? []].filter((value) => Boolean(value)));
|
|
2766
|
+
const uriPrefixes = normalizeFilterValues([filters.uriPrefix, ...filters.uriPrefixes ?? []].filter((value) => Boolean(value)));
|
|
2767
|
+
const tags = normalizeFilterValues([filters.tag, ...filters.tags ?? []].filter((value) => Boolean(value)));
|
|
2768
|
+
if (sourceIds.length > 0) {
|
|
2769
|
+
filterClauses.push({ terms: { sourceId: sourceIds } });
|
|
2770
|
+
}
|
|
2771
|
+
if (sourceNames.length > 0) {
|
|
2772
|
+
filterClauses.push({ terms: { sourceName: sourceNames } });
|
|
2773
|
+
}
|
|
2774
|
+
if (sourceTypes.length > 0) {
|
|
2775
|
+
filterClauses.push({ terms: { sourceType: sourceTypes } });
|
|
2776
|
+
}
|
|
2777
|
+
if (uriPrefixes.length > 0) {
|
|
2778
|
+
filterClauses.push({
|
|
2779
|
+
bool: {
|
|
2780
|
+
should: uriPrefixes.map((prefix) => ({ prefix: { uri: prefix } })),
|
|
2781
|
+
minimum_should_match: 1
|
|
2782
|
+
}
|
|
2783
|
+
});
|
|
2784
|
+
}
|
|
2785
|
+
if (tags.length > 0) {
|
|
2786
|
+
filterClauses.push({ terms: { tags } });
|
|
2787
|
+
}
|
|
2788
|
+
if (filters.hasPublicationDate) {
|
|
2789
|
+
filterClauses.push({ exists: { field: "publicationDate" } });
|
|
2790
|
+
}
|
|
2791
|
+
for (const { key, value } of filters.metadata ?? []) {
|
|
2792
|
+
filterClauses.push({ term: { [`metadata.${key}`]: value.toLowerCase() } });
|
|
2793
|
+
}
|
|
2794
|
+
for (const { field, from, to } of dateRanges) {
|
|
2795
|
+
filterClauses.push({
|
|
2796
|
+
range: {
|
|
2797
|
+
[field]: {
|
|
2798
|
+
...from ? { gte: from } : {},
|
|
2799
|
+
...to ? { lte: to } : {}
|
|
2800
|
+
}
|
|
2801
|
+
}
|
|
2802
|
+
});
|
|
2803
|
+
}
|
|
2804
|
+
return {
|
|
2805
|
+
size: topK,
|
|
2806
|
+
query: {
|
|
2807
|
+
bool: {
|
|
2808
|
+
should: [
|
|
2809
|
+
{ match: { title: { query, operator: "and", boost: 6 } } },
|
|
2810
|
+
{ match: { text: { query, operator: "and", boost: 4 } } },
|
|
2811
|
+
{ match: { text: { query, operator: "or", boost: 2 } } }
|
|
2812
|
+
],
|
|
2813
|
+
filter: filterClauses,
|
|
2814
|
+
minimum_should_match: 1
|
|
2815
|
+
}
|
|
2816
|
+
}
|
|
2817
|
+
};
|
|
2818
|
+
}
|
|
2819
|
+
function sourceToChunkRecord(source) {
|
|
2820
|
+
return {
|
|
2821
|
+
id: source.chunkId,
|
|
2822
|
+
documentId: source.documentId,
|
|
2823
|
+
sourceId: source.sourceId,
|
|
2824
|
+
title: source.title,
|
|
2825
|
+
uri: source.uri,
|
|
2826
|
+
headingPath: source.headingPath,
|
|
2827
|
+
text: source.text,
|
|
2828
|
+
contentHash: "",
|
|
2829
|
+
metadata: source.metadata,
|
|
2830
|
+
firstSeenAt: source.firstSeenAt,
|
|
2831
|
+
lastSeenAt: source.lastSeenAt,
|
|
2832
|
+
lastChangedAt: source.lastChangedAt
|
|
2833
|
+
};
|
|
2834
|
+
}
|
|
2835
|
+
function sourceToDocumentRecord(source) {
|
|
2836
|
+
return {
|
|
2837
|
+
id: source.documentId,
|
|
2838
|
+
sourceId: source.sourceId,
|
|
2839
|
+
sourceType: source.sourceType,
|
|
2840
|
+
title: source.title,
|
|
2841
|
+
uri: source.uri,
|
|
2842
|
+
sourceUri: source.uri,
|
|
2843
|
+
mimeType: "text/plain",
|
|
2844
|
+
normalizedPath: source.normalizedPath ?? "",
|
|
2845
|
+
contentHash: "",
|
|
2846
|
+
metadata: source.metadata,
|
|
2847
|
+
publicationDate: source.publicationDate ?? null,
|
|
2848
|
+
crawledAt: source.crawledAt,
|
|
2849
|
+
firstSeenAt: source.firstSeenAt,
|
|
2850
|
+
lastSeenAt: source.lastSeenAt,
|
|
2851
|
+
lastChangedAt: source.lastChangedAt
|
|
2852
|
+
};
|
|
2853
|
+
}
|
|
2854
|
+
async function materializeSearchHit(hit, query, config, orderedChunkCache, showChunks) {
|
|
2855
|
+
const source = hit._source;
|
|
2856
|
+
const chunk = sourceToChunkRecord(source);
|
|
2857
|
+
const document = sourceToDocumentRecord(source);
|
|
2858
|
+
const snippet = await buildSnippetWithAdjacentChunks(chunk, query, { document, config, orderedChunkCache });
|
|
2859
|
+
const enrichedSource = {
|
|
2860
|
+
...source,
|
|
2861
|
+
snippet
|
|
2862
|
+
};
|
|
2863
|
+
const result = {
|
|
2864
|
+
chunkId: source.chunkId,
|
|
2865
|
+
documentId: source.documentId,
|
|
2866
|
+
sourceId: source.sourceId,
|
|
2867
|
+
sourceType: source.sourceType,
|
|
2868
|
+
score: hit._score,
|
|
2869
|
+
title: chooseResultTitle(chunk),
|
|
2870
|
+
uri: source.uri,
|
|
2871
|
+
snippet,
|
|
2872
|
+
text: showChunks ? source.text : void 0,
|
|
2873
|
+
publicationDate: source.publicationDate ?? null,
|
|
2874
|
+
firstSeenAt: source.firstSeenAt,
|
|
2875
|
+
lastSeenAt: source.lastSeenAt,
|
|
2876
|
+
lastChangedAt: source.lastChangedAt,
|
|
2877
|
+
metadata: source.metadata
|
|
2878
|
+
};
|
|
2879
|
+
return {
|
|
2880
|
+
hit: {
|
|
2881
|
+
...hit,
|
|
2882
|
+
_source: enrichedSource
|
|
2883
|
+
},
|
|
2884
|
+
result
|
|
2885
|
+
};
|
|
2886
|
+
}
|
|
2887
|
+
function createSearchResponse(retrievalMode, hits, took, aggregations) {
|
|
2888
|
+
return {
|
|
2889
|
+
retrievalMode,
|
|
2890
|
+
took,
|
|
2891
|
+
hits: {
|
|
2892
|
+
total: {
|
|
2893
|
+
value: hits.length,
|
|
2894
|
+
relation: "eq"
|
|
2895
|
+
},
|
|
2896
|
+
max_score: hits.length > 0 ? Math.max(...hits.map((hit) => hit._score)) : null,
|
|
2897
|
+
hits
|
|
2898
|
+
},
|
|
2899
|
+
aggregations
|
|
2900
|
+
};
|
|
2901
|
+
}
|
|
2902
|
+
function searchResultsFromResponse(response, showChunks = false) {
|
|
2903
|
+
return response.hits.hits.map((hit) => ({
|
|
2904
|
+
chunkId: hit._source.chunkId,
|
|
2905
|
+
documentId: hit._source.documentId,
|
|
2906
|
+
sourceId: hit._source.sourceId,
|
|
2907
|
+
sourceType: hit._source.sourceType,
|
|
2908
|
+
score: hit._score,
|
|
2909
|
+
title: chooseResultTitle(sourceToChunkRecord(hit._source)),
|
|
2910
|
+
uri: hit._source.uri,
|
|
2911
|
+
snippet: hit._source.snippet ?? hit.highlight?.text?.join("\n\n") ?? buildSnippet(hit._source.text, hit._source.title),
|
|
2912
|
+
text: showChunks ? hit._source.text : void 0,
|
|
2913
|
+
publicationDate: hit._source.publicationDate ?? null,
|
|
2914
|
+
firstSeenAt: hit._source.firstSeenAt,
|
|
2915
|
+
lastSeenAt: hit._source.lastSeenAt,
|
|
2916
|
+
lastChangedAt: hit._source.lastChangedAt,
|
|
2917
|
+
metadata: hit._source.metadata
|
|
2918
|
+
}));
|
|
2919
|
+
}
|
|
2920
|
+
async function searchJsonRequest({
|
|
2921
|
+
index,
|
|
2922
|
+
request,
|
|
2923
|
+
indexName = "querylight"
|
|
2924
|
+
}) {
|
|
2925
|
+
return searchJsonDsl({ index, request, indexName });
|
|
2926
|
+
}
|
|
2927
|
+
async function searchJsonIndex({
|
|
2928
|
+
workspacePath,
|
|
2929
|
+
request,
|
|
2930
|
+
indexName = "querylight"
|
|
2931
|
+
}) {
|
|
2932
|
+
const index = await loadHydratedIndex(workspacePath);
|
|
2933
|
+
return searchJsonRequest({ index, request, indexName });
|
|
2934
|
+
}
|
|
2716
2935
|
function normalizeDisplayTitle(title) {
|
|
2717
2936
|
return title.replace(/\s*\|\s*Querylight TS Demo\s*$/i, "").replace(/\s+/g, " ").trim();
|
|
2718
2937
|
}
|
|
@@ -2850,6 +3069,7 @@ async function searchIndex({
|
|
|
2850
3069
|
retrievalMode,
|
|
2851
3070
|
showChunks = false
|
|
2852
3071
|
}) {
|
|
3072
|
+
const startedAt = Date.now();
|
|
2853
3073
|
const config = await loadConfig(workspacePath);
|
|
2854
3074
|
const mode = retrievalMode ?? config.retrieval.defaultMode;
|
|
2855
3075
|
const candidateLimit = Math.max(topK * 5, 50);
|
|
@@ -2906,12 +3126,48 @@ async function searchIndex({
|
|
|
2906
3126
|
};
|
|
2907
3127
|
})
|
|
2908
3128
|
);
|
|
2909
|
-
|
|
3129
|
+
const hits2 = latestResults.filter((result) => result != null).map((result) => {
|
|
3130
|
+
const chunk = chunks.get(result.chunkId);
|
|
3131
|
+
const document = documents.get(result.documentId);
|
|
3132
|
+
const source = sources.get(result.sourceId);
|
|
3133
|
+
return {
|
|
3134
|
+
_index: "querylight",
|
|
3135
|
+
_id: result.chunkId,
|
|
3136
|
+
_score: result.score,
|
|
3137
|
+
_source: {
|
|
3138
|
+
chunkId: result.chunkId,
|
|
3139
|
+
documentId: result.documentId,
|
|
3140
|
+
sourceId: result.sourceId,
|
|
3141
|
+
sourceType: result.sourceType,
|
|
3142
|
+
sourceName: source?.name,
|
|
3143
|
+
title: chunk.title,
|
|
3144
|
+
uri: result.uri,
|
|
3145
|
+
headingPath: chunk.headingPath,
|
|
3146
|
+
text: chunk.text,
|
|
3147
|
+
snippet: result.snippet,
|
|
3148
|
+
normalizedPath: document.normalizedPath,
|
|
3149
|
+
publicationDate: result.publicationDate ?? null,
|
|
3150
|
+
crawledAt: document.crawledAt,
|
|
3151
|
+
firstSeenAt: result.firstSeenAt,
|
|
3152
|
+
lastSeenAt: result.lastSeenAt,
|
|
3153
|
+
lastChangedAt: result.lastChangedAt,
|
|
3154
|
+
metadata: result.metadata
|
|
3155
|
+
}
|
|
3156
|
+
};
|
|
3157
|
+
});
|
|
3158
|
+
return createSearchResponse("lexical", hits2, Date.now() - startedAt);
|
|
2910
3159
|
}
|
|
2911
3160
|
const lexicalHits = async () => {
|
|
2912
|
-
const
|
|
2913
|
-
|
|
2914
|
-
|
|
3161
|
+
const response = await searchJsonIndex({
|
|
3162
|
+
workspacePath,
|
|
3163
|
+
request: buildSearchDslRequest({
|
|
3164
|
+
query: normalizedQuery,
|
|
3165
|
+
topK: candidateLimit,
|
|
3166
|
+
filters: { sourceId, sourceIds, sourceName, sourceNames, sourceType, sourceTypes, uriPrefix, uriPrefixes, hasPublicationDate, tag, tags, metadata },
|
|
3167
|
+
dateRanges
|
|
3168
|
+
})
|
|
3169
|
+
});
|
|
3170
|
+
return response.hits.hits;
|
|
2915
3171
|
};
|
|
2916
3172
|
const denseHits = async () => {
|
|
2917
3173
|
if (!await fileExists(denseVectorPath(workspacePath))) {
|
|
@@ -2925,15 +3181,18 @@ async function searchIndex({
|
|
|
2925
3181
|
}
|
|
2926
3182
|
return sparseQuery({ workspacePath, config: config.retrieval.sparse, query: normalizedQuery, topK: candidateLimit }).then((hits2) => hits2.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit));
|
|
2927
3183
|
};
|
|
3184
|
+
let lexicalResponseHits = [];
|
|
2928
3185
|
let hits;
|
|
2929
3186
|
if (mode === "lexical") {
|
|
2930
|
-
|
|
3187
|
+
lexicalResponseHits = await lexicalHits();
|
|
3188
|
+
hits = lexicalResponseHits.map((hit) => [hit._id, hit._score]);
|
|
2931
3189
|
} else if (mode === "dense") {
|
|
2932
3190
|
hits = await denseHits();
|
|
2933
3191
|
} else if (mode === "sparse") {
|
|
2934
3192
|
hits = await sparseHits();
|
|
2935
3193
|
} else {
|
|
2936
|
-
|
|
3194
|
+
lexicalResponseHits = await lexicalHits();
|
|
3195
|
+
const rankings = [lexicalResponseHits.map((hit) => [hit._id, hit._score])];
|
|
2937
3196
|
if (await fileExists(denseVectorPath(workspacePath))) {
|
|
2938
3197
|
rankings.push(await denseQuery({ workspacePath, config: config.retrieval.dense, query: normalizedQuery, topK: candidateLimit }).then((dense) => dense.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit)));
|
|
2939
3198
|
}
|
|
@@ -2942,38 +3201,242 @@ async function searchIndex({
|
|
|
2942
3201
|
}
|
|
2943
3202
|
hits = reciprocalRankFusion(rankings, { rankConstant: 20, weights: rankings.map((_, index) => index === 0 ? 3 : 1) }).slice(0, candidateLimit);
|
|
2944
3203
|
}
|
|
2945
|
-
const
|
|
3204
|
+
const baseHits = mode === "lexical" ? lexicalResponseHits : hits.flatMap(([chunkId, score]) => {
|
|
2946
3205
|
const chunk = chunks.get(chunkId);
|
|
2947
3206
|
if (!chunk) {
|
|
2948
|
-
return
|
|
3207
|
+
return [];
|
|
2949
3208
|
}
|
|
3209
|
+
const document = documents.get(chunk.documentId);
|
|
3210
|
+
const source = sources.get(chunk.sourceId);
|
|
3211
|
+
return [{
|
|
3212
|
+
_index: "querylight",
|
|
3213
|
+
_id: chunkId,
|
|
3214
|
+
_score: score,
|
|
3215
|
+
_source: {
|
|
3216
|
+
chunkId,
|
|
3217
|
+
documentId: chunk.documentId,
|
|
3218
|
+
sourceId: chunk.sourceId,
|
|
3219
|
+
sourceType: document?.sourceType ?? "text",
|
|
3220
|
+
sourceName: source?.name,
|
|
3221
|
+
title: chunk.title,
|
|
3222
|
+
uri: chunk.uri,
|
|
3223
|
+
headingPath: chunk.headingPath,
|
|
3224
|
+
text: chunk.text,
|
|
3225
|
+
normalizedPath: document?.normalizedPath,
|
|
3226
|
+
publicationDate: document?.publicationDate ?? null,
|
|
3227
|
+
crawledAt: document?.crawledAt,
|
|
3228
|
+
firstSeenAt: document?.firstSeenAt ?? chunk.firstSeenAt,
|
|
3229
|
+
lastSeenAt: document?.lastSeenAt ?? chunk.lastSeenAt,
|
|
3230
|
+
lastChangedAt: document?.lastChangedAt ?? chunk.lastChangedAt,
|
|
3231
|
+
metadata: chunk.metadata
|
|
3232
|
+
}
|
|
3233
|
+
}];
|
|
3234
|
+
});
|
|
3235
|
+
const materialized = await Promise.all(baseHits.map((hit) => materializeSearchHit(hit, normalizedQuery, config, orderedChunkCache, showChunks)));
|
|
3236
|
+
if (showChunks) {
|
|
3237
|
+
const topHits = materialized.sort((left, right) => right.result.score - left.result.score).slice(0, topK).map(({ hit, result }) => ({ ...hit, _score: result.score }));
|
|
3238
|
+
return createSearchResponse(mode, topHits, Date.now() - startedAt);
|
|
3239
|
+
}
|
|
3240
|
+
const reranked = rerankResultsByDocument(materialized.map(({ result }) => result), topK);
|
|
3241
|
+
const byChunkId = new Map(materialized.map(({ hit }) => [hit._id, hit]));
|
|
3242
|
+
const finalHits = reranked.map((result) => {
|
|
3243
|
+
const hit = byChunkId.get(result.chunkId);
|
|
3244
|
+
return hit ? { ...hit, _score: result.score, _source: { ...hit._source, snippet: result.snippet } } : null;
|
|
3245
|
+
}).filter((hit) => hit != null);
|
|
3246
|
+
return createSearchResponse(mode, finalHits, Date.now() - startedAt);
|
|
3247
|
+
}
|
|
3248
|
+
|
|
3249
|
+
// src/server/search-api.ts
|
|
3250
|
+
import { createServer } from "http";
|
|
3251
|
+
import { readdir, stat as stat4 } from "fs/promises";
|
|
3252
|
+
import path19 from "path";
|
|
3253
|
+
async function pathIsDirectory(candidatePath) {
|
|
3254
|
+
try {
|
|
3255
|
+
return (await stat4(candidatePath)).isDirectory();
|
|
3256
|
+
} catch {
|
|
3257
|
+
return false;
|
|
3258
|
+
}
|
|
3259
|
+
}
|
|
3260
|
+
async function discoverKnowledgeBases(workspacePath) {
|
|
3261
|
+
try {
|
|
3262
|
+
const singleWorkspace = await assertWorkspaceExists(workspacePath);
|
|
3263
|
+
const config = await loadConfig(singleWorkspace);
|
|
3264
|
+
const index = await loadHydratedIndex(singleWorkspace);
|
|
2950
3265
|
return {
|
|
2951
|
-
|
|
2952
|
-
|
|
2953
|
-
|
|
2954
|
-
|
|
2955
|
-
|
|
2956
|
-
|
|
2957
|
-
|
|
2958
|
-
snippet: await buildSnippetWithAdjacentChunks(chunk, normalizedQuery, {
|
|
2959
|
-
document: documents.get(chunk.documentId),
|
|
2960
|
-
config,
|
|
2961
|
-
orderedChunkCache
|
|
2962
|
-
}),
|
|
2963
|
-
text: showChunks ? chunk.text : void 0,
|
|
2964
|
-
publicationDate: documents.get(chunk.documentId)?.publicationDate ?? null,
|
|
2965
|
-
firstSeenAt: documents.get(chunk.documentId)?.firstSeenAt ?? chunk.firstSeenAt,
|
|
2966
|
-
lastSeenAt: documents.get(chunk.documentId)?.lastSeenAt ?? chunk.lastSeenAt,
|
|
2967
|
-
lastChangedAt: documents.get(chunk.documentId)?.lastChangedAt ?? chunk.lastChangedAt,
|
|
2968
|
-
metadata: chunk.metadata
|
|
3266
|
+
mode: "single",
|
|
3267
|
+
knowledgeBases: [{
|
|
3268
|
+
name: config.index.name,
|
|
3269
|
+
workspacePath: singleWorkspace,
|
|
3270
|
+
configuredIndexName: config.index.name,
|
|
3271
|
+
index
|
|
3272
|
+
}]
|
|
2969
3273
|
};
|
|
2970
|
-
})
|
|
2971
|
-
|
|
2972
|
-
|
|
3274
|
+
} catch (error) {
|
|
3275
|
+
if (!(error instanceof CliError) || error.code !== "WORKSPACE_ERROR") {
|
|
3276
|
+
throw error;
|
|
3277
|
+
}
|
|
3278
|
+
}
|
|
3279
|
+
const resolvedRoot = path19.resolve(workspacePath);
|
|
3280
|
+
if (!await pathIsDirectory(resolvedRoot)) {
|
|
3281
|
+
throw new CliError(`workspace path does not exist: ${resolvedRoot}`, "WORKSPACE_ERROR", 3 /* WorkspaceError */);
|
|
3282
|
+
}
|
|
3283
|
+
const entries = await readdir(resolvedRoot, { withFileTypes: true });
|
|
3284
|
+
const knowledgeBases = (await Promise.all(entries.filter((entry) => entry.isDirectory()).map(async (entry) => {
|
|
3285
|
+
const candidateWorkspace = path19.join(resolvedRoot, entry.name, ".kb");
|
|
3286
|
+
try {
|
|
3287
|
+
const workspace = await assertWorkspaceExists(candidateWorkspace);
|
|
3288
|
+
const config = await loadConfig(workspace);
|
|
3289
|
+
const index = await loadHydratedIndex(workspace);
|
|
3290
|
+
return {
|
|
3291
|
+
name: entry.name,
|
|
3292
|
+
workspacePath: workspace,
|
|
3293
|
+
configuredIndexName: config.index.name,
|
|
3294
|
+
index
|
|
3295
|
+
};
|
|
3296
|
+
} catch (error) {
|
|
3297
|
+
if (error instanceof CliError && error.code === "WORKSPACE_ERROR") {
|
|
3298
|
+
return null;
|
|
3299
|
+
}
|
|
3300
|
+
throw error;
|
|
3301
|
+
}
|
|
3302
|
+
}))).filter((knowledgeBase) => knowledgeBase != null);
|
|
3303
|
+
if (knowledgeBases.length === 0) {
|
|
3304
|
+
throw new CliError(
|
|
3305
|
+
`no knowledge bases found at ${resolvedRoot}; use a .kb workspace or a directory of named subdirectories that each contain .kb`,
|
|
3306
|
+
"WORKSPACE_ERROR",
|
|
3307
|
+
3 /* WorkspaceError */
|
|
3308
|
+
);
|
|
3309
|
+
}
|
|
3310
|
+
return { mode: "multi", knowledgeBases };
|
|
3311
|
+
}
|
|
3312
|
+
function sendJson(response, statusCode, payload) {
|
|
3313
|
+
response.statusCode = statusCode;
|
|
3314
|
+
response.setHeader("content-type", "application/json; charset=utf-8");
|
|
3315
|
+
response.end(JSON.stringify(payload));
|
|
3316
|
+
}
|
|
3317
|
+
function sendError(response, statusCode, type, reason) {
|
|
3318
|
+
sendJson(response, statusCode, {
|
|
3319
|
+
error: {
|
|
3320
|
+
type,
|
|
3321
|
+
reason
|
|
3322
|
+
},
|
|
3323
|
+
status: statusCode
|
|
3324
|
+
});
|
|
3325
|
+
}
|
|
3326
|
+
async function readRequestBody(request) {
|
|
3327
|
+
const chunks = [];
|
|
3328
|
+
for await (const chunk of request) {
|
|
3329
|
+
chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
|
|
3330
|
+
}
|
|
3331
|
+
return Buffer.concat(chunks).toString("utf8");
|
|
3332
|
+
}
|
|
3333
|
+
function parseSearchRequest(raw) {
|
|
3334
|
+
const normalized = raw.trim();
|
|
3335
|
+
if (normalized.length === 0) {
|
|
3336
|
+
return {};
|
|
3337
|
+
}
|
|
3338
|
+
try {
|
|
3339
|
+
const parsed = JSON.parse(normalized);
|
|
3340
|
+
if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
|
|
3341
|
+
throw new Error("expected a JSON object");
|
|
3342
|
+
}
|
|
3343
|
+
return parsed;
|
|
3344
|
+
} catch (error) {
|
|
3345
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3346
|
+
throw new CliError(`invalid JSON request: ${message}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
|
|
3347
|
+
}
|
|
3348
|
+
}
|
|
3349
|
+
function routeForKnowledgeBase(mode, knowledgeBase) {
|
|
3350
|
+
return mode === "single" ? "/_search" : `/${knowledgeBase.name}/_search`;
|
|
3351
|
+
}
|
|
3352
|
+
function resolveKnowledgeBaseForPath(pathname, mode, knowledgeBases) {
|
|
3353
|
+
const segments = pathname.split("/").filter(Boolean);
|
|
3354
|
+
if (mode === "single") {
|
|
3355
|
+
const knowledgeBase = [...knowledgeBases.values()][0];
|
|
3356
|
+
if (!knowledgeBase) {
|
|
3357
|
+
return null;
|
|
3358
|
+
}
|
|
3359
|
+
if (segments.length === 1 && segments[0] === "_search") {
|
|
3360
|
+
return knowledgeBase;
|
|
3361
|
+
}
|
|
3362
|
+
if (segments.length === 2 && segments[1] === "_search" && segments[0] === knowledgeBase.configuredIndexName) {
|
|
3363
|
+
return knowledgeBase;
|
|
3364
|
+
}
|
|
3365
|
+
return null;
|
|
3366
|
+
}
|
|
3367
|
+
if (segments.length === 2 && segments[1] === "_search") {
|
|
3368
|
+
return knowledgeBases.get(segments[0]) ?? null;
|
|
3369
|
+
}
|
|
3370
|
+
return null;
|
|
3371
|
+
}
|
|
3372
|
+
async function handleSearchRequest(request, response, pathname, mode, knowledgeBases) {
|
|
3373
|
+
if (request.method !== "GET" && request.method !== "POST") {
|
|
3374
|
+
response.setHeader("allow", "GET, POST");
|
|
3375
|
+
sendError(response, 405, "method_not_allowed", `unsupported method for ${pathname}`);
|
|
3376
|
+
return;
|
|
3377
|
+
}
|
|
3378
|
+
const knowledgeBase = resolveKnowledgeBaseForPath(pathname, mode, knowledgeBases);
|
|
3379
|
+
if (!knowledgeBase) {
|
|
3380
|
+
sendError(response, 404, "resource_not_found_exception", `unknown search route: ${pathname}`);
|
|
3381
|
+
return;
|
|
3382
|
+
}
|
|
3383
|
+
try {
|
|
3384
|
+
const requestBody = parseSearchRequest(await readRequestBody(request));
|
|
3385
|
+
const indexName = mode === "multi" ? knowledgeBase.name : knowledgeBase.configuredIndexName;
|
|
3386
|
+
const result = await searchJsonRequest({
|
|
3387
|
+
index: knowledgeBase.index,
|
|
3388
|
+
request: requestBody,
|
|
3389
|
+
indexName
|
|
3390
|
+
});
|
|
3391
|
+
sendJson(response, 200, result);
|
|
3392
|
+
} catch (error) {
|
|
3393
|
+
if (error instanceof CliError && error.code === "INVALID_ARGUMENT") {
|
|
3394
|
+
sendError(response, 400, "parse_exception", error.message);
|
|
3395
|
+
return;
|
|
3396
|
+
}
|
|
3397
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3398
|
+
sendError(response, 500, "search_phase_execution_exception", message);
|
|
3399
|
+
}
|
|
3400
|
+
}
|
|
3401
|
+
async function startSearchApiServer({
|
|
3402
|
+
workspacePath,
|
|
3403
|
+
host = "127.0.0.1",
|
|
3404
|
+
port = 3e3
|
|
3405
|
+
}) {
|
|
3406
|
+
const { mode, knowledgeBases } = await discoverKnowledgeBases(workspacePath);
|
|
3407
|
+
const byName = new Map(knowledgeBases.map((knowledgeBase) => [knowledgeBase.name, knowledgeBase]));
|
|
3408
|
+
const server = createServer(async (request, response) => {
|
|
3409
|
+
const url2 = new URL(request.url ?? "/", `http://${request.headers.host ?? `${host}:${port}`}`);
|
|
3410
|
+
await handleSearchRequest(request, response, url2.pathname, mode, byName);
|
|
3411
|
+
});
|
|
3412
|
+
await new Promise((resolve2, reject) => {
|
|
3413
|
+
server.once("error", reject);
|
|
3414
|
+
server.listen(port, host, () => {
|
|
3415
|
+
server.off("error", reject);
|
|
3416
|
+
resolve2();
|
|
3417
|
+
});
|
|
3418
|
+
});
|
|
3419
|
+
const address = server.address();
|
|
3420
|
+
if (!address || typeof address === "string") {
|
|
3421
|
+
throw new CliError("server failed to bind to a TCP address", "SERVER_ERROR", 1 /* GeneralError */);
|
|
3422
|
+
}
|
|
3423
|
+
const url = `http://${host}:${address.port}`;
|
|
3424
|
+
return {
|
|
3425
|
+
mode,
|
|
3426
|
+
url,
|
|
3427
|
+
knowledgeBases: knowledgeBases.map((knowledgeBase) => ({
|
|
3428
|
+
name: knowledgeBase.name,
|
|
3429
|
+
workspacePath: knowledgeBase.workspacePath,
|
|
3430
|
+
route: routeForKnowledgeBase(mode, knowledgeBase)
|
|
3431
|
+
})),
|
|
3432
|
+
close: async () => new Promise((resolve2, reject) => {
|
|
3433
|
+
server.close((error) => error ? reject(error) : resolve2());
|
|
3434
|
+
})
|
|
3435
|
+
};
|
|
2973
3436
|
}
|
|
2974
3437
|
|
|
2975
3438
|
// src/query/related-service.ts
|
|
2976
|
-
import
|
|
3439
|
+
import path20 from "path";
|
|
2977
3440
|
function cosineSimilarity2(left, right) {
|
|
2978
3441
|
let dot = 0;
|
|
2979
3442
|
let leftNorm = 0;
|
|
@@ -3049,7 +3512,7 @@ async function findRelatedDocuments({
|
|
|
3049
3512
|
if (!await fileExists(denseVectorPath(workspacePath))) {
|
|
3050
3513
|
throw new CliError("dense vector index is not built; run `qli models pull --dense` and `qli rebuild`", "DENSE_INDEX_MISSING", 7 /* QueryError */);
|
|
3051
3514
|
}
|
|
3052
|
-
const documents = await readJsonl(
|
|
3515
|
+
const documents = await readJsonl(path20.join(workspacePath, "documents", "documents.jsonl"));
|
|
3053
3516
|
const selected = resolveDocumentSelector(documents, document);
|
|
3054
3517
|
const densePayload = await readDensePayload(workspacePath);
|
|
3055
3518
|
const vectors = buildDocumentVectors(documents, densePayload.chunks, densePayload.metadata.dimensions);
|
|
@@ -3086,9 +3549,10 @@ async function createContext({
|
|
|
3086
3549
|
retrievalMode
|
|
3087
3550
|
}) {
|
|
3088
3551
|
const search = await searchIndex({ workspacePath, query, topK, showChunks: true, retrievalMode });
|
|
3552
|
+
const results = searchResultsFromResponse(search, true);
|
|
3089
3553
|
const sources = [];
|
|
3090
3554
|
let total = 0;
|
|
3091
|
-
for (const result of
|
|
3555
|
+
for (const result of results) {
|
|
3092
3556
|
const text = result.text ?? "";
|
|
3093
3557
|
if (total + text.length > maxChars && sources.length > 0) {
|
|
3094
3558
|
break;
|
|
@@ -3121,7 +3585,7 @@ async function createContext({
|
|
|
3121
3585
|
}
|
|
3122
3586
|
|
|
3123
3587
|
// src/report/diff-service.ts
|
|
3124
|
-
import
|
|
3588
|
+
import path21 from "path";
|
|
3125
3589
|
function chooseBaselineRun(runs, since) {
|
|
3126
3590
|
if (since === "last-run") {
|
|
3127
3591
|
return runs.at(-1);
|
|
@@ -3137,7 +3601,7 @@ async function diffWorkspace({
|
|
|
3137
3601
|
documentId,
|
|
3138
3602
|
since
|
|
3139
3603
|
}) {
|
|
3140
|
-
const current = await readJsonl(
|
|
3604
|
+
const current = await readJsonl(path21.join(workspacePath, "documents", "documents.jsonl"));
|
|
3141
3605
|
const baseline = chooseBaselineRun(await listRuns(workspacePath), since);
|
|
3142
3606
|
const previous = new Map((baseline?.documentsSnapshot ?? []).map((document) => [document.id, document]));
|
|
3143
3607
|
const changedDocuments = current.filter((document) => (!sourceId || document.sourceId === sourceId) && (!documentId || document.id === documentId)).filter((document) => {
|
|
@@ -3193,10 +3657,15 @@ export {
|
|
|
3193
3657
|
ingestSources,
|
|
3194
3658
|
listSources,
|
|
3195
3659
|
loadConfig,
|
|
3660
|
+
loadHydratedIndex,
|
|
3196
3661
|
removeSource,
|
|
3197
3662
|
renderChangeReport,
|
|
3198
3663
|
reprocessDocuments,
|
|
3199
3664
|
searchIndex,
|
|
3665
|
+
searchJsonIndex,
|
|
3666
|
+
searchJsonRequest,
|
|
3667
|
+
searchResultsFromResponse,
|
|
3668
|
+
startSearchApiServer,
|
|
3200
3669
|
updateSource,
|
|
3201
3670
|
writeDefaultConfig
|
|
3202
3671
|
};
|