@tryformation/querylight-cli 0.2.2 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +33 -3
- package/dist/cli/format.d.ts +2 -2
- package/dist/cli/main.js +694 -135
- package/dist/core/constants.d.ts +1 -1
- package/dist/index/querylight-indexer.d.ts +2 -2
- package/dist/index.d.ts +1 -0
- package/dist/index.js +592 -123
- package/dist/query/search-service.d.ts +14 -1
- package/dist/server/search-api.d.ts +15 -0
- package/dist/types/models.d.ts +36 -1
- package/dist/vector/dense.d.ts +6 -1
- package/package.json +2 -2
- package/scripts/sparse-encode.py +29 -8
package/dist/cli/main.js
CHANGED
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
// src/cli/run-cli.ts
|
|
4
4
|
import { Command, Option } from "commander";
|
|
5
|
-
import { stat as
|
|
6
|
-
import
|
|
5
|
+
import { readFile as readFile11, stat as stat5 } from "fs/promises";
|
|
6
|
+
import path22 from "path";
|
|
7
7
|
|
|
8
8
|
// src/chunk/chunker.ts
|
|
9
9
|
import { readFile as readFile3 } from "fs/promises";
|
|
@@ -16,7 +16,7 @@ import path from "path";
|
|
|
16
16
|
import YAML from "yaml";
|
|
17
17
|
|
|
18
18
|
// src/core/constants.ts
|
|
19
|
-
var PACKAGE_VERSION = "0.2.
|
|
19
|
+
var PACKAGE_VERSION = "0.2.3";
|
|
20
20
|
var DEFAULT_WORKSPACE = ".kb";
|
|
21
21
|
var DEFAULT_SHARED_MODEL_CACHE_DIR = "~/.qli/models/huggingface";
|
|
22
22
|
var LEGACY_WORKSPACE_MODEL_CACHE_DIR = ".kb/models/huggingface";
|
|
@@ -53,7 +53,7 @@ var defaultConfig = () => ({
|
|
|
53
53
|
defaultMode: "lexical",
|
|
54
54
|
dense: {
|
|
55
55
|
enabled: true,
|
|
56
|
-
modelId: "Xenova/
|
|
56
|
+
modelId: "Xenova/paraphrase-MiniLM-L3-v2",
|
|
57
57
|
cacheDir: DEFAULT_SHARED_MODEL_CACHE_DIR,
|
|
58
58
|
indexHashTables: 8,
|
|
59
59
|
indexRandomSeed: 42,
|
|
@@ -61,7 +61,7 @@ var defaultConfig = () => ({
|
|
|
61
61
|
},
|
|
62
62
|
sparse: {
|
|
63
63
|
enabled: true,
|
|
64
|
-
modelId: "opensearch-project/opensearch-neural-sparse-encoding-doc-
|
|
64
|
+
modelId: "opensearch-project/opensearch-neural-sparse-encoding-doc-v2-mini",
|
|
65
65
|
cacheDir: DEFAULT_SHARED_MODEL_CACHE_DIR,
|
|
66
66
|
documentTopTokens: 128,
|
|
67
67
|
queryEncoding: "tokenizer-token-weights",
|
|
@@ -374,7 +374,7 @@ async function assertWorkspaceExists(workspacePath) {
|
|
|
374
374
|
}
|
|
375
375
|
|
|
376
376
|
// src/index/querylight-indexer.ts
|
|
377
|
-
import { Analyzer, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, RankingAlgorithm, TextFieldIndex } from "@tryformation/querylight-ts";
|
|
377
|
+
import { Analyzer, DateFieldIndex, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, RankingAlgorithm, StoredSourceIndex, TextFieldIndex } from "@tryformation/querylight-ts";
|
|
378
378
|
import path11 from "path";
|
|
379
379
|
|
|
380
380
|
// src/vector/dense.ts
|
|
@@ -682,15 +682,26 @@ function createSparseChunkText(chunk) {
|
|
|
682
682
|
// src/vector/dense.ts
|
|
683
683
|
var denseEmbedderFactory = null;
|
|
684
684
|
var EXACT_DENSE_RERANK_THRESHOLD = 5e3;
|
|
685
|
+
function normalizeDenseEmbedder(embedder) {
|
|
686
|
+
if (typeof embedder === "function") {
|
|
687
|
+
return { embed: embedder };
|
|
688
|
+
}
|
|
689
|
+
return embedder;
|
|
690
|
+
}
|
|
685
691
|
async function createEmbedder(cacheDir, modelId) {
|
|
686
692
|
if (denseEmbedderFactory) {
|
|
687
|
-
return denseEmbedderFactory(cacheDir, modelId);
|
|
693
|
+
return normalizeDenseEmbedder(await denseEmbedderFactory(cacheDir, modelId));
|
|
688
694
|
}
|
|
689
695
|
const runtime = await getDenseTransformersRuntime(cacheDir);
|
|
690
696
|
const extractor = await runtime.pipeline("feature-extraction", modelId);
|
|
691
|
-
return
|
|
692
|
-
|
|
693
|
-
|
|
697
|
+
return {
|
|
698
|
+
async embed(text) {
|
|
699
|
+
const output = await extractor(text, { pooling: "mean", normalize: true });
|
|
700
|
+
return output.tolist()[0];
|
|
701
|
+
},
|
|
702
|
+
async dispose() {
|
|
703
|
+
await extractor.dispose();
|
|
704
|
+
}
|
|
694
705
|
};
|
|
695
706
|
}
|
|
696
707
|
function exactDenseQuery(payload, vector, topK) {
|
|
@@ -699,8 +710,12 @@ function exactDenseQuery(payload, vector, topK) {
|
|
|
699
710
|
async function pullDenseModel(workspacePath, config) {
|
|
700
711
|
const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
|
|
701
712
|
await mkdir4(cacheDir, { recursive: true });
|
|
702
|
-
const
|
|
703
|
-
|
|
713
|
+
const embedder = await createEmbedder(cacheDir, config.modelId);
|
|
714
|
+
try {
|
|
715
|
+
await embedder.embed("warm dense model cache");
|
|
716
|
+
} finally {
|
|
717
|
+
await embedder.dispose?.();
|
|
718
|
+
}
|
|
704
719
|
}
|
|
705
720
|
async function buildDenseVectors({
|
|
706
721
|
workspacePath,
|
|
@@ -710,53 +725,57 @@ async function buildDenseVectors({
|
|
|
710
725
|
const chunks = await readJsonl(path8.join(workspacePath, "chunks", "chunks.jsonl"));
|
|
711
726
|
const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
|
|
712
727
|
await mkdir4(cacheDir, { recursive: true });
|
|
713
|
-
const
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
const
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
728
|
+
const embedder = await createEmbedder(cacheDir, config.modelId);
|
|
729
|
+
try {
|
|
730
|
+
const records = [];
|
|
731
|
+
let dimensions = 0;
|
|
732
|
+
reportProgress(progress, `Encoding ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} for dense retrieval`);
|
|
733
|
+
for (const chunk of chunks) {
|
|
734
|
+
const embedding = await embedder.embed(createDenseChunkText(chunk));
|
|
735
|
+
dimensions ||= embedding.length;
|
|
736
|
+
records.push({
|
|
737
|
+
chunkId: chunk.id,
|
|
738
|
+
documentId: chunk.documentId,
|
|
739
|
+
sourceId: chunk.sourceId,
|
|
740
|
+
title: chunk.title,
|
|
741
|
+
uri: chunk.uri,
|
|
742
|
+
headingPath: chunk.headingPath,
|
|
743
|
+
text: chunk.text,
|
|
744
|
+
embedding
|
|
745
|
+
});
|
|
746
|
+
if (records.length === 1 || records.length % 100 === 0 || records.length === chunks.length) {
|
|
747
|
+
reportProgressDetail(progress, `Encoded ${records.length}/${chunks.length} chunks for dense retrieval`);
|
|
748
|
+
}
|
|
749
|
+
}
|
|
750
|
+
reportProgress(progress, "Building dense vector index");
|
|
751
|
+
const index = new VectorFieldIndex({
|
|
752
|
+
numHashTables: config.indexHashTables,
|
|
753
|
+
dimensions,
|
|
754
|
+
random: createSeededRandom(config.indexRandomSeed)
|
|
729
755
|
});
|
|
730
|
-
|
|
731
|
-
|
|
756
|
+
for (const record of records) {
|
|
757
|
+
index.insert(record.chunkId, [record.embedding]);
|
|
732
758
|
}
|
|
759
|
+
const metadata = {
|
|
760
|
+
createdAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
761
|
+
modelId: config.modelId,
|
|
762
|
+
dimensions,
|
|
763
|
+
hashTables: config.indexHashTables,
|
|
764
|
+
randomSeed: config.indexRandomSeed,
|
|
765
|
+
chunkCount: records.length,
|
|
766
|
+
indexHash: sha256(JSON.stringify(index.indexState))
|
|
767
|
+
};
|
|
768
|
+
const payload = {
|
|
769
|
+
metadata,
|
|
770
|
+
indexState: index.indexState,
|
|
771
|
+
chunks: records
|
|
772
|
+
};
|
|
773
|
+
await writeDensePayload(workspacePath, payload);
|
|
774
|
+
reportProgress(progress, `Dense vectors written for ${records.length} chunk${records.length === 1 ? "" : "s"}`);
|
|
775
|
+
return payload;
|
|
776
|
+
} finally {
|
|
777
|
+
await embedder.dispose?.();
|
|
733
778
|
}
|
|
734
|
-
reportProgress(progress, "Building dense vector index");
|
|
735
|
-
const index = new VectorFieldIndex({
|
|
736
|
-
numHashTables: config.indexHashTables,
|
|
737
|
-
dimensions,
|
|
738
|
-
random: createSeededRandom(config.indexRandomSeed)
|
|
739
|
-
});
|
|
740
|
-
for (const record of records) {
|
|
741
|
-
index.insert(record.chunkId, [record.embedding]);
|
|
742
|
-
}
|
|
743
|
-
const metadata = {
|
|
744
|
-
createdAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
745
|
-
modelId: config.modelId,
|
|
746
|
-
dimensions,
|
|
747
|
-
hashTables: config.indexHashTables,
|
|
748
|
-
randomSeed: config.indexRandomSeed,
|
|
749
|
-
chunkCount: records.length,
|
|
750
|
-
indexHash: sha256(JSON.stringify(index.indexState))
|
|
751
|
-
};
|
|
752
|
-
const payload = {
|
|
753
|
-
metadata,
|
|
754
|
-
indexState: index.indexState,
|
|
755
|
-
chunks: records
|
|
756
|
-
};
|
|
757
|
-
await writeDensePayload(workspacePath, payload);
|
|
758
|
-
reportProgress(progress, `Dense vectors written for ${records.length} chunk${records.length === 1 ? "" : "s"}`);
|
|
759
|
-
return payload;
|
|
760
779
|
}
|
|
761
780
|
async function denseQuery({
|
|
762
781
|
workspacePath,
|
|
@@ -766,21 +785,25 @@ async function denseQuery({
|
|
|
766
785
|
}) {
|
|
767
786
|
const payload = await readDensePayload(workspacePath);
|
|
768
787
|
const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
|
|
769
|
-
const
|
|
770
|
-
|
|
771
|
-
|
|
788
|
+
const embedder = await createEmbedder(cacheDir, config.modelId);
|
|
789
|
+
try {
|
|
790
|
+
const vector = await embedder.embed(query);
|
|
791
|
+
if (payload.chunks.length <= EXACT_DENSE_RERANK_THRESHOLD) {
|
|
792
|
+
return exactDenseQuery(payload, vector, topK);
|
|
793
|
+
}
|
|
794
|
+
const index = new VectorFieldIndex({
|
|
795
|
+
numHashTables: payload.metadata.hashTables,
|
|
796
|
+
dimensions: payload.metadata.dimensions,
|
|
797
|
+
random: createSeededRandom(payload.metadata.randomSeed)
|
|
798
|
+
}).loadState(payload.indexState);
|
|
799
|
+
const approximateHits = index.query(vector, topK);
|
|
800
|
+
if (approximateHits.length >= topK) {
|
|
801
|
+
return approximateHits;
|
|
802
|
+
}
|
|
772
803
|
return exactDenseQuery(payload, vector, topK);
|
|
804
|
+
} finally {
|
|
805
|
+
await embedder.dispose?.();
|
|
773
806
|
}
|
|
774
|
-
const index = new VectorFieldIndex({
|
|
775
|
-
numHashTables: payload.metadata.hashTables,
|
|
776
|
-
dimensions: payload.metadata.dimensions,
|
|
777
|
-
random: createSeededRandom(payload.metadata.randomSeed)
|
|
778
|
-
}).loadState(payload.indexState);
|
|
779
|
-
const approximateHits = index.query(vector, topK);
|
|
780
|
-
if (approximateHits.length >= topK) {
|
|
781
|
-
return approximateHits;
|
|
782
|
-
}
|
|
783
|
-
return exactDenseQuery(payload, vector, topK);
|
|
784
807
|
}
|
|
785
808
|
|
|
786
809
|
// src/vector/sparse.ts
|
|
@@ -1091,12 +1114,19 @@ function keywordFieldIndex() {
|
|
|
1091
1114
|
function createIndexMapping(extraFields = []) {
|
|
1092
1115
|
const lexical = new TextFieldIndex(void 0, void 0, RankingAlgorithm.BM25);
|
|
1093
1116
|
const mapping = {
|
|
1117
|
+
_source: new StoredSourceIndex(),
|
|
1094
1118
|
text: lexical,
|
|
1095
1119
|
title: new TextFieldIndex(void 0, void 0, RankingAlgorithm.BM25),
|
|
1096
1120
|
uri: keywordFieldIndex(),
|
|
1097
1121
|
sourceId: keywordFieldIndex(),
|
|
1122
|
+
sourceName: keywordFieldIndex(),
|
|
1098
1123
|
tags: keywordFieldIndex(),
|
|
1099
|
-
sourceType: keywordFieldIndex()
|
|
1124
|
+
sourceType: keywordFieldIndex(),
|
|
1125
|
+
publicationDate: new DateFieldIndex(),
|
|
1126
|
+
firstSeenAt: new DateFieldIndex(),
|
|
1127
|
+
lastSeenAt: new DateFieldIndex(),
|
|
1128
|
+
lastChangedAt: new DateFieldIndex(),
|
|
1129
|
+
crawledAt: new DateFieldIndex()
|
|
1100
1130
|
};
|
|
1101
1131
|
for (const field of extraFields) {
|
|
1102
1132
|
mapping[field] = keywordFieldIndex();
|
|
@@ -1132,8 +1162,12 @@ async function buildIndex({
|
|
|
1132
1162
|
const sources = await readJsonl(path11.join(workspacePath, "sources", "sources.jsonl"));
|
|
1133
1163
|
const metadataFields = [...new Set(chunks.flatMap((chunk) => Object.keys(chunk.metadata).map((key) => `metadata.${key}`)))];
|
|
1134
1164
|
const index = new DocumentIndex(createIndexMapping(metadataFields));
|
|
1165
|
+
const documentsById = new Map(documents.map((document) => [document.id, document]));
|
|
1166
|
+
const sourcesById = new Map(sources.map((source) => [source.id, source]));
|
|
1135
1167
|
reportProgress(progress, `Building lexical index from ${chunks.length} chunk${chunks.length === 1 ? "" : "s"}`);
|
|
1136
1168
|
for (const chunk of chunks) {
|
|
1169
|
+
const document = documentsById.get(chunk.documentId);
|
|
1170
|
+
const source = sourcesById.get(chunk.sourceId);
|
|
1137
1171
|
index.index({
|
|
1138
1172
|
id: chunk.id,
|
|
1139
1173
|
fields: {
|
|
@@ -1141,9 +1175,33 @@ async function buildIndex({
|
|
|
1141
1175
|
title: [chunk.title],
|
|
1142
1176
|
uri: [chunk.uri.toLowerCase()],
|
|
1143
1177
|
sourceId: [chunk.sourceId.toLowerCase()],
|
|
1178
|
+
sourceName: source ? [source.name.toLowerCase()] : [],
|
|
1144
1179
|
tags: Array.isArray(chunk.metadata.tags) ? chunk.metadata.tags.map((tag) => String(tag).toLowerCase()) : [],
|
|
1145
1180
|
sourceType: [String(chunk.metadata.sourceType ?? "").toLowerCase()],
|
|
1181
|
+
publicationDate: document?.publicationDate ? [document.publicationDate] : [],
|
|
1182
|
+
firstSeenAt: [document?.firstSeenAt ?? chunk.firstSeenAt],
|
|
1183
|
+
lastSeenAt: [document?.lastSeenAt ?? chunk.lastSeenAt],
|
|
1184
|
+
lastChangedAt: [document?.lastChangedAt ?? chunk.lastChangedAt],
|
|
1185
|
+
crawledAt: document?.crawledAt ? [document.crawledAt] : [],
|
|
1146
1186
|
...flattenMetadata(chunk.metadata)
|
|
1187
|
+
},
|
|
1188
|
+
source: {
|
|
1189
|
+
chunkId: chunk.id,
|
|
1190
|
+
documentId: chunk.documentId,
|
|
1191
|
+
sourceId: chunk.sourceId,
|
|
1192
|
+
sourceType: document?.sourceType ?? "text",
|
|
1193
|
+
sourceName: source?.name,
|
|
1194
|
+
title: chunk.title,
|
|
1195
|
+
uri: chunk.uri,
|
|
1196
|
+
headingPath: chunk.headingPath,
|
|
1197
|
+
text: chunk.text,
|
|
1198
|
+
normalizedPath: document?.normalizedPath,
|
|
1199
|
+
publicationDate: document?.publicationDate ?? null,
|
|
1200
|
+
crawledAt: document?.crawledAt,
|
|
1201
|
+
firstSeenAt: document?.firstSeenAt ?? chunk.firstSeenAt,
|
|
1202
|
+
lastSeenAt: document?.lastSeenAt ?? chunk.lastSeenAt,
|
|
1203
|
+
lastChangedAt: document?.lastChangedAt ?? chunk.lastChangedAt,
|
|
1204
|
+
metadata: chunk.metadata
|
|
1147
1205
|
}
|
|
1148
1206
|
});
|
|
1149
1207
|
}
|
|
@@ -1152,7 +1210,7 @@ async function buildIndex({
|
|
|
1152
1210
|
const metadata = {
|
|
1153
1211
|
id: `index_${createdAt.replace(/[:.]/g, "-")}`,
|
|
1154
1212
|
createdAt,
|
|
1155
|
-
querylightVersion: "0.
|
|
1213
|
+
querylightVersion: "0.11.0",
|
|
1156
1214
|
kbVersion: "0.1.0",
|
|
1157
1215
|
documentCount: documents.length,
|
|
1158
1216
|
chunkCount: chunks.length,
|
|
@@ -2138,13 +2196,17 @@ function isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules
|
|
|
2138
2196
|
if (url.search.length > 0) {
|
|
2139
2197
|
return false;
|
|
2140
2198
|
}
|
|
2141
|
-
|
|
2199
|
+
const pathname = url.pathname.toLowerCase();
|
|
2200
|
+
if (pathname.endsWith(".xml")) {
|
|
2142
2201
|
return false;
|
|
2143
2202
|
}
|
|
2144
|
-
if (
|
|
2203
|
+
if (pathname.endsWith(".pdf")) {
|
|
2145
2204
|
return false;
|
|
2146
2205
|
}
|
|
2147
|
-
if (
|
|
2206
|
+
if (pathname.includes("/cdn-cgi/")) {
|
|
2207
|
+
return false;
|
|
2208
|
+
}
|
|
2209
|
+
if (pathname === "/search" || pathname === "/search/" || pathname.endsWith("/search/")) {
|
|
2148
2210
|
return false;
|
|
2149
2211
|
}
|
|
2150
2212
|
if (disallowRules.some((rule) => rule !== "/" && url.pathname.startsWith(rule))) {
|
|
@@ -2778,7 +2840,7 @@ async function discoverWebsiteFeed(websiteUrl, userAgent) {
|
|
|
2778
2840
|
|
|
2779
2841
|
// src/query/search-service.ts
|
|
2780
2842
|
import { readFile as readFile10 } from "fs/promises";
|
|
2781
|
-
import {
|
|
2843
|
+
import { reciprocalRankFusion, searchJsonDsl } from "@tryformation/querylight-ts";
|
|
2782
2844
|
import path18 from "path";
|
|
2783
2845
|
async function loadHydratedIndex(workspacePath) {
|
|
2784
2846
|
let state;
|
|
@@ -2806,24 +2868,6 @@ function matchesPrefix(value, prefixes) {
|
|
|
2806
2868
|
const lower = value.toLowerCase();
|
|
2807
2869
|
return prefixes.some((prefix) => lower.startsWith(prefix));
|
|
2808
2870
|
}
|
|
2809
|
-
function buildSearchQuery(query, filters) {
|
|
2810
|
-
const sourceIds = normalizeFilterValues([filters.sourceId, ...filters.sourceIds ?? []].filter((value) => Boolean(value)));
|
|
2811
|
-
const sourceTypes = normalizeFilterValues([filters.sourceType, ...filters.sourceTypes ?? []].filter((value) => Boolean(value)));
|
|
2812
|
-
const tags = normalizeFilterValues([filters.tag, ...filters.tags ?? []].filter((value) => Boolean(value)));
|
|
2813
|
-
return new BoolQuery({
|
|
2814
|
-
should: [
|
|
2815
|
-
new MatchQuery({ field: "title", text: query, operation: OP.AND, boost: 6 }),
|
|
2816
|
-
new MatchQuery({ field: "text", text: query, operation: OP.AND, boost: 4 }),
|
|
2817
|
-
new MatchQuery({ field: "text", text: query, operation: OP.OR, boost: 2 })
|
|
2818
|
-
],
|
|
2819
|
-
filter: [
|
|
2820
|
-
...sourceIds.length === 1 ? [new TermQuery({ field: "sourceId", text: sourceIds[0] })] : [],
|
|
2821
|
-
...sourceTypes.length === 1 ? [new TermQuery({ field: "sourceType", text: sourceTypes[0] })] : [],
|
|
2822
|
-
...tags.length === 1 ? [new TermQuery({ field: "tags", text: tags[0] })] : [],
|
|
2823
|
-
...(filters.metadata ?? []).map(({ key, value }) => new TermQuery({ field: `metadata.${key}`, text: value.toLowerCase() }))
|
|
2824
|
-
]
|
|
2825
|
-
});
|
|
2826
|
-
}
|
|
2827
2871
|
function isValidDate(value) {
|
|
2828
2872
|
return typeof value === "string" && !Number.isNaN(new Date(value).getTime());
|
|
2829
2873
|
}
|
|
@@ -3022,6 +3066,185 @@ async function buildSnippetWithAdjacentChunks(chunk, query, {
|
|
|
3022
3066
|
}
|
|
3023
3067
|
return buildExpandedParagraphSnippet(paragraphs, currentIndex, query);
|
|
3024
3068
|
}
|
|
3069
|
+
function buildSearchDslRequest({
|
|
3070
|
+
query,
|
|
3071
|
+
topK,
|
|
3072
|
+
filters,
|
|
3073
|
+
dateRanges
|
|
3074
|
+
}) {
|
|
3075
|
+
const filterClauses = [];
|
|
3076
|
+
const sourceIds = normalizeFilterValues([filters.sourceId, ...filters.sourceIds ?? []].filter((value) => Boolean(value)));
|
|
3077
|
+
const sourceNames = normalizeFilterValues([filters.sourceName, ...filters.sourceNames ?? []].filter((value) => Boolean(value)));
|
|
3078
|
+
const sourceTypes = normalizeFilterValues([filters.sourceType, ...filters.sourceTypes ?? []].filter((value) => Boolean(value)));
|
|
3079
|
+
const uriPrefixes = normalizeFilterValues([filters.uriPrefix, ...filters.uriPrefixes ?? []].filter((value) => Boolean(value)));
|
|
3080
|
+
const tags = normalizeFilterValues([filters.tag, ...filters.tags ?? []].filter((value) => Boolean(value)));
|
|
3081
|
+
if (sourceIds.length > 0) {
|
|
3082
|
+
filterClauses.push({ terms: { sourceId: sourceIds } });
|
|
3083
|
+
}
|
|
3084
|
+
if (sourceNames.length > 0) {
|
|
3085
|
+
filterClauses.push({ terms: { sourceName: sourceNames } });
|
|
3086
|
+
}
|
|
3087
|
+
if (sourceTypes.length > 0) {
|
|
3088
|
+
filterClauses.push({ terms: { sourceType: sourceTypes } });
|
|
3089
|
+
}
|
|
3090
|
+
if (uriPrefixes.length > 0) {
|
|
3091
|
+
filterClauses.push({
|
|
3092
|
+
bool: {
|
|
3093
|
+
should: uriPrefixes.map((prefix) => ({ prefix: { uri: prefix } })),
|
|
3094
|
+
minimum_should_match: 1
|
|
3095
|
+
}
|
|
3096
|
+
});
|
|
3097
|
+
}
|
|
3098
|
+
if (tags.length > 0) {
|
|
3099
|
+
filterClauses.push({ terms: { tags } });
|
|
3100
|
+
}
|
|
3101
|
+
if (filters.hasPublicationDate) {
|
|
3102
|
+
filterClauses.push({ exists: { field: "publicationDate" } });
|
|
3103
|
+
}
|
|
3104
|
+
for (const { key, value } of filters.metadata ?? []) {
|
|
3105
|
+
filterClauses.push({ term: { [`metadata.${key}`]: value.toLowerCase() } });
|
|
3106
|
+
}
|
|
3107
|
+
for (const { field, from, to } of dateRanges) {
|
|
3108
|
+
filterClauses.push({
|
|
3109
|
+
range: {
|
|
3110
|
+
[field]: {
|
|
3111
|
+
...from ? { gte: from } : {},
|
|
3112
|
+
...to ? { lte: to } : {}
|
|
3113
|
+
}
|
|
3114
|
+
}
|
|
3115
|
+
});
|
|
3116
|
+
}
|
|
3117
|
+
return {
|
|
3118
|
+
size: topK,
|
|
3119
|
+
query: {
|
|
3120
|
+
bool: {
|
|
3121
|
+
should: [
|
|
3122
|
+
{ match: { title: { query, operator: "and", boost: 6 } } },
|
|
3123
|
+
{ match: { text: { query, operator: "and", boost: 4 } } },
|
|
3124
|
+
{ match: { text: { query, operator: "or", boost: 2 } } }
|
|
3125
|
+
],
|
|
3126
|
+
filter: filterClauses,
|
|
3127
|
+
minimum_should_match: 1
|
|
3128
|
+
}
|
|
3129
|
+
}
|
|
3130
|
+
};
|
|
3131
|
+
}
|
|
3132
|
+
function sourceToChunkRecord(source) {
|
|
3133
|
+
return {
|
|
3134
|
+
id: source.chunkId,
|
|
3135
|
+
documentId: source.documentId,
|
|
3136
|
+
sourceId: source.sourceId,
|
|
3137
|
+
title: source.title,
|
|
3138
|
+
uri: source.uri,
|
|
3139
|
+
headingPath: source.headingPath,
|
|
3140
|
+
text: source.text,
|
|
3141
|
+
contentHash: "",
|
|
3142
|
+
metadata: source.metadata,
|
|
3143
|
+
firstSeenAt: source.firstSeenAt,
|
|
3144
|
+
lastSeenAt: source.lastSeenAt,
|
|
3145
|
+
lastChangedAt: source.lastChangedAt
|
|
3146
|
+
};
|
|
3147
|
+
}
|
|
3148
|
+
function sourceToDocumentRecord(source) {
|
|
3149
|
+
return {
|
|
3150
|
+
id: source.documentId,
|
|
3151
|
+
sourceId: source.sourceId,
|
|
3152
|
+
sourceType: source.sourceType,
|
|
3153
|
+
title: source.title,
|
|
3154
|
+
uri: source.uri,
|
|
3155
|
+
sourceUri: source.uri,
|
|
3156
|
+
mimeType: "text/plain",
|
|
3157
|
+
normalizedPath: source.normalizedPath ?? "",
|
|
3158
|
+
contentHash: "",
|
|
3159
|
+
metadata: source.metadata,
|
|
3160
|
+
publicationDate: source.publicationDate ?? null,
|
|
3161
|
+
crawledAt: source.crawledAt,
|
|
3162
|
+
firstSeenAt: source.firstSeenAt,
|
|
3163
|
+
lastSeenAt: source.lastSeenAt,
|
|
3164
|
+
lastChangedAt: source.lastChangedAt
|
|
3165
|
+
};
|
|
3166
|
+
}
|
|
3167
|
+
async function materializeSearchHit(hit, query, config, orderedChunkCache, showChunks) {
|
|
3168
|
+
const source = hit._source;
|
|
3169
|
+
const chunk = sourceToChunkRecord(source);
|
|
3170
|
+
const document = sourceToDocumentRecord(source);
|
|
3171
|
+
const snippet = await buildSnippetWithAdjacentChunks(chunk, query, { document, config, orderedChunkCache });
|
|
3172
|
+
const enrichedSource = {
|
|
3173
|
+
...source,
|
|
3174
|
+
snippet
|
|
3175
|
+
};
|
|
3176
|
+
const result = {
|
|
3177
|
+
chunkId: source.chunkId,
|
|
3178
|
+
documentId: source.documentId,
|
|
3179
|
+
sourceId: source.sourceId,
|
|
3180
|
+
sourceType: source.sourceType,
|
|
3181
|
+
score: hit._score,
|
|
3182
|
+
title: chooseResultTitle(chunk),
|
|
3183
|
+
uri: source.uri,
|
|
3184
|
+
snippet,
|
|
3185
|
+
text: showChunks ? source.text : void 0,
|
|
3186
|
+
publicationDate: source.publicationDate ?? null,
|
|
3187
|
+
firstSeenAt: source.firstSeenAt,
|
|
3188
|
+
lastSeenAt: source.lastSeenAt,
|
|
3189
|
+
lastChangedAt: source.lastChangedAt,
|
|
3190
|
+
metadata: source.metadata
|
|
3191
|
+
};
|
|
3192
|
+
return {
|
|
3193
|
+
hit: {
|
|
3194
|
+
...hit,
|
|
3195
|
+
_source: enrichedSource
|
|
3196
|
+
},
|
|
3197
|
+
result
|
|
3198
|
+
};
|
|
3199
|
+
}
|
|
3200
|
+
function createSearchResponse(retrievalMode, hits, took, aggregations) {
|
|
3201
|
+
return {
|
|
3202
|
+
retrievalMode,
|
|
3203
|
+
took,
|
|
3204
|
+
hits: {
|
|
3205
|
+
total: {
|
|
3206
|
+
value: hits.length,
|
|
3207
|
+
relation: "eq"
|
|
3208
|
+
},
|
|
3209
|
+
max_score: hits.length > 0 ? Math.max(...hits.map((hit) => hit._score)) : null,
|
|
3210
|
+
hits
|
|
3211
|
+
},
|
|
3212
|
+
aggregations
|
|
3213
|
+
};
|
|
3214
|
+
}
|
|
3215
|
+
function searchResultsFromResponse(response2, showChunks = false) {
|
|
3216
|
+
return response2.hits.hits.map((hit) => ({
|
|
3217
|
+
chunkId: hit._source.chunkId,
|
|
3218
|
+
documentId: hit._source.documentId,
|
|
3219
|
+
sourceId: hit._source.sourceId,
|
|
3220
|
+
sourceType: hit._source.sourceType,
|
|
3221
|
+
score: hit._score,
|
|
3222
|
+
title: chooseResultTitle(sourceToChunkRecord(hit._source)),
|
|
3223
|
+
uri: hit._source.uri,
|
|
3224
|
+
snippet: hit._source.snippet ?? hit.highlight?.text?.join("\n\n") ?? buildSnippet(hit._source.text, hit._source.title),
|
|
3225
|
+
text: showChunks ? hit._source.text : void 0,
|
|
3226
|
+
publicationDate: hit._source.publicationDate ?? null,
|
|
3227
|
+
firstSeenAt: hit._source.firstSeenAt,
|
|
3228
|
+
lastSeenAt: hit._source.lastSeenAt,
|
|
3229
|
+
lastChangedAt: hit._source.lastChangedAt,
|
|
3230
|
+
metadata: hit._source.metadata
|
|
3231
|
+
}));
|
|
3232
|
+
}
|
|
3233
|
+
async function searchJsonRequest({
|
|
3234
|
+
index,
|
|
3235
|
+
request,
|
|
3236
|
+
indexName = "querylight"
|
|
3237
|
+
}) {
|
|
3238
|
+
return searchJsonDsl({ index, request, indexName });
|
|
3239
|
+
}
|
|
3240
|
+
async function searchJsonIndex({
|
|
3241
|
+
workspacePath,
|
|
3242
|
+
request,
|
|
3243
|
+
indexName = "querylight"
|
|
3244
|
+
}) {
|
|
3245
|
+
const index = await loadHydratedIndex(workspacePath);
|
|
3246
|
+
return searchJsonRequest({ index, request, indexName });
|
|
3247
|
+
}
|
|
3025
3248
|
function normalizeDisplayTitle(title) {
|
|
3026
3249
|
return title.replace(/\s*\|\s*Querylight TS Demo\s*$/i, "").replace(/\s+/g, " ").trim();
|
|
3027
3250
|
}
|
|
@@ -3159,6 +3382,7 @@ async function searchIndex({
|
|
|
3159
3382
|
retrievalMode,
|
|
3160
3383
|
showChunks = false
|
|
3161
3384
|
}) {
|
|
3385
|
+
const startedAt = Date.now();
|
|
3162
3386
|
const config = await loadConfig(workspacePath);
|
|
3163
3387
|
const mode = retrievalMode ?? config.retrieval.defaultMode;
|
|
3164
3388
|
const candidateLimit = Math.max(topK * 5, 50);
|
|
@@ -3215,12 +3439,48 @@ async function searchIndex({
|
|
|
3215
3439
|
};
|
|
3216
3440
|
})
|
|
3217
3441
|
);
|
|
3218
|
-
|
|
3442
|
+
const hits2 = latestResults.filter((result) => result != null).map((result) => {
|
|
3443
|
+
const chunk = chunks.get(result.chunkId);
|
|
3444
|
+
const document = documents.get(result.documentId);
|
|
3445
|
+
const source = sources.get(result.sourceId);
|
|
3446
|
+
return {
|
|
3447
|
+
_index: "querylight",
|
|
3448
|
+
_id: result.chunkId,
|
|
3449
|
+
_score: result.score,
|
|
3450
|
+
_source: {
|
|
3451
|
+
chunkId: result.chunkId,
|
|
3452
|
+
documentId: result.documentId,
|
|
3453
|
+
sourceId: result.sourceId,
|
|
3454
|
+
sourceType: result.sourceType,
|
|
3455
|
+
sourceName: source?.name,
|
|
3456
|
+
title: chunk.title,
|
|
3457
|
+
uri: result.uri,
|
|
3458
|
+
headingPath: chunk.headingPath,
|
|
3459
|
+
text: chunk.text,
|
|
3460
|
+
snippet: result.snippet,
|
|
3461
|
+
normalizedPath: document.normalizedPath,
|
|
3462
|
+
publicationDate: result.publicationDate ?? null,
|
|
3463
|
+
crawledAt: document.crawledAt,
|
|
3464
|
+
firstSeenAt: result.firstSeenAt,
|
|
3465
|
+
lastSeenAt: result.lastSeenAt,
|
|
3466
|
+
lastChangedAt: result.lastChangedAt,
|
|
3467
|
+
metadata: result.metadata
|
|
3468
|
+
}
|
|
3469
|
+
};
|
|
3470
|
+
});
|
|
3471
|
+
return createSearchResponse("lexical", hits2, Date.now() - startedAt);
|
|
3219
3472
|
}
|
|
3220
3473
|
const lexicalHits = async () => {
|
|
3221
|
-
const
|
|
3222
|
-
|
|
3223
|
-
|
|
3474
|
+
const response2 = await searchJsonIndex({
|
|
3475
|
+
workspacePath,
|
|
3476
|
+
request: buildSearchDslRequest({
|
|
3477
|
+
query: normalizedQuery,
|
|
3478
|
+
topK: candidateLimit,
|
|
3479
|
+
filters: { sourceId, sourceIds, sourceName, sourceNames, sourceType, sourceTypes, uriPrefix, uriPrefixes, hasPublicationDate, tag, tags, metadata },
|
|
3480
|
+
dateRanges
|
|
3481
|
+
})
|
|
3482
|
+
});
|
|
3483
|
+
return response2.hits.hits;
|
|
3224
3484
|
};
|
|
3225
3485
|
const denseHits = async () => {
|
|
3226
3486
|
if (!await fileExists(denseVectorPath(workspacePath))) {
|
|
@@ -3234,15 +3494,18 @@ async function searchIndex({
|
|
|
3234
3494
|
}
|
|
3235
3495
|
return sparseQuery({ workspacePath, config: config.retrieval.sparse, query: normalizedQuery, topK: candidateLimit }).then((hits2) => hits2.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit));
|
|
3236
3496
|
};
|
|
3497
|
+
let lexicalResponseHits = [];
|
|
3237
3498
|
let hits;
|
|
3238
3499
|
if (mode === "lexical") {
|
|
3239
|
-
|
|
3500
|
+
lexicalResponseHits = await lexicalHits();
|
|
3501
|
+
hits = lexicalResponseHits.map((hit) => [hit._id, hit._score]);
|
|
3240
3502
|
} else if (mode === "dense") {
|
|
3241
3503
|
hits = await denseHits();
|
|
3242
3504
|
} else if (mode === "sparse") {
|
|
3243
3505
|
hits = await sparseHits();
|
|
3244
3506
|
} else {
|
|
3245
|
-
|
|
3507
|
+
lexicalResponseHits = await lexicalHits();
|
|
3508
|
+
const rankings = [lexicalResponseHits.map((hit) => [hit._id, hit._score])];
|
|
3246
3509
|
if (await fileExists(denseVectorPath(workspacePath))) {
|
|
3247
3510
|
rankings.push(await denseQuery({ workspacePath, config: config.retrieval.dense, query: normalizedQuery, topK: candidateLimit }).then((dense) => dense.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit)));
|
|
3248
3511
|
}
|
|
@@ -3251,38 +3514,242 @@ async function searchIndex({
|
|
|
3251
3514
|
}
|
|
3252
3515
|
hits = reciprocalRankFusion(rankings, { rankConstant: 20, weights: rankings.map((_, index) => index === 0 ? 3 : 1) }).slice(0, candidateLimit);
|
|
3253
3516
|
}
|
|
3254
|
-
const
|
|
3517
|
+
const baseHits = mode === "lexical" ? lexicalResponseHits : hits.flatMap(([chunkId, score]) => {
|
|
3255
3518
|
const chunk = chunks.get(chunkId);
|
|
3256
3519
|
if (!chunk) {
|
|
3257
|
-
return
|
|
3520
|
+
return [];
|
|
3258
3521
|
}
|
|
3522
|
+
const document = documents.get(chunk.documentId);
|
|
3523
|
+
const source = sources.get(chunk.sourceId);
|
|
3524
|
+
return [{
|
|
3525
|
+
_index: "querylight",
|
|
3526
|
+
_id: chunkId,
|
|
3527
|
+
_score: score,
|
|
3528
|
+
_source: {
|
|
3529
|
+
chunkId,
|
|
3530
|
+
documentId: chunk.documentId,
|
|
3531
|
+
sourceId: chunk.sourceId,
|
|
3532
|
+
sourceType: document?.sourceType ?? "text",
|
|
3533
|
+
sourceName: source?.name,
|
|
3534
|
+
title: chunk.title,
|
|
3535
|
+
uri: chunk.uri,
|
|
3536
|
+
headingPath: chunk.headingPath,
|
|
3537
|
+
text: chunk.text,
|
|
3538
|
+
normalizedPath: document?.normalizedPath,
|
|
3539
|
+
publicationDate: document?.publicationDate ?? null,
|
|
3540
|
+
crawledAt: document?.crawledAt,
|
|
3541
|
+
firstSeenAt: document?.firstSeenAt ?? chunk.firstSeenAt,
|
|
3542
|
+
lastSeenAt: document?.lastSeenAt ?? chunk.lastSeenAt,
|
|
3543
|
+
lastChangedAt: document?.lastChangedAt ?? chunk.lastChangedAt,
|
|
3544
|
+
metadata: chunk.metadata
|
|
3545
|
+
}
|
|
3546
|
+
}];
|
|
3547
|
+
});
|
|
3548
|
+
const materialized = await Promise.all(baseHits.map((hit) => materializeSearchHit(hit, normalizedQuery, config, orderedChunkCache, showChunks)));
|
|
3549
|
+
if (showChunks) {
|
|
3550
|
+
const topHits = materialized.sort((left, right) => right.result.score - left.result.score).slice(0, topK).map(({ hit, result }) => ({ ...hit, _score: result.score }));
|
|
3551
|
+
return createSearchResponse(mode, topHits, Date.now() - startedAt);
|
|
3552
|
+
}
|
|
3553
|
+
const reranked = rerankResultsByDocument(materialized.map(({ result }) => result), topK);
|
|
3554
|
+
const byChunkId = new Map(materialized.map(({ hit }) => [hit._id, hit]));
|
|
3555
|
+
const finalHits = reranked.map((result) => {
|
|
3556
|
+
const hit = byChunkId.get(result.chunkId);
|
|
3557
|
+
return hit ? { ...hit, _score: result.score, _source: { ...hit._source, snippet: result.snippet } } : null;
|
|
3558
|
+
}).filter((hit) => hit != null);
|
|
3559
|
+
return createSearchResponse(mode, finalHits, Date.now() - startedAt);
|
|
3560
|
+
}
|
|
3561
|
+
|
|
3562
|
+
// src/server/search-api.ts
|
|
3563
|
+
import { createServer } from "http";
|
|
3564
|
+
import { readdir, stat as stat4 } from "fs/promises";
|
|
3565
|
+
import path19 from "path";
|
|
3566
|
+
async function pathIsDirectory(candidatePath) {
|
|
3567
|
+
try {
|
|
3568
|
+
return (await stat4(candidatePath)).isDirectory();
|
|
3569
|
+
} catch {
|
|
3570
|
+
return false;
|
|
3571
|
+
}
|
|
3572
|
+
}
|
|
3573
|
+
async function discoverKnowledgeBases(workspacePath) {
|
|
3574
|
+
try {
|
|
3575
|
+
const singleWorkspace = await assertWorkspaceExists(workspacePath);
|
|
3576
|
+
const config = await loadConfig(singleWorkspace);
|
|
3577
|
+
const index = await loadHydratedIndex(singleWorkspace);
|
|
3259
3578
|
return {
|
|
3260
|
-
|
|
3261
|
-
|
|
3262
|
-
|
|
3263
|
-
|
|
3264
|
-
|
|
3265
|
-
|
|
3266
|
-
|
|
3267
|
-
snippet: await buildSnippetWithAdjacentChunks(chunk, normalizedQuery, {
|
|
3268
|
-
document: documents.get(chunk.documentId),
|
|
3269
|
-
config,
|
|
3270
|
-
orderedChunkCache
|
|
3271
|
-
}),
|
|
3272
|
-
text: showChunks ? chunk.text : void 0,
|
|
3273
|
-
publicationDate: documents.get(chunk.documentId)?.publicationDate ?? null,
|
|
3274
|
-
firstSeenAt: documents.get(chunk.documentId)?.firstSeenAt ?? chunk.firstSeenAt,
|
|
3275
|
-
lastSeenAt: documents.get(chunk.documentId)?.lastSeenAt ?? chunk.lastSeenAt,
|
|
3276
|
-
lastChangedAt: documents.get(chunk.documentId)?.lastChangedAt ?? chunk.lastChangedAt,
|
|
3277
|
-
metadata: chunk.metadata
|
|
3579
|
+
mode: "single",
|
|
3580
|
+
knowledgeBases: [{
|
|
3581
|
+
name: config.index.name,
|
|
3582
|
+
workspacePath: singleWorkspace,
|
|
3583
|
+
configuredIndexName: config.index.name,
|
|
3584
|
+
index
|
|
3585
|
+
}]
|
|
3278
3586
|
};
|
|
3279
|
-
})
|
|
3280
|
-
|
|
3281
|
-
|
|
3587
|
+
} catch (error) {
|
|
3588
|
+
if (!(error instanceof CliError) || error.code !== "WORKSPACE_ERROR") {
|
|
3589
|
+
throw error;
|
|
3590
|
+
}
|
|
3591
|
+
}
|
|
3592
|
+
const resolvedRoot = path19.resolve(workspacePath);
|
|
3593
|
+
if (!await pathIsDirectory(resolvedRoot)) {
|
|
3594
|
+
throw new CliError(`workspace path does not exist: ${resolvedRoot}`, "WORKSPACE_ERROR", 3 /* WorkspaceError */);
|
|
3595
|
+
}
|
|
3596
|
+
const entries = await readdir(resolvedRoot, { withFileTypes: true });
|
|
3597
|
+
const knowledgeBases = (await Promise.all(entries.filter((entry) => entry.isDirectory()).map(async (entry) => {
|
|
3598
|
+
const candidateWorkspace = path19.join(resolvedRoot, entry.name, ".kb");
|
|
3599
|
+
try {
|
|
3600
|
+
const workspace = await assertWorkspaceExists(candidateWorkspace);
|
|
3601
|
+
const config = await loadConfig(workspace);
|
|
3602
|
+
const index = await loadHydratedIndex(workspace);
|
|
3603
|
+
return {
|
|
3604
|
+
name: entry.name,
|
|
3605
|
+
workspacePath: workspace,
|
|
3606
|
+
configuredIndexName: config.index.name,
|
|
3607
|
+
index
|
|
3608
|
+
};
|
|
3609
|
+
} catch (error) {
|
|
3610
|
+
if (error instanceof CliError && error.code === "WORKSPACE_ERROR") {
|
|
3611
|
+
return null;
|
|
3612
|
+
}
|
|
3613
|
+
throw error;
|
|
3614
|
+
}
|
|
3615
|
+
}))).filter((knowledgeBase) => knowledgeBase != null);
|
|
3616
|
+
if (knowledgeBases.length === 0) {
|
|
3617
|
+
throw new CliError(
|
|
3618
|
+
`no knowledge bases found at ${resolvedRoot}; use a .kb workspace or a directory of named subdirectories that each contain .kb`,
|
|
3619
|
+
"WORKSPACE_ERROR",
|
|
3620
|
+
3 /* WorkspaceError */
|
|
3621
|
+
);
|
|
3622
|
+
}
|
|
3623
|
+
return { mode: "multi", knowledgeBases };
|
|
3624
|
+
}
|
|
3625
|
+
function sendJson(response2, statusCode, payload) {
|
|
3626
|
+
response2.statusCode = statusCode;
|
|
3627
|
+
response2.setHeader("content-type", "application/json; charset=utf-8");
|
|
3628
|
+
response2.end(JSON.stringify(payload));
|
|
3629
|
+
}
|
|
3630
|
+
function sendError(response2, statusCode, type, reason) {
|
|
3631
|
+
sendJson(response2, statusCode, {
|
|
3632
|
+
error: {
|
|
3633
|
+
type,
|
|
3634
|
+
reason
|
|
3635
|
+
},
|
|
3636
|
+
status: statusCode
|
|
3637
|
+
});
|
|
3638
|
+
}
|
|
3639
|
+
async function readRequestBody(request) {
|
|
3640
|
+
const chunks = [];
|
|
3641
|
+
for await (const chunk of request) {
|
|
3642
|
+
chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
|
|
3643
|
+
}
|
|
3644
|
+
return Buffer.concat(chunks).toString("utf8");
|
|
3645
|
+
}
|
|
3646
|
+
function parseSearchRequest(raw) {
|
|
3647
|
+
const normalized = raw.trim();
|
|
3648
|
+
if (normalized.length === 0) {
|
|
3649
|
+
return {};
|
|
3650
|
+
}
|
|
3651
|
+
try {
|
|
3652
|
+
const parsed = JSON.parse(normalized);
|
|
3653
|
+
if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
|
|
3654
|
+
throw new Error("expected a JSON object");
|
|
3655
|
+
}
|
|
3656
|
+
return parsed;
|
|
3657
|
+
} catch (error) {
|
|
3658
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3659
|
+
throw new CliError(`invalid JSON request: ${message}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
|
|
3660
|
+
}
|
|
3661
|
+
}
|
|
3662
|
+
function routeForKnowledgeBase(mode, knowledgeBase) {
|
|
3663
|
+
return mode === "single" ? "/_search" : `/${knowledgeBase.name}/_search`;
|
|
3664
|
+
}
|
|
3665
|
+
function resolveKnowledgeBaseForPath(pathname, mode, knowledgeBases) {
|
|
3666
|
+
const segments = pathname.split("/").filter(Boolean);
|
|
3667
|
+
if (mode === "single") {
|
|
3668
|
+
const knowledgeBase = [...knowledgeBases.values()][0];
|
|
3669
|
+
if (!knowledgeBase) {
|
|
3670
|
+
return null;
|
|
3671
|
+
}
|
|
3672
|
+
if (segments.length === 1 && segments[0] === "_search") {
|
|
3673
|
+
return knowledgeBase;
|
|
3674
|
+
}
|
|
3675
|
+
if (segments.length === 2 && segments[1] === "_search" && segments[0] === knowledgeBase.configuredIndexName) {
|
|
3676
|
+
return knowledgeBase;
|
|
3677
|
+
}
|
|
3678
|
+
return null;
|
|
3679
|
+
}
|
|
3680
|
+
if (segments.length === 2 && segments[1] === "_search") {
|
|
3681
|
+
return knowledgeBases.get(segments[0]) ?? null;
|
|
3682
|
+
}
|
|
3683
|
+
return null;
|
|
3684
|
+
}
|
|
3685
|
+
async function handleSearchRequest(request, response2, pathname, mode, knowledgeBases) {
|
|
3686
|
+
if (request.method !== "GET" && request.method !== "POST") {
|
|
3687
|
+
response2.setHeader("allow", "GET, POST");
|
|
3688
|
+
sendError(response2, 405, "method_not_allowed", `unsupported method for ${pathname}`);
|
|
3689
|
+
return;
|
|
3690
|
+
}
|
|
3691
|
+
const knowledgeBase = resolveKnowledgeBaseForPath(pathname, mode, knowledgeBases);
|
|
3692
|
+
if (!knowledgeBase) {
|
|
3693
|
+
sendError(response2, 404, "resource_not_found_exception", `unknown search route: ${pathname}`);
|
|
3694
|
+
return;
|
|
3695
|
+
}
|
|
3696
|
+
try {
|
|
3697
|
+
const requestBody = parseSearchRequest(await readRequestBody(request));
|
|
3698
|
+
const indexName = mode === "multi" ? knowledgeBase.name : knowledgeBase.configuredIndexName;
|
|
3699
|
+
const result = await searchJsonRequest({
|
|
3700
|
+
index: knowledgeBase.index,
|
|
3701
|
+
request: requestBody,
|
|
3702
|
+
indexName
|
|
3703
|
+
});
|
|
3704
|
+
sendJson(response2, 200, result);
|
|
3705
|
+
} catch (error) {
|
|
3706
|
+
if (error instanceof CliError && error.code === "INVALID_ARGUMENT") {
|
|
3707
|
+
sendError(response2, 400, "parse_exception", error.message);
|
|
3708
|
+
return;
|
|
3709
|
+
}
|
|
3710
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3711
|
+
sendError(response2, 500, "search_phase_execution_exception", message);
|
|
3712
|
+
}
|
|
3713
|
+
}
|
|
3714
|
+
async function startSearchApiServer({
|
|
3715
|
+
workspacePath,
|
|
3716
|
+
host = "127.0.0.1",
|
|
3717
|
+
port = 3e3
|
|
3718
|
+
}) {
|
|
3719
|
+
const { mode, knowledgeBases } = await discoverKnowledgeBases(workspacePath);
|
|
3720
|
+
const byName = new Map(knowledgeBases.map((knowledgeBase) => [knowledgeBase.name, knowledgeBase]));
|
|
3721
|
+
const server = createServer(async (request, response2) => {
|
|
3722
|
+
const url2 = new URL(request.url ?? "/", `http://${request.headers.host ?? `${host}:${port}`}`);
|
|
3723
|
+
await handleSearchRequest(request, response2, url2.pathname, mode, byName);
|
|
3724
|
+
});
|
|
3725
|
+
await new Promise((resolve2, reject) => {
|
|
3726
|
+
server.once("error", reject);
|
|
3727
|
+
server.listen(port, host, () => {
|
|
3728
|
+
server.off("error", reject);
|
|
3729
|
+
resolve2();
|
|
3730
|
+
});
|
|
3731
|
+
});
|
|
3732
|
+
const address = server.address();
|
|
3733
|
+
if (!address || typeof address === "string") {
|
|
3734
|
+
throw new CliError("server failed to bind to a TCP address", "SERVER_ERROR", 1 /* GeneralError */);
|
|
3735
|
+
}
|
|
3736
|
+
const url = `http://${host}:${address.port}`;
|
|
3737
|
+
return {
|
|
3738
|
+
mode,
|
|
3739
|
+
url,
|
|
3740
|
+
knowledgeBases: knowledgeBases.map((knowledgeBase) => ({
|
|
3741
|
+
name: knowledgeBase.name,
|
|
3742
|
+
workspacePath: knowledgeBase.workspacePath,
|
|
3743
|
+
route: routeForKnowledgeBase(mode, knowledgeBase)
|
|
3744
|
+
})),
|
|
3745
|
+
close: async () => new Promise((resolve2, reject) => {
|
|
3746
|
+
server.close((error) => error ? reject(error) : resolve2());
|
|
3747
|
+
})
|
|
3748
|
+
};
|
|
3282
3749
|
}
|
|
3283
3750
|
|
|
3284
3751
|
// src/query/related-service.ts
|
|
3285
|
-
import
|
|
3752
|
+
import path20 from "path";
|
|
3286
3753
|
function cosineSimilarity2(left, right) {
|
|
3287
3754
|
let dot = 0;
|
|
3288
3755
|
let leftNorm = 0;
|
|
@@ -3358,7 +3825,7 @@ async function findRelatedDocuments({
|
|
|
3358
3825
|
if (!await fileExists(denseVectorPath(workspacePath))) {
|
|
3359
3826
|
throw new CliError("dense vector index is not built; run `qli models pull --dense` and `qli rebuild`", "DENSE_INDEX_MISSING", 7 /* QueryError */);
|
|
3360
3827
|
}
|
|
3361
|
-
const documents = await readJsonl(
|
|
3828
|
+
const documents = await readJsonl(path20.join(workspacePath, "documents", "documents.jsonl"));
|
|
3362
3829
|
const selected = resolveDocumentSelector(documents, document);
|
|
3363
3830
|
const densePayload = await readDensePayload(workspacePath);
|
|
3364
3831
|
const vectors = buildDocumentVectors(documents, densePayload.chunks, densePayload.metadata.dimensions);
|
|
@@ -3395,9 +3862,10 @@ async function createContext({
|
|
|
3395
3862
|
retrievalMode
|
|
3396
3863
|
}) {
|
|
3397
3864
|
const search = await searchIndex({ workspacePath, query, topK, showChunks: true, retrievalMode });
|
|
3865
|
+
const results = searchResultsFromResponse(search, true);
|
|
3398
3866
|
const sources = [];
|
|
3399
3867
|
let total = 0;
|
|
3400
|
-
for (const result of
|
|
3868
|
+
for (const result of results) {
|
|
3401
3869
|
const text = result.text ?? "";
|
|
3402
3870
|
if (total + text.length > maxChars && sources.length > 0) {
|
|
3403
3871
|
break;
|
|
@@ -3430,7 +3898,7 @@ async function createContext({
|
|
|
3430
3898
|
}
|
|
3431
3899
|
|
|
3432
3900
|
// src/report/diff-service.ts
|
|
3433
|
-
import
|
|
3901
|
+
import path21 from "path";
|
|
3434
3902
|
function chooseBaselineRun(runs, since) {
|
|
3435
3903
|
if (since === "last-run") {
|
|
3436
3904
|
return runs.at(-1);
|
|
@@ -3446,7 +3914,7 @@ async function diffWorkspace({
|
|
|
3446
3914
|
documentId,
|
|
3447
3915
|
since
|
|
3448
3916
|
}) {
|
|
3449
|
-
const current = await readJsonl(
|
|
3917
|
+
const current = await readJsonl(path21.join(workspacePath, "documents", "documents.jsonl"));
|
|
3450
3918
|
const baseline = chooseBaselineRun(await listRuns(workspacePath), since);
|
|
3451
3919
|
const previous = new Map((baseline?.documentsSnapshot ?? []).map((document) => [document.id, document]));
|
|
3452
3920
|
const changedDocuments = current.filter((document) => (!sourceId || document.sourceId === sourceId) && (!documentId || document.id === documentId)).filter((document) => {
|
|
@@ -3498,7 +3966,8 @@ function formatSourcesTable(sources) {
|
|
|
3498
3966
|
}
|
|
3499
3967
|
return table.toString();
|
|
3500
3968
|
}
|
|
3501
|
-
function formatSearchResults(
|
|
3969
|
+
function formatSearchResults(response2) {
|
|
3970
|
+
const results = searchResultsFromResponse(response2);
|
|
3502
3971
|
return results.map((result, index) => [
|
|
3503
3972
|
`${index + 1}. ${colors.bold(result.title)}`,
|
|
3504
3973
|
` URL: ${result.uri}`,
|
|
@@ -3803,6 +4272,19 @@ function parseDateValue(input, optionName) {
|
|
|
3803
4272
|
}
|
|
3804
4273
|
return parsed.toISOString();
|
|
3805
4274
|
}
|
|
4275
|
+
async function parseJsonArgument(input) {
|
|
4276
|
+
const raw = input.startsWith("@") ? await readFile11(path22.resolve(input.slice(1)), "utf8") : input;
|
|
4277
|
+
try {
|
|
4278
|
+
const parsed = JSON.parse(raw);
|
|
4279
|
+
if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
|
|
4280
|
+
throw new Error("expected a JSON object");
|
|
4281
|
+
}
|
|
4282
|
+
return parsed;
|
|
4283
|
+
} catch (error) {
|
|
4284
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
4285
|
+
throw new CliError(`invalid JSON request: ${message}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
|
|
4286
|
+
}
|
|
4287
|
+
}
|
|
3806
4288
|
function searchDateRanges(options) {
|
|
3807
4289
|
const entries = [];
|
|
3808
4290
|
if (options.since || options.until) {
|
|
@@ -3835,14 +4317,14 @@ function searchDateRanges(options) {
|
|
|
3835
4317
|
return entries;
|
|
3836
4318
|
}
|
|
3837
4319
|
async function resolveWorkspace(options) {
|
|
3838
|
-
return
|
|
4320
|
+
return path22.resolve(options.workspace ?? DEFAULT_WORKSPACE);
|
|
3839
4321
|
}
|
|
3840
4322
|
function workspaceFromArgv(argv) {
|
|
3841
4323
|
const index = argv.findIndex((arg) => arg === "--workspace");
|
|
3842
4324
|
if (index >= 0 && argv[index + 1]) {
|
|
3843
|
-
return
|
|
4325
|
+
return path22.resolve(argv[index + 1]);
|
|
3844
4326
|
}
|
|
3845
|
-
return
|
|
4327
|
+
return path22.resolve(DEFAULT_WORKSPACE);
|
|
3846
4328
|
}
|
|
3847
4329
|
async function runCli(argv, io = {}) {
|
|
3848
4330
|
const capture = { stdout: [], stderr: [], ...io };
|
|
@@ -3936,7 +4418,7 @@ Notes:
|
|
|
3936
4418
|
}
|
|
3937
4419
|
const stored = await addSource(workspace, {
|
|
3938
4420
|
type,
|
|
3939
|
-
uri: ["file", "directory"].includes(type) ?
|
|
4421
|
+
uri: ["file", "directory"].includes(type) ? path22.resolve(uri) : uri,
|
|
3940
4422
|
name: options.name,
|
|
3941
4423
|
enabled: true,
|
|
3942
4424
|
tags: options.tag ?? [],
|
|
@@ -4147,7 +4629,7 @@ Examples:
|
|
|
4147
4629
|
progress?.("info", "Rebuild complete");
|
|
4148
4630
|
emit(global.json, capture, response("rebuild", workspace, data), `Processed ${ingest.processedSources} sources, wrote ${chunk.chunksWritten} chunks`);
|
|
4149
4631
|
});
|
|
4150
|
-
program.command("search").description("Search the built index and return ranked matching documents or chunks.").argument("[query]", "Text query. Omit it to list the latest matching documents.").option("--top-k <n>", "Maximum number of results to return.", "12").option("--source <sourceIds>", "Restrict results to one or more source ids. Use comma-separated values.").option("--source-name <names>", "Restrict results to one or more source names. Use comma-separated values.").option("--source-type <types>", `Restrict results to one or more source types. Use comma-separated values: ${SOURCE_TYPE_LIST.join(", ")}`).option("--uri-prefix <prefixes>", "Restrict results to one or more URI prefixes. Use comma-separated values.").option("--tag <tags>", "Restrict results to one or more source tags. Use comma-separated values.").option("--metadata <key=value...>", "Restrict results to sources with matching metadata.").option("--since <date>", "Shortcut for --publication-date-from.").option("--until <date>", "Shortcut for --publication-date-to.").option("--changed-since <date>", "Only include documents changed on or after this date.").option("--has-publication-date", "Only include documents with a publication date.").option("--publication-date-from <date>", "Only include documents published on or after this date.").option("--publication-date-to <date>", "Only include documents published on or before this date.").option("--first-seen-at-from <date>", "Only include documents first seen on or after this date.").option("--first-seen-at-to <date>", "Only include documents first seen on or before this date.").option("--last-seen-at-from <date>", "Only include documents last seen on or after this date.").option("--last-seen-at-to <date>", "Only include documents last seen on or before this date.").option("--last-changed-at-from <date>", "Only include documents changed on or after this date.").option("--last-changed-at-to <date>", "Only include documents changed on or before this date.").option("--crawled-at-from <date>", "Only include documents crawled on or after this date.").option("--crawled-at-to <date>", "Only include documents crawled on or before this date.").option("--retrieval <mode>", `Retrieval mode: ${RETRIEVAL_MODE_LIST.join(", ")}`).option("--show-chunks", "Return chunk-level matches when available.").addHelpText("after", `
|
|
4632
|
+
program.command("search").description("Search the built index and return ranked matching documents or chunks. Use search-json for raw JSON DSL queries.").argument("[query]", "Text query. Omit it to list the latest matching documents.").option("--top-k <n>", "Maximum number of results to return.", "12").option("--source <sourceIds>", "Restrict results to one or more source ids. Use comma-separated values.").option("--source-name <names>", "Restrict results to one or more source names. Use comma-separated values.").option("--source-type <types>", `Restrict results to one or more source types. Use comma-separated values: ${SOURCE_TYPE_LIST.join(", ")}`).option("--uri-prefix <prefixes>", "Restrict results to one or more URI prefixes. Use comma-separated values.").option("--tag <tags>", "Restrict results to one or more source tags. Use comma-separated values.").option("--metadata <key=value...>", "Restrict results to sources with matching metadata.").option("--since <date>", "Shortcut for --publication-date-from.").option("--until <date>", "Shortcut for --publication-date-to.").option("--changed-since <date>", "Only include documents changed on or after this date.").option("--has-publication-date", "Only include documents with a publication date.").option("--publication-date-from <date>", "Only include documents published on or after this date.").option("--publication-date-to <date>", "Only include documents published on or before this date.").option("--first-seen-at-from <date>", "Only include documents first seen on or after this date.").option("--first-seen-at-to <date>", "Only include documents first seen on or before this date.").option("--last-seen-at-from <date>", "Only include documents last seen on or after this date.").option("--last-seen-at-to <date>", "Only include documents last seen on or before this date.").option("--last-changed-at-from <date>", "Only include documents changed on or after this date.").option("--last-changed-at-to <date>", "Only include documents changed on or before this date.").option("--crawled-at-from <date>", "Only include documents crawled on or after this date.").option("--crawled-at-to <date>", "Only include documents crawled on or before this date.").option("--retrieval <mode>", `Retrieval mode: ${RETRIEVAL_MODE_LIST.join(", ")}`).option("--show-chunks", "Return chunk-level matches when available.").addHelpText("after", `
|
|
4151
4633
|
Examples:
|
|
4152
4634
|
qli search "pricing api limits"
|
|
4153
4635
|
qli search "authentication" --top-k 20 --tag docs
|
|
@@ -4160,6 +4642,7 @@ Examples:
|
|
|
4160
4642
|
Notes:
|
|
4161
4643
|
lexical works without vector models.
|
|
4162
4644
|
dense, sparse, and hybrid require the relevant index artifacts to exist.
|
|
4645
|
+
Use search-json when you want the raw Querylight 0.11 JSON DSL and hit format.
|
|
4163
4646
|
When you omit the query, qli returns the latest matching documents sorted by publication date.`).action(async function command(query, options) {
|
|
4164
4647
|
const global = this.optsWithGlobals();
|
|
4165
4648
|
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
@@ -4178,7 +4661,83 @@ Notes:
|
|
|
4178
4661
|
retrievalMode: parseRetrievalMode(options.retrieval),
|
|
4179
4662
|
showChunks: Boolean(options.showChunks)
|
|
4180
4663
|
});
|
|
4181
|
-
emit(global.json, capture, response("search", workspace, result), formatSearchResults(result
|
|
4664
|
+
emit(global.json, capture, response("search", workspace, result), formatSearchResults(result));
|
|
4665
|
+
});
|
|
4666
|
+
program.command("search-json").description("Run a raw Querylight 0.11 JSON DSL search request against the lexical index.").argument("<request>", "Inline JSON request or @path/to/request.json.").addHelpText("after", `
|
|
4667
|
+
Examples:
|
|
4668
|
+
qli search-json '{"query":{"match":{"text":"authentication"}},"size":5}'
|
|
4669
|
+
qli search-json @./search-request.json
|
|
4670
|
+
qli search-json '{"query":{"bool":{"filter":[{"term":{"sourceType":"rss"}}]}},"aggs":{"types":{"terms":{"field":"sourceType","size":5}}}}' --json
|
|
4671
|
+
|
|
4672
|
+
Notes:
|
|
4673
|
+
search-json uses the lexical index and Querylight 0.11 JSON DSL fields.
|
|
4674
|
+
Stored hit payloads are returned under _source.
|
|
4675
|
+
Use --json when another tool needs the full response envelope.`).action(async function command(requestInput) {
|
|
4676
|
+
const global = this.optsWithGlobals();
|
|
4677
|
+
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
4678
|
+
const request = await parseJsonArgument(requestInput);
|
|
4679
|
+
const result = await searchJsonIndex({
|
|
4680
|
+
workspacePath: workspace,
|
|
4681
|
+
request
|
|
4682
|
+
});
|
|
4683
|
+
emit(global.json, capture, response("search-json", workspace, result), JSON.stringify(result, null, 2));
|
|
4684
|
+
});
|
|
4685
|
+
program.command("serve").description("Start a small HTTP API that exposes Querylight JSON DSL search through an OpenSearch-like _search endpoint.").option("--host <host>", "Host interface to bind. Defaults to 127.0.0.1.", "127.0.0.1").option("--port <n>", "Port to bind. Use 0 to let the OS choose a free port.", "3000").addHelpText("after", `
|
|
4686
|
+
Examples:
|
|
4687
|
+
qli serve
|
|
4688
|
+
qli serve --workspace ./docs/.kb --port 4000
|
|
4689
|
+
qli serve --workspace ./kbs --host 0.0.0.0 --port 4000
|
|
4690
|
+
|
|
4691
|
+
Routes:
|
|
4692
|
+
Single workspace: POST /_search
|
|
4693
|
+
Single workspace: POST /<configured-index-name>/_search
|
|
4694
|
+
Multi-KB root: POST /<directory-name>/_search
|
|
4695
|
+
|
|
4696
|
+
Notes:
|
|
4697
|
+
The request body must be a Querylight JSON DSL object.
|
|
4698
|
+
serve only exposes lexical _search for now.
|
|
4699
|
+
When --workspace points to a directory of knowledge bases, each child directory must contain its own .kb workspace.
|
|
4700
|
+
Index files are loaded once at startup and reused across requests.`).action(async function command(options) {
|
|
4701
|
+
const global = this.optsWithGlobals();
|
|
4702
|
+
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
4703
|
+
const port = Number(options.port);
|
|
4704
|
+
if (!Number.isInteger(port) || port < 0 || port > 65535) {
|
|
4705
|
+
throw new CliError(`invalid port: ${options.port}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
|
|
4706
|
+
}
|
|
4707
|
+
const server = await startSearchApiServer({
|
|
4708
|
+
workspacePath: workspace,
|
|
4709
|
+
host: options.host,
|
|
4710
|
+
port
|
|
4711
|
+
});
|
|
4712
|
+
const data = {
|
|
4713
|
+
url: server.url,
|
|
4714
|
+
mode: server.mode,
|
|
4715
|
+
knowledgeBases: server.knowledgeBases
|
|
4716
|
+
};
|
|
4717
|
+
const human = [
|
|
4718
|
+
`Listening on ${server.url}`,
|
|
4719
|
+
...server.knowledgeBases.map((knowledgeBase) => `${knowledgeBase.route} -> ${knowledgeBase.workspacePath}`)
|
|
4720
|
+
].join("\n");
|
|
4721
|
+
emit(global.json, capture, response("serve", workspace, data), human);
|
|
4722
|
+
const shutdown = async () => {
|
|
4723
|
+
for (const signal of ["SIGINT", "SIGTERM"]) {
|
|
4724
|
+
process.off(signal, stop);
|
|
4725
|
+
}
|
|
4726
|
+
await server.close();
|
|
4727
|
+
};
|
|
4728
|
+
const stop = () => {
|
|
4729
|
+
void shutdown().then(() => resolveStop(), rejectStop);
|
|
4730
|
+
};
|
|
4731
|
+
let resolveStop;
|
|
4732
|
+
let rejectStop;
|
|
4733
|
+
const waitForStop = new Promise((resolve2, reject) => {
|
|
4734
|
+
resolveStop = resolve2;
|
|
4735
|
+
rejectStop = reject;
|
|
4736
|
+
});
|
|
4737
|
+
for (const signal of ["SIGINT", "SIGTERM"]) {
|
|
4738
|
+
process.once(signal, stop);
|
|
4739
|
+
}
|
|
4740
|
+
await waitForStop;
|
|
4182
4741
|
});
|
|
4183
4742
|
program.command("related").description("Find documents similar to an existing document by id or URI.").argument("<document>", "Document id, uri, or canonical uri").option("--top-k <n>", "Maximum number of related documents to return.", "12").addHelpText("after", `
|
|
4184
4743
|
Examples:
|
|
@@ -4303,7 +4862,7 @@ Examples:
|
|
|
4303
4862
|
try {
|
|
4304
4863
|
const meta = await readLatestIndexMetadata(workspace);
|
|
4305
4864
|
latestIndex = meta.createdAt;
|
|
4306
|
-
indexSize = (await
|
|
4865
|
+
indexSize = (await stat5(await resolveLatestIndexArtifactPath(workspace))).size;
|
|
4307
4866
|
} catch {
|
|
4308
4867
|
latestIndex = void 0;
|
|
4309
4868
|
}
|