@tryformation/querylight-cli 0.2.3 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -0
- package/dist/cli/main.js +397 -90
- package/dist/core/constants.d.ts +2 -2
- package/dist/index.d.ts +1 -0
- package/dist/index.js +310 -76
- package/dist/query/search-service.d.ts +7 -1
- package/dist/server/search-api.d.ts +15 -0
- package/dist/types/models.d.ts +3 -0
- package/dist/vector/dense.d.ts +6 -1
- package/package.json +3 -2
- package/scripts/assert-release-version.mjs +48 -0
- package/scripts/sparse-encode.py +29 -8
package/dist/core/constants.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
export declare const PACKAGE_NAME
|
|
2
|
-
export declare const PACKAGE_VERSION
|
|
1
|
+
export declare const PACKAGE_NAME: string;
|
|
2
|
+
export declare const PACKAGE_VERSION: string;
|
|
3
3
|
export declare const DEFAULT_WORKSPACE = ".kb";
|
|
4
4
|
export declare const DEFAULT_SHARED_MODEL_CACHE_DIR = "~/.qli/models/huggingface";
|
|
5
5
|
export declare const LEGACY_WORKSPACE_MODEL_CACHE_DIR = ".kb/models/huggingface";
|
package/dist/index.d.ts
CHANGED
|
@@ -6,6 +6,7 @@ export * from "./ingest/ingest-service.js";
|
|
|
6
6
|
export * from "./chunk/chunker.js";
|
|
7
7
|
export * from "./index/querylight-indexer.js";
|
|
8
8
|
export * from "./query/search-service.js";
|
|
9
|
+
export * from "./server/search-api.js";
|
|
9
10
|
export * from "./query/related-service.js";
|
|
10
11
|
export * from "./query/context-builder.js";
|
|
11
12
|
export * from "./report/diff-service.js";
|
package/dist/index.js
CHANGED
|
@@ -22,6 +22,11 @@ import path from "path";
|
|
|
22
22
|
import YAML from "yaml";
|
|
23
23
|
|
|
24
24
|
// src/core/constants.ts
|
|
25
|
+
import { createRequire } from "module";
|
|
26
|
+
var require2 = createRequire(import.meta.url);
|
|
27
|
+
var packageJson = require2("../../package.json");
|
|
28
|
+
var PACKAGE_NAME = packageJson.name;
|
|
29
|
+
var PACKAGE_VERSION = packageJson.version;
|
|
25
30
|
var DEFAULT_SHARED_MODEL_CACHE_DIR = "~/.qli/models/huggingface";
|
|
26
31
|
var LEGACY_WORKSPACE_MODEL_CACHE_DIR = ".kb/models/huggingface";
|
|
27
32
|
|
|
@@ -53,11 +58,14 @@ var defaultConfig = () => ({
|
|
|
53
58
|
maxContextChars: 12e3,
|
|
54
59
|
citationStyle: "markdown"
|
|
55
60
|
},
|
|
61
|
+
search: {
|
|
62
|
+
defaultTopK: 50
|
|
63
|
+
},
|
|
56
64
|
retrieval: {
|
|
57
65
|
defaultMode: "lexical",
|
|
58
66
|
dense: {
|
|
59
67
|
enabled: true,
|
|
60
|
-
modelId: "Xenova/
|
|
68
|
+
modelId: "Xenova/paraphrase-MiniLM-L3-v2",
|
|
61
69
|
cacheDir: DEFAULT_SHARED_MODEL_CACHE_DIR,
|
|
62
70
|
indexHashTables: 8,
|
|
63
71
|
indexRandomSeed: 42,
|
|
@@ -65,7 +73,7 @@ var defaultConfig = () => ({
|
|
|
65
73
|
},
|
|
66
74
|
sparse: {
|
|
67
75
|
enabled: true,
|
|
68
|
-
modelId: "opensearch-project/opensearch-neural-sparse-encoding-doc-
|
|
76
|
+
modelId: "opensearch-project/opensearch-neural-sparse-encoding-doc-v2-mini",
|
|
69
77
|
cacheDir: DEFAULT_SHARED_MODEL_CACHE_DIR,
|
|
70
78
|
documentTopTokens: 128,
|
|
71
79
|
queryEncoding: "tokenizer-token-weights",
|
|
@@ -74,12 +82,12 @@ var defaultConfig = () => ({
|
|
|
74
82
|
}
|
|
75
83
|
},
|
|
76
84
|
crawler: {
|
|
77
|
-
defaultUserAgent: "querylight-cli
|
|
85
|
+
defaultUserAgent: "querylight-cli",
|
|
78
86
|
obeyRobotsTxt: true,
|
|
79
87
|
rateLimitMs: 1e3,
|
|
80
88
|
maxConcurrentRequests: 5,
|
|
81
89
|
renderJs: false,
|
|
82
|
-
retentionDays:
|
|
90
|
+
retentionDays: 30,
|
|
83
91
|
fetchArticles: true
|
|
84
92
|
},
|
|
85
93
|
limits: {
|
|
@@ -123,6 +131,10 @@ async function loadConfig(workspacePath, configPath) {
|
|
|
123
131
|
...defaults.rag,
|
|
124
132
|
...parsed.rag ?? {}
|
|
125
133
|
},
|
|
134
|
+
search: {
|
|
135
|
+
...defaults.search,
|
|
136
|
+
...parsed.search ?? {}
|
|
137
|
+
},
|
|
126
138
|
retrieval: {
|
|
127
139
|
...defaults.retrieval,
|
|
128
140
|
...parsed.retrieval ?? {},
|
|
@@ -1069,7 +1081,7 @@ async function fetchUrlDocument({
|
|
|
1069
1081
|
publicationDate
|
|
1070
1082
|
}) {
|
|
1071
1083
|
const headers = {
|
|
1072
|
-
"user-agent": source.crawl?.userAgent ?? "querylight-cli
|
|
1084
|
+
"user-agent": source.crawl?.userAgent ?? "querylight-cli"
|
|
1073
1085
|
};
|
|
1074
1086
|
if (previous?.httpCache?.etag) {
|
|
1075
1087
|
headers["if-none-match"] = previous.httpCache.etag;
|
|
@@ -1213,13 +1225,17 @@ function isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules
|
|
|
1213
1225
|
if (url.search.length > 0) {
|
|
1214
1226
|
return false;
|
|
1215
1227
|
}
|
|
1216
|
-
|
|
1228
|
+
const pathname = url.pathname.toLowerCase();
|
|
1229
|
+
if (pathname.endsWith(".xml")) {
|
|
1217
1230
|
return false;
|
|
1218
1231
|
}
|
|
1219
|
-
if (
|
|
1232
|
+
if (pathname.endsWith(".pdf")) {
|
|
1220
1233
|
return false;
|
|
1221
1234
|
}
|
|
1222
|
-
if (
|
|
1235
|
+
if (pathname.includes("/cdn-cgi/")) {
|
|
1236
|
+
return false;
|
|
1237
|
+
}
|
|
1238
|
+
if (pathname === "/search" || pathname === "/search/" || pathname.endsWith("/search/")) {
|
|
1223
1239
|
return false;
|
|
1224
1240
|
}
|
|
1225
1241
|
if (disallowRules.some((rule) => rule !== "/" && url.pathname.startsWith(rule))) {
|
|
@@ -1364,7 +1380,7 @@ async function purgeDocuments(workspacePath, documentIds, documents) {
|
|
|
1364
1380
|
async function fetchFeedText(source) {
|
|
1365
1381
|
const response = await fetch(source.uri, {
|
|
1366
1382
|
headers: {
|
|
1367
|
-
"user-agent": source.crawl?.userAgent ?? "querylight-cli
|
|
1383
|
+
"user-agent": source.crawl?.userAgent ?? "querylight-cli"
|
|
1368
1384
|
}
|
|
1369
1385
|
});
|
|
1370
1386
|
if (!response.ok) {
|
|
@@ -2058,15 +2074,26 @@ function createSparseChunkText(chunk) {
|
|
|
2058
2074
|
// src/vector/dense.ts
|
|
2059
2075
|
var denseEmbedderFactory = null;
|
|
2060
2076
|
var EXACT_DENSE_RERANK_THRESHOLD = 5e3;
|
|
2077
|
+
function normalizeDenseEmbedder(embedder) {
|
|
2078
|
+
if (typeof embedder === "function") {
|
|
2079
|
+
return { embed: embedder };
|
|
2080
|
+
}
|
|
2081
|
+
return embedder;
|
|
2082
|
+
}
|
|
2061
2083
|
async function createEmbedder(cacheDir, modelId) {
|
|
2062
2084
|
if (denseEmbedderFactory) {
|
|
2063
|
-
return denseEmbedderFactory(cacheDir, modelId);
|
|
2085
|
+
return normalizeDenseEmbedder(await denseEmbedderFactory(cacheDir, modelId));
|
|
2064
2086
|
}
|
|
2065
2087
|
const runtime = await getDenseTransformersRuntime(cacheDir);
|
|
2066
2088
|
const extractor = await runtime.pipeline("feature-extraction", modelId);
|
|
2067
|
-
return
|
|
2068
|
-
|
|
2069
|
-
|
|
2089
|
+
return {
|
|
2090
|
+
async embed(text) {
|
|
2091
|
+
const output = await extractor(text, { pooling: "mean", normalize: true });
|
|
2092
|
+
return output.tolist()[0];
|
|
2093
|
+
},
|
|
2094
|
+
async dispose() {
|
|
2095
|
+
await extractor.dispose();
|
|
2096
|
+
}
|
|
2070
2097
|
};
|
|
2071
2098
|
}
|
|
2072
2099
|
function exactDenseQuery(payload, vector, topK) {
|
|
@@ -2080,53 +2107,57 @@ async function buildDenseVectors({
|
|
|
2080
2107
|
const chunks = await readJsonl(path14.join(workspacePath, "chunks", "chunks.jsonl"));
|
|
2081
2108
|
const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
|
|
2082
2109
|
await mkdir7(cacheDir, { recursive: true });
|
|
2083
|
-
const
|
|
2084
|
-
|
|
2085
|
-
|
|
2086
|
-
|
|
2087
|
-
|
|
2088
|
-
const
|
|
2089
|
-
|
|
2090
|
-
|
|
2091
|
-
|
|
2092
|
-
|
|
2093
|
-
|
|
2094
|
-
|
|
2095
|
-
|
|
2096
|
-
|
|
2097
|
-
|
|
2098
|
-
|
|
2099
|
-
|
|
2100
|
-
|
|
2101
|
-
|
|
2110
|
+
const embedder = await createEmbedder(cacheDir, config.modelId);
|
|
2111
|
+
try {
|
|
2112
|
+
const records = [];
|
|
2113
|
+
let dimensions = 0;
|
|
2114
|
+
reportProgress(progress, `Encoding ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} for dense retrieval`);
|
|
2115
|
+
for (const chunk of chunks) {
|
|
2116
|
+
const embedding = await embedder.embed(createDenseChunkText(chunk));
|
|
2117
|
+
dimensions ||= embedding.length;
|
|
2118
|
+
records.push({
|
|
2119
|
+
chunkId: chunk.id,
|
|
2120
|
+
documentId: chunk.documentId,
|
|
2121
|
+
sourceId: chunk.sourceId,
|
|
2122
|
+
title: chunk.title,
|
|
2123
|
+
uri: chunk.uri,
|
|
2124
|
+
headingPath: chunk.headingPath,
|
|
2125
|
+
text: chunk.text,
|
|
2126
|
+
embedding
|
|
2127
|
+
});
|
|
2128
|
+
if (records.length === 1 || records.length % 100 === 0 || records.length === chunks.length) {
|
|
2129
|
+
reportProgressDetail(progress, `Encoded ${records.length}/${chunks.length} chunks for dense retrieval`);
|
|
2130
|
+
}
|
|
2102
2131
|
}
|
|
2132
|
+
reportProgress(progress, "Building dense vector index");
|
|
2133
|
+
const index = new VectorFieldIndex({
|
|
2134
|
+
numHashTables: config.indexHashTables,
|
|
2135
|
+
dimensions,
|
|
2136
|
+
random: createSeededRandom(config.indexRandomSeed)
|
|
2137
|
+
});
|
|
2138
|
+
for (const record of records) {
|
|
2139
|
+
index.insert(record.chunkId, [record.embedding]);
|
|
2140
|
+
}
|
|
2141
|
+
const metadata = {
|
|
2142
|
+
createdAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2143
|
+
modelId: config.modelId,
|
|
2144
|
+
dimensions,
|
|
2145
|
+
hashTables: config.indexHashTables,
|
|
2146
|
+
randomSeed: config.indexRandomSeed,
|
|
2147
|
+
chunkCount: records.length,
|
|
2148
|
+
indexHash: sha256(JSON.stringify(index.indexState))
|
|
2149
|
+
};
|
|
2150
|
+
const payload = {
|
|
2151
|
+
metadata,
|
|
2152
|
+
indexState: index.indexState,
|
|
2153
|
+
chunks: records
|
|
2154
|
+
};
|
|
2155
|
+
await writeDensePayload(workspacePath, payload);
|
|
2156
|
+
reportProgress(progress, `Dense vectors written for ${records.length} chunk${records.length === 1 ? "" : "s"}`);
|
|
2157
|
+
return payload;
|
|
2158
|
+
} finally {
|
|
2159
|
+
await embedder.dispose?.();
|
|
2103
2160
|
}
|
|
2104
|
-
reportProgress(progress, "Building dense vector index");
|
|
2105
|
-
const index = new VectorFieldIndex({
|
|
2106
|
-
numHashTables: config.indexHashTables,
|
|
2107
|
-
dimensions,
|
|
2108
|
-
random: createSeededRandom(config.indexRandomSeed)
|
|
2109
|
-
});
|
|
2110
|
-
for (const record of records) {
|
|
2111
|
-
index.insert(record.chunkId, [record.embedding]);
|
|
2112
|
-
}
|
|
2113
|
-
const metadata = {
|
|
2114
|
-
createdAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2115
|
-
modelId: config.modelId,
|
|
2116
|
-
dimensions,
|
|
2117
|
-
hashTables: config.indexHashTables,
|
|
2118
|
-
randomSeed: config.indexRandomSeed,
|
|
2119
|
-
chunkCount: records.length,
|
|
2120
|
-
indexHash: sha256(JSON.stringify(index.indexState))
|
|
2121
|
-
};
|
|
2122
|
-
const payload = {
|
|
2123
|
-
metadata,
|
|
2124
|
-
indexState: index.indexState,
|
|
2125
|
-
chunks: records
|
|
2126
|
-
};
|
|
2127
|
-
await writeDensePayload(workspacePath, payload);
|
|
2128
|
-
reportProgress(progress, `Dense vectors written for ${records.length} chunk${records.length === 1 ? "" : "s"}`);
|
|
2129
|
-
return payload;
|
|
2130
2161
|
}
|
|
2131
2162
|
async function denseQuery({
|
|
2132
2163
|
workspacePath,
|
|
@@ -2136,21 +2167,25 @@ async function denseQuery({
|
|
|
2136
2167
|
}) {
|
|
2137
2168
|
const payload = await readDensePayload(workspacePath);
|
|
2138
2169
|
const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
|
|
2139
|
-
const
|
|
2140
|
-
|
|
2141
|
-
|
|
2170
|
+
const embedder = await createEmbedder(cacheDir, config.modelId);
|
|
2171
|
+
try {
|
|
2172
|
+
const vector = await embedder.embed(query);
|
|
2173
|
+
if (payload.chunks.length <= EXACT_DENSE_RERANK_THRESHOLD) {
|
|
2174
|
+
return exactDenseQuery(payload, vector, topK);
|
|
2175
|
+
}
|
|
2176
|
+
const index = new VectorFieldIndex({
|
|
2177
|
+
numHashTables: payload.metadata.hashTables,
|
|
2178
|
+
dimensions: payload.metadata.dimensions,
|
|
2179
|
+
random: createSeededRandom(payload.metadata.randomSeed)
|
|
2180
|
+
}).loadState(payload.indexState);
|
|
2181
|
+
const approximateHits = index.query(vector, topK);
|
|
2182
|
+
if (approximateHits.length >= topK) {
|
|
2183
|
+
return approximateHits;
|
|
2184
|
+
}
|
|
2142
2185
|
return exactDenseQuery(payload, vector, topK);
|
|
2186
|
+
} finally {
|
|
2187
|
+
await embedder.dispose?.();
|
|
2143
2188
|
}
|
|
2144
|
-
const index = new VectorFieldIndex({
|
|
2145
|
-
numHashTables: payload.metadata.hashTables,
|
|
2146
|
-
dimensions: payload.metadata.dimensions,
|
|
2147
|
-
random: createSeededRandom(payload.metadata.randomSeed)
|
|
2148
|
-
}).loadState(payload.indexState);
|
|
2149
|
-
const approximateHits = index.query(vector, topK);
|
|
2150
|
-
if (approximateHits.length >= topK) {
|
|
2151
|
-
return approximateHits;
|
|
2152
|
-
}
|
|
2153
|
-
return exactDenseQuery(payload, vector, topK);
|
|
2154
2189
|
}
|
|
2155
2190
|
|
|
2156
2191
|
// src/vector/sparse.ts
|
|
@@ -2894,13 +2929,20 @@ function searchResultsFromResponse(response, showChunks = false) {
|
|
|
2894
2929
|
metadata: hit._source.metadata
|
|
2895
2930
|
}));
|
|
2896
2931
|
}
|
|
2932
|
+
async function searchJsonRequest({
|
|
2933
|
+
index,
|
|
2934
|
+
request,
|
|
2935
|
+
indexName = "querylight"
|
|
2936
|
+
}) {
|
|
2937
|
+
return searchJsonDsl({ index, request, indexName });
|
|
2938
|
+
}
|
|
2897
2939
|
async function searchJsonIndex({
|
|
2898
2940
|
workspacePath,
|
|
2899
2941
|
request,
|
|
2900
2942
|
indexName = "querylight"
|
|
2901
2943
|
}) {
|
|
2902
2944
|
const index = await loadHydratedIndex(workspacePath);
|
|
2903
|
-
return
|
|
2945
|
+
return searchJsonRequest({ index, request, indexName });
|
|
2904
2946
|
}
|
|
2905
2947
|
function normalizeDisplayTitle(title) {
|
|
2906
2948
|
return title.replace(/\s*\|\s*Querylight TS Demo\s*$/i, "").replace(/\s+/g, " ").trim();
|
|
@@ -3216,8 +3258,197 @@ async function searchIndex({
|
|
|
3216
3258
|
return createSearchResponse(mode, finalHits, Date.now() - startedAt);
|
|
3217
3259
|
}
|
|
3218
3260
|
|
|
3219
|
-
// src/
|
|
3261
|
+
// src/server/search-api.ts
|
|
3262
|
+
import { createServer } from "http";
|
|
3263
|
+
import { readdir, stat as stat4 } from "fs/promises";
|
|
3220
3264
|
import path19 from "path";
|
|
3265
|
+
async function pathIsDirectory(candidatePath) {
|
|
3266
|
+
try {
|
|
3267
|
+
return (await stat4(candidatePath)).isDirectory();
|
|
3268
|
+
} catch {
|
|
3269
|
+
return false;
|
|
3270
|
+
}
|
|
3271
|
+
}
|
|
3272
|
+
async function discoverKnowledgeBases(workspacePath) {
|
|
3273
|
+
try {
|
|
3274
|
+
const singleWorkspace = await assertWorkspaceExists(workspacePath);
|
|
3275
|
+
const config = await loadConfig(singleWorkspace);
|
|
3276
|
+
const index = await loadHydratedIndex(singleWorkspace);
|
|
3277
|
+
return {
|
|
3278
|
+
mode: "single",
|
|
3279
|
+
knowledgeBases: [{
|
|
3280
|
+
name: config.index.name,
|
|
3281
|
+
workspacePath: singleWorkspace,
|
|
3282
|
+
configuredIndexName: config.index.name,
|
|
3283
|
+
index
|
|
3284
|
+
}]
|
|
3285
|
+
};
|
|
3286
|
+
} catch (error) {
|
|
3287
|
+
if (!(error instanceof CliError) || error.code !== "WORKSPACE_ERROR") {
|
|
3288
|
+
throw error;
|
|
3289
|
+
}
|
|
3290
|
+
}
|
|
3291
|
+
const resolvedRoot = path19.resolve(workspacePath);
|
|
3292
|
+
if (!await pathIsDirectory(resolvedRoot)) {
|
|
3293
|
+
throw new CliError(`workspace path does not exist: ${resolvedRoot}`, "WORKSPACE_ERROR", 3 /* WorkspaceError */);
|
|
3294
|
+
}
|
|
3295
|
+
const entries = await readdir(resolvedRoot, { withFileTypes: true });
|
|
3296
|
+
const knowledgeBases = (await Promise.all(entries.filter((entry) => entry.isDirectory()).map(async (entry) => {
|
|
3297
|
+
const candidateWorkspace = path19.join(resolvedRoot, entry.name, ".kb");
|
|
3298
|
+
try {
|
|
3299
|
+
const workspace = await assertWorkspaceExists(candidateWorkspace);
|
|
3300
|
+
const config = await loadConfig(workspace);
|
|
3301
|
+
const index = await loadHydratedIndex(workspace);
|
|
3302
|
+
return {
|
|
3303
|
+
name: entry.name,
|
|
3304
|
+
workspacePath: workspace,
|
|
3305
|
+
configuredIndexName: config.index.name,
|
|
3306
|
+
index
|
|
3307
|
+
};
|
|
3308
|
+
} catch (error) {
|
|
3309
|
+
if (error instanceof CliError && error.code === "WORKSPACE_ERROR") {
|
|
3310
|
+
return null;
|
|
3311
|
+
}
|
|
3312
|
+
throw error;
|
|
3313
|
+
}
|
|
3314
|
+
}))).filter((knowledgeBase) => knowledgeBase != null);
|
|
3315
|
+
if (knowledgeBases.length === 0) {
|
|
3316
|
+
throw new CliError(
|
|
3317
|
+
`no knowledge bases found at ${resolvedRoot}; use a .kb workspace or a directory of named subdirectories that each contain .kb`,
|
|
3318
|
+
"WORKSPACE_ERROR",
|
|
3319
|
+
3 /* WorkspaceError */
|
|
3320
|
+
);
|
|
3321
|
+
}
|
|
3322
|
+
return { mode: "multi", knowledgeBases };
|
|
3323
|
+
}
|
|
3324
|
+
function sendJson(response, statusCode, payload) {
|
|
3325
|
+
response.statusCode = statusCode;
|
|
3326
|
+
response.setHeader("content-type", "application/json; charset=utf-8");
|
|
3327
|
+
response.end(JSON.stringify(payload));
|
|
3328
|
+
}
|
|
3329
|
+
function sendError(response, statusCode, type, reason) {
|
|
3330
|
+
sendJson(response, statusCode, {
|
|
3331
|
+
error: {
|
|
3332
|
+
type,
|
|
3333
|
+
reason
|
|
3334
|
+
},
|
|
3335
|
+
status: statusCode
|
|
3336
|
+
});
|
|
3337
|
+
}
|
|
3338
|
+
async function readRequestBody(request) {
|
|
3339
|
+
const chunks = [];
|
|
3340
|
+
for await (const chunk of request) {
|
|
3341
|
+
chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
|
|
3342
|
+
}
|
|
3343
|
+
return Buffer.concat(chunks).toString("utf8");
|
|
3344
|
+
}
|
|
3345
|
+
function parseSearchRequest(raw) {
|
|
3346
|
+
const normalized = raw.trim();
|
|
3347
|
+
if (normalized.length === 0) {
|
|
3348
|
+
return {};
|
|
3349
|
+
}
|
|
3350
|
+
try {
|
|
3351
|
+
const parsed = JSON.parse(normalized);
|
|
3352
|
+
if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
|
|
3353
|
+
throw new Error("expected a JSON object");
|
|
3354
|
+
}
|
|
3355
|
+
return parsed;
|
|
3356
|
+
} catch (error) {
|
|
3357
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3358
|
+
throw new CliError(`invalid JSON request: ${message}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
|
|
3359
|
+
}
|
|
3360
|
+
}
|
|
3361
|
+
function routeForKnowledgeBase(mode, knowledgeBase) {
|
|
3362
|
+
return mode === "single" ? "/_search" : `/${knowledgeBase.name}/_search`;
|
|
3363
|
+
}
|
|
3364
|
+
function resolveKnowledgeBaseForPath(pathname, mode, knowledgeBases) {
|
|
3365
|
+
const segments = pathname.split("/").filter(Boolean);
|
|
3366
|
+
if (mode === "single") {
|
|
3367
|
+
const knowledgeBase = [...knowledgeBases.values()][0];
|
|
3368
|
+
if (!knowledgeBase) {
|
|
3369
|
+
return null;
|
|
3370
|
+
}
|
|
3371
|
+
if (segments.length === 1 && segments[0] === "_search") {
|
|
3372
|
+
return knowledgeBase;
|
|
3373
|
+
}
|
|
3374
|
+
if (segments.length === 2 && segments[1] === "_search" && segments[0] === knowledgeBase.configuredIndexName) {
|
|
3375
|
+
return knowledgeBase;
|
|
3376
|
+
}
|
|
3377
|
+
return null;
|
|
3378
|
+
}
|
|
3379
|
+
if (segments.length === 2 && segments[1] === "_search") {
|
|
3380
|
+
return knowledgeBases.get(segments[0]) ?? null;
|
|
3381
|
+
}
|
|
3382
|
+
return null;
|
|
3383
|
+
}
|
|
3384
|
+
async function handleSearchRequest(request, response, pathname, mode, knowledgeBases) {
|
|
3385
|
+
if (request.method !== "GET" && request.method !== "POST") {
|
|
3386
|
+
response.setHeader("allow", "GET, POST");
|
|
3387
|
+
sendError(response, 405, "method_not_allowed", `unsupported method for ${pathname}`);
|
|
3388
|
+
return;
|
|
3389
|
+
}
|
|
3390
|
+
const knowledgeBase = resolveKnowledgeBaseForPath(pathname, mode, knowledgeBases);
|
|
3391
|
+
if (!knowledgeBase) {
|
|
3392
|
+
sendError(response, 404, "resource_not_found_exception", `unknown search route: ${pathname}`);
|
|
3393
|
+
return;
|
|
3394
|
+
}
|
|
3395
|
+
try {
|
|
3396
|
+
const requestBody = parseSearchRequest(await readRequestBody(request));
|
|
3397
|
+
const indexName = mode === "multi" ? knowledgeBase.name : knowledgeBase.configuredIndexName;
|
|
3398
|
+
const result = await searchJsonRequest({
|
|
3399
|
+
index: knowledgeBase.index,
|
|
3400
|
+
request: requestBody,
|
|
3401
|
+
indexName
|
|
3402
|
+
});
|
|
3403
|
+
sendJson(response, 200, result);
|
|
3404
|
+
} catch (error) {
|
|
3405
|
+
if (error instanceof CliError && error.code === "INVALID_ARGUMENT") {
|
|
3406
|
+
sendError(response, 400, "parse_exception", error.message);
|
|
3407
|
+
return;
|
|
3408
|
+
}
|
|
3409
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3410
|
+
sendError(response, 500, "search_phase_execution_exception", message);
|
|
3411
|
+
}
|
|
3412
|
+
}
|
|
3413
|
+
async function startSearchApiServer({
|
|
3414
|
+
workspacePath,
|
|
3415
|
+
host = "127.0.0.1",
|
|
3416
|
+
port = 3e3
|
|
3417
|
+
}) {
|
|
3418
|
+
const { mode, knowledgeBases } = await discoverKnowledgeBases(workspacePath);
|
|
3419
|
+
const byName = new Map(knowledgeBases.map((knowledgeBase) => [knowledgeBase.name, knowledgeBase]));
|
|
3420
|
+
const server = createServer(async (request, response) => {
|
|
3421
|
+
const url2 = new URL(request.url ?? "/", `http://${request.headers.host ?? `${host}:${port}`}`);
|
|
3422
|
+
await handleSearchRequest(request, response, url2.pathname, mode, byName);
|
|
3423
|
+
});
|
|
3424
|
+
await new Promise((resolve2, reject) => {
|
|
3425
|
+
server.once("error", reject);
|
|
3426
|
+
server.listen(port, host, () => {
|
|
3427
|
+
server.off("error", reject);
|
|
3428
|
+
resolve2();
|
|
3429
|
+
});
|
|
3430
|
+
});
|
|
3431
|
+
const address = server.address();
|
|
3432
|
+
if (!address || typeof address === "string") {
|
|
3433
|
+
throw new CliError("server failed to bind to a TCP address", "SERVER_ERROR", 1 /* GeneralError */);
|
|
3434
|
+
}
|
|
3435
|
+
const url = `http://${host}:${address.port}`;
|
|
3436
|
+
return {
|
|
3437
|
+
mode,
|
|
3438
|
+
url,
|
|
3439
|
+
knowledgeBases: knowledgeBases.map((knowledgeBase) => ({
|
|
3440
|
+
name: knowledgeBase.name,
|
|
3441
|
+
workspacePath: knowledgeBase.workspacePath,
|
|
3442
|
+
route: routeForKnowledgeBase(mode, knowledgeBase)
|
|
3443
|
+
})),
|
|
3444
|
+
close: async () => new Promise((resolve2, reject) => {
|
|
3445
|
+
server.close((error) => error ? reject(error) : resolve2());
|
|
3446
|
+
})
|
|
3447
|
+
};
|
|
3448
|
+
}
|
|
3449
|
+
|
|
3450
|
+
// src/query/related-service.ts
|
|
3451
|
+
import path20 from "path";
|
|
3221
3452
|
function cosineSimilarity2(left, right) {
|
|
3222
3453
|
let dot = 0;
|
|
3223
3454
|
let leftNorm = 0;
|
|
@@ -3293,7 +3524,7 @@ async function findRelatedDocuments({
|
|
|
3293
3524
|
if (!await fileExists(denseVectorPath(workspacePath))) {
|
|
3294
3525
|
throw new CliError("dense vector index is not built; run `qli models pull --dense` and `qli rebuild`", "DENSE_INDEX_MISSING", 7 /* QueryError */);
|
|
3295
3526
|
}
|
|
3296
|
-
const documents = await readJsonl(
|
|
3527
|
+
const documents = await readJsonl(path20.join(workspacePath, "documents", "documents.jsonl"));
|
|
3297
3528
|
const selected = resolveDocumentSelector(documents, document);
|
|
3298
3529
|
const densePayload = await readDensePayload(workspacePath);
|
|
3299
3530
|
const vectors = buildDocumentVectors(documents, densePayload.chunks, densePayload.metadata.dimensions);
|
|
@@ -3366,7 +3597,7 @@ async function createContext({
|
|
|
3366
3597
|
}
|
|
3367
3598
|
|
|
3368
3599
|
// src/report/diff-service.ts
|
|
3369
|
-
import
|
|
3600
|
+
import path21 from "path";
|
|
3370
3601
|
function chooseBaselineRun(runs, since) {
|
|
3371
3602
|
if (since === "last-run") {
|
|
3372
3603
|
return runs.at(-1);
|
|
@@ -3382,7 +3613,7 @@ async function diffWorkspace({
|
|
|
3382
3613
|
documentId,
|
|
3383
3614
|
since
|
|
3384
3615
|
}) {
|
|
3385
|
-
const current = await readJsonl(
|
|
3616
|
+
const current = await readJsonl(path21.join(workspacePath, "documents", "documents.jsonl"));
|
|
3386
3617
|
const baseline = chooseBaselineRun(await listRuns(workspacePath), since);
|
|
3387
3618
|
const previous = new Map((baseline?.documentsSnapshot ?? []).map((document) => [document.id, document]));
|
|
3388
3619
|
const changedDocuments = current.filter((document) => (!sourceId || document.sourceId === sourceId) && (!documentId || document.id === documentId)).filter((document) => {
|
|
@@ -3438,12 +3669,15 @@ export {
|
|
|
3438
3669
|
ingestSources,
|
|
3439
3670
|
listSources,
|
|
3440
3671
|
loadConfig,
|
|
3672
|
+
loadHydratedIndex,
|
|
3441
3673
|
removeSource,
|
|
3442
3674
|
renderChangeReport,
|
|
3443
3675
|
reprocessDocuments,
|
|
3444
3676
|
searchIndex,
|
|
3445
3677
|
searchJsonIndex,
|
|
3678
|
+
searchJsonRequest,
|
|
3446
3679
|
searchResultsFromResponse,
|
|
3680
|
+
startSearchApiServer,
|
|
3447
3681
|
updateSource,
|
|
3448
3682
|
writeDefaultConfig
|
|
3449
3683
|
};
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
import { type JsonDslRequest, type JsonDslResponse } from "@tryformation/querylight-ts";
|
|
1
|
+
import { type DocumentIndex, type JsonDslRequest, type JsonDslResponse } from "@tryformation/querylight-ts";
|
|
2
2
|
import type { RetrievalMode, SearchResponseData, SearchResult } from "../types/models.js";
|
|
3
|
+
export declare function loadHydratedIndex(workspacePath: string): Promise<DocumentIndex>;
|
|
3
4
|
type SearchDateField = "publicationDate" | "firstSeenAt" | "lastSeenAt" | "lastChangedAt" | "crawledAt";
|
|
4
5
|
type SearchDateRange = {
|
|
5
6
|
field: SearchDateField;
|
|
@@ -7,6 +8,11 @@ type SearchDateRange = {
|
|
|
7
8
|
to?: string;
|
|
8
9
|
};
|
|
9
10
|
export declare function searchResultsFromResponse(response: SearchResponseData, showChunks?: boolean): SearchResult[];
|
|
11
|
+
export declare function searchJsonRequest({ index, request, indexName }: {
|
|
12
|
+
index: DocumentIndex;
|
|
13
|
+
request: JsonDslRequest;
|
|
14
|
+
indexName?: string;
|
|
15
|
+
}): Promise<JsonDslResponse>;
|
|
10
16
|
export declare function searchJsonIndex({ workspacePath, request, indexName }: {
|
|
11
17
|
workspacePath: string;
|
|
12
18
|
request: JsonDslRequest;
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
export type SearchApiServerInfo = {
|
|
2
|
+
mode: "single" | "multi";
|
|
3
|
+
url: string;
|
|
4
|
+
knowledgeBases: Array<{
|
|
5
|
+
name: string;
|
|
6
|
+
workspacePath: string;
|
|
7
|
+
route: string;
|
|
8
|
+
}>;
|
|
9
|
+
close: () => Promise<void>;
|
|
10
|
+
};
|
|
11
|
+
export declare function startSearchApiServer({ workspacePath, host, port }: {
|
|
12
|
+
workspacePath: string;
|
|
13
|
+
host?: string;
|
|
14
|
+
port?: number;
|
|
15
|
+
}): Promise<SearchApiServerInfo>;
|
package/dist/types/models.d.ts
CHANGED
package/dist/vector/dense.d.ts
CHANGED
|
@@ -1,6 +1,10 @@
|
|
|
1
1
|
import { type ProgressHandler } from "../core/progress.js";
|
|
2
2
|
import type { DenseVectorPayload, WorkspaceConfig } from "../types/models.js";
|
|
3
|
-
|
|
3
|
+
type DenseEmbedder = {
|
|
4
|
+
embed(text: string): Promise<number[]>;
|
|
5
|
+
dispose?: () => Promise<void>;
|
|
6
|
+
};
|
|
7
|
+
export declare function setDenseEmbedderFactoryForTests(factory: ((cacheDir: string, modelId: string) => Promise<DenseEmbedder | ((text: string) => Promise<number[]>)>) | null): void;
|
|
4
8
|
export declare function pullDenseModel(workspacePath: string, config: WorkspaceConfig["retrieval"]["dense"]): Promise<void>;
|
|
5
9
|
export declare function buildDenseVectors({ workspacePath, config, progress }: {
|
|
6
10
|
workspacePath: string;
|
|
@@ -13,3 +17,4 @@ export declare function denseQuery({ workspacePath, config, query, topK }: {
|
|
|
13
17
|
query: string;
|
|
14
18
|
topK: number;
|
|
15
19
|
}): Promise<Array<[string, number]>>;
|
|
20
|
+
export {};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tryformation/querylight-cli",
|
|
3
|
-
"version": "0.2.
|
|
3
|
+
"version": "0.2.5",
|
|
4
4
|
"description": "Querylight CLI for building and querying local knowledge bases.",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"homepage": "https://github.com/formation-res/querylight-cli#readme",
|
|
@@ -36,7 +36,8 @@
|
|
|
36
36
|
"test:watch": "vitest",
|
|
37
37
|
"lint": "tsc --noEmit",
|
|
38
38
|
"check": "npm run lint && npm test",
|
|
39
|
-
"prepublishOnly": "npm run check && npm run build"
|
|
39
|
+
"prepublishOnly": "npm run check && npm run build && npm run verify:release-version",
|
|
40
|
+
"verify:release-version": "node scripts/assert-release-version.mjs"
|
|
40
41
|
},
|
|
41
42
|
"dependencies": {
|
|
42
43
|
"@huggingface/transformers": "^3.8.1",
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import assert from "node:assert/strict";
|
|
2
|
+
import { mkdtemp, rm } from "node:fs/promises";
|
|
3
|
+
import os from "node:os";
|
|
4
|
+
import path from "node:path";
|
|
5
|
+
import { spawn } from "node:child_process";
|
|
6
|
+
import packageJson from "../package.json" with { type: "json" };
|
|
7
|
+
|
|
8
|
+
function run(command, args, options = {}) {
|
|
9
|
+
return new Promise((resolve, reject) => {
|
|
10
|
+
const child = spawn(command, args, {
|
|
11
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
12
|
+
...options
|
|
13
|
+
});
|
|
14
|
+
let stdout = "";
|
|
15
|
+
let stderr = "";
|
|
16
|
+
|
|
17
|
+
child.stdout.on("data", (chunk) => {
|
|
18
|
+
stdout += String(chunk);
|
|
19
|
+
});
|
|
20
|
+
child.stderr.on("data", (chunk) => {
|
|
21
|
+
stderr += String(chunk);
|
|
22
|
+
});
|
|
23
|
+
child.on("error", reject);
|
|
24
|
+
child.on("close", (code) => {
|
|
25
|
+
if (code === 0) {
|
|
26
|
+
resolve({ stdout, stderr });
|
|
27
|
+
return;
|
|
28
|
+
}
|
|
29
|
+
reject(new Error(`${command} ${args.join(" ")} failed with exit code ${code}\n${stderr}`));
|
|
30
|
+
});
|
|
31
|
+
});
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
const workspaceRoot = await mkdtemp(path.join(os.tmpdir(), "qli-release-version-"));
|
|
35
|
+
const workspacePath = path.join(workspaceRoot, ".kb");
|
|
36
|
+
|
|
37
|
+
try {
|
|
38
|
+
const { stdout } = await run("node", ["dist/cli/main.js", "init", "--workspace", workspacePath, "--json"], {
|
|
39
|
+
cwd: new URL("..", import.meta.url)
|
|
40
|
+
});
|
|
41
|
+
const parsed = JSON.parse(stdout);
|
|
42
|
+
|
|
43
|
+
assert.equal(parsed.ok, true, "Expected qli init --json to succeed");
|
|
44
|
+
assert.equal(parsed.version, packageJson.version, `Built CLI reported version ${parsed.version}, expected ${packageJson.version}`);
|
|
45
|
+
process.stdout.write(`Verified built CLI version ${parsed.version}\n`);
|
|
46
|
+
} finally {
|
|
47
|
+
await rm(workspaceRoot, { recursive: true, force: true });
|
|
48
|
+
}
|