@tryformation/querylight-cli 0.2.3 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -0
- package/dist/cli/main.js +397 -90
- package/dist/core/constants.d.ts +2 -2
- package/dist/index.d.ts +1 -0
- package/dist/index.js +310 -76
- package/dist/query/search-service.d.ts +7 -1
- package/dist/server/search-api.d.ts +15 -0
- package/dist/types/models.d.ts +3 -0
- package/dist/vector/dense.d.ts +6 -1
- package/package.json +3 -2
- package/scripts/assert-release-version.mjs +48 -0
- package/scripts/sparse-encode.py +29 -8
package/dist/cli/main.js
CHANGED
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
// src/cli/run-cli.ts
|
|
4
4
|
import { Command, Option } from "commander";
|
|
5
|
-
import { readFile as readFile11, stat as
|
|
6
|
-
import
|
|
5
|
+
import { readFile as readFile11, stat as stat5 } from "fs/promises";
|
|
6
|
+
import path22 from "path";
|
|
7
7
|
|
|
8
8
|
// src/chunk/chunker.ts
|
|
9
9
|
import { readFile as readFile3 } from "fs/promises";
|
|
@@ -16,7 +16,11 @@ import path from "path";
|
|
|
16
16
|
import YAML from "yaml";
|
|
17
17
|
|
|
18
18
|
// src/core/constants.ts
|
|
19
|
-
|
|
19
|
+
import { createRequire } from "module";
|
|
20
|
+
var require2 = createRequire(import.meta.url);
|
|
21
|
+
var packageJson = require2("../../package.json");
|
|
22
|
+
var PACKAGE_NAME = packageJson.name;
|
|
23
|
+
var PACKAGE_VERSION = packageJson.version;
|
|
20
24
|
var DEFAULT_WORKSPACE = ".kb";
|
|
21
25
|
var DEFAULT_SHARED_MODEL_CACHE_DIR = "~/.qli/models/huggingface";
|
|
22
26
|
var LEGACY_WORKSPACE_MODEL_CACHE_DIR = ".kb/models/huggingface";
|
|
@@ -49,11 +53,14 @@ var defaultConfig = () => ({
|
|
|
49
53
|
maxContextChars: 12e3,
|
|
50
54
|
citationStyle: "markdown"
|
|
51
55
|
},
|
|
56
|
+
search: {
|
|
57
|
+
defaultTopK: 50
|
|
58
|
+
},
|
|
52
59
|
retrieval: {
|
|
53
60
|
defaultMode: "lexical",
|
|
54
61
|
dense: {
|
|
55
62
|
enabled: true,
|
|
56
|
-
modelId: "Xenova/
|
|
63
|
+
modelId: "Xenova/paraphrase-MiniLM-L3-v2",
|
|
57
64
|
cacheDir: DEFAULT_SHARED_MODEL_CACHE_DIR,
|
|
58
65
|
indexHashTables: 8,
|
|
59
66
|
indexRandomSeed: 42,
|
|
@@ -61,7 +68,7 @@ var defaultConfig = () => ({
|
|
|
61
68
|
},
|
|
62
69
|
sparse: {
|
|
63
70
|
enabled: true,
|
|
64
|
-
modelId: "opensearch-project/opensearch-neural-sparse-encoding-doc-
|
|
71
|
+
modelId: "opensearch-project/opensearch-neural-sparse-encoding-doc-v2-mini",
|
|
65
72
|
cacheDir: DEFAULT_SHARED_MODEL_CACHE_DIR,
|
|
66
73
|
documentTopTokens: 128,
|
|
67
74
|
queryEncoding: "tokenizer-token-weights",
|
|
@@ -70,12 +77,12 @@ var defaultConfig = () => ({
|
|
|
70
77
|
}
|
|
71
78
|
},
|
|
72
79
|
crawler: {
|
|
73
|
-
defaultUserAgent: "querylight-cli
|
|
80
|
+
defaultUserAgent: "querylight-cli",
|
|
74
81
|
obeyRobotsTxt: true,
|
|
75
82
|
rateLimitMs: 1e3,
|
|
76
83
|
maxConcurrentRequests: 5,
|
|
77
84
|
renderJs: false,
|
|
78
|
-
retentionDays:
|
|
85
|
+
retentionDays: 30,
|
|
79
86
|
fetchArticles: true
|
|
80
87
|
},
|
|
81
88
|
limits: {
|
|
@@ -119,6 +126,10 @@ async function loadConfig(workspacePath, configPath) {
|
|
|
119
126
|
...defaults.rag,
|
|
120
127
|
...parsed.rag ?? {}
|
|
121
128
|
},
|
|
129
|
+
search: {
|
|
130
|
+
...defaults.search,
|
|
131
|
+
...parsed.search ?? {}
|
|
132
|
+
},
|
|
122
133
|
retrieval: {
|
|
123
134
|
...defaults.retrieval,
|
|
124
135
|
...parsed.retrieval ?? {},
|
|
@@ -682,15 +693,26 @@ function createSparseChunkText(chunk) {
|
|
|
682
693
|
// src/vector/dense.ts
|
|
683
694
|
var denseEmbedderFactory = null;
|
|
684
695
|
var EXACT_DENSE_RERANK_THRESHOLD = 5e3;
|
|
696
|
+
function normalizeDenseEmbedder(embedder) {
|
|
697
|
+
if (typeof embedder === "function") {
|
|
698
|
+
return { embed: embedder };
|
|
699
|
+
}
|
|
700
|
+
return embedder;
|
|
701
|
+
}
|
|
685
702
|
async function createEmbedder(cacheDir, modelId) {
|
|
686
703
|
if (denseEmbedderFactory) {
|
|
687
|
-
return denseEmbedderFactory(cacheDir, modelId);
|
|
704
|
+
return normalizeDenseEmbedder(await denseEmbedderFactory(cacheDir, modelId));
|
|
688
705
|
}
|
|
689
706
|
const runtime = await getDenseTransformersRuntime(cacheDir);
|
|
690
707
|
const extractor = await runtime.pipeline("feature-extraction", modelId);
|
|
691
|
-
return
|
|
692
|
-
|
|
693
|
-
|
|
708
|
+
return {
|
|
709
|
+
async embed(text) {
|
|
710
|
+
const output = await extractor(text, { pooling: "mean", normalize: true });
|
|
711
|
+
return output.tolist()[0];
|
|
712
|
+
},
|
|
713
|
+
async dispose() {
|
|
714
|
+
await extractor.dispose();
|
|
715
|
+
}
|
|
694
716
|
};
|
|
695
717
|
}
|
|
696
718
|
function exactDenseQuery(payload, vector, topK) {
|
|
@@ -699,8 +721,12 @@ function exactDenseQuery(payload, vector, topK) {
|
|
|
699
721
|
async function pullDenseModel(workspacePath, config) {
|
|
700
722
|
const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
|
|
701
723
|
await mkdir4(cacheDir, { recursive: true });
|
|
702
|
-
const
|
|
703
|
-
|
|
724
|
+
const embedder = await createEmbedder(cacheDir, config.modelId);
|
|
725
|
+
try {
|
|
726
|
+
await embedder.embed("warm dense model cache");
|
|
727
|
+
} finally {
|
|
728
|
+
await embedder.dispose?.();
|
|
729
|
+
}
|
|
704
730
|
}
|
|
705
731
|
async function buildDenseVectors({
|
|
706
732
|
workspacePath,
|
|
@@ -710,53 +736,57 @@ async function buildDenseVectors({
|
|
|
710
736
|
const chunks = await readJsonl(path8.join(workspacePath, "chunks", "chunks.jsonl"));
|
|
711
737
|
const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
|
|
712
738
|
await mkdir4(cacheDir, { recursive: true });
|
|
713
|
-
const
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
const
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
739
|
+
const embedder = await createEmbedder(cacheDir, config.modelId);
|
|
740
|
+
try {
|
|
741
|
+
const records = [];
|
|
742
|
+
let dimensions = 0;
|
|
743
|
+
reportProgress(progress, `Encoding ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} for dense retrieval`);
|
|
744
|
+
for (const chunk of chunks) {
|
|
745
|
+
const embedding = await embedder.embed(createDenseChunkText(chunk));
|
|
746
|
+
dimensions ||= embedding.length;
|
|
747
|
+
records.push({
|
|
748
|
+
chunkId: chunk.id,
|
|
749
|
+
documentId: chunk.documentId,
|
|
750
|
+
sourceId: chunk.sourceId,
|
|
751
|
+
title: chunk.title,
|
|
752
|
+
uri: chunk.uri,
|
|
753
|
+
headingPath: chunk.headingPath,
|
|
754
|
+
text: chunk.text,
|
|
755
|
+
embedding
|
|
756
|
+
});
|
|
757
|
+
if (records.length === 1 || records.length % 100 === 0 || records.length === chunks.length) {
|
|
758
|
+
reportProgressDetail(progress, `Encoded ${records.length}/${chunks.length} chunks for dense retrieval`);
|
|
759
|
+
}
|
|
760
|
+
}
|
|
761
|
+
reportProgress(progress, "Building dense vector index");
|
|
762
|
+
const index = new VectorFieldIndex({
|
|
763
|
+
numHashTables: config.indexHashTables,
|
|
764
|
+
dimensions,
|
|
765
|
+
random: createSeededRandom(config.indexRandomSeed)
|
|
729
766
|
});
|
|
730
|
-
|
|
731
|
-
|
|
767
|
+
for (const record of records) {
|
|
768
|
+
index.insert(record.chunkId, [record.embedding]);
|
|
732
769
|
}
|
|
770
|
+
const metadata = {
|
|
771
|
+
createdAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
772
|
+
modelId: config.modelId,
|
|
773
|
+
dimensions,
|
|
774
|
+
hashTables: config.indexHashTables,
|
|
775
|
+
randomSeed: config.indexRandomSeed,
|
|
776
|
+
chunkCount: records.length,
|
|
777
|
+
indexHash: sha256(JSON.stringify(index.indexState))
|
|
778
|
+
};
|
|
779
|
+
const payload = {
|
|
780
|
+
metadata,
|
|
781
|
+
indexState: index.indexState,
|
|
782
|
+
chunks: records
|
|
783
|
+
};
|
|
784
|
+
await writeDensePayload(workspacePath, payload);
|
|
785
|
+
reportProgress(progress, `Dense vectors written for ${records.length} chunk${records.length === 1 ? "" : "s"}`);
|
|
786
|
+
return payload;
|
|
787
|
+
} finally {
|
|
788
|
+
await embedder.dispose?.();
|
|
733
789
|
}
|
|
734
|
-
reportProgress(progress, "Building dense vector index");
|
|
735
|
-
const index = new VectorFieldIndex({
|
|
736
|
-
numHashTables: config.indexHashTables,
|
|
737
|
-
dimensions,
|
|
738
|
-
random: createSeededRandom(config.indexRandomSeed)
|
|
739
|
-
});
|
|
740
|
-
for (const record of records) {
|
|
741
|
-
index.insert(record.chunkId, [record.embedding]);
|
|
742
|
-
}
|
|
743
|
-
const metadata = {
|
|
744
|
-
createdAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
745
|
-
modelId: config.modelId,
|
|
746
|
-
dimensions,
|
|
747
|
-
hashTables: config.indexHashTables,
|
|
748
|
-
randomSeed: config.indexRandomSeed,
|
|
749
|
-
chunkCount: records.length,
|
|
750
|
-
indexHash: sha256(JSON.stringify(index.indexState))
|
|
751
|
-
};
|
|
752
|
-
const payload = {
|
|
753
|
-
metadata,
|
|
754
|
-
indexState: index.indexState,
|
|
755
|
-
chunks: records
|
|
756
|
-
};
|
|
757
|
-
await writeDensePayload(workspacePath, payload);
|
|
758
|
-
reportProgress(progress, `Dense vectors written for ${records.length} chunk${records.length === 1 ? "" : "s"}`);
|
|
759
|
-
return payload;
|
|
760
790
|
}
|
|
761
791
|
async function denseQuery({
|
|
762
792
|
workspacePath,
|
|
@@ -766,21 +796,25 @@ async function denseQuery({
|
|
|
766
796
|
}) {
|
|
767
797
|
const payload = await readDensePayload(workspacePath);
|
|
768
798
|
const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
|
|
769
|
-
const
|
|
770
|
-
|
|
771
|
-
|
|
799
|
+
const embedder = await createEmbedder(cacheDir, config.modelId);
|
|
800
|
+
try {
|
|
801
|
+
const vector = await embedder.embed(query);
|
|
802
|
+
if (payload.chunks.length <= EXACT_DENSE_RERANK_THRESHOLD) {
|
|
803
|
+
return exactDenseQuery(payload, vector, topK);
|
|
804
|
+
}
|
|
805
|
+
const index = new VectorFieldIndex({
|
|
806
|
+
numHashTables: payload.metadata.hashTables,
|
|
807
|
+
dimensions: payload.metadata.dimensions,
|
|
808
|
+
random: createSeededRandom(payload.metadata.randomSeed)
|
|
809
|
+
}).loadState(payload.indexState);
|
|
810
|
+
const approximateHits = index.query(vector, topK);
|
|
811
|
+
if (approximateHits.length >= topK) {
|
|
812
|
+
return approximateHits;
|
|
813
|
+
}
|
|
772
814
|
return exactDenseQuery(payload, vector, topK);
|
|
815
|
+
} finally {
|
|
816
|
+
await embedder.dispose?.();
|
|
773
817
|
}
|
|
774
|
-
const index = new VectorFieldIndex({
|
|
775
|
-
numHashTables: payload.metadata.hashTables,
|
|
776
|
-
dimensions: payload.metadata.dimensions,
|
|
777
|
-
random: createSeededRandom(payload.metadata.randomSeed)
|
|
778
|
-
}).loadState(payload.indexState);
|
|
779
|
-
const approximateHits = index.query(vector, topK);
|
|
780
|
-
if (approximateHits.length >= topK) {
|
|
781
|
-
return approximateHits;
|
|
782
|
-
}
|
|
783
|
-
return exactDenseQuery(payload, vector, topK);
|
|
784
818
|
}
|
|
785
819
|
|
|
786
820
|
// src/vector/sparse.ts
|
|
@@ -2029,7 +2063,7 @@ async function fetchUrlDocument({
|
|
|
2029
2063
|
publicationDate
|
|
2030
2064
|
}) {
|
|
2031
2065
|
const headers = {
|
|
2032
|
-
"user-agent": source.crawl?.userAgent ?? "querylight-cli
|
|
2066
|
+
"user-agent": source.crawl?.userAgent ?? "querylight-cli"
|
|
2033
2067
|
};
|
|
2034
2068
|
if (previous?.httpCache?.etag) {
|
|
2035
2069
|
headers["if-none-match"] = previous.httpCache.etag;
|
|
@@ -2173,13 +2207,17 @@ function isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules
|
|
|
2173
2207
|
if (url.search.length > 0) {
|
|
2174
2208
|
return false;
|
|
2175
2209
|
}
|
|
2176
|
-
|
|
2210
|
+
const pathname = url.pathname.toLowerCase();
|
|
2211
|
+
if (pathname.endsWith(".xml")) {
|
|
2177
2212
|
return false;
|
|
2178
2213
|
}
|
|
2179
|
-
if (
|
|
2214
|
+
if (pathname.endsWith(".pdf")) {
|
|
2180
2215
|
return false;
|
|
2181
2216
|
}
|
|
2182
|
-
if (
|
|
2217
|
+
if (pathname.includes("/cdn-cgi/")) {
|
|
2218
|
+
return false;
|
|
2219
|
+
}
|
|
2220
|
+
if (pathname === "/search" || pathname === "/search/" || pathname.endsWith("/search/")) {
|
|
2183
2221
|
return false;
|
|
2184
2222
|
}
|
|
2185
2223
|
if (disallowRules.some((rule) => rule !== "/" && url.pathname.startsWith(rule))) {
|
|
@@ -2324,7 +2362,7 @@ async function purgeDocuments(workspacePath, documentIds, documents) {
|
|
|
2324
2362
|
async function fetchFeedText(source) {
|
|
2325
2363
|
const response2 = await fetch(source.uri, {
|
|
2326
2364
|
headers: {
|
|
2327
|
-
"user-agent": source.crawl?.userAgent ?? "querylight-cli
|
|
2365
|
+
"user-agent": source.crawl?.userAgent ?? "querylight-cli"
|
|
2328
2366
|
}
|
|
2329
2367
|
});
|
|
2330
2368
|
if (!response2.ok) {
|
|
@@ -3203,13 +3241,20 @@ function searchResultsFromResponse(response2, showChunks = false) {
|
|
|
3203
3241
|
metadata: hit._source.metadata
|
|
3204
3242
|
}));
|
|
3205
3243
|
}
|
|
3244
|
+
async function searchJsonRequest({
|
|
3245
|
+
index,
|
|
3246
|
+
request,
|
|
3247
|
+
indexName = "querylight"
|
|
3248
|
+
}) {
|
|
3249
|
+
return searchJsonDsl({ index, request, indexName });
|
|
3250
|
+
}
|
|
3206
3251
|
async function searchJsonIndex({
|
|
3207
3252
|
workspacePath,
|
|
3208
3253
|
request,
|
|
3209
3254
|
indexName = "querylight"
|
|
3210
3255
|
}) {
|
|
3211
3256
|
const index = await loadHydratedIndex(workspacePath);
|
|
3212
|
-
return
|
|
3257
|
+
return searchJsonRequest({ index, request, indexName });
|
|
3213
3258
|
}
|
|
3214
3259
|
function normalizeDisplayTitle(title) {
|
|
3215
3260
|
return title.replace(/\s*\|\s*Querylight TS Demo\s*$/i, "").replace(/\s+/g, " ").trim();
|
|
@@ -3525,8 +3570,197 @@ async function searchIndex({
|
|
|
3525
3570
|
return createSearchResponse(mode, finalHits, Date.now() - startedAt);
|
|
3526
3571
|
}
|
|
3527
3572
|
|
|
3528
|
-
// src/
|
|
3573
|
+
// src/server/search-api.ts
|
|
3574
|
+
import { createServer } from "http";
|
|
3575
|
+
import { readdir, stat as stat4 } from "fs/promises";
|
|
3529
3576
|
import path19 from "path";
|
|
3577
|
+
async function pathIsDirectory(candidatePath) {
|
|
3578
|
+
try {
|
|
3579
|
+
return (await stat4(candidatePath)).isDirectory();
|
|
3580
|
+
} catch {
|
|
3581
|
+
return false;
|
|
3582
|
+
}
|
|
3583
|
+
}
|
|
3584
|
+
async function discoverKnowledgeBases(workspacePath) {
|
|
3585
|
+
try {
|
|
3586
|
+
const singleWorkspace = await assertWorkspaceExists(workspacePath);
|
|
3587
|
+
const config = await loadConfig(singleWorkspace);
|
|
3588
|
+
const index = await loadHydratedIndex(singleWorkspace);
|
|
3589
|
+
return {
|
|
3590
|
+
mode: "single",
|
|
3591
|
+
knowledgeBases: [{
|
|
3592
|
+
name: config.index.name,
|
|
3593
|
+
workspacePath: singleWorkspace,
|
|
3594
|
+
configuredIndexName: config.index.name,
|
|
3595
|
+
index
|
|
3596
|
+
}]
|
|
3597
|
+
};
|
|
3598
|
+
} catch (error) {
|
|
3599
|
+
if (!(error instanceof CliError) || error.code !== "WORKSPACE_ERROR") {
|
|
3600
|
+
throw error;
|
|
3601
|
+
}
|
|
3602
|
+
}
|
|
3603
|
+
const resolvedRoot = path19.resolve(workspacePath);
|
|
3604
|
+
if (!await pathIsDirectory(resolvedRoot)) {
|
|
3605
|
+
throw new CliError(`workspace path does not exist: ${resolvedRoot}`, "WORKSPACE_ERROR", 3 /* WorkspaceError */);
|
|
3606
|
+
}
|
|
3607
|
+
const entries = await readdir(resolvedRoot, { withFileTypes: true });
|
|
3608
|
+
const knowledgeBases = (await Promise.all(entries.filter((entry) => entry.isDirectory()).map(async (entry) => {
|
|
3609
|
+
const candidateWorkspace = path19.join(resolvedRoot, entry.name, ".kb");
|
|
3610
|
+
try {
|
|
3611
|
+
const workspace = await assertWorkspaceExists(candidateWorkspace);
|
|
3612
|
+
const config = await loadConfig(workspace);
|
|
3613
|
+
const index = await loadHydratedIndex(workspace);
|
|
3614
|
+
return {
|
|
3615
|
+
name: entry.name,
|
|
3616
|
+
workspacePath: workspace,
|
|
3617
|
+
configuredIndexName: config.index.name,
|
|
3618
|
+
index
|
|
3619
|
+
};
|
|
3620
|
+
} catch (error) {
|
|
3621
|
+
if (error instanceof CliError && error.code === "WORKSPACE_ERROR") {
|
|
3622
|
+
return null;
|
|
3623
|
+
}
|
|
3624
|
+
throw error;
|
|
3625
|
+
}
|
|
3626
|
+
}))).filter((knowledgeBase) => knowledgeBase != null);
|
|
3627
|
+
if (knowledgeBases.length === 0) {
|
|
3628
|
+
throw new CliError(
|
|
3629
|
+
`no knowledge bases found at ${resolvedRoot}; use a .kb workspace or a directory of named subdirectories that each contain .kb`,
|
|
3630
|
+
"WORKSPACE_ERROR",
|
|
3631
|
+
3 /* WorkspaceError */
|
|
3632
|
+
);
|
|
3633
|
+
}
|
|
3634
|
+
return { mode: "multi", knowledgeBases };
|
|
3635
|
+
}
|
|
3636
|
+
function sendJson(response2, statusCode, payload) {
|
|
3637
|
+
response2.statusCode = statusCode;
|
|
3638
|
+
response2.setHeader("content-type", "application/json; charset=utf-8");
|
|
3639
|
+
response2.end(JSON.stringify(payload));
|
|
3640
|
+
}
|
|
3641
|
+
function sendError(response2, statusCode, type, reason) {
|
|
3642
|
+
sendJson(response2, statusCode, {
|
|
3643
|
+
error: {
|
|
3644
|
+
type,
|
|
3645
|
+
reason
|
|
3646
|
+
},
|
|
3647
|
+
status: statusCode
|
|
3648
|
+
});
|
|
3649
|
+
}
|
|
3650
|
+
async function readRequestBody(request) {
|
|
3651
|
+
const chunks = [];
|
|
3652
|
+
for await (const chunk of request) {
|
|
3653
|
+
chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
|
|
3654
|
+
}
|
|
3655
|
+
return Buffer.concat(chunks).toString("utf8");
|
|
3656
|
+
}
|
|
3657
|
+
function parseSearchRequest(raw) {
|
|
3658
|
+
const normalized = raw.trim();
|
|
3659
|
+
if (normalized.length === 0) {
|
|
3660
|
+
return {};
|
|
3661
|
+
}
|
|
3662
|
+
try {
|
|
3663
|
+
const parsed = JSON.parse(normalized);
|
|
3664
|
+
if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
|
|
3665
|
+
throw new Error("expected a JSON object");
|
|
3666
|
+
}
|
|
3667
|
+
return parsed;
|
|
3668
|
+
} catch (error) {
|
|
3669
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3670
|
+
throw new CliError(`invalid JSON request: ${message}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
|
|
3671
|
+
}
|
|
3672
|
+
}
|
|
3673
|
+
function routeForKnowledgeBase(mode, knowledgeBase) {
|
|
3674
|
+
return mode === "single" ? "/_search" : `/${knowledgeBase.name}/_search`;
|
|
3675
|
+
}
|
|
3676
|
+
function resolveKnowledgeBaseForPath(pathname, mode, knowledgeBases) {
|
|
3677
|
+
const segments = pathname.split("/").filter(Boolean);
|
|
3678
|
+
if (mode === "single") {
|
|
3679
|
+
const knowledgeBase = [...knowledgeBases.values()][0];
|
|
3680
|
+
if (!knowledgeBase) {
|
|
3681
|
+
return null;
|
|
3682
|
+
}
|
|
3683
|
+
if (segments.length === 1 && segments[0] === "_search") {
|
|
3684
|
+
return knowledgeBase;
|
|
3685
|
+
}
|
|
3686
|
+
if (segments.length === 2 && segments[1] === "_search" && segments[0] === knowledgeBase.configuredIndexName) {
|
|
3687
|
+
return knowledgeBase;
|
|
3688
|
+
}
|
|
3689
|
+
return null;
|
|
3690
|
+
}
|
|
3691
|
+
if (segments.length === 2 && segments[1] === "_search") {
|
|
3692
|
+
return knowledgeBases.get(segments[0]) ?? null;
|
|
3693
|
+
}
|
|
3694
|
+
return null;
|
|
3695
|
+
}
|
|
3696
|
+
async function handleSearchRequest(request, response2, pathname, mode, knowledgeBases) {
|
|
3697
|
+
if (request.method !== "GET" && request.method !== "POST") {
|
|
3698
|
+
response2.setHeader("allow", "GET, POST");
|
|
3699
|
+
sendError(response2, 405, "method_not_allowed", `unsupported method for ${pathname}`);
|
|
3700
|
+
return;
|
|
3701
|
+
}
|
|
3702
|
+
const knowledgeBase = resolveKnowledgeBaseForPath(pathname, mode, knowledgeBases);
|
|
3703
|
+
if (!knowledgeBase) {
|
|
3704
|
+
sendError(response2, 404, "resource_not_found_exception", `unknown search route: ${pathname}`);
|
|
3705
|
+
return;
|
|
3706
|
+
}
|
|
3707
|
+
try {
|
|
3708
|
+
const requestBody = parseSearchRequest(await readRequestBody(request));
|
|
3709
|
+
const indexName = mode === "multi" ? knowledgeBase.name : knowledgeBase.configuredIndexName;
|
|
3710
|
+
const result = await searchJsonRequest({
|
|
3711
|
+
index: knowledgeBase.index,
|
|
3712
|
+
request: requestBody,
|
|
3713
|
+
indexName
|
|
3714
|
+
});
|
|
3715
|
+
sendJson(response2, 200, result);
|
|
3716
|
+
} catch (error) {
|
|
3717
|
+
if (error instanceof CliError && error.code === "INVALID_ARGUMENT") {
|
|
3718
|
+
sendError(response2, 400, "parse_exception", error.message);
|
|
3719
|
+
return;
|
|
3720
|
+
}
|
|
3721
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3722
|
+
sendError(response2, 500, "search_phase_execution_exception", message);
|
|
3723
|
+
}
|
|
3724
|
+
}
|
|
3725
|
+
async function startSearchApiServer({
|
|
3726
|
+
workspacePath,
|
|
3727
|
+
host = "127.0.0.1",
|
|
3728
|
+
port = 3e3
|
|
3729
|
+
}) {
|
|
3730
|
+
const { mode, knowledgeBases } = await discoverKnowledgeBases(workspacePath);
|
|
3731
|
+
const byName = new Map(knowledgeBases.map((knowledgeBase) => [knowledgeBase.name, knowledgeBase]));
|
|
3732
|
+
const server = createServer(async (request, response2) => {
|
|
3733
|
+
const url2 = new URL(request.url ?? "/", `http://${request.headers.host ?? `${host}:${port}`}`);
|
|
3734
|
+
await handleSearchRequest(request, response2, url2.pathname, mode, byName);
|
|
3735
|
+
});
|
|
3736
|
+
await new Promise((resolve2, reject) => {
|
|
3737
|
+
server.once("error", reject);
|
|
3738
|
+
server.listen(port, host, () => {
|
|
3739
|
+
server.off("error", reject);
|
|
3740
|
+
resolve2();
|
|
3741
|
+
});
|
|
3742
|
+
});
|
|
3743
|
+
const address = server.address();
|
|
3744
|
+
if (!address || typeof address === "string") {
|
|
3745
|
+
throw new CliError("server failed to bind to a TCP address", "SERVER_ERROR", 1 /* GeneralError */);
|
|
3746
|
+
}
|
|
3747
|
+
const url = `http://${host}:${address.port}`;
|
|
3748
|
+
return {
|
|
3749
|
+
mode,
|
|
3750
|
+
url,
|
|
3751
|
+
knowledgeBases: knowledgeBases.map((knowledgeBase) => ({
|
|
3752
|
+
name: knowledgeBase.name,
|
|
3753
|
+
workspacePath: knowledgeBase.workspacePath,
|
|
3754
|
+
route: routeForKnowledgeBase(mode, knowledgeBase)
|
|
3755
|
+
})),
|
|
3756
|
+
close: async () => new Promise((resolve2, reject) => {
|
|
3757
|
+
server.close((error) => error ? reject(error) : resolve2());
|
|
3758
|
+
})
|
|
3759
|
+
};
|
|
3760
|
+
}
|
|
3761
|
+
|
|
3762
|
+
// src/query/related-service.ts
|
|
3763
|
+
import path20 from "path";
|
|
3530
3764
|
function cosineSimilarity2(left, right) {
|
|
3531
3765
|
let dot = 0;
|
|
3532
3766
|
let leftNorm = 0;
|
|
@@ -3602,7 +3836,7 @@ async function findRelatedDocuments({
|
|
|
3602
3836
|
if (!await fileExists(denseVectorPath(workspacePath))) {
|
|
3603
3837
|
throw new CliError("dense vector index is not built; run `qli models pull --dense` and `qli rebuild`", "DENSE_INDEX_MISSING", 7 /* QueryError */);
|
|
3604
3838
|
}
|
|
3605
|
-
const documents = await readJsonl(
|
|
3839
|
+
const documents = await readJsonl(path20.join(workspacePath, "documents", "documents.jsonl"));
|
|
3606
3840
|
const selected = resolveDocumentSelector(documents, document);
|
|
3607
3841
|
const densePayload = await readDensePayload(workspacePath);
|
|
3608
3842
|
const vectors = buildDocumentVectors(documents, densePayload.chunks, densePayload.metadata.dimensions);
|
|
@@ -3675,7 +3909,7 @@ async function createContext({
|
|
|
3675
3909
|
}
|
|
3676
3910
|
|
|
3677
3911
|
// src/report/diff-service.ts
|
|
3678
|
-
import
|
|
3912
|
+
import path21 from "path";
|
|
3679
3913
|
function chooseBaselineRun(runs, since) {
|
|
3680
3914
|
if (since === "last-run") {
|
|
3681
3915
|
return runs.at(-1);
|
|
@@ -3691,7 +3925,7 @@ async function diffWorkspace({
|
|
|
3691
3925
|
documentId,
|
|
3692
3926
|
since
|
|
3693
3927
|
}) {
|
|
3694
|
-
const current = await readJsonl(
|
|
3928
|
+
const current = await readJsonl(path21.join(workspacePath, "documents", "documents.jsonl"));
|
|
3695
3929
|
const baseline = chooseBaselineRun(await listRuns(workspacePath), since);
|
|
3696
3930
|
const previous = new Map((baseline?.documentsSnapshot ?? []).map((document) => [document.id, document]));
|
|
3697
3931
|
const changedDocuments = current.filter((document) => (!sourceId || document.sourceId === sourceId) && (!documentId || document.id === documentId)).filter((document) => {
|
|
@@ -4050,7 +4284,7 @@ function parseDateValue(input, optionName) {
|
|
|
4050
4284
|
return parsed.toISOString();
|
|
4051
4285
|
}
|
|
4052
4286
|
async function parseJsonArgument(input) {
|
|
4053
|
-
const raw = input.startsWith("@") ? await readFile11(
|
|
4287
|
+
const raw = input.startsWith("@") ? await readFile11(path22.resolve(input.slice(1)), "utf8") : input;
|
|
4054
4288
|
try {
|
|
4055
4289
|
const parsed = JSON.parse(raw);
|
|
4056
4290
|
if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
|
|
@@ -4093,15 +4327,26 @@ function searchDateRanges(options) {
|
|
|
4093
4327
|
}
|
|
4094
4328
|
return entries;
|
|
4095
4329
|
}
|
|
4330
|
+
function resolveSearchTopK(optionsTopK, sourceTypes, dateRanges, defaultTopK) {
|
|
4331
|
+
const explicitTopK = parseOptionalPositiveInteger(optionsTopK, "--top-k");
|
|
4332
|
+
if (explicitTopK !== void 0) {
|
|
4333
|
+
return explicitTopK;
|
|
4334
|
+
}
|
|
4335
|
+
const includesRss = (sourceTypes ?? []).includes("rss");
|
|
4336
|
+
if (includesRss && dateRanges.length > 0) {
|
|
4337
|
+
return 500;
|
|
4338
|
+
}
|
|
4339
|
+
return defaultTopK;
|
|
4340
|
+
}
|
|
4096
4341
|
async function resolveWorkspace(options) {
|
|
4097
|
-
return
|
|
4342
|
+
return path22.resolve(options.workspace ?? DEFAULT_WORKSPACE);
|
|
4098
4343
|
}
|
|
4099
4344
|
function workspaceFromArgv(argv) {
|
|
4100
4345
|
const index = argv.findIndex((arg) => arg === "--workspace");
|
|
4101
4346
|
if (index >= 0 && argv[index + 1]) {
|
|
4102
|
-
return
|
|
4347
|
+
return path22.resolve(argv[index + 1]);
|
|
4103
4348
|
}
|
|
4104
|
-
return
|
|
4349
|
+
return path22.resolve(DEFAULT_WORKSPACE);
|
|
4105
4350
|
}
|
|
4106
4351
|
async function runCli(argv, io = {}) {
|
|
4107
4352
|
const capture = { stdout: [], stderr: [], ...io };
|
|
@@ -4195,7 +4440,7 @@ Notes:
|
|
|
4195
4440
|
}
|
|
4196
4441
|
const stored = await addSource(workspace, {
|
|
4197
4442
|
type,
|
|
4198
|
-
uri: ["file", "directory"].includes(type) ?
|
|
4443
|
+
uri: ["file", "directory"].includes(type) ? path22.resolve(uri) : uri,
|
|
4199
4444
|
name: options.name,
|
|
4200
4445
|
enabled: true,
|
|
4201
4446
|
tags: options.tag ?? [],
|
|
@@ -4406,7 +4651,7 @@ Examples:
|
|
|
4406
4651
|
progress?.("info", "Rebuild complete");
|
|
4407
4652
|
emit(global.json, capture, response("rebuild", workspace, data), `Processed ${ingest.processedSources} sources, wrote ${chunk.chunksWritten} chunks`);
|
|
4408
4653
|
});
|
|
4409
|
-
program.command("search").description("Search the built index and return ranked matching documents or chunks. Use search-json for raw JSON DSL queries.").argument("[query]", "Text query. Omit it to list the latest matching documents.").option("--top-k <n>", "Maximum number of results to return.
|
|
4654
|
+
program.command("search").description("Search the built index and return ranked matching documents or chunks. Use search-json for raw JSON DSL queries.").argument("[query]", "Text query. Omit it to list the latest matching documents.").option("--top-k <n>", "Maximum number of results to return. Defaults to search.defaultTopK in config.yaml. RSS searches with a time window use 500 when omitted.").option("--source <sourceIds>", "Restrict results to one or more source ids. Use comma-separated values.").option("--source-name <names>", "Restrict results to one or more source names. Use comma-separated values.").option("--source-type <types>", `Restrict results to one or more source types. Use comma-separated values: ${SOURCE_TYPE_LIST.join(", ")}`).option("--uri-prefix <prefixes>", "Restrict results to one or more URI prefixes. Use comma-separated values.").option("--tag <tags>", "Restrict results to one or more source tags. Use comma-separated values.").option("--metadata <key=value...>", "Restrict results to sources with matching metadata.").option("--since <date>", "Shortcut for --publication-date-from.").option("--until <date>", "Shortcut for --publication-date-to.").option("--changed-since <date>", "Only include documents changed on or after this date.").option("--has-publication-date", "Only include documents with a publication date.").option("--publication-date-from <date>", "Only include documents published on or after this date.").option("--publication-date-to <date>", "Only include documents published on or before this date.").option("--first-seen-at-from <date>", "Only include documents first seen on or after this date.").option("--first-seen-at-to <date>", "Only include documents first seen on or before this date.").option("--last-seen-at-from <date>", "Only include documents last seen on or after this date.").option("--last-seen-at-to <date>", "Only include documents last seen on or before this date.").option("--last-changed-at-from <date>", "Only include documents changed on or after this date.").option("--last-changed-at-to <date>", "Only include documents changed on or before this date.").option("--crawled-at-from <date>", "Only include documents crawled on or after this date.").option("--crawled-at-to <date>", "Only include documents crawled on or before this date.").option("--retrieval <mode>", `Retrieval mode: ${RETRIEVAL_MODE_LIST.join(", ")}`).option("--show-chunks", "Return chunk-level matches when available.").addHelpText("after", `
|
|
4410
4655
|
Examples:
|
|
4411
4656
|
qli search "pricing api limits"
|
|
4412
4657
|
qli search "authentication" --top-k 20 --tag docs
|
|
@@ -4419,22 +4664,27 @@ Examples:
|
|
|
4419
4664
|
Notes:
|
|
4420
4665
|
lexical works without vector models.
|
|
4421
4666
|
dense, sparse, and hybrid require the relevant index artifacts to exist.
|
|
4667
|
+
When you omit --top-k, qli uses search.defaultTopK from config.yaml. The default workspace value is 50.
|
|
4668
|
+
RSS searches with a time window default to 500 results when you omit --top-k.
|
|
4422
4669
|
Use search-json when you want the raw Querylight 0.11 JSON DSL and hit format.
|
|
4423
4670
|
When you omit the query, qli returns the latest matching documents sorted by publication date.`).action(async function command(query, options) {
|
|
4424
4671
|
const global = this.optsWithGlobals();
|
|
4425
4672
|
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
4673
|
+
const config = await loadConfig(workspace, global.config);
|
|
4674
|
+
const sourceTypes = parseSourceTypes(options.sourceType);
|
|
4675
|
+
const dateRanges = searchDateRanges(options);
|
|
4426
4676
|
const result = await searchIndex({
|
|
4427
4677
|
workspacePath: workspace,
|
|
4428
4678
|
query: query ?? "",
|
|
4429
|
-
topK:
|
|
4679
|
+
topK: resolveSearchTopK(options.topK, sourceTypes, dateRanges, config.search.defaultTopK),
|
|
4430
4680
|
sourceIds: parseCommaSeparatedList(options.source),
|
|
4431
4681
|
sourceNames: parseCommaSeparatedList(options.sourceName),
|
|
4432
|
-
sourceTypes
|
|
4682
|
+
sourceTypes,
|
|
4433
4683
|
uriPrefixes: parseCommaSeparatedList(options.uriPrefix),
|
|
4434
4684
|
hasPublicationDate: Boolean(options.hasPublicationDate),
|
|
4435
4685
|
tags: parseCommaSeparatedList(options.tag),
|
|
4436
4686
|
metadata: (options.metadata ?? []).map(parseKeyValue).map(([key, value]) => ({ key, value })),
|
|
4437
|
-
dateRanges
|
|
4687
|
+
dateRanges,
|
|
4438
4688
|
retrievalMode: parseRetrievalMode(options.retrieval),
|
|
4439
4689
|
showChunks: Boolean(options.showChunks)
|
|
4440
4690
|
});
|
|
@@ -4459,6 +4709,63 @@ Notes:
|
|
|
4459
4709
|
});
|
|
4460
4710
|
emit(global.json, capture, response("search-json", workspace, result), JSON.stringify(result, null, 2));
|
|
4461
4711
|
});
|
|
4712
|
+
program.command("serve").description("Start a small HTTP API that exposes Querylight JSON DSL search through an OpenSearch-like _search endpoint.").option("--host <host>", "Host interface to bind. Defaults to 127.0.0.1.", "127.0.0.1").option("--port <n>", "Port to bind. Use 0 to let the OS choose a free port.", "3000").addHelpText("after", `
|
|
4713
|
+
Examples:
|
|
4714
|
+
qli serve
|
|
4715
|
+
qli serve --workspace ./docs/.kb --port 4000
|
|
4716
|
+
qli serve --workspace ./kbs --host 0.0.0.0 --port 4000
|
|
4717
|
+
|
|
4718
|
+
Routes:
|
|
4719
|
+
Single workspace: POST /_search
|
|
4720
|
+
Single workspace: POST /<configured-index-name>/_search
|
|
4721
|
+
Multi-KB root: POST /<directory-name>/_search
|
|
4722
|
+
|
|
4723
|
+
Notes:
|
|
4724
|
+
The request body must be a Querylight JSON DSL object.
|
|
4725
|
+
serve only exposes lexical _search for now.
|
|
4726
|
+
When --workspace points to a directory of knowledge bases, each child directory must contain its own .kb workspace.
|
|
4727
|
+
Index files are loaded once at startup and reused across requests.`).action(async function command(options) {
|
|
4728
|
+
const global = this.optsWithGlobals();
|
|
4729
|
+
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
4730
|
+
const port = Number(options.port);
|
|
4731
|
+
if (!Number.isInteger(port) || port < 0 || port > 65535) {
|
|
4732
|
+
throw new CliError(`invalid port: ${options.port}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
|
|
4733
|
+
}
|
|
4734
|
+
const server = await startSearchApiServer({
|
|
4735
|
+
workspacePath: workspace,
|
|
4736
|
+
host: options.host,
|
|
4737
|
+
port
|
|
4738
|
+
});
|
|
4739
|
+
const data = {
|
|
4740
|
+
url: server.url,
|
|
4741
|
+
mode: server.mode,
|
|
4742
|
+
knowledgeBases: server.knowledgeBases
|
|
4743
|
+
};
|
|
4744
|
+
const human = [
|
|
4745
|
+
`Listening on ${server.url}`,
|
|
4746
|
+
...server.knowledgeBases.map((knowledgeBase) => `${knowledgeBase.route} -> ${knowledgeBase.workspacePath}`)
|
|
4747
|
+
].join("\n");
|
|
4748
|
+
emit(global.json, capture, response("serve", workspace, data), human);
|
|
4749
|
+
const shutdown = async () => {
|
|
4750
|
+
for (const signal of ["SIGINT", "SIGTERM"]) {
|
|
4751
|
+
process.off(signal, stop);
|
|
4752
|
+
}
|
|
4753
|
+
await server.close();
|
|
4754
|
+
};
|
|
4755
|
+
const stop = () => {
|
|
4756
|
+
void shutdown().then(() => resolveStop(), rejectStop);
|
|
4757
|
+
};
|
|
4758
|
+
let resolveStop;
|
|
4759
|
+
let rejectStop;
|
|
4760
|
+
const waitForStop = new Promise((resolve2, reject) => {
|
|
4761
|
+
resolveStop = resolve2;
|
|
4762
|
+
rejectStop = reject;
|
|
4763
|
+
});
|
|
4764
|
+
for (const signal of ["SIGINT", "SIGTERM"]) {
|
|
4765
|
+
process.once(signal, stop);
|
|
4766
|
+
}
|
|
4767
|
+
await waitForStop;
|
|
4768
|
+
});
|
|
4462
4769
|
program.command("related").description("Find documents similar to an existing document by id or URI.").argument("<document>", "Document id, uri, or canonical uri").option("--top-k <n>", "Maximum number of related documents to return.", "12").addHelpText("after", `
|
|
4463
4770
|
Examples:
|
|
4464
4771
|
qli related doc_123
|
|
@@ -4582,7 +4889,7 @@ Examples:
|
|
|
4582
4889
|
try {
|
|
4583
4890
|
const meta = await readLatestIndexMetadata(workspace);
|
|
4584
4891
|
latestIndex = meta.createdAt;
|
|
4585
|
-
indexSize = (await
|
|
4892
|
+
indexSize = (await stat5(await resolveLatestIndexArtifactPath(workspace))).size;
|
|
4586
4893
|
} catch {
|
|
4587
4894
|
latestIndex = void 0;
|
|
4588
4895
|
}
|