@tryformation/querylight-cli 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +7 -3
- package/dist/cli/format.d.ts +2 -2
- package/dist/cli/main.js +391 -103
- package/dist/core/constants.d.ts +1 -1
- package/dist/index/querylight-indexer.d.ts +2 -2
- package/dist/index.js +344 -88
- package/dist/query/search-service.d.ts +8 -1
- package/dist/types/models.d.ts +36 -1
- package/dist/vector/runtime.d.ts +8 -0
- package/package.json +2 -2
- package/scripts/sparse-encode.py +5 -1
package/dist/cli/main.js
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
// src/cli/run-cli.ts
|
|
4
4
|
import { Command, Option } from "commander";
|
|
5
|
-
import { stat as stat4 } from "fs/promises";
|
|
5
|
+
import { readFile as readFile11, stat as stat4 } from "fs/promises";
|
|
6
6
|
import path21 from "path";
|
|
7
7
|
|
|
8
8
|
// src/chunk/chunker.ts
|
|
@@ -16,7 +16,7 @@ import path from "path";
|
|
|
16
16
|
import YAML from "yaml";
|
|
17
17
|
|
|
18
18
|
// src/core/constants.ts
|
|
19
|
-
var PACKAGE_VERSION = "0.2.
|
|
19
|
+
var PACKAGE_VERSION = "0.2.3";
|
|
20
20
|
var DEFAULT_WORKSPACE = ".kb";
|
|
21
21
|
var DEFAULT_SHARED_MODEL_CACHE_DIR = "~/.qli/models/huggingface";
|
|
22
22
|
var LEGACY_WORKSPACE_MODEL_CACHE_DIR = ".kb/models/huggingface";
|
|
@@ -374,7 +374,7 @@ async function assertWorkspaceExists(workspacePath) {
|
|
|
374
374
|
}
|
|
375
375
|
|
|
376
376
|
// src/index/querylight-indexer.ts
|
|
377
|
-
import { Analyzer, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, RankingAlgorithm, TextFieldIndex } from "@tryformation/querylight-ts";
|
|
377
|
+
import { Analyzer, DateFieldIndex, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, RankingAlgorithm, StoredSourceIndex, TextFieldIndex } from "@tryformation/querylight-ts";
|
|
378
378
|
import path11 from "path";
|
|
379
379
|
|
|
380
380
|
// src/vector/dense.ts
|
|
@@ -387,6 +387,7 @@ import os from "os";
|
|
|
387
387
|
import path6 from "path";
|
|
388
388
|
import { fileURLToPath } from "url";
|
|
389
389
|
import { execFile, execFileSync } from "child_process";
|
|
390
|
+
import { mkdtemp, rm, writeFile as writeFile3 } from "fs/promises";
|
|
390
391
|
|
|
391
392
|
// src/core/files.ts
|
|
392
393
|
import { stat as stat2 } from "fs/promises";
|
|
@@ -400,6 +401,7 @@ async function fileExists(filePath) {
|
|
|
400
401
|
}
|
|
401
402
|
|
|
402
403
|
// src/vector/runtime.ts
|
|
404
|
+
var sparseExecFileSync = execFileSync;
|
|
403
405
|
function resolveQliHomeDir() {
|
|
404
406
|
return path6.resolve(process.env.QLI_HOME ?? path6.join(os.homedir(), ".qli"));
|
|
405
407
|
}
|
|
@@ -455,29 +457,36 @@ async function runSparsePython({
|
|
|
455
457
|
}) {
|
|
456
458
|
const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
|
|
457
459
|
const scriptPath = await sparseScriptPath(importMetaUrl);
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
"
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
460
|
+
const payloadDir = await mkdtemp(path6.join(os.tmpdir(), "qli-sparse-"));
|
|
461
|
+
const payloadPath = path6.join(payloadDir, "payload.json");
|
|
462
|
+
await writeFile3(payloadPath, JSON.stringify(payload), "utf8");
|
|
463
|
+
try {
|
|
464
|
+
return sparseExecFileSync(
|
|
465
|
+
"uv",
|
|
466
|
+
[
|
|
467
|
+
"run",
|
|
468
|
+
"--with",
|
|
469
|
+
"torch",
|
|
470
|
+
"--with",
|
|
471
|
+
"transformers",
|
|
472
|
+
"--with",
|
|
473
|
+
"huggingface_hub",
|
|
474
|
+
"python",
|
|
475
|
+
scriptPath,
|
|
476
|
+
payloadPath
|
|
477
|
+
],
|
|
478
|
+
{
|
|
479
|
+
encoding: "utf8",
|
|
480
|
+
maxBuffer: 1024 * 1024 * 1024,
|
|
481
|
+
env: {
|
|
482
|
+
...process.env,
|
|
483
|
+
HF_HOME: cacheDir
|
|
484
|
+
}
|
|
478
485
|
}
|
|
479
|
-
|
|
480
|
-
|
|
486
|
+
);
|
|
487
|
+
} finally {
|
|
488
|
+
await rm(payloadDir, { recursive: true, force: true });
|
|
489
|
+
}
|
|
481
490
|
}
|
|
482
491
|
async function getDenseTransformersRuntime(cacheDir) {
|
|
483
492
|
const transformers = await import("@huggingface/transformers");
|
|
@@ -490,18 +499,18 @@ async function getDenseTransformersRuntime(cacheDir) {
|
|
|
490
499
|
}
|
|
491
500
|
|
|
492
501
|
// src/vector/store.ts
|
|
493
|
-
import { mkdir as mkdir3, rm, writeFile as
|
|
502
|
+
import { mkdir as mkdir3, rm as rm2, writeFile as writeFile5 } from "fs/promises";
|
|
494
503
|
import path7 from "path";
|
|
495
504
|
|
|
496
505
|
// src/core/gzip-json.ts
|
|
497
|
-
import { readFile as readFile4, writeFile as
|
|
506
|
+
import { readFile as readFile4, writeFile as writeFile4 } from "fs/promises";
|
|
498
507
|
import { promisify } from "util";
|
|
499
508
|
import { gunzip, gzip } from "zlib";
|
|
500
509
|
var gzipAsync = promisify(gzip);
|
|
501
510
|
var gunzipAsync = promisify(gunzip);
|
|
502
511
|
async function writeGzipJson(filePath, value) {
|
|
503
512
|
const payload = JSON.stringify(value, null, 2);
|
|
504
|
-
await
|
|
513
|
+
await writeFile4(filePath, await gzipAsync(Buffer.from(payload, "utf8")));
|
|
505
514
|
}
|
|
506
515
|
async function readJsonFromGzipOrFile(gzipPath, legacyPath) {
|
|
507
516
|
if (await fileExists(gzipPath)) {
|
|
@@ -570,8 +579,8 @@ async function writeDensePayload(workspacePath, payload) {
|
|
|
570
579
|
await writeGzipJson(denseVectorPath(workspacePath), payload);
|
|
571
580
|
await writeGzipJson(denseMetaPath(workspacePath), payload.metadata);
|
|
572
581
|
await Promise.all([
|
|
573
|
-
|
|
574
|
-
|
|
582
|
+
rm2(legacyDenseVectorPath(workspacePath), { force: true }),
|
|
583
|
+
rm2(legacyDenseMetaPath(workspacePath), { force: true })
|
|
575
584
|
]);
|
|
576
585
|
}
|
|
577
586
|
async function readDensePayload(workspacePath) {
|
|
@@ -582,8 +591,8 @@ async function writeSparsePayload(workspacePath, payload) {
|
|
|
582
591
|
await writeGzipJson(sparseVectorPath(workspacePath), payload);
|
|
583
592
|
await writeGzipJson(sparseMetaPath(workspacePath), payload.metadata);
|
|
584
593
|
await Promise.all([
|
|
585
|
-
|
|
586
|
-
|
|
594
|
+
rm2(legacySparseVectorPath(workspacePath), { force: true }),
|
|
595
|
+
rm2(legacySparseMetaPath(workspacePath), { force: true })
|
|
587
596
|
]);
|
|
588
597
|
}
|
|
589
598
|
async function readSparsePayload(workspacePath) {
|
|
@@ -592,12 +601,12 @@ async function readSparsePayload(workspacePath) {
|
|
|
592
601
|
async function writeDensePullMarker(workspacePath, model, value) {
|
|
593
602
|
const markerPath = densePullMarker(workspacePath, model.modelId, model.cacheDir);
|
|
594
603
|
await mkdir3(path7.dirname(markerPath), { recursive: true });
|
|
595
|
-
await
|
|
604
|
+
await writeFile5(markerPath, JSON.stringify(value, null, 2), "utf8");
|
|
596
605
|
}
|
|
597
606
|
async function writeSparsePullMarker(workspacePath, model, value) {
|
|
598
607
|
const markerPath = sparsePullMarker(workspacePath, model.modelId, model.cacheDir);
|
|
599
608
|
await mkdir3(path7.dirname(markerPath), { recursive: true });
|
|
600
|
-
await
|
|
609
|
+
await writeFile5(markerPath, JSON.stringify(value, null, 2), "utf8");
|
|
601
610
|
}
|
|
602
611
|
async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
|
|
603
612
|
const denseCacheDir = resolveCacheDir(workspacePath, dense.cacheDir);
|
|
@@ -1015,7 +1024,7 @@ async function getModelStatus(workspacePath, config) {
|
|
|
1015
1024
|
}
|
|
1016
1025
|
|
|
1017
1026
|
// src/index/index-store.ts
|
|
1018
|
-
import { mkdir as mkdir6, rm as
|
|
1027
|
+
import { mkdir as mkdir6, rm as rm3 } from "fs/promises";
|
|
1019
1028
|
import path10 from "path";
|
|
1020
1029
|
function versionedIndexPath(workspacePath, stamp) {
|
|
1021
1030
|
return path10.join(workspacePath, "indexes", `${stamp}.json.gz`);
|
|
@@ -1057,10 +1066,10 @@ async function writeIndexArtifacts({
|
|
|
1057
1066
|
await writeGzipJson(latestIndexArtifactPath, indexState);
|
|
1058
1067
|
await writeGzipJson(latestMetadataArtifactPath, metadata);
|
|
1059
1068
|
await Promise.all([
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1069
|
+
rm3(legacyLatestIndexPath(workspacePath), { force: true }),
|
|
1070
|
+
rm3(legacyLatestMetaPath(workspacePath), { force: true }),
|
|
1071
|
+
rm3(versionedLegacyIndexPath(workspacePath, stamp), { force: true }),
|
|
1072
|
+
rm3(versionedLegacyMetaPath(workspacePath, stamp), { force: true })
|
|
1064
1073
|
]);
|
|
1065
1074
|
return { indexPath: latestIndexArtifactPath, metadataPath: latestMetadataArtifactPath };
|
|
1066
1075
|
}
|
|
@@ -1082,12 +1091,19 @@ function keywordFieldIndex() {
|
|
|
1082
1091
|
function createIndexMapping(extraFields = []) {
|
|
1083
1092
|
const lexical = new TextFieldIndex(void 0, void 0, RankingAlgorithm.BM25);
|
|
1084
1093
|
const mapping = {
|
|
1094
|
+
_source: new StoredSourceIndex(),
|
|
1085
1095
|
text: lexical,
|
|
1086
1096
|
title: new TextFieldIndex(void 0, void 0, RankingAlgorithm.BM25),
|
|
1087
1097
|
uri: keywordFieldIndex(),
|
|
1088
1098
|
sourceId: keywordFieldIndex(),
|
|
1099
|
+
sourceName: keywordFieldIndex(),
|
|
1089
1100
|
tags: keywordFieldIndex(),
|
|
1090
|
-
sourceType: keywordFieldIndex()
|
|
1101
|
+
sourceType: keywordFieldIndex(),
|
|
1102
|
+
publicationDate: new DateFieldIndex(),
|
|
1103
|
+
firstSeenAt: new DateFieldIndex(),
|
|
1104
|
+
lastSeenAt: new DateFieldIndex(),
|
|
1105
|
+
lastChangedAt: new DateFieldIndex(),
|
|
1106
|
+
crawledAt: new DateFieldIndex()
|
|
1091
1107
|
};
|
|
1092
1108
|
for (const field of extraFields) {
|
|
1093
1109
|
mapping[field] = keywordFieldIndex();
|
|
@@ -1123,8 +1139,12 @@ async function buildIndex({
|
|
|
1123
1139
|
const sources = await readJsonl(path11.join(workspacePath, "sources", "sources.jsonl"));
|
|
1124
1140
|
const metadataFields = [...new Set(chunks.flatMap((chunk) => Object.keys(chunk.metadata).map((key) => `metadata.${key}`)))];
|
|
1125
1141
|
const index = new DocumentIndex(createIndexMapping(metadataFields));
|
|
1142
|
+
const documentsById = new Map(documents.map((document) => [document.id, document]));
|
|
1143
|
+
const sourcesById = new Map(sources.map((source) => [source.id, source]));
|
|
1126
1144
|
reportProgress(progress, `Building lexical index from ${chunks.length} chunk${chunks.length === 1 ? "" : "s"}`);
|
|
1127
1145
|
for (const chunk of chunks) {
|
|
1146
|
+
const document = documentsById.get(chunk.documentId);
|
|
1147
|
+
const source = sourcesById.get(chunk.sourceId);
|
|
1128
1148
|
index.index({
|
|
1129
1149
|
id: chunk.id,
|
|
1130
1150
|
fields: {
|
|
@@ -1132,9 +1152,33 @@ async function buildIndex({
|
|
|
1132
1152
|
title: [chunk.title],
|
|
1133
1153
|
uri: [chunk.uri.toLowerCase()],
|
|
1134
1154
|
sourceId: [chunk.sourceId.toLowerCase()],
|
|
1155
|
+
sourceName: source ? [source.name.toLowerCase()] : [],
|
|
1135
1156
|
tags: Array.isArray(chunk.metadata.tags) ? chunk.metadata.tags.map((tag) => String(tag).toLowerCase()) : [],
|
|
1136
1157
|
sourceType: [String(chunk.metadata.sourceType ?? "").toLowerCase()],
|
|
1158
|
+
publicationDate: document?.publicationDate ? [document.publicationDate] : [],
|
|
1159
|
+
firstSeenAt: [document?.firstSeenAt ?? chunk.firstSeenAt],
|
|
1160
|
+
lastSeenAt: [document?.lastSeenAt ?? chunk.lastSeenAt],
|
|
1161
|
+
lastChangedAt: [document?.lastChangedAt ?? chunk.lastChangedAt],
|
|
1162
|
+
crawledAt: document?.crawledAt ? [document.crawledAt] : [],
|
|
1137
1163
|
...flattenMetadata(chunk.metadata)
|
|
1164
|
+
},
|
|
1165
|
+
source: {
|
|
1166
|
+
chunkId: chunk.id,
|
|
1167
|
+
documentId: chunk.documentId,
|
|
1168
|
+
sourceId: chunk.sourceId,
|
|
1169
|
+
sourceType: document?.sourceType ?? "text",
|
|
1170
|
+
sourceName: source?.name,
|
|
1171
|
+
title: chunk.title,
|
|
1172
|
+
uri: chunk.uri,
|
|
1173
|
+
headingPath: chunk.headingPath,
|
|
1174
|
+
text: chunk.text,
|
|
1175
|
+
normalizedPath: document?.normalizedPath,
|
|
1176
|
+
publicationDate: document?.publicationDate ?? null,
|
|
1177
|
+
crawledAt: document?.crawledAt,
|
|
1178
|
+
firstSeenAt: document?.firstSeenAt ?? chunk.firstSeenAt,
|
|
1179
|
+
lastSeenAt: document?.lastSeenAt ?? chunk.lastSeenAt,
|
|
1180
|
+
lastChangedAt: document?.lastChangedAt ?? chunk.lastChangedAt,
|
|
1181
|
+
metadata: chunk.metadata
|
|
1138
1182
|
}
|
|
1139
1183
|
});
|
|
1140
1184
|
}
|
|
@@ -1143,7 +1187,7 @@ async function buildIndex({
|
|
|
1143
1187
|
const metadata = {
|
|
1144
1188
|
id: `index_${createdAt.replace(/[:.]/g, "-")}`,
|
|
1145
1189
|
createdAt,
|
|
1146
|
-
querylightVersion: "0.
|
|
1190
|
+
querylightVersion: "0.11.0",
|
|
1147
1191
|
kbVersion: "0.1.0",
|
|
1148
1192
|
documentCount: documents.length,
|
|
1149
1193
|
chunkCount: chunks.length,
|
|
@@ -1262,7 +1306,7 @@ async function removeSource(workspacePath, sourceId) {
|
|
|
1262
1306
|
}
|
|
1263
1307
|
|
|
1264
1308
|
// src/ingest/document-utils.ts
|
|
1265
|
-
import { mkdir as mkdir7, rm as
|
|
1309
|
+
import { mkdir as mkdir7, rm as rm4, writeFile as writeFile6 } from "fs/promises";
|
|
1266
1310
|
import path14 from "path";
|
|
1267
1311
|
|
|
1268
1312
|
// src/normalize/normalize-markdown.ts
|
|
@@ -1316,7 +1360,7 @@ async function writeNormalizedDocument({
|
|
|
1316
1360
|
markdown
|
|
1317
1361
|
}) {
|
|
1318
1362
|
await mkdir7(path14.dirname(normalizedPath), { recursive: true });
|
|
1319
|
-
await
|
|
1363
|
+
await writeFile6(
|
|
1320
1364
|
normalizedPath,
|
|
1321
1365
|
withFrontmatter(
|
|
1322
1366
|
{
|
|
@@ -1338,8 +1382,8 @@ async function writeNormalizedDocument({
|
|
|
1338
1382
|
}
|
|
1339
1383
|
async function deleteDocumentArtifacts(document) {
|
|
1340
1384
|
await Promise.all([
|
|
1341
|
-
document.rawPath ?
|
|
1342
|
-
|
|
1385
|
+
document.rawPath ? rm4(document.rawPath, { force: true }) : Promise.resolve(),
|
|
1386
|
+
rm4(document.normalizedPath, { force: true })
|
|
1343
1387
|
]);
|
|
1344
1388
|
}
|
|
1345
1389
|
|
|
@@ -1363,7 +1407,7 @@ async function listDirectoryFiles(source) {
|
|
|
1363
1407
|
|
|
1364
1408
|
// src/ingest/adapters/file-adapter.ts
|
|
1365
1409
|
import { basename, extname, resolve } from "path";
|
|
1366
|
-
import { mkdir as mkdir8, readFile as readFile8, stat as stat3, writeFile as
|
|
1410
|
+
import { mkdir as mkdir8, readFile as readFile8, stat as stat3, writeFile as writeFile7 } from "fs/promises";
|
|
1367
1411
|
|
|
1368
1412
|
// src/ingest/extractors/docx-extractor.ts
|
|
1369
1413
|
import mammoth from "mammoth";
|
|
@@ -1653,7 +1697,7 @@ async function ingestFile({
|
|
|
1653
1697
|
await mkdir8(resolve(workspacePath, "normalized"), { recursive: true });
|
|
1654
1698
|
await mkdir8(resolve(workspacePath, "raw", source.id), { recursive: true });
|
|
1655
1699
|
if (extracted.raw) {
|
|
1656
|
-
await
|
|
1700
|
+
await writeFile7(rawPath, extracted.raw, "utf8");
|
|
1657
1701
|
}
|
|
1658
1702
|
await writeNormalizedDocument({
|
|
1659
1703
|
documentId,
|
|
@@ -1877,7 +1921,7 @@ async function parseRssFeedDocument(xml, source) {
|
|
|
1877
1921
|
}
|
|
1878
1922
|
|
|
1879
1923
|
// src/ingest/adapters/url-adapter.ts
|
|
1880
|
-
import { mkdir as mkdir9, readFile as readFile9, writeFile as
|
|
1924
|
+
import { mkdir as mkdir9, readFile as readFile9, writeFile as writeFile8 } from "fs/promises";
|
|
1881
1925
|
import path16 from "path";
|
|
1882
1926
|
|
|
1883
1927
|
// src/core/urls.ts
|
|
@@ -1930,7 +1974,7 @@ ${extracted.markdown}`;
|
|
|
1930
1974
|
const crawledAt = now;
|
|
1931
1975
|
const resolvedPublicationDate = choosePublicationDate(publicationDate, extractPublicationDateFromHtml(body), previous?.publicationDate);
|
|
1932
1976
|
await mkdir9(path16.resolve(workspacePath, "raw", source.id), { recursive: true });
|
|
1933
|
-
await
|
|
1977
|
+
await writeFile8(rawPath, body, "utf8");
|
|
1934
1978
|
await writeNormalizedDocument({
|
|
1935
1979
|
documentId,
|
|
1936
1980
|
sourceId: source.id,
|
|
@@ -2769,7 +2813,7 @@ async function discoverWebsiteFeed(websiteUrl, userAgent) {
|
|
|
2769
2813
|
|
|
2770
2814
|
// src/query/search-service.ts
|
|
2771
2815
|
import { readFile as readFile10 } from "fs/promises";
|
|
2772
|
-
import {
|
|
2816
|
+
import { reciprocalRankFusion, searchJsonDsl } from "@tryformation/querylight-ts";
|
|
2773
2817
|
import path18 from "path";
|
|
2774
2818
|
async function loadHydratedIndex(workspacePath) {
|
|
2775
2819
|
let state;
|
|
@@ -2797,24 +2841,6 @@ function matchesPrefix(value, prefixes) {
|
|
|
2797
2841
|
const lower = value.toLowerCase();
|
|
2798
2842
|
return prefixes.some((prefix) => lower.startsWith(prefix));
|
|
2799
2843
|
}
|
|
2800
|
-
function buildSearchQuery(query, filters) {
|
|
2801
|
-
const sourceIds = normalizeFilterValues([filters.sourceId, ...filters.sourceIds ?? []].filter((value) => Boolean(value)));
|
|
2802
|
-
const sourceTypes = normalizeFilterValues([filters.sourceType, ...filters.sourceTypes ?? []].filter((value) => Boolean(value)));
|
|
2803
|
-
const tags = normalizeFilterValues([filters.tag, ...filters.tags ?? []].filter((value) => Boolean(value)));
|
|
2804
|
-
return new BoolQuery({
|
|
2805
|
-
should: [
|
|
2806
|
-
new MatchQuery({ field: "title", text: query, operation: OP.AND, boost: 6 }),
|
|
2807
|
-
new MatchQuery({ field: "text", text: query, operation: OP.AND, boost: 4 }),
|
|
2808
|
-
new MatchQuery({ field: "text", text: query, operation: OP.OR, boost: 2 })
|
|
2809
|
-
],
|
|
2810
|
-
filter: [
|
|
2811
|
-
...sourceIds.length === 1 ? [new TermQuery({ field: "sourceId", text: sourceIds[0] })] : [],
|
|
2812
|
-
...sourceTypes.length === 1 ? [new TermQuery({ field: "sourceType", text: sourceTypes[0] })] : [],
|
|
2813
|
-
...tags.length === 1 ? [new TermQuery({ field: "tags", text: tags[0] })] : [],
|
|
2814
|
-
...(filters.metadata ?? []).map(({ key, value }) => new TermQuery({ field: `metadata.${key}`, text: value.toLowerCase() }))
|
|
2815
|
-
]
|
|
2816
|
-
});
|
|
2817
|
-
}
|
|
2818
2844
|
function isValidDate(value) {
|
|
2819
2845
|
return typeof value === "string" && !Number.isNaN(new Date(value).getTime());
|
|
2820
2846
|
}
|
|
@@ -3013,6 +3039,178 @@ async function buildSnippetWithAdjacentChunks(chunk, query, {
|
|
|
3013
3039
|
}
|
|
3014
3040
|
return buildExpandedParagraphSnippet(paragraphs, currentIndex, query);
|
|
3015
3041
|
}
|
|
3042
|
+
function buildSearchDslRequest({
|
|
3043
|
+
query,
|
|
3044
|
+
topK,
|
|
3045
|
+
filters,
|
|
3046
|
+
dateRanges
|
|
3047
|
+
}) {
|
|
3048
|
+
const filterClauses = [];
|
|
3049
|
+
const sourceIds = normalizeFilterValues([filters.sourceId, ...filters.sourceIds ?? []].filter((value) => Boolean(value)));
|
|
3050
|
+
const sourceNames = normalizeFilterValues([filters.sourceName, ...filters.sourceNames ?? []].filter((value) => Boolean(value)));
|
|
3051
|
+
const sourceTypes = normalizeFilterValues([filters.sourceType, ...filters.sourceTypes ?? []].filter((value) => Boolean(value)));
|
|
3052
|
+
const uriPrefixes = normalizeFilterValues([filters.uriPrefix, ...filters.uriPrefixes ?? []].filter((value) => Boolean(value)));
|
|
3053
|
+
const tags = normalizeFilterValues([filters.tag, ...filters.tags ?? []].filter((value) => Boolean(value)));
|
|
3054
|
+
if (sourceIds.length > 0) {
|
|
3055
|
+
filterClauses.push({ terms: { sourceId: sourceIds } });
|
|
3056
|
+
}
|
|
3057
|
+
if (sourceNames.length > 0) {
|
|
3058
|
+
filterClauses.push({ terms: { sourceName: sourceNames } });
|
|
3059
|
+
}
|
|
3060
|
+
if (sourceTypes.length > 0) {
|
|
3061
|
+
filterClauses.push({ terms: { sourceType: sourceTypes } });
|
|
3062
|
+
}
|
|
3063
|
+
if (uriPrefixes.length > 0) {
|
|
3064
|
+
filterClauses.push({
|
|
3065
|
+
bool: {
|
|
3066
|
+
should: uriPrefixes.map((prefix) => ({ prefix: { uri: prefix } })),
|
|
3067
|
+
minimum_should_match: 1
|
|
3068
|
+
}
|
|
3069
|
+
});
|
|
3070
|
+
}
|
|
3071
|
+
if (tags.length > 0) {
|
|
3072
|
+
filterClauses.push({ terms: { tags } });
|
|
3073
|
+
}
|
|
3074
|
+
if (filters.hasPublicationDate) {
|
|
3075
|
+
filterClauses.push({ exists: { field: "publicationDate" } });
|
|
3076
|
+
}
|
|
3077
|
+
for (const { key, value } of filters.metadata ?? []) {
|
|
3078
|
+
filterClauses.push({ term: { [`metadata.${key}`]: value.toLowerCase() } });
|
|
3079
|
+
}
|
|
3080
|
+
for (const { field, from, to } of dateRanges) {
|
|
3081
|
+
filterClauses.push({
|
|
3082
|
+
range: {
|
|
3083
|
+
[field]: {
|
|
3084
|
+
...from ? { gte: from } : {},
|
|
3085
|
+
...to ? { lte: to } : {}
|
|
3086
|
+
}
|
|
3087
|
+
}
|
|
3088
|
+
});
|
|
3089
|
+
}
|
|
3090
|
+
return {
|
|
3091
|
+
size: topK,
|
|
3092
|
+
query: {
|
|
3093
|
+
bool: {
|
|
3094
|
+
should: [
|
|
3095
|
+
{ match: { title: { query, operator: "and", boost: 6 } } },
|
|
3096
|
+
{ match: { text: { query, operator: "and", boost: 4 } } },
|
|
3097
|
+
{ match: { text: { query, operator: "or", boost: 2 } } }
|
|
3098
|
+
],
|
|
3099
|
+
filter: filterClauses,
|
|
3100
|
+
minimum_should_match: 1
|
|
3101
|
+
}
|
|
3102
|
+
}
|
|
3103
|
+
};
|
|
3104
|
+
}
|
|
3105
|
+
function sourceToChunkRecord(source) {
|
|
3106
|
+
return {
|
|
3107
|
+
id: source.chunkId,
|
|
3108
|
+
documentId: source.documentId,
|
|
3109
|
+
sourceId: source.sourceId,
|
|
3110
|
+
title: source.title,
|
|
3111
|
+
uri: source.uri,
|
|
3112
|
+
headingPath: source.headingPath,
|
|
3113
|
+
text: source.text,
|
|
3114
|
+
contentHash: "",
|
|
3115
|
+
metadata: source.metadata,
|
|
3116
|
+
firstSeenAt: source.firstSeenAt,
|
|
3117
|
+
lastSeenAt: source.lastSeenAt,
|
|
3118
|
+
lastChangedAt: source.lastChangedAt
|
|
3119
|
+
};
|
|
3120
|
+
}
|
|
3121
|
+
function sourceToDocumentRecord(source) {
|
|
3122
|
+
return {
|
|
3123
|
+
id: source.documentId,
|
|
3124
|
+
sourceId: source.sourceId,
|
|
3125
|
+
sourceType: source.sourceType,
|
|
3126
|
+
title: source.title,
|
|
3127
|
+
uri: source.uri,
|
|
3128
|
+
sourceUri: source.uri,
|
|
3129
|
+
mimeType: "text/plain",
|
|
3130
|
+
normalizedPath: source.normalizedPath ?? "",
|
|
3131
|
+
contentHash: "",
|
|
3132
|
+
metadata: source.metadata,
|
|
3133
|
+
publicationDate: source.publicationDate ?? null,
|
|
3134
|
+
crawledAt: source.crawledAt,
|
|
3135
|
+
firstSeenAt: source.firstSeenAt,
|
|
3136
|
+
lastSeenAt: source.lastSeenAt,
|
|
3137
|
+
lastChangedAt: source.lastChangedAt
|
|
3138
|
+
};
|
|
3139
|
+
}
|
|
3140
|
+
async function materializeSearchHit(hit, query, config, orderedChunkCache, showChunks) {
|
|
3141
|
+
const source = hit._source;
|
|
3142
|
+
const chunk = sourceToChunkRecord(source);
|
|
3143
|
+
const document = sourceToDocumentRecord(source);
|
|
3144
|
+
const snippet = await buildSnippetWithAdjacentChunks(chunk, query, { document, config, orderedChunkCache });
|
|
3145
|
+
const enrichedSource = {
|
|
3146
|
+
...source,
|
|
3147
|
+
snippet
|
|
3148
|
+
};
|
|
3149
|
+
const result = {
|
|
3150
|
+
chunkId: source.chunkId,
|
|
3151
|
+
documentId: source.documentId,
|
|
3152
|
+
sourceId: source.sourceId,
|
|
3153
|
+
sourceType: source.sourceType,
|
|
3154
|
+
score: hit._score,
|
|
3155
|
+
title: chooseResultTitle(chunk),
|
|
3156
|
+
uri: source.uri,
|
|
3157
|
+
snippet,
|
|
3158
|
+
text: showChunks ? source.text : void 0,
|
|
3159
|
+
publicationDate: source.publicationDate ?? null,
|
|
3160
|
+
firstSeenAt: source.firstSeenAt,
|
|
3161
|
+
lastSeenAt: source.lastSeenAt,
|
|
3162
|
+
lastChangedAt: source.lastChangedAt,
|
|
3163
|
+
metadata: source.metadata
|
|
3164
|
+
};
|
|
3165
|
+
return {
|
|
3166
|
+
hit: {
|
|
3167
|
+
...hit,
|
|
3168
|
+
_source: enrichedSource
|
|
3169
|
+
},
|
|
3170
|
+
result
|
|
3171
|
+
};
|
|
3172
|
+
}
|
|
3173
|
+
function createSearchResponse(retrievalMode, hits, took, aggregations) {
|
|
3174
|
+
return {
|
|
3175
|
+
retrievalMode,
|
|
3176
|
+
took,
|
|
3177
|
+
hits: {
|
|
3178
|
+
total: {
|
|
3179
|
+
value: hits.length,
|
|
3180
|
+
relation: "eq"
|
|
3181
|
+
},
|
|
3182
|
+
max_score: hits.length > 0 ? Math.max(...hits.map((hit) => hit._score)) : null,
|
|
3183
|
+
hits
|
|
3184
|
+
},
|
|
3185
|
+
aggregations
|
|
3186
|
+
};
|
|
3187
|
+
}
|
|
3188
|
+
function searchResultsFromResponse(response2, showChunks = false) {
|
|
3189
|
+
return response2.hits.hits.map((hit) => ({
|
|
3190
|
+
chunkId: hit._source.chunkId,
|
|
3191
|
+
documentId: hit._source.documentId,
|
|
3192
|
+
sourceId: hit._source.sourceId,
|
|
3193
|
+
sourceType: hit._source.sourceType,
|
|
3194
|
+
score: hit._score,
|
|
3195
|
+
title: chooseResultTitle(sourceToChunkRecord(hit._source)),
|
|
3196
|
+
uri: hit._source.uri,
|
|
3197
|
+
snippet: hit._source.snippet ?? hit.highlight?.text?.join("\n\n") ?? buildSnippet(hit._source.text, hit._source.title),
|
|
3198
|
+
text: showChunks ? hit._source.text : void 0,
|
|
3199
|
+
publicationDate: hit._source.publicationDate ?? null,
|
|
3200
|
+
firstSeenAt: hit._source.firstSeenAt,
|
|
3201
|
+
lastSeenAt: hit._source.lastSeenAt,
|
|
3202
|
+
lastChangedAt: hit._source.lastChangedAt,
|
|
3203
|
+
metadata: hit._source.metadata
|
|
3204
|
+
}));
|
|
3205
|
+
}
|
|
3206
|
+
async function searchJsonIndex({
|
|
3207
|
+
workspacePath,
|
|
3208
|
+
request,
|
|
3209
|
+
indexName = "querylight"
|
|
3210
|
+
}) {
|
|
3211
|
+
const index = await loadHydratedIndex(workspacePath);
|
|
3212
|
+
return searchJsonDsl({ index, request, indexName });
|
|
3213
|
+
}
|
|
3016
3214
|
function normalizeDisplayTitle(title) {
|
|
3017
3215
|
return title.replace(/\s*\|\s*Querylight TS Demo\s*$/i, "").replace(/\s+/g, " ").trim();
|
|
3018
3216
|
}
|
|
@@ -3150,6 +3348,7 @@ async function searchIndex({
|
|
|
3150
3348
|
retrievalMode,
|
|
3151
3349
|
showChunks = false
|
|
3152
3350
|
}) {
|
|
3351
|
+
const startedAt = Date.now();
|
|
3153
3352
|
const config = await loadConfig(workspacePath);
|
|
3154
3353
|
const mode = retrievalMode ?? config.retrieval.defaultMode;
|
|
3155
3354
|
const candidateLimit = Math.max(topK * 5, 50);
|
|
@@ -3206,12 +3405,48 @@ async function searchIndex({
|
|
|
3206
3405
|
};
|
|
3207
3406
|
})
|
|
3208
3407
|
);
|
|
3209
|
-
|
|
3408
|
+
const hits2 = latestResults.filter((result) => result != null).map((result) => {
|
|
3409
|
+
const chunk = chunks.get(result.chunkId);
|
|
3410
|
+
const document = documents.get(result.documentId);
|
|
3411
|
+
const source = sources.get(result.sourceId);
|
|
3412
|
+
return {
|
|
3413
|
+
_index: "querylight",
|
|
3414
|
+
_id: result.chunkId,
|
|
3415
|
+
_score: result.score,
|
|
3416
|
+
_source: {
|
|
3417
|
+
chunkId: result.chunkId,
|
|
3418
|
+
documentId: result.documentId,
|
|
3419
|
+
sourceId: result.sourceId,
|
|
3420
|
+
sourceType: result.sourceType,
|
|
3421
|
+
sourceName: source?.name,
|
|
3422
|
+
title: chunk.title,
|
|
3423
|
+
uri: result.uri,
|
|
3424
|
+
headingPath: chunk.headingPath,
|
|
3425
|
+
text: chunk.text,
|
|
3426
|
+
snippet: result.snippet,
|
|
3427
|
+
normalizedPath: document.normalizedPath,
|
|
3428
|
+
publicationDate: result.publicationDate ?? null,
|
|
3429
|
+
crawledAt: document.crawledAt,
|
|
3430
|
+
firstSeenAt: result.firstSeenAt,
|
|
3431
|
+
lastSeenAt: result.lastSeenAt,
|
|
3432
|
+
lastChangedAt: result.lastChangedAt,
|
|
3433
|
+
metadata: result.metadata
|
|
3434
|
+
}
|
|
3435
|
+
};
|
|
3436
|
+
});
|
|
3437
|
+
return createSearchResponse("lexical", hits2, Date.now() - startedAt);
|
|
3210
3438
|
}
|
|
3211
3439
|
const lexicalHits = async () => {
|
|
3212
|
-
const
|
|
3213
|
-
|
|
3214
|
-
|
|
3440
|
+
const response2 = await searchJsonIndex({
|
|
3441
|
+
workspacePath,
|
|
3442
|
+
request: buildSearchDslRequest({
|
|
3443
|
+
query: normalizedQuery,
|
|
3444
|
+
topK: candidateLimit,
|
|
3445
|
+
filters: { sourceId, sourceIds, sourceName, sourceNames, sourceType, sourceTypes, uriPrefix, uriPrefixes, hasPublicationDate, tag, tags, metadata },
|
|
3446
|
+
dateRanges
|
|
3447
|
+
})
|
|
3448
|
+
});
|
|
3449
|
+
return response2.hits.hits;
|
|
3215
3450
|
};
|
|
3216
3451
|
const denseHits = async () => {
|
|
3217
3452
|
if (!await fileExists(denseVectorPath(workspacePath))) {
|
|
@@ -3225,15 +3460,18 @@ async function searchIndex({
|
|
|
3225
3460
|
}
|
|
3226
3461
|
return sparseQuery({ workspacePath, config: config.retrieval.sparse, query: normalizedQuery, topK: candidateLimit }).then((hits2) => hits2.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit));
|
|
3227
3462
|
};
|
|
3463
|
+
let lexicalResponseHits = [];
|
|
3228
3464
|
let hits;
|
|
3229
3465
|
if (mode === "lexical") {
|
|
3230
|
-
|
|
3466
|
+
lexicalResponseHits = await lexicalHits();
|
|
3467
|
+
hits = lexicalResponseHits.map((hit) => [hit._id, hit._score]);
|
|
3231
3468
|
} else if (mode === "dense") {
|
|
3232
3469
|
hits = await denseHits();
|
|
3233
3470
|
} else if (mode === "sparse") {
|
|
3234
3471
|
hits = await sparseHits();
|
|
3235
3472
|
} else {
|
|
3236
|
-
|
|
3473
|
+
lexicalResponseHits = await lexicalHits();
|
|
3474
|
+
const rankings = [lexicalResponseHits.map((hit) => [hit._id, hit._score])];
|
|
3237
3475
|
if (await fileExists(denseVectorPath(workspacePath))) {
|
|
3238
3476
|
rankings.push(await denseQuery({ workspacePath, config: config.retrieval.dense, query: normalizedQuery, topK: candidateLimit }).then((dense) => dense.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit)));
|
|
3239
3477
|
}
|
|
@@ -3242,34 +3480,49 @@ async function searchIndex({
|
|
|
3242
3480
|
}
|
|
3243
3481
|
hits = reciprocalRankFusion(rankings, { rankConstant: 20, weights: rankings.map((_, index) => index === 0 ? 3 : 1) }).slice(0, candidateLimit);
|
|
3244
3482
|
}
|
|
3245
|
-
const
|
|
3483
|
+
const baseHits = mode === "lexical" ? lexicalResponseHits : hits.flatMap(([chunkId, score]) => {
|
|
3246
3484
|
const chunk = chunks.get(chunkId);
|
|
3247
3485
|
if (!chunk) {
|
|
3248
|
-
return
|
|
3486
|
+
return [];
|
|
3249
3487
|
}
|
|
3250
|
-
|
|
3251
|
-
|
|
3252
|
-
|
|
3253
|
-
|
|
3254
|
-
|
|
3255
|
-
score,
|
|
3256
|
-
|
|
3257
|
-
|
|
3258
|
-
|
|
3259
|
-
|
|
3260
|
-
|
|
3261
|
-
|
|
3262
|
-
|
|
3263
|
-
|
|
3264
|
-
|
|
3265
|
-
|
|
3266
|
-
|
|
3267
|
-
|
|
3268
|
-
|
|
3269
|
-
|
|
3270
|
-
|
|
3271
|
-
|
|
3272
|
-
|
|
3488
|
+
const document = documents.get(chunk.documentId);
|
|
3489
|
+
const source = sources.get(chunk.sourceId);
|
|
3490
|
+
return [{
|
|
3491
|
+
_index: "querylight",
|
|
3492
|
+
_id: chunkId,
|
|
3493
|
+
_score: score,
|
|
3494
|
+
_source: {
|
|
3495
|
+
chunkId,
|
|
3496
|
+
documentId: chunk.documentId,
|
|
3497
|
+
sourceId: chunk.sourceId,
|
|
3498
|
+
sourceType: document?.sourceType ?? "text",
|
|
3499
|
+
sourceName: source?.name,
|
|
3500
|
+
title: chunk.title,
|
|
3501
|
+
uri: chunk.uri,
|
|
3502
|
+
headingPath: chunk.headingPath,
|
|
3503
|
+
text: chunk.text,
|
|
3504
|
+
normalizedPath: document?.normalizedPath,
|
|
3505
|
+
publicationDate: document?.publicationDate ?? null,
|
|
3506
|
+
crawledAt: document?.crawledAt,
|
|
3507
|
+
firstSeenAt: document?.firstSeenAt ?? chunk.firstSeenAt,
|
|
3508
|
+
lastSeenAt: document?.lastSeenAt ?? chunk.lastSeenAt,
|
|
3509
|
+
lastChangedAt: document?.lastChangedAt ?? chunk.lastChangedAt,
|
|
3510
|
+
metadata: chunk.metadata
|
|
3511
|
+
}
|
|
3512
|
+
}];
|
|
3513
|
+
});
|
|
3514
|
+
const materialized = await Promise.all(baseHits.map((hit) => materializeSearchHit(hit, normalizedQuery, config, orderedChunkCache, showChunks)));
|
|
3515
|
+
if (showChunks) {
|
|
3516
|
+
const topHits = materialized.sort((left, right) => right.result.score - left.result.score).slice(0, topK).map(({ hit, result }) => ({ ...hit, _score: result.score }));
|
|
3517
|
+
return createSearchResponse(mode, topHits, Date.now() - startedAt);
|
|
3518
|
+
}
|
|
3519
|
+
const reranked = rerankResultsByDocument(materialized.map(({ result }) => result), topK);
|
|
3520
|
+
const byChunkId = new Map(materialized.map(({ hit }) => [hit._id, hit]));
|
|
3521
|
+
const finalHits = reranked.map((result) => {
|
|
3522
|
+
const hit = byChunkId.get(result.chunkId);
|
|
3523
|
+
return hit ? { ...hit, _score: result.score, _source: { ...hit._source, snippet: result.snippet } } : null;
|
|
3524
|
+
}).filter((hit) => hit != null);
|
|
3525
|
+
return createSearchResponse(mode, finalHits, Date.now() - startedAt);
|
|
3273
3526
|
}
|
|
3274
3527
|
|
|
3275
3528
|
// src/query/related-service.ts
|
|
@@ -3386,9 +3639,10 @@ async function createContext({
|
|
|
3386
3639
|
retrievalMode
|
|
3387
3640
|
}) {
|
|
3388
3641
|
const search = await searchIndex({ workspacePath, query, topK, showChunks: true, retrievalMode });
|
|
3642
|
+
const results = searchResultsFromResponse(search, true);
|
|
3389
3643
|
const sources = [];
|
|
3390
3644
|
let total = 0;
|
|
3391
|
-
for (const result of
|
|
3645
|
+
for (const result of results) {
|
|
3392
3646
|
const text = result.text ?? "";
|
|
3393
3647
|
if (total + text.length > maxChars && sources.length > 0) {
|
|
3394
3648
|
break;
|
|
@@ -3489,7 +3743,8 @@ function formatSourcesTable(sources) {
|
|
|
3489
3743
|
}
|
|
3490
3744
|
return table.toString();
|
|
3491
3745
|
}
|
|
3492
|
-
function formatSearchResults(
|
|
3746
|
+
function formatSearchResults(response2) {
|
|
3747
|
+
const results = searchResultsFromResponse(response2);
|
|
3493
3748
|
return results.map((result, index) => [
|
|
3494
3749
|
`${index + 1}. ${colors.bold(result.title)}`,
|
|
3495
3750
|
` URL: ${result.uri}`,
|
|
@@ -3794,6 +4049,19 @@ function parseDateValue(input, optionName) {
|
|
|
3794
4049
|
}
|
|
3795
4050
|
return parsed.toISOString();
|
|
3796
4051
|
}
|
|
4052
|
+
async function parseJsonArgument(input) {
|
|
4053
|
+
const raw = input.startsWith("@") ? await readFile11(path21.resolve(input.slice(1)), "utf8") : input;
|
|
4054
|
+
try {
|
|
4055
|
+
const parsed = JSON.parse(raw);
|
|
4056
|
+
if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
|
|
4057
|
+
throw new Error("expected a JSON object");
|
|
4058
|
+
}
|
|
4059
|
+
return parsed;
|
|
4060
|
+
} catch (error) {
|
|
4061
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
4062
|
+
throw new CliError(`invalid JSON request: ${message}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
|
|
4063
|
+
}
|
|
4064
|
+
}
|
|
3797
4065
|
function searchDateRanges(options) {
|
|
3798
4066
|
const entries = [];
|
|
3799
4067
|
if (options.since || options.until) {
|
|
@@ -4138,7 +4406,7 @@ Examples:
|
|
|
4138
4406
|
progress?.("info", "Rebuild complete");
|
|
4139
4407
|
emit(global.json, capture, response("rebuild", workspace, data), `Processed ${ingest.processedSources} sources, wrote ${chunk.chunksWritten} chunks`);
|
|
4140
4408
|
});
|
|
4141
|
-
program.command("search").description("Search the built index and return ranked matching documents or chunks.").argument("[query]", "Text query. Omit it to list the latest matching documents.").option("--top-k <n>", "Maximum number of results to return.", "12").option("--source <sourceIds>", "Restrict results to one or more source ids. Use comma-separated values.").option("--source-name <names>", "Restrict results to one or more source names. Use comma-separated values.").option("--source-type <types>", `Restrict results to one or more source types. Use comma-separated values: ${SOURCE_TYPE_LIST.join(", ")}`).option("--uri-prefix <prefixes>", "Restrict results to one or more URI prefixes. Use comma-separated values.").option("--tag <tags>", "Restrict results to one or more source tags. Use comma-separated values.").option("--metadata <key=value...>", "Restrict results to sources with matching metadata.").option("--since <date>", "Shortcut for --publication-date-from.").option("--until <date>", "Shortcut for --publication-date-to.").option("--changed-since <date>", "Only include documents changed on or after this date.").option("--has-publication-date", "Only include documents with a publication date.").option("--publication-date-from <date>", "Only include documents published on or after this date.").option("--publication-date-to <date>", "Only include documents published on or before this date.").option("--first-seen-at-from <date>", "Only include documents first seen on or after this date.").option("--first-seen-at-to <date>", "Only include documents first seen on or before this date.").option("--last-seen-at-from <date>", "Only include documents last seen on or after this date.").option("--last-seen-at-to <date>", "Only include documents last seen on or before this date.").option("--last-changed-at-from <date>", "Only include documents changed on or after this date.").option("--last-changed-at-to <date>", "Only include documents changed on or before this date.").option("--crawled-at-from <date>", "Only include documents crawled on or after this date.").option("--crawled-at-to <date>", "Only include documents crawled on or before this date.").option("--retrieval <mode>", `Retrieval mode: ${RETRIEVAL_MODE_LIST.join(", ")}`).option("--show-chunks", "Return chunk-level matches when available.").addHelpText("after", `
|
|
4409
|
+
program.command("search").description("Search the built index and return ranked matching documents or chunks. Use search-json for raw JSON DSL queries.").argument("[query]", "Text query. Omit it to list the latest matching documents.").option("--top-k <n>", "Maximum number of results to return.", "12").option("--source <sourceIds>", "Restrict results to one or more source ids. Use comma-separated values.").option("--source-name <names>", "Restrict results to one or more source names. Use comma-separated values.").option("--source-type <types>", `Restrict results to one or more source types. Use comma-separated values: ${SOURCE_TYPE_LIST.join(", ")}`).option("--uri-prefix <prefixes>", "Restrict results to one or more URI prefixes. Use comma-separated values.").option("--tag <tags>", "Restrict results to one or more source tags. Use comma-separated values.").option("--metadata <key=value...>", "Restrict results to sources with matching metadata.").option("--since <date>", "Shortcut for --publication-date-from.").option("--until <date>", "Shortcut for --publication-date-to.").option("--changed-since <date>", "Only include documents changed on or after this date.").option("--has-publication-date", "Only include documents with a publication date.").option("--publication-date-from <date>", "Only include documents published on or after this date.").option("--publication-date-to <date>", "Only include documents published on or before this date.").option("--first-seen-at-from <date>", "Only include documents first seen on or after this date.").option("--first-seen-at-to <date>", "Only include documents first seen on or before this date.").option("--last-seen-at-from <date>", "Only include documents last seen on or after this date.").option("--last-seen-at-to <date>", "Only include documents last seen on or before this date.").option("--last-changed-at-from <date>", "Only include documents changed on or after this date.").option("--last-changed-at-to <date>", "Only include documents changed on or before this date.").option("--crawled-at-from <date>", "Only include documents crawled on or after this date.").option("--crawled-at-to <date>", "Only include documents crawled on or before this date.").option("--retrieval <mode>", `Retrieval mode: ${RETRIEVAL_MODE_LIST.join(", ")}`).option("--show-chunks", "Return chunk-level matches when available.").addHelpText("after", `
|
|
4142
4410
|
Examples:
|
|
4143
4411
|
qli search "pricing api limits"
|
|
4144
4412
|
qli search "authentication" --top-k 20 --tag docs
|
|
@@ -4151,6 +4419,7 @@ Examples:
|
|
|
4151
4419
|
Notes:
|
|
4152
4420
|
lexical works without vector models.
|
|
4153
4421
|
dense, sparse, and hybrid require the relevant index artifacts to exist.
|
|
4422
|
+
Use search-json when you want the raw Querylight 0.11 JSON DSL and hit format.
|
|
4154
4423
|
When you omit the query, qli returns the latest matching documents sorted by publication date.`).action(async function command(query, options) {
|
|
4155
4424
|
const global = this.optsWithGlobals();
|
|
4156
4425
|
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
@@ -4169,7 +4438,26 @@ Notes:
|
|
|
4169
4438
|
retrievalMode: parseRetrievalMode(options.retrieval),
|
|
4170
4439
|
showChunks: Boolean(options.showChunks)
|
|
4171
4440
|
});
|
|
4172
|
-
emit(global.json, capture, response("search", workspace, result), formatSearchResults(result
|
|
4441
|
+
emit(global.json, capture, response("search", workspace, result), formatSearchResults(result));
|
|
4442
|
+
});
|
|
4443
|
+
program.command("search-json").description("Run a raw Querylight 0.11 JSON DSL search request against the lexical index.").argument("<request>", "Inline JSON request or @path/to/request.json.").addHelpText("after", `
|
|
4444
|
+
Examples:
|
|
4445
|
+
qli search-json '{"query":{"match":{"text":"authentication"}},"size":5}'
|
|
4446
|
+
qli search-json @./search-request.json
|
|
4447
|
+
qli search-json '{"query":{"bool":{"filter":[{"term":{"sourceType":"rss"}}]}},"aggs":{"types":{"terms":{"field":"sourceType","size":5}}}}' --json
|
|
4448
|
+
|
|
4449
|
+
Notes:
|
|
4450
|
+
search-json uses the lexical index and Querylight 0.11 JSON DSL fields.
|
|
4451
|
+
Stored hit payloads are returned under _source.
|
|
4452
|
+
Use --json when another tool needs the full response envelope.`).action(async function command(requestInput) {
|
|
4453
|
+
const global = this.optsWithGlobals();
|
|
4454
|
+
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
4455
|
+
const request = await parseJsonArgument(requestInput);
|
|
4456
|
+
const result = await searchJsonIndex({
|
|
4457
|
+
workspacePath: workspace,
|
|
4458
|
+
request
|
|
4459
|
+
});
|
|
4460
|
+
emit(global.json, capture, response("search-json", workspace, result), JSON.stringify(result, null, 2));
|
|
4173
4461
|
});
|
|
4174
4462
|
program.command("related").description("Find documents similar to an existing document by id or URI.").argument("<document>", "Document id, uri, or canonical uri").option("--top-k <n>", "Maximum number of related documents to return.", "12").addHelpText("after", `
|
|
4175
4463
|
Examples:
|