@tryformation/querylight-cli 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +7 -3
- package/dist/cli/format.d.ts +2 -2
- package/dist/cli/main.js +338 -59
- package/dist/core/constants.d.ts +1 -1
- package/dist/index/querylight-indexer.d.ts +2 -2
- package/dist/index.js +301 -54
- package/dist/query/search-service.d.ts +8 -1
- package/dist/types/models.d.ts +36 -1
- package/package.json +2 -2
package/README.md
CHANGED
|
@@ -23,7 +23,7 @@ It is designed for local, inspectable workflows:
|
|
|
23
23
|
Run without installing globally:
|
|
24
24
|
|
|
25
25
|
```bash
|
|
26
|
-
bunx @tryformation/querylight-cli init
|
|
26
|
+
bunx --bun @tryformation/querylight-cli init
|
|
27
27
|
```
|
|
28
28
|
|
|
29
29
|
For agent and Python automation examples that use `bunx` and `uv`, see [`examples/skills/qli-bunx-uv/SKILL.md`](https://github.com/formation-res/querylight-cli/blob/main/examples/skills/qli-bunx-uv/SKILL.md).
|
|
@@ -43,9 +43,11 @@ npx qli --help
|
|
|
43
43
|
If you prefer to avoid a local install, use:
|
|
44
44
|
|
|
45
45
|
```bash
|
|
46
|
-
bunx @tryformation/querylight-cli --help
|
|
46
|
+
bunx --bun @tryformation/querylight-cli --help
|
|
47
47
|
```
|
|
48
48
|
|
|
49
|
+
Use `bunx --bun` for repeated or concurrent `bunx` calls. `bunx` respects the CLI shebang by default and otherwise starts `qli` through `node`.
|
|
50
|
+
|
|
49
51
|
## Release
|
|
50
52
|
|
|
51
53
|
Publish releases from semantic version tags such as `0.1.1`.
|
|
@@ -105,6 +107,7 @@ Search it:
|
|
|
105
107
|
```bash
|
|
106
108
|
qli search "API authentication"
|
|
107
109
|
qli search --source-type rss --since 2026-05-01 --has-publication-date
|
|
110
|
+
qli search-json '{"query":{"match":{"text":"API authentication"}},"size":5}'
|
|
108
111
|
```
|
|
109
112
|
|
|
110
113
|
Find related documents for an existing one:
|
|
@@ -127,7 +130,7 @@ The repository includes an example skill for running `qli` without a global inst
|
|
|
127
130
|
|
|
128
131
|
It covers:
|
|
129
132
|
|
|
130
|
-
- running `qli` with `bunx @tryformation/querylight-cli`
|
|
133
|
+
- running `qli` with `bunx --bun @tryformation/querylight-cli`
|
|
131
134
|
- using `--json` for automation and agents
|
|
132
135
|
- calling `qli search` and `qli context` from Python with `subprocess`
|
|
133
136
|
|
|
@@ -361,6 +364,7 @@ qli search --source-type rss,page --since 2026-05-01 --has-publication-date --to
|
|
|
361
364
|
qli search --source-name "Release Feed,Company Blog" --uri-prefix https://example.com/news,https://example.com/blog
|
|
362
365
|
qli search --source-type rss,page --top-k 25 --json
|
|
363
366
|
qli search "authentication" --json
|
|
367
|
+
qli search-json '{"query":{"bool":{"filter":[{"term":{"sourceType":"rss"}}]}},"size":10}' --json
|
|
364
368
|
```
|
|
365
369
|
|
|
366
370
|
Build retrieval context:
|
package/dist/cli/format.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import type { RelatedDocumentResult,
|
|
1
|
+
import type { RelatedDocumentResult, SearchResponseData, Source } from "../types/models.js";
|
|
2
2
|
export declare function formatSourcesTable(sources: Source[]): string;
|
|
3
|
-
export declare function formatSearchResults(
|
|
3
|
+
export declare function formatSearchResults(response: SearchResponseData): string;
|
|
4
4
|
export declare function formatRelatedDocuments(results: RelatedDocumentResult[]): string;
|
package/dist/cli/main.js
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
// src/cli/run-cli.ts
|
|
4
4
|
import { Command, Option } from "commander";
|
|
5
|
-
import { stat as stat4 } from "fs/promises";
|
|
5
|
+
import { readFile as readFile11, stat as stat4 } from "fs/promises";
|
|
6
6
|
import path21 from "path";
|
|
7
7
|
|
|
8
8
|
// src/chunk/chunker.ts
|
|
@@ -16,7 +16,7 @@ import path from "path";
|
|
|
16
16
|
import YAML from "yaml";
|
|
17
17
|
|
|
18
18
|
// src/core/constants.ts
|
|
19
|
-
var PACKAGE_VERSION = "0.2.
|
|
19
|
+
var PACKAGE_VERSION = "0.2.3";
|
|
20
20
|
var DEFAULT_WORKSPACE = ".kb";
|
|
21
21
|
var DEFAULT_SHARED_MODEL_CACHE_DIR = "~/.qli/models/huggingface";
|
|
22
22
|
var LEGACY_WORKSPACE_MODEL_CACHE_DIR = ".kb/models/huggingface";
|
|
@@ -374,7 +374,7 @@ async function assertWorkspaceExists(workspacePath) {
|
|
|
374
374
|
}
|
|
375
375
|
|
|
376
376
|
// src/index/querylight-indexer.ts
|
|
377
|
-
import { Analyzer, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, RankingAlgorithm, TextFieldIndex } from "@tryformation/querylight-ts";
|
|
377
|
+
import { Analyzer, DateFieldIndex, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, RankingAlgorithm, StoredSourceIndex, TextFieldIndex } from "@tryformation/querylight-ts";
|
|
378
378
|
import path11 from "path";
|
|
379
379
|
|
|
380
380
|
// src/vector/dense.ts
|
|
@@ -1091,12 +1091,19 @@ function keywordFieldIndex() {
|
|
|
1091
1091
|
function createIndexMapping(extraFields = []) {
|
|
1092
1092
|
const lexical = new TextFieldIndex(void 0, void 0, RankingAlgorithm.BM25);
|
|
1093
1093
|
const mapping = {
|
|
1094
|
+
_source: new StoredSourceIndex(),
|
|
1094
1095
|
text: lexical,
|
|
1095
1096
|
title: new TextFieldIndex(void 0, void 0, RankingAlgorithm.BM25),
|
|
1096
1097
|
uri: keywordFieldIndex(),
|
|
1097
1098
|
sourceId: keywordFieldIndex(),
|
|
1099
|
+
sourceName: keywordFieldIndex(),
|
|
1098
1100
|
tags: keywordFieldIndex(),
|
|
1099
|
-
sourceType: keywordFieldIndex()
|
|
1101
|
+
sourceType: keywordFieldIndex(),
|
|
1102
|
+
publicationDate: new DateFieldIndex(),
|
|
1103
|
+
firstSeenAt: new DateFieldIndex(),
|
|
1104
|
+
lastSeenAt: new DateFieldIndex(),
|
|
1105
|
+
lastChangedAt: new DateFieldIndex(),
|
|
1106
|
+
crawledAt: new DateFieldIndex()
|
|
1100
1107
|
};
|
|
1101
1108
|
for (const field of extraFields) {
|
|
1102
1109
|
mapping[field] = keywordFieldIndex();
|
|
@@ -1132,8 +1139,12 @@ async function buildIndex({
|
|
|
1132
1139
|
const sources = await readJsonl(path11.join(workspacePath, "sources", "sources.jsonl"));
|
|
1133
1140
|
const metadataFields = [...new Set(chunks.flatMap((chunk) => Object.keys(chunk.metadata).map((key) => `metadata.${key}`)))];
|
|
1134
1141
|
const index = new DocumentIndex(createIndexMapping(metadataFields));
|
|
1142
|
+
const documentsById = new Map(documents.map((document) => [document.id, document]));
|
|
1143
|
+
const sourcesById = new Map(sources.map((source) => [source.id, source]));
|
|
1135
1144
|
reportProgress(progress, `Building lexical index from ${chunks.length} chunk${chunks.length === 1 ? "" : "s"}`);
|
|
1136
1145
|
for (const chunk of chunks) {
|
|
1146
|
+
const document = documentsById.get(chunk.documentId);
|
|
1147
|
+
const source = sourcesById.get(chunk.sourceId);
|
|
1137
1148
|
index.index({
|
|
1138
1149
|
id: chunk.id,
|
|
1139
1150
|
fields: {
|
|
@@ -1141,9 +1152,33 @@ async function buildIndex({
|
|
|
1141
1152
|
title: [chunk.title],
|
|
1142
1153
|
uri: [chunk.uri.toLowerCase()],
|
|
1143
1154
|
sourceId: [chunk.sourceId.toLowerCase()],
|
|
1155
|
+
sourceName: source ? [source.name.toLowerCase()] : [],
|
|
1144
1156
|
tags: Array.isArray(chunk.metadata.tags) ? chunk.metadata.tags.map((tag) => String(tag).toLowerCase()) : [],
|
|
1145
1157
|
sourceType: [String(chunk.metadata.sourceType ?? "").toLowerCase()],
|
|
1158
|
+
publicationDate: document?.publicationDate ? [document.publicationDate] : [],
|
|
1159
|
+
firstSeenAt: [document?.firstSeenAt ?? chunk.firstSeenAt],
|
|
1160
|
+
lastSeenAt: [document?.lastSeenAt ?? chunk.lastSeenAt],
|
|
1161
|
+
lastChangedAt: [document?.lastChangedAt ?? chunk.lastChangedAt],
|
|
1162
|
+
crawledAt: document?.crawledAt ? [document.crawledAt] : [],
|
|
1146
1163
|
...flattenMetadata(chunk.metadata)
|
|
1164
|
+
},
|
|
1165
|
+
source: {
|
|
1166
|
+
chunkId: chunk.id,
|
|
1167
|
+
documentId: chunk.documentId,
|
|
1168
|
+
sourceId: chunk.sourceId,
|
|
1169
|
+
sourceType: document?.sourceType ?? "text",
|
|
1170
|
+
sourceName: source?.name,
|
|
1171
|
+
title: chunk.title,
|
|
1172
|
+
uri: chunk.uri,
|
|
1173
|
+
headingPath: chunk.headingPath,
|
|
1174
|
+
text: chunk.text,
|
|
1175
|
+
normalizedPath: document?.normalizedPath,
|
|
1176
|
+
publicationDate: document?.publicationDate ?? null,
|
|
1177
|
+
crawledAt: document?.crawledAt,
|
|
1178
|
+
firstSeenAt: document?.firstSeenAt ?? chunk.firstSeenAt,
|
|
1179
|
+
lastSeenAt: document?.lastSeenAt ?? chunk.lastSeenAt,
|
|
1180
|
+
lastChangedAt: document?.lastChangedAt ?? chunk.lastChangedAt,
|
|
1181
|
+
metadata: chunk.metadata
|
|
1147
1182
|
}
|
|
1148
1183
|
});
|
|
1149
1184
|
}
|
|
@@ -1152,7 +1187,7 @@ async function buildIndex({
|
|
|
1152
1187
|
const metadata = {
|
|
1153
1188
|
id: `index_${createdAt.replace(/[:.]/g, "-")}`,
|
|
1154
1189
|
createdAt,
|
|
1155
|
-
querylightVersion: "0.
|
|
1190
|
+
querylightVersion: "0.11.0",
|
|
1156
1191
|
kbVersion: "0.1.0",
|
|
1157
1192
|
documentCount: documents.length,
|
|
1158
1193
|
chunkCount: chunks.length,
|
|
@@ -2778,7 +2813,7 @@ async function discoverWebsiteFeed(websiteUrl, userAgent) {
|
|
|
2778
2813
|
|
|
2779
2814
|
// src/query/search-service.ts
|
|
2780
2815
|
import { readFile as readFile10 } from "fs/promises";
|
|
2781
|
-
import {
|
|
2816
|
+
import { reciprocalRankFusion, searchJsonDsl } from "@tryformation/querylight-ts";
|
|
2782
2817
|
import path18 from "path";
|
|
2783
2818
|
async function loadHydratedIndex(workspacePath) {
|
|
2784
2819
|
let state;
|
|
@@ -2806,24 +2841,6 @@ function matchesPrefix(value, prefixes) {
|
|
|
2806
2841
|
const lower = value.toLowerCase();
|
|
2807
2842
|
return prefixes.some((prefix) => lower.startsWith(prefix));
|
|
2808
2843
|
}
|
|
2809
|
-
function buildSearchQuery(query, filters) {
|
|
2810
|
-
const sourceIds = normalizeFilterValues([filters.sourceId, ...filters.sourceIds ?? []].filter((value) => Boolean(value)));
|
|
2811
|
-
const sourceTypes = normalizeFilterValues([filters.sourceType, ...filters.sourceTypes ?? []].filter((value) => Boolean(value)));
|
|
2812
|
-
const tags = normalizeFilterValues([filters.tag, ...filters.tags ?? []].filter((value) => Boolean(value)));
|
|
2813
|
-
return new BoolQuery({
|
|
2814
|
-
should: [
|
|
2815
|
-
new MatchQuery({ field: "title", text: query, operation: OP.AND, boost: 6 }),
|
|
2816
|
-
new MatchQuery({ field: "text", text: query, operation: OP.AND, boost: 4 }),
|
|
2817
|
-
new MatchQuery({ field: "text", text: query, operation: OP.OR, boost: 2 })
|
|
2818
|
-
],
|
|
2819
|
-
filter: [
|
|
2820
|
-
...sourceIds.length === 1 ? [new TermQuery({ field: "sourceId", text: sourceIds[0] })] : [],
|
|
2821
|
-
...sourceTypes.length === 1 ? [new TermQuery({ field: "sourceType", text: sourceTypes[0] })] : [],
|
|
2822
|
-
...tags.length === 1 ? [new TermQuery({ field: "tags", text: tags[0] })] : [],
|
|
2823
|
-
...(filters.metadata ?? []).map(({ key, value }) => new TermQuery({ field: `metadata.${key}`, text: value.toLowerCase() }))
|
|
2824
|
-
]
|
|
2825
|
-
});
|
|
2826
|
-
}
|
|
2827
2844
|
function isValidDate(value) {
|
|
2828
2845
|
return typeof value === "string" && !Number.isNaN(new Date(value).getTime());
|
|
2829
2846
|
}
|
|
@@ -3022,6 +3039,178 @@ async function buildSnippetWithAdjacentChunks(chunk, query, {
|
|
|
3022
3039
|
}
|
|
3023
3040
|
return buildExpandedParagraphSnippet(paragraphs, currentIndex, query);
|
|
3024
3041
|
}
|
|
3042
|
+
function buildSearchDslRequest({
|
|
3043
|
+
query,
|
|
3044
|
+
topK,
|
|
3045
|
+
filters,
|
|
3046
|
+
dateRanges
|
|
3047
|
+
}) {
|
|
3048
|
+
const filterClauses = [];
|
|
3049
|
+
const sourceIds = normalizeFilterValues([filters.sourceId, ...filters.sourceIds ?? []].filter((value) => Boolean(value)));
|
|
3050
|
+
const sourceNames = normalizeFilterValues([filters.sourceName, ...filters.sourceNames ?? []].filter((value) => Boolean(value)));
|
|
3051
|
+
const sourceTypes = normalizeFilterValues([filters.sourceType, ...filters.sourceTypes ?? []].filter((value) => Boolean(value)));
|
|
3052
|
+
const uriPrefixes = normalizeFilterValues([filters.uriPrefix, ...filters.uriPrefixes ?? []].filter((value) => Boolean(value)));
|
|
3053
|
+
const tags = normalizeFilterValues([filters.tag, ...filters.tags ?? []].filter((value) => Boolean(value)));
|
|
3054
|
+
if (sourceIds.length > 0) {
|
|
3055
|
+
filterClauses.push({ terms: { sourceId: sourceIds } });
|
|
3056
|
+
}
|
|
3057
|
+
if (sourceNames.length > 0) {
|
|
3058
|
+
filterClauses.push({ terms: { sourceName: sourceNames } });
|
|
3059
|
+
}
|
|
3060
|
+
if (sourceTypes.length > 0) {
|
|
3061
|
+
filterClauses.push({ terms: { sourceType: sourceTypes } });
|
|
3062
|
+
}
|
|
3063
|
+
if (uriPrefixes.length > 0) {
|
|
3064
|
+
filterClauses.push({
|
|
3065
|
+
bool: {
|
|
3066
|
+
should: uriPrefixes.map((prefix) => ({ prefix: { uri: prefix } })),
|
|
3067
|
+
minimum_should_match: 1
|
|
3068
|
+
}
|
|
3069
|
+
});
|
|
3070
|
+
}
|
|
3071
|
+
if (tags.length > 0) {
|
|
3072
|
+
filterClauses.push({ terms: { tags } });
|
|
3073
|
+
}
|
|
3074
|
+
if (filters.hasPublicationDate) {
|
|
3075
|
+
filterClauses.push({ exists: { field: "publicationDate" } });
|
|
3076
|
+
}
|
|
3077
|
+
for (const { key, value } of filters.metadata ?? []) {
|
|
3078
|
+
filterClauses.push({ term: { [`metadata.${key}`]: value.toLowerCase() } });
|
|
3079
|
+
}
|
|
3080
|
+
for (const { field, from, to } of dateRanges) {
|
|
3081
|
+
filterClauses.push({
|
|
3082
|
+
range: {
|
|
3083
|
+
[field]: {
|
|
3084
|
+
...from ? { gte: from } : {},
|
|
3085
|
+
...to ? { lte: to } : {}
|
|
3086
|
+
}
|
|
3087
|
+
}
|
|
3088
|
+
});
|
|
3089
|
+
}
|
|
3090
|
+
return {
|
|
3091
|
+
size: topK,
|
|
3092
|
+
query: {
|
|
3093
|
+
bool: {
|
|
3094
|
+
should: [
|
|
3095
|
+
{ match: { title: { query, operator: "and", boost: 6 } } },
|
|
3096
|
+
{ match: { text: { query, operator: "and", boost: 4 } } },
|
|
3097
|
+
{ match: { text: { query, operator: "or", boost: 2 } } }
|
|
3098
|
+
],
|
|
3099
|
+
filter: filterClauses,
|
|
3100
|
+
minimum_should_match: 1
|
|
3101
|
+
}
|
|
3102
|
+
}
|
|
3103
|
+
};
|
|
3104
|
+
}
|
|
3105
|
+
function sourceToChunkRecord(source) {
|
|
3106
|
+
return {
|
|
3107
|
+
id: source.chunkId,
|
|
3108
|
+
documentId: source.documentId,
|
|
3109
|
+
sourceId: source.sourceId,
|
|
3110
|
+
title: source.title,
|
|
3111
|
+
uri: source.uri,
|
|
3112
|
+
headingPath: source.headingPath,
|
|
3113
|
+
text: source.text,
|
|
3114
|
+
contentHash: "",
|
|
3115
|
+
metadata: source.metadata,
|
|
3116
|
+
firstSeenAt: source.firstSeenAt,
|
|
3117
|
+
lastSeenAt: source.lastSeenAt,
|
|
3118
|
+
lastChangedAt: source.lastChangedAt
|
|
3119
|
+
};
|
|
3120
|
+
}
|
|
3121
|
+
function sourceToDocumentRecord(source) {
|
|
3122
|
+
return {
|
|
3123
|
+
id: source.documentId,
|
|
3124
|
+
sourceId: source.sourceId,
|
|
3125
|
+
sourceType: source.sourceType,
|
|
3126
|
+
title: source.title,
|
|
3127
|
+
uri: source.uri,
|
|
3128
|
+
sourceUri: source.uri,
|
|
3129
|
+
mimeType: "text/plain",
|
|
3130
|
+
normalizedPath: source.normalizedPath ?? "",
|
|
3131
|
+
contentHash: "",
|
|
3132
|
+
metadata: source.metadata,
|
|
3133
|
+
publicationDate: source.publicationDate ?? null,
|
|
3134
|
+
crawledAt: source.crawledAt,
|
|
3135
|
+
firstSeenAt: source.firstSeenAt,
|
|
3136
|
+
lastSeenAt: source.lastSeenAt,
|
|
3137
|
+
lastChangedAt: source.lastChangedAt
|
|
3138
|
+
};
|
|
3139
|
+
}
|
|
3140
|
+
async function materializeSearchHit(hit, query, config, orderedChunkCache, showChunks) {
|
|
3141
|
+
const source = hit._source;
|
|
3142
|
+
const chunk = sourceToChunkRecord(source);
|
|
3143
|
+
const document = sourceToDocumentRecord(source);
|
|
3144
|
+
const snippet = await buildSnippetWithAdjacentChunks(chunk, query, { document, config, orderedChunkCache });
|
|
3145
|
+
const enrichedSource = {
|
|
3146
|
+
...source,
|
|
3147
|
+
snippet
|
|
3148
|
+
};
|
|
3149
|
+
const result = {
|
|
3150
|
+
chunkId: source.chunkId,
|
|
3151
|
+
documentId: source.documentId,
|
|
3152
|
+
sourceId: source.sourceId,
|
|
3153
|
+
sourceType: source.sourceType,
|
|
3154
|
+
score: hit._score,
|
|
3155
|
+
title: chooseResultTitle(chunk),
|
|
3156
|
+
uri: source.uri,
|
|
3157
|
+
snippet,
|
|
3158
|
+
text: showChunks ? source.text : void 0,
|
|
3159
|
+
publicationDate: source.publicationDate ?? null,
|
|
3160
|
+
firstSeenAt: source.firstSeenAt,
|
|
3161
|
+
lastSeenAt: source.lastSeenAt,
|
|
3162
|
+
lastChangedAt: source.lastChangedAt,
|
|
3163
|
+
metadata: source.metadata
|
|
3164
|
+
};
|
|
3165
|
+
return {
|
|
3166
|
+
hit: {
|
|
3167
|
+
...hit,
|
|
3168
|
+
_source: enrichedSource
|
|
3169
|
+
},
|
|
3170
|
+
result
|
|
3171
|
+
};
|
|
3172
|
+
}
|
|
3173
|
+
function createSearchResponse(retrievalMode, hits, took, aggregations) {
|
|
3174
|
+
return {
|
|
3175
|
+
retrievalMode,
|
|
3176
|
+
took,
|
|
3177
|
+
hits: {
|
|
3178
|
+
total: {
|
|
3179
|
+
value: hits.length,
|
|
3180
|
+
relation: "eq"
|
|
3181
|
+
},
|
|
3182
|
+
max_score: hits.length > 0 ? Math.max(...hits.map((hit) => hit._score)) : null,
|
|
3183
|
+
hits
|
|
3184
|
+
},
|
|
3185
|
+
aggregations
|
|
3186
|
+
};
|
|
3187
|
+
}
|
|
3188
|
+
function searchResultsFromResponse(response2, showChunks = false) {
|
|
3189
|
+
return response2.hits.hits.map((hit) => ({
|
|
3190
|
+
chunkId: hit._source.chunkId,
|
|
3191
|
+
documentId: hit._source.documentId,
|
|
3192
|
+
sourceId: hit._source.sourceId,
|
|
3193
|
+
sourceType: hit._source.sourceType,
|
|
3194
|
+
score: hit._score,
|
|
3195
|
+
title: chooseResultTitle(sourceToChunkRecord(hit._source)),
|
|
3196
|
+
uri: hit._source.uri,
|
|
3197
|
+
snippet: hit._source.snippet ?? hit.highlight?.text?.join("\n\n") ?? buildSnippet(hit._source.text, hit._source.title),
|
|
3198
|
+
text: showChunks ? hit._source.text : void 0,
|
|
3199
|
+
publicationDate: hit._source.publicationDate ?? null,
|
|
3200
|
+
firstSeenAt: hit._source.firstSeenAt,
|
|
3201
|
+
lastSeenAt: hit._source.lastSeenAt,
|
|
3202
|
+
lastChangedAt: hit._source.lastChangedAt,
|
|
3203
|
+
metadata: hit._source.metadata
|
|
3204
|
+
}));
|
|
3205
|
+
}
|
|
3206
|
+
async function searchJsonIndex({
|
|
3207
|
+
workspacePath,
|
|
3208
|
+
request,
|
|
3209
|
+
indexName = "querylight"
|
|
3210
|
+
}) {
|
|
3211
|
+
const index = await loadHydratedIndex(workspacePath);
|
|
3212
|
+
return searchJsonDsl({ index, request, indexName });
|
|
3213
|
+
}
|
|
3025
3214
|
function normalizeDisplayTitle(title) {
|
|
3026
3215
|
return title.replace(/\s*\|\s*Querylight TS Demo\s*$/i, "").replace(/\s+/g, " ").trim();
|
|
3027
3216
|
}
|
|
@@ -3159,6 +3348,7 @@ async function searchIndex({
|
|
|
3159
3348
|
retrievalMode,
|
|
3160
3349
|
showChunks = false
|
|
3161
3350
|
}) {
|
|
3351
|
+
const startedAt = Date.now();
|
|
3162
3352
|
const config = await loadConfig(workspacePath);
|
|
3163
3353
|
const mode = retrievalMode ?? config.retrieval.defaultMode;
|
|
3164
3354
|
const candidateLimit = Math.max(topK * 5, 50);
|
|
@@ -3215,12 +3405,48 @@ async function searchIndex({
|
|
|
3215
3405
|
};
|
|
3216
3406
|
})
|
|
3217
3407
|
);
|
|
3218
|
-
|
|
3408
|
+
const hits2 = latestResults.filter((result) => result != null).map((result) => {
|
|
3409
|
+
const chunk = chunks.get(result.chunkId);
|
|
3410
|
+
const document = documents.get(result.documentId);
|
|
3411
|
+
const source = sources.get(result.sourceId);
|
|
3412
|
+
return {
|
|
3413
|
+
_index: "querylight",
|
|
3414
|
+
_id: result.chunkId,
|
|
3415
|
+
_score: result.score,
|
|
3416
|
+
_source: {
|
|
3417
|
+
chunkId: result.chunkId,
|
|
3418
|
+
documentId: result.documentId,
|
|
3419
|
+
sourceId: result.sourceId,
|
|
3420
|
+
sourceType: result.sourceType,
|
|
3421
|
+
sourceName: source?.name,
|
|
3422
|
+
title: chunk.title,
|
|
3423
|
+
uri: result.uri,
|
|
3424
|
+
headingPath: chunk.headingPath,
|
|
3425
|
+
text: chunk.text,
|
|
3426
|
+
snippet: result.snippet,
|
|
3427
|
+
normalizedPath: document.normalizedPath,
|
|
3428
|
+
publicationDate: result.publicationDate ?? null,
|
|
3429
|
+
crawledAt: document.crawledAt,
|
|
3430
|
+
firstSeenAt: result.firstSeenAt,
|
|
3431
|
+
lastSeenAt: result.lastSeenAt,
|
|
3432
|
+
lastChangedAt: result.lastChangedAt,
|
|
3433
|
+
metadata: result.metadata
|
|
3434
|
+
}
|
|
3435
|
+
};
|
|
3436
|
+
});
|
|
3437
|
+
return createSearchResponse("lexical", hits2, Date.now() - startedAt);
|
|
3219
3438
|
}
|
|
3220
3439
|
const lexicalHits = async () => {
|
|
3221
|
-
const
|
|
3222
|
-
|
|
3223
|
-
|
|
3440
|
+
const response2 = await searchJsonIndex({
|
|
3441
|
+
workspacePath,
|
|
3442
|
+
request: buildSearchDslRequest({
|
|
3443
|
+
query: normalizedQuery,
|
|
3444
|
+
topK: candidateLimit,
|
|
3445
|
+
filters: { sourceId, sourceIds, sourceName, sourceNames, sourceType, sourceTypes, uriPrefix, uriPrefixes, hasPublicationDate, tag, tags, metadata },
|
|
3446
|
+
dateRanges
|
|
3447
|
+
})
|
|
3448
|
+
});
|
|
3449
|
+
return response2.hits.hits;
|
|
3224
3450
|
};
|
|
3225
3451
|
const denseHits = async () => {
|
|
3226
3452
|
if (!await fileExists(denseVectorPath(workspacePath))) {
|
|
@@ -3234,15 +3460,18 @@ async function searchIndex({
|
|
|
3234
3460
|
}
|
|
3235
3461
|
return sparseQuery({ workspacePath, config: config.retrieval.sparse, query: normalizedQuery, topK: candidateLimit }).then((hits2) => hits2.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit));
|
|
3236
3462
|
};
|
|
3463
|
+
let lexicalResponseHits = [];
|
|
3237
3464
|
let hits;
|
|
3238
3465
|
if (mode === "lexical") {
|
|
3239
|
-
|
|
3466
|
+
lexicalResponseHits = await lexicalHits();
|
|
3467
|
+
hits = lexicalResponseHits.map((hit) => [hit._id, hit._score]);
|
|
3240
3468
|
} else if (mode === "dense") {
|
|
3241
3469
|
hits = await denseHits();
|
|
3242
3470
|
} else if (mode === "sparse") {
|
|
3243
3471
|
hits = await sparseHits();
|
|
3244
3472
|
} else {
|
|
3245
|
-
|
|
3473
|
+
lexicalResponseHits = await lexicalHits();
|
|
3474
|
+
const rankings = [lexicalResponseHits.map((hit) => [hit._id, hit._score])];
|
|
3246
3475
|
if (await fileExists(denseVectorPath(workspacePath))) {
|
|
3247
3476
|
rankings.push(await denseQuery({ workspacePath, config: config.retrieval.dense, query: normalizedQuery, topK: candidateLimit }).then((dense) => dense.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit)));
|
|
3248
3477
|
}
|
|
@@ -3251,34 +3480,49 @@ async function searchIndex({
|
|
|
3251
3480
|
}
|
|
3252
3481
|
hits = reciprocalRankFusion(rankings, { rankConstant: 20, weights: rankings.map((_, index) => index === 0 ? 3 : 1) }).slice(0, candidateLimit);
|
|
3253
3482
|
}
|
|
3254
|
-
const
|
|
3483
|
+
const baseHits = mode === "lexical" ? lexicalResponseHits : hits.flatMap(([chunkId, score]) => {
|
|
3255
3484
|
const chunk = chunks.get(chunkId);
|
|
3256
3485
|
if (!chunk) {
|
|
3257
|
-
return
|
|
3486
|
+
return [];
|
|
3258
3487
|
}
|
|
3259
|
-
|
|
3260
|
-
|
|
3261
|
-
|
|
3262
|
-
|
|
3263
|
-
|
|
3264
|
-
score,
|
|
3265
|
-
|
|
3266
|
-
|
|
3267
|
-
|
|
3268
|
-
|
|
3269
|
-
|
|
3270
|
-
|
|
3271
|
-
|
|
3272
|
-
|
|
3273
|
-
|
|
3274
|
-
|
|
3275
|
-
|
|
3276
|
-
|
|
3277
|
-
|
|
3278
|
-
|
|
3279
|
-
|
|
3280
|
-
|
|
3281
|
-
|
|
3488
|
+
const document = documents.get(chunk.documentId);
|
|
3489
|
+
const source = sources.get(chunk.sourceId);
|
|
3490
|
+
return [{
|
|
3491
|
+
_index: "querylight",
|
|
3492
|
+
_id: chunkId,
|
|
3493
|
+
_score: score,
|
|
3494
|
+
_source: {
|
|
3495
|
+
chunkId,
|
|
3496
|
+
documentId: chunk.documentId,
|
|
3497
|
+
sourceId: chunk.sourceId,
|
|
3498
|
+
sourceType: document?.sourceType ?? "text",
|
|
3499
|
+
sourceName: source?.name,
|
|
3500
|
+
title: chunk.title,
|
|
3501
|
+
uri: chunk.uri,
|
|
3502
|
+
headingPath: chunk.headingPath,
|
|
3503
|
+
text: chunk.text,
|
|
3504
|
+
normalizedPath: document?.normalizedPath,
|
|
3505
|
+
publicationDate: document?.publicationDate ?? null,
|
|
3506
|
+
crawledAt: document?.crawledAt,
|
|
3507
|
+
firstSeenAt: document?.firstSeenAt ?? chunk.firstSeenAt,
|
|
3508
|
+
lastSeenAt: document?.lastSeenAt ?? chunk.lastSeenAt,
|
|
3509
|
+
lastChangedAt: document?.lastChangedAt ?? chunk.lastChangedAt,
|
|
3510
|
+
metadata: chunk.metadata
|
|
3511
|
+
}
|
|
3512
|
+
}];
|
|
3513
|
+
});
|
|
3514
|
+
const materialized = await Promise.all(baseHits.map((hit) => materializeSearchHit(hit, normalizedQuery, config, orderedChunkCache, showChunks)));
|
|
3515
|
+
if (showChunks) {
|
|
3516
|
+
const topHits = materialized.sort((left, right) => right.result.score - left.result.score).slice(0, topK).map(({ hit, result }) => ({ ...hit, _score: result.score }));
|
|
3517
|
+
return createSearchResponse(mode, topHits, Date.now() - startedAt);
|
|
3518
|
+
}
|
|
3519
|
+
const reranked = rerankResultsByDocument(materialized.map(({ result }) => result), topK);
|
|
3520
|
+
const byChunkId = new Map(materialized.map(({ hit }) => [hit._id, hit]));
|
|
3521
|
+
const finalHits = reranked.map((result) => {
|
|
3522
|
+
const hit = byChunkId.get(result.chunkId);
|
|
3523
|
+
return hit ? { ...hit, _score: result.score, _source: { ...hit._source, snippet: result.snippet } } : null;
|
|
3524
|
+
}).filter((hit) => hit != null);
|
|
3525
|
+
return createSearchResponse(mode, finalHits, Date.now() - startedAt);
|
|
3282
3526
|
}
|
|
3283
3527
|
|
|
3284
3528
|
// src/query/related-service.ts
|
|
@@ -3395,9 +3639,10 @@ async function createContext({
|
|
|
3395
3639
|
retrievalMode
|
|
3396
3640
|
}) {
|
|
3397
3641
|
const search = await searchIndex({ workspacePath, query, topK, showChunks: true, retrievalMode });
|
|
3642
|
+
const results = searchResultsFromResponse(search, true);
|
|
3398
3643
|
const sources = [];
|
|
3399
3644
|
let total = 0;
|
|
3400
|
-
for (const result of
|
|
3645
|
+
for (const result of results) {
|
|
3401
3646
|
const text = result.text ?? "";
|
|
3402
3647
|
if (total + text.length > maxChars && sources.length > 0) {
|
|
3403
3648
|
break;
|
|
@@ -3498,7 +3743,8 @@ function formatSourcesTable(sources) {
|
|
|
3498
3743
|
}
|
|
3499
3744
|
return table.toString();
|
|
3500
3745
|
}
|
|
3501
|
-
function formatSearchResults(
|
|
3746
|
+
function formatSearchResults(response2) {
|
|
3747
|
+
const results = searchResultsFromResponse(response2);
|
|
3502
3748
|
return results.map((result, index) => [
|
|
3503
3749
|
`${index + 1}. ${colors.bold(result.title)}`,
|
|
3504
3750
|
` URL: ${result.uri}`,
|
|
@@ -3803,6 +4049,19 @@ function parseDateValue(input, optionName) {
|
|
|
3803
4049
|
}
|
|
3804
4050
|
return parsed.toISOString();
|
|
3805
4051
|
}
|
|
4052
|
+
async function parseJsonArgument(input) {
|
|
4053
|
+
const raw = input.startsWith("@") ? await readFile11(path21.resolve(input.slice(1)), "utf8") : input;
|
|
4054
|
+
try {
|
|
4055
|
+
const parsed = JSON.parse(raw);
|
|
4056
|
+
if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
|
|
4057
|
+
throw new Error("expected a JSON object");
|
|
4058
|
+
}
|
|
4059
|
+
return parsed;
|
|
4060
|
+
} catch (error) {
|
|
4061
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
4062
|
+
throw new CliError(`invalid JSON request: ${message}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
|
|
4063
|
+
}
|
|
4064
|
+
}
|
|
3806
4065
|
function searchDateRanges(options) {
|
|
3807
4066
|
const entries = [];
|
|
3808
4067
|
if (options.since || options.until) {
|
|
@@ -4147,7 +4406,7 @@ Examples:
|
|
|
4147
4406
|
progress?.("info", "Rebuild complete");
|
|
4148
4407
|
emit(global.json, capture, response("rebuild", workspace, data), `Processed ${ingest.processedSources} sources, wrote ${chunk.chunksWritten} chunks`);
|
|
4149
4408
|
});
|
|
4150
|
-
program.command("search").description("Search the built index and return ranked matching documents or chunks.").argument("[query]", "Text query. Omit it to list the latest matching documents.").option("--top-k <n>", "Maximum number of results to return.", "12").option("--source <sourceIds>", "Restrict results to one or more source ids. Use comma-separated values.").option("--source-name <names>", "Restrict results to one or more source names. Use comma-separated values.").option("--source-type <types>", `Restrict results to one or more source types. Use comma-separated values: ${SOURCE_TYPE_LIST.join(", ")}`).option("--uri-prefix <prefixes>", "Restrict results to one or more URI prefixes. Use comma-separated values.").option("--tag <tags>", "Restrict results to one or more source tags. Use comma-separated values.").option("--metadata <key=value...>", "Restrict results to sources with matching metadata.").option("--since <date>", "Shortcut for --publication-date-from.").option("--until <date>", "Shortcut for --publication-date-to.").option("--changed-since <date>", "Only include documents changed on or after this date.").option("--has-publication-date", "Only include documents with a publication date.").option("--publication-date-from <date>", "Only include documents published on or after this date.").option("--publication-date-to <date>", "Only include documents published on or before this date.").option("--first-seen-at-from <date>", "Only include documents first seen on or after this date.").option("--first-seen-at-to <date>", "Only include documents first seen on or before this date.").option("--last-seen-at-from <date>", "Only include documents last seen on or after this date.").option("--last-seen-at-to <date>", "Only include documents last seen on or before this date.").option("--last-changed-at-from <date>", "Only include documents changed on or after this date.").option("--last-changed-at-to <date>", "Only include documents changed on or before this date.").option("--crawled-at-from <date>", "Only include documents crawled on or after this date.").option("--crawled-at-to <date>", "Only include documents crawled on or before this date.").option("--retrieval <mode>", `Retrieval mode: ${RETRIEVAL_MODE_LIST.join(", ")}`).option("--show-chunks", "Return chunk-level matches when available.").addHelpText("after", `
|
|
4409
|
+
program.command("search").description("Search the built index and return ranked matching documents or chunks. Use search-json for raw JSON DSL queries.").argument("[query]", "Text query. Omit it to list the latest matching documents.").option("--top-k <n>", "Maximum number of results to return.", "12").option("--source <sourceIds>", "Restrict results to one or more source ids. Use comma-separated values.").option("--source-name <names>", "Restrict results to one or more source names. Use comma-separated values.").option("--source-type <types>", `Restrict results to one or more source types. Use comma-separated values: ${SOURCE_TYPE_LIST.join(", ")}`).option("--uri-prefix <prefixes>", "Restrict results to one or more URI prefixes. Use comma-separated values.").option("--tag <tags>", "Restrict results to one or more source tags. Use comma-separated values.").option("--metadata <key=value...>", "Restrict results to sources with matching metadata.").option("--since <date>", "Shortcut for --publication-date-from.").option("--until <date>", "Shortcut for --publication-date-to.").option("--changed-since <date>", "Only include documents changed on or after this date.").option("--has-publication-date", "Only include documents with a publication date.").option("--publication-date-from <date>", "Only include documents published on or after this date.").option("--publication-date-to <date>", "Only include documents published on or before this date.").option("--first-seen-at-from <date>", "Only include documents first seen on or after this date.").option("--first-seen-at-to <date>", "Only include documents first seen on or before this date.").option("--last-seen-at-from <date>", "Only include documents last seen on or after this date.").option("--last-seen-at-to <date>", "Only include documents last seen on or before this date.").option("--last-changed-at-from <date>", "Only include documents changed on or after this date.").option("--last-changed-at-to <date>", "Only include documents changed on or before this date.").option("--crawled-at-from <date>", "Only include documents crawled on or after this date.").option("--crawled-at-to <date>", "Only include documents crawled on or before this date.").option("--retrieval <mode>", `Retrieval mode: ${RETRIEVAL_MODE_LIST.join(", ")}`).option("--show-chunks", "Return chunk-level matches when available.").addHelpText("after", `
|
|
4151
4410
|
Examples:
|
|
4152
4411
|
qli search "pricing api limits"
|
|
4153
4412
|
qli search "authentication" --top-k 20 --tag docs
|
|
@@ -4160,6 +4419,7 @@ Examples:
|
|
|
4160
4419
|
Notes:
|
|
4161
4420
|
lexical works without vector models.
|
|
4162
4421
|
dense, sparse, and hybrid require the relevant index artifacts to exist.
|
|
4422
|
+
Use search-json when you want the raw Querylight 0.11 JSON DSL and hit format.
|
|
4163
4423
|
When you omit the query, qli returns the latest matching documents sorted by publication date.`).action(async function command(query, options) {
|
|
4164
4424
|
const global = this.optsWithGlobals();
|
|
4165
4425
|
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
@@ -4178,7 +4438,26 @@ Notes:
|
|
|
4178
4438
|
retrievalMode: parseRetrievalMode(options.retrieval),
|
|
4179
4439
|
showChunks: Boolean(options.showChunks)
|
|
4180
4440
|
});
|
|
4181
|
-
emit(global.json, capture, response("search", workspace, result), formatSearchResults(result
|
|
4441
|
+
emit(global.json, capture, response("search", workspace, result), formatSearchResults(result));
|
|
4442
|
+
});
|
|
4443
|
+
program.command("search-json").description("Run a raw Querylight 0.11 JSON DSL search request against the lexical index.").argument("<request>", "Inline JSON request or @path/to/request.json.").addHelpText("after", `
|
|
4444
|
+
Examples:
|
|
4445
|
+
qli search-json '{"query":{"match":{"text":"authentication"}},"size":5}'
|
|
4446
|
+
qli search-json @./search-request.json
|
|
4447
|
+
qli search-json '{"query":{"bool":{"filter":[{"term":{"sourceType":"rss"}}]}},"aggs":{"types":{"terms":{"field":"sourceType","size":5}}}}' --json
|
|
4448
|
+
|
|
4449
|
+
Notes:
|
|
4450
|
+
search-json uses the lexical index and Querylight 0.11 JSON DSL fields.
|
|
4451
|
+
Stored hit payloads are returned under _source.
|
|
4452
|
+
Use --json when another tool needs the full response envelope.`).action(async function command(requestInput) {
|
|
4453
|
+
const global = this.optsWithGlobals();
|
|
4454
|
+
const workspace = await resolveWorkspace({ workspace: global.workspace });
|
|
4455
|
+
const request = await parseJsonArgument(requestInput);
|
|
4456
|
+
const result = await searchJsonIndex({
|
|
4457
|
+
workspacePath: workspace,
|
|
4458
|
+
request
|
|
4459
|
+
});
|
|
4460
|
+
emit(global.json, capture, response("search-json", workspace, result), JSON.stringify(result, null, 2));
|
|
4182
4461
|
});
|
|
4183
4462
|
program.command("related").description("Find documents similar to an existing document by id or URI.").argument("<document>", "Document id, uri, or canonical uri").option("--top-k <n>", "Maximum number of related documents to return.", "12").addHelpText("after", `
|
|
4184
4463
|
Examples:
|
package/dist/core/constants.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
export declare const PACKAGE_NAME = "@tryformation/querylight-cli";
|
|
2
|
-
export declare const PACKAGE_VERSION = "0.2.
|
|
2
|
+
export declare const PACKAGE_VERSION = "0.2.3";
|
|
3
3
|
export declare const DEFAULT_WORKSPACE = ".kb";
|
|
4
4
|
export declare const DEFAULT_SHARED_MODEL_CACHE_DIR = "~/.qli/models/huggingface";
|
|
5
5
|
export declare const LEGACY_WORKSPACE_MODEL_CACHE_DIR = ".kb/models/huggingface";
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { type FieldIndex } from "@tryformation/querylight-ts";
|
|
2
2
|
import { type ProgressHandler } from "../core/progress.js";
|
|
3
3
|
import type { IndexMetadata } from "../types/models.js";
|
|
4
|
-
export declare function createIndexMapping(extraFields?: string[]): Record<string,
|
|
4
|
+
export declare function createIndexMapping(extraFields?: string[]): Record<string, FieldIndex>;
|
|
5
5
|
export declare function buildIndex({ workspacePath, denseOverride, sparseOverride, buildAvailableModels, progress }: {
|
|
6
6
|
workspacePath: string;
|
|
7
7
|
denseOverride?: boolean;
|
package/dist/index.js
CHANGED
|
@@ -1782,7 +1782,7 @@ async function chunkDocuments({
|
|
|
1782
1782
|
}
|
|
1783
1783
|
|
|
1784
1784
|
// src/index/querylight-indexer.ts
|
|
1785
|
-
import { Analyzer, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, RankingAlgorithm, TextFieldIndex } from "@tryformation/querylight-ts";
|
|
1785
|
+
import { Analyzer, DateFieldIndex, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, RankingAlgorithm, StoredSourceIndex, TextFieldIndex } from "@tryformation/querylight-ts";
|
|
1786
1786
|
import path17 from "path";
|
|
1787
1787
|
|
|
1788
1788
|
// src/vector/dense.ts
|
|
@@ -2379,12 +2379,19 @@ function keywordFieldIndex() {
|
|
|
2379
2379
|
function createIndexMapping(extraFields = []) {
|
|
2380
2380
|
const lexical = new TextFieldIndex(void 0, void 0, RankingAlgorithm.BM25);
|
|
2381
2381
|
const mapping = {
|
|
2382
|
+
_source: new StoredSourceIndex(),
|
|
2382
2383
|
text: lexical,
|
|
2383
2384
|
title: new TextFieldIndex(void 0, void 0, RankingAlgorithm.BM25),
|
|
2384
2385
|
uri: keywordFieldIndex(),
|
|
2385
2386
|
sourceId: keywordFieldIndex(),
|
|
2387
|
+
sourceName: keywordFieldIndex(),
|
|
2386
2388
|
tags: keywordFieldIndex(),
|
|
2387
|
-
sourceType: keywordFieldIndex()
|
|
2389
|
+
sourceType: keywordFieldIndex(),
|
|
2390
|
+
publicationDate: new DateFieldIndex(),
|
|
2391
|
+
firstSeenAt: new DateFieldIndex(),
|
|
2392
|
+
lastSeenAt: new DateFieldIndex(),
|
|
2393
|
+
lastChangedAt: new DateFieldIndex(),
|
|
2394
|
+
crawledAt: new DateFieldIndex()
|
|
2388
2395
|
};
|
|
2389
2396
|
for (const field of extraFields) {
|
|
2390
2397
|
mapping[field] = keywordFieldIndex();
|
|
@@ -2420,8 +2427,12 @@ async function buildIndex({
|
|
|
2420
2427
|
const sources = await readJsonl(path17.join(workspacePath, "sources", "sources.jsonl"));
|
|
2421
2428
|
const metadataFields = [...new Set(chunks.flatMap((chunk) => Object.keys(chunk.metadata).map((key) => `metadata.${key}`)))];
|
|
2422
2429
|
const index = new DocumentIndex(createIndexMapping(metadataFields));
|
|
2430
|
+
const documentsById = new Map(documents.map((document) => [document.id, document]));
|
|
2431
|
+
const sourcesById = new Map(sources.map((source) => [source.id, source]));
|
|
2423
2432
|
reportProgress(progress, `Building lexical index from ${chunks.length} chunk${chunks.length === 1 ? "" : "s"}`);
|
|
2424
2433
|
for (const chunk of chunks) {
|
|
2434
|
+
const document = documentsById.get(chunk.documentId);
|
|
2435
|
+
const source = sourcesById.get(chunk.sourceId);
|
|
2425
2436
|
index.index({
|
|
2426
2437
|
id: chunk.id,
|
|
2427
2438
|
fields: {
|
|
@@ -2429,9 +2440,33 @@ async function buildIndex({
|
|
|
2429
2440
|
title: [chunk.title],
|
|
2430
2441
|
uri: [chunk.uri.toLowerCase()],
|
|
2431
2442
|
sourceId: [chunk.sourceId.toLowerCase()],
|
|
2443
|
+
sourceName: source ? [source.name.toLowerCase()] : [],
|
|
2432
2444
|
tags: Array.isArray(chunk.metadata.tags) ? chunk.metadata.tags.map((tag) => String(tag).toLowerCase()) : [],
|
|
2433
2445
|
sourceType: [String(chunk.metadata.sourceType ?? "").toLowerCase()],
|
|
2446
|
+
publicationDate: document?.publicationDate ? [document.publicationDate] : [],
|
|
2447
|
+
firstSeenAt: [document?.firstSeenAt ?? chunk.firstSeenAt],
|
|
2448
|
+
lastSeenAt: [document?.lastSeenAt ?? chunk.lastSeenAt],
|
|
2449
|
+
lastChangedAt: [document?.lastChangedAt ?? chunk.lastChangedAt],
|
|
2450
|
+
crawledAt: document?.crawledAt ? [document.crawledAt] : [],
|
|
2434
2451
|
...flattenMetadata(chunk.metadata)
|
|
2452
|
+
},
|
|
2453
|
+
source: {
|
|
2454
|
+
chunkId: chunk.id,
|
|
2455
|
+
documentId: chunk.documentId,
|
|
2456
|
+
sourceId: chunk.sourceId,
|
|
2457
|
+
sourceType: document?.sourceType ?? "text",
|
|
2458
|
+
sourceName: source?.name,
|
|
2459
|
+
title: chunk.title,
|
|
2460
|
+
uri: chunk.uri,
|
|
2461
|
+
headingPath: chunk.headingPath,
|
|
2462
|
+
text: chunk.text,
|
|
2463
|
+
normalizedPath: document?.normalizedPath,
|
|
2464
|
+
publicationDate: document?.publicationDate ?? null,
|
|
2465
|
+
crawledAt: document?.crawledAt,
|
|
2466
|
+
firstSeenAt: document?.firstSeenAt ?? chunk.firstSeenAt,
|
|
2467
|
+
lastSeenAt: document?.lastSeenAt ?? chunk.lastSeenAt,
|
|
2468
|
+
lastChangedAt: document?.lastChangedAt ?? chunk.lastChangedAt,
|
|
2469
|
+
metadata: chunk.metadata
|
|
2435
2470
|
}
|
|
2436
2471
|
});
|
|
2437
2472
|
}
|
|
@@ -2440,7 +2475,7 @@ async function buildIndex({
|
|
|
2440
2475
|
const metadata = {
|
|
2441
2476
|
id: `index_${createdAt.replace(/[:.]/g, "-")}`,
|
|
2442
2477
|
createdAt,
|
|
2443
|
-
querylightVersion: "0.
|
|
2478
|
+
querylightVersion: "0.11.0",
|
|
2444
2479
|
kbVersion: "0.1.0",
|
|
2445
2480
|
documentCount: documents.length,
|
|
2446
2481
|
chunkCount: chunks.length,
|
|
@@ -2469,7 +2504,7 @@ async function buildIndex({
|
|
|
2469
2504
|
|
|
2470
2505
|
// src/query/search-service.ts
|
|
2471
2506
|
import { readFile as readFile10 } from "fs/promises";
|
|
2472
|
-
import {
|
|
2507
|
+
import { reciprocalRankFusion, searchJsonDsl } from "@tryformation/querylight-ts";
|
|
2473
2508
|
import path18 from "path";
|
|
2474
2509
|
async function loadHydratedIndex(workspacePath) {
|
|
2475
2510
|
let state;
|
|
@@ -2497,24 +2532,6 @@ function matchesPrefix(value, prefixes) {
|
|
|
2497
2532
|
const lower = value.toLowerCase();
|
|
2498
2533
|
return prefixes.some((prefix) => lower.startsWith(prefix));
|
|
2499
2534
|
}
|
|
2500
|
-
function buildSearchQuery(query, filters) {
|
|
2501
|
-
const sourceIds = normalizeFilterValues([filters.sourceId, ...filters.sourceIds ?? []].filter((value) => Boolean(value)));
|
|
2502
|
-
const sourceTypes = normalizeFilterValues([filters.sourceType, ...filters.sourceTypes ?? []].filter((value) => Boolean(value)));
|
|
2503
|
-
const tags = normalizeFilterValues([filters.tag, ...filters.tags ?? []].filter((value) => Boolean(value)));
|
|
2504
|
-
return new BoolQuery({
|
|
2505
|
-
should: [
|
|
2506
|
-
new MatchQuery({ field: "title", text: query, operation: OP.AND, boost: 6 }),
|
|
2507
|
-
new MatchQuery({ field: "text", text: query, operation: OP.AND, boost: 4 }),
|
|
2508
|
-
new MatchQuery({ field: "text", text: query, operation: OP.OR, boost: 2 })
|
|
2509
|
-
],
|
|
2510
|
-
filter: [
|
|
2511
|
-
...sourceIds.length === 1 ? [new TermQuery({ field: "sourceId", text: sourceIds[0] })] : [],
|
|
2512
|
-
...sourceTypes.length === 1 ? [new TermQuery({ field: "sourceType", text: sourceTypes[0] })] : [],
|
|
2513
|
-
...tags.length === 1 ? [new TermQuery({ field: "tags", text: tags[0] })] : [],
|
|
2514
|
-
...(filters.metadata ?? []).map(({ key, value }) => new TermQuery({ field: `metadata.${key}`, text: value.toLowerCase() }))
|
|
2515
|
-
]
|
|
2516
|
-
});
|
|
2517
|
-
}
|
|
2518
2535
|
function isValidDate(value) {
|
|
2519
2536
|
return typeof value === "string" && !Number.isNaN(new Date(value).getTime());
|
|
2520
2537
|
}
|
|
@@ -2713,6 +2730,178 @@ async function buildSnippetWithAdjacentChunks(chunk, query, {
|
|
|
2713
2730
|
}
|
|
2714
2731
|
return buildExpandedParagraphSnippet(paragraphs, currentIndex, query);
|
|
2715
2732
|
}
|
|
2733
|
+
function buildSearchDslRequest({
|
|
2734
|
+
query,
|
|
2735
|
+
topK,
|
|
2736
|
+
filters,
|
|
2737
|
+
dateRanges
|
|
2738
|
+
}) {
|
|
2739
|
+
const filterClauses = [];
|
|
2740
|
+
const sourceIds = normalizeFilterValues([filters.sourceId, ...filters.sourceIds ?? []].filter((value) => Boolean(value)));
|
|
2741
|
+
const sourceNames = normalizeFilterValues([filters.sourceName, ...filters.sourceNames ?? []].filter((value) => Boolean(value)));
|
|
2742
|
+
const sourceTypes = normalizeFilterValues([filters.sourceType, ...filters.sourceTypes ?? []].filter((value) => Boolean(value)));
|
|
2743
|
+
const uriPrefixes = normalizeFilterValues([filters.uriPrefix, ...filters.uriPrefixes ?? []].filter((value) => Boolean(value)));
|
|
2744
|
+
const tags = normalizeFilterValues([filters.tag, ...filters.tags ?? []].filter((value) => Boolean(value)));
|
|
2745
|
+
if (sourceIds.length > 0) {
|
|
2746
|
+
filterClauses.push({ terms: { sourceId: sourceIds } });
|
|
2747
|
+
}
|
|
2748
|
+
if (sourceNames.length > 0) {
|
|
2749
|
+
filterClauses.push({ terms: { sourceName: sourceNames } });
|
|
2750
|
+
}
|
|
2751
|
+
if (sourceTypes.length > 0) {
|
|
2752
|
+
filterClauses.push({ terms: { sourceType: sourceTypes } });
|
|
2753
|
+
}
|
|
2754
|
+
if (uriPrefixes.length > 0) {
|
|
2755
|
+
filterClauses.push({
|
|
2756
|
+
bool: {
|
|
2757
|
+
should: uriPrefixes.map((prefix) => ({ prefix: { uri: prefix } })),
|
|
2758
|
+
minimum_should_match: 1
|
|
2759
|
+
}
|
|
2760
|
+
});
|
|
2761
|
+
}
|
|
2762
|
+
if (tags.length > 0) {
|
|
2763
|
+
filterClauses.push({ terms: { tags } });
|
|
2764
|
+
}
|
|
2765
|
+
if (filters.hasPublicationDate) {
|
|
2766
|
+
filterClauses.push({ exists: { field: "publicationDate" } });
|
|
2767
|
+
}
|
|
2768
|
+
for (const { key, value } of filters.metadata ?? []) {
|
|
2769
|
+
filterClauses.push({ term: { [`metadata.${key}`]: value.toLowerCase() } });
|
|
2770
|
+
}
|
|
2771
|
+
for (const { field, from, to } of dateRanges) {
|
|
2772
|
+
filterClauses.push({
|
|
2773
|
+
range: {
|
|
2774
|
+
[field]: {
|
|
2775
|
+
...from ? { gte: from } : {},
|
|
2776
|
+
...to ? { lte: to } : {}
|
|
2777
|
+
}
|
|
2778
|
+
}
|
|
2779
|
+
});
|
|
2780
|
+
}
|
|
2781
|
+
return {
|
|
2782
|
+
size: topK,
|
|
2783
|
+
query: {
|
|
2784
|
+
bool: {
|
|
2785
|
+
should: [
|
|
2786
|
+
{ match: { title: { query, operator: "and", boost: 6 } } },
|
|
2787
|
+
{ match: { text: { query, operator: "and", boost: 4 } } },
|
|
2788
|
+
{ match: { text: { query, operator: "or", boost: 2 } } }
|
|
2789
|
+
],
|
|
2790
|
+
filter: filterClauses,
|
|
2791
|
+
minimum_should_match: 1
|
|
2792
|
+
}
|
|
2793
|
+
}
|
|
2794
|
+
};
|
|
2795
|
+
}
|
|
2796
|
+
function sourceToChunkRecord(source) {
|
|
2797
|
+
return {
|
|
2798
|
+
id: source.chunkId,
|
|
2799
|
+
documentId: source.documentId,
|
|
2800
|
+
sourceId: source.sourceId,
|
|
2801
|
+
title: source.title,
|
|
2802
|
+
uri: source.uri,
|
|
2803
|
+
headingPath: source.headingPath,
|
|
2804
|
+
text: source.text,
|
|
2805
|
+
contentHash: "",
|
|
2806
|
+
metadata: source.metadata,
|
|
2807
|
+
firstSeenAt: source.firstSeenAt,
|
|
2808
|
+
lastSeenAt: source.lastSeenAt,
|
|
2809
|
+
lastChangedAt: source.lastChangedAt
|
|
2810
|
+
};
|
|
2811
|
+
}
|
|
2812
|
+
function sourceToDocumentRecord(source) {
|
|
2813
|
+
return {
|
|
2814
|
+
id: source.documentId,
|
|
2815
|
+
sourceId: source.sourceId,
|
|
2816
|
+
sourceType: source.sourceType,
|
|
2817
|
+
title: source.title,
|
|
2818
|
+
uri: source.uri,
|
|
2819
|
+
sourceUri: source.uri,
|
|
2820
|
+
mimeType: "text/plain",
|
|
2821
|
+
normalizedPath: source.normalizedPath ?? "",
|
|
2822
|
+
contentHash: "",
|
|
2823
|
+
metadata: source.metadata,
|
|
2824
|
+
publicationDate: source.publicationDate ?? null,
|
|
2825
|
+
crawledAt: source.crawledAt,
|
|
2826
|
+
firstSeenAt: source.firstSeenAt,
|
|
2827
|
+
lastSeenAt: source.lastSeenAt,
|
|
2828
|
+
lastChangedAt: source.lastChangedAt
|
|
2829
|
+
};
|
|
2830
|
+
}
|
|
2831
|
+
async function materializeSearchHit(hit, query, config, orderedChunkCache, showChunks) {
|
|
2832
|
+
const source = hit._source;
|
|
2833
|
+
const chunk = sourceToChunkRecord(source);
|
|
2834
|
+
const document = sourceToDocumentRecord(source);
|
|
2835
|
+
const snippet = await buildSnippetWithAdjacentChunks(chunk, query, { document, config, orderedChunkCache });
|
|
2836
|
+
const enrichedSource = {
|
|
2837
|
+
...source,
|
|
2838
|
+
snippet
|
|
2839
|
+
};
|
|
2840
|
+
const result = {
|
|
2841
|
+
chunkId: source.chunkId,
|
|
2842
|
+
documentId: source.documentId,
|
|
2843
|
+
sourceId: source.sourceId,
|
|
2844
|
+
sourceType: source.sourceType,
|
|
2845
|
+
score: hit._score,
|
|
2846
|
+
title: chooseResultTitle(chunk),
|
|
2847
|
+
uri: source.uri,
|
|
2848
|
+
snippet,
|
|
2849
|
+
text: showChunks ? source.text : void 0,
|
|
2850
|
+
publicationDate: source.publicationDate ?? null,
|
|
2851
|
+
firstSeenAt: source.firstSeenAt,
|
|
2852
|
+
lastSeenAt: source.lastSeenAt,
|
|
2853
|
+
lastChangedAt: source.lastChangedAt,
|
|
2854
|
+
metadata: source.metadata
|
|
2855
|
+
};
|
|
2856
|
+
return {
|
|
2857
|
+
hit: {
|
|
2858
|
+
...hit,
|
|
2859
|
+
_source: enrichedSource
|
|
2860
|
+
},
|
|
2861
|
+
result
|
|
2862
|
+
};
|
|
2863
|
+
}
|
|
2864
|
+
function createSearchResponse(retrievalMode, hits, took, aggregations) {
|
|
2865
|
+
return {
|
|
2866
|
+
retrievalMode,
|
|
2867
|
+
took,
|
|
2868
|
+
hits: {
|
|
2869
|
+
total: {
|
|
2870
|
+
value: hits.length,
|
|
2871
|
+
relation: "eq"
|
|
2872
|
+
},
|
|
2873
|
+
max_score: hits.length > 0 ? Math.max(...hits.map((hit) => hit._score)) : null,
|
|
2874
|
+
hits
|
|
2875
|
+
},
|
|
2876
|
+
aggregations
|
|
2877
|
+
};
|
|
2878
|
+
}
|
|
2879
|
+
function searchResultsFromResponse(response, showChunks = false) {
|
|
2880
|
+
return response.hits.hits.map((hit) => ({
|
|
2881
|
+
chunkId: hit._source.chunkId,
|
|
2882
|
+
documentId: hit._source.documentId,
|
|
2883
|
+
sourceId: hit._source.sourceId,
|
|
2884
|
+
sourceType: hit._source.sourceType,
|
|
2885
|
+
score: hit._score,
|
|
2886
|
+
title: chooseResultTitle(sourceToChunkRecord(hit._source)),
|
|
2887
|
+
uri: hit._source.uri,
|
|
2888
|
+
snippet: hit._source.snippet ?? hit.highlight?.text?.join("\n\n") ?? buildSnippet(hit._source.text, hit._source.title),
|
|
2889
|
+
text: showChunks ? hit._source.text : void 0,
|
|
2890
|
+
publicationDate: hit._source.publicationDate ?? null,
|
|
2891
|
+
firstSeenAt: hit._source.firstSeenAt,
|
|
2892
|
+
lastSeenAt: hit._source.lastSeenAt,
|
|
2893
|
+
lastChangedAt: hit._source.lastChangedAt,
|
|
2894
|
+
metadata: hit._source.metadata
|
|
2895
|
+
}));
|
|
2896
|
+
}
|
|
2897
|
+
async function searchJsonIndex({
|
|
2898
|
+
workspacePath,
|
|
2899
|
+
request,
|
|
2900
|
+
indexName = "querylight"
|
|
2901
|
+
}) {
|
|
2902
|
+
const index = await loadHydratedIndex(workspacePath);
|
|
2903
|
+
return searchJsonDsl({ index, request, indexName });
|
|
2904
|
+
}
|
|
2716
2905
|
function normalizeDisplayTitle(title) {
|
|
2717
2906
|
return title.replace(/\s*\|\s*Querylight TS Demo\s*$/i, "").replace(/\s+/g, " ").trim();
|
|
2718
2907
|
}
|
|
@@ -2850,6 +3039,7 @@ async function searchIndex({
|
|
|
2850
3039
|
retrievalMode,
|
|
2851
3040
|
showChunks = false
|
|
2852
3041
|
}) {
|
|
3042
|
+
const startedAt = Date.now();
|
|
2853
3043
|
const config = await loadConfig(workspacePath);
|
|
2854
3044
|
const mode = retrievalMode ?? config.retrieval.defaultMode;
|
|
2855
3045
|
const candidateLimit = Math.max(topK * 5, 50);
|
|
@@ -2906,12 +3096,48 @@ async function searchIndex({
|
|
|
2906
3096
|
};
|
|
2907
3097
|
})
|
|
2908
3098
|
);
|
|
2909
|
-
|
|
3099
|
+
const hits2 = latestResults.filter((result) => result != null).map((result) => {
|
|
3100
|
+
const chunk = chunks.get(result.chunkId);
|
|
3101
|
+
const document = documents.get(result.documentId);
|
|
3102
|
+
const source = sources.get(result.sourceId);
|
|
3103
|
+
return {
|
|
3104
|
+
_index: "querylight",
|
|
3105
|
+
_id: result.chunkId,
|
|
3106
|
+
_score: result.score,
|
|
3107
|
+
_source: {
|
|
3108
|
+
chunkId: result.chunkId,
|
|
3109
|
+
documentId: result.documentId,
|
|
3110
|
+
sourceId: result.sourceId,
|
|
3111
|
+
sourceType: result.sourceType,
|
|
3112
|
+
sourceName: source?.name,
|
|
3113
|
+
title: chunk.title,
|
|
3114
|
+
uri: result.uri,
|
|
3115
|
+
headingPath: chunk.headingPath,
|
|
3116
|
+
text: chunk.text,
|
|
3117
|
+
snippet: result.snippet,
|
|
3118
|
+
normalizedPath: document.normalizedPath,
|
|
3119
|
+
publicationDate: result.publicationDate ?? null,
|
|
3120
|
+
crawledAt: document.crawledAt,
|
|
3121
|
+
firstSeenAt: result.firstSeenAt,
|
|
3122
|
+
lastSeenAt: result.lastSeenAt,
|
|
3123
|
+
lastChangedAt: result.lastChangedAt,
|
|
3124
|
+
metadata: result.metadata
|
|
3125
|
+
}
|
|
3126
|
+
};
|
|
3127
|
+
});
|
|
3128
|
+
return createSearchResponse("lexical", hits2, Date.now() - startedAt);
|
|
2910
3129
|
}
|
|
2911
3130
|
const lexicalHits = async () => {
|
|
2912
|
-
const
|
|
2913
|
-
|
|
2914
|
-
|
|
3131
|
+
const response = await searchJsonIndex({
|
|
3132
|
+
workspacePath,
|
|
3133
|
+
request: buildSearchDslRequest({
|
|
3134
|
+
query: normalizedQuery,
|
|
3135
|
+
topK: candidateLimit,
|
|
3136
|
+
filters: { sourceId, sourceIds, sourceName, sourceNames, sourceType, sourceTypes, uriPrefix, uriPrefixes, hasPublicationDate, tag, tags, metadata },
|
|
3137
|
+
dateRanges
|
|
3138
|
+
})
|
|
3139
|
+
});
|
|
3140
|
+
return response.hits.hits;
|
|
2915
3141
|
};
|
|
2916
3142
|
const denseHits = async () => {
|
|
2917
3143
|
if (!await fileExists(denseVectorPath(workspacePath))) {
|
|
@@ -2925,15 +3151,18 @@ async function searchIndex({
|
|
|
2925
3151
|
}
|
|
2926
3152
|
return sparseQuery({ workspacePath, config: config.retrieval.sparse, query: normalizedQuery, topK: candidateLimit }).then((hits2) => hits2.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit));
|
|
2927
3153
|
};
|
|
3154
|
+
let lexicalResponseHits = [];
|
|
2928
3155
|
let hits;
|
|
2929
3156
|
if (mode === "lexical") {
|
|
2930
|
-
|
|
3157
|
+
lexicalResponseHits = await lexicalHits();
|
|
3158
|
+
hits = lexicalResponseHits.map((hit) => [hit._id, hit._score]);
|
|
2931
3159
|
} else if (mode === "dense") {
|
|
2932
3160
|
hits = await denseHits();
|
|
2933
3161
|
} else if (mode === "sparse") {
|
|
2934
3162
|
hits = await sparseHits();
|
|
2935
3163
|
} else {
|
|
2936
|
-
|
|
3164
|
+
lexicalResponseHits = await lexicalHits();
|
|
3165
|
+
const rankings = [lexicalResponseHits.map((hit) => [hit._id, hit._score])];
|
|
2937
3166
|
if (await fileExists(denseVectorPath(workspacePath))) {
|
|
2938
3167
|
rankings.push(await denseQuery({ workspacePath, config: config.retrieval.dense, query: normalizedQuery, topK: candidateLimit }).then((dense) => dense.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit)));
|
|
2939
3168
|
}
|
|
@@ -2942,34 +3171,49 @@ async function searchIndex({
|
|
|
2942
3171
|
}
|
|
2943
3172
|
hits = reciprocalRankFusion(rankings, { rankConstant: 20, weights: rankings.map((_, index) => index === 0 ? 3 : 1) }).slice(0, candidateLimit);
|
|
2944
3173
|
}
|
|
2945
|
-
const
|
|
3174
|
+
const baseHits = mode === "lexical" ? lexicalResponseHits : hits.flatMap(([chunkId, score]) => {
|
|
2946
3175
|
const chunk = chunks.get(chunkId);
|
|
2947
3176
|
if (!chunk) {
|
|
2948
|
-
return
|
|
3177
|
+
return [];
|
|
2949
3178
|
}
|
|
2950
|
-
|
|
2951
|
-
|
|
2952
|
-
|
|
2953
|
-
|
|
2954
|
-
|
|
2955
|
-
score,
|
|
2956
|
-
|
|
2957
|
-
|
|
2958
|
-
|
|
2959
|
-
|
|
2960
|
-
|
|
2961
|
-
|
|
2962
|
-
|
|
2963
|
-
|
|
2964
|
-
|
|
2965
|
-
|
|
2966
|
-
|
|
2967
|
-
|
|
2968
|
-
|
|
2969
|
-
|
|
2970
|
-
|
|
2971
|
-
|
|
2972
|
-
|
|
3179
|
+
const document = documents.get(chunk.documentId);
|
|
3180
|
+
const source = sources.get(chunk.sourceId);
|
|
3181
|
+
return [{
|
|
3182
|
+
_index: "querylight",
|
|
3183
|
+
_id: chunkId,
|
|
3184
|
+
_score: score,
|
|
3185
|
+
_source: {
|
|
3186
|
+
chunkId,
|
|
3187
|
+
documentId: chunk.documentId,
|
|
3188
|
+
sourceId: chunk.sourceId,
|
|
3189
|
+
sourceType: document?.sourceType ?? "text",
|
|
3190
|
+
sourceName: source?.name,
|
|
3191
|
+
title: chunk.title,
|
|
3192
|
+
uri: chunk.uri,
|
|
3193
|
+
headingPath: chunk.headingPath,
|
|
3194
|
+
text: chunk.text,
|
|
3195
|
+
normalizedPath: document?.normalizedPath,
|
|
3196
|
+
publicationDate: document?.publicationDate ?? null,
|
|
3197
|
+
crawledAt: document?.crawledAt,
|
|
3198
|
+
firstSeenAt: document?.firstSeenAt ?? chunk.firstSeenAt,
|
|
3199
|
+
lastSeenAt: document?.lastSeenAt ?? chunk.lastSeenAt,
|
|
3200
|
+
lastChangedAt: document?.lastChangedAt ?? chunk.lastChangedAt,
|
|
3201
|
+
metadata: chunk.metadata
|
|
3202
|
+
}
|
|
3203
|
+
}];
|
|
3204
|
+
});
|
|
3205
|
+
const materialized = await Promise.all(baseHits.map((hit) => materializeSearchHit(hit, normalizedQuery, config, orderedChunkCache, showChunks)));
|
|
3206
|
+
if (showChunks) {
|
|
3207
|
+
const topHits = materialized.sort((left, right) => right.result.score - left.result.score).slice(0, topK).map(({ hit, result }) => ({ ...hit, _score: result.score }));
|
|
3208
|
+
return createSearchResponse(mode, topHits, Date.now() - startedAt);
|
|
3209
|
+
}
|
|
3210
|
+
const reranked = rerankResultsByDocument(materialized.map(({ result }) => result), topK);
|
|
3211
|
+
const byChunkId = new Map(materialized.map(({ hit }) => [hit._id, hit]));
|
|
3212
|
+
const finalHits = reranked.map((result) => {
|
|
3213
|
+
const hit = byChunkId.get(result.chunkId);
|
|
3214
|
+
return hit ? { ...hit, _score: result.score, _source: { ...hit._source, snippet: result.snippet } } : null;
|
|
3215
|
+
}).filter((hit) => hit != null);
|
|
3216
|
+
return createSearchResponse(mode, finalHits, Date.now() - startedAt);
|
|
2973
3217
|
}
|
|
2974
3218
|
|
|
2975
3219
|
// src/query/related-service.ts
|
|
@@ -3086,9 +3330,10 @@ async function createContext({
|
|
|
3086
3330
|
retrievalMode
|
|
3087
3331
|
}) {
|
|
3088
3332
|
const search = await searchIndex({ workspacePath, query, topK, showChunks: true, retrievalMode });
|
|
3333
|
+
const results = searchResultsFromResponse(search, true);
|
|
3089
3334
|
const sources = [];
|
|
3090
3335
|
let total = 0;
|
|
3091
|
-
for (const result of
|
|
3336
|
+
for (const result of results) {
|
|
3092
3337
|
const text = result.text ?? "";
|
|
3093
3338
|
if (total + text.length > maxChars && sources.length > 0) {
|
|
3094
3339
|
break;
|
|
@@ -3197,6 +3442,8 @@ export {
|
|
|
3197
3442
|
renderChangeReport,
|
|
3198
3443
|
reprocessDocuments,
|
|
3199
3444
|
searchIndex,
|
|
3445
|
+
searchJsonIndex,
|
|
3446
|
+
searchResultsFromResponse,
|
|
3200
3447
|
updateSource,
|
|
3201
3448
|
writeDefaultConfig
|
|
3202
3449
|
};
|
|
@@ -1,10 +1,17 @@
|
|
|
1
|
-
import type
|
|
1
|
+
import { type JsonDslRequest, type JsonDslResponse } from "@tryformation/querylight-ts";
|
|
2
|
+
import type { RetrievalMode, SearchResponseData, SearchResult } from "../types/models.js";
|
|
2
3
|
type SearchDateField = "publicationDate" | "firstSeenAt" | "lastSeenAt" | "lastChangedAt" | "crawledAt";
|
|
3
4
|
type SearchDateRange = {
|
|
4
5
|
field: SearchDateField;
|
|
5
6
|
from?: string;
|
|
6
7
|
to?: string;
|
|
7
8
|
};
|
|
9
|
+
export declare function searchResultsFromResponse(response: SearchResponseData, showChunks?: boolean): SearchResult[];
|
|
10
|
+
export declare function searchJsonIndex({ workspacePath, request, indexName }: {
|
|
11
|
+
workspacePath: string;
|
|
12
|
+
request: JsonDslRequest;
|
|
13
|
+
indexName?: string;
|
|
14
|
+
}): Promise<JsonDslResponse>;
|
|
8
15
|
export declare function searchIndex({ workspacePath, query, topK, sourceId, sourceIds, sourceName, sourceNames, sourceType, sourceTypes, uriPrefix, uriPrefixes, hasPublicationDate, tag, tags, metadata, dateRanges, retrievalMode, showChunks }: {
|
|
9
16
|
workspacePath: string;
|
|
10
17
|
query: string;
|
package/dist/types/models.d.ts
CHANGED
|
@@ -222,9 +222,44 @@ export type SearchResult = {
|
|
|
222
222
|
lastChangedAt: string;
|
|
223
223
|
metadata: Record<string, unknown>;
|
|
224
224
|
};
|
|
225
|
+
export type SearchHitSource = {
|
|
226
|
+
chunkId: string;
|
|
227
|
+
documentId: string;
|
|
228
|
+
sourceId: string;
|
|
229
|
+
sourceType: SourceType;
|
|
230
|
+
sourceName?: string;
|
|
231
|
+
title: string;
|
|
232
|
+
uri: string;
|
|
233
|
+
headingPath: string[];
|
|
234
|
+
text: string;
|
|
235
|
+
snippet?: string;
|
|
236
|
+
normalizedPath?: string;
|
|
237
|
+
publicationDate?: string | null;
|
|
238
|
+
crawledAt?: string;
|
|
239
|
+
firstSeenAt: string;
|
|
240
|
+
lastSeenAt: string;
|
|
241
|
+
lastChangedAt: string;
|
|
242
|
+
metadata: Record<string, unknown>;
|
|
243
|
+
};
|
|
244
|
+
export type SearchHit = {
|
|
245
|
+
_index: string;
|
|
246
|
+
_id: string;
|
|
247
|
+
_score: number;
|
|
248
|
+
_source: SearchHitSource;
|
|
249
|
+
highlight?: Record<string, string[]>;
|
|
250
|
+
};
|
|
225
251
|
export type SearchResponseData = {
|
|
226
252
|
retrievalMode?: RetrievalMode;
|
|
227
|
-
|
|
253
|
+
took: number;
|
|
254
|
+
hits: {
|
|
255
|
+
total: {
|
|
256
|
+
value: number;
|
|
257
|
+
relation: "eq";
|
|
258
|
+
};
|
|
259
|
+
max_score: number | null;
|
|
260
|
+
hits: SearchHit[];
|
|
261
|
+
};
|
|
262
|
+
aggregations?: Record<string, unknown>;
|
|
228
263
|
};
|
|
229
264
|
export type RelatedDocumentResult = {
|
|
230
265
|
documentId: string;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tryformation/querylight-cli",
|
|
3
|
-
"version": "0.2.
|
|
3
|
+
"version": "0.2.3",
|
|
4
4
|
"description": "Querylight CLI for building and querying local knowledge bases.",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"homepage": "https://github.com/formation-res/querylight-cli#readme",
|
|
@@ -40,7 +40,7 @@
|
|
|
40
40
|
},
|
|
41
41
|
"dependencies": {
|
|
42
42
|
"@huggingface/transformers": "^3.8.1",
|
|
43
|
-
"@tryformation/querylight-ts": "^0.
|
|
43
|
+
"@tryformation/querylight-ts": "^0.11.0",
|
|
44
44
|
"cheerio": "^1.2.0",
|
|
45
45
|
"cli-table3": "^0.6.5",
|
|
46
46
|
"commander": "^14.0.3",
|