@tryformation/querylight-cli 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -23,7 +23,7 @@ It is designed for local, inspectable workflows:
23
23
  Run without installing globally:
24
24
 
25
25
  ```bash
26
- bunx @tryformation/querylight-cli init
26
+ bunx --bun @tryformation/querylight-cli init
27
27
  ```
28
28
 
29
29
  For agent and Python automation examples that use `bunx` and `uv`, see [`examples/skills/qli-bunx-uv/SKILL.md`](https://github.com/formation-res/querylight-cli/blob/main/examples/skills/qli-bunx-uv/SKILL.md).
@@ -43,9 +43,11 @@ npx qli --help
43
43
  If you prefer to avoid a local install, use:
44
44
 
45
45
  ```bash
46
- bunx @tryformation/querylight-cli --help
46
+ bunx --bun @tryformation/querylight-cli --help
47
47
  ```
48
48
 
49
+ Use `bunx --bun` for repeated or concurrent `bunx` calls. `bunx` respects the CLI shebang by default and otherwise starts `qli` through `node`.
50
+
49
51
  ## Release
50
52
 
51
53
  Publish releases from semantic version tags such as `0.1.1`.
@@ -105,6 +107,7 @@ Search it:
105
107
  ```bash
106
108
  qli search "API authentication"
107
109
  qli search --source-type rss --since 2026-05-01 --has-publication-date
110
+ qli search-json '{"query":{"match":{"text":"API authentication"}},"size":5}'
108
111
  ```
109
112
 
110
113
  Find related documents for an existing one:
@@ -127,7 +130,7 @@ The repository includes an example skill for running `qli` without a global inst
127
130
 
128
131
  It covers:
129
132
 
130
- - running `qli` with `bunx @tryformation/querylight-cli`
133
+ - running `qli` with `bunx --bun @tryformation/querylight-cli`
131
134
  - using `--json` for automation and agents
132
135
  - calling `qli search` and `qli context` from Python with `subprocess`
133
136
 
@@ -361,6 +364,7 @@ qli search --source-type rss,page --since 2026-05-01 --has-publication-date --to
361
364
  qli search --source-name "Release Feed,Company Blog" --uri-prefix https://example.com/news,https://example.com/blog
362
365
  qli search --source-type rss,page --top-k 25 --json
363
366
  qli search "authentication" --json
367
+ qli search-json '{"query":{"bool":{"filter":[{"term":{"sourceType":"rss"}}]}},"size":10}' --json
364
368
  ```
365
369
 
366
370
  Build retrieval context:
@@ -1,4 +1,4 @@
1
- import type { RelatedDocumentResult, SearchResult, Source } from "../types/models.js";
1
+ import type { RelatedDocumentResult, SearchResponseData, Source } from "../types/models.js";
2
2
  export declare function formatSourcesTable(sources: Source[]): string;
3
- export declare function formatSearchResults(results: SearchResult[]): string;
3
+ export declare function formatSearchResults(response: SearchResponseData): string;
4
4
  export declare function formatRelatedDocuments(results: RelatedDocumentResult[]): string;
package/dist/cli/main.js CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  // src/cli/run-cli.ts
4
4
  import { Command, Option } from "commander";
5
- import { stat as stat4 } from "fs/promises";
5
+ import { readFile as readFile11, stat as stat4 } from "fs/promises";
6
6
  import path21 from "path";
7
7
 
8
8
  // src/chunk/chunker.ts
@@ -16,7 +16,7 @@ import path from "path";
16
16
  import YAML from "yaml";
17
17
 
18
18
  // src/core/constants.ts
19
- var PACKAGE_VERSION = "0.2.1";
19
+ var PACKAGE_VERSION = "0.2.3";
20
20
  var DEFAULT_WORKSPACE = ".kb";
21
21
  var DEFAULT_SHARED_MODEL_CACHE_DIR = "~/.qli/models/huggingface";
22
22
  var LEGACY_WORKSPACE_MODEL_CACHE_DIR = ".kb/models/huggingface";
@@ -374,7 +374,7 @@ async function assertWorkspaceExists(workspacePath) {
374
374
  }
375
375
 
376
376
  // src/index/querylight-indexer.ts
377
- import { Analyzer, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, RankingAlgorithm, TextFieldIndex } from "@tryformation/querylight-ts";
377
+ import { Analyzer, DateFieldIndex, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, RankingAlgorithm, StoredSourceIndex, TextFieldIndex } from "@tryformation/querylight-ts";
378
378
  import path11 from "path";
379
379
 
380
380
  // src/vector/dense.ts
@@ -1091,12 +1091,19 @@ function keywordFieldIndex() {
1091
1091
  function createIndexMapping(extraFields = []) {
1092
1092
  const lexical = new TextFieldIndex(void 0, void 0, RankingAlgorithm.BM25);
1093
1093
  const mapping = {
1094
+ _source: new StoredSourceIndex(),
1094
1095
  text: lexical,
1095
1096
  title: new TextFieldIndex(void 0, void 0, RankingAlgorithm.BM25),
1096
1097
  uri: keywordFieldIndex(),
1097
1098
  sourceId: keywordFieldIndex(),
1099
+ sourceName: keywordFieldIndex(),
1098
1100
  tags: keywordFieldIndex(),
1099
- sourceType: keywordFieldIndex()
1101
+ sourceType: keywordFieldIndex(),
1102
+ publicationDate: new DateFieldIndex(),
1103
+ firstSeenAt: new DateFieldIndex(),
1104
+ lastSeenAt: new DateFieldIndex(),
1105
+ lastChangedAt: new DateFieldIndex(),
1106
+ crawledAt: new DateFieldIndex()
1100
1107
  };
1101
1108
  for (const field of extraFields) {
1102
1109
  mapping[field] = keywordFieldIndex();
@@ -1132,8 +1139,12 @@ async function buildIndex({
1132
1139
  const sources = await readJsonl(path11.join(workspacePath, "sources", "sources.jsonl"));
1133
1140
  const metadataFields = [...new Set(chunks.flatMap((chunk) => Object.keys(chunk.metadata).map((key) => `metadata.${key}`)))];
1134
1141
  const index = new DocumentIndex(createIndexMapping(metadataFields));
1142
+ const documentsById = new Map(documents.map((document) => [document.id, document]));
1143
+ const sourcesById = new Map(sources.map((source) => [source.id, source]));
1135
1144
  reportProgress(progress, `Building lexical index from ${chunks.length} chunk${chunks.length === 1 ? "" : "s"}`);
1136
1145
  for (const chunk of chunks) {
1146
+ const document = documentsById.get(chunk.documentId);
1147
+ const source = sourcesById.get(chunk.sourceId);
1137
1148
  index.index({
1138
1149
  id: chunk.id,
1139
1150
  fields: {
@@ -1141,9 +1152,33 @@ async function buildIndex({
1141
1152
  title: [chunk.title],
1142
1153
  uri: [chunk.uri.toLowerCase()],
1143
1154
  sourceId: [chunk.sourceId.toLowerCase()],
1155
+ sourceName: source ? [source.name.toLowerCase()] : [],
1144
1156
  tags: Array.isArray(chunk.metadata.tags) ? chunk.metadata.tags.map((tag) => String(tag).toLowerCase()) : [],
1145
1157
  sourceType: [String(chunk.metadata.sourceType ?? "").toLowerCase()],
1158
+ publicationDate: document?.publicationDate ? [document.publicationDate] : [],
1159
+ firstSeenAt: [document?.firstSeenAt ?? chunk.firstSeenAt],
1160
+ lastSeenAt: [document?.lastSeenAt ?? chunk.lastSeenAt],
1161
+ lastChangedAt: [document?.lastChangedAt ?? chunk.lastChangedAt],
1162
+ crawledAt: document?.crawledAt ? [document.crawledAt] : [],
1146
1163
  ...flattenMetadata(chunk.metadata)
1164
+ },
1165
+ source: {
1166
+ chunkId: chunk.id,
1167
+ documentId: chunk.documentId,
1168
+ sourceId: chunk.sourceId,
1169
+ sourceType: document?.sourceType ?? "text",
1170
+ sourceName: source?.name,
1171
+ title: chunk.title,
1172
+ uri: chunk.uri,
1173
+ headingPath: chunk.headingPath,
1174
+ text: chunk.text,
1175
+ normalizedPath: document?.normalizedPath,
1176
+ publicationDate: document?.publicationDate ?? null,
1177
+ crawledAt: document?.crawledAt,
1178
+ firstSeenAt: document?.firstSeenAt ?? chunk.firstSeenAt,
1179
+ lastSeenAt: document?.lastSeenAt ?? chunk.lastSeenAt,
1180
+ lastChangedAt: document?.lastChangedAt ?? chunk.lastChangedAt,
1181
+ metadata: chunk.metadata
1147
1182
  }
1148
1183
  });
1149
1184
  }
@@ -1152,7 +1187,7 @@ async function buildIndex({
1152
1187
  const metadata = {
1153
1188
  id: `index_${createdAt.replace(/[:.]/g, "-")}`,
1154
1189
  createdAt,
1155
- querylightVersion: "0.10.0",
1190
+ querylightVersion: "0.11.0",
1156
1191
  kbVersion: "0.1.0",
1157
1192
  documentCount: documents.length,
1158
1193
  chunkCount: chunks.length,
@@ -2778,7 +2813,7 @@ async function discoverWebsiteFeed(websiteUrl, userAgent) {
2778
2813
 
2779
2814
  // src/query/search-service.ts
2780
2815
  import { readFile as readFile10 } from "fs/promises";
2781
- import { BoolQuery, MatchQuery, OP, TermQuery, reciprocalRankFusion } from "@tryformation/querylight-ts";
2816
+ import { reciprocalRankFusion, searchJsonDsl } from "@tryformation/querylight-ts";
2782
2817
  import path18 from "path";
2783
2818
  async function loadHydratedIndex(workspacePath) {
2784
2819
  let state;
@@ -2806,24 +2841,6 @@ function matchesPrefix(value, prefixes) {
2806
2841
  const lower = value.toLowerCase();
2807
2842
  return prefixes.some((prefix) => lower.startsWith(prefix));
2808
2843
  }
2809
- function buildSearchQuery(query, filters) {
2810
- const sourceIds = normalizeFilterValues([filters.sourceId, ...filters.sourceIds ?? []].filter((value) => Boolean(value)));
2811
- const sourceTypes = normalizeFilterValues([filters.sourceType, ...filters.sourceTypes ?? []].filter((value) => Boolean(value)));
2812
- const tags = normalizeFilterValues([filters.tag, ...filters.tags ?? []].filter((value) => Boolean(value)));
2813
- return new BoolQuery({
2814
- should: [
2815
- new MatchQuery({ field: "title", text: query, operation: OP.AND, boost: 6 }),
2816
- new MatchQuery({ field: "text", text: query, operation: OP.AND, boost: 4 }),
2817
- new MatchQuery({ field: "text", text: query, operation: OP.OR, boost: 2 })
2818
- ],
2819
- filter: [
2820
- ...sourceIds.length === 1 ? [new TermQuery({ field: "sourceId", text: sourceIds[0] })] : [],
2821
- ...sourceTypes.length === 1 ? [new TermQuery({ field: "sourceType", text: sourceTypes[0] })] : [],
2822
- ...tags.length === 1 ? [new TermQuery({ field: "tags", text: tags[0] })] : [],
2823
- ...(filters.metadata ?? []).map(({ key, value }) => new TermQuery({ field: `metadata.${key}`, text: value.toLowerCase() }))
2824
- ]
2825
- });
2826
- }
2827
2844
  function isValidDate(value) {
2828
2845
  return typeof value === "string" && !Number.isNaN(new Date(value).getTime());
2829
2846
  }
@@ -3022,6 +3039,178 @@ async function buildSnippetWithAdjacentChunks(chunk, query, {
3022
3039
  }
3023
3040
  return buildExpandedParagraphSnippet(paragraphs, currentIndex, query);
3024
3041
  }
3042
+ function buildSearchDslRequest({
3043
+ query,
3044
+ topK,
3045
+ filters,
3046
+ dateRanges
3047
+ }) {
3048
+ const filterClauses = [];
3049
+ const sourceIds = normalizeFilterValues([filters.sourceId, ...filters.sourceIds ?? []].filter((value) => Boolean(value)));
3050
+ const sourceNames = normalizeFilterValues([filters.sourceName, ...filters.sourceNames ?? []].filter((value) => Boolean(value)));
3051
+ const sourceTypes = normalizeFilterValues([filters.sourceType, ...filters.sourceTypes ?? []].filter((value) => Boolean(value)));
3052
+ const uriPrefixes = normalizeFilterValues([filters.uriPrefix, ...filters.uriPrefixes ?? []].filter((value) => Boolean(value)));
3053
+ const tags = normalizeFilterValues([filters.tag, ...filters.tags ?? []].filter((value) => Boolean(value)));
3054
+ if (sourceIds.length > 0) {
3055
+ filterClauses.push({ terms: { sourceId: sourceIds } });
3056
+ }
3057
+ if (sourceNames.length > 0) {
3058
+ filterClauses.push({ terms: { sourceName: sourceNames } });
3059
+ }
3060
+ if (sourceTypes.length > 0) {
3061
+ filterClauses.push({ terms: { sourceType: sourceTypes } });
3062
+ }
3063
+ if (uriPrefixes.length > 0) {
3064
+ filterClauses.push({
3065
+ bool: {
3066
+ should: uriPrefixes.map((prefix) => ({ prefix: { uri: prefix } })),
3067
+ minimum_should_match: 1
3068
+ }
3069
+ });
3070
+ }
3071
+ if (tags.length > 0) {
3072
+ filterClauses.push({ terms: { tags } });
3073
+ }
3074
+ if (filters.hasPublicationDate) {
3075
+ filterClauses.push({ exists: { field: "publicationDate" } });
3076
+ }
3077
+ for (const { key, value } of filters.metadata ?? []) {
3078
+ filterClauses.push({ term: { [`metadata.${key}`]: value.toLowerCase() } });
3079
+ }
3080
+ for (const { field, from, to } of dateRanges) {
3081
+ filterClauses.push({
3082
+ range: {
3083
+ [field]: {
3084
+ ...from ? { gte: from } : {},
3085
+ ...to ? { lte: to } : {}
3086
+ }
3087
+ }
3088
+ });
3089
+ }
3090
+ return {
3091
+ size: topK,
3092
+ query: {
3093
+ bool: {
3094
+ should: [
3095
+ { match: { title: { query, operator: "and", boost: 6 } } },
3096
+ { match: { text: { query, operator: "and", boost: 4 } } },
3097
+ { match: { text: { query, operator: "or", boost: 2 } } }
3098
+ ],
3099
+ filter: filterClauses,
3100
+ minimum_should_match: 1
3101
+ }
3102
+ }
3103
+ };
3104
+ }
3105
+ function sourceToChunkRecord(source) {
3106
+ return {
3107
+ id: source.chunkId,
3108
+ documentId: source.documentId,
3109
+ sourceId: source.sourceId,
3110
+ title: source.title,
3111
+ uri: source.uri,
3112
+ headingPath: source.headingPath,
3113
+ text: source.text,
3114
+ contentHash: "",
3115
+ metadata: source.metadata,
3116
+ firstSeenAt: source.firstSeenAt,
3117
+ lastSeenAt: source.lastSeenAt,
3118
+ lastChangedAt: source.lastChangedAt
3119
+ };
3120
+ }
3121
+ function sourceToDocumentRecord(source) {
3122
+ return {
3123
+ id: source.documentId,
3124
+ sourceId: source.sourceId,
3125
+ sourceType: source.sourceType,
3126
+ title: source.title,
3127
+ uri: source.uri,
3128
+ sourceUri: source.uri,
3129
+ mimeType: "text/plain",
3130
+ normalizedPath: source.normalizedPath ?? "",
3131
+ contentHash: "",
3132
+ metadata: source.metadata,
3133
+ publicationDate: source.publicationDate ?? null,
3134
+ crawledAt: source.crawledAt,
3135
+ firstSeenAt: source.firstSeenAt,
3136
+ lastSeenAt: source.lastSeenAt,
3137
+ lastChangedAt: source.lastChangedAt
3138
+ };
3139
+ }
3140
+ async function materializeSearchHit(hit, query, config, orderedChunkCache, showChunks) {
3141
+ const source = hit._source;
3142
+ const chunk = sourceToChunkRecord(source);
3143
+ const document = sourceToDocumentRecord(source);
3144
+ const snippet = await buildSnippetWithAdjacentChunks(chunk, query, { document, config, orderedChunkCache });
3145
+ const enrichedSource = {
3146
+ ...source,
3147
+ snippet
3148
+ };
3149
+ const result = {
3150
+ chunkId: source.chunkId,
3151
+ documentId: source.documentId,
3152
+ sourceId: source.sourceId,
3153
+ sourceType: source.sourceType,
3154
+ score: hit._score,
3155
+ title: chooseResultTitle(chunk),
3156
+ uri: source.uri,
3157
+ snippet,
3158
+ text: showChunks ? source.text : void 0,
3159
+ publicationDate: source.publicationDate ?? null,
3160
+ firstSeenAt: source.firstSeenAt,
3161
+ lastSeenAt: source.lastSeenAt,
3162
+ lastChangedAt: source.lastChangedAt,
3163
+ metadata: source.metadata
3164
+ };
3165
+ return {
3166
+ hit: {
3167
+ ...hit,
3168
+ _source: enrichedSource
3169
+ },
3170
+ result
3171
+ };
3172
+ }
3173
+ function createSearchResponse(retrievalMode, hits, took, aggregations) {
3174
+ return {
3175
+ retrievalMode,
3176
+ took,
3177
+ hits: {
3178
+ total: {
3179
+ value: hits.length,
3180
+ relation: "eq"
3181
+ },
3182
+ max_score: hits.length > 0 ? Math.max(...hits.map((hit) => hit._score)) : null,
3183
+ hits
3184
+ },
3185
+ aggregations
3186
+ };
3187
+ }
3188
+ function searchResultsFromResponse(response2, showChunks = false) {
3189
+ return response2.hits.hits.map((hit) => ({
3190
+ chunkId: hit._source.chunkId,
3191
+ documentId: hit._source.documentId,
3192
+ sourceId: hit._source.sourceId,
3193
+ sourceType: hit._source.sourceType,
3194
+ score: hit._score,
3195
+ title: chooseResultTitle(sourceToChunkRecord(hit._source)),
3196
+ uri: hit._source.uri,
3197
+ snippet: hit._source.snippet ?? hit.highlight?.text?.join("\n\n") ?? buildSnippet(hit._source.text, hit._source.title),
3198
+ text: showChunks ? hit._source.text : void 0,
3199
+ publicationDate: hit._source.publicationDate ?? null,
3200
+ firstSeenAt: hit._source.firstSeenAt,
3201
+ lastSeenAt: hit._source.lastSeenAt,
3202
+ lastChangedAt: hit._source.lastChangedAt,
3203
+ metadata: hit._source.metadata
3204
+ }));
3205
+ }
3206
+ async function searchJsonIndex({
3207
+ workspacePath,
3208
+ request,
3209
+ indexName = "querylight"
3210
+ }) {
3211
+ const index = await loadHydratedIndex(workspacePath);
3212
+ return searchJsonDsl({ index, request, indexName });
3213
+ }
3025
3214
  function normalizeDisplayTitle(title) {
3026
3215
  return title.replace(/\s*\|\s*Querylight TS Demo\s*$/i, "").replace(/\s+/g, " ").trim();
3027
3216
  }
@@ -3159,6 +3348,7 @@ async function searchIndex({
3159
3348
  retrievalMode,
3160
3349
  showChunks = false
3161
3350
  }) {
3351
+ const startedAt = Date.now();
3162
3352
  const config = await loadConfig(workspacePath);
3163
3353
  const mode = retrievalMode ?? config.retrieval.defaultMode;
3164
3354
  const candidateLimit = Math.max(topK * 5, 50);
@@ -3215,12 +3405,48 @@ async function searchIndex({
3215
3405
  };
3216
3406
  })
3217
3407
  );
3218
- return { retrievalMode: "lexical", results: latestResults.filter((result) => result != null) };
3408
+ const hits2 = latestResults.filter((result) => result != null).map((result) => {
3409
+ const chunk = chunks.get(result.chunkId);
3410
+ const document = documents.get(result.documentId);
3411
+ const source = sources.get(result.sourceId);
3412
+ return {
3413
+ _index: "querylight",
3414
+ _id: result.chunkId,
3415
+ _score: result.score,
3416
+ _source: {
3417
+ chunkId: result.chunkId,
3418
+ documentId: result.documentId,
3419
+ sourceId: result.sourceId,
3420
+ sourceType: result.sourceType,
3421
+ sourceName: source?.name,
3422
+ title: chunk.title,
3423
+ uri: result.uri,
3424
+ headingPath: chunk.headingPath,
3425
+ text: chunk.text,
3426
+ snippet: result.snippet,
3427
+ normalizedPath: document.normalizedPath,
3428
+ publicationDate: result.publicationDate ?? null,
3429
+ crawledAt: document.crawledAt,
3430
+ firstSeenAt: result.firstSeenAt,
3431
+ lastSeenAt: result.lastSeenAt,
3432
+ lastChangedAt: result.lastChangedAt,
3433
+ metadata: result.metadata
3434
+ }
3435
+ };
3436
+ });
3437
+ return createSearchResponse("lexical", hits2, Date.now() - startedAt);
3219
3438
  }
3220
3439
  const lexicalHits = async () => {
3221
- const index = await loadHydratedIndex(workspacePath);
3222
- const all = await index.searchRequest({ query: buildSearchQuery(normalizedQuery, { sourceId, sourceIds, sourceType, sourceTypes, tag, tags, metadata }), limit: candidateLimit });
3223
- return all.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit);
3440
+ const response2 = await searchJsonIndex({
3441
+ workspacePath,
3442
+ request: buildSearchDslRequest({
3443
+ query: normalizedQuery,
3444
+ topK: candidateLimit,
3445
+ filters: { sourceId, sourceIds, sourceName, sourceNames, sourceType, sourceTypes, uriPrefix, uriPrefixes, hasPublicationDate, tag, tags, metadata },
3446
+ dateRanges
3447
+ })
3448
+ });
3449
+ return response2.hits.hits;
3224
3450
  };
3225
3451
  const denseHits = async () => {
3226
3452
  if (!await fileExists(denseVectorPath(workspacePath))) {
@@ -3234,15 +3460,18 @@ async function searchIndex({
3234
3460
  }
3235
3461
  return sparseQuery({ workspacePath, config: config.retrieval.sparse, query: normalizedQuery, topK: candidateLimit }).then((hits2) => hits2.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit));
3236
3462
  };
3463
+ let lexicalResponseHits = [];
3237
3464
  let hits;
3238
3465
  if (mode === "lexical") {
3239
- hits = await lexicalHits();
3466
+ lexicalResponseHits = await lexicalHits();
3467
+ hits = lexicalResponseHits.map((hit) => [hit._id, hit._score]);
3240
3468
  } else if (mode === "dense") {
3241
3469
  hits = await denseHits();
3242
3470
  } else if (mode === "sparse") {
3243
3471
  hits = await sparseHits();
3244
3472
  } else {
3245
- const rankings = [await lexicalHits()];
3473
+ lexicalResponseHits = await lexicalHits();
3474
+ const rankings = [lexicalResponseHits.map((hit) => [hit._id, hit._score])];
3246
3475
  if (await fileExists(denseVectorPath(workspacePath))) {
3247
3476
  rankings.push(await denseQuery({ workspacePath, config: config.retrieval.dense, query: normalizedQuery, topK: candidateLimit }).then((dense) => dense.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit)));
3248
3477
  }
@@ -3251,34 +3480,49 @@ async function searchIndex({
3251
3480
  }
3252
3481
  hits = reciprocalRankFusion(rankings, { rankConstant: 20, weights: rankings.map((_, index) => index === 0 ? 3 : 1) }).slice(0, candidateLimit);
3253
3482
  }
3254
- const rawResults = await Promise.all(hits.map(async ([chunkId, score]) => {
3483
+ const baseHits = mode === "lexical" ? lexicalResponseHits : hits.flatMap(([chunkId, score]) => {
3255
3484
  const chunk = chunks.get(chunkId);
3256
3485
  if (!chunk) {
3257
- return null;
3486
+ return [];
3258
3487
  }
3259
- return {
3260
- chunkId,
3261
- documentId: chunk.documentId,
3262
- sourceId: chunk.sourceId,
3263
- sourceType: documents.get(chunk.documentId)?.sourceType ?? "text",
3264
- score,
3265
- title: chooseResultTitle(chunk),
3266
- uri: chunk.uri,
3267
- snippet: await buildSnippetWithAdjacentChunks(chunk, normalizedQuery, {
3268
- document: documents.get(chunk.documentId),
3269
- config,
3270
- orderedChunkCache
3271
- }),
3272
- text: showChunks ? chunk.text : void 0,
3273
- publicationDate: documents.get(chunk.documentId)?.publicationDate ?? null,
3274
- firstSeenAt: documents.get(chunk.documentId)?.firstSeenAt ?? chunk.firstSeenAt,
3275
- lastSeenAt: documents.get(chunk.documentId)?.lastSeenAt ?? chunk.lastSeenAt,
3276
- lastChangedAt: documents.get(chunk.documentId)?.lastChangedAt ?? chunk.lastChangedAt,
3277
- metadata: chunk.metadata
3278
- };
3279
- }));
3280
- const results = rawResults.filter((result) => result != null);
3281
- return { retrievalMode: mode, results: rerankResultsByDocument(results, topK) };
3488
+ const document = documents.get(chunk.documentId);
3489
+ const source = sources.get(chunk.sourceId);
3490
+ return [{
3491
+ _index: "querylight",
3492
+ _id: chunkId,
3493
+ _score: score,
3494
+ _source: {
3495
+ chunkId,
3496
+ documentId: chunk.documentId,
3497
+ sourceId: chunk.sourceId,
3498
+ sourceType: document?.sourceType ?? "text",
3499
+ sourceName: source?.name,
3500
+ title: chunk.title,
3501
+ uri: chunk.uri,
3502
+ headingPath: chunk.headingPath,
3503
+ text: chunk.text,
3504
+ normalizedPath: document?.normalizedPath,
3505
+ publicationDate: document?.publicationDate ?? null,
3506
+ crawledAt: document?.crawledAt,
3507
+ firstSeenAt: document?.firstSeenAt ?? chunk.firstSeenAt,
3508
+ lastSeenAt: document?.lastSeenAt ?? chunk.lastSeenAt,
3509
+ lastChangedAt: document?.lastChangedAt ?? chunk.lastChangedAt,
3510
+ metadata: chunk.metadata
3511
+ }
3512
+ }];
3513
+ });
3514
+ const materialized = await Promise.all(baseHits.map((hit) => materializeSearchHit(hit, normalizedQuery, config, orderedChunkCache, showChunks)));
3515
+ if (showChunks) {
3516
+ const topHits = materialized.sort((left, right) => right.result.score - left.result.score).slice(0, topK).map(({ hit, result }) => ({ ...hit, _score: result.score }));
3517
+ return createSearchResponse(mode, topHits, Date.now() - startedAt);
3518
+ }
3519
+ const reranked = rerankResultsByDocument(materialized.map(({ result }) => result), topK);
3520
+ const byChunkId = new Map(materialized.map(({ hit }) => [hit._id, hit]));
3521
+ const finalHits = reranked.map((result) => {
3522
+ const hit = byChunkId.get(result.chunkId);
3523
+ return hit ? { ...hit, _score: result.score, _source: { ...hit._source, snippet: result.snippet } } : null;
3524
+ }).filter((hit) => hit != null);
3525
+ return createSearchResponse(mode, finalHits, Date.now() - startedAt);
3282
3526
  }
3283
3527
 
3284
3528
  // src/query/related-service.ts
@@ -3395,9 +3639,10 @@ async function createContext({
3395
3639
  retrievalMode
3396
3640
  }) {
3397
3641
  const search = await searchIndex({ workspacePath, query, topK, showChunks: true, retrievalMode });
3642
+ const results = searchResultsFromResponse(search, true);
3398
3643
  const sources = [];
3399
3644
  let total = 0;
3400
- for (const result of search.results) {
3645
+ for (const result of results) {
3401
3646
  const text = result.text ?? "";
3402
3647
  if (total + text.length > maxChars && sources.length > 0) {
3403
3648
  break;
@@ -3498,7 +3743,8 @@ function formatSourcesTable(sources) {
3498
3743
  }
3499
3744
  return table.toString();
3500
3745
  }
3501
- function formatSearchResults(results) {
3746
+ function formatSearchResults(response2) {
3747
+ const results = searchResultsFromResponse(response2);
3502
3748
  return results.map((result, index) => [
3503
3749
  `${index + 1}. ${colors.bold(result.title)}`,
3504
3750
  ` URL: ${result.uri}`,
@@ -3803,6 +4049,19 @@ function parseDateValue(input, optionName) {
3803
4049
  }
3804
4050
  return parsed.toISOString();
3805
4051
  }
4052
+ async function parseJsonArgument(input) {
4053
+ const raw = input.startsWith("@") ? await readFile11(path21.resolve(input.slice(1)), "utf8") : input;
4054
+ try {
4055
+ const parsed = JSON.parse(raw);
4056
+ if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
4057
+ throw new Error("expected a JSON object");
4058
+ }
4059
+ return parsed;
4060
+ } catch (error) {
4061
+ const message = error instanceof Error ? error.message : String(error);
4062
+ throw new CliError(`invalid JSON request: ${message}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
4063
+ }
4064
+ }
3806
4065
  function searchDateRanges(options) {
3807
4066
  const entries = [];
3808
4067
  if (options.since || options.until) {
@@ -4147,7 +4406,7 @@ Examples:
4147
4406
  progress?.("info", "Rebuild complete");
4148
4407
  emit(global.json, capture, response("rebuild", workspace, data), `Processed ${ingest.processedSources} sources, wrote ${chunk.chunksWritten} chunks`);
4149
4408
  });
4150
- program.command("search").description("Search the built index and return ranked matching documents or chunks.").argument("[query]", "Text query. Omit it to list the latest matching documents.").option("--top-k <n>", "Maximum number of results to return.", "12").option("--source <sourceIds>", "Restrict results to one or more source ids. Use comma-separated values.").option("--source-name <names>", "Restrict results to one or more source names. Use comma-separated values.").option("--source-type <types>", `Restrict results to one or more source types. Use comma-separated values: ${SOURCE_TYPE_LIST.join(", ")}`).option("--uri-prefix <prefixes>", "Restrict results to one or more URI prefixes. Use comma-separated values.").option("--tag <tags>", "Restrict results to one or more source tags. Use comma-separated values.").option("--metadata <key=value...>", "Restrict results to sources with matching metadata.").option("--since <date>", "Shortcut for --publication-date-from.").option("--until <date>", "Shortcut for --publication-date-to.").option("--changed-since <date>", "Only include documents changed on or after this date.").option("--has-publication-date", "Only include documents with a publication date.").option("--publication-date-from <date>", "Only include documents published on or after this date.").option("--publication-date-to <date>", "Only include documents published on or before this date.").option("--first-seen-at-from <date>", "Only include documents first seen on or after this date.").option("--first-seen-at-to <date>", "Only include documents first seen on or before this date.").option("--last-seen-at-from <date>", "Only include documents last seen on or after this date.").option("--last-seen-at-to <date>", "Only include documents last seen on or before this date.").option("--last-changed-at-from <date>", "Only include documents changed on or after this date.").option("--last-changed-at-to <date>", "Only include documents changed on or before this date.").option("--crawled-at-from <date>", "Only include documents crawled on or after this date.").option("--crawled-at-to <date>", "Only include documents crawled on or before this date.").option("--retrieval <mode>", `Retrieval mode: ${RETRIEVAL_MODE_LIST.join(", ")}`).option("--show-chunks", "Return chunk-level matches when available.").addHelpText("after", `
4409
+ program.command("search").description("Search the built index and return ranked matching documents or chunks. Use search-json for raw JSON DSL queries.").argument("[query]", "Text query. Omit it to list the latest matching documents.").option("--top-k <n>", "Maximum number of results to return.", "12").option("--source <sourceIds>", "Restrict results to one or more source ids. Use comma-separated values.").option("--source-name <names>", "Restrict results to one or more source names. Use comma-separated values.").option("--source-type <types>", `Restrict results to one or more source types. Use comma-separated values: ${SOURCE_TYPE_LIST.join(", ")}`).option("--uri-prefix <prefixes>", "Restrict results to one or more URI prefixes. Use comma-separated values.").option("--tag <tags>", "Restrict results to one or more source tags. Use comma-separated values.").option("--metadata <key=value...>", "Restrict results to sources with matching metadata.").option("--since <date>", "Shortcut for --publication-date-from.").option("--until <date>", "Shortcut for --publication-date-to.").option("--changed-since <date>", "Only include documents changed on or after this date.").option("--has-publication-date", "Only include documents with a publication date.").option("--publication-date-from <date>", "Only include documents published on or after this date.").option("--publication-date-to <date>", "Only include documents published on or before this date.").option("--first-seen-at-from <date>", "Only include documents first seen on or after this date.").option("--first-seen-at-to <date>", "Only include documents first seen on or before this date.").option("--last-seen-at-from <date>", "Only include documents last seen on or after this date.").option("--last-seen-at-to <date>", "Only include documents last seen on or before this date.").option("--last-changed-at-from <date>", "Only include documents changed on or after this date.").option("--last-changed-at-to <date>", "Only include documents changed on or before this date.").option("--crawled-at-from <date>", "Only include documents crawled on or after this date.").option("--crawled-at-to <date>", "Only include documents crawled on or before this date.").option("--retrieval <mode>", `Retrieval mode: ${RETRIEVAL_MODE_LIST.join(", ")}`).option("--show-chunks", "Return chunk-level matches when available.").addHelpText("after", `
4151
4410
  Examples:
4152
4411
  qli search "pricing api limits"
4153
4412
  qli search "authentication" --top-k 20 --tag docs
@@ -4160,6 +4419,7 @@ Examples:
4160
4419
  Notes:
4161
4420
  lexical works without vector models.
4162
4421
  dense, sparse, and hybrid require the relevant index artifacts to exist.
4422
+ Use search-json when you want the raw Querylight 0.11 JSON DSL and hit format.
4163
4423
  When you omit the query, qli returns the latest matching documents sorted by publication date.`).action(async function command(query, options) {
4164
4424
  const global = this.optsWithGlobals();
4165
4425
  const workspace = await resolveWorkspace({ workspace: global.workspace });
@@ -4178,7 +4438,26 @@ Notes:
4178
4438
  retrievalMode: parseRetrievalMode(options.retrieval),
4179
4439
  showChunks: Boolean(options.showChunks)
4180
4440
  });
4181
- emit(global.json, capture, response("search", workspace, result), formatSearchResults(result.results));
4441
+ emit(global.json, capture, response("search", workspace, result), formatSearchResults(result));
4442
+ });
4443
+ program.command("search-json").description("Run a raw Querylight 0.11 JSON DSL search request against the lexical index.").argument("<request>", "Inline JSON request or @path/to/request.json.").addHelpText("after", `
4444
+ Examples:
4445
+ qli search-json '{"query":{"match":{"text":"authentication"}},"size":5}'
4446
+ qli search-json @./search-request.json
4447
+ qli search-json '{"query":{"bool":{"filter":[{"term":{"sourceType":"rss"}}]}},"aggs":{"types":{"terms":{"field":"sourceType","size":5}}}}' --json
4448
+
4449
+ Notes:
4450
+ search-json uses the lexical index and Querylight 0.11 JSON DSL fields.
4451
+ Stored hit payloads are returned under _source.
4452
+ Use --json when another tool needs the full response envelope.`).action(async function command(requestInput) {
4453
+ const global = this.optsWithGlobals();
4454
+ const workspace = await resolveWorkspace({ workspace: global.workspace });
4455
+ const request = await parseJsonArgument(requestInput);
4456
+ const result = await searchJsonIndex({
4457
+ workspacePath: workspace,
4458
+ request
4459
+ });
4460
+ emit(global.json, capture, response("search-json", workspace, result), JSON.stringify(result, null, 2));
4182
4461
  });
4183
4462
  program.command("related").description("Find documents similar to an existing document by id or URI.").argument("<document>", "Document id, uri, or canonical uri").option("--top-k <n>", "Maximum number of related documents to return.", "12").addHelpText("after", `
4184
4463
  Examples:
@@ -1,5 +1,5 @@
1
1
  export declare const PACKAGE_NAME = "@tryformation/querylight-cli";
2
- export declare const PACKAGE_VERSION = "0.2.1";
2
+ export declare const PACKAGE_VERSION = "0.2.3";
3
3
  export declare const DEFAULT_WORKSPACE = ".kb";
4
4
  export declare const DEFAULT_SHARED_MODEL_CACHE_DIR = "~/.qli/models/huggingface";
5
5
  export declare const LEGACY_WORKSPACE_MODEL_CACHE_DIR = ".kb/models/huggingface";
@@ -1,7 +1,7 @@
1
- import { TextFieldIndex } from "@tryformation/querylight-ts";
1
+ import { type FieldIndex } from "@tryformation/querylight-ts";
2
2
  import { type ProgressHandler } from "../core/progress.js";
3
3
  import type { IndexMetadata } from "../types/models.js";
4
- export declare function createIndexMapping(extraFields?: string[]): Record<string, TextFieldIndex>;
4
+ export declare function createIndexMapping(extraFields?: string[]): Record<string, FieldIndex>;
5
5
  export declare function buildIndex({ workspacePath, denseOverride, sparseOverride, buildAvailableModels, progress }: {
6
6
  workspacePath: string;
7
7
  denseOverride?: boolean;
package/dist/index.js CHANGED
@@ -1782,7 +1782,7 @@ async function chunkDocuments({
1782
1782
  }
1783
1783
 
1784
1784
  // src/index/querylight-indexer.ts
1785
- import { Analyzer, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, RankingAlgorithm, TextFieldIndex } from "@tryformation/querylight-ts";
1785
+ import { Analyzer, DateFieldIndex, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, RankingAlgorithm, StoredSourceIndex, TextFieldIndex } from "@tryformation/querylight-ts";
1786
1786
  import path17 from "path";
1787
1787
 
1788
1788
  // src/vector/dense.ts
@@ -2379,12 +2379,19 @@ function keywordFieldIndex() {
2379
2379
  function createIndexMapping(extraFields = []) {
2380
2380
  const lexical = new TextFieldIndex(void 0, void 0, RankingAlgorithm.BM25);
2381
2381
  const mapping = {
2382
+ _source: new StoredSourceIndex(),
2382
2383
  text: lexical,
2383
2384
  title: new TextFieldIndex(void 0, void 0, RankingAlgorithm.BM25),
2384
2385
  uri: keywordFieldIndex(),
2385
2386
  sourceId: keywordFieldIndex(),
2387
+ sourceName: keywordFieldIndex(),
2386
2388
  tags: keywordFieldIndex(),
2387
- sourceType: keywordFieldIndex()
2389
+ sourceType: keywordFieldIndex(),
2390
+ publicationDate: new DateFieldIndex(),
2391
+ firstSeenAt: new DateFieldIndex(),
2392
+ lastSeenAt: new DateFieldIndex(),
2393
+ lastChangedAt: new DateFieldIndex(),
2394
+ crawledAt: new DateFieldIndex()
2388
2395
  };
2389
2396
  for (const field of extraFields) {
2390
2397
  mapping[field] = keywordFieldIndex();
@@ -2420,8 +2427,12 @@ async function buildIndex({
2420
2427
  const sources = await readJsonl(path17.join(workspacePath, "sources", "sources.jsonl"));
2421
2428
  const metadataFields = [...new Set(chunks.flatMap((chunk) => Object.keys(chunk.metadata).map((key) => `metadata.${key}`)))];
2422
2429
  const index = new DocumentIndex(createIndexMapping(metadataFields));
2430
+ const documentsById = new Map(documents.map((document) => [document.id, document]));
2431
+ const sourcesById = new Map(sources.map((source) => [source.id, source]));
2423
2432
  reportProgress(progress, `Building lexical index from ${chunks.length} chunk${chunks.length === 1 ? "" : "s"}`);
2424
2433
  for (const chunk of chunks) {
2434
+ const document = documentsById.get(chunk.documentId);
2435
+ const source = sourcesById.get(chunk.sourceId);
2425
2436
  index.index({
2426
2437
  id: chunk.id,
2427
2438
  fields: {
@@ -2429,9 +2440,33 @@ async function buildIndex({
2429
2440
  title: [chunk.title],
2430
2441
  uri: [chunk.uri.toLowerCase()],
2431
2442
  sourceId: [chunk.sourceId.toLowerCase()],
2443
+ sourceName: source ? [source.name.toLowerCase()] : [],
2432
2444
  tags: Array.isArray(chunk.metadata.tags) ? chunk.metadata.tags.map((tag) => String(tag).toLowerCase()) : [],
2433
2445
  sourceType: [String(chunk.metadata.sourceType ?? "").toLowerCase()],
2446
+ publicationDate: document?.publicationDate ? [document.publicationDate] : [],
2447
+ firstSeenAt: [document?.firstSeenAt ?? chunk.firstSeenAt],
2448
+ lastSeenAt: [document?.lastSeenAt ?? chunk.lastSeenAt],
2449
+ lastChangedAt: [document?.lastChangedAt ?? chunk.lastChangedAt],
2450
+ crawledAt: document?.crawledAt ? [document.crawledAt] : [],
2434
2451
  ...flattenMetadata(chunk.metadata)
2452
+ },
2453
+ source: {
2454
+ chunkId: chunk.id,
2455
+ documentId: chunk.documentId,
2456
+ sourceId: chunk.sourceId,
2457
+ sourceType: document?.sourceType ?? "text",
2458
+ sourceName: source?.name,
2459
+ title: chunk.title,
2460
+ uri: chunk.uri,
2461
+ headingPath: chunk.headingPath,
2462
+ text: chunk.text,
2463
+ normalizedPath: document?.normalizedPath,
2464
+ publicationDate: document?.publicationDate ?? null,
2465
+ crawledAt: document?.crawledAt,
2466
+ firstSeenAt: document?.firstSeenAt ?? chunk.firstSeenAt,
2467
+ lastSeenAt: document?.lastSeenAt ?? chunk.lastSeenAt,
2468
+ lastChangedAt: document?.lastChangedAt ?? chunk.lastChangedAt,
2469
+ metadata: chunk.metadata
2435
2470
  }
2436
2471
  });
2437
2472
  }
@@ -2440,7 +2475,7 @@ async function buildIndex({
2440
2475
  const metadata = {
2441
2476
  id: `index_${createdAt.replace(/[:.]/g, "-")}`,
2442
2477
  createdAt,
2443
- querylightVersion: "0.10.0",
2478
+ querylightVersion: "0.11.0",
2444
2479
  kbVersion: "0.1.0",
2445
2480
  documentCount: documents.length,
2446
2481
  chunkCount: chunks.length,
@@ -2469,7 +2504,7 @@ async function buildIndex({
2469
2504
 
2470
2505
  // src/query/search-service.ts
2471
2506
  import { readFile as readFile10 } from "fs/promises";
2472
- import { BoolQuery, MatchQuery, OP, TermQuery, reciprocalRankFusion } from "@tryformation/querylight-ts";
2507
+ import { reciprocalRankFusion, searchJsonDsl } from "@tryformation/querylight-ts";
2473
2508
  import path18 from "path";
2474
2509
  async function loadHydratedIndex(workspacePath) {
2475
2510
  let state;
@@ -2497,24 +2532,6 @@ function matchesPrefix(value, prefixes) {
2497
2532
  const lower = value.toLowerCase();
2498
2533
  return prefixes.some((prefix) => lower.startsWith(prefix));
2499
2534
  }
2500
- function buildSearchQuery(query, filters) {
2501
- const sourceIds = normalizeFilterValues([filters.sourceId, ...filters.sourceIds ?? []].filter((value) => Boolean(value)));
2502
- const sourceTypes = normalizeFilterValues([filters.sourceType, ...filters.sourceTypes ?? []].filter((value) => Boolean(value)));
2503
- const tags = normalizeFilterValues([filters.tag, ...filters.tags ?? []].filter((value) => Boolean(value)));
2504
- return new BoolQuery({
2505
- should: [
2506
- new MatchQuery({ field: "title", text: query, operation: OP.AND, boost: 6 }),
2507
- new MatchQuery({ field: "text", text: query, operation: OP.AND, boost: 4 }),
2508
- new MatchQuery({ field: "text", text: query, operation: OP.OR, boost: 2 })
2509
- ],
2510
- filter: [
2511
- ...sourceIds.length === 1 ? [new TermQuery({ field: "sourceId", text: sourceIds[0] })] : [],
2512
- ...sourceTypes.length === 1 ? [new TermQuery({ field: "sourceType", text: sourceTypes[0] })] : [],
2513
- ...tags.length === 1 ? [new TermQuery({ field: "tags", text: tags[0] })] : [],
2514
- ...(filters.metadata ?? []).map(({ key, value }) => new TermQuery({ field: `metadata.${key}`, text: value.toLowerCase() }))
2515
- ]
2516
- });
2517
- }
2518
2535
  function isValidDate(value) {
2519
2536
  return typeof value === "string" && !Number.isNaN(new Date(value).getTime());
2520
2537
  }
@@ -2713,6 +2730,178 @@ async function buildSnippetWithAdjacentChunks(chunk, query, {
2713
2730
  }
2714
2731
  return buildExpandedParagraphSnippet(paragraphs, currentIndex, query);
2715
2732
  }
2733
+ function buildSearchDslRequest({
2734
+ query,
2735
+ topK,
2736
+ filters,
2737
+ dateRanges
2738
+ }) {
2739
+ const filterClauses = [];
2740
+ const sourceIds = normalizeFilterValues([filters.sourceId, ...filters.sourceIds ?? []].filter((value) => Boolean(value)));
2741
+ const sourceNames = normalizeFilterValues([filters.sourceName, ...filters.sourceNames ?? []].filter((value) => Boolean(value)));
2742
+ const sourceTypes = normalizeFilterValues([filters.sourceType, ...filters.sourceTypes ?? []].filter((value) => Boolean(value)));
2743
+ const uriPrefixes = normalizeFilterValues([filters.uriPrefix, ...filters.uriPrefixes ?? []].filter((value) => Boolean(value)));
2744
+ const tags = normalizeFilterValues([filters.tag, ...filters.tags ?? []].filter((value) => Boolean(value)));
2745
+ if (sourceIds.length > 0) {
2746
+ filterClauses.push({ terms: { sourceId: sourceIds } });
2747
+ }
2748
+ if (sourceNames.length > 0) {
2749
+ filterClauses.push({ terms: { sourceName: sourceNames } });
2750
+ }
2751
+ if (sourceTypes.length > 0) {
2752
+ filterClauses.push({ terms: { sourceType: sourceTypes } });
2753
+ }
2754
+ if (uriPrefixes.length > 0) {
2755
+ filterClauses.push({
2756
+ bool: {
2757
+ should: uriPrefixes.map((prefix) => ({ prefix: { uri: prefix } })),
2758
+ minimum_should_match: 1
2759
+ }
2760
+ });
2761
+ }
2762
+ if (tags.length > 0) {
2763
+ filterClauses.push({ terms: { tags } });
2764
+ }
2765
+ if (filters.hasPublicationDate) {
2766
+ filterClauses.push({ exists: { field: "publicationDate" } });
2767
+ }
2768
+ for (const { key, value } of filters.metadata ?? []) {
2769
+ filterClauses.push({ term: { [`metadata.${key}`]: value.toLowerCase() } });
2770
+ }
2771
+ for (const { field, from, to } of dateRanges) {
2772
+ filterClauses.push({
2773
+ range: {
2774
+ [field]: {
2775
+ ...from ? { gte: from } : {},
2776
+ ...to ? { lte: to } : {}
2777
+ }
2778
+ }
2779
+ });
2780
+ }
2781
+ return {
2782
+ size: topK,
2783
+ query: {
2784
+ bool: {
2785
+ should: [
2786
+ { match: { title: { query, operator: "and", boost: 6 } } },
2787
+ { match: { text: { query, operator: "and", boost: 4 } } },
2788
+ { match: { text: { query, operator: "or", boost: 2 } } }
2789
+ ],
2790
+ filter: filterClauses,
2791
+ minimum_should_match: 1
2792
+ }
2793
+ }
2794
+ };
2795
+ }
2796
+ function sourceToChunkRecord(source) {
2797
+ return {
2798
+ id: source.chunkId,
2799
+ documentId: source.documentId,
2800
+ sourceId: source.sourceId,
2801
+ title: source.title,
2802
+ uri: source.uri,
2803
+ headingPath: source.headingPath,
2804
+ text: source.text,
2805
+ contentHash: "",
2806
+ metadata: source.metadata,
2807
+ firstSeenAt: source.firstSeenAt,
2808
+ lastSeenAt: source.lastSeenAt,
2809
+ lastChangedAt: source.lastChangedAt
2810
+ };
2811
+ }
2812
+ function sourceToDocumentRecord(source) {
2813
+ return {
2814
+ id: source.documentId,
2815
+ sourceId: source.sourceId,
2816
+ sourceType: source.sourceType,
2817
+ title: source.title,
2818
+ uri: source.uri,
2819
+ sourceUri: source.uri,
2820
+ mimeType: "text/plain",
2821
+ normalizedPath: source.normalizedPath ?? "",
2822
+ contentHash: "",
2823
+ metadata: source.metadata,
2824
+ publicationDate: source.publicationDate ?? null,
2825
+ crawledAt: source.crawledAt,
2826
+ firstSeenAt: source.firstSeenAt,
2827
+ lastSeenAt: source.lastSeenAt,
2828
+ lastChangedAt: source.lastChangedAt
2829
+ };
2830
+ }
2831
+ async function materializeSearchHit(hit, query, config, orderedChunkCache, showChunks) {
2832
+ const source = hit._source;
2833
+ const chunk = sourceToChunkRecord(source);
2834
+ const document = sourceToDocumentRecord(source);
2835
+ const snippet = await buildSnippetWithAdjacentChunks(chunk, query, { document, config, orderedChunkCache });
2836
+ const enrichedSource = {
2837
+ ...source,
2838
+ snippet
2839
+ };
2840
+ const result = {
2841
+ chunkId: source.chunkId,
2842
+ documentId: source.documentId,
2843
+ sourceId: source.sourceId,
2844
+ sourceType: source.sourceType,
2845
+ score: hit._score,
2846
+ title: chooseResultTitle(chunk),
2847
+ uri: source.uri,
2848
+ snippet,
2849
+ text: showChunks ? source.text : void 0,
2850
+ publicationDate: source.publicationDate ?? null,
2851
+ firstSeenAt: source.firstSeenAt,
2852
+ lastSeenAt: source.lastSeenAt,
2853
+ lastChangedAt: source.lastChangedAt,
2854
+ metadata: source.metadata
2855
+ };
2856
+ return {
2857
+ hit: {
2858
+ ...hit,
2859
+ _source: enrichedSource
2860
+ },
2861
+ result
2862
+ };
2863
+ }
2864
+ function createSearchResponse(retrievalMode, hits, took, aggregations) {
2865
+ return {
2866
+ retrievalMode,
2867
+ took,
2868
+ hits: {
2869
+ total: {
2870
+ value: hits.length,
2871
+ relation: "eq"
2872
+ },
2873
+ max_score: hits.length > 0 ? Math.max(...hits.map((hit) => hit._score)) : null,
2874
+ hits
2875
+ },
2876
+ aggregations
2877
+ };
2878
+ }
2879
+ function searchResultsFromResponse(response, showChunks = false) {
2880
+ return response.hits.hits.map((hit) => ({
2881
+ chunkId: hit._source.chunkId,
2882
+ documentId: hit._source.documentId,
2883
+ sourceId: hit._source.sourceId,
2884
+ sourceType: hit._source.sourceType,
2885
+ score: hit._score,
2886
+ title: chooseResultTitle(sourceToChunkRecord(hit._source)),
2887
+ uri: hit._source.uri,
2888
+ snippet: hit._source.snippet ?? hit.highlight?.text?.join("\n\n") ?? buildSnippet(hit._source.text, hit._source.title),
2889
+ text: showChunks ? hit._source.text : void 0,
2890
+ publicationDate: hit._source.publicationDate ?? null,
2891
+ firstSeenAt: hit._source.firstSeenAt,
2892
+ lastSeenAt: hit._source.lastSeenAt,
2893
+ lastChangedAt: hit._source.lastChangedAt,
2894
+ metadata: hit._source.metadata
2895
+ }));
2896
+ }
2897
+ async function searchJsonIndex({
2898
+ workspacePath,
2899
+ request,
2900
+ indexName = "querylight"
2901
+ }) {
2902
+ const index = await loadHydratedIndex(workspacePath);
2903
+ return searchJsonDsl({ index, request, indexName });
2904
+ }
2716
2905
  function normalizeDisplayTitle(title) {
2717
2906
  return title.replace(/\s*\|\s*Querylight TS Demo\s*$/i, "").replace(/\s+/g, " ").trim();
2718
2907
  }
@@ -2850,6 +3039,7 @@ async function searchIndex({
2850
3039
  retrievalMode,
2851
3040
  showChunks = false
2852
3041
  }) {
3042
+ const startedAt = Date.now();
2853
3043
  const config = await loadConfig(workspacePath);
2854
3044
  const mode = retrievalMode ?? config.retrieval.defaultMode;
2855
3045
  const candidateLimit = Math.max(topK * 5, 50);
@@ -2906,12 +3096,48 @@ async function searchIndex({
2906
3096
  };
2907
3097
  })
2908
3098
  );
2909
- return { retrievalMode: "lexical", results: latestResults.filter((result) => result != null) };
3099
+ const hits2 = latestResults.filter((result) => result != null).map((result) => {
3100
+ const chunk = chunks.get(result.chunkId);
3101
+ const document = documents.get(result.documentId);
3102
+ const source = sources.get(result.sourceId);
3103
+ return {
3104
+ _index: "querylight",
3105
+ _id: result.chunkId,
3106
+ _score: result.score,
3107
+ _source: {
3108
+ chunkId: result.chunkId,
3109
+ documentId: result.documentId,
3110
+ sourceId: result.sourceId,
3111
+ sourceType: result.sourceType,
3112
+ sourceName: source?.name,
3113
+ title: chunk.title,
3114
+ uri: result.uri,
3115
+ headingPath: chunk.headingPath,
3116
+ text: chunk.text,
3117
+ snippet: result.snippet,
3118
+ normalizedPath: document.normalizedPath,
3119
+ publicationDate: result.publicationDate ?? null,
3120
+ crawledAt: document.crawledAt,
3121
+ firstSeenAt: result.firstSeenAt,
3122
+ lastSeenAt: result.lastSeenAt,
3123
+ lastChangedAt: result.lastChangedAt,
3124
+ metadata: result.metadata
3125
+ }
3126
+ };
3127
+ });
3128
+ return createSearchResponse("lexical", hits2, Date.now() - startedAt);
2910
3129
  }
2911
3130
  const lexicalHits = async () => {
2912
- const index = await loadHydratedIndex(workspacePath);
2913
- const all = await index.searchRequest({ query: buildSearchQuery(normalizedQuery, { sourceId, sourceIds, sourceType, sourceTypes, tag, tags, metadata }), limit: candidateLimit });
2914
- return all.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit);
3131
+ const response = await searchJsonIndex({
3132
+ workspacePath,
3133
+ request: buildSearchDslRequest({
3134
+ query: normalizedQuery,
3135
+ topK: candidateLimit,
3136
+ filters: { sourceId, sourceIds, sourceName, sourceNames, sourceType, sourceTypes, uriPrefix, uriPrefixes, hasPublicationDate, tag, tags, metadata },
3137
+ dateRanges
3138
+ })
3139
+ });
3140
+ return response.hits.hits;
2915
3141
  };
2916
3142
  const denseHits = async () => {
2917
3143
  if (!await fileExists(denseVectorPath(workspacePath))) {
@@ -2925,15 +3151,18 @@ async function searchIndex({
2925
3151
  }
2926
3152
  return sparseQuery({ workspacePath, config: config.retrieval.sparse, query: normalizedQuery, topK: candidateLimit }).then((hits2) => hits2.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit));
2927
3153
  };
3154
+ let lexicalResponseHits = [];
2928
3155
  let hits;
2929
3156
  if (mode === "lexical") {
2930
- hits = await lexicalHits();
3157
+ lexicalResponseHits = await lexicalHits();
3158
+ hits = lexicalResponseHits.map((hit) => [hit._id, hit._score]);
2931
3159
  } else if (mode === "dense") {
2932
3160
  hits = await denseHits();
2933
3161
  } else if (mode === "sparse") {
2934
3162
  hits = await sparseHits();
2935
3163
  } else {
2936
- const rankings = [await lexicalHits()];
3164
+ lexicalResponseHits = await lexicalHits();
3165
+ const rankings = [lexicalResponseHits.map((hit) => [hit._id, hit._score])];
2937
3166
  if (await fileExists(denseVectorPath(workspacePath))) {
2938
3167
  rankings.push(await denseQuery({ workspacePath, config: config.retrieval.dense, query: normalizedQuery, topK: candidateLimit }).then((dense) => dense.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit)));
2939
3168
  }
@@ -2942,34 +3171,49 @@ async function searchIndex({
2942
3171
  }
2943
3172
  hits = reciprocalRankFusion(rankings, { rankConstant: 20, weights: rankings.map((_, index) => index === 0 ? 3 : 1) }).slice(0, candidateLimit);
2944
3173
  }
2945
- const rawResults = await Promise.all(hits.map(async ([chunkId, score]) => {
3174
+ const baseHits = mode === "lexical" ? lexicalResponseHits : hits.flatMap(([chunkId, score]) => {
2946
3175
  const chunk = chunks.get(chunkId);
2947
3176
  if (!chunk) {
2948
- return null;
3177
+ return [];
2949
3178
  }
2950
- return {
2951
- chunkId,
2952
- documentId: chunk.documentId,
2953
- sourceId: chunk.sourceId,
2954
- sourceType: documents.get(chunk.documentId)?.sourceType ?? "text",
2955
- score,
2956
- title: chooseResultTitle(chunk),
2957
- uri: chunk.uri,
2958
- snippet: await buildSnippetWithAdjacentChunks(chunk, normalizedQuery, {
2959
- document: documents.get(chunk.documentId),
2960
- config,
2961
- orderedChunkCache
2962
- }),
2963
- text: showChunks ? chunk.text : void 0,
2964
- publicationDate: documents.get(chunk.documentId)?.publicationDate ?? null,
2965
- firstSeenAt: documents.get(chunk.documentId)?.firstSeenAt ?? chunk.firstSeenAt,
2966
- lastSeenAt: documents.get(chunk.documentId)?.lastSeenAt ?? chunk.lastSeenAt,
2967
- lastChangedAt: documents.get(chunk.documentId)?.lastChangedAt ?? chunk.lastChangedAt,
2968
- metadata: chunk.metadata
2969
- };
2970
- }));
2971
- const results = rawResults.filter((result) => result != null);
2972
- return { retrievalMode: mode, results: rerankResultsByDocument(results, topK) };
3179
+ const document = documents.get(chunk.documentId);
3180
+ const source = sources.get(chunk.sourceId);
3181
+ return [{
3182
+ _index: "querylight",
3183
+ _id: chunkId,
3184
+ _score: score,
3185
+ _source: {
3186
+ chunkId,
3187
+ documentId: chunk.documentId,
3188
+ sourceId: chunk.sourceId,
3189
+ sourceType: document?.sourceType ?? "text",
3190
+ sourceName: source?.name,
3191
+ title: chunk.title,
3192
+ uri: chunk.uri,
3193
+ headingPath: chunk.headingPath,
3194
+ text: chunk.text,
3195
+ normalizedPath: document?.normalizedPath,
3196
+ publicationDate: document?.publicationDate ?? null,
3197
+ crawledAt: document?.crawledAt,
3198
+ firstSeenAt: document?.firstSeenAt ?? chunk.firstSeenAt,
3199
+ lastSeenAt: document?.lastSeenAt ?? chunk.lastSeenAt,
3200
+ lastChangedAt: document?.lastChangedAt ?? chunk.lastChangedAt,
3201
+ metadata: chunk.metadata
3202
+ }
3203
+ }];
3204
+ });
3205
+ const materialized = await Promise.all(baseHits.map((hit) => materializeSearchHit(hit, normalizedQuery, config, orderedChunkCache, showChunks)));
3206
+ if (showChunks) {
3207
+ const topHits = materialized.sort((left, right) => right.result.score - left.result.score).slice(0, topK).map(({ hit, result }) => ({ ...hit, _score: result.score }));
3208
+ return createSearchResponse(mode, topHits, Date.now() - startedAt);
3209
+ }
3210
+ const reranked = rerankResultsByDocument(materialized.map(({ result }) => result), topK);
3211
+ const byChunkId = new Map(materialized.map(({ hit }) => [hit._id, hit]));
3212
+ const finalHits = reranked.map((result) => {
3213
+ const hit = byChunkId.get(result.chunkId);
3214
+ return hit ? { ...hit, _score: result.score, _source: { ...hit._source, snippet: result.snippet } } : null;
3215
+ }).filter((hit) => hit != null);
3216
+ return createSearchResponse(mode, finalHits, Date.now() - startedAt);
2973
3217
  }
2974
3218
 
2975
3219
  // src/query/related-service.ts
@@ -3086,9 +3330,10 @@ async function createContext({
3086
3330
  retrievalMode
3087
3331
  }) {
3088
3332
  const search = await searchIndex({ workspacePath, query, topK, showChunks: true, retrievalMode });
3333
+ const results = searchResultsFromResponse(search, true);
3089
3334
  const sources = [];
3090
3335
  let total = 0;
3091
- for (const result of search.results) {
3336
+ for (const result of results) {
3092
3337
  const text = result.text ?? "";
3093
3338
  if (total + text.length > maxChars && sources.length > 0) {
3094
3339
  break;
@@ -3197,6 +3442,8 @@ export {
3197
3442
  renderChangeReport,
3198
3443
  reprocessDocuments,
3199
3444
  searchIndex,
3445
+ searchJsonIndex,
3446
+ searchResultsFromResponse,
3200
3447
  updateSource,
3201
3448
  writeDefaultConfig
3202
3449
  };
@@ -1,10 +1,17 @@
1
- import type { RetrievalMode, SearchResponseData } from "../types/models.js";
1
+ import { type JsonDslRequest, type JsonDslResponse } from "@tryformation/querylight-ts";
2
+ import type { RetrievalMode, SearchResponseData, SearchResult } from "../types/models.js";
2
3
  type SearchDateField = "publicationDate" | "firstSeenAt" | "lastSeenAt" | "lastChangedAt" | "crawledAt";
3
4
  type SearchDateRange = {
4
5
  field: SearchDateField;
5
6
  from?: string;
6
7
  to?: string;
7
8
  };
9
+ export declare function searchResultsFromResponse(response: SearchResponseData, showChunks?: boolean): SearchResult[];
10
+ export declare function searchJsonIndex({ workspacePath, request, indexName }: {
11
+ workspacePath: string;
12
+ request: JsonDslRequest;
13
+ indexName?: string;
14
+ }): Promise<JsonDslResponse>;
8
15
  export declare function searchIndex({ workspacePath, query, topK, sourceId, sourceIds, sourceName, sourceNames, sourceType, sourceTypes, uriPrefix, uriPrefixes, hasPublicationDate, tag, tags, metadata, dateRanges, retrievalMode, showChunks }: {
9
16
  workspacePath: string;
10
17
  query: string;
@@ -222,9 +222,44 @@ export type SearchResult = {
222
222
  lastChangedAt: string;
223
223
  metadata: Record<string, unknown>;
224
224
  };
225
+ export type SearchHitSource = {
226
+ chunkId: string;
227
+ documentId: string;
228
+ sourceId: string;
229
+ sourceType: SourceType;
230
+ sourceName?: string;
231
+ title: string;
232
+ uri: string;
233
+ headingPath: string[];
234
+ text: string;
235
+ snippet?: string;
236
+ normalizedPath?: string;
237
+ publicationDate?: string | null;
238
+ crawledAt?: string;
239
+ firstSeenAt: string;
240
+ lastSeenAt: string;
241
+ lastChangedAt: string;
242
+ metadata: Record<string, unknown>;
243
+ };
244
+ export type SearchHit = {
245
+ _index: string;
246
+ _id: string;
247
+ _score: number;
248
+ _source: SearchHitSource;
249
+ highlight?: Record<string, string[]>;
250
+ };
225
251
  export type SearchResponseData = {
226
252
  retrievalMode?: RetrievalMode;
227
- results: SearchResult[];
253
+ took: number;
254
+ hits: {
255
+ total: {
256
+ value: number;
257
+ relation: "eq";
258
+ };
259
+ max_score: number | null;
260
+ hits: SearchHit[];
261
+ };
262
+ aggregations?: Record<string, unknown>;
228
263
  };
229
264
  export type RelatedDocumentResult = {
230
265
  documentId: string;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tryformation/querylight-cli",
3
- "version": "0.2.2",
3
+ "version": "0.2.3",
4
4
  "description": "Querylight CLI for building and querying local knowledge bases.",
5
5
  "license": "MIT",
6
6
  "homepage": "https://github.com/formation-res/querylight-cli#readme",
@@ -40,7 +40,7 @@
40
40
  },
41
41
  "dependencies": {
42
42
  "@huggingface/transformers": "^3.8.1",
43
- "@tryformation/querylight-ts": "^0.10.0",
43
+ "@tryformation/querylight-ts": "^0.11.0",
44
44
  "cheerio": "^1.2.0",
45
45
  "cli-table3": "^0.6.5",
46
46
  "commander": "^14.0.3",