@tryformation/querylight-cli 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli/main.js CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  // src/cli/run-cli.ts
4
4
  import { Command, Option } from "commander";
5
- import { stat as stat4 } from "fs/promises";
5
+ import { readFile as readFile11, stat as stat4 } from "fs/promises";
6
6
  import path21 from "path";
7
7
 
8
8
  // src/chunk/chunker.ts
@@ -16,7 +16,7 @@ import path from "path";
16
16
  import YAML from "yaml";
17
17
 
18
18
  // src/core/constants.ts
19
- var PACKAGE_VERSION = "0.2.1";
19
+ var PACKAGE_VERSION = "0.2.3";
20
20
  var DEFAULT_WORKSPACE = ".kb";
21
21
  var DEFAULT_SHARED_MODEL_CACHE_DIR = "~/.qli/models/huggingface";
22
22
  var LEGACY_WORKSPACE_MODEL_CACHE_DIR = ".kb/models/huggingface";
@@ -374,7 +374,7 @@ async function assertWorkspaceExists(workspacePath) {
374
374
  }
375
375
 
376
376
  // src/index/querylight-indexer.ts
377
- import { Analyzer, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, RankingAlgorithm, TextFieldIndex } from "@tryformation/querylight-ts";
377
+ import { Analyzer, DateFieldIndex, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, RankingAlgorithm, StoredSourceIndex, TextFieldIndex } from "@tryformation/querylight-ts";
378
378
  import path11 from "path";
379
379
 
380
380
  // src/vector/dense.ts
@@ -387,6 +387,7 @@ import os from "os";
387
387
  import path6 from "path";
388
388
  import { fileURLToPath } from "url";
389
389
  import { execFile, execFileSync } from "child_process";
390
+ import { mkdtemp, rm, writeFile as writeFile3 } from "fs/promises";
390
391
 
391
392
  // src/core/files.ts
392
393
  import { stat as stat2 } from "fs/promises";
@@ -400,6 +401,7 @@ async function fileExists(filePath) {
400
401
  }
401
402
 
402
403
  // src/vector/runtime.ts
404
+ var sparseExecFileSync = execFileSync;
403
405
  function resolveQliHomeDir() {
404
406
  return path6.resolve(process.env.QLI_HOME ?? path6.join(os.homedir(), ".qli"));
405
407
  }
@@ -455,29 +457,36 @@ async function runSparsePython({
455
457
  }) {
456
458
  const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
457
459
  const scriptPath = await sparseScriptPath(importMetaUrl);
458
- return execFileSync(
459
- "uv",
460
- [
461
- "run",
462
- "--with",
463
- "torch",
464
- "--with",
465
- "transformers",
466
- "--with",
467
- "huggingface_hub",
468
- "python",
469
- scriptPath
470
- ],
471
- {
472
- encoding: "utf8",
473
- maxBuffer: 1024 * 1024 * 1024,
474
- input: JSON.stringify(payload),
475
- env: {
476
- ...process.env,
477
- HF_HOME: cacheDir
460
+ const payloadDir = await mkdtemp(path6.join(os.tmpdir(), "qli-sparse-"));
461
+ const payloadPath = path6.join(payloadDir, "payload.json");
462
+ await writeFile3(payloadPath, JSON.stringify(payload), "utf8");
463
+ try {
464
+ return sparseExecFileSync(
465
+ "uv",
466
+ [
467
+ "run",
468
+ "--with",
469
+ "torch",
470
+ "--with",
471
+ "transformers",
472
+ "--with",
473
+ "huggingface_hub",
474
+ "python",
475
+ scriptPath,
476
+ payloadPath
477
+ ],
478
+ {
479
+ encoding: "utf8",
480
+ maxBuffer: 1024 * 1024 * 1024,
481
+ env: {
482
+ ...process.env,
483
+ HF_HOME: cacheDir
484
+ }
478
485
  }
479
- }
480
- );
486
+ );
487
+ } finally {
488
+ await rm(payloadDir, { recursive: true, force: true });
489
+ }
481
490
  }
482
491
  async function getDenseTransformersRuntime(cacheDir) {
483
492
  const transformers = await import("@huggingface/transformers");
@@ -490,18 +499,18 @@ async function getDenseTransformersRuntime(cacheDir) {
490
499
  }
491
500
 
492
501
  // src/vector/store.ts
493
- import { mkdir as mkdir3, rm, writeFile as writeFile4 } from "fs/promises";
502
+ import { mkdir as mkdir3, rm as rm2, writeFile as writeFile5 } from "fs/promises";
494
503
  import path7 from "path";
495
504
 
496
505
  // src/core/gzip-json.ts
497
- import { readFile as readFile4, writeFile as writeFile3 } from "fs/promises";
506
+ import { readFile as readFile4, writeFile as writeFile4 } from "fs/promises";
498
507
  import { promisify } from "util";
499
508
  import { gunzip, gzip } from "zlib";
500
509
  var gzipAsync = promisify(gzip);
501
510
  var gunzipAsync = promisify(gunzip);
502
511
  async function writeGzipJson(filePath, value) {
503
512
  const payload = JSON.stringify(value, null, 2);
504
- await writeFile3(filePath, await gzipAsync(Buffer.from(payload, "utf8")));
513
+ await writeFile4(filePath, await gzipAsync(Buffer.from(payload, "utf8")));
505
514
  }
506
515
  async function readJsonFromGzipOrFile(gzipPath, legacyPath) {
507
516
  if (await fileExists(gzipPath)) {
@@ -570,8 +579,8 @@ async function writeDensePayload(workspacePath, payload) {
570
579
  await writeGzipJson(denseVectorPath(workspacePath), payload);
571
580
  await writeGzipJson(denseMetaPath(workspacePath), payload.metadata);
572
581
  await Promise.all([
573
- rm(legacyDenseVectorPath(workspacePath), { force: true }),
574
- rm(legacyDenseMetaPath(workspacePath), { force: true })
582
+ rm2(legacyDenseVectorPath(workspacePath), { force: true }),
583
+ rm2(legacyDenseMetaPath(workspacePath), { force: true })
575
584
  ]);
576
585
  }
577
586
  async function readDensePayload(workspacePath) {
@@ -582,8 +591,8 @@ async function writeSparsePayload(workspacePath, payload) {
582
591
  await writeGzipJson(sparseVectorPath(workspacePath), payload);
583
592
  await writeGzipJson(sparseMetaPath(workspacePath), payload.metadata);
584
593
  await Promise.all([
585
- rm(legacySparseVectorPath(workspacePath), { force: true }),
586
- rm(legacySparseMetaPath(workspacePath), { force: true })
594
+ rm2(legacySparseVectorPath(workspacePath), { force: true }),
595
+ rm2(legacySparseMetaPath(workspacePath), { force: true })
587
596
  ]);
588
597
  }
589
598
  async function readSparsePayload(workspacePath) {
@@ -592,12 +601,12 @@ async function readSparsePayload(workspacePath) {
592
601
  async function writeDensePullMarker(workspacePath, model, value) {
593
602
  const markerPath = densePullMarker(workspacePath, model.modelId, model.cacheDir);
594
603
  await mkdir3(path7.dirname(markerPath), { recursive: true });
595
- await writeFile4(markerPath, JSON.stringify(value, null, 2), "utf8");
604
+ await writeFile5(markerPath, JSON.stringify(value, null, 2), "utf8");
596
605
  }
597
606
  async function writeSparsePullMarker(workspacePath, model, value) {
598
607
  const markerPath = sparsePullMarker(workspacePath, model.modelId, model.cacheDir);
599
608
  await mkdir3(path7.dirname(markerPath), { recursive: true });
600
- await writeFile4(markerPath, JSON.stringify(value, null, 2), "utf8");
609
+ await writeFile5(markerPath, JSON.stringify(value, null, 2), "utf8");
601
610
  }
602
611
  async function buildModelStatus(workspacePath, dense, sparse, uvAvailable) {
603
612
  const denseCacheDir = resolveCacheDir(workspacePath, dense.cacheDir);
@@ -1015,7 +1024,7 @@ async function getModelStatus(workspacePath, config) {
1015
1024
  }
1016
1025
 
1017
1026
  // src/index/index-store.ts
1018
- import { mkdir as mkdir6, rm as rm2 } from "fs/promises";
1027
+ import { mkdir as mkdir6, rm as rm3 } from "fs/promises";
1019
1028
  import path10 from "path";
1020
1029
  function versionedIndexPath(workspacePath, stamp) {
1021
1030
  return path10.join(workspacePath, "indexes", `${stamp}.json.gz`);
@@ -1057,10 +1066,10 @@ async function writeIndexArtifacts({
1057
1066
  await writeGzipJson(latestIndexArtifactPath, indexState);
1058
1067
  await writeGzipJson(latestMetadataArtifactPath, metadata);
1059
1068
  await Promise.all([
1060
- rm2(legacyLatestIndexPath(workspacePath), { force: true }),
1061
- rm2(legacyLatestMetaPath(workspacePath), { force: true }),
1062
- rm2(versionedLegacyIndexPath(workspacePath, stamp), { force: true }),
1063
- rm2(versionedLegacyMetaPath(workspacePath, stamp), { force: true })
1069
+ rm3(legacyLatestIndexPath(workspacePath), { force: true }),
1070
+ rm3(legacyLatestMetaPath(workspacePath), { force: true }),
1071
+ rm3(versionedLegacyIndexPath(workspacePath, stamp), { force: true }),
1072
+ rm3(versionedLegacyMetaPath(workspacePath, stamp), { force: true })
1064
1073
  ]);
1065
1074
  return { indexPath: latestIndexArtifactPath, metadataPath: latestMetadataArtifactPath };
1066
1075
  }
@@ -1082,12 +1091,19 @@ function keywordFieldIndex() {
1082
1091
  function createIndexMapping(extraFields = []) {
1083
1092
  const lexical = new TextFieldIndex(void 0, void 0, RankingAlgorithm.BM25);
1084
1093
  const mapping = {
1094
+ _source: new StoredSourceIndex(),
1085
1095
  text: lexical,
1086
1096
  title: new TextFieldIndex(void 0, void 0, RankingAlgorithm.BM25),
1087
1097
  uri: keywordFieldIndex(),
1088
1098
  sourceId: keywordFieldIndex(),
1099
+ sourceName: keywordFieldIndex(),
1089
1100
  tags: keywordFieldIndex(),
1090
- sourceType: keywordFieldIndex()
1101
+ sourceType: keywordFieldIndex(),
1102
+ publicationDate: new DateFieldIndex(),
1103
+ firstSeenAt: new DateFieldIndex(),
1104
+ lastSeenAt: new DateFieldIndex(),
1105
+ lastChangedAt: new DateFieldIndex(),
1106
+ crawledAt: new DateFieldIndex()
1091
1107
  };
1092
1108
  for (const field of extraFields) {
1093
1109
  mapping[field] = keywordFieldIndex();
@@ -1123,8 +1139,12 @@ async function buildIndex({
1123
1139
  const sources = await readJsonl(path11.join(workspacePath, "sources", "sources.jsonl"));
1124
1140
  const metadataFields = [...new Set(chunks.flatMap((chunk) => Object.keys(chunk.metadata).map((key) => `metadata.${key}`)))];
1125
1141
  const index = new DocumentIndex(createIndexMapping(metadataFields));
1142
+ const documentsById = new Map(documents.map((document) => [document.id, document]));
1143
+ const sourcesById = new Map(sources.map((source) => [source.id, source]));
1126
1144
  reportProgress(progress, `Building lexical index from ${chunks.length} chunk${chunks.length === 1 ? "" : "s"}`);
1127
1145
  for (const chunk of chunks) {
1146
+ const document = documentsById.get(chunk.documentId);
1147
+ const source = sourcesById.get(chunk.sourceId);
1128
1148
  index.index({
1129
1149
  id: chunk.id,
1130
1150
  fields: {
@@ -1132,9 +1152,33 @@ async function buildIndex({
1132
1152
  title: [chunk.title],
1133
1153
  uri: [chunk.uri.toLowerCase()],
1134
1154
  sourceId: [chunk.sourceId.toLowerCase()],
1155
+ sourceName: source ? [source.name.toLowerCase()] : [],
1135
1156
  tags: Array.isArray(chunk.metadata.tags) ? chunk.metadata.tags.map((tag) => String(tag).toLowerCase()) : [],
1136
1157
  sourceType: [String(chunk.metadata.sourceType ?? "").toLowerCase()],
1158
+ publicationDate: document?.publicationDate ? [document.publicationDate] : [],
1159
+ firstSeenAt: [document?.firstSeenAt ?? chunk.firstSeenAt],
1160
+ lastSeenAt: [document?.lastSeenAt ?? chunk.lastSeenAt],
1161
+ lastChangedAt: [document?.lastChangedAt ?? chunk.lastChangedAt],
1162
+ crawledAt: document?.crawledAt ? [document.crawledAt] : [],
1137
1163
  ...flattenMetadata(chunk.metadata)
1164
+ },
1165
+ source: {
1166
+ chunkId: chunk.id,
1167
+ documentId: chunk.documentId,
1168
+ sourceId: chunk.sourceId,
1169
+ sourceType: document?.sourceType ?? "text",
1170
+ sourceName: source?.name,
1171
+ title: chunk.title,
1172
+ uri: chunk.uri,
1173
+ headingPath: chunk.headingPath,
1174
+ text: chunk.text,
1175
+ normalizedPath: document?.normalizedPath,
1176
+ publicationDate: document?.publicationDate ?? null,
1177
+ crawledAt: document?.crawledAt,
1178
+ firstSeenAt: document?.firstSeenAt ?? chunk.firstSeenAt,
1179
+ lastSeenAt: document?.lastSeenAt ?? chunk.lastSeenAt,
1180
+ lastChangedAt: document?.lastChangedAt ?? chunk.lastChangedAt,
1181
+ metadata: chunk.metadata
1138
1182
  }
1139
1183
  });
1140
1184
  }
@@ -1143,7 +1187,7 @@ async function buildIndex({
1143
1187
  const metadata = {
1144
1188
  id: `index_${createdAt.replace(/[:.]/g, "-")}`,
1145
1189
  createdAt,
1146
- querylightVersion: "0.10.0",
1190
+ querylightVersion: "0.11.0",
1147
1191
  kbVersion: "0.1.0",
1148
1192
  documentCount: documents.length,
1149
1193
  chunkCount: chunks.length,
@@ -1262,7 +1306,7 @@ async function removeSource(workspacePath, sourceId) {
1262
1306
  }
1263
1307
 
1264
1308
  // src/ingest/document-utils.ts
1265
- import { mkdir as mkdir7, rm as rm3, writeFile as writeFile5 } from "fs/promises";
1309
+ import { mkdir as mkdir7, rm as rm4, writeFile as writeFile6 } from "fs/promises";
1266
1310
  import path14 from "path";
1267
1311
 
1268
1312
  // src/normalize/normalize-markdown.ts
@@ -1316,7 +1360,7 @@ async function writeNormalizedDocument({
1316
1360
  markdown
1317
1361
  }) {
1318
1362
  await mkdir7(path14.dirname(normalizedPath), { recursive: true });
1319
- await writeFile5(
1363
+ await writeFile6(
1320
1364
  normalizedPath,
1321
1365
  withFrontmatter(
1322
1366
  {
@@ -1338,8 +1382,8 @@ async function writeNormalizedDocument({
1338
1382
  }
1339
1383
  async function deleteDocumentArtifacts(document) {
1340
1384
  await Promise.all([
1341
- document.rawPath ? rm3(document.rawPath, { force: true }) : Promise.resolve(),
1342
- rm3(document.normalizedPath, { force: true })
1385
+ document.rawPath ? rm4(document.rawPath, { force: true }) : Promise.resolve(),
1386
+ rm4(document.normalizedPath, { force: true })
1343
1387
  ]);
1344
1388
  }
1345
1389
 
@@ -1363,7 +1407,7 @@ async function listDirectoryFiles(source) {
1363
1407
 
1364
1408
  // src/ingest/adapters/file-adapter.ts
1365
1409
  import { basename, extname, resolve } from "path";
1366
- import { mkdir as mkdir8, readFile as readFile8, stat as stat3, writeFile as writeFile6 } from "fs/promises";
1410
+ import { mkdir as mkdir8, readFile as readFile8, stat as stat3, writeFile as writeFile7 } from "fs/promises";
1367
1411
 
1368
1412
  // src/ingest/extractors/docx-extractor.ts
1369
1413
  import mammoth from "mammoth";
@@ -1653,7 +1697,7 @@ async function ingestFile({
1653
1697
  await mkdir8(resolve(workspacePath, "normalized"), { recursive: true });
1654
1698
  await mkdir8(resolve(workspacePath, "raw", source.id), { recursive: true });
1655
1699
  if (extracted.raw) {
1656
- await writeFile6(rawPath, extracted.raw, "utf8");
1700
+ await writeFile7(rawPath, extracted.raw, "utf8");
1657
1701
  }
1658
1702
  await writeNormalizedDocument({
1659
1703
  documentId,
@@ -1877,7 +1921,7 @@ async function parseRssFeedDocument(xml, source) {
1877
1921
  }
1878
1922
 
1879
1923
  // src/ingest/adapters/url-adapter.ts
1880
- import { mkdir as mkdir9, readFile as readFile9, writeFile as writeFile7 } from "fs/promises";
1924
+ import { mkdir as mkdir9, readFile as readFile9, writeFile as writeFile8 } from "fs/promises";
1881
1925
  import path16 from "path";
1882
1926
 
1883
1927
  // src/core/urls.ts
@@ -1930,7 +1974,7 @@ ${extracted.markdown}`;
1930
1974
  const crawledAt = now;
1931
1975
  const resolvedPublicationDate = choosePublicationDate(publicationDate, extractPublicationDateFromHtml(body), previous?.publicationDate);
1932
1976
  await mkdir9(path16.resolve(workspacePath, "raw", source.id), { recursive: true });
1933
- await writeFile7(rawPath, body, "utf8");
1977
+ await writeFile8(rawPath, body, "utf8");
1934
1978
  await writeNormalizedDocument({
1935
1979
  documentId,
1936
1980
  sourceId: source.id,
@@ -2769,7 +2813,7 @@ async function discoverWebsiteFeed(websiteUrl, userAgent) {
2769
2813
 
2770
2814
  // src/query/search-service.ts
2771
2815
  import { readFile as readFile10 } from "fs/promises";
2772
- import { BoolQuery, MatchQuery, OP, TermQuery, reciprocalRankFusion } from "@tryformation/querylight-ts";
2816
+ import { reciprocalRankFusion, searchJsonDsl } from "@tryformation/querylight-ts";
2773
2817
  import path18 from "path";
2774
2818
  async function loadHydratedIndex(workspacePath) {
2775
2819
  let state;
@@ -2797,24 +2841,6 @@ function matchesPrefix(value, prefixes) {
2797
2841
  const lower = value.toLowerCase();
2798
2842
  return prefixes.some((prefix) => lower.startsWith(prefix));
2799
2843
  }
2800
- function buildSearchQuery(query, filters) {
2801
- const sourceIds = normalizeFilterValues([filters.sourceId, ...filters.sourceIds ?? []].filter((value) => Boolean(value)));
2802
- const sourceTypes = normalizeFilterValues([filters.sourceType, ...filters.sourceTypes ?? []].filter((value) => Boolean(value)));
2803
- const tags = normalizeFilterValues([filters.tag, ...filters.tags ?? []].filter((value) => Boolean(value)));
2804
- return new BoolQuery({
2805
- should: [
2806
- new MatchQuery({ field: "title", text: query, operation: OP.AND, boost: 6 }),
2807
- new MatchQuery({ field: "text", text: query, operation: OP.AND, boost: 4 }),
2808
- new MatchQuery({ field: "text", text: query, operation: OP.OR, boost: 2 })
2809
- ],
2810
- filter: [
2811
- ...sourceIds.length === 1 ? [new TermQuery({ field: "sourceId", text: sourceIds[0] })] : [],
2812
- ...sourceTypes.length === 1 ? [new TermQuery({ field: "sourceType", text: sourceTypes[0] })] : [],
2813
- ...tags.length === 1 ? [new TermQuery({ field: "tags", text: tags[0] })] : [],
2814
- ...(filters.metadata ?? []).map(({ key, value }) => new TermQuery({ field: `metadata.${key}`, text: value.toLowerCase() }))
2815
- ]
2816
- });
2817
- }
2818
2844
  function isValidDate(value) {
2819
2845
  return typeof value === "string" && !Number.isNaN(new Date(value).getTime());
2820
2846
  }
@@ -3013,6 +3039,178 @@ async function buildSnippetWithAdjacentChunks(chunk, query, {
3013
3039
  }
3014
3040
  return buildExpandedParagraphSnippet(paragraphs, currentIndex, query);
3015
3041
  }
3042
+ function buildSearchDslRequest({
3043
+ query,
3044
+ topK,
3045
+ filters,
3046
+ dateRanges
3047
+ }) {
3048
+ const filterClauses = [];
3049
+ const sourceIds = normalizeFilterValues([filters.sourceId, ...filters.sourceIds ?? []].filter((value) => Boolean(value)));
3050
+ const sourceNames = normalizeFilterValues([filters.sourceName, ...filters.sourceNames ?? []].filter((value) => Boolean(value)));
3051
+ const sourceTypes = normalizeFilterValues([filters.sourceType, ...filters.sourceTypes ?? []].filter((value) => Boolean(value)));
3052
+ const uriPrefixes = normalizeFilterValues([filters.uriPrefix, ...filters.uriPrefixes ?? []].filter((value) => Boolean(value)));
3053
+ const tags = normalizeFilterValues([filters.tag, ...filters.tags ?? []].filter((value) => Boolean(value)));
3054
+ if (sourceIds.length > 0) {
3055
+ filterClauses.push({ terms: { sourceId: sourceIds } });
3056
+ }
3057
+ if (sourceNames.length > 0) {
3058
+ filterClauses.push({ terms: { sourceName: sourceNames } });
3059
+ }
3060
+ if (sourceTypes.length > 0) {
3061
+ filterClauses.push({ terms: { sourceType: sourceTypes } });
3062
+ }
3063
+ if (uriPrefixes.length > 0) {
3064
+ filterClauses.push({
3065
+ bool: {
3066
+ should: uriPrefixes.map((prefix) => ({ prefix: { uri: prefix } })),
3067
+ minimum_should_match: 1
3068
+ }
3069
+ });
3070
+ }
3071
+ if (tags.length > 0) {
3072
+ filterClauses.push({ terms: { tags } });
3073
+ }
3074
+ if (filters.hasPublicationDate) {
3075
+ filterClauses.push({ exists: { field: "publicationDate" } });
3076
+ }
3077
+ for (const { key, value } of filters.metadata ?? []) {
3078
+ filterClauses.push({ term: { [`metadata.${key}`]: value.toLowerCase() } });
3079
+ }
3080
+ for (const { field, from, to } of dateRanges) {
3081
+ filterClauses.push({
3082
+ range: {
3083
+ [field]: {
3084
+ ...from ? { gte: from } : {},
3085
+ ...to ? { lte: to } : {}
3086
+ }
3087
+ }
3088
+ });
3089
+ }
3090
+ return {
3091
+ size: topK,
3092
+ query: {
3093
+ bool: {
3094
+ should: [
3095
+ { match: { title: { query, operator: "and", boost: 6 } } },
3096
+ { match: { text: { query, operator: "and", boost: 4 } } },
3097
+ { match: { text: { query, operator: "or", boost: 2 } } }
3098
+ ],
3099
+ filter: filterClauses,
3100
+ minimum_should_match: 1
3101
+ }
3102
+ }
3103
+ };
3104
+ }
3105
+ function sourceToChunkRecord(source) {
3106
+ return {
3107
+ id: source.chunkId,
3108
+ documentId: source.documentId,
3109
+ sourceId: source.sourceId,
3110
+ title: source.title,
3111
+ uri: source.uri,
3112
+ headingPath: source.headingPath,
3113
+ text: source.text,
3114
+ contentHash: "",
3115
+ metadata: source.metadata,
3116
+ firstSeenAt: source.firstSeenAt,
3117
+ lastSeenAt: source.lastSeenAt,
3118
+ lastChangedAt: source.lastChangedAt
3119
+ };
3120
+ }
3121
+ function sourceToDocumentRecord(source) {
3122
+ return {
3123
+ id: source.documentId,
3124
+ sourceId: source.sourceId,
3125
+ sourceType: source.sourceType,
3126
+ title: source.title,
3127
+ uri: source.uri,
3128
+ sourceUri: source.uri,
3129
+ mimeType: "text/plain",
3130
+ normalizedPath: source.normalizedPath ?? "",
3131
+ contentHash: "",
3132
+ metadata: source.metadata,
3133
+ publicationDate: source.publicationDate ?? null,
3134
+ crawledAt: source.crawledAt,
3135
+ firstSeenAt: source.firstSeenAt,
3136
+ lastSeenAt: source.lastSeenAt,
3137
+ lastChangedAt: source.lastChangedAt
3138
+ };
3139
+ }
3140
+ async function materializeSearchHit(hit, query, config, orderedChunkCache, showChunks) {
3141
+ const source = hit._source;
3142
+ const chunk = sourceToChunkRecord(source);
3143
+ const document = sourceToDocumentRecord(source);
3144
+ const snippet = await buildSnippetWithAdjacentChunks(chunk, query, { document, config, orderedChunkCache });
3145
+ const enrichedSource = {
3146
+ ...source,
3147
+ snippet
3148
+ };
3149
+ const result = {
3150
+ chunkId: source.chunkId,
3151
+ documentId: source.documentId,
3152
+ sourceId: source.sourceId,
3153
+ sourceType: source.sourceType,
3154
+ score: hit._score,
3155
+ title: chooseResultTitle(chunk),
3156
+ uri: source.uri,
3157
+ snippet,
3158
+ text: showChunks ? source.text : void 0,
3159
+ publicationDate: source.publicationDate ?? null,
3160
+ firstSeenAt: source.firstSeenAt,
3161
+ lastSeenAt: source.lastSeenAt,
3162
+ lastChangedAt: source.lastChangedAt,
3163
+ metadata: source.metadata
3164
+ };
3165
+ return {
3166
+ hit: {
3167
+ ...hit,
3168
+ _source: enrichedSource
3169
+ },
3170
+ result
3171
+ };
3172
+ }
3173
+ function createSearchResponse(retrievalMode, hits, took, aggregations) {
3174
+ return {
3175
+ retrievalMode,
3176
+ took,
3177
+ hits: {
3178
+ total: {
3179
+ value: hits.length,
3180
+ relation: "eq"
3181
+ },
3182
+ max_score: hits.length > 0 ? Math.max(...hits.map((hit) => hit._score)) : null,
3183
+ hits
3184
+ },
3185
+ aggregations
3186
+ };
3187
+ }
3188
+ function searchResultsFromResponse(response2, showChunks = false) {
3189
+ return response2.hits.hits.map((hit) => ({
3190
+ chunkId: hit._source.chunkId,
3191
+ documentId: hit._source.documentId,
3192
+ sourceId: hit._source.sourceId,
3193
+ sourceType: hit._source.sourceType,
3194
+ score: hit._score,
3195
+ title: chooseResultTitle(sourceToChunkRecord(hit._source)),
3196
+ uri: hit._source.uri,
3197
+ snippet: hit._source.snippet ?? hit.highlight?.text?.join("\n\n") ?? buildSnippet(hit._source.text, hit._source.title),
3198
+ text: showChunks ? hit._source.text : void 0,
3199
+ publicationDate: hit._source.publicationDate ?? null,
3200
+ firstSeenAt: hit._source.firstSeenAt,
3201
+ lastSeenAt: hit._source.lastSeenAt,
3202
+ lastChangedAt: hit._source.lastChangedAt,
3203
+ metadata: hit._source.metadata
3204
+ }));
3205
+ }
3206
+ async function searchJsonIndex({
3207
+ workspacePath,
3208
+ request,
3209
+ indexName = "querylight"
3210
+ }) {
3211
+ const index = await loadHydratedIndex(workspacePath);
3212
+ return searchJsonDsl({ index, request, indexName });
3213
+ }
3016
3214
  function normalizeDisplayTitle(title) {
3017
3215
  return title.replace(/\s*\|\s*Querylight TS Demo\s*$/i, "").replace(/\s+/g, " ").trim();
3018
3216
  }
@@ -3150,6 +3348,7 @@ async function searchIndex({
3150
3348
  retrievalMode,
3151
3349
  showChunks = false
3152
3350
  }) {
3351
+ const startedAt = Date.now();
3153
3352
  const config = await loadConfig(workspacePath);
3154
3353
  const mode = retrievalMode ?? config.retrieval.defaultMode;
3155
3354
  const candidateLimit = Math.max(topK * 5, 50);
@@ -3206,12 +3405,48 @@ async function searchIndex({
3206
3405
  };
3207
3406
  })
3208
3407
  );
3209
- return { retrievalMode: "lexical", results: latestResults.filter((result) => result != null) };
3408
+ const hits2 = latestResults.filter((result) => result != null).map((result) => {
3409
+ const chunk = chunks.get(result.chunkId);
3410
+ const document = documents.get(result.documentId);
3411
+ const source = sources.get(result.sourceId);
3412
+ return {
3413
+ _index: "querylight",
3414
+ _id: result.chunkId,
3415
+ _score: result.score,
3416
+ _source: {
3417
+ chunkId: result.chunkId,
3418
+ documentId: result.documentId,
3419
+ sourceId: result.sourceId,
3420
+ sourceType: result.sourceType,
3421
+ sourceName: source?.name,
3422
+ title: chunk.title,
3423
+ uri: result.uri,
3424
+ headingPath: chunk.headingPath,
3425
+ text: chunk.text,
3426
+ snippet: result.snippet,
3427
+ normalizedPath: document.normalizedPath,
3428
+ publicationDate: result.publicationDate ?? null,
3429
+ crawledAt: document.crawledAt,
3430
+ firstSeenAt: result.firstSeenAt,
3431
+ lastSeenAt: result.lastSeenAt,
3432
+ lastChangedAt: result.lastChangedAt,
3433
+ metadata: result.metadata
3434
+ }
3435
+ };
3436
+ });
3437
+ return createSearchResponse("lexical", hits2, Date.now() - startedAt);
3210
3438
  }
3211
3439
  const lexicalHits = async () => {
3212
- const index = await loadHydratedIndex(workspacePath);
3213
- const all = await index.searchRequest({ query: buildSearchQuery(normalizedQuery, { sourceId, sourceIds, sourceType, sourceTypes, tag, tags, metadata }), limit: candidateLimit });
3214
- return all.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit);
3440
+ const response2 = await searchJsonIndex({
3441
+ workspacePath,
3442
+ request: buildSearchDslRequest({
3443
+ query: normalizedQuery,
3444
+ topK: candidateLimit,
3445
+ filters: { sourceId, sourceIds, sourceName, sourceNames, sourceType, sourceTypes, uriPrefix, uriPrefixes, hasPublicationDate, tag, tags, metadata },
3446
+ dateRanges
3447
+ })
3448
+ });
3449
+ return response2.hits.hits;
3215
3450
  };
3216
3451
  const denseHits = async () => {
3217
3452
  if (!await fileExists(denseVectorPath(workspacePath))) {
@@ -3225,15 +3460,18 @@ async function searchIndex({
3225
3460
  }
3226
3461
  return sparseQuery({ workspacePath, config: config.retrieval.sparse, query: normalizedQuery, topK: candidateLimit }).then((hits2) => hits2.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit));
3227
3462
  };
3463
+ let lexicalResponseHits = [];
3228
3464
  let hits;
3229
3465
  if (mode === "lexical") {
3230
- hits = await lexicalHits();
3466
+ lexicalResponseHits = await lexicalHits();
3467
+ hits = lexicalResponseHits.map((hit) => [hit._id, hit._score]);
3231
3468
  } else if (mode === "dense") {
3232
3469
  hits = await denseHits();
3233
3470
  } else if (mode === "sparse") {
3234
3471
  hits = await sparseHits();
3235
3472
  } else {
3236
- const rankings = [await lexicalHits()];
3473
+ lexicalResponseHits = await lexicalHits();
3474
+ const rankings = [lexicalResponseHits.map((hit) => [hit._id, hit._score])];
3237
3475
  if (await fileExists(denseVectorPath(workspacePath))) {
3238
3476
  rankings.push(await denseQuery({ workspacePath, config: config.retrieval.dense, query: normalizedQuery, topK: candidateLimit }).then((dense) => dense.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit)));
3239
3477
  }
@@ -3242,34 +3480,49 @@ async function searchIndex({
3242
3480
  }
3243
3481
  hits = reciprocalRankFusion(rankings, { rankConstant: 20, weights: rankings.map((_, index) => index === 0 ? 3 : 1) }).slice(0, candidateLimit);
3244
3482
  }
3245
- const rawResults = await Promise.all(hits.map(async ([chunkId, score]) => {
3483
+ const baseHits = mode === "lexical" ? lexicalResponseHits : hits.flatMap(([chunkId, score]) => {
3246
3484
  const chunk = chunks.get(chunkId);
3247
3485
  if (!chunk) {
3248
- return null;
3486
+ return [];
3249
3487
  }
3250
- return {
3251
- chunkId,
3252
- documentId: chunk.documentId,
3253
- sourceId: chunk.sourceId,
3254
- sourceType: documents.get(chunk.documentId)?.sourceType ?? "text",
3255
- score,
3256
- title: chooseResultTitle(chunk),
3257
- uri: chunk.uri,
3258
- snippet: await buildSnippetWithAdjacentChunks(chunk, normalizedQuery, {
3259
- document: documents.get(chunk.documentId),
3260
- config,
3261
- orderedChunkCache
3262
- }),
3263
- text: showChunks ? chunk.text : void 0,
3264
- publicationDate: documents.get(chunk.documentId)?.publicationDate ?? null,
3265
- firstSeenAt: documents.get(chunk.documentId)?.firstSeenAt ?? chunk.firstSeenAt,
3266
- lastSeenAt: documents.get(chunk.documentId)?.lastSeenAt ?? chunk.lastSeenAt,
3267
- lastChangedAt: documents.get(chunk.documentId)?.lastChangedAt ?? chunk.lastChangedAt,
3268
- metadata: chunk.metadata
3269
- };
3270
- }));
3271
- const results = rawResults.filter((result) => result != null);
3272
- return { retrievalMode: mode, results: rerankResultsByDocument(results, topK) };
3488
+ const document = documents.get(chunk.documentId);
3489
+ const source = sources.get(chunk.sourceId);
3490
+ return [{
3491
+ _index: "querylight",
3492
+ _id: chunkId,
3493
+ _score: score,
3494
+ _source: {
3495
+ chunkId,
3496
+ documentId: chunk.documentId,
3497
+ sourceId: chunk.sourceId,
3498
+ sourceType: document?.sourceType ?? "text",
3499
+ sourceName: source?.name,
3500
+ title: chunk.title,
3501
+ uri: chunk.uri,
3502
+ headingPath: chunk.headingPath,
3503
+ text: chunk.text,
3504
+ normalizedPath: document?.normalizedPath,
3505
+ publicationDate: document?.publicationDate ?? null,
3506
+ crawledAt: document?.crawledAt,
3507
+ firstSeenAt: document?.firstSeenAt ?? chunk.firstSeenAt,
3508
+ lastSeenAt: document?.lastSeenAt ?? chunk.lastSeenAt,
3509
+ lastChangedAt: document?.lastChangedAt ?? chunk.lastChangedAt,
3510
+ metadata: chunk.metadata
3511
+ }
3512
+ }];
3513
+ });
3514
+ const materialized = await Promise.all(baseHits.map((hit) => materializeSearchHit(hit, normalizedQuery, config, orderedChunkCache, showChunks)));
3515
+ if (showChunks) {
3516
+ const topHits = materialized.sort((left, right) => right.result.score - left.result.score).slice(0, topK).map(({ hit, result }) => ({ ...hit, _score: result.score }));
3517
+ return createSearchResponse(mode, topHits, Date.now() - startedAt);
3518
+ }
3519
+ const reranked = rerankResultsByDocument(materialized.map(({ result }) => result), topK);
3520
+ const byChunkId = new Map(materialized.map(({ hit }) => [hit._id, hit]));
3521
+ const finalHits = reranked.map((result) => {
3522
+ const hit = byChunkId.get(result.chunkId);
3523
+ return hit ? { ...hit, _score: result.score, _source: { ...hit._source, snippet: result.snippet } } : null;
3524
+ }).filter((hit) => hit != null);
3525
+ return createSearchResponse(mode, finalHits, Date.now() - startedAt);
3273
3526
  }
3274
3527
 
3275
3528
  // src/query/related-service.ts
@@ -3386,9 +3639,10 @@ async function createContext({
3386
3639
  retrievalMode
3387
3640
  }) {
3388
3641
  const search = await searchIndex({ workspacePath, query, topK, showChunks: true, retrievalMode });
3642
+ const results = searchResultsFromResponse(search, true);
3389
3643
  const sources = [];
3390
3644
  let total = 0;
3391
- for (const result of search.results) {
3645
+ for (const result of results) {
3392
3646
  const text = result.text ?? "";
3393
3647
  if (total + text.length > maxChars && sources.length > 0) {
3394
3648
  break;
@@ -3489,7 +3743,8 @@ function formatSourcesTable(sources) {
3489
3743
  }
3490
3744
  return table.toString();
3491
3745
  }
3492
- function formatSearchResults(results) {
3746
+ function formatSearchResults(response2) {
3747
+ const results = searchResultsFromResponse(response2);
3493
3748
  return results.map((result, index) => [
3494
3749
  `${index + 1}. ${colors.bold(result.title)}`,
3495
3750
  ` URL: ${result.uri}`,
@@ -3794,6 +4049,19 @@ function parseDateValue(input, optionName) {
3794
4049
  }
3795
4050
  return parsed.toISOString();
3796
4051
  }
4052
+ async function parseJsonArgument(input) {
4053
+ const raw = input.startsWith("@") ? await readFile11(path21.resolve(input.slice(1)), "utf8") : input;
4054
+ try {
4055
+ const parsed = JSON.parse(raw);
4056
+ if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
4057
+ throw new Error("expected a JSON object");
4058
+ }
4059
+ return parsed;
4060
+ } catch (error) {
4061
+ const message = error instanceof Error ? error.message : String(error);
4062
+ throw new CliError(`invalid JSON request: ${message}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
4063
+ }
4064
+ }
3797
4065
  function searchDateRanges(options) {
3798
4066
  const entries = [];
3799
4067
  if (options.since || options.until) {
@@ -4138,7 +4406,7 @@ Examples:
4138
4406
  progress?.("info", "Rebuild complete");
4139
4407
  emit(global.json, capture, response("rebuild", workspace, data), `Processed ${ingest.processedSources} sources, wrote ${chunk.chunksWritten} chunks`);
4140
4408
  });
4141
- program.command("search").description("Search the built index and return ranked matching documents or chunks.").argument("[query]", "Text query. Omit it to list the latest matching documents.").option("--top-k <n>", "Maximum number of results to return.", "12").option("--source <sourceIds>", "Restrict results to one or more source ids. Use comma-separated values.").option("--source-name <names>", "Restrict results to one or more source names. Use comma-separated values.").option("--source-type <types>", `Restrict results to one or more source types. Use comma-separated values: ${SOURCE_TYPE_LIST.join(", ")}`).option("--uri-prefix <prefixes>", "Restrict results to one or more URI prefixes. Use comma-separated values.").option("--tag <tags>", "Restrict results to one or more source tags. Use comma-separated values.").option("--metadata <key=value...>", "Restrict results to sources with matching metadata.").option("--since <date>", "Shortcut for --publication-date-from.").option("--until <date>", "Shortcut for --publication-date-to.").option("--changed-since <date>", "Only include documents changed on or after this date.").option("--has-publication-date", "Only include documents with a publication date.").option("--publication-date-from <date>", "Only include documents published on or after this date.").option("--publication-date-to <date>", "Only include documents published on or before this date.").option("--first-seen-at-from <date>", "Only include documents first seen on or after this date.").option("--first-seen-at-to <date>", "Only include documents first seen on or before this date.").option("--last-seen-at-from <date>", "Only include documents last seen on or after this date.").option("--last-seen-at-to <date>", "Only include documents last seen on or before this date.").option("--last-changed-at-from <date>", "Only include documents changed on or after this date.").option("--last-changed-at-to <date>", "Only include documents changed on or before this date.").option("--crawled-at-from <date>", "Only include documents crawled on or after this date.").option("--crawled-at-to <date>", "Only include documents crawled on or before this date.").option("--retrieval <mode>", `Retrieval mode: ${RETRIEVAL_MODE_LIST.join(", ")}`).option("--show-chunks", "Return chunk-level matches when available.").addHelpText("after", `
4409
+ program.command("search").description("Search the built index and return ranked matching documents or chunks. Use search-json for raw JSON DSL queries.").argument("[query]", "Text query. Omit it to list the latest matching documents.").option("--top-k <n>", "Maximum number of results to return.", "12").option("--source <sourceIds>", "Restrict results to one or more source ids. Use comma-separated values.").option("--source-name <names>", "Restrict results to one or more source names. Use comma-separated values.").option("--source-type <types>", `Restrict results to one or more source types. Use comma-separated values: ${SOURCE_TYPE_LIST.join(", ")}`).option("--uri-prefix <prefixes>", "Restrict results to one or more URI prefixes. Use comma-separated values.").option("--tag <tags>", "Restrict results to one or more source tags. Use comma-separated values.").option("--metadata <key=value...>", "Restrict results to sources with matching metadata.").option("--since <date>", "Shortcut for --publication-date-from.").option("--until <date>", "Shortcut for --publication-date-to.").option("--changed-since <date>", "Only include documents changed on or after this date.").option("--has-publication-date", "Only include documents with a publication date.").option("--publication-date-from <date>", "Only include documents published on or after this date.").option("--publication-date-to <date>", "Only include documents published on or before this date.").option("--first-seen-at-from <date>", "Only include documents first seen on or after this date.").option("--first-seen-at-to <date>", "Only include documents first seen on or before this date.").option("--last-seen-at-from <date>", "Only include documents last seen on or after this date.").option("--last-seen-at-to <date>", "Only include documents last seen on or before this date.").option("--last-changed-at-from <date>", "Only include documents changed on or after this date.").option("--last-changed-at-to <date>", "Only include documents changed on or before this date.").option("--crawled-at-from <date>", "Only include documents crawled on or after this date.").option("--crawled-at-to <date>", "Only include documents crawled on or before this date.").option("--retrieval <mode>", `Retrieval mode: ${RETRIEVAL_MODE_LIST.join(", ")}`).option("--show-chunks", "Return chunk-level matches when available.").addHelpText("after", `
4142
4410
  Examples:
4143
4411
  qli search "pricing api limits"
4144
4412
  qli search "authentication" --top-k 20 --tag docs
@@ -4151,6 +4419,7 @@ Examples:
4151
4419
  Notes:
4152
4420
  lexical works without vector models.
4153
4421
  dense, sparse, and hybrid require the relevant index artifacts to exist.
4422
+ Use search-json when you want the raw Querylight 0.11 JSON DSL and hit format.
4154
4423
  When you omit the query, qli returns the latest matching documents sorted by publication date.`).action(async function command(query, options) {
4155
4424
  const global = this.optsWithGlobals();
4156
4425
  const workspace = await resolveWorkspace({ workspace: global.workspace });
@@ -4169,7 +4438,26 @@ Notes:
4169
4438
  retrievalMode: parseRetrievalMode(options.retrieval),
4170
4439
  showChunks: Boolean(options.showChunks)
4171
4440
  });
4172
- emit(global.json, capture, response("search", workspace, result), formatSearchResults(result.results));
4441
+ emit(global.json, capture, response("search", workspace, result), formatSearchResults(result));
4442
+ });
4443
+ program.command("search-json").description("Run a raw Querylight 0.11 JSON DSL search request against the lexical index.").argument("<request>", "Inline JSON request or @path/to/request.json.").addHelpText("after", `
4444
+ Examples:
4445
+ qli search-json '{"query":{"match":{"text":"authentication"}},"size":5}'
4446
+ qli search-json @./search-request.json
4447
+ qli search-json '{"query":{"bool":{"filter":[{"term":{"sourceType":"rss"}}]}},"aggs":{"types":{"terms":{"field":"sourceType","size":5}}}}' --json
4448
+
4449
+ Notes:
4450
+ search-json uses the lexical index and Querylight 0.11 JSON DSL fields.
4451
+ Stored hit payloads are returned under _source.
4452
+ Use --json when another tool needs the full response envelope.`).action(async function command(requestInput) {
4453
+ const global = this.optsWithGlobals();
4454
+ const workspace = await resolveWorkspace({ workspace: global.workspace });
4455
+ const request = await parseJsonArgument(requestInput);
4456
+ const result = await searchJsonIndex({
4457
+ workspacePath: workspace,
4458
+ request
4459
+ });
4460
+ emit(global.json, capture, response("search-json", workspace, result), JSON.stringify(result, null, 2));
4173
4461
  });
4174
4462
  program.command("related").description("Find documents similar to an existing document by id or URI.").argument("<document>", "Document id, uri, or canonical uri").option("--top-k <n>", "Maximum number of related documents to return.", "12").addHelpText("after", `
4175
4463
  Examples: