@tryformation/querylight-cli 0.2.1 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
  export declare const PACKAGE_NAME = "@tryformation/querylight-cli";
2
- export declare const PACKAGE_VERSION = "0.2.1";
2
+ export declare const PACKAGE_VERSION = "0.2.3";
3
3
  export declare const DEFAULT_WORKSPACE = ".kb";
4
4
  export declare const DEFAULT_SHARED_MODEL_CACHE_DIR = "~/.qli/models/huggingface";
5
5
  export declare const LEGACY_WORKSPACE_MODEL_CACHE_DIR = ".kb/models/huggingface";
@@ -1,7 +1,7 @@
1
- import { TextFieldIndex } from "@tryformation/querylight-ts";
1
+ import { type FieldIndex } from "@tryformation/querylight-ts";
2
2
  import { type ProgressHandler } from "../core/progress.js";
3
3
  import type { IndexMetadata } from "../types/models.js";
4
- export declare function createIndexMapping(extraFields?: string[]): Record<string, TextFieldIndex>;
4
+ export declare function createIndexMapping(extraFields?: string[]): Record<string, FieldIndex>;
5
5
  export declare function buildIndex({ workspacePath, denseOverride, sparseOverride, buildAvailableModels, progress }: {
6
6
  workspacePath: string;
7
7
  denseOverride?: boolean;
package/dist/index.js CHANGED
@@ -1782,7 +1782,7 @@ async function chunkDocuments({
1782
1782
  }
1783
1783
 
1784
1784
  // src/index/querylight-indexer.ts
1785
- import { Analyzer, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, RankingAlgorithm, TextFieldIndex } from "@tryformation/querylight-ts";
1785
+ import { Analyzer, DateFieldIndex, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, RankingAlgorithm, StoredSourceIndex, TextFieldIndex } from "@tryformation/querylight-ts";
1786
1786
  import path17 from "path";
1787
1787
 
1788
1788
  // src/vector/dense.ts
@@ -1795,6 +1795,8 @@ import os from "os";
1795
1795
  import path12 from "path";
1796
1796
  import { fileURLToPath } from "url";
1797
1797
  import { execFile, execFileSync } from "child_process";
1798
+ import { mkdtemp, rm as rm2, writeFile as writeFile6 } from "fs/promises";
1799
+ var sparseExecFileSync = execFileSync;
1798
1800
  function resolveQliHomeDir() {
1799
1801
  return path12.resolve(process.env.QLI_HOME ?? path12.join(os.homedir(), ".qli"));
1800
1802
  }
@@ -1850,29 +1852,36 @@ async function runSparsePython({
1850
1852
  }) {
1851
1853
  const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
1852
1854
  const scriptPath = await sparseScriptPath(importMetaUrl);
1853
- return execFileSync(
1854
- "uv",
1855
- [
1856
- "run",
1857
- "--with",
1858
- "torch",
1859
- "--with",
1860
- "transformers",
1861
- "--with",
1862
- "huggingface_hub",
1863
- "python",
1864
- scriptPath
1865
- ],
1866
- {
1867
- encoding: "utf8",
1868
- maxBuffer: 1024 * 1024 * 1024,
1869
- input: JSON.stringify(payload),
1870
- env: {
1871
- ...process.env,
1872
- HF_HOME: cacheDir
1855
+ const payloadDir = await mkdtemp(path12.join(os.tmpdir(), "qli-sparse-"));
1856
+ const payloadPath = path12.join(payloadDir, "payload.json");
1857
+ await writeFile6(payloadPath, JSON.stringify(payload), "utf8");
1858
+ try {
1859
+ return sparseExecFileSync(
1860
+ "uv",
1861
+ [
1862
+ "run",
1863
+ "--with",
1864
+ "torch",
1865
+ "--with",
1866
+ "transformers",
1867
+ "--with",
1868
+ "huggingface_hub",
1869
+ "python",
1870
+ scriptPath,
1871
+ payloadPath
1872
+ ],
1873
+ {
1874
+ encoding: "utf8",
1875
+ maxBuffer: 1024 * 1024 * 1024,
1876
+ env: {
1877
+ ...process.env,
1878
+ HF_HOME: cacheDir
1879
+ }
1873
1880
  }
1874
- }
1875
- );
1881
+ );
1882
+ } finally {
1883
+ await rm2(payloadDir, { recursive: true, force: true });
1884
+ }
1876
1885
  }
1877
1886
  async function getDenseTransformersRuntime(cacheDir) {
1878
1887
  const transformers = await import("@huggingface/transformers");
@@ -1885,18 +1894,18 @@ async function getDenseTransformersRuntime(cacheDir) {
1885
1894
  }
1886
1895
 
1887
1896
  // src/vector/store.ts
1888
- import { mkdir as mkdir6, rm as rm2, writeFile as writeFile7 } from "fs/promises";
1897
+ import { mkdir as mkdir6, rm as rm3, writeFile as writeFile8 } from "fs/promises";
1889
1898
  import path13 from "path";
1890
1899
 
1891
1900
  // src/core/gzip-json.ts
1892
- import { readFile as readFile9, writeFile as writeFile6 } from "fs/promises";
1901
+ import { readFile as readFile9, writeFile as writeFile7 } from "fs/promises";
1893
1902
  import { promisify } from "util";
1894
1903
  import { gunzip, gzip } from "zlib";
1895
1904
  var gzipAsync = promisify(gzip);
1896
1905
  var gunzipAsync = promisify(gunzip);
1897
1906
  async function writeGzipJson(filePath, value) {
1898
1907
  const payload = JSON.stringify(value, null, 2);
1899
- await writeFile6(filePath, await gzipAsync(Buffer.from(payload, "utf8")));
1908
+ await writeFile7(filePath, await gzipAsync(Buffer.from(payload, "utf8")));
1900
1909
  }
1901
1910
  async function readJsonFromGzipOrFile(gzipPath, legacyPath) {
1902
1911
  if (await fileExists(gzipPath)) {
@@ -1956,8 +1965,8 @@ async function writeDensePayload(workspacePath, payload) {
1956
1965
  await writeGzipJson(denseVectorPath(workspacePath), payload);
1957
1966
  await writeGzipJson(denseMetaPath(workspacePath), payload.metadata);
1958
1967
  await Promise.all([
1959
- rm2(legacyDenseVectorPath(workspacePath), { force: true }),
1960
- rm2(legacyDenseMetaPath(workspacePath), { force: true })
1968
+ rm3(legacyDenseVectorPath(workspacePath), { force: true }),
1969
+ rm3(legacyDenseMetaPath(workspacePath), { force: true })
1961
1970
  ]);
1962
1971
  }
1963
1972
  async function readDensePayload(workspacePath) {
@@ -1968,8 +1977,8 @@ async function writeSparsePayload(workspacePath, payload) {
1968
1977
  await writeGzipJson(sparseVectorPath(workspacePath), payload);
1969
1978
  await writeGzipJson(sparseMetaPath(workspacePath), payload.metadata);
1970
1979
  await Promise.all([
1971
- rm2(legacySparseVectorPath(workspacePath), { force: true }),
1972
- rm2(legacySparseMetaPath(workspacePath), { force: true })
1980
+ rm3(legacySparseVectorPath(workspacePath), { force: true }),
1981
+ rm3(legacySparseMetaPath(workspacePath), { force: true })
1973
1982
  ]);
1974
1983
  }
1975
1984
  async function readSparsePayload(workspacePath) {
@@ -2309,7 +2318,7 @@ async function buildVectorArtifacts({
2309
2318
  }
2310
2319
 
2311
2320
  // src/index/index-store.ts
2312
- import { mkdir as mkdir9, rm as rm3 } from "fs/promises";
2321
+ import { mkdir as mkdir9, rm as rm4 } from "fs/promises";
2313
2322
  import path16 from "path";
2314
2323
  function versionedIndexPath(workspacePath, stamp) {
2315
2324
  return path16.join(workspacePath, "indexes", `${stamp}.json.gz`);
@@ -2351,10 +2360,10 @@ async function writeIndexArtifacts({
2351
2360
  await writeGzipJson(latestIndexArtifactPath, indexState);
2352
2361
  await writeGzipJson(latestMetadataArtifactPath, metadata);
2353
2362
  await Promise.all([
2354
- rm3(legacyLatestIndexPath(workspacePath), { force: true }),
2355
- rm3(legacyLatestMetaPath(workspacePath), { force: true }),
2356
- rm3(versionedLegacyIndexPath(workspacePath, stamp), { force: true }),
2357
- rm3(versionedLegacyMetaPath(workspacePath, stamp), { force: true })
2363
+ rm4(legacyLatestIndexPath(workspacePath), { force: true }),
2364
+ rm4(legacyLatestMetaPath(workspacePath), { force: true }),
2365
+ rm4(versionedLegacyIndexPath(workspacePath, stamp), { force: true }),
2366
+ rm4(versionedLegacyMetaPath(workspacePath, stamp), { force: true })
2358
2367
  ]);
2359
2368
  return { indexPath: latestIndexArtifactPath, metadataPath: latestMetadataArtifactPath };
2360
2369
  }
@@ -2370,12 +2379,19 @@ function keywordFieldIndex() {
2370
2379
  function createIndexMapping(extraFields = []) {
2371
2380
  const lexical = new TextFieldIndex(void 0, void 0, RankingAlgorithm.BM25);
2372
2381
  const mapping = {
2382
+ _source: new StoredSourceIndex(),
2373
2383
  text: lexical,
2374
2384
  title: new TextFieldIndex(void 0, void 0, RankingAlgorithm.BM25),
2375
2385
  uri: keywordFieldIndex(),
2376
2386
  sourceId: keywordFieldIndex(),
2387
+ sourceName: keywordFieldIndex(),
2377
2388
  tags: keywordFieldIndex(),
2378
- sourceType: keywordFieldIndex()
2389
+ sourceType: keywordFieldIndex(),
2390
+ publicationDate: new DateFieldIndex(),
2391
+ firstSeenAt: new DateFieldIndex(),
2392
+ lastSeenAt: new DateFieldIndex(),
2393
+ lastChangedAt: new DateFieldIndex(),
2394
+ crawledAt: new DateFieldIndex()
2379
2395
  };
2380
2396
  for (const field of extraFields) {
2381
2397
  mapping[field] = keywordFieldIndex();
@@ -2411,8 +2427,12 @@ async function buildIndex({
2411
2427
  const sources = await readJsonl(path17.join(workspacePath, "sources", "sources.jsonl"));
2412
2428
  const metadataFields = [...new Set(chunks.flatMap((chunk) => Object.keys(chunk.metadata).map((key) => `metadata.${key}`)))];
2413
2429
  const index = new DocumentIndex(createIndexMapping(metadataFields));
2430
+ const documentsById = new Map(documents.map((document) => [document.id, document]));
2431
+ const sourcesById = new Map(sources.map((source) => [source.id, source]));
2414
2432
  reportProgress(progress, `Building lexical index from ${chunks.length} chunk${chunks.length === 1 ? "" : "s"}`);
2415
2433
  for (const chunk of chunks) {
2434
+ const document = documentsById.get(chunk.documentId);
2435
+ const source = sourcesById.get(chunk.sourceId);
2416
2436
  index.index({
2417
2437
  id: chunk.id,
2418
2438
  fields: {
@@ -2420,9 +2440,33 @@ async function buildIndex({
2420
2440
  title: [chunk.title],
2421
2441
  uri: [chunk.uri.toLowerCase()],
2422
2442
  sourceId: [chunk.sourceId.toLowerCase()],
2443
+ sourceName: source ? [source.name.toLowerCase()] : [],
2423
2444
  tags: Array.isArray(chunk.metadata.tags) ? chunk.metadata.tags.map((tag) => String(tag).toLowerCase()) : [],
2424
2445
  sourceType: [String(chunk.metadata.sourceType ?? "").toLowerCase()],
2446
+ publicationDate: document?.publicationDate ? [document.publicationDate] : [],
2447
+ firstSeenAt: [document?.firstSeenAt ?? chunk.firstSeenAt],
2448
+ lastSeenAt: [document?.lastSeenAt ?? chunk.lastSeenAt],
2449
+ lastChangedAt: [document?.lastChangedAt ?? chunk.lastChangedAt],
2450
+ crawledAt: document?.crawledAt ? [document.crawledAt] : [],
2425
2451
  ...flattenMetadata(chunk.metadata)
2452
+ },
2453
+ source: {
2454
+ chunkId: chunk.id,
2455
+ documentId: chunk.documentId,
2456
+ sourceId: chunk.sourceId,
2457
+ sourceType: document?.sourceType ?? "text",
2458
+ sourceName: source?.name,
2459
+ title: chunk.title,
2460
+ uri: chunk.uri,
2461
+ headingPath: chunk.headingPath,
2462
+ text: chunk.text,
2463
+ normalizedPath: document?.normalizedPath,
2464
+ publicationDate: document?.publicationDate ?? null,
2465
+ crawledAt: document?.crawledAt,
2466
+ firstSeenAt: document?.firstSeenAt ?? chunk.firstSeenAt,
2467
+ lastSeenAt: document?.lastSeenAt ?? chunk.lastSeenAt,
2468
+ lastChangedAt: document?.lastChangedAt ?? chunk.lastChangedAt,
2469
+ metadata: chunk.metadata
2426
2470
  }
2427
2471
  });
2428
2472
  }
@@ -2431,7 +2475,7 @@ async function buildIndex({
2431
2475
  const metadata = {
2432
2476
  id: `index_${createdAt.replace(/[:.]/g, "-")}`,
2433
2477
  createdAt,
2434
- querylightVersion: "0.10.0",
2478
+ querylightVersion: "0.11.0",
2435
2479
  kbVersion: "0.1.0",
2436
2480
  documentCount: documents.length,
2437
2481
  chunkCount: chunks.length,
@@ -2460,7 +2504,7 @@ async function buildIndex({
2460
2504
 
2461
2505
  // src/query/search-service.ts
2462
2506
  import { readFile as readFile10 } from "fs/promises";
2463
- import { BoolQuery, MatchQuery, OP, TermQuery, reciprocalRankFusion } from "@tryformation/querylight-ts";
2507
+ import { reciprocalRankFusion, searchJsonDsl } from "@tryformation/querylight-ts";
2464
2508
  import path18 from "path";
2465
2509
  async function loadHydratedIndex(workspacePath) {
2466
2510
  let state;
@@ -2488,24 +2532,6 @@ function matchesPrefix(value, prefixes) {
2488
2532
  const lower = value.toLowerCase();
2489
2533
  return prefixes.some((prefix) => lower.startsWith(prefix));
2490
2534
  }
2491
- function buildSearchQuery(query, filters) {
2492
- const sourceIds = normalizeFilterValues([filters.sourceId, ...filters.sourceIds ?? []].filter((value) => Boolean(value)));
2493
- const sourceTypes = normalizeFilterValues([filters.sourceType, ...filters.sourceTypes ?? []].filter((value) => Boolean(value)));
2494
- const tags = normalizeFilterValues([filters.tag, ...filters.tags ?? []].filter((value) => Boolean(value)));
2495
- return new BoolQuery({
2496
- should: [
2497
- new MatchQuery({ field: "title", text: query, operation: OP.AND, boost: 6 }),
2498
- new MatchQuery({ field: "text", text: query, operation: OP.AND, boost: 4 }),
2499
- new MatchQuery({ field: "text", text: query, operation: OP.OR, boost: 2 })
2500
- ],
2501
- filter: [
2502
- ...sourceIds.length === 1 ? [new TermQuery({ field: "sourceId", text: sourceIds[0] })] : [],
2503
- ...sourceTypes.length === 1 ? [new TermQuery({ field: "sourceType", text: sourceTypes[0] })] : [],
2504
- ...tags.length === 1 ? [new TermQuery({ field: "tags", text: tags[0] })] : [],
2505
- ...(filters.metadata ?? []).map(({ key, value }) => new TermQuery({ field: `metadata.${key}`, text: value.toLowerCase() }))
2506
- ]
2507
- });
2508
- }
2509
2535
  function isValidDate(value) {
2510
2536
  return typeof value === "string" && !Number.isNaN(new Date(value).getTime());
2511
2537
  }
@@ -2704,6 +2730,178 @@ async function buildSnippetWithAdjacentChunks(chunk, query, {
2704
2730
  }
2705
2731
  return buildExpandedParagraphSnippet(paragraphs, currentIndex, query);
2706
2732
  }
2733
+ function buildSearchDslRequest({
2734
+ query,
2735
+ topK,
2736
+ filters,
2737
+ dateRanges
2738
+ }) {
2739
+ const filterClauses = [];
2740
+ const sourceIds = normalizeFilterValues([filters.sourceId, ...filters.sourceIds ?? []].filter((value) => Boolean(value)));
2741
+ const sourceNames = normalizeFilterValues([filters.sourceName, ...filters.sourceNames ?? []].filter((value) => Boolean(value)));
2742
+ const sourceTypes = normalizeFilterValues([filters.sourceType, ...filters.sourceTypes ?? []].filter((value) => Boolean(value)));
2743
+ const uriPrefixes = normalizeFilterValues([filters.uriPrefix, ...filters.uriPrefixes ?? []].filter((value) => Boolean(value)));
2744
+ const tags = normalizeFilterValues([filters.tag, ...filters.tags ?? []].filter((value) => Boolean(value)));
2745
+ if (sourceIds.length > 0) {
2746
+ filterClauses.push({ terms: { sourceId: sourceIds } });
2747
+ }
2748
+ if (sourceNames.length > 0) {
2749
+ filterClauses.push({ terms: { sourceName: sourceNames } });
2750
+ }
2751
+ if (sourceTypes.length > 0) {
2752
+ filterClauses.push({ terms: { sourceType: sourceTypes } });
2753
+ }
2754
+ if (uriPrefixes.length > 0) {
2755
+ filterClauses.push({
2756
+ bool: {
2757
+ should: uriPrefixes.map((prefix) => ({ prefix: { uri: prefix } })),
2758
+ minimum_should_match: 1
2759
+ }
2760
+ });
2761
+ }
2762
+ if (tags.length > 0) {
2763
+ filterClauses.push({ terms: { tags } });
2764
+ }
2765
+ if (filters.hasPublicationDate) {
2766
+ filterClauses.push({ exists: { field: "publicationDate" } });
2767
+ }
2768
+ for (const { key, value } of filters.metadata ?? []) {
2769
+ filterClauses.push({ term: { [`metadata.${key}`]: value.toLowerCase() } });
2770
+ }
2771
+ for (const { field, from, to } of dateRanges) {
2772
+ filterClauses.push({
2773
+ range: {
2774
+ [field]: {
2775
+ ...from ? { gte: from } : {},
2776
+ ...to ? { lte: to } : {}
2777
+ }
2778
+ }
2779
+ });
2780
+ }
2781
+ return {
2782
+ size: topK,
2783
+ query: {
2784
+ bool: {
2785
+ should: [
2786
+ { match: { title: { query, operator: "and", boost: 6 } } },
2787
+ { match: { text: { query, operator: "and", boost: 4 } } },
2788
+ { match: { text: { query, operator: "or", boost: 2 } } }
2789
+ ],
2790
+ filter: filterClauses,
2791
+ minimum_should_match: 1
2792
+ }
2793
+ }
2794
+ };
2795
+ }
2796
+ function sourceToChunkRecord(source) {
2797
+ return {
2798
+ id: source.chunkId,
2799
+ documentId: source.documentId,
2800
+ sourceId: source.sourceId,
2801
+ title: source.title,
2802
+ uri: source.uri,
2803
+ headingPath: source.headingPath,
2804
+ text: source.text,
2805
+ contentHash: "",
2806
+ metadata: source.metadata,
2807
+ firstSeenAt: source.firstSeenAt,
2808
+ lastSeenAt: source.lastSeenAt,
2809
+ lastChangedAt: source.lastChangedAt
2810
+ };
2811
+ }
2812
+ function sourceToDocumentRecord(source) {
2813
+ return {
2814
+ id: source.documentId,
2815
+ sourceId: source.sourceId,
2816
+ sourceType: source.sourceType,
2817
+ title: source.title,
2818
+ uri: source.uri,
2819
+ sourceUri: source.uri,
2820
+ mimeType: "text/plain",
2821
+ normalizedPath: source.normalizedPath ?? "",
2822
+ contentHash: "",
2823
+ metadata: source.metadata,
2824
+ publicationDate: source.publicationDate ?? null,
2825
+ crawledAt: source.crawledAt,
2826
+ firstSeenAt: source.firstSeenAt,
2827
+ lastSeenAt: source.lastSeenAt,
2828
+ lastChangedAt: source.lastChangedAt
2829
+ };
2830
+ }
2831
+ async function materializeSearchHit(hit, query, config, orderedChunkCache, showChunks) {
2832
+ const source = hit._source;
2833
+ const chunk = sourceToChunkRecord(source);
2834
+ const document = sourceToDocumentRecord(source);
2835
+ const snippet = await buildSnippetWithAdjacentChunks(chunk, query, { document, config, orderedChunkCache });
2836
+ const enrichedSource = {
2837
+ ...source,
2838
+ snippet
2839
+ };
2840
+ const result = {
2841
+ chunkId: source.chunkId,
2842
+ documentId: source.documentId,
2843
+ sourceId: source.sourceId,
2844
+ sourceType: source.sourceType,
2845
+ score: hit._score,
2846
+ title: chooseResultTitle(chunk),
2847
+ uri: source.uri,
2848
+ snippet,
2849
+ text: showChunks ? source.text : void 0,
2850
+ publicationDate: source.publicationDate ?? null,
2851
+ firstSeenAt: source.firstSeenAt,
2852
+ lastSeenAt: source.lastSeenAt,
2853
+ lastChangedAt: source.lastChangedAt,
2854
+ metadata: source.metadata
2855
+ };
2856
+ return {
2857
+ hit: {
2858
+ ...hit,
2859
+ _source: enrichedSource
2860
+ },
2861
+ result
2862
+ };
2863
+ }
2864
+ function createSearchResponse(retrievalMode, hits, took, aggregations) {
2865
+ return {
2866
+ retrievalMode,
2867
+ took,
2868
+ hits: {
2869
+ total: {
2870
+ value: hits.length,
2871
+ relation: "eq"
2872
+ },
2873
+ max_score: hits.length > 0 ? Math.max(...hits.map((hit) => hit._score)) : null,
2874
+ hits
2875
+ },
2876
+ aggregations
2877
+ };
2878
+ }
2879
+ function searchResultsFromResponse(response, showChunks = false) {
2880
+ return response.hits.hits.map((hit) => ({
2881
+ chunkId: hit._source.chunkId,
2882
+ documentId: hit._source.documentId,
2883
+ sourceId: hit._source.sourceId,
2884
+ sourceType: hit._source.sourceType,
2885
+ score: hit._score,
2886
+ title: chooseResultTitle(sourceToChunkRecord(hit._source)),
2887
+ uri: hit._source.uri,
2888
+ snippet: hit._source.snippet ?? hit.highlight?.text?.join("\n\n") ?? buildSnippet(hit._source.text, hit._source.title),
2889
+ text: showChunks ? hit._source.text : void 0,
2890
+ publicationDate: hit._source.publicationDate ?? null,
2891
+ firstSeenAt: hit._source.firstSeenAt,
2892
+ lastSeenAt: hit._source.lastSeenAt,
2893
+ lastChangedAt: hit._source.lastChangedAt,
2894
+ metadata: hit._source.metadata
2895
+ }));
2896
+ }
2897
+ async function searchJsonIndex({
2898
+ workspacePath,
2899
+ request,
2900
+ indexName = "querylight"
2901
+ }) {
2902
+ const index = await loadHydratedIndex(workspacePath);
2903
+ return searchJsonDsl({ index, request, indexName });
2904
+ }
2707
2905
  function normalizeDisplayTitle(title) {
2708
2906
  return title.replace(/\s*\|\s*Querylight TS Demo\s*$/i, "").replace(/\s+/g, " ").trim();
2709
2907
  }
@@ -2841,6 +3039,7 @@ async function searchIndex({
2841
3039
  retrievalMode,
2842
3040
  showChunks = false
2843
3041
  }) {
3042
+ const startedAt = Date.now();
2844
3043
  const config = await loadConfig(workspacePath);
2845
3044
  const mode = retrievalMode ?? config.retrieval.defaultMode;
2846
3045
  const candidateLimit = Math.max(topK * 5, 50);
@@ -2897,12 +3096,48 @@ async function searchIndex({
2897
3096
  };
2898
3097
  })
2899
3098
  );
2900
- return { retrievalMode: "lexical", results: latestResults.filter((result) => result != null) };
3099
+ const hits2 = latestResults.filter((result) => result != null).map((result) => {
3100
+ const chunk = chunks.get(result.chunkId);
3101
+ const document = documents.get(result.documentId);
3102
+ const source = sources.get(result.sourceId);
3103
+ return {
3104
+ _index: "querylight",
3105
+ _id: result.chunkId,
3106
+ _score: result.score,
3107
+ _source: {
3108
+ chunkId: result.chunkId,
3109
+ documentId: result.documentId,
3110
+ sourceId: result.sourceId,
3111
+ sourceType: result.sourceType,
3112
+ sourceName: source?.name,
3113
+ title: chunk.title,
3114
+ uri: result.uri,
3115
+ headingPath: chunk.headingPath,
3116
+ text: chunk.text,
3117
+ snippet: result.snippet,
3118
+ normalizedPath: document.normalizedPath,
3119
+ publicationDate: result.publicationDate ?? null,
3120
+ crawledAt: document.crawledAt,
3121
+ firstSeenAt: result.firstSeenAt,
3122
+ lastSeenAt: result.lastSeenAt,
3123
+ lastChangedAt: result.lastChangedAt,
3124
+ metadata: result.metadata
3125
+ }
3126
+ };
3127
+ });
3128
+ return createSearchResponse("lexical", hits2, Date.now() - startedAt);
2901
3129
  }
2902
3130
  const lexicalHits = async () => {
2903
- const index = await loadHydratedIndex(workspacePath);
2904
- const all = await index.searchRequest({ query: buildSearchQuery(normalizedQuery, { sourceId, sourceIds, sourceType, sourceTypes, tag, tags, metadata }), limit: candidateLimit });
2905
- return all.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit);
3131
+ const response = await searchJsonIndex({
3132
+ workspacePath,
3133
+ request: buildSearchDslRequest({
3134
+ query: normalizedQuery,
3135
+ topK: candidateLimit,
3136
+ filters: { sourceId, sourceIds, sourceName, sourceNames, sourceType, sourceTypes, uriPrefix, uriPrefixes, hasPublicationDate, tag, tags, metadata },
3137
+ dateRanges
3138
+ })
3139
+ });
3140
+ return response.hits.hits;
2906
3141
  };
2907
3142
  const denseHits = async () => {
2908
3143
  if (!await fileExists(denseVectorPath(workspacePath))) {
@@ -2916,15 +3151,18 @@ async function searchIndex({
2916
3151
  }
2917
3152
  return sparseQuery({ workspacePath, config: config.retrieval.sparse, query: normalizedQuery, topK: candidateLimit }).then((hits2) => hits2.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit));
2918
3153
  };
3154
+ let lexicalResponseHits = [];
2919
3155
  let hits;
2920
3156
  if (mode === "lexical") {
2921
- hits = await lexicalHits();
3157
+ lexicalResponseHits = await lexicalHits();
3158
+ hits = lexicalResponseHits.map((hit) => [hit._id, hit._score]);
2922
3159
  } else if (mode === "dense") {
2923
3160
  hits = await denseHits();
2924
3161
  } else if (mode === "sparse") {
2925
3162
  hits = await sparseHits();
2926
3163
  } else {
2927
- const rankings = [await lexicalHits()];
3164
+ lexicalResponseHits = await lexicalHits();
3165
+ const rankings = [lexicalResponseHits.map((hit) => [hit._id, hit._score])];
2928
3166
  if (await fileExists(denseVectorPath(workspacePath))) {
2929
3167
  rankings.push(await denseQuery({ workspacePath, config: config.retrieval.dense, query: normalizedQuery, topK: candidateLimit }).then((dense) => dense.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit)));
2930
3168
  }
@@ -2933,34 +3171,49 @@ async function searchIndex({
2933
3171
  }
2934
3172
  hits = reciprocalRankFusion(rankings, { rankConstant: 20, weights: rankings.map((_, index) => index === 0 ? 3 : 1) }).slice(0, candidateLimit);
2935
3173
  }
2936
- const rawResults = await Promise.all(hits.map(async ([chunkId, score]) => {
3174
+ const baseHits = mode === "lexical" ? lexicalResponseHits : hits.flatMap(([chunkId, score]) => {
2937
3175
  const chunk = chunks.get(chunkId);
2938
3176
  if (!chunk) {
2939
- return null;
3177
+ return [];
2940
3178
  }
2941
- return {
2942
- chunkId,
2943
- documentId: chunk.documentId,
2944
- sourceId: chunk.sourceId,
2945
- sourceType: documents.get(chunk.documentId)?.sourceType ?? "text",
2946
- score,
2947
- title: chooseResultTitle(chunk),
2948
- uri: chunk.uri,
2949
- snippet: await buildSnippetWithAdjacentChunks(chunk, normalizedQuery, {
2950
- document: documents.get(chunk.documentId),
2951
- config,
2952
- orderedChunkCache
2953
- }),
2954
- text: showChunks ? chunk.text : void 0,
2955
- publicationDate: documents.get(chunk.documentId)?.publicationDate ?? null,
2956
- firstSeenAt: documents.get(chunk.documentId)?.firstSeenAt ?? chunk.firstSeenAt,
2957
- lastSeenAt: documents.get(chunk.documentId)?.lastSeenAt ?? chunk.lastSeenAt,
2958
- lastChangedAt: documents.get(chunk.documentId)?.lastChangedAt ?? chunk.lastChangedAt,
2959
- metadata: chunk.metadata
2960
- };
2961
- }));
2962
- const results = rawResults.filter((result) => result != null);
2963
- return { retrievalMode: mode, results: rerankResultsByDocument(results, topK) };
3179
+ const document = documents.get(chunk.documentId);
3180
+ const source = sources.get(chunk.sourceId);
3181
+ return [{
3182
+ _index: "querylight",
3183
+ _id: chunkId,
3184
+ _score: score,
3185
+ _source: {
3186
+ chunkId,
3187
+ documentId: chunk.documentId,
3188
+ sourceId: chunk.sourceId,
3189
+ sourceType: document?.sourceType ?? "text",
3190
+ sourceName: source?.name,
3191
+ title: chunk.title,
3192
+ uri: chunk.uri,
3193
+ headingPath: chunk.headingPath,
3194
+ text: chunk.text,
3195
+ normalizedPath: document?.normalizedPath,
3196
+ publicationDate: document?.publicationDate ?? null,
3197
+ crawledAt: document?.crawledAt,
3198
+ firstSeenAt: document?.firstSeenAt ?? chunk.firstSeenAt,
3199
+ lastSeenAt: document?.lastSeenAt ?? chunk.lastSeenAt,
3200
+ lastChangedAt: document?.lastChangedAt ?? chunk.lastChangedAt,
3201
+ metadata: chunk.metadata
3202
+ }
3203
+ }];
3204
+ });
3205
+ const materialized = await Promise.all(baseHits.map((hit) => materializeSearchHit(hit, normalizedQuery, config, orderedChunkCache, showChunks)));
3206
+ if (showChunks) {
3207
+ const topHits = materialized.sort((left, right) => right.result.score - left.result.score).slice(0, topK).map(({ hit, result }) => ({ ...hit, _score: result.score }));
3208
+ return createSearchResponse(mode, topHits, Date.now() - startedAt);
3209
+ }
3210
+ const reranked = rerankResultsByDocument(materialized.map(({ result }) => result), topK);
3211
+ const byChunkId = new Map(materialized.map(({ hit }) => [hit._id, hit]));
3212
+ const finalHits = reranked.map((result) => {
3213
+ const hit = byChunkId.get(result.chunkId);
3214
+ return hit ? { ...hit, _score: result.score, _source: { ...hit._source, snippet: result.snippet } } : null;
3215
+ }).filter((hit) => hit != null);
3216
+ return createSearchResponse(mode, finalHits, Date.now() - startedAt);
2964
3217
  }
2965
3218
 
2966
3219
  // src/query/related-service.ts
@@ -3077,9 +3330,10 @@ async function createContext({
3077
3330
  retrievalMode
3078
3331
  }) {
3079
3332
  const search = await searchIndex({ workspacePath, query, topK, showChunks: true, retrievalMode });
3333
+ const results = searchResultsFromResponse(search, true);
3080
3334
  const sources = [];
3081
3335
  let total = 0;
3082
- for (const result of search.results) {
3336
+ for (const result of results) {
3083
3337
  const text = result.text ?? "";
3084
3338
  if (total + text.length > maxChars && sources.length > 0) {
3085
3339
  break;
@@ -3188,6 +3442,8 @@ export {
3188
3442
  renderChangeReport,
3189
3443
  reprocessDocuments,
3190
3444
  searchIndex,
3445
+ searchJsonIndex,
3446
+ searchResultsFromResponse,
3191
3447
  updateSource,
3192
3448
  writeDefaultConfig
3193
3449
  };
@@ -1,10 +1,17 @@
1
- import type { RetrievalMode, SearchResponseData } from "../types/models.js";
1
+ import { type JsonDslRequest, type JsonDslResponse } from "@tryformation/querylight-ts";
2
+ import type { RetrievalMode, SearchResponseData, SearchResult } from "../types/models.js";
2
3
  type SearchDateField = "publicationDate" | "firstSeenAt" | "lastSeenAt" | "lastChangedAt" | "crawledAt";
3
4
  type SearchDateRange = {
4
5
  field: SearchDateField;
5
6
  from?: string;
6
7
  to?: string;
7
8
  };
9
+ export declare function searchResultsFromResponse(response: SearchResponseData, showChunks?: boolean): SearchResult[];
10
+ export declare function searchJsonIndex({ workspacePath, request, indexName }: {
11
+ workspacePath: string;
12
+ request: JsonDslRequest;
13
+ indexName?: string;
14
+ }): Promise<JsonDslResponse>;
8
15
  export declare function searchIndex({ workspacePath, query, topK, sourceId, sourceIds, sourceName, sourceNames, sourceType, sourceTypes, uriPrefix, uriPrefixes, hasPublicationDate, tag, tags, metadata, dateRanges, retrievalMode, showChunks }: {
9
16
  workspacePath: string;
10
17
  query: string;