@tryformation/querylight-cli 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -57,7 +57,7 @@ var defaultConfig = () => ({
57
57
  defaultMode: "lexical",
58
58
  dense: {
59
59
  enabled: true,
60
- modelId: "Xenova/all-MiniLM-L6-v2",
60
+ modelId: "Xenova/paraphrase-MiniLM-L3-v2",
61
61
  cacheDir: DEFAULT_SHARED_MODEL_CACHE_DIR,
62
62
  indexHashTables: 8,
63
63
  indexRandomSeed: 42,
@@ -65,7 +65,7 @@ var defaultConfig = () => ({
65
65
  },
66
66
  sparse: {
67
67
  enabled: true,
68
- modelId: "opensearch-project/opensearch-neural-sparse-encoding-doc-v3-distill",
68
+ modelId: "opensearch-project/opensearch-neural-sparse-encoding-doc-v2-mini",
69
69
  cacheDir: DEFAULT_SHARED_MODEL_CACHE_DIR,
70
70
  documentTopTokens: 128,
71
71
  queryEncoding: "tokenizer-token-weights",
@@ -1213,13 +1213,17 @@ function isAllowed(url, baseUrl, includePatterns, excludePatterns, disallowRules
1213
1213
  if (url.search.length > 0) {
1214
1214
  return false;
1215
1215
  }
1216
- if (url.pathname.endsWith(".xml")) {
1216
+ const pathname = url.pathname.toLowerCase();
1217
+ if (pathname.endsWith(".xml")) {
1217
1218
  return false;
1218
1219
  }
1219
- if (url.pathname.includes("/cdn-cgi/")) {
1220
+ if (pathname.endsWith(".pdf")) {
1220
1221
  return false;
1221
1222
  }
1222
- if (url.pathname === "/search" || url.pathname === "/search/" || url.pathname.endsWith("/search/")) {
1223
+ if (pathname.includes("/cdn-cgi/")) {
1224
+ return false;
1225
+ }
1226
+ if (pathname === "/search" || pathname === "/search/" || pathname.endsWith("/search/")) {
1223
1227
  return false;
1224
1228
  }
1225
1229
  if (disallowRules.some((rule) => rule !== "/" && url.pathname.startsWith(rule))) {
@@ -1782,7 +1786,7 @@ async function chunkDocuments({
1782
1786
  }
1783
1787
 
1784
1788
  // src/index/querylight-indexer.ts
1785
- import { Analyzer, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, RankingAlgorithm, TextFieldIndex } from "@tryformation/querylight-ts";
1789
+ import { Analyzer, DateFieldIndex, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, RankingAlgorithm, StoredSourceIndex, TextFieldIndex } from "@tryformation/querylight-ts";
1786
1790
  import path17 from "path";
1787
1791
 
1788
1792
  // src/vector/dense.ts
@@ -2058,15 +2062,26 @@ function createSparseChunkText(chunk) {
2058
2062
  // src/vector/dense.ts
2059
2063
  var denseEmbedderFactory = null;
2060
2064
  var EXACT_DENSE_RERANK_THRESHOLD = 5e3;
2065
+ function normalizeDenseEmbedder(embedder) {
2066
+ if (typeof embedder === "function") {
2067
+ return { embed: embedder };
2068
+ }
2069
+ return embedder;
2070
+ }
2061
2071
  async function createEmbedder(cacheDir, modelId) {
2062
2072
  if (denseEmbedderFactory) {
2063
- return denseEmbedderFactory(cacheDir, modelId);
2073
+ return normalizeDenseEmbedder(await denseEmbedderFactory(cacheDir, modelId));
2064
2074
  }
2065
2075
  const runtime = await getDenseTransformersRuntime(cacheDir);
2066
2076
  const extractor = await runtime.pipeline("feature-extraction", modelId);
2067
- return async (text) => {
2068
- const output = await extractor(text, { pooling: "mean", normalize: true });
2069
- return output.tolist()[0];
2077
+ return {
2078
+ async embed(text) {
2079
+ const output = await extractor(text, { pooling: "mean", normalize: true });
2080
+ return output.tolist()[0];
2081
+ },
2082
+ async dispose() {
2083
+ await extractor.dispose();
2084
+ }
2070
2085
  };
2071
2086
  }
2072
2087
  function exactDenseQuery(payload, vector, topK) {
@@ -2080,53 +2095,57 @@ async function buildDenseVectors({
2080
2095
  const chunks = await readJsonl(path14.join(workspacePath, "chunks", "chunks.jsonl"));
2081
2096
  const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
2082
2097
  await mkdir7(cacheDir, { recursive: true });
2083
- const embed = await createEmbedder(cacheDir, config.modelId);
2084
- const records = [];
2085
- let dimensions = 0;
2086
- reportProgress(progress, `Encoding ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} for dense retrieval`);
2087
- for (const chunk of chunks) {
2088
- const embedding = await embed(createDenseChunkText(chunk));
2089
- dimensions ||= embedding.length;
2090
- records.push({
2091
- chunkId: chunk.id,
2092
- documentId: chunk.documentId,
2093
- sourceId: chunk.sourceId,
2094
- title: chunk.title,
2095
- uri: chunk.uri,
2096
- headingPath: chunk.headingPath,
2097
- text: chunk.text,
2098
- embedding
2099
- });
2100
- if (records.length === 1 || records.length % 100 === 0 || records.length === chunks.length) {
2101
- reportProgressDetail(progress, `Encoded ${records.length}/${chunks.length} chunks for dense retrieval`);
2098
+ const embedder = await createEmbedder(cacheDir, config.modelId);
2099
+ try {
2100
+ const records = [];
2101
+ let dimensions = 0;
2102
+ reportProgress(progress, `Encoding ${chunks.length} chunk${chunks.length === 1 ? "" : "s"} for dense retrieval`);
2103
+ for (const chunk of chunks) {
2104
+ const embedding = await embedder.embed(createDenseChunkText(chunk));
2105
+ dimensions ||= embedding.length;
2106
+ records.push({
2107
+ chunkId: chunk.id,
2108
+ documentId: chunk.documentId,
2109
+ sourceId: chunk.sourceId,
2110
+ title: chunk.title,
2111
+ uri: chunk.uri,
2112
+ headingPath: chunk.headingPath,
2113
+ text: chunk.text,
2114
+ embedding
2115
+ });
2116
+ if (records.length === 1 || records.length % 100 === 0 || records.length === chunks.length) {
2117
+ reportProgressDetail(progress, `Encoded ${records.length}/${chunks.length} chunks for dense retrieval`);
2118
+ }
2102
2119
  }
2120
+ reportProgress(progress, "Building dense vector index");
2121
+ const index = new VectorFieldIndex({
2122
+ numHashTables: config.indexHashTables,
2123
+ dimensions,
2124
+ random: createSeededRandom(config.indexRandomSeed)
2125
+ });
2126
+ for (const record of records) {
2127
+ index.insert(record.chunkId, [record.embedding]);
2128
+ }
2129
+ const metadata = {
2130
+ createdAt: (/* @__PURE__ */ new Date()).toISOString(),
2131
+ modelId: config.modelId,
2132
+ dimensions,
2133
+ hashTables: config.indexHashTables,
2134
+ randomSeed: config.indexRandomSeed,
2135
+ chunkCount: records.length,
2136
+ indexHash: sha256(JSON.stringify(index.indexState))
2137
+ };
2138
+ const payload = {
2139
+ metadata,
2140
+ indexState: index.indexState,
2141
+ chunks: records
2142
+ };
2143
+ await writeDensePayload(workspacePath, payload);
2144
+ reportProgress(progress, `Dense vectors written for ${records.length} chunk${records.length === 1 ? "" : "s"}`);
2145
+ return payload;
2146
+ } finally {
2147
+ await embedder.dispose?.();
2103
2148
  }
2104
- reportProgress(progress, "Building dense vector index");
2105
- const index = new VectorFieldIndex({
2106
- numHashTables: config.indexHashTables,
2107
- dimensions,
2108
- random: createSeededRandom(config.indexRandomSeed)
2109
- });
2110
- for (const record of records) {
2111
- index.insert(record.chunkId, [record.embedding]);
2112
- }
2113
- const metadata = {
2114
- createdAt: (/* @__PURE__ */ new Date()).toISOString(),
2115
- modelId: config.modelId,
2116
- dimensions,
2117
- hashTables: config.indexHashTables,
2118
- randomSeed: config.indexRandomSeed,
2119
- chunkCount: records.length,
2120
- indexHash: sha256(JSON.stringify(index.indexState))
2121
- };
2122
- const payload = {
2123
- metadata,
2124
- indexState: index.indexState,
2125
- chunks: records
2126
- };
2127
- await writeDensePayload(workspacePath, payload);
2128
- reportProgress(progress, `Dense vectors written for ${records.length} chunk${records.length === 1 ? "" : "s"}`);
2129
- return payload;
2130
2149
  }
2131
2150
  async function denseQuery({
2132
2151
  workspacePath,
@@ -2136,21 +2155,25 @@ async function denseQuery({
2136
2155
  }) {
2137
2156
  const payload = await readDensePayload(workspacePath);
2138
2157
  const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
2139
- const embed = await createEmbedder(cacheDir, config.modelId);
2140
- const vector = await embed(query);
2141
- if (payload.chunks.length <= EXACT_DENSE_RERANK_THRESHOLD) {
2158
+ const embedder = await createEmbedder(cacheDir, config.modelId);
2159
+ try {
2160
+ const vector = await embedder.embed(query);
2161
+ if (payload.chunks.length <= EXACT_DENSE_RERANK_THRESHOLD) {
2162
+ return exactDenseQuery(payload, vector, topK);
2163
+ }
2164
+ const index = new VectorFieldIndex({
2165
+ numHashTables: payload.metadata.hashTables,
2166
+ dimensions: payload.metadata.dimensions,
2167
+ random: createSeededRandom(payload.metadata.randomSeed)
2168
+ }).loadState(payload.indexState);
2169
+ const approximateHits = index.query(vector, topK);
2170
+ if (approximateHits.length >= topK) {
2171
+ return approximateHits;
2172
+ }
2142
2173
  return exactDenseQuery(payload, vector, topK);
2174
+ } finally {
2175
+ await embedder.dispose?.();
2143
2176
  }
2144
- const index = new VectorFieldIndex({
2145
- numHashTables: payload.metadata.hashTables,
2146
- dimensions: payload.metadata.dimensions,
2147
- random: createSeededRandom(payload.metadata.randomSeed)
2148
- }).loadState(payload.indexState);
2149
- const approximateHits = index.query(vector, topK);
2150
- if (approximateHits.length >= topK) {
2151
- return approximateHits;
2152
- }
2153
- return exactDenseQuery(payload, vector, topK);
2154
2177
  }
2155
2178
 
2156
2179
  // src/vector/sparse.ts
@@ -2379,12 +2402,19 @@ function keywordFieldIndex() {
2379
2402
  function createIndexMapping(extraFields = []) {
2380
2403
  const lexical = new TextFieldIndex(void 0, void 0, RankingAlgorithm.BM25);
2381
2404
  const mapping = {
2405
+ _source: new StoredSourceIndex(),
2382
2406
  text: lexical,
2383
2407
  title: new TextFieldIndex(void 0, void 0, RankingAlgorithm.BM25),
2384
2408
  uri: keywordFieldIndex(),
2385
2409
  sourceId: keywordFieldIndex(),
2410
+ sourceName: keywordFieldIndex(),
2386
2411
  tags: keywordFieldIndex(),
2387
- sourceType: keywordFieldIndex()
2412
+ sourceType: keywordFieldIndex(),
2413
+ publicationDate: new DateFieldIndex(),
2414
+ firstSeenAt: new DateFieldIndex(),
2415
+ lastSeenAt: new DateFieldIndex(),
2416
+ lastChangedAt: new DateFieldIndex(),
2417
+ crawledAt: new DateFieldIndex()
2388
2418
  };
2389
2419
  for (const field of extraFields) {
2390
2420
  mapping[field] = keywordFieldIndex();
@@ -2420,8 +2450,12 @@ async function buildIndex({
2420
2450
  const sources = await readJsonl(path17.join(workspacePath, "sources", "sources.jsonl"));
2421
2451
  const metadataFields = [...new Set(chunks.flatMap((chunk) => Object.keys(chunk.metadata).map((key) => `metadata.${key}`)))];
2422
2452
  const index = new DocumentIndex(createIndexMapping(metadataFields));
2453
+ const documentsById = new Map(documents.map((document) => [document.id, document]));
2454
+ const sourcesById = new Map(sources.map((source) => [source.id, source]));
2423
2455
  reportProgress(progress, `Building lexical index from ${chunks.length} chunk${chunks.length === 1 ? "" : "s"}`);
2424
2456
  for (const chunk of chunks) {
2457
+ const document = documentsById.get(chunk.documentId);
2458
+ const source = sourcesById.get(chunk.sourceId);
2425
2459
  index.index({
2426
2460
  id: chunk.id,
2427
2461
  fields: {
@@ -2429,9 +2463,33 @@ async function buildIndex({
2429
2463
  title: [chunk.title],
2430
2464
  uri: [chunk.uri.toLowerCase()],
2431
2465
  sourceId: [chunk.sourceId.toLowerCase()],
2466
+ sourceName: source ? [source.name.toLowerCase()] : [],
2432
2467
  tags: Array.isArray(chunk.metadata.tags) ? chunk.metadata.tags.map((tag) => String(tag).toLowerCase()) : [],
2433
2468
  sourceType: [String(chunk.metadata.sourceType ?? "").toLowerCase()],
2469
+ publicationDate: document?.publicationDate ? [document.publicationDate] : [],
2470
+ firstSeenAt: [document?.firstSeenAt ?? chunk.firstSeenAt],
2471
+ lastSeenAt: [document?.lastSeenAt ?? chunk.lastSeenAt],
2472
+ lastChangedAt: [document?.lastChangedAt ?? chunk.lastChangedAt],
2473
+ crawledAt: document?.crawledAt ? [document.crawledAt] : [],
2434
2474
  ...flattenMetadata(chunk.metadata)
2475
+ },
2476
+ source: {
2477
+ chunkId: chunk.id,
2478
+ documentId: chunk.documentId,
2479
+ sourceId: chunk.sourceId,
2480
+ sourceType: document?.sourceType ?? "text",
2481
+ sourceName: source?.name,
2482
+ title: chunk.title,
2483
+ uri: chunk.uri,
2484
+ headingPath: chunk.headingPath,
2485
+ text: chunk.text,
2486
+ normalizedPath: document?.normalizedPath,
2487
+ publicationDate: document?.publicationDate ?? null,
2488
+ crawledAt: document?.crawledAt,
2489
+ firstSeenAt: document?.firstSeenAt ?? chunk.firstSeenAt,
2490
+ lastSeenAt: document?.lastSeenAt ?? chunk.lastSeenAt,
2491
+ lastChangedAt: document?.lastChangedAt ?? chunk.lastChangedAt,
2492
+ metadata: chunk.metadata
2435
2493
  }
2436
2494
  });
2437
2495
  }
@@ -2440,7 +2498,7 @@ async function buildIndex({
2440
2498
  const metadata = {
2441
2499
  id: `index_${createdAt.replace(/[:.]/g, "-")}`,
2442
2500
  createdAt,
2443
- querylightVersion: "0.10.0",
2501
+ querylightVersion: "0.11.0",
2444
2502
  kbVersion: "0.1.0",
2445
2503
  documentCount: documents.length,
2446
2504
  chunkCount: chunks.length,
@@ -2469,7 +2527,7 @@ async function buildIndex({
2469
2527
 
2470
2528
  // src/query/search-service.ts
2471
2529
  import { readFile as readFile10 } from "fs/promises";
2472
- import { BoolQuery, MatchQuery, OP, TermQuery, reciprocalRankFusion } from "@tryformation/querylight-ts";
2530
+ import { reciprocalRankFusion, searchJsonDsl } from "@tryformation/querylight-ts";
2473
2531
  import path18 from "path";
2474
2532
  async function loadHydratedIndex(workspacePath) {
2475
2533
  let state;
@@ -2497,24 +2555,6 @@ function matchesPrefix(value, prefixes) {
2497
2555
  const lower = value.toLowerCase();
2498
2556
  return prefixes.some((prefix) => lower.startsWith(prefix));
2499
2557
  }
2500
- function buildSearchQuery(query, filters) {
2501
- const sourceIds = normalizeFilterValues([filters.sourceId, ...filters.sourceIds ?? []].filter((value) => Boolean(value)));
2502
- const sourceTypes = normalizeFilterValues([filters.sourceType, ...filters.sourceTypes ?? []].filter((value) => Boolean(value)));
2503
- const tags = normalizeFilterValues([filters.tag, ...filters.tags ?? []].filter((value) => Boolean(value)));
2504
- return new BoolQuery({
2505
- should: [
2506
- new MatchQuery({ field: "title", text: query, operation: OP.AND, boost: 6 }),
2507
- new MatchQuery({ field: "text", text: query, operation: OP.AND, boost: 4 }),
2508
- new MatchQuery({ field: "text", text: query, operation: OP.OR, boost: 2 })
2509
- ],
2510
- filter: [
2511
- ...sourceIds.length === 1 ? [new TermQuery({ field: "sourceId", text: sourceIds[0] })] : [],
2512
- ...sourceTypes.length === 1 ? [new TermQuery({ field: "sourceType", text: sourceTypes[0] })] : [],
2513
- ...tags.length === 1 ? [new TermQuery({ field: "tags", text: tags[0] })] : [],
2514
- ...(filters.metadata ?? []).map(({ key, value }) => new TermQuery({ field: `metadata.${key}`, text: value.toLowerCase() }))
2515
- ]
2516
- });
2517
- }
2518
2558
  function isValidDate(value) {
2519
2559
  return typeof value === "string" && !Number.isNaN(new Date(value).getTime());
2520
2560
  }
@@ -2713,6 +2753,185 @@ async function buildSnippetWithAdjacentChunks(chunk, query, {
2713
2753
  }
2714
2754
  return buildExpandedParagraphSnippet(paragraphs, currentIndex, query);
2715
2755
  }
2756
+ function buildSearchDslRequest({
2757
+ query,
2758
+ topK,
2759
+ filters,
2760
+ dateRanges
2761
+ }) {
2762
+ const filterClauses = [];
2763
+ const sourceIds = normalizeFilterValues([filters.sourceId, ...filters.sourceIds ?? []].filter((value) => Boolean(value)));
2764
+ const sourceNames = normalizeFilterValues([filters.sourceName, ...filters.sourceNames ?? []].filter((value) => Boolean(value)));
2765
+ const sourceTypes = normalizeFilterValues([filters.sourceType, ...filters.sourceTypes ?? []].filter((value) => Boolean(value)));
2766
+ const uriPrefixes = normalizeFilterValues([filters.uriPrefix, ...filters.uriPrefixes ?? []].filter((value) => Boolean(value)));
2767
+ const tags = normalizeFilterValues([filters.tag, ...filters.tags ?? []].filter((value) => Boolean(value)));
2768
+ if (sourceIds.length > 0) {
2769
+ filterClauses.push({ terms: { sourceId: sourceIds } });
2770
+ }
2771
+ if (sourceNames.length > 0) {
2772
+ filterClauses.push({ terms: { sourceName: sourceNames } });
2773
+ }
2774
+ if (sourceTypes.length > 0) {
2775
+ filterClauses.push({ terms: { sourceType: sourceTypes } });
2776
+ }
2777
+ if (uriPrefixes.length > 0) {
2778
+ filterClauses.push({
2779
+ bool: {
2780
+ should: uriPrefixes.map((prefix) => ({ prefix: { uri: prefix } })),
2781
+ minimum_should_match: 1
2782
+ }
2783
+ });
2784
+ }
2785
+ if (tags.length > 0) {
2786
+ filterClauses.push({ terms: { tags } });
2787
+ }
2788
+ if (filters.hasPublicationDate) {
2789
+ filterClauses.push({ exists: { field: "publicationDate" } });
2790
+ }
2791
+ for (const { key, value } of filters.metadata ?? []) {
2792
+ filterClauses.push({ term: { [`metadata.${key}`]: value.toLowerCase() } });
2793
+ }
2794
+ for (const { field, from, to } of dateRanges) {
2795
+ filterClauses.push({
2796
+ range: {
2797
+ [field]: {
2798
+ ...from ? { gte: from } : {},
2799
+ ...to ? { lte: to } : {}
2800
+ }
2801
+ }
2802
+ });
2803
+ }
2804
+ return {
2805
+ size: topK,
2806
+ query: {
2807
+ bool: {
2808
+ should: [
2809
+ { match: { title: { query, operator: "and", boost: 6 } } },
2810
+ { match: { text: { query, operator: "and", boost: 4 } } },
2811
+ { match: { text: { query, operator: "or", boost: 2 } } }
2812
+ ],
2813
+ filter: filterClauses,
2814
+ minimum_should_match: 1
2815
+ }
2816
+ }
2817
+ };
2818
+ }
2819
+ function sourceToChunkRecord(source) {
2820
+ return {
2821
+ id: source.chunkId,
2822
+ documentId: source.documentId,
2823
+ sourceId: source.sourceId,
2824
+ title: source.title,
2825
+ uri: source.uri,
2826
+ headingPath: source.headingPath,
2827
+ text: source.text,
2828
+ contentHash: "",
2829
+ metadata: source.metadata,
2830
+ firstSeenAt: source.firstSeenAt,
2831
+ lastSeenAt: source.lastSeenAt,
2832
+ lastChangedAt: source.lastChangedAt
2833
+ };
2834
+ }
2835
+ function sourceToDocumentRecord(source) {
2836
+ return {
2837
+ id: source.documentId,
2838
+ sourceId: source.sourceId,
2839
+ sourceType: source.sourceType,
2840
+ title: source.title,
2841
+ uri: source.uri,
2842
+ sourceUri: source.uri,
2843
+ mimeType: "text/plain",
2844
+ normalizedPath: source.normalizedPath ?? "",
2845
+ contentHash: "",
2846
+ metadata: source.metadata,
2847
+ publicationDate: source.publicationDate ?? null,
2848
+ crawledAt: source.crawledAt,
2849
+ firstSeenAt: source.firstSeenAt,
2850
+ lastSeenAt: source.lastSeenAt,
2851
+ lastChangedAt: source.lastChangedAt
2852
+ };
2853
+ }
2854
+ async function materializeSearchHit(hit, query, config, orderedChunkCache, showChunks) {
2855
+ const source = hit._source;
2856
+ const chunk = sourceToChunkRecord(source);
2857
+ const document = sourceToDocumentRecord(source);
2858
+ const snippet = await buildSnippetWithAdjacentChunks(chunk, query, { document, config, orderedChunkCache });
2859
+ const enrichedSource = {
2860
+ ...source,
2861
+ snippet
2862
+ };
2863
+ const result = {
2864
+ chunkId: source.chunkId,
2865
+ documentId: source.documentId,
2866
+ sourceId: source.sourceId,
2867
+ sourceType: source.sourceType,
2868
+ score: hit._score,
2869
+ title: chooseResultTitle(chunk),
2870
+ uri: source.uri,
2871
+ snippet,
2872
+ text: showChunks ? source.text : void 0,
2873
+ publicationDate: source.publicationDate ?? null,
2874
+ firstSeenAt: source.firstSeenAt,
2875
+ lastSeenAt: source.lastSeenAt,
2876
+ lastChangedAt: source.lastChangedAt,
2877
+ metadata: source.metadata
2878
+ };
2879
+ return {
2880
+ hit: {
2881
+ ...hit,
2882
+ _source: enrichedSource
2883
+ },
2884
+ result
2885
+ };
2886
+ }
2887
+ function createSearchResponse(retrievalMode, hits, took, aggregations) {
2888
+ return {
2889
+ retrievalMode,
2890
+ took,
2891
+ hits: {
2892
+ total: {
2893
+ value: hits.length,
2894
+ relation: "eq"
2895
+ },
2896
+ max_score: hits.length > 0 ? Math.max(...hits.map((hit) => hit._score)) : null,
2897
+ hits
2898
+ },
2899
+ aggregations
2900
+ };
2901
+ }
2902
+ function searchResultsFromResponse(response, showChunks = false) {
2903
+ return response.hits.hits.map((hit) => ({
2904
+ chunkId: hit._source.chunkId,
2905
+ documentId: hit._source.documentId,
2906
+ sourceId: hit._source.sourceId,
2907
+ sourceType: hit._source.sourceType,
2908
+ score: hit._score,
2909
+ title: chooseResultTitle(sourceToChunkRecord(hit._source)),
2910
+ uri: hit._source.uri,
2911
+ snippet: hit._source.snippet ?? hit.highlight?.text?.join("\n\n") ?? buildSnippet(hit._source.text, hit._source.title),
2912
+ text: showChunks ? hit._source.text : void 0,
2913
+ publicationDate: hit._source.publicationDate ?? null,
2914
+ firstSeenAt: hit._source.firstSeenAt,
2915
+ lastSeenAt: hit._source.lastSeenAt,
2916
+ lastChangedAt: hit._source.lastChangedAt,
2917
+ metadata: hit._source.metadata
2918
+ }));
2919
+ }
2920
+ async function searchJsonRequest({
2921
+ index,
2922
+ request,
2923
+ indexName = "querylight"
2924
+ }) {
2925
+ return searchJsonDsl({ index, request, indexName });
2926
+ }
2927
+ async function searchJsonIndex({
2928
+ workspacePath,
2929
+ request,
2930
+ indexName = "querylight"
2931
+ }) {
2932
+ const index = await loadHydratedIndex(workspacePath);
2933
+ return searchJsonRequest({ index, request, indexName });
2934
+ }
2716
2935
  function normalizeDisplayTitle(title) {
2717
2936
  return title.replace(/\s*\|\s*Querylight TS Demo\s*$/i, "").replace(/\s+/g, " ").trim();
2718
2937
  }
@@ -2850,6 +3069,7 @@ async function searchIndex({
2850
3069
  retrievalMode,
2851
3070
  showChunks = false
2852
3071
  }) {
3072
+ const startedAt = Date.now();
2853
3073
  const config = await loadConfig(workspacePath);
2854
3074
  const mode = retrievalMode ?? config.retrieval.defaultMode;
2855
3075
  const candidateLimit = Math.max(topK * 5, 50);
@@ -2906,12 +3126,48 @@ async function searchIndex({
2906
3126
  };
2907
3127
  })
2908
3128
  );
2909
- return { retrievalMode: "lexical", results: latestResults.filter((result) => result != null) };
3129
+ const hits2 = latestResults.filter((result) => result != null).map((result) => {
3130
+ const chunk = chunks.get(result.chunkId);
3131
+ const document = documents.get(result.documentId);
3132
+ const source = sources.get(result.sourceId);
3133
+ return {
3134
+ _index: "querylight",
3135
+ _id: result.chunkId,
3136
+ _score: result.score,
3137
+ _source: {
3138
+ chunkId: result.chunkId,
3139
+ documentId: result.documentId,
3140
+ sourceId: result.sourceId,
3141
+ sourceType: result.sourceType,
3142
+ sourceName: source?.name,
3143
+ title: chunk.title,
3144
+ uri: result.uri,
3145
+ headingPath: chunk.headingPath,
3146
+ text: chunk.text,
3147
+ snippet: result.snippet,
3148
+ normalizedPath: document.normalizedPath,
3149
+ publicationDate: result.publicationDate ?? null,
3150
+ crawledAt: document.crawledAt,
3151
+ firstSeenAt: result.firstSeenAt,
3152
+ lastSeenAt: result.lastSeenAt,
3153
+ lastChangedAt: result.lastChangedAt,
3154
+ metadata: result.metadata
3155
+ }
3156
+ };
3157
+ });
3158
+ return createSearchResponse("lexical", hits2, Date.now() - startedAt);
2910
3159
  }
2911
3160
  const lexicalHits = async () => {
2912
- const index = await loadHydratedIndex(workspacePath);
2913
- const all = await index.searchRequest({ query: buildSearchQuery(normalizedQuery, { sourceId, sourceIds, sourceType, sourceTypes, tag, tags, metadata }), limit: candidateLimit });
2914
- return all.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit);
3161
+ const response = await searchJsonIndex({
3162
+ workspacePath,
3163
+ request: buildSearchDslRequest({
3164
+ query: normalizedQuery,
3165
+ topK: candidateLimit,
3166
+ filters: { sourceId, sourceIds, sourceName, sourceNames, sourceType, sourceTypes, uriPrefix, uriPrefixes, hasPublicationDate, tag, tags, metadata },
3167
+ dateRanges
3168
+ })
3169
+ });
3170
+ return response.hits.hits;
2915
3171
  };
2916
3172
  const denseHits = async () => {
2917
3173
  if (!await fileExists(denseVectorPath(workspacePath))) {
@@ -2925,15 +3181,18 @@ async function searchIndex({
2925
3181
  }
2926
3182
  return sparseQuery({ workspacePath, config: config.retrieval.sparse, query: normalizedQuery, topK: candidateLimit }).then((hits2) => hits2.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit));
2927
3183
  };
3184
+ let lexicalResponseHits = [];
2928
3185
  let hits;
2929
3186
  if (mode === "lexical") {
2930
- hits = await lexicalHits();
3187
+ lexicalResponseHits = await lexicalHits();
3188
+ hits = lexicalResponseHits.map((hit) => [hit._id, hit._score]);
2931
3189
  } else if (mode === "dense") {
2932
3190
  hits = await denseHits();
2933
3191
  } else if (mode === "sparse") {
2934
3192
  hits = await sparseHits();
2935
3193
  } else {
2936
- const rankings = [await lexicalHits()];
3194
+ lexicalResponseHits = await lexicalHits();
3195
+ const rankings = [lexicalResponseHits.map((hit) => [hit._id, hit._score])];
2937
3196
  if (await fileExists(denseVectorPath(workspacePath))) {
2938
3197
  rankings.push(await denseQuery({ workspacePath, config: config.retrieval.dense, query: normalizedQuery, topK: candidateLimit }).then((dense) => dense.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit)));
2939
3198
  }
@@ -2942,38 +3201,242 @@ async function searchIndex({
2942
3201
  }
2943
3202
  hits = reciprocalRankFusion(rankings, { rankConstant: 20, weights: rankings.map((_, index) => index === 0 ? 3 : 1) }).slice(0, candidateLimit);
2944
3203
  }
2945
- const rawResults = await Promise.all(hits.map(async ([chunkId, score]) => {
3204
+ const baseHits = mode === "lexical" ? lexicalResponseHits : hits.flatMap(([chunkId, score]) => {
2946
3205
  const chunk = chunks.get(chunkId);
2947
3206
  if (!chunk) {
2948
- return null;
3207
+ return [];
2949
3208
  }
3209
+ const document = documents.get(chunk.documentId);
3210
+ const source = sources.get(chunk.sourceId);
3211
+ return [{
3212
+ _index: "querylight",
3213
+ _id: chunkId,
3214
+ _score: score,
3215
+ _source: {
3216
+ chunkId,
3217
+ documentId: chunk.documentId,
3218
+ sourceId: chunk.sourceId,
3219
+ sourceType: document?.sourceType ?? "text",
3220
+ sourceName: source?.name,
3221
+ title: chunk.title,
3222
+ uri: chunk.uri,
3223
+ headingPath: chunk.headingPath,
3224
+ text: chunk.text,
3225
+ normalizedPath: document?.normalizedPath,
3226
+ publicationDate: document?.publicationDate ?? null,
3227
+ crawledAt: document?.crawledAt,
3228
+ firstSeenAt: document?.firstSeenAt ?? chunk.firstSeenAt,
3229
+ lastSeenAt: document?.lastSeenAt ?? chunk.lastSeenAt,
3230
+ lastChangedAt: document?.lastChangedAt ?? chunk.lastChangedAt,
3231
+ metadata: chunk.metadata
3232
+ }
3233
+ }];
3234
+ });
3235
+ const materialized = await Promise.all(baseHits.map((hit) => materializeSearchHit(hit, normalizedQuery, config, orderedChunkCache, showChunks)));
3236
+ if (showChunks) {
3237
+ const topHits = materialized.sort((left, right) => right.result.score - left.result.score).slice(0, topK).map(({ hit, result }) => ({ ...hit, _score: result.score }));
3238
+ return createSearchResponse(mode, topHits, Date.now() - startedAt);
3239
+ }
3240
+ const reranked = rerankResultsByDocument(materialized.map(({ result }) => result), topK);
3241
+ const byChunkId = new Map(materialized.map(({ hit }) => [hit._id, hit]));
3242
+ const finalHits = reranked.map((result) => {
3243
+ const hit = byChunkId.get(result.chunkId);
3244
+ return hit ? { ...hit, _score: result.score, _source: { ...hit._source, snippet: result.snippet } } : null;
3245
+ }).filter((hit) => hit != null);
3246
+ return createSearchResponse(mode, finalHits, Date.now() - startedAt);
3247
+ }
3248
+
3249
+ // src/server/search-api.ts
3250
+ import { createServer } from "http";
3251
+ import { readdir, stat as stat4 } from "fs/promises";
3252
+ import path19 from "path";
3253
+ async function pathIsDirectory(candidatePath) {
3254
+ try {
3255
+ return (await stat4(candidatePath)).isDirectory();
3256
+ } catch {
3257
+ return false;
3258
+ }
3259
+ }
3260
+ async function discoverKnowledgeBases(workspacePath) {
3261
+ try {
3262
+ const singleWorkspace = await assertWorkspaceExists(workspacePath);
3263
+ const config = await loadConfig(singleWorkspace);
3264
+ const index = await loadHydratedIndex(singleWorkspace);
2950
3265
  return {
2951
- chunkId,
2952
- documentId: chunk.documentId,
2953
- sourceId: chunk.sourceId,
2954
- sourceType: documents.get(chunk.documentId)?.sourceType ?? "text",
2955
- score,
2956
- title: chooseResultTitle(chunk),
2957
- uri: chunk.uri,
2958
- snippet: await buildSnippetWithAdjacentChunks(chunk, normalizedQuery, {
2959
- document: documents.get(chunk.documentId),
2960
- config,
2961
- orderedChunkCache
2962
- }),
2963
- text: showChunks ? chunk.text : void 0,
2964
- publicationDate: documents.get(chunk.documentId)?.publicationDate ?? null,
2965
- firstSeenAt: documents.get(chunk.documentId)?.firstSeenAt ?? chunk.firstSeenAt,
2966
- lastSeenAt: documents.get(chunk.documentId)?.lastSeenAt ?? chunk.lastSeenAt,
2967
- lastChangedAt: documents.get(chunk.documentId)?.lastChangedAt ?? chunk.lastChangedAt,
2968
- metadata: chunk.metadata
3266
+ mode: "single",
3267
+ knowledgeBases: [{
3268
+ name: config.index.name,
3269
+ workspacePath: singleWorkspace,
3270
+ configuredIndexName: config.index.name,
3271
+ index
3272
+ }]
2969
3273
  };
2970
- }));
2971
- const results = rawResults.filter((result) => result != null);
2972
- return { retrievalMode: mode, results: rerankResultsByDocument(results, topK) };
3274
+ } catch (error) {
3275
+ if (!(error instanceof CliError) || error.code !== "WORKSPACE_ERROR") {
3276
+ throw error;
3277
+ }
3278
+ }
3279
+ const resolvedRoot = path19.resolve(workspacePath);
3280
+ if (!await pathIsDirectory(resolvedRoot)) {
3281
+ throw new CliError(`workspace path does not exist: ${resolvedRoot}`, "WORKSPACE_ERROR", 3 /* WorkspaceError */);
3282
+ }
3283
+ const entries = await readdir(resolvedRoot, { withFileTypes: true });
3284
+ const knowledgeBases = (await Promise.all(entries.filter((entry) => entry.isDirectory()).map(async (entry) => {
3285
+ const candidateWorkspace = path19.join(resolvedRoot, entry.name, ".kb");
3286
+ try {
3287
+ const workspace = await assertWorkspaceExists(candidateWorkspace);
3288
+ const config = await loadConfig(workspace);
3289
+ const index = await loadHydratedIndex(workspace);
3290
+ return {
3291
+ name: entry.name,
3292
+ workspacePath: workspace,
3293
+ configuredIndexName: config.index.name,
3294
+ index
3295
+ };
3296
+ } catch (error) {
3297
+ if (error instanceof CliError && error.code === "WORKSPACE_ERROR") {
3298
+ return null;
3299
+ }
3300
+ throw error;
3301
+ }
3302
+ }))).filter((knowledgeBase) => knowledgeBase != null);
3303
+ if (knowledgeBases.length === 0) {
3304
+ throw new CliError(
3305
+ `no knowledge bases found at ${resolvedRoot}; use a .kb workspace or a directory of named subdirectories that each contain .kb`,
3306
+ "WORKSPACE_ERROR",
3307
+ 3 /* WorkspaceError */
3308
+ );
3309
+ }
3310
+ return { mode: "multi", knowledgeBases };
3311
+ }
3312
+ function sendJson(response, statusCode, payload) {
3313
+ response.statusCode = statusCode;
3314
+ response.setHeader("content-type", "application/json; charset=utf-8");
3315
+ response.end(JSON.stringify(payload));
3316
+ }
3317
+ function sendError(response, statusCode, type, reason) {
3318
+ sendJson(response, statusCode, {
3319
+ error: {
3320
+ type,
3321
+ reason
3322
+ },
3323
+ status: statusCode
3324
+ });
3325
+ }
3326
+ async function readRequestBody(request) {
3327
+ const chunks = [];
3328
+ for await (const chunk of request) {
3329
+ chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
3330
+ }
3331
+ return Buffer.concat(chunks).toString("utf8");
3332
+ }
3333
+ function parseSearchRequest(raw) {
3334
+ const normalized = raw.trim();
3335
+ if (normalized.length === 0) {
3336
+ return {};
3337
+ }
3338
+ try {
3339
+ const parsed = JSON.parse(normalized);
3340
+ if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
3341
+ throw new Error("expected a JSON object");
3342
+ }
3343
+ return parsed;
3344
+ } catch (error) {
3345
+ const message = error instanceof Error ? error.message : String(error);
3346
+ throw new CliError(`invalid JSON request: ${message}`, "INVALID_ARGUMENT", 2 /* InvalidArguments */);
3347
+ }
3348
+ }
3349
+ function routeForKnowledgeBase(mode, knowledgeBase) {
3350
+ return mode === "single" ? "/_search" : `/${knowledgeBase.name}/_search`;
3351
+ }
3352
+ function resolveKnowledgeBaseForPath(pathname, mode, knowledgeBases) {
3353
+ const segments = pathname.split("/").filter(Boolean);
3354
+ if (mode === "single") {
3355
+ const knowledgeBase = [...knowledgeBases.values()][0];
3356
+ if (!knowledgeBase) {
3357
+ return null;
3358
+ }
3359
+ if (segments.length === 1 && segments[0] === "_search") {
3360
+ return knowledgeBase;
3361
+ }
3362
+ if (segments.length === 2 && segments[1] === "_search" && segments[0] === knowledgeBase.configuredIndexName) {
3363
+ return knowledgeBase;
3364
+ }
3365
+ return null;
3366
+ }
3367
+ if (segments.length === 2 && segments[1] === "_search") {
3368
+ return knowledgeBases.get(segments[0]) ?? null;
3369
+ }
3370
+ return null;
3371
+ }
3372
+ async function handleSearchRequest(request, response, pathname, mode, knowledgeBases) {
3373
+ if (request.method !== "GET" && request.method !== "POST") {
3374
+ response.setHeader("allow", "GET, POST");
3375
+ sendError(response, 405, "method_not_allowed", `unsupported method for ${pathname}`);
3376
+ return;
3377
+ }
3378
+ const knowledgeBase = resolveKnowledgeBaseForPath(pathname, mode, knowledgeBases);
3379
+ if (!knowledgeBase) {
3380
+ sendError(response, 404, "resource_not_found_exception", `unknown search route: ${pathname}`);
3381
+ return;
3382
+ }
3383
+ try {
3384
+ const requestBody = parseSearchRequest(await readRequestBody(request));
3385
+ const indexName = mode === "multi" ? knowledgeBase.name : knowledgeBase.configuredIndexName;
3386
+ const result = await searchJsonRequest({
3387
+ index: knowledgeBase.index,
3388
+ request: requestBody,
3389
+ indexName
3390
+ });
3391
+ sendJson(response, 200, result);
3392
+ } catch (error) {
3393
+ if (error instanceof CliError && error.code === "INVALID_ARGUMENT") {
3394
+ sendError(response, 400, "parse_exception", error.message);
3395
+ return;
3396
+ }
3397
+ const message = error instanceof Error ? error.message : String(error);
3398
+ sendError(response, 500, "search_phase_execution_exception", message);
3399
+ }
3400
+ }
3401
+ async function startSearchApiServer({
3402
+ workspacePath,
3403
+ host = "127.0.0.1",
3404
+ port = 3e3
3405
+ }) {
3406
+ const { mode, knowledgeBases } = await discoverKnowledgeBases(workspacePath);
3407
+ const byName = new Map(knowledgeBases.map((knowledgeBase) => [knowledgeBase.name, knowledgeBase]));
3408
+ const server = createServer(async (request, response) => {
3409
+ const url2 = new URL(request.url ?? "/", `http://${request.headers.host ?? `${host}:${port}`}`);
3410
+ await handleSearchRequest(request, response, url2.pathname, mode, byName);
3411
+ });
3412
+ await new Promise((resolve2, reject) => {
3413
+ server.once("error", reject);
3414
+ server.listen(port, host, () => {
3415
+ server.off("error", reject);
3416
+ resolve2();
3417
+ });
3418
+ });
3419
+ const address = server.address();
3420
+ if (!address || typeof address === "string") {
3421
+ throw new CliError("server failed to bind to a TCP address", "SERVER_ERROR", 1 /* GeneralError */);
3422
+ }
3423
+ const url = `http://${host}:${address.port}`;
3424
+ return {
3425
+ mode,
3426
+ url,
3427
+ knowledgeBases: knowledgeBases.map((knowledgeBase) => ({
3428
+ name: knowledgeBase.name,
3429
+ workspacePath: knowledgeBase.workspacePath,
3430
+ route: routeForKnowledgeBase(mode, knowledgeBase)
3431
+ })),
3432
+ close: async () => new Promise((resolve2, reject) => {
3433
+ server.close((error) => error ? reject(error) : resolve2());
3434
+ })
3435
+ };
2973
3436
  }
2974
3437
 
2975
3438
  // src/query/related-service.ts
2976
- import path19 from "path";
3439
+ import path20 from "path";
2977
3440
  function cosineSimilarity2(left, right) {
2978
3441
  let dot = 0;
2979
3442
  let leftNorm = 0;
@@ -3049,7 +3512,7 @@ async function findRelatedDocuments({
3049
3512
  if (!await fileExists(denseVectorPath(workspacePath))) {
3050
3513
  throw new CliError("dense vector index is not built; run `qli models pull --dense` and `qli rebuild`", "DENSE_INDEX_MISSING", 7 /* QueryError */);
3051
3514
  }
3052
- const documents = await readJsonl(path19.join(workspacePath, "documents", "documents.jsonl"));
3515
+ const documents = await readJsonl(path20.join(workspacePath, "documents", "documents.jsonl"));
3053
3516
  const selected = resolveDocumentSelector(documents, document);
3054
3517
  const densePayload = await readDensePayload(workspacePath);
3055
3518
  const vectors = buildDocumentVectors(documents, densePayload.chunks, densePayload.metadata.dimensions);
@@ -3086,9 +3549,10 @@ async function createContext({
3086
3549
  retrievalMode
3087
3550
  }) {
3088
3551
  const search = await searchIndex({ workspacePath, query, topK, showChunks: true, retrievalMode });
3552
+ const results = searchResultsFromResponse(search, true);
3089
3553
  const sources = [];
3090
3554
  let total = 0;
3091
- for (const result of search.results) {
3555
+ for (const result of results) {
3092
3556
  const text = result.text ?? "";
3093
3557
  if (total + text.length > maxChars && sources.length > 0) {
3094
3558
  break;
@@ -3121,7 +3585,7 @@ async function createContext({
3121
3585
  }
3122
3586
 
3123
3587
  // src/report/diff-service.ts
3124
- import path20 from "path";
3588
+ import path21 from "path";
3125
3589
  function chooseBaselineRun(runs, since) {
3126
3590
  if (since === "last-run") {
3127
3591
  return runs.at(-1);
@@ -3137,7 +3601,7 @@ async function diffWorkspace({
3137
3601
  documentId,
3138
3602
  since
3139
3603
  }) {
3140
- const current = await readJsonl(path20.join(workspacePath, "documents", "documents.jsonl"));
3604
+ const current = await readJsonl(path21.join(workspacePath, "documents", "documents.jsonl"));
3141
3605
  const baseline = chooseBaselineRun(await listRuns(workspacePath), since);
3142
3606
  const previous = new Map((baseline?.documentsSnapshot ?? []).map((document) => [document.id, document]));
3143
3607
  const changedDocuments = current.filter((document) => (!sourceId || document.sourceId === sourceId) && (!documentId || document.id === documentId)).filter((document) => {
@@ -3193,10 +3657,15 @@ export {
3193
3657
  ingestSources,
3194
3658
  listSources,
3195
3659
  loadConfig,
3660
+ loadHydratedIndex,
3196
3661
  removeSource,
3197
3662
  renderChangeReport,
3198
3663
  reprocessDocuments,
3199
3664
  searchIndex,
3665
+ searchJsonIndex,
3666
+ searchJsonRequest,
3667
+ searchResultsFromResponse,
3668
+ startSearchApiServer,
3200
3669
  updateSource,
3201
3670
  writeDefaultConfig
3202
3671
  };