@tryformation/querylight-cli 0.2.1 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +7 -3
- package/dist/cli/format.d.ts +2 -2
- package/dist/cli/main.js +391 -103
- package/dist/core/constants.d.ts +1 -1
- package/dist/index/querylight-indexer.d.ts +2 -2
- package/dist/index.js +344 -88
- package/dist/query/search-service.d.ts +8 -1
- package/dist/types/models.d.ts +36 -1
- package/dist/vector/runtime.d.ts +8 -0
- package/package.json +2 -2
- package/scripts/sparse-encode.py +5 -1
package/dist/core/constants.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
export declare const PACKAGE_NAME = "@tryformation/querylight-cli";
|
|
2
|
-
export declare const PACKAGE_VERSION = "0.2.
|
|
2
|
+
export declare const PACKAGE_VERSION = "0.2.3";
|
|
3
3
|
export declare const DEFAULT_WORKSPACE = ".kb";
|
|
4
4
|
export declare const DEFAULT_SHARED_MODEL_CACHE_DIR = "~/.qli/models/huggingface";
|
|
5
5
|
export declare const LEGACY_WORKSPACE_MODEL_CACHE_DIR = ".kb/models/huggingface";
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { type FieldIndex } from "@tryformation/querylight-ts";
|
|
2
2
|
import { type ProgressHandler } from "../core/progress.js";
|
|
3
3
|
import type { IndexMetadata } from "../types/models.js";
|
|
4
|
-
export declare function createIndexMapping(extraFields?: string[]): Record<string,
|
|
4
|
+
export declare function createIndexMapping(extraFields?: string[]): Record<string, FieldIndex>;
|
|
5
5
|
export declare function buildIndex({ workspacePath, denseOverride, sparseOverride, buildAvailableModels, progress }: {
|
|
6
6
|
workspacePath: string;
|
|
7
7
|
denseOverride?: boolean;
|
package/dist/index.js
CHANGED
|
@@ -1782,7 +1782,7 @@ async function chunkDocuments({
|
|
|
1782
1782
|
}
|
|
1783
1783
|
|
|
1784
1784
|
// src/index/querylight-indexer.ts
|
|
1785
|
-
import { Analyzer, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, RankingAlgorithm, TextFieldIndex } from "@tryformation/querylight-ts";
|
|
1785
|
+
import { Analyzer, DateFieldIndex, DocumentIndex, KeywordTokenizer, LowerCaseTextFilter, RankingAlgorithm, StoredSourceIndex, TextFieldIndex } from "@tryformation/querylight-ts";
|
|
1786
1786
|
import path17 from "path";
|
|
1787
1787
|
|
|
1788
1788
|
// src/vector/dense.ts
|
|
@@ -1795,6 +1795,8 @@ import os from "os";
|
|
|
1795
1795
|
import path12 from "path";
|
|
1796
1796
|
import { fileURLToPath } from "url";
|
|
1797
1797
|
import { execFile, execFileSync } from "child_process";
|
|
1798
|
+
import { mkdtemp, rm as rm2, writeFile as writeFile6 } from "fs/promises";
|
|
1799
|
+
var sparseExecFileSync = execFileSync;
|
|
1798
1800
|
function resolveQliHomeDir() {
|
|
1799
1801
|
return path12.resolve(process.env.QLI_HOME ?? path12.join(os.homedir(), ".qli"));
|
|
1800
1802
|
}
|
|
@@ -1850,29 +1852,36 @@ async function runSparsePython({
|
|
|
1850
1852
|
}) {
|
|
1851
1853
|
const cacheDir = resolveCacheDir(workspacePath, config.cacheDir);
|
|
1852
1854
|
const scriptPath = await sparseScriptPath(importMetaUrl);
|
|
1853
|
-
|
|
1854
|
-
|
|
1855
|
-
|
|
1856
|
-
|
|
1857
|
-
|
|
1858
|
-
"
|
|
1859
|
-
|
|
1860
|
-
|
|
1861
|
-
|
|
1862
|
-
|
|
1863
|
-
|
|
1864
|
-
|
|
1865
|
-
|
|
1866
|
-
|
|
1867
|
-
|
|
1868
|
-
|
|
1869
|
-
|
|
1870
|
-
|
|
1871
|
-
|
|
1872
|
-
|
|
1855
|
+
const payloadDir = await mkdtemp(path12.join(os.tmpdir(), "qli-sparse-"));
|
|
1856
|
+
const payloadPath = path12.join(payloadDir, "payload.json");
|
|
1857
|
+
await writeFile6(payloadPath, JSON.stringify(payload), "utf8");
|
|
1858
|
+
try {
|
|
1859
|
+
return sparseExecFileSync(
|
|
1860
|
+
"uv",
|
|
1861
|
+
[
|
|
1862
|
+
"run",
|
|
1863
|
+
"--with",
|
|
1864
|
+
"torch",
|
|
1865
|
+
"--with",
|
|
1866
|
+
"transformers",
|
|
1867
|
+
"--with",
|
|
1868
|
+
"huggingface_hub",
|
|
1869
|
+
"python",
|
|
1870
|
+
scriptPath,
|
|
1871
|
+
payloadPath
|
|
1872
|
+
],
|
|
1873
|
+
{
|
|
1874
|
+
encoding: "utf8",
|
|
1875
|
+
maxBuffer: 1024 * 1024 * 1024,
|
|
1876
|
+
env: {
|
|
1877
|
+
...process.env,
|
|
1878
|
+
HF_HOME: cacheDir
|
|
1879
|
+
}
|
|
1873
1880
|
}
|
|
1874
|
-
|
|
1875
|
-
|
|
1881
|
+
);
|
|
1882
|
+
} finally {
|
|
1883
|
+
await rm2(payloadDir, { recursive: true, force: true });
|
|
1884
|
+
}
|
|
1876
1885
|
}
|
|
1877
1886
|
async function getDenseTransformersRuntime(cacheDir) {
|
|
1878
1887
|
const transformers = await import("@huggingface/transformers");
|
|
@@ -1885,18 +1894,18 @@ async function getDenseTransformersRuntime(cacheDir) {
|
|
|
1885
1894
|
}
|
|
1886
1895
|
|
|
1887
1896
|
// src/vector/store.ts
|
|
1888
|
-
import { mkdir as mkdir6, rm as
|
|
1897
|
+
import { mkdir as mkdir6, rm as rm3, writeFile as writeFile8 } from "fs/promises";
|
|
1889
1898
|
import path13 from "path";
|
|
1890
1899
|
|
|
1891
1900
|
// src/core/gzip-json.ts
|
|
1892
|
-
import { readFile as readFile9, writeFile as
|
|
1901
|
+
import { readFile as readFile9, writeFile as writeFile7 } from "fs/promises";
|
|
1893
1902
|
import { promisify } from "util";
|
|
1894
1903
|
import { gunzip, gzip } from "zlib";
|
|
1895
1904
|
var gzipAsync = promisify(gzip);
|
|
1896
1905
|
var gunzipAsync = promisify(gunzip);
|
|
1897
1906
|
async function writeGzipJson(filePath, value) {
|
|
1898
1907
|
const payload = JSON.stringify(value, null, 2);
|
|
1899
|
-
await
|
|
1908
|
+
await writeFile7(filePath, await gzipAsync(Buffer.from(payload, "utf8")));
|
|
1900
1909
|
}
|
|
1901
1910
|
async function readJsonFromGzipOrFile(gzipPath, legacyPath) {
|
|
1902
1911
|
if (await fileExists(gzipPath)) {
|
|
@@ -1956,8 +1965,8 @@ async function writeDensePayload(workspacePath, payload) {
|
|
|
1956
1965
|
await writeGzipJson(denseVectorPath(workspacePath), payload);
|
|
1957
1966
|
await writeGzipJson(denseMetaPath(workspacePath), payload.metadata);
|
|
1958
1967
|
await Promise.all([
|
|
1959
|
-
|
|
1960
|
-
|
|
1968
|
+
rm3(legacyDenseVectorPath(workspacePath), { force: true }),
|
|
1969
|
+
rm3(legacyDenseMetaPath(workspacePath), { force: true })
|
|
1961
1970
|
]);
|
|
1962
1971
|
}
|
|
1963
1972
|
async function readDensePayload(workspacePath) {
|
|
@@ -1968,8 +1977,8 @@ async function writeSparsePayload(workspacePath, payload) {
|
|
|
1968
1977
|
await writeGzipJson(sparseVectorPath(workspacePath), payload);
|
|
1969
1978
|
await writeGzipJson(sparseMetaPath(workspacePath), payload.metadata);
|
|
1970
1979
|
await Promise.all([
|
|
1971
|
-
|
|
1972
|
-
|
|
1980
|
+
rm3(legacySparseVectorPath(workspacePath), { force: true }),
|
|
1981
|
+
rm3(legacySparseMetaPath(workspacePath), { force: true })
|
|
1973
1982
|
]);
|
|
1974
1983
|
}
|
|
1975
1984
|
async function readSparsePayload(workspacePath) {
|
|
@@ -2309,7 +2318,7 @@ async function buildVectorArtifacts({
|
|
|
2309
2318
|
}
|
|
2310
2319
|
|
|
2311
2320
|
// src/index/index-store.ts
|
|
2312
|
-
import { mkdir as mkdir9, rm as
|
|
2321
|
+
import { mkdir as mkdir9, rm as rm4 } from "fs/promises";
|
|
2313
2322
|
import path16 from "path";
|
|
2314
2323
|
function versionedIndexPath(workspacePath, stamp) {
|
|
2315
2324
|
return path16.join(workspacePath, "indexes", `${stamp}.json.gz`);
|
|
@@ -2351,10 +2360,10 @@ async function writeIndexArtifacts({
|
|
|
2351
2360
|
await writeGzipJson(latestIndexArtifactPath, indexState);
|
|
2352
2361
|
await writeGzipJson(latestMetadataArtifactPath, metadata);
|
|
2353
2362
|
await Promise.all([
|
|
2354
|
-
|
|
2355
|
-
|
|
2356
|
-
|
|
2357
|
-
|
|
2363
|
+
rm4(legacyLatestIndexPath(workspacePath), { force: true }),
|
|
2364
|
+
rm4(legacyLatestMetaPath(workspacePath), { force: true }),
|
|
2365
|
+
rm4(versionedLegacyIndexPath(workspacePath, stamp), { force: true }),
|
|
2366
|
+
rm4(versionedLegacyMetaPath(workspacePath, stamp), { force: true })
|
|
2358
2367
|
]);
|
|
2359
2368
|
return { indexPath: latestIndexArtifactPath, metadataPath: latestMetadataArtifactPath };
|
|
2360
2369
|
}
|
|
@@ -2370,12 +2379,19 @@ function keywordFieldIndex() {
|
|
|
2370
2379
|
function createIndexMapping(extraFields = []) {
|
|
2371
2380
|
const lexical = new TextFieldIndex(void 0, void 0, RankingAlgorithm.BM25);
|
|
2372
2381
|
const mapping = {
|
|
2382
|
+
_source: new StoredSourceIndex(),
|
|
2373
2383
|
text: lexical,
|
|
2374
2384
|
title: new TextFieldIndex(void 0, void 0, RankingAlgorithm.BM25),
|
|
2375
2385
|
uri: keywordFieldIndex(),
|
|
2376
2386
|
sourceId: keywordFieldIndex(),
|
|
2387
|
+
sourceName: keywordFieldIndex(),
|
|
2377
2388
|
tags: keywordFieldIndex(),
|
|
2378
|
-
sourceType: keywordFieldIndex()
|
|
2389
|
+
sourceType: keywordFieldIndex(),
|
|
2390
|
+
publicationDate: new DateFieldIndex(),
|
|
2391
|
+
firstSeenAt: new DateFieldIndex(),
|
|
2392
|
+
lastSeenAt: new DateFieldIndex(),
|
|
2393
|
+
lastChangedAt: new DateFieldIndex(),
|
|
2394
|
+
crawledAt: new DateFieldIndex()
|
|
2379
2395
|
};
|
|
2380
2396
|
for (const field of extraFields) {
|
|
2381
2397
|
mapping[field] = keywordFieldIndex();
|
|
@@ -2411,8 +2427,12 @@ async function buildIndex({
|
|
|
2411
2427
|
const sources = await readJsonl(path17.join(workspacePath, "sources", "sources.jsonl"));
|
|
2412
2428
|
const metadataFields = [...new Set(chunks.flatMap((chunk) => Object.keys(chunk.metadata).map((key) => `metadata.${key}`)))];
|
|
2413
2429
|
const index = new DocumentIndex(createIndexMapping(metadataFields));
|
|
2430
|
+
const documentsById = new Map(documents.map((document) => [document.id, document]));
|
|
2431
|
+
const sourcesById = new Map(sources.map((source) => [source.id, source]));
|
|
2414
2432
|
reportProgress(progress, `Building lexical index from ${chunks.length} chunk${chunks.length === 1 ? "" : "s"}`);
|
|
2415
2433
|
for (const chunk of chunks) {
|
|
2434
|
+
const document = documentsById.get(chunk.documentId);
|
|
2435
|
+
const source = sourcesById.get(chunk.sourceId);
|
|
2416
2436
|
index.index({
|
|
2417
2437
|
id: chunk.id,
|
|
2418
2438
|
fields: {
|
|
@@ -2420,9 +2440,33 @@ async function buildIndex({
|
|
|
2420
2440
|
title: [chunk.title],
|
|
2421
2441
|
uri: [chunk.uri.toLowerCase()],
|
|
2422
2442
|
sourceId: [chunk.sourceId.toLowerCase()],
|
|
2443
|
+
sourceName: source ? [source.name.toLowerCase()] : [],
|
|
2423
2444
|
tags: Array.isArray(chunk.metadata.tags) ? chunk.metadata.tags.map((tag) => String(tag).toLowerCase()) : [],
|
|
2424
2445
|
sourceType: [String(chunk.metadata.sourceType ?? "").toLowerCase()],
|
|
2446
|
+
publicationDate: document?.publicationDate ? [document.publicationDate] : [],
|
|
2447
|
+
firstSeenAt: [document?.firstSeenAt ?? chunk.firstSeenAt],
|
|
2448
|
+
lastSeenAt: [document?.lastSeenAt ?? chunk.lastSeenAt],
|
|
2449
|
+
lastChangedAt: [document?.lastChangedAt ?? chunk.lastChangedAt],
|
|
2450
|
+
crawledAt: document?.crawledAt ? [document.crawledAt] : [],
|
|
2425
2451
|
...flattenMetadata(chunk.metadata)
|
|
2452
|
+
},
|
|
2453
|
+
source: {
|
|
2454
|
+
chunkId: chunk.id,
|
|
2455
|
+
documentId: chunk.documentId,
|
|
2456
|
+
sourceId: chunk.sourceId,
|
|
2457
|
+
sourceType: document?.sourceType ?? "text",
|
|
2458
|
+
sourceName: source?.name,
|
|
2459
|
+
title: chunk.title,
|
|
2460
|
+
uri: chunk.uri,
|
|
2461
|
+
headingPath: chunk.headingPath,
|
|
2462
|
+
text: chunk.text,
|
|
2463
|
+
normalizedPath: document?.normalizedPath,
|
|
2464
|
+
publicationDate: document?.publicationDate ?? null,
|
|
2465
|
+
crawledAt: document?.crawledAt,
|
|
2466
|
+
firstSeenAt: document?.firstSeenAt ?? chunk.firstSeenAt,
|
|
2467
|
+
lastSeenAt: document?.lastSeenAt ?? chunk.lastSeenAt,
|
|
2468
|
+
lastChangedAt: document?.lastChangedAt ?? chunk.lastChangedAt,
|
|
2469
|
+
metadata: chunk.metadata
|
|
2426
2470
|
}
|
|
2427
2471
|
});
|
|
2428
2472
|
}
|
|
@@ -2431,7 +2475,7 @@ async function buildIndex({
|
|
|
2431
2475
|
const metadata = {
|
|
2432
2476
|
id: `index_${createdAt.replace(/[:.]/g, "-")}`,
|
|
2433
2477
|
createdAt,
|
|
2434
|
-
querylightVersion: "0.
|
|
2478
|
+
querylightVersion: "0.11.0",
|
|
2435
2479
|
kbVersion: "0.1.0",
|
|
2436
2480
|
documentCount: documents.length,
|
|
2437
2481
|
chunkCount: chunks.length,
|
|
@@ -2460,7 +2504,7 @@ async function buildIndex({
|
|
|
2460
2504
|
|
|
2461
2505
|
// src/query/search-service.ts
|
|
2462
2506
|
import { readFile as readFile10 } from "fs/promises";
|
|
2463
|
-
import {
|
|
2507
|
+
import { reciprocalRankFusion, searchJsonDsl } from "@tryformation/querylight-ts";
|
|
2464
2508
|
import path18 from "path";
|
|
2465
2509
|
async function loadHydratedIndex(workspacePath) {
|
|
2466
2510
|
let state;
|
|
@@ -2488,24 +2532,6 @@ function matchesPrefix(value, prefixes) {
|
|
|
2488
2532
|
const lower = value.toLowerCase();
|
|
2489
2533
|
return prefixes.some((prefix) => lower.startsWith(prefix));
|
|
2490
2534
|
}
|
|
2491
|
-
function buildSearchQuery(query, filters) {
|
|
2492
|
-
const sourceIds = normalizeFilterValues([filters.sourceId, ...filters.sourceIds ?? []].filter((value) => Boolean(value)));
|
|
2493
|
-
const sourceTypes = normalizeFilterValues([filters.sourceType, ...filters.sourceTypes ?? []].filter((value) => Boolean(value)));
|
|
2494
|
-
const tags = normalizeFilterValues([filters.tag, ...filters.tags ?? []].filter((value) => Boolean(value)));
|
|
2495
|
-
return new BoolQuery({
|
|
2496
|
-
should: [
|
|
2497
|
-
new MatchQuery({ field: "title", text: query, operation: OP.AND, boost: 6 }),
|
|
2498
|
-
new MatchQuery({ field: "text", text: query, operation: OP.AND, boost: 4 }),
|
|
2499
|
-
new MatchQuery({ field: "text", text: query, operation: OP.OR, boost: 2 })
|
|
2500
|
-
],
|
|
2501
|
-
filter: [
|
|
2502
|
-
...sourceIds.length === 1 ? [new TermQuery({ field: "sourceId", text: sourceIds[0] })] : [],
|
|
2503
|
-
...sourceTypes.length === 1 ? [new TermQuery({ field: "sourceType", text: sourceTypes[0] })] : [],
|
|
2504
|
-
...tags.length === 1 ? [new TermQuery({ field: "tags", text: tags[0] })] : [],
|
|
2505
|
-
...(filters.metadata ?? []).map(({ key, value }) => new TermQuery({ field: `metadata.${key}`, text: value.toLowerCase() }))
|
|
2506
|
-
]
|
|
2507
|
-
});
|
|
2508
|
-
}
|
|
2509
2535
|
function isValidDate(value) {
|
|
2510
2536
|
return typeof value === "string" && !Number.isNaN(new Date(value).getTime());
|
|
2511
2537
|
}
|
|
@@ -2704,6 +2730,178 @@ async function buildSnippetWithAdjacentChunks(chunk, query, {
|
|
|
2704
2730
|
}
|
|
2705
2731
|
return buildExpandedParagraphSnippet(paragraphs, currentIndex, query);
|
|
2706
2732
|
}
|
|
2733
|
+
function buildSearchDslRequest({
|
|
2734
|
+
query,
|
|
2735
|
+
topK,
|
|
2736
|
+
filters,
|
|
2737
|
+
dateRanges
|
|
2738
|
+
}) {
|
|
2739
|
+
const filterClauses = [];
|
|
2740
|
+
const sourceIds = normalizeFilterValues([filters.sourceId, ...filters.sourceIds ?? []].filter((value) => Boolean(value)));
|
|
2741
|
+
const sourceNames = normalizeFilterValues([filters.sourceName, ...filters.sourceNames ?? []].filter((value) => Boolean(value)));
|
|
2742
|
+
const sourceTypes = normalizeFilterValues([filters.sourceType, ...filters.sourceTypes ?? []].filter((value) => Boolean(value)));
|
|
2743
|
+
const uriPrefixes = normalizeFilterValues([filters.uriPrefix, ...filters.uriPrefixes ?? []].filter((value) => Boolean(value)));
|
|
2744
|
+
const tags = normalizeFilterValues([filters.tag, ...filters.tags ?? []].filter((value) => Boolean(value)));
|
|
2745
|
+
if (sourceIds.length > 0) {
|
|
2746
|
+
filterClauses.push({ terms: { sourceId: sourceIds } });
|
|
2747
|
+
}
|
|
2748
|
+
if (sourceNames.length > 0) {
|
|
2749
|
+
filterClauses.push({ terms: { sourceName: sourceNames } });
|
|
2750
|
+
}
|
|
2751
|
+
if (sourceTypes.length > 0) {
|
|
2752
|
+
filterClauses.push({ terms: { sourceType: sourceTypes } });
|
|
2753
|
+
}
|
|
2754
|
+
if (uriPrefixes.length > 0) {
|
|
2755
|
+
filterClauses.push({
|
|
2756
|
+
bool: {
|
|
2757
|
+
should: uriPrefixes.map((prefix) => ({ prefix: { uri: prefix } })),
|
|
2758
|
+
minimum_should_match: 1
|
|
2759
|
+
}
|
|
2760
|
+
});
|
|
2761
|
+
}
|
|
2762
|
+
if (tags.length > 0) {
|
|
2763
|
+
filterClauses.push({ terms: { tags } });
|
|
2764
|
+
}
|
|
2765
|
+
if (filters.hasPublicationDate) {
|
|
2766
|
+
filterClauses.push({ exists: { field: "publicationDate" } });
|
|
2767
|
+
}
|
|
2768
|
+
for (const { key, value } of filters.metadata ?? []) {
|
|
2769
|
+
filterClauses.push({ term: { [`metadata.${key}`]: value.toLowerCase() } });
|
|
2770
|
+
}
|
|
2771
|
+
for (const { field, from, to } of dateRanges) {
|
|
2772
|
+
filterClauses.push({
|
|
2773
|
+
range: {
|
|
2774
|
+
[field]: {
|
|
2775
|
+
...from ? { gte: from } : {},
|
|
2776
|
+
...to ? { lte: to } : {}
|
|
2777
|
+
}
|
|
2778
|
+
}
|
|
2779
|
+
});
|
|
2780
|
+
}
|
|
2781
|
+
return {
|
|
2782
|
+
size: topK,
|
|
2783
|
+
query: {
|
|
2784
|
+
bool: {
|
|
2785
|
+
should: [
|
|
2786
|
+
{ match: { title: { query, operator: "and", boost: 6 } } },
|
|
2787
|
+
{ match: { text: { query, operator: "and", boost: 4 } } },
|
|
2788
|
+
{ match: { text: { query, operator: "or", boost: 2 } } }
|
|
2789
|
+
],
|
|
2790
|
+
filter: filterClauses,
|
|
2791
|
+
minimum_should_match: 1
|
|
2792
|
+
}
|
|
2793
|
+
}
|
|
2794
|
+
};
|
|
2795
|
+
}
|
|
2796
|
+
function sourceToChunkRecord(source) {
|
|
2797
|
+
return {
|
|
2798
|
+
id: source.chunkId,
|
|
2799
|
+
documentId: source.documentId,
|
|
2800
|
+
sourceId: source.sourceId,
|
|
2801
|
+
title: source.title,
|
|
2802
|
+
uri: source.uri,
|
|
2803
|
+
headingPath: source.headingPath,
|
|
2804
|
+
text: source.text,
|
|
2805
|
+
contentHash: "",
|
|
2806
|
+
metadata: source.metadata,
|
|
2807
|
+
firstSeenAt: source.firstSeenAt,
|
|
2808
|
+
lastSeenAt: source.lastSeenAt,
|
|
2809
|
+
lastChangedAt: source.lastChangedAt
|
|
2810
|
+
};
|
|
2811
|
+
}
|
|
2812
|
+
function sourceToDocumentRecord(source) {
|
|
2813
|
+
return {
|
|
2814
|
+
id: source.documentId,
|
|
2815
|
+
sourceId: source.sourceId,
|
|
2816
|
+
sourceType: source.sourceType,
|
|
2817
|
+
title: source.title,
|
|
2818
|
+
uri: source.uri,
|
|
2819
|
+
sourceUri: source.uri,
|
|
2820
|
+
mimeType: "text/plain",
|
|
2821
|
+
normalizedPath: source.normalizedPath ?? "",
|
|
2822
|
+
contentHash: "",
|
|
2823
|
+
metadata: source.metadata,
|
|
2824
|
+
publicationDate: source.publicationDate ?? null,
|
|
2825
|
+
crawledAt: source.crawledAt,
|
|
2826
|
+
firstSeenAt: source.firstSeenAt,
|
|
2827
|
+
lastSeenAt: source.lastSeenAt,
|
|
2828
|
+
lastChangedAt: source.lastChangedAt
|
|
2829
|
+
};
|
|
2830
|
+
}
|
|
2831
|
+
async function materializeSearchHit(hit, query, config, orderedChunkCache, showChunks) {
|
|
2832
|
+
const source = hit._source;
|
|
2833
|
+
const chunk = sourceToChunkRecord(source);
|
|
2834
|
+
const document = sourceToDocumentRecord(source);
|
|
2835
|
+
const snippet = await buildSnippetWithAdjacentChunks(chunk, query, { document, config, orderedChunkCache });
|
|
2836
|
+
const enrichedSource = {
|
|
2837
|
+
...source,
|
|
2838
|
+
snippet
|
|
2839
|
+
};
|
|
2840
|
+
const result = {
|
|
2841
|
+
chunkId: source.chunkId,
|
|
2842
|
+
documentId: source.documentId,
|
|
2843
|
+
sourceId: source.sourceId,
|
|
2844
|
+
sourceType: source.sourceType,
|
|
2845
|
+
score: hit._score,
|
|
2846
|
+
title: chooseResultTitle(chunk),
|
|
2847
|
+
uri: source.uri,
|
|
2848
|
+
snippet,
|
|
2849
|
+
text: showChunks ? source.text : void 0,
|
|
2850
|
+
publicationDate: source.publicationDate ?? null,
|
|
2851
|
+
firstSeenAt: source.firstSeenAt,
|
|
2852
|
+
lastSeenAt: source.lastSeenAt,
|
|
2853
|
+
lastChangedAt: source.lastChangedAt,
|
|
2854
|
+
metadata: source.metadata
|
|
2855
|
+
};
|
|
2856
|
+
return {
|
|
2857
|
+
hit: {
|
|
2858
|
+
...hit,
|
|
2859
|
+
_source: enrichedSource
|
|
2860
|
+
},
|
|
2861
|
+
result
|
|
2862
|
+
};
|
|
2863
|
+
}
|
|
2864
|
+
function createSearchResponse(retrievalMode, hits, took, aggregations) {
|
|
2865
|
+
return {
|
|
2866
|
+
retrievalMode,
|
|
2867
|
+
took,
|
|
2868
|
+
hits: {
|
|
2869
|
+
total: {
|
|
2870
|
+
value: hits.length,
|
|
2871
|
+
relation: "eq"
|
|
2872
|
+
},
|
|
2873
|
+
max_score: hits.length > 0 ? Math.max(...hits.map((hit) => hit._score)) : null,
|
|
2874
|
+
hits
|
|
2875
|
+
},
|
|
2876
|
+
aggregations
|
|
2877
|
+
};
|
|
2878
|
+
}
|
|
2879
|
+
function searchResultsFromResponse(response, showChunks = false) {
|
|
2880
|
+
return response.hits.hits.map((hit) => ({
|
|
2881
|
+
chunkId: hit._source.chunkId,
|
|
2882
|
+
documentId: hit._source.documentId,
|
|
2883
|
+
sourceId: hit._source.sourceId,
|
|
2884
|
+
sourceType: hit._source.sourceType,
|
|
2885
|
+
score: hit._score,
|
|
2886
|
+
title: chooseResultTitle(sourceToChunkRecord(hit._source)),
|
|
2887
|
+
uri: hit._source.uri,
|
|
2888
|
+
snippet: hit._source.snippet ?? hit.highlight?.text?.join("\n\n") ?? buildSnippet(hit._source.text, hit._source.title),
|
|
2889
|
+
text: showChunks ? hit._source.text : void 0,
|
|
2890
|
+
publicationDate: hit._source.publicationDate ?? null,
|
|
2891
|
+
firstSeenAt: hit._source.firstSeenAt,
|
|
2892
|
+
lastSeenAt: hit._source.lastSeenAt,
|
|
2893
|
+
lastChangedAt: hit._source.lastChangedAt,
|
|
2894
|
+
metadata: hit._source.metadata
|
|
2895
|
+
}));
|
|
2896
|
+
}
|
|
2897
|
+
async function searchJsonIndex({
|
|
2898
|
+
workspacePath,
|
|
2899
|
+
request,
|
|
2900
|
+
indexName = "querylight"
|
|
2901
|
+
}) {
|
|
2902
|
+
const index = await loadHydratedIndex(workspacePath);
|
|
2903
|
+
return searchJsonDsl({ index, request, indexName });
|
|
2904
|
+
}
|
|
2707
2905
|
function normalizeDisplayTitle(title) {
|
|
2708
2906
|
return title.replace(/\s*\|\s*Querylight TS Demo\s*$/i, "").replace(/\s+/g, " ").trim();
|
|
2709
2907
|
}
|
|
@@ -2841,6 +3039,7 @@ async function searchIndex({
|
|
|
2841
3039
|
retrievalMode,
|
|
2842
3040
|
showChunks = false
|
|
2843
3041
|
}) {
|
|
3042
|
+
const startedAt = Date.now();
|
|
2844
3043
|
const config = await loadConfig(workspacePath);
|
|
2845
3044
|
const mode = retrievalMode ?? config.retrieval.defaultMode;
|
|
2846
3045
|
const candidateLimit = Math.max(topK * 5, 50);
|
|
@@ -2897,12 +3096,48 @@ async function searchIndex({
|
|
|
2897
3096
|
};
|
|
2898
3097
|
})
|
|
2899
3098
|
);
|
|
2900
|
-
|
|
3099
|
+
const hits2 = latestResults.filter((result) => result != null).map((result) => {
|
|
3100
|
+
const chunk = chunks.get(result.chunkId);
|
|
3101
|
+
const document = documents.get(result.documentId);
|
|
3102
|
+
const source = sources.get(result.sourceId);
|
|
3103
|
+
return {
|
|
3104
|
+
_index: "querylight",
|
|
3105
|
+
_id: result.chunkId,
|
|
3106
|
+
_score: result.score,
|
|
3107
|
+
_source: {
|
|
3108
|
+
chunkId: result.chunkId,
|
|
3109
|
+
documentId: result.documentId,
|
|
3110
|
+
sourceId: result.sourceId,
|
|
3111
|
+
sourceType: result.sourceType,
|
|
3112
|
+
sourceName: source?.name,
|
|
3113
|
+
title: chunk.title,
|
|
3114
|
+
uri: result.uri,
|
|
3115
|
+
headingPath: chunk.headingPath,
|
|
3116
|
+
text: chunk.text,
|
|
3117
|
+
snippet: result.snippet,
|
|
3118
|
+
normalizedPath: document.normalizedPath,
|
|
3119
|
+
publicationDate: result.publicationDate ?? null,
|
|
3120
|
+
crawledAt: document.crawledAt,
|
|
3121
|
+
firstSeenAt: result.firstSeenAt,
|
|
3122
|
+
lastSeenAt: result.lastSeenAt,
|
|
3123
|
+
lastChangedAt: result.lastChangedAt,
|
|
3124
|
+
metadata: result.metadata
|
|
3125
|
+
}
|
|
3126
|
+
};
|
|
3127
|
+
});
|
|
3128
|
+
return createSearchResponse("lexical", hits2, Date.now() - startedAt);
|
|
2901
3129
|
}
|
|
2902
3130
|
const lexicalHits = async () => {
|
|
2903
|
-
const
|
|
2904
|
-
|
|
2905
|
-
|
|
3131
|
+
const response = await searchJsonIndex({
|
|
3132
|
+
workspacePath,
|
|
3133
|
+
request: buildSearchDslRequest({
|
|
3134
|
+
query: normalizedQuery,
|
|
3135
|
+
topK: candidateLimit,
|
|
3136
|
+
filters: { sourceId, sourceIds, sourceName, sourceNames, sourceType, sourceTypes, uriPrefix, uriPrefixes, hasPublicationDate, tag, tags, metadata },
|
|
3137
|
+
dateRanges
|
|
3138
|
+
})
|
|
3139
|
+
});
|
|
3140
|
+
return response.hits.hits;
|
|
2906
3141
|
};
|
|
2907
3142
|
const denseHits = async () => {
|
|
2908
3143
|
if (!await fileExists(denseVectorPath(workspacePath))) {
|
|
@@ -2916,15 +3151,18 @@ async function searchIndex({
|
|
|
2916
3151
|
}
|
|
2917
3152
|
return sparseQuery({ workspacePath, config: config.retrieval.sparse, query: normalizedQuery, topK: candidateLimit }).then((hits2) => hits2.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit));
|
|
2918
3153
|
};
|
|
3154
|
+
let lexicalResponseHits = [];
|
|
2919
3155
|
let hits;
|
|
2920
3156
|
if (mode === "lexical") {
|
|
2921
|
-
|
|
3157
|
+
lexicalResponseHits = await lexicalHits();
|
|
3158
|
+
hits = lexicalResponseHits.map((hit) => [hit._id, hit._score]);
|
|
2922
3159
|
} else if (mode === "dense") {
|
|
2923
3160
|
hits = await denseHits();
|
|
2924
3161
|
} else if (mode === "sparse") {
|
|
2925
3162
|
hits = await sparseHits();
|
|
2926
3163
|
} else {
|
|
2927
|
-
|
|
3164
|
+
lexicalResponseHits = await lexicalHits();
|
|
3165
|
+
const rankings = [lexicalResponseHits.map((hit) => [hit._id, hit._score])];
|
|
2928
3166
|
if (await fileExists(denseVectorPath(workspacePath))) {
|
|
2929
3167
|
rankings.push(await denseQuery({ workspacePath, config: config.retrieval.dense, query: normalizedQuery, topK: candidateLimit }).then((dense) => dense.filter(([chunkId]) => filterIds.includes(chunkId)).slice(0, candidateLimit)));
|
|
2930
3168
|
}
|
|
@@ -2933,34 +3171,49 @@ async function searchIndex({
|
|
|
2933
3171
|
}
|
|
2934
3172
|
hits = reciprocalRankFusion(rankings, { rankConstant: 20, weights: rankings.map((_, index) => index === 0 ? 3 : 1) }).slice(0, candidateLimit);
|
|
2935
3173
|
}
|
|
2936
|
-
const
|
|
3174
|
+
const baseHits = mode === "lexical" ? lexicalResponseHits : hits.flatMap(([chunkId, score]) => {
|
|
2937
3175
|
const chunk = chunks.get(chunkId);
|
|
2938
3176
|
if (!chunk) {
|
|
2939
|
-
return
|
|
3177
|
+
return [];
|
|
2940
3178
|
}
|
|
2941
|
-
|
|
2942
|
-
|
|
2943
|
-
|
|
2944
|
-
|
|
2945
|
-
|
|
2946
|
-
score,
|
|
2947
|
-
|
|
2948
|
-
|
|
2949
|
-
|
|
2950
|
-
|
|
2951
|
-
|
|
2952
|
-
|
|
2953
|
-
|
|
2954
|
-
|
|
2955
|
-
|
|
2956
|
-
|
|
2957
|
-
|
|
2958
|
-
|
|
2959
|
-
|
|
2960
|
-
|
|
2961
|
-
|
|
2962
|
-
|
|
2963
|
-
|
|
3179
|
+
const document = documents.get(chunk.documentId);
|
|
3180
|
+
const source = sources.get(chunk.sourceId);
|
|
3181
|
+
return [{
|
|
3182
|
+
_index: "querylight",
|
|
3183
|
+
_id: chunkId,
|
|
3184
|
+
_score: score,
|
|
3185
|
+
_source: {
|
|
3186
|
+
chunkId,
|
|
3187
|
+
documentId: chunk.documentId,
|
|
3188
|
+
sourceId: chunk.sourceId,
|
|
3189
|
+
sourceType: document?.sourceType ?? "text",
|
|
3190
|
+
sourceName: source?.name,
|
|
3191
|
+
title: chunk.title,
|
|
3192
|
+
uri: chunk.uri,
|
|
3193
|
+
headingPath: chunk.headingPath,
|
|
3194
|
+
text: chunk.text,
|
|
3195
|
+
normalizedPath: document?.normalizedPath,
|
|
3196
|
+
publicationDate: document?.publicationDate ?? null,
|
|
3197
|
+
crawledAt: document?.crawledAt,
|
|
3198
|
+
firstSeenAt: document?.firstSeenAt ?? chunk.firstSeenAt,
|
|
3199
|
+
lastSeenAt: document?.lastSeenAt ?? chunk.lastSeenAt,
|
|
3200
|
+
lastChangedAt: document?.lastChangedAt ?? chunk.lastChangedAt,
|
|
3201
|
+
metadata: chunk.metadata
|
|
3202
|
+
}
|
|
3203
|
+
}];
|
|
3204
|
+
});
|
|
3205
|
+
const materialized = await Promise.all(baseHits.map((hit) => materializeSearchHit(hit, normalizedQuery, config, orderedChunkCache, showChunks)));
|
|
3206
|
+
if (showChunks) {
|
|
3207
|
+
const topHits = materialized.sort((left, right) => right.result.score - left.result.score).slice(0, topK).map(({ hit, result }) => ({ ...hit, _score: result.score }));
|
|
3208
|
+
return createSearchResponse(mode, topHits, Date.now() - startedAt);
|
|
3209
|
+
}
|
|
3210
|
+
const reranked = rerankResultsByDocument(materialized.map(({ result }) => result), topK);
|
|
3211
|
+
const byChunkId = new Map(materialized.map(({ hit }) => [hit._id, hit]));
|
|
3212
|
+
const finalHits = reranked.map((result) => {
|
|
3213
|
+
const hit = byChunkId.get(result.chunkId);
|
|
3214
|
+
return hit ? { ...hit, _score: result.score, _source: { ...hit._source, snippet: result.snippet } } : null;
|
|
3215
|
+
}).filter((hit) => hit != null);
|
|
3216
|
+
return createSearchResponse(mode, finalHits, Date.now() - startedAt);
|
|
2964
3217
|
}
|
|
2965
3218
|
|
|
2966
3219
|
// src/query/related-service.ts
|
|
@@ -3077,9 +3330,10 @@ async function createContext({
|
|
|
3077
3330
|
retrievalMode
|
|
3078
3331
|
}) {
|
|
3079
3332
|
const search = await searchIndex({ workspacePath, query, topK, showChunks: true, retrievalMode });
|
|
3333
|
+
const results = searchResultsFromResponse(search, true);
|
|
3080
3334
|
const sources = [];
|
|
3081
3335
|
let total = 0;
|
|
3082
|
-
for (const result of
|
|
3336
|
+
for (const result of results) {
|
|
3083
3337
|
const text = result.text ?? "";
|
|
3084
3338
|
if (total + text.length > maxChars && sources.length > 0) {
|
|
3085
3339
|
break;
|
|
@@ -3188,6 +3442,8 @@ export {
|
|
|
3188
3442
|
renderChangeReport,
|
|
3189
3443
|
reprocessDocuments,
|
|
3190
3444
|
searchIndex,
|
|
3445
|
+
searchJsonIndex,
|
|
3446
|
+
searchResultsFromResponse,
|
|
3191
3447
|
updateSource,
|
|
3192
3448
|
writeDefaultConfig
|
|
3193
3449
|
};
|
|
@@ -1,10 +1,17 @@
|
|
|
1
|
-
import type
|
|
1
|
+
import { type JsonDslRequest, type JsonDslResponse } from "@tryformation/querylight-ts";
|
|
2
|
+
import type { RetrievalMode, SearchResponseData, SearchResult } from "../types/models.js";
|
|
2
3
|
type SearchDateField = "publicationDate" | "firstSeenAt" | "lastSeenAt" | "lastChangedAt" | "crawledAt";
|
|
3
4
|
type SearchDateRange = {
|
|
4
5
|
field: SearchDateField;
|
|
5
6
|
from?: string;
|
|
6
7
|
to?: string;
|
|
7
8
|
};
|
|
9
|
+
export declare function searchResultsFromResponse(response: SearchResponseData, showChunks?: boolean): SearchResult[];
|
|
10
|
+
export declare function searchJsonIndex({ workspacePath, request, indexName }: {
|
|
11
|
+
workspacePath: string;
|
|
12
|
+
request: JsonDslRequest;
|
|
13
|
+
indexName?: string;
|
|
14
|
+
}): Promise<JsonDslResponse>;
|
|
8
15
|
export declare function searchIndex({ workspacePath, query, topK, sourceId, sourceIds, sourceName, sourceNames, sourceType, sourceTypes, uriPrefix, uriPrefixes, hasPublicationDate, tag, tags, metadata, dateRanges, retrievalMode, showChunks }: {
|
|
9
16
|
workspacePath: string;
|
|
10
17
|
query: string;
|