@tryformation/querylight-cli 0.2.2 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +33 -3
- package/dist/cli/format.d.ts +2 -2
- package/dist/cli/main.js +694 -135
- package/dist/core/constants.d.ts +1 -1
- package/dist/index/querylight-indexer.d.ts +2 -2
- package/dist/index.d.ts +1 -0
- package/dist/index.js +592 -123
- package/dist/query/search-service.d.ts +14 -1
- package/dist/server/search-api.d.ts +15 -0
- package/dist/types/models.d.ts +36 -1
- package/dist/vector/dense.d.ts +6 -1
- package/package.json +2 -2
- package/scripts/sparse-encode.py +29 -8
|
@@ -1,10 +1,23 @@
|
|
|
1
|
-
import type
|
|
1
|
+
import { type DocumentIndex, type JsonDslRequest, type JsonDslResponse } from "@tryformation/querylight-ts";
|
|
2
|
+
import type { RetrievalMode, SearchResponseData, SearchResult } from "../types/models.js";
|
|
3
|
+
export declare function loadHydratedIndex(workspacePath: string): Promise<DocumentIndex>;
|
|
2
4
|
type SearchDateField = "publicationDate" | "firstSeenAt" | "lastSeenAt" | "lastChangedAt" | "crawledAt";
|
|
3
5
|
type SearchDateRange = {
|
|
4
6
|
field: SearchDateField;
|
|
5
7
|
from?: string;
|
|
6
8
|
to?: string;
|
|
7
9
|
};
|
|
10
|
+
export declare function searchResultsFromResponse(response: SearchResponseData, showChunks?: boolean): SearchResult[];
|
|
11
|
+
export declare function searchJsonRequest({ index, request, indexName }: {
|
|
12
|
+
index: DocumentIndex;
|
|
13
|
+
request: JsonDslRequest;
|
|
14
|
+
indexName?: string;
|
|
15
|
+
}): Promise<JsonDslResponse>;
|
|
16
|
+
export declare function searchJsonIndex({ workspacePath, request, indexName }: {
|
|
17
|
+
workspacePath: string;
|
|
18
|
+
request: JsonDslRequest;
|
|
19
|
+
indexName?: string;
|
|
20
|
+
}): Promise<JsonDslResponse>;
|
|
8
21
|
export declare function searchIndex({ workspacePath, query, topK, sourceId, sourceIds, sourceName, sourceNames, sourceType, sourceTypes, uriPrefix, uriPrefixes, hasPublicationDate, tag, tags, metadata, dateRanges, retrievalMode, showChunks }: {
|
|
9
22
|
workspacePath: string;
|
|
10
23
|
query: string;
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
export type SearchApiServerInfo = {
|
|
2
|
+
mode: "single" | "multi";
|
|
3
|
+
url: string;
|
|
4
|
+
knowledgeBases: Array<{
|
|
5
|
+
name: string;
|
|
6
|
+
workspacePath: string;
|
|
7
|
+
route: string;
|
|
8
|
+
}>;
|
|
9
|
+
close: () => Promise<void>;
|
|
10
|
+
};
|
|
11
|
+
export declare function startSearchApiServer({ workspacePath, host, port }: {
|
|
12
|
+
workspacePath: string;
|
|
13
|
+
host?: string;
|
|
14
|
+
port?: number;
|
|
15
|
+
}): Promise<SearchApiServerInfo>;
|
package/dist/types/models.d.ts
CHANGED
|
@@ -222,9 +222,44 @@ export type SearchResult = {
|
|
|
222
222
|
lastChangedAt: string;
|
|
223
223
|
metadata: Record<string, unknown>;
|
|
224
224
|
};
|
|
225
|
+
export type SearchHitSource = {
|
|
226
|
+
chunkId: string;
|
|
227
|
+
documentId: string;
|
|
228
|
+
sourceId: string;
|
|
229
|
+
sourceType: SourceType;
|
|
230
|
+
sourceName?: string;
|
|
231
|
+
title: string;
|
|
232
|
+
uri: string;
|
|
233
|
+
headingPath: string[];
|
|
234
|
+
text: string;
|
|
235
|
+
snippet?: string;
|
|
236
|
+
normalizedPath?: string;
|
|
237
|
+
publicationDate?: string | null;
|
|
238
|
+
crawledAt?: string;
|
|
239
|
+
firstSeenAt: string;
|
|
240
|
+
lastSeenAt: string;
|
|
241
|
+
lastChangedAt: string;
|
|
242
|
+
metadata: Record<string, unknown>;
|
|
243
|
+
};
|
|
244
|
+
export type SearchHit = {
|
|
245
|
+
_index: string;
|
|
246
|
+
_id: string;
|
|
247
|
+
_score: number;
|
|
248
|
+
_source: SearchHitSource;
|
|
249
|
+
highlight?: Record<string, string[]>;
|
|
250
|
+
};
|
|
225
251
|
export type SearchResponseData = {
|
|
226
252
|
retrievalMode?: RetrievalMode;
|
|
227
|
-
|
|
253
|
+
took: number;
|
|
254
|
+
hits: {
|
|
255
|
+
total: {
|
|
256
|
+
value: number;
|
|
257
|
+
relation: "eq";
|
|
258
|
+
};
|
|
259
|
+
max_score: number | null;
|
|
260
|
+
hits: SearchHit[];
|
|
261
|
+
};
|
|
262
|
+
aggregations?: Record<string, unknown>;
|
|
228
263
|
};
|
|
229
264
|
export type RelatedDocumentResult = {
|
|
230
265
|
documentId: string;
|
package/dist/vector/dense.d.ts
CHANGED
|
@@ -1,6 +1,10 @@
|
|
|
1
1
|
import { type ProgressHandler } from "../core/progress.js";
|
|
2
2
|
import type { DenseVectorPayload, WorkspaceConfig } from "../types/models.js";
|
|
3
|
-
|
|
3
|
+
type DenseEmbedder = {
|
|
4
|
+
embed(text: string): Promise<number[]>;
|
|
5
|
+
dispose?: () => Promise<void>;
|
|
6
|
+
};
|
|
7
|
+
export declare function setDenseEmbedderFactoryForTests(factory: ((cacheDir: string, modelId: string) => Promise<DenseEmbedder | ((text: string) => Promise<number[]>)>) | null): void;
|
|
4
8
|
export declare function pullDenseModel(workspacePath: string, config: WorkspaceConfig["retrieval"]["dense"]): Promise<void>;
|
|
5
9
|
export declare function buildDenseVectors({ workspacePath, config, progress }: {
|
|
6
10
|
workspacePath: string;
|
|
@@ -13,3 +17,4 @@ export declare function denseQuery({ workspacePath, config, query, topK }: {
|
|
|
13
17
|
query: string;
|
|
14
18
|
topK: number;
|
|
15
19
|
}): Promise<Array<[string, number]>>;
|
|
20
|
+
export {};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tryformation/querylight-cli",
|
|
3
|
-
"version": "0.2.
|
|
3
|
+
"version": "0.2.4",
|
|
4
4
|
"description": "Querylight CLI for building and querying local knowledge bases.",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"homepage": "https://github.com/formation-res/querylight-cli#readme",
|
|
@@ -40,7 +40,7 @@
|
|
|
40
40
|
},
|
|
41
41
|
"dependencies": {
|
|
42
42
|
"@huggingface/transformers": "^3.8.1",
|
|
43
|
-
"@tryformation/querylight-ts": "^0.
|
|
43
|
+
"@tryformation/querylight-ts": "^0.11.0",
|
|
44
44
|
"cheerio": "^1.2.0",
|
|
45
45
|
"cli-table3": "^0.6.5",
|
|
46
46
|
"commander": "^14.0.3",
|
package/scripts/sparse-encode.py
CHANGED
|
@@ -7,19 +7,40 @@ from huggingface_hub import hf_hub_download
|
|
|
7
7
|
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
|
8
8
|
|
|
9
9
|
|
|
10
|
+
def _load_query_weights_file(model_id: str, filename: str):
|
|
11
|
+
try:
|
|
12
|
+
return hf_hub_download(repo_id=model_id, filename=filename)
|
|
13
|
+
except Exception:
|
|
14
|
+
return None
|
|
15
|
+
|
|
16
|
+
|
|
10
17
|
def build_query_token_weight_vector(tokenizer, model_id: str):
|
|
11
|
-
local_cached_path = hf_hub_download(repo_id=model_id, filename="query_token_weights.txt")
|
|
12
18
|
vector = [0.0] * tokenizer.vocab_size
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
19
|
+
local_cached_path = _load_query_weights_file(model_id, "query_token_weights.txt")
|
|
20
|
+
|
|
21
|
+
if local_cached_path is not None:
|
|
22
|
+
with open(local_cached_path, encoding="utf-8") as handle:
|
|
23
|
+
for line in handle:
|
|
24
|
+
line = line.rstrip("\n")
|
|
25
|
+
if not line:
|
|
26
|
+
continue
|
|
27
|
+
token, weight = line.split("\t", 1)
|
|
28
|
+
token_id = tokenizer._convert_token_to_id_with_added_voc(token)
|
|
29
|
+
if token_id is not None and token_id >= 0:
|
|
30
|
+
vector[token_id] = float(weight)
|
|
31
|
+
return vector
|
|
32
|
+
|
|
33
|
+
local_cached_path = _load_query_weights_file(model_id, "idf.json")
|
|
34
|
+
if local_cached_path is not None:
|
|
35
|
+
with open(local_cached_path, encoding="utf-8") as handle:
|
|
36
|
+
idf = json.load(handle)
|
|
37
|
+
for token, weight in idf.items():
|
|
20
38
|
token_id = tokenizer._convert_token_to_id_with_added_voc(token)
|
|
21
39
|
if token_id is not None and token_id >= 0:
|
|
22
40
|
vector[token_id] = float(weight)
|
|
41
|
+
return vector
|
|
42
|
+
|
|
43
|
+
raise FileNotFoundError(f"missing query token weights for {model_id}: expected query_token_weights.txt or idf.json")
|
|
23
44
|
|
|
24
45
|
return vector
|
|
25
46
|
|