@tryformation/querylight-cli 0.2.2 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,23 @@
1
- import type { RetrievalMode, SearchResponseData } from "../types/models.js";
1
+ import { type DocumentIndex, type JsonDslRequest, type JsonDslResponse } from "@tryformation/querylight-ts";
2
+ import type { RetrievalMode, SearchResponseData, SearchResult } from "../types/models.js";
3
+ export declare function loadHydratedIndex(workspacePath: string): Promise<DocumentIndex>;
2
4
  type SearchDateField = "publicationDate" | "firstSeenAt" | "lastSeenAt" | "lastChangedAt" | "crawledAt";
3
5
  type SearchDateRange = {
4
6
  field: SearchDateField;
5
7
  from?: string;
6
8
  to?: string;
7
9
  };
10
+ export declare function searchResultsFromResponse(response: SearchResponseData, showChunks?: boolean): SearchResult[];
11
+ export declare function searchJsonRequest({ index, request, indexName }: {
12
+ index: DocumentIndex;
13
+ request: JsonDslRequest;
14
+ indexName?: string;
15
+ }): Promise<JsonDslResponse>;
16
+ export declare function searchJsonIndex({ workspacePath, request, indexName }: {
17
+ workspacePath: string;
18
+ request: JsonDslRequest;
19
+ indexName?: string;
20
+ }): Promise<JsonDslResponse>;
8
21
  export declare function searchIndex({ workspacePath, query, topK, sourceId, sourceIds, sourceName, sourceNames, sourceType, sourceTypes, uriPrefix, uriPrefixes, hasPublicationDate, tag, tags, metadata, dateRanges, retrievalMode, showChunks }: {
9
22
  workspacePath: string;
10
23
  query: string;
@@ -0,0 +1,15 @@
1
+ export type SearchApiServerInfo = {
2
+ mode: "single" | "multi";
3
+ url: string;
4
+ knowledgeBases: Array<{
5
+ name: string;
6
+ workspacePath: string;
7
+ route: string;
8
+ }>;
9
+ close: () => Promise<void>;
10
+ };
11
+ export declare function startSearchApiServer({ workspacePath, host, port }: {
12
+ workspacePath: string;
13
+ host?: string;
14
+ port?: number;
15
+ }): Promise<SearchApiServerInfo>;
@@ -222,9 +222,44 @@ export type SearchResult = {
222
222
  lastChangedAt: string;
223
223
  metadata: Record<string, unknown>;
224
224
  };
225
+ export type SearchHitSource = {
226
+ chunkId: string;
227
+ documentId: string;
228
+ sourceId: string;
229
+ sourceType: SourceType;
230
+ sourceName?: string;
231
+ title: string;
232
+ uri: string;
233
+ headingPath: string[];
234
+ text: string;
235
+ snippet?: string;
236
+ normalizedPath?: string;
237
+ publicationDate?: string | null;
238
+ crawledAt?: string;
239
+ firstSeenAt: string;
240
+ lastSeenAt: string;
241
+ lastChangedAt: string;
242
+ metadata: Record<string, unknown>;
243
+ };
244
+ export type SearchHit = {
245
+ _index: string;
246
+ _id: string;
247
+ _score: number;
248
+ _source: SearchHitSource;
249
+ highlight?: Record<string, string[]>;
250
+ };
225
251
  export type SearchResponseData = {
226
252
  retrievalMode?: RetrievalMode;
227
- results: SearchResult[];
253
+ took: number;
254
+ hits: {
255
+ total: {
256
+ value: number;
257
+ relation: "eq";
258
+ };
259
+ max_score: number | null;
260
+ hits: SearchHit[];
261
+ };
262
+ aggregations?: Record<string, unknown>;
228
263
  };
229
264
  export type RelatedDocumentResult = {
230
265
  documentId: string;
@@ -1,6 +1,10 @@
1
1
  import { type ProgressHandler } from "../core/progress.js";
2
2
  import type { DenseVectorPayload, WorkspaceConfig } from "../types/models.js";
3
- export declare function setDenseEmbedderFactoryForTests(factory: ((cacheDir: string, modelId: string) => Promise<(text: string) => Promise<number[]>>) | null): void;
3
+ type DenseEmbedder = {
4
+ embed(text: string): Promise<number[]>;
5
+ dispose?: () => Promise<void>;
6
+ };
7
+ export declare function setDenseEmbedderFactoryForTests(factory: ((cacheDir: string, modelId: string) => Promise<DenseEmbedder | ((text: string) => Promise<number[]>)>) | null): void;
4
8
  export declare function pullDenseModel(workspacePath: string, config: WorkspaceConfig["retrieval"]["dense"]): Promise<void>;
5
9
  export declare function buildDenseVectors({ workspacePath, config, progress }: {
6
10
  workspacePath: string;
@@ -13,3 +17,4 @@ export declare function denseQuery({ workspacePath, config, query, topK }: {
13
17
  query: string;
14
18
  topK: number;
15
19
  }): Promise<Array<[string, number]>>;
20
+ export {};
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tryformation/querylight-cli",
3
- "version": "0.2.2",
3
+ "version": "0.2.4",
4
4
  "description": "Querylight CLI for building and querying local knowledge bases.",
5
5
  "license": "MIT",
6
6
  "homepage": "https://github.com/formation-res/querylight-cli#readme",
@@ -40,7 +40,7 @@
40
40
  },
41
41
  "dependencies": {
42
42
  "@huggingface/transformers": "^3.8.1",
43
- "@tryformation/querylight-ts": "^0.10.0",
43
+ "@tryformation/querylight-ts": "^0.11.0",
44
44
  "cheerio": "^1.2.0",
45
45
  "cli-table3": "^0.6.5",
46
46
  "commander": "^14.0.3",
@@ -7,19 +7,40 @@ from huggingface_hub import hf_hub_download
7
7
  from transformers import AutoModelForMaskedLM, AutoTokenizer
8
8
 
9
9
 
10
+ def _load_query_weights_file(model_id: str, filename: str):
11
+ try:
12
+ return hf_hub_download(repo_id=model_id, filename=filename)
13
+ except Exception:
14
+ return None
15
+
16
+
10
17
  def build_query_token_weight_vector(tokenizer, model_id: str):
11
- local_cached_path = hf_hub_download(repo_id=model_id, filename="query_token_weights.txt")
12
18
  vector = [0.0] * tokenizer.vocab_size
13
-
14
- with open(local_cached_path, encoding="utf-8") as handle:
15
- for line in handle:
16
- line = line.rstrip("\n")
17
- if not line:
18
- continue
19
- token, weight = line.split("\t", 1)
19
+ local_cached_path = _load_query_weights_file(model_id, "query_token_weights.txt")
20
+
21
+ if local_cached_path is not None:
22
+ with open(local_cached_path, encoding="utf-8") as handle:
23
+ for line in handle:
24
+ line = line.rstrip("\n")
25
+ if not line:
26
+ continue
27
+ token, weight = line.split("\t", 1)
28
+ token_id = tokenizer._convert_token_to_id_with_added_voc(token)
29
+ if token_id is not None and token_id >= 0:
30
+ vector[token_id] = float(weight)
31
+ return vector
32
+
33
+ local_cached_path = _load_query_weights_file(model_id, "idf.json")
34
+ if local_cached_path is not None:
35
+ with open(local_cached_path, encoding="utf-8") as handle:
36
+ idf = json.load(handle)
37
+ for token, weight in idf.items():
20
38
  token_id = tokenizer._convert_token_to_id_with_added_voc(token)
21
39
  if token_id is not None and token_id >= 0:
22
40
  vector[token_id] = float(weight)
41
+ return vector
42
+
43
+ raise FileNotFoundError(f"missing query token weights for {model_id}: expected query_token_weights.txt or idf.json")
23
44
 
24
45
  return vector
25
46