@tryformation/querylight-cli 0.2.3 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -0
- package/dist/cli/main.js +397 -90
- package/dist/core/constants.d.ts +2 -2
- package/dist/index.d.ts +1 -0
- package/dist/index.js +310 -76
- package/dist/query/search-service.d.ts +7 -1
- package/dist/server/search-api.d.ts +15 -0
- package/dist/types/models.d.ts +3 -0
- package/dist/vector/dense.d.ts +6 -1
- package/package.json +3 -2
- package/scripts/assert-release-version.mjs +48 -0
- package/scripts/sparse-encode.py +29 -8
package/scripts/sparse-encode.py
CHANGED
|
@@ -7,19 +7,40 @@ from huggingface_hub import hf_hub_download
|
|
|
7
7
|
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
|
8
8
|
|
|
9
9
|
|
|
10
|
+
def _load_query_weights_file(model_id: str, filename: str):
|
|
11
|
+
try:
|
|
12
|
+
return hf_hub_download(repo_id=model_id, filename=filename)
|
|
13
|
+
except Exception:
|
|
14
|
+
return None
|
|
15
|
+
|
|
16
|
+
|
|
10
17
|
def build_query_token_weight_vector(tokenizer, model_id: str):
|
|
11
|
-
local_cached_path = hf_hub_download(repo_id=model_id, filename="query_token_weights.txt")
|
|
12
18
|
vector = [0.0] * tokenizer.vocab_size
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
19
|
+
local_cached_path = _load_query_weights_file(model_id, "query_token_weights.txt")
|
|
20
|
+
|
|
21
|
+
if local_cached_path is not None:
|
|
22
|
+
with open(local_cached_path, encoding="utf-8") as handle:
|
|
23
|
+
for line in handle:
|
|
24
|
+
line = line.rstrip("\n")
|
|
25
|
+
if not line:
|
|
26
|
+
continue
|
|
27
|
+
token, weight = line.split("\t", 1)
|
|
28
|
+
token_id = tokenizer._convert_token_to_id_with_added_voc(token)
|
|
29
|
+
if token_id is not None and token_id >= 0:
|
|
30
|
+
vector[token_id] = float(weight)
|
|
31
|
+
return vector
|
|
32
|
+
|
|
33
|
+
local_cached_path = _load_query_weights_file(model_id, "idf.json")
|
|
34
|
+
if local_cached_path is not None:
|
|
35
|
+
with open(local_cached_path, encoding="utf-8") as handle:
|
|
36
|
+
idf = json.load(handle)
|
|
37
|
+
for token, weight in idf.items():
|
|
20
38
|
token_id = tokenizer._convert_token_to_id_with_added_voc(token)
|
|
21
39
|
if token_id is not None and token_id >= 0:
|
|
22
40
|
vector[token_id] = float(weight)
|
|
41
|
+
return vector
|
|
42
|
+
|
|
43
|
+
raise FileNotFoundError(f"missing query token weights for {model_id}: expected query_token_weights.txt or idf.json")
|
|
23
44
|
|
|
24
45
|
return vector
|
|
25
46
|
|