@tryformation/querylight-cli 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Dockerfile +7 -0
- package/LICENSE +21 -0
- package/README.md +391 -0
- package/dist/chunk/chunk-store.d.ts +4 -0
- package/dist/chunk/chunker.d.ts +9 -0
- package/dist/cli/format.d.ts +4 -0
- package/dist/cli/main.d.ts +2 -0
- package/dist/cli/main.js +3523 -0
- package/dist/cli/run-cli.d.ts +5 -0
- package/dist/core/config.d.ts +4 -0
- package/dist/core/constants.d.ts +3 -0
- package/dist/core/errors.d.ts +17 -0
- package/dist/core/files.d.ts +1 -0
- package/dist/core/hashing.d.ts +1 -0
- package/dist/core/ids.d.ts +1 -0
- package/dist/core/jsonl.d.ts +2 -0
- package/dist/core/runs.d.ts +3 -0
- package/dist/core/workspace.d.ts +7 -0
- package/dist/index/index-store.d.ts +11 -0
- package/dist/index/querylight-indexer.d.ts +14 -0
- package/dist/index.d.ts +11 -0
- package/dist/index.js +2794 -0
- package/dist/ingest/adapters/crawl4ai-adapter.d.ts +1 -0
- package/dist/ingest/adapters/directory-adapter.d.ts +2 -0
- package/dist/ingest/adapters/file-adapter.d.ts +16 -0
- package/dist/ingest/adapters/rss-adapter.d.ts +7 -0
- package/dist/ingest/adapters/url-adapter.d.ts +11 -0
- package/dist/ingest/adapters/website-adapter.d.ts +2 -0
- package/dist/ingest/document-utils.d.ts +24 -0
- package/dist/ingest/extractors/docx-extractor.d.ts +1 -0
- package/dist/ingest/extractors/html-extractor.d.ts +5 -0
- package/dist/ingest/extractors/markdown-extractor.d.ts +1 -0
- package/dist/ingest/extractors/pdf-extractor.d.ts +1 -0
- package/dist/ingest/extractors/text-extractor.d.ts +1 -0
- package/dist/ingest/ingest-service.d.ts +23 -0
- package/dist/normalize/boilerplate.d.ts +1 -0
- package/dist/normalize/normalize-markdown.d.ts +2 -0
- package/dist/query/context-builder.d.ts +8 -0
- package/dist/query/related-service.d.ts +6 -0
- package/dist/query/search-service.d.ts +31 -0
- package/dist/report/diff-service.d.ts +23 -0
- package/dist/sources/source-model.d.ts +1 -0
- package/dist/sources/source-store.d.ts +7 -0
- package/dist/types/models.d.ts +309 -0
- package/dist/vector/dense.d.ts +13 -0
- package/dist/vector/runtime.d.ts +18 -0
- package/dist/vector/service.d.ts +26 -0
- package/dist/vector/sparse.d.ts +19 -0
- package/dist/vector/store.d.ts +20 -0
- package/dist/vector/text.d.ts +3 -0
- package/package.json +66 -0
- package/scripts/sparse-encode.py +104 -0
package/package.json
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@tryformation/querylight-cli",
|
|
3
|
+
"version": "0.1.1",
|
|
4
|
+
"description": "Querylight CLI for building and querying local knowledge bases.",
|
|
5
|
+
"license": "MIT",
|
|
6
|
+
"homepage": "https://github.com/formation-res/querylight-cli#readme",
|
|
7
|
+
"repository": {
|
|
8
|
+
"type": "git",
|
|
9
|
+
"url": "git+https://github.com/formation-res/querylight-cli.git"
|
|
10
|
+
},
|
|
11
|
+
"bugs": {
|
|
12
|
+
"url": "https://github.com/formation-res/querylight-cli/issues"
|
|
13
|
+
},
|
|
14
|
+
"type": "module",
|
|
15
|
+
"bin": {
|
|
16
|
+
"qli": "dist/cli/main.js"
|
|
17
|
+
},
|
|
18
|
+
"main": "./dist/index.js",
|
|
19
|
+
"types": "./dist/index.d.ts",
|
|
20
|
+
"files": [
|
|
21
|
+
"dist",
|
|
22
|
+
"scripts",
|
|
23
|
+
"Dockerfile",
|
|
24
|
+
"README.md",
|
|
25
|
+
"LICENSE"
|
|
26
|
+
],
|
|
27
|
+
"engines": {
|
|
28
|
+
"node": ">=22"
|
|
29
|
+
},
|
|
30
|
+
"publishConfig": {
|
|
31
|
+
"access": "public"
|
|
32
|
+
},
|
|
33
|
+
"scripts": {
|
|
34
|
+
"build": "tsup src/index.ts src/cli/main.ts --format esm --splitting false --clean && tsc -p tsconfig.build.json",
|
|
35
|
+
"test": "vitest run",
|
|
36
|
+
"test:watch": "vitest",
|
|
37
|
+
"lint": "tsc --noEmit",
|
|
38
|
+
"check": "npm run lint && npm test",
|
|
39
|
+
"prepublishOnly": "npm run check && npm run build"
|
|
40
|
+
},
|
|
41
|
+
"dependencies": {
|
|
42
|
+
"@huggingface/transformers": "^3.8.1",
|
|
43
|
+
"@tryformation/querylight-ts": "^0.10.0",
|
|
44
|
+
"cheerio": "^1.2.0",
|
|
45
|
+
"cli-table3": "^0.6.5",
|
|
46
|
+
"commander": "^14.0.3",
|
|
47
|
+
"fast-glob": "^3.3.3",
|
|
48
|
+
"feedparser": "^2.2.10",
|
|
49
|
+
"feedsmith": "^2.9.4",
|
|
50
|
+
"gray-matter": "^4.0.3",
|
|
51
|
+
"mammoth": "^1.12.0",
|
|
52
|
+
"pdf-parse": "^2.4.5",
|
|
53
|
+
"picocolors": "^1.1.1",
|
|
54
|
+
"turndown": "^7.2.4",
|
|
55
|
+
"yaml": "^2.9.0"
|
|
56
|
+
},
|
|
57
|
+
"devDependencies": {
|
|
58
|
+
"@types/feedparser": "^2.2.8",
|
|
59
|
+
"@types/node": "^25.8.0",
|
|
60
|
+
"@types/pdf-parse": "^1.1.5",
|
|
61
|
+
"@types/turndown": "^5.0.6",
|
|
62
|
+
"tsup": "^8.5.1",
|
|
63
|
+
"typescript": "^6.0.3",
|
|
64
|
+
"vitest": "^4.1.6"
|
|
65
|
+
}
|
|
66
|
+
}
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
import torch
|
|
6
|
+
from huggingface_hub import hf_hub_download
|
|
7
|
+
from transformers import AutoModelForMaskedLM, AutoTokenizer
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def build_query_token_weight_vector(tokenizer, model_id: str):
|
|
11
|
+
local_cached_path = hf_hub_download(repo_id=model_id, filename="query_token_weights.txt")
|
|
12
|
+
vector = [0.0] * tokenizer.vocab_size
|
|
13
|
+
|
|
14
|
+
with open(local_cached_path, encoding="utf-8") as handle:
|
|
15
|
+
for line in handle:
|
|
16
|
+
line = line.rstrip("\n")
|
|
17
|
+
if not line:
|
|
18
|
+
continue
|
|
19
|
+
token, weight = line.split("\t", 1)
|
|
20
|
+
token_id = tokenizer._convert_token_to_id_with_added_voc(token)
|
|
21
|
+
if token_id is not None and token_id >= 0:
|
|
22
|
+
vector[token_id] = float(weight)
|
|
23
|
+
|
|
24
|
+
return vector
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def encode_document(model, tokenizer, text: str, top_tokens: int, special_token_ids: list[int]):
|
|
28
|
+
features = tokenizer([text], padding=True, truncation=True, return_tensors="pt", return_token_type_ids=False)
|
|
29
|
+
output = model(**features).logits
|
|
30
|
+
values, _ = torch.max(output * features["attention_mask"].unsqueeze(-1), dim=1)
|
|
31
|
+
values = torch.log1p(torch.relu(values))
|
|
32
|
+
values[:, special_token_ids] = 0
|
|
33
|
+
|
|
34
|
+
row = values[0].detach()
|
|
35
|
+
nonzero = torch.nonzero(row > 0, as_tuple=True)[0]
|
|
36
|
+
if len(nonzero) == 0:
|
|
37
|
+
return {}
|
|
38
|
+
|
|
39
|
+
if top_tokens > 0 and len(nonzero) > top_tokens:
|
|
40
|
+
candidate_weights = row[nonzero]
|
|
41
|
+
top_indices = torch.topk(candidate_weights, k=top_tokens).indices
|
|
42
|
+
nonzero = nonzero[top_indices]
|
|
43
|
+
|
|
44
|
+
vector = {}
|
|
45
|
+
for token_id in nonzero.tolist():
|
|
46
|
+
weight = float(row[token_id])
|
|
47
|
+
if weight > 0:
|
|
48
|
+
vector[str(token_id)] = weight
|
|
49
|
+
return vector
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def load_runtime(model_id: str):
|
|
53
|
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|
54
|
+
model = AutoModelForMaskedLM.from_pretrained(model_id)
|
|
55
|
+
special_token_ids = [
|
|
56
|
+
tokenizer.vocab[token]
|
|
57
|
+
for value in tokenizer.special_tokens_map.values()
|
|
58
|
+
for token in (value if isinstance(value, list) else [value])
|
|
59
|
+
if token in tokenizer.vocab
|
|
60
|
+
]
|
|
61
|
+
return tokenizer, model, special_token_ids
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def download_only(model_id: str):
|
|
65
|
+
tokenizer, _, _ = load_runtime(model_id)
|
|
66
|
+
query_weights = build_query_token_weight_vector(tokenizer, model_id)
|
|
67
|
+
json.dump({
|
|
68
|
+
"ok": True,
|
|
69
|
+
"vocabularySize": tokenizer.vocab_size,
|
|
70
|
+
"queryTokenWeightsLength": len(query_weights)
|
|
71
|
+
}, sys.stdout)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def encode_documents(model_id: str, top_tokens: int, documents):
|
|
75
|
+
tokenizer, model, special_token_ids = load_runtime(model_id)
|
|
76
|
+
output_documents = []
|
|
77
|
+
for document in documents:
|
|
78
|
+
output_documents.append({
|
|
79
|
+
"chunkId": document["chunkId"],
|
|
80
|
+
"vector": encode_document(model, tokenizer, document["text"], top_tokens, special_token_ids)
|
|
81
|
+
})
|
|
82
|
+
|
|
83
|
+
json.dump({
|
|
84
|
+
"query_token_weights": build_query_token_weight_vector(tokenizer, model_id),
|
|
85
|
+
"documents": output_documents,
|
|
86
|
+
"vocabularySize": tokenizer.vocab_size
|
|
87
|
+
}, sys.stdout)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def main():
|
|
91
|
+
payload = json.load(sys.stdin)
|
|
92
|
+
action = payload["action"]
|
|
93
|
+
model_id = payload["model_id"]
|
|
94
|
+
if action == "download_only":
|
|
95
|
+
download_only(model_id)
|
|
96
|
+
return
|
|
97
|
+
if action == "encode_documents":
|
|
98
|
+
encode_documents(model_id, int(payload["top_tokens"]), payload["documents"])
|
|
99
|
+
return
|
|
100
|
+
raise SystemExit(f"unsupported action: {action}")
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
if __name__ == "__main__":
|
|
104
|
+
main()
|