@tryformation/querylight-cli 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/Dockerfile +7 -0
  2. package/LICENSE +21 -0
  3. package/README.md +391 -0
  4. package/dist/chunk/chunk-store.d.ts +4 -0
  5. package/dist/chunk/chunker.d.ts +9 -0
  6. package/dist/cli/format.d.ts +4 -0
  7. package/dist/cli/main.d.ts +2 -0
  8. package/dist/cli/main.js +3523 -0
  9. package/dist/cli/run-cli.d.ts +5 -0
  10. package/dist/core/config.d.ts +4 -0
  11. package/dist/core/constants.d.ts +3 -0
  12. package/dist/core/errors.d.ts +17 -0
  13. package/dist/core/files.d.ts +1 -0
  14. package/dist/core/hashing.d.ts +1 -0
  15. package/dist/core/ids.d.ts +1 -0
  16. package/dist/core/jsonl.d.ts +2 -0
  17. package/dist/core/runs.d.ts +3 -0
  18. package/dist/core/workspace.d.ts +7 -0
  19. package/dist/index/index-store.d.ts +11 -0
  20. package/dist/index/querylight-indexer.d.ts +14 -0
  21. package/dist/index.d.ts +11 -0
  22. package/dist/index.js +2794 -0
  23. package/dist/ingest/adapters/crawl4ai-adapter.d.ts +1 -0
  24. package/dist/ingest/adapters/directory-adapter.d.ts +2 -0
  25. package/dist/ingest/adapters/file-adapter.d.ts +16 -0
  26. package/dist/ingest/adapters/rss-adapter.d.ts +7 -0
  27. package/dist/ingest/adapters/url-adapter.d.ts +11 -0
  28. package/dist/ingest/adapters/website-adapter.d.ts +2 -0
  29. package/dist/ingest/document-utils.d.ts +24 -0
  30. package/dist/ingest/extractors/docx-extractor.d.ts +1 -0
  31. package/dist/ingest/extractors/html-extractor.d.ts +5 -0
  32. package/dist/ingest/extractors/markdown-extractor.d.ts +1 -0
  33. package/dist/ingest/extractors/pdf-extractor.d.ts +1 -0
  34. package/dist/ingest/extractors/text-extractor.d.ts +1 -0
  35. package/dist/ingest/ingest-service.d.ts +23 -0
  36. package/dist/normalize/boilerplate.d.ts +1 -0
  37. package/dist/normalize/normalize-markdown.d.ts +2 -0
  38. package/dist/query/context-builder.d.ts +8 -0
  39. package/dist/query/related-service.d.ts +6 -0
  40. package/dist/query/search-service.d.ts +31 -0
  41. package/dist/report/diff-service.d.ts +23 -0
  42. package/dist/sources/source-model.d.ts +1 -0
  43. package/dist/sources/source-store.d.ts +7 -0
  44. package/dist/types/models.d.ts +309 -0
  45. package/dist/vector/dense.d.ts +13 -0
  46. package/dist/vector/runtime.d.ts +18 -0
  47. package/dist/vector/service.d.ts +26 -0
  48. package/dist/vector/sparse.d.ts +19 -0
  49. package/dist/vector/store.d.ts +20 -0
  50. package/dist/vector/text.d.ts +3 -0
  51. package/package.json +66 -0
  52. package/scripts/sparse-encode.py +104 -0
package/package.json ADDED
@@ -0,0 +1,66 @@
1
+ {
2
+ "name": "@tryformation/querylight-cli",
3
+ "version": "0.1.1",
4
+ "description": "Querylight CLI for building and querying local knowledge bases.",
5
+ "license": "MIT",
6
+ "homepage": "https://github.com/formation-res/querylight-cli#readme",
7
+ "repository": {
8
+ "type": "git",
9
+ "url": "git+https://github.com/formation-res/querylight-cli.git"
10
+ },
11
+ "bugs": {
12
+ "url": "https://github.com/formation-res/querylight-cli/issues"
13
+ },
14
+ "type": "module",
15
+ "bin": {
16
+ "qli": "dist/cli/main.js"
17
+ },
18
+ "main": "./dist/index.js",
19
+ "types": "./dist/index.d.ts",
20
+ "files": [
21
+ "dist",
22
+ "scripts",
23
+ "Dockerfile",
24
+ "README.md",
25
+ "LICENSE"
26
+ ],
27
+ "engines": {
28
+ "node": ">=22"
29
+ },
30
+ "publishConfig": {
31
+ "access": "public"
32
+ },
33
+ "scripts": {
34
+ "build": "tsup src/index.ts src/cli/main.ts --format esm --splitting false --clean && tsc -p tsconfig.build.json",
35
+ "test": "vitest run",
36
+ "test:watch": "vitest",
37
+ "lint": "tsc --noEmit",
38
+ "check": "npm run lint && npm test",
39
+ "prepublishOnly": "npm run check && npm run build"
40
+ },
41
+ "dependencies": {
42
+ "@huggingface/transformers": "^3.8.1",
43
+ "@tryformation/querylight-ts": "^0.10.0",
44
+ "cheerio": "^1.2.0",
45
+ "cli-table3": "^0.6.5",
46
+ "commander": "^14.0.3",
47
+ "fast-glob": "^3.3.3",
48
+ "feedparser": "^2.2.10",
49
+ "feedsmith": "^2.9.4",
50
+ "gray-matter": "^4.0.3",
51
+ "mammoth": "^1.12.0",
52
+ "pdf-parse": "^2.4.5",
53
+ "picocolors": "^1.1.1",
54
+ "turndown": "^7.2.4",
55
+ "yaml": "^2.9.0"
56
+ },
57
+ "devDependencies": {
58
+ "@types/feedparser": "^2.2.8",
59
+ "@types/node": "^25.8.0",
60
+ "@types/pdf-parse": "^1.1.5",
61
+ "@types/turndown": "^5.0.6",
62
+ "tsup": "^8.5.1",
63
+ "typescript": "^6.0.3",
64
+ "vitest": "^4.1.6"
65
+ }
66
+ }
@@ -0,0 +1,104 @@
1
+ import json
2
+ import os
3
+ import sys
4
+
5
+ import torch
6
+ from huggingface_hub import hf_hub_download
7
+ from transformers import AutoModelForMaskedLM, AutoTokenizer
8
+
9
+
10
+ def build_query_token_weight_vector(tokenizer, model_id: str):
11
+ local_cached_path = hf_hub_download(repo_id=model_id, filename="query_token_weights.txt")
12
+ vector = [0.0] * tokenizer.vocab_size
13
+
14
+ with open(local_cached_path, encoding="utf-8") as handle:
15
+ for line in handle:
16
+ line = line.rstrip("\n")
17
+ if not line:
18
+ continue
19
+ token, weight = line.split("\t", 1)
20
+ token_id = tokenizer._convert_token_to_id_with_added_voc(token)
21
+ if token_id is not None and token_id >= 0:
22
+ vector[token_id] = float(weight)
23
+
24
+ return vector
25
+
26
+
27
+ def encode_document(model, tokenizer, text: str, top_tokens: int, special_token_ids: list[int]):
28
+ features = tokenizer([text], padding=True, truncation=True, return_tensors="pt", return_token_type_ids=False)
29
+ output = model(**features).logits
30
+ values, _ = torch.max(output * features["attention_mask"].unsqueeze(-1), dim=1)
31
+ values = torch.log1p(torch.relu(values))
32
+ values[:, special_token_ids] = 0
33
+
34
+ row = values[0].detach()
35
+ nonzero = torch.nonzero(row > 0, as_tuple=True)[0]
36
+ if len(nonzero) == 0:
37
+ return {}
38
+
39
+ if top_tokens > 0 and len(nonzero) > top_tokens:
40
+ candidate_weights = row[nonzero]
41
+ top_indices = torch.topk(candidate_weights, k=top_tokens).indices
42
+ nonzero = nonzero[top_indices]
43
+
44
+ vector = {}
45
+ for token_id in nonzero.tolist():
46
+ weight = float(row[token_id])
47
+ if weight > 0:
48
+ vector[str(token_id)] = weight
49
+ return vector
50
+
51
+
52
+ def load_runtime(model_id: str):
53
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
54
+ model = AutoModelForMaskedLM.from_pretrained(model_id)
55
+ special_token_ids = [
56
+ tokenizer.vocab[token]
57
+ for value in tokenizer.special_tokens_map.values()
58
+ for token in (value if isinstance(value, list) else [value])
59
+ if token in tokenizer.vocab
60
+ ]
61
+ return tokenizer, model, special_token_ids
62
+
63
+
64
+ def download_only(model_id: str):
65
+ tokenizer, _, _ = load_runtime(model_id)
66
+ query_weights = build_query_token_weight_vector(tokenizer, model_id)
67
+ json.dump({
68
+ "ok": True,
69
+ "vocabularySize": tokenizer.vocab_size,
70
+ "queryTokenWeightsLength": len(query_weights)
71
+ }, sys.stdout)
72
+
73
+
74
+ def encode_documents(model_id: str, top_tokens: int, documents):
75
+ tokenizer, model, special_token_ids = load_runtime(model_id)
76
+ output_documents = []
77
+ for document in documents:
78
+ output_documents.append({
79
+ "chunkId": document["chunkId"],
80
+ "vector": encode_document(model, tokenizer, document["text"], top_tokens, special_token_ids)
81
+ })
82
+
83
+ json.dump({
84
+ "query_token_weights": build_query_token_weight_vector(tokenizer, model_id),
85
+ "documents": output_documents,
86
+ "vocabularySize": tokenizer.vocab_size
87
+ }, sys.stdout)
88
+
89
+
90
+ def main():
91
+ payload = json.load(sys.stdin)
92
+ action = payload["action"]
93
+ model_id = payload["model_id"]
94
+ if action == "download_only":
95
+ download_only(model_id)
96
+ return
97
+ if action == "encode_documents":
98
+ encode_documents(model_id, int(payload["top_tokens"]), payload["documents"])
99
+ return
100
+ raise SystemExit(f"unsupported action: {action}")
101
+
102
+
103
+ if __name__ == "__main__":
104
+ main()