grepmax 0.7.44 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/commands/verify.js
CHANGED
|
@@ -52,7 +52,7 @@ const MODEL_PATH = path.join(MODEL_DIR, "model.onnx");
|
|
|
52
52
|
const SKIPLIST_PATH = path.join(MODEL_DIR, "skiplist.json");
|
|
53
53
|
function main() {
|
|
54
54
|
return __awaiter(this, void 0, void 0, function* () {
|
|
55
|
-
var _a, _b;
|
|
55
|
+
var _a, _b, _c;
|
|
56
56
|
console.log("🔍 Starting ColBERT Integrity Check...\n");
|
|
57
57
|
// --- CHECK 1: FILES EXIST ---
|
|
58
58
|
if (!fs.existsSync(MODEL_PATH))
|
|
@@ -68,19 +68,19 @@ function main() {
|
|
|
68
68
|
// Note: We use the ID we know works from your export: 50368
|
|
69
69
|
// But let's see if the tokenizer resolves "[Q] " correctly.
|
|
70
70
|
const encoded = yield tokenizer(queryText, { add_special_tokens: false });
|
|
71
|
-
const inputIds = encoded.input_ids
|
|
71
|
+
const inputIds = (_a = encoded.input_ids.data) !== null && _a !== void 0 ? _a : encoded.input_ids;
|
|
72
72
|
// Convert to standard array for inspection
|
|
73
73
|
const ids = Array.from(inputIds).map(Number);
|
|
74
74
|
// Mixedbread expects: [CLS] [Q] ...tokens... [SEP]
|
|
75
75
|
// Let's verify we can construct that.
|
|
76
76
|
const Q_ID = 50368;
|
|
77
|
-
const CLS_ID = (
|
|
77
|
+
const CLS_ID = (_b = tokenizer.convert_tokens_to_ids("[CLS]")) !== null && _b !== void 0 ? _b : 50281; // Fallback to standard if null
|
|
78
78
|
console.log(`\n--- Tokenizer Check ---`);
|
|
79
79
|
console.log(`Query: "${queryText}"`);
|
|
80
80
|
console.log(`Raw IDs:`, ids);
|
|
81
81
|
// Check if tokenizer recognizes the special tokens by text
|
|
82
|
-
const qCheck = tokenizer.
|
|
83
|
-
const dCheck = tokenizer.
|
|
82
|
+
const qCheck = tokenizer.convert_tokens_to_ids("[Q] ");
|
|
83
|
+
const dCheck = tokenizer.convert_tokens_to_ids("[D] ");
|
|
84
84
|
if (qCheck === 50368 && dCheck === 50369) {
|
|
85
85
|
console.log(`✅ Tokenizer Map Correct: [Q] -> ${qCheck}, [D] -> ${dCheck}`);
|
|
86
86
|
}
|
|
@@ -93,8 +93,8 @@ function main() {
|
|
|
93
93
|
console.log(`\n--- Skiplist Check ---`);
|
|
94
94
|
console.log(`Skiplist size: ${skiplist.size}`);
|
|
95
95
|
// Check common punctuation
|
|
96
|
-
const commaId = tokenizer.
|
|
97
|
-
const dotId = tokenizer.
|
|
96
|
+
const commaId = tokenizer.convert_tokens_to_ids(",");
|
|
97
|
+
const dotId = tokenizer.convert_tokens_to_ids(".");
|
|
98
98
|
if (skiplist.has(commaId) && skiplist.has(dotId)) {
|
|
99
99
|
console.log(`✅ Skiplist contains punctuation ('.'=${dotId}, ','=${commaId})`);
|
|
100
100
|
}
|
|
@@ -110,7 +110,7 @@ function main() {
|
|
|
110
110
|
BigInt(CLS_ID),
|
|
111
111
|
BigInt(Q_ID),
|
|
112
112
|
BigInt(1234),
|
|
113
|
-
BigInt((
|
|
113
|
+
BigInt((_c = tokenizer.sep_token_id) !== null && _c !== void 0 ? _c : 50282),
|
|
114
114
|
];
|
|
115
115
|
const tensorIds = new ort.Tensor("int64", new BigInt64Array(batchIds), [1, 4]);
|
|
116
116
|
const tensorMask = new ort.Tensor("int64", new BigInt64Array([BigInt(1), BigInt(1), BigInt(1), BigInt(1)]), [1, 4]);
|
|
@@ -23,20 +23,22 @@ class ColBERTTokenizer {
|
|
|
23
23
|
}
|
|
24
24
|
init(modelPath) {
|
|
25
25
|
return __awaiter(this, void 0, void 0, function* () {
|
|
26
|
-
var _a, _b, _c, _d, _e, _f
|
|
26
|
+
var _a, _b, _c, _d, _e, _f;
|
|
27
27
|
this.tokenizer = yield transformers_1.AutoTokenizer.from_pretrained(modelPath);
|
|
28
28
|
// Get special token IDs with fallbacks
|
|
29
29
|
// We use the IDs we discovered in validation: [Q]=50368, [D]=50369
|
|
30
30
|
// But we still try to look them up dynamically first.
|
|
31
31
|
const tokenizer = this.tokenizer;
|
|
32
|
-
const get = (token) =>
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
const
|
|
37
|
-
const
|
|
38
|
-
const
|
|
39
|
-
const
|
|
32
|
+
const get = (token) => {
|
|
33
|
+
const id = tokenizer === null || tokenizer === void 0 ? void 0 : tokenizer.convert_tokens_to_ids(token);
|
|
34
|
+
return typeof id === "number" && id >= 0 ? id : undefined;
|
|
35
|
+
};
|
|
36
|
+
const clsId = (_a = get("[CLS]")) !== null && _a !== void 0 ? _a : 50281;
|
|
37
|
+
const sepId = (_b = get("[SEP]")) !== null && _b !== void 0 ? _b : 50282;
|
|
38
|
+
const padId = (_c = get("[PAD]")) !== null && _c !== void 0 ? _c : 50283;
|
|
39
|
+
const maskId = (_d = get(MASK_TOKEN)) !== null && _d !== void 0 ? _d : 50284;
|
|
40
|
+
const queryMarkerId = (_e = get(QUERY_MARKER_TOKEN)) !== null && _e !== void 0 ? _e : 50368;
|
|
41
|
+
const docMarkerId = (_f = get(DOC_MARKER_TOKEN)) !== null && _f !== void 0 ? _f : 50369;
|
|
40
42
|
this.specialTokenIds = {
|
|
41
43
|
cls: clsId,
|
|
42
44
|
sep: sepId,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "grepmax",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.8.0",
|
|
4
4
|
"author": "Robert Owens <robowens@me.com>",
|
|
5
5
|
"homepage": "https://github.com/reowens/grepmax",
|
|
6
6
|
"bugs": {
|
|
@@ -33,9 +33,9 @@
|
|
|
33
33
|
"description": "Semantic code search for coding agents. Local embeddings, LLM summaries, call graph tracing.",
|
|
34
34
|
"dependencies": {
|
|
35
35
|
"@clack/prompts": "^1.1.0",
|
|
36
|
-
"@huggingface/transformers": "^
|
|
37
|
-
"@lancedb/lancedb": "^0.
|
|
38
|
-
"@modelcontextprotocol/sdk": "^1.
|
|
36
|
+
"@huggingface/transformers": "^4.0.0",
|
|
37
|
+
"@lancedb/lancedb": "^0.27.1",
|
|
38
|
+
"@modelcontextprotocol/sdk": "^1.29.0",
|
|
39
39
|
"apache-arrow": "^18.1.0",
|
|
40
40
|
"chalk": "^5.6.2",
|
|
41
41
|
"chokidar": "^5.0.0",
|
|
@@ -44,23 +44,24 @@
|
|
|
44
44
|
"dotenv": "^17.2.3",
|
|
45
45
|
"fast-glob": "^3.3.3",
|
|
46
46
|
"ignore": "^7.0.5",
|
|
47
|
-
"lmdb": "^3.
|
|
47
|
+
"lmdb": "^3.5.2",
|
|
48
48
|
"onnxruntime-node": "1.24.3",
|
|
49
|
-
"ora": "^
|
|
49
|
+
"ora": "^9.3.0",
|
|
50
50
|
"piscina": "^5.1.4",
|
|
51
51
|
"simsimd": "^6.5.5",
|
|
52
52
|
"uuid": "^13.0.0",
|
|
53
|
-
"web-tree-sitter": "^0.26.
|
|
53
|
+
"web-tree-sitter": "^0.26.7",
|
|
54
54
|
"zod": "^4.1.12"
|
|
55
55
|
},
|
|
56
56
|
"devDependencies": {
|
|
57
|
-
"@anthropic-ai/claude-agent-sdk": "^0.2.
|
|
58
|
-
"@biomejs/biome": "2.4.
|
|
57
|
+
"@anthropic-ai/claude-agent-sdk": "^0.2.87",
|
|
58
|
+
"@biomejs/biome": "2.4.10",
|
|
59
59
|
"@types/node": "^25.5.0",
|
|
60
60
|
"node-gyp": "^12.1.0",
|
|
61
61
|
"ts-node": "^10.9.2",
|
|
62
|
-
"typescript": "^
|
|
63
|
-
"
|
|
62
|
+
"typescript": "^6.0.2",
|
|
63
|
+
"vite": "^8.0.3",
|
|
64
|
+
"vitest": "^4.1.2"
|
|
64
65
|
},
|
|
65
66
|
"scripts": {
|
|
66
67
|
"postinstall": "node scripts/postinstall.js",
|
|
@@ -80,6 +81,6 @@
|
|
|
80
81
|
"typecheck": "tsc --noEmit",
|
|
81
82
|
"preversion": "pnpm test && pnpm typecheck",
|
|
82
83
|
"version": "bash scripts/sync-versions.sh && git add -A",
|
|
83
|
-
"postversion": "git push origin main
|
|
84
|
+
"postversion": "git push origin main && git push origin v$npm_package_version && gh release create v$npm_package_version --generate-notes --title v$npm_package_version && sleep 5 && gh run watch $(gh run list --workflow=release.yml --branch v$npm_package_version --limit 1 --json databaseId --jq '.[0].databaseId') --exit-status && sleep 30 && npm cache clean --force && npm install -g grepmax@$npm_package_version"
|
|
84
85
|
}
|
|
85
86
|
}
|