grepmax 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of grepmax might be problematic. Click here for more details.
- package/LICENSE +202 -0
- package/NOTICE +33 -0
- package/README.md +375 -0
- package/dist/commands/claude-code.js +60 -0
- package/dist/commands/codex.js +98 -0
- package/dist/commands/doctor.js +92 -0
- package/dist/commands/droid.js +189 -0
- package/dist/commands/index.js +125 -0
- package/dist/commands/list.js +120 -0
- package/dist/commands/mcp.js +572 -0
- package/dist/commands/opencode.js +199 -0
- package/dist/commands/search.js +539 -0
- package/dist/commands/serve.js +512 -0
- package/dist/commands/setup.js +162 -0
- package/dist/commands/skeleton.js +288 -0
- package/dist/commands/symbols.js +129 -0
- package/dist/commands/trace.js +50 -0
- package/dist/commands/verify.js +174 -0
- package/dist/config.js +120 -0
- package/dist/eval.js +618 -0
- package/dist/index.js +82 -0
- package/dist/lib/core/languages.js +237 -0
- package/dist/lib/graph/graph-builder.js +105 -0
- package/dist/lib/index/chunker.js +663 -0
- package/dist/lib/index/grammar-loader.js +110 -0
- package/dist/lib/index/ignore-patterns.js +63 -0
- package/dist/lib/index/index-config.js +86 -0
- package/dist/lib/index/sync-helpers.js +97 -0
- package/dist/lib/index/syncer.js +396 -0
- package/dist/lib/index/walker.js +164 -0
- package/dist/lib/index/watcher.js +245 -0
- package/dist/lib/output/formatter.js +161 -0
- package/dist/lib/output/json-formatter.js +6 -0
- package/dist/lib/search/intent.js +23 -0
- package/dist/lib/search/searcher.js +475 -0
- package/dist/lib/setup/model-loader.js +107 -0
- package/dist/lib/setup/setup-helpers.js +106 -0
- package/dist/lib/skeleton/body-fields.js +175 -0
- package/dist/lib/skeleton/index.js +24 -0
- package/dist/lib/skeleton/retriever.js +36 -0
- package/dist/lib/skeleton/skeletonizer.js +483 -0
- package/dist/lib/skeleton/summary-formatter.js +90 -0
- package/dist/lib/store/meta-cache.js +143 -0
- package/dist/lib/store/types.js +2 -0
- package/dist/lib/store/vector-db.js +340 -0
- package/dist/lib/utils/cleanup.js +33 -0
- package/dist/lib/utils/exit.js +38 -0
- package/dist/lib/utils/file-utils.js +131 -0
- package/dist/lib/utils/filter-builder.js +17 -0
- package/dist/lib/utils/formatter.js +230 -0
- package/dist/lib/utils/git.js +83 -0
- package/dist/lib/utils/lock.js +157 -0
- package/dist/lib/utils/project-root.js +107 -0
- package/dist/lib/utils/server-registry.js +97 -0
- package/dist/lib/workers/colbert-math.js +107 -0
- package/dist/lib/workers/colbert-tokenizer.js +113 -0
- package/dist/lib/workers/download-worker.js +169 -0
- package/dist/lib/workers/embeddings/colbert.js +213 -0
- package/dist/lib/workers/embeddings/granite.js +180 -0
- package/dist/lib/workers/embeddings/mlx-client.js +144 -0
- package/dist/lib/workers/orchestrator.js +350 -0
- package/dist/lib/workers/pool.js +373 -0
- package/dist/lib/workers/process-child.js +92 -0
- package/dist/lib/workers/worker.js +31 -0
- package/package.json +80 -0
- package/plugins/osgrep/.claude-plugin/plugin.json +20 -0
- package/plugins/osgrep/hooks/start.js +92 -0
- package/plugins/osgrep/hooks/stop.js +3 -0
- package/plugins/osgrep/hooks.json +26 -0
- package/plugins/osgrep/skills/osgrep/SKILL.md +82 -0
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
exports.findProjectRoot = findProjectRoot;
|
|
37
|
+
exports.ensureProjectPaths = ensureProjectPaths;
|
|
38
|
+
const fs = __importStar(require("node:fs"));
|
|
39
|
+
const path = __importStar(require("node:path"));
|
|
40
|
+
const config_1 = require("../../config");
|
|
41
|
+
function findProjectRoot(startDir = process.cwd()) {
|
|
42
|
+
const start = path.resolve(startDir);
|
|
43
|
+
// Only consider the current directory; do not climb above the user's cwd.
|
|
44
|
+
const osgrepDir = path.join(start, ".osgrep");
|
|
45
|
+
const gitDir = path.join(start, ".git");
|
|
46
|
+
if ((fs.existsSync(osgrepDir) || fs.existsSync(gitDir)) &&
|
|
47
|
+
path.resolve(start) !== path.resolve(config_1.PATHS.globalRoot)) {
|
|
48
|
+
return start;
|
|
49
|
+
}
|
|
50
|
+
// Otherwise, treat the current dir as the root (per-subdirectory isolation).
|
|
51
|
+
return start;
|
|
52
|
+
}
|
|
53
|
+
function ensureProjectPaths(startDir = process.cwd(), options) {
|
|
54
|
+
var _a;
|
|
55
|
+
const root = (_a = findProjectRoot(startDir)) !== null && _a !== void 0 ? _a : path.resolve(startDir);
|
|
56
|
+
const osgrepDir = path.join(root, ".osgrep");
|
|
57
|
+
const lancedbDir = path.join(osgrepDir, "lancedb");
|
|
58
|
+
const cacheDir = path.join(osgrepDir, "cache");
|
|
59
|
+
const lmdbPath = path.join(cacheDir, "meta.lmdb");
|
|
60
|
+
const configPath = path.join(osgrepDir, "config.json");
|
|
61
|
+
if (!(options === null || options === void 0 ? void 0 : options.dryRun)) {
|
|
62
|
+
[osgrepDir, lancedbDir, cacheDir].forEach((dir) => {
|
|
63
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
64
|
+
});
|
|
65
|
+
ensureGitignoreEntry(root);
|
|
66
|
+
}
|
|
67
|
+
return { root, osgrepDir, lancedbDir, cacheDir, lmdbPath, configPath };
|
|
68
|
+
}
|
|
69
|
+
function fileContainsEntry(filePath, entry) {
|
|
70
|
+
try {
|
|
71
|
+
const contents = fs.readFileSync(filePath, "utf-8");
|
|
72
|
+
return contents
|
|
73
|
+
.split(/\r?\n/)
|
|
74
|
+
.map((line) => line.trim())
|
|
75
|
+
.includes(entry);
|
|
76
|
+
}
|
|
77
|
+
catch (_a) {
|
|
78
|
+
return false;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
function ensureGitignoreEntry(root) {
|
|
82
|
+
// Only add when inside a git repo.
|
|
83
|
+
if (!fs.existsSync(path.join(root, ".git")))
|
|
84
|
+
return;
|
|
85
|
+
const entry = ".osgrep";
|
|
86
|
+
// Check .git/info/exclude first
|
|
87
|
+
const excludePath = path.join(root, ".git", "info", "exclude");
|
|
88
|
+
if (fileContainsEntry(excludePath, entry))
|
|
89
|
+
return;
|
|
90
|
+
// Check .gitignore
|
|
91
|
+
const gitignorePath = path.join(root, ".gitignore");
|
|
92
|
+
if (fileContainsEntry(gitignorePath, entry))
|
|
93
|
+
return;
|
|
94
|
+
// Add to .gitignore
|
|
95
|
+
let contents = "";
|
|
96
|
+
try {
|
|
97
|
+
contents = fs.readFileSync(gitignorePath, "utf-8");
|
|
98
|
+
}
|
|
99
|
+
catch (_a) {
|
|
100
|
+
// ignore missing file; will create below
|
|
101
|
+
}
|
|
102
|
+
const needsNewline = contents.length > 0 && !contents.endsWith("\n");
|
|
103
|
+
const prefix = needsNewline ? "\n" : "";
|
|
104
|
+
fs.writeFileSync(gitignorePath, `${contents}${prefix}${entry}\n`, {
|
|
105
|
+
encoding: "utf-8",
|
|
106
|
+
});
|
|
107
|
+
}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
exports.isProcessRunning = isProcessRunning;
|
|
37
|
+
exports.registerServer = registerServer;
|
|
38
|
+
exports.unregisterServer = unregisterServer;
|
|
39
|
+
exports.listServers = listServers;
|
|
40
|
+
exports.getServerForProject = getServerForProject;
|
|
41
|
+
const fs = __importStar(require("node:fs"));
|
|
42
|
+
const path = __importStar(require("node:path"));
|
|
43
|
+
const config_1 = require("../../config");
|
|
44
|
+
const REGISTRY_PATH = path.join(config_1.PATHS.globalRoot, "servers.json");
|
|
45
|
+
function loadRegistry() {
|
|
46
|
+
try {
|
|
47
|
+
if (!fs.existsSync(REGISTRY_PATH))
|
|
48
|
+
return [];
|
|
49
|
+
const data = fs.readFileSync(REGISTRY_PATH, "utf-8");
|
|
50
|
+
return JSON.parse(data);
|
|
51
|
+
}
|
|
52
|
+
catch (_a) {
|
|
53
|
+
return [];
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
function saveRegistry(servers) {
|
|
57
|
+
try {
|
|
58
|
+
fs.mkdirSync(path.dirname(REGISTRY_PATH), { recursive: true });
|
|
59
|
+
fs.writeFileSync(REGISTRY_PATH, JSON.stringify(servers, null, 2));
|
|
60
|
+
}
|
|
61
|
+
catch (err) {
|
|
62
|
+
console.error("Failed to save server registry:", err);
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
function isProcessRunning(pid) {
|
|
66
|
+
try {
|
|
67
|
+
process.kill(pid, 0);
|
|
68
|
+
return true;
|
|
69
|
+
}
|
|
70
|
+
catch (_a) {
|
|
71
|
+
return false;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
function registerServer(info) {
|
|
75
|
+
const servers = loadRegistry().filter((s) => isProcessRunning(s.pid));
|
|
76
|
+
// Remove any existing entry for this projectRoot to avoid duplicates
|
|
77
|
+
const filtered = servers.filter((s) => s.projectRoot !== info.projectRoot);
|
|
78
|
+
filtered.push(info);
|
|
79
|
+
saveRegistry(filtered);
|
|
80
|
+
}
|
|
81
|
+
function unregisterServer(pid) {
|
|
82
|
+
const servers = loadRegistry();
|
|
83
|
+
const filtered = servers.filter((s) => s.pid !== pid);
|
|
84
|
+
saveRegistry(filtered);
|
|
85
|
+
}
|
|
86
|
+
function listServers() {
|
|
87
|
+
const servers = loadRegistry();
|
|
88
|
+
// Clean up stale entries on read
|
|
89
|
+
const active = servers.filter((s) => isProcessRunning(s.pid));
|
|
90
|
+
if (active.length !== servers.length) {
|
|
91
|
+
saveRegistry(active);
|
|
92
|
+
}
|
|
93
|
+
return active;
|
|
94
|
+
}
|
|
95
|
+
function getServerForProject(projectRoot) {
|
|
96
|
+
return listServers().find((s) => s.projectRoot === projectRoot);
|
|
97
|
+
}
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
exports.maxSim = maxSim;
|
|
37
|
+
exports.cosineSim = cosineSim;
|
|
38
|
+
const fs = __importStar(require("node:fs"));
|
|
39
|
+
const path = __importStar(require("node:path"));
|
|
40
|
+
const simsimd_1 = require("simsimd");
|
|
41
|
+
const config_1 = require("../../config");
|
|
42
|
+
let SKIP_IDS = null;
|
|
43
|
+
function loadSkipIds() {
|
|
44
|
+
if (SKIP_IDS)
|
|
45
|
+
return SKIP_IDS;
|
|
46
|
+
// Check local models first (same logic as orchestrator)
|
|
47
|
+
const PROJECT_ROOT = process.env.OSGREP_PROJECT_ROOT
|
|
48
|
+
? path.resolve(process.env.OSGREP_PROJECT_ROOT)
|
|
49
|
+
: process.cwd();
|
|
50
|
+
const localModels = path.join(PROJECT_ROOT, "models");
|
|
51
|
+
const localColbert = path.join(localModels, ...config_1.MODEL_IDS.colbert.split("/"));
|
|
52
|
+
const localSkipPath = path.join(localColbert, "skiplist.json");
|
|
53
|
+
// Try local first, then global
|
|
54
|
+
const globalBasePath = path.join(config_1.PATHS.models, ...config_1.MODEL_IDS.colbert.split("/"));
|
|
55
|
+
const globalSkipPath = path.join(globalBasePath, "skiplist.json");
|
|
56
|
+
const skipPath = fs.existsSync(localSkipPath)
|
|
57
|
+
? localSkipPath
|
|
58
|
+
: globalSkipPath;
|
|
59
|
+
if (fs.existsSync(skipPath)) {
|
|
60
|
+
try {
|
|
61
|
+
const parsed = JSON.parse(fs.readFileSync(skipPath, "utf8"));
|
|
62
|
+
SKIP_IDS = new Set(parsed.map((n) => Number(n)));
|
|
63
|
+
return SKIP_IDS;
|
|
64
|
+
}
|
|
65
|
+
catch (_e) {
|
|
66
|
+
// fall through to empty set
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
SKIP_IDS = new Set();
|
|
70
|
+
return SKIP_IDS;
|
|
71
|
+
}
|
|
72
|
+
function maxSim(queryEmbeddings, docEmbeddings, docTokenIds) {
|
|
73
|
+
if (queryEmbeddings.length === 0 || docEmbeddings.length === 0) {
|
|
74
|
+
return 0;
|
|
75
|
+
}
|
|
76
|
+
const qVecs = queryEmbeddings.map((v) => v instanceof Float32Array ? v : new Float32Array(v));
|
|
77
|
+
const dVecs = docEmbeddings.map((v) => v instanceof Float32Array ? v : new Float32Array(v));
|
|
78
|
+
const dTokenIds = docTokenIds && docTokenIds.length === dVecs.length ? docTokenIds : null;
|
|
79
|
+
const skipIds = loadSkipIds();
|
|
80
|
+
let totalScore = 0;
|
|
81
|
+
for (const qVec of qVecs) {
|
|
82
|
+
let maxDotProduct = -Infinity;
|
|
83
|
+
for (let idx = 0; idx < dVecs.length; idx++) {
|
|
84
|
+
const tokenId = dTokenIds ? dTokenIds[idx] : null;
|
|
85
|
+
if (tokenId !== null && skipIds.has(Number(tokenId)))
|
|
86
|
+
continue;
|
|
87
|
+
const dVec = dVecs[idx];
|
|
88
|
+
const dim = Math.min(qVec.length, dVec.length);
|
|
89
|
+
const dot = (0, simsimd_1.inner)(qVec.subarray(0, dim), dVec.subarray(0, dim));
|
|
90
|
+
if (dot > maxDotProduct)
|
|
91
|
+
maxDotProduct = dot;
|
|
92
|
+
}
|
|
93
|
+
if (maxDotProduct === -Infinity)
|
|
94
|
+
maxDotProduct = 0;
|
|
95
|
+
totalScore += maxDotProduct;
|
|
96
|
+
}
|
|
97
|
+
return totalScore;
|
|
98
|
+
}
|
|
99
|
+
function cosineSim(a, b) {
|
|
100
|
+
const aVec = a instanceof Float32Array ? a : new Float32Array(a);
|
|
101
|
+
const bVec = b instanceof Float32Array ? b : new Float32Array(b);
|
|
102
|
+
const dim = Math.min(aVec.length, bVec.length);
|
|
103
|
+
if (aVec.length !== bVec.length) {
|
|
104
|
+
return (0, simsimd_1.inner)(aVec.subarray(0, dim), bVec.subarray(0, dim));
|
|
105
|
+
}
|
|
106
|
+
return (0, simsimd_1.inner)(aVec, bVec);
|
|
107
|
+
}
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
3
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
4
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
5
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
6
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
7
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
8
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.ColBERTTokenizer = void 0;
|
|
13
|
+
const transformers_1 = require("@huggingface/transformers");
|
|
14
|
+
const QUERY_MARKER_TOKEN = "[Q] ";
|
|
15
|
+
const DOC_MARKER_TOKEN = "[D] ";
|
|
16
|
+
const MASK_TOKEN = "[MASK]";
|
|
17
|
+
const QUERY_MAXLEN = 32; // Standard ColBERT query length
|
|
18
|
+
const DOC_MAXLEN = 512; // Standard ColBERT document length
|
|
19
|
+
class ColBERTTokenizer {
|
|
20
|
+
constructor() {
|
|
21
|
+
this.tokenizer = null;
|
|
22
|
+
this.specialTokenIds = null;
|
|
23
|
+
}
|
|
24
|
+
init(modelPath) {
|
|
25
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
26
|
+
var _a, _b, _c, _d, _e, _f, _g, _h, _j;
|
|
27
|
+
this.tokenizer = yield transformers_1.AutoTokenizer.from_pretrained(modelPath);
|
|
28
|
+
// Get special token IDs with fallbacks
|
|
29
|
+
// We use the IDs we discovered in validation: [Q]=50368, [D]=50369
|
|
30
|
+
// But we still try to look them up dynamically first.
|
|
31
|
+
const tokenizer = this.tokenizer;
|
|
32
|
+
const get = (token) => tokenizer === null || tokenizer === void 0 ? void 0 : tokenizer.model.tokens_to_ids.get(token);
|
|
33
|
+
const specialTokens = tokenizer;
|
|
34
|
+
const clsId = (_b = get((_a = specialTokens.cls_token) !== null && _a !== void 0 ? _a : "[CLS]")) !== null && _b !== void 0 ? _b : 50281;
|
|
35
|
+
const sepId = (_d = get((_c = specialTokens.sep_token) !== null && _c !== void 0 ? _c : "[SEP]")) !== null && _d !== void 0 ? _d : 50282;
|
|
36
|
+
const padId = (_f = get((_e = specialTokens.pad_token) !== null && _e !== void 0 ? _e : "[PAD]")) !== null && _f !== void 0 ? _f : 50283;
|
|
37
|
+
const maskId = (_g = get(MASK_TOKEN)) !== null && _g !== void 0 ? _g : 50284;
|
|
38
|
+
const queryMarkerId = (_h = get(QUERY_MARKER_TOKEN)) !== null && _h !== void 0 ? _h : 50368;
|
|
39
|
+
const docMarkerId = (_j = get(DOC_MARKER_TOKEN)) !== null && _j !== void 0 ? _j : 50369;
|
|
40
|
+
this.specialTokenIds = {
|
|
41
|
+
cls: clsId,
|
|
42
|
+
sep: sepId,
|
|
43
|
+
pad: padId,
|
|
44
|
+
mask: maskId,
|
|
45
|
+
queryMarker: queryMarkerId,
|
|
46
|
+
docMarker: docMarkerId,
|
|
47
|
+
};
|
|
48
|
+
});
|
|
49
|
+
}
|
|
50
|
+
encodeQuery(text) {
|
|
51
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
52
|
+
if (!this.tokenizer || !this.specialTokenIds) {
|
|
53
|
+
throw new Error("Tokenizer not initialized. Call init() first.");
|
|
54
|
+
}
|
|
55
|
+
// Tokenize without special tokens
|
|
56
|
+
const encoded = yield this.tokenizer(text, {
|
|
57
|
+
add_special_tokens: false,
|
|
58
|
+
truncation: true,
|
|
59
|
+
max_length: QUERY_MAXLEN - 2, // Reserve space for [CLS] and [Q]
|
|
60
|
+
});
|
|
61
|
+
const { input_ids } = encoded;
|
|
62
|
+
// Build sequence: [CLS] [Q] token1 token2 ... [SEP] [MASK] [MASK] ...
|
|
63
|
+
const finalIds = [
|
|
64
|
+
this.specialTokenIds.cls,
|
|
65
|
+
this.specialTokenIds.queryMarker,
|
|
66
|
+
...Array.from(input_ids.data).map(Number),
|
|
67
|
+
this.specialTokenIds.sep,
|
|
68
|
+
];
|
|
69
|
+
// Query Expansion: pad with [MASK] tokens up to QUERY_MAXLEN
|
|
70
|
+
while (finalIds.length < QUERY_MAXLEN) {
|
|
71
|
+
finalIds.push(this.specialTokenIds.mask);
|
|
72
|
+
}
|
|
73
|
+
// Truncate if somehow longer (safety check)
|
|
74
|
+
if (finalIds.length > QUERY_MAXLEN) {
|
|
75
|
+
finalIds.length = QUERY_MAXLEN;
|
|
76
|
+
}
|
|
77
|
+
// Create attention mask (1 for all tokens, since MASK is also attended to)
|
|
78
|
+
const attentionMask = new Array(finalIds.length).fill(1);
|
|
79
|
+
return {
|
|
80
|
+
input_ids: finalIds.map((id) => BigInt(id)),
|
|
81
|
+
attention_mask: attentionMask.map((v) => BigInt(v)),
|
|
82
|
+
};
|
|
83
|
+
});
|
|
84
|
+
}
|
|
85
|
+
encodeDoc(text) {
|
|
86
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
87
|
+
if (!this.tokenizer || !this.specialTokenIds) {
|
|
88
|
+
throw new Error("Tokenizer not initialized. Call init() first.");
|
|
89
|
+
}
|
|
90
|
+
// Tokenize without special tokens
|
|
91
|
+
const encoded = yield this.tokenizer(text, {
|
|
92
|
+
add_special_tokens: false,
|
|
93
|
+
truncation: true,
|
|
94
|
+
max_length: DOC_MAXLEN - 3, // Reserve space for [CLS], [D], and [SEP]
|
|
95
|
+
});
|
|
96
|
+
const { input_ids } = encoded;
|
|
97
|
+
// Build sequence: [CLS] [D] token1 token2 ... [SEP]
|
|
98
|
+
const finalIds = [
|
|
99
|
+
this.specialTokenIds.cls,
|
|
100
|
+
this.specialTokenIds.docMarker,
|
|
101
|
+
...Array.from(input_ids.data).map(Number),
|
|
102
|
+
this.specialTokenIds.sep,
|
|
103
|
+
];
|
|
104
|
+
// Create attention mask
|
|
105
|
+
const attentionMask = new Array(finalIds.length).fill(1);
|
|
106
|
+
return {
|
|
107
|
+
input_ids: finalIds.map((id) => BigInt(id)),
|
|
108
|
+
attention_mask: attentionMask.map((v) => BigInt(v)),
|
|
109
|
+
};
|
|
110
|
+
});
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
exports.ColBERTTokenizer = ColBERTTokenizer;
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
36
|
+
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
37
|
+
return new (P || (P = Promise))(function (resolve, reject) {
|
|
38
|
+
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
39
|
+
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
40
|
+
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
41
|
+
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
42
|
+
});
|
|
43
|
+
};
|
|
44
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
45
|
+
const fs = __importStar(require("node:fs"));
|
|
46
|
+
const os = __importStar(require("node:os"));
|
|
47
|
+
const path = __importStar(require("node:path"));
|
|
48
|
+
const node_worker_threads_1 = require("node:worker_threads");
|
|
49
|
+
const transformers_1 = require("@huggingface/transformers");
|
|
50
|
+
const config_1 = require("../../config");
|
|
51
|
+
// Configuration
|
|
52
|
+
const HOMEDIR = os.homedir();
|
|
53
|
+
const CACHE_DIR = path.join(HOMEDIR, ".osgrep", "models");
|
|
54
|
+
transformers_1.env.cacheDir = CACHE_DIR;
|
|
55
|
+
transformers_1.env.allowLocalModels = true;
|
|
56
|
+
transformers_1.env.allowRemoteModels = true;
|
|
57
|
+
// Suppress noisy warnings from transformers.js/onnxruntime
|
|
58
|
+
const originalWarn = console.warn;
|
|
59
|
+
console.warn = (...args) => {
|
|
60
|
+
if (args[0] &&
|
|
61
|
+
typeof args[0] === "string" &&
|
|
62
|
+
args[0].includes("Unable to determine content-length")) {
|
|
63
|
+
return;
|
|
64
|
+
}
|
|
65
|
+
originalWarn(...args);
|
|
66
|
+
};
|
|
67
|
+
// Helper to download with timeout
|
|
68
|
+
function downloadModelWithTimeout(modelId, dtype) {
|
|
69
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
70
|
+
const TIMEOUT_MS = 5 * 60 * 1000; // 5 minutes
|
|
71
|
+
try {
|
|
72
|
+
const downloadPromise = (0, transformers_1.pipeline)("feature-extraction", modelId, {
|
|
73
|
+
dtype,
|
|
74
|
+
progress_callback: (progress) => {
|
|
75
|
+
if (node_worker_threads_1.parentPort)
|
|
76
|
+
node_worker_threads_1.parentPort.postMessage({ type: "progress", progress });
|
|
77
|
+
},
|
|
78
|
+
});
|
|
79
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
80
|
+
setTimeout(() => reject(new Error(`Download timed out after ${TIMEOUT_MS} ms`)), TIMEOUT_MS);
|
|
81
|
+
});
|
|
82
|
+
return Promise.race([downloadPromise, timeoutPromise]);
|
|
83
|
+
}
|
|
84
|
+
catch (err) {
|
|
85
|
+
console.error(`Worker: pipeline creation failed for ${modelId}: `, err);
|
|
86
|
+
throw err;
|
|
87
|
+
}
|
|
88
|
+
});
|
|
89
|
+
}
|
|
90
|
+
// Helper to manually download extra files like skiplist.json
|
|
91
|
+
function downloadExtraFile(modelId, filename) {
|
|
92
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
93
|
+
const url = `https://huggingface.co/${modelId}/resolve/main/${filename}`;
|
|
94
|
+
// Construct path: ~/.osgrep/models/ryandono/osgrep-colbert-q8/skiplist.json
|
|
95
|
+
const destDir = path.join(CACHE_DIR, ...modelId.split("/"));
|
|
96
|
+
const destPath = path.join(destDir, filename);
|
|
97
|
+
if (!fs.existsSync(destDir)) {
|
|
98
|
+
fs.mkdirSync(destDir, { recursive: true });
|
|
99
|
+
}
|
|
100
|
+
// If file exists and is non-zero, skip (or implement hash check if you want SOTA robustness)
|
|
101
|
+
if (fs.existsSync(destPath) && fs.statSync(destPath).size > 0) {
|
|
102
|
+
return;
|
|
103
|
+
}
|
|
104
|
+
if (node_worker_threads_1.parentPort) {
|
|
105
|
+
node_worker_threads_1.parentPort.postMessage({
|
|
106
|
+
type: "progress",
|
|
107
|
+
progress: { status: "downloading", file: filename },
|
|
108
|
+
});
|
|
109
|
+
}
|
|
110
|
+
try {
|
|
111
|
+
const res = yield fetch(url);
|
|
112
|
+
if (!res.ok) {
|
|
113
|
+
throw new Error(`HTTP ${res.status}: ${res.statusText}`);
|
|
114
|
+
}
|
|
115
|
+
const buffer = yield res.arrayBuffer();
|
|
116
|
+
fs.writeFileSync(destPath, Buffer.from(buffer));
|
|
117
|
+
if (node_worker_threads_1.parentPort) {
|
|
118
|
+
node_worker_threads_1.parentPort.postMessage({
|
|
119
|
+
type: "progress",
|
|
120
|
+
progress: { status: "downloaded", file: filename },
|
|
121
|
+
});
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
catch (e) {
|
|
125
|
+
const errorMsg = e instanceof Error ? e.message : String(e);
|
|
126
|
+
console.warn(`⚠️ Failed to download ${filename} from ${url}:`, errorMsg);
|
|
127
|
+
// Don't crash, just warn. The math worker has a fallback (empty set).
|
|
128
|
+
// But report the failure so setup can retry
|
|
129
|
+
if (node_worker_threads_1.parentPort) {
|
|
130
|
+
node_worker_threads_1.parentPort.postMessage({
|
|
131
|
+
type: "warning",
|
|
132
|
+
file: filename,
|
|
133
|
+
error: errorMsg,
|
|
134
|
+
});
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
function download() {
|
|
140
|
+
return __awaiter(this, void 0, void 0, function* () {
|
|
141
|
+
try {
|
|
142
|
+
// 1. Download Dense Model
|
|
143
|
+
const embedPipeline = yield downloadModelWithTimeout(config_1.MODEL_IDS.embed, "q4");
|
|
144
|
+
yield embedPipeline.dispose();
|
|
145
|
+
// 2. Download ColBERT Model
|
|
146
|
+
const colbertPipeline = yield downloadModelWithTimeout(config_1.MODEL_IDS.colbert, "int8");
|
|
147
|
+
yield colbertPipeline.dispose();
|
|
148
|
+
// 3. Download the custom Skiplist
|
|
149
|
+
yield downloadExtraFile(config_1.MODEL_IDS.colbert, "skiplist.json");
|
|
150
|
+
if (node_worker_threads_1.parentPort) {
|
|
151
|
+
node_worker_threads_1.parentPort.postMessage({ status: "success" });
|
|
152
|
+
}
|
|
153
|
+
else {
|
|
154
|
+
process.exit(0);
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
catch (error) {
|
|
158
|
+
console.error("Worker failed to download models:", error);
|
|
159
|
+
if (node_worker_threads_1.parentPort) {
|
|
160
|
+
const errorMsg = error instanceof Error ? error.message : String(error);
|
|
161
|
+
node_worker_threads_1.parentPort.postMessage({ status: "error", error: errorMsg });
|
|
162
|
+
}
|
|
163
|
+
else {
|
|
164
|
+
process.exit(1);
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
});
|
|
168
|
+
}
|
|
169
|
+
download();
|