grepmax 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/LICENSE +202 -0
  2. package/NOTICE +33 -0
  3. package/README.md +375 -0
  4. package/dist/commands/claude-code.js +60 -0
  5. package/dist/commands/codex.js +98 -0
  6. package/dist/commands/doctor.js +92 -0
  7. package/dist/commands/droid.js +189 -0
  8. package/dist/commands/index.js +125 -0
  9. package/dist/commands/list.js +120 -0
  10. package/dist/commands/mcp.js +567 -0
  11. package/dist/commands/opencode.js +199 -0
  12. package/dist/commands/search.js +539 -0
  13. package/dist/commands/serve.js +502 -0
  14. package/dist/commands/setup.js +160 -0
  15. package/dist/commands/skeleton.js +288 -0
  16. package/dist/commands/symbols.js +129 -0
  17. package/dist/commands/trace.js +50 -0
  18. package/dist/commands/verify.js +174 -0
  19. package/dist/config.js +120 -0
  20. package/dist/eval.js +618 -0
  21. package/dist/index.js +82 -0
  22. package/dist/lib/core/languages.js +237 -0
  23. package/dist/lib/graph/graph-builder.js +105 -0
  24. package/dist/lib/index/chunker.js +663 -0
  25. package/dist/lib/index/grammar-loader.js +110 -0
  26. package/dist/lib/index/ignore-patterns.js +63 -0
  27. package/dist/lib/index/index-config.js +86 -0
  28. package/dist/lib/index/sync-helpers.js +97 -0
  29. package/dist/lib/index/syncer.js +396 -0
  30. package/dist/lib/index/walker.js +164 -0
  31. package/dist/lib/index/watcher.js +245 -0
  32. package/dist/lib/output/formatter.js +161 -0
  33. package/dist/lib/output/json-formatter.js +6 -0
  34. package/dist/lib/search/intent.js +23 -0
  35. package/dist/lib/search/searcher.js +475 -0
  36. package/dist/lib/setup/model-loader.js +107 -0
  37. package/dist/lib/setup/setup-helpers.js +106 -0
  38. package/dist/lib/skeleton/body-fields.js +175 -0
  39. package/dist/lib/skeleton/index.js +24 -0
  40. package/dist/lib/skeleton/retriever.js +36 -0
  41. package/dist/lib/skeleton/skeletonizer.js +483 -0
  42. package/dist/lib/skeleton/summary-formatter.js +92 -0
  43. package/dist/lib/store/meta-cache.js +143 -0
  44. package/dist/lib/store/types.js +2 -0
  45. package/dist/lib/store/vector-db.js +340 -0
  46. package/dist/lib/utils/cleanup.js +33 -0
  47. package/dist/lib/utils/exit.js +38 -0
  48. package/dist/lib/utils/file-utils.js +131 -0
  49. package/dist/lib/utils/filter-builder.js +17 -0
  50. package/dist/lib/utils/formatter.js +230 -0
  51. package/dist/lib/utils/git.js +83 -0
  52. package/dist/lib/utils/lock.js +157 -0
  53. package/dist/lib/utils/project-root.js +107 -0
  54. package/dist/lib/utils/server-registry.js +97 -0
  55. package/dist/lib/workers/colbert-math.js +107 -0
  56. package/dist/lib/workers/colbert-tokenizer.js +113 -0
  57. package/dist/lib/workers/download-worker.js +169 -0
  58. package/dist/lib/workers/embeddings/colbert.js +213 -0
  59. package/dist/lib/workers/embeddings/granite.js +180 -0
  60. package/dist/lib/workers/embeddings/mlx-client.js +144 -0
  61. package/dist/lib/workers/orchestrator.js +350 -0
  62. package/dist/lib/workers/pool.js +373 -0
  63. package/dist/lib/workers/process-child.js +92 -0
  64. package/dist/lib/workers/worker.js +31 -0
  65. package/package.json +80 -0
  66. package/plugins/osgrep/.claude-plugin/plugin.json +20 -0
  67. package/plugins/osgrep/hooks/start.js +90 -0
  68. package/plugins/osgrep/hooks/stop.js +3 -0
  69. package/plugins/osgrep/hooks.json +26 -0
  70. package/plugins/osgrep/skills/osgrep/SKILL.md +82 -0
@@ -0,0 +1,107 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ Object.defineProperty(exports, "__esModule", { value: true });
36
+ exports.findProjectRoot = findProjectRoot;
37
+ exports.ensureProjectPaths = ensureProjectPaths;
38
+ const fs = __importStar(require("node:fs"));
39
+ const path = __importStar(require("node:path"));
40
+ const config_1 = require("../../config");
41
+ function findProjectRoot(startDir = process.cwd()) {
42
+ const start = path.resolve(startDir);
43
+ // Only consider the current directory; do not climb above the user's cwd.
44
+ const osgrepDir = path.join(start, ".osgrep");
45
+ const gitDir = path.join(start, ".git");
46
+ if ((fs.existsSync(osgrepDir) || fs.existsSync(gitDir)) &&
47
+ path.resolve(start) !== path.resolve(config_1.PATHS.globalRoot)) {
48
+ return start;
49
+ }
50
+ // Otherwise, treat the current dir as the root (per-subdirectory isolation).
51
+ return start;
52
+ }
53
+ function ensureProjectPaths(startDir = process.cwd(), options) {
54
+ var _a;
55
+ const root = (_a = findProjectRoot(startDir)) !== null && _a !== void 0 ? _a : path.resolve(startDir);
56
+ const osgrepDir = path.join(root, ".osgrep");
57
+ const lancedbDir = path.join(osgrepDir, "lancedb");
58
+ const cacheDir = path.join(osgrepDir, "cache");
59
+ const lmdbPath = path.join(cacheDir, "meta.lmdb");
60
+ const configPath = path.join(osgrepDir, "config.json");
61
+ if (!(options === null || options === void 0 ? void 0 : options.dryRun)) {
62
+ [osgrepDir, lancedbDir, cacheDir].forEach((dir) => {
63
+ fs.mkdirSync(dir, { recursive: true });
64
+ });
65
+ ensureGitignoreEntry(root);
66
+ }
67
+ return { root, osgrepDir, lancedbDir, cacheDir, lmdbPath, configPath };
68
+ }
69
+ function fileContainsEntry(filePath, entry) {
70
+ try {
71
+ const contents = fs.readFileSync(filePath, "utf-8");
72
+ return contents
73
+ .split(/\r?\n/)
74
+ .map((line) => line.trim())
75
+ .includes(entry);
76
+ }
77
+ catch (_a) {
78
+ return false;
79
+ }
80
+ }
81
+ function ensureGitignoreEntry(root) {
82
+ // Only add when inside a git repo.
83
+ if (!fs.existsSync(path.join(root, ".git")))
84
+ return;
85
+ const entry = ".osgrep";
86
+ // Check .git/info/exclude first
87
+ const excludePath = path.join(root, ".git", "info", "exclude");
88
+ if (fileContainsEntry(excludePath, entry))
89
+ return;
90
+ // Check .gitignore
91
+ const gitignorePath = path.join(root, ".gitignore");
92
+ if (fileContainsEntry(gitignorePath, entry))
93
+ return;
94
+ // Add to .gitignore
95
+ let contents = "";
96
+ try {
97
+ contents = fs.readFileSync(gitignorePath, "utf-8");
98
+ }
99
+ catch (_a) {
100
+ // ignore missing file; will create below
101
+ }
102
+ const needsNewline = contents.length > 0 && !contents.endsWith("\n");
103
+ const prefix = needsNewline ? "\n" : "";
104
+ fs.writeFileSync(gitignorePath, `${contents}${prefix}${entry}\n`, {
105
+ encoding: "utf-8",
106
+ });
107
+ }
@@ -0,0 +1,97 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ Object.defineProperty(exports, "__esModule", { value: true });
36
+ exports.isProcessRunning = isProcessRunning;
37
+ exports.registerServer = registerServer;
38
+ exports.unregisterServer = unregisterServer;
39
+ exports.listServers = listServers;
40
+ exports.getServerForProject = getServerForProject;
41
+ const fs = __importStar(require("node:fs"));
42
+ const path = __importStar(require("node:path"));
43
+ const config_1 = require("../../config");
44
+ const REGISTRY_PATH = path.join(config_1.PATHS.globalRoot, "servers.json");
45
+ function loadRegistry() {
46
+ try {
47
+ if (!fs.existsSync(REGISTRY_PATH))
48
+ return [];
49
+ const data = fs.readFileSync(REGISTRY_PATH, "utf-8");
50
+ return JSON.parse(data);
51
+ }
52
+ catch (_a) {
53
+ return [];
54
+ }
55
+ }
56
+ function saveRegistry(servers) {
57
+ try {
58
+ fs.mkdirSync(path.dirname(REGISTRY_PATH), { recursive: true });
59
+ fs.writeFileSync(REGISTRY_PATH, JSON.stringify(servers, null, 2));
60
+ }
61
+ catch (err) {
62
+ console.error("Failed to save server registry:", err);
63
+ }
64
+ }
65
+ function isProcessRunning(pid) {
66
+ try {
67
+ process.kill(pid, 0);
68
+ return true;
69
+ }
70
+ catch (_a) {
71
+ return false;
72
+ }
73
+ }
74
+ function registerServer(info) {
75
+ const servers = loadRegistry().filter((s) => isProcessRunning(s.pid));
76
+ // Remove any existing entry for this projectRoot to avoid duplicates
77
+ const filtered = servers.filter((s) => s.projectRoot !== info.projectRoot);
78
+ filtered.push(info);
79
+ saveRegistry(filtered);
80
+ }
81
+ function unregisterServer(pid) {
82
+ const servers = loadRegistry();
83
+ const filtered = servers.filter((s) => s.pid !== pid);
84
+ saveRegistry(filtered);
85
+ }
86
+ function listServers() {
87
+ const servers = loadRegistry();
88
+ // Clean up stale entries on read
89
+ const active = servers.filter((s) => isProcessRunning(s.pid));
90
+ if (active.length !== servers.length) {
91
+ saveRegistry(active);
92
+ }
93
+ return active;
94
+ }
95
+ function getServerForProject(projectRoot) {
96
+ return listServers().find((s) => s.projectRoot === projectRoot);
97
+ }
@@ -0,0 +1,107 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ Object.defineProperty(exports, "__esModule", { value: true });
36
+ exports.maxSim = maxSim;
37
+ exports.cosineSim = cosineSim;
38
+ const fs = __importStar(require("node:fs"));
39
+ const path = __importStar(require("node:path"));
40
+ const simsimd_1 = require("simsimd");
41
+ const config_1 = require("../../config");
42
+ let SKIP_IDS = null;
43
+ function loadSkipIds() {
44
+ if (SKIP_IDS)
45
+ return SKIP_IDS;
46
+ // Check local models first (same logic as orchestrator)
47
+ const PROJECT_ROOT = process.env.OSGREP_PROJECT_ROOT
48
+ ? path.resolve(process.env.OSGREP_PROJECT_ROOT)
49
+ : process.cwd();
50
+ const localModels = path.join(PROJECT_ROOT, "models");
51
+ const localColbert = path.join(localModels, ...config_1.MODEL_IDS.colbert.split("/"));
52
+ const localSkipPath = path.join(localColbert, "skiplist.json");
53
+ // Try local first, then global
54
+ const globalBasePath = path.join(config_1.PATHS.models, ...config_1.MODEL_IDS.colbert.split("/"));
55
+ const globalSkipPath = path.join(globalBasePath, "skiplist.json");
56
+ const skipPath = fs.existsSync(localSkipPath)
57
+ ? localSkipPath
58
+ : globalSkipPath;
59
+ if (fs.existsSync(skipPath)) {
60
+ try {
61
+ const parsed = JSON.parse(fs.readFileSync(skipPath, "utf8"));
62
+ SKIP_IDS = new Set(parsed.map((n) => Number(n)));
63
+ return SKIP_IDS;
64
+ }
65
+ catch (_e) {
66
+ // fall through to empty set
67
+ }
68
+ }
69
+ SKIP_IDS = new Set();
70
+ return SKIP_IDS;
71
+ }
72
+ function maxSim(queryEmbeddings, docEmbeddings, docTokenIds) {
73
+ if (queryEmbeddings.length === 0 || docEmbeddings.length === 0) {
74
+ return 0;
75
+ }
76
+ const qVecs = queryEmbeddings.map((v) => v instanceof Float32Array ? v : new Float32Array(v));
77
+ const dVecs = docEmbeddings.map((v) => v instanceof Float32Array ? v : new Float32Array(v));
78
+ const dTokenIds = docTokenIds && docTokenIds.length === dVecs.length ? docTokenIds : null;
79
+ const skipIds = loadSkipIds();
80
+ let totalScore = 0;
81
+ for (const qVec of qVecs) {
82
+ let maxDotProduct = -Infinity;
83
+ for (let idx = 0; idx < dVecs.length; idx++) {
84
+ const tokenId = dTokenIds ? dTokenIds[idx] : null;
85
+ if (tokenId !== null && skipIds.has(Number(tokenId)))
86
+ continue;
87
+ const dVec = dVecs[idx];
88
+ const dim = Math.min(qVec.length, dVec.length);
89
+ const dot = (0, simsimd_1.inner)(qVec.subarray(0, dim), dVec.subarray(0, dim));
90
+ if (dot > maxDotProduct)
91
+ maxDotProduct = dot;
92
+ }
93
+ if (maxDotProduct === -Infinity)
94
+ maxDotProduct = 0;
95
+ totalScore += maxDotProduct;
96
+ }
97
+ return totalScore;
98
+ }
99
+ function cosineSim(a, b) {
100
+ const aVec = a instanceof Float32Array ? a : new Float32Array(a);
101
+ const bVec = b instanceof Float32Array ? b : new Float32Array(b);
102
+ const dim = Math.min(aVec.length, bVec.length);
103
+ if (aVec.length !== bVec.length) {
104
+ return (0, simsimd_1.inner)(aVec.subarray(0, dim), bVec.subarray(0, dim));
105
+ }
106
+ return (0, simsimd_1.inner)(aVec, bVec);
107
+ }
@@ -0,0 +1,113 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.ColBERTTokenizer = void 0;
13
+ const transformers_1 = require("@huggingface/transformers");
14
+ const QUERY_MARKER_TOKEN = "[Q] ";
15
+ const DOC_MARKER_TOKEN = "[D] ";
16
+ const MASK_TOKEN = "[MASK]";
17
+ const QUERY_MAXLEN = 32; // Standard ColBERT query length
18
+ const DOC_MAXLEN = 512; // Standard ColBERT document length
19
+ class ColBERTTokenizer {
20
+ constructor() {
21
+ this.tokenizer = null;
22
+ this.specialTokenIds = null;
23
+ }
24
+ init(modelPath) {
25
+ return __awaiter(this, void 0, void 0, function* () {
26
+ var _a, _b, _c, _d, _e, _f, _g, _h, _j;
27
+ this.tokenizer = yield transformers_1.AutoTokenizer.from_pretrained(modelPath);
28
+ // Get special token IDs with fallbacks
29
+ // We use the IDs we discovered in validation: [Q]=50368, [D]=50369
30
+ // But we still try to look them up dynamically first.
31
+ const tokenizer = this.tokenizer;
32
+ const get = (token) => tokenizer === null || tokenizer === void 0 ? void 0 : tokenizer.model.tokens_to_ids.get(token);
33
+ const specialTokens = tokenizer;
34
+ const clsId = (_b = get((_a = specialTokens.cls_token) !== null && _a !== void 0 ? _a : "[CLS]")) !== null && _b !== void 0 ? _b : 50281;
35
+ const sepId = (_d = get((_c = specialTokens.sep_token) !== null && _c !== void 0 ? _c : "[SEP]")) !== null && _d !== void 0 ? _d : 50282;
36
+ const padId = (_f = get((_e = specialTokens.pad_token) !== null && _e !== void 0 ? _e : "[PAD]")) !== null && _f !== void 0 ? _f : 50283;
37
+ const maskId = (_g = get(MASK_TOKEN)) !== null && _g !== void 0 ? _g : 50284;
38
+ const queryMarkerId = (_h = get(QUERY_MARKER_TOKEN)) !== null && _h !== void 0 ? _h : 50368;
39
+ const docMarkerId = (_j = get(DOC_MARKER_TOKEN)) !== null && _j !== void 0 ? _j : 50369;
40
+ this.specialTokenIds = {
41
+ cls: clsId,
42
+ sep: sepId,
43
+ pad: padId,
44
+ mask: maskId,
45
+ queryMarker: queryMarkerId,
46
+ docMarker: docMarkerId,
47
+ };
48
+ });
49
+ }
50
+ encodeQuery(text) {
51
+ return __awaiter(this, void 0, void 0, function* () {
52
+ if (!this.tokenizer || !this.specialTokenIds) {
53
+ throw new Error("Tokenizer not initialized. Call init() first.");
54
+ }
55
+ // Tokenize without special tokens
56
+ const encoded = yield this.tokenizer(text, {
57
+ add_special_tokens: false,
58
+ truncation: true,
59
+ max_length: QUERY_MAXLEN - 2, // Reserve space for [CLS] and [Q]
60
+ });
61
+ const { input_ids } = encoded;
62
+ // Build sequence: [CLS] [Q] token1 token2 ... [SEP] [MASK] [MASK] ...
63
+ const finalIds = [
64
+ this.specialTokenIds.cls,
65
+ this.specialTokenIds.queryMarker,
66
+ ...Array.from(input_ids.data).map(Number),
67
+ this.specialTokenIds.sep,
68
+ ];
69
+ // Query Expansion: pad with [MASK] tokens up to QUERY_MAXLEN
70
+ while (finalIds.length < QUERY_MAXLEN) {
71
+ finalIds.push(this.specialTokenIds.mask);
72
+ }
73
+ // Truncate if somehow longer (safety check)
74
+ if (finalIds.length > QUERY_MAXLEN) {
75
+ finalIds.length = QUERY_MAXLEN;
76
+ }
77
+ // Create attention mask (1 for all tokens, since MASK is also attended to)
78
+ const attentionMask = new Array(finalIds.length).fill(1);
79
+ return {
80
+ input_ids: finalIds.map((id) => BigInt(id)),
81
+ attention_mask: attentionMask.map((v) => BigInt(v)),
82
+ };
83
+ });
84
+ }
85
+ encodeDoc(text) {
86
+ return __awaiter(this, void 0, void 0, function* () {
87
+ if (!this.tokenizer || !this.specialTokenIds) {
88
+ throw new Error("Tokenizer not initialized. Call init() first.");
89
+ }
90
+ // Tokenize without special tokens
91
+ const encoded = yield this.tokenizer(text, {
92
+ add_special_tokens: false,
93
+ truncation: true,
94
+ max_length: DOC_MAXLEN - 3, // Reserve space for [CLS], [D], and [SEP]
95
+ });
96
+ const { input_ids } = encoded;
97
+ // Build sequence: [CLS] [D] token1 token2 ... [SEP]
98
+ const finalIds = [
99
+ this.specialTokenIds.cls,
100
+ this.specialTokenIds.docMarker,
101
+ ...Array.from(input_ids.data).map(Number),
102
+ this.specialTokenIds.sep,
103
+ ];
104
+ // Create attention mask
105
+ const attentionMask = new Array(finalIds.length).fill(1);
106
+ return {
107
+ input_ids: finalIds.map((id) => BigInt(id)),
108
+ attention_mask: attentionMask.map((v) => BigInt(v)),
109
+ };
110
+ });
111
+ }
112
+ }
113
+ exports.ColBERTTokenizer = ColBERTTokenizer;
@@ -0,0 +1,169 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
36
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
37
+ return new (P || (P = Promise))(function (resolve, reject) {
38
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
39
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
40
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
41
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
42
+ });
43
+ };
44
+ Object.defineProperty(exports, "__esModule", { value: true });
45
+ const fs = __importStar(require("node:fs"));
46
+ const os = __importStar(require("node:os"));
47
+ const path = __importStar(require("node:path"));
48
+ const node_worker_threads_1 = require("node:worker_threads");
49
+ const transformers_1 = require("@huggingface/transformers");
50
+ const config_1 = require("../../config");
51
+ // Configuration
52
+ const HOMEDIR = os.homedir();
53
+ const CACHE_DIR = path.join(HOMEDIR, ".osgrep", "models");
54
+ transformers_1.env.cacheDir = CACHE_DIR;
55
+ transformers_1.env.allowLocalModels = true;
56
+ transformers_1.env.allowRemoteModels = true;
57
+ // Suppress noisy warnings from transformers.js/onnxruntime
58
+ const originalWarn = console.warn;
59
+ console.warn = (...args) => {
60
+ if (args[0] &&
61
+ typeof args[0] === "string" &&
62
+ args[0].includes("Unable to determine content-length")) {
63
+ return;
64
+ }
65
+ originalWarn(...args);
66
+ };
67
+ // Helper to download with timeout
68
+ function downloadModelWithTimeout(modelId, dtype) {
69
+ return __awaiter(this, void 0, void 0, function* () {
70
+ const TIMEOUT_MS = 5 * 60 * 1000; // 5 minutes
71
+ try {
72
+ const downloadPromise = (0, transformers_1.pipeline)("feature-extraction", modelId, {
73
+ dtype,
74
+ progress_callback: (progress) => {
75
+ if (node_worker_threads_1.parentPort)
76
+ node_worker_threads_1.parentPort.postMessage({ type: "progress", progress });
77
+ },
78
+ });
79
+ const timeoutPromise = new Promise((_, reject) => {
80
+ setTimeout(() => reject(new Error(`Download timed out after ${TIMEOUT_MS} ms`)), TIMEOUT_MS);
81
+ });
82
+ return Promise.race([downloadPromise, timeoutPromise]);
83
+ }
84
+ catch (err) {
85
+ console.error(`Worker: pipeline creation failed for ${modelId}: `, err);
86
+ throw err;
87
+ }
88
+ });
89
+ }
90
+ // Helper to manually download extra files like skiplist.json
91
+ function downloadExtraFile(modelId, filename) {
92
+ return __awaiter(this, void 0, void 0, function* () {
93
+ const url = `https://huggingface.co/${modelId}/resolve/main/${filename}`;
94
+ // Construct path: ~/.osgrep/models/ryandono/osgrep-colbert-q8/skiplist.json
95
+ const destDir = path.join(CACHE_DIR, ...modelId.split("/"));
96
+ const destPath = path.join(destDir, filename);
97
+ if (!fs.existsSync(destDir)) {
98
+ fs.mkdirSync(destDir, { recursive: true });
99
+ }
100
+ // If file exists and is non-zero, skip (or implement hash check if you want SOTA robustness)
101
+ if (fs.existsSync(destPath) && fs.statSync(destPath).size > 0) {
102
+ return;
103
+ }
104
+ if (node_worker_threads_1.parentPort) {
105
+ node_worker_threads_1.parentPort.postMessage({
106
+ type: "progress",
107
+ progress: { status: "downloading", file: filename },
108
+ });
109
+ }
110
+ try {
111
+ const res = yield fetch(url);
112
+ if (!res.ok) {
113
+ throw new Error(`HTTP ${res.status}: ${res.statusText}`);
114
+ }
115
+ const buffer = yield res.arrayBuffer();
116
+ fs.writeFileSync(destPath, Buffer.from(buffer));
117
+ if (node_worker_threads_1.parentPort) {
118
+ node_worker_threads_1.parentPort.postMessage({
119
+ type: "progress",
120
+ progress: { status: "downloaded", file: filename },
121
+ });
122
+ }
123
+ }
124
+ catch (e) {
125
+ const errorMsg = e instanceof Error ? e.message : String(e);
126
+ console.warn(`⚠️ Failed to download ${filename} from ${url}:`, errorMsg);
127
+ // Don't crash, just warn. The math worker has a fallback (empty set).
128
+ // But report the failure so setup can retry
129
+ if (node_worker_threads_1.parentPort) {
130
+ node_worker_threads_1.parentPort.postMessage({
131
+ type: "warning",
132
+ file: filename,
133
+ error: errorMsg,
134
+ });
135
+ }
136
+ }
137
+ });
138
+ }
139
+ function download() {
140
+ return __awaiter(this, void 0, void 0, function* () {
141
+ try {
142
+ // 1. Download Dense Model
143
+ const embedPipeline = yield downloadModelWithTimeout(config_1.MODEL_IDS.embed, "q4");
144
+ yield embedPipeline.dispose();
145
+ // 2. Download ColBERT Model
146
+ const colbertPipeline = yield downloadModelWithTimeout(config_1.MODEL_IDS.colbert, "int8");
147
+ yield colbertPipeline.dispose();
148
+ // 3. Download the custom Skiplist
149
+ yield downloadExtraFile(config_1.MODEL_IDS.colbert, "skiplist.json");
150
+ if (node_worker_threads_1.parentPort) {
151
+ node_worker_threads_1.parentPort.postMessage({ status: "success" });
152
+ }
153
+ else {
154
+ process.exit(0);
155
+ }
156
+ }
157
+ catch (error) {
158
+ console.error("Worker failed to download models:", error);
159
+ if (node_worker_threads_1.parentPort) {
160
+ const errorMsg = error instanceof Error ? error.message : String(error);
161
+ node_worker_threads_1.parentPort.postMessage({ status: "error", error: errorMsg });
162
+ }
163
+ else {
164
+ process.exit(1);
165
+ }
166
+ }
167
+ });
168
+ }
169
+ download();