codebase-contextualizer-cli 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,189 @@
1
+ #!/usr/bin/env node
2
+
3
+ const path = require("path");
4
+ const { performance } = require("perf_hooks");
5
+ const Database = require("better-sqlite3");
6
+ const { pipeline } = require("@xenova/transformers");
7
+
8
+ const ITERATIONS = 1000;
9
+ const DEFAULT_QUERY = "authentication logic";
10
+ const MODEL_ID = "Xenova/all-MiniLM-L6-v2";
11
+
12
+ let embedderPromise;
13
+
14
+ function getEmbedder() {
15
+ if (!embedderPromise) {
16
+ embedderPromise = pipeline("feature-extraction", MODEL_ID);
17
+ }
18
+
19
+ return embedderPromise;
20
+ }
21
+
22
+ function toFloat32Array(value) {
23
+ if (value instanceof Float32Array) {
24
+ return new Float32Array(value);
25
+ }
26
+
27
+ if (ArrayBuffer.isView(value)) {
28
+ return Float32Array.from(value);
29
+ }
30
+
31
+ if (Array.isArray(value)) {
32
+ return Float32Array.from(value);
33
+ }
34
+
35
+ throw new Error("Embedding output could not be converted to Float32Array");
36
+ }
37
+
38
+ async function embedQuery(query) {
39
+ const embedder = await getEmbedder();
40
+ const output = await embedder(query, {
41
+ pooling: "mean",
42
+ normalize: true,
43
+ });
44
+
45
+ return toFloat32Array(output.data);
46
+ }
47
+
48
+ function blobToFloat32Array(blob) {
49
+ return new Float32Array(
50
+ blob.buffer,
51
+ blob.byteOffset,
52
+ blob.byteLength / Float32Array.BYTES_PER_ELEMENT,
53
+ );
54
+ }
55
+
56
+ function cosineSimilarity(left, right) {
57
+ if (left.length !== right.length) {
58
+ return Number.NEGATIVE_INFINITY;
59
+ }
60
+
61
+ let dot = 0;
62
+ let magnitudeLeft = 0;
63
+ let magnitudeRight = 0;
64
+
65
+ for (let index = 0; index < left.length; index += 1) {
66
+ dot += left[index] * right[index];
67
+ magnitudeLeft += left[index] * left[index];
68
+ magnitudeRight += right[index] * right[index];
69
+ }
70
+
71
+ if (magnitudeLeft === 0 || magnitudeRight === 0) {
72
+ return 0;
73
+ }
74
+
75
+ return dot / (Math.sqrt(magnitudeLeft) * Math.sqrt(magnitudeRight));
76
+ }
77
+
78
+ function loadChunks(databasePath) {
79
+ const db = new Database(databasePath, { readonly: true, fileMustExist: true });
80
+
81
+ try {
82
+ return db.prepare(`
83
+ SELECT
84
+ chunks.id,
85
+ files.path AS file,
86
+ chunks.name,
87
+ chunks.startLine,
88
+ chunks.endLine,
89
+ chunks.embedding
90
+ FROM chunks
91
+ INNER JOIN files ON files.id = chunks.file_id
92
+ `).all().map((row) => ({
93
+ ...row,
94
+ embedding: blobToFloat32Array(row.embedding),
95
+ }));
96
+ } finally {
97
+ db.close();
98
+ }
99
+ }
100
+
101
+ function searchChunks(queryEmbedding, chunks) {
102
+ let best = null;
103
+
104
+ for (const chunk of chunks) {
105
+ const score = cosineSimilarity(queryEmbedding, chunk.embedding);
106
+
107
+ if (!best || score > best.score) {
108
+ best = {
109
+ chunk,
110
+ score,
111
+ };
112
+ }
113
+ }
114
+
115
+ return best;
116
+ }
117
+
118
+ function percentile(sortedValues, percentileValue) {
119
+ if (sortedValues.length === 0) {
120
+ return 0;
121
+ }
122
+
123
+ const index = Math.ceil((percentileValue / 100) * sortedValues.length) - 1;
124
+ return sortedValues[Math.max(0, Math.min(index, sortedValues.length - 1))];
125
+ }
126
+
127
+ function summarizeLatencies(latencies) {
128
+ const sorted = [...latencies].sort((left, right) => left - right);
129
+ const total = latencies.reduce((sum, value) => sum + value, 0);
130
+
131
+ return {
132
+ average: total / latencies.length,
133
+ p50: percentile(sorted, 50),
134
+ p90: percentile(sorted, 90),
135
+ p95: percentile(sorted, 95),
136
+ };
137
+ }
138
+
139
+ async function main() {
140
+ const target = path.resolve(process.argv[2] || ".");
141
+ const query = process.argv[3] || DEFAULT_QUERY;
142
+ const databasePath = path.join(target, ".contextualizer", "vector.db");
143
+ const chunks = loadChunks(databasePath);
144
+
145
+ if (chunks.length === 0) {
146
+ throw new Error(`No chunks found in ${databasePath}. Run "node index.js index ${target}" first.`);
147
+ }
148
+
149
+ console.log(`Database: ${databasePath}`);
150
+ console.log(`Chunks: ${chunks.length}`);
151
+ console.log(`Query: ${query}`);
152
+ console.log(`Loading model: ${MODEL_ID}`);
153
+
154
+ const queryEmbedding = await embedQuery(query);
155
+ const latencies = [];
156
+
157
+ // Warm the tight loop once so the reported run is less sensitive to first-call overhead.
158
+ searchChunks(queryEmbedding, chunks);
159
+
160
+ for (let index = 0; index < ITERATIONS; index += 1) {
161
+ const start = performance.now();
162
+ searchChunks(queryEmbedding, chunks);
163
+ latencies.push(performance.now() - start);
164
+ }
165
+
166
+ const summary = summarizeLatencies(latencies);
167
+ const best = searchChunks(queryEmbedding, chunks);
168
+
169
+ console.log("");
170
+ console.log(`Iterations: ${ITERATIONS}`);
171
+ console.log(`Average: ${summary.average.toFixed(4)} ms`);
172
+ console.log(`p50: ${summary.p50.toFixed(4)} ms`);
173
+ console.log(`p90: ${summary.p90.toFixed(4)} ms`);
174
+ console.log(`p95: ${summary.p95.toFixed(4)} ms`);
175
+
176
+ if (best) {
177
+ console.log("");
178
+ console.log("Best match:");
179
+ console.log(` File: ${best.chunk.file}`);
180
+ console.log(` Line: ${best.chunk.startLine}-${best.chunk.endLine}`);
181
+ console.log(` Symbol: ${best.chunk.name}`);
182
+ console.log(` Score: ${best.score.toFixed(4)}`);
183
+ }
184
+ }
185
+
186
+ main().catch((error) => {
187
+ console.error(`Benchmark failed: ${error.message}`);
188
+ process.exitCode = 1;
189
+ });
package/src/abort.js ADDED
@@ -0,0 +1,27 @@
1
+ function createAbortError(signal) {
2
+ const reason = signal && signal.reason;
3
+
4
+ if (reason instanceof Error) {
5
+ return reason;
6
+ }
7
+
8
+ const error = new Error(reason ? String(reason) : "Operation aborted");
9
+ error.name = "AbortError";
10
+ return error;
11
+ }
12
+
13
+ function throwIfAborted(signal) {
14
+ if (signal && signal.aborted) {
15
+ throw createAbortError(signal);
16
+ }
17
+ }
18
+
19
+ function isAbortError(error) {
20
+ return Boolean(error && (error.name === "AbortError" || /aborted|interrupted/i.test(error.message || "")));
21
+ }
22
+
23
+ module.exports = {
24
+ createAbortError,
25
+ isAbortError,
26
+ throwIfAborted,
27
+ };
package/src/cache.js ADDED
@@ -0,0 +1,100 @@
1
+ const fs = require("fs/promises");
2
+ const path = require("path");
3
+
4
+ const CACHE_DIR = ".contextualizer";
5
+ const CACHE_FILE = "cache.json";
6
+ const CACHE_VERSION = 1;
7
+
8
+ function getCachePath(root) {
9
+ return path.join(root, CACHE_DIR, CACHE_FILE);
10
+ }
11
+
12
+ function createEmptyCache(root) {
13
+ return {
14
+ version: CACHE_VERSION,
15
+ root,
16
+ updatedAt: null,
17
+ files: {},
18
+ };
19
+ }
20
+
21
+ function normalizeCache(cache, root) {
22
+ if (!cache || typeof cache !== "object") {
23
+ return createEmptyCache(root);
24
+ }
25
+
26
+ return {
27
+ version: cache.version || CACHE_VERSION,
28
+ root: cache.root || root,
29
+ updatedAt: cache.updatedAt || null,
30
+ files: cache.files && typeof cache.files === "object" ? cache.files : {},
31
+ };
32
+ }
33
+
34
+ function sortObjectByKey(value) {
35
+ return Object.keys(value)
36
+ .sort()
37
+ .reduce((sorted, key) => {
38
+ sorted[key] = value[key];
39
+ return sorted;
40
+ }, {});
41
+ }
42
+
43
+ async function loadCache(root) {
44
+ const cachePath = getCachePath(root);
45
+
46
+ try {
47
+ const raw = await fs.readFile(cachePath, "utf8");
48
+ const parsed = JSON.parse(raw);
49
+
50
+ return {
51
+ cache: normalizeCache(parsed, root),
52
+ exists: true,
53
+ cachePath,
54
+ };
55
+ } catch (error) {
56
+ if (error.code === "ENOENT") {
57
+ return {
58
+ cache: createEmptyCache(root),
59
+ exists: false,
60
+ cachePath,
61
+ };
62
+ }
63
+
64
+ if (error instanceof SyntaxError) {
65
+ throw new Error(`Cache file is not valid JSON: ${cachePath}`);
66
+ }
67
+
68
+ throw error;
69
+ }
70
+ }
71
+
72
+ async function saveCache(root, files) {
73
+ const cachePath = getCachePath(root);
74
+ const cacheDir = path.dirname(cachePath);
75
+ const cache = {
76
+ version: CACHE_VERSION,
77
+ root,
78
+ updatedAt: new Date().toISOString(),
79
+ files: sortObjectByKey(files),
80
+ };
81
+ const tmpPath = `${cachePath}.${process.pid}.${Date.now()}.tmp`;
82
+
83
+ await fs.mkdir(cacheDir, { recursive: true });
84
+ await fs.writeFile(tmpPath, `${JSON.stringify(cache, null, 2)}\n`, "utf8");
85
+ await fs.rename(tmpPath, cachePath);
86
+
87
+ return {
88
+ cache,
89
+ cachePath,
90
+ };
91
+ }
92
+
93
+ module.exports = {
94
+ CACHE_DIR,
95
+ CACHE_FILE,
96
+ CACHE_VERSION,
97
+ getCachePath,
98
+ loadCache,
99
+ saveCache,
100
+ };
@@ -0,0 +1,20 @@
1
+ async function mapLimit(items, limit, mapper) {
2
+ const results = new Array(items.length);
3
+ const workerCount = Math.max(1, Math.min(limit, items.length || 1));
4
+ let nextIndex = 0;
5
+
6
+ async function runWorker() {
7
+ while (nextIndex < items.length) {
8
+ const currentIndex = nextIndex;
9
+ nextIndex += 1;
10
+ results[currentIndex] = await mapper(items[currentIndex], currentIndex);
11
+ }
12
+ }
13
+
14
+ await Promise.all(Array.from({ length: workerCount }, runWorker));
15
+ return results;
16
+ }
17
+
18
+ module.exports = {
19
+ mapLimit,
20
+ };
@@ -0,0 +1,172 @@
1
+ const fs = require("fs");
2
+ const path = require("path");
3
+ const Database = require("better-sqlite3");
4
+ const { CACHE_DIR } = require("./cache");
5
+ const { registerCleanup } = require("./shutdown");
6
+
7
+ const DATABASE_FILE = "vector.db";
8
+ const openDatabases = new Set();
9
+
10
+ function closeOpenDatabases() {
11
+ for (const db of Array.from(openDatabases)) {
12
+ try {
13
+ db.close();
14
+ } catch {
15
+ openDatabases.delete(db);
16
+ }
17
+ }
18
+ }
19
+
20
+ registerCleanup(closeOpenDatabases);
21
+
22
+ function getDatabasePath(root) {
23
+ return path.join(root, CACHE_DIR, DATABASE_FILE);
24
+ }
25
+
26
+ function databaseExists(root) {
27
+ return fs.existsSync(getDatabasePath(root));
28
+ }
29
+
30
+ function openDatabase(root) {
31
+ const databasePath = getDatabasePath(root);
32
+
33
+ fs.mkdirSync(path.dirname(databasePath), { recursive: true });
34
+
35
+ const db = new Database(databasePath);
36
+ const originalClose = db.close.bind(db);
37
+ let closed = false;
38
+
39
+ db.close = () => {
40
+ if (closed) {
41
+ return undefined;
42
+ }
43
+
44
+ closed = true;
45
+ openDatabases.delete(db);
46
+ return originalClose();
47
+ };
48
+
49
+ openDatabases.add(db);
50
+
51
+ db.pragma("journal_mode = WAL");
52
+ db.pragma("foreign_keys = ON");
53
+ db.exec(`
54
+ CREATE TABLE IF NOT EXISTS files (
55
+ id INTEGER PRIMARY KEY,
56
+ path TEXT NOT NULL UNIQUE,
57
+ hash TEXT NOT NULL
58
+ );
59
+
60
+ CREATE TABLE IF NOT EXISTS chunks (
61
+ id INTEGER PRIMARY KEY,
62
+ file_id INTEGER NOT NULL,
63
+ name TEXT NOT NULL,
64
+ type TEXT NOT NULL,
65
+ startLine INTEGER NOT NULL,
66
+ endLine INTEGER NOT NULL,
67
+ code TEXT NOT NULL,
68
+ embedding BLOB NOT NULL,
69
+ FOREIGN KEY (file_id) REFERENCES files(id) ON DELETE CASCADE
70
+ );
71
+
72
+ CREATE INDEX IF NOT EXISTS idx_chunks_file_id ON chunks(file_id);
73
+ `);
74
+
75
+ return db;
76
+ }
77
+
78
+ function upsertFile(statements, file) {
79
+ const existing = statements.selectFile.get(file.path);
80
+
81
+ if (existing) {
82
+ statements.updateFile.run(file.hash, existing.id);
83
+ return existing.id;
84
+ }
85
+
86
+ const result = statements.insertFile.run(file.path, file.hash);
87
+ return Number(result.lastInsertRowid);
88
+ }
89
+
90
+ function writeEmbeddingResults(root, filesByRelativePath, embeddingSummary, removedPaths = []) {
91
+ const db = openDatabase(root);
92
+
93
+ try {
94
+ const statements = {
95
+ selectFile: db.prepare("SELECT id FROM files WHERE path = ?"),
96
+ insertFile: db.prepare("INSERT INTO files (path, hash) VALUES (?, ?)"),
97
+ updateFile: db.prepare("UPDATE files SET hash = ? WHERE id = ?"),
98
+ deleteFile: db.prepare("DELETE FROM files WHERE path = ?"),
99
+ deleteChunks: db.prepare("DELETE FROM chunks WHERE file_id = ?"),
100
+ insertChunk: db.prepare(`
101
+ INSERT INTO chunks (
102
+ file_id,
103
+ name,
104
+ type,
105
+ startLine,
106
+ endLine,
107
+ code,
108
+ embedding
109
+ ) VALUES (?, ?, ?, ?, ?, ?, ?)
110
+ `),
111
+ };
112
+ const persist = db.transaction((fileResults, deletedPaths) => {
113
+ for (const removedPath of deletedPaths) {
114
+ statements.deleteFile.run(removedPath);
115
+ }
116
+
117
+ for (const fileResult of fileResults) {
118
+ if (!fileResult.ok) {
119
+ continue;
120
+ }
121
+
122
+ const file = filesByRelativePath[fileResult.relativePath];
123
+
124
+ if (!file) {
125
+ continue;
126
+ }
127
+
128
+ const fileId = upsertFile(statements, {
129
+ path: fileResult.relativePath,
130
+ hash: file.hash,
131
+ });
132
+
133
+ statements.deleteChunks.run(fileId);
134
+
135
+ for (const chunk of fileResult.chunks) {
136
+ statements.insertChunk.run(
137
+ fileId,
138
+ chunk.name,
139
+ chunk.type,
140
+ chunk.startLine,
141
+ chunk.endLine,
142
+ chunk.code,
143
+ Buffer.from(chunk.embedding.buffer),
144
+ );
145
+ }
146
+ }
147
+ });
148
+
149
+ persist(embeddingSummary.files, removedPaths);
150
+
151
+ return {
152
+ databasePath: getDatabasePath(root),
153
+ filesRemoved: removedPaths.length,
154
+ filesWritten: embeddingSummary.files.filter((fileResult) => fileResult.ok).length,
155
+ chunksWritten: embeddingSummary.files.reduce(
156
+ (total, fileResult) => total + (fileResult.ok ? fileResult.chunks.length : 0),
157
+ 0,
158
+ ),
159
+ };
160
+ } finally {
161
+ db.close();
162
+ }
163
+ }
164
+
165
+ module.exports = {
166
+ DATABASE_FILE,
167
+ closeOpenDatabases,
168
+ databaseExists,
169
+ getDatabasePath,
170
+ openDatabase,
171
+ writeEmbeddingResults,
172
+ };
@@ -0,0 +1,113 @@
1
+ const path = require("path");
2
+
3
+ const MAX_FILE_SIZE = 1024 * 1024;
4
+ const MAX_LINE_LENGTH = 20_000;
5
+ const MINIFIED_SOURCE_PATTERN = /(?:^|[.-])min\.(?:cjs|js|jsx|mjs|ts|tsx)$/i;
6
+ const BINARY_EXTENSIONS = new Set([
7
+ ".7z",
8
+ ".a",
9
+ ".app",
10
+ ".avi",
11
+ ".bmp",
12
+ ".class",
13
+ ".dll",
14
+ ".dmg",
15
+ ".doc",
16
+ ".docx",
17
+ ".dylib",
18
+ ".eot",
19
+ ".exe",
20
+ ".gif",
21
+ ".gz",
22
+ ".ico",
23
+ ".jar",
24
+ ".jpeg",
25
+ ".jpg",
26
+ ".mov",
27
+ ".mp3",
28
+ ".mp4",
29
+ ".o",
30
+ ".obj",
31
+ ".otf",
32
+ ".pdf",
33
+ ".png",
34
+ ".ppt",
35
+ ".pptx",
36
+ ".rar",
37
+ ".so",
38
+ ".sqlite",
39
+ ".tar",
40
+ ".ttf",
41
+ ".wasm",
42
+ ".webm",
43
+ ".webp",
44
+ ".woff",
45
+ ".woff2",
46
+ ".xls",
47
+ ".xlsx",
48
+ ".zip",
49
+ ]);
50
+
51
+ function isBinaryExtension(fileName) {
52
+ return BINARY_EXTENSIONS.has(path.extname(fileName).toLowerCase());
53
+ }
54
+
55
+ function isMinifiedSourceName(fileName) {
56
+ return MINIFIED_SOURCE_PATTERN.test(fileName);
57
+ }
58
+
59
+ function getFileSizeSkipReason(size) {
60
+ if (size > MAX_FILE_SIZE) {
61
+ return `file is larger than ${MAX_FILE_SIZE} bytes`;
62
+ }
63
+
64
+ return null;
65
+ }
66
+
67
+ function getSourcePathSkipReason(fileName) {
68
+ if (isMinifiedSourceName(fileName)) {
69
+ return "minified source file";
70
+ }
71
+
72
+ return null;
73
+ }
74
+
75
+ function getContentSkipReason(code) {
76
+ if (code.includes("\0")) {
77
+ return "binary content";
78
+ }
79
+
80
+ let currentLineLength = 0;
81
+ let maxLineLength = 0;
82
+
83
+ for (let index = 0; index < code.length; index += 1) {
84
+ const character = code[index];
85
+
86
+ if (character === "\n" || character === "\r") {
87
+ maxLineLength = Math.max(maxLineLength, currentLineLength);
88
+ currentLineLength = 0;
89
+ continue;
90
+ }
91
+
92
+ currentLineLength += 1;
93
+ }
94
+
95
+ maxLineLength = Math.max(maxLineLength, currentLineLength);
96
+
97
+ if (maxLineLength > MAX_LINE_LENGTH) {
98
+ return `line is longer than ${MAX_LINE_LENGTH} characters`;
99
+ }
100
+
101
+ return null;
102
+ }
103
+
104
+ module.exports = {
105
+ BINARY_EXTENSIONS,
106
+ MAX_FILE_SIZE,
107
+ MAX_LINE_LENGTH,
108
+ getContentSkipReason,
109
+ getFileSizeSkipReason,
110
+ getSourcePathSkipReason,
111
+ isBinaryExtension,
112
+ isMinifiedSourceName,
113
+ };
@@ -0,0 +1,17 @@
1
+ const { createHash } = require("crypto");
2
+ const { createReadStream } = require("fs");
3
+
4
+ function hashFile(filePath) {
5
+ return new Promise((resolve, reject) => {
6
+ const hash = createHash("sha256");
7
+ const stream = createReadStream(filePath);
8
+
9
+ stream.on("data", (chunk) => hash.update(chunk));
10
+ stream.on("error", reject);
11
+ stream.on("end", () => resolve(hash.digest("hex")));
12
+ });
13
+ }
14
+
15
+ module.exports = {
16
+ hashFile,
17
+ };