codebase-contextualizer-cli 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/walker.js ADDED
@@ -0,0 +1,214 @@
1
+ const fs = require("fs/promises");
2
+ const path = require("path");
3
+ const ignore = require("ignore");
4
+ const { throwIfAborted } = require("./abort");
5
+ const { CACHE_DIR } = require("./cache");
6
+ const {
7
+ MAX_FILE_SIZE,
8
+ getFileSizeSkipReason,
9
+ getSourcePathSkipReason,
10
+ isBinaryExtension,
11
+ } = require("./file-filter");
12
+ const { toRelativePosixPath, toPosixPath } = require("./paths");
13
+
14
+ const HARD_IGNORED_DIRS = new Set([".git", "node_modules", CACHE_DIR]);
15
+ const SOURCE_EXTENSIONS = new Set([
16
+ ".c",
17
+ ".cc",
18
+ ".cpp",
19
+ ".cs",
20
+ ".cxx",
21
+ ".go",
22
+ ".java",
23
+ ".js",
24
+ ".jsx",
25
+ ".kt",
26
+ ".mjs",
27
+ ".php",
28
+ ".py",
29
+ ".rb",
30
+ ".rs",
31
+ ".swift",
32
+ ".ts",
33
+ ".tsx",
34
+ ]);
35
+
36
+ async function readGitignore(directory, errors) {
37
+ const gitignorePath = path.join(directory, ".gitignore");
38
+
39
+ try {
40
+ const contents = await fs.readFile(gitignorePath, "utf8");
41
+ const matcher = ignore().add(contents.split(/\r?\n/));
42
+
43
+ return {
44
+ base: directory,
45
+ matcher,
46
+ };
47
+ } catch (error) {
48
+ if (error.code !== "ENOENT") {
49
+ errors.push({
50
+ path: gitignorePath,
51
+ message: error.message,
52
+ });
53
+ }
54
+
55
+ return null;
56
+ }
57
+ }
58
+
59
+ function isInside(base, target) {
60
+ const relative = path.relative(base, target);
61
+ return (
62
+ relative &&
63
+ relative !== ".." &&
64
+ !relative.startsWith(`..${path.sep}`) &&
65
+ !path.isAbsolute(relative)
66
+ );
67
+ }
68
+
69
+ function isIgnoredByContexts(contexts, target, isDirectory) {
70
+ for (const context of contexts) {
71
+ if (!isInside(context.base, target)) {
72
+ continue;
73
+ }
74
+
75
+ const relativePath = toPosixPath(path.relative(context.base, target));
76
+ const candidate = isDirectory ? `${relativePath}/` : relativePath;
77
+
78
+ if (context.matcher.ignores(candidate)) {
79
+ return true;
80
+ }
81
+ }
82
+
83
+ return false;
84
+ }
85
+
86
+ function isSourceFile(fileName) {
87
+ return SOURCE_EXTENSIONS.has(path.extname(fileName).toLowerCase());
88
+ }
89
+
90
+ async function walkSourceFiles(root, options = {}) {
91
+ const files = [];
92
+ const errors = [];
93
+ const visitedDirectories = new Set();
94
+
95
+ function addSkippedFile(absolutePath, reason) {
96
+ errors.push({
97
+ path: absolutePath,
98
+ message: `Skipped: ${reason}`,
99
+ });
100
+ }
101
+
102
+ async function getDirectoryKey(directory) {
103
+ const realPath = await fs.realpath(directory);
104
+ const stat = await fs.stat(realPath);
105
+ return `${stat.dev}:${stat.ino}:${process.platform === "win32" ? realPath.toLowerCase() : realPath}`;
106
+ }
107
+
108
+ async function walkDirectory(directory, contexts) {
109
+ throwIfAborted(options.signal);
110
+
111
+ let directoryKey;
112
+
113
+ try {
114
+ directoryKey = await getDirectoryKey(directory);
115
+ } catch (error) {
116
+ errors.push({
117
+ path: directory,
118
+ message: error.message,
119
+ });
120
+ return;
121
+ }
122
+
123
+ if (visitedDirectories.has(directoryKey)) {
124
+ return;
125
+ }
126
+
127
+ visitedDirectories.add(directoryKey);
128
+
129
+ const localContext = await readGitignore(directory, errors);
130
+ const activeContexts = localContext ? [...contexts, localContext] : contexts;
131
+ let entries;
132
+
133
+ try {
134
+ entries = await fs.readdir(directory, { withFileTypes: true });
135
+ } catch (error) {
136
+ errors.push({
137
+ path: directory,
138
+ message: error.message,
139
+ });
140
+ return;
141
+ }
142
+
143
+ entries.sort((left, right) => left.name.localeCompare(right.name));
144
+
145
+ for (const entry of entries) {
146
+ throwIfAborted(options.signal);
147
+
148
+ const absolutePath = path.join(directory, entry.name);
149
+
150
+ if (entry.isSymbolicLink()) {
151
+ continue;
152
+ }
153
+
154
+ if (entry.isDirectory()) {
155
+ if (HARD_IGNORED_DIRS.has(entry.name) || isIgnoredByContexts(activeContexts, absolutePath, true)) {
156
+ continue;
157
+ }
158
+
159
+ await walkDirectory(absolutePath, activeContexts);
160
+ continue;
161
+ }
162
+
163
+ if (!entry.isFile() || isIgnoredByContexts(activeContexts, absolutePath, false)) {
164
+ continue;
165
+ }
166
+
167
+ if (isBinaryExtension(entry.name) || !isSourceFile(entry.name)) {
168
+ continue;
169
+ }
170
+
171
+ const pathSkipReason = getSourcePathSkipReason(entry.name);
172
+
173
+ if (pathSkipReason) {
174
+ addSkippedFile(absolutePath, pathSkipReason);
175
+ continue;
176
+ }
177
+
178
+ try {
179
+ const stat = await fs.stat(absolutePath);
180
+ const sizeSkipReason = getFileSizeSkipReason(stat.size);
181
+
182
+ if (sizeSkipReason) {
183
+ addSkippedFile(absolutePath, sizeSkipReason);
184
+ continue;
185
+ }
186
+
187
+ files.push({
188
+ absolutePath,
189
+ relativePath: toRelativePosixPath(root, absolutePath),
190
+ size: stat.size,
191
+ mtimeMs: stat.mtimeMs,
192
+ });
193
+ } catch (error) {
194
+ errors.push({
195
+ path: absolutePath,
196
+ message: error.message,
197
+ });
198
+ }
199
+ }
200
+ }
201
+
202
+ await walkDirectory(root, []);
203
+
204
+ return {
205
+ files,
206
+ errors,
207
+ };
208
+ }
209
+
210
+ module.exports = {
211
+ SOURCE_EXTENSIONS,
212
+ MAX_FILE_SIZE,
213
+ walkSourceFiles,
214
+ };
@@ -0,0 +1,178 @@
1
+ const os = require("os");
2
+ const path = require("path");
3
+ const { Worker } = require("worker_threads");
4
+ const { registerCleanup } = require("./shutdown");
5
+
6
+ const MAX_WORKER_COUNT = 4;
7
+
8
+ function normalizeWorkerCount(count) {
9
+ return Math.min(MAX_WORKER_COUNT, Math.max(1, count));
10
+ }
11
+
12
+ function getDefaultWorkerCount() {
13
+ return normalizeWorkerCount(os.cpus().length - 1);
14
+ }
15
+
16
+ class WorkerPool {
17
+ constructor(options = {}) {
18
+ this.workerPath = options.workerPath || path.join(__dirname, "worker.js");
19
+ this.size = normalizeWorkerCount(options.size || getDefaultWorkerCount());
20
+ this.queue = [];
21
+ this.workers = new Set();
22
+ this.closed = false;
23
+ this.closePromise = null;
24
+ this.unregisterCleanup = registerCleanup(() => this.close());
25
+
26
+ for (let index = 0; index < this.size; index += 1) {
27
+ this.spawnWorker();
28
+ }
29
+ }
30
+
31
+ run(task) {
32
+ if (this.closed) {
33
+ return Promise.reject(new Error("Worker pool is closed"));
34
+ }
35
+
36
+ return new Promise((resolve, reject) => {
37
+ this.queue.push({
38
+ task,
39
+ resolve,
40
+ reject,
41
+ });
42
+ this.dispatch();
43
+ });
44
+ }
45
+
46
+ async close() {
47
+ if (this.closePromise) {
48
+ return this.closePromise;
49
+ }
50
+
51
+ this.closed = true;
52
+
53
+ this.closePromise = (async () => {
54
+ if (this.unregisterCleanup) {
55
+ this.unregisterCleanup();
56
+ this.unregisterCleanup = null;
57
+ }
58
+
59
+ for (const queuedTask of this.queue.splice(0)) {
60
+ queuedTask.reject(new Error("Worker pool closed before task started"));
61
+ }
62
+
63
+ await Promise.allSettled(
64
+ Array.from(this.workers, (record) => record.worker.terminate()),
65
+ );
66
+
67
+ for (const record of this.workers) {
68
+ if (record.currentTask) {
69
+ record.currentTask.reject(new Error("Worker pool closed before task completed"));
70
+ record.currentTask = null;
71
+ }
72
+ }
73
+
74
+ this.workers.clear();
75
+ })();
76
+
77
+ return this.closePromise;
78
+ }
79
+
80
+ spawnWorker() {
81
+ if (this.closed) {
82
+ return null;
83
+ }
84
+
85
+ const worker = new Worker(this.workerPath);
86
+ const record = {
87
+ worker,
88
+ idle: true,
89
+ currentTask: null,
90
+ replaced: false,
91
+ };
92
+
93
+ this.workers.add(record);
94
+
95
+ worker.on("message", (message) => {
96
+ this.finishTask(record, null, message);
97
+ });
98
+
99
+ worker.on("error", (error) => {
100
+ this.failWorker(record, error);
101
+ });
102
+
103
+ worker.on("exit", (code) => {
104
+ this.workers.delete(record);
105
+
106
+ if (record.currentTask) {
107
+ record.currentTask.reject(new Error(`Worker exited before completing task with code ${code}`));
108
+ record.currentTask = null;
109
+ }
110
+
111
+ if (!this.closed && code !== 0 && !record.replaced) {
112
+ this.spawnWorker();
113
+ this.dispatch();
114
+ }
115
+ });
116
+
117
+ return record;
118
+ }
119
+
120
+ failWorker(record, error) {
121
+ this.workers.delete(record);
122
+ record.idle = false;
123
+ record.replaced = true;
124
+
125
+ if (record.currentTask) {
126
+ record.currentTask.reject(error);
127
+ record.currentTask = null;
128
+ }
129
+
130
+ if (!this.closed) {
131
+ this.spawnWorker();
132
+ this.dispatch();
133
+ }
134
+ }
135
+
136
+ finishTask(record, error, message) {
137
+ if (!record.currentTask) {
138
+ return;
139
+ }
140
+
141
+ const completedTask = record.currentTask;
142
+ record.currentTask = null;
143
+ record.idle = true;
144
+
145
+ if (error) {
146
+ completedTask.reject(error);
147
+ } else if (message && message.ok === false) {
148
+ completedTask.reject(new Error(message.error || "Worker task failed"));
149
+ } else {
150
+ completedTask.resolve(message.result);
151
+ }
152
+
153
+ this.dispatch();
154
+ }
155
+
156
+ dispatch() {
157
+ if (this.closed || this.queue.length === 0) {
158
+ return;
159
+ }
160
+
161
+ for (const record of this.workers) {
162
+ if (!record.idle || this.queue.length === 0) {
163
+ continue;
164
+ }
165
+
166
+ const queuedTask = this.queue.shift();
167
+ record.idle = false;
168
+ record.currentTask = queuedTask;
169
+ record.worker.postMessage(queuedTask.task);
170
+ }
171
+ }
172
+ }
173
+
174
+ module.exports = {
175
+ MAX_WORKER_COUNT,
176
+ WorkerPool,
177
+ getDefaultWorkerCount,
178
+ };
package/src/worker.js ADDED
@@ -0,0 +1,124 @@
1
+ const fs = require("fs/promises");
2
+ const { parentPort } = require("worker_threads");
3
+ const { pipeline } = require("@xenova/transformers");
4
+ const { getContentSkipReason } = require("./file-filter");
5
+ const { getParser, parseSourceFile } = require("./parser");
6
+
7
+ const MODEL_ID = "Xenova/all-MiniLM-L6-v2";
8
+
9
+ let embedderPromise;
10
+
11
+ function getEmbedder() {
12
+ if (!embedderPromise) {
13
+ embedderPromise = pipeline("feature-extraction", MODEL_ID);
14
+ }
15
+
16
+ return embedderPromise;
17
+ }
18
+
19
+ function toFloat32Array(value) {
20
+ if (value instanceof Float32Array) {
21
+ return new Float32Array(value);
22
+ }
23
+
24
+ if (ArrayBuffer.isView(value)) {
25
+ return Float32Array.from(value);
26
+ }
27
+
28
+ if (Array.isArray(value)) {
29
+ return Float32Array.from(value);
30
+ }
31
+
32
+ throw new Error("Embedding output could not be converted to Float32Array");
33
+ }
34
+
35
+ async function embedChunk(embedder, chunk) {
36
+ const output = await embedder(chunk.semanticText, {
37
+ pooling: "mean",
38
+ normalize: true,
39
+ });
40
+
41
+ return {
42
+ ...chunk,
43
+ embedding: toFloat32Array(output.data),
44
+ };
45
+ }
46
+
47
+ async function processFile(task) {
48
+ if (!getParser(task.filePath)) {
49
+ return {
50
+ filePath: task.filePath,
51
+ relativePath: task.relativePath,
52
+ skipped: true,
53
+ reason: "Unsupported source extension",
54
+ chunks: [],
55
+ };
56
+ }
57
+
58
+ const code = await fs.readFile(task.filePath, "utf8");
59
+ const contentSkipReason = getContentSkipReason(code);
60
+
61
+ if (contentSkipReason) {
62
+ return {
63
+ filePath: task.filePath,
64
+ relativePath: task.relativePath,
65
+ skipped: true,
66
+ reason: contentSkipReason,
67
+ chunks: [],
68
+ };
69
+ }
70
+
71
+ const parseResult = parseSourceFile({
72
+ filePath: task.filePath,
73
+ relativePath: task.relativePath,
74
+ code,
75
+ });
76
+ const chunks = parseResult.chunks;
77
+
78
+ if (chunks.length === 0) {
79
+ return {
80
+ filePath: task.filePath,
81
+ relativePath: task.relativePath,
82
+ skipped: false,
83
+ chunks: [],
84
+ };
85
+ }
86
+
87
+ const embedder = await getEmbedder();
88
+ const embeddedChunks = [];
89
+
90
+ for (const chunk of chunks) {
91
+ embeddedChunks.push(await embedChunk(embedder, chunk));
92
+ }
93
+
94
+ return {
95
+ filePath: task.filePath,
96
+ relativePath: task.relativePath,
97
+ skipped: false,
98
+ chunks: embeddedChunks,
99
+ };
100
+ }
101
+
102
+ parentPort.on("message", async (task) => {
103
+ try {
104
+ const result = await processFile(task);
105
+ const transferList = [];
106
+
107
+ for (const chunk of result.chunks) {
108
+ transferList.push(chunk.embedding.buffer);
109
+ }
110
+
111
+ parentPort.postMessage(
112
+ {
113
+ ok: true,
114
+ result,
115
+ },
116
+ transferList,
117
+ );
118
+ } catch (error) {
119
+ parentPort.postMessage({
120
+ ok: false,
121
+ error: error.stack || error.message,
122
+ });
123
+ }
124
+ });