@softerist/heuristic-mcp 2.1.47 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent/workflows/code-review.md +60 -0
- package/.prettierrc +7 -0
- package/ARCHITECTURE.md +105 -170
- package/CONTRIBUTING.md +32 -113
- package/GEMINI.md +73 -0
- package/LICENSE +21 -21
- package/README.md +161 -54
- package/config.json +876 -75
- package/debug-pids.js +27 -0
- package/eslint.config.js +36 -0
- package/features/ann-config.js +37 -26
- package/features/clear-cache.js +28 -19
- package/features/find-similar-code.js +142 -66
- package/features/hybrid-search.js +253 -93
- package/features/index-codebase.js +1455 -394
- package/features/lifecycle.js +813 -180
- package/features/register.js +58 -52
- package/index.js +450 -306
- package/lib/cache-ops.js +22 -0
- package/lib/cache-utils.js +68 -0
- package/lib/cache.js +1392 -587
- package/lib/call-graph.js +165 -50
- package/lib/cli.js +154 -0
- package/lib/config.js +462 -121
- package/lib/embedding-process.js +77 -0
- package/lib/embedding-worker.js +545 -30
- package/lib/ignore-patterns.js +61 -59
- package/lib/json-worker.js +14 -0
- package/lib/json-writer.js +344 -0
- package/lib/logging.js +88 -0
- package/lib/memory-logger.js +13 -0
- package/lib/project-detector.js +13 -17
- package/lib/server-lifecycle.js +38 -0
- package/lib/settings-editor.js +645 -0
- package/lib/tokenizer.js +207 -104
- package/lib/utils.js +273 -198
- package/lib/vector-store-binary.js +592 -0
- package/mcp_config.example.json +13 -0
- package/package.json +13 -2
- package/scripts/clear-cache.js +6 -17
- package/scripts/download-model.js +14 -9
- package/scripts/postinstall.js +5 -5
- package/search-configs.js +36 -0
- package/test/ann-config.test.js +179 -0
- package/test/ann-fallback.test.js +6 -6
- package/test/binary-store.test.js +69 -0
- package/test/cache-branches.test.js +120 -0
- package/test/cache-errors.test.js +264 -0
- package/test/cache-extra.test.js +300 -0
- package/test/cache-helpers.test.js +205 -0
- package/test/cache-hnsw-failure.test.js +40 -0
- package/test/cache-json-worker.test.js +190 -0
- package/test/cache-worker.test.js +102 -0
- package/test/cache.test.js +443 -0
- package/test/call-graph.test.js +103 -4
- package/test/clear-cache.test.js +69 -68
- package/test/code-review-workflow.test.js +50 -0
- package/test/config.test.js +418 -0
- package/test/coverage-gap.test.js +497 -0
- package/test/coverage-maximizer.test.js +236 -0
- package/test/debug-analysis.js +107 -0
- package/test/embedding-model.test.js +173 -103
- package/test/embedding-worker-extra.test.js +272 -0
- package/test/embedding-worker.test.js +158 -0
- package/test/features.test.js +139 -0
- package/test/final-boost.test.js +271 -0
- package/test/final-polish.test.js +183 -0
- package/test/final.test.js +95 -0
- package/test/find-similar-code.test.js +191 -0
- package/test/helpers.js +92 -11
- package/test/helpers.test.js +46 -0
- package/test/hybrid-search-basic.test.js +62 -0
- package/test/hybrid-search-branch.test.js +202 -0
- package/test/hybrid-search-callgraph.test.js +229 -0
- package/test/hybrid-search-extra.test.js +81 -0
- package/test/hybrid-search.test.js +484 -71
- package/test/index-cli.test.js +520 -0
- package/test/index-codebase-batch.test.js +119 -0
- package/test/index-codebase-branches.test.js +585 -0
- package/test/index-codebase-core.test.js +1032 -0
- package/test/index-codebase-edge-cases.test.js +254 -0
- package/test/index-codebase-errors.test.js +132 -0
- package/test/index-codebase-gap.test.js +239 -0
- package/test/index-codebase-lines.test.js +151 -0
- package/test/index-codebase-watcher.test.js +259 -0
- package/test/index-codebase-zone.test.js +259 -0
- package/test/index-codebase.test.js +371 -69
- package/test/index-memory.test.js +220 -0
- package/test/indexer-detailed.test.js +176 -0
- package/test/integration.test.js +148 -92
- package/test/json-worker.test.js +50 -0
- package/test/lifecycle.test.js +541 -0
- package/test/master.test.js +198 -0
- package/test/perfection.test.js +349 -0
- package/test/project-detector.test.js +65 -0
- package/test/register.test.js +262 -0
- package/test/tokenizer.test.js +55 -93
- package/test/ultra-maximizer.test.js +116 -0
- package/test/utils-branches.test.js +161 -0
- package/test/utils-extra.test.js +116 -0
- package/test/utils.test.js +131 -0
- package/test/verify_fixes.js +76 -0
- package/test/worker-errors.test.js +96 -0
- package/test/worker-init.test.js +102 -0
- package/test/worker_throttling.test.js +93 -0
- package/tools/scripts/benchmark-search.js +95 -0
- package/tools/scripts/cache-stats.js +71 -0
- package/tools/scripts/manual-search.js +34 -0
- package/vitest.config.js +19 -9
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import { pipeline, env } from '@xenova/transformers';
|
|
2
|
+
|
|
3
|
+
// Keep output clean for IPC
|
|
4
|
+
const log = (...args) => {
|
|
5
|
+
if (process.env.EMBEDDING_PROCESS_VERBOSE === 'true') {
|
|
6
|
+
console.error(...args);
|
|
7
|
+
}
|
|
8
|
+
};
|
|
9
|
+
|
|
10
|
+
function readStdin() {
|
|
11
|
+
return new Promise((resolve, reject) => {
|
|
12
|
+
let data = '';
|
|
13
|
+
process.stdin.setEncoding('utf8');
|
|
14
|
+
process.stdin.on('data', (chunk) => {
|
|
15
|
+
data += chunk;
|
|
16
|
+
});
|
|
17
|
+
process.stdin.on('end', () => resolve(data));
|
|
18
|
+
process.stdin.on('error', reject);
|
|
19
|
+
});
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
function toFloat32Array(vector) {
|
|
23
|
+
if (vector instanceof Float32Array) return vector;
|
|
24
|
+
return Float32Array.from(vector);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
async function main() {
|
|
28
|
+
const raw = await readStdin();
|
|
29
|
+
if (!raw) return;
|
|
30
|
+
|
|
31
|
+
const payload = JSON.parse(raw);
|
|
32
|
+
const {
|
|
33
|
+
embeddingModel,
|
|
34
|
+
chunks,
|
|
35
|
+
numThreads = 1,
|
|
36
|
+
} = payload;
|
|
37
|
+
|
|
38
|
+
env.backends.onnx.wasm.numThreads = numThreads;
|
|
39
|
+
env.backends.onnx.numThreads = numThreads;
|
|
40
|
+
|
|
41
|
+
log(`[Child] Loading model ${embeddingModel}...`);
|
|
42
|
+
const embedder = await pipeline('feature-extraction', embeddingModel, {
|
|
43
|
+
quantized: true,
|
|
44
|
+
});
|
|
45
|
+
log('[Child] Model ready');
|
|
46
|
+
|
|
47
|
+
const results = [];
|
|
48
|
+
for (const chunk of chunks || []) {
|
|
49
|
+
try {
|
|
50
|
+
const output = await embedder(chunk.text, { pooling: 'mean', normalize: true });
|
|
51
|
+
const vector = toFloat32Array(output.data);
|
|
52
|
+
results.push({
|
|
53
|
+
file: chunk.file,
|
|
54
|
+
startLine: chunk.startLine,
|
|
55
|
+
endLine: chunk.endLine,
|
|
56
|
+
content: chunk.text,
|
|
57
|
+
vector: Array.from(vector),
|
|
58
|
+
success: true,
|
|
59
|
+
});
|
|
60
|
+
} catch (error) {
|
|
61
|
+
results.push({
|
|
62
|
+
file: chunk.file,
|
|
63
|
+
startLine: chunk.startLine,
|
|
64
|
+
endLine: chunk.endLine,
|
|
65
|
+
error: error.message,
|
|
66
|
+
success: false,
|
|
67
|
+
});
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
process.stdout.write(JSON.stringify({ results }));
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
main().catch((err) => {
|
|
75
|
+
process.stderr.write(String(err?.message || err));
|
|
76
|
+
process.exit(1);
|
|
77
|
+
});
|
package/lib/embedding-worker.js
CHANGED
|
@@ -1,67 +1,582 @@
|
|
|
1
|
-
import { parentPort, workerData } from
|
|
2
|
-
import
|
|
1
|
+
import { parentPort, workerData } from 'worker_threads';
|
|
2
|
+
import fs from 'fs/promises';
|
|
3
|
+
import path from 'path';
|
|
4
|
+
import os from 'os';
|
|
5
|
+
import { pipeline, env } from '@xenova/transformers';
|
|
6
|
+
import { smartChunk, hashContent } from './utils.js';
|
|
7
|
+
import { extractCallData } from './call-graph.js';
|
|
3
8
|
|
|
4
|
-
|
|
9
|
+
// Helper to get global cache dir (duplicated from config.js to avoid full config load in worker)
|
|
10
|
+
function getGlobalCacheDir() {
|
|
11
|
+
if (process.platform === 'win32') {
|
|
12
|
+
return process.env.LOCALAPPDATA || path.join(os.homedir(), 'AppData', 'Local');
|
|
13
|
+
}
|
|
14
|
+
if (process.platform === 'darwin') {
|
|
15
|
+
return path.join(os.homedir(), 'Library', 'Caches');
|
|
16
|
+
}
|
|
17
|
+
return process.env.XDG_CACHE_HOME || path.join(os.homedir(), '.cache');
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
// Override console.info/warn to write to stderr so we don't break the MCP JSON-RPC protocol on stdout
|
|
21
|
+
console.info = (...args) => console.error(...args);
|
|
22
|
+
console.warn = (...args) => console.error(...args);
|
|
23
|
+
|
|
24
|
+
// Dynamic thread configuration from main thread
|
|
25
|
+
// This allows optimal CPU usage (dynamic per system) without saturation.
|
|
26
|
+
const numThreads = workerData.numThreads || 1;
|
|
27
|
+
env.backends.onnx.wasm.numThreads = numThreads;
|
|
28
|
+
env.backends.onnx.numThreads = numThreads;
|
|
29
|
+
|
|
30
|
+
const RESULT_BATCH_SIZE = 25;
|
|
31
|
+
const workerId = Number.isInteger(workerData.workerId) ? workerData.workerId : null;
|
|
32
|
+
const workerLabel = workerId === null ? '[Worker]' : `[Worker ${workerId}]`;
|
|
33
|
+
const logInfo = (...args) => {
|
|
34
|
+
console.info(...args);
|
|
35
|
+
};
|
|
36
|
+
|
|
37
|
+
function toFloat32Array(vector) {
|
|
38
|
+
// Always create a copy to ensure we have a unique buffer for transfer
|
|
39
|
+
// and avoid detaching shared WASM memory or overwriting reusable buffers
|
|
40
|
+
return new Float32Array(vector);
|
|
41
|
+
}
|
|
5
42
|
|
|
6
43
|
// Initialize the embedding model once when worker starts
|
|
44
|
+
// Use a promise to handle concurrent calls to initializeEmbedder safely
|
|
45
|
+
let embedderPromise = null;
|
|
46
|
+
|
|
7
47
|
async function initializeEmbedder() {
|
|
8
|
-
if (!
|
|
9
|
-
|
|
48
|
+
if (!embedderPromise) {
|
|
49
|
+
const modelLoadStart = Date.now();
|
|
50
|
+
|
|
51
|
+
// Ensure we use the global cache directory
|
|
52
|
+
env.cacheDir = path.join(getGlobalCacheDir(), 'xenova');
|
|
53
|
+
|
|
54
|
+
logInfo(`${workerLabel} Embedding model load started: ${workerData.embeddingModel}`);
|
|
55
|
+
|
|
56
|
+
embedderPromise = (async () => {
|
|
57
|
+
try {
|
|
58
|
+
const model = await pipeline('feature-extraction', workerData.embeddingModel, {
|
|
59
|
+
quantized: true,
|
|
60
|
+
});
|
|
61
|
+
const loadSeconds = ((Date.now() - modelLoadStart) / 1000).toFixed(1);
|
|
62
|
+
logInfo(`${workerLabel} Embedding model ready: ${workerData.embeddingModel} (${loadSeconds}s)`);
|
|
63
|
+
return model;
|
|
64
|
+
} catch (err) {
|
|
65
|
+
embedderPromise = null; // Reset promise so we can retry later
|
|
66
|
+
throw err;
|
|
67
|
+
}
|
|
68
|
+
})();
|
|
10
69
|
}
|
|
11
|
-
return
|
|
70
|
+
return embedderPromise;
|
|
12
71
|
}
|
|
13
72
|
|
|
14
73
|
/**
|
|
15
|
-
* Process chunks with optimized single-text embedding
|
|
16
|
-
*
|
|
17
|
-
* because it loops internally. Single calls are actually faster.
|
|
74
|
+
* Legacy Protocol: Process chunks with optimized single-text embedding
|
|
75
|
+
* Streams results in batches.
|
|
18
76
|
*/
|
|
19
|
-
async function processChunks(chunks) {
|
|
77
|
+
async function processChunks(chunks, batchId) {
|
|
20
78
|
const embedder = await initializeEmbedder();
|
|
21
|
-
|
|
79
|
+
let results = [];
|
|
80
|
+
let transferList = [];
|
|
81
|
+
|
|
82
|
+
const flush = (done = false) => {
|
|
83
|
+
// Only flush intermediate results when we have enough for a batch
|
|
84
|
+
if (!done && results.length < RESULT_BATCH_SIZE) return;
|
|
85
|
+
|
|
86
|
+
// final batch might be empty if chunks was empty or perfectly divisible by RESULT_BATCH_SIZE
|
|
87
|
+
// but we still send it to signal we are done.
|
|
88
|
+
|
|
89
|
+
const payload = {
|
|
90
|
+
type: 'results',
|
|
91
|
+
results,
|
|
92
|
+
batchId,
|
|
93
|
+
done,
|
|
94
|
+
};
|
|
95
|
+
if (transferList.length > 0) {
|
|
96
|
+
parentPort.postMessage(payload, transferList);
|
|
97
|
+
} else {
|
|
98
|
+
parentPort.postMessage(payload);
|
|
99
|
+
}
|
|
100
|
+
results = [];
|
|
101
|
+
transferList = [];
|
|
102
|
+
};
|
|
22
103
|
|
|
23
104
|
for (const chunk of chunks) {
|
|
24
105
|
try {
|
|
25
|
-
const output = await embedder(chunk.text, {
|
|
106
|
+
const output = await embedder(chunk.text, {
|
|
107
|
+
pooling: 'mean',
|
|
108
|
+
normalize: true,
|
|
109
|
+
});
|
|
110
|
+
const vector = toFloat32Array(output.data);
|
|
26
111
|
results.push({
|
|
27
112
|
file: chunk.file,
|
|
28
113
|
startLine: chunk.startLine,
|
|
29
114
|
endLine: chunk.endLine,
|
|
30
115
|
content: chunk.text,
|
|
31
|
-
vector
|
|
32
|
-
success: true
|
|
116
|
+
vector,
|
|
117
|
+
success: true,
|
|
33
118
|
});
|
|
119
|
+
transferList.push(vector.buffer);
|
|
34
120
|
} catch (error) {
|
|
35
121
|
results.push({
|
|
36
122
|
file: chunk.file,
|
|
37
123
|
startLine: chunk.startLine,
|
|
38
124
|
endLine: chunk.endLine,
|
|
39
125
|
error: error.message,
|
|
40
|
-
success: false
|
|
126
|
+
success: false,
|
|
41
127
|
});
|
|
42
128
|
}
|
|
129
|
+
flush();
|
|
43
130
|
}
|
|
44
131
|
|
|
45
|
-
|
|
132
|
+
flush(true);
|
|
133
|
+
|
|
134
|
+
// Force GC if available to free massive tensor buffers immediately
|
|
135
|
+
if (typeof global.gc === 'function') {
|
|
136
|
+
global.gc();
|
|
137
|
+
}
|
|
46
138
|
}
|
|
47
139
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
140
|
+
/**
|
|
141
|
+
* New Protocol: Process entire file (read, chunk, embed) in worker.
|
|
142
|
+
* Returns results once processing is complete.
|
|
143
|
+
*/
|
|
144
|
+
async function processFileTask(message) {
|
|
145
|
+
const embedder = await initializeEmbedder();
|
|
146
|
+
|
|
147
|
+
const file = message.file;
|
|
148
|
+
const force = !!message.force;
|
|
149
|
+
const expectedHash = message.expectedHash || null;
|
|
150
|
+
|
|
151
|
+
// workerData.maxFileSize might not be set if using old config, default to Infinity
|
|
152
|
+
const maxFileSize = Number.isFinite(workerData.maxFileSize) ? workerData.maxFileSize : Infinity;
|
|
153
|
+
const callGraphEnabled = !!workerData.callGraphEnabled;
|
|
154
|
+
|
|
155
|
+
let mtimeMs = null;
|
|
156
|
+
let size = null;
|
|
157
|
+
|
|
158
|
+
// 1) Get stats (if we were passed content, stats are best-effort or skipped for simplicity if not needed)
|
|
159
|
+
if (!message.content) {
|
|
51
160
|
try {
|
|
52
|
-
const
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
161
|
+
const st = await fs.stat(file);
|
|
162
|
+
if (st.isDirectory()) {
|
|
163
|
+
return { status: 'skipped', reason: 'is_directory', mtimeMs: st.mtimeMs, size: st.size };
|
|
164
|
+
}
|
|
165
|
+
if (st.size > maxFileSize) {
|
|
166
|
+
return { status: 'skipped', reason: 'too_large', mtimeMs: st.mtimeMs, size: st.size };
|
|
167
|
+
}
|
|
168
|
+
mtimeMs = st.mtimeMs;
|
|
169
|
+
size = st.size;
|
|
170
|
+
} catch (err) {
|
|
171
|
+
return { status: 'skipped', reason: `stat_failed: ${err.message}` };
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
// 2) Read content (unless provided)
|
|
176
|
+
let content;
|
|
177
|
+
try {
|
|
178
|
+
content = typeof message.content === 'string' ? message.content : await fs.readFile(file, 'utf-8');
|
|
179
|
+
} catch (err) {
|
|
180
|
+
return { status: 'skipped', reason: `read_failed: ${err.message}`, mtimeMs, size };
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
// Size check when content was provided
|
|
184
|
+
if (message.content) {
|
|
185
|
+
const byteSize = Buffer.byteLength(content, 'utf8');
|
|
186
|
+
if (byteSize > maxFileSize) {
|
|
187
|
+
return { status: 'skipped', reason: 'too_large', mtimeMs, size: byteSize };
|
|
188
|
+
}
|
|
189
|
+
size = byteSize;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
// 3) Hash and unchanged short-circuit
|
|
193
|
+
const hash = hashContent(content);
|
|
194
|
+
if (!force && expectedHash && expectedHash === hash) {
|
|
195
|
+
return { status: 'unchanged', hash, mtimeMs, size };
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
// 4) Call graph extraction (optional)
|
|
199
|
+
let callData = null;
|
|
200
|
+
if (callGraphEnabled) {
|
|
201
|
+
try {
|
|
202
|
+
callData = extractCallData(content, file);
|
|
203
|
+
} catch {
|
|
204
|
+
callData = null;
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
// 5) Chunking in worker
|
|
209
|
+
// Default to empty object if chunkConfig is missing
|
|
210
|
+
const chunkConfig = workerData.chunkConfig || workerData.config || {};
|
|
211
|
+
// If chunkConfig is missing model info, fall back to global workerData model
|
|
212
|
+
if (!chunkConfig.embeddingModel) {
|
|
213
|
+
chunkConfig.embeddingModel = workerData.embeddingModel;
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
const chunks = smartChunk(content, file, chunkConfig);
|
|
217
|
+
|
|
218
|
+
// 6) Embed chunks in batches for performance
|
|
219
|
+
const results = [];
|
|
220
|
+
const transferList = [];
|
|
221
|
+
|
|
222
|
+
// Batch size for inference (balance between speed and memory)
|
|
223
|
+
// Jina v2 is heavy, so we don't want massive batches if chunks are large
|
|
224
|
+
const INFERENCE_BATCH_SIZE = 16;
|
|
225
|
+
|
|
226
|
+
for (let i = 0; i < chunks.length; i += INFERENCE_BATCH_SIZE) {
|
|
227
|
+
const batchChunks = chunks.slice(i, i + INFERENCE_BATCH_SIZE);
|
|
228
|
+
const batchTexts = batchChunks.map(c => c.text);
|
|
229
|
+
|
|
230
|
+
try {
|
|
231
|
+
// Run inference on the batch
|
|
232
|
+
const output = await embedder(batchTexts, { pooling: 'mean', normalize: true });
|
|
233
|
+
|
|
234
|
+
// Output is a Tensor with shape [batch_size, hidden_size]
|
|
235
|
+
// data is a flat Float32Array
|
|
236
|
+
const hiddenSize = output.dims[output.dims.length - 1];
|
|
237
|
+
|
|
238
|
+
for (let j = 0; j < batchChunks.length; j++) {
|
|
239
|
+
const c = batchChunks[j];
|
|
240
|
+
|
|
241
|
+
// Slice the flat buffer to get this chunk's vector
|
|
242
|
+
// specific slice for this element
|
|
243
|
+
const start = j * hiddenSize;
|
|
244
|
+
const end = start + hiddenSize;
|
|
245
|
+
const vectorView = output.data.subarray(start, end);
|
|
246
|
+
|
|
247
|
+
// Deep copy to ensure independent buffer for transfer
|
|
248
|
+
const vector = new Float32Array(vectorView);
|
|
249
|
+
|
|
250
|
+
results.push({
|
|
251
|
+
startLine: c.startLine,
|
|
252
|
+
endLine: c.endLine,
|
|
253
|
+
text: c.text,
|
|
254
|
+
vectorBuffer: vector.buffer,
|
|
255
|
+
});
|
|
256
|
+
transferList.push(vector.buffer);
|
|
257
|
+
}
|
|
258
|
+
} catch (err) {
|
|
259
|
+
// Fallback: if batch fails (e.g. OOM), try one by one for this batch
|
|
260
|
+
console.warn(`${workerLabel} Batch inference failed, retrying individually: ${err.message}`);
|
|
261
|
+
|
|
262
|
+
for (const c of batchChunks) {
|
|
263
|
+
try {
|
|
264
|
+
const output = await embedder(c.text, { pooling: 'mean', normalize: true });
|
|
265
|
+
const vector = toFloat32Array(output.data);
|
|
266
|
+
results.push({
|
|
267
|
+
startLine: c.startLine,
|
|
268
|
+
endLine: c.endLine,
|
|
269
|
+
text: c.text,
|
|
270
|
+
vectorBuffer: vector.buffer,
|
|
271
|
+
});
|
|
272
|
+
transferList.push(vector.buffer);
|
|
273
|
+
} catch (innerErr) {
|
|
274
|
+
console.warn(`${workerLabel} Chunk embedding failed: ${innerErr.message}`);
|
|
275
|
+
// We omit this chunk from results, effectively skipping it
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
// Yield to event loop briefly between batches
|
|
281
|
+
if (chunks.length > INFERENCE_BATCH_SIZE) {
|
|
282
|
+
await new Promise(resolve => setTimeout(resolve, 0));
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
return { status: 'indexed', hash, mtimeMs, size, callData, results, transferList };
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
// Listen for messages from main thread
|
|
290
|
+
parentPort.on('message', async (message) => {
|
|
291
|
+
try {
|
|
292
|
+
if (!message || typeof message.type !== 'string') return;
|
|
293
|
+
|
|
294
|
+
if (message.type === 'shutdown') {
|
|
295
|
+
process.exit(0);
|
|
296
|
+
return;
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
// ---- New protocol: file-level processing (chunking + embedding in worker) ----
|
|
300
|
+
if (message.type === 'processFile') {
|
|
301
|
+
const { id } = message;
|
|
302
|
+
if (!id) {
|
|
303
|
+
parentPort.postMessage({ type: 'error', error: 'processFile missing id' });
|
|
304
|
+
return;
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
const res = await processFileTask(message);
|
|
308
|
+
|
|
309
|
+
// Transfer vectors if present
|
|
310
|
+
if (res && res.transferList && res.transferList.length > 0) {
|
|
311
|
+
const { transferList, ...payload } = res;
|
|
312
|
+
parentPort.postMessage({ id, ...payload }, transferList);
|
|
313
|
+
} else {
|
|
314
|
+
parentPort.postMessage({ id, ...res });
|
|
315
|
+
}
|
|
316
|
+
return;
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
// ---- Batch file processing ----
|
|
320
|
+
if (message.type === 'processFiles') {
|
|
321
|
+
const { files, batchId } = message;
|
|
322
|
+
const batchTransfer = [];
|
|
323
|
+
|
|
324
|
+
// 1. Pre-process all files: Read, Stat, and Chunk
|
|
325
|
+
// We do this first to gather a massive list of chunks for batched inference
|
|
326
|
+
const fileTasks = [];
|
|
327
|
+
const allPendingChunks = []; // { text, fileIndex, chunkIndex, startLine, endLine }
|
|
328
|
+
|
|
329
|
+
for (let i = 0; i < files.length; i++) {
|
|
330
|
+
const fileMsg = files[i];
|
|
331
|
+
|
|
332
|
+
// We reuse processFileTask but we need to intercept the "embedding" phase
|
|
333
|
+
// So we split processFileTask logic.
|
|
334
|
+
// For now, let's just duplicate the "prep" logic to avoid breaking single-file calls.
|
|
335
|
+
|
|
336
|
+
try {
|
|
337
|
+
const file = fileMsg.file;
|
|
338
|
+
const force = !!fileMsg.force;
|
|
339
|
+
const expectedHash = fileMsg.expectedHash || null;
|
|
340
|
+
const maxFileSize = Number.isFinite(workerData.maxFileSize) ? workerData.maxFileSize : Infinity;
|
|
341
|
+
const callGraphEnabled = !!workerData.callGraphEnabled;
|
|
342
|
+
|
|
343
|
+
let mtimeMs = null;
|
|
344
|
+
let size = null;
|
|
345
|
+
let status = 'processing';
|
|
346
|
+
let reason = null;
|
|
347
|
+
let hash = null;
|
|
348
|
+
let content = null;
|
|
349
|
+
let callData = null;
|
|
350
|
+
|
|
351
|
+
// A. Stat & Checks
|
|
352
|
+
if (!fileMsg.content) {
|
|
353
|
+
try {
|
|
354
|
+
const st = await fs.stat(file);
|
|
355
|
+
if (st.isDirectory()) {
|
|
356
|
+
status = 'skipped'; reason = 'is_directory'; mtimeMs = st.mtimeMs; size = st.size;
|
|
357
|
+
} else if (st.size > maxFileSize) {
|
|
358
|
+
status = 'skipped'; reason = 'too_large'; mtimeMs = st.mtimeMs; size = st.size;
|
|
359
|
+
} else {
|
|
360
|
+
mtimeMs = st.mtimeMs;
|
|
361
|
+
size = st.size;
|
|
362
|
+
}
|
|
363
|
+
} catch (err) {
|
|
364
|
+
status = 'skipped'; reason = `stat_failed: ${err.message}`;
|
|
365
|
+
}
|
|
366
|
+
} else {
|
|
367
|
+
// Content provided
|
|
368
|
+
content = fileMsg.content;
|
|
369
|
+
const byteSize = Buffer.byteLength(content, 'utf-8');
|
|
370
|
+
if (byteSize > maxFileSize) {
|
|
371
|
+
status = 'skipped'; reason = 'too_large'; size = byteSize;
|
|
372
|
+
} else {
|
|
373
|
+
size = byteSize;
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
if (status === 'processing') {
|
|
378
|
+
// B. Read Content
|
|
379
|
+
if (content === null) {
|
|
380
|
+
try {
|
|
381
|
+
content = await fs.readFile(file, 'utf-8');
|
|
382
|
+
} catch (err) {
|
|
383
|
+
status = 'skipped'; reason = `read_failed: ${err.message}`;
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
if (status === 'processing') {
|
|
389
|
+
// C. Hash Check
|
|
390
|
+
hash = hashContent(content);
|
|
391
|
+
if (!force && expectedHash && expectedHash === hash) {
|
|
392
|
+
status = 'unchanged';
|
|
393
|
+
}
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
if (status === 'processing') {
|
|
397
|
+
// D. Call Graph
|
|
398
|
+
if (callGraphEnabled) {
|
|
399
|
+
try {
|
|
400
|
+
callData = extractCallData(content, file);
|
|
401
|
+
} catch {
|
|
402
|
+
callData = null;
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
// E. Chunking
|
|
407
|
+
const chunkConfig = message.chunkConfig || workerData.chunkConfig || workerData.config || {};
|
|
408
|
+
if (!chunkConfig.embeddingModel) chunkConfig.embeddingModel = workerData.embeddingModel;
|
|
409
|
+
|
|
410
|
+
const chunks = smartChunk(content, file, chunkConfig);
|
|
411
|
+
const chunkCount = chunks.length;
|
|
412
|
+
|
|
413
|
+
// Register chunks for batching
|
|
414
|
+
if (chunks.length > 0) {
|
|
415
|
+
for (const c of chunks) {
|
|
416
|
+
allPendingChunks.push({
|
|
417
|
+
fileIndex: i,
|
|
418
|
+
text: c.text,
|
|
419
|
+
startLine: c.startLine,
|
|
420
|
+
endLine: c.endLine,
|
|
421
|
+
vectorBuffer: null // to be filled
|
|
422
|
+
});
|
|
423
|
+
}
|
|
424
|
+
status = 'indexed'; // Provisional, pending embedding
|
|
425
|
+
} else {
|
|
426
|
+
// No chunks (empty file or all comments), but technically 'indexed'
|
|
427
|
+
status = 'indexed';
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
fileTasks.push({
|
|
431
|
+
file: fileMsg.file,
|
|
432
|
+
status,
|
|
433
|
+
reason,
|
|
434
|
+
hash,
|
|
435
|
+
mtimeMs,
|
|
436
|
+
size,
|
|
437
|
+
callData,
|
|
438
|
+
expectedChunks: chunkCount,
|
|
439
|
+
results: [] // Will store chunk results
|
|
440
|
+
});
|
|
441
|
+
} else {
|
|
442
|
+
// status is skipped/error
|
|
443
|
+
fileTasks.push({
|
|
444
|
+
file: fileMsg.file,
|
|
445
|
+
status,
|
|
446
|
+
reason,
|
|
447
|
+
hash,
|
|
448
|
+
mtimeMs,
|
|
449
|
+
size,
|
|
450
|
+
callData: null,
|
|
451
|
+
expectedChunks: 0,
|
|
452
|
+
results: []
|
|
453
|
+
});
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
} catch (error) {
|
|
457
|
+
fileTasks.push({
|
|
458
|
+
file: fileMsg.file,
|
|
459
|
+
status: 'error',
|
|
460
|
+
error: error.message,
|
|
461
|
+
expectedChunks: 0,
|
|
462
|
+
results: []
|
|
463
|
+
});
|
|
464
|
+
}
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
// 2. Run Batched Inference on all accumulated chunks
|
|
468
|
+
if (allPendingChunks.length > 0) {
|
|
469
|
+
const embedder = await initializeEmbedder();
|
|
470
|
+
const INFERENCE_BATCH_SIZE = 16;
|
|
471
|
+
|
|
472
|
+
for (let i = 0; i < allPendingChunks.length; i += INFERENCE_BATCH_SIZE) {
|
|
473
|
+
const batchSlice = allPendingChunks.slice(i, i + INFERENCE_BATCH_SIZE);
|
|
474
|
+
const batchTexts = batchSlice.map(c => c.text);
|
|
475
|
+
|
|
476
|
+
try {
|
|
477
|
+
const output = await embedder(batchTexts, { pooling: 'mean', normalize: true });
|
|
478
|
+
const hiddenSize = output.dims[output.dims.length - 1];
|
|
479
|
+
|
|
480
|
+
for (let j = 0; j < batchSlice.length; j++) {
|
|
481
|
+
const start = j * hiddenSize;
|
|
482
|
+
const end = start + hiddenSize;
|
|
483
|
+
const vectorView = output.data.subarray(start, end);
|
|
484
|
+
const vector = new Float32Array(vectorView);
|
|
485
|
+
|
|
486
|
+
batchSlice[j].vectorBuffer = vector.buffer;
|
|
487
|
+
batchTransfer.push(vector.buffer);
|
|
488
|
+
}
|
|
489
|
+
} catch (err) {
|
|
490
|
+
console.warn(`${workerLabel} Cross-file batch inference failed, retrying individually: ${err.message}`);
|
|
491
|
+
// Fallback: individual embedding for this failed batch
|
|
492
|
+
for (const item of batchSlice) {
|
|
493
|
+
try {
|
|
494
|
+
const output = await embedder(item.text, { pooling: 'mean', normalize: true });
|
|
495
|
+
const vector = toFloat32Array(output.data);
|
|
496
|
+
item.vectorBuffer = vector.buffer;
|
|
497
|
+
batchTransfer.push(vector.buffer);
|
|
498
|
+
} catch (innerErr) {
|
|
499
|
+
console.warn(`${workerLabel} Chunk embedding failed: ${innerErr.message}`);
|
|
500
|
+
}
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
// Minimal yield to keep event loop breathing (optional, can be removed for max throughput)
|
|
505
|
+
if (allPendingChunks.length > 50 && i % 50 === 0) {
|
|
506
|
+
await new Promise(resolve => setTimeout(resolve, 0));
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
// 3. Reassemble results and validate
|
|
512
|
+
for (const chunkItem of allPendingChunks) {
|
|
513
|
+
if (chunkItem.vectorBuffer) {
|
|
514
|
+
const task = fileTasks[chunkItem.fileIndex];
|
|
515
|
+
task.results.push({
|
|
516
|
+
startLine: chunkItem.startLine,
|
|
517
|
+
endLine: chunkItem.endLine,
|
|
518
|
+
text: chunkItem.text,
|
|
519
|
+
vectorBuffer: chunkItem.vectorBuffer
|
|
520
|
+
});
|
|
521
|
+
}
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
// Validation pass: mark files as failed if they miss chunks
|
|
525
|
+
for (const task of fileTasks) {
|
|
526
|
+
if (task.status === 'indexed' && task.expectedChunks > 0) {
|
|
527
|
+
if (task.results.length !== task.expectedChunks) {
|
|
528
|
+
task.status = 'error';
|
|
529
|
+
task.error = `Embedding incomplete: ${task.results.length}/${task.expectedChunks} chunks`;
|
|
530
|
+
}
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
// 4. Send response
|
|
535
|
+
parentPort.postMessage({
|
|
536
|
+
type: 'results',
|
|
537
|
+
results: fileTasks,
|
|
538
|
+
batchId,
|
|
539
|
+
done: true
|
|
540
|
+
}, batchTransfer);
|
|
541
|
+
|
|
542
|
+
// Explicitly clear references and trigger GC
|
|
543
|
+
batchTransfer.length = 0;
|
|
544
|
+
if (global.gc) global.gc();
|
|
545
|
+
return;
|
|
546
|
+
}
|
|
547
|
+
|
|
548
|
+
// ---- Legacy protocol: batch of chunks prepared by main thread ----
|
|
549
|
+
if (message.type === 'process') {
|
|
550
|
+
try {
|
|
551
|
+
await processChunks(message.chunks || [], message.batchId);
|
|
552
|
+
} catch (error) {
|
|
553
|
+
parentPort.postMessage({
|
|
554
|
+
type: 'error',
|
|
555
|
+
error: error.message,
|
|
556
|
+
batchId: message.batchId,
|
|
557
|
+
});
|
|
558
|
+
}
|
|
559
|
+
return;
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
// Unknown type
|
|
563
|
+
parentPort.postMessage({ type: 'error', error: `Unknown message type: ${message.type}` });
|
|
564
|
+
|
|
565
|
+
} catch (error) {
|
|
566
|
+
// If message had an id, respond via RPC style; otherwise legacy error
|
|
567
|
+
if (message?.id) {
|
|
568
|
+
parentPort.postMessage({ id: message.id, error: error.message });
|
|
569
|
+
} else {
|
|
570
|
+
parentPort.postMessage({ type: 'error', error: error.message, batchId: message?.batchId });
|
|
56
571
|
}
|
|
57
|
-
} else if (message.type === "shutdown") {
|
|
58
|
-
process.exit(0);
|
|
59
572
|
}
|
|
60
573
|
});
|
|
61
574
|
|
|
62
575
|
// Signal that worker is ready
|
|
63
|
-
initializeEmbedder()
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
576
|
+
initializeEmbedder()
|
|
577
|
+
.then(() => {
|
|
578
|
+
parentPort.postMessage({ type: 'ready' });
|
|
579
|
+
})
|
|
580
|
+
.catch((error) => {
|
|
581
|
+
parentPort.postMessage({ type: 'error', error: error.message });
|
|
582
|
+
});
|