@softerist/heuristic-mcp 3.0.17 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config.jsonc +23 -6
- package/features/ann-config.js +7 -14
- package/features/clear-cache.js +3 -3
- package/features/find-similar-code.js +17 -22
- package/features/hybrid-search.js +59 -67
- package/features/index-codebase.js +305 -268
- package/features/lifecycle.js +370 -176
- package/features/package-version.js +15 -26
- package/features/register.js +75 -57
- package/features/resources.js +21 -47
- package/features/set-workspace.js +31 -43
- package/index.js +818 -172
- package/lib/cache-utils.js +95 -99
- package/lib/cache.js +121 -166
- package/lib/cli.js +246 -238
- package/lib/config.js +232 -62
- package/lib/constants.js +22 -2
- package/lib/embed-query-process.js +13 -29
- package/lib/embedding-process.js +29 -19
- package/lib/embedding-worker.js +166 -149
- package/lib/ignore-patterns.js +39 -39
- package/lib/json-writer.js +7 -34
- package/lib/logging.js +11 -42
- package/lib/onnx-backend.js +4 -4
- package/lib/path-utils.js +4 -21
- package/lib/project-detector.js +3 -3
- package/lib/server-lifecycle.js +109 -15
- package/lib/settings-editor.js +25 -18
- package/lib/slice-normalize.js +6 -16
- package/lib/tokenizer.js +56 -109
- package/lib/utils.js +62 -81
- package/lib/vector-store-binary.js +7 -7
- package/lib/vector-store-sqlite.js +35 -67
- package/lib/workspace-cache-key.js +36 -0
- package/lib/workspace-env.js +55 -14
- package/package.json +86 -86
package/lib/embedding-worker.js
CHANGED
|
@@ -18,8 +18,8 @@ function getGlobalCacheDir() {
|
|
|
18
18
|
return process.env.XDG_CACHE_HOME || path.join(os.homedir(), '.cache');
|
|
19
19
|
}
|
|
20
20
|
|
|
21
|
-
|
|
22
|
-
|
|
21
|
+
|
|
22
|
+
|
|
23
23
|
console.info = (...args) => console.error('[INFO]', ...args);
|
|
24
24
|
console.warn = (...args) => console.error('[WARN]', ...args);
|
|
25
25
|
|
|
@@ -143,10 +143,10 @@ async function isPathInsideWorkspace(targetPath) {
|
|
|
143
143
|
}
|
|
144
144
|
}
|
|
145
145
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
|
|
150
150
|
function sliceAndNormalize(vector, targetDim) {
|
|
151
151
|
if (!targetDim || targetDim >= vector.length) {
|
|
152
152
|
return vector;
|
|
@@ -165,11 +165,11 @@ function sliceAndNormalize(vector, targetDim) {
|
|
|
165
165
|
return sliced;
|
|
166
166
|
}
|
|
167
167
|
|
|
168
|
-
|
|
168
|
+
|
|
169
169
|
const embeddingDimension = workerData.embeddingDimension || null;
|
|
170
170
|
|
|
171
|
-
|
|
172
|
-
|
|
171
|
+
|
|
172
|
+
|
|
173
173
|
let embedderPromise = null;
|
|
174
174
|
|
|
175
175
|
async function initializeEmbedder() {
|
|
@@ -199,7 +199,7 @@ async function initializeEmbedder() {
|
|
|
199
199
|
);
|
|
200
200
|
return model;
|
|
201
201
|
} catch (err) {
|
|
202
|
-
embedderPromise = null;
|
|
202
|
+
embedderPromise = null;
|
|
203
203
|
throw err;
|
|
204
204
|
}
|
|
205
205
|
})();
|
|
@@ -237,10 +237,7 @@ function getEmbeddingTensor(output, { requireDimsForBatch = false, batchSize = n
|
|
|
237
237
|
return { data, hiddenSize };
|
|
238
238
|
}
|
|
239
239
|
|
|
240
|
-
|
|
241
|
-
* Legacy Protocol: Process chunks with optimized single-text embedding
|
|
242
|
-
* Streams results in batches.
|
|
243
|
-
*/
|
|
240
|
+
|
|
244
241
|
async function processChunks(chunks, batchId) {
|
|
245
242
|
const embedder = await initializeEmbedder();
|
|
246
243
|
let results = [];
|
|
@@ -248,11 +245,11 @@ async function processChunks(chunks, batchId) {
|
|
|
248
245
|
const failFastState = createFailFastState('legacy chunk embedding');
|
|
249
246
|
|
|
250
247
|
const flush = (done = false) => {
|
|
251
|
-
|
|
248
|
+
|
|
252
249
|
if (!done && results.length < RESULT_BATCH_SIZE) return;
|
|
253
250
|
|
|
254
|
-
|
|
255
|
-
|
|
251
|
+
|
|
252
|
+
|
|
256
253
|
|
|
257
254
|
const payload = {
|
|
258
255
|
type: 'results',
|
|
@@ -275,12 +272,12 @@ async function processChunks(chunks, batchId) {
|
|
|
275
272
|
pooling: 'mean',
|
|
276
273
|
normalize: true,
|
|
277
274
|
});
|
|
278
|
-
|
|
275
|
+
|
|
279
276
|
const { data } = getEmbeddingTensor(output);
|
|
280
277
|
let vector = new Float32Array(data);
|
|
281
|
-
|
|
278
|
+
|
|
282
279
|
vector = sliceAndNormalize(vector, embeddingDimension);
|
|
283
|
-
|
|
280
|
+
|
|
284
281
|
if (typeof output.dispose === 'function')
|
|
285
282
|
try {
|
|
286
283
|
output.dispose();
|
|
@@ -317,27 +314,21 @@ async function processChunks(chunks, batchId) {
|
|
|
317
314
|
|
|
318
315
|
flush(true);
|
|
319
316
|
|
|
320
|
-
|
|
317
|
+
|
|
321
318
|
maybeRunGc();
|
|
322
319
|
}
|
|
323
320
|
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
* Prepares file content for processing: stat, size check, read content.
|
|
330
|
-
* @param {string} file - File path
|
|
331
|
-
* @param {string|null} providedContent - Pre-provided content (optional)
|
|
332
|
-
* @param {number} maxFileSize - Maximum allowed file size
|
|
333
|
-
* @returns {Promise<{status: string, reason?: string, content?: string, mtimeMs?: number, size?: number}>}
|
|
334
|
-
*/
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
|
|
335
326
|
async function prepareFileContent(file, providedContent, maxFileSize) {
|
|
336
327
|
let mtimeMs = null;
|
|
337
328
|
let size = null;
|
|
338
329
|
let content = null;
|
|
339
330
|
|
|
340
|
-
|
|
331
|
+
|
|
341
332
|
if (typeof providedContent === 'string') {
|
|
342
333
|
content = providedContent;
|
|
343
334
|
const byteSize = Buffer.byteLength(content, 'utf-8');
|
|
@@ -348,16 +339,16 @@ async function prepareFileContent(file, providedContent, maxFileSize) {
|
|
|
348
339
|
return { status: 'ok', content, mtimeMs, size };
|
|
349
340
|
}
|
|
350
341
|
|
|
351
|
-
|
|
342
|
+
|
|
352
343
|
try {
|
|
353
344
|
const st = await fs.stat(file);
|
|
354
345
|
if (st.isDirectory()) {
|
|
355
346
|
return { status: 'skipped', reason: 'is_directory', mtimeMs: st.mtimeMs, size: st.size };
|
|
356
347
|
}
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
|
|
361
352
|
if (st.size > maxFileSize) {
|
|
362
353
|
return { status: 'skipped', reason: 'too_large', mtimeMs: st.mtimeMs, size: st.size };
|
|
363
354
|
}
|
|
@@ -367,18 +358,18 @@ async function prepareFileContent(file, providedContent, maxFileSize) {
|
|
|
367
358
|
return { status: 'skipped', reason: `stat_failed: ${err.message}` };
|
|
368
359
|
}
|
|
369
360
|
|
|
370
|
-
|
|
361
|
+
|
|
371
362
|
try {
|
|
372
363
|
const handle = await fs.open(file, 'r');
|
|
373
364
|
try {
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
|
|
377
368
|
const bufferSize = Math.min(maxFileSize + 1, Math.max(size + 1024, 64 * 1024));
|
|
378
369
|
const buffer = Buffer.alloc(bufferSize);
|
|
379
370
|
const { bytesRead } = await handle.read(buffer, 0, bufferSize, 0);
|
|
380
371
|
|
|
381
|
-
|
|
372
|
+
|
|
382
373
|
if (bytesRead > maxFileSize) {
|
|
383
374
|
return { status: 'skipped', reason: 'too_large_after_read', mtimeMs, size: bytesRead };
|
|
384
375
|
}
|
|
@@ -395,23 +386,17 @@ async function prepareFileContent(file, providedContent, maxFileSize) {
|
|
|
395
386
|
return { status: 'ok', content, mtimeMs, size };
|
|
396
387
|
}
|
|
397
388
|
|
|
398
|
-
|
|
399
|
-
* Processes file metadata: hash check, call graph, chunking.
|
|
400
|
-
* @param {string} file - File path
|
|
401
|
-
* @param {string} content - File content
|
|
402
|
-
* @param {object} options - { force, expectedHash, callGraphEnabled, chunkConfig, workerLabel }
|
|
403
|
-
* @returns {{status: string, hash: string, callData: object|null, chunks: Array}}
|
|
404
|
-
*/
|
|
389
|
+
|
|
405
390
|
function processFileMetadata(file, content, options) {
|
|
406
391
|
const { force, expectedHash, callGraphEnabled, chunkConfig, workerLabel } = options;
|
|
407
392
|
|
|
408
|
-
|
|
393
|
+
|
|
409
394
|
const hash = hashContent(content);
|
|
410
395
|
if (!force && expectedHash && expectedHash === hash) {
|
|
411
396
|
return { status: 'unchanged', hash, callData: null, chunks: [] };
|
|
412
397
|
}
|
|
413
398
|
|
|
414
|
-
|
|
399
|
+
|
|
415
400
|
let callData = null;
|
|
416
401
|
if (callGraphEnabled) {
|
|
417
402
|
try {
|
|
@@ -424,16 +409,13 @@ function processFileMetadata(file, content, options) {
|
|
|
424
409
|
}
|
|
425
410
|
}
|
|
426
411
|
|
|
427
|
-
|
|
412
|
+
|
|
428
413
|
const chunks = smartChunk(content, file, chunkConfig);
|
|
429
414
|
|
|
430
415
|
return { status: 'processing', hash, callData, chunks };
|
|
431
416
|
}
|
|
432
417
|
|
|
433
|
-
|
|
434
|
-
* New Protocol: Process entire file (read, chunk, embed) in worker.
|
|
435
|
-
* Returns results once processing is complete.
|
|
436
|
-
*/
|
|
418
|
+
|
|
437
419
|
async function processFileTask(message) {
|
|
438
420
|
const embedder = await initializeEmbedder();
|
|
439
421
|
const failFastState = createFailFastState(`file-task ${path.basename(message.file || '')}`);
|
|
@@ -442,7 +424,7 @@ async function processFileTask(message) {
|
|
|
442
424
|
const force = !!message.force;
|
|
443
425
|
const expectedHash = message.expectedHash || null;
|
|
444
426
|
|
|
445
|
-
|
|
427
|
+
|
|
446
428
|
if (!(await isPathInsideWorkspace(file))) {
|
|
447
429
|
if (workerData.verbose) {
|
|
448
430
|
console.warn(`[Worker ${workerData.workerId}] Skipping file outside workspace: ${file}`);
|
|
@@ -453,14 +435,14 @@ async function processFileTask(message) {
|
|
|
453
435
|
const maxFileSize = Number.isFinite(workerData.maxFileSize) ? workerData.maxFileSize : Infinity;
|
|
454
436
|
const callGraphEnabled = !!workerData.callGraphEnabled;
|
|
455
437
|
|
|
456
|
-
|
|
438
|
+
|
|
457
439
|
const prep = await prepareFileContent(file, message.content, maxFileSize);
|
|
458
440
|
if (prep.status !== 'ok') {
|
|
459
441
|
return { status: prep.status, reason: prep.reason, mtimeMs: prep.mtimeMs, size: prep.size };
|
|
460
442
|
}
|
|
461
443
|
const { content, mtimeMs, size } = prep;
|
|
462
444
|
|
|
463
|
-
|
|
445
|
+
|
|
464
446
|
const chunkConfig = {
|
|
465
447
|
...(workerData.config || {}),
|
|
466
448
|
...(workerData.chunkConfig || {}),
|
|
@@ -482,12 +464,12 @@ async function processFileTask(message) {
|
|
|
482
464
|
|
|
483
465
|
const { hash, callData, chunks } = meta;
|
|
484
466
|
|
|
485
|
-
|
|
467
|
+
|
|
486
468
|
const results = [];
|
|
487
469
|
const transferList = [];
|
|
488
470
|
|
|
489
|
-
|
|
490
|
-
|
|
471
|
+
|
|
472
|
+
|
|
491
473
|
const INFERENCE_BATCH_SIZE = Number.isInteger(workerData.inferenceBatchSize)
|
|
492
474
|
? workerData.inferenceBatchSize
|
|
493
475
|
: DEFAULT_INFERENCE_BATCH_SIZE;
|
|
@@ -498,14 +480,14 @@ async function processFileTask(message) {
|
|
|
498
480
|
const batchTexts = batchChunks.map((c) => c.text);
|
|
499
481
|
|
|
500
482
|
try {
|
|
501
|
-
|
|
483
|
+
|
|
502
484
|
const output = await embedder(batchTexts, {
|
|
503
485
|
pooling: 'mean',
|
|
504
486
|
normalize: true,
|
|
505
487
|
});
|
|
506
488
|
|
|
507
|
-
|
|
508
|
-
|
|
489
|
+
|
|
490
|
+
|
|
509
491
|
const { data, hiddenSize } = getEmbeddingTensor(output, {
|
|
510
492
|
requireDimsForBatch: true,
|
|
511
493
|
batchSize: batchChunks.length,
|
|
@@ -514,16 +496,16 @@ async function processFileTask(message) {
|
|
|
514
496
|
for (let j = 0; j < batchChunks.length; j++) {
|
|
515
497
|
const c = batchChunks[j];
|
|
516
498
|
|
|
517
|
-
|
|
518
|
-
|
|
499
|
+
|
|
500
|
+
|
|
519
501
|
const start = j * hiddenSize;
|
|
520
502
|
const end = start + hiddenSize;
|
|
521
503
|
const vectorView =
|
|
522
504
|
typeof data.subarray === 'function' ? data.subarray(start, end) : data.slice(start, end);
|
|
523
505
|
|
|
524
|
-
|
|
506
|
+
|
|
525
507
|
let vector = new Float32Array(vectorView);
|
|
526
|
-
|
|
508
|
+
|
|
527
509
|
vector = sliceAndNormalize(vector, embeddingDimension);
|
|
528
510
|
|
|
529
511
|
results.push({
|
|
@@ -534,7 +516,7 @@ async function processFileTask(message) {
|
|
|
534
516
|
});
|
|
535
517
|
transferList.push(vector.buffer);
|
|
536
518
|
}
|
|
537
|
-
|
|
519
|
+
|
|
538
520
|
if (typeof output.dispose === 'function')
|
|
539
521
|
try {
|
|
540
522
|
output.dispose();
|
|
@@ -549,7 +531,7 @@ async function processFileTask(message) {
|
|
|
549
531
|
noteEmbeddingFailure(failFastState, err);
|
|
550
532
|
throw err;
|
|
551
533
|
}
|
|
552
|
-
|
|
534
|
+
|
|
553
535
|
console.warn(`${workerLabel} Batch inference failed (${err.name}), retrying individually: ${err.message}`);
|
|
554
536
|
noteEmbeddingFailure(failFastState, err);
|
|
555
537
|
|
|
@@ -558,9 +540,9 @@ async function processFileTask(message) {
|
|
|
558
540
|
const output = await embedder(c.text, { pooling: 'mean', normalize: true });
|
|
559
541
|
const { data } = getEmbeddingTensor(output);
|
|
560
542
|
let vector = new Float32Array(data);
|
|
561
|
-
|
|
543
|
+
|
|
562
544
|
vector = sliceAndNormalize(vector, embeddingDimension);
|
|
563
|
-
|
|
545
|
+
|
|
564
546
|
if (typeof output.dispose === 'function')
|
|
565
547
|
try {
|
|
566
548
|
output.dispose();
|
|
@@ -578,9 +560,9 @@ async function processFileTask(message) {
|
|
|
578
560
|
transferList.push(vector.buffer);
|
|
579
561
|
noteEmbeddingSuccess(failFastState);
|
|
580
562
|
} catch (innerErr) {
|
|
581
|
-
|
|
563
|
+
|
|
582
564
|
console.warn(`${workerLabel} Chunk embedding failed: ${innerErr.message}`);
|
|
583
|
-
|
|
565
|
+
|
|
584
566
|
noteEmbeddingFailure(failFastState, innerErr);
|
|
585
567
|
if (isFatalRuntimeEmbeddingError(innerErr)) {
|
|
586
568
|
throw innerErr;
|
|
@@ -589,7 +571,7 @@ async function processFileTask(message) {
|
|
|
589
571
|
}
|
|
590
572
|
}
|
|
591
573
|
|
|
592
|
-
|
|
574
|
+
|
|
593
575
|
processedSinceGc += batchChunks.length;
|
|
594
576
|
if (chunks.length > INFERENCE_BATCH_SIZE) {
|
|
595
577
|
if (processedSinceGc >= 100) {
|
|
@@ -603,7 +585,7 @@ async function processFileTask(message) {
|
|
|
603
585
|
return { status: 'indexed', hash, mtimeMs, size, callData, results, transferList };
|
|
604
586
|
}
|
|
605
587
|
|
|
606
|
-
|
|
588
|
+
|
|
607
589
|
parentPort.on('message', async (message) => {
|
|
608
590
|
try {
|
|
609
591
|
if (!message || typeof message.type !== 'string') return;
|
|
@@ -613,14 +595,14 @@ parentPort.on('message', async (message) => {
|
|
|
613
595
|
return;
|
|
614
596
|
}
|
|
615
597
|
|
|
616
|
-
|
|
598
|
+
|
|
617
599
|
if (message.type === 'unload') {
|
|
618
600
|
const wasLoaded = embedderPromise !== null;
|
|
619
601
|
|
|
620
602
|
if (embedderPromise) {
|
|
621
603
|
try {
|
|
622
604
|
const embedder = await embedderPromise;
|
|
623
|
-
|
|
605
|
+
|
|
624
606
|
if (embedder && typeof embedder.dispose === 'function') {
|
|
625
607
|
try {
|
|
626
608
|
await embedder.dispose();
|
|
@@ -633,11 +615,11 @@ parentPort.on('message', async (message) => {
|
|
|
633
615
|
logInfo(`${workerLabel} Error during model unload: ${err.message}`);
|
|
634
616
|
}
|
|
635
617
|
|
|
636
|
-
|
|
618
|
+
|
|
637
619
|
embedderPromise = null;
|
|
638
620
|
}
|
|
639
621
|
|
|
640
|
-
|
|
622
|
+
|
|
641
623
|
if (explicitGcEnabled && typeof global.gc === 'function') {
|
|
642
624
|
const before = process.memoryUsage();
|
|
643
625
|
global.gc();
|
|
@@ -651,7 +633,7 @@ parentPort.on('message', async (message) => {
|
|
|
651
633
|
return;
|
|
652
634
|
}
|
|
653
635
|
|
|
654
|
-
|
|
636
|
+
|
|
655
637
|
if (message.type === 'processFile') {
|
|
656
638
|
const { id } = message;
|
|
657
639
|
if (!id) {
|
|
@@ -661,7 +643,7 @@ parentPort.on('message', async (message) => {
|
|
|
661
643
|
|
|
662
644
|
const res = await processFileTask(message);
|
|
663
645
|
|
|
664
|
-
|
|
646
|
+
|
|
665
647
|
if (res && res.transferList && res.transferList.length > 0) {
|
|
666
648
|
const { transferList, ...payload } = res;
|
|
667
649
|
parentPort.postMessage({ id, ...payload }, transferList);
|
|
@@ -671,19 +653,42 @@ parentPort.on('message', async (message) => {
|
|
|
671
653
|
return;
|
|
672
654
|
}
|
|
673
655
|
|
|
674
|
-
|
|
675
|
-
if (message.type === 'processFiles') {
|
|
676
|
-
const { files, batchId } = message;
|
|
677
|
-
const batchTransfer = [];
|
|
678
|
-
const failFastState = createFailFastState('cross-file batch embedding');
|
|
656
|
+
|
|
657
|
+
if (message.type === 'processFiles') {
|
|
658
|
+
const { files, batchId } = message;
|
|
659
|
+
const batchTransfer = [];
|
|
660
|
+
const failFastState = createFailFastState('cross-file batch embedding');
|
|
661
|
+
const PROGRESS_HEARTBEAT_MS = 15_000;
|
|
662
|
+
let lastProgressAt = 0;
|
|
663
|
+
const sendProgress = (payload = {}, { force = false } = {}) => {
|
|
664
|
+
const now = Date.now();
|
|
665
|
+
if (!force && now - lastProgressAt < PROGRESS_HEARTBEAT_MS) {
|
|
666
|
+
return;
|
|
667
|
+
}
|
|
668
|
+
lastProgressAt = now;
|
|
669
|
+
parentPort.postMessage({
|
|
670
|
+
type: 'progress',
|
|
671
|
+
batchId,
|
|
672
|
+
...payload,
|
|
673
|
+
});
|
|
674
|
+
};
|
|
675
|
+
|
|
676
|
+
sendProgress(
|
|
677
|
+
{
|
|
678
|
+
stage: 'prepare',
|
|
679
|
+
filesCompleted: 0,
|
|
680
|
+
filesTotal: Array.isArray(files) ? files.length : 0,
|
|
681
|
+
},
|
|
682
|
+
{ force: true }
|
|
683
|
+
);
|
|
679
684
|
|
|
680
|
-
|
|
681
|
-
|
|
685
|
+
|
|
686
|
+
|
|
682
687
|
const fileTasks = [];
|
|
683
|
-
const allPendingChunks = [];
|
|
688
|
+
const allPendingChunks = [];
|
|
684
689
|
|
|
685
|
-
for (let i = 0; i < files.length; i++) {
|
|
686
|
-
const fileMsg = files[i];
|
|
690
|
+
for (let i = 0; i < files.length; i++) {
|
|
691
|
+
const fileMsg = files[i];
|
|
687
692
|
|
|
688
693
|
try {
|
|
689
694
|
const file = fileMsg.file;
|
|
@@ -694,7 +699,7 @@ parentPort.on('message', async (message) => {
|
|
|
694
699
|
: Infinity;
|
|
695
700
|
const callGraphEnabled = !!workerData.callGraphEnabled;
|
|
696
701
|
|
|
697
|
-
|
|
702
|
+
|
|
698
703
|
if (!(await isPathInsideWorkspace(file))) {
|
|
699
704
|
fileTasks.push({
|
|
700
705
|
file: fileMsg.file,
|
|
@@ -710,7 +715,7 @@ parentPort.on('message', async (message) => {
|
|
|
710
715
|
continue;
|
|
711
716
|
}
|
|
712
717
|
|
|
713
|
-
|
|
718
|
+
|
|
714
719
|
const prep = await prepareFileContent(file, fileMsg.content, maxFileSize);
|
|
715
720
|
if (prep.status !== 'ok') {
|
|
716
721
|
fileTasks.push({
|
|
@@ -729,7 +734,7 @@ parentPort.on('message', async (message) => {
|
|
|
729
734
|
|
|
730
735
|
const { content, mtimeMs, size } = prep;
|
|
731
736
|
|
|
732
|
-
|
|
737
|
+
|
|
733
738
|
const chunkConfig = {
|
|
734
739
|
...(workerData.config || {}),
|
|
735
740
|
...(workerData.chunkConfig || {}),
|
|
@@ -763,12 +768,12 @@ parentPort.on('message', async (message) => {
|
|
|
763
768
|
const { hash, callData, chunks } = meta;
|
|
764
769
|
const chunkCount = chunks.length;
|
|
765
770
|
|
|
766
|
-
|
|
771
|
+
|
|
767
772
|
if ((i + 1) % 100 === 0) {
|
|
768
773
|
maybeRunGc();
|
|
769
774
|
}
|
|
770
775
|
|
|
771
|
-
|
|
776
|
+
|
|
772
777
|
if (chunks.length > 0) {
|
|
773
778
|
for (const c of chunks) {
|
|
774
779
|
allPendingChunks.push({
|
|
@@ -776,43 +781,49 @@ parentPort.on('message', async (message) => {
|
|
|
776
781
|
text: c.text,
|
|
777
782
|
startLine: c.startLine,
|
|
778
783
|
endLine: c.endLine,
|
|
779
|
-
vectorBuffer: null,
|
|
784
|
+
vectorBuffer: null,
|
|
780
785
|
});
|
|
781
786
|
}
|
|
782
787
|
}
|
|
783
788
|
|
|
784
|
-
fileTasks.push({
|
|
785
|
-
file: fileMsg.file,
|
|
786
|
-
status: 'indexed',
|
|
787
|
-
reason: null,
|
|
788
|
-
hash,
|
|
789
|
+
fileTasks.push({
|
|
790
|
+
file: fileMsg.file,
|
|
791
|
+
status: 'indexed',
|
|
792
|
+
reason: null,
|
|
793
|
+
hash,
|
|
789
794
|
mtimeMs,
|
|
790
795
|
size,
|
|
791
796
|
callData,
|
|
792
797
|
expectedChunks: chunkCount,
|
|
793
|
-
results: [],
|
|
798
|
+
results: [],
|
|
794
799
|
});
|
|
795
800
|
} catch (error) {
|
|
796
|
-
fileTasks.push({
|
|
797
|
-
file: fileMsg.file,
|
|
798
|
-
status: 'error',
|
|
799
|
-
error: error.message,
|
|
800
|
-
expectedChunks: 0,
|
|
801
|
-
results: [],
|
|
802
|
-
});
|
|
803
|
-
}
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
801
|
+
fileTasks.push({
|
|
802
|
+
file: fileMsg.file,
|
|
803
|
+
status: 'error',
|
|
804
|
+
error: error.message,
|
|
805
|
+
expectedChunks: 0,
|
|
806
|
+
results: [],
|
|
807
|
+
});
|
|
808
|
+
} finally {
|
|
809
|
+
sendProgress({
|
|
810
|
+
stage: 'prepare',
|
|
811
|
+
filesCompleted: i + 1,
|
|
812
|
+
filesTotal: files.length,
|
|
813
|
+
});
|
|
814
|
+
}
|
|
815
|
+
}
|
|
816
|
+
|
|
817
|
+
|
|
818
|
+
if (allPendingChunks.length > 0) {
|
|
808
819
|
const embedder = await initializeEmbedder();
|
|
809
820
|
const INFERENCE_BATCH_SIZE = Number.isInteger(workerData.inferenceBatchSize)
|
|
810
821
|
? workerData.inferenceBatchSize
|
|
811
822
|
: DEFAULT_INFERENCE_BATCH_SIZE;
|
|
812
823
|
|
|
813
|
-
for (let i = 0; i < allPendingChunks.length; i += INFERENCE_BATCH_SIZE) {
|
|
814
|
-
const batchSlice = allPendingChunks.slice(i, i + INFERENCE_BATCH_SIZE);
|
|
815
|
-
const batchTexts = batchSlice.map((c) => c.text);
|
|
824
|
+
for (let i = 0; i < allPendingChunks.length; i += INFERENCE_BATCH_SIZE) {
|
|
825
|
+
const batchSlice = allPendingChunks.slice(i, i + INFERENCE_BATCH_SIZE);
|
|
826
|
+
const batchTexts = batchSlice.map((c) => c.text);
|
|
816
827
|
|
|
817
828
|
try {
|
|
818
829
|
const output = await embedder(batchTexts, { pooling: 'mean', normalize: true });
|
|
@@ -828,13 +839,13 @@ parentPort.on('message', async (message) => {
|
|
|
828
839
|
typeof data.subarray === 'function'
|
|
829
840
|
? data.subarray(start, end)
|
|
830
841
|
: data.slice(start, end);
|
|
831
|
-
|
|
842
|
+
|
|
832
843
|
const vector = sliceAndNormalize(new Float32Array(vectorView), embeddingDimension);
|
|
833
844
|
|
|
834
845
|
batchSlice[j].vectorBuffer = vector.buffer;
|
|
835
846
|
batchTransfer.push(vector.buffer);
|
|
836
847
|
}
|
|
837
|
-
|
|
848
|
+
|
|
838
849
|
if (typeof output.dispose === 'function')
|
|
839
850
|
try {
|
|
840
851
|
output.dispose();
|
|
@@ -853,14 +864,14 @@ parentPort.on('message', async (message) => {
|
|
|
853
864
|
`${workerLabel} Cross-file batch inference failed, retrying individually: ${err.message}`
|
|
854
865
|
);
|
|
855
866
|
noteEmbeddingFailure(failFastState, err);
|
|
856
|
-
|
|
857
|
-
for (const item of batchSlice) {
|
|
858
|
-
try {
|
|
859
|
-
const output = await embedder(item.text, { pooling: 'mean', normalize: true });
|
|
867
|
+
|
|
868
|
+
for (const item of batchSlice) {
|
|
869
|
+
try {
|
|
870
|
+
const output = await embedder(item.text, { pooling: 'mean', normalize: true });
|
|
860
871
|
const { data } = getEmbeddingTensor(output);
|
|
861
|
-
|
|
872
|
+
|
|
862
873
|
const vector = sliceAndNormalize(new Float32Array(data), embeddingDimension);
|
|
863
|
-
|
|
874
|
+
|
|
864
875
|
if (typeof output.dispose === 'function')
|
|
865
876
|
try {
|
|
866
877
|
output.dispose();
|
|
@@ -878,18 +889,24 @@ parentPort.on('message', async (message) => {
|
|
|
878
889
|
if (isFatalRuntimeEmbeddingError(innerErr)) {
|
|
879
890
|
throw innerErr;
|
|
880
891
|
}
|
|
881
|
-
}
|
|
882
|
-
}
|
|
883
|
-
}
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
892
|
+
}
|
|
893
|
+
}
|
|
894
|
+
}
|
|
895
|
+
|
|
896
|
+
sendProgress({
|
|
897
|
+
stage: 'embed',
|
|
898
|
+
chunksCompleted: Math.min(i + batchSlice.length, allPendingChunks.length),
|
|
899
|
+
chunksTotal: allPendingChunks.length,
|
|
900
|
+
});
|
|
901
|
+
|
|
902
|
+
|
|
903
|
+
if (allPendingChunks.length > 50 && i % 50 === 0) {
|
|
904
|
+
await new Promise((resolve) => setTimeout(resolve, 0));
|
|
905
|
+
}
|
|
889
906
|
}
|
|
890
907
|
}
|
|
891
908
|
|
|
892
|
-
|
|
909
|
+
|
|
893
910
|
for (const chunkItem of allPendingChunks) {
|
|
894
911
|
if (chunkItem.vectorBuffer) {
|
|
895
912
|
const task = fileTasks[chunkItem.fileIndex];
|
|
@@ -913,21 +930,21 @@ parentPort.on('message', async (message) => {
|
|
|
913
930
|
}
|
|
914
931
|
}
|
|
915
932
|
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
933
|
+
|
|
934
|
+
|
|
935
|
+
|
|
936
|
+
|
|
920
937
|
const resultsForTransfer = fileTasks.map((task) => ({
|
|
921
938
|
...task,
|
|
922
939
|
results: task.results.map((r) => ({
|
|
923
940
|
startLine: r.startLine,
|
|
924
941
|
endLine: r.endLine,
|
|
925
942
|
text: r.text,
|
|
926
|
-
vectorBuffer: r.vectorBuffer,
|
|
943
|
+
vectorBuffer: r.vectorBuffer,
|
|
927
944
|
})),
|
|
928
945
|
}));
|
|
929
946
|
|
|
930
|
-
|
|
947
|
+
|
|
931
948
|
for (const task of fileTasks) {
|
|
932
949
|
for (const r of task.results) {
|
|
933
950
|
r.vectorBuffer = null;
|
|
@@ -944,13 +961,13 @@ parentPort.on('message', async (message) => {
|
|
|
944
961
|
batchTransfer
|
|
945
962
|
);
|
|
946
963
|
|
|
947
|
-
|
|
964
|
+
|
|
948
965
|
batchTransfer.length = 0;
|
|
949
966
|
maybeRunGc();
|
|
950
967
|
return;
|
|
951
968
|
}
|
|
952
969
|
|
|
953
|
-
|
|
970
|
+
|
|
954
971
|
if (message.type === 'process') {
|
|
955
972
|
try {
|
|
956
973
|
await processChunks(message.chunks || [], message.batchId);
|
|
@@ -964,10 +981,10 @@ parentPort.on('message', async (message) => {
|
|
|
964
981
|
return;
|
|
965
982
|
}
|
|
966
983
|
|
|
967
|
-
|
|
984
|
+
|
|
968
985
|
parentPort.postMessage({ type: 'error', error: `Unknown message type: ${message.type}` });
|
|
969
986
|
} catch (error) {
|
|
970
|
-
|
|
987
|
+
|
|
971
988
|
if (message && typeof message === 'object' && message.id) {
|
|
972
989
|
parentPort.postMessage({ id: message.id, error: error.message });
|
|
973
990
|
} else {
|
|
@@ -976,7 +993,7 @@ parentPort.on('message', async (message) => {
|
|
|
976
993
|
}
|
|
977
994
|
});
|
|
978
995
|
|
|
979
|
-
|
|
996
|
+
|
|
980
997
|
initializeEmbedder()
|
|
981
998
|
.then(() => {
|
|
982
999
|
parentPort.postMessage({ type: 'ready' });
|