@softerist/heuristic-mcp 3.0.15 → 3.0.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +104 -104
- package/config.jsonc +173 -173
- package/features/ann-config.js +131 -0
- package/features/clear-cache.js +84 -0
- package/features/find-similar-code.js +291 -0
- package/features/hybrid-search.js +544 -0
- package/features/index-codebase.js +3268 -0
- package/features/lifecycle.js +1189 -0
- package/features/package-version.js +302 -0
- package/features/register.js +408 -0
- package/features/resources.js +156 -0
- package/features/set-workspace.js +265 -0
- package/index.js +96 -96
- package/lib/cache-ops.js +22 -22
- package/lib/cache-utils.js +565 -565
- package/lib/cache.js +1870 -1870
- package/lib/call-graph.js +396 -396
- package/lib/cli.js +1 -1
- package/lib/config.js +517 -517
- package/lib/constants.js +39 -39
- package/lib/embed-query-process.js +7 -7
- package/lib/embedding-process.js +7 -7
- package/lib/embedding-worker.js +299 -299
- package/lib/ignore-patterns.js +316 -316
- package/lib/json-worker.js +14 -14
- package/lib/json-writer.js +337 -337
- package/lib/logging.js +164 -164
- package/lib/memory-logger.js +13 -13
- package/lib/onnx-backend.js +193 -193
- package/lib/project-detector.js +84 -84
- package/lib/server-lifecycle.js +165 -165
- package/lib/settings-editor.js +754 -754
- package/lib/tokenizer.js +256 -256
- package/lib/utils.js +428 -428
- package/lib/vector-store-binary.js +627 -627
- package/lib/vector-store-sqlite.js +95 -95
- package/lib/workspace-env.js +28 -28
- package/mcp_config.json +9 -9
- package/package.json +86 -75
- package/scripts/clear-cache.js +20 -0
- package/scripts/download-model.js +43 -0
- package/scripts/mcp-launcher.js +49 -0
- package/scripts/postinstall.js +12 -0
- package/search-configs.js +36 -36
- package/.prettierrc +0 -7
- package/debug-pids.js +0 -30
- package/eslint.config.js +0 -36
- package/specs/plan.md +0 -23
- package/vitest.config.js +0 -39
|
@@ -0,0 +1,3268 @@
|
|
|
1
|
+
import { fdir } from 'fdir';
|
|
2
|
+
import fs from 'fs/promises';
|
|
3
|
+
import chokidar from 'chokidar';
|
|
4
|
+
import path from 'path';
|
|
5
|
+
import os from 'os';
|
|
6
|
+
import { Worker } from 'worker_threads';
|
|
7
|
+
import { spawn } from 'child_process';
|
|
8
|
+
import { setTimeout as delay } from 'timers/promises';
|
|
9
|
+
import { fileURLToPath } from 'url';
|
|
10
|
+
import { smartChunk, hashContent } from '../lib/utils.js';
|
|
11
|
+
import { extractCallData } from '../lib/call-graph.js';
|
|
12
|
+
import { forceShutdownEmbeddingPool, isEmbeddingPoolActive } from '../lib/embed-query-process.js';
|
|
13
|
+
|
|
14
|
+
import ignore from 'ignore';
|
|
15
|
+
|
|
16
|
+
import { sliceAndNormalize, toFloat32Array } from '../lib/slice-normalize.js';
|
|
17
|
+
import {
|
|
18
|
+
EMBEDDING_PROCESS_DEFAULT_GC_MAX_REQUESTS_WITHOUT_COLLECTION,
|
|
19
|
+
EMBEDDING_PROCESS_DEFAULT_GC_MIN_INTERVAL_MS,
|
|
20
|
+
EMBEDDING_PROCESS_DEFAULT_GC_RSS_THRESHOLD_MB,
|
|
21
|
+
MAX_PENDING_WATCH_EVENTS,
|
|
22
|
+
PENDING_WATCH_EVENTS_TRIM_SIZE,
|
|
23
|
+
} from '../lib/constants.js';
|
|
24
|
+
|
|
25
|
+
function isTestEnv() {
|
|
26
|
+
return process.env.VITEST === 'true' || process.env.NODE_ENV === 'test';
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
function normalizePath(value) {
|
|
30
|
+
if (typeof value !== 'string') return '';
|
|
31
|
+
return value.split(path.sep).join('/');
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
function globToRegExp(pattern) {
|
|
35
|
+
let regex = '^';
|
|
36
|
+
for (let i = 0; i < pattern.length; i += 1) {
|
|
37
|
+
const char = pattern[i];
|
|
38
|
+
if (char === '*') {
|
|
39
|
+
if (pattern[i + 1] === '*') {
|
|
40
|
+
if (pattern[i + 2] === '/') {
|
|
41
|
+
regex += '(?:.*/)?';
|
|
42
|
+
i += 2;
|
|
43
|
+
} else {
|
|
44
|
+
regex += '.*';
|
|
45
|
+
i += 1;
|
|
46
|
+
}
|
|
47
|
+
} else {
|
|
48
|
+
regex += '[^/]*';
|
|
49
|
+
}
|
|
50
|
+
} else if (char === '?') {
|
|
51
|
+
regex += '[^/]';
|
|
52
|
+
} else if ('\\.[]{}()+-^$|'.includes(char)) {
|
|
53
|
+
regex += `\\${char}`;
|
|
54
|
+
} else {
|
|
55
|
+
regex += char;
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
regex += '$';
|
|
59
|
+
return new RegExp(regex);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
function buildExcludeMatchers(patterns) {
|
|
63
|
+
if (!Array.isArray(patterns)) return [];
|
|
64
|
+
return patterns
|
|
65
|
+
.filter((pattern) => typeof pattern === 'string' && pattern.length > 0)
|
|
66
|
+
.map((pattern) => {
|
|
67
|
+
const normalized = pattern.replace(/\\/g, '/');
|
|
68
|
+
const matchBase = !normalized.includes('/');
|
|
69
|
+
return {
|
|
70
|
+
pattern: normalized,
|
|
71
|
+
matchBase,
|
|
72
|
+
regex: globToRegExp(normalized),
|
|
73
|
+
};
|
|
74
|
+
});
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
function matchesExcludePatterns(filePath, matchers) {
|
|
78
|
+
if (!filePath || matchers.length === 0) return false;
|
|
79
|
+
const normalized = normalizePath(filePath);
|
|
80
|
+
const base = path.posix.basename(normalized);
|
|
81
|
+
for (const matcher of matchers) {
|
|
82
|
+
const target = matcher.matchBase ? base : normalized;
|
|
83
|
+
if (matcher.regex.test(target)) return true;
|
|
84
|
+
}
|
|
85
|
+
return false;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
export class CodebaseIndexer {
|
|
89
|
+
constructor(embedder, cache, config, server = null) {
|
|
90
|
+
this.embedder = embedder;
|
|
91
|
+
this.cache = cache;
|
|
92
|
+
this.config = config;
|
|
93
|
+
this.server = server;
|
|
94
|
+
this.watcher = null;
|
|
95
|
+
this.workers = [];
|
|
96
|
+
this.workerReady = [];
|
|
97
|
+
this.isIndexing = false;
|
|
98
|
+
this.processingWatchEvents = false;
|
|
99
|
+
this.pendingWatchEvents = new Map();
|
|
100
|
+
this.rebuildExcludeMatchers();
|
|
101
|
+
this.gitignore = ignore();
|
|
102
|
+
this.workerFailureCount = 0;
|
|
103
|
+
this.workersDisabledUntil = 0;
|
|
104
|
+
this.workerCircuitOpen = false;
|
|
105
|
+
this._retryTimer = null;
|
|
106
|
+
this._lastProgress = null;
|
|
107
|
+
this.currentIndexMode = null;
|
|
108
|
+
this.workspaceRoot = this.config.searchDirectory
|
|
109
|
+
? path.resolve(this.config.searchDirectory)
|
|
110
|
+
: null;
|
|
111
|
+
this.workspaceRootReal = null;
|
|
112
|
+
this._lastIncrementalGcAt = 0;
|
|
113
|
+
this._autoEmbeddingProcessLogged = false;
|
|
114
|
+
this._heavyWorkerSafetyLogged = false;
|
|
115
|
+
// Debounce timers for watcher events (path -> timeout ID)
|
|
116
|
+
this._watcherDebounceTimers = new Map();
|
|
117
|
+
// Files currently being indexed via watcher (path -> Promise)
|
|
118
|
+
this._watcherInProgress = new Map();
|
|
119
|
+
// Files that need a follow-up reindex after current watcher indexing finishes
|
|
120
|
+
this._watcherPendingReindex = new Map();
|
|
121
|
+
// Debounce delay in ms (consolidates rapid add/change events)
|
|
122
|
+
this._watcherDebounceMs = Number.isInteger(this.config.watchDebounceMs)
|
|
123
|
+
? this.config.watchDebounceMs
|
|
124
|
+
: 300;
|
|
125
|
+
// Wait-for-stable writes (chokidar awaitWriteFinish) to reduce add+change churn
|
|
126
|
+
this._watcherWriteStabilityMs = Number.isInteger(this.config.watchWriteStabilityMs)
|
|
127
|
+
? this.config.watchWriteStabilityMs
|
|
128
|
+
: 1500;
|
|
129
|
+
// Persistent embedding child process (used to avoid per-batch model reloads)
|
|
130
|
+
this._embeddingProcessSessionActive = false;
|
|
131
|
+
this._embeddingChild = null;
|
|
132
|
+
this._embeddingChildBuffer = '';
|
|
133
|
+
this._embeddingChildQueue = [];
|
|
134
|
+
this._embeddingSessionStats = null;
|
|
135
|
+
this._embeddingRequestId = 0;
|
|
136
|
+
this._embeddingChildNeedsRestart = false;
|
|
137
|
+
this._embeddingChildRestartThresholdMb = this.getEmbeddingChildRestartThresholdMb();
|
|
138
|
+
this._embeddingChildStopping = false;
|
|
139
|
+
this._lastExplicitGcAt = 0;
|
|
140
|
+
this._lastHighRssRecycleAt = 0;
|
|
141
|
+
this._pendingHighRssRecycleTimer = null;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
rebuildExcludeMatchers() {
|
|
145
|
+
const cacheRelative = this.getCacheRelativePath();
|
|
146
|
+
const autoExclude = ['.smart-coding-cache'];
|
|
147
|
+
if (cacheRelative) {
|
|
148
|
+
autoExclude.push(cacheRelative, `${cacheRelative}/**`);
|
|
149
|
+
}
|
|
150
|
+
this.excludeMatchers = buildExcludeMatchers([
|
|
151
|
+
...autoExclude,
|
|
152
|
+
...(this.config.excludePatterns || []),
|
|
153
|
+
]);
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
async updateWorkspaceState({ restartWatcher = false } = {}) {
|
|
157
|
+
this.workspaceRoot = this.config.searchDirectory
|
|
158
|
+
? path.resolve(this.config.searchDirectory)
|
|
159
|
+
: null;
|
|
160
|
+
this.workspaceRootReal = null;
|
|
161
|
+
this.rebuildExcludeMatchers();
|
|
162
|
+
this.gitignore = ignore();
|
|
163
|
+
if (this.pendingWatchEvents) {
|
|
164
|
+
this.pendingWatchEvents.clear();
|
|
165
|
+
}
|
|
166
|
+
if (this._watcherDebounceTimers) {
|
|
167
|
+
for (const timer of this._watcherDebounceTimers.values()) {
|
|
168
|
+
clearTimeout(timer);
|
|
169
|
+
}
|
|
170
|
+
this._watcherDebounceTimers.clear();
|
|
171
|
+
}
|
|
172
|
+
if (this._watcherInProgress) {
|
|
173
|
+
this._watcherInProgress.clear();
|
|
174
|
+
}
|
|
175
|
+
if (this._watcherPendingReindex) {
|
|
176
|
+
this._watcherPendingReindex.clear();
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
if (restartWatcher && this.config.watchFiles) {
|
|
180
|
+
await this.setupFileWatcher();
|
|
181
|
+
} else if (this.config.watchFiles) {
|
|
182
|
+
await this.loadGitignore();
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
getEmbeddingChildRestartThresholdMb() {
|
|
187
|
+
const totalMb = typeof os.totalmem === 'function' ? os.totalmem() / 1024 / 1024 : 8192;
|
|
188
|
+
if (this.isHeavyEmbeddingModel()) {
|
|
189
|
+
return Math.min(8000, Math.max(6000, totalMb * 0.3));
|
|
190
|
+
}
|
|
191
|
+
return Math.min(5000, Math.max(2500, totalMb * 0.3));
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
getEmbeddingProcessConfig() {
|
|
195
|
+
const threads = Number.isInteger(this.config.embeddingProcessNumThreads)
|
|
196
|
+
? this.config.embeddingProcessNumThreads
|
|
197
|
+
: 8;
|
|
198
|
+
const batchSize =
|
|
199
|
+
Number.isInteger(this.config.embeddingBatchSize) && this.config.embeddingBatchSize > 0
|
|
200
|
+
? this.config.embeddingBatchSize
|
|
201
|
+
: null;
|
|
202
|
+
return { threads, batchSize };
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
getEmbeddingProcessGcConfig() {
|
|
206
|
+
const thresholdRaw = Number(this.config.embeddingProcessGcRssThresholdMb);
|
|
207
|
+
const minIntervalRaw = Number(this.config.embeddingProcessGcMinIntervalMs);
|
|
208
|
+
const maxRequestsRaw = Number(this.config.embeddingProcessGcMaxRequestsWithoutCollection);
|
|
209
|
+
const gcRssThresholdMb =
|
|
210
|
+
Number.isFinite(thresholdRaw) && thresholdRaw > 0
|
|
211
|
+
? thresholdRaw
|
|
212
|
+
: EMBEDDING_PROCESS_DEFAULT_GC_RSS_THRESHOLD_MB;
|
|
213
|
+
const gcMinIntervalMs =
|
|
214
|
+
Number.isFinite(minIntervalRaw) && minIntervalRaw >= 0
|
|
215
|
+
? Math.floor(minIntervalRaw)
|
|
216
|
+
: EMBEDDING_PROCESS_DEFAULT_GC_MIN_INTERVAL_MS;
|
|
217
|
+
const gcMaxRequestsWithoutCollection =
|
|
218
|
+
Number.isFinite(maxRequestsRaw) && maxRequestsRaw > 0
|
|
219
|
+
? Math.floor(maxRequestsRaw)
|
|
220
|
+
: EMBEDDING_PROCESS_DEFAULT_GC_MAX_REQUESTS_WITHOUT_COLLECTION;
|
|
221
|
+
return { gcRssThresholdMb, gcMinIntervalMs, gcMaxRequestsWithoutCollection };
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
shouldPreferDiskCacheLoad() {
|
|
225
|
+
if (!this.config.clearCacheAfterIndex) return false;
|
|
226
|
+
return this.config.vectorStoreFormat === 'binary' || this.config.vectorStoreFormat === 'sqlite';
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
isExplicitGcEnabled() {
|
|
230
|
+
return this.config.enableExplicitGc !== false && typeof global.gc === 'function';
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
runExplicitGc({ minIntervalMs = 0, force = false } = {}) {
|
|
234
|
+
if (!this.isExplicitGcEnabled()) return false;
|
|
235
|
+
const now = Date.now();
|
|
236
|
+
if (
|
|
237
|
+
!force &&
|
|
238
|
+
minIntervalMs > 0 &&
|
|
239
|
+
this._lastExplicitGcAt &&
|
|
240
|
+
now - this._lastExplicitGcAt < minIntervalMs
|
|
241
|
+
) {
|
|
242
|
+
return false;
|
|
243
|
+
}
|
|
244
|
+
this._lastExplicitGcAt = now;
|
|
245
|
+
global.gc();
|
|
246
|
+
return true;
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
shouldTraceIncrementalMemory() {
|
|
250
|
+
return this.config.incrementalMemoryProfile === true;
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
formatMemoryUsage(usage = process.memoryUsage()) {
|
|
254
|
+
const toMb = (value) => `${(value / 1024 / 1024).toFixed(1)}MB`;
|
|
255
|
+
return (
|
|
256
|
+
`rss=${toMb(usage.rss)} ` +
|
|
257
|
+
`heap=${toMb(usage.heapUsed)}/${toMb(usage.heapTotal)} ` +
|
|
258
|
+
`ext=${toMb(usage.external)} arr=${toMb(usage.arrayBuffers)}`
|
|
259
|
+
);
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
async traceIncrementalMemoryPhase(phase, fn) {
|
|
263
|
+
if (!this.shouldTraceIncrementalMemory()) {
|
|
264
|
+
return await fn();
|
|
265
|
+
}
|
|
266
|
+
const startedAt = Date.now();
|
|
267
|
+
const startUsage = process.memoryUsage();
|
|
268
|
+
console.info(`[Indexer][MemTrace] ${phase} start: ${this.formatMemoryUsage(startUsage)}`);
|
|
269
|
+
try {
|
|
270
|
+
return await fn();
|
|
271
|
+
} finally {
|
|
272
|
+
const endUsage = process.memoryUsage();
|
|
273
|
+
const deltaRssMb = (endUsage.rss - startUsage.rss) / 1024 / 1024;
|
|
274
|
+
const deltaHeapMb = (endUsage.heapUsed - startUsage.heapUsed) / 1024 / 1024;
|
|
275
|
+
const elapsedSec = ((Date.now() - startedAt) / 1000).toFixed(2);
|
|
276
|
+
console.info(
|
|
277
|
+
`[Indexer][MemTrace] ${phase} end: ${this.formatMemoryUsage(endUsage)} ` +
|
|
278
|
+
`deltaRss=${deltaRssMb.toFixed(1)}MB deltaHeap=${deltaHeapMb.toFixed(1)}MB elapsed=${elapsedSec}s`
|
|
279
|
+
);
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
isPathInsideWorkspace(filePath) {
|
|
284
|
+
if (!filePath || !this.workspaceRoot) return true;
|
|
285
|
+
const target = path.resolve(filePath);
|
|
286
|
+
const normalizedBase =
|
|
287
|
+
process.platform === 'win32' ? this.workspaceRoot.toLowerCase() : this.workspaceRoot;
|
|
288
|
+
const normalizedTarget = process.platform === 'win32' ? target.toLowerCase() : target;
|
|
289
|
+
const rel = path.relative(normalizedBase, normalizedTarget);
|
|
290
|
+
return rel === '' || (!rel.startsWith('..') && !path.isAbsolute(rel));
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
async resolveWorkspaceRealpath() {
|
|
294
|
+
if (!this.workspaceRoot) return null;
|
|
295
|
+
if (this.workspaceRootReal) return this.workspaceRootReal;
|
|
296
|
+
try {
|
|
297
|
+
this.workspaceRootReal = await fs.realpath(this.workspaceRoot);
|
|
298
|
+
} catch {
|
|
299
|
+
this.workspaceRootReal = this.workspaceRoot;
|
|
300
|
+
}
|
|
301
|
+
return this.workspaceRootReal;
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
async isPathInsideWorkspaceReal(filePath) {
|
|
305
|
+
if (!filePath || !this.workspaceRoot) return true;
|
|
306
|
+
const baseReal = await this.resolveWorkspaceRealpath();
|
|
307
|
+
try {
|
|
308
|
+
const targetReal = await fs.realpath(filePath);
|
|
309
|
+
const normalizedBase = process.platform === 'win32' ? baseReal.toLowerCase() : baseReal;
|
|
310
|
+
const normalizedTarget = process.platform === 'win32' ? targetReal.toLowerCase() : targetReal;
|
|
311
|
+
const rel = path.relative(normalizedBase, normalizedTarget);
|
|
312
|
+
return rel === '' || (!rel.startsWith('..') && !path.isAbsolute(rel));
|
|
313
|
+
} catch {
|
|
314
|
+
// Fall back to lexical check when realpath fails (e.g., deleted files).
|
|
315
|
+
return this.isPathInsideWorkspace(filePath);
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
maybeResetWorkerCircuit() {
|
|
320
|
+
if (
|
|
321
|
+
this.workerCircuitOpen &&
|
|
322
|
+
this.workersDisabledUntil &&
|
|
323
|
+
Date.now() >= this.workersDisabledUntil
|
|
324
|
+
) {
|
|
325
|
+
this.workerCircuitOpen = false;
|
|
326
|
+
this.workersDisabledUntil = 0;
|
|
327
|
+
this.workerFailureCount = 0;
|
|
328
|
+
if (this.config.verbose) {
|
|
329
|
+
console.info('[Indexer] Worker circuit closed; resuming worker use');
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
shouldUseWorkers() {
|
|
335
|
+
this.maybeResetWorkerCircuit();
|
|
336
|
+
if (this.workersDisabledUntil && Date.now() < this.workersDisabledUntil) {
|
|
337
|
+
return false;
|
|
338
|
+
}
|
|
339
|
+
if (isTestEnv()) return false;
|
|
340
|
+
return (
|
|
341
|
+
os.cpus().length > 1 &&
|
|
342
|
+
this.config.workerThreads !== 0 &&
|
|
343
|
+
!this.config.embeddingProcessPerBatch
|
|
344
|
+
);
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
isHeavyEmbeddingModel() {
|
|
348
|
+
const model = String(this.config.embeddingModel || '').toLowerCase();
|
|
349
|
+
return model.includes('jina');
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
getWorkerInferenceBatchSize({ numWorkers = null } = {}) {
|
|
353
|
+
const configured =
|
|
354
|
+
Number.isInteger(this.config.embeddingBatchSize) && this.config.embeddingBatchSize > 0
|
|
355
|
+
? this.config.embeddingBatchSize
|
|
356
|
+
: null;
|
|
357
|
+
if (configured) return Math.min(configured, 256);
|
|
358
|
+
// Heavy models are more stable with batch=1 in multi-worker mode on some runtimes.
|
|
359
|
+
if (this.isHeavyEmbeddingModel() && Number.isInteger(numWorkers) && numWorkers > 1) return 1;
|
|
360
|
+
return null;
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
shouldUseEmbeddingProcessPerBatch(useWorkers = null) {
|
|
364
|
+
if (this.config.embeddingProcessPerBatch) return true;
|
|
365
|
+
if (isTestEnv()) return false;
|
|
366
|
+
if (this.config.autoEmbeddingProcessPerBatch === false) return false;
|
|
367
|
+
const workersActive = typeof useWorkers === 'boolean' ? useWorkers : this.shouldUseWorkers();
|
|
368
|
+
if (workersActive) return false;
|
|
369
|
+
if (!this.isHeavyEmbeddingModel()) return false;
|
|
370
|
+
if (!this._autoEmbeddingProcessLogged) {
|
|
371
|
+
console.info(
|
|
372
|
+
'[Indexer] Auto-enabling embeddingProcessPerBatch for memory isolation (set autoEmbeddingProcessPerBatch=false to disable)'
|
|
373
|
+
);
|
|
374
|
+
this._autoEmbeddingProcessLogged = true;
|
|
375
|
+
}
|
|
376
|
+
return true;
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
scheduleRetry() {
|
|
380
|
+
if (this._retryTimer || isTestEnv()) return;
|
|
381
|
+
const delayMs = Math.max(1000, this.workersDisabledUntil - Date.now());
|
|
382
|
+
if (!Number.isFinite(delayMs) || delayMs <= 0) return;
|
|
383
|
+
this._retryTimer = setTimeout(() => {
|
|
384
|
+
this._retryTimer = null;
|
|
385
|
+
if (!this.isIndexing && !this.processingWatchEvents) {
|
|
386
|
+
this.indexAll().catch(() => null);
|
|
387
|
+
}
|
|
388
|
+
}, delayMs);
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
maybeRunIncrementalGc(reason) {
|
|
392
|
+
if (!this.config.enableExplicitGc || typeof global.gc !== 'function') return;
|
|
393
|
+
const now = Date.now();
|
|
394
|
+
const minIntervalMs = 60_000;
|
|
395
|
+
if (this._lastIncrementalGcAt && now - this._lastIncrementalGcAt < minIntervalMs) return;
|
|
396
|
+
const thresholdMb = Number.isFinite(this.config.incrementalGcThresholdMb)
|
|
397
|
+
? this.config.incrementalGcThresholdMb
|
|
398
|
+
: 2048;
|
|
399
|
+
if (thresholdMb <= 0) return;
|
|
400
|
+
const { rss } = process.memoryUsage();
|
|
401
|
+
if (rss < thresholdMb * 1024 * 1024) return;
|
|
402
|
+
if (this.config.verbose) {
|
|
403
|
+
const rssMb = (rss / 1024 / 1024).toFixed(1);
|
|
404
|
+
console.info(
|
|
405
|
+
`[Indexer] Incremental GC (${reason}) rss=${rssMb}MB threshold=${thresholdMb}MB`
|
|
406
|
+
);
|
|
407
|
+
}
|
|
408
|
+
this._lastIncrementalGcAt = now;
|
|
409
|
+
global.gc();
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
maybeShutdownQueryEmbeddingPool(reason = 'indexing') {
|
|
413
|
+
if (this.config.shutdownQueryEmbeddingPoolAfterIndex === false) return;
|
|
414
|
+
if (!isEmbeddingPoolActive()) return;
|
|
415
|
+
if (this.config.verbose) {
|
|
416
|
+
console.info(`[Indexer] Shutting down search embedding pool after ${reason}`);
|
|
417
|
+
}
|
|
418
|
+
forceShutdownEmbeddingPool();
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
maybeRecycleServerAfterIncremental(reason = 'watch update') {
|
|
422
|
+
if (this.config.recycleServerOnHighRssAfterIncremental !== true) return false;
|
|
423
|
+
|
|
424
|
+
const thresholdRaw = Number(this.config.recycleServerOnHighRssThresholdMb);
|
|
425
|
+
const thresholdMb = Number.isFinite(thresholdRaw) && thresholdRaw > 0 ? thresholdRaw : 4096;
|
|
426
|
+
const cooldownRaw = Number(this.config.recycleServerOnHighRssCooldownMs);
|
|
427
|
+
const cooldownMs = Number.isFinite(cooldownRaw) && cooldownRaw >= 0 ? cooldownRaw : 300000;
|
|
428
|
+
const delayRaw = Number(this.config.recycleServerOnHighRssDelayMs);
|
|
429
|
+
const delayMs = Number.isFinite(delayRaw) && delayRaw >= 0 ? delayRaw : 2000;
|
|
430
|
+
|
|
431
|
+
const now = Date.now();
|
|
432
|
+
if (this._lastHighRssRecycleAt && now - this._lastHighRssRecycleAt < cooldownMs) {
|
|
433
|
+
return false;
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
const rssMb = process.memoryUsage().rss / 1024 / 1024;
|
|
437
|
+
if (rssMb < thresholdMb) return false;
|
|
438
|
+
|
|
439
|
+
this._lastHighRssRecycleAt = now;
|
|
440
|
+
if (this._pendingHighRssRecycleTimer) {
|
|
441
|
+
clearTimeout(this._pendingHighRssRecycleTimer);
|
|
442
|
+
this._pendingHighRssRecycleTimer = null;
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
console.warn(
|
|
446
|
+
`[Indexer] High RSS after ${reason} cleanup (${rssMb.toFixed(1)}MB >= ${thresholdMb}MB); recycling server in ${(delayMs / 1000).toFixed(1)}s`
|
|
447
|
+
);
|
|
448
|
+
|
|
449
|
+
this._pendingHighRssRecycleTimer = setTimeout(() => {
|
|
450
|
+
this._pendingHighRssRecycleTimer = null;
|
|
451
|
+
this.runExplicitGc({ force: true });
|
|
452
|
+
const currentRssMb = process.memoryUsage().rss / 1024 / 1024;
|
|
453
|
+
if (currentRssMb < thresholdMb) {
|
|
454
|
+
if (this.config.verbose || this.shouldTraceIncrementalMemory()) {
|
|
455
|
+
console.info(
|
|
456
|
+
`[Indexer] High-RSS recycle canceled after ${reason}; rss dropped to ${currentRssMb.toFixed(1)}MB`
|
|
457
|
+
);
|
|
458
|
+
}
|
|
459
|
+
return;
|
|
460
|
+
}
|
|
461
|
+
console.warn(
|
|
462
|
+
`[Indexer] Recycling server process due to persistent high RSS after ${reason} (${currentRssMb.toFixed(1)}MB)`
|
|
463
|
+
);
|
|
464
|
+
process.exit(0);
|
|
465
|
+
}, delayMs);
|
|
466
|
+
|
|
467
|
+
if (typeof this._pendingHighRssRecycleTimer?.unref === 'function') {
|
|
468
|
+
this._pendingHighRssRecycleTimer.unref();
|
|
469
|
+
}
|
|
470
|
+
return true;
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
async runPostIncrementalCleanup(reason = 'watch update') {
|
|
474
|
+
if (this.config.clearCacheAfterIndex) {
|
|
475
|
+
await this.traceIncrementalMemoryPhase(`incremental.dropInMemoryVectors (${reason})`, async () => {
|
|
476
|
+
await this.cache.dropInMemoryVectors();
|
|
477
|
+
});
|
|
478
|
+
if (this.config.verbose) {
|
|
479
|
+
console.info(`[Cache] Cleared in-memory vectors after ${reason}`);
|
|
480
|
+
}
|
|
481
|
+
// Keep server RSS low after single-file updates where vector arrays can remain in old-gen.
|
|
482
|
+
await this.traceIncrementalMemoryPhase(`incremental.explicitGc (${reason})`, async () => {
|
|
483
|
+
this.runExplicitGc({ force: true });
|
|
484
|
+
});
|
|
485
|
+
} else {
|
|
486
|
+
await this.traceIncrementalMemoryPhase(`incremental.maybeRunGc (${reason})`, async () => {
|
|
487
|
+
this.maybeRunIncrementalGc(reason);
|
|
488
|
+
});
|
|
489
|
+
}
|
|
490
|
+
if (this.config.unloadModelAfterIndex) {
|
|
491
|
+
await this.traceIncrementalMemoryPhase(
|
|
492
|
+
`incremental.unloadEmbeddingModels (${reason})`,
|
|
493
|
+
async () => {
|
|
494
|
+
await this.unloadEmbeddingModels();
|
|
495
|
+
}
|
|
496
|
+
);
|
|
497
|
+
}
|
|
498
|
+
await this.traceIncrementalMemoryPhase(
|
|
499
|
+
`incremental.shutdownQueryPool (${reason})`,
|
|
500
|
+
async () => {
|
|
501
|
+
this.maybeShutdownQueryEmbeddingPool(reason);
|
|
502
|
+
}
|
|
503
|
+
);
|
|
504
|
+
if (this.config.verbose) {
|
|
505
|
+
const { rss, heapUsed, heapTotal } = process.memoryUsage();
|
|
506
|
+
const toMb = (value) => `${(value / 1024 / 1024).toFixed(1)}MB`;
|
|
507
|
+
console.info(
|
|
508
|
+
`[Indexer] Memory after ${reason} cleanup: rss=${toMb(rss)} heap=${toMb(heapUsed)}/${toMb(heapTotal)}`
|
|
509
|
+
);
|
|
510
|
+
}
|
|
511
|
+
this.maybeRecycleServerAfterIncremental(reason);
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
recordWorkerFailure(reason) {
|
|
515
|
+
const threshold = Number.isInteger(this.config.workerFailureThreshold)
|
|
516
|
+
? this.config.workerFailureThreshold
|
|
517
|
+
: 1;
|
|
518
|
+
const cooldownMs = Number.isInteger(this.config.workerFailureCooldownMs)
|
|
519
|
+
? this.config.workerFailureCooldownMs
|
|
520
|
+
: 10 * 60 * 1000;
|
|
521
|
+
|
|
522
|
+
this.workerFailureCount += 1;
|
|
523
|
+
console.warn(`[Indexer] Worker failure: ${reason} (${this.workerFailureCount}/${threshold})`);
|
|
524
|
+
|
|
525
|
+
if (this.workerFailureCount >= threshold) {
|
|
526
|
+
this.workersDisabledUntil = Date.now() + cooldownMs;
|
|
527
|
+
this.workerCircuitOpen = true;
|
|
528
|
+
console.warn(
|
|
529
|
+
`[Indexer] Worker circuit open; pausing worker use for ${Math.round(cooldownMs / 1000)}s`
|
|
530
|
+
);
|
|
531
|
+
this.scheduleRetry();
|
|
532
|
+
}
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
/**
|
|
536
|
+
* Initialize worker thread pool for parallel embedding
|
|
537
|
+
*/
|
|
538
|
+
async initializeWorkers() {
|
|
539
|
+
// Check if we have any active workers
|
|
540
|
+
const activeWorkers = this.workers.filter((w) => w !== null);
|
|
541
|
+
if (activeWorkers.length > 0) return;
|
|
542
|
+
|
|
543
|
+
// If we have workers array but they are all null, reset it
|
|
544
|
+
if (this.workers.length > 0) {
|
|
545
|
+
this.workers = [];
|
|
546
|
+
this.workerReady = [];
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
if (this.initWorkerPromise) return this.initWorkerPromise;
|
|
550
|
+
|
|
551
|
+
this.initWorkerPromise = (async () => {
|
|
552
|
+
try {
|
|
553
|
+
let numWorkers =
|
|
554
|
+
this.config.workerThreads === 'auto'
|
|
555
|
+
? Math.min(2, Math.max(1, os.cpus().length - 1)) // Cap 'auto' at 2 workers
|
|
556
|
+
: typeof this.config.workerThreads === 'number'
|
|
557
|
+
? this.config.workerThreads
|
|
558
|
+
: 1;
|
|
559
|
+
|
|
560
|
+
// Heavy models can consume multiple GB per worker. Keep auto mode bounded by
|
|
561
|
+
// existing memory guards below; do not hard-pin to 1 worker as it can hurt throughput.
|
|
562
|
+
if (process.platform === 'win32' && this.isHeavyEmbeddingModel() && numWorkers > 1) {
|
|
563
|
+
if (!this._heavyWorkerSafetyLogged) {
|
|
564
|
+
console.warn(
|
|
565
|
+
'[Indexer] Heavy model worker safety mode: forcing workers=1 on Windows to avoid native multi-worker crashes'
|
|
566
|
+
);
|
|
567
|
+
this._heavyWorkerSafetyLogged = true;
|
|
568
|
+
}
|
|
569
|
+
numWorkers = 1;
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
// Resource-aware scaling: check available RAM (skip in test env to avoid mocking issues)
|
|
573
|
+
// We apply this if we have > 1 worker, regardless of whether it was 'auto' or explicit
|
|
574
|
+
if (numWorkers > 1 && !isTestEnv() && typeof os.freemem === 'function') {
|
|
575
|
+
const freeMemGb = os.freemem() / 1024 / 1024 / 1024;
|
|
576
|
+
const isHeavyModel = this.isHeavyEmbeddingModel();
|
|
577
|
+
const memPerWorker = isHeavyModel ? 8.0 : 0.8;
|
|
578
|
+
|
|
579
|
+
const memCappedWorkers = Math.max(1, Math.floor(freeMemGb / memPerWorker));
|
|
580
|
+
if (memCappedWorkers < numWorkers) {
|
|
581
|
+
if (this.config.verbose) {
|
|
582
|
+
console.info(
|
|
583
|
+
`[Indexer] Throttling workers from ${numWorkers} to ${memCappedWorkers} due to available RAM (${freeMemGb.toFixed(1)}GB)`
|
|
584
|
+
);
|
|
585
|
+
}
|
|
586
|
+
numWorkers = memCappedWorkers;
|
|
587
|
+
}
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
// Hard memory ceiling: disable workers if projected RSS risks OOM
|
|
591
|
+
if (!isTestEnv() && typeof os.totalmem === 'function') {
|
|
592
|
+
const totalMemGb = os.totalmem() / 1024 / 1024 / 1024;
|
|
593
|
+
const rssGb = process.memoryUsage().rss / 1024 / 1024 / 1024;
|
|
594
|
+
const isHeavyModel = this.isHeavyEmbeddingModel();
|
|
595
|
+
const memPerWorker = isHeavyModel ? 8.0 : 0.8;
|
|
596
|
+
const projectedGb = rssGb + numWorkers * memPerWorker + 0.5; // 0.5GB headroom
|
|
597
|
+
const ceilingGb = totalMemGb * 0.85;
|
|
598
|
+
if (numWorkers > 0 && projectedGb > ceilingGb) {
|
|
599
|
+
if (this.config.verbose) {
|
|
600
|
+
console.info(
|
|
601
|
+
`[Indexer] Disabling workers to avoid OOM: projected=${projectedGb.toFixed(1)}GB ceiling=${ceilingGb.toFixed(1)}GB rss=${rssGb.toFixed(1)}GB total=${totalMemGb.toFixed(1)}GB`
|
|
602
|
+
);
|
|
603
|
+
}
|
|
604
|
+
numWorkers = 0;
|
|
605
|
+
}
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
// Use workers even for single worker to benefit from --expose-gc and separate heap
|
|
609
|
+
if (numWorkers < 1) {
|
|
610
|
+
console.info(
|
|
611
|
+
'[Indexer] No workers configured, using main thread (warning: higher RAM usage)'
|
|
612
|
+
);
|
|
613
|
+
return;
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
if (this.config.verbose) {
|
|
617
|
+
console.info(
|
|
618
|
+
`[Indexer] Worker config: workerThreads=${this.config.workerThreads}, resolved to ${numWorkers}`
|
|
619
|
+
);
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
// Force 1 thread per worker to prevent CPU saturation (ONNX is very aggressive)
|
|
623
|
+
const threadsPerWorker = 1;
|
|
624
|
+
|
|
625
|
+
console.info(
|
|
626
|
+
`[Indexer] Initializing ${numWorkers} worker threads (${threadsPerWorker} threads per worker)...`
|
|
627
|
+
);
|
|
628
|
+
|
|
629
|
+
const workerInferenceBatchSize = this.getWorkerInferenceBatchSize({ numWorkers });
|
|
630
|
+
if (this.config.verbose && Number.isInteger(workerInferenceBatchSize)) {
|
|
631
|
+
console.info(
|
|
632
|
+
`[Indexer] Worker inference batch size: ${workerInferenceBatchSize}`
|
|
633
|
+
);
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
for (let i = 0; i < numWorkers; i++) {
|
|
637
|
+
try {
|
|
638
|
+
const worker = new Worker(new URL('../lib/embedding-worker.js', import.meta.url), {
|
|
639
|
+
workerData: {
|
|
640
|
+
workerId: i,
|
|
641
|
+
embeddingModel: this.config.embeddingModel,
|
|
642
|
+
embeddingDimension: this.config.embeddingDimension || null,
|
|
643
|
+
verbose: this.config.verbose,
|
|
644
|
+
numThreads: threadsPerWorker,
|
|
645
|
+
searchDirectory: this.config.searchDirectory,
|
|
646
|
+
maxFileSize: this.config.maxFileSize,
|
|
647
|
+
callGraphEnabled: this.config.callGraphEnabled,
|
|
648
|
+
enableExplicitGc: this.config.enableExplicitGc,
|
|
649
|
+
failFastEmbeddingErrors: this.config.failFastEmbeddingErrors === true,
|
|
650
|
+
inferenceBatchSize: workerInferenceBatchSize,
|
|
651
|
+
},
|
|
652
|
+
});
|
|
653
|
+
|
|
654
|
+
const readyPromise = new Promise((resolve, reject) => {
|
|
655
|
+
const readyTimeoutMs = isTestEnv() ? 1000 : 120000;
|
|
656
|
+
const timeout = setTimeout(
|
|
657
|
+
() => reject(new Error('Worker init timeout')),
|
|
658
|
+
readyTimeoutMs
|
|
659
|
+
);
|
|
660
|
+
|
|
661
|
+
worker.once('message', (msg) => {
|
|
662
|
+
clearTimeout(timeout);
|
|
663
|
+
if (msg.type === 'ready') {
|
|
664
|
+
resolve(worker);
|
|
665
|
+
} else if (msg.type === 'error') {
|
|
666
|
+
console.warn(`[Indexer] Worker initialization failed: ${msg.error}`);
|
|
667
|
+
reject(new Error(msg.error));
|
|
668
|
+
}
|
|
669
|
+
});
|
|
670
|
+
|
|
671
|
+
worker.once('error', (err) => {
|
|
672
|
+
clearTimeout(timeout);
|
|
673
|
+
console.warn(`[Indexer] Worker initialization failed: ${err.message}`);
|
|
674
|
+
reject(err);
|
|
675
|
+
});
|
|
676
|
+
});
|
|
677
|
+
|
|
678
|
+
this.workers.push(worker);
|
|
679
|
+
this.workerReady.push(readyPromise);
|
|
680
|
+
} catch (err) {
|
|
681
|
+
console.warn(`[Indexer] Failed to create worker ${i}: ${err.message}`);
|
|
682
|
+
}
|
|
683
|
+
}
|
|
684
|
+
|
|
685
|
+
// Wait for all workers to be ready
|
|
686
|
+
try {
|
|
687
|
+
await Promise.all(this.workerReady);
|
|
688
|
+
console.info(`[Indexer] ${this.workers.length} workers ready`);
|
|
689
|
+
if (this.config.verbose) {
|
|
690
|
+
console.info(`[Indexer] Each worker loaded model: ${this.config.embeddingModel}`);
|
|
691
|
+
}
|
|
692
|
+
} catch (err) {
|
|
693
|
+
console.warn(
|
|
694
|
+
`[Indexer] Worker initialization failed: ${err.message}, falling back to single-threaded`
|
|
695
|
+
);
|
|
696
|
+
await this.terminateWorkers();
|
|
697
|
+
}
|
|
698
|
+
} finally {
|
|
699
|
+
this.initWorkerPromise = null;
|
|
700
|
+
}
|
|
701
|
+
})();
|
|
702
|
+
return this.initWorkerPromise;
|
|
703
|
+
}
|
|
704
|
+
|
|
705
|
+
/**
|
|
706
|
+
* Terminate all worker threads
|
|
707
|
+
*/
|
|
708
|
+
async terminateWorkers() {
|
|
709
|
+
const WORKER_SHUTDOWN_TIMEOUT = isTestEnv() ? 50 : 5000;
|
|
710
|
+
const terminations = this.workers.filter(Boolean).map((worker) => {
|
|
711
|
+
try {
|
|
712
|
+
worker.postMessage({ type: 'shutdown' });
|
|
713
|
+
} catch {
|
|
714
|
+
/* ignore */
|
|
715
|
+
}
|
|
716
|
+
|
|
717
|
+
let exited = false;
|
|
718
|
+
const exitPromise = new Promise((resolve) => {
|
|
719
|
+
worker.once('exit', () => {
|
|
720
|
+
exited = true;
|
|
721
|
+
resolve();
|
|
722
|
+
});
|
|
723
|
+
});
|
|
724
|
+
const timeoutPromise = delay(WORKER_SHUTDOWN_TIMEOUT);
|
|
725
|
+
|
|
726
|
+
return Promise.race([exitPromise, timeoutPromise]).then(() => {
|
|
727
|
+
if (!exited) {
|
|
728
|
+
const termination = worker.terminate?.();
|
|
729
|
+
return Promise.resolve(termination).catch(() => null);
|
|
730
|
+
}
|
|
731
|
+
return null;
|
|
732
|
+
});
|
|
733
|
+
});
|
|
734
|
+
await Promise.all(terminations);
|
|
735
|
+
this.workers = [];
|
|
736
|
+
this.workerReady = [];
|
|
737
|
+
}
|
|
738
|
+
|
|
739
|
+
/**
|
|
740
|
+
* Send unload message to all workers to free their model memory.
|
|
741
|
+
* This keeps workers alive but releases the embedding model from RAM.
|
|
742
|
+
*/
|
|
743
|
+
async unloadWorkersModels() {
|
|
744
|
+
if (this.workers.length === 0) return { unloaded: 0 };
|
|
745
|
+
|
|
746
|
+
const UNLOAD_TIMEOUT = 10000;
|
|
747
|
+
let unloadedCount = 0;
|
|
748
|
+
|
|
749
|
+
const unloadPromises = this.workers.filter(Boolean).map((worker, idx) => {
|
|
750
|
+
return new Promise((resolve) => {
|
|
751
|
+
const timeout = setTimeout(() => {
|
|
752
|
+
if (this.config.verbose) {
|
|
753
|
+
console.warn(`[Indexer] Worker ${idx} unload timed out`);
|
|
754
|
+
}
|
|
755
|
+
resolve(false);
|
|
756
|
+
}, UNLOAD_TIMEOUT);
|
|
757
|
+
|
|
758
|
+
const handler = (msg) => {
|
|
759
|
+
if (msg?.type === 'unload-complete') {
|
|
760
|
+
clearTimeout(timeout);
|
|
761
|
+
worker.off('message', handler);
|
|
762
|
+
if (msg.wasLoaded) unloadedCount++;
|
|
763
|
+
resolve(true);
|
|
764
|
+
}
|
|
765
|
+
};
|
|
766
|
+
|
|
767
|
+
worker.on('message', handler);
|
|
768
|
+
try {
|
|
769
|
+
worker.postMessage({ type: 'unload' });
|
|
770
|
+
} catch (err) {
|
|
771
|
+
clearTimeout(timeout);
|
|
772
|
+
worker.off('message', handler);
|
|
773
|
+
if (this.config.verbose) {
|
|
774
|
+
console.warn(`[Indexer] Failed to send unload to worker ${idx}: ${err.message}`);
|
|
775
|
+
}
|
|
776
|
+
resolve(false);
|
|
777
|
+
}
|
|
778
|
+
});
|
|
779
|
+
});
|
|
780
|
+
|
|
781
|
+
await Promise.all(unloadPromises);
|
|
782
|
+
|
|
783
|
+
if (this.config.verbose) {
|
|
784
|
+
console.info(`[Indexer] Unloaded models from ${unloadedCount} workers`);
|
|
785
|
+
}
|
|
786
|
+
|
|
787
|
+
return { unloaded: unloadedCount };
|
|
788
|
+
}
|
|
789
|
+
|
|
790
|
+
/**
|
|
791
|
+
* Send unload message to the embedding child process.
|
|
792
|
+
* This frees the embedding model from RAM in the child process.
|
|
793
|
+
*/
|
|
794
|
+
async unloadEmbeddingChildModel() {
|
|
795
|
+
const child = this._embeddingChild;
|
|
796
|
+
if (!child) return { success: true, wasLoaded: false };
|
|
797
|
+
|
|
798
|
+
return new Promise((resolve) => {
|
|
799
|
+
const timeout = setTimeout(() => {
|
|
800
|
+
if (this.config.verbose) {
|
|
801
|
+
console.warn('[Indexer] Embedding child unload timed out');
|
|
802
|
+
}
|
|
803
|
+
resolve({ success: false, timeout: true });
|
|
804
|
+
}, 10000);
|
|
805
|
+
|
|
806
|
+
const onData = (data) => {
|
|
807
|
+
try {
|
|
808
|
+
const lines = data.toString().split('\n').filter(Boolean);
|
|
809
|
+
for (const line of lines) {
|
|
810
|
+
const parsed = JSON.parse(line);
|
|
811
|
+
if (parsed?.success !== undefined) {
|
|
812
|
+
clearTimeout(timeout);
|
|
813
|
+
child.stdout.off('data', onData);
|
|
814
|
+
resolve(parsed);
|
|
815
|
+
return;
|
|
816
|
+
}
|
|
817
|
+
}
|
|
818
|
+
} catch {
|
|
819
|
+
// Not JSON or incomplete, keep waiting
|
|
820
|
+
}
|
|
821
|
+
};
|
|
822
|
+
|
|
823
|
+
child.stdout.on('data', onData);
|
|
824
|
+
|
|
825
|
+
try {
|
|
826
|
+
child.stdin.write(`${JSON.stringify({ type: 'unload' })}\n`);
|
|
827
|
+
} catch (err) {
|
|
828
|
+
clearTimeout(timeout);
|
|
829
|
+
child.stdout.off('data', onData);
|
|
830
|
+
if (this.config.verbose) {
|
|
831
|
+
console.warn(`[Indexer] Failed to send unload to child: ${err.message}`);
|
|
832
|
+
}
|
|
833
|
+
resolve({ success: false, error: err.message });
|
|
834
|
+
}
|
|
835
|
+
});
|
|
836
|
+
}
|
|
837
|
+
|
|
838
|
+
/**
|
|
839
|
+
* Unload embedding models from all sources (workers and child process) to free RAM.
|
|
840
|
+
* This is called after indexing when unloadModelAfterIndex is enabled.
|
|
841
|
+
*/
|
|
842
|
+
async unloadEmbeddingModels() {
|
|
843
|
+
const results = { workers: 0, childUnloaded: false };
|
|
844
|
+
|
|
845
|
+
// Unload from workers (or terminate them - termination also frees memory)
|
|
846
|
+
if (this.workers.length > 0) {
|
|
847
|
+
// Terminating workers is more reliable than unloading in-place
|
|
848
|
+
// since it fully releases the ONNX runtime memory
|
|
849
|
+
if (this.config.verbose) {
|
|
850
|
+
console.info(`[Indexer] Terminating ${this.workers.length} workers to free model memory`);
|
|
851
|
+
}
|
|
852
|
+
await this.terminateWorkers();
|
|
853
|
+
results.workers = this.workers.length;
|
|
854
|
+
}
|
|
855
|
+
|
|
856
|
+
// Unload from persistent embedding child process
|
|
857
|
+
if (this._embeddingChild) {
|
|
858
|
+
const childResult = await this.unloadEmbeddingChildModel();
|
|
859
|
+
results.childUnloaded = childResult?.wasLoaded || false;
|
|
860
|
+
if (this.config.verbose) {
|
|
861
|
+
console.info(`[Indexer] Embedding child model unloaded: ${results.childUnloaded}`);
|
|
862
|
+
}
|
|
863
|
+
}
|
|
864
|
+
|
|
865
|
+
// Trigger GC in main process if configured
|
|
866
|
+
if (this.isExplicitGcEnabled()) {
|
|
867
|
+
const before = process.memoryUsage();
|
|
868
|
+
this.runExplicitGc({ force: true });
|
|
869
|
+
const after = process.memoryUsage();
|
|
870
|
+
if (this.config.verbose) {
|
|
871
|
+
console.info(
|
|
872
|
+
`[Indexer] Post-unload GC: rss ${(before.rss / 1024 / 1024).toFixed(1)}MB -> ${(after.rss / 1024 / 1024).toFixed(1)}MB`
|
|
873
|
+
);
|
|
874
|
+
}
|
|
875
|
+
}
|
|
876
|
+
|
|
877
|
+
return results;
|
|
878
|
+
}
|
|
879
|
+
|
|
880
|
+
|
|
881
|
+
async loadGitignore() {
|
|
882
|
+
if (!this.config.searchDirectory) {
|
|
883
|
+
this.gitignore = ignore();
|
|
884
|
+
return;
|
|
885
|
+
}
|
|
886
|
+
try {
|
|
887
|
+
const gitignorePath = path.join(this.config.searchDirectory, '.gitignore');
|
|
888
|
+
const content = await fs.readFile(gitignorePath, 'utf8');
|
|
889
|
+
this.gitignore = ignore().add(content);
|
|
890
|
+
if (this.config.verbose) console.info('[Indexer] Loaded .gitignore rules');
|
|
891
|
+
} catch (_e) {
|
|
892
|
+
// No .gitignore or error reading it
|
|
893
|
+
this.gitignore = ignore();
|
|
894
|
+
}
|
|
895
|
+
}
|
|
896
|
+
|
|
897
|
+
getCacheRelativePath() {
|
|
898
|
+
if (!this.config.cacheDirectory || !this.config.searchDirectory) return null;
|
|
899
|
+
const relative = path.relative(this.config.searchDirectory, this.config.cacheDirectory);
|
|
900
|
+
if (!relative || relative.startsWith('..') || path.isAbsolute(relative)) return null;
|
|
901
|
+
return normalizePath(relative);
|
|
902
|
+
}
|
|
903
|
+
|
|
904
|
+
isExcluded(filePath) {
|
|
905
|
+
if (!filePath || typeof filePath !== 'string') {
|
|
906
|
+
return false;
|
|
907
|
+
}
|
|
908
|
+
|
|
909
|
+
let relative = filePath;
|
|
910
|
+
if (path.isAbsolute(filePath)) {
|
|
911
|
+
if (this.config.searchDirectory) {
|
|
912
|
+
relative = path.relative(this.config.searchDirectory, filePath);
|
|
913
|
+
if (!relative || relative.startsWith('..') || path.isAbsolute(relative)) {
|
|
914
|
+
return false;
|
|
915
|
+
}
|
|
916
|
+
} else {
|
|
917
|
+
const root = path.parse(filePath).root;
|
|
918
|
+
relative = filePath.slice(root.length);
|
|
919
|
+
}
|
|
920
|
+
}
|
|
921
|
+
|
|
922
|
+
relative = normalizePath(relative);
|
|
923
|
+
|
|
924
|
+
if (matchesExcludePatterns(relative, this.excludeMatchers)) return true;
|
|
925
|
+
|
|
926
|
+
if (this.gitignore.ignores(relative)) return true;
|
|
927
|
+
|
|
928
|
+
return false;
|
|
929
|
+
}
|
|
930
|
+
|
|
931
|
+
async replaceDeadWorker(index) {
|
|
932
|
+
if (this.config.verbose) console.info(`[Indexer] Replacing dead worker at index ${index}...`);
|
|
933
|
+
|
|
934
|
+
// Use 1 thread per worker to match initializeWorkers and prevent CPU saturation
|
|
935
|
+
const threadsPerWorker = 1;
|
|
936
|
+
const activeWorkerCount = this.workers.filter(Boolean).length || 1;
|
|
937
|
+
const workerInferenceBatchSize = this.getWorkerInferenceBatchSize({
|
|
938
|
+
numWorkers: activeWorkerCount,
|
|
939
|
+
});
|
|
940
|
+
const newWorker = new Worker(new URL('../lib/embedding-worker.js', import.meta.url), {
|
|
941
|
+
workerData: {
|
|
942
|
+
workerId: index,
|
|
943
|
+
embeddingModel: this.config.embeddingModel,
|
|
944
|
+
embeddingDimension: this.config.embeddingDimension || null,
|
|
945
|
+
verbose: this.config.verbose,
|
|
946
|
+
numThreads: threadsPerWorker,
|
|
947
|
+
searchDirectory: this.config.searchDirectory,
|
|
948
|
+
maxFileSize: this.config.maxFileSize,
|
|
949
|
+
callGraphEnabled: this.config.callGraphEnabled,
|
|
950
|
+
enableExplicitGc: this.config.enableExplicitGc,
|
|
951
|
+
failFastEmbeddingErrors: this.config.failFastEmbeddingErrors === true,
|
|
952
|
+
inferenceBatchSize: workerInferenceBatchSize,
|
|
953
|
+
},
|
|
954
|
+
});
|
|
955
|
+
|
|
956
|
+
// Wait for ready
|
|
957
|
+
await new Promise((resolve, reject) => {
|
|
958
|
+
const timeout = setTimeout(() => reject(new Error('Timeout')), 30000);
|
|
959
|
+
newWorker.once('message', (msg) => {
|
|
960
|
+
if (msg.type === 'ready') {
|
|
961
|
+
clearTimeout(timeout);
|
|
962
|
+
resolve();
|
|
963
|
+
}
|
|
964
|
+
});
|
|
965
|
+
newWorker.once('error', (err) => {
|
|
966
|
+
clearTimeout(timeout);
|
|
967
|
+
reject(err);
|
|
968
|
+
});
|
|
969
|
+
});
|
|
970
|
+
|
|
971
|
+
this.workers[index] = newWorker;
|
|
972
|
+
if (this.config.verbose) console.info(`[Indexer] Worker ${index} respawned successfully`);
|
|
973
|
+
}
|
|
974
|
+
|
|
975
|
+
/**
|
|
976
|
+
* Send MCP progress notification to connected clients
|
|
977
|
+
*/
|
|
978
|
+
sendProgress(progress, total, message) {
|
|
979
|
+
if (this.server) {
|
|
980
|
+
try {
|
|
981
|
+
this.server.sendNotification('notifications/progress', {
|
|
982
|
+
progressToken: 'indexing',
|
|
983
|
+
progress,
|
|
984
|
+
total,
|
|
985
|
+
message,
|
|
986
|
+
});
|
|
987
|
+
} catch (_err) {
|
|
988
|
+
// Silently ignore if client doesn't support progress notifications
|
|
989
|
+
}
|
|
990
|
+
}
|
|
991
|
+
this.writeProgressFile(progress, total, message).catch(() => null);
|
|
992
|
+
}
|
|
993
|
+
|
|
994
|
+
async writeProgressFile(progress, total, message) {
|
|
995
|
+
if (!this.config.enableCache) return;
|
|
996
|
+
|
|
997
|
+
const payload = {
|
|
998
|
+
progress,
|
|
999
|
+
total,
|
|
1000
|
+
message,
|
|
1001
|
+
updatedAt: new Date().toISOString(),
|
|
1002
|
+
indexMode: this.currentIndexMode || null,
|
|
1003
|
+
workerCircuitOpen: !!this.workerCircuitOpen,
|
|
1004
|
+
workersDisabledUntil: Number.isFinite(this.workersDisabledUntil)
|
|
1005
|
+
? this.workersDisabledUntil
|
|
1006
|
+
: null,
|
|
1007
|
+
};
|
|
1008
|
+
|
|
1009
|
+
const prev = this._lastProgress;
|
|
1010
|
+
if (
|
|
1011
|
+
prev &&
|
|
1012
|
+
prev.progress === payload.progress &&
|
|
1013
|
+
prev.total === payload.total &&
|
|
1014
|
+
prev.message === payload.message
|
|
1015
|
+
) {
|
|
1016
|
+
return;
|
|
1017
|
+
}
|
|
1018
|
+
|
|
1019
|
+
this._lastProgress = payload;
|
|
1020
|
+
try {
|
|
1021
|
+
await fs.mkdir(this.config.cacheDirectory, { recursive: true });
|
|
1022
|
+
const progressPath = path.join(this.config.cacheDirectory, 'progress.json');
|
|
1023
|
+
await fs.writeFile(progressPath, JSON.stringify(payload), 'utf-8');
|
|
1024
|
+
} catch {
|
|
1025
|
+
// ignore progress write errors
|
|
1026
|
+
}
|
|
1027
|
+
}
|
|
1028
|
+
|
|
1029
|
+
async processFilesWithWorkers(allFiles) {
|
|
1030
|
+
const allowedFiles = [];
|
|
1031
|
+
for (const entry of allFiles) {
|
|
1032
|
+
if (await this.isPathInsideWorkspaceReal(entry.file)) {
|
|
1033
|
+
allowedFiles.push(entry);
|
|
1034
|
+
}
|
|
1035
|
+
}
|
|
1036
|
+
if (allowedFiles.length !== allFiles.length) {
|
|
1037
|
+
console.warn(
|
|
1038
|
+
`[Indexer] Skipping ${allFiles.length - allowedFiles.length} file(s) outside workspace`
|
|
1039
|
+
);
|
|
1040
|
+
}
|
|
1041
|
+
if (allowedFiles.length === 0) {
|
|
1042
|
+
return [];
|
|
1043
|
+
}
|
|
1044
|
+
|
|
1045
|
+
// Wait for any pending worker replacements to complete before distributing work
|
|
1046
|
+
if (this._workerReplacementPromises && this._workerReplacementPromises.size > 0) {
|
|
1047
|
+
await Promise.all(this._workerReplacementPromises.values());
|
|
1048
|
+
}
|
|
1049
|
+
|
|
1050
|
+
const activeWorkers = this.workers
|
|
1051
|
+
.map((worker, index) => ({ worker, index }))
|
|
1052
|
+
.filter((entry) => entry.worker);
|
|
1053
|
+
|
|
1054
|
+
if (activeWorkers.length === 0) {
|
|
1055
|
+
// Fallback: This method shouldn't be called if workers aren't available,
|
|
1056
|
+
// but if it is, we return empty and let the caller handle legacy fallback.
|
|
1057
|
+
return [];
|
|
1058
|
+
}
|
|
1059
|
+
|
|
1060
|
+
const results = [];
|
|
1061
|
+
const chunkSize = Math.ceil(allowedFiles.length / activeWorkers.length);
|
|
1062
|
+
const workerPromises = [];
|
|
1063
|
+
const configuredTimeout = Number.isInteger(this.config.workerBatchTimeoutMs)
|
|
1064
|
+
? this.config.workerBatchTimeoutMs
|
|
1065
|
+
: 300000;
|
|
1066
|
+
const WORKER_TIMEOUT = isTestEnv() ? 1000 : configuredTimeout;
|
|
1067
|
+
|
|
1068
|
+
for (let i = 0; i < activeWorkers.length; i++) {
|
|
1069
|
+
const { worker, index: workerIndex } = activeWorkers[i];
|
|
1070
|
+
const workerFiles = allowedFiles.slice(i * chunkSize, (i + 1) * chunkSize);
|
|
1071
|
+
if (workerFiles.length === 0) continue;
|
|
1072
|
+
|
|
1073
|
+
if (this.config.verbose) {
|
|
1074
|
+
console.info(`[Indexer] Worker ${workerIndex}: processing ${workerFiles.length} files`);
|
|
1075
|
+
}
|
|
1076
|
+
|
|
1077
|
+
const promise = new Promise((resolve) => {
|
|
1078
|
+
const batchId = `file-batch-${i}-${Date.now()}`;
|
|
1079
|
+
const batchResults = [];
|
|
1080
|
+
let workerKilled = false; // Atomic guard against duplicate kills
|
|
1081
|
+
|
|
1082
|
+
const killWorker = async () => {
|
|
1083
|
+
// Atomic guard: prevent concurrent killWorker calls for same worker
|
|
1084
|
+
if (workerKilled || this.workers[workerIndex] === null) return;
|
|
1085
|
+
workerKilled = true;
|
|
1086
|
+
this.workers[workerIndex] = null; // Mark as dead immediately before async work
|
|
1087
|
+
try {
|
|
1088
|
+
await worker.terminate?.();
|
|
1089
|
+
} catch (_err) {
|
|
1090
|
+
// ignore termination errors
|
|
1091
|
+
}
|
|
1092
|
+
// Track worker replacement to prevent concurrent replacements for the same slot
|
|
1093
|
+
if (!this._workerReplacementPromises) {
|
|
1094
|
+
this._workerReplacementPromises = new Map();
|
|
1095
|
+
}
|
|
1096
|
+
if (!this._workerReplacementPromises.has(workerIndex)) {
|
|
1097
|
+
// Use IIFE to ensure cleanup happens in finally block even on sync errors
|
|
1098
|
+
const replacement = (async () => {
|
|
1099
|
+
try {
|
|
1100
|
+
await this.replaceDeadWorker(workerIndex);
|
|
1101
|
+
} catch (err) {
|
|
1102
|
+
console.warn(`[Indexer] Failed to replace worker ${workerIndex}: ${err.message}`);
|
|
1103
|
+
} finally {
|
|
1104
|
+
this._workerReplacementPromises.delete(workerIndex);
|
|
1105
|
+
}
|
|
1106
|
+
})();
|
|
1107
|
+
this._workerReplacementPromises.set(workerIndex, replacement);
|
|
1108
|
+
}
|
|
1109
|
+
};
|
|
1110
|
+
|
|
1111
|
+
const handleTimeout = () => {
|
|
1112
|
+
// Terminate first to ensure no more messages arrive
|
|
1113
|
+
void killWorker();
|
|
1114
|
+
worker.off('message', handler);
|
|
1115
|
+
worker.off('error', errorHandler);
|
|
1116
|
+
console.warn(`[Indexer] Worker ${workerIndex} timed out (files)`);
|
|
1117
|
+
this.recordWorkerFailure(`timeout (batch ${batchId})`);
|
|
1118
|
+
resolve([]);
|
|
1119
|
+
};
|
|
1120
|
+
|
|
1121
|
+
let timeout = setTimeout(handleTimeout, WORKER_TIMEOUT);
|
|
1122
|
+
|
|
1123
|
+
const finalize = (results) => {
|
|
1124
|
+
clearTimeout(timeout);
|
|
1125
|
+
worker.off('message', handler);
|
|
1126
|
+
worker.off('error', errorHandler);
|
|
1127
|
+
resolve(results);
|
|
1128
|
+
};
|
|
1129
|
+
|
|
1130
|
+
const handler = (msg) => {
|
|
1131
|
+
if (msg.batchId === batchId) {
|
|
1132
|
+
if (msg.type === 'results') {
|
|
1133
|
+
if (Array.isArray(msg.results)) {
|
|
1134
|
+
batchResults.push(...msg.results);
|
|
1135
|
+
}
|
|
1136
|
+
if (msg.done) {
|
|
1137
|
+
finalize(batchResults);
|
|
1138
|
+
}
|
|
1139
|
+
} else if (msg.type === 'error') {
|
|
1140
|
+
finalize([]);
|
|
1141
|
+
}
|
|
1142
|
+
}
|
|
1143
|
+
};
|
|
1144
|
+
|
|
1145
|
+
const errorHandler = (err) => {
|
|
1146
|
+
console.warn(`[Indexer] Worker ${workerIndex} crashed: ${err.message}`);
|
|
1147
|
+
this.recordWorkerFailure(`crash (${err.message})`);
|
|
1148
|
+
void killWorker();
|
|
1149
|
+
finalize([]);
|
|
1150
|
+
};
|
|
1151
|
+
|
|
1152
|
+
worker.once('error', errorHandler);
|
|
1153
|
+
worker.on('message', handler);
|
|
1154
|
+
|
|
1155
|
+
try {
|
|
1156
|
+
worker.postMessage({
|
|
1157
|
+
type: 'processFiles',
|
|
1158
|
+
files: workerFiles,
|
|
1159
|
+
batchId,
|
|
1160
|
+
chunkConfig: this.config,
|
|
1161
|
+
});
|
|
1162
|
+
} catch (_error) {
|
|
1163
|
+
finalize([]);
|
|
1164
|
+
}
|
|
1165
|
+
});
|
|
1166
|
+
|
|
1167
|
+
workerPromises.push({ promise, files: workerFiles });
|
|
1168
|
+
}
|
|
1169
|
+
|
|
1170
|
+
const workerResults = await Promise.all(workerPromises.map((p) => p.promise));
|
|
1171
|
+
|
|
1172
|
+
// Identify failed files for retry
|
|
1173
|
+
const failedFiles = [];
|
|
1174
|
+
for (let i = 0; i < workerResults.length; i++) {
|
|
1175
|
+
if (workerResults[i].length > 0) {
|
|
1176
|
+
results.push(...workerResults[i]);
|
|
1177
|
+
} else if (workerPromises[i].files.length > 0) {
|
|
1178
|
+
failedFiles.push(...workerPromises[i].files);
|
|
1179
|
+
}
|
|
1180
|
+
}
|
|
1181
|
+
|
|
1182
|
+
// Pass failed files back to be handled by legacy path
|
|
1183
|
+
if (failedFiles.length > 0) {
|
|
1184
|
+
if (this.config.verbose) {
|
|
1185
|
+
console.warn(
|
|
1186
|
+
`[Indexer] ${failedFiles.length} files failed in workers, falling back to main thread`
|
|
1187
|
+
);
|
|
1188
|
+
}
|
|
1189
|
+
// Mark these as failed in the results so the caller knows to process them manually
|
|
1190
|
+
for (const f of failedFiles) {
|
|
1191
|
+
results.push({ file: f.file, status: 'retry' });
|
|
1192
|
+
}
|
|
1193
|
+
}
|
|
1194
|
+
|
|
1195
|
+
return results;
|
|
1196
|
+
}
|
|
1197
|
+
|
|
1198
|
+
/**
|
|
1199
|
+
* Process chunks using worker thread pool with timeout and error recovery
|
|
1200
|
+
*/
|
|
1201
|
+
async processChunksWithWorkers(allChunks) {
|
|
1202
|
+
const activeWorkers = this.workers
|
|
1203
|
+
.map((worker, index) => ({ worker, index }))
|
|
1204
|
+
.filter((entry) => entry.worker);
|
|
1205
|
+
|
|
1206
|
+
if (activeWorkers.length === 0) {
|
|
1207
|
+
// Fallback to single-threaded processing
|
|
1208
|
+
return this.processChunksSingleThreaded(allChunks);
|
|
1209
|
+
}
|
|
1210
|
+
|
|
1211
|
+
const results = [];
|
|
1212
|
+
const allowSingleThreadFallback = this.config.allowSingleThreadFallback !== false;
|
|
1213
|
+
const chunkSize = Math.ceil(allChunks.length / activeWorkers.length);
|
|
1214
|
+
const workerPromises = [];
|
|
1215
|
+
const configuredTimeout = Number.isInteger(this.config.workerBatchTimeoutMs)
|
|
1216
|
+
? this.config.workerBatchTimeoutMs
|
|
1217
|
+
: 300000;
|
|
1218
|
+
const WORKER_TIMEOUT = isTestEnv() ? 1000 : configuredTimeout; // 1s in tests, configurable in prod
|
|
1219
|
+
|
|
1220
|
+
if (this.config.verbose) {
|
|
1221
|
+
console.info(
|
|
1222
|
+
`[Indexer] Distributing ${allChunks.length} chunks across ${activeWorkers.length} workers (~${chunkSize} chunks each)`
|
|
1223
|
+
);
|
|
1224
|
+
}
|
|
1225
|
+
|
|
1226
|
+
for (let i = 0; i < activeWorkers.length; i++) {
|
|
1227
|
+
const { worker, index: workerIndex } = activeWorkers[i];
|
|
1228
|
+
const workerChunks = allChunks.slice(i * chunkSize, (i + 1) * chunkSize);
|
|
1229
|
+
if (workerChunks.length === 0) continue;
|
|
1230
|
+
|
|
1231
|
+
if (this.config.verbose) {
|
|
1232
|
+
console.info(`[Indexer] Worker ${workerIndex}: processing ${workerChunks.length} chunks`);
|
|
1233
|
+
}
|
|
1234
|
+
|
|
1235
|
+
const promise = new Promise((resolve, _reject) => {
|
|
1236
|
+
const batchId = `batch-${i}-${Date.now()}`;
|
|
1237
|
+
const batchResults = [];
|
|
1238
|
+
let workerKilled = false; // Atomic guard against duplicate kills
|
|
1239
|
+
|
|
1240
|
+
// Timeout handler
|
|
1241
|
+
const killWorker = async () => {
|
|
1242
|
+
// Atomic guard: prevent concurrent killWorker calls for same worker
|
|
1243
|
+
if (workerKilled || this.workers[workerIndex] === null) return;
|
|
1244
|
+
workerKilled = true;
|
|
1245
|
+
this.workers[workerIndex] = null; // Mark as dead immediately before async work
|
|
1246
|
+
try {
|
|
1247
|
+
await worker.terminate?.();
|
|
1248
|
+
} catch {
|
|
1249
|
+
// ignore terminate errors
|
|
1250
|
+
}
|
|
1251
|
+
|
|
1252
|
+
// Track worker replacement to prevent concurrent replacements for the same slot
|
|
1253
|
+
if (!this._workerReplacementPromises) {
|
|
1254
|
+
this._workerReplacementPromises = new Map();
|
|
1255
|
+
}
|
|
1256
|
+
if (!this._workerReplacementPromises.has(workerIndex)) {
|
|
1257
|
+
const replacement = this.replaceDeadWorker(workerIndex)
|
|
1258
|
+
.catch((err) => {
|
|
1259
|
+
console.warn(`[Indexer] Failed to replace worker ${workerIndex}: ${err.message}`);
|
|
1260
|
+
})
|
|
1261
|
+
.finally(() => {
|
|
1262
|
+
this._workerReplacementPromises.delete(workerIndex);
|
|
1263
|
+
});
|
|
1264
|
+
this._workerReplacementPromises.set(workerIndex, replacement);
|
|
1265
|
+
}
|
|
1266
|
+
};
|
|
1267
|
+
|
|
1268
|
+
const handleTimeout = (label) => {
|
|
1269
|
+
// Terminate first to ensure no more messages arrive
|
|
1270
|
+
void killWorker();
|
|
1271
|
+
worker.off('message', handler);
|
|
1272
|
+
worker.off('error', errorHandler);
|
|
1273
|
+
if (exitHandler) worker.off('exit', exitHandler);
|
|
1274
|
+
console.warn(`[Indexer] Worker ${workerIndex} timed out, ${label}`);
|
|
1275
|
+
this.recordWorkerFailure(`timeout (batch ${batchId})`);
|
|
1276
|
+
// Return empty and let fallback handle it
|
|
1277
|
+
resolve([]);
|
|
1278
|
+
};
|
|
1279
|
+
|
|
1280
|
+
let timeout = setTimeout(
|
|
1281
|
+
() => handleTimeout('killing worker and falling back to single-threaded for this batch'),
|
|
1282
|
+
WORKER_TIMEOUT
|
|
1283
|
+
);
|
|
1284
|
+
|
|
1285
|
+
const resetTimeout = () => {
|
|
1286
|
+
clearTimeout(timeout);
|
|
1287
|
+
timeout = setTimeout(
|
|
1288
|
+
() =>
|
|
1289
|
+
handleTimeout('killing worker and falling back to single-threaded for this batch'),
|
|
1290
|
+
WORKER_TIMEOUT
|
|
1291
|
+
);
|
|
1292
|
+
};
|
|
1293
|
+
|
|
1294
|
+
let exitHandler;
|
|
1295
|
+
|
|
1296
|
+
const finalize = (results) => {
|
|
1297
|
+
clearTimeout(timeout);
|
|
1298
|
+
worker.off('message', handler);
|
|
1299
|
+
worker.off('error', errorHandler);
|
|
1300
|
+
if (exitHandler) worker.off('exit', exitHandler);
|
|
1301
|
+
resolve(results);
|
|
1302
|
+
};
|
|
1303
|
+
|
|
1304
|
+
const handler = (msg) => {
|
|
1305
|
+
if (msg.batchId === batchId) {
|
|
1306
|
+
resetTimeout();
|
|
1307
|
+
if (msg.type === 'results') {
|
|
1308
|
+
if (Array.isArray(msg.results) && msg.results.length > 0) {
|
|
1309
|
+
batchResults.push(...msg.results);
|
|
1310
|
+
}
|
|
1311
|
+
if (msg.done === false) {
|
|
1312
|
+
return;
|
|
1313
|
+
}
|
|
1314
|
+
finalize(batchResults);
|
|
1315
|
+
} else if (msg.type === 'error') {
|
|
1316
|
+
console.warn(`[Indexer] Worker ${workerIndex} error: ${msg.error}`);
|
|
1317
|
+
finalize([]); // Return empty, don't reject - let fallback handle
|
|
1318
|
+
}
|
|
1319
|
+
}
|
|
1320
|
+
};
|
|
1321
|
+
|
|
1322
|
+
// Handle worker crash
|
|
1323
|
+
const errorHandler = (err) => {
|
|
1324
|
+
console.warn(`[Indexer] Worker ${workerIndex} crashed: ${err.message}`);
|
|
1325
|
+
this.recordWorkerFailure(`crash (${err.message})`);
|
|
1326
|
+
void killWorker();
|
|
1327
|
+
finalize([]); // Return empty, don't reject
|
|
1328
|
+
};
|
|
1329
|
+
worker.once('error', errorHandler);
|
|
1330
|
+
|
|
1331
|
+
exitHandler = (code) => {
|
|
1332
|
+
if (code !== 0) {
|
|
1333
|
+
console.warn(`[Indexer] Worker ${workerIndex} exited unexpectedly with code ${code}`);
|
|
1334
|
+
this.recordWorkerFailure(`exit ${code}`);
|
|
1335
|
+
void killWorker();
|
|
1336
|
+
finalize([]);
|
|
1337
|
+
}
|
|
1338
|
+
};
|
|
1339
|
+
worker.once('exit', exitHandler);
|
|
1340
|
+
|
|
1341
|
+
worker.on('message', handler);
|
|
1342
|
+
try {
|
|
1343
|
+
worker.postMessage({ type: 'process', chunks: workerChunks, batchId });
|
|
1344
|
+
} catch (error) {
|
|
1345
|
+
console.warn(`[Indexer] Worker ${i} postMessage failed: ${error.message}`);
|
|
1346
|
+
finalize([]);
|
|
1347
|
+
}
|
|
1348
|
+
});
|
|
1349
|
+
|
|
1350
|
+
workerPromises.push({ promise, chunks: workerChunks });
|
|
1351
|
+
}
|
|
1352
|
+
|
|
1353
|
+
// Wait for all workers with error recovery
|
|
1354
|
+
const workerResults = await Promise.all(workerPromises.map((p) => p.promise));
|
|
1355
|
+
|
|
1356
|
+
// Collect results and identify failed chunks that need retry
|
|
1357
|
+
const failedChunks = [];
|
|
1358
|
+
for (let i = 0; i < workerResults.length; i++) {
|
|
1359
|
+
if (workerResults[i].length > 0) {
|
|
1360
|
+
results.push(...workerResults[i]);
|
|
1361
|
+
} else if (workerPromises[i].chunks.length > 0) {
|
|
1362
|
+
// Worker failed or timed out, need to retry these chunks
|
|
1363
|
+
failedChunks.push(...workerPromises[i].chunks);
|
|
1364
|
+
}
|
|
1365
|
+
}
|
|
1366
|
+
|
|
1367
|
+
// Retry failed chunks with single-threaded fallback
|
|
1368
|
+
if (failedChunks.length > 0 && allowSingleThreadFallback) {
|
|
1369
|
+
console.warn(
|
|
1370
|
+
`[Indexer] Retrying ${failedChunks.length} chunks with single-threaded fallback...`
|
|
1371
|
+
);
|
|
1372
|
+
const retryResults = await this.processChunksSingleThreaded(failedChunks);
|
|
1373
|
+
results.push(...retryResults);
|
|
1374
|
+
} else if (failedChunks.length > 0) {
|
|
1375
|
+
console.warn(
|
|
1376
|
+
`[Indexer] Skipping ${failedChunks.length} chunks (single-threaded fallback disabled)`
|
|
1377
|
+
);
|
|
1378
|
+
}
|
|
1379
|
+
|
|
1380
|
+
return results;
|
|
1381
|
+
}
|
|
1382
|
+
|
|
1383
|
+
async startEmbeddingProcessSession() {
|
|
1384
|
+
if (this._embeddingChild) return;
|
|
1385
|
+
|
|
1386
|
+
const nodePath = process.execPath || 'node';
|
|
1387
|
+
const scriptPath = fileURLToPath(new URL('../lib/embedding-process.js', import.meta.url));
|
|
1388
|
+
const child = spawn(nodePath, ['--expose-gc', scriptPath], {
|
|
1389
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
1390
|
+
env: {
|
|
1391
|
+
...process.env,
|
|
1392
|
+
EMBEDDING_PROCESS_PERSISTENT: 'true',
|
|
1393
|
+
EMBEDDING_PROCESS_RUN_MAIN: 'true',
|
|
1394
|
+
},
|
|
1395
|
+
});
|
|
1396
|
+
|
|
1397
|
+
this._embeddingChild = child;
|
|
1398
|
+
this._embeddingProcessSessionActive = true;
|
|
1399
|
+
this._embeddingChildStopping = false;
|
|
1400
|
+
this._embeddingChildBuffer = '';
|
|
1401
|
+
this._embeddingChildQueue = [];
|
|
1402
|
+
if (!this._embeddingSessionStats) {
|
|
1403
|
+
this._embeddingSessionStats = {
|
|
1404
|
+
startedAt: Date.now(),
|
|
1405
|
+
requests: 0,
|
|
1406
|
+
chunks: 0,
|
|
1407
|
+
totalRequestMs: 0,
|
|
1408
|
+
};
|
|
1409
|
+
}
|
|
1410
|
+
|
|
1411
|
+
const childPid = child?.pid ?? 'unknown';
|
|
1412
|
+
if (this.config.verbose) {
|
|
1413
|
+
console.info(`[Indexer] Persistent embedding process started pid=${childPid}`);
|
|
1414
|
+
}
|
|
1415
|
+
|
|
1416
|
+
child.stdout.on('data', (chunk) => {
|
|
1417
|
+
this._handleEmbeddingChildStdout(chunk);
|
|
1418
|
+
});
|
|
1419
|
+
|
|
1420
|
+
child.stderr.on('data', (chunk) => {
|
|
1421
|
+
if (this.config.verbose) {
|
|
1422
|
+
const msg = chunk.toString().trim();
|
|
1423
|
+
if (msg) {
|
|
1424
|
+
console.info(`[Indexer] Persistent embedding pid=${childPid}: ${msg}`);
|
|
1425
|
+
}
|
|
1426
|
+
}
|
|
1427
|
+
});
|
|
1428
|
+
|
|
1429
|
+
child.on('error', (err) => {
|
|
1430
|
+
if (this.config.verbose) {
|
|
1431
|
+
console.warn(`[Indexer] Persistent embedding error pid=${childPid}: ${err.message}`);
|
|
1432
|
+
}
|
|
1433
|
+
this._failEmbeddingChildQueue(`child process error (${err.message})`);
|
|
1434
|
+
this._embeddingChild = null;
|
|
1435
|
+
this._embeddingProcessSessionActive = false;
|
|
1436
|
+
});
|
|
1437
|
+
|
|
1438
|
+
child.on('close', (code, signal) => {
|
|
1439
|
+
if (this.config.verbose) {
|
|
1440
|
+
console.info(
|
|
1441
|
+
`[Indexer] Persistent embedding process exit pid=${childPid} code=${code ?? 'null'}${signal ? ` signal=${signal}` : ''}`
|
|
1442
|
+
);
|
|
1443
|
+
}
|
|
1444
|
+
this._failEmbeddingChildQueue(
|
|
1445
|
+
`child process exited (${code ?? 'null'}${signal ? `, signal=${signal}` : ''})`
|
|
1446
|
+
);
|
|
1447
|
+
this._embeddingChild = null;
|
|
1448
|
+
this._embeddingProcessSessionActive = false;
|
|
1449
|
+
});
|
|
1450
|
+
}
|
|
1451
|
+
|
|
1452
|
+
_handleEmbeddingChildStdout(chunk) {
|
|
1453
|
+
this._embeddingChildBuffer += chunk.toString();
|
|
1454
|
+
let newlineIndex = this._embeddingChildBuffer.indexOf('\n');
|
|
1455
|
+
while (newlineIndex !== -1) {
|
|
1456
|
+
const line = this._embeddingChildBuffer.slice(0, newlineIndex).trim();
|
|
1457
|
+
this._embeddingChildBuffer = this._embeddingChildBuffer.slice(newlineIndex + 1);
|
|
1458
|
+
if (line.length > 0) {
|
|
1459
|
+
let parsed = null;
|
|
1460
|
+
try {
|
|
1461
|
+
parsed = JSON.parse(line);
|
|
1462
|
+
} catch (err) {
|
|
1463
|
+
if (this.config.verbose) {
|
|
1464
|
+
console.warn(`[Indexer] Persistent embedding response parse error: ${err.message}`);
|
|
1465
|
+
}
|
|
1466
|
+
}
|
|
1467
|
+
const entry = this._embeddingChildQueue.shift();
|
|
1468
|
+
if (entry) {
|
|
1469
|
+
clearTimeout(entry.timeoutId);
|
|
1470
|
+
entry.done = true;
|
|
1471
|
+
const elapsed = ((Date.now() - entry.startedAt) / 1000).toFixed(1);
|
|
1472
|
+
if (this.config.verbose) {
|
|
1473
|
+
console.info(
|
|
1474
|
+
`[Indexer] Child embedding request done id=${entry.requestId} pid=${entry.pid} chunks=${entry.chunks} elapsed=${elapsed}s`
|
|
1475
|
+
);
|
|
1476
|
+
}
|
|
1477
|
+
if (this._embeddingSessionStats) {
|
|
1478
|
+
this._embeddingSessionStats.totalRequestMs += Date.now() - entry.startedAt;
|
|
1479
|
+
}
|
|
1480
|
+
const rssMb = Number(parsed?.meta?.rssMb);
|
|
1481
|
+
if (Number.isFinite(rssMb) && rssMb >= this._embeddingChildRestartThresholdMb) {
|
|
1482
|
+
if (this.config.verbose) {
|
|
1483
|
+
console.warn(
|
|
1484
|
+
`[Indexer] Child embedding RSS ${rssMb.toFixed(1)}MB exceeds threshold ${this._embeddingChildRestartThresholdMb.toFixed(1)}MB; will restart child after request`
|
|
1485
|
+
);
|
|
1486
|
+
}
|
|
1487
|
+
this._embeddingChildNeedsRestart = true;
|
|
1488
|
+
}
|
|
1489
|
+
entry.resolve(parsed?.results || []);
|
|
1490
|
+
} else if (this.config.verbose) {
|
|
1491
|
+
const isControlResponse =
|
|
1492
|
+
parsed &&
|
|
1493
|
+
typeof parsed === 'object' &&
|
|
1494
|
+
!Array.isArray(parsed) &&
|
|
1495
|
+
Object.prototype.hasOwnProperty.call(parsed, 'success') &&
|
|
1496
|
+
!Object.prototype.hasOwnProperty.call(parsed, 'results');
|
|
1497
|
+
if (isControlResponse || this._embeddingChildStopping) {
|
|
1498
|
+
newlineIndex = this._embeddingChildBuffer.indexOf('\n');
|
|
1499
|
+
continue;
|
|
1500
|
+
}
|
|
1501
|
+
console.warn('[Indexer] Persistent embedding response with no pending request');
|
|
1502
|
+
}
|
|
1503
|
+
}
|
|
1504
|
+
newlineIndex = this._embeddingChildBuffer.indexOf('\n');
|
|
1505
|
+
}
|
|
1506
|
+
}
|
|
1507
|
+
|
|
1508
|
+
_failEmbeddingChildQueue(reason) {
|
|
1509
|
+
while (this._embeddingChildQueue.length > 0) {
|
|
1510
|
+
const entry = this._embeddingChildQueue.shift();
|
|
1511
|
+
clearTimeout(entry.timeoutId);
|
|
1512
|
+
if (entry.done) {
|
|
1513
|
+
continue;
|
|
1514
|
+
}
|
|
1515
|
+
if (this.config.verbose) {
|
|
1516
|
+
console.warn(`[Indexer] Persistent embedding request failed: ${reason}`);
|
|
1517
|
+
}
|
|
1518
|
+
entry.done = true;
|
|
1519
|
+
entry.resolve([]);
|
|
1520
|
+
}
|
|
1521
|
+
}
|
|
1522
|
+
|
|
1523
|
+
async stopEmbeddingProcessSession({ preserveStats = false } = {}) {
|
|
1524
|
+
const child = this._embeddingChild;
|
|
1525
|
+
if (!child) {
|
|
1526
|
+
this._embeddingChildStopping = false;
|
|
1527
|
+
return;
|
|
1528
|
+
}
|
|
1529
|
+
this._embeddingChildStopping = true;
|
|
1530
|
+
const childPid = child?.pid ?? 'unknown';
|
|
1531
|
+
if (this.config.verbose) {
|
|
1532
|
+
console.info(`[Indexer] Stopping persistent embedding process pid=${childPid}`);
|
|
1533
|
+
}
|
|
1534
|
+
try {
|
|
1535
|
+
child.stdin.write(`${JSON.stringify({ type: 'shutdown' })}\n`);
|
|
1536
|
+
} catch {
|
|
1537
|
+
// ignore
|
|
1538
|
+
}
|
|
1539
|
+
await new Promise((resolve) => {
|
|
1540
|
+
const timeout = setTimeout(() => {
|
|
1541
|
+
try {
|
|
1542
|
+
child.kill('SIGKILL');
|
|
1543
|
+
} catch {
|
|
1544
|
+
// ignore
|
|
1545
|
+
}
|
|
1546
|
+
resolve();
|
|
1547
|
+
}, 5000);
|
|
1548
|
+
child.once('exit', () => {
|
|
1549
|
+
clearTimeout(timeout);
|
|
1550
|
+
resolve();
|
|
1551
|
+
});
|
|
1552
|
+
});
|
|
1553
|
+
if (this.config.verbose && this._embeddingSessionStats && !preserveStats) {
|
|
1554
|
+
const elapsedMs = Date.now() - this._embeddingSessionStats.startedAt;
|
|
1555
|
+
const elapsedSec = (elapsedMs / 1000).toFixed(1);
|
|
1556
|
+
const avgRequestMs = this._embeddingSessionStats.requests
|
|
1557
|
+
? (
|
|
1558
|
+
this._embeddingSessionStats.totalRequestMs / this._embeddingSessionStats.requests
|
|
1559
|
+
).toFixed(1)
|
|
1560
|
+
: '0.0';
|
|
1561
|
+
const avgChunksPerReq = this._embeddingSessionStats.requests
|
|
1562
|
+
? (this._embeddingSessionStats.chunks / this._embeddingSessionStats.requests).toFixed(1)
|
|
1563
|
+
: '0.0';
|
|
1564
|
+
const avgMsPerChunk = this._embeddingSessionStats.chunks
|
|
1565
|
+
? (this._embeddingSessionStats.totalRequestMs / this._embeddingSessionStats.chunks).toFixed(
|
|
1566
|
+
1
|
|
1567
|
+
)
|
|
1568
|
+
: '0.0';
|
|
1569
|
+
console.info(
|
|
1570
|
+
`[Indexer] Persistent embedding summary: requests=${this._embeddingSessionStats.requests} chunks=${this._embeddingSessionStats.chunks} avgChunksPerReq=${avgChunksPerReq} avgReqMs=${avgRequestMs} avgMsPerChunk=${avgMsPerChunk} totalElapsed=${elapsedSec}s`
|
|
1571
|
+
);
|
|
1572
|
+
}
|
|
1573
|
+
this._embeddingChild = null;
|
|
1574
|
+
this._embeddingProcessSessionActive = false;
|
|
1575
|
+
this._embeddingChildStopping = false;
|
|
1576
|
+
// Clear buffers to release memory
|
|
1577
|
+
this._embeddingChildBuffer = '';
|
|
1578
|
+
this._embeddingChildQueue = [];
|
|
1579
|
+
if (!preserveStats) {
|
|
1580
|
+
this._embeddingSessionStats = null;
|
|
1581
|
+
}
|
|
1582
|
+
}
|
|
1583
|
+
|
|
1584
|
+
async processChunksInPersistentChild(chunks) {
|
|
1585
|
+
if (!this._embeddingChild) {
|
|
1586
|
+
await this.startEmbeddingProcessSession();
|
|
1587
|
+
}
|
|
1588
|
+
if (!this._embeddingChild) {
|
|
1589
|
+
return [];
|
|
1590
|
+
}
|
|
1591
|
+
|
|
1592
|
+
const child = this._embeddingChild;
|
|
1593
|
+
const childPid = child?.pid ?? 'unknown';
|
|
1594
|
+
const requestId = this._embeddingRequestId++;
|
|
1595
|
+
const { threads, batchSize } = this.getEmbeddingProcessConfig();
|
|
1596
|
+
const payload = {
|
|
1597
|
+
embeddingModel: this.config.embeddingModel,
|
|
1598
|
+
chunks,
|
|
1599
|
+
numThreads: threads,
|
|
1600
|
+
batchSize,
|
|
1601
|
+
enableExplicitGc: this.config.enableExplicitGc,
|
|
1602
|
+
...this.getEmbeddingProcessGcConfig(),
|
|
1603
|
+
requestId,
|
|
1604
|
+
};
|
|
1605
|
+
const timeoutMs = Number.isInteger(this.config.workerBatchTimeoutMs)
|
|
1606
|
+
? this.config.workerBatchTimeoutMs
|
|
1607
|
+
: 120000;
|
|
1608
|
+
|
|
1609
|
+
return new Promise((resolve) => {
|
|
1610
|
+
const startedAt = Date.now();
|
|
1611
|
+
const entry = {
|
|
1612
|
+
resolve,
|
|
1613
|
+
timeoutId: null,
|
|
1614
|
+
startedAt,
|
|
1615
|
+
chunks: Array.isArray(chunks) ? chunks.length : 0,
|
|
1616
|
+
pid: childPid,
|
|
1617
|
+
requestId,
|
|
1618
|
+
done: false,
|
|
1619
|
+
};
|
|
1620
|
+
|
|
1621
|
+
if (this.config.verbose) {
|
|
1622
|
+
console.info(
|
|
1623
|
+
`[Indexer] Child embedding request started id=${requestId} pid=${childPid} chunks=${entry.chunks} queue=${this._embeddingChildQueue.length}`
|
|
1624
|
+
);
|
|
1625
|
+
}
|
|
1626
|
+
if (this._embeddingSessionStats) {
|
|
1627
|
+
this._embeddingSessionStats.requests += 1;
|
|
1628
|
+
this._embeddingSessionStats.chunks += entry.chunks;
|
|
1629
|
+
}
|
|
1630
|
+
|
|
1631
|
+
entry.timeoutId = setTimeout(() => {
|
|
1632
|
+
if (entry.done) {
|
|
1633
|
+
return;
|
|
1634
|
+
}
|
|
1635
|
+
entry.done = true;
|
|
1636
|
+
this._embeddingChildQueue = this._embeddingChildQueue.filter((item) => item !== entry);
|
|
1637
|
+
if (this.config.verbose) {
|
|
1638
|
+
const elapsed = ((Date.now() - startedAt) / 1000).toFixed(1);
|
|
1639
|
+
console.warn(
|
|
1640
|
+
`[Indexer] Child embedding request timeout id=${requestId} pid=${childPid} elapsed=${elapsed}s limit=${(timeoutMs / 1000).toFixed(1)}s`
|
|
1641
|
+
);
|
|
1642
|
+
}
|
|
1643
|
+
this.recordWorkerFailure('child process timeout');
|
|
1644
|
+
try {
|
|
1645
|
+
child.kill('SIGKILL');
|
|
1646
|
+
} catch {
|
|
1647
|
+
// ignore
|
|
1648
|
+
}
|
|
1649
|
+
resolve([]);
|
|
1650
|
+
}, timeoutMs);
|
|
1651
|
+
|
|
1652
|
+
this._embeddingChildQueue.push(entry);
|
|
1653
|
+
try {
|
|
1654
|
+
child.stdin.write(`${JSON.stringify(payload)}\n`);
|
|
1655
|
+
} catch (err) {
|
|
1656
|
+
clearTimeout(entry.timeoutId);
|
|
1657
|
+
this.recordWorkerFailure(`child process error (${err.message})`);
|
|
1658
|
+
resolve([]);
|
|
1659
|
+
}
|
|
1660
|
+
}).then(async (results) => {
|
|
1661
|
+
if (this._embeddingChildNeedsRestart && this._embeddingChildQueue.length === 0) {
|
|
1662
|
+
this._embeddingChildNeedsRestart = false;
|
|
1663
|
+
await this.stopEmbeddingProcessSession({ preserveStats: true });
|
|
1664
|
+
await this.startEmbeddingProcessSession();
|
|
1665
|
+
}
|
|
1666
|
+
return this.applyEmbeddingDimensionToResults(results);
|
|
1667
|
+
});
|
|
1668
|
+
}
|
|
1669
|
+
|
|
1670
|
+
applyEmbeddingDimensionToResults(results) {
|
|
1671
|
+
const targetDim = this.config.embeddingDimension;
|
|
1672
|
+
if (!targetDim || !Array.isArray(results)) {
|
|
1673
|
+
return results;
|
|
1674
|
+
}
|
|
1675
|
+
for (const result of results) {
|
|
1676
|
+
if (!result || !result.vector) continue;
|
|
1677
|
+
const floatVector = toFloat32Array(result.vector);
|
|
1678
|
+
result.vector = sliceAndNormalize(floatVector, targetDim);
|
|
1679
|
+
}
|
|
1680
|
+
return results;
|
|
1681
|
+
}
|
|
1682
|
+
|
|
1683
|
+
async processChunksInChildProcess(chunks) {
|
|
1684
|
+
if (this._embeddingProcessSessionActive) {
|
|
1685
|
+
return this.processChunksInPersistentChild(chunks);
|
|
1686
|
+
}
|
|
1687
|
+
const nodePath = process.execPath || 'node';
|
|
1688
|
+
const scriptPath = fileURLToPath(new URL('../lib/embedding-process.js', import.meta.url));
|
|
1689
|
+
const { threads, batchSize } = this.getEmbeddingProcessConfig();
|
|
1690
|
+
const payload = {
|
|
1691
|
+
embeddingModel: this.config.embeddingModel,
|
|
1692
|
+
chunks,
|
|
1693
|
+
numThreads: threads,
|
|
1694
|
+
batchSize,
|
|
1695
|
+
enableExplicitGc: this.config.enableExplicitGc,
|
|
1696
|
+
...this.getEmbeddingProcessGcConfig(),
|
|
1697
|
+
};
|
|
1698
|
+
|
|
1699
|
+
return new Promise((resolve) => {
|
|
1700
|
+
const startedAt = Date.now();
|
|
1701
|
+
const child = spawn(nodePath, ['--expose-gc', scriptPath], {
|
|
1702
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
1703
|
+
env: {
|
|
1704
|
+
...process.env,
|
|
1705
|
+
EMBEDDING_PROCESS_RUN_MAIN: 'true',
|
|
1706
|
+
},
|
|
1707
|
+
});
|
|
1708
|
+
const childPid = child?.pid ?? 'unknown';
|
|
1709
|
+
if (this.config.verbose) {
|
|
1710
|
+
console.info(
|
|
1711
|
+
`[Indexer] Child embedding process started pid=${childPid} chunks=${Array.isArray(chunks) ? chunks.length : 0}`
|
|
1712
|
+
);
|
|
1713
|
+
}
|
|
1714
|
+
|
|
1715
|
+
let stdout = '';
|
|
1716
|
+
let stderr = '';
|
|
1717
|
+
let closed = false;
|
|
1718
|
+
child.stdout.on('data', (chunk) => {
|
|
1719
|
+
stdout += chunk.toString();
|
|
1720
|
+
});
|
|
1721
|
+
child.stderr.on('data', (chunk) => {
|
|
1722
|
+
stderr += chunk.toString();
|
|
1723
|
+
});
|
|
1724
|
+
|
|
1725
|
+
const timeoutMs = Number.isInteger(this.config.workerBatchTimeoutMs)
|
|
1726
|
+
? this.config.workerBatchTimeoutMs
|
|
1727
|
+
: 120000;
|
|
1728
|
+
const timeout = setTimeout(() => {
|
|
1729
|
+
try {
|
|
1730
|
+
child.kill('SIGKILL');
|
|
1731
|
+
} catch {
|
|
1732
|
+
// ignore
|
|
1733
|
+
}
|
|
1734
|
+
if (this.config.verbose && !closed) {
|
|
1735
|
+
const elapsed = ((Date.now() - startedAt) / 1000).toFixed(1);
|
|
1736
|
+
console.warn(
|
|
1737
|
+
`[Indexer] Child embedding process timeout pid=${childPid} elapsed=${elapsed}s limit=${(timeoutMs / 1000).toFixed(1)}s`
|
|
1738
|
+
);
|
|
1739
|
+
}
|
|
1740
|
+
this.recordWorkerFailure('child process timeout');
|
|
1741
|
+
resolve([]);
|
|
1742
|
+
}, timeoutMs);
|
|
1743
|
+
|
|
1744
|
+
child.on('error', (err) => {
|
|
1745
|
+
clearTimeout(timeout);
|
|
1746
|
+
if (this.config.verbose && !closed) {
|
|
1747
|
+
console.warn(`[Indexer] Child embedding process error pid=${childPid}: ${err.message}`);
|
|
1748
|
+
}
|
|
1749
|
+
this.recordWorkerFailure(`child process error (${err.message})`);
|
|
1750
|
+
resolve([]);
|
|
1751
|
+
});
|
|
1752
|
+
|
|
1753
|
+
child.on('close', (code, signal) => {
|
|
1754
|
+
clearTimeout(timeout);
|
|
1755
|
+
closed = true;
|
|
1756
|
+
if (this.config.verbose) {
|
|
1757
|
+
const elapsed = ((Date.now() - startedAt) / 1000).toFixed(1);
|
|
1758
|
+
console.info(
|
|
1759
|
+
`[Indexer] Child embedding process exit pid=${childPid} code=${code ?? 'null'}${signal ? ` signal=${signal}` : ''} elapsed=${elapsed}s`
|
|
1760
|
+
);
|
|
1761
|
+
const { rss, heapUsed, heapTotal } = process.memoryUsage();
|
|
1762
|
+
const toMb = (value) => `${(value / 1024 / 1024).toFixed(1)}MB`;
|
|
1763
|
+
console.info(
|
|
1764
|
+
`[Indexer] Memory after child exit: rss=${toMb(rss)} heap=${toMb(heapUsed)}/${toMb(heapTotal)}`
|
|
1765
|
+
);
|
|
1766
|
+
}
|
|
1767
|
+
if (code !== 0) {
|
|
1768
|
+
this.recordWorkerFailure(
|
|
1769
|
+
`child process exited (${code ?? 'null'}${signal ? `, signal=${signal}` : ''})`
|
|
1770
|
+
);
|
|
1771
|
+
if (stderr) {
|
|
1772
|
+
console.warn(`[Indexer] Child process error: ${stderr.trim()}`);
|
|
1773
|
+
}
|
|
1774
|
+
return resolve([]);
|
|
1775
|
+
}
|
|
1776
|
+
try {
|
|
1777
|
+
const parsed = JSON.parse(stdout);
|
|
1778
|
+
// Clear large JSON buffer immediately after parsing to release memory
|
|
1779
|
+
stdout = '';
|
|
1780
|
+
stderr = '';
|
|
1781
|
+
resolve(this.applyEmbeddingDimensionToResults(parsed?.results || []));
|
|
1782
|
+
} catch (err) {
|
|
1783
|
+
// Clear buffers on error too
|
|
1784
|
+
stdout = '';
|
|
1785
|
+
stderr = '';
|
|
1786
|
+
this.recordWorkerFailure(`child process parse error (${err.message})`);
|
|
1787
|
+
resolve([]);
|
|
1788
|
+
}
|
|
1789
|
+
});
|
|
1790
|
+
|
|
1791
|
+
child.stdin.end(JSON.stringify(payload));
|
|
1792
|
+
});
|
|
1793
|
+
}
|
|
1794
|
+
|
|
1795
|
+
/**
|
|
1796
|
+
* Single-threaded chunk processing (fallback)
|
|
1797
|
+
*/
|
|
1798
|
+
async processChunksSingleThreaded(chunks) {
|
|
1799
|
+
const results = [];
|
|
1800
|
+
|
|
1801
|
+
// Manual GC and yield loop to prevent CPU lockup
|
|
1802
|
+
let processedSinceGc = 0;
|
|
1803
|
+
|
|
1804
|
+
for (const chunk of chunks) {
|
|
1805
|
+
// Throttle speed (balanced) - yield to event loop but don't wait unnecessarily
|
|
1806
|
+
await delay(0);
|
|
1807
|
+
|
|
1808
|
+
try {
|
|
1809
|
+
const output = await this.embedder(chunk.text, {
|
|
1810
|
+
pooling: 'mean',
|
|
1811
|
+
normalize: true,
|
|
1812
|
+
});
|
|
1813
|
+
// CRITICAL: Deep copy to release ONNX tensor memory
|
|
1814
|
+
let vector = toFloat32Array(output.data);
|
|
1815
|
+
if (this.config.embeddingDimension) {
|
|
1816
|
+
vector = sliceAndNormalize(vector, this.config.embeddingDimension);
|
|
1817
|
+
}
|
|
1818
|
+
// Properly dispose tensor to release ONNX runtime memory
|
|
1819
|
+
if (typeof output.dispose === 'function') {
|
|
1820
|
+
try {
|
|
1821
|
+
output.dispose();
|
|
1822
|
+
} catch {
|
|
1823
|
+
/* frozen tensor */
|
|
1824
|
+
}
|
|
1825
|
+
}
|
|
1826
|
+
results.push({
|
|
1827
|
+
file: chunk.file,
|
|
1828
|
+
startLine: chunk.startLine,
|
|
1829
|
+
endLine: chunk.endLine,
|
|
1830
|
+
content: chunk.text,
|
|
1831
|
+
vector,
|
|
1832
|
+
success: true,
|
|
1833
|
+
});
|
|
1834
|
+
|
|
1835
|
+
// Periodic GC to prevent memory creep
|
|
1836
|
+
processedSinceGc++;
|
|
1837
|
+
if (processedSinceGc >= 100) {
|
|
1838
|
+
this.runExplicitGc({ minIntervalMs: 5000 });
|
|
1839
|
+
processedSinceGc = 0;
|
|
1840
|
+
}
|
|
1841
|
+
} catch (error) {
|
|
1842
|
+
results.push({
|
|
1843
|
+
file: chunk.file,
|
|
1844
|
+
startLine: chunk.startLine,
|
|
1845
|
+
endLine: chunk.endLine,
|
|
1846
|
+
error: error.message,
|
|
1847
|
+
success: false,
|
|
1848
|
+
});
|
|
1849
|
+
}
|
|
1850
|
+
}
|
|
1851
|
+
|
|
1852
|
+
return results;
|
|
1853
|
+
}
|
|
1854
|
+
|
|
1855
|
+
async indexFile(file) {
|
|
1856
|
+
const fileName = path.basename(file);
|
|
1857
|
+
if (typeof this.cache.ensureLoaded === 'function') {
|
|
1858
|
+
const preferDisk = this.shouldPreferDiskCacheLoad();
|
|
1859
|
+
await this.traceIncrementalMemoryPhase(`indexFile.ensureLoaded (${fileName})`, async () => {
|
|
1860
|
+
await this.cache.ensureLoaded({ preferDisk });
|
|
1861
|
+
});
|
|
1862
|
+
}
|
|
1863
|
+
if (!(await this.isPathInsideWorkspaceReal(file))) {
|
|
1864
|
+
console.warn(`[Indexer] Skipped ${path.basename(file)} (outside workspace)`);
|
|
1865
|
+
return 0;
|
|
1866
|
+
}
|
|
1867
|
+
if (this.isExcluded(file)) {
|
|
1868
|
+
if (this.config.verbose) {
|
|
1869
|
+
console.info(`[Indexer] Skipped ${fileName} (excluded by pattern)`);
|
|
1870
|
+
}
|
|
1871
|
+
return 0;
|
|
1872
|
+
}
|
|
1873
|
+
if (this.config.verbose) {
|
|
1874
|
+
console.info(`[Indexer] Processing: ${fileName}...`);
|
|
1875
|
+
}
|
|
1876
|
+
|
|
1877
|
+
try {
|
|
1878
|
+
// Check file size first
|
|
1879
|
+
const stats = await fs.stat(file);
|
|
1880
|
+
|
|
1881
|
+
// Skip directories
|
|
1882
|
+
if (stats.isDirectory()) {
|
|
1883
|
+
return 0;
|
|
1884
|
+
}
|
|
1885
|
+
|
|
1886
|
+
if (stats.size > this.config.maxFileSize) {
|
|
1887
|
+
if (this.config.verbose) {
|
|
1888
|
+
console.warn(
|
|
1889
|
+
`[Indexer] Skipped ${fileName} (too large: ${(stats.size / 1024 / 1024).toFixed(2)}MB)`
|
|
1890
|
+
);
|
|
1891
|
+
}
|
|
1892
|
+
return 0;
|
|
1893
|
+
}
|
|
1894
|
+
|
|
1895
|
+
const content = await fs.readFile(file, 'utf-8');
|
|
1896
|
+
const hash = hashContent(content);
|
|
1897
|
+
|
|
1898
|
+
// Skip if file hasn't changed
|
|
1899
|
+
const cachedHash =
|
|
1900
|
+
typeof this.cache.getFileHash === 'function' ? this.cache.getFileHash(file) : null;
|
|
1901
|
+
if (cachedHash === hash) {
|
|
1902
|
+
if (this.config.verbose) {
|
|
1903
|
+
console.info(`[Indexer] Skipped ${fileName} (unchanged)`);
|
|
1904
|
+
}
|
|
1905
|
+
// Still update metadata (size, mtime) even if hash is same
|
|
1906
|
+
this.cache.setFileHash(file, hash, stats);
|
|
1907
|
+
return 0;
|
|
1908
|
+
}
|
|
1909
|
+
|
|
1910
|
+
if (this.config.verbose) {
|
|
1911
|
+
console.info(`[Indexer] Indexing ${fileName}...`);
|
|
1912
|
+
}
|
|
1913
|
+
|
|
1914
|
+
// Extract call graph data if enabled
|
|
1915
|
+
let callData = null;
|
|
1916
|
+
if (this.config.callGraphEnabled) {
|
|
1917
|
+
try {
|
|
1918
|
+
callData = extractCallData(content, file);
|
|
1919
|
+
} catch (err) {
|
|
1920
|
+
if (this.config.verbose) {
|
|
1921
|
+
console.warn(`[Indexer] Call graph extraction failed for ${fileName}: ${err.message}`);
|
|
1922
|
+
}
|
|
1923
|
+
}
|
|
1924
|
+
}
|
|
1925
|
+
|
|
1926
|
+
const rawChunks = smartChunk(content, file, this.config);
|
|
1927
|
+
const chunks = Array.isArray(rawChunks) ? rawChunks : [];
|
|
1928
|
+
let addedChunks = 0;
|
|
1929
|
+
let successChunks = 0;
|
|
1930
|
+
let failedChunks = 0;
|
|
1931
|
+
const newChunks = [];
|
|
1932
|
+
|
|
1933
|
+
// Use workers for watcher-triggered embedding to keep main thread responsive
|
|
1934
|
+
let useWorkers = this.shouldUseWorkers();
|
|
1935
|
+
if (useWorkers && this.workers.length === 0) {
|
|
1936
|
+
await this.initializeWorkers();
|
|
1937
|
+
if (this.workers.length === 0) {
|
|
1938
|
+
useWorkers = false;
|
|
1939
|
+
}
|
|
1940
|
+
}
|
|
1941
|
+
const useEmbeddingProcessPerBatch = this.shouldUseEmbeddingProcessPerBatch(useWorkers);
|
|
1942
|
+
let embeddingRuntimeSummary = '';
|
|
1943
|
+
if (useWorkers && this.workers.length > 0) {
|
|
1944
|
+
const workerInferenceBatchSize =
|
|
1945
|
+
this.getWorkerInferenceBatchSize({ numWorkers: this.workers.length }) ?? 'default';
|
|
1946
|
+
embeddingRuntimeSummary =
|
|
1947
|
+
`mode=worker-pool workers=${this.workers.length} onnxThreadsPerWorker=1 ` +
|
|
1948
|
+
`effectiveThreads=${this.workers.length} inferenceBatchSize=${workerInferenceBatchSize}`;
|
|
1949
|
+
} else if (useEmbeddingProcessPerBatch) {
|
|
1950
|
+
const { threads, batchSize } = this.getEmbeddingProcessConfig();
|
|
1951
|
+
embeddingRuntimeSummary =
|
|
1952
|
+
`mode=child-process onnxThreads=${threads} ` +
|
|
1953
|
+
`inferenceBatchSize=${batchSize ?? 1} persistentSession=${this._embeddingProcessSessionActive ? 'true' : 'false'}`;
|
|
1954
|
+
} else {
|
|
1955
|
+
embeddingRuntimeSummary = 'mode=main-thread onnxThreads=auto';
|
|
1956
|
+
}
|
|
1957
|
+
console.info(`[Indexer] Embedding runtime: ${embeddingRuntimeSummary}`);
|
|
1958
|
+
|
|
1959
|
+
const chunksToProcess = chunks.map((c) => ({
|
|
1960
|
+
file,
|
|
1961
|
+
text: c.text,
|
|
1962
|
+
startLine: c.startLine,
|
|
1963
|
+
endLine: c.endLine,
|
|
1964
|
+
}));
|
|
1965
|
+
|
|
1966
|
+
const results = await this.traceIncrementalMemoryPhase(
|
|
1967
|
+
`indexFile.embedChunks (${fileName})`,
|
|
1968
|
+
async () => {
|
|
1969
|
+
if (useWorkers && this.workers.length > 0) {
|
|
1970
|
+
return await this.processChunksWithWorkers(chunksToProcess);
|
|
1971
|
+
}
|
|
1972
|
+
if (useEmbeddingProcessPerBatch) {
|
|
1973
|
+
return await this.processChunksInChildProcess(chunksToProcess);
|
|
1974
|
+
}
|
|
1975
|
+
return await this.processChunksSingleThreaded(chunksToProcess);
|
|
1976
|
+
}
|
|
1977
|
+
);
|
|
1978
|
+
|
|
1979
|
+
for (const result of results) {
|
|
1980
|
+
if (result.success) {
|
|
1981
|
+
newChunks.push({
|
|
1982
|
+
file,
|
|
1983
|
+
startLine: result.startLine,
|
|
1984
|
+
endLine: result.endLine,
|
|
1985
|
+
content: result.content,
|
|
1986
|
+
vector: toFloat32Array(result.vector),
|
|
1987
|
+
});
|
|
1988
|
+
addedChunks++;
|
|
1989
|
+
successChunks++;
|
|
1990
|
+
} else {
|
|
1991
|
+
console.warn(`[Indexer] Failed to embed chunk in ${fileName}:`, result.error);
|
|
1992
|
+
failedChunks++;
|
|
1993
|
+
}
|
|
1994
|
+
}
|
|
1995
|
+
|
|
1996
|
+
const totalChunks = chunks.length;
|
|
1997
|
+
const allSucceeded = totalChunks === 0 || failedChunks === 0;
|
|
1998
|
+
|
|
1999
|
+
await this.traceIncrementalMemoryPhase(`indexFile.commit (${fileName})`, async () => {
|
|
2000
|
+
if (allSucceeded) {
|
|
2001
|
+
this.cache.removeFileFromStore(file);
|
|
2002
|
+
for (const chunk of newChunks) {
|
|
2003
|
+
this.cache.addToStore(chunk);
|
|
2004
|
+
}
|
|
2005
|
+
this.cache.setFileHash(file, hash, stats);
|
|
2006
|
+
if (this.config.callGraphEnabled && callData) {
|
|
2007
|
+
this.cache.setFileCallData(file, callData);
|
|
2008
|
+
}
|
|
2009
|
+
} else if (this.config.verbose) {
|
|
2010
|
+
console.warn(
|
|
2011
|
+
`[Indexer] Skipped hash update for ${fileName} (${successChunks}/${totalChunks} chunks embedded)`
|
|
2012
|
+
);
|
|
2013
|
+
}
|
|
2014
|
+
});
|
|
2015
|
+
|
|
2016
|
+
if (this.config.verbose) {
|
|
2017
|
+
console.info(`[Indexer] Completed ${fileName} (${addedChunks} chunks)`);
|
|
2018
|
+
}
|
|
2019
|
+
return addedChunks;
|
|
2020
|
+
} catch (error) {
|
|
2021
|
+
if (this.config.verbose) {
|
|
2022
|
+
console.warn(`[Indexer] Error indexing ${fileName}:`, error.message);
|
|
2023
|
+
}
|
|
2024
|
+
return 0;
|
|
2025
|
+
}
|
|
2026
|
+
}
|
|
2027
|
+
|
|
2028
|
+
/**
|
|
2029
|
+
* Discover files using fdir (3-5x faster than glob)
|
|
2030
|
+
* Uses config.excludePatterns which includes smart patterns from ignore-patterns.js
|
|
2031
|
+
*/
|
|
2032
|
+
async discoverFiles() {
|
|
2033
|
+
const startTime = Date.now();
|
|
2034
|
+
|
|
2035
|
+
// Build extension filter from config
|
|
2036
|
+
const extensions = new Set(
|
|
2037
|
+
this.config.fileExtensions.map((ext) => `.${String(ext).toLowerCase()}`)
|
|
2038
|
+
);
|
|
2039
|
+
const allowedFileNames = new Set(this.config.fileNames || []);
|
|
2040
|
+
|
|
2041
|
+
// Load .gitignore before discovery
|
|
2042
|
+
await this.loadGitignore();
|
|
2043
|
+
|
|
2044
|
+
if (!this.config.searchDirectory) {
|
|
2045
|
+
return [];
|
|
2046
|
+
}
|
|
2047
|
+
|
|
2048
|
+
const api = new fdir()
|
|
2049
|
+
.withFullPaths()
|
|
2050
|
+
.exclude((dirName, dirPath) => {
|
|
2051
|
+
// Always exclude specific heavy folders immediately
|
|
2052
|
+
if (dirName === 'node_modules' || dirName === '.git' || dirName === '.smart-coding-cache')
|
|
2053
|
+
return true;
|
|
2054
|
+
|
|
2055
|
+
// Check exclusion rules for directories
|
|
2056
|
+
const fullPath = path.join(dirPath, dirName);
|
|
2057
|
+
return this.isExcluded(fullPath);
|
|
2058
|
+
})
|
|
2059
|
+
.filter((filePath) => {
|
|
2060
|
+
if (this.isExcluded(filePath)) return false;
|
|
2061
|
+
|
|
2062
|
+
// Check extensions/filenames
|
|
2063
|
+
const base = path.basename(filePath);
|
|
2064
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
2065
|
+
return extensions.has(ext) || allowedFileNames.has(base);
|
|
2066
|
+
})
|
|
2067
|
+
.crawl(this.config.searchDirectory);
|
|
2068
|
+
|
|
2069
|
+
const files = await api.withPromise();
|
|
2070
|
+
|
|
2071
|
+
console.info(`[Indexer] File discovery: ${files.length} files in ${Date.now() - startTime}ms`);
|
|
2072
|
+
return files;
|
|
2073
|
+
}
|
|
2074
|
+
|
|
2075
|
+
/**
|
|
2076
|
+
* Pre-filter files by hash (skip unchanged files before processing)
|
|
2077
|
+
*/
|
|
2078
|
+
async preFilterFiles(files) {
|
|
2079
|
+
const startTime = Date.now();
|
|
2080
|
+
const filesToProcess = [];
|
|
2081
|
+
const skippedCount = { unchanged: 0, tooLarge: 0, error: 0 };
|
|
2082
|
+
|
|
2083
|
+
// Process in parallel batches for speed
|
|
2084
|
+
// We fetch stats for 100 files at a time to keep IO efficient
|
|
2085
|
+
const STAT_BATCH_SIZE = Math.min(100, this.config.batchSize || 100);
|
|
2086
|
+
// Limit concurrent file reads to 50MB to prevent OOM
|
|
2087
|
+
const MAX_READ_BATCH_BYTES = 50 * 1024 * 1024;
|
|
2088
|
+
|
|
2089
|
+
for (let i = 0; i < files.length; i += STAT_BATCH_SIZE) {
|
|
2090
|
+
const batchFiles = files.slice(i, i + STAT_BATCH_SIZE);
|
|
2091
|
+
|
|
2092
|
+
// 1. Get stats for all files in this batch parallel
|
|
2093
|
+
const fileStats = await Promise.all(
|
|
2094
|
+
batchFiles.map(async (file) => {
|
|
2095
|
+
try {
|
|
2096
|
+
const stats = await fs.stat(file);
|
|
2097
|
+
|
|
2098
|
+
if (stats.isDirectory()) {
|
|
2099
|
+
return null;
|
|
2100
|
+
}
|
|
2101
|
+
|
|
2102
|
+
if (stats.size > this.config.maxFileSize) {
|
|
2103
|
+
skippedCount.tooLarge++;
|
|
2104
|
+
return null;
|
|
2105
|
+
}
|
|
2106
|
+
|
|
2107
|
+
return { file, size: stats.size, mtimeMs: stats.mtimeMs };
|
|
2108
|
+
} catch (_err) {
|
|
2109
|
+
skippedCount.error++;
|
|
2110
|
+
return null;
|
|
2111
|
+
}
|
|
2112
|
+
})
|
|
2113
|
+
);
|
|
2114
|
+
|
|
2115
|
+
// 2. Process valid files in size-constrained sub-batches
|
|
2116
|
+
let currentReadBatch = [];
|
|
2117
|
+
let currentReadBytes = 0;
|
|
2118
|
+
|
|
2119
|
+
const mtimeSafeWindowMs = isTestEnv()
|
|
2120
|
+
? 0
|
|
2121
|
+
: Number.isInteger(this.config.mtimeSafeWindowMs)
|
|
2122
|
+
? this.config.mtimeSafeWindowMs
|
|
2123
|
+
: 2000;
|
|
2124
|
+
const processReadBatch = async (batch) => {
|
|
2125
|
+
const results = await Promise.all(
|
|
2126
|
+
batch.map(async ({ file, size, mtimeMs }) => {
|
|
2127
|
+
// Check if we have cached metadata for this file
|
|
2128
|
+
const cachedHash =
|
|
2129
|
+
typeof this.cache.getFileHash === 'function' ? this.cache.getFileHash(file) : null;
|
|
2130
|
+
const cachedMeta = this.cache.getFileMeta ? this.cache.getFileMeta(file) : null;
|
|
2131
|
+
|
|
2132
|
+
const metaMatches =
|
|
2133
|
+
cachedHash &&
|
|
2134
|
+
cachedMeta &&
|
|
2135
|
+
Number.isFinite(cachedMeta.mtimeMs) &&
|
|
2136
|
+
cachedMeta.mtimeMs === mtimeMs &&
|
|
2137
|
+
Number.isFinite(cachedMeta.size) &&
|
|
2138
|
+
cachedMeta.size === size;
|
|
2139
|
+
if (metaMatches) {
|
|
2140
|
+
// Avoid missing rapid edits on coarse timestamp filesystems.
|
|
2141
|
+
const now = Date.now();
|
|
2142
|
+
const isRecent = Math.abs(now - mtimeMs) <= mtimeSafeWindowMs;
|
|
2143
|
+
if (!isRecent) {
|
|
2144
|
+
// Metadata matches exactly, skip reading/hashing
|
|
2145
|
+
skippedCount.unchanged++;
|
|
2146
|
+
return null;
|
|
2147
|
+
}
|
|
2148
|
+
}
|
|
2149
|
+
|
|
2150
|
+
// Suspect file: Either new, or metadata changed.
|
|
2151
|
+
// We pass it to indexAll with the cachedHash as 'expectedHash'
|
|
2152
|
+
// so workers can perform the actual hashing and unchanged check.
|
|
2153
|
+
return { file, hash: null, expectedHash: cachedHash, force: false, size, mtimeMs };
|
|
2154
|
+
})
|
|
2155
|
+
);
|
|
2156
|
+
|
|
2157
|
+
for (const result of results) {
|
|
2158
|
+
if (result) filesToProcess.push(result);
|
|
2159
|
+
}
|
|
2160
|
+
};
|
|
2161
|
+
|
|
2162
|
+
for (const item of fileStats) {
|
|
2163
|
+
if (!item) continue;
|
|
2164
|
+
|
|
2165
|
+
if (currentReadBytes + item.size > MAX_READ_BATCH_BYTES && currentReadBatch.length > 0) {
|
|
2166
|
+
await processReadBatch(currentReadBatch);
|
|
2167
|
+
currentReadBatch = [];
|
|
2168
|
+
currentReadBytes = 0;
|
|
2169
|
+
}
|
|
2170
|
+
|
|
2171
|
+
currentReadBatch.push(item);
|
|
2172
|
+
currentReadBytes += item.size;
|
|
2173
|
+
}
|
|
2174
|
+
|
|
2175
|
+
if (currentReadBatch.length > 0) {
|
|
2176
|
+
await processReadBatch(currentReadBatch);
|
|
2177
|
+
}
|
|
2178
|
+
|
|
2179
|
+
// Pre-warm HybridSearch cache if available
|
|
2180
|
+
if (this.server && this.server.hybridSearch && this.server.hybridSearch.fileModTimes) {
|
|
2181
|
+
for (const stat of fileStats) {
|
|
2182
|
+
if (stat && stat.file && typeof stat.mtimeMs === 'number') {
|
|
2183
|
+
this.server.hybridSearch.fileModTimes.set(stat.file, stat.mtimeMs);
|
|
2184
|
+
}
|
|
2185
|
+
}
|
|
2186
|
+
}
|
|
2187
|
+
}
|
|
2188
|
+
|
|
2189
|
+
if (this.config.verbose) {
|
|
2190
|
+
console.info(
|
|
2191
|
+
`[Indexer] Pre-filter: ${filesToProcess.length} changed, ${skippedCount.unchanged} unchanged, ${skippedCount.tooLarge} too large, ${skippedCount.error} errors (${Date.now() - startTime}ms)`
|
|
2192
|
+
);
|
|
2193
|
+
}
|
|
2194
|
+
|
|
2195
|
+
return filesToProcess;
|
|
2196
|
+
}
|
|
2197
|
+
|
|
2198
|
+
async indexAll(force = false) {
|
|
2199
|
+
if (this.isIndexing || this.processingWatchEvents) {
|
|
2200
|
+
console.warn('[Indexer] Indexing already in progress, skipping concurrent request');
|
|
2201
|
+
return {
|
|
2202
|
+
skipped: true,
|
|
2203
|
+
reason: 'Indexing already in progress or pending file updates are being applied',
|
|
2204
|
+
};
|
|
2205
|
+
}
|
|
2206
|
+
|
|
2207
|
+
this.isIndexing = true;
|
|
2208
|
+
let memoryTimer = null;
|
|
2209
|
+
const logMemory = (label) => {
|
|
2210
|
+
if (!this.config.verbose) return;
|
|
2211
|
+
const { rss, heapUsed, heapTotal } = process.memoryUsage();
|
|
2212
|
+
const toMb = (value) => `${(value / 1024 / 1024).toFixed(1)}MB`;
|
|
2213
|
+
console.info(
|
|
2214
|
+
`[Indexer] Memory ${label}: rss=${toMb(rss)} heap=${toMb(heapUsed)}/${toMb(heapTotal)}`
|
|
2215
|
+
);
|
|
2216
|
+
};
|
|
2217
|
+
|
|
2218
|
+
try {
|
|
2219
|
+
logMemory('start');
|
|
2220
|
+
if (this.config.verbose) {
|
|
2221
|
+
const intervalMs =
|
|
2222
|
+
Number.isInteger(this.config.memoryLogIntervalMs) && this.config.memoryLogIntervalMs >= 0
|
|
2223
|
+
? this.config.memoryLogIntervalMs
|
|
2224
|
+
: 30000;
|
|
2225
|
+
if (intervalMs > 0) {
|
|
2226
|
+
memoryTimer = setInterval(() => logMemory('periodic'), intervalMs);
|
|
2227
|
+
}
|
|
2228
|
+
}
|
|
2229
|
+
|
|
2230
|
+
if (force) {
|
|
2231
|
+
console.info('[Indexer] Force reindex requested: clearing cache');
|
|
2232
|
+
await this.cache.reset();
|
|
2233
|
+
} else {
|
|
2234
|
+
if (typeof this.cache.ensureLoaded === 'function') {
|
|
2235
|
+
await this.cache.ensureLoaded({ preferDisk: this.shouldPreferDiskCacheLoad() });
|
|
2236
|
+
}
|
|
2237
|
+
}
|
|
2238
|
+
|
|
2239
|
+
const totalStartTime = Date.now();
|
|
2240
|
+
const indexStartedAt = new Date(totalStartTime).toISOString();
|
|
2241
|
+
let indexMode = force
|
|
2242
|
+
? 'full'
|
|
2243
|
+
: this.cache.getVectorStore().length === 0
|
|
2244
|
+
? 'initial'
|
|
2245
|
+
: 'incremental';
|
|
2246
|
+
this.currentIndexMode = indexMode;
|
|
2247
|
+
this.sendProgress(0, 100, 'Indexing started');
|
|
2248
|
+
console.info(`[Indexer] Starting optimized indexing in ${this.config.searchDirectory}...`);
|
|
2249
|
+
|
|
2250
|
+
// Step 1: Fast file discovery with fdir
|
|
2251
|
+
const files = await this.discoverFiles();
|
|
2252
|
+
|
|
2253
|
+
if (files.length === 0) {
|
|
2254
|
+
console.info('[Indexer] No files found to index');
|
|
2255
|
+
this.sendProgress(100, 100, 'No files found to index');
|
|
2256
|
+
return {
|
|
2257
|
+
skipped: false,
|
|
2258
|
+
filesProcessed: 0,
|
|
2259
|
+
chunksCreated: 0,
|
|
2260
|
+
message: 'No files found to index',
|
|
2261
|
+
};
|
|
2262
|
+
}
|
|
2263
|
+
|
|
2264
|
+
// Send progress: discovery complete
|
|
2265
|
+
this.sendProgress(5, 100, `Discovered ${files.length} files`);
|
|
2266
|
+
|
|
2267
|
+
const currentFilesSet = new Set(files);
|
|
2268
|
+
|
|
2269
|
+
// Step 1.5: Prune deleted or excluded files from cache
|
|
2270
|
+
if (!force) {
|
|
2271
|
+
const cachedFiles =
|
|
2272
|
+
typeof this.cache.getFileHashKeys === 'function' ? this.cache.getFileHashKeys() : [];
|
|
2273
|
+
let prunedCount = 0;
|
|
2274
|
+
|
|
2275
|
+
for (const cachedFile of cachedFiles) {
|
|
2276
|
+
if (!currentFilesSet.has(cachedFile)) {
|
|
2277
|
+
this.cache.removeFileFromStore(cachedFile);
|
|
2278
|
+
this.cache.deleteFileHash(cachedFile);
|
|
2279
|
+
prunedCount++;
|
|
2280
|
+
}
|
|
2281
|
+
}
|
|
2282
|
+
|
|
2283
|
+
if (prunedCount > 0) {
|
|
2284
|
+
if (this.config.verbose) {
|
|
2285
|
+
console.info(`[Indexer] Pruned ${prunedCount} deleted/excluded files from index`);
|
|
2286
|
+
}
|
|
2287
|
+
// If we pruned files, we should save these changes even if no other files changed
|
|
2288
|
+
}
|
|
2289
|
+
|
|
2290
|
+
const prunedCallGraph = this.cache.pruneCallGraphData(currentFilesSet);
|
|
2291
|
+
if (prunedCallGraph > 0 && this.config.verbose) {
|
|
2292
|
+
console.info(`[Indexer] Pruned ${prunedCallGraph} call-graph entries`);
|
|
2293
|
+
}
|
|
2294
|
+
}
|
|
2295
|
+
|
|
2296
|
+
// Step 2: Pre-filter unchanged files (early hash check)
|
|
2297
|
+
const filesToProcess = await this.preFilterFiles(files);
|
|
2298
|
+
const filesToProcessSet = new Set(filesToProcess.map((entry) => entry.file));
|
|
2299
|
+
const filesToProcessByFile = new Map(filesToProcess.map((entry) => [entry.file, entry]));
|
|
2300
|
+
|
|
2301
|
+
// Re-index files missing call graph data (if enabled)
|
|
2302
|
+
if (this.config.callGraphEnabled && this.cache.getVectorStore().length > 0) {
|
|
2303
|
+
const cachedFiles = new Set(this.cache.getVectorStore().map((c) => c.file));
|
|
2304
|
+
const callDataFiles = new Set(this.cache.getFileCallDataKeys());
|
|
2305
|
+
|
|
2306
|
+
const missingCallData = [];
|
|
2307
|
+
for (const file of cachedFiles) {
|
|
2308
|
+
if (!callDataFiles.has(file) && currentFilesSet.has(file)) {
|
|
2309
|
+
missingCallData.push(file);
|
|
2310
|
+
const existing = filesToProcessByFile.get(file);
|
|
2311
|
+
if (existing) existing.force = true;
|
|
2312
|
+
}
|
|
2313
|
+
}
|
|
2314
|
+
|
|
2315
|
+
if (missingCallData.length > 0) {
|
|
2316
|
+
console.info(
|
|
2317
|
+
`[Indexer] Found ${missingCallData.length} files missing call graph data, re-indexing...`
|
|
2318
|
+
);
|
|
2319
|
+
const BATCH_SIZE = 100;
|
|
2320
|
+
for (let i = 0; i < missingCallData.length; i += BATCH_SIZE) {
|
|
2321
|
+
const batch = missingCallData.slice(i, i + BATCH_SIZE);
|
|
2322
|
+
const results = await Promise.all(
|
|
2323
|
+
batch.map(async (file) => {
|
|
2324
|
+
try {
|
|
2325
|
+
const stats = await fs.stat(file);
|
|
2326
|
+
if (!stats || typeof stats.isDirectory !== 'function') {
|
|
2327
|
+
return null;
|
|
2328
|
+
}
|
|
2329
|
+
if (stats.isDirectory()) return null;
|
|
2330
|
+
if (stats.size > this.config.maxFileSize) return null;
|
|
2331
|
+
const content = await fs.readFile(file, 'utf-8');
|
|
2332
|
+
const hash = hashContent(content);
|
|
2333
|
+
return { file, hash, force: true, size: stats.size, mtimeMs: stats.mtimeMs };
|
|
2334
|
+
} catch {
|
|
2335
|
+
return null;
|
|
2336
|
+
}
|
|
2337
|
+
})
|
|
2338
|
+
);
|
|
2339
|
+
|
|
2340
|
+
for (const result of results) {
|
|
2341
|
+
if (!result) continue;
|
|
2342
|
+
if (!filesToProcessSet.has(result.file)) {
|
|
2343
|
+
filesToProcess.push(result);
|
|
2344
|
+
filesToProcessSet.add(result.file);
|
|
2345
|
+
}
|
|
2346
|
+
}
|
|
2347
|
+
}
|
|
2348
|
+
}
|
|
2349
|
+
}
|
|
2350
|
+
|
|
2351
|
+
indexMode = force
|
|
2352
|
+
? 'full'
|
|
2353
|
+
: this.cache.getVectorStore().length === 0
|
|
2354
|
+
? 'initial'
|
|
2355
|
+
: filesToProcess.length === files.length
|
|
2356
|
+
? 'full'
|
|
2357
|
+
: 'incremental';
|
|
2358
|
+
this.currentIndexMode = indexMode;
|
|
2359
|
+
|
|
2360
|
+
if (filesToProcess.length === 0) {
|
|
2361
|
+
console.info('[Indexer] All files unchanged, nothing to index');
|
|
2362
|
+
this.sendProgress(100, 100, 'All files up to date');
|
|
2363
|
+
await this.cache.save();
|
|
2364
|
+
const vectorStore = this.cache.getVectorStore();
|
|
2365
|
+
return {
|
|
2366
|
+
skipped: false,
|
|
2367
|
+
filesProcessed: 0,
|
|
2368
|
+
chunksCreated: 0,
|
|
2369
|
+
totalFiles: new Set(vectorStore.map((v) => v.file)).size,
|
|
2370
|
+
totalChunks: vectorStore.length,
|
|
2371
|
+
message: 'All files up to date',
|
|
2372
|
+
};
|
|
2373
|
+
}
|
|
2374
|
+
|
|
2375
|
+
// Send progress: filtering complete
|
|
2376
|
+
console.info(`[Indexer] Processing ${filesToProcess.length} changed files`);
|
|
2377
|
+
this.sendProgress(10, 100, `Processing ${filesToProcess.length} changed files`);
|
|
2378
|
+
|
|
2379
|
+
// Step 3: Determine batch size based on project size
|
|
2380
|
+
// Adaptive batch size: use larger batches for larger projects to reduce overhead
|
|
2381
|
+
let adaptiveBatchSize = 10;
|
|
2382
|
+
if (files.length > 500) adaptiveBatchSize = 50;
|
|
2383
|
+
if (files.length > 1000) adaptiveBatchSize = 100;
|
|
2384
|
+
if (files.length > 5000) adaptiveBatchSize = 500;
|
|
2385
|
+
|
|
2386
|
+
if (this.config.verbose) {
|
|
2387
|
+
console.info(
|
|
2388
|
+
`[Indexer] Processing ${filesToProcess.length} files (batch size: ${adaptiveBatchSize})`
|
|
2389
|
+
);
|
|
2390
|
+
}
|
|
2391
|
+
|
|
2392
|
+
// Step 4: Initialize worker threads (skip if explicitly disabled)
|
|
2393
|
+
const allowSingleThreadFallback =
|
|
2394
|
+
this.config.allowSingleThreadFallback !== false ||
|
|
2395
|
+
this.config.workerThreads === 0 ||
|
|
2396
|
+
isTestEnv();
|
|
2397
|
+
let useWorkers = this.shouldUseWorkers();
|
|
2398
|
+
|
|
2399
|
+
if (useWorkers) {
|
|
2400
|
+
await this.initializeWorkers();
|
|
2401
|
+
if (this.workers.length === 0) {
|
|
2402
|
+
useWorkers = false;
|
|
2403
|
+
} else if (this.config.verbose) {
|
|
2404
|
+
console.info(`[Indexer] Multi-threaded mode: ${this.workers.length} workers active`);
|
|
2405
|
+
}
|
|
2406
|
+
}
|
|
2407
|
+
|
|
2408
|
+
const useEmbeddingProcessPerBatch = this.shouldUseEmbeddingProcessPerBatch(useWorkers);
|
|
2409
|
+
let embeddingRuntimeSummary = '';
|
|
2410
|
+
if (useWorkers && this.workers.length > 0) {
|
|
2411
|
+
// Worker pool is intentionally fixed to 1 ONNX thread per worker.
|
|
2412
|
+
const workerInferenceBatchSize =
|
|
2413
|
+
this.getWorkerInferenceBatchSize({ numWorkers: this.workers.length }) ?? 'default';
|
|
2414
|
+
embeddingRuntimeSummary =
|
|
2415
|
+
`mode=worker-pool workers=${this.workers.length} onnxThreadsPerWorker=1 ` +
|
|
2416
|
+
`effectiveThreads=${this.workers.length} inferenceBatchSize=${workerInferenceBatchSize}`;
|
|
2417
|
+
} else if (useEmbeddingProcessPerBatch) {
|
|
2418
|
+
const { threads, batchSize } = this.getEmbeddingProcessConfig();
|
|
2419
|
+
embeddingRuntimeSummary =
|
|
2420
|
+
`mode=child-process onnxThreads=${threads} ` +
|
|
2421
|
+
`inferenceBatchSize=${batchSize ?? 1} persistentSession=true`;
|
|
2422
|
+
} else {
|
|
2423
|
+
embeddingRuntimeSummary = 'mode=main-thread onnxThreads=auto';
|
|
2424
|
+
}
|
|
2425
|
+
console.info(`[Indexer] Embedding runtime: ${embeddingRuntimeSummary}`);
|
|
2426
|
+
|
|
2427
|
+
if (!useWorkers && this.config.verbose) {
|
|
2428
|
+
const cpuCount = Array.isArray(os.cpus()) ? os.cpus().length : 0;
|
|
2429
|
+
const baseDetail = `cpu=${cpuCount}, embeddingProcessPerBatch=${useEmbeddingProcessPerBatch}, workerThreads=${this.config.workerThreads}`;
|
|
2430
|
+
const until = this.workersDisabledUntil - Date.now();
|
|
2431
|
+
if (this.workersDisabledUntil && until > 0) {
|
|
2432
|
+
console.info(
|
|
2433
|
+
`[Indexer] Workers disabled for ${Math.round(until / 1000)}s; using non-worker path (${baseDetail}); single-threaded fallback ${allowSingleThreadFallback ? 'enabled' : 'disabled'}`
|
|
2434
|
+
);
|
|
2435
|
+
} else {
|
|
2436
|
+
console.info(`[Indexer] Workers disabled; using non-worker path (${baseDetail})`);
|
|
2437
|
+
}
|
|
2438
|
+
}
|
|
2439
|
+
|
|
2440
|
+
if (useEmbeddingProcessPerBatch) {
|
|
2441
|
+
try {
|
|
2442
|
+
await this.startEmbeddingProcessSession();
|
|
2443
|
+
} catch (err) {
|
|
2444
|
+
this._embeddingProcessSessionActive = false;
|
|
2445
|
+
if (this.config.verbose) {
|
|
2446
|
+
console.warn(`[Indexer] Failed to start persistent embedding process: ${err.message}`);
|
|
2447
|
+
}
|
|
2448
|
+
}
|
|
2449
|
+
}
|
|
2450
|
+
|
|
2451
|
+
const resolvedWorkerThreads = useWorkers ? this.workers.length : 0;
|
|
2452
|
+
|
|
2453
|
+
let totalChunks = 0;
|
|
2454
|
+
let processedFiles = 0;
|
|
2455
|
+
|
|
2456
|
+
console.info(
|
|
2457
|
+
`[Indexer] Embedding pass started: ${filesToProcess.length} files using ${this.config.embeddingModel}`
|
|
2458
|
+
);
|
|
2459
|
+
|
|
2460
|
+
// Step 5: Process files in adaptive batches
|
|
2461
|
+
for (let i = 0; i < filesToProcess.length; i += adaptiveBatchSize) {
|
|
2462
|
+
const batch = filesToProcess.slice(i, i + adaptiveBatchSize);
|
|
2463
|
+
|
|
2464
|
+
const allChunks = [];
|
|
2465
|
+
const fileStats = new Map();
|
|
2466
|
+
const newChunksByFile = new Map();
|
|
2467
|
+
const callDataByFile = new Map();
|
|
2468
|
+
const filesForWorkers = [];
|
|
2469
|
+
|
|
2470
|
+
// Memory safeguard
|
|
2471
|
+
const mem = process.memoryUsage();
|
|
2472
|
+
if (mem.rss > 2048 * 1024 * 1024) {
|
|
2473
|
+
this.runExplicitGc({ minIntervalMs: 5000 });
|
|
2474
|
+
}
|
|
2475
|
+
|
|
2476
|
+
const useWorkersForBatch =
|
|
2477
|
+
useWorkers && this.workers.length > 0 && !useEmbeddingProcessPerBatch;
|
|
2478
|
+
|
|
2479
|
+
for (const item of batch) {
|
|
2480
|
+
const {
|
|
2481
|
+
file,
|
|
2482
|
+
force,
|
|
2483
|
+
content: presetContent,
|
|
2484
|
+
hash: presetHash,
|
|
2485
|
+
expectedHash: presetExpectedHash,
|
|
2486
|
+
size: presetSize,
|
|
2487
|
+
mtimeMs: presetMtimeMs,
|
|
2488
|
+
} = item;
|
|
2489
|
+
let content = presetContent;
|
|
2490
|
+
let liveHash = presetHash;
|
|
2491
|
+
let size = presetSize;
|
|
2492
|
+
let mtimeMs = presetMtimeMs;
|
|
2493
|
+
const expectedHash =
|
|
2494
|
+
presetExpectedHash ||
|
|
2495
|
+
(typeof this.cache.getFileHash === 'function' ? this.cache.getFileHash(file) : null);
|
|
2496
|
+
|
|
2497
|
+
if (useWorkersForBatch && (content === undefined || content === null)) {
|
|
2498
|
+
// Speed optimization: Offload reading and hashing to workers.
|
|
2499
|
+
// Main thread skips I/O entirely for this file.
|
|
2500
|
+
filesForWorkers.push({ file, content: null, force, expectedHash });
|
|
2501
|
+
// Initialize stats placeholder (will be updated with worker results)
|
|
2502
|
+
fileStats.set(file, { hash: null, totalChunks: 0, successChunks: 0, size, mtimeMs });
|
|
2503
|
+
continue;
|
|
2504
|
+
}
|
|
2505
|
+
|
|
2506
|
+
// Read content if not provided (Legacy Path or workers disabled)
|
|
2507
|
+
if (content === undefined || content === null) {
|
|
2508
|
+
let stats = null;
|
|
2509
|
+
try {
|
|
2510
|
+
stats = await fs.stat(file);
|
|
2511
|
+
} catch (err) {
|
|
2512
|
+
if (this.config.verbose) {
|
|
2513
|
+
console.warn(`[Indexer] Failed to stat ${path.basename(file)}: ${err.message}`);
|
|
2514
|
+
}
|
|
2515
|
+
continue;
|
|
2516
|
+
}
|
|
2517
|
+
if (!stats || typeof stats.isDirectory !== 'function') {
|
|
2518
|
+
if (this.config.verbose) {
|
|
2519
|
+
console.warn(`[Indexer] Invalid stat result for ${path.basename(file)}`);
|
|
2520
|
+
}
|
|
2521
|
+
continue;
|
|
2522
|
+
}
|
|
2523
|
+
if (stats.isDirectory()) continue;
|
|
2524
|
+
if (stats.size > this.config.maxFileSize) {
|
|
2525
|
+
if (this.config.verbose) {
|
|
2526
|
+
console.warn(
|
|
2527
|
+
`[Indexer] Skipped ${path.basename(file)} (too large: ${(stats.size / 1024 / 1024).toFixed(2)}MB)`
|
|
2528
|
+
);
|
|
2529
|
+
}
|
|
2530
|
+
continue;
|
|
2531
|
+
}
|
|
2532
|
+
try {
|
|
2533
|
+
content = await fs.readFile(file, 'utf-8');
|
|
2534
|
+
} catch (err) {
|
|
2535
|
+
if (this.config.verbose) {
|
|
2536
|
+
console.warn(`[Indexer] Failed to read ${path.basename(file)}: ${err.message}`);
|
|
2537
|
+
}
|
|
2538
|
+
continue;
|
|
2539
|
+
}
|
|
2540
|
+
liveHash = hashContent(content);
|
|
2541
|
+
size = stats.size;
|
|
2542
|
+
mtimeMs = stats.mtimeMs;
|
|
2543
|
+
} else {
|
|
2544
|
+
if (typeof content !== 'string') content = String(content);
|
|
2545
|
+
if (!liveHash) liveHash = hashContent(content);
|
|
2546
|
+
if (!Number.isFinite(size)) {
|
|
2547
|
+
// Use character length as approximation to avoid blocking Buffer.byteLength on large strings
|
|
2548
|
+
size = content.length;
|
|
2549
|
+
}
|
|
2550
|
+
if (size > this.config.maxFileSize) {
|
|
2551
|
+
if (this.config.verbose) {
|
|
2552
|
+
console.warn(
|
|
2553
|
+
`[Indexer] Skipped ${path.basename(file)} (too large: ${(size / 1024 / 1024).toFixed(2)}MB)`
|
|
2554
|
+
);
|
|
2555
|
+
}
|
|
2556
|
+
continue;
|
|
2557
|
+
}
|
|
2558
|
+
}
|
|
2559
|
+
|
|
2560
|
+
const cachedFileHash =
|
|
2561
|
+
typeof this.cache.getFileHash === 'function' ? this.cache.getFileHash(file) : null;
|
|
2562
|
+
if (!force && liveHash && cachedFileHash === liveHash) {
|
|
2563
|
+
if (this.config.verbose)
|
|
2564
|
+
console.info(`[Indexer] Skipped ${path.basename(file)} (unchanged)`);
|
|
2565
|
+
this.cache.setFileHash(file, liveHash, { size, mtimeMs });
|
|
2566
|
+
continue;
|
|
2567
|
+
}
|
|
2568
|
+
|
|
2569
|
+
if (useWorkersForBatch) {
|
|
2570
|
+
filesForWorkers.push({ file, content, force, expectedHash });
|
|
2571
|
+
// Initialize stats placeholder (will be updated with worker results)
|
|
2572
|
+
fileStats.set(file, {
|
|
2573
|
+
hash: liveHash,
|
|
2574
|
+
totalChunks: 0,
|
|
2575
|
+
successChunks: 0,
|
|
2576
|
+
size,
|
|
2577
|
+
mtimeMs,
|
|
2578
|
+
});
|
|
2579
|
+
continue;
|
|
2580
|
+
}
|
|
2581
|
+
|
|
2582
|
+
// Legacy / Fallback path: Chunk on main thread
|
|
2583
|
+
if (this.config.callGraphEnabled) {
|
|
2584
|
+
try {
|
|
2585
|
+
const callData = extractCallData(content, file);
|
|
2586
|
+
callDataByFile.set(file, callData);
|
|
2587
|
+
} catch (err) {
|
|
2588
|
+
if (this.config.verbose) {
|
|
2589
|
+
console.warn(
|
|
2590
|
+
`[Indexer] Call graph extraction failed for ${path.basename(file)}: ${err.message}`
|
|
2591
|
+
);
|
|
2592
|
+
}
|
|
2593
|
+
}
|
|
2594
|
+
}
|
|
2595
|
+
|
|
2596
|
+
const rawChunks = smartChunk(content, file, this.config);
|
|
2597
|
+
const chunks = Array.isArray(rawChunks) ? rawChunks : [];
|
|
2598
|
+
fileStats.set(file, {
|
|
2599
|
+
hash: liveHash,
|
|
2600
|
+
totalChunks: chunks.length,
|
|
2601
|
+
successChunks: 0,
|
|
2602
|
+
size,
|
|
2603
|
+
mtimeMs,
|
|
2604
|
+
});
|
|
2605
|
+
|
|
2606
|
+
for (const chunk of chunks) {
|
|
2607
|
+
allChunks.push({
|
|
2608
|
+
file,
|
|
2609
|
+
text: chunk.text,
|
|
2610
|
+
startLine: chunk.startLine,
|
|
2611
|
+
endLine: chunk.endLine,
|
|
2612
|
+
});
|
|
2613
|
+
}
|
|
2614
|
+
}
|
|
2615
|
+
|
|
2616
|
+
// Process files with workers (New Path)
|
|
2617
|
+
if (filesForWorkers.length > 0) {
|
|
2618
|
+
const results = await this.processFilesWithWorkers(filesForWorkers);
|
|
2619
|
+
|
|
2620
|
+
for (const res of results) {
|
|
2621
|
+
const stats = fileStats.get(res.file);
|
|
2622
|
+
if (res.status === 'indexed' && stats) {
|
|
2623
|
+
stats.totalChunks = res.results.length;
|
|
2624
|
+
stats.successChunks = res.results.length;
|
|
2625
|
+
if (res.hash) stats.hash = res.hash; // Update with new hash from worker
|
|
2626
|
+
if (res.callData) callDataByFile.set(res.file, res.callData);
|
|
2627
|
+
|
|
2628
|
+
const chunks = res.results.map((r) => ({
|
|
2629
|
+
file: res.file,
|
|
2630
|
+
startLine: r.startLine,
|
|
2631
|
+
endLine: r.endLine,
|
|
2632
|
+
content: r.text,
|
|
2633
|
+
vector: toFloat32Array(r.vectorBuffer),
|
|
2634
|
+
}));
|
|
2635
|
+
newChunksByFile.set(res.file, chunks);
|
|
2636
|
+
} else if (res.status === 'unchanged' && stats) {
|
|
2637
|
+
// Worker found file hash matches old hash
|
|
2638
|
+
stats.totalChunks = 0; // Signal skip commit
|
|
2639
|
+
stats.successChunks = 0;
|
|
2640
|
+
stats.hash = res.hash;
|
|
2641
|
+
this.cache.setFileHash(res.file, res.hash, { size: res.size, mtimeMs: res.mtimeMs });
|
|
2642
|
+
if (res.callData && this.config.callGraphEnabled) {
|
|
2643
|
+
this.cache.setFileCallData(res.file, res.callData);
|
|
2644
|
+
}
|
|
2645
|
+
} else if ((res.status === 'retry' || res.status === 'error') && stats) {
|
|
2646
|
+
// Worker failed, fallback to local chunking + single threaded
|
|
2647
|
+
const original = filesForWorkers.find((f) => f.file === res.file);
|
|
2648
|
+
if (original) {
|
|
2649
|
+
if (this.config.verbose)
|
|
2650
|
+
console.info(`[Indexer] Fallback for ${path.basename(res.file)}`);
|
|
2651
|
+
|
|
2652
|
+
let fallbackContent = original.content;
|
|
2653
|
+
let fallbackSize = stats.size;
|
|
2654
|
+
let fallbackMtimeMs = stats.mtimeMs;
|
|
2655
|
+
|
|
2656
|
+
if (fallbackContent === undefined || fallbackContent === null) {
|
|
2657
|
+
try {
|
|
2658
|
+
const liveStats = await fs.stat(res.file);
|
|
2659
|
+
if (!liveStats || typeof liveStats.isDirectory !== 'function') {
|
|
2660
|
+
continue;
|
|
2661
|
+
}
|
|
2662
|
+
if (liveStats.isDirectory()) continue;
|
|
2663
|
+
if (liveStats.size > this.config.maxFileSize) {
|
|
2664
|
+
if (this.config.verbose) {
|
|
2665
|
+
console.warn(
|
|
2666
|
+
`[Indexer] Skipped ${path.basename(res.file)} (too large: ${(liveStats.size / 1024 / 1024).toFixed(2)}MB)`
|
|
2667
|
+
);
|
|
2668
|
+
}
|
|
2669
|
+
continue;
|
|
2670
|
+
}
|
|
2671
|
+
fallbackContent = await fs.readFile(res.file, 'utf-8');
|
|
2672
|
+
fallbackSize = liveStats.size;
|
|
2673
|
+
fallbackMtimeMs = liveStats.mtimeMs;
|
|
2674
|
+
} catch (err) {
|
|
2675
|
+
if (this.config.verbose) {
|
|
2676
|
+
console.warn(
|
|
2677
|
+
`[Indexer] Fallback read failed for ${path.basename(res.file)}: ${err.message}`
|
|
2678
|
+
);
|
|
2679
|
+
}
|
|
2680
|
+
continue;
|
|
2681
|
+
}
|
|
2682
|
+
}
|
|
2683
|
+
if (typeof fallbackContent !== 'string') {
|
|
2684
|
+
fallbackContent = String(fallbackContent);
|
|
2685
|
+
}
|
|
2686
|
+
stats.hash = hashContent(fallbackContent);
|
|
2687
|
+
if (Number.isFinite(fallbackSize)) stats.size = fallbackSize;
|
|
2688
|
+
if (Number.isFinite(fallbackMtimeMs)) stats.mtimeMs = fallbackMtimeMs;
|
|
2689
|
+
|
|
2690
|
+
if (this.config.callGraphEnabled) {
|
|
2691
|
+
try {
|
|
2692
|
+
callDataByFile.set(res.file, extractCallData(fallbackContent, res.file));
|
|
2693
|
+
} catch (err) {
|
|
2694
|
+
if (this.config.verbose) {
|
|
2695
|
+
console.warn(
|
|
2696
|
+
`[Indexer] Call graph extraction failed for ${path.basename(res.file)}: ${err.message}`
|
|
2697
|
+
);
|
|
2698
|
+
}
|
|
2699
|
+
}
|
|
2700
|
+
}
|
|
2701
|
+
const fallbackChunks = smartChunk(fallbackContent, res.file, this.config);
|
|
2702
|
+
const chunks = Array.isArray(fallbackChunks) ? fallbackChunks : [];
|
|
2703
|
+
stats.totalChunks = chunks.length;
|
|
2704
|
+
for (const chunk of chunks) {
|
|
2705
|
+
allChunks.push({
|
|
2706
|
+
file: res.file,
|
|
2707
|
+
text: chunk.text,
|
|
2708
|
+
startLine: chunk.startLine,
|
|
2709
|
+
endLine: chunk.endLine,
|
|
2710
|
+
});
|
|
2711
|
+
}
|
|
2712
|
+
}
|
|
2713
|
+
}
|
|
2714
|
+
}
|
|
2715
|
+
}
|
|
2716
|
+
|
|
2717
|
+
// Process chunks (Legacy Path & Fallbacks)
|
|
2718
|
+
if (allChunks.length > 0) {
|
|
2719
|
+
const chunksToProcess = allChunks.slice();
|
|
2720
|
+
let results = [];
|
|
2721
|
+
if (useEmbeddingProcessPerBatch) {
|
|
2722
|
+
results = await this.processChunksInChildProcess(chunksToProcess);
|
|
2723
|
+
} else {
|
|
2724
|
+
// If we are here, either workers are disabled/full or these are retry chunks
|
|
2725
|
+
// Use single threaded fallback if not using child process
|
|
2726
|
+
results = await this.processChunksSingleThreaded(chunksToProcess);
|
|
2727
|
+
}
|
|
2728
|
+
|
|
2729
|
+
for (const result of results) {
|
|
2730
|
+
const stats = fileStats.get(result.file);
|
|
2731
|
+
if (result.success && stats) {
|
|
2732
|
+
const items = newChunksByFile.get(result.file) || [];
|
|
2733
|
+
items.push({
|
|
2734
|
+
file: result.file,
|
|
2735
|
+
startLine: result.startLine,
|
|
2736
|
+
endLine: result.endLine,
|
|
2737
|
+
content: result.content,
|
|
2738
|
+
vector: toFloat32Array(result.vector),
|
|
2739
|
+
});
|
|
2740
|
+
newChunksByFile.set(result.file, items);
|
|
2741
|
+
stats.successChunks++;
|
|
2742
|
+
}
|
|
2743
|
+
}
|
|
2744
|
+
}
|
|
2745
|
+
|
|
2746
|
+
// Commit changes to cache
|
|
2747
|
+
for (const [file, stats] of fileStats) {
|
|
2748
|
+
if (stats.totalChunks > 0 && stats.successChunks === stats.totalChunks) {
|
|
2749
|
+
this.cache.removeFileFromStore(file);
|
|
2750
|
+
const newChunks = newChunksByFile.get(file) || [];
|
|
2751
|
+
for (const chunk of newChunks) {
|
|
2752
|
+
this.cache.addToStore(chunk);
|
|
2753
|
+
totalChunks++;
|
|
2754
|
+
}
|
|
2755
|
+
if (typeof stats.hash === 'string' && stats.hash.length > 0) {
|
|
2756
|
+
this.cache.setFileHash(file, stats.hash, { size: stats.size, mtimeMs: stats.mtimeMs });
|
|
2757
|
+
} else if (this.config.verbose) {
|
|
2758
|
+
console.warn(`[Indexer] Skipped hash update for ${path.basename(file)} (missing hash)`);
|
|
2759
|
+
}
|
|
2760
|
+
const callData = callDataByFile.get(file);
|
|
2761
|
+
if (callData && this.config.callGraphEnabled) {
|
|
2762
|
+
this.cache.setFileCallData(file, callData);
|
|
2763
|
+
}
|
|
2764
|
+
} else if (stats.totalChunks === 0) {
|
|
2765
|
+
// File had no chunks (empty or comments only), just mark as indexed
|
|
2766
|
+
if (typeof stats.hash === 'string' && stats.hash.length > 0) {
|
|
2767
|
+
this.cache.setFileHash(file, stats.hash, { size: stats.size, mtimeMs: stats.mtimeMs });
|
|
2768
|
+
} else if (this.config.verbose) {
|
|
2769
|
+
console.warn(`[Indexer] Skipped hash update for ${path.basename(file)} (missing hash)`);
|
|
2770
|
+
}
|
|
2771
|
+
const callData = callDataByFile.get(file);
|
|
2772
|
+
if (callData && this.config.callGraphEnabled) {
|
|
2773
|
+
this.cache.setFileCallData(file, callData);
|
|
2774
|
+
}
|
|
2775
|
+
} else if (this.config.verbose) {
|
|
2776
|
+
console.warn(
|
|
2777
|
+
`[Indexer] Skipped hash update for ${path.basename(file)} (${stats.successChunks}/${stats.totalChunks} chunks embedded)`
|
|
2778
|
+
);
|
|
2779
|
+
}
|
|
2780
|
+
}
|
|
2781
|
+
|
|
2782
|
+
this.runExplicitGc({ minIntervalMs: 5000 });
|
|
2783
|
+
|
|
2784
|
+
processedFiles += batch.length;
|
|
2785
|
+
|
|
2786
|
+
// Progress indicator
|
|
2787
|
+
if (
|
|
2788
|
+
processedFiles % (adaptiveBatchSize * 2) === 0 ||
|
|
2789
|
+
processedFiles === filesToProcess.length
|
|
2790
|
+
) {
|
|
2791
|
+
const elapsedSeconds = (Date.now() - totalStartTime) / 1000;
|
|
2792
|
+
const elapsed = elapsedSeconds.toFixed(1);
|
|
2793
|
+
const rate = (processedFiles / Math.max(elapsedSeconds, 0.001)).toFixed(1);
|
|
2794
|
+
console.info(
|
|
2795
|
+
`[Indexer] Progress: ${processedFiles}/${filesToProcess.length} files (${rate} files/sec, ${elapsed}s elapsed)`
|
|
2796
|
+
);
|
|
2797
|
+
const progressPercent = Math.floor(10 + (processedFiles / filesToProcess.length) * 85);
|
|
2798
|
+
this.sendProgress(
|
|
2799
|
+
progressPercent,
|
|
2800
|
+
100,
|
|
2801
|
+
`Indexed ${processedFiles}/${filesToProcess.length} files (${rate}/sec)`
|
|
2802
|
+
);
|
|
2803
|
+
}
|
|
2804
|
+
|
|
2805
|
+
// Batch-level memory cleanup to reduce peak usage
|
|
2806
|
+
allChunks.length = 0;
|
|
2807
|
+
filesForWorkers.length = 0;
|
|
2808
|
+
fileStats.clear();
|
|
2809
|
+
newChunksByFile.clear();
|
|
2810
|
+
callDataByFile.clear();
|
|
2811
|
+
await delay(0);
|
|
2812
|
+
}
|
|
2813
|
+
|
|
2814
|
+
// Cleanup workers
|
|
2815
|
+
if (this.workers.length > 0) {
|
|
2816
|
+
await this.terminateWorkers();
|
|
2817
|
+
}
|
|
2818
|
+
this.runExplicitGc({ force: true });
|
|
2819
|
+
|
|
2820
|
+
const totalDurationMs = Date.now() - totalStartTime;
|
|
2821
|
+
const totalTime = (totalDurationMs / 1000).toFixed(1);
|
|
2822
|
+
console.info(
|
|
2823
|
+
`[Indexer] Embedding pass complete: ${totalChunks} chunks from ${filesToProcess.length} files in ${totalTime}s`
|
|
2824
|
+
);
|
|
2825
|
+
|
|
2826
|
+
// Send completion progress
|
|
2827
|
+
this.sendProgress(
|
|
2828
|
+
100,
|
|
2829
|
+
100,
|
|
2830
|
+
`Complete: ${totalChunks} chunks from ${filesToProcess.length} files in ${totalTime}s`
|
|
2831
|
+
);
|
|
2832
|
+
|
|
2833
|
+
this.cache.setLastIndexDuration?.(totalDurationMs);
|
|
2834
|
+
this.cache.setLastIndexStats?.({
|
|
2835
|
+
lastIndexStartedAt: indexStartedAt,
|
|
2836
|
+
lastIndexEndedAt: new Date().toISOString(),
|
|
2837
|
+
lastDiscoveredFiles: files.length,
|
|
2838
|
+
lastFilesProcessed: filesToProcess.length,
|
|
2839
|
+
lastIndexMode: indexMode,
|
|
2840
|
+
lastBatchSize: adaptiveBatchSize,
|
|
2841
|
+
lastWorkerThreads: resolvedWorkerThreads,
|
|
2842
|
+
lastEmbeddingProcessPerBatch: useEmbeddingProcessPerBatch,
|
|
2843
|
+
});
|
|
2844
|
+
await this.cache.save();
|
|
2845
|
+
|
|
2846
|
+
const vectorStoreSnapshot = this.cache.getVectorStore();
|
|
2847
|
+
const totalFiles = new Set(vectorStoreSnapshot.map((v) => v.file)).size;
|
|
2848
|
+
const totalChunksCount = vectorStoreSnapshot.length;
|
|
2849
|
+
|
|
2850
|
+
if (this.config.clearCacheAfterIndex) {
|
|
2851
|
+
console.info(
|
|
2852
|
+
'[Indexer] clearCacheAfterIndex enabled; in-memory vectors will be reloaded on next query'
|
|
2853
|
+
);
|
|
2854
|
+
await this.cache.dropInMemoryVectors();
|
|
2855
|
+
if (this.config.verbose) {
|
|
2856
|
+
console.info('[Cache] Cleared in-memory vectors after indexing');
|
|
2857
|
+
}
|
|
2858
|
+
}
|
|
2859
|
+
|
|
2860
|
+
// Unload embedding models to free RAM
|
|
2861
|
+
if (this.config.unloadModelAfterIndex) {
|
|
2862
|
+
console.info(
|
|
2863
|
+
'[Indexer] unloadModelAfterIndex enabled; embedding model will be reloaded on next query'
|
|
2864
|
+
);
|
|
2865
|
+
await this.unloadEmbeddingModels();
|
|
2866
|
+
}
|
|
2867
|
+
this.maybeShutdownQueryEmbeddingPool('full index');
|
|
2868
|
+
|
|
2869
|
+
// Rebuild call graph in background
|
|
2870
|
+
if (this.config.callGraphEnabled) {
|
|
2871
|
+
this.cache.rebuildCallGraph();
|
|
2872
|
+
}
|
|
2873
|
+
|
|
2874
|
+
if (!this.config.clearCacheAfterIndex) {
|
|
2875
|
+
void this.cache.ensureAnnIndex().catch((error) => {
|
|
2876
|
+
if (this.config.verbose) {
|
|
2877
|
+
console.warn(`[ANN] Background ANN build failed: ${error.message}`);
|
|
2878
|
+
}
|
|
2879
|
+
});
|
|
2880
|
+
}
|
|
2881
|
+
|
|
2882
|
+
return {
|
|
2883
|
+
skipped: false,
|
|
2884
|
+
filesProcessed: filesToProcess.length,
|
|
2885
|
+
chunksCreated: totalChunks,
|
|
2886
|
+
totalFiles,
|
|
2887
|
+
totalChunks: totalChunksCount,
|
|
2888
|
+
duration: totalTime,
|
|
2889
|
+
message: `Indexed ${filesToProcess.length} files (${totalChunks} chunks) in ${totalTime}s`,
|
|
2890
|
+
};
|
|
2891
|
+
} finally {
|
|
2892
|
+
if (memoryTimer) {
|
|
2893
|
+
clearInterval(memoryTimer);
|
|
2894
|
+
}
|
|
2895
|
+
if (this._embeddingProcessSessionActive) {
|
|
2896
|
+
await this.stopEmbeddingProcessSession();
|
|
2897
|
+
}
|
|
2898
|
+
logMemory('end');
|
|
2899
|
+
this.isIndexing = false;
|
|
2900
|
+
try {
|
|
2901
|
+
await this.processPendingWatchEvents();
|
|
2902
|
+
} catch (error) {
|
|
2903
|
+
console.warn(`[Indexer] Failed to apply queued file updates: ${error.message}`);
|
|
2904
|
+
}
|
|
2905
|
+
}
|
|
2906
|
+
}
|
|
2907
|
+
|
|
2908
|
+
enqueueWatchEvent(type, filePath) {
|
|
2909
|
+
// Prevent unbounded memory growth during rapid file churn (e.g., build processes)
|
|
2910
|
+
if (this.pendingWatchEvents.size >= MAX_PENDING_WATCH_EVENTS) {
|
|
2911
|
+
console.warn(
|
|
2912
|
+
`[Indexer] pendingWatchEvents limit reached (${MAX_PENDING_WATCH_EVENTS}), ` +
|
|
2913
|
+
`trimming oldest ${this.pendingWatchEvents.size - PENDING_WATCH_EVENTS_TRIM_SIZE} events`
|
|
2914
|
+
);
|
|
2915
|
+
// Drop oldest events (Map iterates in insertion order)
|
|
2916
|
+
const toRemove = this.pendingWatchEvents.size - PENDING_WATCH_EVENTS_TRIM_SIZE;
|
|
2917
|
+
let count = 0;
|
|
2918
|
+
for (const key of this.pendingWatchEvents.keys()) {
|
|
2919
|
+
if (count++ >= toRemove) break;
|
|
2920
|
+
this.pendingWatchEvents.delete(key);
|
|
2921
|
+
}
|
|
2922
|
+
}
|
|
2923
|
+
|
|
2924
|
+
// If it's a delete, it always wins
|
|
2925
|
+
if (type === 'unlink') {
|
|
2926
|
+
this.pendingWatchEvents.set(filePath, 'unlink');
|
|
2927
|
+
return;
|
|
2928
|
+
}
|
|
2929
|
+
|
|
2930
|
+
// If we're adding/changing, it overwrites a potential unlink (file came back)
|
|
2931
|
+
this.pendingWatchEvents.set(filePath, type);
|
|
2932
|
+
}
|
|
2933
|
+
|
|
2934
|
+
async processPendingWatchEvents() {
|
|
2935
|
+
if (this.processingWatchEvents || this.pendingWatchEvents.size === 0) {
|
|
2936
|
+
return;
|
|
2937
|
+
}
|
|
2938
|
+
|
|
2939
|
+
this.processingWatchEvents = true;
|
|
2940
|
+
try {
|
|
2941
|
+
if (typeof this.cache.ensureLoaded === 'function') {
|
|
2942
|
+
const preferDisk = this.shouldPreferDiskCacheLoad();
|
|
2943
|
+
await this.traceIncrementalMemoryPhase('watchBatch.ensureLoaded', async () => {
|
|
2944
|
+
await this.cache.ensureLoaded({ preferDisk });
|
|
2945
|
+
});
|
|
2946
|
+
}
|
|
2947
|
+
|
|
2948
|
+
while (this.pendingWatchEvents.size > 0) {
|
|
2949
|
+
const pending = Array.from(this.pendingWatchEvents.entries());
|
|
2950
|
+
this.pendingWatchEvents.clear();
|
|
2951
|
+
|
|
2952
|
+
for (const [filePath, type] of pending) {
|
|
2953
|
+
if (this.server && this.server.hybridSearch) {
|
|
2954
|
+
this.server.hybridSearch.clearFileModTime(filePath);
|
|
2955
|
+
}
|
|
2956
|
+
|
|
2957
|
+
if (type === 'unlink') {
|
|
2958
|
+
await this.cache.removeFileFromStore(filePath);
|
|
2959
|
+
this.cache.deleteFileHash(filePath);
|
|
2960
|
+
} else {
|
|
2961
|
+
await this.indexFile(filePath);
|
|
2962
|
+
}
|
|
2963
|
+
}
|
|
2964
|
+
|
|
2965
|
+
await this.traceIncrementalMemoryPhase('watchBatch.cacheSave', async () => {
|
|
2966
|
+
await this.cache.save();
|
|
2967
|
+
});
|
|
2968
|
+
await this.traceIncrementalMemoryPhase('watchBatch.cleanup', async () => {
|
|
2969
|
+
await this.runPostIncrementalCleanup('watch batch');
|
|
2970
|
+
});
|
|
2971
|
+
}
|
|
2972
|
+
} finally {
|
|
2973
|
+
this.processingWatchEvents = false;
|
|
2974
|
+
}
|
|
2975
|
+
}
|
|
2976
|
+
|
|
2977
|
+
/**
|
|
2978
|
+
* Debounced file indexing for watcher events.
|
|
2979
|
+
* Consolidates rapid add/change events and prevents concurrent indexing of the same file.
|
|
2980
|
+
*/
|
|
2981
|
+
debouncedWatchIndexFile(fullPath, eventType) {
|
|
2982
|
+
// Cancel any pending debounce timer for this file
|
|
2983
|
+
const existingTimer = this._watcherDebounceTimers.get(fullPath);
|
|
2984
|
+
if (existingTimer) {
|
|
2985
|
+
clearTimeout(existingTimer);
|
|
2986
|
+
}
|
|
2987
|
+
|
|
2988
|
+
// If file is currently being indexed, just schedule a re-index after it completes
|
|
2989
|
+
if (this._watcherInProgress.has(fullPath)) {
|
|
2990
|
+
// Schedule a follow-up reindex after current one completes
|
|
2991
|
+
this._watcherPendingReindex.set(fullPath, eventType);
|
|
2992
|
+
if (this.config.verbose) {
|
|
2993
|
+
console.info(
|
|
2994
|
+
`[Indexer] Skipping duplicate ${eventType} for ${path.basename(fullPath)} (already indexing)`
|
|
2995
|
+
);
|
|
2996
|
+
}
|
|
2997
|
+
return;
|
|
2998
|
+
}
|
|
2999
|
+
|
|
3000
|
+
// Set a debounce timer to consolidate rapid events
|
|
3001
|
+
const timer = setTimeout(async () => {
|
|
3002
|
+
this._watcherDebounceTimers.delete(fullPath);
|
|
3003
|
+
|
|
3004
|
+
// Mark file as in-progress
|
|
3005
|
+
const indexPromise = (async () => {
|
|
3006
|
+
try {
|
|
3007
|
+
// Invalidate recency cache
|
|
3008
|
+
if (this.server && this.server.hybridSearch) {
|
|
3009
|
+
this.server.hybridSearch.clearFileModTime(fullPath);
|
|
3010
|
+
}
|
|
3011
|
+
|
|
3012
|
+
await this.indexFile(fullPath);
|
|
3013
|
+
await this.traceIncrementalMemoryPhase(
|
|
3014
|
+
`watchSingle.cacheSave (${path.basename(fullPath)})`,
|
|
3015
|
+
async () => {
|
|
3016
|
+
await this.cache.save();
|
|
3017
|
+
}
|
|
3018
|
+
);
|
|
3019
|
+
await this.traceIncrementalMemoryPhase(
|
|
3020
|
+
`watchSingle.cleanup (${path.basename(fullPath)})`,
|
|
3021
|
+
async () => {
|
|
3022
|
+
await this.runPostIncrementalCleanup(`watch ${eventType}`);
|
|
3023
|
+
}
|
|
3024
|
+
);
|
|
3025
|
+
} catch (err) {
|
|
3026
|
+
console.warn(`[Indexer] Failed to index ${path.basename(fullPath)}: ${err.message}`);
|
|
3027
|
+
} finally {
|
|
3028
|
+
this._watcherInProgress.delete(fullPath);
|
|
3029
|
+
const pendingType = this._watcherPendingReindex.get(fullPath);
|
|
3030
|
+
if (pendingType) {
|
|
3031
|
+
this._watcherPendingReindex.delete(fullPath);
|
|
3032
|
+
this.debouncedWatchIndexFile(fullPath, pendingType);
|
|
3033
|
+
}
|
|
3034
|
+
}
|
|
3035
|
+
})();
|
|
3036
|
+
|
|
3037
|
+
this._watcherInProgress.set(fullPath, indexPromise);
|
|
3038
|
+
}, this._watcherDebounceMs);
|
|
3039
|
+
|
|
3040
|
+
this._watcherDebounceTimers.set(fullPath, timer);
|
|
3041
|
+
}
|
|
3042
|
+
|
|
3043
|
+
async setupFileWatcher() {
|
|
3044
|
+
if (!this.config.watchFiles) return;
|
|
3045
|
+
|
|
3046
|
+
// Close existing watcher if active to prevent leaks
|
|
3047
|
+
if (this.watcher) {
|
|
3048
|
+
await this.watcher.close();
|
|
3049
|
+
this.watcher = null;
|
|
3050
|
+
}
|
|
3051
|
+
|
|
3052
|
+
await this.loadGitignore();
|
|
3053
|
+
|
|
3054
|
+
const pattern = [
|
|
3055
|
+
...this.config.fileExtensions.map((ext) => `**/*.${ext}`),
|
|
3056
|
+
...(this.config.fileNames || []).map((name) => `**/${name}`),
|
|
3057
|
+
];
|
|
3058
|
+
|
|
3059
|
+
const ignored = (filePath) => {
|
|
3060
|
+
const fullPath = path.isAbsolute(filePath)
|
|
3061
|
+
? filePath
|
|
3062
|
+
: path.join(this.config.searchDirectory, filePath);
|
|
3063
|
+
const isIgnored = this.isExcluded(fullPath);
|
|
3064
|
+
if (isIgnored && this.config.verbose) {
|
|
3065
|
+
if (!this._watcherIgnoredLogCount) {
|
|
3066
|
+
this._watcherIgnoredLogCount = 0;
|
|
3067
|
+
this._watcherIgnoredLastLogAt = 0;
|
|
3068
|
+
}
|
|
3069
|
+
const now = Date.now();
|
|
3070
|
+
const shouldLog =
|
|
3071
|
+
this._watcherIgnoredLogCount < 5 || now - this._watcherIgnoredLastLogAt > 2000;
|
|
3072
|
+
if (shouldLog) {
|
|
3073
|
+
this._watcherIgnoredLogCount += 1;
|
|
3074
|
+
this._watcherIgnoredLastLogAt = now;
|
|
3075
|
+
console.info(`[Indexer] Watcher ignored: ${fullPath}`);
|
|
3076
|
+
}
|
|
3077
|
+
}
|
|
3078
|
+
return isIgnored;
|
|
3079
|
+
};
|
|
3080
|
+
|
|
3081
|
+
const awaitWriteFinish =
|
|
3082
|
+
this._watcherWriteStabilityMs > 0
|
|
3083
|
+
? {
|
|
3084
|
+
stabilityThreshold: this._watcherWriteStabilityMs,
|
|
3085
|
+
pollInterval: 100,
|
|
3086
|
+
}
|
|
3087
|
+
: undefined;
|
|
3088
|
+
|
|
3089
|
+
this.watcher = chokidar.watch(pattern, {
|
|
3090
|
+
cwd: this.config.searchDirectory,
|
|
3091
|
+
ignored,
|
|
3092
|
+
persistent: true,
|
|
3093
|
+
ignoreInitial: true,
|
|
3094
|
+
...(awaitWriteFinish ? { awaitWriteFinish } : {}),
|
|
3095
|
+
});
|
|
3096
|
+
|
|
3097
|
+
this.watcher
|
|
3098
|
+
.on('add', (filePath) => {
|
|
3099
|
+
const fullPath = path.join(this.config.searchDirectory, filePath);
|
|
3100
|
+
console.info(`[Indexer] New file detected: ${filePath}`);
|
|
3101
|
+
|
|
3102
|
+
// Invalidate recency cache for consistency
|
|
3103
|
+
if (this.server && this.server.hybridSearch) {
|
|
3104
|
+
this.server.hybridSearch.clearFileModTime(fullPath);
|
|
3105
|
+
}
|
|
3106
|
+
|
|
3107
|
+
if (this.isIndexing || this.processingWatchEvents) {
|
|
3108
|
+
if (this.config.verbose) {
|
|
3109
|
+
console.info(`[Indexer] Queued add event during indexing: ${filePath}`);
|
|
3110
|
+
}
|
|
3111
|
+
this.enqueueWatchEvent('add', fullPath);
|
|
3112
|
+
return;
|
|
3113
|
+
}
|
|
3114
|
+
|
|
3115
|
+
// Use debounced indexing to consolidate rapid add/change events
|
|
3116
|
+
this.debouncedWatchIndexFile(fullPath, 'add');
|
|
3117
|
+
})
|
|
3118
|
+
.on('change', (filePath) => {
|
|
3119
|
+
const fullPath = path.join(this.config.searchDirectory, filePath);
|
|
3120
|
+
console.info(`[Indexer] File changed: ${filePath}`);
|
|
3121
|
+
|
|
3122
|
+
// Invalidate recency cache for consistency
|
|
3123
|
+
if (this.server && this.server.hybridSearch) {
|
|
3124
|
+
this.server.hybridSearch.clearFileModTime(fullPath);
|
|
3125
|
+
}
|
|
3126
|
+
|
|
3127
|
+
if (this.isIndexing || this.processingWatchEvents) {
|
|
3128
|
+
if (this.config.verbose) {
|
|
3129
|
+
console.info(`[Indexer] Queued change event during indexing: ${filePath}`);
|
|
3130
|
+
}
|
|
3131
|
+
this.enqueueWatchEvent('change', fullPath);
|
|
3132
|
+
return;
|
|
3133
|
+
}
|
|
3134
|
+
|
|
3135
|
+
// Use debounced indexing to consolidate rapid add/change events
|
|
3136
|
+
this.debouncedWatchIndexFile(fullPath, 'change');
|
|
3137
|
+
})
|
|
3138
|
+
.on('unlink', async (filePath) => {
|
|
3139
|
+
const fullPath = path.join(this.config.searchDirectory, filePath);
|
|
3140
|
+
console.info(`[Indexer] File deleted: ${filePath}`);
|
|
3141
|
+
|
|
3142
|
+
if (this.isIndexing || this.processingWatchEvents) {
|
|
3143
|
+
if (this.config.verbose) {
|
|
3144
|
+
console.info(`[Indexer] Queued delete event during indexing: ${filePath}`);
|
|
3145
|
+
}
|
|
3146
|
+
this.enqueueWatchEvent('unlink', fullPath);
|
|
3147
|
+
return;
|
|
3148
|
+
}
|
|
3149
|
+
|
|
3150
|
+
// Invalidate recency cache
|
|
3151
|
+
if (this.server && this.server.hybridSearch) {
|
|
3152
|
+
this.server.hybridSearch.clearFileModTime(fullPath);
|
|
3153
|
+
}
|
|
3154
|
+
|
|
3155
|
+
if (typeof this.cache.ensureLoaded === 'function') {
|
|
3156
|
+
const preferDisk = this.shouldPreferDiskCacheLoad();
|
|
3157
|
+
await this.traceIncrementalMemoryPhase(`watchUnlink.ensureLoaded (${filePath})`, async () => {
|
|
3158
|
+
await this.cache.ensureLoaded({ preferDisk });
|
|
3159
|
+
});
|
|
3160
|
+
}
|
|
3161
|
+
await this.cache.removeFileFromStore(fullPath);
|
|
3162
|
+
this.cache.deleteFileHash(fullPath);
|
|
3163
|
+
await this.traceIncrementalMemoryPhase(`watchUnlink.cacheSave (${filePath})`, async () => {
|
|
3164
|
+
await this.cache.save();
|
|
3165
|
+
});
|
|
3166
|
+
await this.traceIncrementalMemoryPhase(`watchUnlink.cleanup (${filePath})`, async () => {
|
|
3167
|
+
await this.runPostIncrementalCleanup('watch unlink');
|
|
3168
|
+
});
|
|
3169
|
+
})
|
|
3170
|
+
.on('ready', () => {
|
|
3171
|
+
console.info('[Indexer] File watcher ready and monitoring for changes');
|
|
3172
|
+
if (this.config.verbose) {
|
|
3173
|
+
console.info(`[Indexer] Watch root: ${this.config.searchDirectory || 'unknown'}`);
|
|
3174
|
+
console.info(`[Indexer] Watch patterns: ${pattern.length}`);
|
|
3175
|
+
console.info(
|
|
3176
|
+
`[Indexer] Watching extensions: ${this.config.fileExtensions?.length || 0} types`
|
|
3177
|
+
);
|
|
3178
|
+
console.info(
|
|
3179
|
+
`[Indexer] Watching fileNames: ${(this.config.fileNames || []).join(', ') || 'none'}`
|
|
3180
|
+
);
|
|
3181
|
+
console.info(
|
|
3182
|
+
`[Indexer] Exclude patterns: ${(this.config.excludePatterns || []).length} patterns`
|
|
3183
|
+
);
|
|
3184
|
+
console.info('[Indexer] ignoreInitial: true');
|
|
3185
|
+
}
|
|
3186
|
+
})
|
|
3187
|
+
.on('error', (error) => {
|
|
3188
|
+
console.error(`[Indexer] File watcher error: ${error.message}`);
|
|
3189
|
+
if (this.config.verbose) {
|
|
3190
|
+
console.error(`[Indexer] Watcher error details:`, error);
|
|
3191
|
+
}
|
|
3192
|
+
});
|
|
3193
|
+
|
|
3194
|
+
console.info('[Indexer] File watcher starting...');
|
|
3195
|
+
}
|
|
3196
|
+
}
|
|
3197
|
+
|
|
3198
|
+
// MCP Tool definition for this feature
|
|
3199
|
+
export function getToolDefinition() {
|
|
3200
|
+
return {
|
|
3201
|
+
name: 'b_index_codebase',
|
|
3202
|
+
description:
|
|
3203
|
+
'Manually trigger a full reindex of the codebase. This will scan all files and update the embeddings cache. Useful after large code changes or if the index seems out of date.',
|
|
3204
|
+
inputSchema: {
|
|
3205
|
+
type: 'object',
|
|
3206
|
+
properties: {
|
|
3207
|
+
force: {
|
|
3208
|
+
type: 'boolean',
|
|
3209
|
+
description: "Force reindex even if files haven't changed",
|
|
3210
|
+
default: false,
|
|
3211
|
+
},
|
|
3212
|
+
},
|
|
3213
|
+
},
|
|
3214
|
+
annotations: {
|
|
3215
|
+
title: 'Reindex Codebase',
|
|
3216
|
+
readOnlyHint: false,
|
|
3217
|
+
destructiveHint: false,
|
|
3218
|
+
idempotentHint: true,
|
|
3219
|
+
openWorldHint: false,
|
|
3220
|
+
},
|
|
3221
|
+
};
|
|
3222
|
+
}
|
|
3223
|
+
|
|
3224
|
+
// Tool handler
|
|
3225
|
+
export async function handleToolCall(request, indexer) {
|
|
3226
|
+
const force = request.params.arguments?.force || false;
|
|
3227
|
+
const result = await indexer.indexAll(force);
|
|
3228
|
+
|
|
3229
|
+
// Handle case when indexing was skipped due to concurrent request
|
|
3230
|
+
if (result?.skipped) {
|
|
3231
|
+
return {
|
|
3232
|
+
content: [
|
|
3233
|
+
{
|
|
3234
|
+
type: 'text',
|
|
3235
|
+
text: `Indexing skipped: ${result.reason}\n\nPlease wait for the current indexing operation to complete before requesting another reindex.`,
|
|
3236
|
+
},
|
|
3237
|
+
],
|
|
3238
|
+
};
|
|
3239
|
+
}
|
|
3240
|
+
|
|
3241
|
+
// Get current stats from cache
|
|
3242
|
+
const vectorStore = indexer.cache.getVectorStore();
|
|
3243
|
+
const stats = {
|
|
3244
|
+
totalChunks: result?.totalChunks ?? vectorStore.length,
|
|
3245
|
+
totalFiles: result?.totalFiles ?? new Set(vectorStore.map((v) => v.file)).size,
|
|
3246
|
+
filesProcessed: result?.filesProcessed ?? 0,
|
|
3247
|
+
chunksCreated: result?.chunksCreated ?? 0,
|
|
3248
|
+
};
|
|
3249
|
+
|
|
3250
|
+
let message = result?.message
|
|
3251
|
+
? `Codebase reindexed successfully.\n\n${result.message}`
|
|
3252
|
+
: `Codebase reindexed successfully.`;
|
|
3253
|
+
|
|
3254
|
+
message += `\n\nStatistics:\n- Total files in index: ${stats.totalFiles}\n- Total code chunks: ${stats.totalChunks}`;
|
|
3255
|
+
|
|
3256
|
+
if (stats.filesProcessed > 0) {
|
|
3257
|
+
message += `\n- Files processed this run: ${stats.filesProcessed}\n- Chunks created this run: ${stats.chunksCreated}`;
|
|
3258
|
+
}
|
|
3259
|
+
|
|
3260
|
+
return {
|
|
3261
|
+
content: [
|
|
3262
|
+
{
|
|
3263
|
+
type: 'text',
|
|
3264
|
+
text: message,
|
|
3265
|
+
},
|
|
3266
|
+
],
|
|
3267
|
+
};
|
|
3268
|
+
}
|