@softerist/heuristic-mcp 3.2.2 → 3.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +387 -376
- package/config.jsonc +800 -800
- package/features/ann-config.js +102 -110
- package/features/clear-cache.js +81 -84
- package/features/find-similar-code.js +265 -286
- package/features/hybrid-search.js +487 -536
- package/features/index-codebase.js +3139 -3270
- package/features/lifecycle.js +1041 -1063
- package/features/package-version.js +277 -291
- package/features/register.js +351 -370
- package/features/resources.js +115 -130
- package/features/set-workspace.js +214 -240
- package/index.js +742 -762
- package/lib/cache-ops.js +22 -22
- package/lib/cache-utils.js +465 -519
- package/lib/cache.js +1699 -1767
- package/lib/call-graph.js +396 -396
- package/lib/cli.js +232 -226
- package/lib/config.js +1483 -1495
- package/lib/constants.js +511 -492
- package/lib/embed-query-process.js +206 -212
- package/lib/embedding-process.js +434 -451
- package/lib/embedding-worker.js +862 -934
- package/lib/ignore-patterns.js +276 -316
- package/lib/json-worker.js +14 -14
- package/lib/json-writer.js +302 -310
- package/lib/logging.js +116 -127
- package/lib/memory-logger.js +13 -13
- package/lib/onnx-backend.js +188 -193
- package/lib/path-utils.js +18 -23
- package/lib/project-detector.js +82 -84
- package/lib/server-lifecycle.js +133 -145
- package/lib/settings-editor.js +738 -739
- package/lib/slice-normalize.js +25 -31
- package/lib/tokenizer.js +168 -203
- package/lib/utils.js +364 -409
- package/lib/vector-store-binary.js +811 -591
- package/lib/vector-store-sqlite.js +377 -414
- package/lib/workspace-env.js +32 -34
- package/mcp_config.json +9 -9
- package/package.json +86 -86
- package/scripts/clear-cache.js +20 -20
- package/scripts/download-model.js +43 -43
- package/scripts/mcp-launcher.js +49 -49
- package/scripts/postinstall.js +12 -12
- package/search-configs.js +36 -36
package/lib/constants.js
CHANGED
|
@@ -1,492 +1,511 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Centralized constants for the heuristic-mcp project.
|
|
3
|
-
* Extracting magic numbers improves maintainability and documents design decisions.
|
|
4
|
-
*/
|
|
5
|
-
|
|
6
|
-
// ================================
|
|
7
|
-
// Workspace Resolution Constants
|
|
8
|
-
// ================================
|
|
9
|
-
|
|
10
|
-
/**
|
|
11
|
-
* Environment variables checked for workspace resolution, in precedence order.
|
|
12
|
-
*/
|
|
13
|
-
export const WORKSPACE_ENV_VARS = Object.freeze([
|
|
14
|
-
'HEURISTIC_MCP_WORKSPACE',
|
|
15
|
-
'MCP_WORKSPACE',
|
|
16
|
-
'CODEX_WORKSPACE',
|
|
17
|
-
'CODEX_PROJECT_ROOT',
|
|
18
|
-
'CODEX_CWD',
|
|
19
|
-
'WORKSPACE_FOLDER',
|
|
20
|
-
'WORKSPACE_ROOT',
|
|
21
|
-
'CURSOR_WORKSPACE',
|
|
22
|
-
'CLAUDE_WORKSPACE',
|
|
23
|
-
'ANTIGRAVITY_WORKSPACE',
|
|
24
|
-
'INIT_CWD',
|
|
25
|
-
]);
|
|
26
|
-
|
|
27
|
-
/**
|
|
28
|
-
* Prefixes for dynamic workspace-related env vars (provider-specific).
|
|
29
|
-
*/
|
|
30
|
-
export const DYNAMIC_WORKSPACE_ENV_PREFIXES = Object.freeze([
|
|
31
|
-
'CODEX_',
|
|
32
|
-
'ANTIGRAVITY_',
|
|
33
|
-
'CURSOR_',
|
|
34
|
-
'CLAUDE_',
|
|
35
|
-
'WINDSURF_',
|
|
36
|
-
'WARP_',
|
|
37
|
-
'MCP_',
|
|
38
|
-
'VSCODE_',
|
|
39
|
-
]);
|
|
40
|
-
|
|
41
|
-
/**
|
|
42
|
-
* Backward-compatible alias for legacy single-prefix consumers.
|
|
43
|
-
*/
|
|
44
|
-
export const DYNAMIC_WORKSPACE_ENV_PREFIX = DYNAMIC_WORKSPACE_ENV_PREFIXES[0];
|
|
45
|
-
|
|
46
|
-
/**
|
|
47
|
-
* Pattern used when ranking provider-specific workspace env vars.
|
|
48
|
-
*/
|
|
49
|
-
export const WORKSPACE_ENV_KEY_PATTERN = /(WORKSPACE|PROJECT|ROOT|CWD|DIR)/i;
|
|
50
|
-
|
|
51
|
-
/**
|
|
52
|
-
* Broad token used to discover unknown provider keys safely.
|
|
53
|
-
* We only auto-discover generic env keys containing "WORKSPACE".
|
|
54
|
-
*/
|
|
55
|
-
export const WORKSPACE_ENV_GENERIC_DISCOVERY_PATTERN = /WORKSPACE/i;
|
|
56
|
-
|
|
57
|
-
// ================================
|
|
58
|
-
// Chunking Constants
|
|
59
|
-
// ================================
|
|
60
|
-
|
|
61
|
-
/**
|
|
62
|
-
* Minimum text length for a chunk to be considered valid.
|
|
63
|
-
* Prevents tiny fragments from polluting search results.
|
|
64
|
-
* Chunks shorter than this are discarded.
|
|
65
|
-
*/
|
|
66
|
-
export const MIN_CHUNK_TEXT_LENGTH = 20;
|
|
67
|
-
|
|
68
|
-
/**
|
|
69
|
-
* Absolute limit on overlap calculation iterations.
|
|
70
|
-
* Prevents unbounded loops when processing files with many zero-token lines.
|
|
71
|
-
*/
|
|
72
|
-
export const MAX_OVERLAP_ITERATIONS = 50;
|
|
73
|
-
|
|
74
|
-
/**
|
|
75
|
-
* Target token ratio relative to max tokens.
|
|
76
|
-
* Chunks aim to be 85% of max capacity to leave room for context.
|
|
77
|
-
*/
|
|
78
|
-
export const TARGET_TOKEN_RATIO = 0.85;
|
|
79
|
-
|
|
80
|
-
/**
|
|
81
|
-
* Overlap token ratio relative to target tokens.
|
|
82
|
-
* 18% overlap provides good context continuity between chunks.
|
|
83
|
-
*/
|
|
84
|
-
export const OVERLAP_TOKEN_RATIO = 0.18;
|
|
85
|
-
|
|
86
|
-
// ================================
|
|
87
|
-
// Cache Constants
|
|
88
|
-
// ================================
|
|
89
|
-
|
|
90
|
-
/**
|
|
91
|
-
* Maximum entries in the chunking params LRU cache.
|
|
92
|
-
* Trade-off: memory vs. lookup time. 100 is sufficient for typical workloads.
|
|
93
|
-
*/
|
|
94
|
-
export const CHUNKING_PARAMS_CACHE_SIZE = 100;
|
|
95
|
-
|
|
96
|
-
/**
|
|
97
|
-
* JSON files larger than this threshold are parsed in a worker thread.
|
|
98
|
-
* Prevents main thread blocking on large cache files.
|
|
99
|
-
*/
|
|
100
|
-
export const JSON_WORKER_THRESHOLD_BYTES = 2 * 1024 * 1024; // 2MB
|
|
101
|
-
|
|
102
|
-
// ================================
|
|
103
|
-
// Worker Constants
|
|
104
|
-
// ================================
|
|
105
|
-
|
|
106
|
-
/**
|
|
107
|
-
* Number of results to batch before sending to main thread.
|
|
108
|
-
* Balances IPC overhead vs. memory usage in worker communication.
|
|
109
|
-
*/
|
|
110
|
-
export const RESULT_BATCH_SIZE = 25;
|
|
111
|
-
|
|
112
|
-
/**
|
|
113
|
-
* Default batch size for embedding inference.
|
|
114
|
-
* Balances memory usage with throughput for ONNX runtime.
|
|
115
|
-
*/
|
|
116
|
-
export const DEFAULT_INFERENCE_BATCH_SIZE = 4;
|
|
117
|
-
|
|
118
|
-
/**
|
|
119
|
-
* Timeout for worker batch processing before considering it failed.
|
|
120
|
-
* Generous timeout to handle large files with complex embeddings.
|
|
121
|
-
*/
|
|
122
|
-
export const WORKER_BATCH_TIMEOUT_MS = 300_000; // 5 minutes
|
|
123
|
-
|
|
124
|
-
/**
|
|
125
|
-
* Cooldown period after worker failures before retrying worker use.
|
|
126
|
-
*/
|
|
127
|
-
export const WORKER_FAILURE_COOLDOWN_MS = 10 * 60 * 1000; // 10 minutes
|
|
128
|
-
|
|
129
|
-
/**
|
|
130
|
-
* Delay before starting background indexing after server connects.
|
|
131
|
-
* Allows server to accept first request before CPU-intensive work.
|
|
132
|
-
*/
|
|
133
|
-
export const BACKGROUND_INDEX_DELAY_MS = 3000;
|
|
134
|
-
|
|
135
|
-
/**
|
|
136
|
-
* Concurrency limit for file stat operations during search.
|
|
137
|
-
* Prevents EMFILE (too many open files) errors.
|
|
138
|
-
*/
|
|
139
|
-
export const FILE_STAT_CONCURRENCY_LIMIT = 50;
|
|
140
|
-
|
|
141
|
-
/**
|
|
142
|
-
* Maximum entries in file modification time LRU cache.
|
|
143
|
-
*/
|
|
144
|
-
export const LRU_MAX_ENTRIES = 5000;
|
|
145
|
-
|
|
146
|
-
/**
|
|
147
|
-
* Target entries after LRU eviction (80% of max).
|
|
148
|
-
*/
|
|
149
|
-
export const LRU_TARGET_ENTRIES = 4000;
|
|
150
|
-
|
|
151
|
-
/**
|
|
152
|
-
* Interval for logging memory usage during indexing.
|
|
153
|
-
* Helps diagnose memory leaks and OOM issues.
|
|
154
|
-
*/
|
|
155
|
-
export const MEMORY_LOG_INTERVAL_MS = 15_000; // 15 seconds
|
|
156
|
-
|
|
157
|
-
/**
|
|
158
|
-
* Retry delay when SQLite file is locked or busy.
|
|
159
|
-
*/
|
|
160
|
-
export const SQLITE_FILE_RETRY_DELAY_MS = 50;
|
|
161
|
-
|
|
162
|
-
/**
|
|
163
|
-
* Number of retries when SQLite file is locked or busy.
|
|
164
|
-
* Combined with delay: max wait = 50ms * 40 = 2 seconds.
|
|
165
|
-
*/
|
|
166
|
-
export const SQLITE_FILE_RETRY_COUNT = 40;
|
|
167
|
-
|
|
168
|
-
// ================================
|
|
169
|
-
// Search Constants
|
|
170
|
-
// ================================
|
|
171
|
-
|
|
172
|
-
/**
|
|
173
|
-
* Batch size for scoring chunks during search.
|
|
174
|
-
* Yields to event loop between batches to maintain responsiveness.
|
|
175
|
-
*/
|
|
176
|
-
export const SEARCH_SCORING_BATCH_SIZE = 500;
|
|
177
|
-
|
|
178
|
-
/**
|
|
179
|
-
* Maximum number of files for recency boost IO operations.
|
|
180
|
-
* Above this, we rely on cached metadata only to prevent IO storms.
|
|
181
|
-
*/
|
|
182
|
-
export const RECENCY_BOOST_MAX_IO_FILES = 1000;
|
|
183
|
-
|
|
184
|
-
/**
|
|
185
|
-
* Maximum size for full linear scan fallback.
|
|
186
|
-
* Above this, we skip full scan to prevent performance degradation.
|
|
187
|
-
*/
|
|
188
|
-
export const MAX_FULL_SCAN_SIZE = 50_000;
|
|
189
|
-
|
|
190
|
-
// ================================
|
|
191
|
-
// ANN (Approximate Nearest Neighbor) Constants
|
|
192
|
-
// ================================
|
|
193
|
-
|
|
194
|
-
/**
|
|
195
|
-
* Number of vectors to sample for dimension consistency validation.
|
|
196
|
-
*/
|
|
197
|
-
export const ANN_DIMENSION_SAMPLE_SIZE = 100;
|
|
198
|
-
|
|
199
|
-
/**
|
|
200
|
-
* Minimum chunks required before enabling ANN index.
|
|
201
|
-
* Linear scan is faster for smaller datasets.
|
|
202
|
-
*/
|
|
203
|
-
export const ANN_MIN_CHUNKS_DEFAULT = 5000;
|
|
204
|
-
|
|
205
|
-
/**
|
|
206
|
-
* Cooldown period after hnswlib load errors before retrying.
|
|
207
|
-
* Prevents tight error loops when the native module fails to load.
|
|
208
|
-
*/
|
|
209
|
-
export const HNSWLIB_ERROR_RESET_MS = 5 * 60 * 1000; // 5 minutes
|
|
210
|
-
|
|
211
|
-
/**
|
|
212
|
-
* Default timeout for waiting on active readers before aborting save.
|
|
213
|
-
* Balances data safety with responsiveness.
|
|
214
|
-
*/
|
|
215
|
-
export const DEFAULT_READER_WAIT_TIMEOUT_MS = 5000;
|
|
216
|
-
|
|
217
|
-
// ================================
|
|
218
|
-
// Embedding Process GC Constants
|
|
219
|
-
// ================================
|
|
220
|
-
|
|
221
|
-
/**
|
|
222
|
-
* Default RSS threshold (MB) for adaptive GC in embedding child process.
|
|
223
|
-
*/
|
|
224
|
-
export const EMBEDDING_PROCESS_DEFAULT_GC_RSS_THRESHOLD_MB = 2048;
|
|
225
|
-
|
|
226
|
-
/**
|
|
227
|
-
* Minimum interval (ms) between adaptive GC runs in embedding child process.
|
|
228
|
-
*/
|
|
229
|
-
export const EMBEDDING_PROCESS_DEFAULT_GC_MIN_INTERVAL_MS = 15_000;
|
|
230
|
-
|
|
231
|
-
/**
|
|
232
|
-
* Backstop: run GC after this many requests without collection.
|
|
233
|
-
*/
|
|
234
|
-
export const EMBEDDING_PROCESS_DEFAULT_GC_MAX_REQUESTS_WITHOUT_COLLECTION = 8;
|
|
235
|
-
|
|
236
|
-
/**
|
|
237
|
-
* Initial mutable state shape for embedding child process GC tracking.
|
|
238
|
-
*/
|
|
239
|
-
export const EMBEDDING_PROCESS_GC_STATE_INITIAL = Object.freeze({
|
|
240
|
-
lastRunAtMs: 0,
|
|
241
|
-
requestsSinceLastRun: 0,
|
|
242
|
-
});
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
*
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
*
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
//
|
|
366
|
-
'.
|
|
367
|
-
'.
|
|
368
|
-
'.
|
|
369
|
-
'.
|
|
370
|
-
'.
|
|
371
|
-
'.
|
|
372
|
-
'.
|
|
373
|
-
'.
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
'.
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
'.
|
|
380
|
-
'.
|
|
381
|
-
'.
|
|
382
|
-
'.
|
|
383
|
-
'.
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
'.
|
|
389
|
-
'.
|
|
390
|
-
'.
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
'.
|
|
394
|
-
'.
|
|
395
|
-
'.
|
|
396
|
-
'.
|
|
397
|
-
|
|
398
|
-
//
|
|
399
|
-
'.
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
'.
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
'.
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
'.
|
|
409
|
-
'.
|
|
410
|
-
'.
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
'.
|
|
415
|
-
'.
|
|
416
|
-
'.
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
'.
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
'.
|
|
423
|
-
|
|
424
|
-
//
|
|
425
|
-
'.
|
|
426
|
-
'.
|
|
427
|
-
'.
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
'.
|
|
431
|
-
'.
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
'.
|
|
435
|
-
'.
|
|
436
|
-
'.
|
|
437
|
-
'.
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
'.
|
|
441
|
-
'.
|
|
442
|
-
'.
|
|
443
|
-
|
|
444
|
-
//
|
|
445
|
-
'.
|
|
446
|
-
'.
|
|
447
|
-
'.
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
'.
|
|
453
|
-
'.
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
'.
|
|
457
|
-
'.
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
'.
|
|
461
|
-
'.
|
|
462
|
-
'.
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
'.
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
'.
|
|
469
|
-
'.
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
'.
|
|
473
|
-
'.
|
|
474
|
-
|
|
475
|
-
//
|
|
476
|
-
'.
|
|
477
|
-
'.
|
|
478
|
-
'.
|
|
479
|
-
'.
|
|
480
|
-
'.
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
1
|
+
/**
|
|
2
|
+
* Centralized constants for the heuristic-mcp project.
|
|
3
|
+
* Extracting magic numbers improves maintainability and documents design decisions.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
// ================================
|
|
7
|
+
// Workspace Resolution Constants
|
|
8
|
+
// ================================
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Environment variables checked for workspace resolution, in precedence order.
|
|
12
|
+
*/
|
|
13
|
+
export const WORKSPACE_ENV_VARS = Object.freeze([
|
|
14
|
+
'HEURISTIC_MCP_WORKSPACE',
|
|
15
|
+
'MCP_WORKSPACE',
|
|
16
|
+
'CODEX_WORKSPACE',
|
|
17
|
+
'CODEX_PROJECT_ROOT',
|
|
18
|
+
'CODEX_CWD',
|
|
19
|
+
'WORKSPACE_FOLDER',
|
|
20
|
+
'WORKSPACE_ROOT',
|
|
21
|
+
'CURSOR_WORKSPACE',
|
|
22
|
+
'CLAUDE_WORKSPACE',
|
|
23
|
+
'ANTIGRAVITY_WORKSPACE',
|
|
24
|
+
'INIT_CWD',
|
|
25
|
+
]);
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Prefixes for dynamic workspace-related env vars (provider-specific).
|
|
29
|
+
*/
|
|
30
|
+
export const DYNAMIC_WORKSPACE_ENV_PREFIXES = Object.freeze([
|
|
31
|
+
'CODEX_',
|
|
32
|
+
'ANTIGRAVITY_',
|
|
33
|
+
'CURSOR_',
|
|
34
|
+
'CLAUDE_',
|
|
35
|
+
'WINDSURF_',
|
|
36
|
+
'WARP_',
|
|
37
|
+
'MCP_',
|
|
38
|
+
'VSCODE_',
|
|
39
|
+
]);
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Backward-compatible alias for legacy single-prefix consumers.
|
|
43
|
+
*/
|
|
44
|
+
export const DYNAMIC_WORKSPACE_ENV_PREFIX = DYNAMIC_WORKSPACE_ENV_PREFIXES[0];
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Pattern used when ranking provider-specific workspace env vars.
|
|
48
|
+
*/
|
|
49
|
+
export const WORKSPACE_ENV_KEY_PATTERN = /(WORKSPACE|PROJECT|ROOT|CWD|DIR)/i;
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Broad token used to discover unknown provider keys safely.
|
|
53
|
+
* We only auto-discover generic env keys containing "WORKSPACE".
|
|
54
|
+
*/
|
|
55
|
+
export const WORKSPACE_ENV_GENERIC_DISCOVERY_PATTERN = /WORKSPACE/i;
|
|
56
|
+
|
|
57
|
+
// ================================
|
|
58
|
+
// Chunking Constants
|
|
59
|
+
// ================================
|
|
60
|
+
|
|
61
|
+
/**
|
|
62
|
+
* Minimum text length for a chunk to be considered valid.
|
|
63
|
+
* Prevents tiny fragments from polluting search results.
|
|
64
|
+
* Chunks shorter than this are discarded.
|
|
65
|
+
*/
|
|
66
|
+
export const MIN_CHUNK_TEXT_LENGTH = 20;
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Absolute limit on overlap calculation iterations.
|
|
70
|
+
* Prevents unbounded loops when processing files with many zero-token lines.
|
|
71
|
+
*/
|
|
72
|
+
export const MAX_OVERLAP_ITERATIONS = 50;
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Target token ratio relative to max tokens.
|
|
76
|
+
* Chunks aim to be 85% of max capacity to leave room for context.
|
|
77
|
+
*/
|
|
78
|
+
export const TARGET_TOKEN_RATIO = 0.85;
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* Overlap token ratio relative to target tokens.
|
|
82
|
+
* 18% overlap provides good context continuity between chunks.
|
|
83
|
+
*/
|
|
84
|
+
export const OVERLAP_TOKEN_RATIO = 0.18;
|
|
85
|
+
|
|
86
|
+
// ================================
|
|
87
|
+
// Cache Constants
|
|
88
|
+
// ================================
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Maximum entries in the chunking params LRU cache.
|
|
92
|
+
* Trade-off: memory vs. lookup time. 100 is sufficient for typical workloads.
|
|
93
|
+
*/
|
|
94
|
+
export const CHUNKING_PARAMS_CACHE_SIZE = 100;
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* JSON files larger than this threshold are parsed in a worker thread.
|
|
98
|
+
* Prevents main thread blocking on large cache files.
|
|
99
|
+
*/
|
|
100
|
+
export const JSON_WORKER_THRESHOLD_BYTES = 2 * 1024 * 1024; // 2MB
|
|
101
|
+
|
|
102
|
+
// ================================
|
|
103
|
+
// Worker Constants
|
|
104
|
+
// ================================
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Number of results to batch before sending to main thread.
|
|
108
|
+
* Balances IPC overhead vs. memory usage in worker communication.
|
|
109
|
+
*/
|
|
110
|
+
export const RESULT_BATCH_SIZE = 25;
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* Default batch size for embedding inference.
|
|
114
|
+
* Balances memory usage with throughput for ONNX runtime.
|
|
115
|
+
*/
|
|
116
|
+
export const DEFAULT_INFERENCE_BATCH_SIZE = 4;
|
|
117
|
+
|
|
118
|
+
/**
|
|
119
|
+
* Timeout for worker batch processing before considering it failed.
|
|
120
|
+
* Generous timeout to handle large files with complex embeddings.
|
|
121
|
+
*/
|
|
122
|
+
export const WORKER_BATCH_TIMEOUT_MS = 300_000; // 5 minutes
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* Cooldown period after worker failures before retrying worker use.
|
|
126
|
+
*/
|
|
127
|
+
export const WORKER_FAILURE_COOLDOWN_MS = 10 * 60 * 1000; // 10 minutes
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Delay before starting background indexing after server connects.
|
|
131
|
+
* Allows server to accept first request before CPU-intensive work.
|
|
132
|
+
*/
|
|
133
|
+
export const BACKGROUND_INDEX_DELAY_MS = 3000;
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Concurrency limit for file stat operations during search.
|
|
137
|
+
* Prevents EMFILE (too many open files) errors.
|
|
138
|
+
*/
|
|
139
|
+
export const FILE_STAT_CONCURRENCY_LIMIT = 50;
|
|
140
|
+
|
|
141
|
+
/**
|
|
142
|
+
* Maximum entries in file modification time LRU cache.
|
|
143
|
+
*/
|
|
144
|
+
export const LRU_MAX_ENTRIES = 5000;
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Target entries after LRU eviction (80% of max).
|
|
148
|
+
*/
|
|
149
|
+
export const LRU_TARGET_ENTRIES = 4000;
|
|
150
|
+
|
|
151
|
+
/**
|
|
152
|
+
* Interval for logging memory usage during indexing.
|
|
153
|
+
* Helps diagnose memory leaks and OOM issues.
|
|
154
|
+
*/
|
|
155
|
+
export const MEMORY_LOG_INTERVAL_MS = 15_000; // 15 seconds
|
|
156
|
+
|
|
157
|
+
/**
|
|
158
|
+
* Retry delay when SQLite file is locked or busy.
|
|
159
|
+
*/
|
|
160
|
+
export const SQLITE_FILE_RETRY_DELAY_MS = 50;
|
|
161
|
+
|
|
162
|
+
/**
|
|
163
|
+
* Number of retries when SQLite file is locked or busy.
|
|
164
|
+
* Combined with delay: max wait = 50ms * 40 = 2 seconds.
|
|
165
|
+
*/
|
|
166
|
+
export const SQLITE_FILE_RETRY_COUNT = 40;
|
|
167
|
+
|
|
168
|
+
// ================================
|
|
169
|
+
// Search Constants
|
|
170
|
+
// ================================
|
|
171
|
+
|
|
172
|
+
/**
|
|
173
|
+
* Batch size for scoring chunks during search.
|
|
174
|
+
* Yields to event loop between batches to maintain responsiveness.
|
|
175
|
+
*/
|
|
176
|
+
export const SEARCH_SCORING_BATCH_SIZE = 500;
|
|
177
|
+
|
|
178
|
+
/**
|
|
179
|
+
* Maximum number of files for recency boost IO operations.
|
|
180
|
+
* Above this, we rely on cached metadata only to prevent IO storms.
|
|
181
|
+
*/
|
|
182
|
+
export const RECENCY_BOOST_MAX_IO_FILES = 1000;
|
|
183
|
+
|
|
184
|
+
/**
|
|
185
|
+
* Maximum size for full linear scan fallback.
|
|
186
|
+
* Above this, we skip full scan to prevent performance degradation.
|
|
187
|
+
*/
|
|
188
|
+
export const MAX_FULL_SCAN_SIZE = 50_000;
|
|
189
|
+
|
|
190
|
+
// ================================
|
|
191
|
+
// ANN (Approximate Nearest Neighbor) Constants
|
|
192
|
+
// ================================
|
|
193
|
+
|
|
194
|
+
/**
|
|
195
|
+
* Number of vectors to sample for dimension consistency validation.
|
|
196
|
+
*/
|
|
197
|
+
export const ANN_DIMENSION_SAMPLE_SIZE = 100;
|
|
198
|
+
|
|
199
|
+
/**
|
|
200
|
+
* Minimum chunks required before enabling ANN index.
|
|
201
|
+
* Linear scan is faster for smaller datasets.
|
|
202
|
+
*/
|
|
203
|
+
export const ANN_MIN_CHUNKS_DEFAULT = 5000;
|
|
204
|
+
|
|
205
|
+
/**
|
|
206
|
+
* Cooldown period after hnswlib load errors before retrying.
|
|
207
|
+
* Prevents tight error loops when the native module fails to load.
|
|
208
|
+
*/
|
|
209
|
+
export const HNSWLIB_ERROR_RESET_MS = 5 * 60 * 1000; // 5 minutes
|
|
210
|
+
|
|
211
|
+
/**
|
|
212
|
+
* Default timeout for waiting on active readers before aborting save.
|
|
213
|
+
* Balances data safety with responsiveness.
|
|
214
|
+
*/
|
|
215
|
+
export const DEFAULT_READER_WAIT_TIMEOUT_MS = 5000;
|
|
216
|
+
|
|
217
|
+
// ================================
|
|
218
|
+
// Embedding Process GC Constants
|
|
219
|
+
// ================================
|
|
220
|
+
|
|
221
|
+
/**
|
|
222
|
+
* Default RSS threshold (MB) for adaptive GC in embedding child process.
|
|
223
|
+
*/
|
|
224
|
+
export const EMBEDDING_PROCESS_DEFAULT_GC_RSS_THRESHOLD_MB = 2048;
|
|
225
|
+
|
|
226
|
+
/**
|
|
227
|
+
* Minimum interval (ms) between adaptive GC runs in embedding child process.
|
|
228
|
+
*/
|
|
229
|
+
export const EMBEDDING_PROCESS_DEFAULT_GC_MIN_INTERVAL_MS = 15_000;
|
|
230
|
+
|
|
231
|
+
/**
|
|
232
|
+
* Backstop: run GC after this many requests without collection.
|
|
233
|
+
*/
|
|
234
|
+
export const EMBEDDING_PROCESS_DEFAULT_GC_MAX_REQUESTS_WITHOUT_COLLECTION = 8;
|
|
235
|
+
|
|
236
|
+
/**
|
|
237
|
+
* Initial mutable state shape for embedding child process GC tracking.
|
|
238
|
+
*/
|
|
239
|
+
export const EMBEDDING_PROCESS_GC_STATE_INITIAL = Object.freeze({
|
|
240
|
+
lastRunAtMs: 0,
|
|
241
|
+
requestsSinceLastRun: 0,
|
|
242
|
+
});
|
|
243
|
+
|
|
244
|
+
/**
|
|
245
|
+
* Default idle timeout for the persistent embedding child process.
|
|
246
|
+
* 10 minutes (600,000 ms) prevents frequent process churn while releasing
|
|
247
|
+
* resources during extended inactivity.
|
|
248
|
+
*/
|
|
249
|
+
export const EMBEDDING_POOL_IDLE_TIMEOUT_MS = 600000; // 10 minutes
|
|
250
|
+
|
|
251
|
+
// ================================
|
|
252
|
+
// Vector Store Format Constants
|
|
253
|
+
// ================================
|
|
254
|
+
|
|
255
|
+
/**
|
|
256
|
+
* Binary vector store format version.
|
|
257
|
+
* Increment when binary format changes to trigger re-indexing.
|
|
258
|
+
* v2: added writeId (4 bytes) + CRC32 (4 bytes) + 8 bytes reserved to all headers.
|
|
259
|
+
*/
|
|
260
|
+
export const BINARY_STORE_VERSION = 2;
|
|
261
|
+
|
|
262
|
+
/**
|
|
263
|
+
* SQLite vector store format version.
|
|
264
|
+
* Increment when schema changes to trigger re-indexing.
|
|
265
|
+
*/
|
|
266
|
+
export const SQLITE_STORE_VERSION = 1;
|
|
267
|
+
|
|
268
|
+
/**
|
|
269
|
+
* Binary vector file header size in bytes.
|
|
270
|
+
* Contains: magic (4) + version (4) + dim (4) + count (4) + writeId (4) + crc32 (4) + reserved (8)
|
|
271
|
+
*/
|
|
272
|
+
export const BINARY_VECTOR_HEADER_SIZE = 32;
|
|
273
|
+
|
|
274
|
+
/**
|
|
275
|
+
* Binary record file header size in bytes.
|
|
276
|
+
* Contains: magic (4) + version (4) + count (4) + fileCount (4) + writeId (4) + crc32 (4) + reserved (8)
|
|
277
|
+
*/
|
|
278
|
+
export const BINARY_RECORD_HEADER_SIZE = 32;
|
|
279
|
+
|
|
280
|
+
/**
|
|
281
|
+
* Binary content file header size in bytes.
|
|
282
|
+
* Contains: magic (4) + version (4) + totalBytes (8) + writeId (4) + crc32 (4) + reserved (8)
|
|
283
|
+
*/
|
|
284
|
+
export const BINARY_CONTENT_HEADER_SIZE = 32;
|
|
285
|
+
|
|
286
|
+
/**
|
|
287
|
+
* Size of a single record entry in bytes.
|
|
288
|
+
* Contains: file offset (4) + file length (4) + startLine (4) + endLine (4) +
|
|
289
|
+
* content offset (4) + content length (4) + reserved (8)
|
|
290
|
+
*/
|
|
291
|
+
export const BINARY_RECORD_SIZE = 32;
|
|
292
|
+
|
|
293
|
+
// ================================
|
|
294
|
+
// Server Process Constants
|
|
295
|
+
// ================================
|
|
296
|
+
|
|
297
|
+
/**
|
|
298
|
+
* Interval for the robust Keep-Alive mechanism in the main server process.
|
|
299
|
+
* Used to ensure the process doesn't exit when the event loop is empty
|
|
300
|
+
* (e.g., during complex async operations or child process restarts).
|
|
301
|
+
* Value is Max Int32 (approx 24.8 days) to minimize overhead.
|
|
302
|
+
*/
|
|
303
|
+
export const SERVER_KEEP_ALIVE_INTERVAL_MS = 2147483647;
|
|
304
|
+
|
|
305
|
+
// ================================
|
|
306
|
+
// Watcher Constants
|
|
307
|
+
// ================================
|
|
308
|
+
|
|
309
|
+
/**
|
|
310
|
+
* Maximum pending watch events before dropping oldest.
|
|
311
|
+
* Prevents memory exhaustion during rapid file churn (e.g., build processes).
|
|
312
|
+
*/
|
|
313
|
+
export const MAX_PENDING_WATCH_EVENTS = 10000;
|
|
314
|
+
|
|
315
|
+
/**
|
|
316
|
+
* Target size after trimming pending watch events (50% of max).
|
|
317
|
+
* Provides hysteresis to avoid repeated trim operations.
|
|
318
|
+
*/
|
|
319
|
+
export const PENDING_WATCH_EVENTS_TRIM_SIZE = 5000;
|
|
320
|
+
|
|
321
|
+
// ================================
|
|
322
|
+
// ONNX Constants
|
|
323
|
+
// ================================
|
|
324
|
+
|
|
325
|
+
/**
|
|
326
|
+
* Thread limit for ONNX runtime when native backend is unavailable.
|
|
327
|
+
* Reduces CPU contention on the main thread.
|
|
328
|
+
*/
|
|
329
|
+
export const ONNX_THREAD_LIMIT = 2;
|
|
330
|
+
|
|
331
|
+
/**
|
|
332
|
+
* Partial match boost factor for keyword matching in search.
|
|
333
|
+
* Applied when query words are found but not exact phrase match.
|
|
334
|
+
*/
|
|
335
|
+
export const PARTIAL_MATCH_BOOST = 0.3;
|
|
336
|
+
|
|
337
|
+
/**
|
|
338
|
+
* Text match candidate limit for deferred text matching.
|
|
339
|
+
* Limits expensive string operations to top candidates.
|
|
340
|
+
*/
|
|
341
|
+
export const TEXT_MATCH_MAX_CANDIDATES = 2000;
|
|
342
|
+
|
|
343
|
+
/**
|
|
344
|
+
* Concurrency limit for file stat operations in search.
|
|
345
|
+
* Prevents EMFILE errors from too many simultaneous file handles.
|
|
346
|
+
*/
|
|
347
|
+
export const STAT_CONCURRENCY_LIMIT = 50;
|
|
348
|
+
|
|
349
|
+
/**
|
|
350
|
+
* Batch size for scoring chunks in hybrid search.
|
|
351
|
+
* Allows event loop to tick between batches for responsiveness.
|
|
352
|
+
*/
|
|
353
|
+
export const SEARCH_BATCH_SIZE = 500;
|
|
354
|
+
|
|
355
|
+
// ================================
|
|
356
|
+
// MIME Type Constants
|
|
357
|
+
// ================================
|
|
358
|
+
|
|
359
|
+
/**
|
|
360
|
+
* MIME type mapping for file extensions.
|
|
361
|
+
* Used by MCP resources and content-type detection.
|
|
362
|
+
* Extensions should be lowercase with leading dot.
|
|
363
|
+
*/
|
|
364
|
+
export const MIME_TYPES = {
|
|
365
|
+
// JavaScript/TypeScript
|
|
366
|
+
'.js': 'text/javascript',
|
|
367
|
+
'.mjs': 'text/javascript',
|
|
368
|
+
'.cjs': 'text/javascript',
|
|
369
|
+
'.ts': 'text/typescript',
|
|
370
|
+
'.tsx': 'text/typescript',
|
|
371
|
+
'.jsx': 'text/javascript',
|
|
372
|
+
'.mts': 'text/typescript',
|
|
373
|
+
'.cts': 'text/typescript',
|
|
374
|
+
|
|
375
|
+
// Data formats
|
|
376
|
+
'.json': 'application/json',
|
|
377
|
+
'.json5': 'application/json',
|
|
378
|
+
'.jsonc': 'application/json',
|
|
379
|
+
'.yaml': 'text/yaml',
|
|
380
|
+
'.yml': 'text/yaml',
|
|
381
|
+
'.toml': 'text/x-toml',
|
|
382
|
+
'.xml': 'application/xml',
|
|
383
|
+
'.csv': 'text/csv',
|
|
384
|
+
|
|
385
|
+
// Web
|
|
386
|
+
'.html': 'text/html',
|
|
387
|
+
'.htm': 'text/html',
|
|
388
|
+
'.xhtml': 'application/xhtml+xml',
|
|
389
|
+
'.css': 'text/css',
|
|
390
|
+
'.scss': 'text/x-scss',
|
|
391
|
+
'.sass': 'text/x-sass',
|
|
392
|
+
'.less': 'text/x-less',
|
|
393
|
+
'.styl': 'text/x-stylus',
|
|
394
|
+
'.vue': 'text/x-vue',
|
|
395
|
+
'.svelte': 'text/x-svelte',
|
|
396
|
+
'.astro': 'text/x-astro',
|
|
397
|
+
|
|
398
|
+
// Documentation
|
|
399
|
+
'.md': 'text/markdown',
|
|
400
|
+
'.markdown': 'text/markdown',
|
|
401
|
+
'.mdx': 'text/markdown',
|
|
402
|
+
'.txt': 'text/plain',
|
|
403
|
+
'.rst': 'text/x-rst',
|
|
404
|
+
'.adoc': 'text/asciidoc',
|
|
405
|
+
'.tex': 'text/x-tex',
|
|
406
|
+
|
|
407
|
+
// Python
|
|
408
|
+
'.py': 'text/x-python',
|
|
409
|
+
'.pyw': 'text/x-python',
|
|
410
|
+
'.pyx': 'text/x-cython',
|
|
411
|
+
|
|
412
|
+
// Ruby
|
|
413
|
+
'.rb': 'text/x-ruby',
|
|
414
|
+
'.erb': 'text/x-ruby',
|
|
415
|
+
'.rake': 'text/x-ruby',
|
|
416
|
+
'.gemspec': 'text/x-ruby',
|
|
417
|
+
|
|
418
|
+
// Go
|
|
419
|
+
'.go': 'text/x-go',
|
|
420
|
+
|
|
421
|
+
// Rust
|
|
422
|
+
'.rs': 'text/x-rust',
|
|
423
|
+
|
|
424
|
+
// Java/JVM
|
|
425
|
+
'.java': 'text/x-java',
|
|
426
|
+
'.kt': 'text/x-kotlin',
|
|
427
|
+
'.kts': 'text/x-kotlin',
|
|
428
|
+
'.groovy': 'text/x-groovy',
|
|
429
|
+
'.scala': 'text/x-scala',
|
|
430
|
+
'.clj': 'text/x-clojure',
|
|
431
|
+
'.cljs': 'text/x-clojure',
|
|
432
|
+
|
|
433
|
+
// C/C++
|
|
434
|
+
'.c': 'text/x-c',
|
|
435
|
+
'.h': 'text/x-c',
|
|
436
|
+
'.cpp': 'text/x-c++',
|
|
437
|
+
'.cc': 'text/x-c++',
|
|
438
|
+
'.cxx': 'text/x-c++',
|
|
439
|
+
'.hpp': 'text/x-c++',
|
|
440
|
+
'.hxx': 'text/x-c++',
|
|
441
|
+
'.m': 'text/x-objectivec',
|
|
442
|
+
'.mm': 'text/x-objectivec',
|
|
443
|
+
|
|
444
|
+
// .NET
|
|
445
|
+
'.cs': 'text/x-csharp',
|
|
446
|
+
'.vb': 'text/x-vb',
|
|
447
|
+
'.fs': 'text/x-fsharp',
|
|
448
|
+
|
|
449
|
+
// Shell
|
|
450
|
+
'.sh': 'text/x-shellscript',
|
|
451
|
+
'.bash': 'text/x-shellscript',
|
|
452
|
+
'.zsh': 'text/x-shellscript',
|
|
453
|
+
'.fish': 'text/x-shellscript',
|
|
454
|
+
'.bat': 'text/x-batch',
|
|
455
|
+
'.cmd': 'text/x-batch',
|
|
456
|
+
'.ps1': 'text/x-powershell',
|
|
457
|
+
'.psm1': 'text/x-powershell',
|
|
458
|
+
|
|
459
|
+
// Database
|
|
460
|
+
'.sql': 'text/x-sql',
|
|
461
|
+
'.pgsql': 'text/x-sql',
|
|
462
|
+
'.mysql': 'text/x-sql',
|
|
463
|
+
|
|
464
|
+
// Config
|
|
465
|
+
'.ini': 'text/x-ini',
|
|
466
|
+
'.cfg': 'text/plain',
|
|
467
|
+
'.conf': 'text/plain',
|
|
468
|
+
'.properties': 'text/x-properties',
|
|
469
|
+
'.env': 'text/plain',
|
|
470
|
+
|
|
471
|
+
// Swift/Dart
|
|
472
|
+
'.swift': 'text/x-swift',
|
|
473
|
+
'.dart': 'text/x-dart',
|
|
474
|
+
|
|
475
|
+
// Functional
|
|
476
|
+
'.hs': 'text/x-haskell',
|
|
477
|
+
'.ml': 'text/x-ocaml',
|
|
478
|
+
'.ex': 'text/x-elixir',
|
|
479
|
+
'.exs': 'text/x-elixir',
|
|
480
|
+
'.erl': 'text/x-erlang',
|
|
481
|
+
'.lua': 'text/x-lua',
|
|
482
|
+
'.pl': 'text/x-perl',
|
|
483
|
+
'.pm': 'text/x-perl',
|
|
484
|
+
'.r': 'text/x-r',
|
|
485
|
+
'.jl': 'text/x-julia',
|
|
486
|
+
|
|
487
|
+
// Build/IaC
|
|
488
|
+
'.tf': 'text/x-terraform',
|
|
489
|
+
'.hcl': 'text/x-hcl',
|
|
490
|
+
'.nix': 'text/x-nix',
|
|
491
|
+
'.cmake': 'text/x-cmake',
|
|
492
|
+
'.gradle': 'text/x-groovy',
|
|
493
|
+
'.dockerfile': 'text/x-dockerfile',
|
|
494
|
+
|
|
495
|
+
// API/Schema
|
|
496
|
+
'.proto': 'text/x-protobuf',
|
|
497
|
+
'.graphql': 'text/x-graphql',
|
|
498
|
+
'.gql': 'text/x-graphql',
|
|
499
|
+
'.sol': 'text/x-solidity',
|
|
500
|
+
'.svg': 'image/svg+xml',
|
|
501
|
+
};
|
|
502
|
+
|
|
503
|
+
/**
|
|
504
|
+
* Get MIME type for a file extension.
|
|
505
|
+
* @param {string} ext - File extension (with or without leading dot)
|
|
506
|
+
* @returns {string} MIME type or 'text/plain' as default
|
|
507
|
+
*/
|
|
508
|
+
export function getMimeType(ext) {
|
|
509
|
+
const normalizedExt = ext.startsWith('.') ? ext.toLowerCase() : `.${ext.toLowerCase()}`;
|
|
510
|
+
return MIME_TYPES[normalizedExt] || 'text/plain';
|
|
511
|
+
}
|