@softerist/heuristic-mcp 3.2.2 → 3.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/README.md +387 -376
  2. package/config.jsonc +800 -800
  3. package/features/ann-config.js +102 -110
  4. package/features/clear-cache.js +81 -84
  5. package/features/find-similar-code.js +265 -286
  6. package/features/hybrid-search.js +487 -536
  7. package/features/index-codebase.js +3139 -3270
  8. package/features/lifecycle.js +1041 -1063
  9. package/features/package-version.js +277 -291
  10. package/features/register.js +351 -370
  11. package/features/resources.js +115 -130
  12. package/features/set-workspace.js +214 -240
  13. package/index.js +742 -762
  14. package/lib/cache-ops.js +22 -22
  15. package/lib/cache-utils.js +465 -519
  16. package/lib/cache.js +1699 -1767
  17. package/lib/call-graph.js +396 -396
  18. package/lib/cli.js +232 -226
  19. package/lib/config.js +1483 -1495
  20. package/lib/constants.js +511 -492
  21. package/lib/embed-query-process.js +206 -212
  22. package/lib/embedding-process.js +434 -451
  23. package/lib/embedding-worker.js +862 -934
  24. package/lib/ignore-patterns.js +276 -316
  25. package/lib/json-worker.js +14 -14
  26. package/lib/json-writer.js +302 -310
  27. package/lib/logging.js +116 -127
  28. package/lib/memory-logger.js +13 -13
  29. package/lib/onnx-backend.js +188 -193
  30. package/lib/path-utils.js +18 -23
  31. package/lib/project-detector.js +82 -84
  32. package/lib/server-lifecycle.js +133 -145
  33. package/lib/settings-editor.js +738 -739
  34. package/lib/slice-normalize.js +25 -31
  35. package/lib/tokenizer.js +168 -203
  36. package/lib/utils.js +364 -409
  37. package/lib/vector-store-binary.js +811 -591
  38. package/lib/vector-store-sqlite.js +377 -414
  39. package/lib/workspace-env.js +32 -34
  40. package/mcp_config.json +9 -9
  41. package/package.json +86 -86
  42. package/scripts/clear-cache.js +20 -20
  43. package/scripts/download-model.js +43 -43
  44. package/scripts/mcp-launcher.js +49 -49
  45. package/scripts/postinstall.js +12 -12
  46. package/search-configs.js +36 -36
package/lib/constants.js CHANGED
@@ -1,492 +1,511 @@
1
- /**
2
- * Centralized constants for the heuristic-mcp project.
3
- * Extracting magic numbers improves maintainability and documents design decisions.
4
- */
5
-
6
- // ================================
7
- // Workspace Resolution Constants
8
- // ================================
9
-
10
- /**
11
- * Environment variables checked for workspace resolution, in precedence order.
12
- */
13
- export const WORKSPACE_ENV_VARS = Object.freeze([
14
- 'HEURISTIC_MCP_WORKSPACE',
15
- 'MCP_WORKSPACE',
16
- 'CODEX_WORKSPACE',
17
- 'CODEX_PROJECT_ROOT',
18
- 'CODEX_CWD',
19
- 'WORKSPACE_FOLDER',
20
- 'WORKSPACE_ROOT',
21
- 'CURSOR_WORKSPACE',
22
- 'CLAUDE_WORKSPACE',
23
- 'ANTIGRAVITY_WORKSPACE',
24
- 'INIT_CWD',
25
- ]);
26
-
27
- /**
28
- * Prefixes for dynamic workspace-related env vars (provider-specific).
29
- */
30
- export const DYNAMIC_WORKSPACE_ENV_PREFIXES = Object.freeze([
31
- 'CODEX_',
32
- 'ANTIGRAVITY_',
33
- 'CURSOR_',
34
- 'CLAUDE_',
35
- 'WINDSURF_',
36
- 'WARP_',
37
- 'MCP_',
38
- 'VSCODE_',
39
- ]);
40
-
41
- /**
42
- * Backward-compatible alias for legacy single-prefix consumers.
43
- */
44
- export const DYNAMIC_WORKSPACE_ENV_PREFIX = DYNAMIC_WORKSPACE_ENV_PREFIXES[0];
45
-
46
- /**
47
- * Pattern used when ranking provider-specific workspace env vars.
48
- */
49
- export const WORKSPACE_ENV_KEY_PATTERN = /(WORKSPACE|PROJECT|ROOT|CWD|DIR)/i;
50
-
51
- /**
52
- * Broad token used to discover unknown provider keys safely.
53
- * We only auto-discover generic env keys containing "WORKSPACE".
54
- */
55
- export const WORKSPACE_ENV_GENERIC_DISCOVERY_PATTERN = /WORKSPACE/i;
56
-
57
- // ================================
58
- // Chunking Constants
59
- // ================================
60
-
61
- /**
62
- * Minimum text length for a chunk to be considered valid.
63
- * Prevents tiny fragments from polluting search results.
64
- * Chunks shorter than this are discarded.
65
- */
66
- export const MIN_CHUNK_TEXT_LENGTH = 20;
67
-
68
- /**
69
- * Absolute limit on overlap calculation iterations.
70
- * Prevents unbounded loops when processing files with many zero-token lines.
71
- */
72
- export const MAX_OVERLAP_ITERATIONS = 50;
73
-
74
- /**
75
- * Target token ratio relative to max tokens.
76
- * Chunks aim to be 85% of max capacity to leave room for context.
77
- */
78
- export const TARGET_TOKEN_RATIO = 0.85;
79
-
80
- /**
81
- * Overlap token ratio relative to target tokens.
82
- * 18% overlap provides good context continuity between chunks.
83
- */
84
- export const OVERLAP_TOKEN_RATIO = 0.18;
85
-
86
- // ================================
87
- // Cache Constants
88
- // ================================
89
-
90
- /**
91
- * Maximum entries in the chunking params LRU cache.
92
- * Trade-off: memory vs. lookup time. 100 is sufficient for typical workloads.
93
- */
94
- export const CHUNKING_PARAMS_CACHE_SIZE = 100;
95
-
96
- /**
97
- * JSON files larger than this threshold are parsed in a worker thread.
98
- * Prevents main thread blocking on large cache files.
99
- */
100
- export const JSON_WORKER_THRESHOLD_BYTES = 2 * 1024 * 1024; // 2MB
101
-
102
- // ================================
103
- // Worker Constants
104
- // ================================
105
-
106
- /**
107
- * Number of results to batch before sending to main thread.
108
- * Balances IPC overhead vs. memory usage in worker communication.
109
- */
110
- export const RESULT_BATCH_SIZE = 25;
111
-
112
- /**
113
- * Default batch size for embedding inference.
114
- * Balances memory usage with throughput for ONNX runtime.
115
- */
116
- export const DEFAULT_INFERENCE_BATCH_SIZE = 4;
117
-
118
- /**
119
- * Timeout for worker batch processing before considering it failed.
120
- * Generous timeout to handle large files with complex embeddings.
121
- */
122
- export const WORKER_BATCH_TIMEOUT_MS = 300_000; // 5 minutes
123
-
124
- /**
125
- * Cooldown period after worker failures before retrying worker use.
126
- */
127
- export const WORKER_FAILURE_COOLDOWN_MS = 10 * 60 * 1000; // 10 minutes
128
-
129
- /**
130
- * Delay before starting background indexing after server connects.
131
- * Allows server to accept first request before CPU-intensive work.
132
- */
133
- export const BACKGROUND_INDEX_DELAY_MS = 3000;
134
-
135
- /**
136
- * Concurrency limit for file stat operations during search.
137
- * Prevents EMFILE (too many open files) errors.
138
- */
139
- export const FILE_STAT_CONCURRENCY_LIMIT = 50;
140
-
141
- /**
142
- * Maximum entries in file modification time LRU cache.
143
- */
144
- export const LRU_MAX_ENTRIES = 5000;
145
-
146
- /**
147
- * Target entries after LRU eviction (80% of max).
148
- */
149
- export const LRU_TARGET_ENTRIES = 4000;
150
-
151
- /**
152
- * Interval for logging memory usage during indexing.
153
- * Helps diagnose memory leaks and OOM issues.
154
- */
155
- export const MEMORY_LOG_INTERVAL_MS = 15_000; // 15 seconds
156
-
157
- /**
158
- * Retry delay when SQLite file is locked or busy.
159
- */
160
- export const SQLITE_FILE_RETRY_DELAY_MS = 50;
161
-
162
- /**
163
- * Number of retries when SQLite file is locked or busy.
164
- * Combined with delay: max wait = 50ms * 40 = 2 seconds.
165
- */
166
- export const SQLITE_FILE_RETRY_COUNT = 40;
167
-
168
- // ================================
169
- // Search Constants
170
- // ================================
171
-
172
- /**
173
- * Batch size for scoring chunks during search.
174
- * Yields to event loop between batches to maintain responsiveness.
175
- */
176
- export const SEARCH_SCORING_BATCH_SIZE = 500;
177
-
178
- /**
179
- * Maximum number of files for recency boost IO operations.
180
- * Above this, we rely on cached metadata only to prevent IO storms.
181
- */
182
- export const RECENCY_BOOST_MAX_IO_FILES = 1000;
183
-
184
- /**
185
- * Maximum size for full linear scan fallback.
186
- * Above this, we skip full scan to prevent performance degradation.
187
- */
188
- export const MAX_FULL_SCAN_SIZE = 50_000;
189
-
190
- // ================================
191
- // ANN (Approximate Nearest Neighbor) Constants
192
- // ================================
193
-
194
- /**
195
- * Number of vectors to sample for dimension consistency validation.
196
- */
197
- export const ANN_DIMENSION_SAMPLE_SIZE = 100;
198
-
199
- /**
200
- * Minimum chunks required before enabling ANN index.
201
- * Linear scan is faster for smaller datasets.
202
- */
203
- export const ANN_MIN_CHUNKS_DEFAULT = 5000;
204
-
205
- /**
206
- * Cooldown period after hnswlib load errors before retrying.
207
- * Prevents tight error loops when the native module fails to load.
208
- */
209
- export const HNSWLIB_ERROR_RESET_MS = 5 * 60 * 1000; // 5 minutes
210
-
211
- /**
212
- * Default timeout for waiting on active readers before aborting save.
213
- * Balances data safety with responsiveness.
214
- */
215
- export const DEFAULT_READER_WAIT_TIMEOUT_MS = 5000;
216
-
217
- // ================================
218
- // Embedding Process GC Constants
219
- // ================================
220
-
221
- /**
222
- * Default RSS threshold (MB) for adaptive GC in embedding child process.
223
- */
224
- export const EMBEDDING_PROCESS_DEFAULT_GC_RSS_THRESHOLD_MB = 2048;
225
-
226
- /**
227
- * Minimum interval (ms) between adaptive GC runs in embedding child process.
228
- */
229
- export const EMBEDDING_PROCESS_DEFAULT_GC_MIN_INTERVAL_MS = 15_000;
230
-
231
- /**
232
- * Backstop: run GC after this many requests without collection.
233
- */
234
- export const EMBEDDING_PROCESS_DEFAULT_GC_MAX_REQUESTS_WITHOUT_COLLECTION = 8;
235
-
236
- /**
237
- * Initial mutable state shape for embedding child process GC tracking.
238
- */
239
- export const EMBEDDING_PROCESS_GC_STATE_INITIAL = Object.freeze({
240
- lastRunAtMs: 0,
241
- requestsSinceLastRun: 0,
242
- });
243
-
244
- // ================================
245
- // Vector Store Format Constants
246
- // ================================
247
-
248
- /**
249
- * Binary vector store format version.
250
- * Increment when binary format changes to trigger re-indexing.
251
- */
252
- export const BINARY_STORE_VERSION = 1;
253
-
254
- /**
255
- * SQLite vector store format version.
256
- * Increment when schema changes to trigger re-indexing.
257
- */
258
- export const SQLITE_STORE_VERSION = 1;
259
-
260
- /**
261
- * Binary vector file header size in bytes.
262
- * Contains: magic (4) + version (4) + dim (4) + count (4) + reserved (4)
263
- */
264
- export const BINARY_VECTOR_HEADER_SIZE = 20;
265
-
266
- /**
267
- * Binary record file header size in bytes.
268
- * Contains: magic (4) + version (4) + count (4) + reserved (8)
269
- */
270
- export const BINARY_RECORD_HEADER_SIZE = 20;
271
-
272
- /**
273
- * Binary content file header size in bytes.
274
- * Contains: magic (4) + version (4) + count (4) + reserved (8)
275
- */
276
- export const BINARY_CONTENT_HEADER_SIZE = 20;
277
-
278
- /**
279
- * Size of a single record entry in bytes.
280
- * Contains: file offset (4) + file length (4) + startLine (4) + endLine (4) +
281
- * content offset (4) + content length (4) + reserved (8)
282
- */
283
- export const BINARY_RECORD_SIZE = 32;
284
-
285
- // ================================
286
- // Watcher Constants
287
- // ================================
288
-
289
- /**
290
- * Maximum pending watch events before dropping oldest.
291
- * Prevents memory exhaustion during rapid file churn (e.g., build processes).
292
- */
293
- export const MAX_PENDING_WATCH_EVENTS = 10000;
294
-
295
- /**
296
- * Target size after trimming pending watch events (50% of max).
297
- * Provides hysteresis to avoid repeated trim operations.
298
- */
299
- export const PENDING_WATCH_EVENTS_TRIM_SIZE = 5000;
300
-
301
- // ================================
302
- // ONNX Constants
303
- // ================================
304
-
305
- /**
306
- * Thread limit for ONNX runtime when native backend is unavailable.
307
- * Reduces CPU contention on the main thread.
308
- */
309
- export const ONNX_THREAD_LIMIT = 2;
310
-
311
- /**
312
- * Partial match boost factor for keyword matching in search.
313
- * Applied when query words are found but not exact phrase match.
314
- */
315
- export const PARTIAL_MATCH_BOOST = 0.3;
316
-
317
- /**
318
- * Text match candidate limit for deferred text matching.
319
- * Limits expensive string operations to top candidates.
320
- */
321
- export const TEXT_MATCH_MAX_CANDIDATES = 2000;
322
-
323
- /**
324
- * Concurrency limit for file stat operations in search.
325
- * Prevents EMFILE errors from too many simultaneous file handles.
326
- */
327
- export const STAT_CONCURRENCY_LIMIT = 50;
328
-
329
- /**
330
- * Batch size for scoring chunks in hybrid search.
331
- * Allows event loop to tick between batches for responsiveness.
332
- */
333
- export const SEARCH_BATCH_SIZE = 500;
334
-
335
- // ================================
336
- // MIME Type Constants
337
- // ================================
338
-
339
- /**
340
- * MIME type mapping for file extensions.
341
- * Used by MCP resources and content-type detection.
342
- * Extensions should be lowercase with leading dot.
343
- */
344
- export const MIME_TYPES = {
345
- // JavaScript/TypeScript
346
- '.js': 'text/javascript',
347
- '.mjs': 'text/javascript',
348
- '.cjs': 'text/javascript',
349
- '.ts': 'text/typescript',
350
- '.tsx': 'text/typescript',
351
- '.jsx': 'text/javascript',
352
- '.mts': 'text/typescript',
353
- '.cts': 'text/typescript',
354
-
355
- // Data formats
356
- '.json': 'application/json',
357
- '.json5': 'application/json',
358
- '.jsonc': 'application/json',
359
- '.yaml': 'text/yaml',
360
- '.yml': 'text/yaml',
361
- '.toml': 'text/x-toml',
362
- '.xml': 'application/xml',
363
- '.csv': 'text/csv',
364
-
365
- // Web
366
- '.html': 'text/html',
367
- '.htm': 'text/html',
368
- '.xhtml': 'application/xhtml+xml',
369
- '.css': 'text/css',
370
- '.scss': 'text/x-scss',
371
- '.sass': 'text/x-sass',
372
- '.less': 'text/x-less',
373
- '.styl': 'text/x-stylus',
374
- '.vue': 'text/x-vue',
375
- '.svelte': 'text/x-svelte',
376
- '.astro': 'text/x-astro',
377
-
378
- // Documentation
379
- '.md': 'text/markdown',
380
- '.markdown': 'text/markdown',
381
- '.mdx': 'text/markdown',
382
- '.txt': 'text/plain',
383
- '.rst': 'text/x-rst',
384
- '.adoc': 'text/asciidoc',
385
- '.tex': 'text/x-tex',
386
-
387
- // Python
388
- '.py': 'text/x-python',
389
- '.pyw': 'text/x-python',
390
- '.pyx': 'text/x-cython',
391
-
392
- // Ruby
393
- '.rb': 'text/x-ruby',
394
- '.erb': 'text/x-ruby',
395
- '.rake': 'text/x-ruby',
396
- '.gemspec': 'text/x-ruby',
397
-
398
- // Go
399
- '.go': 'text/x-go',
400
-
401
- // Rust
402
- '.rs': 'text/x-rust',
403
-
404
- // Java/JVM
405
- '.java': 'text/x-java',
406
- '.kt': 'text/x-kotlin',
407
- '.kts': 'text/x-kotlin',
408
- '.groovy': 'text/x-groovy',
409
- '.scala': 'text/x-scala',
410
- '.clj': 'text/x-clojure',
411
- '.cljs': 'text/x-clojure',
412
-
413
- // C/C++
414
- '.c': 'text/x-c',
415
- '.h': 'text/x-c',
416
- '.cpp': 'text/x-c++',
417
- '.cc': 'text/x-c++',
418
- '.cxx': 'text/x-c++',
419
- '.hpp': 'text/x-c++',
420
- '.hxx': 'text/x-c++',
421
- '.m': 'text/x-objectivec',
422
- '.mm': 'text/x-objectivec',
423
-
424
- // .NET
425
- '.cs': 'text/x-csharp',
426
- '.vb': 'text/x-vb',
427
- '.fs': 'text/x-fsharp',
428
-
429
- // Shell
430
- '.sh': 'text/x-shellscript',
431
- '.bash': 'text/x-shellscript',
432
- '.zsh': 'text/x-shellscript',
433
- '.fish': 'text/x-shellscript',
434
- '.bat': 'text/x-batch',
435
- '.cmd': 'text/x-batch',
436
- '.ps1': 'text/x-powershell',
437
- '.psm1': 'text/x-powershell',
438
-
439
- // Database
440
- '.sql': 'text/x-sql',
441
- '.pgsql': 'text/x-sql',
442
- '.mysql': 'text/x-sql',
443
-
444
- // Config
445
- '.ini': 'text/x-ini',
446
- '.cfg': 'text/plain',
447
- '.conf': 'text/plain',
448
- '.properties': 'text/x-properties',
449
- '.env': 'text/plain',
450
-
451
- // Swift/Dart
452
- '.swift': 'text/x-swift',
453
- '.dart': 'text/x-dart',
454
-
455
- // Functional
456
- '.hs': 'text/x-haskell',
457
- '.ml': 'text/x-ocaml',
458
- '.ex': 'text/x-elixir',
459
- '.exs': 'text/x-elixir',
460
- '.erl': 'text/x-erlang',
461
- '.lua': 'text/x-lua',
462
- '.pl': 'text/x-perl',
463
- '.pm': 'text/x-perl',
464
- '.r': 'text/x-r',
465
- '.jl': 'text/x-julia',
466
-
467
- // Build/IaC
468
- '.tf': 'text/x-terraform',
469
- '.hcl': 'text/x-hcl',
470
- '.nix': 'text/x-nix',
471
- '.cmake': 'text/x-cmake',
472
- '.gradle': 'text/x-groovy',
473
- '.dockerfile': 'text/x-dockerfile',
474
-
475
- // API/Schema
476
- '.proto': 'text/x-protobuf',
477
- '.graphql': 'text/x-graphql',
478
- '.gql': 'text/x-graphql',
479
- '.sol': 'text/x-solidity',
480
- '.svg': 'image/svg+xml',
481
- };
482
-
483
- /**
484
- * Get MIME type for a file extension.
485
- * @param {string} ext - File extension (with or without leading dot)
486
- * @returns {string} MIME type or 'text/plain' as default
487
- */
488
- export function getMimeType(ext) {
489
- const normalizedExt = ext.startsWith('.') ? ext.toLowerCase() : `.${ext.toLowerCase()}`;
490
- return MIME_TYPES[normalizedExt] || 'text/plain';
491
- }
492
-
1
+ /**
2
+ * Centralized constants for the heuristic-mcp project.
3
+ * Extracting magic numbers improves maintainability and documents design decisions.
4
+ */
5
+
6
+ // ================================
7
+ // Workspace Resolution Constants
8
+ // ================================
9
+
10
+ /**
11
+ * Environment variables checked for workspace resolution, in precedence order.
12
+ */
13
+ export const WORKSPACE_ENV_VARS = Object.freeze([
14
+ 'HEURISTIC_MCP_WORKSPACE',
15
+ 'MCP_WORKSPACE',
16
+ 'CODEX_WORKSPACE',
17
+ 'CODEX_PROJECT_ROOT',
18
+ 'CODEX_CWD',
19
+ 'WORKSPACE_FOLDER',
20
+ 'WORKSPACE_ROOT',
21
+ 'CURSOR_WORKSPACE',
22
+ 'CLAUDE_WORKSPACE',
23
+ 'ANTIGRAVITY_WORKSPACE',
24
+ 'INIT_CWD',
25
+ ]);
26
+
27
+ /**
28
+ * Prefixes for dynamic workspace-related env vars (provider-specific).
29
+ */
30
+ export const DYNAMIC_WORKSPACE_ENV_PREFIXES = Object.freeze([
31
+ 'CODEX_',
32
+ 'ANTIGRAVITY_',
33
+ 'CURSOR_',
34
+ 'CLAUDE_',
35
+ 'WINDSURF_',
36
+ 'WARP_',
37
+ 'MCP_',
38
+ 'VSCODE_',
39
+ ]);
40
+
41
+ /**
42
+ * Backward-compatible alias for legacy single-prefix consumers.
43
+ */
44
+ export const DYNAMIC_WORKSPACE_ENV_PREFIX = DYNAMIC_WORKSPACE_ENV_PREFIXES[0];
45
+
46
+ /**
47
+ * Pattern used when ranking provider-specific workspace env vars.
48
+ */
49
+ export const WORKSPACE_ENV_KEY_PATTERN = /(WORKSPACE|PROJECT|ROOT|CWD|DIR)/i;
50
+
51
+ /**
52
+ * Broad token used to discover unknown provider keys safely.
53
+ * We only auto-discover generic env keys containing "WORKSPACE".
54
+ */
55
+ export const WORKSPACE_ENV_GENERIC_DISCOVERY_PATTERN = /WORKSPACE/i;
56
+
57
+ // ================================
58
+ // Chunking Constants
59
+ // ================================
60
+
61
+ /**
62
+ * Minimum text length for a chunk to be considered valid.
63
+ * Prevents tiny fragments from polluting search results.
64
+ * Chunks shorter than this are discarded.
65
+ */
66
+ export const MIN_CHUNK_TEXT_LENGTH = 20;
67
+
68
+ /**
69
+ * Absolute limit on overlap calculation iterations.
70
+ * Prevents unbounded loops when processing files with many zero-token lines.
71
+ */
72
+ export const MAX_OVERLAP_ITERATIONS = 50;
73
+
74
+ /**
75
+ * Target token ratio relative to max tokens.
76
+ * Chunks aim to be 85% of max capacity to leave room for context.
77
+ */
78
+ export const TARGET_TOKEN_RATIO = 0.85;
79
+
80
+ /**
81
+ * Overlap token ratio relative to target tokens.
82
+ * 18% overlap provides good context continuity between chunks.
83
+ */
84
+ export const OVERLAP_TOKEN_RATIO = 0.18;
85
+
86
+ // ================================
87
+ // Cache Constants
88
+ // ================================
89
+
90
+ /**
91
+ * Maximum entries in the chunking params LRU cache.
92
+ * Trade-off: memory vs. lookup time. 100 is sufficient for typical workloads.
93
+ */
94
+ export const CHUNKING_PARAMS_CACHE_SIZE = 100;
95
+
96
+ /**
97
+ * JSON files larger than this threshold are parsed in a worker thread.
98
+ * Prevents main thread blocking on large cache files.
99
+ */
100
+ export const JSON_WORKER_THRESHOLD_BYTES = 2 * 1024 * 1024; // 2MB
101
+
102
+ // ================================
103
+ // Worker Constants
104
+ // ================================
105
+
106
+ /**
107
+ * Number of results to batch before sending to main thread.
108
+ * Balances IPC overhead vs. memory usage in worker communication.
109
+ */
110
+ export const RESULT_BATCH_SIZE = 25;
111
+
112
+ /**
113
+ * Default batch size for embedding inference.
114
+ * Balances memory usage with throughput for ONNX runtime.
115
+ */
116
+ export const DEFAULT_INFERENCE_BATCH_SIZE = 4;
117
+
118
+ /**
119
+ * Timeout for worker batch processing before considering it failed.
120
+ * Generous timeout to handle large files with complex embeddings.
121
+ */
122
+ export const WORKER_BATCH_TIMEOUT_MS = 300_000; // 5 minutes
123
+
124
+ /**
125
+ * Cooldown period after worker failures before retrying worker use.
126
+ */
127
+ export const WORKER_FAILURE_COOLDOWN_MS = 10 * 60 * 1000; // 10 minutes
128
+
129
+ /**
130
+ * Delay before starting background indexing after server connects.
131
+ * Allows server to accept first request before CPU-intensive work.
132
+ */
133
+ export const BACKGROUND_INDEX_DELAY_MS = 3000;
134
+
135
+ /**
136
+ * Concurrency limit for file stat operations during search.
137
+ * Prevents EMFILE (too many open files) errors.
138
+ */
139
+ export const FILE_STAT_CONCURRENCY_LIMIT = 50;
140
+
141
+ /**
142
+ * Maximum entries in file modification time LRU cache.
143
+ */
144
+ export const LRU_MAX_ENTRIES = 5000;
145
+
146
+ /**
147
+ * Target entries after LRU eviction (80% of max).
148
+ */
149
+ export const LRU_TARGET_ENTRIES = 4000;
150
+
151
+ /**
152
+ * Interval for logging memory usage during indexing.
153
+ * Helps diagnose memory leaks and OOM issues.
154
+ */
155
+ export const MEMORY_LOG_INTERVAL_MS = 15_000; // 15 seconds
156
+
157
+ /**
158
+ * Retry delay when SQLite file is locked or busy.
159
+ */
160
+ export const SQLITE_FILE_RETRY_DELAY_MS = 50;
161
+
162
+ /**
163
+ * Number of retries when SQLite file is locked or busy.
164
+ * Combined with delay: max wait = 50ms * 40 = 2 seconds.
165
+ */
166
+ export const SQLITE_FILE_RETRY_COUNT = 40;
167
+
168
+ // ================================
169
+ // Search Constants
170
+ // ================================
171
+
172
+ /**
173
+ * Batch size for scoring chunks during search.
174
+ * Yields to event loop between batches to maintain responsiveness.
175
+ */
176
+ export const SEARCH_SCORING_BATCH_SIZE = 500;
177
+
178
+ /**
179
+ * Maximum number of files for recency boost IO operations.
180
+ * Above this, we rely on cached metadata only to prevent IO storms.
181
+ */
182
+ export const RECENCY_BOOST_MAX_IO_FILES = 1000;
183
+
184
+ /**
185
+ * Maximum size for full linear scan fallback.
186
+ * Above this, we skip full scan to prevent performance degradation.
187
+ */
188
+ export const MAX_FULL_SCAN_SIZE = 50_000;
189
+
190
+ // ================================
191
+ // ANN (Approximate Nearest Neighbor) Constants
192
+ // ================================
193
+
194
+ /**
195
+ * Number of vectors to sample for dimension consistency validation.
196
+ */
197
+ export const ANN_DIMENSION_SAMPLE_SIZE = 100;
198
+
199
+ /**
200
+ * Minimum chunks required before enabling ANN index.
201
+ * Linear scan is faster for smaller datasets.
202
+ */
203
+ export const ANN_MIN_CHUNKS_DEFAULT = 5000;
204
+
205
+ /**
206
+ * Cooldown period after hnswlib load errors before retrying.
207
+ * Prevents tight error loops when the native module fails to load.
208
+ */
209
+ export const HNSWLIB_ERROR_RESET_MS = 5 * 60 * 1000; // 5 minutes
210
+
211
+ /**
212
+ * Default timeout for waiting on active readers before aborting save.
213
+ * Balances data safety with responsiveness.
214
+ */
215
+ export const DEFAULT_READER_WAIT_TIMEOUT_MS = 5000;
216
+
217
+ // ================================
218
+ // Embedding Process GC Constants
219
+ // ================================
220
+
221
+ /**
222
+ * Default RSS threshold (MB) for adaptive GC in embedding child process.
223
+ */
224
+ export const EMBEDDING_PROCESS_DEFAULT_GC_RSS_THRESHOLD_MB = 2048;
225
+
226
+ /**
227
+ * Minimum interval (ms) between adaptive GC runs in embedding child process.
228
+ */
229
+ export const EMBEDDING_PROCESS_DEFAULT_GC_MIN_INTERVAL_MS = 15_000;
230
+
231
+ /**
232
+ * Backstop: run GC after this many requests without collection.
233
+ */
234
+ export const EMBEDDING_PROCESS_DEFAULT_GC_MAX_REQUESTS_WITHOUT_COLLECTION = 8;
235
+
236
+ /**
237
+ * Initial mutable state shape for embedding child process GC tracking.
238
+ */
239
+ export const EMBEDDING_PROCESS_GC_STATE_INITIAL = Object.freeze({
240
+ lastRunAtMs: 0,
241
+ requestsSinceLastRun: 0,
242
+ });
243
+
244
+ /**
245
+ * Default idle timeout for the persistent embedding child process.
246
+ * 10 minutes (600,000 ms) prevents frequent process churn while releasing
247
+ * resources during extended inactivity.
248
+ */
249
+ export const EMBEDDING_POOL_IDLE_TIMEOUT_MS = 600000; // 10 minutes
250
+
251
+ // ================================
252
+ // Vector Store Format Constants
253
+ // ================================
254
+
255
+ /**
256
+ * Binary vector store format version.
257
+ * Increment when binary format changes to trigger re-indexing.
258
+ * v2: added writeId (4 bytes) + CRC32 (4 bytes) + 8 bytes reserved to all headers.
259
+ */
260
+ export const BINARY_STORE_VERSION = 2;
261
+
262
+ /**
263
+ * SQLite vector store format version.
264
+ * Increment when schema changes to trigger re-indexing.
265
+ */
266
+ export const SQLITE_STORE_VERSION = 1;
267
+
268
+ /**
269
+ * Binary vector file header size in bytes.
270
+ * Contains: magic (4) + version (4) + dim (4) + count (4) + writeId (4) + crc32 (4) + reserved (8)
271
+ */
272
+ export const BINARY_VECTOR_HEADER_SIZE = 32;
273
+
274
+ /**
275
+ * Binary record file header size in bytes.
276
+ * Contains: magic (4) + version (4) + count (4) + fileCount (4) + writeId (4) + crc32 (4) + reserved (8)
277
+ */
278
+ export const BINARY_RECORD_HEADER_SIZE = 32;
279
+
280
+ /**
281
+ * Binary content file header size in bytes.
282
+ * Contains: magic (4) + version (4) + totalBytes (8) + writeId (4) + crc32 (4) + reserved (8)
283
+ */
284
+ export const BINARY_CONTENT_HEADER_SIZE = 32;
285
+
286
+ /**
287
+ * Size of a single record entry in bytes.
288
+ * Contains: file offset (4) + file length (4) + startLine (4) + endLine (4) +
289
+ * content offset (4) + content length (4) + reserved (8)
290
+ */
291
+ export const BINARY_RECORD_SIZE = 32;
292
+
293
+ // ================================
294
+ // Server Process Constants
295
+ // ================================
296
+
297
+ /**
298
+ * Interval for the robust Keep-Alive mechanism in the main server process.
299
+ * Used to ensure the process doesn't exit when the event loop is empty
300
+ * (e.g., during complex async operations or child process restarts).
301
+ * Value is Max Int32 (approx 24.8 days) to minimize overhead.
302
+ */
303
+ export const SERVER_KEEP_ALIVE_INTERVAL_MS = 2147483647;
304
+
305
+ // ================================
306
+ // Watcher Constants
307
+ // ================================
308
+
309
+ /**
310
+ * Maximum pending watch events before dropping oldest.
311
+ * Prevents memory exhaustion during rapid file churn (e.g., build processes).
312
+ */
313
+ export const MAX_PENDING_WATCH_EVENTS = 10000;
314
+
315
+ /**
316
+ * Target size after trimming pending watch events (50% of max).
317
+ * Provides hysteresis to avoid repeated trim operations.
318
+ */
319
+ export const PENDING_WATCH_EVENTS_TRIM_SIZE = 5000;
320
+
321
+ // ================================
322
+ // ONNX Constants
323
+ // ================================
324
+
325
+ /**
326
+ * Thread limit for ONNX runtime when native backend is unavailable.
327
+ * Reduces CPU contention on the main thread.
328
+ */
329
+ export const ONNX_THREAD_LIMIT = 2;
330
+
331
+ /**
332
+ * Partial match boost factor for keyword matching in search.
333
+ * Applied when query words are found but not exact phrase match.
334
+ */
335
+ export const PARTIAL_MATCH_BOOST = 0.3;
336
+
337
+ /**
338
+ * Text match candidate limit for deferred text matching.
339
+ * Limits expensive string operations to top candidates.
340
+ */
341
+ export const TEXT_MATCH_MAX_CANDIDATES = 2000;
342
+
343
+ /**
344
+ * Concurrency limit for file stat operations in search.
345
+ * Prevents EMFILE errors from too many simultaneous file handles.
346
+ */
347
+ export const STAT_CONCURRENCY_LIMIT = 50;
348
+
349
+ /**
350
+ * Batch size for scoring chunks in hybrid search.
351
+ * Allows event loop to tick between batches for responsiveness.
352
+ */
353
+ export const SEARCH_BATCH_SIZE = 500;
354
+
355
+ // ================================
356
+ // MIME Type Constants
357
+ // ================================
358
+
359
+ /**
360
+ * MIME type mapping for file extensions.
361
+ * Used by MCP resources and content-type detection.
362
+ * Extensions should be lowercase with leading dot.
363
+ */
364
+ export const MIME_TYPES = {
365
+ // JavaScript/TypeScript
366
+ '.js': 'text/javascript',
367
+ '.mjs': 'text/javascript',
368
+ '.cjs': 'text/javascript',
369
+ '.ts': 'text/typescript',
370
+ '.tsx': 'text/typescript',
371
+ '.jsx': 'text/javascript',
372
+ '.mts': 'text/typescript',
373
+ '.cts': 'text/typescript',
374
+
375
+ // Data formats
376
+ '.json': 'application/json',
377
+ '.json5': 'application/json',
378
+ '.jsonc': 'application/json',
379
+ '.yaml': 'text/yaml',
380
+ '.yml': 'text/yaml',
381
+ '.toml': 'text/x-toml',
382
+ '.xml': 'application/xml',
383
+ '.csv': 'text/csv',
384
+
385
+ // Web
386
+ '.html': 'text/html',
387
+ '.htm': 'text/html',
388
+ '.xhtml': 'application/xhtml+xml',
389
+ '.css': 'text/css',
390
+ '.scss': 'text/x-scss',
391
+ '.sass': 'text/x-sass',
392
+ '.less': 'text/x-less',
393
+ '.styl': 'text/x-stylus',
394
+ '.vue': 'text/x-vue',
395
+ '.svelte': 'text/x-svelte',
396
+ '.astro': 'text/x-astro',
397
+
398
+ // Documentation
399
+ '.md': 'text/markdown',
400
+ '.markdown': 'text/markdown',
401
+ '.mdx': 'text/markdown',
402
+ '.txt': 'text/plain',
403
+ '.rst': 'text/x-rst',
404
+ '.adoc': 'text/asciidoc',
405
+ '.tex': 'text/x-tex',
406
+
407
+ // Python
408
+ '.py': 'text/x-python',
409
+ '.pyw': 'text/x-python',
410
+ '.pyx': 'text/x-cython',
411
+
412
+ // Ruby
413
+ '.rb': 'text/x-ruby',
414
+ '.erb': 'text/x-ruby',
415
+ '.rake': 'text/x-ruby',
416
+ '.gemspec': 'text/x-ruby',
417
+
418
+ // Go
419
+ '.go': 'text/x-go',
420
+
421
+ // Rust
422
+ '.rs': 'text/x-rust',
423
+
424
+ // Java/JVM
425
+ '.java': 'text/x-java',
426
+ '.kt': 'text/x-kotlin',
427
+ '.kts': 'text/x-kotlin',
428
+ '.groovy': 'text/x-groovy',
429
+ '.scala': 'text/x-scala',
430
+ '.clj': 'text/x-clojure',
431
+ '.cljs': 'text/x-clojure',
432
+
433
+ // C/C++
434
+ '.c': 'text/x-c',
435
+ '.h': 'text/x-c',
436
+ '.cpp': 'text/x-c++',
437
+ '.cc': 'text/x-c++',
438
+ '.cxx': 'text/x-c++',
439
+ '.hpp': 'text/x-c++',
440
+ '.hxx': 'text/x-c++',
441
+ '.m': 'text/x-objectivec',
442
+ '.mm': 'text/x-objectivec',
443
+
444
+ // .NET
445
+ '.cs': 'text/x-csharp',
446
+ '.vb': 'text/x-vb',
447
+ '.fs': 'text/x-fsharp',
448
+
449
+ // Shell
450
+ '.sh': 'text/x-shellscript',
451
+ '.bash': 'text/x-shellscript',
452
+ '.zsh': 'text/x-shellscript',
453
+ '.fish': 'text/x-shellscript',
454
+ '.bat': 'text/x-batch',
455
+ '.cmd': 'text/x-batch',
456
+ '.ps1': 'text/x-powershell',
457
+ '.psm1': 'text/x-powershell',
458
+
459
+ // Database
460
+ '.sql': 'text/x-sql',
461
+ '.pgsql': 'text/x-sql',
462
+ '.mysql': 'text/x-sql',
463
+
464
+ // Config
465
+ '.ini': 'text/x-ini',
466
+ '.cfg': 'text/plain',
467
+ '.conf': 'text/plain',
468
+ '.properties': 'text/x-properties',
469
+ '.env': 'text/plain',
470
+
471
+ // Swift/Dart
472
+ '.swift': 'text/x-swift',
473
+ '.dart': 'text/x-dart',
474
+
475
+ // Functional
476
+ '.hs': 'text/x-haskell',
477
+ '.ml': 'text/x-ocaml',
478
+ '.ex': 'text/x-elixir',
479
+ '.exs': 'text/x-elixir',
480
+ '.erl': 'text/x-erlang',
481
+ '.lua': 'text/x-lua',
482
+ '.pl': 'text/x-perl',
483
+ '.pm': 'text/x-perl',
484
+ '.r': 'text/x-r',
485
+ '.jl': 'text/x-julia',
486
+
487
+ // Build/IaC
488
+ '.tf': 'text/x-terraform',
489
+ '.hcl': 'text/x-hcl',
490
+ '.nix': 'text/x-nix',
491
+ '.cmake': 'text/x-cmake',
492
+ '.gradle': 'text/x-groovy',
493
+ '.dockerfile': 'text/x-dockerfile',
494
+
495
+ // API/Schema
496
+ '.proto': 'text/x-protobuf',
497
+ '.graphql': 'text/x-graphql',
498
+ '.gql': 'text/x-graphql',
499
+ '.sol': 'text/x-solidity',
500
+ '.svg': 'image/svg+xml',
501
+ };
502
+
503
+ /**
504
+ * Get MIME type for a file extension.
505
+ * @param {string} ext - File extension (with or without leading dot)
506
+ * @returns {string} MIME type or 'text/plain' as default
507
+ */
508
+ export function getMimeType(ext) {
509
+ const normalizedExt = ext.startsWith('.') ? ext.toLowerCase() : `.${ext.toLowerCase()}`;
510
+ return MIME_TYPES[normalizedExt] || 'text/plain';
511
+ }