sweet-search 0.0.1 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (161) hide show
  1. package/LICENSE +190 -0
  2. package/NOTICE +23 -0
  3. package/core/cli.js +51 -0
  4. package/core/config.js +27 -0
  5. package/core/embedding/embedding-cache.js +467 -0
  6. package/core/embedding/embedding-local-model.js +845 -0
  7. package/core/embedding/embedding-remote.js +492 -0
  8. package/core/embedding/embedding-service.js +712 -0
  9. package/core/embedding/embedding-telemetry.js +219 -0
  10. package/core/embedding/index.js +40 -0
  11. package/core/graph/community-detector.js +294 -0
  12. package/core/graph/graph-expansion.js +839 -0
  13. package/core/graph/graph-extractor.js +2304 -0
  14. package/core/graph/graph-search.js +2148 -0
  15. package/core/graph/hcgs-generator.js +666 -0
  16. package/core/graph/index.js +16 -0
  17. package/core/graph/leiden-algorithm.js +547 -0
  18. package/core/graph/relationship-resolver.js +366 -0
  19. package/core/graph/repo-map.js +408 -0
  20. package/core/graph/summary-manager.js +549 -0
  21. package/core/indexing/artifact-builder.js +1054 -0
  22. package/core/indexing/ast-chunker.js +709 -0
  23. package/core/indexing/chunking/chunk-builder.js +170 -0
  24. package/core/indexing/chunking/markdown-chunker.js +503 -0
  25. package/core/indexing/chunking/plaintext-chunker.js +104 -0
  26. package/core/indexing/dedup/dedup-phase.js +159 -0
  27. package/core/indexing/dedup/exemplar-selector.js +65 -0
  28. package/core/indexing/document-chunker.js +56 -0
  29. package/core/indexing/incremental-parser.js +390 -0
  30. package/core/indexing/incremental-tracker.js +761 -0
  31. package/core/indexing/index-codebase-v21.js +472 -0
  32. package/core/indexing/index-maintainer.mjs +1674 -0
  33. package/core/indexing/index.js +90 -0
  34. package/core/indexing/indexer-ann.js +1077 -0
  35. package/core/indexing/indexer-build.js +742 -0
  36. package/core/indexing/indexer-phases.js +800 -0
  37. package/core/indexing/indexer-pool.js +764 -0
  38. package/core/indexing/indexer-sparse-gram.js +98 -0
  39. package/core/indexing/indexer-utils.js +536 -0
  40. package/core/indexing/indexer-worker.js +148 -0
  41. package/core/indexing/li-skip-policy.js +225 -0
  42. package/core/indexing/merkle-tracker.js +244 -0
  43. package/core/indexing/model-pool.js +166 -0
  44. package/core/infrastructure/code-graph-repository.js +120 -0
  45. package/core/infrastructure/codebase-repository.js +131 -0
  46. package/core/infrastructure/config/dedup.js +54 -0
  47. package/core/infrastructure/config/embedding.js +298 -0
  48. package/core/infrastructure/config/graph.js +80 -0
  49. package/core/infrastructure/config/index.js +82 -0
  50. package/core/infrastructure/config/indexing.js +8 -0
  51. package/core/infrastructure/config/platform.js +254 -0
  52. package/core/infrastructure/config/ranking.js +221 -0
  53. package/core/infrastructure/config/search.js +396 -0
  54. package/core/infrastructure/config/translation.js +89 -0
  55. package/core/infrastructure/config/vector-store.js +114 -0
  56. package/core/infrastructure/constants.js +86 -0
  57. package/core/infrastructure/coreml-cascade.js +909 -0
  58. package/core/infrastructure/coreml-cascade.json +46 -0
  59. package/core/infrastructure/coreml-provider.js +81 -0
  60. package/core/infrastructure/db-utils.js +69 -0
  61. package/core/infrastructure/dedup-hashing.js +83 -0
  62. package/core/infrastructure/hardware-capability.js +332 -0
  63. package/core/infrastructure/index.js +104 -0
  64. package/core/infrastructure/language-patterns/maps.js +121 -0
  65. package/core/infrastructure/language-patterns/registry-core.js +323 -0
  66. package/core/infrastructure/language-patterns/registry-data-query.js +155 -0
  67. package/core/infrastructure/language-patterns/registry-object-oriented.js +285 -0
  68. package/core/infrastructure/language-patterns/registry-tooling.js +240 -0
  69. package/core/infrastructure/language-patterns/registry-web-style.js +143 -0
  70. package/core/infrastructure/language-patterns/registry.js +19 -0
  71. package/core/infrastructure/language-patterns.js +141 -0
  72. package/core/infrastructure/llm-provider.js +733 -0
  73. package/core/infrastructure/manifest.json +46 -0
  74. package/core/infrastructure/maxsim.wasm +0 -0
  75. package/core/infrastructure/model-fetcher.js +423 -0
  76. package/core/infrastructure/model-registry.js +214 -0
  77. package/core/infrastructure/native-inference.js +587 -0
  78. package/core/infrastructure/native-resolver.js +187 -0
  79. package/core/infrastructure/native-sparse-gram.js +257 -0
  80. package/core/infrastructure/native-tokenizer.js +160 -0
  81. package/core/infrastructure/onnx-mutex.js +45 -0
  82. package/core/infrastructure/onnx-session-utils.js +261 -0
  83. package/core/infrastructure/ort-pipeline.js +111 -0
  84. package/core/infrastructure/project-detector.js +102 -0
  85. package/core/infrastructure/quantization.js +410 -0
  86. package/core/infrastructure/simd-distance.js +502 -0
  87. package/core/infrastructure/simd-distance.wasm +0 -0
  88. package/core/infrastructure/tree-sitter-provider.js +665 -0
  89. package/core/infrastructure/webgpu-maxsim.js +222 -0
  90. package/core/query/index.js +35 -0
  91. package/core/query/intent-detector.js +201 -0
  92. package/core/query/intent-router.js +156 -0
  93. package/core/query/query-router-catboost.js +222 -0
  94. package/core/query/query-router-ml.js +266 -0
  95. package/core/query/query-router.js +213 -0
  96. package/core/ranking/cascaded-scorer.js +379 -0
  97. package/core/ranking/flashrank.js +810 -0
  98. package/core/ranking/index.js +49 -0
  99. package/core/ranking/late-interaction-index.js +2383 -0
  100. package/core/ranking/late-interaction-model.js +812 -0
  101. package/core/ranking/local-reranker.js +374 -0
  102. package/core/ranking/mmr.js +379 -0
  103. package/core/ranking/quality-scorer.js +363 -0
  104. package/core/search/context-expander.js +1167 -0
  105. package/core/search/dedup/sibling-expander.js +327 -0
  106. package/core/search/index.js +16 -0
  107. package/core/search/search-boost.js +259 -0
  108. package/core/search/search-cli.js +544 -0
  109. package/core/search/search-format.js +282 -0
  110. package/core/search/search-fusion.js +327 -0
  111. package/core/search/search-hybrid.js +204 -0
  112. package/core/search/search-pattern-chunks.js +337 -0
  113. package/core/search/search-pattern-planner.js +439 -0
  114. package/core/search/search-pattern-prefilter.js +412 -0
  115. package/core/search/search-pattern-ripgrep.js +663 -0
  116. package/core/search/search-pattern.js +463 -0
  117. package/core/search/search-postprocess.js +452 -0
  118. package/core/search/search-semantic.js +706 -0
  119. package/core/search/search-server.js +554 -0
  120. package/core/search/session-daemon-prewarm.mjs +164 -0
  121. package/core/search/session-warmup.js +595 -0
  122. package/core/search/sweet-search.js +632 -0
  123. package/core/search/warmup-metrics.js +532 -0
  124. package/core/start-server.js +6 -0
  125. package/core/training/query-router/features/extractor.js +762 -0
  126. package/core/training/query-router/features/multilingual-patterns.js +431 -0
  127. package/core/training/query-router/features/text-segmenter.js +303 -0
  128. package/core/training/query-router/features/unicode-utils.js +383 -0
  129. package/core/training/query-router/output/v45_router_d4.js +11521 -0
  130. package/core/training/query-router/output/v46_router_d4.js +11498 -0
  131. package/core/vector-store/binary-heap.js +227 -0
  132. package/core/vector-store/binary-hnsw-index.js +1004 -0
  133. package/core/vector-store/float-vector-store.js +234 -0
  134. package/core/vector-store/hnsw-index.js +580 -0
  135. package/core/vector-store/index.js +39 -0
  136. package/core/vector-store/seismic-index.js +498 -0
  137. package/core/vocabulary/index.js +84 -0
  138. package/core/vocabulary/vocab-constants.js +20 -0
  139. package/core/vocabulary/vocab-miner-extractors.js +375 -0
  140. package/core/vocabulary/vocab-miner-nl.js +404 -0
  141. package/core/vocabulary/vocab-miner-utils.js +146 -0
  142. package/core/vocabulary/vocab-miner.js +574 -0
  143. package/core/vocabulary/vocab-prewarm-cli.js +110 -0
  144. package/core/vocabulary/vocab-ranker.js +492 -0
  145. package/core/vocabulary/vocab-warmer.js +523 -0
  146. package/core/vocabulary/vocab-warmup-orchestrator.js +425 -0
  147. package/core/vocabulary/vocabulary-utils.js +704 -0
  148. package/crates/wasm-router/pkg/package.json +13 -0
  149. package/crates/wasm-router/pkg/query_router_wasm.d.ts +36 -0
  150. package/crates/wasm-router/pkg/query_router_wasm.js +271 -0
  151. package/crates/wasm-router/pkg/query_router_wasm_bg.wasm +0 -0
  152. package/crates/wasm-router/pkg/query_router_wasm_bg.wasm.d.ts +19 -0
  153. package/mcp/config-gen.js +121 -0
  154. package/mcp/server.js +335 -0
  155. package/mcp/tool-handlers.js +476 -0
  156. package/package.json +131 -9
  157. package/scripts/benchmark-harness.js +794 -0
  158. package/scripts/init.js +1058 -0
  159. package/scripts/smoke-test.js +435 -0
  160. package/scripts/uninstall.js +478 -0
  161. package/scripts/verify-runtime.js +176 -0
@@ -0,0 +1,2304 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * Code Graph Extractor
5
+ *
6
+ * Builds a knowledge graph from codebase:
7
+ * - Entities: classes, interfaces, methods, fields, enums
8
+ * - Relationships: extends, implements, calls, uses, throws, overrides
9
+ *
10
+ * Stores in SQLite with FTS5 for fast lexical search.
11
+ */
12
+
13
+ import { createHash } from 'crypto';
14
+ import path from 'path';
15
+ import fs from 'fs/promises';
16
+ import { GRAPH_CONFIG, DB_PATHS } from '../infrastructure/config/index.js';
17
+ import { getLanguageByPath } from '../infrastructure/language-patterns.js';
18
+ import { getTreeSitterProvider } from '../infrastructure/tree-sitter-provider.js';
19
+
20
+ // Schema version - increment when schema changes require full reindex
21
+ // Users should run `/index-codebase --full` after upgrading
22
+ export const SCHEMA_VERSION = 2;
23
+
24
+ /**
25
+ * Normalize an identifier into searchable alias tokens.
26
+ * Splits camelCase, PascalCase, snake_case, digits and emits both
27
+ * the split form and the collapsed alnum form.
28
+ *
29
+ * @param {string} name - The original identifier name
30
+ * @returns {string} Space-separated alias tokens (lowercased, deduped)
31
+ *
32
+ * @example
33
+ * normalizeIdentifier('UserService') // 'user service userservice'
34
+ * normalizeIdentifier('getUserName') // 'get user name getusername'
35
+ * normalizeIdentifier('get_user_name') // 'get user name getusername'
36
+ * normalizeIdentifier('HTMLParser2') // 'html parser 2 htmlparser2'
37
+ * normalizeIdentifier('OAuth2Client') // 'o auth 2 client oauth2client'
38
+ * normalizeIdentifier('auth.service') // 'auth service authservice'
39
+ */
40
+ export function normalizeIdentifier(name) {
41
+ if (!name) return '';
42
+
43
+ // Step 1-4: Split on separators and camelCase/PascalCase boundaries
44
+ let split = name
45
+ // Insert space before acronym→word transitions (e.g. HTMLParser -> HTML Parser)
46
+ // Requires 2+ uppercase chars to avoid splitting single-letter prefixes (OAuth stays intact)
47
+ .replace(/([A-Z]{2,})([A-Z][a-z])/g, '$1 $2')
48
+ // Insert space at camelCase boundaries (e.g. getUser -> get User)
49
+ .replace(/([a-z])([A-Z])/g, '$1 $2')
50
+ // Insert space at digit boundaries (e.g. Parser2 -> Parser 2, v2Handler -> v 2 Handler)
51
+ .replace(/([a-zA-Z])(\d)/g, '$1 $2')
52
+ .replace(/(\d)([a-zA-Z])/g, '$1 $2')
53
+ // Split on separators: _ - . / :
54
+ .replace(/[_\-./:\\]/g, ' ');
55
+
56
+ // Step 5-6: Lowercase and normalize whitespace
57
+ const tokens = split.toLowerCase().split(/\s+/).filter(t => t.length > 0);
58
+
59
+ // Step 7: Emit both split tokens and collapsed form
60
+ const collapsed = tokens.join('');
61
+ const uniqueTokens = [...new Set([...tokens, collapsed])];
62
+
63
+ return uniqueTokens.join(' ');
64
+ }
65
+
66
+ /**
67
+ * Persist the current schema version after schema creation/migration succeeds.
68
+ *
69
+ * @param {import('better-sqlite3').Database} db
70
+ * @param {number} [version=SCHEMA_VERSION]
71
+ */
72
+ export function setSchemaVersion(db, version = SCHEMA_VERSION) {
73
+ db.exec(`CREATE TABLE IF NOT EXISTS schema_meta (key TEXT PRIMARY KEY, value TEXT)`);
74
+ db.prepare('INSERT OR REPLACE INTO schema_meta (key, value) VALUES (?, ?)').run('version', String(version));
75
+ }
76
+
77
+ function getTableSql(db, tableName) {
78
+ const row = db.prepare("SELECT sql FROM sqlite_master WHERE type='table' AND name = ?").get(tableName);
79
+ return row?.sql || '';
80
+ }
81
+
82
+ function normalizeSql(sql) {
83
+ return sql.toLowerCase().replace(/\s+/g, ' ');
84
+ }
85
+
86
+ function hasExpectedEntitiesFtsSchema(sql) {
87
+ const normalized = normalizeSql(sql);
88
+ return normalized.includes('name_alias')
89
+ && normalized.includes("tokenize='porter unicode61'")
90
+ && normalized.includes("prefix='2 3 4'");
91
+ }
92
+
93
+ function hasExpectedTrigramSchema(sql) {
94
+ const normalized = normalizeSql(sql);
95
+ return normalized.includes("tokenize='trigram'")
96
+ && normalized.includes("content='entities'")
97
+ && normalized.includes("content_rowid='rowid'");
98
+ }
99
+
100
+ function backfillNameAliases(db) {
101
+ const rowsNeedingAlias = db.prepare(`
102
+ SELECT id, name
103
+ FROM entities
104
+ WHERE name IS NOT NULL
105
+ AND (name_alias IS NULL OR trim(name_alias) = '')
106
+ `).all();
107
+
108
+ if (rowsNeedingAlias.length === 0) {
109
+ return 0;
110
+ }
111
+
112
+ const updateAlias = db.prepare(`UPDATE entities SET name_alias = ? WHERE id = ?`);
113
+ const applyBackfill = db.transaction((rows) => {
114
+ for (const row of rows) {
115
+ updateAlias.run(normalizeIdentifier(row.name), row.id);
116
+ }
117
+ });
118
+
119
+ applyBackfill(rowsNeedingAlias);
120
+ return rowsNeedingAlias.length;
121
+ }
122
+
123
+ function ensureLexicalFtsSchema(db) {
124
+ const existingFtsSql = getTableSql(db, 'entities_fts');
125
+ const existingTrigramSql = getTableSql(db, 'entities_trigram');
126
+ const needsRebuild = !existingFtsSql
127
+ || !existingTrigramSql
128
+ || !hasExpectedEntitiesFtsSchema(existingFtsSql)
129
+ || !hasExpectedTrigramSchema(existingTrigramSql);
130
+
131
+ if (needsRebuild) {
132
+ db.exec(`DROP TABLE IF EXISTS entities_fts`);
133
+ db.exec(`DROP TABLE IF EXISTS entities_trigram`);
134
+ }
135
+
136
+ db.exec(`
137
+ CREATE VIRTUAL TABLE IF NOT EXISTS entities_fts USING fts5(
138
+ name,
139
+ name_alias,
140
+ signature,
141
+ doc_comment,
142
+ content='entities',
143
+ content_rowid='rowid',
144
+ tokenize='porter unicode61',
145
+ prefix='2 3 4'
146
+ )
147
+ `);
148
+
149
+ db.exec(`
150
+ CREATE VIRTUAL TABLE IF NOT EXISTS entities_trigram USING fts5(
151
+ name,
152
+ signature,
153
+ content='entities',
154
+ content_rowid='rowid',
155
+ tokenize='trigram'
156
+ )
157
+ `);
158
+
159
+ return { rebuilt: needsRebuild };
160
+ }
161
+
162
+ // =============================================================================
163
+ // ENTITY EXTRACTION PATTERNS
164
+ // =============================================================================
165
+
166
+ const JAVA_PATTERNS = {
167
+ // Class declarations
168
+ class: /(?:public|private|protected)?\s*(?:static)?\s*(?:final|abstract)?\s*class\s+(\w+)(?:\s+extends\s+(\w+))?(?:\s+implements\s+([\w,\s]+))?/g,
169
+
170
+ // Interface declarations
171
+ interface: /(?:public)?\s*interface\s+(\w+)(?:\s+extends\s+([\w,\s]+))?/g,
172
+
173
+ // Enum declarations
174
+ enum: /(?:public)?\s*enum\s+(\w+)/g,
175
+
176
+ // Method declarations
177
+ method: /(?:@\w+\s*(?:\([^)]*\))?\s*)*(?:public|private|protected)?\s*(?:static)?\s*(?:final)?\s*(?:synchronized)?\s*(?:<[\w\s,<>?]+>\s*)?(\w+(?:<[\w\s,<>?]+>)?(?:\[\])?)\s+(\w+)\s*\(([^)]*)\)/g,
178
+
179
+ // Field declarations
180
+ field: /(?:public|private|protected)\s+(?:static)?\s*(?:final)?\s*(\w+(?:<[\w\s,<>?]+>)?(?:\[\])?)\s+(\w+)\s*[;=]/g,
181
+
182
+ // Method calls
183
+ methodCall: /(\w+)\s*\.\s*(\w+)\s*\(/g,
184
+
185
+ // Imports (supports static and wildcard: import com.foo.*; import static com.bar.Baz.*)
186
+ import: /import\s+(?:static\s+)?([a-zA-Z_][\w.]*(?:\.\*)?)\s*;/g,
187
+
188
+ // Throw statements
189
+ throw: /throw\s+new\s+(\w+)/g,
190
+
191
+ // Package declaration
192
+ package: /package\s+([\w.]+)\s*;/,
193
+ };
194
+
195
+ const JS_PATTERNS = {
196
+ // Function declarations
197
+ function: /(?:export\s+)?(?:async\s+)?function\s+(\w+)\s*\(/g,
198
+
199
+ // Arrow functions
200
+ arrowFunction: /(?:export\s+)?(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s*)?\([^)]*\)\s*=>/g,
201
+
202
+ // Class declarations
203
+ class: /(?:export\s+)?class\s+(\w+)(?:\s+extends\s+(\w+))?/g,
204
+
205
+ // React components (capitalized functions)
206
+ component: /(?:export\s+)?(?:const|function)\s+([A-Z]\w+)\s*[=:]/g,
207
+
208
+ // Method calls
209
+ methodCall: /(\w+)\s*\.\s*(\w+)\s*\(/g,
210
+
211
+ // Imports
212
+ import: /import\s+(?:{([^}]+)}|(\w+))\s+from\s+['"]([^'"]+)['"]/g,
213
+ };
214
+
215
+ const PROTO_PATTERNS = {
216
+ // Message declarations
217
+ message: /message\s+(\w+)\s*\{/g,
218
+
219
+ // Service declarations
220
+ service: /service\s+(\w+)\s*\{/g,
221
+
222
+ // RPC declarations
223
+ rpc: /rpc\s+(\w+)\s*\(\s*(\w+)\s*\)\s+returns\s+\(\s*(\w+)\s*\)/g,
224
+
225
+ // Enum declarations
226
+ enum: /enum\s+(\w+)\s*\{/g,
227
+ };
228
+
229
+ export const GENERIC_RELATIONSHIP_MAPPING = Object.freeze({
230
+ import: 'imports',
231
+ plainImport: 'imports',
232
+ include: 'imports',
233
+ require: 'imports',
234
+ reexport: 'imports',
235
+ dynamicImport: 'imports',
236
+ use: 'imports',
237
+ prepend: 'imports',
238
+ open: 'imports',
239
+ source: 'imports',
240
+ from: 'imports',
241
+ forward: 'imports',
242
+ using: 'imports',
243
+ link: 'imports',
244
+ script: 'imports',
245
+ copyFrom: 'imports',
246
+ alias: 'imports',
247
+ namespace: 'imports',
248
+ ref: 'imports',
249
+ dep: 'imports',
250
+ package: 'imports',
251
+ extends: 'extends',
252
+ inherit: 'extends',
253
+ mixin: 'extends',
254
+ with: 'extends',
255
+ category: 'extends',
256
+ implements: 'implements',
257
+ protocol: 'implements',
258
+ implFor: 'implements',
259
+ decorator: 'uses',
260
+ embed: 'uses',
261
+ extend: 'uses',
262
+ anchor: 'uses',
263
+ derive: 'uses',
264
+ throw: 'uses',
265
+ img: 'uses',
266
+ form: 'uses',
267
+ methodOf: 'uses',
268
+ });
269
+
270
+ export const INTENTIONAL_DEFAULT_RELATIONSHIP_TYPES = Object.freeze([]);
271
+ const escapeRegexLiteral = (value) => value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
272
+
273
+ // Types whose regex capture groups commonly contain comma-separated lists.
274
+ // Module-scope constant to avoid per-call Set allocation.
275
+ const MULTI_TARGET_TYPES = new Set([
276
+ 'plainImport', 'implements', 'inherit', 'protocol', 'with',
277
+ ]);
278
+
279
+ export const TREE_SITTER_ENTITY_PRIORITY = Object.freeze({
280
+ component: 40,
281
+ class: 35,
282
+ function: 30,
283
+ method: 25,
284
+ arrowFunction: 20,
285
+ interface: 20,
286
+ typeAlias: 20,
287
+ enum: 20,
288
+ namespace: 20,
289
+ struct: 30,
290
+ record: 30,
291
+ module: 25,
292
+ trait: 25,
293
+ impl: 20,
294
+ decorator: 15,
295
+ });
296
+
297
+ // Module-scope constants for extractJavaScript() — avoid per-call/per-line allocation.
298
+ const JS_CALL_SKIP_OBJECTS = new Set([
299
+ 'console', 'Math', 'JSON', 'Object', 'Array', 'Promise', 'process', 'Buffer', 'Date',
300
+ ]);
301
+ const JS_RESERVED_WORDS = new Set([
302
+ 'if', 'else', 'for', 'while', 'switch', 'catch', 'with', 'do', 'try', 'return',
303
+ ]);
304
+
305
+ // Import-like relationship patterns for extractJavaScript() — DRYs up five inline blocks.
306
+ const JS_IMPORT_PATTERNS = [
307
+ { regex: /import\s+(?:\{[^}]+\}|\w+)\s+from\s+['"]([^'"]+)['"]/, group: 1 },
308
+ { regex: /(?:const|let|var)\s+(?:\{[^}]+\}|\w+)\s*=\s*require\s*\(\s*['"]([^'"]+)['"]\s*\)/, group: 1 },
309
+ { regex: /export\s+(?:\{[^}]+\}|\*)\s+from\s+['"]([^'"]+)['"]/, group: 1 },
310
+ { regex: /(?:await\s+)?import\s*\(\s*['"]([^'"]+)['"]\s*\)/, group: 1 },
311
+ ];
312
+
313
+ /**
314
+ * Split a string on commas, but only at the top level — ignoring commas
315
+ * inside <>, (), [], or {} brackets.
316
+ */
317
+ export function splitTopLevelCommas(str) {
318
+ const parts = [];
319
+ let depth = 0;
320
+ let start = 0;
321
+ for (let i = 0; i < str.length; i++) {
322
+ const ch = str[i];
323
+ if (ch === '<' || ch === '(' || ch === '[' || ch === '{') depth++;
324
+ else if (ch === '>' || ch === ')' || ch === ']' || ch === '}') depth = Math.max(0, depth - 1);
325
+ else if (ch === ',' && depth === 0) {
326
+ parts.push(str.slice(start, i));
327
+ start = i + 1;
328
+ }
329
+ }
330
+ parts.push(str.slice(start));
331
+ return parts;
332
+ }
333
+
334
+ // =============================================================================
335
+ // GRAPH EXTRACTOR CLASS
336
+ // =============================================================================
337
+
338
+ export class GraphExtractor {
339
+ constructor(options) {
340
+ this.projectRoot = options?.projectRoot || process.cwd();
341
+ this.entities = new Map();
342
+ this.relationships = [];
343
+ this.currentFile = null;
344
+ this.currentClass = null;
345
+ this.packageName = '';
346
+ this._useTreeSitter = options?.useTreeSitter !== false;
347
+ this.warnOnPatternDrop = options?.warnOnPatternDrop || false;
348
+ this.maxRegexLineLength = options?.maxRegexLineLength || 4000;
349
+ this.debugCounters = {
350
+ emptyCapture: {
351
+ entity: 0,
352
+ relationship: 0,
353
+ },
354
+ skippedLongLines: 0,
355
+ byLanguage: {},
356
+ byPattern: {},
357
+ };
358
+ this.patternPrefilterCache = new Map();
359
+ this.methodCallRegexCache = new Map();
360
+ this.genericPatternPlanCache = new Map();
361
+ }
362
+
363
+ /**
364
+ * Extract entities and relationships from a file.
365
+ * Dispatches to specialized extractors for Java/JS/Proto,
366
+ * generic registry-based extractor for all other languages.
367
+ */
368
+ async extractFromFile(filePath, content) {
369
+ this.currentFile = filePath;
370
+ const lines = content.split('\n');
371
+ const langInfo = getLanguageByPath(filePath);
372
+
373
+ if (!langInfo) {
374
+ return { entities: [], relationships: [] };
375
+ }
376
+
377
+ // Try tree-sitter extraction first (more accurate than regex)
378
+ if (this._useTreeSitter) {
379
+ try {
380
+ const provider = getTreeSitterProvider();
381
+ if (await provider.isAvailable() && provider.hasLanguage(langInfo.id)) {
382
+ const symbols = await provider.extractSymbols(content, langInfo.id);
383
+ if (symbols && symbols.length > 0) {
384
+ // Convert tree-sitter symbols to graph entities format and align
385
+ // labels with regex semantics (component/object arrow distinctions).
386
+ const entities = this._normalizeTreeSitterEntities(filePath, symbols, langInfo.id);
387
+ // Still extract relationships with regex (tree-sitter only gives definitions)
388
+ const relationships = this._extractRelationships(content, lines, filePath, langInfo, entities);
389
+ return { entities, relationships };
390
+ }
391
+ }
392
+ } catch {
393
+ // Fall through to regex extraction
394
+ }
395
+ }
396
+
397
+ // Specialized extractors for languages with complex logic
398
+ if (langInfo.id === 'java') {
399
+ return this.extractJava(content, lines, filePath);
400
+ }
401
+ if (langInfo.id === 'javascript') {
402
+ return this.extractJavaScript(content, lines, filePath);
403
+ }
404
+ if (langInfo.id === 'proto') {
405
+ return this.extractProto(content, lines, filePath);
406
+ }
407
+
408
+ // Generic registry-based extraction for all other languages
409
+ if (langInfo.graph) {
410
+ return this.extractGeneric(content, lines, filePath, langInfo);
411
+ }
412
+
413
+ return { entities: [], relationships: [] };
414
+ }
415
+
416
+ /**
417
+ * Extract from Java file
418
+ */
419
+ extractJava(content, lines, filePath) {
420
+ const entities = [];
421
+ const relationships = [];
422
+
423
+ // Extract package
424
+ const pkgMatch = content.match(JAVA_PATTERNS.package);
425
+ this.packageName = pkgMatch ? pkgMatch[1] : '';
426
+
427
+ // Extract Java imports (Phase 3.2: Java Import Extraction)
428
+ // Creates 'imports' relationships for dependency tracking
429
+ const fileEntityId = this.makeId(filePath, 'file', path.basename(filePath));
430
+ const importMatches = content.matchAll(JAVA_PATTERNS.import);
431
+
432
+ for (const match of importMatches) {
433
+ const importPath = match[1];
434
+ const isStatic = match[0].includes('static');
435
+ const isWildcard = importPath.endsWith('.*');
436
+
437
+ // Find the line number of this import by counting newlines before match position
438
+ // Note: Uses regex match which creates an array; for truly allocation-free counting,
439
+ // would need a manual loop, but this is fast enough for typical file sizes (<10k lines)
440
+ const importLine = (content.substring(0, match.index).match(/\n/g) || []).length + 1;
441
+
442
+ // Extract the class name for target resolution
443
+ // For "com.example.services.AuthService" -> target_name = "AuthService"
444
+ // For "com.example.services.*" -> target_name = "services" (package - won't resolve)
445
+ // For static "com.example.utils.Constants.MAX_VALUE" -> target_name = "Constants" (class only)
446
+ // For static "com.example.utils.Constants.*" -> target_name = "Constants" (class only)
447
+ const pathWithoutWildcard = importPath.replace(/\.\*$/, '');
448
+ const parts = pathWithoutWildcard.split('.');
449
+
450
+ // Static import logic explanation:
451
+ // - Regular import "com.foo.Bar" → target = "Bar" (last part)
452
+ // - Regular wildcard "com.foo.*" → target = "foo" (last part after removing *)
453
+ // - Static import "com.foo.Bar.METHOD" → target = "Bar" (second-to-last, the class)
454
+ // - Static wildcard "com.foo.Bar.*" → target = "Bar" (last part after removing *, the class)
455
+ // The key insight: static imports reference CLASS members, so we need the class name,
456
+ // not the member name, for entity resolution to work correctly.
457
+ let targetName;
458
+ if (isWildcard && !isStatic) {
459
+ // Regular wildcard: import com.foo.* -> package name (won't resolve to entity)
460
+ targetName = parts[parts.length - 1];
461
+ } else if (isStatic) {
462
+ // Static import: import static com.foo.Bar.METHOD or com.foo.Bar.*
463
+ // The class is second-to-last part (Bar), member is last (METHOD or *)
464
+ // For resolution, we want the CLASS name (Bar), not the member
465
+ targetName = parts.length >= 2 ? parts[parts.length - (isWildcard ? 1 : 2)] : parts[parts.length - 1];
466
+ } else {
467
+ // Regular import: import com.foo.Bar -> class name
468
+ targetName = parts[parts.length - 1];
469
+ }
470
+
471
+ // Skip empty or invalid target names
472
+ if (!targetName || targetName.length === 0) continue;
473
+
474
+ relationships.push({
475
+ source_id: fileEntityId,
476
+ target_id: null, // Will be resolved by resolveRelationshipTargets()
477
+ target_name: targetName,
478
+ full_import_path: importPath, // Store full path for better resolution
479
+ type: 'imports',
480
+ weight: GRAPH_CONFIG.relationshipWeights.imports,
481
+ context_line: importLine,
482
+ is_static: isStatic,
483
+ is_wildcard: isWildcard,
484
+ });
485
+ }
486
+
487
+ // Track current class for method/field association
488
+ let currentClass = null;
489
+ let braceDepth = 0;
490
+ let classStartDepth = 0;
491
+
492
+ for (let i = 0; i < lines.length; i++) {
493
+ const line = lines[i];
494
+ const lineNum = i + 1;
495
+
496
+ // Track brace depth
497
+ braceDepth += (line.match(/{/g) || []).length;
498
+ braceDepth -= (line.match(/}/g) || []).length;
499
+
500
+ // Reset current class when we exit its scope
501
+ if (currentClass && braceDepth < classStartDepth) {
502
+ currentClass = null;
503
+ }
504
+
505
+ // Class declarations
506
+ const classMatch = line.match(/(?:public|private|protected)?\s*(?:static)?\s*(?:final|abstract)?\s*class\s+(\w+)(?:\s+extends\s+(\w+))?(?:\s+implements\s+([\w,\s]+))?/);
507
+ if (classMatch) {
508
+ const className = classMatch[1];
509
+ const extendsClass = classMatch[2];
510
+ const implementsStr = classMatch[3];
511
+
512
+ const id = this.makeId(filePath, 'class', className);
513
+ const entity = {
514
+ id,
515
+ file_path: filePath,
516
+ type: 'class',
517
+ name: className,
518
+ signature: line.trim(),
519
+ doc_comment: this.extractDocComment(lines, i),
520
+ start_line: lineNum,
521
+ end_line: this.findEndLine(lines, i),
522
+ package: this.packageName,
523
+ };
524
+ entities.push(entity);
525
+ currentClass = entity;
526
+ classStartDepth = braceDepth;
527
+
528
+ // Extends relationship
529
+ if (extendsClass) {
530
+ relationships.push({
531
+ source_id: id,
532
+ target_id: this.makeId(filePath, 'class', extendsClass),
533
+ target_name: extendsClass,
534
+ type: 'extends',
535
+ weight: GRAPH_CONFIG.relationshipWeights.extends,
536
+ });
537
+ }
538
+
539
+ // Implements relationships
540
+ if (implementsStr) {
541
+ const interfaces = implementsStr.split(',').map(s => s.trim());
542
+ for (const iface of interfaces) {
543
+ relationships.push({
544
+ source_id: id,
545
+ target_id: this.makeId(filePath, 'interface', iface),
546
+ target_name: iface,
547
+ type: 'implements',
548
+ weight: GRAPH_CONFIG.relationshipWeights.implements,
549
+ });
550
+ }
551
+ }
552
+ }
553
+
554
+ // Interface declarations
555
+ const ifaceMatch = line.match(/(?:public)?\s*interface\s+(\w+)(?:\s+extends\s+([\w,\s]+))?/);
556
+ if (ifaceMatch) {
557
+ const ifaceName = ifaceMatch[1];
558
+ const id = this.makeId(filePath, 'interface', ifaceName);
559
+
560
+ entities.push({
561
+ id,
562
+ file_path: filePath,
563
+ type: 'interface',
564
+ name: ifaceName,
565
+ signature: line.trim(),
566
+ doc_comment: this.extractDocComment(lines, i),
567
+ start_line: lineNum,
568
+ end_line: this.findEndLine(lines, i),
569
+ package: this.packageName,
570
+ });
571
+
572
+ // Extends relationships for interfaces
573
+ const extendsStr = ifaceMatch[2];
574
+ if (extendsStr) {
575
+ const extended = extendsStr.split(',').map(s => s.trim());
576
+ for (const ext of extended) {
577
+ relationships.push({
578
+ source_id: id,
579
+ target_id: this.makeId(filePath, 'interface', ext),
580
+ target_name: ext,
581
+ type: 'extends',
582
+ weight: GRAPH_CONFIG.relationshipWeights.extends,
583
+ });
584
+ }
585
+ }
586
+ }
587
+
588
+ // Method declarations
589
+ const methodMatch = line.match(/(?:@\w+\s*(?:\([^)]*\))?\s*)*(?:public|private|protected)?\s*(?:static)?\s*(?:final)?\s*(?:synchronized)?\s*(?:<[\w\s,<>?]+>\s*)?(\w+(?:<[\w\s,<>?]+>)?(?:\[\])?)\s+(\w+)\s*\(([^)]*)\)/);
590
+ if (methodMatch && !line.includes('class ') && !line.includes('interface ')) {
591
+ const returnType = methodMatch[1];
592
+ const methodName = methodMatch[2];
593
+ const params = methodMatch[3];
594
+
595
+ // Skip if this looks like a constructor
596
+ if (returnType === currentClass?.name) continue;
597
+
598
+ // Build full signature for collision-proof ID (overloaded methods)
599
+ const fullSignature = `${returnType} ${methodName}(${params})`;
600
+ const signatureHash = this.makeSignatureHash(fullSignature);
601
+
602
+ // Use signature hash for disambiguation of overloaded methods
603
+ const id = this.makeId(filePath, 'method', `${currentClass?.name || 'Unknown'}.${methodName}`, {
604
+ signature: fullSignature,
605
+ startLine: lineNum,
606
+ });
607
+
608
+ entities.push({
609
+ id,
610
+ file_path: filePath,
611
+ type: 'method',
612
+ name: methodName,
613
+ signature: fullSignature,
614
+ signature_hash: signatureHash, // Store for backup/restore matching
615
+ doc_comment: this.extractDocComment(lines, i),
616
+ start_line: lineNum,
617
+ end_line: this.findMethodEndLine(lines, i),
618
+ parent_class: currentClass?.name,
619
+ package: this.packageName,
620
+ });
621
+
622
+ // Check for @Override
623
+ if (i > 0 && lines[i - 1].includes('@Override')) {
624
+ relationships.push({
625
+ source_id: id,
626
+ target_id: null, // Will be resolved later
627
+ target_name: methodName,
628
+ type: 'overrides',
629
+ weight: GRAPH_CONFIG.relationshipWeights.overrides,
630
+ });
631
+ }
632
+ }
633
+
634
+ // Method calls (within method bodies)
635
+ const callMatches = line.matchAll(/(\w+)\s*\.\s*(\w+)\s*\(/g);
636
+ for (const callMatch of callMatches) {
637
+ const object = callMatch[1];
638
+ const method = callMatch[2];
639
+
640
+ // Skip common patterns
641
+ if (['System', 'log', 'LOG', 'logger', 'String', 'Integer', 'Long'].includes(object)) continue;
642
+
643
+ relationships.push({
644
+ source_id: currentClass ? this.makeId(filePath, 'class', currentClass.name) : null,
645
+ target_id: null,
646
+ target_name: `${object}.${method}`,
647
+ type: 'calls',
648
+ weight: GRAPH_CONFIG.relationshipWeights.calls,
649
+ context_line: lineNum,
650
+ });
651
+ }
652
+
653
+ // Throw statements
654
+ const throwMatch = line.match(/throw\s+new\s+(\w+)/);
655
+ if (throwMatch && currentClass) {
656
+ relationships.push({
657
+ source_id: this.makeId(filePath, 'class', currentClass.name),
658
+ target_id: null,
659
+ target_name: throwMatch[1],
660
+ type: 'throws',
661
+ weight: GRAPH_CONFIG.relationshipWeights.throws,
662
+ });
663
+ }
664
+ }
665
+
666
+ return { entities, relationships };
667
+ }
668
+
669
+ /**
670
+ * Extract from JavaScript/TypeScript file
671
+ */
672
+ extractJavaScript(content, lines, filePath) {
673
+ const entities = [];
674
+ const relationships = [];
675
+ const fileEntityId = this.makeId(filePath, 'file', path.basename(filePath));
676
+
677
+ for (let i = 0; i < lines.length; i++) {
678
+ const line = lines[i];
679
+ const lineNum = i + 1;
680
+
681
+ // --- Entity extraction (if-else chain: first match wins per line) ---
682
+
683
+ const classMatch = line.match(/(?:export\s+(?:default\s+)?)?class\s+(\w+)(?:\s+extends\s+(\w+))?/);
684
+ if (classMatch) {
685
+ const className = classMatch[1];
686
+ const id = this.makeId(filePath, 'class', className);
687
+ entities.push({
688
+ id,
689
+ file_path: filePath,
690
+ type: 'class',
691
+ name: className,
692
+ signature: line.trim(),
693
+ doc_comment: this.extractDocComment(lines, i),
694
+ start_line: lineNum,
695
+ end_line: this.findEndLine(lines, i),
696
+ });
697
+ if (classMatch[2]) {
698
+ relationships.push({
699
+ source_id: id,
700
+ target_id: null,
701
+ target_name: classMatch[2],
702
+ type: 'extends',
703
+ weight: GRAPH_CONFIG.relationshipWeights.extends,
704
+ });
705
+ }
706
+ } else {
707
+ const funcMatch = line.match(/(?:export\s+(?:default\s+)?)?(?:async\s+)?function\s*\*?\s+(\w+)\s*\(/);
708
+ if (funcMatch) {
709
+ const sig = line.trim().slice(0, 100);
710
+ entities.push({
711
+ id: this.makeId(filePath, 'function', funcMatch[1], { signature: sig, startLine: lineNum }),
712
+ file_path: filePath,
713
+ type: 'function',
714
+ name: funcMatch[1],
715
+ signature: sig,
716
+ signature_hash: this.makeSignatureHash(sig),
717
+ doc_comment: this.extractDocComment(lines, i),
718
+ start_line: lineNum,
719
+ end_line: this.findEndLine(lines, i),
720
+ });
721
+ } else {
722
+ const componentMatch = line.match(/(?:export\s+)?(?:const|function)\s+([A-Z]\w+)\s*[=:]/);
723
+ if (componentMatch) {
724
+ const sig = line.trim().slice(0, 100);
725
+ entities.push({
726
+ id: this.makeId(filePath, 'component', componentMatch[1], { startLine: lineNum }),
727
+ file_path: filePath,
728
+ type: 'component',
729
+ name: componentMatch[1],
730
+ signature: sig,
731
+ signature_hash: this.makeSignatureHash(sig),
732
+ doc_comment: this.extractDocComment(lines, i),
733
+ start_line: lineNum,
734
+ end_line: this.findEndLine(lines, i),
735
+ });
736
+ } else {
737
+ const arrowMatch = line.match(/(?:export\s+)?(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s*)?\([^)]*\)\s*=>/);
738
+ if (arrowMatch) {
739
+ const sig = line.trim().slice(0, 100);
740
+ entities.push({
741
+ id: this.makeId(filePath, 'arrowFunction', arrowMatch[1], { signature: sig, startLine: lineNum }),
742
+ file_path: filePath,
743
+ type: 'arrowFunction',
744
+ name: arrowMatch[1],
745
+ signature: sig,
746
+ signature_hash: this.makeSignatureHash(sig),
747
+ doc_comment: this.extractDocComment(lines, i),
748
+ start_line: lineNum,
749
+ end_line: this.findEndLine(lines, i),
750
+ });
751
+ } else {
752
+ const objArrowMatch = line.match(/(\w+)\s*:\s*(?:async\s*)?\([^)]*\)\s*=>/);
753
+ if (objArrowMatch) {
754
+ entities.push({
755
+ id: this.makeId(filePath, 'arrowFunction', objArrowMatch[1], { startLine: lineNum }),
756
+ file_path: filePath,
757
+ type: 'arrowFunction',
758
+ name: objArrowMatch[1],
759
+ signature: line.trim().slice(0, 100),
760
+ doc_comment: this.extractDocComment(lines, i),
761
+ start_line: lineNum,
762
+ end_line: this.findEndLine(lines, i),
763
+ });
764
+ } else {
765
+ const objMethodMatch = line.match(/^\s+(\w+)\s*\([^)]*\)\s*\{/);
766
+ if (objMethodMatch && !JS_RESERVED_WORDS.has(objMethodMatch[1])) {
767
+ entities.push({
768
+ id: this.makeId(filePath, 'method', objMethodMatch[1], { startLine: lineNum }),
769
+ file_path: filePath,
770
+ type: 'method',
771
+ name: objMethodMatch[1],
772
+ signature: line.trim().slice(0, 100),
773
+ doc_comment: this.extractDocComment(lines, i),
774
+ start_line: lineNum,
775
+ end_line: this.findEndLine(lines, i),
776
+ });
777
+ }
778
+ }
779
+ }
780
+ }
781
+ }
782
+ }
783
+
784
+ // --- Relationship extraction ---
785
+
786
+ // Module-level import patterns (ESM import, CJS require, re-export, dynamic import)
787
+ for (const { regex, group } of JS_IMPORT_PATTERNS) {
788
+ const m = line.match(regex);
789
+ if (m) {
790
+ const source = m[group];
791
+ if (source && !source.startsWith('.')) {
792
+ relationships.push({
793
+ source_id: fileEntityId,
794
+ target_id: null,
795
+ target_name: source,
796
+ type: 'imports',
797
+ weight: GRAPH_CONFIG.relationshipWeights.imports,
798
+ });
799
+ }
800
+ }
801
+ }
802
+
803
+ // Destructured require — per-name import relationships
804
+ this._appendDestructuredRequireRelationships(line, fileEntityId, relationships);
805
+
806
+ // Method call relationships
807
+ const methodCalls = line.matchAll(/(\w+)\s*\.\s*(\w+)\s*\(/g);
808
+ for (const callMatch of methodCalls) {
809
+ const obj = callMatch[1];
810
+ const method = callMatch[2];
811
+ if (!obj || !method || JS_CALL_SKIP_OBJECTS.has(obj)) continue;
812
+ relationships.push({
813
+ source_id: fileEntityId,
814
+ target_id: null,
815
+ target_name: `${obj}.${method}`,
816
+ type: 'calls',
817
+ weight: GRAPH_CONFIG.relationshipWeights.calls,
818
+ });
819
+ }
820
+ }
821
+
822
+ return { entities, relationships };
823
+ }
824
+
825
+ /**
826
+ * Extract from Proto file
827
+ */
828
+ extractProto(content, lines, filePath) {
829
+ const entities = [];
830
+ const relationships = [];
831
+
832
+ for (let i = 0; i < lines.length; i++) {
833
+ const line = lines[i];
834
+ const lineNum = i + 1;
835
+
836
+ // Message declarations
837
+ const msgMatch = line.match(/message\s+(\w+)\s*\{/);
838
+ if (msgMatch) {
839
+ entities.push({
840
+ id: this.makeId(filePath, 'message', msgMatch[1]),
841
+ file_path: filePath,
842
+ type: 'message',
843
+ name: msgMatch[1],
844
+ signature: line.trim(),
845
+ doc_comment: this.extractDocComment(lines, i),
846
+ start_line: lineNum,
847
+ end_line: this.findEndLine(lines, i),
848
+ });
849
+ }
850
+
851
+ // Service declarations
852
+ const svcMatch = line.match(/service\s+(\w+)\s*\{/);
853
+ if (svcMatch) {
854
+ entities.push({
855
+ id: this.makeId(filePath, 'service', svcMatch[1]),
856
+ file_path: filePath,
857
+ type: 'service',
858
+ name: svcMatch[1],
859
+ signature: line.trim(),
860
+ doc_comment: this.extractDocComment(lines, i),
861
+ start_line: lineNum,
862
+ end_line: this.findEndLine(lines, i),
863
+ });
864
+ }
865
+
866
+ // RPC declarations
867
+ const rpcMatch = line.match(/rpc\s+(\w+)\s*\(\s*(\w+)\s*\)\s+returns\s+\(\s*(\w+)\s*\)/);
868
+ if (rpcMatch) {
869
+ const rpcName = rpcMatch[1];
870
+ const inputType = rpcMatch[2];
871
+ const outputType = rpcMatch[3];
872
+
873
+ const id = this.makeId(filePath, 'rpc', rpcName);
874
+ entities.push({
875
+ id,
876
+ file_path: filePath,
877
+ type: 'rpc',
878
+ name: rpcName,
879
+ signature: line.trim(),
880
+ doc_comment: this.extractDocComment(lines, i),
881
+ start_line: lineNum,
882
+ end_line: lineNum,
883
+ });
884
+
885
+ // RPC uses input and output messages
886
+ relationships.push({
887
+ source_id: id,
888
+ target_id: null,
889
+ target_name: inputType,
890
+ type: 'uses',
891
+ weight: GRAPH_CONFIG.relationshipWeights.uses,
892
+ });
893
+ relationships.push({
894
+ source_id: id,
895
+ target_id: null,
896
+ target_name: outputType,
897
+ type: 'uses',
898
+ weight: GRAPH_CONFIG.relationshipWeights.uses,
899
+ });
900
+ }
901
+ }
902
+
903
+ return { entities, relationships };
904
+ }
905
+
906
+ /**
907
+ * Generic extraction using registry patterns.
908
+ * Works for all languages that have graph patterns in language-patterns.js.
909
+ */
910
+ extractGeneric(content, lines, filePath, langInfo) {
911
+ const entities = [];
912
+ const relationships = [];
913
+ const { graph, id: language } = langInfo;
914
+ const {
915
+ entityPatterns,
916
+ relationshipPatterns,
917
+ methodCallPattern,
918
+ methodCallPrefilter,
919
+ } = this.getGenericPatternPlan(language, graph);
920
+ const skipCallObjects = new Set(graph.skipCallObjects || []);
921
+ const fileEntityId = this.makeId(filePath, 'file', path.basename(filePath));
922
+ const jsonDependencySections = new Set(['dependencies', 'devDependencies', 'peerDependencies']);
923
+ let jsonBraceDepth = 0;
924
+ let activeJsonDependencyDepth = null;
925
+ // Track active entity scopes to attribute call source_id by lexical range.
926
+ const activeEntityScopes = [];
927
+
928
+ // Choose findEndLine strategy based on language type
929
+ const findEndLineFn = (startIdx) => {
930
+ if (langInfo.indentBased) {
931
+ return this.findEndLineIndent(lines, startIdx);
932
+ }
933
+ if (langInfo.endKeyword) {
934
+ return this.findEndLineKeyword(lines, startIdx, langInfo.endKeyword, langInfo.blockKeywords);
935
+ }
936
+ return this.findEndLine(lines, startIdx);
937
+ };
938
+
939
+ for (let i = 0; i < lines.length; i++) {
940
+ const line = lines[i];
941
+ const trimmed = line.trimStart();
942
+ const lineNum = i + 1;
943
+ while (
944
+ activeEntityScopes.length > 0 &&
945
+ activeEntityScopes[activeEntityScopes.length - 1].end_line < lineNum
946
+ ) {
947
+ activeEntityScopes.pop();
948
+ }
949
+ const openBraces = (line.match(/{/g) || []).length;
950
+ const closeBraces = (line.match(/}/g) || []).length;
951
+ const depthBefore = jsonBraceDepth;
952
+ const depthAfter = depthBefore + openBraces - closeBraces;
953
+ if (trimmed.length > this.maxRegexLineLength) {
954
+ this._recordLongLineSkip(language, lineNum, trimmed.length);
955
+ if (language === 'json') {
956
+ if (activeJsonDependencyDepth !== null && depthAfter < activeJsonDependencyDepth) {
957
+ activeJsonDependencyDepth = null;
958
+ }
959
+ jsonBraceDepth = depthAfter;
960
+ }
961
+ continue;
962
+ }
963
+
964
+ // JSON dependency extraction:
965
+ // "dependencies"/"devDependencies"/"peerDependencies" are section markers.
966
+ // Actual imports are package keys inside those objects.
967
+ if (language === 'json' && activeJsonDependencyDepth !== null && depthBefore === activeJsonDependencyDepth) {
968
+ const depEntry = trimmed.match(/^"([^"]+)"\s*:\s*"([^"]+)"/);
969
+ if (depEntry && depEntry[1]) {
970
+ relationships.push({
971
+ source_id: fileEntityId,
972
+ target_id: null,
973
+ target_name: depEntry[1],
974
+ type: 'imports',
975
+ weight: GRAPH_CONFIG.relationshipWeights.imports,
976
+ context_line: lineNum,
977
+ });
978
+ }
979
+ }
980
+
981
+ // Entity extraction
982
+ for (const { type, pattern, prefilter } of entityPatterns) {
983
+ if (prefilter && !prefilter(trimmed)) continue;
984
+ const match = trimmed.match(pattern);
985
+ if (match) {
986
+ const name = match[1];
987
+ if (!name) {
988
+ this._recordEmptyCapture('entity', language, type, lineNum, trimmed);
989
+ continue;
990
+ }
991
+ const sig = trimmed.slice(0, 120);
992
+ const sigHash = this.makeSignatureHash(sig);
993
+ const entityId = this.makeId(filePath, type, name, { signature: sig, startLine: lineNum });
994
+ const endLine = findEndLineFn(i);
995
+
996
+ entities.push({
997
+ id: entityId,
998
+ file_path: filePath,
999
+ type,
1000
+ name,
1001
+ signature: sig,
1002
+ signature_hash: sigHash,
1003
+ doc_comment: this.extractDocComment(lines, i),
1004
+ start_line: lineNum,
1005
+ end_line: endLine,
1006
+ });
1007
+ activeEntityScopes.push({ id: entityId, start_line: lineNum, end_line: endLine });
1008
+ break; // one entity per line
1009
+ }
1010
+ }
1011
+
1012
+ // Relationship extraction
1013
+ const sourceEntityId = activeEntityScopes.length > 0
1014
+ ? activeEntityScopes[activeEntityScopes.length - 1].id
1015
+ : null;
1016
+ // Method calls need special handling: group1=object, group2=method.
1017
+ // Reuse compiled global regex to avoid per-line RegExp allocations.
1018
+ if (methodCallPattern && (!methodCallPrefilter || methodCallPrefilter(trimmed))) {
1019
+ methodCallPattern.lastIndex = 0;
1020
+ let m;
1021
+ while ((m = methodCallPattern.exec(trimmed)) !== null) {
1022
+ const obj = m[1];
1023
+ const method = m[2];
1024
+ if (!obj || !method) {
1025
+ this._recordEmptyCapture('relationship', language, 'methodCall', lineNum, trimmed);
1026
+ if (m[0] === '') methodCallPattern.lastIndex++;
1027
+ continue;
1028
+ }
1029
+ if (skipCallObjects.has(obj)) {
1030
+ if (m[0] === '') methodCallPattern.lastIndex++;
1031
+ continue;
1032
+ }
1033
+ relationships.push({
1034
+ source_id: sourceEntityId,
1035
+ target_id: null,
1036
+ target_name: `${obj}.${method}`,
1037
+ type: 'calls',
1038
+ weight: GRAPH_CONFIG.relationshipWeights.calls,
1039
+ context_line: lineNum,
1040
+ });
1041
+ if (m[0] === '') methodCallPattern.lastIndex++;
1042
+ }
1043
+ }
1044
+
1045
+ this._appendDestructuredRequireRelationships(trimmed, sourceEntityId || fileEntityId, relationships);
1046
+
1047
+ for (const { type: relType, pattern, prefilter } of relationshipPatterns) {
1048
+ if (relType === 'methodCall') continue;
1049
+ if (prefilter && !prefilter(trimmed)) continue;
1050
+
1051
+ const match = trimmed.match(pattern);
1052
+ if (relType === 'dep' && language === 'json') {
1053
+ if (match && match[1] && jsonDependencySections.has(match[1]) && depthAfter > depthBefore) {
1054
+ activeJsonDependencyDepth = depthAfter;
1055
+ }
1056
+ continue;
1057
+ }
1058
+ if (match) {
1059
+ const { targets, filtered } = this._resolveRelationshipTargets(relType, match, language);
1060
+ if (targets.length === 0) {
1061
+ if (!filtered) this._recordEmptyCapture('relationship', language, relType, lineNum, trimmed);
1062
+ continue;
1063
+ }
1064
+ const mappedType = GENERIC_RELATIONSHIP_MAPPING[relType] || 'uses';
1065
+ const weight = GRAPH_CONFIG.relationshipWeights[mappedType] || 1.0;
1066
+ for (const target of targets) {
1067
+ relationships.push({
1068
+ source_id: sourceEntityId || fileEntityId,
1069
+ target_id: null,
1070
+ target_name: target,
1071
+ type: mappedType,
1072
+ weight,
1073
+ context_line: lineNum,
1074
+ });
1075
+ }
1076
+ }
1077
+ }
1078
+
1079
+ if (language === 'json') {
1080
+ if (activeJsonDependencyDepth !== null && depthAfter < activeJsonDependencyDepth) {
1081
+ activeJsonDependencyDepth = null;
1082
+ }
1083
+ jsonBraceDepth = depthAfter;
1084
+ }
1085
+ }
1086
+
1087
+ return { entities, relationships };
1088
+ }
1089
+
1090
+ getGenericPatternPlan(language, graph) {
1091
+ const cached = this.genericPatternPlanCache.get(language);
1092
+ if (cached) return cached;
1093
+
1094
+ const entityPatterns = Object.entries(graph.entities || {}).map(([type, pattern]) => ({
1095
+ type,
1096
+ pattern,
1097
+ prefilter: this.getPatternPrefilter(pattern),
1098
+ }));
1099
+ const relationshipPatterns = Object.entries(graph.relationships || {}).map(([type, pattern]) => ({
1100
+ type,
1101
+ pattern,
1102
+ prefilter: this.getPatternPrefilter(pattern),
1103
+ }));
1104
+
1105
+ const methodCallEntry = relationshipPatterns.find((entry) => entry.type === 'methodCall');
1106
+ const methodCallPattern = methodCallEntry
1107
+ ? this.getCachedGlobalRegex(language, methodCallEntry.pattern)
1108
+ : null;
1109
+ const plan = {
1110
+ entityPatterns,
1111
+ relationshipPatterns,
1112
+ methodCallPattern,
1113
+ methodCallPrefilter: methodCallEntry?.prefilter || null,
1114
+ };
1115
+ this.genericPatternPlanCache.set(language, plan);
1116
+ return plan;
1117
+ }
1118
+
1119
+ getCachedGlobalRegex(language, pattern) {
1120
+ const key = `${language}:${pattern.source}:${pattern.flags}`;
1121
+ const cached = this.methodCallRegexCache.get(key);
1122
+ if (cached) return cached;
1123
+
1124
+ const uniqueFlags = [...new Set(`${pattern.flags || ''}g`)].join('');
1125
+ const compiled = new RegExp(pattern.source, uniqueFlags);
1126
+ this.methodCallRegexCache.set(key, compiled);
1127
+ return compiled;
1128
+ }
1129
+
1130
+ getPatternPrefilter(pattern) {
1131
+ const key = `${pattern.source}:${pattern.flags}`;
1132
+ if (this.patternPrefilterCache.has(key)) {
1133
+ return this.patternPrefilterCache.get(key);
1134
+ }
1135
+
1136
+ const caseInsensitive = pattern.flags.includes('i');
1137
+ let tokens = this.extractLineStartTokens(pattern.source);
1138
+ const optionalPrefixMatch = pattern.source.match(/^\^(\\?.)\?/);
1139
+ if (optionalPrefixMatch && tokens.length > 0) {
1140
+ const prefix = optionalPrefixMatch[1].startsWith('\\')
1141
+ ? optionalPrefixMatch[1].slice(1)
1142
+ : optionalPrefixMatch[1];
1143
+ tokens = [...tokens, ...tokens.map((token) => `${prefix}${token}`)];
1144
+ }
1145
+ if (tokens.length === 0) {
1146
+ this.patternPrefilterCache.set(key, null);
1147
+ return null;
1148
+ }
1149
+
1150
+ const normalizedTokens = caseInsensitive
1151
+ ? [...new Set(tokens.map((t) => t.toLowerCase()))]
1152
+ : [...new Set(tokens)];
1153
+ const prefilter = (line) => {
1154
+ const value = caseInsensitive ? line.toLowerCase() : line;
1155
+ return normalizedTokens.some((token) => value.startsWith(token));
1156
+ };
1157
+ this.patternPrefilterCache.set(key, prefilter);
1158
+ return prefilter;
1159
+ }
1160
+
1161
+ extractLineStartTokens(source) {
1162
+ if (!source.startsWith('^')) return [];
1163
+
1164
+ let i = 1;
1165
+ const tokens = [];
1166
+
1167
+ const skipLeadingWhitespace = () => {
1168
+ if (source.slice(i).startsWith('\\s*')) {
1169
+ i += 3;
1170
+ return true;
1171
+ }
1172
+ if (source.slice(i).startsWith('\\s+')) {
1173
+ i += 3;
1174
+ return true;
1175
+ }
1176
+ return false;
1177
+ };
1178
+
1179
+ while (skipLeadingWhitespace()) {}
1180
+
1181
+ while (source.slice(i).startsWith('(?:')) {
1182
+ const start = i + 3;
1183
+ let depth = 1;
1184
+ let j = start;
1185
+ let inClass = false;
1186
+ while (j < source.length && depth > 0) {
1187
+ const ch = source[j];
1188
+ if (ch === '\\') {
1189
+ j += 2;
1190
+ continue;
1191
+ }
1192
+ if (ch === '[') inClass = true;
1193
+ else if (ch === ']' && inClass) inClass = false;
1194
+ else if (!inClass && ch === '(') depth++;
1195
+ else if (!inClass && ch === ')') depth--;
1196
+ j++;
1197
+ }
1198
+ if (depth !== 0) return [];
1199
+
1200
+ const groupEnd = j - 1;
1201
+ const groupContent = source.slice(start, groupEnd);
1202
+ const isOptional = source[groupEnd + 1] === '?';
1203
+ if (!isOptional) {
1204
+ const alternatives = groupContent.split('|').map((alt) => alt.trim()).filter(Boolean);
1205
+ const altTokens = [];
1206
+ for (const alt of alternatives) {
1207
+ const token = this.extractLiteralPrefix(alt);
1208
+ if (!token) return [];
1209
+ altTokens.push(token);
1210
+ }
1211
+ tokens.push(...altTokens);
1212
+ return [...new Set(tokens)];
1213
+ }
1214
+ const optionalAlternatives = groupContent.split('|').map((alt) => alt.trim()).filter(Boolean);
1215
+ for (const alt of optionalAlternatives) {
1216
+ const token = this.extractLiteralPrefix(alt);
1217
+ if (token) tokens.push(token);
1218
+ }
1219
+ i = groupEnd + 2;
1220
+ while (skipLeadingWhitespace()) {}
1221
+ }
1222
+
1223
+ const literal = this.extractLiteralPrefix(source.slice(i));
1224
+ if (!literal) {
1225
+ // If no mandatory literal prefix can be derived, disable prefilter to avoid false negatives.
1226
+ return [];
1227
+ }
1228
+ tokens.push(literal);
1229
+ return [...new Set(tokens)];
1230
+ }
1231
+
1232
+ extractLiteralPrefix(fragment) {
1233
+ let result = '';
1234
+
1235
+ for (let i = 0; i < fragment.length; i++) {
1236
+ const ch = fragment[i];
1237
+ if (ch === '\\') {
1238
+ const next = fragment[i + 1];
1239
+ if (!next) break;
1240
+ if (/[A-Za-z0-9]/.test(next)) break;
1241
+ result += next;
1242
+ i++;
1243
+ continue;
1244
+ }
1245
+ if (fragment[i + 1] === '?' && result.length === 0 && /[@#<./:_-]/.test(ch)) {
1246
+ // Skip optional leading literal chars (e.g. -?include).
1247
+ i++;
1248
+ continue;
1249
+ }
1250
+ if (/[A-Za-z0-9_@#<./:-]/.test(ch)) {
1251
+ result += ch;
1252
+ continue;
1253
+ }
1254
+ break;
1255
+ }
1256
+
1257
+ return result;
1258
+ }
1259
+
1260
+ expandRelationshipTargets(relType, target) {
1261
+ if (typeof target !== 'string') return [target];
1262
+ if (!MULTI_TARGET_TYPES.has(relType)) return [target];
1263
+
1264
+ // Bracket-depth-aware top-level comma splitter.
1265
+ // Naive .split(',') would break generics: Base<Foo, Bar>, IFace
1266
+ const parts = splitTopLevelCommas(target);
1267
+
1268
+ return parts
1269
+ .map((entry) => entry.trim()
1270
+ .replace(/\s+as\s+\w+$/i, '') // import aliases
1271
+ .replace(/^(?:(?:public|protected|private|virtual)\s+)+/, '') // C++ access specifiers
1272
+ .replace(/<.*$/, '') // strip generics from first <: Map<K, V> → Map
1273
+ .replace(/\([^)]*\)/g, '') // strip constructor args: Base(x) → Base
1274
+ .replace(/[;{}]+$/, '') // strip trailing punctuation
1275
+ .trim()
1276
+ )
1277
+ .filter(Boolean);
1278
+ }
1279
+
1280
+ _normalizeTreeSitterEntities(filePath, symbols, language) {
1281
+ const dedupedBySymbolAndLine = new Map();
1282
+
1283
+ for (const sym of symbols) {
1284
+ if (!sym?.name || !sym?.type) continue;
1285
+ const normalizedType = this._normalizeTreeSitterSymbolType(sym.type, sym.name);
1286
+ if ((language === 'javascript' || language === 'typescript') && normalizedType === 'variable') {
1287
+ continue;
1288
+ }
1289
+ const startLine = Number.isInteger(sym.startLine) ? sym.startLine : 0;
1290
+ const endLine = Number.isInteger(sym.endLine) ? sym.endLine : startLine;
1291
+ const rank = TREE_SITTER_ENTITY_PRIORITY[normalizedType] || 0;
1292
+ const key = `${sym.name}:${startLine}`;
1293
+ const existing = dedupedBySymbolAndLine.get(key);
1294
+
1295
+ if (!existing || rank > existing.rank) {
1296
+ dedupedBySymbolAndLine.set(key, {
1297
+ id: this._makeEntityId(filePath, sym.name, normalizedType, startLine),
1298
+ file_path: filePath,
1299
+ type: normalizedType,
1300
+ name: sym.name,
1301
+ signature: sym.signature || null,
1302
+ start_line: startLine + 1, // tree-sitter is 0-indexed
1303
+ end_line: endLine + 1,
1304
+ rank,
1305
+ });
1306
+ }
1307
+ }
1308
+
1309
+ return Array.from(dedupedBySymbolAndLine.values())
1310
+ .sort((a, b) => a.start_line - b.start_line)
1311
+ .map(({ rank, ...entity }) => entity);
1312
+ }
1313
+
1314
+ _normalizeTreeSitterSymbolType(type, name) {
1315
+ if (type === 'arrowFunction' && /^[A-Z]/.test(name)) {
1316
+ return 'component';
1317
+ }
1318
+ return type;
1319
+ }
1320
+
1321
+ _resolveRelationshipTargets(relType, match, language) {
1322
+ const isJsTs = language === 'javascript' || language === 'typescript';
1323
+
1324
+ if (isJsTs && relType === 'import') {
1325
+ const source = match[3]?.trim();
1326
+ if (!source) return { targets: [], filtered: false };
1327
+ if (source.startsWith('.')) return { targets: [], filtered: true };
1328
+ return { targets: [source], filtered: false };
1329
+ }
1330
+
1331
+ if (isJsTs && (relType === 'require' || relType === 'reexport' || relType === 'dynamicImport')) {
1332
+ const source = match[1]?.trim();
1333
+ if (!source) return { targets: [], filtered: false };
1334
+ if (source.startsWith('.')) return { targets: [], filtered: true };
1335
+ return { targets: [source], filtered: false };
1336
+ }
1337
+
1338
+ const rawTarget = typeof match[1] === 'string' ? match[1].trim() : match[1];
1339
+ if (!rawTarget) return { targets: [], filtered: false };
1340
+
1341
+ return {
1342
+ targets: this.expandRelationshipTargets(relType, rawTarget),
1343
+ filtered: false,
1344
+ };
1345
+ }
1346
+
1347
+ _appendDestructuredRequireRelationships(line, sourceId, relationships) {
1348
+ const destructuredRequire = line.match(/(?:const|let|var)\s+\{([^}]+)\}\s*=\s*require\s*\(\s*['"]([^'"]+)['"]\s*\)/);
1349
+ if (!destructuredRequire) return;
1350
+
1351
+ const names = this._extractDestructuredRequireNames(destructuredRequire[1]);
1352
+ for (const name of names) {
1353
+ relationships.push({
1354
+ source_id: sourceId,
1355
+ target_id: null,
1356
+ target_name: name,
1357
+ type: 'imports',
1358
+ weight: GRAPH_CONFIG.relationshipWeights.imports,
1359
+ });
1360
+ }
1361
+ }
1362
+
1363
+ _extractDestructuredRequireNames(rawNames) {
1364
+ return rawNames
1365
+ .split(',')
1366
+ .map(part => part.trim())
1367
+ .map((name) => {
1368
+ if (!name) return null;
1369
+
1370
+ // JS destructuring alias: { readFile: read }.
1371
+ if (name.includes(':')) {
1372
+ name = name.split(':').pop().trim();
1373
+ }
1374
+
1375
+ // TS-style docs aliasing: { foo as bar }.
1376
+ const asAlias = name.match(/\bas\s+([A-Za-z_$][\w$]*)$/);
1377
+ if (asAlias) {
1378
+ name = asAlias[1];
1379
+ }
1380
+
1381
+ // Remove default value patterns: { foo = fallback }.
1382
+ name = name.replace(/=.*/, '').trim();
1383
+ name = name.replace(/^\.\.\./, '').trim();
1384
+
1385
+ return /^[A-Za-z_$][\w$]*$/.test(name) ? name : null;
1386
+ })
1387
+ .filter(Boolean);
1388
+ }
1389
+
1390
+ /**
1391
+ * Generate a deterministic entity ID for tree-sitter symbols.
1392
+ * Uses the same hash pattern as makeId() for consistency.
1393
+ */
1394
+ _makeEntityId(filePath, name, type, startLine) {
1395
+ const relativePath = this.projectRoot ? path.relative(this.projectRoot, filePath) : filePath;
1396
+ const key = `${relativePath}:${type}:${name}:${startLine}`;
1397
+ return createHash('sha256').update(key).digest('hex').slice(0, 16);
1398
+ }
1399
+
1400
+ /**
1401
+ * Extract relationships using regex patterns from langInfo.graph.
1402
+ * Used by tree-sitter path where entities come from AST but relationships
1403
+ * still need regex (tree-sitter tags.scm only gives definitions).
1404
+ */
1405
+ _extractRelationships(content, lines, filePath, langInfo, entities) {
1406
+ const relationships = [];
1407
+ if (!langInfo.graph) return relationships;
1408
+
1409
+ const { graph, id: language } = langInfo;
1410
+ const {
1411
+ relationshipPatterns,
1412
+ methodCallPattern,
1413
+ methodCallPrefilter,
1414
+ } = this.getGenericPatternPlan(language, graph);
1415
+ const skipCallObjects = new Set(graph.skipCallObjects || []);
1416
+ const fileEntityId = this.makeId(filePath, 'file', path.basename(filePath));
1417
+
1418
+ // Build scope lookup from tree-sitter entities for source_id attribution
1419
+ const sortedEntities = [...entities].sort((a, b) => a.start_line - b.start_line);
1420
+
1421
+ const findScopeEntity = (lineNum) => {
1422
+ for (let i = sortedEntities.length - 1; i >= 0; i--) {
1423
+ const e = sortedEntities[i];
1424
+ if (e.start_line <= lineNum && e.end_line >= lineNum) {
1425
+ return e.id;
1426
+ }
1427
+ }
1428
+ return null;
1429
+ };
1430
+
1431
+ for (let i = 0; i < lines.length; i++) {
1432
+ const line = lines[i];
1433
+ const trimmed = line.trimStart();
1434
+ const lineNum = i + 1;
1435
+
1436
+ if (trimmed.length > this.maxRegexLineLength) continue;
1437
+
1438
+ const sourceEntityId = findScopeEntity(lineNum);
1439
+
1440
+ // Method calls
1441
+ if (methodCallPattern && (!methodCallPrefilter || methodCallPrefilter(trimmed))) {
1442
+ methodCallPattern.lastIndex = 0;
1443
+ let m;
1444
+ while ((m = methodCallPattern.exec(trimmed)) !== null) {
1445
+ const obj = m[1];
1446
+ const method = m[2];
1447
+ if (!obj || !method) {
1448
+ if (m[0] === '') methodCallPattern.lastIndex++;
1449
+ continue;
1450
+ }
1451
+ if (skipCallObjects.has(obj)) {
1452
+ if (m[0] === '') methodCallPattern.lastIndex++;
1453
+ continue;
1454
+ }
1455
+ relationships.push({
1456
+ source_id: sourceEntityId,
1457
+ target_id: null,
1458
+ target_name: `${obj}.${method}`,
1459
+ type: 'calls',
1460
+ weight: GRAPH_CONFIG.relationshipWeights.calls,
1461
+ context_line: lineNum,
1462
+ });
1463
+ if (m[0] === '') methodCallPattern.lastIndex++;
1464
+ }
1465
+ }
1466
+
1467
+ this._appendDestructuredRequireRelationships(trimmed, sourceEntityId || fileEntityId, relationships);
1468
+
1469
+ // Other relationships (imports, extends, etc.)
1470
+ for (const { type: relType, pattern, prefilter } of relationshipPatterns) {
1471
+ if (relType === 'methodCall') continue;
1472
+ if (prefilter && !prefilter(trimmed)) continue;
1473
+
1474
+ const match = trimmed.match(pattern);
1475
+ if (match) {
1476
+ const { targets, filtered } = this._resolveRelationshipTargets(relType, match, language);
1477
+ if (targets.length === 0) {
1478
+ if (!filtered) this._recordEmptyCapture('relationship', language, relType, lineNum, trimmed);
1479
+ continue;
1480
+ }
1481
+ const mappedType = GENERIC_RELATIONSHIP_MAPPING[relType] || 'uses';
1482
+ const weight = GRAPH_CONFIG.relationshipWeights[mappedType] || 1.0;
1483
+ for (const target of targets) {
1484
+ relationships.push({
1485
+ source_id: sourceEntityId || fileEntityId,
1486
+ target_id: null,
1487
+ target_name: target,
1488
+ type: mappedType,
1489
+ weight,
1490
+ context_line: lineNum,
1491
+ });
1492
+ }
1493
+ }
1494
+ }
1495
+ }
1496
+
1497
+ return relationships;
1498
+ }
1499
+
1500
+ _recordEmptyCapture(kind, language, patternType, lineNum, line) {
1501
+ this.debugCounters.emptyCapture[kind] = (this.debugCounters.emptyCapture[kind] || 0) + 1;
1502
+
1503
+ if (!this.debugCounters.byLanguage[language]) {
1504
+ this.debugCounters.byLanguage[language] = { entity: 0, relationship: 0, skippedLongLines: 0 };
1505
+ }
1506
+ this.debugCounters.byLanguage[language][kind] += 1;
1507
+
1508
+ const key = `${language}:${kind}:${patternType}`;
1509
+ this.debugCounters.byPattern[key] = (this.debugCounters.byPattern[key] || 0) + 1;
1510
+
1511
+ if (this.warnOnPatternDrop && this.debugCounters.byPattern[key] <= 3) {
1512
+ console.warn(`[graph-extractor] Empty capture dropped for ${key} at line ${lineNum}: ${line.slice(0, 120)}`);
1513
+ }
1514
+ }
1515
+
1516
+ _recordLongLineSkip(language, lineNum, lineLength) {
1517
+ this.debugCounters.skippedLongLines += 1;
1518
+ if (!this.debugCounters.byLanguage[language]) {
1519
+ this.debugCounters.byLanguage[language] = { entity: 0, relationship: 0, skippedLongLines: 0 };
1520
+ }
1521
+ this.debugCounters.byLanguage[language].skippedLongLines += 1;
1522
+ if (this.warnOnPatternDrop && this.debugCounters.byLanguage[language].skippedLongLines <= 3) {
1523
+ console.warn(`[graph-extractor] Skipping regex extraction for long line (${lineLength} chars) at ${language}:${lineNum}`);
1524
+ }
1525
+ }
1526
+
1527
+ getDebugCounters() {
1528
+ const byLanguage = {};
1529
+ for (const [language, counts] of Object.entries(this.debugCounters.byLanguage)) {
1530
+ byLanguage[language] = { ...counts };
1531
+ }
1532
+ return {
1533
+ emptyCapture: { ...this.debugCounters.emptyCapture },
1534
+ skippedLongLines: this.debugCounters.skippedLongLines,
1535
+ byLanguage,
1536
+ byPattern: { ...this.debugCounters.byPattern },
1537
+ };
1538
+ }
1539
+
1540
+ /**
1541
+ * Generate unique ID for an entity
1542
+ *
1543
+ * For collision-proof IDs (especially overloaded methods), include signature or line info.
1544
+ * ID format: sha256(relativePath:type:name:disambiguator)[0:16]
1545
+ *
1546
+ * @param {string} filePath - Absolute file path
1547
+ * @param {string} type - Entity type (class, method, function, etc.)
1548
+ * @param {string} name - Entity name
1549
+ * @param {object} [options] - Optional disambiguation info
1550
+ * @param {string} [options.signature] - Method/function signature for overload disambiguation
1551
+ * @param {number} [options.startLine] - Start line as fallback disambiguator
1552
+ * @returns {string} 16-char hex ID
1553
+ */
1554
+ makeId(filePath, type, name, options = {}) {
1555
+ const relativePath = this.projectRoot ? path.relative(this.projectRoot, filePath) : filePath;
1556
+
1557
+ // Build disambiguator for overloaded methods or same-name entities
1558
+ let disambiguator = '';
1559
+ if (options.signature) {
1560
+ // Hash the signature for a compact, stable disambiguator
1561
+ disambiguator = createHash('sha256').update(options.signature).digest('hex').slice(0, 8);
1562
+ } else if (options.startLine !== undefined) {
1563
+ // Fallback: use line number if no signature
1564
+ disambiguator = String(options.startLine);
1565
+ }
1566
+
1567
+ const key = disambiguator
1568
+ ? `${relativePath}:${type}:${name}:${disambiguator}`
1569
+ : `${relativePath}:${type}:${name}`;
1570
+
1571
+ return createHash('sha256').update(key).digest('hex').slice(0, 16);
1572
+ }
1573
+
1574
+ /**
1575
+ * Generate a signature hash for stable entity identification.
1576
+ * Used for backup/restore matching when IDs change.
1577
+ *
1578
+ * @param {string} signature - Full method/function signature
1579
+ * @returns {string|null} 8-char hex hash or null if no signature
1580
+ */
1581
+ makeSignatureHash(signature) {
1582
+ if (!signature) return null;
1583
+ return createHash('sha256').update(signature).digest('hex').slice(0, 8);
1584
+ }
1585
+
1586
+ /**
1587
+ * Extract doc comment from lines before a declaration
1588
+ */
1589
+ extractDocComment(lines, lineIndex) {
1590
+ const comments = [];
1591
+ let i = lineIndex - 1;
1592
+
1593
+ while (i >= 0) {
1594
+ const line = lines[i].trim();
1595
+ if (line.startsWith('*') || line.startsWith('//') || line.startsWith('/*') || line.startsWith('/**')) {
1596
+ comments.unshift(line.replace(/^[/*\s]+/, '').replace(/\*\/$/, '').trim());
1597
+ i--;
1598
+ } else if (line === '') {
1599
+ i--;
1600
+ } else {
1601
+ break;
1602
+ }
1603
+ }
1604
+
1605
+ return comments.join(' ').slice(0, 500) || null;
1606
+ }
1607
+
1608
+ /**
1609
+ * Find end line of a block (matching braces)
1610
+ */
1611
+ findEndLine(lines, startIndex) {
1612
+ let braceDepth = 0;
1613
+ let started = false;
1614
+
1615
+ for (let i = startIndex; i < lines.length; i++) {
1616
+ const line = lines[i];
1617
+ const opens = (line.match(/{/g) || []).length;
1618
+ const closes = (line.match(/}/g) || []).length;
1619
+
1620
+ if (opens > 0) started = true;
1621
+ braceDepth += opens - closes;
1622
+
1623
+ if (started && braceDepth === 0) {
1624
+ return i + 1;
1625
+ }
1626
+ }
1627
+
1628
+ return lines.length;
1629
+ }
1630
+
1631
+ /**
1632
+ * Find end line for indent-based languages (Python, YAML, etc.)
1633
+ * Scans forward until a line at the same or lesser indentation is found.
1634
+ */
1635
+ findEndLineIndent(lines, startIndex) {
1636
+ const startLine = lines[startIndex];
1637
+ const startIndent = startLine.length - startLine.trimStart().length;
1638
+
1639
+ for (let i = startIndex + 1; i < lines.length; i++) {
1640
+ const line = lines[i];
1641
+ const trimmed = line.trimStart();
1642
+ if (!trimmed) continue; // skip blank lines
1643
+ const indent = line.length - trimmed.length;
1644
+ if (indent <= startIndent) {
1645
+ return i; // 0-based exclusive → 1-based line number
1646
+ }
1647
+ }
1648
+
1649
+ return lines.length;
1650
+ }
1651
+
1652
+ /**
1653
+ * Find end line for end-keyword languages (Ruby, Elixir, Lua, Obj-C).
1654
+ * Counts matching keyword pairs to find the closing end/keyword.
1655
+ */
1656
+ findEndLineKeyword(lines, startIndex, endKeyword, blockKeywords) {
1657
+ const endRe = new RegExp(`^\\s*${escapeRegexLiteral(endKeyword)}\\b`);
1658
+ const blockStartRe = blockKeywords?.length
1659
+ ? new RegExp(`^\\s*(?:${blockKeywords.join('|')})\\b`)
1660
+ : null;
1661
+ let depth = 1; // start inside the opening block
1662
+
1663
+ for (let i = startIndex + 1; i < lines.length; i++) {
1664
+ const line = lines[i];
1665
+ // Check for nested block openers (boundary patterns or block keywords)
1666
+ if (blockStartRe && blockStartRe.test(line)) {
1667
+ depth++;
1668
+ }
1669
+ if (endRe.test(line)) {
1670
+ depth--;
1671
+ if (depth === 0) {
1672
+ return i + 1; // 1-based
1673
+ }
1674
+ }
1675
+ }
1676
+
1677
+ return lines.length;
1678
+ }
1679
+
1680
+ /**
1681
+ * Find end line of a method (simpler heuristic)
1682
+ */
1683
+ findMethodEndLine(lines, startIndex) {
1684
+ let braceDepth = 0;
1685
+ let started = false;
1686
+
1687
+ for (let i = startIndex; i < Math.min(startIndex + 200, lines.length); i++) {
1688
+ const line = lines[i];
1689
+ const opens = (line.match(/{/g) || []).length;
1690
+ const closes = (line.match(/}/g) || []).length;
1691
+
1692
+ if (opens > 0) started = true;
1693
+ braceDepth += opens - closes;
1694
+
1695
+ if (started && braceDepth === 0) {
1696
+ return i + 1;
1697
+ }
1698
+ }
1699
+
1700
+ return Math.min(startIndex + 50, lines.length);
1701
+ }
1702
+ }
1703
+
1704
+ // =============================================================================
1705
+ // DATABASE OPERATIONS
1706
+ // =============================================================================
1707
+
1708
+ /**
1709
+ * Ensure stale_since column exists for soft-delete support.
1710
+ * Handles branch switching gracefully by marking entities as stale instead of deleting.
1711
+ * Files marked as stale can be pruned after 30 days.
1712
+ *
1713
+ * @param {import('better-sqlite3').Database} db
1714
+ * @returns {boolean} true if column exists or was added successfully
1715
+ */
1716
+ export function ensureStaleColumn(db) {
1717
+ try {
1718
+ // Check if column exists
1719
+ const columns = db.prepare("PRAGMA table_info(entities)").all();
1720
+ const hasStaleColumn = columns.some(c => c.name === 'stale_since');
1721
+
1722
+ if (!hasStaleColumn) {
1723
+ console.log('[graph-extractor] Adding stale_since column for soft-delete support');
1724
+ db.exec('ALTER TABLE entities ADD COLUMN stale_since INTEGER DEFAULT NULL');
1725
+ // Create partial index for efficient stale entity queries
1726
+ db.exec('CREATE INDEX IF NOT EXISTS idx_entities_stale ON entities(stale_since) WHERE stale_since IS NOT NULL');
1727
+ }
1728
+
1729
+ // P1 FIX: Add covering index for active entities (stale_since IS NULL)
1730
+ // The idx_entities_stale helps find stale entries, but queries filtering for
1731
+ // active entries (WHERE stale_since IS NULL) need their own index
1732
+ // This provides 5-20ms savings per query on active entity lookups
1733
+ try {
1734
+ db.exec(`
1735
+ CREATE INDEX IF NOT EXISTS idx_entities_active
1736
+ ON entities(id, name, type, file_path)
1737
+ WHERE stale_since IS NULL
1738
+ `);
1739
+ } catch (e) {
1740
+ // Index may already exist, ignore
1741
+ }
1742
+
1743
+ return true;
1744
+ } catch (err) {
1745
+ if (err.message.includes('duplicate column')) {
1746
+ return true; // Column already exists
1747
+ }
1748
+ console.error(`[graph-extractor] Failed to add stale_since column: ${err.message}`);
1749
+ return false;
1750
+ }
1751
+ }
1752
+
1753
+ /**
1754
+ * Check if database schema is compatible with current version.
1755
+ * Stores version in a simple key-value table.
1756
+ * @param {import('better-sqlite3').Database} db
1757
+ * @returns {{compatible: boolean, dbVersion: number|null}}
1758
+ */
1759
+ export function checkSchemaVersion(db) {
1760
+ try {
1761
+ // Create metadata table if not exists
1762
+ db.exec(`CREATE TABLE IF NOT EXISTS schema_meta (key TEXT PRIMARY KEY, value TEXT)`);
1763
+
1764
+ const row = db.prepare('SELECT value FROM schema_meta WHERE key = ?').get('version');
1765
+ const dbVersion = row ? parseInt(row.value, 10) : null;
1766
+
1767
+ if (dbVersion === null) {
1768
+ const existingTableCount = db.prepare(`
1769
+ SELECT COUNT(*) AS count
1770
+ FROM sqlite_master
1771
+ WHERE type = 'table'
1772
+ AND name NOT LIKE 'sqlite_%'
1773
+ AND name != 'schema_meta'
1774
+ `).get().count;
1775
+
1776
+ // Fresh databases can continue; pre-versioning databases must be migrated.
1777
+ return { compatible: existingTableCount === 0, dbVersion: null };
1778
+ }
1779
+
1780
+ if (dbVersion < SCHEMA_VERSION) {
1781
+ console.warn(`⚠️ Schema version mismatch: DB has v${dbVersion}, code expects v${SCHEMA_VERSION}`);
1782
+ console.warn(` Run: /index-codebase --full (or node index-codebase-v21.js --full)`);
1783
+ return { compatible: false, dbVersion };
1784
+ }
1785
+
1786
+ return { compatible: true, dbVersion };
1787
+ } catch (err) {
1788
+ // If check fails, assume compatible and continue
1789
+ return { compatible: true, dbVersion: null };
1790
+ }
1791
+ }
1792
+
1793
+ /**
1794
+ * Create code graph database schema
1795
+ * Uses better-sqlite3 (native SQLite binding with full FTS5 trigram support)
1796
+ */
1797
+ export function createGraphSchema(db) {
1798
+ const versionStatus = checkSchemaVersion(db);
1799
+ if (!versionStatus.compatible) {
1800
+ console.log(` Updating schema from ${versionStatus.dbVersion ?? 'unversioned'} to v${SCHEMA_VERSION}`);
1801
+ }
1802
+
1803
+ // Entities table with HCGS summary support
1804
+ // signature_hash added for collision-proof backup/restore of overloaded methods
1805
+ // code column stores actual source code for HCGS summary generation
1806
+ db.exec(`
1807
+ CREATE TABLE IF NOT EXISTS entities (
1808
+ id TEXT PRIMARY KEY,
1809
+ file_path TEXT NOT NULL,
1810
+ type TEXT NOT NULL,
1811
+ name TEXT NOT NULL,
1812
+ signature TEXT,
1813
+ signature_hash TEXT,
1814
+ doc_comment TEXT,
1815
+ start_line INTEGER,
1816
+ end_line INTEGER,
1817
+ package TEXT,
1818
+ parent_class TEXT,
1819
+ search_text TEXT,
1820
+ summary TEXT,
1821
+ summary_embedding BLOB,
1822
+ parent_id TEXT,
1823
+ hierarchy_level INTEGER DEFAULT 0,
1824
+ code TEXT,
1825
+ name_alias TEXT,
1826
+ stale_since INTEGER DEFAULT NULL
1827
+ )
1828
+ `);
1829
+
1830
+ // Migration: Add code column to existing tables that don't have it
1831
+ try {
1832
+ const columns = db.prepare("PRAGMA table_info(entities)").all();
1833
+ const hasCodeColumn = columns.some(col => col.name === 'code');
1834
+ if (!hasCodeColumn) {
1835
+ db.exec('ALTER TABLE entities ADD COLUMN code TEXT');
1836
+ console.log(' Migrated: added code column to entities table');
1837
+ }
1838
+ const hasAliasColumn = columns.some(col => col.name === 'name_alias');
1839
+ if (!hasAliasColumn) {
1840
+ db.exec('ALTER TABLE entities ADD COLUMN name_alias TEXT');
1841
+ console.log(' Migrated: added name_alias column to entities table');
1842
+ }
1843
+ } catch (err) {
1844
+ // Ignore errors - column might already exist or table not created yet
1845
+ }
1846
+
1847
+ const aliasBackfillCount = backfillNameAliases(db);
1848
+ if (aliasBackfillCount > 0) {
1849
+ console.log(` Migrated: backfilled name_alias for ${aliasBackfillCount} entities`);
1850
+ }
1851
+
1852
+ // Migration: Add stale_since column for soft-delete support
1853
+ // Files marked as stale (removed from filesystem but kept in DB) can be pruned after 30 days
1854
+ // This handles branch switches gracefully
1855
+ // E4 FIX: Check return value and warn if migration failed
1856
+ if (!ensureStaleColumn(db)) {
1857
+ console.warn('[graph-extractor] WARN: Failed to add stale_since column - searches may include deleted files');
1858
+ }
1859
+
1860
+ // Relationships table (source_id can be NULL for unresolved references)
1861
+ db.exec(`
1862
+ CREATE TABLE IF NOT EXISTS relationships (
1863
+ source_id TEXT,
1864
+ target_id TEXT,
1865
+ target_name TEXT NOT NULL,
1866
+ type TEXT NOT NULL,
1867
+ weight REAL DEFAULT 1.0,
1868
+ context_line INTEGER,
1869
+ full_import_path TEXT,
1870
+ is_static INTEGER DEFAULT 0,
1871
+ is_wildcard INTEGER DEFAULT 0
1872
+ )
1873
+ `);
1874
+
1875
+ // Try FTS5 first, fallback to regular indexes if not available
1876
+ // better-sqlite3 bundles SQLite 3.51.1 which has native FTS5 trigram support
1877
+ let hasFts5 = false;
1878
+ try {
1879
+ const { rebuilt } = ensureLexicalFtsSchema(db);
1880
+ hasFts5 = true;
1881
+ console.log(rebuilt ? ' FTS5 schema rebuilt (porter + trigram)' : ' FTS5 enabled (porter + trigram)');
1882
+ } catch (err) {
1883
+ console.log(' FTS5 not available:', err.message);
1884
+ }
1885
+
1886
+ // Indexes for graph traversal and text search
1887
+ db.exec(`CREATE INDEX IF NOT EXISTS idx_entities_file ON entities(file_path)`);
1888
+ db.exec(`CREATE INDEX IF NOT EXISTS idx_entities_type ON entities(type)`);
1889
+ db.exec(`CREATE INDEX IF NOT EXISTS idx_entities_name ON entities(name)`);
1890
+ db.exec(`CREATE INDEX IF NOT EXISTS idx_entities_search ON entities(search_text)`);
1891
+ db.exec(`CREATE INDEX IF NOT EXISTS idx_entities_parent ON entities(parent_id)`);
1892
+ db.exec(`CREATE INDEX IF NOT EXISTS idx_entities_level ON entities(hierarchy_level)`);
1893
+ // Partial index for soft-delete queries: efficiently find stale entities
1894
+ // Only indexes rows where stale_since IS NOT NULL (smaller index, faster lookups)
1895
+ db.exec(`CREATE INDEX IF NOT EXISTS idx_entities_stale ON entities(stale_since) WHERE stale_since IS NOT NULL`);
1896
+ // P1 FIX: Covering index for active entity queries (WHERE stale_since IS NULL)
1897
+ // Provides 5-20ms savings on all active entity lookups (BM25, graph expansion, etc.)
1898
+ db.exec(`CREATE INDEX IF NOT EXISTS idx_entities_active ON entities(id, name, type, file_path) WHERE stale_since IS NULL`);
1899
+ // Composite index for collision-proof backup/restore of overloaded methods
1900
+ db.exec(`CREATE INDEX IF NOT EXISTS idx_entities_sig_hash ON entities(file_path, type, name, signature_hash)`);
1901
+ db.exec(`CREATE INDEX IF NOT EXISTS idx_rel_source ON relationships(source_id)`);
1902
+ db.exec(`CREATE INDEX IF NOT EXISTS idx_rel_target ON relationships(target_id)`);
1903
+ db.exec(`CREATE INDEX IF NOT EXISTS idx_rel_target_name ON relationships(target_name)`);
1904
+ db.exec(`CREATE INDEX IF NOT EXISTS idx_rel_type ON relationships(type)`);
1905
+ // Unique constraint to prevent duplicate relationships (same source→target with same type)
1906
+ // Allows NULL source_id (unresolved refs) by excluding them from uniqueness check
1907
+ db.exec(`CREATE UNIQUE INDEX IF NOT EXISTS idx_rel_unique ON relationships(source_id, target_id, type, target_name) WHERE source_id IS NOT NULL`);
1908
+ // Index on target_id for efficient reverse lookups ("what calls X")
1909
+ db.exec(`CREATE INDEX IF NOT EXISTS idx_rel_target_id ON relationships(target_id) WHERE target_id IS NOT NULL`);
1910
+
1911
+ setSchemaVersion(db);
1912
+
1913
+ return hasFts5;
1914
+ }
1915
+
1916
+ /**
1917
+ * Resolve target_id for relationships that have NULL target_id
1918
+ * Uses fuzzy matching and heuristics to link calls/imports/overrides to actual entities
1919
+ */
1920
+ function resolveRelationshipTargets(db) {
1921
+ const stats = {
1922
+ calls: 0,
1923
+ imports: 0,
1924
+ overrides: 0,
1925
+ throws: 0,
1926
+ unresolved: 0
1927
+ };
1928
+
1929
+ console.log(' Building entity lookup maps...');
1930
+
1931
+ // Build entity lookup maps for fast resolution
1932
+ const entityByName = new Map(); // name -> [entities]
1933
+ const entityByFQN = new Map(); // fully qualified name -> entity
1934
+ const entityById = new Map(); // id -> entity
1935
+
1936
+ const allEntities = db.prepare('SELECT id, name, type, parent_class, package, file_path FROM entities').all();
1937
+ console.log(` Loaded ${allEntities.length} entities`);
1938
+
1939
+ for (const e of allEntities) {
1940
+ // Add to ID lookup
1941
+ entityById.set(e.id, e);
1942
+
1943
+ // Add to name lookup (can have duplicates)
1944
+ if (!entityByName.has(e.name)) {
1945
+ entityByName.set(e.name, []);
1946
+ }
1947
+ entityByName.get(e.name).push(e);
1948
+
1949
+ // Add to FQN lookup (unique)
1950
+ if (e.package && e.parent_class) {
1951
+ // Java method: package.ClassName.methodName
1952
+ const fqn = `${e.package}.${e.parent_class}.${e.name}`;
1953
+ entityByFQN.set(fqn, e);
1954
+ } else if (e.package) {
1955
+ // Java class: package.ClassName
1956
+ const fqn = `${e.package}.${e.name}`;
1957
+ entityByFQN.set(fqn, e);
1958
+ } else if (e.parent_class) {
1959
+ // Method without package: ClassName.methodName
1960
+ const fqn = `${e.parent_class}.${e.name}`;
1961
+ entityByFQN.set(fqn, e);
1962
+ }
1963
+ }
1964
+
1965
+ console.log(' Resolving unresolved relationships...');
1966
+
1967
+ // Prepare update statement and use transaction for bulk updates
1968
+ const updateStmt = db.prepare('UPDATE relationships SET target_id = ? WHERE rowid = ?');
1969
+
1970
+ // Get all unresolved relationships
1971
+ const unresolvedRels = db.prepare(`
1972
+ SELECT rowid, source_id, target_name, type
1973
+ FROM relationships
1974
+ WHERE target_id IS NULL
1975
+ `).all();
1976
+
1977
+ console.log(` Found ${unresolvedRels.length} unresolved relationships`);
1978
+
1979
+ // Use transaction for bulk updates (much faster)
1980
+ const updateMany = db.transaction(() => {
1981
+ let processed = 0;
1982
+ for (const rel of unresolvedRels) {
1983
+ let targetId = null;
1984
+
1985
+ if (rel.type === 'calls') {
1986
+ // Method calls: "object.method" or "ClassName.method"
1987
+ targetId = resolveMethodCall(rel.target_name, rel.source_id, entityByName, entityByFQN, entityById);
1988
+ if (targetId) stats.calls++;
1989
+ } else if (rel.type === 'imports') {
1990
+ // Imports: package path or module name
1991
+ targetId = resolveImport(rel.target_name, entityByName, entityByFQN);
1992
+ if (targetId) stats.imports++;
1993
+ } else if (rel.type === 'overrides') {
1994
+ // Method overrides: methodName (need to find parent class method)
1995
+ targetId = resolveOverride(rel.target_name, rel.source_id, entityByName, entityById, db);
1996
+ if (targetId) stats.overrides++;
1997
+ } else if (rel.type === 'throws') {
1998
+ // Exception classes
1999
+ targetId = resolveThrows(rel.target_name, entityByName);
2000
+ if (targetId) stats.throws++;
2001
+ }
2002
+
2003
+ if (targetId) {
2004
+ updateStmt.run(targetId, rel.rowid);
2005
+ } else {
2006
+ stats.unresolved++;
2007
+ }
2008
+
2009
+ processed++;
2010
+ if (processed % 1000 === 0) {
2011
+ process.stdout.write(`\r Processed ${processed}/${unresolvedRels.length}...`);
2012
+ }
2013
+ }
2014
+ process.stdout.write('\n');
2015
+ });
2016
+
2017
+ updateMany();
2018
+
2019
+ return stats;
2020
+ }
2021
+
2022
+ /**
2023
+ * Resolve method call: "object.method" or "service.method"
2024
+ */
2025
+ function resolveMethodCall(targetName, sourceId, entityByName, entityByFQN, entityById) {
2026
+ // Parse "object.method" or "ClassName.methodName"
2027
+ const parts = targetName.split('.');
2028
+ if (parts.length < 2) return null;
2029
+
2030
+ const [objName, methodName] = parts;
2031
+
2032
+ // Strategy 1: Exact FQN match (e.g., "UserService.findById")
2033
+ if (entityByFQN.has(targetName)) {
2034
+ return entityByFQN.get(targetName).id;
2035
+ }
2036
+
2037
+ // Strategy 2: Look for method with matching name in class with matching name
2038
+ const methodCandidates = entityByName.get(methodName) || [];
2039
+ const classCandidates = entityByName.get(objName) || [];
2040
+
2041
+ for (const method of methodCandidates) {
2042
+ if (method.type === 'method' && method.parent_class === objName) {
2043
+ return method.id;
2044
+ }
2045
+ }
2046
+
2047
+ // Strategy 3: Fuzzy match - any method with this name (pick most common class)
2048
+ if (methodCandidates.length > 0) {
2049
+ const methods = methodCandidates.filter(e => e.type === 'method');
2050
+ if (methods.length > 0) {
2051
+ // Prefer methods in same package or file as source
2052
+ const sourceEntity = entityById.get(sourceId);
2053
+ if (sourceEntity) {
2054
+ const samePackage = methods.find(m => m.package === sourceEntity.package);
2055
+ if (samePackage) return samePackage.id;
2056
+
2057
+ const sameFile = methods.find(m => m.file_path === sourceEntity.file_path);
2058
+ if (sameFile) return sameFile.id;
2059
+ }
2060
+
2061
+ // Otherwise pick first match
2062
+ return methods[0].id;
2063
+ }
2064
+ }
2065
+
2066
+ return null;
2067
+ }
2068
+
2069
+ /**
2070
+ * Resolve import: package.Class or module path
2071
+ */
2072
+ function resolveImport(targetName, entityByName, entityByFQN) {
2073
+ // Strategy 1: Exact FQN match
2074
+ if (entityByFQN.has(targetName)) {
2075
+ return entityByFQN.get(targetName).id;
2076
+ }
2077
+
2078
+ // Strategy 2: Match last component (class name)
2079
+ const parts = targetName.split('.');
2080
+ const className = parts[parts.length - 1];
2081
+
2082
+ const candidates = entityByName.get(className) || [];
2083
+ if (candidates.length > 0) {
2084
+ // Prefer classes/interfaces over other types
2085
+ const classLike = candidates.find(e => ['class', 'interface', 'enum'].includes(e.type));
2086
+ if (classLike) return classLike.id;
2087
+ return candidates[0].id;
2088
+ }
2089
+
2090
+ return null;
2091
+ }
2092
+
2093
+ /**
2094
+ * Resolve method override: find parent class/interface method with same name
2095
+ * Simplified version: just match by method name (close enough for most cases)
2096
+ */
2097
+ function resolveOverride(methodName, sourceId, entityByName, entityById, db) {
2098
+ // Simple strategy: Find any method with this name
2099
+ // In a real override, it should be in a parent class, but for now we'll match by name
2100
+ const methodCandidates = entityByName.get(methodName) || [];
2101
+
2102
+ const methods = methodCandidates.filter(e => e.type === 'method');
2103
+ if (methods.length > 0) {
2104
+ // Return first match (could be improved with parent class lookup later)
2105
+ return methods[0].id;
2106
+ }
2107
+
2108
+ return null;
2109
+ }
2110
+
2111
+ /**
2112
+ * Resolve throws: exception class name
2113
+ */
2114
+ function resolveThrows(exceptionName, entityByName) {
2115
+ const candidates = entityByName.get(exceptionName) || [];
2116
+
2117
+ // Prefer classes
2118
+ const classMatch = candidates.find(e => e.type === 'class');
2119
+ if (classMatch) return classMatch.id;
2120
+
2121
+ if (candidates.length > 0) return candidates[0].id;
2122
+ return null;
2123
+ }
2124
+
2125
+ /**
2126
+ * Insert entities and relationships into database
2127
+ * Uses better-sqlite3 (sync API, no .free() needed)
2128
+ */
2129
+ export function insertGraph(db, entities, relationships, hasFts5 = false) {
2130
+ // Insert entities with HCGS hierarchy support
2131
+ // Includes signature_hash for collision-proof backup/restore
2132
+ const entityStmt = db.prepare(`
2133
+ INSERT OR REPLACE INTO entities
2134
+ (id, file_path, type, name, signature, signature_hash, doc_comment, start_line, end_line, package, parent_class, search_text, name_alias, parent_id, hierarchy_level)
2135
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
2136
+ `);
2137
+
2138
+ // Build parent lookup for hierarchy
2139
+ const parentLookup = new Map();
2140
+ for (const e of entities) {
2141
+ if (['class', 'interface', 'enum', 'service'].includes(e.type)) {
2142
+ parentLookup.set(`${e.file_path}:${e.name}`, e.id);
2143
+ }
2144
+ }
2145
+
2146
+ console.log(` Inserting ${entities.length} entities...`);
2147
+
2148
+ // Use transaction for bulk entity inserts (much faster)
2149
+ const insertEntities = db.transaction(() => {
2150
+ for (const e of entities) {
2151
+ // Create searchable text combining name, signature, and doc comment
2152
+ const searchText = [e.name, e.signature, e.doc_comment]
2153
+ .filter(Boolean)
2154
+ .join(' ')
2155
+ .toLowerCase()
2156
+ .slice(0, 1000);
2157
+
2158
+ // Determine hierarchy level and parent
2159
+ let hierarchyLevel = 0;
2160
+ let parentId = null;
2161
+
2162
+ if (['method', 'field', 'rpc'].includes(e.type)) {
2163
+ hierarchyLevel = 1;
2164
+ // Find parent class/interface/service
2165
+ if (e.parent_class) {
2166
+ parentId = parentLookup.get(`${e.file_path}:${e.parent_class}`);
2167
+ }
2168
+ } else if (['class', 'interface', 'enum', 'service', 'message'].includes(e.type)) {
2169
+ hierarchyLevel = 0;
2170
+ } else if (['function', 'component'].includes(e.type)) {
2171
+ hierarchyLevel = 0; // Top-level in JS/TS files
2172
+ }
2173
+
2174
+ // Fix 7: Generate normalized identifier alias for cross-style search
2175
+ const nameAlias = normalizeIdentifier(e.name);
2176
+
2177
+ // better-sqlite3: use spread params instead of array
2178
+ entityStmt.run(
2179
+ e.id,
2180
+ e.file_path,
2181
+ e.type,
2182
+ e.name,
2183
+ e.signature || null,
2184
+ e.signature_hash || null, // For collision-proof backup/restore
2185
+ e.doc_comment || null,
2186
+ e.start_line || null,
2187
+ e.end_line || null,
2188
+ e.package || null,
2189
+ e.parent_class || null,
2190
+ searchText,
2191
+ nameAlias || null,
2192
+ parentId,
2193
+ hierarchyLevel
2194
+ );
2195
+ }
2196
+ });
2197
+
2198
+ insertEntities();
2199
+ console.log(` ✓ Inserted ${entities.length} entities`);
2200
+ // Note: better-sqlite3 doesn't need .free()
2201
+
2202
+ // Insert relationships (filter out invalid ones)
2203
+ console.log(` Inserting ${relationships.length} relationships...`);
2204
+
2205
+ const relStmt = db.prepare(`
2206
+ INSERT INTO relationships
2207
+ (source_id, target_id, target_name, type, weight, context_line, full_import_path, is_static, is_wildcard)
2208
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
2209
+ `);
2210
+
2211
+ // Use transaction for bulk relationship inserts
2212
+ let relInserted = 0;
2213
+ const insertRelationships = db.transaction(() => {
2214
+ for (const r of relationships) {
2215
+ // Skip relationships without target_name
2216
+ if (!r.target_name) continue;
2217
+
2218
+ try {
2219
+ // better-sqlite3: use spread params instead of array
2220
+ relStmt.run(
2221
+ r.source_id || null,
2222
+ r.target_id || null,
2223
+ r.target_name,
2224
+ r.type,
2225
+ r.weight || 1.0,
2226
+ r.context_line || null,
2227
+ r.full_import_path || null,
2228
+ r.is_static ? 1 : 0,
2229
+ r.is_wildcard ? 1 : 0
2230
+ );
2231
+ relInserted++;
2232
+ } catch (err) {
2233
+ // Expected: UNIQUE constraint violations for duplicate relationships
2234
+ // Log unexpected errors at debug level for troubleshooting
2235
+ if (!err.message.includes('UNIQUE constraint')) {
2236
+ if (process.env.DEBUG) {
2237
+ console.debug(` [debug] Relationship insert failed: ${err.message} (target: ${r.target_name})`);
2238
+ }
2239
+ }
2240
+ }
2241
+ }
2242
+ });
2243
+
2244
+ insertRelationships();
2245
+ console.log(` ✓ Inserted ${relInserted} relationships`);
2246
+
2247
+ // PHASE 2: Resolve target_id for relationships with NULL target_id
2248
+ // TEMPORARILY DISABLED to test basic indexing
2249
+ // console.log(' Resolving relationship targets...');
2250
+ // try {
2251
+ // const resolveStats = resolveRelationshipTargets(db);
2252
+ // console.log(` Resolved ${resolveStats.calls} calls, ${resolveStats.imports} imports, ${resolveStats.overrides} overrides, ${resolveStats.throws} throws (${resolveStats.unresolved} unresolved)`);
2253
+ // } catch (err) {
2254
+ // console.log(` ⚠ Resolution failed: ${err.message}`);
2255
+ // if (process.env.DEBUG) console.error(err.stack);
2256
+ // }
2257
+
2258
+ // Rebuild FTS indexes if available
2259
+ if (hasFts5) {
2260
+ try {
2261
+ db.exec(`INSERT INTO entities_fts(entities_fts) VALUES('rebuild')`);
2262
+ db.exec(`INSERT INTO entities_trigram(entities_trigram) VALUES('rebuild')`);
2263
+ console.log(' FTS5 indexes rebuilt (porter + trigram)');
2264
+
2265
+ // Best-effort post-build compaction for faster reads.
2266
+ db.exec(`INSERT INTO entities_fts(entities_fts) VALUES('optimize')`);
2267
+ db.exec(`INSERT INTO entities_trigram(entities_trigram) VALUES('optimize')`);
2268
+ console.log(' FTS5 indexes optimized (segments merged)');
2269
+ } catch (err) {
2270
+ // FTS5 rebuild/optimize failed, ignore
2271
+ }
2272
+ }
2273
+ }
2274
+
2275
+ // =============================================================================
2276
+ // CLI
2277
+ // =============================================================================
2278
+
2279
+ if (import.meta.url === `file://${process.argv[1]}`) {
2280
+ const args = process.argv.slice(2);
2281
+
2282
+ if (args.length === 0) {
2283
+ console.log('Usage: graph-extractor.js <file>');
2284
+ process.exit(1);
2285
+ }
2286
+
2287
+ const filePath = args[0];
2288
+
2289
+ (async () => {
2290
+ try {
2291
+ const content = await fs.readFile(filePath, 'utf-8');
2292
+ const extractor = new GraphExtractor();
2293
+ const result = await extractor.extractFromFile(filePath, content);
2294
+
2295
+ console.log(JSON.stringify(result, null, 2));
2296
+ console.error(`\nExtracted ${result.entities.length} entities, ${result.relationships.length} relationships`);
2297
+ } catch (err) {
2298
+ console.error('Error:', err.message);
2299
+ process.exit(1);
2300
+ }
2301
+ })();
2302
+ }
2303
+
2304
+ export default GraphExtractor;