sweet-search 0.0.1 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +190 -0
- package/NOTICE +23 -0
- package/core/cli.js +51 -0
- package/core/config.js +27 -0
- package/core/embedding/embedding-cache.js +467 -0
- package/core/embedding/embedding-local-model.js +845 -0
- package/core/embedding/embedding-remote.js +492 -0
- package/core/embedding/embedding-service.js +712 -0
- package/core/embedding/embedding-telemetry.js +219 -0
- package/core/embedding/index.js +40 -0
- package/core/graph/community-detector.js +294 -0
- package/core/graph/graph-expansion.js +839 -0
- package/core/graph/graph-extractor.js +2304 -0
- package/core/graph/graph-search.js +2148 -0
- package/core/graph/hcgs-generator.js +666 -0
- package/core/graph/index.js +16 -0
- package/core/graph/leiden-algorithm.js +547 -0
- package/core/graph/relationship-resolver.js +366 -0
- package/core/graph/repo-map.js +408 -0
- package/core/graph/summary-manager.js +549 -0
- package/core/indexing/artifact-builder.js +1054 -0
- package/core/indexing/ast-chunker.js +709 -0
- package/core/indexing/chunking/chunk-builder.js +170 -0
- package/core/indexing/chunking/markdown-chunker.js +503 -0
- package/core/indexing/chunking/plaintext-chunker.js +104 -0
- package/core/indexing/dedup/dedup-phase.js +159 -0
- package/core/indexing/dedup/exemplar-selector.js +65 -0
- package/core/indexing/document-chunker.js +56 -0
- package/core/indexing/incremental-parser.js +390 -0
- package/core/indexing/incremental-tracker.js +761 -0
- package/core/indexing/index-codebase-v21.js +472 -0
- package/core/indexing/index-maintainer.mjs +1674 -0
- package/core/indexing/index.js +90 -0
- package/core/indexing/indexer-ann.js +1077 -0
- package/core/indexing/indexer-build.js +742 -0
- package/core/indexing/indexer-phases.js +800 -0
- package/core/indexing/indexer-pool.js +764 -0
- package/core/indexing/indexer-sparse-gram.js +98 -0
- package/core/indexing/indexer-utils.js +536 -0
- package/core/indexing/indexer-worker.js +148 -0
- package/core/indexing/li-skip-policy.js +225 -0
- package/core/indexing/merkle-tracker.js +244 -0
- package/core/indexing/model-pool.js +166 -0
- package/core/infrastructure/code-graph-repository.js +120 -0
- package/core/infrastructure/codebase-repository.js +131 -0
- package/core/infrastructure/config/dedup.js +54 -0
- package/core/infrastructure/config/embedding.js +298 -0
- package/core/infrastructure/config/graph.js +80 -0
- package/core/infrastructure/config/index.js +82 -0
- package/core/infrastructure/config/indexing.js +8 -0
- package/core/infrastructure/config/platform.js +254 -0
- package/core/infrastructure/config/ranking.js +221 -0
- package/core/infrastructure/config/search.js +396 -0
- package/core/infrastructure/config/translation.js +89 -0
- package/core/infrastructure/config/vector-store.js +114 -0
- package/core/infrastructure/constants.js +86 -0
- package/core/infrastructure/coreml-cascade.js +909 -0
- package/core/infrastructure/coreml-cascade.json +46 -0
- package/core/infrastructure/coreml-provider.js +81 -0
- package/core/infrastructure/db-utils.js +69 -0
- package/core/infrastructure/dedup-hashing.js +83 -0
- package/core/infrastructure/hardware-capability.js +332 -0
- package/core/infrastructure/index.js +104 -0
- package/core/infrastructure/language-patterns/maps.js +121 -0
- package/core/infrastructure/language-patterns/registry-core.js +323 -0
- package/core/infrastructure/language-patterns/registry-data-query.js +155 -0
- package/core/infrastructure/language-patterns/registry-object-oriented.js +285 -0
- package/core/infrastructure/language-patterns/registry-tooling.js +240 -0
- package/core/infrastructure/language-patterns/registry-web-style.js +143 -0
- package/core/infrastructure/language-patterns/registry.js +19 -0
- package/core/infrastructure/language-patterns.js +141 -0
- package/core/infrastructure/llm-provider.js +733 -0
- package/core/infrastructure/manifest.json +46 -0
- package/core/infrastructure/maxsim.wasm +0 -0
- package/core/infrastructure/model-fetcher.js +423 -0
- package/core/infrastructure/model-registry.js +214 -0
- package/core/infrastructure/native-inference.js +587 -0
- package/core/infrastructure/native-resolver.js +187 -0
- package/core/infrastructure/native-sparse-gram.js +257 -0
- package/core/infrastructure/native-tokenizer.js +160 -0
- package/core/infrastructure/onnx-mutex.js +45 -0
- package/core/infrastructure/onnx-session-utils.js +261 -0
- package/core/infrastructure/ort-pipeline.js +111 -0
- package/core/infrastructure/project-detector.js +102 -0
- package/core/infrastructure/quantization.js +410 -0
- package/core/infrastructure/simd-distance.js +502 -0
- package/core/infrastructure/simd-distance.wasm +0 -0
- package/core/infrastructure/tree-sitter-provider.js +665 -0
- package/core/infrastructure/webgpu-maxsim.js +222 -0
- package/core/query/index.js +35 -0
- package/core/query/intent-detector.js +201 -0
- package/core/query/intent-router.js +156 -0
- package/core/query/query-router-catboost.js +222 -0
- package/core/query/query-router-ml.js +266 -0
- package/core/query/query-router.js +213 -0
- package/core/ranking/cascaded-scorer.js +379 -0
- package/core/ranking/flashrank.js +810 -0
- package/core/ranking/index.js +49 -0
- package/core/ranking/late-interaction-index.js +2383 -0
- package/core/ranking/late-interaction-model.js +812 -0
- package/core/ranking/local-reranker.js +374 -0
- package/core/ranking/mmr.js +379 -0
- package/core/ranking/quality-scorer.js +363 -0
- package/core/search/context-expander.js +1167 -0
- package/core/search/dedup/sibling-expander.js +327 -0
- package/core/search/index.js +16 -0
- package/core/search/search-boost.js +259 -0
- package/core/search/search-cli.js +544 -0
- package/core/search/search-format.js +282 -0
- package/core/search/search-fusion.js +327 -0
- package/core/search/search-hybrid.js +204 -0
- package/core/search/search-pattern-chunks.js +337 -0
- package/core/search/search-pattern-planner.js +439 -0
- package/core/search/search-pattern-prefilter.js +412 -0
- package/core/search/search-pattern-ripgrep.js +663 -0
- package/core/search/search-pattern.js +463 -0
- package/core/search/search-postprocess.js +452 -0
- package/core/search/search-semantic.js +706 -0
- package/core/search/search-server.js +554 -0
- package/core/search/session-daemon-prewarm.mjs +164 -0
- package/core/search/session-warmup.js +595 -0
- package/core/search/sweet-search.js +632 -0
- package/core/search/warmup-metrics.js +532 -0
- package/core/start-server.js +6 -0
- package/core/training/query-router/features/extractor.js +762 -0
- package/core/training/query-router/features/multilingual-patterns.js +431 -0
- package/core/training/query-router/features/text-segmenter.js +303 -0
- package/core/training/query-router/features/unicode-utils.js +383 -0
- package/core/training/query-router/output/v45_router_d4.js +11521 -0
- package/core/training/query-router/output/v46_router_d4.js +11498 -0
- package/core/vector-store/binary-heap.js +227 -0
- package/core/vector-store/binary-hnsw-index.js +1004 -0
- package/core/vector-store/float-vector-store.js +234 -0
- package/core/vector-store/hnsw-index.js +580 -0
- package/core/vector-store/index.js +39 -0
- package/core/vector-store/seismic-index.js +498 -0
- package/core/vocabulary/index.js +84 -0
- package/core/vocabulary/vocab-constants.js +20 -0
- package/core/vocabulary/vocab-miner-extractors.js +375 -0
- package/core/vocabulary/vocab-miner-nl.js +404 -0
- package/core/vocabulary/vocab-miner-utils.js +146 -0
- package/core/vocabulary/vocab-miner.js +574 -0
- package/core/vocabulary/vocab-prewarm-cli.js +110 -0
- package/core/vocabulary/vocab-ranker.js +492 -0
- package/core/vocabulary/vocab-warmer.js +523 -0
- package/core/vocabulary/vocab-warmup-orchestrator.js +425 -0
- package/core/vocabulary/vocabulary-utils.js +704 -0
- package/crates/wasm-router/pkg/package.json +13 -0
- package/crates/wasm-router/pkg/query_router_wasm.d.ts +36 -0
- package/crates/wasm-router/pkg/query_router_wasm.js +271 -0
- package/crates/wasm-router/pkg/query_router_wasm_bg.wasm +0 -0
- package/crates/wasm-router/pkg/query_router_wasm_bg.wasm.d.ts +19 -0
- package/mcp/config-gen.js +121 -0
- package/mcp/server.js +335 -0
- package/mcp/tool-handlers.js +476 -0
- package/package.json +131 -9
- package/scripts/benchmark-harness.js +794 -0
- package/scripts/init.js +1058 -0
- package/scripts/smoke-test.js +435 -0
- package/scripts/uninstall.js +478 -0
- package/scripts/verify-runtime.js +176 -0
|
@@ -0,0 +1,2304 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Code Graph Extractor
|
|
5
|
+
*
|
|
6
|
+
* Builds a knowledge graph from codebase:
|
|
7
|
+
* - Entities: classes, interfaces, methods, fields, enums
|
|
8
|
+
* - Relationships: extends, implements, calls, uses, throws, overrides
|
|
9
|
+
*
|
|
10
|
+
* Stores in SQLite with FTS5 for fast lexical search.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { createHash } from 'crypto';
|
|
14
|
+
import path from 'path';
|
|
15
|
+
import fs from 'fs/promises';
|
|
16
|
+
import { GRAPH_CONFIG, DB_PATHS } from '../infrastructure/config/index.js';
|
|
17
|
+
import { getLanguageByPath } from '../infrastructure/language-patterns.js';
|
|
18
|
+
import { getTreeSitterProvider } from '../infrastructure/tree-sitter-provider.js';
|
|
19
|
+
|
|
20
|
+
// Schema version - increment when schema changes require full reindex
|
|
21
|
+
// Users should run `/index-codebase --full` after upgrading
|
|
22
|
+
export const SCHEMA_VERSION = 2;
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Normalize an identifier into searchable alias tokens.
|
|
26
|
+
* Splits camelCase, PascalCase, snake_case, digits and emits both
|
|
27
|
+
* the split form and the collapsed alnum form.
|
|
28
|
+
*
|
|
29
|
+
* @param {string} name - The original identifier name
|
|
30
|
+
* @returns {string} Space-separated alias tokens (lowercased, deduped)
|
|
31
|
+
*
|
|
32
|
+
* @example
|
|
33
|
+
* normalizeIdentifier('UserService') // 'user service userservice'
|
|
34
|
+
* normalizeIdentifier('getUserName') // 'get user name getusername'
|
|
35
|
+
* normalizeIdentifier('get_user_name') // 'get user name getusername'
|
|
36
|
+
* normalizeIdentifier('HTMLParser2') // 'html parser 2 htmlparser2'
|
|
37
|
+
* normalizeIdentifier('OAuth2Client') // 'o auth 2 client oauth2client'
|
|
38
|
+
* normalizeIdentifier('auth.service') // 'auth service authservice'
|
|
39
|
+
*/
|
|
40
|
+
export function normalizeIdentifier(name) {
|
|
41
|
+
if (!name) return '';
|
|
42
|
+
|
|
43
|
+
// Step 1-4: Split on separators and camelCase/PascalCase boundaries
|
|
44
|
+
let split = name
|
|
45
|
+
// Insert space before acronym→word transitions (e.g. HTMLParser -> HTML Parser)
|
|
46
|
+
// Requires 2+ uppercase chars to avoid splitting single-letter prefixes (OAuth stays intact)
|
|
47
|
+
.replace(/([A-Z]{2,})([A-Z][a-z])/g, '$1 $2')
|
|
48
|
+
// Insert space at camelCase boundaries (e.g. getUser -> get User)
|
|
49
|
+
.replace(/([a-z])([A-Z])/g, '$1 $2')
|
|
50
|
+
// Insert space at digit boundaries (e.g. Parser2 -> Parser 2, v2Handler -> v 2 Handler)
|
|
51
|
+
.replace(/([a-zA-Z])(\d)/g, '$1 $2')
|
|
52
|
+
.replace(/(\d)([a-zA-Z])/g, '$1 $2')
|
|
53
|
+
// Split on separators: _ - . / :
|
|
54
|
+
.replace(/[_\-./:\\]/g, ' ');
|
|
55
|
+
|
|
56
|
+
// Step 5-6: Lowercase and normalize whitespace
|
|
57
|
+
const tokens = split.toLowerCase().split(/\s+/).filter(t => t.length > 0);
|
|
58
|
+
|
|
59
|
+
// Step 7: Emit both split tokens and collapsed form
|
|
60
|
+
const collapsed = tokens.join('');
|
|
61
|
+
const uniqueTokens = [...new Set([...tokens, collapsed])];
|
|
62
|
+
|
|
63
|
+
return uniqueTokens.join(' ');
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Persist the current schema version after schema creation/migration succeeds.
|
|
68
|
+
*
|
|
69
|
+
* @param {import('better-sqlite3').Database} db
|
|
70
|
+
* @param {number} [version=SCHEMA_VERSION]
|
|
71
|
+
*/
|
|
72
|
+
export function setSchemaVersion(db, version = SCHEMA_VERSION) {
|
|
73
|
+
db.exec(`CREATE TABLE IF NOT EXISTS schema_meta (key TEXT PRIMARY KEY, value TEXT)`);
|
|
74
|
+
db.prepare('INSERT OR REPLACE INTO schema_meta (key, value) VALUES (?, ?)').run('version', String(version));
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
function getTableSql(db, tableName) {
|
|
78
|
+
const row = db.prepare("SELECT sql FROM sqlite_master WHERE type='table' AND name = ?").get(tableName);
|
|
79
|
+
return row?.sql || '';
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
function normalizeSql(sql) {
|
|
83
|
+
return sql.toLowerCase().replace(/\s+/g, ' ');
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
function hasExpectedEntitiesFtsSchema(sql) {
|
|
87
|
+
const normalized = normalizeSql(sql);
|
|
88
|
+
return normalized.includes('name_alias')
|
|
89
|
+
&& normalized.includes("tokenize='porter unicode61'")
|
|
90
|
+
&& normalized.includes("prefix='2 3 4'");
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
function hasExpectedTrigramSchema(sql) {
|
|
94
|
+
const normalized = normalizeSql(sql);
|
|
95
|
+
return normalized.includes("tokenize='trigram'")
|
|
96
|
+
&& normalized.includes("content='entities'")
|
|
97
|
+
&& normalized.includes("content_rowid='rowid'");
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
function backfillNameAliases(db) {
|
|
101
|
+
const rowsNeedingAlias = db.prepare(`
|
|
102
|
+
SELECT id, name
|
|
103
|
+
FROM entities
|
|
104
|
+
WHERE name IS NOT NULL
|
|
105
|
+
AND (name_alias IS NULL OR trim(name_alias) = '')
|
|
106
|
+
`).all();
|
|
107
|
+
|
|
108
|
+
if (rowsNeedingAlias.length === 0) {
|
|
109
|
+
return 0;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
const updateAlias = db.prepare(`UPDATE entities SET name_alias = ? WHERE id = ?`);
|
|
113
|
+
const applyBackfill = db.transaction((rows) => {
|
|
114
|
+
for (const row of rows) {
|
|
115
|
+
updateAlias.run(normalizeIdentifier(row.name), row.id);
|
|
116
|
+
}
|
|
117
|
+
});
|
|
118
|
+
|
|
119
|
+
applyBackfill(rowsNeedingAlias);
|
|
120
|
+
return rowsNeedingAlias.length;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
function ensureLexicalFtsSchema(db) {
|
|
124
|
+
const existingFtsSql = getTableSql(db, 'entities_fts');
|
|
125
|
+
const existingTrigramSql = getTableSql(db, 'entities_trigram');
|
|
126
|
+
const needsRebuild = !existingFtsSql
|
|
127
|
+
|| !existingTrigramSql
|
|
128
|
+
|| !hasExpectedEntitiesFtsSchema(existingFtsSql)
|
|
129
|
+
|| !hasExpectedTrigramSchema(existingTrigramSql);
|
|
130
|
+
|
|
131
|
+
if (needsRebuild) {
|
|
132
|
+
db.exec(`DROP TABLE IF EXISTS entities_fts`);
|
|
133
|
+
db.exec(`DROP TABLE IF EXISTS entities_trigram`);
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
db.exec(`
|
|
137
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS entities_fts USING fts5(
|
|
138
|
+
name,
|
|
139
|
+
name_alias,
|
|
140
|
+
signature,
|
|
141
|
+
doc_comment,
|
|
142
|
+
content='entities',
|
|
143
|
+
content_rowid='rowid',
|
|
144
|
+
tokenize='porter unicode61',
|
|
145
|
+
prefix='2 3 4'
|
|
146
|
+
)
|
|
147
|
+
`);
|
|
148
|
+
|
|
149
|
+
db.exec(`
|
|
150
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS entities_trigram USING fts5(
|
|
151
|
+
name,
|
|
152
|
+
signature,
|
|
153
|
+
content='entities',
|
|
154
|
+
content_rowid='rowid',
|
|
155
|
+
tokenize='trigram'
|
|
156
|
+
)
|
|
157
|
+
`);
|
|
158
|
+
|
|
159
|
+
return { rebuilt: needsRebuild };
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// =============================================================================
|
|
163
|
+
// ENTITY EXTRACTION PATTERNS
|
|
164
|
+
// =============================================================================
|
|
165
|
+
|
|
166
|
+
const JAVA_PATTERNS = {
|
|
167
|
+
// Class declarations
|
|
168
|
+
class: /(?:public|private|protected)?\s*(?:static)?\s*(?:final|abstract)?\s*class\s+(\w+)(?:\s+extends\s+(\w+))?(?:\s+implements\s+([\w,\s]+))?/g,
|
|
169
|
+
|
|
170
|
+
// Interface declarations
|
|
171
|
+
interface: /(?:public)?\s*interface\s+(\w+)(?:\s+extends\s+([\w,\s]+))?/g,
|
|
172
|
+
|
|
173
|
+
// Enum declarations
|
|
174
|
+
enum: /(?:public)?\s*enum\s+(\w+)/g,
|
|
175
|
+
|
|
176
|
+
// Method declarations
|
|
177
|
+
method: /(?:@\w+\s*(?:\([^)]*\))?\s*)*(?:public|private|protected)?\s*(?:static)?\s*(?:final)?\s*(?:synchronized)?\s*(?:<[\w\s,<>?]+>\s*)?(\w+(?:<[\w\s,<>?]+>)?(?:\[\])?)\s+(\w+)\s*\(([^)]*)\)/g,
|
|
178
|
+
|
|
179
|
+
// Field declarations
|
|
180
|
+
field: /(?:public|private|protected)\s+(?:static)?\s*(?:final)?\s*(\w+(?:<[\w\s,<>?]+>)?(?:\[\])?)\s+(\w+)\s*[;=]/g,
|
|
181
|
+
|
|
182
|
+
// Method calls
|
|
183
|
+
methodCall: /(\w+)\s*\.\s*(\w+)\s*\(/g,
|
|
184
|
+
|
|
185
|
+
// Imports (supports static and wildcard: import com.foo.*; import static com.bar.Baz.*)
|
|
186
|
+
import: /import\s+(?:static\s+)?([a-zA-Z_][\w.]*(?:\.\*)?)\s*;/g,
|
|
187
|
+
|
|
188
|
+
// Throw statements
|
|
189
|
+
throw: /throw\s+new\s+(\w+)/g,
|
|
190
|
+
|
|
191
|
+
// Package declaration
|
|
192
|
+
package: /package\s+([\w.]+)\s*;/,
|
|
193
|
+
};
|
|
194
|
+
|
|
195
|
+
const JS_PATTERNS = {
|
|
196
|
+
// Function declarations
|
|
197
|
+
function: /(?:export\s+)?(?:async\s+)?function\s+(\w+)\s*\(/g,
|
|
198
|
+
|
|
199
|
+
// Arrow functions
|
|
200
|
+
arrowFunction: /(?:export\s+)?(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s*)?\([^)]*\)\s*=>/g,
|
|
201
|
+
|
|
202
|
+
// Class declarations
|
|
203
|
+
class: /(?:export\s+)?class\s+(\w+)(?:\s+extends\s+(\w+))?/g,
|
|
204
|
+
|
|
205
|
+
// React components (capitalized functions)
|
|
206
|
+
component: /(?:export\s+)?(?:const|function)\s+([A-Z]\w+)\s*[=:]/g,
|
|
207
|
+
|
|
208
|
+
// Method calls
|
|
209
|
+
methodCall: /(\w+)\s*\.\s*(\w+)\s*\(/g,
|
|
210
|
+
|
|
211
|
+
// Imports
|
|
212
|
+
import: /import\s+(?:{([^}]+)}|(\w+))\s+from\s+['"]([^'"]+)['"]/g,
|
|
213
|
+
};
|
|
214
|
+
|
|
215
|
+
const PROTO_PATTERNS = {
|
|
216
|
+
// Message declarations
|
|
217
|
+
message: /message\s+(\w+)\s*\{/g,
|
|
218
|
+
|
|
219
|
+
// Service declarations
|
|
220
|
+
service: /service\s+(\w+)\s*\{/g,
|
|
221
|
+
|
|
222
|
+
// RPC declarations
|
|
223
|
+
rpc: /rpc\s+(\w+)\s*\(\s*(\w+)\s*\)\s+returns\s+\(\s*(\w+)\s*\)/g,
|
|
224
|
+
|
|
225
|
+
// Enum declarations
|
|
226
|
+
enum: /enum\s+(\w+)\s*\{/g,
|
|
227
|
+
};
|
|
228
|
+
|
|
229
|
+
export const GENERIC_RELATIONSHIP_MAPPING = Object.freeze({
|
|
230
|
+
import: 'imports',
|
|
231
|
+
plainImport: 'imports',
|
|
232
|
+
include: 'imports',
|
|
233
|
+
require: 'imports',
|
|
234
|
+
reexport: 'imports',
|
|
235
|
+
dynamicImport: 'imports',
|
|
236
|
+
use: 'imports',
|
|
237
|
+
prepend: 'imports',
|
|
238
|
+
open: 'imports',
|
|
239
|
+
source: 'imports',
|
|
240
|
+
from: 'imports',
|
|
241
|
+
forward: 'imports',
|
|
242
|
+
using: 'imports',
|
|
243
|
+
link: 'imports',
|
|
244
|
+
script: 'imports',
|
|
245
|
+
copyFrom: 'imports',
|
|
246
|
+
alias: 'imports',
|
|
247
|
+
namespace: 'imports',
|
|
248
|
+
ref: 'imports',
|
|
249
|
+
dep: 'imports',
|
|
250
|
+
package: 'imports',
|
|
251
|
+
extends: 'extends',
|
|
252
|
+
inherit: 'extends',
|
|
253
|
+
mixin: 'extends',
|
|
254
|
+
with: 'extends',
|
|
255
|
+
category: 'extends',
|
|
256
|
+
implements: 'implements',
|
|
257
|
+
protocol: 'implements',
|
|
258
|
+
implFor: 'implements',
|
|
259
|
+
decorator: 'uses',
|
|
260
|
+
embed: 'uses',
|
|
261
|
+
extend: 'uses',
|
|
262
|
+
anchor: 'uses',
|
|
263
|
+
derive: 'uses',
|
|
264
|
+
throw: 'uses',
|
|
265
|
+
img: 'uses',
|
|
266
|
+
form: 'uses',
|
|
267
|
+
methodOf: 'uses',
|
|
268
|
+
});
|
|
269
|
+
|
|
270
|
+
export const INTENTIONAL_DEFAULT_RELATIONSHIP_TYPES = Object.freeze([]);
|
|
271
|
+
const escapeRegexLiteral = (value) => value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
272
|
+
|
|
273
|
+
// Types whose regex capture groups commonly contain comma-separated lists.
|
|
274
|
+
// Module-scope constant to avoid per-call Set allocation.
|
|
275
|
+
const MULTI_TARGET_TYPES = new Set([
|
|
276
|
+
'plainImport', 'implements', 'inherit', 'protocol', 'with',
|
|
277
|
+
]);
|
|
278
|
+
|
|
279
|
+
export const TREE_SITTER_ENTITY_PRIORITY = Object.freeze({
|
|
280
|
+
component: 40,
|
|
281
|
+
class: 35,
|
|
282
|
+
function: 30,
|
|
283
|
+
method: 25,
|
|
284
|
+
arrowFunction: 20,
|
|
285
|
+
interface: 20,
|
|
286
|
+
typeAlias: 20,
|
|
287
|
+
enum: 20,
|
|
288
|
+
namespace: 20,
|
|
289
|
+
struct: 30,
|
|
290
|
+
record: 30,
|
|
291
|
+
module: 25,
|
|
292
|
+
trait: 25,
|
|
293
|
+
impl: 20,
|
|
294
|
+
decorator: 15,
|
|
295
|
+
});
|
|
296
|
+
|
|
297
|
+
// Module-scope constants for extractJavaScript() — avoid per-call/per-line allocation.
|
|
298
|
+
const JS_CALL_SKIP_OBJECTS = new Set([
|
|
299
|
+
'console', 'Math', 'JSON', 'Object', 'Array', 'Promise', 'process', 'Buffer', 'Date',
|
|
300
|
+
]);
|
|
301
|
+
const JS_RESERVED_WORDS = new Set([
|
|
302
|
+
'if', 'else', 'for', 'while', 'switch', 'catch', 'with', 'do', 'try', 'return',
|
|
303
|
+
]);
|
|
304
|
+
|
|
305
|
+
// Import-like relationship patterns for extractJavaScript() — DRYs up five inline blocks.
|
|
306
|
+
const JS_IMPORT_PATTERNS = [
|
|
307
|
+
{ regex: /import\s+(?:\{[^}]+\}|\w+)\s+from\s+['"]([^'"]+)['"]/, group: 1 },
|
|
308
|
+
{ regex: /(?:const|let|var)\s+(?:\{[^}]+\}|\w+)\s*=\s*require\s*\(\s*['"]([^'"]+)['"]\s*\)/, group: 1 },
|
|
309
|
+
{ regex: /export\s+(?:\{[^}]+\}|\*)\s+from\s+['"]([^'"]+)['"]/, group: 1 },
|
|
310
|
+
{ regex: /(?:await\s+)?import\s*\(\s*['"]([^'"]+)['"]\s*\)/, group: 1 },
|
|
311
|
+
];
|
|
312
|
+
|
|
313
|
+
/**
|
|
314
|
+
* Split a string on commas, but only at the top level — ignoring commas
|
|
315
|
+
* inside <>, (), [], or {} brackets.
|
|
316
|
+
*/
|
|
317
|
+
export function splitTopLevelCommas(str) {
|
|
318
|
+
const parts = [];
|
|
319
|
+
let depth = 0;
|
|
320
|
+
let start = 0;
|
|
321
|
+
for (let i = 0; i < str.length; i++) {
|
|
322
|
+
const ch = str[i];
|
|
323
|
+
if (ch === '<' || ch === '(' || ch === '[' || ch === '{') depth++;
|
|
324
|
+
else if (ch === '>' || ch === ')' || ch === ']' || ch === '}') depth = Math.max(0, depth - 1);
|
|
325
|
+
else if (ch === ',' && depth === 0) {
|
|
326
|
+
parts.push(str.slice(start, i));
|
|
327
|
+
start = i + 1;
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
parts.push(str.slice(start));
|
|
331
|
+
return parts;
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
// =============================================================================
|
|
335
|
+
// GRAPH EXTRACTOR CLASS
|
|
336
|
+
// =============================================================================
|
|
337
|
+
|
|
338
|
+
export class GraphExtractor {
|
|
339
|
+
constructor(options) {
|
|
340
|
+
this.projectRoot = options?.projectRoot || process.cwd();
|
|
341
|
+
this.entities = new Map();
|
|
342
|
+
this.relationships = [];
|
|
343
|
+
this.currentFile = null;
|
|
344
|
+
this.currentClass = null;
|
|
345
|
+
this.packageName = '';
|
|
346
|
+
this._useTreeSitter = options?.useTreeSitter !== false;
|
|
347
|
+
this.warnOnPatternDrop = options?.warnOnPatternDrop || false;
|
|
348
|
+
this.maxRegexLineLength = options?.maxRegexLineLength || 4000;
|
|
349
|
+
this.debugCounters = {
|
|
350
|
+
emptyCapture: {
|
|
351
|
+
entity: 0,
|
|
352
|
+
relationship: 0,
|
|
353
|
+
},
|
|
354
|
+
skippedLongLines: 0,
|
|
355
|
+
byLanguage: {},
|
|
356
|
+
byPattern: {},
|
|
357
|
+
};
|
|
358
|
+
this.patternPrefilterCache = new Map();
|
|
359
|
+
this.methodCallRegexCache = new Map();
|
|
360
|
+
this.genericPatternPlanCache = new Map();
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
/**
|
|
364
|
+
* Extract entities and relationships from a file.
|
|
365
|
+
* Dispatches to specialized extractors for Java/JS/Proto,
|
|
366
|
+
* generic registry-based extractor for all other languages.
|
|
367
|
+
*/
|
|
368
|
+
async extractFromFile(filePath, content) {
|
|
369
|
+
this.currentFile = filePath;
|
|
370
|
+
const lines = content.split('\n');
|
|
371
|
+
const langInfo = getLanguageByPath(filePath);
|
|
372
|
+
|
|
373
|
+
if (!langInfo) {
|
|
374
|
+
return { entities: [], relationships: [] };
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
// Try tree-sitter extraction first (more accurate than regex)
|
|
378
|
+
if (this._useTreeSitter) {
|
|
379
|
+
try {
|
|
380
|
+
const provider = getTreeSitterProvider();
|
|
381
|
+
if (await provider.isAvailable() && provider.hasLanguage(langInfo.id)) {
|
|
382
|
+
const symbols = await provider.extractSymbols(content, langInfo.id);
|
|
383
|
+
if (symbols && symbols.length > 0) {
|
|
384
|
+
// Convert tree-sitter symbols to graph entities format and align
|
|
385
|
+
// labels with regex semantics (component/object arrow distinctions).
|
|
386
|
+
const entities = this._normalizeTreeSitterEntities(filePath, symbols, langInfo.id);
|
|
387
|
+
// Still extract relationships with regex (tree-sitter only gives definitions)
|
|
388
|
+
const relationships = this._extractRelationships(content, lines, filePath, langInfo, entities);
|
|
389
|
+
return { entities, relationships };
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
} catch {
|
|
393
|
+
// Fall through to regex extraction
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
// Specialized extractors for languages with complex logic
|
|
398
|
+
if (langInfo.id === 'java') {
|
|
399
|
+
return this.extractJava(content, lines, filePath);
|
|
400
|
+
}
|
|
401
|
+
if (langInfo.id === 'javascript') {
|
|
402
|
+
return this.extractJavaScript(content, lines, filePath);
|
|
403
|
+
}
|
|
404
|
+
if (langInfo.id === 'proto') {
|
|
405
|
+
return this.extractProto(content, lines, filePath);
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
// Generic registry-based extraction for all other languages
|
|
409
|
+
if (langInfo.graph) {
|
|
410
|
+
return this.extractGeneric(content, lines, filePath, langInfo);
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
return { entities: [], relationships: [] };
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
/**
|
|
417
|
+
* Extract from Java file
|
|
418
|
+
*/
|
|
419
|
+
extractJava(content, lines, filePath) {
|
|
420
|
+
const entities = [];
|
|
421
|
+
const relationships = [];
|
|
422
|
+
|
|
423
|
+
// Extract package
|
|
424
|
+
const pkgMatch = content.match(JAVA_PATTERNS.package);
|
|
425
|
+
this.packageName = pkgMatch ? pkgMatch[1] : '';
|
|
426
|
+
|
|
427
|
+
// Extract Java imports (Phase 3.2: Java Import Extraction)
|
|
428
|
+
// Creates 'imports' relationships for dependency tracking
|
|
429
|
+
const fileEntityId = this.makeId(filePath, 'file', path.basename(filePath));
|
|
430
|
+
const importMatches = content.matchAll(JAVA_PATTERNS.import);
|
|
431
|
+
|
|
432
|
+
for (const match of importMatches) {
|
|
433
|
+
const importPath = match[1];
|
|
434
|
+
const isStatic = match[0].includes('static');
|
|
435
|
+
const isWildcard = importPath.endsWith('.*');
|
|
436
|
+
|
|
437
|
+
// Find the line number of this import by counting newlines before match position
|
|
438
|
+
// Note: Uses regex match which creates an array; for truly allocation-free counting,
|
|
439
|
+
// would need a manual loop, but this is fast enough for typical file sizes (<10k lines)
|
|
440
|
+
const importLine = (content.substring(0, match.index).match(/\n/g) || []).length + 1;
|
|
441
|
+
|
|
442
|
+
// Extract the class name for target resolution
|
|
443
|
+
// For "com.example.services.AuthService" -> target_name = "AuthService"
|
|
444
|
+
// For "com.example.services.*" -> target_name = "services" (package - won't resolve)
|
|
445
|
+
// For static "com.example.utils.Constants.MAX_VALUE" -> target_name = "Constants" (class only)
|
|
446
|
+
// For static "com.example.utils.Constants.*" -> target_name = "Constants" (class only)
|
|
447
|
+
const pathWithoutWildcard = importPath.replace(/\.\*$/, '');
|
|
448
|
+
const parts = pathWithoutWildcard.split('.');
|
|
449
|
+
|
|
450
|
+
// Static import logic explanation:
|
|
451
|
+
// - Regular import "com.foo.Bar" → target = "Bar" (last part)
|
|
452
|
+
// - Regular wildcard "com.foo.*" → target = "foo" (last part after removing *)
|
|
453
|
+
// - Static import "com.foo.Bar.METHOD" → target = "Bar" (second-to-last, the class)
|
|
454
|
+
// - Static wildcard "com.foo.Bar.*" → target = "Bar" (last part after removing *, the class)
|
|
455
|
+
// The key insight: static imports reference CLASS members, so we need the class name,
|
|
456
|
+
// not the member name, for entity resolution to work correctly.
|
|
457
|
+
let targetName;
|
|
458
|
+
if (isWildcard && !isStatic) {
|
|
459
|
+
// Regular wildcard: import com.foo.* -> package name (won't resolve to entity)
|
|
460
|
+
targetName = parts[parts.length - 1];
|
|
461
|
+
} else if (isStatic) {
|
|
462
|
+
// Static import: import static com.foo.Bar.METHOD or com.foo.Bar.*
|
|
463
|
+
// The class is second-to-last part (Bar), member is last (METHOD or *)
|
|
464
|
+
// For resolution, we want the CLASS name (Bar), not the member
|
|
465
|
+
targetName = parts.length >= 2 ? parts[parts.length - (isWildcard ? 1 : 2)] : parts[parts.length - 1];
|
|
466
|
+
} else {
|
|
467
|
+
// Regular import: import com.foo.Bar -> class name
|
|
468
|
+
targetName = parts[parts.length - 1];
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
// Skip empty or invalid target names
|
|
472
|
+
if (!targetName || targetName.length === 0) continue;
|
|
473
|
+
|
|
474
|
+
relationships.push({
|
|
475
|
+
source_id: fileEntityId,
|
|
476
|
+
target_id: null, // Will be resolved by resolveRelationshipTargets()
|
|
477
|
+
target_name: targetName,
|
|
478
|
+
full_import_path: importPath, // Store full path for better resolution
|
|
479
|
+
type: 'imports',
|
|
480
|
+
weight: GRAPH_CONFIG.relationshipWeights.imports,
|
|
481
|
+
context_line: importLine,
|
|
482
|
+
is_static: isStatic,
|
|
483
|
+
is_wildcard: isWildcard,
|
|
484
|
+
});
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
// Track current class for method/field association
|
|
488
|
+
let currentClass = null;
|
|
489
|
+
let braceDepth = 0;
|
|
490
|
+
let classStartDepth = 0;
|
|
491
|
+
|
|
492
|
+
for (let i = 0; i < lines.length; i++) {
|
|
493
|
+
const line = lines[i];
|
|
494
|
+
const lineNum = i + 1;
|
|
495
|
+
|
|
496
|
+
// Track brace depth
|
|
497
|
+
braceDepth += (line.match(/{/g) || []).length;
|
|
498
|
+
braceDepth -= (line.match(/}/g) || []).length;
|
|
499
|
+
|
|
500
|
+
// Reset current class when we exit its scope
|
|
501
|
+
if (currentClass && braceDepth < classStartDepth) {
|
|
502
|
+
currentClass = null;
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
// Class declarations
|
|
506
|
+
const classMatch = line.match(/(?:public|private|protected)?\s*(?:static)?\s*(?:final|abstract)?\s*class\s+(\w+)(?:\s+extends\s+(\w+))?(?:\s+implements\s+([\w,\s]+))?/);
|
|
507
|
+
if (classMatch) {
|
|
508
|
+
const className = classMatch[1];
|
|
509
|
+
const extendsClass = classMatch[2];
|
|
510
|
+
const implementsStr = classMatch[3];
|
|
511
|
+
|
|
512
|
+
const id = this.makeId(filePath, 'class', className);
|
|
513
|
+
const entity = {
|
|
514
|
+
id,
|
|
515
|
+
file_path: filePath,
|
|
516
|
+
type: 'class',
|
|
517
|
+
name: className,
|
|
518
|
+
signature: line.trim(),
|
|
519
|
+
doc_comment: this.extractDocComment(lines, i),
|
|
520
|
+
start_line: lineNum,
|
|
521
|
+
end_line: this.findEndLine(lines, i),
|
|
522
|
+
package: this.packageName,
|
|
523
|
+
};
|
|
524
|
+
entities.push(entity);
|
|
525
|
+
currentClass = entity;
|
|
526
|
+
classStartDepth = braceDepth;
|
|
527
|
+
|
|
528
|
+
// Extends relationship
|
|
529
|
+
if (extendsClass) {
|
|
530
|
+
relationships.push({
|
|
531
|
+
source_id: id,
|
|
532
|
+
target_id: this.makeId(filePath, 'class', extendsClass),
|
|
533
|
+
target_name: extendsClass,
|
|
534
|
+
type: 'extends',
|
|
535
|
+
weight: GRAPH_CONFIG.relationshipWeights.extends,
|
|
536
|
+
});
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
// Implements relationships
|
|
540
|
+
if (implementsStr) {
|
|
541
|
+
const interfaces = implementsStr.split(',').map(s => s.trim());
|
|
542
|
+
for (const iface of interfaces) {
|
|
543
|
+
relationships.push({
|
|
544
|
+
source_id: id,
|
|
545
|
+
target_id: this.makeId(filePath, 'interface', iface),
|
|
546
|
+
target_name: iface,
|
|
547
|
+
type: 'implements',
|
|
548
|
+
weight: GRAPH_CONFIG.relationshipWeights.implements,
|
|
549
|
+
});
|
|
550
|
+
}
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
// Interface declarations
|
|
555
|
+
const ifaceMatch = line.match(/(?:public)?\s*interface\s+(\w+)(?:\s+extends\s+([\w,\s]+))?/);
|
|
556
|
+
if (ifaceMatch) {
|
|
557
|
+
const ifaceName = ifaceMatch[1];
|
|
558
|
+
const id = this.makeId(filePath, 'interface', ifaceName);
|
|
559
|
+
|
|
560
|
+
entities.push({
|
|
561
|
+
id,
|
|
562
|
+
file_path: filePath,
|
|
563
|
+
type: 'interface',
|
|
564
|
+
name: ifaceName,
|
|
565
|
+
signature: line.trim(),
|
|
566
|
+
doc_comment: this.extractDocComment(lines, i),
|
|
567
|
+
start_line: lineNum,
|
|
568
|
+
end_line: this.findEndLine(lines, i),
|
|
569
|
+
package: this.packageName,
|
|
570
|
+
});
|
|
571
|
+
|
|
572
|
+
// Extends relationships for interfaces
|
|
573
|
+
const extendsStr = ifaceMatch[2];
|
|
574
|
+
if (extendsStr) {
|
|
575
|
+
const extended = extendsStr.split(',').map(s => s.trim());
|
|
576
|
+
for (const ext of extended) {
|
|
577
|
+
relationships.push({
|
|
578
|
+
source_id: id,
|
|
579
|
+
target_id: this.makeId(filePath, 'interface', ext),
|
|
580
|
+
target_name: ext,
|
|
581
|
+
type: 'extends',
|
|
582
|
+
weight: GRAPH_CONFIG.relationshipWeights.extends,
|
|
583
|
+
});
|
|
584
|
+
}
|
|
585
|
+
}
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
// Method declarations
|
|
589
|
+
const methodMatch = line.match(/(?:@\w+\s*(?:\([^)]*\))?\s*)*(?:public|private|protected)?\s*(?:static)?\s*(?:final)?\s*(?:synchronized)?\s*(?:<[\w\s,<>?]+>\s*)?(\w+(?:<[\w\s,<>?]+>)?(?:\[\])?)\s+(\w+)\s*\(([^)]*)\)/);
|
|
590
|
+
if (methodMatch && !line.includes('class ') && !line.includes('interface ')) {
|
|
591
|
+
const returnType = methodMatch[1];
|
|
592
|
+
const methodName = methodMatch[2];
|
|
593
|
+
const params = methodMatch[3];
|
|
594
|
+
|
|
595
|
+
// Skip if this looks like a constructor
|
|
596
|
+
if (returnType === currentClass?.name) continue;
|
|
597
|
+
|
|
598
|
+
// Build full signature for collision-proof ID (overloaded methods)
|
|
599
|
+
const fullSignature = `${returnType} ${methodName}(${params})`;
|
|
600
|
+
const signatureHash = this.makeSignatureHash(fullSignature);
|
|
601
|
+
|
|
602
|
+
// Use signature hash for disambiguation of overloaded methods
|
|
603
|
+
const id = this.makeId(filePath, 'method', `${currentClass?.name || 'Unknown'}.${methodName}`, {
|
|
604
|
+
signature: fullSignature,
|
|
605
|
+
startLine: lineNum,
|
|
606
|
+
});
|
|
607
|
+
|
|
608
|
+
entities.push({
|
|
609
|
+
id,
|
|
610
|
+
file_path: filePath,
|
|
611
|
+
type: 'method',
|
|
612
|
+
name: methodName,
|
|
613
|
+
signature: fullSignature,
|
|
614
|
+
signature_hash: signatureHash, // Store for backup/restore matching
|
|
615
|
+
doc_comment: this.extractDocComment(lines, i),
|
|
616
|
+
start_line: lineNum,
|
|
617
|
+
end_line: this.findMethodEndLine(lines, i),
|
|
618
|
+
parent_class: currentClass?.name,
|
|
619
|
+
package: this.packageName,
|
|
620
|
+
});
|
|
621
|
+
|
|
622
|
+
// Check for @Override
|
|
623
|
+
if (i > 0 && lines[i - 1].includes('@Override')) {
|
|
624
|
+
relationships.push({
|
|
625
|
+
source_id: id,
|
|
626
|
+
target_id: null, // Will be resolved later
|
|
627
|
+
target_name: methodName,
|
|
628
|
+
type: 'overrides',
|
|
629
|
+
weight: GRAPH_CONFIG.relationshipWeights.overrides,
|
|
630
|
+
});
|
|
631
|
+
}
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
// Method calls (within method bodies)
|
|
635
|
+
const callMatches = line.matchAll(/(\w+)\s*\.\s*(\w+)\s*\(/g);
|
|
636
|
+
for (const callMatch of callMatches) {
|
|
637
|
+
const object = callMatch[1];
|
|
638
|
+
const method = callMatch[2];
|
|
639
|
+
|
|
640
|
+
// Skip common patterns
|
|
641
|
+
if (['System', 'log', 'LOG', 'logger', 'String', 'Integer', 'Long'].includes(object)) continue;
|
|
642
|
+
|
|
643
|
+
relationships.push({
|
|
644
|
+
source_id: currentClass ? this.makeId(filePath, 'class', currentClass.name) : null,
|
|
645
|
+
target_id: null,
|
|
646
|
+
target_name: `${object}.${method}`,
|
|
647
|
+
type: 'calls',
|
|
648
|
+
weight: GRAPH_CONFIG.relationshipWeights.calls,
|
|
649
|
+
context_line: lineNum,
|
|
650
|
+
});
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
// Throw statements
|
|
654
|
+
const throwMatch = line.match(/throw\s+new\s+(\w+)/);
|
|
655
|
+
if (throwMatch && currentClass) {
|
|
656
|
+
relationships.push({
|
|
657
|
+
source_id: this.makeId(filePath, 'class', currentClass.name),
|
|
658
|
+
target_id: null,
|
|
659
|
+
target_name: throwMatch[1],
|
|
660
|
+
type: 'throws',
|
|
661
|
+
weight: GRAPH_CONFIG.relationshipWeights.throws,
|
|
662
|
+
});
|
|
663
|
+
}
|
|
664
|
+
}
|
|
665
|
+
|
|
666
|
+
return { entities, relationships };
|
|
667
|
+
}
|
|
668
|
+
|
|
669
|
+
/**
|
|
670
|
+
* Extract from JavaScript/TypeScript file
|
|
671
|
+
*/
|
|
672
|
+
extractJavaScript(content, lines, filePath) {
|
|
673
|
+
const entities = [];
|
|
674
|
+
const relationships = [];
|
|
675
|
+
const fileEntityId = this.makeId(filePath, 'file', path.basename(filePath));
|
|
676
|
+
|
|
677
|
+
for (let i = 0; i < lines.length; i++) {
|
|
678
|
+
const line = lines[i];
|
|
679
|
+
const lineNum = i + 1;
|
|
680
|
+
|
|
681
|
+
// --- Entity extraction (if-else chain: first match wins per line) ---
|
|
682
|
+
|
|
683
|
+
const classMatch = line.match(/(?:export\s+(?:default\s+)?)?class\s+(\w+)(?:\s+extends\s+(\w+))?/);
|
|
684
|
+
if (classMatch) {
|
|
685
|
+
const className = classMatch[1];
|
|
686
|
+
const id = this.makeId(filePath, 'class', className);
|
|
687
|
+
entities.push({
|
|
688
|
+
id,
|
|
689
|
+
file_path: filePath,
|
|
690
|
+
type: 'class',
|
|
691
|
+
name: className,
|
|
692
|
+
signature: line.trim(),
|
|
693
|
+
doc_comment: this.extractDocComment(lines, i),
|
|
694
|
+
start_line: lineNum,
|
|
695
|
+
end_line: this.findEndLine(lines, i),
|
|
696
|
+
});
|
|
697
|
+
if (classMatch[2]) {
|
|
698
|
+
relationships.push({
|
|
699
|
+
source_id: id,
|
|
700
|
+
target_id: null,
|
|
701
|
+
target_name: classMatch[2],
|
|
702
|
+
type: 'extends',
|
|
703
|
+
weight: GRAPH_CONFIG.relationshipWeights.extends,
|
|
704
|
+
});
|
|
705
|
+
}
|
|
706
|
+
} else {
|
|
707
|
+
const funcMatch = line.match(/(?:export\s+(?:default\s+)?)?(?:async\s+)?function\s*\*?\s+(\w+)\s*\(/);
|
|
708
|
+
if (funcMatch) {
|
|
709
|
+
const sig = line.trim().slice(0, 100);
|
|
710
|
+
entities.push({
|
|
711
|
+
id: this.makeId(filePath, 'function', funcMatch[1], { signature: sig, startLine: lineNum }),
|
|
712
|
+
file_path: filePath,
|
|
713
|
+
type: 'function',
|
|
714
|
+
name: funcMatch[1],
|
|
715
|
+
signature: sig,
|
|
716
|
+
signature_hash: this.makeSignatureHash(sig),
|
|
717
|
+
doc_comment: this.extractDocComment(lines, i),
|
|
718
|
+
start_line: lineNum,
|
|
719
|
+
end_line: this.findEndLine(lines, i),
|
|
720
|
+
});
|
|
721
|
+
} else {
|
|
722
|
+
const componentMatch = line.match(/(?:export\s+)?(?:const|function)\s+([A-Z]\w+)\s*[=:]/);
|
|
723
|
+
if (componentMatch) {
|
|
724
|
+
const sig = line.trim().slice(0, 100);
|
|
725
|
+
entities.push({
|
|
726
|
+
id: this.makeId(filePath, 'component', componentMatch[1], { startLine: lineNum }),
|
|
727
|
+
file_path: filePath,
|
|
728
|
+
type: 'component',
|
|
729
|
+
name: componentMatch[1],
|
|
730
|
+
signature: sig,
|
|
731
|
+
signature_hash: this.makeSignatureHash(sig),
|
|
732
|
+
doc_comment: this.extractDocComment(lines, i),
|
|
733
|
+
start_line: lineNum,
|
|
734
|
+
end_line: this.findEndLine(lines, i),
|
|
735
|
+
});
|
|
736
|
+
} else {
|
|
737
|
+
const arrowMatch = line.match(/(?:export\s+)?(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s*)?\([^)]*\)\s*=>/);
|
|
738
|
+
if (arrowMatch) {
|
|
739
|
+
const sig = line.trim().slice(0, 100);
|
|
740
|
+
entities.push({
|
|
741
|
+
id: this.makeId(filePath, 'arrowFunction', arrowMatch[1], { signature: sig, startLine: lineNum }),
|
|
742
|
+
file_path: filePath,
|
|
743
|
+
type: 'arrowFunction',
|
|
744
|
+
name: arrowMatch[1],
|
|
745
|
+
signature: sig,
|
|
746
|
+
signature_hash: this.makeSignatureHash(sig),
|
|
747
|
+
doc_comment: this.extractDocComment(lines, i),
|
|
748
|
+
start_line: lineNum,
|
|
749
|
+
end_line: this.findEndLine(lines, i),
|
|
750
|
+
});
|
|
751
|
+
} else {
|
|
752
|
+
const objArrowMatch = line.match(/(\w+)\s*:\s*(?:async\s*)?\([^)]*\)\s*=>/);
|
|
753
|
+
if (objArrowMatch) {
|
|
754
|
+
entities.push({
|
|
755
|
+
id: this.makeId(filePath, 'arrowFunction', objArrowMatch[1], { startLine: lineNum }),
|
|
756
|
+
file_path: filePath,
|
|
757
|
+
type: 'arrowFunction',
|
|
758
|
+
name: objArrowMatch[1],
|
|
759
|
+
signature: line.trim().slice(0, 100),
|
|
760
|
+
doc_comment: this.extractDocComment(lines, i),
|
|
761
|
+
start_line: lineNum,
|
|
762
|
+
end_line: this.findEndLine(lines, i),
|
|
763
|
+
});
|
|
764
|
+
} else {
|
|
765
|
+
const objMethodMatch = line.match(/^\s+(\w+)\s*\([^)]*\)\s*\{/);
|
|
766
|
+
if (objMethodMatch && !JS_RESERVED_WORDS.has(objMethodMatch[1])) {
|
|
767
|
+
entities.push({
|
|
768
|
+
id: this.makeId(filePath, 'method', objMethodMatch[1], { startLine: lineNum }),
|
|
769
|
+
file_path: filePath,
|
|
770
|
+
type: 'method',
|
|
771
|
+
name: objMethodMatch[1],
|
|
772
|
+
signature: line.trim().slice(0, 100),
|
|
773
|
+
doc_comment: this.extractDocComment(lines, i),
|
|
774
|
+
start_line: lineNum,
|
|
775
|
+
end_line: this.findEndLine(lines, i),
|
|
776
|
+
});
|
|
777
|
+
}
|
|
778
|
+
}
|
|
779
|
+
}
|
|
780
|
+
}
|
|
781
|
+
}
|
|
782
|
+
}
|
|
783
|
+
|
|
784
|
+
// --- Relationship extraction ---
|
|
785
|
+
|
|
786
|
+
// Module-level import patterns (ESM import, CJS require, re-export, dynamic import)
|
|
787
|
+
for (const { regex, group } of JS_IMPORT_PATTERNS) {
|
|
788
|
+
const m = line.match(regex);
|
|
789
|
+
if (m) {
|
|
790
|
+
const source = m[group];
|
|
791
|
+
if (source && !source.startsWith('.')) {
|
|
792
|
+
relationships.push({
|
|
793
|
+
source_id: fileEntityId,
|
|
794
|
+
target_id: null,
|
|
795
|
+
target_name: source,
|
|
796
|
+
type: 'imports',
|
|
797
|
+
weight: GRAPH_CONFIG.relationshipWeights.imports,
|
|
798
|
+
});
|
|
799
|
+
}
|
|
800
|
+
}
|
|
801
|
+
}
|
|
802
|
+
|
|
803
|
+
// Destructured require — per-name import relationships
|
|
804
|
+
this._appendDestructuredRequireRelationships(line, fileEntityId, relationships);
|
|
805
|
+
|
|
806
|
+
// Method call relationships
|
|
807
|
+
const methodCalls = line.matchAll(/(\w+)\s*\.\s*(\w+)\s*\(/g);
|
|
808
|
+
for (const callMatch of methodCalls) {
|
|
809
|
+
const obj = callMatch[1];
|
|
810
|
+
const method = callMatch[2];
|
|
811
|
+
if (!obj || !method || JS_CALL_SKIP_OBJECTS.has(obj)) continue;
|
|
812
|
+
relationships.push({
|
|
813
|
+
source_id: fileEntityId,
|
|
814
|
+
target_id: null,
|
|
815
|
+
target_name: `${obj}.${method}`,
|
|
816
|
+
type: 'calls',
|
|
817
|
+
weight: GRAPH_CONFIG.relationshipWeights.calls,
|
|
818
|
+
});
|
|
819
|
+
}
|
|
820
|
+
}
|
|
821
|
+
|
|
822
|
+
return { entities, relationships };
|
|
823
|
+
}
|
|
824
|
+
|
|
825
|
+
/**
|
|
826
|
+
* Extract from Proto file
|
|
827
|
+
*/
|
|
828
|
+
extractProto(content, lines, filePath) {
|
|
829
|
+
const entities = [];
|
|
830
|
+
const relationships = [];
|
|
831
|
+
|
|
832
|
+
for (let i = 0; i < lines.length; i++) {
|
|
833
|
+
const line = lines[i];
|
|
834
|
+
const lineNum = i + 1;
|
|
835
|
+
|
|
836
|
+
// Message declarations
|
|
837
|
+
const msgMatch = line.match(/message\s+(\w+)\s*\{/);
|
|
838
|
+
if (msgMatch) {
|
|
839
|
+
entities.push({
|
|
840
|
+
id: this.makeId(filePath, 'message', msgMatch[1]),
|
|
841
|
+
file_path: filePath,
|
|
842
|
+
type: 'message',
|
|
843
|
+
name: msgMatch[1],
|
|
844
|
+
signature: line.trim(),
|
|
845
|
+
doc_comment: this.extractDocComment(lines, i),
|
|
846
|
+
start_line: lineNum,
|
|
847
|
+
end_line: this.findEndLine(lines, i),
|
|
848
|
+
});
|
|
849
|
+
}
|
|
850
|
+
|
|
851
|
+
// Service declarations
|
|
852
|
+
const svcMatch = line.match(/service\s+(\w+)\s*\{/);
|
|
853
|
+
if (svcMatch) {
|
|
854
|
+
entities.push({
|
|
855
|
+
id: this.makeId(filePath, 'service', svcMatch[1]),
|
|
856
|
+
file_path: filePath,
|
|
857
|
+
type: 'service',
|
|
858
|
+
name: svcMatch[1],
|
|
859
|
+
signature: line.trim(),
|
|
860
|
+
doc_comment: this.extractDocComment(lines, i),
|
|
861
|
+
start_line: lineNum,
|
|
862
|
+
end_line: this.findEndLine(lines, i),
|
|
863
|
+
});
|
|
864
|
+
}
|
|
865
|
+
|
|
866
|
+
// RPC declarations
|
|
867
|
+
const rpcMatch = line.match(/rpc\s+(\w+)\s*\(\s*(\w+)\s*\)\s+returns\s+\(\s*(\w+)\s*\)/);
|
|
868
|
+
if (rpcMatch) {
|
|
869
|
+
const rpcName = rpcMatch[1];
|
|
870
|
+
const inputType = rpcMatch[2];
|
|
871
|
+
const outputType = rpcMatch[3];
|
|
872
|
+
|
|
873
|
+
const id = this.makeId(filePath, 'rpc', rpcName);
|
|
874
|
+
entities.push({
|
|
875
|
+
id,
|
|
876
|
+
file_path: filePath,
|
|
877
|
+
type: 'rpc',
|
|
878
|
+
name: rpcName,
|
|
879
|
+
signature: line.trim(),
|
|
880
|
+
doc_comment: this.extractDocComment(lines, i),
|
|
881
|
+
start_line: lineNum,
|
|
882
|
+
end_line: lineNum,
|
|
883
|
+
});
|
|
884
|
+
|
|
885
|
+
// RPC uses input and output messages
|
|
886
|
+
relationships.push({
|
|
887
|
+
source_id: id,
|
|
888
|
+
target_id: null,
|
|
889
|
+
target_name: inputType,
|
|
890
|
+
type: 'uses',
|
|
891
|
+
weight: GRAPH_CONFIG.relationshipWeights.uses,
|
|
892
|
+
});
|
|
893
|
+
relationships.push({
|
|
894
|
+
source_id: id,
|
|
895
|
+
target_id: null,
|
|
896
|
+
target_name: outputType,
|
|
897
|
+
type: 'uses',
|
|
898
|
+
weight: GRAPH_CONFIG.relationshipWeights.uses,
|
|
899
|
+
});
|
|
900
|
+
}
|
|
901
|
+
}
|
|
902
|
+
|
|
903
|
+
return { entities, relationships };
|
|
904
|
+
}
|
|
905
|
+
|
|
906
|
+
/**
|
|
907
|
+
* Generic extraction using registry patterns.
|
|
908
|
+
* Works for all languages that have graph patterns in language-patterns.js.
|
|
909
|
+
*/
|
|
910
|
+
extractGeneric(content, lines, filePath, langInfo) {
|
|
911
|
+
const entities = [];
|
|
912
|
+
const relationships = [];
|
|
913
|
+
const { graph, id: language } = langInfo;
|
|
914
|
+
const {
|
|
915
|
+
entityPatterns,
|
|
916
|
+
relationshipPatterns,
|
|
917
|
+
methodCallPattern,
|
|
918
|
+
methodCallPrefilter,
|
|
919
|
+
} = this.getGenericPatternPlan(language, graph);
|
|
920
|
+
const skipCallObjects = new Set(graph.skipCallObjects || []);
|
|
921
|
+
const fileEntityId = this.makeId(filePath, 'file', path.basename(filePath));
|
|
922
|
+
const jsonDependencySections = new Set(['dependencies', 'devDependencies', 'peerDependencies']);
|
|
923
|
+
let jsonBraceDepth = 0;
|
|
924
|
+
let activeJsonDependencyDepth = null;
|
|
925
|
+
// Track active entity scopes to attribute call source_id by lexical range.
|
|
926
|
+
const activeEntityScopes = [];
|
|
927
|
+
|
|
928
|
+
// Choose findEndLine strategy based on language type
|
|
929
|
+
const findEndLineFn = (startIdx) => {
|
|
930
|
+
if (langInfo.indentBased) {
|
|
931
|
+
return this.findEndLineIndent(lines, startIdx);
|
|
932
|
+
}
|
|
933
|
+
if (langInfo.endKeyword) {
|
|
934
|
+
return this.findEndLineKeyword(lines, startIdx, langInfo.endKeyword, langInfo.blockKeywords);
|
|
935
|
+
}
|
|
936
|
+
return this.findEndLine(lines, startIdx);
|
|
937
|
+
};
|
|
938
|
+
|
|
939
|
+
for (let i = 0; i < lines.length; i++) {
|
|
940
|
+
const line = lines[i];
|
|
941
|
+
const trimmed = line.trimStart();
|
|
942
|
+
const lineNum = i + 1;
|
|
943
|
+
while (
|
|
944
|
+
activeEntityScopes.length > 0 &&
|
|
945
|
+
activeEntityScopes[activeEntityScopes.length - 1].end_line < lineNum
|
|
946
|
+
) {
|
|
947
|
+
activeEntityScopes.pop();
|
|
948
|
+
}
|
|
949
|
+
const openBraces = (line.match(/{/g) || []).length;
|
|
950
|
+
const closeBraces = (line.match(/}/g) || []).length;
|
|
951
|
+
const depthBefore = jsonBraceDepth;
|
|
952
|
+
const depthAfter = depthBefore + openBraces - closeBraces;
|
|
953
|
+
if (trimmed.length > this.maxRegexLineLength) {
|
|
954
|
+
this._recordLongLineSkip(language, lineNum, trimmed.length);
|
|
955
|
+
if (language === 'json') {
|
|
956
|
+
if (activeJsonDependencyDepth !== null && depthAfter < activeJsonDependencyDepth) {
|
|
957
|
+
activeJsonDependencyDepth = null;
|
|
958
|
+
}
|
|
959
|
+
jsonBraceDepth = depthAfter;
|
|
960
|
+
}
|
|
961
|
+
continue;
|
|
962
|
+
}
|
|
963
|
+
|
|
964
|
+
// JSON dependency extraction:
|
|
965
|
+
// "dependencies"/"devDependencies"/"peerDependencies" are section markers.
|
|
966
|
+
// Actual imports are package keys inside those objects.
|
|
967
|
+
if (language === 'json' && activeJsonDependencyDepth !== null && depthBefore === activeJsonDependencyDepth) {
|
|
968
|
+
const depEntry = trimmed.match(/^"([^"]+)"\s*:\s*"([^"]+)"/);
|
|
969
|
+
if (depEntry && depEntry[1]) {
|
|
970
|
+
relationships.push({
|
|
971
|
+
source_id: fileEntityId,
|
|
972
|
+
target_id: null,
|
|
973
|
+
target_name: depEntry[1],
|
|
974
|
+
type: 'imports',
|
|
975
|
+
weight: GRAPH_CONFIG.relationshipWeights.imports,
|
|
976
|
+
context_line: lineNum,
|
|
977
|
+
});
|
|
978
|
+
}
|
|
979
|
+
}
|
|
980
|
+
|
|
981
|
+
// Entity extraction
|
|
982
|
+
for (const { type, pattern, prefilter } of entityPatterns) {
|
|
983
|
+
if (prefilter && !prefilter(trimmed)) continue;
|
|
984
|
+
const match = trimmed.match(pattern);
|
|
985
|
+
if (match) {
|
|
986
|
+
const name = match[1];
|
|
987
|
+
if (!name) {
|
|
988
|
+
this._recordEmptyCapture('entity', language, type, lineNum, trimmed);
|
|
989
|
+
continue;
|
|
990
|
+
}
|
|
991
|
+
const sig = trimmed.slice(0, 120);
|
|
992
|
+
const sigHash = this.makeSignatureHash(sig);
|
|
993
|
+
const entityId = this.makeId(filePath, type, name, { signature: sig, startLine: lineNum });
|
|
994
|
+
const endLine = findEndLineFn(i);
|
|
995
|
+
|
|
996
|
+
entities.push({
|
|
997
|
+
id: entityId,
|
|
998
|
+
file_path: filePath,
|
|
999
|
+
type,
|
|
1000
|
+
name,
|
|
1001
|
+
signature: sig,
|
|
1002
|
+
signature_hash: sigHash,
|
|
1003
|
+
doc_comment: this.extractDocComment(lines, i),
|
|
1004
|
+
start_line: lineNum,
|
|
1005
|
+
end_line: endLine,
|
|
1006
|
+
});
|
|
1007
|
+
activeEntityScopes.push({ id: entityId, start_line: lineNum, end_line: endLine });
|
|
1008
|
+
break; // one entity per line
|
|
1009
|
+
}
|
|
1010
|
+
}
|
|
1011
|
+
|
|
1012
|
+
// Relationship extraction
|
|
1013
|
+
const sourceEntityId = activeEntityScopes.length > 0
|
|
1014
|
+
? activeEntityScopes[activeEntityScopes.length - 1].id
|
|
1015
|
+
: null;
|
|
1016
|
+
// Method calls need special handling: group1=object, group2=method.
|
|
1017
|
+
// Reuse compiled global regex to avoid per-line RegExp allocations.
|
|
1018
|
+
if (methodCallPattern && (!methodCallPrefilter || methodCallPrefilter(trimmed))) {
|
|
1019
|
+
methodCallPattern.lastIndex = 0;
|
|
1020
|
+
let m;
|
|
1021
|
+
while ((m = methodCallPattern.exec(trimmed)) !== null) {
|
|
1022
|
+
const obj = m[1];
|
|
1023
|
+
const method = m[2];
|
|
1024
|
+
if (!obj || !method) {
|
|
1025
|
+
this._recordEmptyCapture('relationship', language, 'methodCall', lineNum, trimmed);
|
|
1026
|
+
if (m[0] === '') methodCallPattern.lastIndex++;
|
|
1027
|
+
continue;
|
|
1028
|
+
}
|
|
1029
|
+
if (skipCallObjects.has(obj)) {
|
|
1030
|
+
if (m[0] === '') methodCallPattern.lastIndex++;
|
|
1031
|
+
continue;
|
|
1032
|
+
}
|
|
1033
|
+
relationships.push({
|
|
1034
|
+
source_id: sourceEntityId,
|
|
1035
|
+
target_id: null,
|
|
1036
|
+
target_name: `${obj}.${method}`,
|
|
1037
|
+
type: 'calls',
|
|
1038
|
+
weight: GRAPH_CONFIG.relationshipWeights.calls,
|
|
1039
|
+
context_line: lineNum,
|
|
1040
|
+
});
|
|
1041
|
+
if (m[0] === '') methodCallPattern.lastIndex++;
|
|
1042
|
+
}
|
|
1043
|
+
}
|
|
1044
|
+
|
|
1045
|
+
this._appendDestructuredRequireRelationships(trimmed, sourceEntityId || fileEntityId, relationships);
|
|
1046
|
+
|
|
1047
|
+
for (const { type: relType, pattern, prefilter } of relationshipPatterns) {
|
|
1048
|
+
if (relType === 'methodCall') continue;
|
|
1049
|
+
if (prefilter && !prefilter(trimmed)) continue;
|
|
1050
|
+
|
|
1051
|
+
const match = trimmed.match(pattern);
|
|
1052
|
+
if (relType === 'dep' && language === 'json') {
|
|
1053
|
+
if (match && match[1] && jsonDependencySections.has(match[1]) && depthAfter > depthBefore) {
|
|
1054
|
+
activeJsonDependencyDepth = depthAfter;
|
|
1055
|
+
}
|
|
1056
|
+
continue;
|
|
1057
|
+
}
|
|
1058
|
+
if (match) {
|
|
1059
|
+
const { targets, filtered } = this._resolveRelationshipTargets(relType, match, language);
|
|
1060
|
+
if (targets.length === 0) {
|
|
1061
|
+
if (!filtered) this._recordEmptyCapture('relationship', language, relType, lineNum, trimmed);
|
|
1062
|
+
continue;
|
|
1063
|
+
}
|
|
1064
|
+
const mappedType = GENERIC_RELATIONSHIP_MAPPING[relType] || 'uses';
|
|
1065
|
+
const weight = GRAPH_CONFIG.relationshipWeights[mappedType] || 1.0;
|
|
1066
|
+
for (const target of targets) {
|
|
1067
|
+
relationships.push({
|
|
1068
|
+
source_id: sourceEntityId || fileEntityId,
|
|
1069
|
+
target_id: null,
|
|
1070
|
+
target_name: target,
|
|
1071
|
+
type: mappedType,
|
|
1072
|
+
weight,
|
|
1073
|
+
context_line: lineNum,
|
|
1074
|
+
});
|
|
1075
|
+
}
|
|
1076
|
+
}
|
|
1077
|
+
}
|
|
1078
|
+
|
|
1079
|
+
if (language === 'json') {
|
|
1080
|
+
if (activeJsonDependencyDepth !== null && depthAfter < activeJsonDependencyDepth) {
|
|
1081
|
+
activeJsonDependencyDepth = null;
|
|
1082
|
+
}
|
|
1083
|
+
jsonBraceDepth = depthAfter;
|
|
1084
|
+
}
|
|
1085
|
+
}
|
|
1086
|
+
|
|
1087
|
+
return { entities, relationships };
|
|
1088
|
+
}
|
|
1089
|
+
|
|
1090
|
+
getGenericPatternPlan(language, graph) {
|
|
1091
|
+
const cached = this.genericPatternPlanCache.get(language);
|
|
1092
|
+
if (cached) return cached;
|
|
1093
|
+
|
|
1094
|
+
const entityPatterns = Object.entries(graph.entities || {}).map(([type, pattern]) => ({
|
|
1095
|
+
type,
|
|
1096
|
+
pattern,
|
|
1097
|
+
prefilter: this.getPatternPrefilter(pattern),
|
|
1098
|
+
}));
|
|
1099
|
+
const relationshipPatterns = Object.entries(graph.relationships || {}).map(([type, pattern]) => ({
|
|
1100
|
+
type,
|
|
1101
|
+
pattern,
|
|
1102
|
+
prefilter: this.getPatternPrefilter(pattern),
|
|
1103
|
+
}));
|
|
1104
|
+
|
|
1105
|
+
const methodCallEntry = relationshipPatterns.find((entry) => entry.type === 'methodCall');
|
|
1106
|
+
const methodCallPattern = methodCallEntry
|
|
1107
|
+
? this.getCachedGlobalRegex(language, methodCallEntry.pattern)
|
|
1108
|
+
: null;
|
|
1109
|
+
const plan = {
|
|
1110
|
+
entityPatterns,
|
|
1111
|
+
relationshipPatterns,
|
|
1112
|
+
methodCallPattern,
|
|
1113
|
+
methodCallPrefilter: methodCallEntry?.prefilter || null,
|
|
1114
|
+
};
|
|
1115
|
+
this.genericPatternPlanCache.set(language, plan);
|
|
1116
|
+
return plan;
|
|
1117
|
+
}
|
|
1118
|
+
|
|
1119
|
+
getCachedGlobalRegex(language, pattern) {
|
|
1120
|
+
const key = `${language}:${pattern.source}:${pattern.flags}`;
|
|
1121
|
+
const cached = this.methodCallRegexCache.get(key);
|
|
1122
|
+
if (cached) return cached;
|
|
1123
|
+
|
|
1124
|
+
const uniqueFlags = [...new Set(`${pattern.flags || ''}g`)].join('');
|
|
1125
|
+
const compiled = new RegExp(pattern.source, uniqueFlags);
|
|
1126
|
+
this.methodCallRegexCache.set(key, compiled);
|
|
1127
|
+
return compiled;
|
|
1128
|
+
}
|
|
1129
|
+
|
|
1130
|
+
getPatternPrefilter(pattern) {
|
|
1131
|
+
const key = `${pattern.source}:${pattern.flags}`;
|
|
1132
|
+
if (this.patternPrefilterCache.has(key)) {
|
|
1133
|
+
return this.patternPrefilterCache.get(key);
|
|
1134
|
+
}
|
|
1135
|
+
|
|
1136
|
+
const caseInsensitive = pattern.flags.includes('i');
|
|
1137
|
+
let tokens = this.extractLineStartTokens(pattern.source);
|
|
1138
|
+
const optionalPrefixMatch = pattern.source.match(/^\^(\\?.)\?/);
|
|
1139
|
+
if (optionalPrefixMatch && tokens.length > 0) {
|
|
1140
|
+
const prefix = optionalPrefixMatch[1].startsWith('\\')
|
|
1141
|
+
? optionalPrefixMatch[1].slice(1)
|
|
1142
|
+
: optionalPrefixMatch[1];
|
|
1143
|
+
tokens = [...tokens, ...tokens.map((token) => `${prefix}${token}`)];
|
|
1144
|
+
}
|
|
1145
|
+
if (tokens.length === 0) {
|
|
1146
|
+
this.patternPrefilterCache.set(key, null);
|
|
1147
|
+
return null;
|
|
1148
|
+
}
|
|
1149
|
+
|
|
1150
|
+
const normalizedTokens = caseInsensitive
|
|
1151
|
+
? [...new Set(tokens.map((t) => t.toLowerCase()))]
|
|
1152
|
+
: [...new Set(tokens)];
|
|
1153
|
+
const prefilter = (line) => {
|
|
1154
|
+
const value = caseInsensitive ? line.toLowerCase() : line;
|
|
1155
|
+
return normalizedTokens.some((token) => value.startsWith(token));
|
|
1156
|
+
};
|
|
1157
|
+
this.patternPrefilterCache.set(key, prefilter);
|
|
1158
|
+
return prefilter;
|
|
1159
|
+
}
|
|
1160
|
+
|
|
1161
|
+
extractLineStartTokens(source) {
|
|
1162
|
+
if (!source.startsWith('^')) return [];
|
|
1163
|
+
|
|
1164
|
+
let i = 1;
|
|
1165
|
+
const tokens = [];
|
|
1166
|
+
|
|
1167
|
+
const skipLeadingWhitespace = () => {
|
|
1168
|
+
if (source.slice(i).startsWith('\\s*')) {
|
|
1169
|
+
i += 3;
|
|
1170
|
+
return true;
|
|
1171
|
+
}
|
|
1172
|
+
if (source.slice(i).startsWith('\\s+')) {
|
|
1173
|
+
i += 3;
|
|
1174
|
+
return true;
|
|
1175
|
+
}
|
|
1176
|
+
return false;
|
|
1177
|
+
};
|
|
1178
|
+
|
|
1179
|
+
while (skipLeadingWhitespace()) {}
|
|
1180
|
+
|
|
1181
|
+
while (source.slice(i).startsWith('(?:')) {
|
|
1182
|
+
const start = i + 3;
|
|
1183
|
+
let depth = 1;
|
|
1184
|
+
let j = start;
|
|
1185
|
+
let inClass = false;
|
|
1186
|
+
while (j < source.length && depth > 0) {
|
|
1187
|
+
const ch = source[j];
|
|
1188
|
+
if (ch === '\\') {
|
|
1189
|
+
j += 2;
|
|
1190
|
+
continue;
|
|
1191
|
+
}
|
|
1192
|
+
if (ch === '[') inClass = true;
|
|
1193
|
+
else if (ch === ']' && inClass) inClass = false;
|
|
1194
|
+
else if (!inClass && ch === '(') depth++;
|
|
1195
|
+
else if (!inClass && ch === ')') depth--;
|
|
1196
|
+
j++;
|
|
1197
|
+
}
|
|
1198
|
+
if (depth !== 0) return [];
|
|
1199
|
+
|
|
1200
|
+
const groupEnd = j - 1;
|
|
1201
|
+
const groupContent = source.slice(start, groupEnd);
|
|
1202
|
+
const isOptional = source[groupEnd + 1] === '?';
|
|
1203
|
+
if (!isOptional) {
|
|
1204
|
+
const alternatives = groupContent.split('|').map((alt) => alt.trim()).filter(Boolean);
|
|
1205
|
+
const altTokens = [];
|
|
1206
|
+
for (const alt of alternatives) {
|
|
1207
|
+
const token = this.extractLiteralPrefix(alt);
|
|
1208
|
+
if (!token) return [];
|
|
1209
|
+
altTokens.push(token);
|
|
1210
|
+
}
|
|
1211
|
+
tokens.push(...altTokens);
|
|
1212
|
+
return [...new Set(tokens)];
|
|
1213
|
+
}
|
|
1214
|
+
const optionalAlternatives = groupContent.split('|').map((alt) => alt.trim()).filter(Boolean);
|
|
1215
|
+
for (const alt of optionalAlternatives) {
|
|
1216
|
+
const token = this.extractLiteralPrefix(alt);
|
|
1217
|
+
if (token) tokens.push(token);
|
|
1218
|
+
}
|
|
1219
|
+
i = groupEnd + 2;
|
|
1220
|
+
while (skipLeadingWhitespace()) {}
|
|
1221
|
+
}
|
|
1222
|
+
|
|
1223
|
+
const literal = this.extractLiteralPrefix(source.slice(i));
|
|
1224
|
+
if (!literal) {
|
|
1225
|
+
// If no mandatory literal prefix can be derived, disable prefilter to avoid false negatives.
|
|
1226
|
+
return [];
|
|
1227
|
+
}
|
|
1228
|
+
tokens.push(literal);
|
|
1229
|
+
return [...new Set(tokens)];
|
|
1230
|
+
}
|
|
1231
|
+
|
|
1232
|
+
extractLiteralPrefix(fragment) {
|
|
1233
|
+
let result = '';
|
|
1234
|
+
|
|
1235
|
+
for (let i = 0; i < fragment.length; i++) {
|
|
1236
|
+
const ch = fragment[i];
|
|
1237
|
+
if (ch === '\\') {
|
|
1238
|
+
const next = fragment[i + 1];
|
|
1239
|
+
if (!next) break;
|
|
1240
|
+
if (/[A-Za-z0-9]/.test(next)) break;
|
|
1241
|
+
result += next;
|
|
1242
|
+
i++;
|
|
1243
|
+
continue;
|
|
1244
|
+
}
|
|
1245
|
+
if (fragment[i + 1] === '?' && result.length === 0 && /[@#<./:_-]/.test(ch)) {
|
|
1246
|
+
// Skip optional leading literal chars (e.g. -?include).
|
|
1247
|
+
i++;
|
|
1248
|
+
continue;
|
|
1249
|
+
}
|
|
1250
|
+
if (/[A-Za-z0-9_@#<./:-]/.test(ch)) {
|
|
1251
|
+
result += ch;
|
|
1252
|
+
continue;
|
|
1253
|
+
}
|
|
1254
|
+
break;
|
|
1255
|
+
}
|
|
1256
|
+
|
|
1257
|
+
return result;
|
|
1258
|
+
}
|
|
1259
|
+
|
|
1260
|
+
expandRelationshipTargets(relType, target) {
|
|
1261
|
+
if (typeof target !== 'string') return [target];
|
|
1262
|
+
if (!MULTI_TARGET_TYPES.has(relType)) return [target];
|
|
1263
|
+
|
|
1264
|
+
// Bracket-depth-aware top-level comma splitter.
|
|
1265
|
+
// Naive .split(',') would break generics: Base<Foo, Bar>, IFace
|
|
1266
|
+
const parts = splitTopLevelCommas(target);
|
|
1267
|
+
|
|
1268
|
+
return parts
|
|
1269
|
+
.map((entry) => entry.trim()
|
|
1270
|
+
.replace(/\s+as\s+\w+$/i, '') // import aliases
|
|
1271
|
+
.replace(/^(?:(?:public|protected|private|virtual)\s+)+/, '') // C++ access specifiers
|
|
1272
|
+
.replace(/<.*$/, '') // strip generics from first <: Map<K, V> → Map
|
|
1273
|
+
.replace(/\([^)]*\)/g, '') // strip constructor args: Base(x) → Base
|
|
1274
|
+
.replace(/[;{}]+$/, '') // strip trailing punctuation
|
|
1275
|
+
.trim()
|
|
1276
|
+
)
|
|
1277
|
+
.filter(Boolean);
|
|
1278
|
+
}
|
|
1279
|
+
|
|
1280
|
+
_normalizeTreeSitterEntities(filePath, symbols, language) {
|
|
1281
|
+
const dedupedBySymbolAndLine = new Map();
|
|
1282
|
+
|
|
1283
|
+
for (const sym of symbols) {
|
|
1284
|
+
if (!sym?.name || !sym?.type) continue;
|
|
1285
|
+
const normalizedType = this._normalizeTreeSitterSymbolType(sym.type, sym.name);
|
|
1286
|
+
if ((language === 'javascript' || language === 'typescript') && normalizedType === 'variable') {
|
|
1287
|
+
continue;
|
|
1288
|
+
}
|
|
1289
|
+
const startLine = Number.isInteger(sym.startLine) ? sym.startLine : 0;
|
|
1290
|
+
const endLine = Number.isInteger(sym.endLine) ? sym.endLine : startLine;
|
|
1291
|
+
const rank = TREE_SITTER_ENTITY_PRIORITY[normalizedType] || 0;
|
|
1292
|
+
const key = `${sym.name}:${startLine}`;
|
|
1293
|
+
const existing = dedupedBySymbolAndLine.get(key);
|
|
1294
|
+
|
|
1295
|
+
if (!existing || rank > existing.rank) {
|
|
1296
|
+
dedupedBySymbolAndLine.set(key, {
|
|
1297
|
+
id: this._makeEntityId(filePath, sym.name, normalizedType, startLine),
|
|
1298
|
+
file_path: filePath,
|
|
1299
|
+
type: normalizedType,
|
|
1300
|
+
name: sym.name,
|
|
1301
|
+
signature: sym.signature || null,
|
|
1302
|
+
start_line: startLine + 1, // tree-sitter is 0-indexed
|
|
1303
|
+
end_line: endLine + 1,
|
|
1304
|
+
rank,
|
|
1305
|
+
});
|
|
1306
|
+
}
|
|
1307
|
+
}
|
|
1308
|
+
|
|
1309
|
+
return Array.from(dedupedBySymbolAndLine.values())
|
|
1310
|
+
.sort((a, b) => a.start_line - b.start_line)
|
|
1311
|
+
.map(({ rank, ...entity }) => entity);
|
|
1312
|
+
}
|
|
1313
|
+
|
|
1314
|
+
_normalizeTreeSitterSymbolType(type, name) {
|
|
1315
|
+
if (type === 'arrowFunction' && /^[A-Z]/.test(name)) {
|
|
1316
|
+
return 'component';
|
|
1317
|
+
}
|
|
1318
|
+
return type;
|
|
1319
|
+
}
|
|
1320
|
+
|
|
1321
|
+
_resolveRelationshipTargets(relType, match, language) {
|
|
1322
|
+
const isJsTs = language === 'javascript' || language === 'typescript';
|
|
1323
|
+
|
|
1324
|
+
if (isJsTs && relType === 'import') {
|
|
1325
|
+
const source = match[3]?.trim();
|
|
1326
|
+
if (!source) return { targets: [], filtered: false };
|
|
1327
|
+
if (source.startsWith('.')) return { targets: [], filtered: true };
|
|
1328
|
+
return { targets: [source], filtered: false };
|
|
1329
|
+
}
|
|
1330
|
+
|
|
1331
|
+
if (isJsTs && (relType === 'require' || relType === 'reexport' || relType === 'dynamicImport')) {
|
|
1332
|
+
const source = match[1]?.trim();
|
|
1333
|
+
if (!source) return { targets: [], filtered: false };
|
|
1334
|
+
if (source.startsWith('.')) return { targets: [], filtered: true };
|
|
1335
|
+
return { targets: [source], filtered: false };
|
|
1336
|
+
}
|
|
1337
|
+
|
|
1338
|
+
const rawTarget = typeof match[1] === 'string' ? match[1].trim() : match[1];
|
|
1339
|
+
if (!rawTarget) return { targets: [], filtered: false };
|
|
1340
|
+
|
|
1341
|
+
return {
|
|
1342
|
+
targets: this.expandRelationshipTargets(relType, rawTarget),
|
|
1343
|
+
filtered: false,
|
|
1344
|
+
};
|
|
1345
|
+
}
|
|
1346
|
+
|
|
1347
|
+
_appendDestructuredRequireRelationships(line, sourceId, relationships) {
|
|
1348
|
+
const destructuredRequire = line.match(/(?:const|let|var)\s+\{([^}]+)\}\s*=\s*require\s*\(\s*['"]([^'"]+)['"]\s*\)/);
|
|
1349
|
+
if (!destructuredRequire) return;
|
|
1350
|
+
|
|
1351
|
+
const names = this._extractDestructuredRequireNames(destructuredRequire[1]);
|
|
1352
|
+
for (const name of names) {
|
|
1353
|
+
relationships.push({
|
|
1354
|
+
source_id: sourceId,
|
|
1355
|
+
target_id: null,
|
|
1356
|
+
target_name: name,
|
|
1357
|
+
type: 'imports',
|
|
1358
|
+
weight: GRAPH_CONFIG.relationshipWeights.imports,
|
|
1359
|
+
});
|
|
1360
|
+
}
|
|
1361
|
+
}
|
|
1362
|
+
|
|
1363
|
+
_extractDestructuredRequireNames(rawNames) {
|
|
1364
|
+
return rawNames
|
|
1365
|
+
.split(',')
|
|
1366
|
+
.map(part => part.trim())
|
|
1367
|
+
.map((name) => {
|
|
1368
|
+
if (!name) return null;
|
|
1369
|
+
|
|
1370
|
+
// JS destructuring alias: { readFile: read }.
|
|
1371
|
+
if (name.includes(':')) {
|
|
1372
|
+
name = name.split(':').pop().trim();
|
|
1373
|
+
}
|
|
1374
|
+
|
|
1375
|
+
// TS-style docs aliasing: { foo as bar }.
|
|
1376
|
+
const asAlias = name.match(/\bas\s+([A-Za-z_$][\w$]*)$/);
|
|
1377
|
+
if (asAlias) {
|
|
1378
|
+
name = asAlias[1];
|
|
1379
|
+
}
|
|
1380
|
+
|
|
1381
|
+
// Remove default value patterns: { foo = fallback }.
|
|
1382
|
+
name = name.replace(/=.*/, '').trim();
|
|
1383
|
+
name = name.replace(/^\.\.\./, '').trim();
|
|
1384
|
+
|
|
1385
|
+
return /^[A-Za-z_$][\w$]*$/.test(name) ? name : null;
|
|
1386
|
+
})
|
|
1387
|
+
.filter(Boolean);
|
|
1388
|
+
}
|
|
1389
|
+
|
|
1390
|
+
/**
|
|
1391
|
+
* Generate a deterministic entity ID for tree-sitter symbols.
|
|
1392
|
+
* Uses the same hash pattern as makeId() for consistency.
|
|
1393
|
+
*/
|
|
1394
|
+
_makeEntityId(filePath, name, type, startLine) {
|
|
1395
|
+
const relativePath = this.projectRoot ? path.relative(this.projectRoot, filePath) : filePath;
|
|
1396
|
+
const key = `${relativePath}:${type}:${name}:${startLine}`;
|
|
1397
|
+
return createHash('sha256').update(key).digest('hex').slice(0, 16);
|
|
1398
|
+
}
|
|
1399
|
+
|
|
1400
|
+
/**
|
|
1401
|
+
* Extract relationships using regex patterns from langInfo.graph.
|
|
1402
|
+
* Used by tree-sitter path where entities come from AST but relationships
|
|
1403
|
+
* still need regex (tree-sitter tags.scm only gives definitions).
|
|
1404
|
+
*/
|
|
1405
|
+
_extractRelationships(content, lines, filePath, langInfo, entities) {
|
|
1406
|
+
const relationships = [];
|
|
1407
|
+
if (!langInfo.graph) return relationships;
|
|
1408
|
+
|
|
1409
|
+
const { graph, id: language } = langInfo;
|
|
1410
|
+
const {
|
|
1411
|
+
relationshipPatterns,
|
|
1412
|
+
methodCallPattern,
|
|
1413
|
+
methodCallPrefilter,
|
|
1414
|
+
} = this.getGenericPatternPlan(language, graph);
|
|
1415
|
+
const skipCallObjects = new Set(graph.skipCallObjects || []);
|
|
1416
|
+
const fileEntityId = this.makeId(filePath, 'file', path.basename(filePath));
|
|
1417
|
+
|
|
1418
|
+
// Build scope lookup from tree-sitter entities for source_id attribution
|
|
1419
|
+
const sortedEntities = [...entities].sort((a, b) => a.start_line - b.start_line);
|
|
1420
|
+
|
|
1421
|
+
const findScopeEntity = (lineNum) => {
|
|
1422
|
+
for (let i = sortedEntities.length - 1; i >= 0; i--) {
|
|
1423
|
+
const e = sortedEntities[i];
|
|
1424
|
+
if (e.start_line <= lineNum && e.end_line >= lineNum) {
|
|
1425
|
+
return e.id;
|
|
1426
|
+
}
|
|
1427
|
+
}
|
|
1428
|
+
return null;
|
|
1429
|
+
};
|
|
1430
|
+
|
|
1431
|
+
for (let i = 0; i < lines.length; i++) {
|
|
1432
|
+
const line = lines[i];
|
|
1433
|
+
const trimmed = line.trimStart();
|
|
1434
|
+
const lineNum = i + 1;
|
|
1435
|
+
|
|
1436
|
+
if (trimmed.length > this.maxRegexLineLength) continue;
|
|
1437
|
+
|
|
1438
|
+
const sourceEntityId = findScopeEntity(lineNum);
|
|
1439
|
+
|
|
1440
|
+
// Method calls
|
|
1441
|
+
if (methodCallPattern && (!methodCallPrefilter || methodCallPrefilter(trimmed))) {
|
|
1442
|
+
methodCallPattern.lastIndex = 0;
|
|
1443
|
+
let m;
|
|
1444
|
+
while ((m = methodCallPattern.exec(trimmed)) !== null) {
|
|
1445
|
+
const obj = m[1];
|
|
1446
|
+
const method = m[2];
|
|
1447
|
+
if (!obj || !method) {
|
|
1448
|
+
if (m[0] === '') methodCallPattern.lastIndex++;
|
|
1449
|
+
continue;
|
|
1450
|
+
}
|
|
1451
|
+
if (skipCallObjects.has(obj)) {
|
|
1452
|
+
if (m[0] === '') methodCallPattern.lastIndex++;
|
|
1453
|
+
continue;
|
|
1454
|
+
}
|
|
1455
|
+
relationships.push({
|
|
1456
|
+
source_id: sourceEntityId,
|
|
1457
|
+
target_id: null,
|
|
1458
|
+
target_name: `${obj}.${method}`,
|
|
1459
|
+
type: 'calls',
|
|
1460
|
+
weight: GRAPH_CONFIG.relationshipWeights.calls,
|
|
1461
|
+
context_line: lineNum,
|
|
1462
|
+
});
|
|
1463
|
+
if (m[0] === '') methodCallPattern.lastIndex++;
|
|
1464
|
+
}
|
|
1465
|
+
}
|
|
1466
|
+
|
|
1467
|
+
this._appendDestructuredRequireRelationships(trimmed, sourceEntityId || fileEntityId, relationships);
|
|
1468
|
+
|
|
1469
|
+
// Other relationships (imports, extends, etc.)
|
|
1470
|
+
for (const { type: relType, pattern, prefilter } of relationshipPatterns) {
|
|
1471
|
+
if (relType === 'methodCall') continue;
|
|
1472
|
+
if (prefilter && !prefilter(trimmed)) continue;
|
|
1473
|
+
|
|
1474
|
+
const match = trimmed.match(pattern);
|
|
1475
|
+
if (match) {
|
|
1476
|
+
const { targets, filtered } = this._resolveRelationshipTargets(relType, match, language);
|
|
1477
|
+
if (targets.length === 0) {
|
|
1478
|
+
if (!filtered) this._recordEmptyCapture('relationship', language, relType, lineNum, trimmed);
|
|
1479
|
+
continue;
|
|
1480
|
+
}
|
|
1481
|
+
const mappedType = GENERIC_RELATIONSHIP_MAPPING[relType] || 'uses';
|
|
1482
|
+
const weight = GRAPH_CONFIG.relationshipWeights[mappedType] || 1.0;
|
|
1483
|
+
for (const target of targets) {
|
|
1484
|
+
relationships.push({
|
|
1485
|
+
source_id: sourceEntityId || fileEntityId,
|
|
1486
|
+
target_id: null,
|
|
1487
|
+
target_name: target,
|
|
1488
|
+
type: mappedType,
|
|
1489
|
+
weight,
|
|
1490
|
+
context_line: lineNum,
|
|
1491
|
+
});
|
|
1492
|
+
}
|
|
1493
|
+
}
|
|
1494
|
+
}
|
|
1495
|
+
}
|
|
1496
|
+
|
|
1497
|
+
return relationships;
|
|
1498
|
+
}
|
|
1499
|
+
|
|
1500
|
+
_recordEmptyCapture(kind, language, patternType, lineNum, line) {
|
|
1501
|
+
this.debugCounters.emptyCapture[kind] = (this.debugCounters.emptyCapture[kind] || 0) + 1;
|
|
1502
|
+
|
|
1503
|
+
if (!this.debugCounters.byLanguage[language]) {
|
|
1504
|
+
this.debugCounters.byLanguage[language] = { entity: 0, relationship: 0, skippedLongLines: 0 };
|
|
1505
|
+
}
|
|
1506
|
+
this.debugCounters.byLanguage[language][kind] += 1;
|
|
1507
|
+
|
|
1508
|
+
const key = `${language}:${kind}:${patternType}`;
|
|
1509
|
+
this.debugCounters.byPattern[key] = (this.debugCounters.byPattern[key] || 0) + 1;
|
|
1510
|
+
|
|
1511
|
+
if (this.warnOnPatternDrop && this.debugCounters.byPattern[key] <= 3) {
|
|
1512
|
+
console.warn(`[graph-extractor] Empty capture dropped for ${key} at line ${lineNum}: ${line.slice(0, 120)}`);
|
|
1513
|
+
}
|
|
1514
|
+
}
|
|
1515
|
+
|
|
1516
|
+
_recordLongLineSkip(language, lineNum, lineLength) {
|
|
1517
|
+
this.debugCounters.skippedLongLines += 1;
|
|
1518
|
+
if (!this.debugCounters.byLanguage[language]) {
|
|
1519
|
+
this.debugCounters.byLanguage[language] = { entity: 0, relationship: 0, skippedLongLines: 0 };
|
|
1520
|
+
}
|
|
1521
|
+
this.debugCounters.byLanguage[language].skippedLongLines += 1;
|
|
1522
|
+
if (this.warnOnPatternDrop && this.debugCounters.byLanguage[language].skippedLongLines <= 3) {
|
|
1523
|
+
console.warn(`[graph-extractor] Skipping regex extraction for long line (${lineLength} chars) at ${language}:${lineNum}`);
|
|
1524
|
+
}
|
|
1525
|
+
}
|
|
1526
|
+
|
|
1527
|
+
getDebugCounters() {
|
|
1528
|
+
const byLanguage = {};
|
|
1529
|
+
for (const [language, counts] of Object.entries(this.debugCounters.byLanguage)) {
|
|
1530
|
+
byLanguage[language] = { ...counts };
|
|
1531
|
+
}
|
|
1532
|
+
return {
|
|
1533
|
+
emptyCapture: { ...this.debugCounters.emptyCapture },
|
|
1534
|
+
skippedLongLines: this.debugCounters.skippedLongLines,
|
|
1535
|
+
byLanguage,
|
|
1536
|
+
byPattern: { ...this.debugCounters.byPattern },
|
|
1537
|
+
};
|
|
1538
|
+
}
|
|
1539
|
+
|
|
1540
|
+
/**
|
|
1541
|
+
* Generate unique ID for an entity
|
|
1542
|
+
*
|
|
1543
|
+
* For collision-proof IDs (especially overloaded methods), include signature or line info.
|
|
1544
|
+
* ID format: sha256(relativePath:type:name:disambiguator)[0:16]
|
|
1545
|
+
*
|
|
1546
|
+
* @param {string} filePath - Absolute file path
|
|
1547
|
+
* @param {string} type - Entity type (class, method, function, etc.)
|
|
1548
|
+
* @param {string} name - Entity name
|
|
1549
|
+
* @param {object} [options] - Optional disambiguation info
|
|
1550
|
+
* @param {string} [options.signature] - Method/function signature for overload disambiguation
|
|
1551
|
+
* @param {number} [options.startLine] - Start line as fallback disambiguator
|
|
1552
|
+
* @returns {string} 16-char hex ID
|
|
1553
|
+
*/
|
|
1554
|
+
makeId(filePath, type, name, options = {}) {
|
|
1555
|
+
const relativePath = this.projectRoot ? path.relative(this.projectRoot, filePath) : filePath;
|
|
1556
|
+
|
|
1557
|
+
// Build disambiguator for overloaded methods or same-name entities
|
|
1558
|
+
let disambiguator = '';
|
|
1559
|
+
if (options.signature) {
|
|
1560
|
+
// Hash the signature for a compact, stable disambiguator
|
|
1561
|
+
disambiguator = createHash('sha256').update(options.signature).digest('hex').slice(0, 8);
|
|
1562
|
+
} else if (options.startLine !== undefined) {
|
|
1563
|
+
// Fallback: use line number if no signature
|
|
1564
|
+
disambiguator = String(options.startLine);
|
|
1565
|
+
}
|
|
1566
|
+
|
|
1567
|
+
const key = disambiguator
|
|
1568
|
+
? `${relativePath}:${type}:${name}:${disambiguator}`
|
|
1569
|
+
: `${relativePath}:${type}:${name}`;
|
|
1570
|
+
|
|
1571
|
+
return createHash('sha256').update(key).digest('hex').slice(0, 16);
|
|
1572
|
+
}
|
|
1573
|
+
|
|
1574
|
+
/**
|
|
1575
|
+
* Generate a signature hash for stable entity identification.
|
|
1576
|
+
* Used for backup/restore matching when IDs change.
|
|
1577
|
+
*
|
|
1578
|
+
* @param {string} signature - Full method/function signature
|
|
1579
|
+
* @returns {string|null} 8-char hex hash or null if no signature
|
|
1580
|
+
*/
|
|
1581
|
+
makeSignatureHash(signature) {
|
|
1582
|
+
if (!signature) return null;
|
|
1583
|
+
return createHash('sha256').update(signature).digest('hex').slice(0, 8);
|
|
1584
|
+
}
|
|
1585
|
+
|
|
1586
|
+
/**
|
|
1587
|
+
* Extract doc comment from lines before a declaration
|
|
1588
|
+
*/
|
|
1589
|
+
extractDocComment(lines, lineIndex) {
|
|
1590
|
+
const comments = [];
|
|
1591
|
+
let i = lineIndex - 1;
|
|
1592
|
+
|
|
1593
|
+
while (i >= 0) {
|
|
1594
|
+
const line = lines[i].trim();
|
|
1595
|
+
if (line.startsWith('*') || line.startsWith('//') || line.startsWith('/*') || line.startsWith('/**')) {
|
|
1596
|
+
comments.unshift(line.replace(/^[/*\s]+/, '').replace(/\*\/$/, '').trim());
|
|
1597
|
+
i--;
|
|
1598
|
+
} else if (line === '') {
|
|
1599
|
+
i--;
|
|
1600
|
+
} else {
|
|
1601
|
+
break;
|
|
1602
|
+
}
|
|
1603
|
+
}
|
|
1604
|
+
|
|
1605
|
+
return comments.join(' ').slice(0, 500) || null;
|
|
1606
|
+
}
|
|
1607
|
+
|
|
1608
|
+
/**
|
|
1609
|
+
* Find end line of a block (matching braces)
|
|
1610
|
+
*/
|
|
1611
|
+
findEndLine(lines, startIndex) {
|
|
1612
|
+
let braceDepth = 0;
|
|
1613
|
+
let started = false;
|
|
1614
|
+
|
|
1615
|
+
for (let i = startIndex; i < lines.length; i++) {
|
|
1616
|
+
const line = lines[i];
|
|
1617
|
+
const opens = (line.match(/{/g) || []).length;
|
|
1618
|
+
const closes = (line.match(/}/g) || []).length;
|
|
1619
|
+
|
|
1620
|
+
if (opens > 0) started = true;
|
|
1621
|
+
braceDepth += opens - closes;
|
|
1622
|
+
|
|
1623
|
+
if (started && braceDepth === 0) {
|
|
1624
|
+
return i + 1;
|
|
1625
|
+
}
|
|
1626
|
+
}
|
|
1627
|
+
|
|
1628
|
+
return lines.length;
|
|
1629
|
+
}
|
|
1630
|
+
|
|
1631
|
+
/**
|
|
1632
|
+
* Find end line for indent-based languages (Python, YAML, etc.)
|
|
1633
|
+
* Scans forward until a line at the same or lesser indentation is found.
|
|
1634
|
+
*/
|
|
1635
|
+
findEndLineIndent(lines, startIndex) {
|
|
1636
|
+
const startLine = lines[startIndex];
|
|
1637
|
+
const startIndent = startLine.length - startLine.trimStart().length;
|
|
1638
|
+
|
|
1639
|
+
for (let i = startIndex + 1; i < lines.length; i++) {
|
|
1640
|
+
const line = lines[i];
|
|
1641
|
+
const trimmed = line.trimStart();
|
|
1642
|
+
if (!trimmed) continue; // skip blank lines
|
|
1643
|
+
const indent = line.length - trimmed.length;
|
|
1644
|
+
if (indent <= startIndent) {
|
|
1645
|
+
return i; // 0-based exclusive → 1-based line number
|
|
1646
|
+
}
|
|
1647
|
+
}
|
|
1648
|
+
|
|
1649
|
+
return lines.length;
|
|
1650
|
+
}
|
|
1651
|
+
|
|
1652
|
+
/**
|
|
1653
|
+
* Find end line for end-keyword languages (Ruby, Elixir, Lua, Obj-C).
|
|
1654
|
+
* Counts matching keyword pairs to find the closing end/keyword.
|
|
1655
|
+
*/
|
|
1656
|
+
findEndLineKeyword(lines, startIndex, endKeyword, blockKeywords) {
|
|
1657
|
+
const endRe = new RegExp(`^\\s*${escapeRegexLiteral(endKeyword)}\\b`);
|
|
1658
|
+
const blockStartRe = blockKeywords?.length
|
|
1659
|
+
? new RegExp(`^\\s*(?:${blockKeywords.join('|')})\\b`)
|
|
1660
|
+
: null;
|
|
1661
|
+
let depth = 1; // start inside the opening block
|
|
1662
|
+
|
|
1663
|
+
for (let i = startIndex + 1; i < lines.length; i++) {
|
|
1664
|
+
const line = lines[i];
|
|
1665
|
+
// Check for nested block openers (boundary patterns or block keywords)
|
|
1666
|
+
if (blockStartRe && blockStartRe.test(line)) {
|
|
1667
|
+
depth++;
|
|
1668
|
+
}
|
|
1669
|
+
if (endRe.test(line)) {
|
|
1670
|
+
depth--;
|
|
1671
|
+
if (depth === 0) {
|
|
1672
|
+
return i + 1; // 1-based
|
|
1673
|
+
}
|
|
1674
|
+
}
|
|
1675
|
+
}
|
|
1676
|
+
|
|
1677
|
+
return lines.length;
|
|
1678
|
+
}
|
|
1679
|
+
|
|
1680
|
+
/**
|
|
1681
|
+
* Find end line of a method (simpler heuristic)
|
|
1682
|
+
*/
|
|
1683
|
+
findMethodEndLine(lines, startIndex) {
|
|
1684
|
+
let braceDepth = 0;
|
|
1685
|
+
let started = false;
|
|
1686
|
+
|
|
1687
|
+
for (let i = startIndex; i < Math.min(startIndex + 200, lines.length); i++) {
|
|
1688
|
+
const line = lines[i];
|
|
1689
|
+
const opens = (line.match(/{/g) || []).length;
|
|
1690
|
+
const closes = (line.match(/}/g) || []).length;
|
|
1691
|
+
|
|
1692
|
+
if (opens > 0) started = true;
|
|
1693
|
+
braceDepth += opens - closes;
|
|
1694
|
+
|
|
1695
|
+
if (started && braceDepth === 0) {
|
|
1696
|
+
return i + 1;
|
|
1697
|
+
}
|
|
1698
|
+
}
|
|
1699
|
+
|
|
1700
|
+
return Math.min(startIndex + 50, lines.length);
|
|
1701
|
+
}
|
|
1702
|
+
}
|
|
1703
|
+
|
|
1704
|
+
// =============================================================================
|
|
1705
|
+
// DATABASE OPERATIONS
|
|
1706
|
+
// =============================================================================
|
|
1707
|
+
|
|
1708
|
+
/**
|
|
1709
|
+
* Ensure stale_since column exists for soft-delete support.
|
|
1710
|
+
* Handles branch switching gracefully by marking entities as stale instead of deleting.
|
|
1711
|
+
* Files marked as stale can be pruned after 30 days.
|
|
1712
|
+
*
|
|
1713
|
+
* @param {import('better-sqlite3').Database} db
|
|
1714
|
+
* @returns {boolean} true if column exists or was added successfully
|
|
1715
|
+
*/
|
|
1716
|
+
export function ensureStaleColumn(db) {
|
|
1717
|
+
try {
|
|
1718
|
+
// Check if column exists
|
|
1719
|
+
const columns = db.prepare("PRAGMA table_info(entities)").all();
|
|
1720
|
+
const hasStaleColumn = columns.some(c => c.name === 'stale_since');
|
|
1721
|
+
|
|
1722
|
+
if (!hasStaleColumn) {
|
|
1723
|
+
console.log('[graph-extractor] Adding stale_since column for soft-delete support');
|
|
1724
|
+
db.exec('ALTER TABLE entities ADD COLUMN stale_since INTEGER DEFAULT NULL');
|
|
1725
|
+
// Create partial index for efficient stale entity queries
|
|
1726
|
+
db.exec('CREATE INDEX IF NOT EXISTS idx_entities_stale ON entities(stale_since) WHERE stale_since IS NOT NULL');
|
|
1727
|
+
}
|
|
1728
|
+
|
|
1729
|
+
// P1 FIX: Add covering index for active entities (stale_since IS NULL)
|
|
1730
|
+
// The idx_entities_stale helps find stale entries, but queries filtering for
|
|
1731
|
+
// active entries (WHERE stale_since IS NULL) need their own index
|
|
1732
|
+
// This provides 5-20ms savings per query on active entity lookups
|
|
1733
|
+
try {
|
|
1734
|
+
db.exec(`
|
|
1735
|
+
CREATE INDEX IF NOT EXISTS idx_entities_active
|
|
1736
|
+
ON entities(id, name, type, file_path)
|
|
1737
|
+
WHERE stale_since IS NULL
|
|
1738
|
+
`);
|
|
1739
|
+
} catch (e) {
|
|
1740
|
+
// Index may already exist, ignore
|
|
1741
|
+
}
|
|
1742
|
+
|
|
1743
|
+
return true;
|
|
1744
|
+
} catch (err) {
|
|
1745
|
+
if (err.message.includes('duplicate column')) {
|
|
1746
|
+
return true; // Column already exists
|
|
1747
|
+
}
|
|
1748
|
+
console.error(`[graph-extractor] Failed to add stale_since column: ${err.message}`);
|
|
1749
|
+
return false;
|
|
1750
|
+
}
|
|
1751
|
+
}
|
|
1752
|
+
|
|
1753
|
+
/**
|
|
1754
|
+
* Check if database schema is compatible with current version.
|
|
1755
|
+
* Stores version in a simple key-value table.
|
|
1756
|
+
* @param {import('better-sqlite3').Database} db
|
|
1757
|
+
* @returns {{compatible: boolean, dbVersion: number|null}}
|
|
1758
|
+
*/
|
|
1759
|
+
export function checkSchemaVersion(db) {
|
|
1760
|
+
try {
|
|
1761
|
+
// Create metadata table if not exists
|
|
1762
|
+
db.exec(`CREATE TABLE IF NOT EXISTS schema_meta (key TEXT PRIMARY KEY, value TEXT)`);
|
|
1763
|
+
|
|
1764
|
+
const row = db.prepare('SELECT value FROM schema_meta WHERE key = ?').get('version');
|
|
1765
|
+
const dbVersion = row ? parseInt(row.value, 10) : null;
|
|
1766
|
+
|
|
1767
|
+
if (dbVersion === null) {
|
|
1768
|
+
const existingTableCount = db.prepare(`
|
|
1769
|
+
SELECT COUNT(*) AS count
|
|
1770
|
+
FROM sqlite_master
|
|
1771
|
+
WHERE type = 'table'
|
|
1772
|
+
AND name NOT LIKE 'sqlite_%'
|
|
1773
|
+
AND name != 'schema_meta'
|
|
1774
|
+
`).get().count;
|
|
1775
|
+
|
|
1776
|
+
// Fresh databases can continue; pre-versioning databases must be migrated.
|
|
1777
|
+
return { compatible: existingTableCount === 0, dbVersion: null };
|
|
1778
|
+
}
|
|
1779
|
+
|
|
1780
|
+
if (dbVersion < SCHEMA_VERSION) {
|
|
1781
|
+
console.warn(`⚠️ Schema version mismatch: DB has v${dbVersion}, code expects v${SCHEMA_VERSION}`);
|
|
1782
|
+
console.warn(` Run: /index-codebase --full (or node index-codebase-v21.js --full)`);
|
|
1783
|
+
return { compatible: false, dbVersion };
|
|
1784
|
+
}
|
|
1785
|
+
|
|
1786
|
+
return { compatible: true, dbVersion };
|
|
1787
|
+
} catch (err) {
|
|
1788
|
+
// If check fails, assume compatible and continue
|
|
1789
|
+
return { compatible: true, dbVersion: null };
|
|
1790
|
+
}
|
|
1791
|
+
}
|
|
1792
|
+
|
|
1793
|
+
/**
|
|
1794
|
+
* Create code graph database schema
|
|
1795
|
+
* Uses better-sqlite3 (native SQLite binding with full FTS5 trigram support)
|
|
1796
|
+
*/
|
|
1797
|
+
export function createGraphSchema(db) {
|
|
1798
|
+
const versionStatus = checkSchemaVersion(db);
|
|
1799
|
+
if (!versionStatus.compatible) {
|
|
1800
|
+
console.log(` Updating schema from ${versionStatus.dbVersion ?? 'unversioned'} to v${SCHEMA_VERSION}`);
|
|
1801
|
+
}
|
|
1802
|
+
|
|
1803
|
+
// Entities table with HCGS summary support
|
|
1804
|
+
// signature_hash added for collision-proof backup/restore of overloaded methods
|
|
1805
|
+
// code column stores actual source code for HCGS summary generation
|
|
1806
|
+
db.exec(`
|
|
1807
|
+
CREATE TABLE IF NOT EXISTS entities (
|
|
1808
|
+
id TEXT PRIMARY KEY,
|
|
1809
|
+
file_path TEXT NOT NULL,
|
|
1810
|
+
type TEXT NOT NULL,
|
|
1811
|
+
name TEXT NOT NULL,
|
|
1812
|
+
signature TEXT,
|
|
1813
|
+
signature_hash TEXT,
|
|
1814
|
+
doc_comment TEXT,
|
|
1815
|
+
start_line INTEGER,
|
|
1816
|
+
end_line INTEGER,
|
|
1817
|
+
package TEXT,
|
|
1818
|
+
parent_class TEXT,
|
|
1819
|
+
search_text TEXT,
|
|
1820
|
+
summary TEXT,
|
|
1821
|
+
summary_embedding BLOB,
|
|
1822
|
+
parent_id TEXT,
|
|
1823
|
+
hierarchy_level INTEGER DEFAULT 0,
|
|
1824
|
+
code TEXT,
|
|
1825
|
+
name_alias TEXT,
|
|
1826
|
+
stale_since INTEGER DEFAULT NULL
|
|
1827
|
+
)
|
|
1828
|
+
`);
|
|
1829
|
+
|
|
1830
|
+
// Migration: Add code column to existing tables that don't have it
|
|
1831
|
+
try {
|
|
1832
|
+
const columns = db.prepare("PRAGMA table_info(entities)").all();
|
|
1833
|
+
const hasCodeColumn = columns.some(col => col.name === 'code');
|
|
1834
|
+
if (!hasCodeColumn) {
|
|
1835
|
+
db.exec('ALTER TABLE entities ADD COLUMN code TEXT');
|
|
1836
|
+
console.log(' Migrated: added code column to entities table');
|
|
1837
|
+
}
|
|
1838
|
+
const hasAliasColumn = columns.some(col => col.name === 'name_alias');
|
|
1839
|
+
if (!hasAliasColumn) {
|
|
1840
|
+
db.exec('ALTER TABLE entities ADD COLUMN name_alias TEXT');
|
|
1841
|
+
console.log(' Migrated: added name_alias column to entities table');
|
|
1842
|
+
}
|
|
1843
|
+
} catch (err) {
|
|
1844
|
+
// Ignore errors - column might already exist or table not created yet
|
|
1845
|
+
}
|
|
1846
|
+
|
|
1847
|
+
const aliasBackfillCount = backfillNameAliases(db);
|
|
1848
|
+
if (aliasBackfillCount > 0) {
|
|
1849
|
+
console.log(` Migrated: backfilled name_alias for ${aliasBackfillCount} entities`);
|
|
1850
|
+
}
|
|
1851
|
+
|
|
1852
|
+
// Migration: Add stale_since column for soft-delete support
|
|
1853
|
+
// Files marked as stale (removed from filesystem but kept in DB) can be pruned after 30 days
|
|
1854
|
+
// This handles branch switches gracefully
|
|
1855
|
+
// E4 FIX: Check return value and warn if migration failed
|
|
1856
|
+
if (!ensureStaleColumn(db)) {
|
|
1857
|
+
console.warn('[graph-extractor] WARN: Failed to add stale_since column - searches may include deleted files');
|
|
1858
|
+
}
|
|
1859
|
+
|
|
1860
|
+
// Relationships table (source_id can be NULL for unresolved references)
|
|
1861
|
+
db.exec(`
|
|
1862
|
+
CREATE TABLE IF NOT EXISTS relationships (
|
|
1863
|
+
source_id TEXT,
|
|
1864
|
+
target_id TEXT,
|
|
1865
|
+
target_name TEXT NOT NULL,
|
|
1866
|
+
type TEXT NOT NULL,
|
|
1867
|
+
weight REAL DEFAULT 1.0,
|
|
1868
|
+
context_line INTEGER,
|
|
1869
|
+
full_import_path TEXT,
|
|
1870
|
+
is_static INTEGER DEFAULT 0,
|
|
1871
|
+
is_wildcard INTEGER DEFAULT 0
|
|
1872
|
+
)
|
|
1873
|
+
`);
|
|
1874
|
+
|
|
1875
|
+
// Try FTS5 first, fallback to regular indexes if not available
|
|
1876
|
+
// better-sqlite3 bundles SQLite 3.51.1 which has native FTS5 trigram support
|
|
1877
|
+
let hasFts5 = false;
|
|
1878
|
+
try {
|
|
1879
|
+
const { rebuilt } = ensureLexicalFtsSchema(db);
|
|
1880
|
+
hasFts5 = true;
|
|
1881
|
+
console.log(rebuilt ? ' FTS5 schema rebuilt (porter + trigram)' : ' FTS5 enabled (porter + trigram)');
|
|
1882
|
+
} catch (err) {
|
|
1883
|
+
console.log(' FTS5 not available:', err.message);
|
|
1884
|
+
}
|
|
1885
|
+
|
|
1886
|
+
// Indexes for graph traversal and text search
|
|
1887
|
+
db.exec(`CREATE INDEX IF NOT EXISTS idx_entities_file ON entities(file_path)`);
|
|
1888
|
+
db.exec(`CREATE INDEX IF NOT EXISTS idx_entities_type ON entities(type)`);
|
|
1889
|
+
db.exec(`CREATE INDEX IF NOT EXISTS idx_entities_name ON entities(name)`);
|
|
1890
|
+
db.exec(`CREATE INDEX IF NOT EXISTS idx_entities_search ON entities(search_text)`);
|
|
1891
|
+
db.exec(`CREATE INDEX IF NOT EXISTS idx_entities_parent ON entities(parent_id)`);
|
|
1892
|
+
db.exec(`CREATE INDEX IF NOT EXISTS idx_entities_level ON entities(hierarchy_level)`);
|
|
1893
|
+
// Partial index for soft-delete queries: efficiently find stale entities
|
|
1894
|
+
// Only indexes rows where stale_since IS NOT NULL (smaller index, faster lookups)
|
|
1895
|
+
db.exec(`CREATE INDEX IF NOT EXISTS idx_entities_stale ON entities(stale_since) WHERE stale_since IS NOT NULL`);
|
|
1896
|
+
// P1 FIX: Covering index for active entity queries (WHERE stale_since IS NULL)
|
|
1897
|
+
// Provides 5-20ms savings on all active entity lookups (BM25, graph expansion, etc.)
|
|
1898
|
+
db.exec(`CREATE INDEX IF NOT EXISTS idx_entities_active ON entities(id, name, type, file_path) WHERE stale_since IS NULL`);
|
|
1899
|
+
// Composite index for collision-proof backup/restore of overloaded methods
|
|
1900
|
+
db.exec(`CREATE INDEX IF NOT EXISTS idx_entities_sig_hash ON entities(file_path, type, name, signature_hash)`);
|
|
1901
|
+
db.exec(`CREATE INDEX IF NOT EXISTS idx_rel_source ON relationships(source_id)`);
|
|
1902
|
+
db.exec(`CREATE INDEX IF NOT EXISTS idx_rel_target ON relationships(target_id)`);
|
|
1903
|
+
db.exec(`CREATE INDEX IF NOT EXISTS idx_rel_target_name ON relationships(target_name)`);
|
|
1904
|
+
db.exec(`CREATE INDEX IF NOT EXISTS idx_rel_type ON relationships(type)`);
|
|
1905
|
+
// Unique constraint to prevent duplicate relationships (same source→target with same type)
|
|
1906
|
+
// Allows NULL source_id (unresolved refs) by excluding them from uniqueness check
|
|
1907
|
+
db.exec(`CREATE UNIQUE INDEX IF NOT EXISTS idx_rel_unique ON relationships(source_id, target_id, type, target_name) WHERE source_id IS NOT NULL`);
|
|
1908
|
+
// Index on target_id for efficient reverse lookups ("what calls X")
|
|
1909
|
+
db.exec(`CREATE INDEX IF NOT EXISTS idx_rel_target_id ON relationships(target_id) WHERE target_id IS NOT NULL`);
|
|
1910
|
+
|
|
1911
|
+
setSchemaVersion(db);
|
|
1912
|
+
|
|
1913
|
+
return hasFts5;
|
|
1914
|
+
}
|
|
1915
|
+
|
|
1916
|
+
/**
|
|
1917
|
+
* Resolve target_id for relationships that have NULL target_id
|
|
1918
|
+
* Uses fuzzy matching and heuristics to link calls/imports/overrides to actual entities
|
|
1919
|
+
*/
|
|
1920
|
+
function resolveRelationshipTargets(db) {
|
|
1921
|
+
const stats = {
|
|
1922
|
+
calls: 0,
|
|
1923
|
+
imports: 0,
|
|
1924
|
+
overrides: 0,
|
|
1925
|
+
throws: 0,
|
|
1926
|
+
unresolved: 0
|
|
1927
|
+
};
|
|
1928
|
+
|
|
1929
|
+
console.log(' Building entity lookup maps...');
|
|
1930
|
+
|
|
1931
|
+
// Build entity lookup maps for fast resolution
|
|
1932
|
+
const entityByName = new Map(); // name -> [entities]
|
|
1933
|
+
const entityByFQN = new Map(); // fully qualified name -> entity
|
|
1934
|
+
const entityById = new Map(); // id -> entity
|
|
1935
|
+
|
|
1936
|
+
const allEntities = db.prepare('SELECT id, name, type, parent_class, package, file_path FROM entities').all();
|
|
1937
|
+
console.log(` Loaded ${allEntities.length} entities`);
|
|
1938
|
+
|
|
1939
|
+
for (const e of allEntities) {
|
|
1940
|
+
// Add to ID lookup
|
|
1941
|
+
entityById.set(e.id, e);
|
|
1942
|
+
|
|
1943
|
+
// Add to name lookup (can have duplicates)
|
|
1944
|
+
if (!entityByName.has(e.name)) {
|
|
1945
|
+
entityByName.set(e.name, []);
|
|
1946
|
+
}
|
|
1947
|
+
entityByName.get(e.name).push(e);
|
|
1948
|
+
|
|
1949
|
+
// Add to FQN lookup (unique)
|
|
1950
|
+
if (e.package && e.parent_class) {
|
|
1951
|
+
// Java method: package.ClassName.methodName
|
|
1952
|
+
const fqn = `${e.package}.${e.parent_class}.${e.name}`;
|
|
1953
|
+
entityByFQN.set(fqn, e);
|
|
1954
|
+
} else if (e.package) {
|
|
1955
|
+
// Java class: package.ClassName
|
|
1956
|
+
const fqn = `${e.package}.${e.name}`;
|
|
1957
|
+
entityByFQN.set(fqn, e);
|
|
1958
|
+
} else if (e.parent_class) {
|
|
1959
|
+
// Method without package: ClassName.methodName
|
|
1960
|
+
const fqn = `${e.parent_class}.${e.name}`;
|
|
1961
|
+
entityByFQN.set(fqn, e);
|
|
1962
|
+
}
|
|
1963
|
+
}
|
|
1964
|
+
|
|
1965
|
+
console.log(' Resolving unresolved relationships...');
|
|
1966
|
+
|
|
1967
|
+
// Prepare update statement and use transaction for bulk updates
|
|
1968
|
+
const updateStmt = db.prepare('UPDATE relationships SET target_id = ? WHERE rowid = ?');
|
|
1969
|
+
|
|
1970
|
+
// Get all unresolved relationships
|
|
1971
|
+
const unresolvedRels = db.prepare(`
|
|
1972
|
+
SELECT rowid, source_id, target_name, type
|
|
1973
|
+
FROM relationships
|
|
1974
|
+
WHERE target_id IS NULL
|
|
1975
|
+
`).all();
|
|
1976
|
+
|
|
1977
|
+
console.log(` Found ${unresolvedRels.length} unresolved relationships`);
|
|
1978
|
+
|
|
1979
|
+
// Use transaction for bulk updates (much faster)
|
|
1980
|
+
const updateMany = db.transaction(() => {
|
|
1981
|
+
let processed = 0;
|
|
1982
|
+
for (const rel of unresolvedRels) {
|
|
1983
|
+
let targetId = null;
|
|
1984
|
+
|
|
1985
|
+
if (rel.type === 'calls') {
|
|
1986
|
+
// Method calls: "object.method" or "ClassName.method"
|
|
1987
|
+
targetId = resolveMethodCall(rel.target_name, rel.source_id, entityByName, entityByFQN, entityById);
|
|
1988
|
+
if (targetId) stats.calls++;
|
|
1989
|
+
} else if (rel.type === 'imports') {
|
|
1990
|
+
// Imports: package path or module name
|
|
1991
|
+
targetId = resolveImport(rel.target_name, entityByName, entityByFQN);
|
|
1992
|
+
if (targetId) stats.imports++;
|
|
1993
|
+
} else if (rel.type === 'overrides') {
|
|
1994
|
+
// Method overrides: methodName (need to find parent class method)
|
|
1995
|
+
targetId = resolveOverride(rel.target_name, rel.source_id, entityByName, entityById, db);
|
|
1996
|
+
if (targetId) stats.overrides++;
|
|
1997
|
+
} else if (rel.type === 'throws') {
|
|
1998
|
+
// Exception classes
|
|
1999
|
+
targetId = resolveThrows(rel.target_name, entityByName);
|
|
2000
|
+
if (targetId) stats.throws++;
|
|
2001
|
+
}
|
|
2002
|
+
|
|
2003
|
+
if (targetId) {
|
|
2004
|
+
updateStmt.run(targetId, rel.rowid);
|
|
2005
|
+
} else {
|
|
2006
|
+
stats.unresolved++;
|
|
2007
|
+
}
|
|
2008
|
+
|
|
2009
|
+
processed++;
|
|
2010
|
+
if (processed % 1000 === 0) {
|
|
2011
|
+
process.stdout.write(`\r Processed ${processed}/${unresolvedRels.length}...`);
|
|
2012
|
+
}
|
|
2013
|
+
}
|
|
2014
|
+
process.stdout.write('\n');
|
|
2015
|
+
});
|
|
2016
|
+
|
|
2017
|
+
updateMany();
|
|
2018
|
+
|
|
2019
|
+
return stats;
|
|
2020
|
+
}
|
|
2021
|
+
|
|
2022
|
+
/**
|
|
2023
|
+
* Resolve method call: "object.method" or "service.method"
|
|
2024
|
+
*/
|
|
2025
|
+
function resolveMethodCall(targetName, sourceId, entityByName, entityByFQN, entityById) {
|
|
2026
|
+
// Parse "object.method" or "ClassName.methodName"
|
|
2027
|
+
const parts = targetName.split('.');
|
|
2028
|
+
if (parts.length < 2) return null;
|
|
2029
|
+
|
|
2030
|
+
const [objName, methodName] = parts;
|
|
2031
|
+
|
|
2032
|
+
// Strategy 1: Exact FQN match (e.g., "UserService.findById")
|
|
2033
|
+
if (entityByFQN.has(targetName)) {
|
|
2034
|
+
return entityByFQN.get(targetName).id;
|
|
2035
|
+
}
|
|
2036
|
+
|
|
2037
|
+
// Strategy 2: Look for method with matching name in class with matching name
|
|
2038
|
+
const methodCandidates = entityByName.get(methodName) || [];
|
|
2039
|
+
const classCandidates = entityByName.get(objName) || [];
|
|
2040
|
+
|
|
2041
|
+
for (const method of methodCandidates) {
|
|
2042
|
+
if (method.type === 'method' && method.parent_class === objName) {
|
|
2043
|
+
return method.id;
|
|
2044
|
+
}
|
|
2045
|
+
}
|
|
2046
|
+
|
|
2047
|
+
// Strategy 3: Fuzzy match - any method with this name (pick most common class)
|
|
2048
|
+
if (methodCandidates.length > 0) {
|
|
2049
|
+
const methods = methodCandidates.filter(e => e.type === 'method');
|
|
2050
|
+
if (methods.length > 0) {
|
|
2051
|
+
// Prefer methods in same package or file as source
|
|
2052
|
+
const sourceEntity = entityById.get(sourceId);
|
|
2053
|
+
if (sourceEntity) {
|
|
2054
|
+
const samePackage = methods.find(m => m.package === sourceEntity.package);
|
|
2055
|
+
if (samePackage) return samePackage.id;
|
|
2056
|
+
|
|
2057
|
+
const sameFile = methods.find(m => m.file_path === sourceEntity.file_path);
|
|
2058
|
+
if (sameFile) return sameFile.id;
|
|
2059
|
+
}
|
|
2060
|
+
|
|
2061
|
+
// Otherwise pick first match
|
|
2062
|
+
return methods[0].id;
|
|
2063
|
+
}
|
|
2064
|
+
}
|
|
2065
|
+
|
|
2066
|
+
return null;
|
|
2067
|
+
}
|
|
2068
|
+
|
|
2069
|
+
/**
|
|
2070
|
+
* Resolve import: package.Class or module path
|
|
2071
|
+
*/
|
|
2072
|
+
function resolveImport(targetName, entityByName, entityByFQN) {
|
|
2073
|
+
// Strategy 1: Exact FQN match
|
|
2074
|
+
if (entityByFQN.has(targetName)) {
|
|
2075
|
+
return entityByFQN.get(targetName).id;
|
|
2076
|
+
}
|
|
2077
|
+
|
|
2078
|
+
// Strategy 2: Match last component (class name)
|
|
2079
|
+
const parts = targetName.split('.');
|
|
2080
|
+
const className = parts[parts.length - 1];
|
|
2081
|
+
|
|
2082
|
+
const candidates = entityByName.get(className) || [];
|
|
2083
|
+
if (candidates.length > 0) {
|
|
2084
|
+
// Prefer classes/interfaces over other types
|
|
2085
|
+
const classLike = candidates.find(e => ['class', 'interface', 'enum'].includes(e.type));
|
|
2086
|
+
if (classLike) return classLike.id;
|
|
2087
|
+
return candidates[0].id;
|
|
2088
|
+
}
|
|
2089
|
+
|
|
2090
|
+
return null;
|
|
2091
|
+
}
|
|
2092
|
+
|
|
2093
|
+
/**
|
|
2094
|
+
* Resolve method override: find parent class/interface method with same name
|
|
2095
|
+
* Simplified version: just match by method name (close enough for most cases)
|
|
2096
|
+
*/
|
|
2097
|
+
function resolveOverride(methodName, sourceId, entityByName, entityById, db) {
|
|
2098
|
+
// Simple strategy: Find any method with this name
|
|
2099
|
+
// In a real override, it should be in a parent class, but for now we'll match by name
|
|
2100
|
+
const methodCandidates = entityByName.get(methodName) || [];
|
|
2101
|
+
|
|
2102
|
+
const methods = methodCandidates.filter(e => e.type === 'method');
|
|
2103
|
+
if (methods.length > 0) {
|
|
2104
|
+
// Return first match (could be improved with parent class lookup later)
|
|
2105
|
+
return methods[0].id;
|
|
2106
|
+
}
|
|
2107
|
+
|
|
2108
|
+
return null;
|
|
2109
|
+
}
|
|
2110
|
+
|
|
2111
|
+
/**
|
|
2112
|
+
* Resolve throws: exception class name
|
|
2113
|
+
*/
|
|
2114
|
+
function resolveThrows(exceptionName, entityByName) {
|
|
2115
|
+
const candidates = entityByName.get(exceptionName) || [];
|
|
2116
|
+
|
|
2117
|
+
// Prefer classes
|
|
2118
|
+
const classMatch = candidates.find(e => e.type === 'class');
|
|
2119
|
+
if (classMatch) return classMatch.id;
|
|
2120
|
+
|
|
2121
|
+
if (candidates.length > 0) return candidates[0].id;
|
|
2122
|
+
return null;
|
|
2123
|
+
}
|
|
2124
|
+
|
|
2125
|
+
/**
|
|
2126
|
+
* Insert entities and relationships into database
|
|
2127
|
+
* Uses better-sqlite3 (sync API, no .free() needed)
|
|
2128
|
+
*/
|
|
2129
|
+
export function insertGraph(db, entities, relationships, hasFts5 = false) {
|
|
2130
|
+
// Insert entities with HCGS hierarchy support
|
|
2131
|
+
// Includes signature_hash for collision-proof backup/restore
|
|
2132
|
+
const entityStmt = db.prepare(`
|
|
2133
|
+
INSERT OR REPLACE INTO entities
|
|
2134
|
+
(id, file_path, type, name, signature, signature_hash, doc_comment, start_line, end_line, package, parent_class, search_text, name_alias, parent_id, hierarchy_level)
|
|
2135
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
2136
|
+
`);
|
|
2137
|
+
|
|
2138
|
+
// Build parent lookup for hierarchy
|
|
2139
|
+
const parentLookup = new Map();
|
|
2140
|
+
for (const e of entities) {
|
|
2141
|
+
if (['class', 'interface', 'enum', 'service'].includes(e.type)) {
|
|
2142
|
+
parentLookup.set(`${e.file_path}:${e.name}`, e.id);
|
|
2143
|
+
}
|
|
2144
|
+
}
|
|
2145
|
+
|
|
2146
|
+
console.log(` Inserting ${entities.length} entities...`);
|
|
2147
|
+
|
|
2148
|
+
// Use transaction for bulk entity inserts (much faster)
|
|
2149
|
+
const insertEntities = db.transaction(() => {
|
|
2150
|
+
for (const e of entities) {
|
|
2151
|
+
// Create searchable text combining name, signature, and doc comment
|
|
2152
|
+
const searchText = [e.name, e.signature, e.doc_comment]
|
|
2153
|
+
.filter(Boolean)
|
|
2154
|
+
.join(' ')
|
|
2155
|
+
.toLowerCase()
|
|
2156
|
+
.slice(0, 1000);
|
|
2157
|
+
|
|
2158
|
+
// Determine hierarchy level and parent
|
|
2159
|
+
let hierarchyLevel = 0;
|
|
2160
|
+
let parentId = null;
|
|
2161
|
+
|
|
2162
|
+
if (['method', 'field', 'rpc'].includes(e.type)) {
|
|
2163
|
+
hierarchyLevel = 1;
|
|
2164
|
+
// Find parent class/interface/service
|
|
2165
|
+
if (e.parent_class) {
|
|
2166
|
+
parentId = parentLookup.get(`${e.file_path}:${e.parent_class}`);
|
|
2167
|
+
}
|
|
2168
|
+
} else if (['class', 'interface', 'enum', 'service', 'message'].includes(e.type)) {
|
|
2169
|
+
hierarchyLevel = 0;
|
|
2170
|
+
} else if (['function', 'component'].includes(e.type)) {
|
|
2171
|
+
hierarchyLevel = 0; // Top-level in JS/TS files
|
|
2172
|
+
}
|
|
2173
|
+
|
|
2174
|
+
// Fix 7: Generate normalized identifier alias for cross-style search
|
|
2175
|
+
const nameAlias = normalizeIdentifier(e.name);
|
|
2176
|
+
|
|
2177
|
+
// better-sqlite3: use spread params instead of array
|
|
2178
|
+
entityStmt.run(
|
|
2179
|
+
e.id,
|
|
2180
|
+
e.file_path,
|
|
2181
|
+
e.type,
|
|
2182
|
+
e.name,
|
|
2183
|
+
e.signature || null,
|
|
2184
|
+
e.signature_hash || null, // For collision-proof backup/restore
|
|
2185
|
+
e.doc_comment || null,
|
|
2186
|
+
e.start_line || null,
|
|
2187
|
+
e.end_line || null,
|
|
2188
|
+
e.package || null,
|
|
2189
|
+
e.parent_class || null,
|
|
2190
|
+
searchText,
|
|
2191
|
+
nameAlias || null,
|
|
2192
|
+
parentId,
|
|
2193
|
+
hierarchyLevel
|
|
2194
|
+
);
|
|
2195
|
+
}
|
|
2196
|
+
});
|
|
2197
|
+
|
|
2198
|
+
insertEntities();
|
|
2199
|
+
console.log(` ✓ Inserted ${entities.length} entities`);
|
|
2200
|
+
// Note: better-sqlite3 doesn't need .free()
|
|
2201
|
+
|
|
2202
|
+
// Insert relationships (filter out invalid ones)
|
|
2203
|
+
console.log(` Inserting ${relationships.length} relationships...`);
|
|
2204
|
+
|
|
2205
|
+
const relStmt = db.prepare(`
|
|
2206
|
+
INSERT INTO relationships
|
|
2207
|
+
(source_id, target_id, target_name, type, weight, context_line, full_import_path, is_static, is_wildcard)
|
|
2208
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
2209
|
+
`);
|
|
2210
|
+
|
|
2211
|
+
// Use transaction for bulk relationship inserts
|
|
2212
|
+
let relInserted = 0;
|
|
2213
|
+
const insertRelationships = db.transaction(() => {
|
|
2214
|
+
for (const r of relationships) {
|
|
2215
|
+
// Skip relationships without target_name
|
|
2216
|
+
if (!r.target_name) continue;
|
|
2217
|
+
|
|
2218
|
+
try {
|
|
2219
|
+
// better-sqlite3: use spread params instead of array
|
|
2220
|
+
relStmt.run(
|
|
2221
|
+
r.source_id || null,
|
|
2222
|
+
r.target_id || null,
|
|
2223
|
+
r.target_name,
|
|
2224
|
+
r.type,
|
|
2225
|
+
r.weight || 1.0,
|
|
2226
|
+
r.context_line || null,
|
|
2227
|
+
r.full_import_path || null,
|
|
2228
|
+
r.is_static ? 1 : 0,
|
|
2229
|
+
r.is_wildcard ? 1 : 0
|
|
2230
|
+
);
|
|
2231
|
+
relInserted++;
|
|
2232
|
+
} catch (err) {
|
|
2233
|
+
// Expected: UNIQUE constraint violations for duplicate relationships
|
|
2234
|
+
// Log unexpected errors at debug level for troubleshooting
|
|
2235
|
+
if (!err.message.includes('UNIQUE constraint')) {
|
|
2236
|
+
if (process.env.DEBUG) {
|
|
2237
|
+
console.debug(` [debug] Relationship insert failed: ${err.message} (target: ${r.target_name})`);
|
|
2238
|
+
}
|
|
2239
|
+
}
|
|
2240
|
+
}
|
|
2241
|
+
}
|
|
2242
|
+
});
|
|
2243
|
+
|
|
2244
|
+
insertRelationships();
|
|
2245
|
+
console.log(` ✓ Inserted ${relInserted} relationships`);
|
|
2246
|
+
|
|
2247
|
+
// PHASE 2: Resolve target_id for relationships with NULL target_id
|
|
2248
|
+
// TEMPORARILY DISABLED to test basic indexing
|
|
2249
|
+
// console.log(' Resolving relationship targets...');
|
|
2250
|
+
// try {
|
|
2251
|
+
// const resolveStats = resolveRelationshipTargets(db);
|
|
2252
|
+
// console.log(` Resolved ${resolveStats.calls} calls, ${resolveStats.imports} imports, ${resolveStats.overrides} overrides, ${resolveStats.throws} throws (${resolveStats.unresolved} unresolved)`);
|
|
2253
|
+
// } catch (err) {
|
|
2254
|
+
// console.log(` ⚠ Resolution failed: ${err.message}`);
|
|
2255
|
+
// if (process.env.DEBUG) console.error(err.stack);
|
|
2256
|
+
// }
|
|
2257
|
+
|
|
2258
|
+
// Rebuild FTS indexes if available
|
|
2259
|
+
if (hasFts5) {
|
|
2260
|
+
try {
|
|
2261
|
+
db.exec(`INSERT INTO entities_fts(entities_fts) VALUES('rebuild')`);
|
|
2262
|
+
db.exec(`INSERT INTO entities_trigram(entities_trigram) VALUES('rebuild')`);
|
|
2263
|
+
console.log(' FTS5 indexes rebuilt (porter + trigram)');
|
|
2264
|
+
|
|
2265
|
+
// Best-effort post-build compaction for faster reads.
|
|
2266
|
+
db.exec(`INSERT INTO entities_fts(entities_fts) VALUES('optimize')`);
|
|
2267
|
+
db.exec(`INSERT INTO entities_trigram(entities_trigram) VALUES('optimize')`);
|
|
2268
|
+
console.log(' FTS5 indexes optimized (segments merged)');
|
|
2269
|
+
} catch (err) {
|
|
2270
|
+
// FTS5 rebuild/optimize failed, ignore
|
|
2271
|
+
}
|
|
2272
|
+
}
|
|
2273
|
+
}
|
|
2274
|
+
|
|
2275
|
+
// =============================================================================
|
|
2276
|
+
// CLI
|
|
2277
|
+
// =============================================================================
|
|
2278
|
+
|
|
2279
|
+
if (import.meta.url === `file://${process.argv[1]}`) {
|
|
2280
|
+
const args = process.argv.slice(2);
|
|
2281
|
+
|
|
2282
|
+
if (args.length === 0) {
|
|
2283
|
+
console.log('Usage: graph-extractor.js <file>');
|
|
2284
|
+
process.exit(1);
|
|
2285
|
+
}
|
|
2286
|
+
|
|
2287
|
+
const filePath = args[0];
|
|
2288
|
+
|
|
2289
|
+
(async () => {
|
|
2290
|
+
try {
|
|
2291
|
+
const content = await fs.readFile(filePath, 'utf-8');
|
|
2292
|
+
const extractor = new GraphExtractor();
|
|
2293
|
+
const result = await extractor.extractFromFile(filePath, content);
|
|
2294
|
+
|
|
2295
|
+
console.log(JSON.stringify(result, null, 2));
|
|
2296
|
+
console.error(`\nExtracted ${result.entities.length} entities, ${result.relationships.length} relationships`);
|
|
2297
|
+
} catch (err) {
|
|
2298
|
+
console.error('Error:', err.message);
|
|
2299
|
+
process.exit(1);
|
|
2300
|
+
}
|
|
2301
|
+
})();
|
|
2302
|
+
}
|
|
2303
|
+
|
|
2304
|
+
export default GraphExtractor;
|