code-review-graph-codeblackwell 2.3.6.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_review_graph/__init__.py +20 -0
- code_review_graph/__main__.py +4 -0
- code_review_graph/analysis.py +410 -0
- code_review_graph/changes.py +409 -0
- code_review_graph/cli.py +1255 -0
- code_review_graph/communities.py +874 -0
- code_review_graph/constants.py +23 -0
- code_review_graph/context_savings.py +317 -0
- code_review_graph/custom_languages.py +322 -0
- code_review_graph/daemon.py +1009 -0
- code_review_graph/daemon_cli.py +320 -0
- code_review_graph/docs/LLM-OPTIMIZED-REFERENCE.md +71 -0
- code_review_graph/embeddings.py +1006 -0
- code_review_graph/enrich.py +303 -0
- code_review_graph/eval/__init__.py +33 -0
- code_review_graph/eval/benchmarks/__init__.py +1 -0
- code_review_graph/eval/benchmarks/agent_baseline.py +193 -0
- code_review_graph/eval/benchmarks/build_performance.py +60 -0
- code_review_graph/eval/benchmarks/flow_completeness.py +36 -0
- code_review_graph/eval/benchmarks/impact_accuracy.py +220 -0
- code_review_graph/eval/benchmarks/multi_hop_retrieval.py +125 -0
- code_review_graph/eval/benchmarks/search_quality.py +59 -0
- code_review_graph/eval/benchmarks/token_efficiency.py +143 -0
- code_review_graph/eval/configs/code-review-graph.yaml +50 -0
- code_review_graph/eval/configs/express.yaml +45 -0
- code_review_graph/eval/configs/fastapi.yaml +48 -0
- code_review_graph/eval/configs/flask.yaml +50 -0
- code_review_graph/eval/configs/gin.yaml +51 -0
- code_review_graph/eval/configs/httpx.yaml +48 -0
- code_review_graph/eval/reporter.py +301 -0
- code_review_graph/eval/runner.py +211 -0
- code_review_graph/eval/scorer.py +85 -0
- code_review_graph/eval/token_benchmark.py +182 -0
- code_review_graph/exports.py +409 -0
- code_review_graph/flows.py +698 -0
- code_review_graph/graph.py +1427 -0
- code_review_graph/graph_diff.py +122 -0
- code_review_graph/hints.py +384 -0
- code_review_graph/incremental.py +1245 -0
- code_review_graph/jedi_resolver.py +303 -0
- code_review_graph/main.py +1079 -0
- code_review_graph/memory.py +142 -0
- code_review_graph/migrations.py +284 -0
- code_review_graph/parser.py +6957 -0
- code_review_graph/postprocessing.py +134 -0
- code_review_graph/prompts.py +159 -0
- code_review_graph/refactor.py +852 -0
- code_review_graph/registry.py +319 -0
- code_review_graph/rescript_resolver.py +206 -0
- code_review_graph/search.py +447 -0
- code_review_graph/skills.py +1481 -0
- code_review_graph/spring_resolver.py +200 -0
- code_review_graph/temporal_resolver.py +199 -0
- code_review_graph/token_benchmark.py +125 -0
- code_review_graph/tools/__init__.py +156 -0
- code_review_graph/tools/_common.py +176 -0
- code_review_graph/tools/analysis_tools.py +184 -0
- code_review_graph/tools/build.py +541 -0
- code_review_graph/tools/community_tools.py +246 -0
- code_review_graph/tools/context.py +152 -0
- code_review_graph/tools/docs.py +274 -0
- code_review_graph/tools/flows_tools.py +176 -0
- code_review_graph/tools/query.py +692 -0
- code_review_graph/tools/refactor_tools.py +168 -0
- code_review_graph/tools/registry_tools.py +125 -0
- code_review_graph/tools/review.py +477 -0
- code_review_graph/tsconfig_resolver.py +257 -0
- code_review_graph/visualization.py +2184 -0
- code_review_graph/wiki.py +305 -0
- code_review_graph_codeblackwell-2.3.6.post1.dist-info/METADATA +718 -0
- code_review_graph_codeblackwell-2.3.6.post1.dist-info/RECORD +74 -0
- code_review_graph_codeblackwell-2.3.6.post1.dist-info/WHEEL +4 -0
- code_review_graph_codeblackwell-2.3.6.post1.dist-info/entry_points.txt +3 -0
- code_review_graph_codeblackwell-2.3.6.post1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,1427 @@
|
|
|
1
|
+
"""SQLite-backed knowledge graph storage and query engine.
|
|
2
|
+
|
|
3
|
+
Stores code structure as nodes (File, Class, Function, Type, Test) and
|
|
4
|
+
edges (CALLS, IMPORTS_FROM, INHERITS, IMPLEMENTS, CONTAINS, TESTED_BY, DEPENDS_ON, REFERENCES).
|
|
5
|
+
Supports impact-radius queries and subgraph extraction.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import logging
|
|
12
|
+
import os
|
|
13
|
+
import re
|
|
14
|
+
import sqlite3
|
|
15
|
+
import threading
|
|
16
|
+
import time
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Any, Optional
|
|
20
|
+
|
|
21
|
+
import networkx as nx
|
|
22
|
+
|
|
23
|
+
from .constants import BFS_ENGINE, MAX_IMPACT_DEPTH, MAX_IMPACT_NODES
|
|
24
|
+
from .migrations import get_schema_version, run_migrations
|
|
25
|
+
from .parser import EdgeInfo, NodeInfo
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
# Suffix appended by _qualified_names_for_file to a 2nd+ same-file collision,
|
|
30
|
+
# e.g. "path::getById:L394". This regex recognizes those keys after the fact.
|
|
31
|
+
_DISAMBIGUATED_RE = re.compile(r":L\d+(?:#\d+)?$")
|
|
32
|
+
|
|
33
|
+
# ---------------------------------------------------------------------------
|
|
34
|
+
# Schema
|
|
35
|
+
# ---------------------------------------------------------------------------
|
|
36
|
+
|
|
37
|
+
_SCHEMA_SQL = """
|
|
38
|
+
CREATE TABLE IF NOT EXISTS nodes (
|
|
39
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
40
|
+
kind TEXT NOT NULL, -- File, Class, Function, Type, Test
|
|
41
|
+
name TEXT NOT NULL,
|
|
42
|
+
qualified_name TEXT NOT NULL UNIQUE,
|
|
43
|
+
file_path TEXT NOT NULL,
|
|
44
|
+
line_start INTEGER,
|
|
45
|
+
line_end INTEGER,
|
|
46
|
+
language TEXT,
|
|
47
|
+
parent_name TEXT,
|
|
48
|
+
params TEXT,
|
|
49
|
+
return_type TEXT,
|
|
50
|
+
modifiers TEXT,
|
|
51
|
+
is_test INTEGER DEFAULT 0,
|
|
52
|
+
file_hash TEXT,
|
|
53
|
+
extra TEXT DEFAULT '{}',
|
|
54
|
+
updated_at REAL NOT NULL
|
|
55
|
+
);
|
|
56
|
+
|
|
57
|
+
CREATE TABLE IF NOT EXISTS edges (
|
|
58
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
59
|
+
kind TEXT NOT NULL, -- CALLS, IMPORTS_FROM, INHERITS, REFERENCES, etc.
|
|
60
|
+
source_qualified TEXT NOT NULL,
|
|
61
|
+
target_qualified TEXT NOT NULL,
|
|
62
|
+
file_path TEXT NOT NULL,
|
|
63
|
+
line INTEGER DEFAULT 0,
|
|
64
|
+
extra TEXT DEFAULT '{}',
|
|
65
|
+
confidence REAL DEFAULT 1.0,
|
|
66
|
+
confidence_tier TEXT DEFAULT 'EXTRACTED',
|
|
67
|
+
updated_at REAL NOT NULL
|
|
68
|
+
);
|
|
69
|
+
|
|
70
|
+
CREATE TABLE IF NOT EXISTS metadata (
|
|
71
|
+
key TEXT PRIMARY KEY,
|
|
72
|
+
value TEXT NOT NULL
|
|
73
|
+
);
|
|
74
|
+
|
|
75
|
+
CREATE INDEX IF NOT EXISTS idx_nodes_file ON nodes(file_path);
|
|
76
|
+
CREATE INDEX IF NOT EXISTS idx_nodes_kind ON nodes(kind);
|
|
77
|
+
CREATE INDEX IF NOT EXISTS idx_nodes_qualified ON nodes(qualified_name);
|
|
78
|
+
CREATE INDEX IF NOT EXISTS idx_edges_source ON edges(source_qualified);
|
|
79
|
+
CREATE INDEX IF NOT EXISTS idx_edges_target ON edges(target_qualified);
|
|
80
|
+
CREATE INDEX IF NOT EXISTS idx_edges_kind ON edges(kind);
|
|
81
|
+
CREATE INDEX IF NOT EXISTS idx_edges_target_kind ON edges(target_qualified, kind);
|
|
82
|
+
CREATE INDEX IF NOT EXISTS idx_edges_source_kind ON edges(source_qualified, kind);
|
|
83
|
+
CREATE INDEX IF NOT EXISTS idx_edges_file ON edges(file_path);
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@dataclass
|
|
88
|
+
class GraphNode:
|
|
89
|
+
id: int
|
|
90
|
+
kind: str
|
|
91
|
+
name: str
|
|
92
|
+
qualified_name: str
|
|
93
|
+
file_path: str
|
|
94
|
+
line_start: int
|
|
95
|
+
line_end: int
|
|
96
|
+
language: str
|
|
97
|
+
parent_name: Optional[str]
|
|
98
|
+
params: Optional[str]
|
|
99
|
+
return_type: Optional[str]
|
|
100
|
+
is_test: bool
|
|
101
|
+
file_hash: Optional[str]
|
|
102
|
+
extra: dict
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@dataclass
|
|
106
|
+
class GraphEdge:
|
|
107
|
+
id: int
|
|
108
|
+
kind: str
|
|
109
|
+
source_qualified: str
|
|
110
|
+
target_qualified: str
|
|
111
|
+
file_path: str
|
|
112
|
+
line: int
|
|
113
|
+
extra: dict
|
|
114
|
+
confidence: float = 1.0
|
|
115
|
+
confidence_tier: str = "EXTRACTED"
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@dataclass
|
|
119
|
+
class FlowAdjacency:
|
|
120
|
+
"""In-memory adjacency structure for flow tracing.
|
|
121
|
+
|
|
122
|
+
Loaded once via :meth:`GraphStore.load_flow_adjacency` and passed to
|
|
123
|
+
``trace_flows`` / ``compute_criticality`` to avoid per-edge SQLite
|
|
124
|
+
point queries on large graphs.
|
|
125
|
+
"""
|
|
126
|
+
calls_out: dict[str, list[str]]
|
|
127
|
+
has_tested_by: set[str]
|
|
128
|
+
nodes_by_qn: dict[str, "GraphNode"]
|
|
129
|
+
nodes_by_id: dict[int, "GraphNode"]
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
@dataclass
|
|
133
|
+
class GraphStats:
|
|
134
|
+
total_nodes: int
|
|
135
|
+
total_edges: int
|
|
136
|
+
nodes_by_kind: dict[str, int]
|
|
137
|
+
edges_by_kind: dict[str, int]
|
|
138
|
+
languages: list[str]
|
|
139
|
+
files_count: int
|
|
140
|
+
last_updated: Optional[str]
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
# ---------------------------------------------------------------------------
|
|
144
|
+
# GraphStore
|
|
145
|
+
# ---------------------------------------------------------------------------
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class GraphStore:
|
|
149
|
+
"""SQLite-backed code knowledge graph."""
|
|
150
|
+
|
|
151
|
+
def __init__(self, db_path: str | Path) -> None:
|
|
152
|
+
self.db_path = Path(db_path)
|
|
153
|
+
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
154
|
+
self._conn = sqlite3.connect(
|
|
155
|
+
str(self.db_path), timeout=30, check_same_thread=False,
|
|
156
|
+
isolation_level=None, # Disable implicit transactions (#135)
|
|
157
|
+
)
|
|
158
|
+
self._conn.row_factory = sqlite3.Row
|
|
159
|
+
self._conn.execute("PRAGMA journal_mode=WAL")
|
|
160
|
+
self._conn.execute("PRAGMA busy_timeout=5000")
|
|
161
|
+
self._init_schema()
|
|
162
|
+
# Ensure schema_version is set, then run pending migrations
|
|
163
|
+
if get_schema_version(self._conn) < 1:
|
|
164
|
+
# Fresh DB — metadata table just created by _init_schema
|
|
165
|
+
self._conn.execute(
|
|
166
|
+
"INSERT OR IGNORE INTO metadata (key, value) "
|
|
167
|
+
"VALUES ('schema_version', '1')"
|
|
168
|
+
)
|
|
169
|
+
self._conn.commit()
|
|
170
|
+
run_migrations(self._conn)
|
|
171
|
+
self._nxg_cache: nx.DiGraph | None = None
|
|
172
|
+
self._cache_lock = threading.Lock()
|
|
173
|
+
|
|
174
|
+
def __enter__(self) -> "GraphStore":
|
|
175
|
+
return self
|
|
176
|
+
|
|
177
|
+
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
178
|
+
self.close()
|
|
179
|
+
|
|
180
|
+
def _init_schema(self) -> None:
|
|
181
|
+
self._conn.executescript(_SCHEMA_SQL)
|
|
182
|
+
self._conn.commit()
|
|
183
|
+
|
|
184
|
+
def _invalidate_cache(self) -> None:
|
|
185
|
+
"""Invalidate the cached NetworkX graph after write operations."""
|
|
186
|
+
with self._cache_lock:
|
|
187
|
+
self._nxg_cache = None
|
|
188
|
+
|
|
189
|
+
def close(self) -> None:
|
|
190
|
+
self._conn.close()
|
|
191
|
+
|
|
192
|
+
# --- Write operations ---
|
|
193
|
+
|
|
194
|
+
def upsert_node(
|
|
195
|
+
self, node: NodeInfo, file_hash: str = "", qualified: str | None = None
|
|
196
|
+
) -> int:
|
|
197
|
+
"""Insert or update a node. Returns the node ID.
|
|
198
|
+
|
|
199
|
+
Pass ``qualified`` to override the computed key — used by the batch store
|
|
200
|
+
functions to disambiguate same-named symbols in one file (see
|
|
201
|
+
``_qualified_names_for_file``). Without it, collisions would collapse
|
|
202
|
+
under ``ON CONFLICT(qualified_name) DO UPDATE`` and silently drop nodes.
|
|
203
|
+
"""
|
|
204
|
+
now = time.time()
|
|
205
|
+
if qualified is None:
|
|
206
|
+
qualified = self._make_qualified(node)
|
|
207
|
+
extra = json.dumps(node.extra) if node.extra else "{}"
|
|
208
|
+
|
|
209
|
+
self._conn.execute(
|
|
210
|
+
"""INSERT INTO nodes
|
|
211
|
+
(kind, name, qualified_name, file_path, line_start, line_end,
|
|
212
|
+
language, parent_name, params, return_type, modifiers, is_test,
|
|
213
|
+
file_hash, extra, updated_at)
|
|
214
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
215
|
+
ON CONFLICT(qualified_name) DO UPDATE SET
|
|
216
|
+
kind=excluded.kind, name=excluded.name,
|
|
217
|
+
file_path=excluded.file_path, line_start=excluded.line_start,
|
|
218
|
+
line_end=excluded.line_end, language=excluded.language,
|
|
219
|
+
parent_name=excluded.parent_name, params=excluded.params,
|
|
220
|
+
return_type=excluded.return_type, modifiers=excluded.modifiers,
|
|
221
|
+
is_test=excluded.is_test, file_hash=excluded.file_hash,
|
|
222
|
+
extra=excluded.extra, updated_at=excluded.updated_at
|
|
223
|
+
""",
|
|
224
|
+
(
|
|
225
|
+
node.kind, node.name, qualified, node.file_path,
|
|
226
|
+
node.line_start, node.line_end, node.language,
|
|
227
|
+
node.parent_name, node.params, node.return_type,
|
|
228
|
+
node.modifiers, int(node.is_test), file_hash,
|
|
229
|
+
extra, now,
|
|
230
|
+
),
|
|
231
|
+
)
|
|
232
|
+
row = self._conn.execute(
|
|
233
|
+
"SELECT id FROM nodes WHERE qualified_name = ?", (qualified,)
|
|
234
|
+
).fetchone()
|
|
235
|
+
return row["id"]
|
|
236
|
+
|
|
237
|
+
def upsert_edge(self, edge: EdgeInfo) -> int:
|
|
238
|
+
"""Insert or update an edge."""
|
|
239
|
+
now = time.time()
|
|
240
|
+
extra_dict = edge.extra if edge.extra else {}
|
|
241
|
+
confidence = float(extra_dict.get("confidence", 1.0))
|
|
242
|
+
confidence_tier = str(extra_dict.get("confidence_tier", "EXTRACTED"))
|
|
243
|
+
extra = json.dumps(extra_dict)
|
|
244
|
+
|
|
245
|
+
# Check for existing edge (include line so multiple call sites are preserved)
|
|
246
|
+
existing = self._conn.execute(
|
|
247
|
+
"""SELECT id FROM edges
|
|
248
|
+
WHERE kind=? AND source_qualified=? AND target_qualified=?
|
|
249
|
+
AND file_path=? AND line=?""",
|
|
250
|
+
(edge.kind, edge.source, edge.target, edge.file_path, edge.line),
|
|
251
|
+
).fetchone()
|
|
252
|
+
|
|
253
|
+
if existing:
|
|
254
|
+
self._conn.execute(
|
|
255
|
+
"UPDATE edges SET line=?, extra=?, confidence=?, confidence_tier=?,"
|
|
256
|
+
" updated_at=? WHERE id=?",
|
|
257
|
+
(edge.line, extra, confidence, confidence_tier, now, existing["id"]),
|
|
258
|
+
)
|
|
259
|
+
return existing["id"]
|
|
260
|
+
|
|
261
|
+
self._conn.execute(
|
|
262
|
+
"""INSERT INTO edges
|
|
263
|
+
(kind, source_qualified, target_qualified, file_path, line, extra,
|
|
264
|
+
confidence, confidence_tier, updated_at)
|
|
265
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
|
266
|
+
(edge.kind, edge.source, edge.target, edge.file_path, edge.line, extra,
|
|
267
|
+
confidence, confidence_tier, now),
|
|
268
|
+
)
|
|
269
|
+
return self._conn.execute("SELECT last_insert_rowid()").fetchone()[0]
|
|
270
|
+
|
|
271
|
+
def remove_file_data(self, file_path: str) -> None:
|
|
272
|
+
"""Remove all nodes and edges associated with a file."""
|
|
273
|
+
self._conn.execute("DELETE FROM nodes WHERE file_path = ?", (file_path,))
|
|
274
|
+
self._conn.execute("DELETE FROM edges WHERE file_path = ?", (file_path,))
|
|
275
|
+
self._invalidate_cache()
|
|
276
|
+
|
|
277
|
+
def _begin_immediate(self) -> None:
|
|
278
|
+
"""Start an IMMEDIATE transaction, rolling back any prior uncommitted
|
|
279
|
+
transaction first (regression guard for #135 / #489).
|
|
280
|
+
"""
|
|
281
|
+
if self._conn.in_transaction:
|
|
282
|
+
logger.warning("Rolling back uncommitted transaction before BEGIN IMMEDIATE")
|
|
283
|
+
self._conn.rollback()
|
|
284
|
+
self._conn.execute("BEGIN IMMEDIATE")
|
|
285
|
+
|
|
286
|
+
def store_file_nodes_edges(
|
|
287
|
+
self, file_path: str, nodes: list[NodeInfo], edges: list[EdgeInfo], fhash: str = ""
|
|
288
|
+
) -> None:
|
|
289
|
+
"""Atomically replace all data for a file."""
|
|
290
|
+
self._begin_immediate()
|
|
291
|
+
try:
|
|
292
|
+
self.remove_file_data(file_path)
|
|
293
|
+
qualified_names = self._qualified_names_for_file(nodes)
|
|
294
|
+
for node, qualified in zip(nodes, qualified_names):
|
|
295
|
+
self.upsert_node(node, file_hash=fhash, qualified=qualified)
|
|
296
|
+
for edge in edges:
|
|
297
|
+
self.upsert_edge(edge)
|
|
298
|
+
self._conn.commit()
|
|
299
|
+
except BaseException:
|
|
300
|
+
self._conn.rollback()
|
|
301
|
+
raise
|
|
302
|
+
self._invalidate_cache()
|
|
303
|
+
|
|
304
|
+
def store_file_batch(
|
|
305
|
+
self, batch: list[tuple[str, list[NodeInfo], list[EdgeInfo], str]]
|
|
306
|
+
) -> None:
|
|
307
|
+
"""Atomically replace data for a batch of files in one transaction."""
|
|
308
|
+
self._begin_immediate()
|
|
309
|
+
try:
|
|
310
|
+
for file_path, nodes, edges, fhash in batch:
|
|
311
|
+
self.remove_file_data(file_path)
|
|
312
|
+
qualified_names = self._qualified_names_for_file(nodes)
|
|
313
|
+
for node, qualified in zip(nodes, qualified_names):
|
|
314
|
+
self.upsert_node(node, file_hash=fhash, qualified=qualified)
|
|
315
|
+
for edge in edges:
|
|
316
|
+
self.upsert_edge(edge)
|
|
317
|
+
self._conn.commit()
|
|
318
|
+
except BaseException:
|
|
319
|
+
self._conn.rollback()
|
|
320
|
+
raise
|
|
321
|
+
self._invalidate_cache()
|
|
322
|
+
|
|
323
|
+
def set_metadata(self, key: str, value: str) -> None:
|
|
324
|
+
self._conn.execute(
|
|
325
|
+
"INSERT OR REPLACE INTO metadata (key, value) VALUES (?, ?)", (key, value)
|
|
326
|
+
)
|
|
327
|
+
self._conn.commit()
|
|
328
|
+
|
|
329
|
+
def get_metadata(self, key: str) -> Optional[str]:
|
|
330
|
+
row = self._conn.execute("SELECT value FROM metadata WHERE key=?", (key,)).fetchone()
|
|
331
|
+
return row["value"] if row else None
|
|
332
|
+
|
|
333
|
+
def commit(self) -> None:
|
|
334
|
+
self._conn.commit()
|
|
335
|
+
|
|
336
|
+
def rollback(self) -> None:
|
|
337
|
+
"""Rollback the current transaction."""
|
|
338
|
+
self._conn.rollback()
|
|
339
|
+
|
|
340
|
+
# --- Read operations ---
|
|
341
|
+
|
|
342
|
+
def get_node(self, qualified_name: str) -> Optional[GraphNode]:
|
|
343
|
+
row = self._conn.execute(
|
|
344
|
+
"SELECT * FROM nodes WHERE qualified_name = ?", (qualified_name,)
|
|
345
|
+
).fetchone()
|
|
346
|
+
return self._row_to_node(row) if row else None
|
|
347
|
+
|
|
348
|
+
def get_nodes_by_file(self, file_path: str) -> list[GraphNode]:
|
|
349
|
+
rows = self._conn.execute(
|
|
350
|
+
"SELECT * FROM nodes WHERE file_path = ?", (file_path,)
|
|
351
|
+
).fetchall()
|
|
352
|
+
return [self._row_to_node(r) for r in rows]
|
|
353
|
+
|
|
354
|
+
def get_all_nodes(self, exclude_files: bool = True) -> list[GraphNode]:
|
|
355
|
+
"""Return all nodes, optionally excluding File nodes."""
|
|
356
|
+
if exclude_files:
|
|
357
|
+
rows = self._conn.execute(
|
|
358
|
+
"SELECT * FROM nodes WHERE kind != 'File'"
|
|
359
|
+
).fetchall()
|
|
360
|
+
else:
|
|
361
|
+
rows = self._conn.execute("SELECT * FROM nodes").fetchall()
|
|
362
|
+
return [self._row_to_node(r) for r in rows]
|
|
363
|
+
|
|
364
|
+
def get_edges_by_source(self, qualified_name: str) -> list[GraphEdge]:
|
|
365
|
+
rows = self._conn.execute(
|
|
366
|
+
"SELECT * FROM edges WHERE source_qualified = ?", (qualified_name,)
|
|
367
|
+
).fetchall()
|
|
368
|
+
return [self._row_to_edge(r) for r in rows]
|
|
369
|
+
|
|
370
|
+
def get_edges_by_target(self, qualified_name: str) -> list[GraphEdge]:
|
|
371
|
+
rows = self._conn.execute(
|
|
372
|
+
"SELECT * FROM edges WHERE target_qualified = ?", (qualified_name,)
|
|
373
|
+
).fetchall()
|
|
374
|
+
return [self._row_to_edge(r) for r in rows]
|
|
375
|
+
|
|
376
|
+
def search_edges_by_target_name(self, name: str, kind: str = "CALLS") -> list[GraphEdge]:
|
|
377
|
+
"""Search for edges where target_qualified matches an unqualified name.
|
|
378
|
+
|
|
379
|
+
CALLS edges often store unqualified target names (e.g. ``generateTestCode``)
|
|
380
|
+
rather than fully qualified ones (``file.ts::generateTestCode``). This
|
|
381
|
+
method finds those edges by exact match on the plain function name so that
|
|
382
|
+
reverse call tracing (callers_of) works even when qualified-name lookup
|
|
383
|
+
returns nothing.
|
|
384
|
+
"""
|
|
385
|
+
rows = self._conn.execute(
|
|
386
|
+
"SELECT * FROM edges WHERE target_qualified = ? AND kind = ?",
|
|
387
|
+
(name, kind),
|
|
388
|
+
).fetchall()
|
|
389
|
+
return [self._row_to_edge(r) for r in rows]
|
|
390
|
+
|
|
391
|
+
def get_transitive_tests(
|
|
392
|
+
self, qualified_name: str, max_depth: int = 1, max_frontier: int | None = None,
|
|
393
|
+
) -> list[dict]:
|
|
394
|
+
"""Find tests covering a node, including indirect (transitive) coverage.
|
|
395
|
+
|
|
396
|
+
1. Direct: TESTED_BY edges targeting this node (+ bare-name fallback).
|
|
397
|
+
2. Indirect: follow outgoing CALLS edges up to *max_depth* hops,
|
|
398
|
+
then collect TESTED_BY edges on each callee.
|
|
399
|
+
|
|
400
|
+
Returns a list of dicts with node fields plus ``indirect: bool``.
|
|
401
|
+
|
|
402
|
+
``max_frontier`` caps the CALLS fan-out per BFS hop to prevent O(N*M)
|
|
403
|
+
query explosion on hub functions in large graphs. Defaults to
|
|
404
|
+
``CRG_MAX_TRANSITIVE_FRONTIER`` env var (50 if unset).
|
|
405
|
+
"""
|
|
406
|
+
if max_frontier is None:
|
|
407
|
+
max_frontier = int(os.environ.get("CRG_MAX_TRANSITIVE_FRONTIER", "50"))
|
|
408
|
+
conn = self._conn
|
|
409
|
+
seen: set[str] = set()
|
|
410
|
+
results: list[dict] = []
|
|
411
|
+
|
|
412
|
+
# If the input is a class, expand to its methods first.
|
|
413
|
+
input_qns = [qualified_name]
|
|
414
|
+
row = conn.execute(
|
|
415
|
+
"SELECT kind FROM nodes WHERE qualified_name = ?",
|
|
416
|
+
(qualified_name,),
|
|
417
|
+
).fetchone()
|
|
418
|
+
if row and row["kind"] == "Class":
|
|
419
|
+
for mrow in conn.execute(
|
|
420
|
+
"SELECT target_qualified FROM edges "
|
|
421
|
+
"WHERE source_qualified = ? AND kind = 'CONTAINS'",
|
|
422
|
+
(qualified_name,),
|
|
423
|
+
).fetchall():
|
|
424
|
+
input_qns.append(mrow["target_qualified"])
|
|
425
|
+
|
|
426
|
+
def _node_dict(qn: str, indirect: bool) -> dict | None:
|
|
427
|
+
row = conn.execute(
|
|
428
|
+
"SELECT * FROM nodes WHERE qualified_name = ?", (qn,)
|
|
429
|
+
).fetchone()
|
|
430
|
+
if not row:
|
|
431
|
+
return None
|
|
432
|
+
return {
|
|
433
|
+
"name": row["name"],
|
|
434
|
+
"qualified_name": row["qualified_name"],
|
|
435
|
+
"file_path": row["file_path"],
|
|
436
|
+
"kind": row["kind"],
|
|
437
|
+
"indirect": indirect,
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
# Direct TESTED_BY
|
|
441
|
+
for qn in input_qns:
|
|
442
|
+
for row in conn.execute(
|
|
443
|
+
"SELECT source_qualified FROM edges "
|
|
444
|
+
"WHERE target_qualified = ? AND kind = 'TESTED_BY'",
|
|
445
|
+
(qn,),
|
|
446
|
+
).fetchall():
|
|
447
|
+
src = row["source_qualified"]
|
|
448
|
+
if src not in seen:
|
|
449
|
+
seen.add(src)
|
|
450
|
+
d = _node_dict(src, indirect=False)
|
|
451
|
+
if d:
|
|
452
|
+
results.append(d)
|
|
453
|
+
|
|
454
|
+
# Bare-name fallback for direct
|
|
455
|
+
bare = qualified_name.rsplit("::", 1)[-1] if "::" in qualified_name else qualified_name
|
|
456
|
+
for row in conn.execute(
|
|
457
|
+
"SELECT source_qualified FROM edges "
|
|
458
|
+
"WHERE target_qualified = ? AND kind = 'TESTED_BY'",
|
|
459
|
+
(bare,),
|
|
460
|
+
).fetchall():
|
|
461
|
+
src = row["source_qualified"]
|
|
462
|
+
if src not in seen:
|
|
463
|
+
seen.add(src)
|
|
464
|
+
d = _node_dict(src, indirect=False)
|
|
465
|
+
if d:
|
|
466
|
+
results.append(d)
|
|
467
|
+
|
|
468
|
+
# Transitive: follow CALLS edges, then collect TESTED_BY on callees
|
|
469
|
+
frontier = set(input_qns)
|
|
470
|
+
for _ in range(max_depth):
|
|
471
|
+
next_frontier: set[str] = set()
|
|
472
|
+
for qn in frontier:
|
|
473
|
+
for row in conn.execute(
|
|
474
|
+
"SELECT target_qualified FROM edges "
|
|
475
|
+
"WHERE source_qualified = ? AND kind = 'CALLS'",
|
|
476
|
+
(qn,),
|
|
477
|
+
).fetchall():
|
|
478
|
+
next_frontier.add(row["target_qualified"])
|
|
479
|
+
if len(next_frontier) > max_frontier:
|
|
480
|
+
next_frontier = set(list(next_frontier)[:max_frontier])
|
|
481
|
+
for callee in next_frontier:
|
|
482
|
+
for row in conn.execute(
|
|
483
|
+
"SELECT source_qualified FROM edges "
|
|
484
|
+
"WHERE target_qualified = ? AND kind = 'TESTED_BY'",
|
|
485
|
+
(callee,),
|
|
486
|
+
).fetchall():
|
|
487
|
+
src = row["source_qualified"]
|
|
488
|
+
if src not in seen:
|
|
489
|
+
seen.add(src)
|
|
490
|
+
d = _node_dict(src, indirect=True)
|
|
491
|
+
if d:
|
|
492
|
+
results.append(d)
|
|
493
|
+
frontier = next_frontier
|
|
494
|
+
|
|
495
|
+
return results
|
|
496
|
+
|
|
497
|
+
def resolve_bare_call_targets(self) -> int:
|
|
498
|
+
"""Batch-resolve bare-name CALLS targets using the global node table.
|
|
499
|
+
|
|
500
|
+
After parsing, some CALLS edges have bare targets (no ``::`` separator)
|
|
501
|
+
because the parser couldn't resolve cross-file. This method matches
|
|
502
|
+
them against nodes and updates unambiguous matches in-place.
|
|
503
|
+
|
|
504
|
+
Disambiguation strategy:
|
|
505
|
+
1. Single node with that name -> resolve directly
|
|
506
|
+
2. Multiple candidates -> prefer one whose file is imported by the
|
|
507
|
+
source file (via IMPORTS_FROM edges)
|
|
508
|
+
|
|
509
|
+
Returns the number of resolved edges.
|
|
510
|
+
"""
|
|
511
|
+
conn = self._conn
|
|
512
|
+
|
|
513
|
+
bare_edges = conn.execute(
|
|
514
|
+
"SELECT id, source_qualified, target_qualified, file_path "
|
|
515
|
+
"FROM edges WHERE kind = 'CALLS' AND target_qualified NOT LIKE '%::%'"
|
|
516
|
+
).fetchall()
|
|
517
|
+
if not bare_edges:
|
|
518
|
+
return 0
|
|
519
|
+
|
|
520
|
+
# bare_name -> list of qualified_names
|
|
521
|
+
node_lookup: dict[str, list[str]] = {}
|
|
522
|
+
for row in conn.execute(
|
|
523
|
+
"SELECT name, qualified_name FROM nodes "
|
|
524
|
+
"WHERE kind IN ('Function', 'Test', 'Class')"
|
|
525
|
+
).fetchall():
|
|
526
|
+
node_lookup.setdefault(row["name"], []).append(row["qualified_name"])
|
|
527
|
+
|
|
528
|
+
# source_file -> set of imported files (for disambiguation)
|
|
529
|
+
import_targets: dict[str, set[str]] = {}
|
|
530
|
+
for row in conn.execute(
|
|
531
|
+
"SELECT DISTINCT file_path, target_qualified FROM edges "
|
|
532
|
+
"WHERE kind = 'IMPORTS_FROM'"
|
|
533
|
+
).fetchall():
|
|
534
|
+
target = row["target_qualified"]
|
|
535
|
+
target_file = target.split("::", 1)[0] if "::" in target else target
|
|
536
|
+
import_targets.setdefault(row["file_path"], set()).add(target_file)
|
|
537
|
+
|
|
538
|
+
resolved = 0
|
|
539
|
+
for edge in bare_edges:
|
|
540
|
+
bare_name = edge["target_qualified"]
|
|
541
|
+
candidates = node_lookup.get(bare_name, [])
|
|
542
|
+
if not candidates:
|
|
543
|
+
continue
|
|
544
|
+
|
|
545
|
+
if len(candidates) == 1:
|
|
546
|
+
qualified = candidates[0]
|
|
547
|
+
else:
|
|
548
|
+
# Disambiguate via imports
|
|
549
|
+
src_qn = edge["source_qualified"]
|
|
550
|
+
src_file = (
|
|
551
|
+
src_qn.split("::", 1)[0] if "::" in src_qn
|
|
552
|
+
else edge["file_path"]
|
|
553
|
+
)
|
|
554
|
+
imported_files = import_targets.get(src_file, set())
|
|
555
|
+
imported = [
|
|
556
|
+
c for c in candidates
|
|
557
|
+
if c.split("::", 1)[0] in imported_files
|
|
558
|
+
]
|
|
559
|
+
if len(imported) == 1:
|
|
560
|
+
qualified = imported[0]
|
|
561
|
+
else:
|
|
562
|
+
continue
|
|
563
|
+
|
|
564
|
+
conn.execute(
|
|
565
|
+
"UPDATE edges SET target_qualified = ? WHERE id = ?",
|
|
566
|
+
(qualified, edge["id"]),
|
|
567
|
+
)
|
|
568
|
+
resolved += 1
|
|
569
|
+
|
|
570
|
+
if resolved:
|
|
571
|
+
conn.commit()
|
|
572
|
+
logger.info("Resolved %d bare-name CALLS targets", resolved)
|
|
573
|
+
return resolved
|
|
574
|
+
|
|
575
|
+
def get_all_files(self) -> list[str]:
|
|
576
|
+
rows = self._conn.execute(
|
|
577
|
+
"SELECT DISTINCT file_path FROM nodes WHERE kind = 'File'"
|
|
578
|
+
).fetchall()
|
|
579
|
+
return [r["file_path"] for r in rows]
|
|
580
|
+
|
|
581
|
+
def search_nodes(self, query: str, limit: int = 20) -> list[GraphNode]:
|
|
582
|
+
"""Keyword search across node names.
|
|
583
|
+
|
|
584
|
+
Tries FTS5 first (fast, tokenized matching), then falls back to
|
|
585
|
+
LIKE-based substring search when FTS5 returns no results.
|
|
586
|
+
"""
|
|
587
|
+
words = query.split()
|
|
588
|
+
if not words:
|
|
589
|
+
return []
|
|
590
|
+
|
|
591
|
+
# Phase 1: FTS5 search (uses the indexed nodes_fts table)
|
|
592
|
+
try:
|
|
593
|
+
if len(words) == 1:
|
|
594
|
+
fts_query = '"' + query.replace('"', '""') + '"'
|
|
595
|
+
else:
|
|
596
|
+
fts_query = " AND ".join(
|
|
597
|
+
'"' + w.replace('"', '""') + '"' for w in words
|
|
598
|
+
)
|
|
599
|
+
rows = self._conn.execute(
|
|
600
|
+
"SELECT n.* FROM nodes_fts f "
|
|
601
|
+
"JOIN nodes n ON f.rowid = n.id "
|
|
602
|
+
"WHERE nodes_fts MATCH ? LIMIT ?",
|
|
603
|
+
(fts_query, limit),
|
|
604
|
+
).fetchall()
|
|
605
|
+
if rows:
|
|
606
|
+
return [self._row_to_node(r) for r in rows]
|
|
607
|
+
except Exception: # nosec B110 - FTS5 table may not exist on older schemas
|
|
608
|
+
pass
|
|
609
|
+
|
|
610
|
+
# Phase 2: LIKE fallback (substring matching)
|
|
611
|
+
conditions: list[str] = []
|
|
612
|
+
params: list[str | int] = []
|
|
613
|
+
for word in words:
|
|
614
|
+
w = word.lower()
|
|
615
|
+
conditions.append(
|
|
616
|
+
"(LOWER(name) LIKE ? OR LOWER(qualified_name) LIKE ?)"
|
|
617
|
+
)
|
|
618
|
+
params.extend([f"%{w}%", f"%{w}%"])
|
|
619
|
+
|
|
620
|
+
where = " AND ".join(conditions)
|
|
621
|
+
sql = f"SELECT * FROM nodes WHERE {where} LIMIT ?" # nosec B608
|
|
622
|
+
params.append(limit)
|
|
623
|
+
rows = self._conn.execute(sql, params).fetchall()
|
|
624
|
+
return [self._row_to_node(r) for r in rows]
|
|
625
|
+
|
|
626
|
+
# --- Impact / Graph traversal ---
|
|
627
|
+
|
|
628
|
+
def get_impact_radius(
|
|
629
|
+
self,
|
|
630
|
+
changed_files: list[str],
|
|
631
|
+
max_depth: int = MAX_IMPACT_DEPTH,
|
|
632
|
+
max_nodes: int = MAX_IMPACT_NODES,
|
|
633
|
+
) -> dict[str, Any]:
|
|
634
|
+
"""BFS from changed files to find all impacted nodes within depth N.
|
|
635
|
+
|
|
636
|
+
Delegates to ``get_impact_radius_sql()`` by default (faster for
|
|
637
|
+
large graphs). Set ``CRG_BFS_ENGINE=networkx`` to use the legacy
|
|
638
|
+
Python-side BFS via NetworkX.
|
|
639
|
+
|
|
640
|
+
Returns dict with:
|
|
641
|
+
- changed_nodes: nodes in changed files
|
|
642
|
+
- impacted_nodes: nodes reachable via edges
|
|
643
|
+
- impacted_files: unique set of affected files
|
|
644
|
+
- edges: connecting edges
|
|
645
|
+
"""
|
|
646
|
+
if BFS_ENGINE == "networkx":
|
|
647
|
+
return self._get_impact_radius_networkx(
|
|
648
|
+
changed_files, max_depth=max_depth, max_nodes=max_nodes,
|
|
649
|
+
)
|
|
650
|
+
return self.get_impact_radius_sql(
|
|
651
|
+
changed_files, max_depth=max_depth, max_nodes=max_nodes,
|
|
652
|
+
)
|
|
653
|
+
|
|
654
|
+
# -- SQLite recursive CTE version (default) ---------------------------
|
|
655
|
+
|
|
656
|
+
def get_impact_radius_sql(
|
|
657
|
+
self,
|
|
658
|
+
changed_files: list[str],
|
|
659
|
+
max_depth: int = MAX_IMPACT_DEPTH,
|
|
660
|
+
max_nodes: int = MAX_IMPACT_NODES,
|
|
661
|
+
) -> dict[str, Any]:
|
|
662
|
+
"""Impact radius via SQLite recursive CTE.
|
|
663
|
+
|
|
664
|
+
Faster than NetworkX for large graphs because it avoids
|
|
665
|
+
materialising the full graph in Python.
|
|
666
|
+
"""
|
|
667
|
+
if not changed_files:
|
|
668
|
+
return {
|
|
669
|
+
"changed_nodes": [],
|
|
670
|
+
"impacted_nodes": [],
|
|
671
|
+
"impacted_files": [],
|
|
672
|
+
"edges": [],
|
|
673
|
+
"truncated": False,
|
|
674
|
+
"total_impacted": 0,
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
# Seed qualified names
|
|
678
|
+
seeds: set[str] = set()
|
|
679
|
+
for f in changed_files:
|
|
680
|
+
nodes = self.get_nodes_by_file(f)
|
|
681
|
+
for n in nodes:
|
|
682
|
+
seeds.add(n.qualified_name)
|
|
683
|
+
|
|
684
|
+
if not seeds:
|
|
685
|
+
return {
|
|
686
|
+
"changed_nodes": [],
|
|
687
|
+
"impacted_nodes": [],
|
|
688
|
+
"impacted_files": [],
|
|
689
|
+
"edges": [],
|
|
690
|
+
"truncated": False,
|
|
691
|
+
"total_impacted": 0,
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
# Build recursive CTE — use a temp table for the seed set to
|
|
695
|
+
# keep the query plan efficient and stay under variable limits.
|
|
696
|
+
self._conn.execute(
|
|
697
|
+
"CREATE TEMP TABLE IF NOT EXISTS _impact_seeds "
|
|
698
|
+
"(qn TEXT PRIMARY KEY)"
|
|
699
|
+
)
|
|
700
|
+
self._conn.execute("DELETE FROM _impact_seeds")
|
|
701
|
+
batch_size = 450
|
|
702
|
+
seed_list = list(seeds)
|
|
703
|
+
for i in range(0, len(seed_list), batch_size):
|
|
704
|
+
batch = seed_list[i:i + batch_size]
|
|
705
|
+
placeholders = ",".join("(?)" for _ in batch)
|
|
706
|
+
self._conn.execute( # nosec B608
|
|
707
|
+
f"INSERT OR IGNORE INTO _impact_seeds (qn) VALUES {placeholders}",
|
|
708
|
+
batch,
|
|
709
|
+
)
|
|
710
|
+
|
|
711
|
+
cte_sql = """
|
|
712
|
+
WITH RECURSIVE impacted(node_qn, depth) AS (
|
|
713
|
+
SELECT qn, 0 FROM _impact_seeds
|
|
714
|
+
UNION
|
|
715
|
+
SELECT e.target_qualified, i.depth + 1
|
|
716
|
+
FROM impacted i
|
|
717
|
+
JOIN edges e ON e.source_qualified = i.node_qn
|
|
718
|
+
WHERE i.depth < ?
|
|
719
|
+
UNION
|
|
720
|
+
SELECT e.source_qualified, i.depth + 1
|
|
721
|
+
FROM impacted i
|
|
722
|
+
JOIN edges e ON e.target_qualified = i.node_qn
|
|
723
|
+
WHERE i.depth < ?
|
|
724
|
+
)
|
|
725
|
+
SELECT DISTINCT node_qn, MIN(depth) AS min_depth
|
|
726
|
+
FROM impacted
|
|
727
|
+
GROUP BY node_qn
|
|
728
|
+
LIMIT ?
|
|
729
|
+
"""
|
|
730
|
+
rows = self._conn.execute(
|
|
731
|
+
cte_sql, (max_depth, max_depth, max_nodes + len(seeds)),
|
|
732
|
+
).fetchall()
|
|
733
|
+
|
|
734
|
+
# Split into seeds vs impacted
|
|
735
|
+
impacted_qns: set[str] = set()
|
|
736
|
+
for r in rows:
|
|
737
|
+
qn = r[0]
|
|
738
|
+
if qn not in seeds:
|
|
739
|
+
impacted_qns.add(qn)
|
|
740
|
+
|
|
741
|
+
# Batch-fetch nodes
|
|
742
|
+
changed_nodes = self._batch_get_nodes(seeds)
|
|
743
|
+
impacted_nodes = self._batch_get_nodes(impacted_qns)
|
|
744
|
+
|
|
745
|
+
total_impacted = len(impacted_nodes)
|
|
746
|
+
truncated = total_impacted > max_nodes
|
|
747
|
+
if truncated:
|
|
748
|
+
impacted_nodes = impacted_nodes[:max_nodes]
|
|
749
|
+
|
|
750
|
+
impacted_files = list({n.file_path for n in impacted_nodes})
|
|
751
|
+
|
|
752
|
+
relevant_edges: list[GraphEdge] = []
|
|
753
|
+
all_qns = seeds | {n.qualified_name for n in impacted_nodes}
|
|
754
|
+
if all_qns:
|
|
755
|
+
relevant_edges = self.get_edges_among(all_qns)
|
|
756
|
+
|
|
757
|
+
return {
|
|
758
|
+
"changed_nodes": changed_nodes,
|
|
759
|
+
"impacted_nodes": impacted_nodes,
|
|
760
|
+
"impacted_files": impacted_files,
|
|
761
|
+
"edges": relevant_edges,
|
|
762
|
+
"truncated": truncated,
|
|
763
|
+
"total_impacted": total_impacted,
|
|
764
|
+
}
|
|
765
|
+
|
|
766
|
+
# -- NetworkX BFS version (legacy) ------------------------------------
|
|
767
|
+
|
|
768
|
+
def _get_impact_radius_networkx(
|
|
769
|
+
self,
|
|
770
|
+
changed_files: list[str],
|
|
771
|
+
max_depth: int = MAX_IMPACT_DEPTH,
|
|
772
|
+
max_nodes: int = MAX_IMPACT_NODES,
|
|
773
|
+
) -> dict[str, Any]:
|
|
774
|
+
"""BFS via NetworkX (legacy). Used when CRG_BFS_ENGINE=networkx."""
|
|
775
|
+
nxg = self._build_networkx_graph()
|
|
776
|
+
|
|
777
|
+
seeds: set[str] = set()
|
|
778
|
+
for f in changed_files:
|
|
779
|
+
nodes = self.get_nodes_by_file(f)
|
|
780
|
+
for n in nodes:
|
|
781
|
+
seeds.add(n.qualified_name)
|
|
782
|
+
|
|
783
|
+
visited: set[str] = set()
|
|
784
|
+
frontier = seeds.copy()
|
|
785
|
+
depth = 0
|
|
786
|
+
impacted: set[str] = set()
|
|
787
|
+
|
|
788
|
+
while frontier and depth < max_depth:
|
|
789
|
+
visited.update(frontier)
|
|
790
|
+
next_frontier: set[str] = set()
|
|
791
|
+
for qn in frontier:
|
|
792
|
+
if qn in nxg:
|
|
793
|
+
for neighbor in nxg.neighbors(qn):
|
|
794
|
+
if neighbor not in visited:
|
|
795
|
+
next_frontier.add(neighbor)
|
|
796
|
+
impacted.add(neighbor)
|
|
797
|
+
if qn in nxg:
|
|
798
|
+
for pred in nxg.predecessors(qn):
|
|
799
|
+
if pred not in visited:
|
|
800
|
+
next_frontier.add(pred)
|
|
801
|
+
impacted.add(pred)
|
|
802
|
+
next_frontier -= visited
|
|
803
|
+
if len(visited) + len(next_frontier) > max_nodes:
|
|
804
|
+
break
|
|
805
|
+
frontier = next_frontier
|
|
806
|
+
depth += 1
|
|
807
|
+
|
|
808
|
+
changed_nodes = self._batch_get_nodes(seeds)
|
|
809
|
+
impacted_qns = impacted - seeds
|
|
810
|
+
impacted_nodes = self._batch_get_nodes(impacted_qns)
|
|
811
|
+
|
|
812
|
+
total_impacted = len(impacted_nodes)
|
|
813
|
+
truncated = total_impacted > max_nodes
|
|
814
|
+
if truncated:
|
|
815
|
+
impacted_nodes = impacted_nodes[:max_nodes]
|
|
816
|
+
|
|
817
|
+
impacted_files = list({n.file_path for n in impacted_nodes})
|
|
818
|
+
|
|
819
|
+
relevant_edges: list[GraphEdge] = []
|
|
820
|
+
all_qns = seeds | {n.qualified_name for n in impacted_nodes}
|
|
821
|
+
if all_qns:
|
|
822
|
+
relevant_edges = self.get_edges_among(all_qns)
|
|
823
|
+
|
|
824
|
+
return {
|
|
825
|
+
"changed_nodes": changed_nodes,
|
|
826
|
+
"impacted_nodes": impacted_nodes,
|
|
827
|
+
"impacted_files": impacted_files,
|
|
828
|
+
"edges": relevant_edges,
|
|
829
|
+
"truncated": truncated,
|
|
830
|
+
"total_impacted": total_impacted,
|
|
831
|
+
}
|
|
832
|
+
|
|
833
|
+
def get_subgraph(self, qualified_names: list[str]) -> dict[str, Any]:
|
|
834
|
+
"""Extract a subgraph containing the specified nodes and their connecting edges."""
|
|
835
|
+
nodes = []
|
|
836
|
+
for qn in qualified_names:
|
|
837
|
+
node = self.get_node(qn)
|
|
838
|
+
if node:
|
|
839
|
+
nodes.append(node)
|
|
840
|
+
|
|
841
|
+
edges = []
|
|
842
|
+
qn_set = set(qualified_names)
|
|
843
|
+
for qn in qualified_names:
|
|
844
|
+
for e in self.get_edges_by_source(qn):
|
|
845
|
+
if e.target_qualified in qn_set:
|
|
846
|
+
edges.append(e)
|
|
847
|
+
|
|
848
|
+
return {"nodes": nodes, "edges": edges}
|
|
849
|
+
|
|
850
|
+
def get_stats(self) -> GraphStats:
|
|
851
|
+
"""Return aggregate statistics about the graph."""
|
|
852
|
+
total_nodes = self._conn.execute("SELECT COUNT(*) FROM nodes").fetchone()[0]
|
|
853
|
+
total_edges = self._conn.execute("SELECT COUNT(*) FROM edges").fetchone()[0]
|
|
854
|
+
|
|
855
|
+
nodes_by_kind: dict[str, int] = {}
|
|
856
|
+
for row in self._conn.execute("SELECT kind, COUNT(*) as cnt FROM nodes GROUP BY kind"):
|
|
857
|
+
nodes_by_kind[row["kind"]] = row["cnt"]
|
|
858
|
+
|
|
859
|
+
edges_by_kind: dict[str, int] = {}
|
|
860
|
+
for row in self._conn.execute("SELECT kind, COUNT(*) as cnt FROM edges GROUP BY kind"):
|
|
861
|
+
edges_by_kind[row["kind"]] = row["cnt"]
|
|
862
|
+
|
|
863
|
+
languages = [
|
|
864
|
+
r["language"] for r in self._conn.execute(
|
|
865
|
+
"SELECT DISTINCT language FROM nodes WHERE language IS NOT NULL AND language != ''"
|
|
866
|
+
)
|
|
867
|
+
]
|
|
868
|
+
|
|
869
|
+
files_count = self._conn.execute(
|
|
870
|
+
"SELECT COUNT(*) FROM nodes WHERE kind = 'File'"
|
|
871
|
+
).fetchone()[0]
|
|
872
|
+
|
|
873
|
+
last_updated = self.get_metadata("last_updated")
|
|
874
|
+
|
|
875
|
+
return GraphStats(
|
|
876
|
+
total_nodes=total_nodes,
|
|
877
|
+
total_edges=total_edges,
|
|
878
|
+
nodes_by_kind=nodes_by_kind,
|
|
879
|
+
edges_by_kind=edges_by_kind,
|
|
880
|
+
languages=languages,
|
|
881
|
+
files_count=files_count,
|
|
882
|
+
last_updated=last_updated,
|
|
883
|
+
)
|
|
884
|
+
|
|
885
|
+
def find_disambiguated_nodes(self) -> list[str]:
|
|
886
|
+
"""Qualified names suffixed to resolve a same-file name collision.
|
|
887
|
+
|
|
888
|
+
When two same-named symbols share a file, the first keeps its bare key
|
|
889
|
+
and later ones get a ``:L<line>`` suffix (see _qualified_names_for_file).
|
|
890
|
+
Surfacing these makes the otherwise-invisible collisions reportable.
|
|
891
|
+
"""
|
|
892
|
+
rows = self._conn.execute(
|
|
893
|
+
"SELECT qualified_name FROM nodes WHERE qualified_name GLOB '*:L[0-9]*'"
|
|
894
|
+
).fetchall()
|
|
895
|
+
return sorted(
|
|
896
|
+
r["qualified_name"] for r in rows
|
|
897
|
+
if _DISAMBIGUATED_RE.search(r["qualified_name"])
|
|
898
|
+
)
|
|
899
|
+
|
|
900
|
+
def get_nodes_by_size(
|
|
901
|
+
self,
|
|
902
|
+
min_lines: int = 50,
|
|
903
|
+
max_lines: int | None = None,
|
|
904
|
+
kind: str | None = None,
|
|
905
|
+
file_path_pattern: str | None = None,
|
|
906
|
+
limit: int = 50,
|
|
907
|
+
) -> list[GraphNode]:
|
|
908
|
+
"""Find nodes within a line-count range, ordered largest first.
|
|
909
|
+
|
|
910
|
+
Args:
|
|
911
|
+
min_lines: Minimum line count threshold (inclusive).
|
|
912
|
+
max_lines: Maximum line count threshold (inclusive). None = no upper bound.
|
|
913
|
+
kind: Filter by node kind (Function, Class, File, etc.).
|
|
914
|
+
file_path_pattern: SQL LIKE pattern to filter by file path.
|
|
915
|
+
limit: Maximum results to return.
|
|
916
|
+
|
|
917
|
+
Returns:
|
|
918
|
+
List of GraphNode objects, ordered by line count descending.
|
|
919
|
+
"""
|
|
920
|
+
conditions = [
|
|
921
|
+
"line_start IS NOT NULL",
|
|
922
|
+
"line_end IS NOT NULL",
|
|
923
|
+
"(line_end - line_start + 1) >= ?",
|
|
924
|
+
]
|
|
925
|
+
params: list = [min_lines]
|
|
926
|
+
|
|
927
|
+
if max_lines is not None:
|
|
928
|
+
conditions.append("(line_end - line_start + 1) <= ?")
|
|
929
|
+
params.append(max_lines)
|
|
930
|
+
if kind:
|
|
931
|
+
conditions.append("kind = ?")
|
|
932
|
+
params.append(kind)
|
|
933
|
+
if file_path_pattern:
|
|
934
|
+
conditions.append("file_path LIKE ?")
|
|
935
|
+
params.append(f"%{file_path_pattern}%")
|
|
936
|
+
|
|
937
|
+
params.append(limit)
|
|
938
|
+
where = " AND ".join(conditions)
|
|
939
|
+
rows = self._conn.execute(
|
|
940
|
+
f"SELECT * FROM nodes WHERE {where} " # nosec B608
|
|
941
|
+
"ORDER BY (line_end - line_start + 1) DESC LIMIT ?",
|
|
942
|
+
params,
|
|
943
|
+
).fetchall()
|
|
944
|
+
return [self._row_to_node(r) for r in rows]
|
|
945
|
+
|
|
946
|
+
# --- Public query helpers (used by flows, changes, communities, etc.) ---
|
|
947
|
+
|
|
948
|
+
def get_node_by_id(self, node_id: int) -> Optional[GraphNode]:
|
|
949
|
+
"""Fetch a single node by its integer primary key."""
|
|
950
|
+
row = self._conn.execute(
|
|
951
|
+
"SELECT * FROM nodes WHERE id = ?", (node_id,)
|
|
952
|
+
).fetchone()
|
|
953
|
+
return self._row_to_node(row) if row else None
|
|
954
|
+
|
|
955
|
+
def get_nodes_by_kind(
|
|
956
|
+
self,
|
|
957
|
+
kinds: list[str],
|
|
958
|
+
file_pattern: str | None = None,
|
|
959
|
+
) -> list[GraphNode]:
|
|
960
|
+
"""Return nodes matching any of *kinds*, optionally filtered by file.
|
|
961
|
+
|
|
962
|
+
Args:
|
|
963
|
+
kinds: List of node kind strings (e.g. ``["Function", "Test"]``).
|
|
964
|
+
file_pattern: If provided, only nodes whose ``file_path``
|
|
965
|
+
contains *file_pattern* (SQL LIKE ``%pattern%``) are
|
|
966
|
+
returned.
|
|
967
|
+
"""
|
|
968
|
+
if not kinds:
|
|
969
|
+
return []
|
|
970
|
+
placeholders = ",".join("?" for _ in kinds)
|
|
971
|
+
conditions = [f"kind IN ({placeholders})"]
|
|
972
|
+
params: list[str] = list(kinds)
|
|
973
|
+
if file_pattern:
|
|
974
|
+
conditions.append("file_path LIKE ?")
|
|
975
|
+
params.append(f"%{file_pattern}%")
|
|
976
|
+
where = " AND ".join(conditions)
|
|
977
|
+
rows = self._conn.execute( # nosec B608
|
|
978
|
+
f"SELECT * FROM nodes WHERE {where}", params,
|
|
979
|
+
).fetchall()
|
|
980
|
+
return [self._row_to_node(r) for r in rows]
|
|
981
|
+
|
|
982
|
+
def count_flow_memberships(self, node_id: int) -> int:
|
|
983
|
+
"""Return the number of flows a node participates in."""
|
|
984
|
+
row = self._conn.execute(
|
|
985
|
+
"SELECT COUNT(*) as cnt FROM flow_memberships "
|
|
986
|
+
"WHERE node_id = ?",
|
|
987
|
+
(node_id,),
|
|
988
|
+
).fetchone()
|
|
989
|
+
return row["cnt"] if row else 0
|
|
990
|
+
|
|
991
|
+
def get_flow_criticalities_for_node(self, node_id: int) -> list[float]:
|
|
992
|
+
"""Return criticality values for all flows a node participates in."""
|
|
993
|
+
rows = self._conn.execute(
|
|
994
|
+
"SELECT f.criticality FROM flows f "
|
|
995
|
+
"JOIN flow_memberships fm ON fm.flow_id = f.id "
|
|
996
|
+
"WHERE fm.node_id = ?",
|
|
997
|
+
(node_id,),
|
|
998
|
+
).fetchall()
|
|
999
|
+
return [r["criticality"] for r in rows]
|
|
1000
|
+
|
|
1001
|
+
def get_node_community_id(self, node_id: int) -> int | None:
|
|
1002
|
+
"""Return the ``community_id`` for a node, or ``None``."""
|
|
1003
|
+
row = self._conn.execute(
|
|
1004
|
+
"SELECT community_id FROM nodes WHERE id = ?",
|
|
1005
|
+
(node_id,),
|
|
1006
|
+
).fetchone()
|
|
1007
|
+
if row and row["community_id"] is not None:
|
|
1008
|
+
return row["community_id"]
|
|
1009
|
+
return None
|
|
1010
|
+
|
|
1011
|
+
def get_community_ids_by_qualified_names(
|
|
1012
|
+
self, qns: list[str],
|
|
1013
|
+
) -> dict[str, int | None]:
|
|
1014
|
+
"""Batch-fetch ``community_id`` for a list of qualified names.
|
|
1015
|
+
|
|
1016
|
+
Returns a mapping from qualified name to community_id (may be
|
|
1017
|
+
``None`` if the node has no assigned community).
|
|
1018
|
+
"""
|
|
1019
|
+
result: dict[str, int | None] = {}
|
|
1020
|
+
batch_size = 450
|
|
1021
|
+
for i in range(0, len(qns), batch_size):
|
|
1022
|
+
batch = qns[i:i + batch_size]
|
|
1023
|
+
placeholders = ",".join("?" for _ in batch)
|
|
1024
|
+
rows = self._conn.execute( # nosec B608
|
|
1025
|
+
"SELECT qualified_name, community_id FROM nodes "
|
|
1026
|
+
f"WHERE qualified_name IN ({placeholders})",
|
|
1027
|
+
batch,
|
|
1028
|
+
).fetchall()
|
|
1029
|
+
for r in rows:
|
|
1030
|
+
result[r["qualified_name"]] = r["community_id"]
|
|
1031
|
+
return result
|
|
1032
|
+
|
|
1033
|
+
def get_files_matching(self, pattern: str) -> list[str]:
|
|
1034
|
+
"""Return distinct ``file_path`` values matching a LIKE suffix."""
|
|
1035
|
+
rows = self._conn.execute(
|
|
1036
|
+
"SELECT DISTINCT file_path FROM nodes "
|
|
1037
|
+
"WHERE file_path LIKE ?",
|
|
1038
|
+
(f"%{pattern}",),
|
|
1039
|
+
).fetchall()
|
|
1040
|
+
return [r["file_path"] for r in rows]
|
|
1041
|
+
|
|
1042
|
+
def get_nodes_without_signature(self) -> list[sqlite3.Row]:
|
|
1043
|
+
"""Return raw rows for nodes that have no signature yet."""
|
|
1044
|
+
return self._conn.execute(
|
|
1045
|
+
"SELECT id, name, kind, params, return_type "
|
|
1046
|
+
"FROM nodes WHERE signature IS NULL"
|
|
1047
|
+
).fetchall()
|
|
1048
|
+
|
|
1049
|
+
def update_node_signature(
|
|
1050
|
+
self, node_id: int, signature: str,
|
|
1051
|
+
) -> None:
|
|
1052
|
+
"""Set the ``signature`` column for a single node."""
|
|
1053
|
+
self._conn.execute(
|
|
1054
|
+
"UPDATE nodes SET signature = ? WHERE id = ?",
|
|
1055
|
+
(signature, node_id),
|
|
1056
|
+
)
|
|
1057
|
+
|
|
1058
|
+
def get_all_community_ids(self) -> dict[str, int | None]:
|
|
1059
|
+
"""Return a mapping of *all* qualified names to their community_id.
|
|
1060
|
+
|
|
1061
|
+
Used primarily by the visualization exporter.
|
|
1062
|
+
"""
|
|
1063
|
+
try:
|
|
1064
|
+
rows = self._conn.execute(
|
|
1065
|
+
"SELECT qualified_name, community_id FROM nodes"
|
|
1066
|
+
).fetchall()
|
|
1067
|
+
return {
|
|
1068
|
+
r["qualified_name"]: r["community_id"]
|
|
1069
|
+
for r in rows
|
|
1070
|
+
}
|
|
1071
|
+
except sqlite3.OperationalError as exc:
|
|
1072
|
+
# community_id column may not exist yet on pre-v6 schemas
|
|
1073
|
+
logger.debug("Community IDs unavailable (schema not yet migrated): %s", exc)
|
|
1074
|
+
return {}
|
|
1075
|
+
|
|
1076
|
+
def get_node_ids_by_files(
|
|
1077
|
+
self, file_paths: list[str],
|
|
1078
|
+
) -> set[int]:
|
|
1079
|
+
"""Return node IDs belonging to the given file paths."""
|
|
1080
|
+
if not file_paths:
|
|
1081
|
+
return set()
|
|
1082
|
+
result: set[int] = set()
|
|
1083
|
+
batch_size = 450
|
|
1084
|
+
for i in range(0, len(file_paths), batch_size):
|
|
1085
|
+
batch = file_paths[i:i + batch_size]
|
|
1086
|
+
placeholders = ",".join("?" for _ in batch)
|
|
1087
|
+
rows = self._conn.execute( # nosec B608
|
|
1088
|
+
"SELECT id FROM nodes "
|
|
1089
|
+
f"WHERE file_path IN ({placeholders})",
|
|
1090
|
+
batch,
|
|
1091
|
+
).fetchall()
|
|
1092
|
+
result.update(r["id"] for r in rows)
|
|
1093
|
+
return result
|
|
1094
|
+
|
|
1095
|
+
def get_flow_ids_by_node_ids(
|
|
1096
|
+
self, node_ids: set[int],
|
|
1097
|
+
) -> list[int]:
|
|
1098
|
+
"""Return distinct flow IDs that contain any of *node_ids*."""
|
|
1099
|
+
if not node_ids:
|
|
1100
|
+
return []
|
|
1101
|
+
nids = list(node_ids)
|
|
1102
|
+
result: list[int] = []
|
|
1103
|
+
batch_size = 450
|
|
1104
|
+
for i in range(0, len(nids), batch_size):
|
|
1105
|
+
batch = nids[i:i + batch_size]
|
|
1106
|
+
placeholders = ",".join("?" for _ in batch)
|
|
1107
|
+
rows = self._conn.execute( # nosec B608
|
|
1108
|
+
"SELECT DISTINCT flow_id FROM flow_memberships "
|
|
1109
|
+
f"WHERE node_id IN ({placeholders})",
|
|
1110
|
+
batch,
|
|
1111
|
+
).fetchall()
|
|
1112
|
+
result.extend(r["flow_id"] for r in rows)
|
|
1113
|
+
# Deduplicate across batches
|
|
1114
|
+
return list(dict.fromkeys(result))
|
|
1115
|
+
|
|
1116
|
+
def get_flow_qualified_names(self, flow_id: int) -> set[str]:
|
|
1117
|
+
"""Return the set of qualified names for nodes in a flow."""
|
|
1118
|
+
rows = self._conn.execute(
|
|
1119
|
+
"SELECT n.qualified_name FROM flow_memberships fm "
|
|
1120
|
+
"JOIN nodes n ON fm.node_id = n.id WHERE fm.flow_id = ?",
|
|
1121
|
+
(flow_id,),
|
|
1122
|
+
).fetchall()
|
|
1123
|
+
return {r["qualified_name"] for r in rows}
|
|
1124
|
+
|
|
1125
|
+
def get_node_kind_by_id(self, node_id: int) -> str | None:
|
|
1126
|
+
"""Return just the ``kind`` column for a node, or ``None``."""
|
|
1127
|
+
row = self._conn.execute(
|
|
1128
|
+
"SELECT kind FROM nodes WHERE id = ?", (node_id,),
|
|
1129
|
+
).fetchone()
|
|
1130
|
+
return row["kind"] if row else None
|
|
1131
|
+
|
|
1132
|
+
def get_all_call_targets(self, include_file_sources: bool = True) -> set[str]:
|
|
1133
|
+
"""Return the set of all CALLS-edge target qualified names.
|
|
1134
|
+
|
|
1135
|
+
When ``include_file_sources`` is False, CALLS edges whose source is a
|
|
1136
|
+
File node (module-scope calls from top-level script glue, CLI
|
|
1137
|
+
entrypoints, or notebook cells) are excluded. Callers that treat "has
|
|
1138
|
+
an incoming call" as "is not a root" (e.g. entry-point detection)
|
|
1139
|
+
should pass ``include_file_sources=False`` — otherwise a script-only
|
|
1140
|
+
callee looks called and is hidden from flow analysis.
|
|
1141
|
+
|
|
1142
|
+
The File-node filter joins against ``nodes.kind`` rather than pattern-
|
|
1143
|
+
matching ``source_qualified`` so that file paths containing ``::`` or
|
|
1144
|
+
any future change to the File-node naming convention cannot silently
|
|
1145
|
+
miscategorize edges.
|
|
1146
|
+
"""
|
|
1147
|
+
if include_file_sources:
|
|
1148
|
+
rows = self._conn.execute(
|
|
1149
|
+
"SELECT DISTINCT target_qualified FROM edges "
|
|
1150
|
+
"WHERE kind = 'CALLS'"
|
|
1151
|
+
).fetchall()
|
|
1152
|
+
else:
|
|
1153
|
+
rows = self._conn.execute(
|
|
1154
|
+
"SELECT DISTINCT e.target_qualified FROM edges e "
|
|
1155
|
+
"LEFT JOIN nodes n ON n.qualified_name = e.source_qualified "
|
|
1156
|
+
"WHERE e.kind = 'CALLS' "
|
|
1157
|
+
"AND (n.kind IS NULL OR n.kind != 'File')"
|
|
1158
|
+
).fetchall()
|
|
1159
|
+
return {r["target_qualified"] for r in rows}
|
|
1160
|
+
|
|
1161
|
+
def get_communities_list(
|
|
1162
|
+
self,
|
|
1163
|
+
) -> list[sqlite3.Row]:
|
|
1164
|
+
"""Return raw rows from the ``communities`` table."""
|
|
1165
|
+
try:
|
|
1166
|
+
return self._conn.execute(
|
|
1167
|
+
"SELECT id, name FROM communities"
|
|
1168
|
+
).fetchall()
|
|
1169
|
+
except sqlite3.OperationalError as exc:
|
|
1170
|
+
# communities table doesn't exist yet on pre-v4 schemas
|
|
1171
|
+
logger.debug("Communities list unavailable (table missing): %s", exc)
|
|
1172
|
+
return []
|
|
1173
|
+
|
|
1174
|
+
def get_community_member_qns(
|
|
1175
|
+
self, community_id: int,
|
|
1176
|
+
) -> list[str]:
|
|
1177
|
+
"""Return qualified names of nodes in a community."""
|
|
1178
|
+
rows = self._conn.execute(
|
|
1179
|
+
"SELECT qualified_name FROM nodes "
|
|
1180
|
+
"WHERE community_id = ?",
|
|
1181
|
+
(community_id,),
|
|
1182
|
+
).fetchall()
|
|
1183
|
+
return [r["qualified_name"] for r in rows]
|
|
1184
|
+
|
|
1185
|
+
def get_nodes_by_community_id(
|
|
1186
|
+
self, community_id: int,
|
|
1187
|
+
) -> list[GraphNode]:
|
|
1188
|
+
"""Return all nodes belonging to a community."""
|
|
1189
|
+
rows = self._conn.execute(
|
|
1190
|
+
"SELECT * FROM nodes WHERE community_id = ?",
|
|
1191
|
+
(community_id,),
|
|
1192
|
+
).fetchall()
|
|
1193
|
+
return [self._row_to_node(r) for r in rows]
|
|
1194
|
+
|
|
1195
|
+
def get_outgoing_targets(
|
|
1196
|
+
self, source_qns: list[str],
|
|
1197
|
+
) -> list[str]:
|
|
1198
|
+
"""Return ``target_qualified`` for edges sourced from *source_qns*."""
|
|
1199
|
+
results: list[str] = []
|
|
1200
|
+
batch_size = 450
|
|
1201
|
+
for i in range(0, len(source_qns), batch_size):
|
|
1202
|
+
batch = source_qns[i:i + batch_size]
|
|
1203
|
+
placeholders = ",".join("?" for _ in batch)
|
|
1204
|
+
rows = self._conn.execute( # nosec B608
|
|
1205
|
+
"SELECT target_qualified FROM edges "
|
|
1206
|
+
f"WHERE source_qualified IN ({placeholders})",
|
|
1207
|
+
batch,
|
|
1208
|
+
).fetchall()
|
|
1209
|
+
results.extend(r["target_qualified"] for r in rows)
|
|
1210
|
+
return results
|
|
1211
|
+
|
|
1212
|
+
def get_incoming_sources(
|
|
1213
|
+
self, target_qns: list[str],
|
|
1214
|
+
) -> list[str]:
|
|
1215
|
+
"""Return ``source_qualified`` for edges targeting *target_qns*."""
|
|
1216
|
+
results: list[str] = []
|
|
1217
|
+
batch_size = 450
|
|
1218
|
+
for i in range(0, len(target_qns), batch_size):
|
|
1219
|
+
batch = target_qns[i:i + batch_size]
|
|
1220
|
+
placeholders = ",".join("?" for _ in batch)
|
|
1221
|
+
rows = self._conn.execute( # nosec B608
|
|
1222
|
+
"SELECT source_qualified FROM edges "
|
|
1223
|
+
f"WHERE target_qualified IN ({placeholders})",
|
|
1224
|
+
batch,
|
|
1225
|
+
).fetchall()
|
|
1226
|
+
results.extend(r["source_qualified"] for r in rows)
|
|
1227
|
+
return results
|
|
1228
|
+
|
|
1229
|
+
# --- Public edge access (for visualization etc.) ---
|
|
1230
|
+
|
|
1231
|
+
def get_all_edges(self) -> list[GraphEdge]:
|
|
1232
|
+
"""Return all edges in the graph."""
|
|
1233
|
+
rows = self._conn.execute("SELECT * FROM edges").fetchall()
|
|
1234
|
+
return [self._row_to_edge(r) for r in rows]
|
|
1235
|
+
|
|
1236
|
+
def get_edges_among(self, qualified_names: set[str]) -> list[GraphEdge]:
|
|
1237
|
+
"""Return edges where both source and target are in the given set.
|
|
1238
|
+
|
|
1239
|
+
Batches the source-side IN clause to stay under SQLite's default
|
|
1240
|
+
SQLITE_MAX_VARIABLE_NUMBER limit, then filters targets in Python.
|
|
1241
|
+
"""
|
|
1242
|
+
if not qualified_names:
|
|
1243
|
+
return []
|
|
1244
|
+
qns = list(qualified_names)
|
|
1245
|
+
results: list[GraphEdge] = []
|
|
1246
|
+
batch_size = 450 # Stay well under SQLite's default 999 limit
|
|
1247
|
+
for i in range(0, len(qns), batch_size):
|
|
1248
|
+
batch = qns[i:i + batch_size]
|
|
1249
|
+
placeholders = ",".join("?" for _ in batch)
|
|
1250
|
+
rows = self._conn.execute( # nosec B608
|
|
1251
|
+
f"SELECT * FROM edges WHERE source_qualified IN ({placeholders})",
|
|
1252
|
+
batch,
|
|
1253
|
+
).fetchall()
|
|
1254
|
+
for r in rows:
|
|
1255
|
+
edge = self._row_to_edge(r)
|
|
1256
|
+
if edge.target_qualified in qualified_names:
|
|
1257
|
+
results.append(edge)
|
|
1258
|
+
return results
|
|
1259
|
+
|
|
1260
|
+
def _batch_get_nodes(self, qualified_names: set[str]) -> list[GraphNode]:
|
|
1261
|
+
"""Batch-fetch nodes by qualified name, staying under SQLite variable limits."""
|
|
1262
|
+
if not qualified_names:
|
|
1263
|
+
return []
|
|
1264
|
+
qns = list(qualified_names)
|
|
1265
|
+
results: list[GraphNode] = []
|
|
1266
|
+
batch_size = 450
|
|
1267
|
+
for i in range(0, len(qns), batch_size):
|
|
1268
|
+
batch = qns[i:i + batch_size]
|
|
1269
|
+
placeholders = ",".join("?" for _ in batch)
|
|
1270
|
+
rows = self._conn.execute( # nosec B608
|
|
1271
|
+
f"SELECT * FROM nodes WHERE qualified_name IN ({placeholders})",
|
|
1272
|
+
batch,
|
|
1273
|
+
).fetchall()
|
|
1274
|
+
results.extend(self._row_to_node(r) for r in rows)
|
|
1275
|
+
return results
|
|
1276
|
+
|
|
1277
|
+
def load_flow_adjacency(self) -> "FlowAdjacency":
|
|
1278
|
+
"""Load all nodes and CALLS/TESTED_BY edges into memory for fast traversal.
|
|
1279
|
+
|
|
1280
|
+
Reads the entire ``nodes`` and ``edges`` tables in two streaming
|
|
1281
|
+
queries and returns an in-memory adjacency structure suitable for
|
|
1282
|
+
flow tracing and criticality scoring. At ~500k nodes / 3M edges
|
|
1283
|
+
this fits in a few hundred MB and eliminates tens of millions of
|
|
1284
|
+
single-row SQLite point queries that otherwise dominate
|
|
1285
|
+
``trace_flows`` / ``compute_criticality`` runtime.
|
|
1286
|
+
"""
|
|
1287
|
+
nodes_by_qn: dict[str, GraphNode] = {}
|
|
1288
|
+
nodes_by_id: dict[int, GraphNode] = {}
|
|
1289
|
+
for row in self._conn.execute("SELECT * FROM nodes"):
|
|
1290
|
+
node = self._row_to_node(row)
|
|
1291
|
+
nodes_by_qn[node.qualified_name] = node
|
|
1292
|
+
nodes_by_id[node.id] = node
|
|
1293
|
+
|
|
1294
|
+
calls_out: dict[str, list[str]] = {}
|
|
1295
|
+
has_tested_by: set[str] = set()
|
|
1296
|
+
for row in self._conn.execute(
|
|
1297
|
+
"SELECT kind, source_qualified, target_qualified FROM edges "
|
|
1298
|
+
"WHERE kind IN ('CALLS', 'TESTED_BY')"
|
|
1299
|
+
):
|
|
1300
|
+
kind, src, tgt = row["kind"], row["source_qualified"], row["target_qualified"]
|
|
1301
|
+
if kind == "CALLS":
|
|
1302
|
+
calls_out.setdefault(src, []).append(tgt)
|
|
1303
|
+
else: # TESTED_BY
|
|
1304
|
+
has_tested_by.add(tgt)
|
|
1305
|
+
|
|
1306
|
+
return FlowAdjacency(
|
|
1307
|
+
calls_out=calls_out,
|
|
1308
|
+
has_tested_by=has_tested_by,
|
|
1309
|
+
nodes_by_qn=nodes_by_qn,
|
|
1310
|
+
nodes_by_id=nodes_by_id,
|
|
1311
|
+
)
|
|
1312
|
+
|
|
1313
|
+
# --- Internal helpers ---
|
|
1314
|
+
|
|
1315
|
+
def _build_networkx_graph(self) -> nx.DiGraph:
|
|
1316
|
+
"""Build (or return cached) in-memory NetworkX directed graph from all edges."""
|
|
1317
|
+
with self._cache_lock:
|
|
1318
|
+
if self._nxg_cache is not None:
|
|
1319
|
+
return self._nxg_cache
|
|
1320
|
+
g: nx.DiGraph = nx.DiGraph()
|
|
1321
|
+
rows = self._conn.execute("SELECT * FROM edges").fetchall()
|
|
1322
|
+
for r in rows:
|
|
1323
|
+
g.add_edge(r["source_qualified"], r["target_qualified"], kind=r["kind"])
|
|
1324
|
+
self._nxg_cache = g
|
|
1325
|
+
return g
|
|
1326
|
+
|
|
1327
|
+
def _make_qualified(self, node: NodeInfo) -> str:
|
|
1328
|
+
if node.kind == "File":
|
|
1329
|
+
return node.file_path
|
|
1330
|
+
if node.parent_name:
|
|
1331
|
+
return f"{node.file_path}::{node.parent_name}.{node.name}"
|
|
1332
|
+
return f"{node.file_path}::{node.name}"
|
|
1333
|
+
|
|
1334
|
+
def _qualified_names_for_file(self, nodes: list[NodeInfo]) -> list[str]:
|
|
1335
|
+
"""Compute collision-free qualified names for one file's nodes.
|
|
1336
|
+
|
|
1337
|
+
The first occurrence of a key keeps its bare form so existing edges
|
|
1338
|
+
(which reference the same parser-computed key) still resolve to it.
|
|
1339
|
+
Later same-key symbols are suffixed with their start line — two defs
|
|
1340
|
+
cannot share a ``line_start``, so this is always unique.
|
|
1341
|
+
"""
|
|
1342
|
+
names: list[str] = []
|
|
1343
|
+
seen: set[str] = set()
|
|
1344
|
+
for index, node in enumerate(nodes):
|
|
1345
|
+
base = self._make_qualified(node)
|
|
1346
|
+
if base not in seen:
|
|
1347
|
+
seen.add(base)
|
|
1348
|
+
names.append(base)
|
|
1349
|
+
continue
|
|
1350
|
+
candidate = f"{base}:L{node.line_start}"
|
|
1351
|
+
if candidate in seen:
|
|
1352
|
+
candidate = f"{base}:L{node.line_start}#{index}"
|
|
1353
|
+
seen.add(candidate)
|
|
1354
|
+
names.append(candidate)
|
|
1355
|
+
return names
|
|
1356
|
+
|
|
1357
|
+
def _row_to_node(self, row: sqlite3.Row) -> GraphNode:
|
|
1358
|
+
return GraphNode(
|
|
1359
|
+
id=row["id"],
|
|
1360
|
+
kind=row["kind"],
|
|
1361
|
+
name=row["name"],
|
|
1362
|
+
qualified_name=row["qualified_name"],
|
|
1363
|
+
file_path=row["file_path"],
|
|
1364
|
+
line_start=row["line_start"],
|
|
1365
|
+
line_end=row["line_end"],
|
|
1366
|
+
language=row["language"] or "",
|
|
1367
|
+
parent_name=row["parent_name"],
|
|
1368
|
+
params=row["params"],
|
|
1369
|
+
return_type=row["return_type"],
|
|
1370
|
+
is_test=bool(row["is_test"]),
|
|
1371
|
+
file_hash=row["file_hash"],
|
|
1372
|
+
extra=json.loads(row["extra"]) if row["extra"] else {},
|
|
1373
|
+
)
|
|
1374
|
+
|
|
1375
|
+
def _row_to_edge(self, row: sqlite3.Row) -> GraphEdge:
|
|
1376
|
+
extra = json.loads(row["extra"]) if row["extra"] else {}
|
|
1377
|
+
confidence = row["confidence"] if "confidence" in row.keys() else 1.0
|
|
1378
|
+
confidence_tier = row["confidence_tier"] if "confidence_tier" in row.keys() else "EXTRACTED"
|
|
1379
|
+
return GraphEdge(
|
|
1380
|
+
id=row["id"],
|
|
1381
|
+
kind=row["kind"],
|
|
1382
|
+
source_qualified=row["source_qualified"],
|
|
1383
|
+
target_qualified=row["target_qualified"],
|
|
1384
|
+
file_path=row["file_path"],
|
|
1385
|
+
line=row["line"],
|
|
1386
|
+
extra=extra,
|
|
1387
|
+
confidence=confidence,
|
|
1388
|
+
confidence_tier=confidence_tier,
|
|
1389
|
+
)
|
|
1390
|
+
|
|
1391
|
+
|
|
1392
|
+
def _sanitize_name(s: str, max_len: int = 256) -> str:
|
|
1393
|
+
"""Strip ASCII control characters and truncate to prevent prompt injection.
|
|
1394
|
+
|
|
1395
|
+
Node names extracted from source code could contain adversarial strings
|
|
1396
|
+
(e.g. ``IGNORE_ALL_PREVIOUS_INSTRUCTIONS``). This function removes control
|
|
1397
|
+
characters (0x00-0x1F except tab and newline) and enforces a length limit so
|
|
1398
|
+
that names flowing through MCP tool responses cannot easily influence AI
|
|
1399
|
+
agent behaviour.
|
|
1400
|
+
"""
|
|
1401
|
+
# Strip control chars 0x00-0x1F except \t (0x09) and \n (0x0A)
|
|
1402
|
+
cleaned = "".join(
|
|
1403
|
+
ch for ch in s
|
|
1404
|
+
if ch in ("\t", "\n") or ord(ch) >= 0x20
|
|
1405
|
+
)
|
|
1406
|
+
return cleaned[:max_len]
|
|
1407
|
+
|
|
1408
|
+
|
|
1409
|
+
def node_to_dict(n: GraphNode) -> dict:
|
|
1410
|
+
return {
|
|
1411
|
+
"id": n.id, "kind": n.kind, "name": _sanitize_name(n.name),
|
|
1412
|
+
"qualified_name": _sanitize_name(n.qualified_name), "file_path": n.file_path,
|
|
1413
|
+
"line_start": n.line_start, "line_end": n.line_end,
|
|
1414
|
+
"language": n.language,
|
|
1415
|
+
"parent_name": _sanitize_name(n.parent_name) if n.parent_name else n.parent_name,
|
|
1416
|
+
"is_test": n.is_test,
|
|
1417
|
+
}
|
|
1418
|
+
|
|
1419
|
+
|
|
1420
|
+
def edge_to_dict(e: GraphEdge) -> dict:
|
|
1421
|
+
return {
|
|
1422
|
+
"id": e.id, "kind": e.kind,
|
|
1423
|
+
"source": _sanitize_name(e.source_qualified),
|
|
1424
|
+
"target": _sanitize_name(e.target_qualified),
|
|
1425
|
+
"file_path": e.file_path, "line": e.line,
|
|
1426
|
+
"confidence": e.confidence, "confidence_tier": e.confidence_tier,
|
|
1427
|
+
}
|