code-review-graph-codeblackwell 2.3.6.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. code_review_graph/__init__.py +20 -0
  2. code_review_graph/__main__.py +4 -0
  3. code_review_graph/analysis.py +410 -0
  4. code_review_graph/changes.py +409 -0
  5. code_review_graph/cli.py +1255 -0
  6. code_review_graph/communities.py +874 -0
  7. code_review_graph/constants.py +23 -0
  8. code_review_graph/context_savings.py +317 -0
  9. code_review_graph/custom_languages.py +322 -0
  10. code_review_graph/daemon.py +1009 -0
  11. code_review_graph/daemon_cli.py +320 -0
  12. code_review_graph/docs/LLM-OPTIMIZED-REFERENCE.md +71 -0
  13. code_review_graph/embeddings.py +1006 -0
  14. code_review_graph/enrich.py +303 -0
  15. code_review_graph/eval/__init__.py +33 -0
  16. code_review_graph/eval/benchmarks/__init__.py +1 -0
  17. code_review_graph/eval/benchmarks/agent_baseline.py +193 -0
  18. code_review_graph/eval/benchmarks/build_performance.py +60 -0
  19. code_review_graph/eval/benchmarks/flow_completeness.py +36 -0
  20. code_review_graph/eval/benchmarks/impact_accuracy.py +220 -0
  21. code_review_graph/eval/benchmarks/multi_hop_retrieval.py +125 -0
  22. code_review_graph/eval/benchmarks/search_quality.py +59 -0
  23. code_review_graph/eval/benchmarks/token_efficiency.py +143 -0
  24. code_review_graph/eval/configs/code-review-graph.yaml +50 -0
  25. code_review_graph/eval/configs/express.yaml +45 -0
  26. code_review_graph/eval/configs/fastapi.yaml +48 -0
  27. code_review_graph/eval/configs/flask.yaml +50 -0
  28. code_review_graph/eval/configs/gin.yaml +51 -0
  29. code_review_graph/eval/configs/httpx.yaml +48 -0
  30. code_review_graph/eval/reporter.py +301 -0
  31. code_review_graph/eval/runner.py +211 -0
  32. code_review_graph/eval/scorer.py +85 -0
  33. code_review_graph/eval/token_benchmark.py +182 -0
  34. code_review_graph/exports.py +409 -0
  35. code_review_graph/flows.py +698 -0
  36. code_review_graph/graph.py +1427 -0
  37. code_review_graph/graph_diff.py +122 -0
  38. code_review_graph/hints.py +384 -0
  39. code_review_graph/incremental.py +1245 -0
  40. code_review_graph/jedi_resolver.py +303 -0
  41. code_review_graph/main.py +1079 -0
  42. code_review_graph/memory.py +142 -0
  43. code_review_graph/migrations.py +284 -0
  44. code_review_graph/parser.py +6957 -0
  45. code_review_graph/postprocessing.py +134 -0
  46. code_review_graph/prompts.py +159 -0
  47. code_review_graph/refactor.py +852 -0
  48. code_review_graph/registry.py +319 -0
  49. code_review_graph/rescript_resolver.py +206 -0
  50. code_review_graph/search.py +447 -0
  51. code_review_graph/skills.py +1481 -0
  52. code_review_graph/spring_resolver.py +200 -0
  53. code_review_graph/temporal_resolver.py +199 -0
  54. code_review_graph/token_benchmark.py +125 -0
  55. code_review_graph/tools/__init__.py +156 -0
  56. code_review_graph/tools/_common.py +176 -0
  57. code_review_graph/tools/analysis_tools.py +184 -0
  58. code_review_graph/tools/build.py +541 -0
  59. code_review_graph/tools/community_tools.py +246 -0
  60. code_review_graph/tools/context.py +152 -0
  61. code_review_graph/tools/docs.py +274 -0
  62. code_review_graph/tools/flows_tools.py +176 -0
  63. code_review_graph/tools/query.py +692 -0
  64. code_review_graph/tools/refactor_tools.py +168 -0
  65. code_review_graph/tools/registry_tools.py +125 -0
  66. code_review_graph/tools/review.py +477 -0
  67. code_review_graph/tsconfig_resolver.py +257 -0
  68. code_review_graph/visualization.py +2184 -0
  69. code_review_graph/wiki.py +305 -0
  70. code_review_graph_codeblackwell-2.3.6.post1.dist-info/METADATA +718 -0
  71. code_review_graph_codeblackwell-2.3.6.post1.dist-info/RECORD +74 -0
  72. code_review_graph_codeblackwell-2.3.6.post1.dist-info/WHEEL +4 -0
  73. code_review_graph_codeblackwell-2.3.6.post1.dist-info/entry_points.txt +3 -0
  74. code_review_graph_codeblackwell-2.3.6.post1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,1427 @@
1
+ """SQLite-backed knowledge graph storage and query engine.
2
+
3
+ Stores code structure as nodes (File, Class, Function, Type, Test) and
4
+ edges (CALLS, IMPORTS_FROM, INHERITS, IMPLEMENTS, CONTAINS, TESTED_BY, DEPENDS_ON, REFERENCES).
5
+ Supports impact-radius queries and subgraph extraction.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import logging
12
+ import os
13
+ import re
14
+ import sqlite3
15
+ import threading
16
+ import time
17
+ from dataclasses import dataclass
18
+ from pathlib import Path
19
+ from typing import Any, Optional
20
+
21
+ import networkx as nx
22
+
23
+ from .constants import BFS_ENGINE, MAX_IMPACT_DEPTH, MAX_IMPACT_NODES
24
+ from .migrations import get_schema_version, run_migrations
25
+ from .parser import EdgeInfo, NodeInfo
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+ # Suffix appended by _qualified_names_for_file to a 2nd+ same-file collision,
30
+ # e.g. "path::getById:L394". This regex recognizes those keys after the fact.
31
+ _DISAMBIGUATED_RE = re.compile(r":L\d+(?:#\d+)?$")
32
+
33
+ # ---------------------------------------------------------------------------
34
+ # Schema
35
+ # ---------------------------------------------------------------------------
36
+
37
+ _SCHEMA_SQL = """
38
+ CREATE TABLE IF NOT EXISTS nodes (
39
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
40
+ kind TEXT NOT NULL, -- File, Class, Function, Type, Test
41
+ name TEXT NOT NULL,
42
+ qualified_name TEXT NOT NULL UNIQUE,
43
+ file_path TEXT NOT NULL,
44
+ line_start INTEGER,
45
+ line_end INTEGER,
46
+ language TEXT,
47
+ parent_name TEXT,
48
+ params TEXT,
49
+ return_type TEXT,
50
+ modifiers TEXT,
51
+ is_test INTEGER DEFAULT 0,
52
+ file_hash TEXT,
53
+ extra TEXT DEFAULT '{}',
54
+ updated_at REAL NOT NULL
55
+ );
56
+
57
+ CREATE TABLE IF NOT EXISTS edges (
58
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
59
+ kind TEXT NOT NULL, -- CALLS, IMPORTS_FROM, INHERITS, REFERENCES, etc.
60
+ source_qualified TEXT NOT NULL,
61
+ target_qualified TEXT NOT NULL,
62
+ file_path TEXT NOT NULL,
63
+ line INTEGER DEFAULT 0,
64
+ extra TEXT DEFAULT '{}',
65
+ confidence REAL DEFAULT 1.0,
66
+ confidence_tier TEXT DEFAULT 'EXTRACTED',
67
+ updated_at REAL NOT NULL
68
+ );
69
+
70
+ CREATE TABLE IF NOT EXISTS metadata (
71
+ key TEXT PRIMARY KEY,
72
+ value TEXT NOT NULL
73
+ );
74
+
75
+ CREATE INDEX IF NOT EXISTS idx_nodes_file ON nodes(file_path);
76
+ CREATE INDEX IF NOT EXISTS idx_nodes_kind ON nodes(kind);
77
+ CREATE INDEX IF NOT EXISTS idx_nodes_qualified ON nodes(qualified_name);
78
+ CREATE INDEX IF NOT EXISTS idx_edges_source ON edges(source_qualified);
79
+ CREATE INDEX IF NOT EXISTS idx_edges_target ON edges(target_qualified);
80
+ CREATE INDEX IF NOT EXISTS idx_edges_kind ON edges(kind);
81
+ CREATE INDEX IF NOT EXISTS idx_edges_target_kind ON edges(target_qualified, kind);
82
+ CREATE INDEX IF NOT EXISTS idx_edges_source_kind ON edges(source_qualified, kind);
83
+ CREATE INDEX IF NOT EXISTS idx_edges_file ON edges(file_path);
84
+ """
85
+
86
+
87
+ @dataclass
88
+ class GraphNode:
89
+ id: int
90
+ kind: str
91
+ name: str
92
+ qualified_name: str
93
+ file_path: str
94
+ line_start: int
95
+ line_end: int
96
+ language: str
97
+ parent_name: Optional[str]
98
+ params: Optional[str]
99
+ return_type: Optional[str]
100
+ is_test: bool
101
+ file_hash: Optional[str]
102
+ extra: dict
103
+
104
+
105
+ @dataclass
106
+ class GraphEdge:
107
+ id: int
108
+ kind: str
109
+ source_qualified: str
110
+ target_qualified: str
111
+ file_path: str
112
+ line: int
113
+ extra: dict
114
+ confidence: float = 1.0
115
+ confidence_tier: str = "EXTRACTED"
116
+
117
+
118
+ @dataclass
119
+ class FlowAdjacency:
120
+ """In-memory adjacency structure for flow tracing.
121
+
122
+ Loaded once via :meth:`GraphStore.load_flow_adjacency` and passed to
123
+ ``trace_flows`` / ``compute_criticality`` to avoid per-edge SQLite
124
+ point queries on large graphs.
125
+ """
126
+ calls_out: dict[str, list[str]]
127
+ has_tested_by: set[str]
128
+ nodes_by_qn: dict[str, "GraphNode"]
129
+ nodes_by_id: dict[int, "GraphNode"]
130
+
131
+
132
+ @dataclass
133
+ class GraphStats:
134
+ total_nodes: int
135
+ total_edges: int
136
+ nodes_by_kind: dict[str, int]
137
+ edges_by_kind: dict[str, int]
138
+ languages: list[str]
139
+ files_count: int
140
+ last_updated: Optional[str]
141
+
142
+
143
+ # ---------------------------------------------------------------------------
144
+ # GraphStore
145
+ # ---------------------------------------------------------------------------
146
+
147
+
148
+ class GraphStore:
149
+ """SQLite-backed code knowledge graph."""
150
+
151
+ def __init__(self, db_path: str | Path) -> None:
152
+ self.db_path = Path(db_path)
153
+ self.db_path.parent.mkdir(parents=True, exist_ok=True)
154
+ self._conn = sqlite3.connect(
155
+ str(self.db_path), timeout=30, check_same_thread=False,
156
+ isolation_level=None, # Disable implicit transactions (#135)
157
+ )
158
+ self._conn.row_factory = sqlite3.Row
159
+ self._conn.execute("PRAGMA journal_mode=WAL")
160
+ self._conn.execute("PRAGMA busy_timeout=5000")
161
+ self._init_schema()
162
+ # Ensure schema_version is set, then run pending migrations
163
+ if get_schema_version(self._conn) < 1:
164
+ # Fresh DB — metadata table just created by _init_schema
165
+ self._conn.execute(
166
+ "INSERT OR IGNORE INTO metadata (key, value) "
167
+ "VALUES ('schema_version', '1')"
168
+ )
169
+ self._conn.commit()
170
+ run_migrations(self._conn)
171
+ self._nxg_cache: nx.DiGraph | None = None
172
+ self._cache_lock = threading.Lock()
173
+
174
+ def __enter__(self) -> "GraphStore":
175
+ return self
176
+
177
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
178
+ self.close()
179
+
180
+ def _init_schema(self) -> None:
181
+ self._conn.executescript(_SCHEMA_SQL)
182
+ self._conn.commit()
183
+
184
+ def _invalidate_cache(self) -> None:
185
+ """Invalidate the cached NetworkX graph after write operations."""
186
+ with self._cache_lock:
187
+ self._nxg_cache = None
188
+
189
+ def close(self) -> None:
190
+ self._conn.close()
191
+
192
+ # --- Write operations ---
193
+
194
+ def upsert_node(
195
+ self, node: NodeInfo, file_hash: str = "", qualified: str | None = None
196
+ ) -> int:
197
+ """Insert or update a node. Returns the node ID.
198
+
199
+ Pass ``qualified`` to override the computed key — used by the batch store
200
+ functions to disambiguate same-named symbols in one file (see
201
+ ``_qualified_names_for_file``). Without it, collisions would collapse
202
+ under ``ON CONFLICT(qualified_name) DO UPDATE`` and silently drop nodes.
203
+ """
204
+ now = time.time()
205
+ if qualified is None:
206
+ qualified = self._make_qualified(node)
207
+ extra = json.dumps(node.extra) if node.extra else "{}"
208
+
209
+ self._conn.execute(
210
+ """INSERT INTO nodes
211
+ (kind, name, qualified_name, file_path, line_start, line_end,
212
+ language, parent_name, params, return_type, modifiers, is_test,
213
+ file_hash, extra, updated_at)
214
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
215
+ ON CONFLICT(qualified_name) DO UPDATE SET
216
+ kind=excluded.kind, name=excluded.name,
217
+ file_path=excluded.file_path, line_start=excluded.line_start,
218
+ line_end=excluded.line_end, language=excluded.language,
219
+ parent_name=excluded.parent_name, params=excluded.params,
220
+ return_type=excluded.return_type, modifiers=excluded.modifiers,
221
+ is_test=excluded.is_test, file_hash=excluded.file_hash,
222
+ extra=excluded.extra, updated_at=excluded.updated_at
223
+ """,
224
+ (
225
+ node.kind, node.name, qualified, node.file_path,
226
+ node.line_start, node.line_end, node.language,
227
+ node.parent_name, node.params, node.return_type,
228
+ node.modifiers, int(node.is_test), file_hash,
229
+ extra, now,
230
+ ),
231
+ )
232
+ row = self._conn.execute(
233
+ "SELECT id FROM nodes WHERE qualified_name = ?", (qualified,)
234
+ ).fetchone()
235
+ return row["id"]
236
+
237
+ def upsert_edge(self, edge: EdgeInfo) -> int:
238
+ """Insert or update an edge."""
239
+ now = time.time()
240
+ extra_dict = edge.extra if edge.extra else {}
241
+ confidence = float(extra_dict.get("confidence", 1.0))
242
+ confidence_tier = str(extra_dict.get("confidence_tier", "EXTRACTED"))
243
+ extra = json.dumps(extra_dict)
244
+
245
+ # Check for existing edge (include line so multiple call sites are preserved)
246
+ existing = self._conn.execute(
247
+ """SELECT id FROM edges
248
+ WHERE kind=? AND source_qualified=? AND target_qualified=?
249
+ AND file_path=? AND line=?""",
250
+ (edge.kind, edge.source, edge.target, edge.file_path, edge.line),
251
+ ).fetchone()
252
+
253
+ if existing:
254
+ self._conn.execute(
255
+ "UPDATE edges SET line=?, extra=?, confidence=?, confidence_tier=?,"
256
+ " updated_at=? WHERE id=?",
257
+ (edge.line, extra, confidence, confidence_tier, now, existing["id"]),
258
+ )
259
+ return existing["id"]
260
+
261
+ self._conn.execute(
262
+ """INSERT INTO edges
263
+ (kind, source_qualified, target_qualified, file_path, line, extra,
264
+ confidence, confidence_tier, updated_at)
265
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
266
+ (edge.kind, edge.source, edge.target, edge.file_path, edge.line, extra,
267
+ confidence, confidence_tier, now),
268
+ )
269
+ return self._conn.execute("SELECT last_insert_rowid()").fetchone()[0]
270
+
271
+ def remove_file_data(self, file_path: str) -> None:
272
+ """Remove all nodes and edges associated with a file."""
273
+ self._conn.execute("DELETE FROM nodes WHERE file_path = ?", (file_path,))
274
+ self._conn.execute("DELETE FROM edges WHERE file_path = ?", (file_path,))
275
+ self._invalidate_cache()
276
+
277
+ def _begin_immediate(self) -> None:
278
+ """Start an IMMEDIATE transaction, rolling back any prior uncommitted
279
+ transaction first (regression guard for #135 / #489).
280
+ """
281
+ if self._conn.in_transaction:
282
+ logger.warning("Rolling back uncommitted transaction before BEGIN IMMEDIATE")
283
+ self._conn.rollback()
284
+ self._conn.execute("BEGIN IMMEDIATE")
285
+
286
+ def store_file_nodes_edges(
287
+ self, file_path: str, nodes: list[NodeInfo], edges: list[EdgeInfo], fhash: str = ""
288
+ ) -> None:
289
+ """Atomically replace all data for a file."""
290
+ self._begin_immediate()
291
+ try:
292
+ self.remove_file_data(file_path)
293
+ qualified_names = self._qualified_names_for_file(nodes)
294
+ for node, qualified in zip(nodes, qualified_names):
295
+ self.upsert_node(node, file_hash=fhash, qualified=qualified)
296
+ for edge in edges:
297
+ self.upsert_edge(edge)
298
+ self._conn.commit()
299
+ except BaseException:
300
+ self._conn.rollback()
301
+ raise
302
+ self._invalidate_cache()
303
+
304
+ def store_file_batch(
305
+ self, batch: list[tuple[str, list[NodeInfo], list[EdgeInfo], str]]
306
+ ) -> None:
307
+ """Atomically replace data for a batch of files in one transaction."""
308
+ self._begin_immediate()
309
+ try:
310
+ for file_path, nodes, edges, fhash in batch:
311
+ self.remove_file_data(file_path)
312
+ qualified_names = self._qualified_names_for_file(nodes)
313
+ for node, qualified in zip(nodes, qualified_names):
314
+ self.upsert_node(node, file_hash=fhash, qualified=qualified)
315
+ for edge in edges:
316
+ self.upsert_edge(edge)
317
+ self._conn.commit()
318
+ except BaseException:
319
+ self._conn.rollback()
320
+ raise
321
+ self._invalidate_cache()
322
+
323
+ def set_metadata(self, key: str, value: str) -> None:
324
+ self._conn.execute(
325
+ "INSERT OR REPLACE INTO metadata (key, value) VALUES (?, ?)", (key, value)
326
+ )
327
+ self._conn.commit()
328
+
329
+ def get_metadata(self, key: str) -> Optional[str]:
330
+ row = self._conn.execute("SELECT value FROM metadata WHERE key=?", (key,)).fetchone()
331
+ return row["value"] if row else None
332
+
333
+ def commit(self) -> None:
334
+ self._conn.commit()
335
+
336
+ def rollback(self) -> None:
337
+ """Rollback the current transaction."""
338
+ self._conn.rollback()
339
+
340
+ # --- Read operations ---
341
+
342
+ def get_node(self, qualified_name: str) -> Optional[GraphNode]:
343
+ row = self._conn.execute(
344
+ "SELECT * FROM nodes WHERE qualified_name = ?", (qualified_name,)
345
+ ).fetchone()
346
+ return self._row_to_node(row) if row else None
347
+
348
+ def get_nodes_by_file(self, file_path: str) -> list[GraphNode]:
349
+ rows = self._conn.execute(
350
+ "SELECT * FROM nodes WHERE file_path = ?", (file_path,)
351
+ ).fetchall()
352
+ return [self._row_to_node(r) for r in rows]
353
+
354
+ def get_all_nodes(self, exclude_files: bool = True) -> list[GraphNode]:
355
+ """Return all nodes, optionally excluding File nodes."""
356
+ if exclude_files:
357
+ rows = self._conn.execute(
358
+ "SELECT * FROM nodes WHERE kind != 'File'"
359
+ ).fetchall()
360
+ else:
361
+ rows = self._conn.execute("SELECT * FROM nodes").fetchall()
362
+ return [self._row_to_node(r) for r in rows]
363
+
364
+ def get_edges_by_source(self, qualified_name: str) -> list[GraphEdge]:
365
+ rows = self._conn.execute(
366
+ "SELECT * FROM edges WHERE source_qualified = ?", (qualified_name,)
367
+ ).fetchall()
368
+ return [self._row_to_edge(r) for r in rows]
369
+
370
+ def get_edges_by_target(self, qualified_name: str) -> list[GraphEdge]:
371
+ rows = self._conn.execute(
372
+ "SELECT * FROM edges WHERE target_qualified = ?", (qualified_name,)
373
+ ).fetchall()
374
+ return [self._row_to_edge(r) for r in rows]
375
+
376
+ def search_edges_by_target_name(self, name: str, kind: str = "CALLS") -> list[GraphEdge]:
377
+ """Search for edges where target_qualified matches an unqualified name.
378
+
379
+ CALLS edges often store unqualified target names (e.g. ``generateTestCode``)
380
+ rather than fully qualified ones (``file.ts::generateTestCode``). This
381
+ method finds those edges by exact match on the plain function name so that
382
+ reverse call tracing (callers_of) works even when qualified-name lookup
383
+ returns nothing.
384
+ """
385
+ rows = self._conn.execute(
386
+ "SELECT * FROM edges WHERE target_qualified = ? AND kind = ?",
387
+ (name, kind),
388
+ ).fetchall()
389
+ return [self._row_to_edge(r) for r in rows]
390
+
391
+ def get_transitive_tests(
392
+ self, qualified_name: str, max_depth: int = 1, max_frontier: int | None = None,
393
+ ) -> list[dict]:
394
+ """Find tests covering a node, including indirect (transitive) coverage.
395
+
396
+ 1. Direct: TESTED_BY edges targeting this node (+ bare-name fallback).
397
+ 2. Indirect: follow outgoing CALLS edges up to *max_depth* hops,
398
+ then collect TESTED_BY edges on each callee.
399
+
400
+ Returns a list of dicts with node fields plus ``indirect: bool``.
401
+
402
+ ``max_frontier`` caps the CALLS fan-out per BFS hop to prevent O(N*M)
403
+ query explosion on hub functions in large graphs. Defaults to
404
+ ``CRG_MAX_TRANSITIVE_FRONTIER`` env var (50 if unset).
405
+ """
406
+ if max_frontier is None:
407
+ max_frontier = int(os.environ.get("CRG_MAX_TRANSITIVE_FRONTIER", "50"))
408
+ conn = self._conn
409
+ seen: set[str] = set()
410
+ results: list[dict] = []
411
+
412
+ # If the input is a class, expand to its methods first.
413
+ input_qns = [qualified_name]
414
+ row = conn.execute(
415
+ "SELECT kind FROM nodes WHERE qualified_name = ?",
416
+ (qualified_name,),
417
+ ).fetchone()
418
+ if row and row["kind"] == "Class":
419
+ for mrow in conn.execute(
420
+ "SELECT target_qualified FROM edges "
421
+ "WHERE source_qualified = ? AND kind = 'CONTAINS'",
422
+ (qualified_name,),
423
+ ).fetchall():
424
+ input_qns.append(mrow["target_qualified"])
425
+
426
+ def _node_dict(qn: str, indirect: bool) -> dict | None:
427
+ row = conn.execute(
428
+ "SELECT * FROM nodes WHERE qualified_name = ?", (qn,)
429
+ ).fetchone()
430
+ if not row:
431
+ return None
432
+ return {
433
+ "name": row["name"],
434
+ "qualified_name": row["qualified_name"],
435
+ "file_path": row["file_path"],
436
+ "kind": row["kind"],
437
+ "indirect": indirect,
438
+ }
439
+
440
+ # Direct TESTED_BY
441
+ for qn in input_qns:
442
+ for row in conn.execute(
443
+ "SELECT source_qualified FROM edges "
444
+ "WHERE target_qualified = ? AND kind = 'TESTED_BY'",
445
+ (qn,),
446
+ ).fetchall():
447
+ src = row["source_qualified"]
448
+ if src not in seen:
449
+ seen.add(src)
450
+ d = _node_dict(src, indirect=False)
451
+ if d:
452
+ results.append(d)
453
+
454
+ # Bare-name fallback for direct
455
+ bare = qualified_name.rsplit("::", 1)[-1] if "::" in qualified_name else qualified_name
456
+ for row in conn.execute(
457
+ "SELECT source_qualified FROM edges "
458
+ "WHERE target_qualified = ? AND kind = 'TESTED_BY'",
459
+ (bare,),
460
+ ).fetchall():
461
+ src = row["source_qualified"]
462
+ if src not in seen:
463
+ seen.add(src)
464
+ d = _node_dict(src, indirect=False)
465
+ if d:
466
+ results.append(d)
467
+
468
+ # Transitive: follow CALLS edges, then collect TESTED_BY on callees
469
+ frontier = set(input_qns)
470
+ for _ in range(max_depth):
471
+ next_frontier: set[str] = set()
472
+ for qn in frontier:
473
+ for row in conn.execute(
474
+ "SELECT target_qualified FROM edges "
475
+ "WHERE source_qualified = ? AND kind = 'CALLS'",
476
+ (qn,),
477
+ ).fetchall():
478
+ next_frontier.add(row["target_qualified"])
479
+ if len(next_frontier) > max_frontier:
480
+ next_frontier = set(list(next_frontier)[:max_frontier])
481
+ for callee in next_frontier:
482
+ for row in conn.execute(
483
+ "SELECT source_qualified FROM edges "
484
+ "WHERE target_qualified = ? AND kind = 'TESTED_BY'",
485
+ (callee,),
486
+ ).fetchall():
487
+ src = row["source_qualified"]
488
+ if src not in seen:
489
+ seen.add(src)
490
+ d = _node_dict(src, indirect=True)
491
+ if d:
492
+ results.append(d)
493
+ frontier = next_frontier
494
+
495
+ return results
496
+
497
+ def resolve_bare_call_targets(self) -> int:
498
+ """Batch-resolve bare-name CALLS targets using the global node table.
499
+
500
+ After parsing, some CALLS edges have bare targets (no ``::`` separator)
501
+ because the parser couldn't resolve cross-file. This method matches
502
+ them against nodes and updates unambiguous matches in-place.
503
+
504
+ Disambiguation strategy:
505
+ 1. Single node with that name -> resolve directly
506
+ 2. Multiple candidates -> prefer one whose file is imported by the
507
+ source file (via IMPORTS_FROM edges)
508
+
509
+ Returns the number of resolved edges.
510
+ """
511
+ conn = self._conn
512
+
513
+ bare_edges = conn.execute(
514
+ "SELECT id, source_qualified, target_qualified, file_path "
515
+ "FROM edges WHERE kind = 'CALLS' AND target_qualified NOT LIKE '%::%'"
516
+ ).fetchall()
517
+ if not bare_edges:
518
+ return 0
519
+
520
+ # bare_name -> list of qualified_names
521
+ node_lookup: dict[str, list[str]] = {}
522
+ for row in conn.execute(
523
+ "SELECT name, qualified_name FROM nodes "
524
+ "WHERE kind IN ('Function', 'Test', 'Class')"
525
+ ).fetchall():
526
+ node_lookup.setdefault(row["name"], []).append(row["qualified_name"])
527
+
528
+ # source_file -> set of imported files (for disambiguation)
529
+ import_targets: dict[str, set[str]] = {}
530
+ for row in conn.execute(
531
+ "SELECT DISTINCT file_path, target_qualified FROM edges "
532
+ "WHERE kind = 'IMPORTS_FROM'"
533
+ ).fetchall():
534
+ target = row["target_qualified"]
535
+ target_file = target.split("::", 1)[0] if "::" in target else target
536
+ import_targets.setdefault(row["file_path"], set()).add(target_file)
537
+
538
+ resolved = 0
539
+ for edge in bare_edges:
540
+ bare_name = edge["target_qualified"]
541
+ candidates = node_lookup.get(bare_name, [])
542
+ if not candidates:
543
+ continue
544
+
545
+ if len(candidates) == 1:
546
+ qualified = candidates[0]
547
+ else:
548
+ # Disambiguate via imports
549
+ src_qn = edge["source_qualified"]
550
+ src_file = (
551
+ src_qn.split("::", 1)[0] if "::" in src_qn
552
+ else edge["file_path"]
553
+ )
554
+ imported_files = import_targets.get(src_file, set())
555
+ imported = [
556
+ c for c in candidates
557
+ if c.split("::", 1)[0] in imported_files
558
+ ]
559
+ if len(imported) == 1:
560
+ qualified = imported[0]
561
+ else:
562
+ continue
563
+
564
+ conn.execute(
565
+ "UPDATE edges SET target_qualified = ? WHERE id = ?",
566
+ (qualified, edge["id"]),
567
+ )
568
+ resolved += 1
569
+
570
+ if resolved:
571
+ conn.commit()
572
+ logger.info("Resolved %d bare-name CALLS targets", resolved)
573
+ return resolved
574
+
575
+ def get_all_files(self) -> list[str]:
576
+ rows = self._conn.execute(
577
+ "SELECT DISTINCT file_path FROM nodes WHERE kind = 'File'"
578
+ ).fetchall()
579
+ return [r["file_path"] for r in rows]
580
+
581
+ def search_nodes(self, query: str, limit: int = 20) -> list[GraphNode]:
582
+ """Keyword search across node names.
583
+
584
+ Tries FTS5 first (fast, tokenized matching), then falls back to
585
+ LIKE-based substring search when FTS5 returns no results.
586
+ """
587
+ words = query.split()
588
+ if not words:
589
+ return []
590
+
591
+ # Phase 1: FTS5 search (uses the indexed nodes_fts table)
592
+ try:
593
+ if len(words) == 1:
594
+ fts_query = '"' + query.replace('"', '""') + '"'
595
+ else:
596
+ fts_query = " AND ".join(
597
+ '"' + w.replace('"', '""') + '"' for w in words
598
+ )
599
+ rows = self._conn.execute(
600
+ "SELECT n.* FROM nodes_fts f "
601
+ "JOIN nodes n ON f.rowid = n.id "
602
+ "WHERE nodes_fts MATCH ? LIMIT ?",
603
+ (fts_query, limit),
604
+ ).fetchall()
605
+ if rows:
606
+ return [self._row_to_node(r) for r in rows]
607
+ except Exception: # nosec B110 - FTS5 table may not exist on older schemas
608
+ pass
609
+
610
+ # Phase 2: LIKE fallback (substring matching)
611
+ conditions: list[str] = []
612
+ params: list[str | int] = []
613
+ for word in words:
614
+ w = word.lower()
615
+ conditions.append(
616
+ "(LOWER(name) LIKE ? OR LOWER(qualified_name) LIKE ?)"
617
+ )
618
+ params.extend([f"%{w}%", f"%{w}%"])
619
+
620
+ where = " AND ".join(conditions)
621
+ sql = f"SELECT * FROM nodes WHERE {where} LIMIT ?" # nosec B608
622
+ params.append(limit)
623
+ rows = self._conn.execute(sql, params).fetchall()
624
+ return [self._row_to_node(r) for r in rows]
625
+
626
+ # --- Impact / Graph traversal ---
627
+
628
+ def get_impact_radius(
629
+ self,
630
+ changed_files: list[str],
631
+ max_depth: int = MAX_IMPACT_DEPTH,
632
+ max_nodes: int = MAX_IMPACT_NODES,
633
+ ) -> dict[str, Any]:
634
+ """BFS from changed files to find all impacted nodes within depth N.
635
+
636
+ Delegates to ``get_impact_radius_sql()`` by default (faster for
637
+ large graphs). Set ``CRG_BFS_ENGINE=networkx`` to use the legacy
638
+ Python-side BFS via NetworkX.
639
+
640
+ Returns dict with:
641
+ - changed_nodes: nodes in changed files
642
+ - impacted_nodes: nodes reachable via edges
643
+ - impacted_files: unique set of affected files
644
+ - edges: connecting edges
645
+ """
646
+ if BFS_ENGINE == "networkx":
647
+ return self._get_impact_radius_networkx(
648
+ changed_files, max_depth=max_depth, max_nodes=max_nodes,
649
+ )
650
+ return self.get_impact_radius_sql(
651
+ changed_files, max_depth=max_depth, max_nodes=max_nodes,
652
+ )
653
+
654
+ # -- SQLite recursive CTE version (default) ---------------------------
655
+
656
+ def get_impact_radius_sql(
657
+ self,
658
+ changed_files: list[str],
659
+ max_depth: int = MAX_IMPACT_DEPTH,
660
+ max_nodes: int = MAX_IMPACT_NODES,
661
+ ) -> dict[str, Any]:
662
+ """Impact radius via SQLite recursive CTE.
663
+
664
+ Faster than NetworkX for large graphs because it avoids
665
+ materialising the full graph in Python.
666
+ """
667
+ if not changed_files:
668
+ return {
669
+ "changed_nodes": [],
670
+ "impacted_nodes": [],
671
+ "impacted_files": [],
672
+ "edges": [],
673
+ "truncated": False,
674
+ "total_impacted": 0,
675
+ }
676
+
677
+ # Seed qualified names
678
+ seeds: set[str] = set()
679
+ for f in changed_files:
680
+ nodes = self.get_nodes_by_file(f)
681
+ for n in nodes:
682
+ seeds.add(n.qualified_name)
683
+
684
+ if not seeds:
685
+ return {
686
+ "changed_nodes": [],
687
+ "impacted_nodes": [],
688
+ "impacted_files": [],
689
+ "edges": [],
690
+ "truncated": False,
691
+ "total_impacted": 0,
692
+ }
693
+
694
+ # Build recursive CTE — use a temp table for the seed set to
695
+ # keep the query plan efficient and stay under variable limits.
696
+ self._conn.execute(
697
+ "CREATE TEMP TABLE IF NOT EXISTS _impact_seeds "
698
+ "(qn TEXT PRIMARY KEY)"
699
+ )
700
+ self._conn.execute("DELETE FROM _impact_seeds")
701
+ batch_size = 450
702
+ seed_list = list(seeds)
703
+ for i in range(0, len(seed_list), batch_size):
704
+ batch = seed_list[i:i + batch_size]
705
+ placeholders = ",".join("(?)" for _ in batch)
706
+ self._conn.execute( # nosec B608
707
+ f"INSERT OR IGNORE INTO _impact_seeds (qn) VALUES {placeholders}",
708
+ batch,
709
+ )
710
+
711
+ cte_sql = """
712
+ WITH RECURSIVE impacted(node_qn, depth) AS (
713
+ SELECT qn, 0 FROM _impact_seeds
714
+ UNION
715
+ SELECT e.target_qualified, i.depth + 1
716
+ FROM impacted i
717
+ JOIN edges e ON e.source_qualified = i.node_qn
718
+ WHERE i.depth < ?
719
+ UNION
720
+ SELECT e.source_qualified, i.depth + 1
721
+ FROM impacted i
722
+ JOIN edges e ON e.target_qualified = i.node_qn
723
+ WHERE i.depth < ?
724
+ )
725
+ SELECT DISTINCT node_qn, MIN(depth) AS min_depth
726
+ FROM impacted
727
+ GROUP BY node_qn
728
+ LIMIT ?
729
+ """
730
+ rows = self._conn.execute(
731
+ cte_sql, (max_depth, max_depth, max_nodes + len(seeds)),
732
+ ).fetchall()
733
+
734
+ # Split into seeds vs impacted
735
+ impacted_qns: set[str] = set()
736
+ for r in rows:
737
+ qn = r[0]
738
+ if qn not in seeds:
739
+ impacted_qns.add(qn)
740
+
741
+ # Batch-fetch nodes
742
+ changed_nodes = self._batch_get_nodes(seeds)
743
+ impacted_nodes = self._batch_get_nodes(impacted_qns)
744
+
745
+ total_impacted = len(impacted_nodes)
746
+ truncated = total_impacted > max_nodes
747
+ if truncated:
748
+ impacted_nodes = impacted_nodes[:max_nodes]
749
+
750
+ impacted_files = list({n.file_path for n in impacted_nodes})
751
+
752
+ relevant_edges: list[GraphEdge] = []
753
+ all_qns = seeds | {n.qualified_name for n in impacted_nodes}
754
+ if all_qns:
755
+ relevant_edges = self.get_edges_among(all_qns)
756
+
757
+ return {
758
+ "changed_nodes": changed_nodes,
759
+ "impacted_nodes": impacted_nodes,
760
+ "impacted_files": impacted_files,
761
+ "edges": relevant_edges,
762
+ "truncated": truncated,
763
+ "total_impacted": total_impacted,
764
+ }
765
+
766
+ # -- NetworkX BFS version (legacy) ------------------------------------
767
+
768
+ def _get_impact_radius_networkx(
769
+ self,
770
+ changed_files: list[str],
771
+ max_depth: int = MAX_IMPACT_DEPTH,
772
+ max_nodes: int = MAX_IMPACT_NODES,
773
+ ) -> dict[str, Any]:
774
+ """BFS via NetworkX (legacy). Used when CRG_BFS_ENGINE=networkx."""
775
+ nxg = self._build_networkx_graph()
776
+
777
+ seeds: set[str] = set()
778
+ for f in changed_files:
779
+ nodes = self.get_nodes_by_file(f)
780
+ for n in nodes:
781
+ seeds.add(n.qualified_name)
782
+
783
+ visited: set[str] = set()
784
+ frontier = seeds.copy()
785
+ depth = 0
786
+ impacted: set[str] = set()
787
+
788
+ while frontier and depth < max_depth:
789
+ visited.update(frontier)
790
+ next_frontier: set[str] = set()
791
+ for qn in frontier:
792
+ if qn in nxg:
793
+ for neighbor in nxg.neighbors(qn):
794
+ if neighbor not in visited:
795
+ next_frontier.add(neighbor)
796
+ impacted.add(neighbor)
797
+ if qn in nxg:
798
+ for pred in nxg.predecessors(qn):
799
+ if pred not in visited:
800
+ next_frontier.add(pred)
801
+ impacted.add(pred)
802
+ next_frontier -= visited
803
+ if len(visited) + len(next_frontier) > max_nodes:
804
+ break
805
+ frontier = next_frontier
806
+ depth += 1
807
+
808
+ changed_nodes = self._batch_get_nodes(seeds)
809
+ impacted_qns = impacted - seeds
810
+ impacted_nodes = self._batch_get_nodes(impacted_qns)
811
+
812
+ total_impacted = len(impacted_nodes)
813
+ truncated = total_impacted > max_nodes
814
+ if truncated:
815
+ impacted_nodes = impacted_nodes[:max_nodes]
816
+
817
+ impacted_files = list({n.file_path for n in impacted_nodes})
818
+
819
+ relevant_edges: list[GraphEdge] = []
820
+ all_qns = seeds | {n.qualified_name for n in impacted_nodes}
821
+ if all_qns:
822
+ relevant_edges = self.get_edges_among(all_qns)
823
+
824
+ return {
825
+ "changed_nodes": changed_nodes,
826
+ "impacted_nodes": impacted_nodes,
827
+ "impacted_files": impacted_files,
828
+ "edges": relevant_edges,
829
+ "truncated": truncated,
830
+ "total_impacted": total_impacted,
831
+ }
832
+
833
+ def get_subgraph(self, qualified_names: list[str]) -> dict[str, Any]:
834
+ """Extract a subgraph containing the specified nodes and their connecting edges."""
835
+ nodes = []
836
+ for qn in qualified_names:
837
+ node = self.get_node(qn)
838
+ if node:
839
+ nodes.append(node)
840
+
841
+ edges = []
842
+ qn_set = set(qualified_names)
843
+ for qn in qualified_names:
844
+ for e in self.get_edges_by_source(qn):
845
+ if e.target_qualified in qn_set:
846
+ edges.append(e)
847
+
848
+ return {"nodes": nodes, "edges": edges}
849
+
850
+ def get_stats(self) -> GraphStats:
851
+ """Return aggregate statistics about the graph."""
852
+ total_nodes = self._conn.execute("SELECT COUNT(*) FROM nodes").fetchone()[0]
853
+ total_edges = self._conn.execute("SELECT COUNT(*) FROM edges").fetchone()[0]
854
+
855
+ nodes_by_kind: dict[str, int] = {}
856
+ for row in self._conn.execute("SELECT kind, COUNT(*) as cnt FROM nodes GROUP BY kind"):
857
+ nodes_by_kind[row["kind"]] = row["cnt"]
858
+
859
+ edges_by_kind: dict[str, int] = {}
860
+ for row in self._conn.execute("SELECT kind, COUNT(*) as cnt FROM edges GROUP BY kind"):
861
+ edges_by_kind[row["kind"]] = row["cnt"]
862
+
863
+ languages = [
864
+ r["language"] for r in self._conn.execute(
865
+ "SELECT DISTINCT language FROM nodes WHERE language IS NOT NULL AND language != ''"
866
+ )
867
+ ]
868
+
869
+ files_count = self._conn.execute(
870
+ "SELECT COUNT(*) FROM nodes WHERE kind = 'File'"
871
+ ).fetchone()[0]
872
+
873
+ last_updated = self.get_metadata("last_updated")
874
+
875
+ return GraphStats(
876
+ total_nodes=total_nodes,
877
+ total_edges=total_edges,
878
+ nodes_by_kind=nodes_by_kind,
879
+ edges_by_kind=edges_by_kind,
880
+ languages=languages,
881
+ files_count=files_count,
882
+ last_updated=last_updated,
883
+ )
884
+
885
+ def find_disambiguated_nodes(self) -> list[str]:
886
+ """Qualified names suffixed to resolve a same-file name collision.
887
+
888
+ When two same-named symbols share a file, the first keeps its bare key
889
+ and later ones get a ``:L<line>`` suffix (see _qualified_names_for_file).
890
+ Surfacing these makes the otherwise-invisible collisions reportable.
891
+ """
892
+ rows = self._conn.execute(
893
+ "SELECT qualified_name FROM nodes WHERE qualified_name GLOB '*:L[0-9]*'"
894
+ ).fetchall()
895
+ return sorted(
896
+ r["qualified_name"] for r in rows
897
+ if _DISAMBIGUATED_RE.search(r["qualified_name"])
898
+ )
899
+
900
+ def get_nodes_by_size(
901
+ self,
902
+ min_lines: int = 50,
903
+ max_lines: int | None = None,
904
+ kind: str | None = None,
905
+ file_path_pattern: str | None = None,
906
+ limit: int = 50,
907
+ ) -> list[GraphNode]:
908
+ """Find nodes within a line-count range, ordered largest first.
909
+
910
+ Args:
911
+ min_lines: Minimum line count threshold (inclusive).
912
+ max_lines: Maximum line count threshold (inclusive). None = no upper bound.
913
+ kind: Filter by node kind (Function, Class, File, etc.).
914
+ file_path_pattern: SQL LIKE pattern to filter by file path.
915
+ limit: Maximum results to return.
916
+
917
+ Returns:
918
+ List of GraphNode objects, ordered by line count descending.
919
+ """
920
+ conditions = [
921
+ "line_start IS NOT NULL",
922
+ "line_end IS NOT NULL",
923
+ "(line_end - line_start + 1) >= ?",
924
+ ]
925
+ params: list = [min_lines]
926
+
927
+ if max_lines is not None:
928
+ conditions.append("(line_end - line_start + 1) <= ?")
929
+ params.append(max_lines)
930
+ if kind:
931
+ conditions.append("kind = ?")
932
+ params.append(kind)
933
+ if file_path_pattern:
934
+ conditions.append("file_path LIKE ?")
935
+ params.append(f"%{file_path_pattern}%")
936
+
937
+ params.append(limit)
938
+ where = " AND ".join(conditions)
939
+ rows = self._conn.execute(
940
+ f"SELECT * FROM nodes WHERE {where} " # nosec B608
941
+ "ORDER BY (line_end - line_start + 1) DESC LIMIT ?",
942
+ params,
943
+ ).fetchall()
944
+ return [self._row_to_node(r) for r in rows]
945
+
946
+ # --- Public query helpers (used by flows, changes, communities, etc.) ---
947
+
948
+ def get_node_by_id(self, node_id: int) -> Optional[GraphNode]:
949
+ """Fetch a single node by its integer primary key."""
950
+ row = self._conn.execute(
951
+ "SELECT * FROM nodes WHERE id = ?", (node_id,)
952
+ ).fetchone()
953
+ return self._row_to_node(row) if row else None
954
+
955
+ def get_nodes_by_kind(
956
+ self,
957
+ kinds: list[str],
958
+ file_pattern: str | None = None,
959
+ ) -> list[GraphNode]:
960
+ """Return nodes matching any of *kinds*, optionally filtered by file.
961
+
962
+ Args:
963
+ kinds: List of node kind strings (e.g. ``["Function", "Test"]``).
964
+ file_pattern: If provided, only nodes whose ``file_path``
965
+ contains *file_pattern* (SQL LIKE ``%pattern%``) are
966
+ returned.
967
+ """
968
+ if not kinds:
969
+ return []
970
+ placeholders = ",".join("?" for _ in kinds)
971
+ conditions = [f"kind IN ({placeholders})"]
972
+ params: list[str] = list(kinds)
973
+ if file_pattern:
974
+ conditions.append("file_path LIKE ?")
975
+ params.append(f"%{file_pattern}%")
976
+ where = " AND ".join(conditions)
977
+ rows = self._conn.execute( # nosec B608
978
+ f"SELECT * FROM nodes WHERE {where}", params,
979
+ ).fetchall()
980
+ return [self._row_to_node(r) for r in rows]
981
+
982
+ def count_flow_memberships(self, node_id: int) -> int:
983
+ """Return the number of flows a node participates in."""
984
+ row = self._conn.execute(
985
+ "SELECT COUNT(*) as cnt FROM flow_memberships "
986
+ "WHERE node_id = ?",
987
+ (node_id,),
988
+ ).fetchone()
989
+ return row["cnt"] if row else 0
990
+
991
+ def get_flow_criticalities_for_node(self, node_id: int) -> list[float]:
992
+ """Return criticality values for all flows a node participates in."""
993
+ rows = self._conn.execute(
994
+ "SELECT f.criticality FROM flows f "
995
+ "JOIN flow_memberships fm ON fm.flow_id = f.id "
996
+ "WHERE fm.node_id = ?",
997
+ (node_id,),
998
+ ).fetchall()
999
+ return [r["criticality"] for r in rows]
1000
+
1001
+ def get_node_community_id(self, node_id: int) -> int | None:
1002
+ """Return the ``community_id`` for a node, or ``None``."""
1003
+ row = self._conn.execute(
1004
+ "SELECT community_id FROM nodes WHERE id = ?",
1005
+ (node_id,),
1006
+ ).fetchone()
1007
+ if row and row["community_id"] is not None:
1008
+ return row["community_id"]
1009
+ return None
1010
+
1011
+ def get_community_ids_by_qualified_names(
1012
+ self, qns: list[str],
1013
+ ) -> dict[str, int | None]:
1014
+ """Batch-fetch ``community_id`` for a list of qualified names.
1015
+
1016
+ Returns a mapping from qualified name to community_id (may be
1017
+ ``None`` if the node has no assigned community).
1018
+ """
1019
+ result: dict[str, int | None] = {}
1020
+ batch_size = 450
1021
+ for i in range(0, len(qns), batch_size):
1022
+ batch = qns[i:i + batch_size]
1023
+ placeholders = ",".join("?" for _ in batch)
1024
+ rows = self._conn.execute( # nosec B608
1025
+ "SELECT qualified_name, community_id FROM nodes "
1026
+ f"WHERE qualified_name IN ({placeholders})",
1027
+ batch,
1028
+ ).fetchall()
1029
+ for r in rows:
1030
+ result[r["qualified_name"]] = r["community_id"]
1031
+ return result
1032
+
1033
+ def get_files_matching(self, pattern: str) -> list[str]:
1034
+ """Return distinct ``file_path`` values matching a LIKE suffix."""
1035
+ rows = self._conn.execute(
1036
+ "SELECT DISTINCT file_path FROM nodes "
1037
+ "WHERE file_path LIKE ?",
1038
+ (f"%{pattern}",),
1039
+ ).fetchall()
1040
+ return [r["file_path"] for r in rows]
1041
+
1042
+ def get_nodes_without_signature(self) -> list[sqlite3.Row]:
1043
+ """Return raw rows for nodes that have no signature yet."""
1044
+ return self._conn.execute(
1045
+ "SELECT id, name, kind, params, return_type "
1046
+ "FROM nodes WHERE signature IS NULL"
1047
+ ).fetchall()
1048
+
1049
+ def update_node_signature(
1050
+ self, node_id: int, signature: str,
1051
+ ) -> None:
1052
+ """Set the ``signature`` column for a single node."""
1053
+ self._conn.execute(
1054
+ "UPDATE nodes SET signature = ? WHERE id = ?",
1055
+ (signature, node_id),
1056
+ )
1057
+
1058
+ def get_all_community_ids(self) -> dict[str, int | None]:
1059
+ """Return a mapping of *all* qualified names to their community_id.
1060
+
1061
+ Used primarily by the visualization exporter.
1062
+ """
1063
+ try:
1064
+ rows = self._conn.execute(
1065
+ "SELECT qualified_name, community_id FROM nodes"
1066
+ ).fetchall()
1067
+ return {
1068
+ r["qualified_name"]: r["community_id"]
1069
+ for r in rows
1070
+ }
1071
+ except sqlite3.OperationalError as exc:
1072
+ # community_id column may not exist yet on pre-v6 schemas
1073
+ logger.debug("Community IDs unavailable (schema not yet migrated): %s", exc)
1074
+ return {}
1075
+
1076
+ def get_node_ids_by_files(
1077
+ self, file_paths: list[str],
1078
+ ) -> set[int]:
1079
+ """Return node IDs belonging to the given file paths."""
1080
+ if not file_paths:
1081
+ return set()
1082
+ result: set[int] = set()
1083
+ batch_size = 450
1084
+ for i in range(0, len(file_paths), batch_size):
1085
+ batch = file_paths[i:i + batch_size]
1086
+ placeholders = ",".join("?" for _ in batch)
1087
+ rows = self._conn.execute( # nosec B608
1088
+ "SELECT id FROM nodes "
1089
+ f"WHERE file_path IN ({placeholders})",
1090
+ batch,
1091
+ ).fetchall()
1092
+ result.update(r["id"] for r in rows)
1093
+ return result
1094
+
1095
+ def get_flow_ids_by_node_ids(
1096
+ self, node_ids: set[int],
1097
+ ) -> list[int]:
1098
+ """Return distinct flow IDs that contain any of *node_ids*."""
1099
+ if not node_ids:
1100
+ return []
1101
+ nids = list(node_ids)
1102
+ result: list[int] = []
1103
+ batch_size = 450
1104
+ for i in range(0, len(nids), batch_size):
1105
+ batch = nids[i:i + batch_size]
1106
+ placeholders = ",".join("?" for _ in batch)
1107
+ rows = self._conn.execute( # nosec B608
1108
+ "SELECT DISTINCT flow_id FROM flow_memberships "
1109
+ f"WHERE node_id IN ({placeholders})",
1110
+ batch,
1111
+ ).fetchall()
1112
+ result.extend(r["flow_id"] for r in rows)
1113
+ # Deduplicate across batches
1114
+ return list(dict.fromkeys(result))
1115
+
1116
+ def get_flow_qualified_names(self, flow_id: int) -> set[str]:
1117
+ """Return the set of qualified names for nodes in a flow."""
1118
+ rows = self._conn.execute(
1119
+ "SELECT n.qualified_name FROM flow_memberships fm "
1120
+ "JOIN nodes n ON fm.node_id = n.id WHERE fm.flow_id = ?",
1121
+ (flow_id,),
1122
+ ).fetchall()
1123
+ return {r["qualified_name"] for r in rows}
1124
+
1125
+ def get_node_kind_by_id(self, node_id: int) -> str | None:
1126
+ """Return just the ``kind`` column for a node, or ``None``."""
1127
+ row = self._conn.execute(
1128
+ "SELECT kind FROM nodes WHERE id = ?", (node_id,),
1129
+ ).fetchone()
1130
+ return row["kind"] if row else None
1131
+
1132
+ def get_all_call_targets(self, include_file_sources: bool = True) -> set[str]:
1133
+ """Return the set of all CALLS-edge target qualified names.
1134
+
1135
+ When ``include_file_sources`` is False, CALLS edges whose source is a
1136
+ File node (module-scope calls from top-level script glue, CLI
1137
+ entrypoints, or notebook cells) are excluded. Callers that treat "has
1138
+ an incoming call" as "is not a root" (e.g. entry-point detection)
1139
+ should pass ``include_file_sources=False`` — otherwise a script-only
1140
+ callee looks called and is hidden from flow analysis.
1141
+
1142
+ The File-node filter joins against ``nodes.kind`` rather than pattern-
1143
+ matching ``source_qualified`` so that file paths containing ``::`` or
1144
+ any future change to the File-node naming convention cannot silently
1145
+ miscategorize edges.
1146
+ """
1147
+ if include_file_sources:
1148
+ rows = self._conn.execute(
1149
+ "SELECT DISTINCT target_qualified FROM edges "
1150
+ "WHERE kind = 'CALLS'"
1151
+ ).fetchall()
1152
+ else:
1153
+ rows = self._conn.execute(
1154
+ "SELECT DISTINCT e.target_qualified FROM edges e "
1155
+ "LEFT JOIN nodes n ON n.qualified_name = e.source_qualified "
1156
+ "WHERE e.kind = 'CALLS' "
1157
+ "AND (n.kind IS NULL OR n.kind != 'File')"
1158
+ ).fetchall()
1159
+ return {r["target_qualified"] for r in rows}
1160
+
1161
+ def get_communities_list(
1162
+ self,
1163
+ ) -> list[sqlite3.Row]:
1164
+ """Return raw rows from the ``communities`` table."""
1165
+ try:
1166
+ return self._conn.execute(
1167
+ "SELECT id, name FROM communities"
1168
+ ).fetchall()
1169
+ except sqlite3.OperationalError as exc:
1170
+ # communities table doesn't exist yet on pre-v4 schemas
1171
+ logger.debug("Communities list unavailable (table missing): %s", exc)
1172
+ return []
1173
+
1174
+ def get_community_member_qns(
1175
+ self, community_id: int,
1176
+ ) -> list[str]:
1177
+ """Return qualified names of nodes in a community."""
1178
+ rows = self._conn.execute(
1179
+ "SELECT qualified_name FROM nodes "
1180
+ "WHERE community_id = ?",
1181
+ (community_id,),
1182
+ ).fetchall()
1183
+ return [r["qualified_name"] for r in rows]
1184
+
1185
+ def get_nodes_by_community_id(
1186
+ self, community_id: int,
1187
+ ) -> list[GraphNode]:
1188
+ """Return all nodes belonging to a community."""
1189
+ rows = self._conn.execute(
1190
+ "SELECT * FROM nodes WHERE community_id = ?",
1191
+ (community_id,),
1192
+ ).fetchall()
1193
+ return [self._row_to_node(r) for r in rows]
1194
+
1195
+ def get_outgoing_targets(
1196
+ self, source_qns: list[str],
1197
+ ) -> list[str]:
1198
+ """Return ``target_qualified`` for edges sourced from *source_qns*."""
1199
+ results: list[str] = []
1200
+ batch_size = 450
1201
+ for i in range(0, len(source_qns), batch_size):
1202
+ batch = source_qns[i:i + batch_size]
1203
+ placeholders = ",".join("?" for _ in batch)
1204
+ rows = self._conn.execute( # nosec B608
1205
+ "SELECT target_qualified FROM edges "
1206
+ f"WHERE source_qualified IN ({placeholders})",
1207
+ batch,
1208
+ ).fetchall()
1209
+ results.extend(r["target_qualified"] for r in rows)
1210
+ return results
1211
+
1212
+ def get_incoming_sources(
1213
+ self, target_qns: list[str],
1214
+ ) -> list[str]:
1215
+ """Return ``source_qualified`` for edges targeting *target_qns*."""
1216
+ results: list[str] = []
1217
+ batch_size = 450
1218
+ for i in range(0, len(target_qns), batch_size):
1219
+ batch = target_qns[i:i + batch_size]
1220
+ placeholders = ",".join("?" for _ in batch)
1221
+ rows = self._conn.execute( # nosec B608
1222
+ "SELECT source_qualified FROM edges "
1223
+ f"WHERE target_qualified IN ({placeholders})",
1224
+ batch,
1225
+ ).fetchall()
1226
+ results.extend(r["source_qualified"] for r in rows)
1227
+ return results
1228
+
1229
+ # --- Public edge access (for visualization etc.) ---
1230
+
1231
+ def get_all_edges(self) -> list[GraphEdge]:
1232
+ """Return all edges in the graph."""
1233
+ rows = self._conn.execute("SELECT * FROM edges").fetchall()
1234
+ return [self._row_to_edge(r) for r in rows]
1235
+
1236
+ def get_edges_among(self, qualified_names: set[str]) -> list[GraphEdge]:
1237
+ """Return edges where both source and target are in the given set.
1238
+
1239
+ Batches the source-side IN clause to stay under SQLite's default
1240
+ SQLITE_MAX_VARIABLE_NUMBER limit, then filters targets in Python.
1241
+ """
1242
+ if not qualified_names:
1243
+ return []
1244
+ qns = list(qualified_names)
1245
+ results: list[GraphEdge] = []
1246
+ batch_size = 450 # Stay well under SQLite's default 999 limit
1247
+ for i in range(0, len(qns), batch_size):
1248
+ batch = qns[i:i + batch_size]
1249
+ placeholders = ",".join("?" for _ in batch)
1250
+ rows = self._conn.execute( # nosec B608
1251
+ f"SELECT * FROM edges WHERE source_qualified IN ({placeholders})",
1252
+ batch,
1253
+ ).fetchall()
1254
+ for r in rows:
1255
+ edge = self._row_to_edge(r)
1256
+ if edge.target_qualified in qualified_names:
1257
+ results.append(edge)
1258
+ return results
1259
+
1260
+ def _batch_get_nodes(self, qualified_names: set[str]) -> list[GraphNode]:
1261
+ """Batch-fetch nodes by qualified name, staying under SQLite variable limits."""
1262
+ if not qualified_names:
1263
+ return []
1264
+ qns = list(qualified_names)
1265
+ results: list[GraphNode] = []
1266
+ batch_size = 450
1267
+ for i in range(0, len(qns), batch_size):
1268
+ batch = qns[i:i + batch_size]
1269
+ placeholders = ",".join("?" for _ in batch)
1270
+ rows = self._conn.execute( # nosec B608
1271
+ f"SELECT * FROM nodes WHERE qualified_name IN ({placeholders})",
1272
+ batch,
1273
+ ).fetchall()
1274
+ results.extend(self._row_to_node(r) for r in rows)
1275
+ return results
1276
+
1277
+ def load_flow_adjacency(self) -> "FlowAdjacency":
1278
+ """Load all nodes and CALLS/TESTED_BY edges into memory for fast traversal.
1279
+
1280
+ Reads the entire ``nodes`` and ``edges`` tables in two streaming
1281
+ queries and returns an in-memory adjacency structure suitable for
1282
+ flow tracing and criticality scoring. At ~500k nodes / 3M edges
1283
+ this fits in a few hundred MB and eliminates tens of millions of
1284
+ single-row SQLite point queries that otherwise dominate
1285
+ ``trace_flows`` / ``compute_criticality`` runtime.
1286
+ """
1287
+ nodes_by_qn: dict[str, GraphNode] = {}
1288
+ nodes_by_id: dict[int, GraphNode] = {}
1289
+ for row in self._conn.execute("SELECT * FROM nodes"):
1290
+ node = self._row_to_node(row)
1291
+ nodes_by_qn[node.qualified_name] = node
1292
+ nodes_by_id[node.id] = node
1293
+
1294
+ calls_out: dict[str, list[str]] = {}
1295
+ has_tested_by: set[str] = set()
1296
+ for row in self._conn.execute(
1297
+ "SELECT kind, source_qualified, target_qualified FROM edges "
1298
+ "WHERE kind IN ('CALLS', 'TESTED_BY')"
1299
+ ):
1300
+ kind, src, tgt = row["kind"], row["source_qualified"], row["target_qualified"]
1301
+ if kind == "CALLS":
1302
+ calls_out.setdefault(src, []).append(tgt)
1303
+ else: # TESTED_BY
1304
+ has_tested_by.add(tgt)
1305
+
1306
+ return FlowAdjacency(
1307
+ calls_out=calls_out,
1308
+ has_tested_by=has_tested_by,
1309
+ nodes_by_qn=nodes_by_qn,
1310
+ nodes_by_id=nodes_by_id,
1311
+ )
1312
+
1313
+ # --- Internal helpers ---
1314
+
1315
+ def _build_networkx_graph(self) -> nx.DiGraph:
1316
+ """Build (or return cached) in-memory NetworkX directed graph from all edges."""
1317
+ with self._cache_lock:
1318
+ if self._nxg_cache is not None:
1319
+ return self._nxg_cache
1320
+ g: nx.DiGraph = nx.DiGraph()
1321
+ rows = self._conn.execute("SELECT * FROM edges").fetchall()
1322
+ for r in rows:
1323
+ g.add_edge(r["source_qualified"], r["target_qualified"], kind=r["kind"])
1324
+ self._nxg_cache = g
1325
+ return g
1326
+
1327
+ def _make_qualified(self, node: NodeInfo) -> str:
1328
+ if node.kind == "File":
1329
+ return node.file_path
1330
+ if node.parent_name:
1331
+ return f"{node.file_path}::{node.parent_name}.{node.name}"
1332
+ return f"{node.file_path}::{node.name}"
1333
+
1334
+ def _qualified_names_for_file(self, nodes: list[NodeInfo]) -> list[str]:
1335
+ """Compute collision-free qualified names for one file's nodes.
1336
+
1337
+ The first occurrence of a key keeps its bare form so existing edges
1338
+ (which reference the same parser-computed key) still resolve to it.
1339
+ Later same-key symbols are suffixed with their start line — two defs
1340
+ cannot share a ``line_start``, so this is always unique.
1341
+ """
1342
+ names: list[str] = []
1343
+ seen: set[str] = set()
1344
+ for index, node in enumerate(nodes):
1345
+ base = self._make_qualified(node)
1346
+ if base not in seen:
1347
+ seen.add(base)
1348
+ names.append(base)
1349
+ continue
1350
+ candidate = f"{base}:L{node.line_start}"
1351
+ if candidate in seen:
1352
+ candidate = f"{base}:L{node.line_start}#{index}"
1353
+ seen.add(candidate)
1354
+ names.append(candidate)
1355
+ return names
1356
+
1357
+ def _row_to_node(self, row: sqlite3.Row) -> GraphNode:
1358
+ return GraphNode(
1359
+ id=row["id"],
1360
+ kind=row["kind"],
1361
+ name=row["name"],
1362
+ qualified_name=row["qualified_name"],
1363
+ file_path=row["file_path"],
1364
+ line_start=row["line_start"],
1365
+ line_end=row["line_end"],
1366
+ language=row["language"] or "",
1367
+ parent_name=row["parent_name"],
1368
+ params=row["params"],
1369
+ return_type=row["return_type"],
1370
+ is_test=bool(row["is_test"]),
1371
+ file_hash=row["file_hash"],
1372
+ extra=json.loads(row["extra"]) if row["extra"] else {},
1373
+ )
1374
+
1375
+ def _row_to_edge(self, row: sqlite3.Row) -> GraphEdge:
1376
+ extra = json.loads(row["extra"]) if row["extra"] else {}
1377
+ confidence = row["confidence"] if "confidence" in row.keys() else 1.0
1378
+ confidence_tier = row["confidence_tier"] if "confidence_tier" in row.keys() else "EXTRACTED"
1379
+ return GraphEdge(
1380
+ id=row["id"],
1381
+ kind=row["kind"],
1382
+ source_qualified=row["source_qualified"],
1383
+ target_qualified=row["target_qualified"],
1384
+ file_path=row["file_path"],
1385
+ line=row["line"],
1386
+ extra=extra,
1387
+ confidence=confidence,
1388
+ confidence_tier=confidence_tier,
1389
+ )
1390
+
1391
+
1392
+ def _sanitize_name(s: str, max_len: int = 256) -> str:
1393
+ """Strip ASCII control characters and truncate to prevent prompt injection.
1394
+
1395
+ Node names extracted from source code could contain adversarial strings
1396
+ (e.g. ``IGNORE_ALL_PREVIOUS_INSTRUCTIONS``). This function removes control
1397
+ characters (0x00-0x1F except tab and newline) and enforces a length limit so
1398
+ that names flowing through MCP tool responses cannot easily influence AI
1399
+ agent behaviour.
1400
+ """
1401
+ # Strip control chars 0x00-0x1F except \t (0x09) and \n (0x0A)
1402
+ cleaned = "".join(
1403
+ ch for ch in s
1404
+ if ch in ("\t", "\n") or ord(ch) >= 0x20
1405
+ )
1406
+ return cleaned[:max_len]
1407
+
1408
+
1409
+ def node_to_dict(n: GraphNode) -> dict:
1410
+ return {
1411
+ "id": n.id, "kind": n.kind, "name": _sanitize_name(n.name),
1412
+ "qualified_name": _sanitize_name(n.qualified_name), "file_path": n.file_path,
1413
+ "line_start": n.line_start, "line_end": n.line_end,
1414
+ "language": n.language,
1415
+ "parent_name": _sanitize_name(n.parent_name) if n.parent_name else n.parent_name,
1416
+ "is_test": n.is_test,
1417
+ }
1418
+
1419
+
1420
+ def edge_to_dict(e: GraphEdge) -> dict:
1421
+ return {
1422
+ "id": e.id, "kind": e.kind,
1423
+ "source": _sanitize_name(e.source_qualified),
1424
+ "target": _sanitize_name(e.target_qualified),
1425
+ "file_path": e.file_path, "line": e.line,
1426
+ "confidence": e.confidence, "confidence_tier": e.confidence_tier,
1427
+ }