codebase-intel 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. codebase_intel/__init__.py +3 -0
  2. codebase_intel/analytics/__init__.py +1 -0
  3. codebase_intel/analytics/benchmark.py +406 -0
  4. codebase_intel/analytics/feedback.py +496 -0
  5. codebase_intel/analytics/tracker.py +439 -0
  6. codebase_intel/cli/__init__.py +1 -0
  7. codebase_intel/cli/main.py +740 -0
  8. codebase_intel/contracts/__init__.py +1 -0
  9. codebase_intel/contracts/auto_generator.py +438 -0
  10. codebase_intel/contracts/evaluator.py +531 -0
  11. codebase_intel/contracts/models.py +433 -0
  12. codebase_intel/contracts/registry.py +225 -0
  13. codebase_intel/core/__init__.py +1 -0
  14. codebase_intel/core/config.py +248 -0
  15. codebase_intel/core/exceptions.py +454 -0
  16. codebase_intel/core/types.py +375 -0
  17. codebase_intel/decisions/__init__.py +1 -0
  18. codebase_intel/decisions/miner.py +297 -0
  19. codebase_intel/decisions/models.py +302 -0
  20. codebase_intel/decisions/store.py +411 -0
  21. codebase_intel/drift/__init__.py +1 -0
  22. codebase_intel/drift/detector.py +443 -0
  23. codebase_intel/graph/__init__.py +1 -0
  24. codebase_intel/graph/builder.py +391 -0
  25. codebase_intel/graph/parser.py +1232 -0
  26. codebase_intel/graph/query.py +377 -0
  27. codebase_intel/graph/storage.py +736 -0
  28. codebase_intel/mcp/__init__.py +1 -0
  29. codebase_intel/mcp/server.py +710 -0
  30. codebase_intel/orchestrator/__init__.py +1 -0
  31. codebase_intel/orchestrator/assembler.py +649 -0
  32. codebase_intel-0.1.0.dist-info/METADATA +361 -0
  33. codebase_intel-0.1.0.dist-info/RECORD +36 -0
  34. codebase_intel-0.1.0.dist-info/WHEEL +4 -0
  35. codebase_intel-0.1.0.dist-info/entry_points.txt +2 -0
  36. codebase_intel-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,736 @@
1
+ """SQLite-backed graph storage with WAL mode for concurrent access.
2
+
3
+ Design decisions:
4
+ - SQLite over a graph DB (Neo4j, etc.) to maintain zero-dependency portability
5
+ - WAL mode enables concurrent readers (MCP queries) + single writer (git hook updates)
6
+ - Schema versioning for forward migration without data loss
7
+ - Batch operations for initial build (10k+ nodes), single ops for incremental
8
+
9
+ Edge cases handled:
10
+ - Concurrent writes: WAL mode + busy timeout + application-level retry
11
+ - Corrupt database: detect via integrity check, offer re-initialization
12
+ - Schema migration: version table tracks schema, auto-migrate on open
13
+ - Large codebases: batch inserts with transaction chunking (100k+ nodes)
14
+ - Partial writes: crash during build → incomplete graph → detect via marker table
15
+ - Disk full: catch and surface meaningful error
16
+ - Path encoding: SQLite stores paths as TEXT, we normalize to POSIX format
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import logging
22
+ from contextlib import asynccontextmanager
23
+ from pathlib import Path, PurePosixPath
24
+ from typing import TYPE_CHECKING, AsyncIterator
25
+
26
+ import aiosqlite
27
+
28
+ from codebase_intel.core.exceptions import (
29
+ ErrorContext,
30
+ StorageConcurrencyError,
31
+ StorageCorruptError,
32
+ StorageMigrationError,
33
+ )
34
+ from codebase_intel.core.types import (
35
+ EdgeKind,
36
+ GraphEdge,
37
+ GraphNode,
38
+ Language,
39
+ LineRange,
40
+ NodeKind,
41
+ )
42
+
43
+ if TYPE_CHECKING:
44
+ from codebase_intel.core.config import GraphConfig
45
+
46
+ logger = logging.getLogger(__name__)
47
+
48
+ SCHEMA_VERSION = 1
49
+
50
+ SCHEMA_SQL = """
51
+ -- Schema version tracking
52
+ CREATE TABLE IF NOT EXISTS schema_version (
53
+ version INTEGER NOT NULL,
54
+ migrated_at TEXT NOT NULL DEFAULT (datetime('now'))
55
+ );
56
+
57
+ -- Build status tracking (detect incomplete builds)
58
+ CREATE TABLE IF NOT EXISTS build_status (
59
+ build_id TEXT PRIMARY KEY,
60
+ started_at TEXT NOT NULL,
61
+ completed_at TEXT,
62
+ file_count INTEGER DEFAULT 0,
63
+ node_count INTEGER DEFAULT 0,
64
+ edge_count INTEGER DEFAULT 0
65
+ );
66
+
67
+ -- Graph nodes
68
+ CREATE TABLE IF NOT EXISTS nodes (
69
+ node_id TEXT PRIMARY KEY,
70
+ kind TEXT NOT NULL,
71
+ name TEXT NOT NULL,
72
+ qualified_name TEXT NOT NULL,
73
+ file_path TEXT NOT NULL, -- POSIX-normalized, relative to project root
74
+ line_start INTEGER,
75
+ line_end INTEGER,
76
+ language TEXT NOT NULL DEFAULT 'unknown',
77
+ content_hash TEXT,
78
+ docstring TEXT,
79
+ is_generated INTEGER NOT NULL DEFAULT 0,
80
+ is_external INTEGER NOT NULL DEFAULT 0,
81
+ is_test INTEGER NOT NULL DEFAULT 0,
82
+ is_entry_point INTEGER NOT NULL DEFAULT 0,
83
+ metadata_json TEXT DEFAULT '{}',
84
+ updated_at TEXT NOT NULL DEFAULT (datetime('now'))
85
+ );
86
+
87
+ -- Graph edges
88
+ CREATE TABLE IF NOT EXISTS edges (
89
+ source_id TEXT NOT NULL REFERENCES nodes(node_id) ON DELETE CASCADE,
90
+ target_id TEXT NOT NULL REFERENCES nodes(node_id) ON DELETE CASCADE,
91
+ kind TEXT NOT NULL,
92
+ confidence REAL NOT NULL DEFAULT 1.0,
93
+ is_type_only INTEGER NOT NULL DEFAULT 0,
94
+ metadata_json TEXT DEFAULT '{}',
95
+ PRIMARY KEY (source_id, target_id, kind)
96
+ );
97
+
98
+ -- File fingerprints (for incremental updates — only re-parse changed files)
99
+ CREATE TABLE IF NOT EXISTS file_fingerprints (
100
+ file_path TEXT PRIMARY KEY,
101
+ content_hash TEXT NOT NULL,
102
+ size_bytes INTEGER NOT NULL,
103
+ last_modified TEXT NOT NULL,
104
+ language TEXT NOT NULL DEFAULT 'unknown',
105
+ node_count INTEGER NOT NULL DEFAULT 0
106
+ );
107
+
108
+ -- Indexes for common query patterns
109
+ CREATE INDEX IF NOT EXISTS idx_nodes_file ON nodes(file_path);
110
+ CREATE INDEX IF NOT EXISTS idx_nodes_kind ON nodes(kind);
111
+ CREATE INDEX IF NOT EXISTS idx_nodes_name ON nodes(name);
112
+ CREATE INDEX IF NOT EXISTS idx_nodes_qualified ON nodes(qualified_name);
113
+ CREATE INDEX IF NOT EXISTS idx_edges_source ON edges(source_id);
114
+ CREATE INDEX IF NOT EXISTS idx_edges_target ON edges(target_id);
115
+ CREATE INDEX IF NOT EXISTS idx_edges_kind ON edges(kind);
116
+ """
117
+
118
+
119
+ class GraphStorage:
120
+ """Async SQLite storage for the semantic code graph.
121
+
122
+ Usage:
123
+ async with GraphStorage.open(config) as storage:
124
+ await storage.upsert_node(node)
125
+ nodes = await storage.get_dependents("node_id")
126
+ """
127
+
128
+ def __init__(self, db: aiosqlite.Connection, project_root: Path) -> None:
129
+ self._db = db
130
+ self._project_root = project_root
131
+
132
+ @classmethod
133
+ @asynccontextmanager
134
+ async def open(cls, config: GraphConfig, project_root: Path) -> AsyncIterator[GraphStorage]:
135
+ """Open (or create) the graph database with proper configuration.
136
+
137
+ Edge cases:
138
+ - DB file doesn't exist: create with full schema
139
+ - DB file exists but wrong version: migrate or error
140
+ - DB file is corrupt: integrity check fails, raise StorageCorruptError
141
+ - DB locked by another process: retry with busy_timeout
142
+ """
143
+ db_path = config.db_path
144
+ db_path.parent.mkdir(parents=True, exist_ok=True)
145
+
146
+ db = await aiosqlite.connect(str(db_path))
147
+ try:
148
+ # Enable WAL mode for concurrent read/write
149
+ if config.enable_wal_mode:
150
+ await db.execute("PRAGMA journal_mode=WAL")
151
+
152
+ # Busy timeout: wait up to 5s for locks instead of failing immediately
153
+ # Edge case: git hook and MCP server both active
154
+ await db.execute("PRAGMA busy_timeout=5000")
155
+
156
+ # Foreign keys for cascade deletes (removing a node removes its edges)
157
+ await db.execute("PRAGMA foreign_keys=ON")
158
+
159
+ # Integrity check on first open
160
+ result = await db.execute("PRAGMA integrity_check")
161
+ check = await result.fetchone()
162
+ if check and check[0] != "ok":
163
+ raise StorageCorruptError(
164
+ f"Database integrity check failed: {check[0]}",
165
+ ErrorContext(file_path=db_path, operation="integrity_check"),
166
+ )
167
+
168
+ storage = cls(db, project_root)
169
+ await storage._ensure_schema()
170
+ yield storage
171
+
172
+ except aiosqlite.OperationalError as exc:
173
+ if "database is locked" in str(exc):
174
+ raise StorageConcurrencyError(
175
+ "Database is locked by another process",
176
+ ErrorContext(file_path=db_path, operation="open"),
177
+ ) from exc
178
+ raise
179
+ finally:
180
+ await db.close()
181
+
182
+ async def _ensure_schema(self) -> None:
183
+ """Create or migrate the schema.
184
+
185
+ Edge case: schema_version table doesn't exist (fresh DB) vs.
186
+ exists with older version (needs migration) vs.
187
+ exists with newer version (user downgraded the tool — refuse).
188
+ """
189
+ # Check if schema_version table exists
190
+ cursor = await self._db.execute(
191
+ "SELECT name FROM sqlite_master WHERE type='table' AND name='schema_version'"
192
+ )
193
+ table_exists = await cursor.fetchone()
194
+
195
+ if not table_exists:
196
+ # Fresh database — create everything
197
+ await self._db.executescript(SCHEMA_SQL)
198
+ await self._db.execute(
199
+ "INSERT INTO schema_version (version) VALUES (?)",
200
+ (SCHEMA_VERSION,),
201
+ )
202
+ await self._db.commit()
203
+ return
204
+
205
+ # Check version
206
+ cursor = await self._db.execute(
207
+ "SELECT MAX(version) FROM schema_version"
208
+ )
209
+ row = await cursor.fetchone()
210
+ current_version = row[0] if row else 0
211
+
212
+ if current_version == SCHEMA_VERSION:
213
+ return
214
+
215
+ if current_version > SCHEMA_VERSION:
216
+ raise StorageMigrationError(
217
+ f"Database schema version {current_version} is newer than "
218
+ f"supported version {SCHEMA_VERSION}. Please upgrade codebase-intel.",
219
+ )
220
+
221
+ # Future: migration logic goes here
222
+ # For now, v1 is the only version
223
+ logger.info("Migrating schema from v%d to v%d", current_version, SCHEMA_VERSION)
224
+ await self._db.executescript(SCHEMA_SQL)
225
+ await self._db.execute(
226
+ "INSERT INTO schema_version (version) VALUES (?)",
227
+ (SCHEMA_VERSION,),
228
+ )
229
+ await self._db.commit()
230
+
231
+ # -------------------------------------------------------------------
232
+ # Path normalization
233
+ # -------------------------------------------------------------------
234
+
235
+ def _to_stored_path(self, path: Path) -> str:
236
+ """Convert absolute path to POSIX-relative for storage.
237
+
238
+ Edge case: path outside project root (symlink target, monorepo ref).
239
+ We store absolute POSIX path in that case.
240
+ """
241
+ try:
242
+ return str(PurePosixPath(path.resolve().relative_to(self._project_root)))
243
+ except ValueError:
244
+ return str(PurePosixPath(path.resolve()))
245
+
246
+ def _from_stored_path(self, stored: str) -> Path:
247
+ """Convert stored POSIX path back to absolute Path."""
248
+ p = Path(stored)
249
+ if p.is_absolute():
250
+ return p
251
+ return self._project_root / p
252
+
253
+ # -------------------------------------------------------------------
254
+ # Node operations
255
+ # -------------------------------------------------------------------
256
+
257
+ async def upsert_node(self, node: GraphNode) -> None:
258
+ """Insert or update a graph node.
259
+
260
+ Edge case: node with same ID but different content (file changed).
261
+ We update in place — the node_id is deterministic from (path, kind, name).
262
+ """
263
+ import json
264
+
265
+ await self._db.execute(
266
+ """
267
+ INSERT INTO nodes (
268
+ node_id, kind, name, qualified_name, file_path,
269
+ line_start, line_end, language, content_hash, docstring,
270
+ is_generated, is_external, is_test, is_entry_point,
271
+ metadata_json
272
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
273
+ ON CONFLICT(node_id) DO UPDATE SET
274
+ kind=excluded.kind, name=excluded.name,
275
+ qualified_name=excluded.qualified_name,
276
+ file_path=excluded.file_path,
277
+ line_start=excluded.line_start, line_end=excluded.line_end,
278
+ language=excluded.language, content_hash=excluded.content_hash,
279
+ docstring=excluded.docstring,
280
+ is_generated=excluded.is_generated,
281
+ is_external=excluded.is_external,
282
+ is_test=excluded.is_test,
283
+ is_entry_point=excluded.is_entry_point,
284
+ metadata_json=excluded.metadata_json,
285
+ updated_at=datetime('now')
286
+ """,
287
+ (
288
+ node.node_id,
289
+ node.kind.value,
290
+ node.name,
291
+ node.qualified_name,
292
+ self._to_stored_path(node.file_path),
293
+ node.line_range.start if node.line_range else None,
294
+ node.line_range.end if node.line_range else None,
295
+ node.language.value,
296
+ node.content_hash,
297
+ node.docstring,
298
+ int(node.is_generated),
299
+ int(node.is_external),
300
+ int(node.is_test),
301
+ int(node.is_entry_point),
302
+ json.dumps(node.metadata),
303
+ ),
304
+ )
305
+
306
+ async def upsert_nodes_batch(self, nodes: list[GraphNode]) -> None:
307
+ """Batch upsert for initial graph build.
308
+
309
+ Edge case: 100k+ nodes on a large codebase. We chunk into transactions
310
+ of 1000 to balance speed vs. memory and allow partial progress visibility.
311
+ """
312
+ import json
313
+
314
+ chunk_size = 1000
315
+ for i in range(0, len(nodes), chunk_size):
316
+ chunk = nodes[i : i + chunk_size]
317
+ await self._db.executemany(
318
+ """
319
+ INSERT INTO nodes (
320
+ node_id, kind, name, qualified_name, file_path,
321
+ line_start, line_end, language, content_hash, docstring,
322
+ is_generated, is_external, is_test, is_entry_point,
323
+ metadata_json
324
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
325
+ ON CONFLICT(node_id) DO UPDATE SET
326
+ kind=excluded.kind, name=excluded.name,
327
+ qualified_name=excluded.qualified_name,
328
+ file_path=excluded.file_path,
329
+ line_start=excluded.line_start, line_end=excluded.line_end,
330
+ language=excluded.language, content_hash=excluded.content_hash,
331
+ docstring=excluded.docstring,
332
+ is_generated=excluded.is_generated,
333
+ is_external=excluded.is_external,
334
+ is_test=excluded.is_test,
335
+ is_entry_point=excluded.is_entry_point,
336
+ metadata_json=excluded.metadata_json,
337
+ updated_at=datetime('now')
338
+ """,
339
+ [
340
+ (
341
+ n.node_id,
342
+ n.kind.value,
343
+ n.name,
344
+ n.qualified_name,
345
+ self._to_stored_path(n.file_path),
346
+ n.line_range.start if n.line_range else None,
347
+ n.line_range.end if n.line_range else None,
348
+ n.language.value,
349
+ n.content_hash,
350
+ n.docstring,
351
+ int(n.is_generated),
352
+ int(n.is_external),
353
+ int(n.is_test),
354
+ int(n.is_entry_point),
355
+ json.dumps(n.metadata),
356
+ )
357
+ for n in chunk
358
+ ],
359
+ )
360
+ await self._db.commit()
361
+
362
+ async def upsert_edge(self, edge: GraphEdge) -> None:
363
+ """Insert or update a graph edge."""
364
+ import json
365
+
366
+ await self._db.execute(
367
+ """
368
+ INSERT INTO edges (source_id, target_id, kind, confidence, is_type_only, metadata_json)
369
+ VALUES (?, ?, ?, ?, ?, ?)
370
+ ON CONFLICT(source_id, target_id, kind) DO UPDATE SET
371
+ confidence=excluded.confidence,
372
+ is_type_only=excluded.is_type_only,
373
+ metadata_json=excluded.metadata_json
374
+ """,
375
+ (
376
+ edge.source_id,
377
+ edge.target_id,
378
+ edge.kind.value,
379
+ edge.confidence,
380
+ int(edge.is_type_only),
381
+ json.dumps(edge.metadata),
382
+ ),
383
+ )
384
+
385
+ async def upsert_edges_batch(self, edges: list[GraphEdge]) -> None:
386
+ """Batch upsert edges."""
387
+ import json
388
+
389
+ chunk_size = 1000
390
+ for i in range(0, len(edges), chunk_size):
391
+ chunk = edges[i : i + chunk_size]
392
+ await self._db.executemany(
393
+ """
394
+ INSERT INTO edges (source_id, target_id, kind, confidence, is_type_only, metadata_json)
395
+ VALUES (?, ?, ?, ?, ?, ?)
396
+ ON CONFLICT(source_id, target_id, kind) DO UPDATE SET
397
+ confidence=excluded.confidence,
398
+ is_type_only=excluded.is_type_only,
399
+ metadata_json=excluded.metadata_json
400
+ """,
401
+ [
402
+ (
403
+ e.source_id,
404
+ e.target_id,
405
+ e.kind.value,
406
+ e.confidence,
407
+ int(e.is_type_only),
408
+ json.dumps(e.metadata),
409
+ )
410
+ for e in chunk
411
+ ],
412
+ )
413
+ await self._db.commit()
414
+
415
+ async def remove_file_nodes(self, file_path: Path) -> int:
416
+ """Remove all nodes (and their edges via CASCADE) for a file.
417
+
418
+ Used during incremental update: delete old nodes before re-parsing.
419
+
420
+ Edge case: file renamed — old path nodes are removed, new path
421
+ nodes are added. The graph handles this as delete+insert, but
422
+ the orchestrator can detect renames via content_hash matching.
423
+ """
424
+ stored_path = self._to_stored_path(file_path)
425
+ cursor = await self._db.execute(
426
+ "DELETE FROM nodes WHERE file_path = ?", (stored_path,)
427
+ )
428
+ await self._db.commit()
429
+ return cursor.rowcount # type: ignore[return-value]
430
+
431
+ # -------------------------------------------------------------------
432
+ # Query operations
433
+ # -------------------------------------------------------------------
434
+
435
+ def _row_to_node(self, row: aiosqlite.Row) -> GraphNode:
436
+ """Convert a database row to a GraphNode."""
437
+ import json
438
+
439
+ line_range = None
440
+ if row[5] is not None and row[6] is not None:
441
+ line_range = LineRange(start=row[5], end=row[6])
442
+
443
+ return GraphNode(
444
+ node_id=row[0],
445
+ kind=NodeKind(row[1]),
446
+ name=row[2],
447
+ qualified_name=row[3],
448
+ file_path=self._from_stored_path(row[4]),
449
+ line_range=line_range,
450
+ language=Language(row[7]),
451
+ content_hash=row[8],
452
+ docstring=row[9],
453
+ is_generated=bool(row[10]),
454
+ is_external=bool(row[11]),
455
+ is_test=bool(row[12]),
456
+ is_entry_point=bool(row[13]),
457
+ metadata=json.loads(row[14]) if row[14] else {},
458
+ )
459
+
460
+ async def get_node(self, node_id: str) -> GraphNode | None:
461
+ """Get a single node by ID."""
462
+ self._db.row_factory = None
463
+ cursor = await self._db.execute(
464
+ """
465
+ SELECT node_id, kind, name, qualified_name, file_path,
466
+ line_start, line_end, language, content_hash, docstring,
467
+ is_generated, is_external, is_test, is_entry_point,
468
+ metadata_json
469
+ FROM nodes WHERE node_id = ?
470
+ """,
471
+ (node_id,),
472
+ )
473
+ row = await cursor.fetchone()
474
+ return self._row_to_node(row) if row else None
475
+
476
+ async def get_nodes_by_file(self, file_path: Path) -> list[GraphNode]:
477
+ """Get all nodes defined in a file."""
478
+ stored_path = self._to_stored_path(file_path)
479
+ cursor = await self._db.execute(
480
+ """
481
+ SELECT node_id, kind, name, qualified_name, file_path,
482
+ line_start, line_end, language, content_hash, docstring,
483
+ is_generated, is_external, is_test, is_entry_point,
484
+ metadata_json
485
+ FROM nodes WHERE file_path = ?
486
+ ORDER BY line_start
487
+ """,
488
+ (stored_path,),
489
+ )
490
+ return [self._row_to_node(row) for row in await cursor.fetchall()]
491
+
492
+ async def get_dependents(
493
+ self,
494
+ node_id: str,
495
+ *,
496
+ edge_kinds: list[EdgeKind] | None = None,
497
+ include_type_only: bool = False,
498
+ max_depth: int = 1,
499
+ ) -> list[GraphNode]:
500
+ """Find all nodes that depend ON the given node (reverse traversal).
501
+
502
+ This answers: "what breaks if I change this?"
503
+
504
+ Edge cases:
505
+ - Circular dependencies: tracked via visited set, no infinite loop
506
+ - Deep chains: capped at max_depth to prevent runaway
507
+ - Type-only deps: excluded by default (changing implementation doesn't
508
+ break a TYPE_CHECKING import)
509
+ - Dynamic imports: included but with lower confidence
510
+ """
511
+ visited: set[str] = set()
512
+ result: list[GraphNode] = []
513
+ queue: list[tuple[str, int]] = [(node_id, 0)]
514
+
515
+ kind_filter = ""
516
+ if edge_kinds:
517
+ kinds_sql = ",".join(f"'{k.value}'" for k in edge_kinds)
518
+ kind_filter = f"AND kind IN ({kinds_sql})"
519
+
520
+ type_filter = "" if include_type_only else "AND is_type_only = 0"
521
+
522
+ while queue:
523
+ current_id, depth = queue.pop(0)
524
+ if current_id in visited or depth > max_depth:
525
+ continue
526
+ visited.add(current_id)
527
+
528
+ cursor = await self._db.execute(
529
+ f"""
530
+ SELECT source_id FROM edges
531
+ WHERE target_id = ? {kind_filter} {type_filter}
532
+ """, # noqa: S608
533
+ (current_id,),
534
+ )
535
+ for row in await cursor.fetchall():
536
+ source_id = row[0]
537
+ if source_id not in visited:
538
+ node = await self.get_node(source_id)
539
+ if node:
540
+ result.append(node)
541
+ if depth + 1 <= max_depth:
542
+ queue.append((source_id, depth + 1))
543
+
544
+ return result
545
+
546
+ async def get_dependencies(
547
+ self,
548
+ node_id: str,
549
+ *,
550
+ edge_kinds: list[EdgeKind] | None = None,
551
+ include_type_only: bool = True,
552
+ max_depth: int = 1,
553
+ ) -> list[GraphNode]:
554
+ """Find all nodes that the given node depends on (forward traversal).
555
+
556
+ This answers: "what context do I need to understand this?"
557
+ """
558
+ visited: set[str] = set()
559
+ result: list[GraphNode] = []
560
+ queue: list[tuple[str, int]] = [(node_id, 0)]
561
+
562
+ kind_filter = ""
563
+ if edge_kinds:
564
+ kinds_sql = ",".join(f"'{k.value}'" for k in edge_kinds)
565
+ kind_filter = f"AND kind IN ({kinds_sql})"
566
+
567
+ type_filter = "" if include_type_only else "AND is_type_only = 0"
568
+
569
+ while queue:
570
+ current_id, depth = queue.pop(0)
571
+ if current_id in visited or depth > max_depth:
572
+ continue
573
+ visited.add(current_id)
574
+
575
+ cursor = await self._db.execute(
576
+ f"""
577
+ SELECT target_id FROM edges
578
+ WHERE source_id = ? {kind_filter} {type_filter}
579
+ """, # noqa: S608
580
+ (current_id,),
581
+ )
582
+ for row in await cursor.fetchall():
583
+ target_id = row[0]
584
+ if target_id not in visited:
585
+ node = await self.get_node(target_id)
586
+ if node:
587
+ result.append(node)
588
+ if depth + 1 <= max_depth:
589
+ queue.append((target_id, depth + 1))
590
+
591
+ return result
592
+
593
+ async def find_cycles(self, max_cycle_length: int = 10) -> list[list[str]]:
594
+ """Detect circular dependency chains in the graph.
595
+
596
+ Edge case: large graphs can have many cycles. We cap detection
597
+ at max_cycle_length to keep this practical. Reports the shortest
598
+ cycles first (most actionable).
599
+
600
+ Uses DFS with back-edge detection.
601
+ """
602
+ # Get all node IDs
603
+ cursor = await self._db.execute("SELECT node_id FROM nodes")
604
+ all_nodes = [row[0] for row in await cursor.fetchall()]
605
+
606
+ visited: set[str] = set()
607
+ rec_stack: set[str] = set()
608
+ path: list[str] = []
609
+ cycles: list[list[str]] = []
610
+
611
+ async def _dfs(node_id: str) -> None:
612
+ if len(cycles) >= 50: # Cap to prevent excessive output
613
+ return
614
+ if len(path) > max_cycle_length:
615
+ return
616
+
617
+ visited.add(node_id)
618
+ rec_stack.add(node_id)
619
+ path.append(node_id)
620
+
621
+ cursor = await self._db.execute(
622
+ "SELECT target_id FROM edges WHERE source_id = ?",
623
+ (node_id,),
624
+ )
625
+ for row in await cursor.fetchall():
626
+ target_id = row[0]
627
+ if target_id not in visited:
628
+ await _dfs(target_id)
629
+ elif target_id in rec_stack:
630
+ # Found a cycle
631
+ cycle_start = path.index(target_id)
632
+ cycle = path[cycle_start:] + [target_id]
633
+ cycles.append(cycle)
634
+
635
+ path.pop()
636
+ rec_stack.discard(node_id)
637
+
638
+ for node_id in all_nodes:
639
+ if node_id not in visited:
640
+ await _dfs(node_id)
641
+
642
+ return sorted(cycles, key=len)
643
+
644
+ async def impact_analysis(
645
+ self,
646
+ file_paths: list[Path],
647
+ max_depth: int = 3,
648
+ ) -> dict[str, list[GraphNode]]:
649
+ """For a set of changed files, find all transitively affected nodes.
650
+
651
+ This is the core query for the orchestrator: "these files changed,
652
+ what else needs to be in context?"
653
+
654
+ Edge cases:
655
+ - Changed file not in graph: it's new, return empty (no dependents yet)
656
+ - Changed file is a barrel/index: many dependents, may blow up results
657
+ → cap at 100 dependents per file and flag truncation
658
+ - Changed file is generated: lower priority in results
659
+ - Changed file is a config: flag as potentially affecting everything
660
+ that reads this config
661
+
662
+ Returns: dict mapping file path → list of affected nodes
663
+ """
664
+ result: dict[str, list[GraphNode]] = {}
665
+ max_dependents_per_file = 100
666
+
667
+ for fp in file_paths:
668
+ nodes = await self.get_nodes_by_file(fp)
669
+ affected: list[GraphNode] = []
670
+ seen: set[str] = set()
671
+
672
+ for node in nodes:
673
+ dependents = await self.get_dependents(
674
+ node.node_id, max_depth=max_depth
675
+ )
676
+ for dep in dependents:
677
+ if dep.node_id not in seen:
678
+ seen.add(dep.node_id)
679
+ affected.append(dep)
680
+
681
+ if len(affected) >= max_dependents_per_file:
682
+ logger.warning(
683
+ "Impact analysis for %s truncated at %d dependents",
684
+ fp,
685
+ max_dependents_per_file,
686
+ )
687
+ break
688
+
689
+ result[str(fp)] = affected
690
+
691
+ return result
692
+
693
+ async def get_stats(self) -> dict[str, int]:
694
+ """Return graph statistics for health checks and CLI display."""
695
+ stats: dict[str, int] = {}
696
+ for table in ("nodes", "edges", "file_fingerprints"):
697
+ cursor = await self._db.execute(f"SELECT COUNT(*) FROM {table}") # noqa: S608
698
+ row = await cursor.fetchone()
699
+ stats[f"{table}_count"] = row[0] if row else 0
700
+ return stats
701
+
702
+ async def get_fingerprint(self, file_path: Path) -> str | None:
703
+ """Get stored content hash for a file (for incremental update checks)."""
704
+ stored_path = self._to_stored_path(file_path)
705
+ cursor = await self._db.execute(
706
+ "SELECT content_hash FROM file_fingerprints WHERE file_path = ?",
707
+ (stored_path,),
708
+ )
709
+ row = await cursor.fetchone()
710
+ return row[0] if row else None
711
+
712
+ async def update_fingerprint(
713
+ self,
714
+ file_path: Path,
715
+ content_hash: str,
716
+ size_bytes: int,
717
+ last_modified: str,
718
+ language: Language,
719
+ node_count: int,
720
+ ) -> None:
721
+ """Update the stored fingerprint for a file."""
722
+ stored_path = self._to_stored_path(file_path)
723
+ await self._db.execute(
724
+ """
725
+ INSERT INTO file_fingerprints (file_path, content_hash, size_bytes, last_modified, language, node_count)
726
+ VALUES (?, ?, ?, ?, ?, ?)
727
+ ON CONFLICT(file_path) DO UPDATE SET
728
+ content_hash=excluded.content_hash,
729
+ size_bytes=excluded.size_bytes,
730
+ last_modified=excluded.last_modified,
731
+ language=excluded.language,
732
+ node_count=excluded.node_count
733
+ """,
734
+ (stored_path, content_hash, size_bytes, last_modified, language.value, node_count),
735
+ )
736
+ await self._db.commit()