codebase-intel 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codebase_intel/__init__.py +3 -0
- codebase_intel/analytics/__init__.py +1 -0
- codebase_intel/analytics/benchmark.py +406 -0
- codebase_intel/analytics/feedback.py +496 -0
- codebase_intel/analytics/tracker.py +439 -0
- codebase_intel/cli/__init__.py +1 -0
- codebase_intel/cli/main.py +740 -0
- codebase_intel/contracts/__init__.py +1 -0
- codebase_intel/contracts/auto_generator.py +438 -0
- codebase_intel/contracts/evaluator.py +531 -0
- codebase_intel/contracts/models.py +433 -0
- codebase_intel/contracts/registry.py +225 -0
- codebase_intel/core/__init__.py +1 -0
- codebase_intel/core/config.py +248 -0
- codebase_intel/core/exceptions.py +454 -0
- codebase_intel/core/types.py +375 -0
- codebase_intel/decisions/__init__.py +1 -0
- codebase_intel/decisions/miner.py +297 -0
- codebase_intel/decisions/models.py +302 -0
- codebase_intel/decisions/store.py +411 -0
- codebase_intel/drift/__init__.py +1 -0
- codebase_intel/drift/detector.py +443 -0
- codebase_intel/graph/__init__.py +1 -0
- codebase_intel/graph/builder.py +391 -0
- codebase_intel/graph/parser.py +1232 -0
- codebase_intel/graph/query.py +377 -0
- codebase_intel/graph/storage.py +736 -0
- codebase_intel/mcp/__init__.py +1 -0
- codebase_intel/mcp/server.py +710 -0
- codebase_intel/orchestrator/__init__.py +1 -0
- codebase_intel/orchestrator/assembler.py +649 -0
- codebase_intel-0.1.0.dist-info/METADATA +361 -0
- codebase_intel-0.1.0.dist-info/RECORD +36 -0
- codebase_intel-0.1.0.dist-info/WHEEL +4 -0
- codebase_intel-0.1.0.dist-info/entry_points.txt +2 -0
- codebase_intel-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,736 @@
|
|
|
1
|
+
"""SQLite-backed graph storage with WAL mode for concurrent access.
|
|
2
|
+
|
|
3
|
+
Design decisions:
|
|
4
|
+
- SQLite over a graph DB (Neo4j, etc.) to maintain zero-dependency portability
|
|
5
|
+
- WAL mode enables concurrent readers (MCP queries) + single writer (git hook updates)
|
|
6
|
+
- Schema versioning for forward migration without data loss
|
|
7
|
+
- Batch operations for initial build (10k+ nodes), single ops for incremental
|
|
8
|
+
|
|
9
|
+
Edge cases handled:
|
|
10
|
+
- Concurrent writes: WAL mode + busy timeout + application-level retry
|
|
11
|
+
- Corrupt database: detect via integrity check, offer re-initialization
|
|
12
|
+
- Schema migration: version table tracks schema, auto-migrate on open
|
|
13
|
+
- Large codebases: batch inserts with transaction chunking (100k+ nodes)
|
|
14
|
+
- Partial writes: crash during build → incomplete graph → detect via marker table
|
|
15
|
+
- Disk full: catch and surface meaningful error
|
|
16
|
+
- Path encoding: SQLite stores paths as TEXT, we normalize to POSIX format
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import logging
|
|
22
|
+
from contextlib import asynccontextmanager
|
|
23
|
+
from pathlib import Path, PurePosixPath
|
|
24
|
+
from typing import TYPE_CHECKING, AsyncIterator
|
|
25
|
+
|
|
26
|
+
import aiosqlite
|
|
27
|
+
|
|
28
|
+
from codebase_intel.core.exceptions import (
|
|
29
|
+
ErrorContext,
|
|
30
|
+
StorageConcurrencyError,
|
|
31
|
+
StorageCorruptError,
|
|
32
|
+
StorageMigrationError,
|
|
33
|
+
)
|
|
34
|
+
from codebase_intel.core.types import (
|
|
35
|
+
EdgeKind,
|
|
36
|
+
GraphEdge,
|
|
37
|
+
GraphNode,
|
|
38
|
+
Language,
|
|
39
|
+
LineRange,
|
|
40
|
+
NodeKind,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
if TYPE_CHECKING:
|
|
44
|
+
from codebase_intel.core.config import GraphConfig
|
|
45
|
+
|
|
46
|
+
logger = logging.getLogger(__name__)
|
|
47
|
+
|
|
48
|
+
SCHEMA_VERSION = 1
|
|
49
|
+
|
|
50
|
+
SCHEMA_SQL = """
|
|
51
|
+
-- Schema version tracking
|
|
52
|
+
CREATE TABLE IF NOT EXISTS schema_version (
|
|
53
|
+
version INTEGER NOT NULL,
|
|
54
|
+
migrated_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
55
|
+
);
|
|
56
|
+
|
|
57
|
+
-- Build status tracking (detect incomplete builds)
|
|
58
|
+
CREATE TABLE IF NOT EXISTS build_status (
|
|
59
|
+
build_id TEXT PRIMARY KEY,
|
|
60
|
+
started_at TEXT NOT NULL,
|
|
61
|
+
completed_at TEXT,
|
|
62
|
+
file_count INTEGER DEFAULT 0,
|
|
63
|
+
node_count INTEGER DEFAULT 0,
|
|
64
|
+
edge_count INTEGER DEFAULT 0
|
|
65
|
+
);
|
|
66
|
+
|
|
67
|
+
-- Graph nodes
|
|
68
|
+
CREATE TABLE IF NOT EXISTS nodes (
|
|
69
|
+
node_id TEXT PRIMARY KEY,
|
|
70
|
+
kind TEXT NOT NULL,
|
|
71
|
+
name TEXT NOT NULL,
|
|
72
|
+
qualified_name TEXT NOT NULL,
|
|
73
|
+
file_path TEXT NOT NULL, -- POSIX-normalized, relative to project root
|
|
74
|
+
line_start INTEGER,
|
|
75
|
+
line_end INTEGER,
|
|
76
|
+
language TEXT NOT NULL DEFAULT 'unknown',
|
|
77
|
+
content_hash TEXT,
|
|
78
|
+
docstring TEXT,
|
|
79
|
+
is_generated INTEGER NOT NULL DEFAULT 0,
|
|
80
|
+
is_external INTEGER NOT NULL DEFAULT 0,
|
|
81
|
+
is_test INTEGER NOT NULL DEFAULT 0,
|
|
82
|
+
is_entry_point INTEGER NOT NULL DEFAULT 0,
|
|
83
|
+
metadata_json TEXT DEFAULT '{}',
|
|
84
|
+
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
|
|
85
|
+
);
|
|
86
|
+
|
|
87
|
+
-- Graph edges
|
|
88
|
+
CREATE TABLE IF NOT EXISTS edges (
|
|
89
|
+
source_id TEXT NOT NULL REFERENCES nodes(node_id) ON DELETE CASCADE,
|
|
90
|
+
target_id TEXT NOT NULL REFERENCES nodes(node_id) ON DELETE CASCADE,
|
|
91
|
+
kind TEXT NOT NULL,
|
|
92
|
+
confidence REAL NOT NULL DEFAULT 1.0,
|
|
93
|
+
is_type_only INTEGER NOT NULL DEFAULT 0,
|
|
94
|
+
metadata_json TEXT DEFAULT '{}',
|
|
95
|
+
PRIMARY KEY (source_id, target_id, kind)
|
|
96
|
+
);
|
|
97
|
+
|
|
98
|
+
-- File fingerprints (for incremental updates — only re-parse changed files)
|
|
99
|
+
CREATE TABLE IF NOT EXISTS file_fingerprints (
|
|
100
|
+
file_path TEXT PRIMARY KEY,
|
|
101
|
+
content_hash TEXT NOT NULL,
|
|
102
|
+
size_bytes INTEGER NOT NULL,
|
|
103
|
+
last_modified TEXT NOT NULL,
|
|
104
|
+
language TEXT NOT NULL DEFAULT 'unknown',
|
|
105
|
+
node_count INTEGER NOT NULL DEFAULT 0
|
|
106
|
+
);
|
|
107
|
+
|
|
108
|
+
-- Indexes for common query patterns
|
|
109
|
+
CREATE INDEX IF NOT EXISTS idx_nodes_file ON nodes(file_path);
|
|
110
|
+
CREATE INDEX IF NOT EXISTS idx_nodes_kind ON nodes(kind);
|
|
111
|
+
CREATE INDEX IF NOT EXISTS idx_nodes_name ON nodes(name);
|
|
112
|
+
CREATE INDEX IF NOT EXISTS idx_nodes_qualified ON nodes(qualified_name);
|
|
113
|
+
CREATE INDEX IF NOT EXISTS idx_edges_source ON edges(source_id);
|
|
114
|
+
CREATE INDEX IF NOT EXISTS idx_edges_target ON edges(target_id);
|
|
115
|
+
CREATE INDEX IF NOT EXISTS idx_edges_kind ON edges(kind);
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class GraphStorage:
|
|
120
|
+
"""Async SQLite storage for the semantic code graph.
|
|
121
|
+
|
|
122
|
+
Usage:
|
|
123
|
+
async with GraphStorage.open(config) as storage:
|
|
124
|
+
await storage.upsert_node(node)
|
|
125
|
+
nodes = await storage.get_dependents("node_id")
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
def __init__(self, db: aiosqlite.Connection, project_root: Path) -> None:
|
|
129
|
+
self._db = db
|
|
130
|
+
self._project_root = project_root
|
|
131
|
+
|
|
132
|
+
@classmethod
|
|
133
|
+
@asynccontextmanager
|
|
134
|
+
async def open(cls, config: GraphConfig, project_root: Path) -> AsyncIterator[GraphStorage]:
|
|
135
|
+
"""Open (or create) the graph database with proper configuration.
|
|
136
|
+
|
|
137
|
+
Edge cases:
|
|
138
|
+
- DB file doesn't exist: create with full schema
|
|
139
|
+
- DB file exists but wrong version: migrate or error
|
|
140
|
+
- DB file is corrupt: integrity check fails, raise StorageCorruptError
|
|
141
|
+
- DB locked by another process: retry with busy_timeout
|
|
142
|
+
"""
|
|
143
|
+
db_path = config.db_path
|
|
144
|
+
db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
145
|
+
|
|
146
|
+
db = await aiosqlite.connect(str(db_path))
|
|
147
|
+
try:
|
|
148
|
+
# Enable WAL mode for concurrent read/write
|
|
149
|
+
if config.enable_wal_mode:
|
|
150
|
+
await db.execute("PRAGMA journal_mode=WAL")
|
|
151
|
+
|
|
152
|
+
# Busy timeout: wait up to 5s for locks instead of failing immediately
|
|
153
|
+
# Edge case: git hook and MCP server both active
|
|
154
|
+
await db.execute("PRAGMA busy_timeout=5000")
|
|
155
|
+
|
|
156
|
+
# Foreign keys for cascade deletes (removing a node removes its edges)
|
|
157
|
+
await db.execute("PRAGMA foreign_keys=ON")
|
|
158
|
+
|
|
159
|
+
# Integrity check on first open
|
|
160
|
+
result = await db.execute("PRAGMA integrity_check")
|
|
161
|
+
check = await result.fetchone()
|
|
162
|
+
if check and check[0] != "ok":
|
|
163
|
+
raise StorageCorruptError(
|
|
164
|
+
f"Database integrity check failed: {check[0]}",
|
|
165
|
+
ErrorContext(file_path=db_path, operation="integrity_check"),
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
storage = cls(db, project_root)
|
|
169
|
+
await storage._ensure_schema()
|
|
170
|
+
yield storage
|
|
171
|
+
|
|
172
|
+
except aiosqlite.OperationalError as exc:
|
|
173
|
+
if "database is locked" in str(exc):
|
|
174
|
+
raise StorageConcurrencyError(
|
|
175
|
+
"Database is locked by another process",
|
|
176
|
+
ErrorContext(file_path=db_path, operation="open"),
|
|
177
|
+
) from exc
|
|
178
|
+
raise
|
|
179
|
+
finally:
|
|
180
|
+
await db.close()
|
|
181
|
+
|
|
182
|
+
async def _ensure_schema(self) -> None:
|
|
183
|
+
"""Create or migrate the schema.
|
|
184
|
+
|
|
185
|
+
Edge case: schema_version table doesn't exist (fresh DB) vs.
|
|
186
|
+
exists with older version (needs migration) vs.
|
|
187
|
+
exists with newer version (user downgraded the tool — refuse).
|
|
188
|
+
"""
|
|
189
|
+
# Check if schema_version table exists
|
|
190
|
+
cursor = await self._db.execute(
|
|
191
|
+
"SELECT name FROM sqlite_master WHERE type='table' AND name='schema_version'"
|
|
192
|
+
)
|
|
193
|
+
table_exists = await cursor.fetchone()
|
|
194
|
+
|
|
195
|
+
if not table_exists:
|
|
196
|
+
# Fresh database — create everything
|
|
197
|
+
await self._db.executescript(SCHEMA_SQL)
|
|
198
|
+
await self._db.execute(
|
|
199
|
+
"INSERT INTO schema_version (version) VALUES (?)",
|
|
200
|
+
(SCHEMA_VERSION,),
|
|
201
|
+
)
|
|
202
|
+
await self._db.commit()
|
|
203
|
+
return
|
|
204
|
+
|
|
205
|
+
# Check version
|
|
206
|
+
cursor = await self._db.execute(
|
|
207
|
+
"SELECT MAX(version) FROM schema_version"
|
|
208
|
+
)
|
|
209
|
+
row = await cursor.fetchone()
|
|
210
|
+
current_version = row[0] if row else 0
|
|
211
|
+
|
|
212
|
+
if current_version == SCHEMA_VERSION:
|
|
213
|
+
return
|
|
214
|
+
|
|
215
|
+
if current_version > SCHEMA_VERSION:
|
|
216
|
+
raise StorageMigrationError(
|
|
217
|
+
f"Database schema version {current_version} is newer than "
|
|
218
|
+
f"supported version {SCHEMA_VERSION}. Please upgrade codebase-intel.",
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
# Future: migration logic goes here
|
|
222
|
+
# For now, v1 is the only version
|
|
223
|
+
logger.info("Migrating schema from v%d to v%d", current_version, SCHEMA_VERSION)
|
|
224
|
+
await self._db.executescript(SCHEMA_SQL)
|
|
225
|
+
await self._db.execute(
|
|
226
|
+
"INSERT INTO schema_version (version) VALUES (?)",
|
|
227
|
+
(SCHEMA_VERSION,),
|
|
228
|
+
)
|
|
229
|
+
await self._db.commit()
|
|
230
|
+
|
|
231
|
+
# -------------------------------------------------------------------
|
|
232
|
+
# Path normalization
|
|
233
|
+
# -------------------------------------------------------------------
|
|
234
|
+
|
|
235
|
+
def _to_stored_path(self, path: Path) -> str:
|
|
236
|
+
"""Convert absolute path to POSIX-relative for storage.
|
|
237
|
+
|
|
238
|
+
Edge case: path outside project root (symlink target, monorepo ref).
|
|
239
|
+
We store absolute POSIX path in that case.
|
|
240
|
+
"""
|
|
241
|
+
try:
|
|
242
|
+
return str(PurePosixPath(path.resolve().relative_to(self._project_root)))
|
|
243
|
+
except ValueError:
|
|
244
|
+
return str(PurePosixPath(path.resolve()))
|
|
245
|
+
|
|
246
|
+
def _from_stored_path(self, stored: str) -> Path:
|
|
247
|
+
"""Convert stored POSIX path back to absolute Path."""
|
|
248
|
+
p = Path(stored)
|
|
249
|
+
if p.is_absolute():
|
|
250
|
+
return p
|
|
251
|
+
return self._project_root / p
|
|
252
|
+
|
|
253
|
+
# -------------------------------------------------------------------
|
|
254
|
+
# Node operations
|
|
255
|
+
# -------------------------------------------------------------------
|
|
256
|
+
|
|
257
|
+
async def upsert_node(self, node: GraphNode) -> None:
|
|
258
|
+
"""Insert or update a graph node.
|
|
259
|
+
|
|
260
|
+
Edge case: node with same ID but different content (file changed).
|
|
261
|
+
We update in place — the node_id is deterministic from (path, kind, name).
|
|
262
|
+
"""
|
|
263
|
+
import json
|
|
264
|
+
|
|
265
|
+
await self._db.execute(
|
|
266
|
+
"""
|
|
267
|
+
INSERT INTO nodes (
|
|
268
|
+
node_id, kind, name, qualified_name, file_path,
|
|
269
|
+
line_start, line_end, language, content_hash, docstring,
|
|
270
|
+
is_generated, is_external, is_test, is_entry_point,
|
|
271
|
+
metadata_json
|
|
272
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
273
|
+
ON CONFLICT(node_id) DO UPDATE SET
|
|
274
|
+
kind=excluded.kind, name=excluded.name,
|
|
275
|
+
qualified_name=excluded.qualified_name,
|
|
276
|
+
file_path=excluded.file_path,
|
|
277
|
+
line_start=excluded.line_start, line_end=excluded.line_end,
|
|
278
|
+
language=excluded.language, content_hash=excluded.content_hash,
|
|
279
|
+
docstring=excluded.docstring,
|
|
280
|
+
is_generated=excluded.is_generated,
|
|
281
|
+
is_external=excluded.is_external,
|
|
282
|
+
is_test=excluded.is_test,
|
|
283
|
+
is_entry_point=excluded.is_entry_point,
|
|
284
|
+
metadata_json=excluded.metadata_json,
|
|
285
|
+
updated_at=datetime('now')
|
|
286
|
+
""",
|
|
287
|
+
(
|
|
288
|
+
node.node_id,
|
|
289
|
+
node.kind.value,
|
|
290
|
+
node.name,
|
|
291
|
+
node.qualified_name,
|
|
292
|
+
self._to_stored_path(node.file_path),
|
|
293
|
+
node.line_range.start if node.line_range else None,
|
|
294
|
+
node.line_range.end if node.line_range else None,
|
|
295
|
+
node.language.value,
|
|
296
|
+
node.content_hash,
|
|
297
|
+
node.docstring,
|
|
298
|
+
int(node.is_generated),
|
|
299
|
+
int(node.is_external),
|
|
300
|
+
int(node.is_test),
|
|
301
|
+
int(node.is_entry_point),
|
|
302
|
+
json.dumps(node.metadata),
|
|
303
|
+
),
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
async def upsert_nodes_batch(self, nodes: list[GraphNode]) -> None:
|
|
307
|
+
"""Batch upsert for initial graph build.
|
|
308
|
+
|
|
309
|
+
Edge case: 100k+ nodes on a large codebase. We chunk into transactions
|
|
310
|
+
of 1000 to balance speed vs. memory and allow partial progress visibility.
|
|
311
|
+
"""
|
|
312
|
+
import json
|
|
313
|
+
|
|
314
|
+
chunk_size = 1000
|
|
315
|
+
for i in range(0, len(nodes), chunk_size):
|
|
316
|
+
chunk = nodes[i : i + chunk_size]
|
|
317
|
+
await self._db.executemany(
|
|
318
|
+
"""
|
|
319
|
+
INSERT INTO nodes (
|
|
320
|
+
node_id, kind, name, qualified_name, file_path,
|
|
321
|
+
line_start, line_end, language, content_hash, docstring,
|
|
322
|
+
is_generated, is_external, is_test, is_entry_point,
|
|
323
|
+
metadata_json
|
|
324
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
325
|
+
ON CONFLICT(node_id) DO UPDATE SET
|
|
326
|
+
kind=excluded.kind, name=excluded.name,
|
|
327
|
+
qualified_name=excluded.qualified_name,
|
|
328
|
+
file_path=excluded.file_path,
|
|
329
|
+
line_start=excluded.line_start, line_end=excluded.line_end,
|
|
330
|
+
language=excluded.language, content_hash=excluded.content_hash,
|
|
331
|
+
docstring=excluded.docstring,
|
|
332
|
+
is_generated=excluded.is_generated,
|
|
333
|
+
is_external=excluded.is_external,
|
|
334
|
+
is_test=excluded.is_test,
|
|
335
|
+
is_entry_point=excluded.is_entry_point,
|
|
336
|
+
metadata_json=excluded.metadata_json,
|
|
337
|
+
updated_at=datetime('now')
|
|
338
|
+
""",
|
|
339
|
+
[
|
|
340
|
+
(
|
|
341
|
+
n.node_id,
|
|
342
|
+
n.kind.value,
|
|
343
|
+
n.name,
|
|
344
|
+
n.qualified_name,
|
|
345
|
+
self._to_stored_path(n.file_path),
|
|
346
|
+
n.line_range.start if n.line_range else None,
|
|
347
|
+
n.line_range.end if n.line_range else None,
|
|
348
|
+
n.language.value,
|
|
349
|
+
n.content_hash,
|
|
350
|
+
n.docstring,
|
|
351
|
+
int(n.is_generated),
|
|
352
|
+
int(n.is_external),
|
|
353
|
+
int(n.is_test),
|
|
354
|
+
int(n.is_entry_point),
|
|
355
|
+
json.dumps(n.metadata),
|
|
356
|
+
)
|
|
357
|
+
for n in chunk
|
|
358
|
+
],
|
|
359
|
+
)
|
|
360
|
+
await self._db.commit()
|
|
361
|
+
|
|
362
|
+
async def upsert_edge(self, edge: GraphEdge) -> None:
|
|
363
|
+
"""Insert or update a graph edge."""
|
|
364
|
+
import json
|
|
365
|
+
|
|
366
|
+
await self._db.execute(
|
|
367
|
+
"""
|
|
368
|
+
INSERT INTO edges (source_id, target_id, kind, confidence, is_type_only, metadata_json)
|
|
369
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
370
|
+
ON CONFLICT(source_id, target_id, kind) DO UPDATE SET
|
|
371
|
+
confidence=excluded.confidence,
|
|
372
|
+
is_type_only=excluded.is_type_only,
|
|
373
|
+
metadata_json=excluded.metadata_json
|
|
374
|
+
""",
|
|
375
|
+
(
|
|
376
|
+
edge.source_id,
|
|
377
|
+
edge.target_id,
|
|
378
|
+
edge.kind.value,
|
|
379
|
+
edge.confidence,
|
|
380
|
+
int(edge.is_type_only),
|
|
381
|
+
json.dumps(edge.metadata),
|
|
382
|
+
),
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
async def upsert_edges_batch(self, edges: list[GraphEdge]) -> None:
|
|
386
|
+
"""Batch upsert edges."""
|
|
387
|
+
import json
|
|
388
|
+
|
|
389
|
+
chunk_size = 1000
|
|
390
|
+
for i in range(0, len(edges), chunk_size):
|
|
391
|
+
chunk = edges[i : i + chunk_size]
|
|
392
|
+
await self._db.executemany(
|
|
393
|
+
"""
|
|
394
|
+
INSERT INTO edges (source_id, target_id, kind, confidence, is_type_only, metadata_json)
|
|
395
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
396
|
+
ON CONFLICT(source_id, target_id, kind) DO UPDATE SET
|
|
397
|
+
confidence=excluded.confidence,
|
|
398
|
+
is_type_only=excluded.is_type_only,
|
|
399
|
+
metadata_json=excluded.metadata_json
|
|
400
|
+
""",
|
|
401
|
+
[
|
|
402
|
+
(
|
|
403
|
+
e.source_id,
|
|
404
|
+
e.target_id,
|
|
405
|
+
e.kind.value,
|
|
406
|
+
e.confidence,
|
|
407
|
+
int(e.is_type_only),
|
|
408
|
+
json.dumps(e.metadata),
|
|
409
|
+
)
|
|
410
|
+
for e in chunk
|
|
411
|
+
],
|
|
412
|
+
)
|
|
413
|
+
await self._db.commit()
|
|
414
|
+
|
|
415
|
+
async def remove_file_nodes(self, file_path: Path) -> int:
|
|
416
|
+
"""Remove all nodes (and their edges via CASCADE) for a file.
|
|
417
|
+
|
|
418
|
+
Used during incremental update: delete old nodes before re-parsing.
|
|
419
|
+
|
|
420
|
+
Edge case: file renamed — old path nodes are removed, new path
|
|
421
|
+
nodes are added. The graph handles this as delete+insert, but
|
|
422
|
+
the orchestrator can detect renames via content_hash matching.
|
|
423
|
+
"""
|
|
424
|
+
stored_path = self._to_stored_path(file_path)
|
|
425
|
+
cursor = await self._db.execute(
|
|
426
|
+
"DELETE FROM nodes WHERE file_path = ?", (stored_path,)
|
|
427
|
+
)
|
|
428
|
+
await self._db.commit()
|
|
429
|
+
return cursor.rowcount # type: ignore[return-value]
|
|
430
|
+
|
|
431
|
+
# -------------------------------------------------------------------
|
|
432
|
+
# Query operations
|
|
433
|
+
# -------------------------------------------------------------------
|
|
434
|
+
|
|
435
|
+
def _row_to_node(self, row: aiosqlite.Row) -> GraphNode:
|
|
436
|
+
"""Convert a database row to a GraphNode."""
|
|
437
|
+
import json
|
|
438
|
+
|
|
439
|
+
line_range = None
|
|
440
|
+
if row[5] is not None and row[6] is not None:
|
|
441
|
+
line_range = LineRange(start=row[5], end=row[6])
|
|
442
|
+
|
|
443
|
+
return GraphNode(
|
|
444
|
+
node_id=row[0],
|
|
445
|
+
kind=NodeKind(row[1]),
|
|
446
|
+
name=row[2],
|
|
447
|
+
qualified_name=row[3],
|
|
448
|
+
file_path=self._from_stored_path(row[4]),
|
|
449
|
+
line_range=line_range,
|
|
450
|
+
language=Language(row[7]),
|
|
451
|
+
content_hash=row[8],
|
|
452
|
+
docstring=row[9],
|
|
453
|
+
is_generated=bool(row[10]),
|
|
454
|
+
is_external=bool(row[11]),
|
|
455
|
+
is_test=bool(row[12]),
|
|
456
|
+
is_entry_point=bool(row[13]),
|
|
457
|
+
metadata=json.loads(row[14]) if row[14] else {},
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
async def get_node(self, node_id: str) -> GraphNode | None:
|
|
461
|
+
"""Get a single node by ID."""
|
|
462
|
+
self._db.row_factory = None
|
|
463
|
+
cursor = await self._db.execute(
|
|
464
|
+
"""
|
|
465
|
+
SELECT node_id, kind, name, qualified_name, file_path,
|
|
466
|
+
line_start, line_end, language, content_hash, docstring,
|
|
467
|
+
is_generated, is_external, is_test, is_entry_point,
|
|
468
|
+
metadata_json
|
|
469
|
+
FROM nodes WHERE node_id = ?
|
|
470
|
+
""",
|
|
471
|
+
(node_id,),
|
|
472
|
+
)
|
|
473
|
+
row = await cursor.fetchone()
|
|
474
|
+
return self._row_to_node(row) if row else None
|
|
475
|
+
|
|
476
|
+
async def get_nodes_by_file(self, file_path: Path) -> list[GraphNode]:
|
|
477
|
+
"""Get all nodes defined in a file."""
|
|
478
|
+
stored_path = self._to_stored_path(file_path)
|
|
479
|
+
cursor = await self._db.execute(
|
|
480
|
+
"""
|
|
481
|
+
SELECT node_id, kind, name, qualified_name, file_path,
|
|
482
|
+
line_start, line_end, language, content_hash, docstring,
|
|
483
|
+
is_generated, is_external, is_test, is_entry_point,
|
|
484
|
+
metadata_json
|
|
485
|
+
FROM nodes WHERE file_path = ?
|
|
486
|
+
ORDER BY line_start
|
|
487
|
+
""",
|
|
488
|
+
(stored_path,),
|
|
489
|
+
)
|
|
490
|
+
return [self._row_to_node(row) for row in await cursor.fetchall()]
|
|
491
|
+
|
|
492
|
+
async def get_dependents(
|
|
493
|
+
self,
|
|
494
|
+
node_id: str,
|
|
495
|
+
*,
|
|
496
|
+
edge_kinds: list[EdgeKind] | None = None,
|
|
497
|
+
include_type_only: bool = False,
|
|
498
|
+
max_depth: int = 1,
|
|
499
|
+
) -> list[GraphNode]:
|
|
500
|
+
"""Find all nodes that depend ON the given node (reverse traversal).
|
|
501
|
+
|
|
502
|
+
This answers: "what breaks if I change this?"
|
|
503
|
+
|
|
504
|
+
Edge cases:
|
|
505
|
+
- Circular dependencies: tracked via visited set, no infinite loop
|
|
506
|
+
- Deep chains: capped at max_depth to prevent runaway
|
|
507
|
+
- Type-only deps: excluded by default (changing implementation doesn't
|
|
508
|
+
break a TYPE_CHECKING import)
|
|
509
|
+
- Dynamic imports: included but with lower confidence
|
|
510
|
+
"""
|
|
511
|
+
visited: set[str] = set()
|
|
512
|
+
result: list[GraphNode] = []
|
|
513
|
+
queue: list[tuple[str, int]] = [(node_id, 0)]
|
|
514
|
+
|
|
515
|
+
kind_filter = ""
|
|
516
|
+
if edge_kinds:
|
|
517
|
+
kinds_sql = ",".join(f"'{k.value}'" for k in edge_kinds)
|
|
518
|
+
kind_filter = f"AND kind IN ({kinds_sql})"
|
|
519
|
+
|
|
520
|
+
type_filter = "" if include_type_only else "AND is_type_only = 0"
|
|
521
|
+
|
|
522
|
+
while queue:
|
|
523
|
+
current_id, depth = queue.pop(0)
|
|
524
|
+
if current_id in visited or depth > max_depth:
|
|
525
|
+
continue
|
|
526
|
+
visited.add(current_id)
|
|
527
|
+
|
|
528
|
+
cursor = await self._db.execute(
|
|
529
|
+
f"""
|
|
530
|
+
SELECT source_id FROM edges
|
|
531
|
+
WHERE target_id = ? {kind_filter} {type_filter}
|
|
532
|
+
""", # noqa: S608
|
|
533
|
+
(current_id,),
|
|
534
|
+
)
|
|
535
|
+
for row in await cursor.fetchall():
|
|
536
|
+
source_id = row[0]
|
|
537
|
+
if source_id not in visited:
|
|
538
|
+
node = await self.get_node(source_id)
|
|
539
|
+
if node:
|
|
540
|
+
result.append(node)
|
|
541
|
+
if depth + 1 <= max_depth:
|
|
542
|
+
queue.append((source_id, depth + 1))
|
|
543
|
+
|
|
544
|
+
return result
|
|
545
|
+
|
|
546
|
+
async def get_dependencies(
|
|
547
|
+
self,
|
|
548
|
+
node_id: str,
|
|
549
|
+
*,
|
|
550
|
+
edge_kinds: list[EdgeKind] | None = None,
|
|
551
|
+
include_type_only: bool = True,
|
|
552
|
+
max_depth: int = 1,
|
|
553
|
+
) -> list[GraphNode]:
|
|
554
|
+
"""Find all nodes that the given node depends on (forward traversal).
|
|
555
|
+
|
|
556
|
+
This answers: "what context do I need to understand this?"
|
|
557
|
+
"""
|
|
558
|
+
visited: set[str] = set()
|
|
559
|
+
result: list[GraphNode] = []
|
|
560
|
+
queue: list[tuple[str, int]] = [(node_id, 0)]
|
|
561
|
+
|
|
562
|
+
kind_filter = ""
|
|
563
|
+
if edge_kinds:
|
|
564
|
+
kinds_sql = ",".join(f"'{k.value}'" for k in edge_kinds)
|
|
565
|
+
kind_filter = f"AND kind IN ({kinds_sql})"
|
|
566
|
+
|
|
567
|
+
type_filter = "" if include_type_only else "AND is_type_only = 0"
|
|
568
|
+
|
|
569
|
+
while queue:
|
|
570
|
+
current_id, depth = queue.pop(0)
|
|
571
|
+
if current_id in visited or depth > max_depth:
|
|
572
|
+
continue
|
|
573
|
+
visited.add(current_id)
|
|
574
|
+
|
|
575
|
+
cursor = await self._db.execute(
|
|
576
|
+
f"""
|
|
577
|
+
SELECT target_id FROM edges
|
|
578
|
+
WHERE source_id = ? {kind_filter} {type_filter}
|
|
579
|
+
""", # noqa: S608
|
|
580
|
+
(current_id,),
|
|
581
|
+
)
|
|
582
|
+
for row in await cursor.fetchall():
|
|
583
|
+
target_id = row[0]
|
|
584
|
+
if target_id not in visited:
|
|
585
|
+
node = await self.get_node(target_id)
|
|
586
|
+
if node:
|
|
587
|
+
result.append(node)
|
|
588
|
+
if depth + 1 <= max_depth:
|
|
589
|
+
queue.append((target_id, depth + 1))
|
|
590
|
+
|
|
591
|
+
return result
|
|
592
|
+
|
|
593
|
+
async def find_cycles(self, max_cycle_length: int = 10) -> list[list[str]]:
|
|
594
|
+
"""Detect circular dependency chains in the graph.
|
|
595
|
+
|
|
596
|
+
Edge case: large graphs can have many cycles. We cap detection
|
|
597
|
+
at max_cycle_length to keep this practical. Reports the shortest
|
|
598
|
+
cycles first (most actionable).
|
|
599
|
+
|
|
600
|
+
Uses DFS with back-edge detection.
|
|
601
|
+
"""
|
|
602
|
+
# Get all node IDs
|
|
603
|
+
cursor = await self._db.execute("SELECT node_id FROM nodes")
|
|
604
|
+
all_nodes = [row[0] for row in await cursor.fetchall()]
|
|
605
|
+
|
|
606
|
+
visited: set[str] = set()
|
|
607
|
+
rec_stack: set[str] = set()
|
|
608
|
+
path: list[str] = []
|
|
609
|
+
cycles: list[list[str]] = []
|
|
610
|
+
|
|
611
|
+
async def _dfs(node_id: str) -> None:
|
|
612
|
+
if len(cycles) >= 50: # Cap to prevent excessive output
|
|
613
|
+
return
|
|
614
|
+
if len(path) > max_cycle_length:
|
|
615
|
+
return
|
|
616
|
+
|
|
617
|
+
visited.add(node_id)
|
|
618
|
+
rec_stack.add(node_id)
|
|
619
|
+
path.append(node_id)
|
|
620
|
+
|
|
621
|
+
cursor = await self._db.execute(
|
|
622
|
+
"SELECT target_id FROM edges WHERE source_id = ?",
|
|
623
|
+
(node_id,),
|
|
624
|
+
)
|
|
625
|
+
for row in await cursor.fetchall():
|
|
626
|
+
target_id = row[0]
|
|
627
|
+
if target_id not in visited:
|
|
628
|
+
await _dfs(target_id)
|
|
629
|
+
elif target_id in rec_stack:
|
|
630
|
+
# Found a cycle
|
|
631
|
+
cycle_start = path.index(target_id)
|
|
632
|
+
cycle = path[cycle_start:] + [target_id]
|
|
633
|
+
cycles.append(cycle)
|
|
634
|
+
|
|
635
|
+
path.pop()
|
|
636
|
+
rec_stack.discard(node_id)
|
|
637
|
+
|
|
638
|
+
for node_id in all_nodes:
|
|
639
|
+
if node_id not in visited:
|
|
640
|
+
await _dfs(node_id)
|
|
641
|
+
|
|
642
|
+
return sorted(cycles, key=len)
|
|
643
|
+
|
|
644
|
+
async def impact_analysis(
|
|
645
|
+
self,
|
|
646
|
+
file_paths: list[Path],
|
|
647
|
+
max_depth: int = 3,
|
|
648
|
+
) -> dict[str, list[GraphNode]]:
|
|
649
|
+
"""For a set of changed files, find all transitively affected nodes.
|
|
650
|
+
|
|
651
|
+
This is the core query for the orchestrator: "these files changed,
|
|
652
|
+
what else needs to be in context?"
|
|
653
|
+
|
|
654
|
+
Edge cases:
|
|
655
|
+
- Changed file not in graph: it's new, return empty (no dependents yet)
|
|
656
|
+
- Changed file is a barrel/index: many dependents, may blow up results
|
|
657
|
+
→ cap at 100 dependents per file and flag truncation
|
|
658
|
+
- Changed file is generated: lower priority in results
|
|
659
|
+
- Changed file is a config: flag as potentially affecting everything
|
|
660
|
+
that reads this config
|
|
661
|
+
|
|
662
|
+
Returns: dict mapping file path → list of affected nodes
|
|
663
|
+
"""
|
|
664
|
+
result: dict[str, list[GraphNode]] = {}
|
|
665
|
+
max_dependents_per_file = 100
|
|
666
|
+
|
|
667
|
+
for fp in file_paths:
|
|
668
|
+
nodes = await self.get_nodes_by_file(fp)
|
|
669
|
+
affected: list[GraphNode] = []
|
|
670
|
+
seen: set[str] = set()
|
|
671
|
+
|
|
672
|
+
for node in nodes:
|
|
673
|
+
dependents = await self.get_dependents(
|
|
674
|
+
node.node_id, max_depth=max_depth
|
|
675
|
+
)
|
|
676
|
+
for dep in dependents:
|
|
677
|
+
if dep.node_id not in seen:
|
|
678
|
+
seen.add(dep.node_id)
|
|
679
|
+
affected.append(dep)
|
|
680
|
+
|
|
681
|
+
if len(affected) >= max_dependents_per_file:
|
|
682
|
+
logger.warning(
|
|
683
|
+
"Impact analysis for %s truncated at %d dependents",
|
|
684
|
+
fp,
|
|
685
|
+
max_dependents_per_file,
|
|
686
|
+
)
|
|
687
|
+
break
|
|
688
|
+
|
|
689
|
+
result[str(fp)] = affected
|
|
690
|
+
|
|
691
|
+
return result
|
|
692
|
+
|
|
693
|
+
async def get_stats(self) -> dict[str, int]:
|
|
694
|
+
"""Return graph statistics for health checks and CLI display."""
|
|
695
|
+
stats: dict[str, int] = {}
|
|
696
|
+
for table in ("nodes", "edges", "file_fingerprints"):
|
|
697
|
+
cursor = await self._db.execute(f"SELECT COUNT(*) FROM {table}") # noqa: S608
|
|
698
|
+
row = await cursor.fetchone()
|
|
699
|
+
stats[f"{table}_count"] = row[0] if row else 0
|
|
700
|
+
return stats
|
|
701
|
+
|
|
702
|
+
async def get_fingerprint(self, file_path: Path) -> str | None:
|
|
703
|
+
"""Get stored content hash for a file (for incremental update checks)."""
|
|
704
|
+
stored_path = self._to_stored_path(file_path)
|
|
705
|
+
cursor = await self._db.execute(
|
|
706
|
+
"SELECT content_hash FROM file_fingerprints WHERE file_path = ?",
|
|
707
|
+
(stored_path,),
|
|
708
|
+
)
|
|
709
|
+
row = await cursor.fetchone()
|
|
710
|
+
return row[0] if row else None
|
|
711
|
+
|
|
712
|
+
async def update_fingerprint(
|
|
713
|
+
self,
|
|
714
|
+
file_path: Path,
|
|
715
|
+
content_hash: str,
|
|
716
|
+
size_bytes: int,
|
|
717
|
+
last_modified: str,
|
|
718
|
+
language: Language,
|
|
719
|
+
node_count: int,
|
|
720
|
+
) -> None:
|
|
721
|
+
"""Update the stored fingerprint for a file."""
|
|
722
|
+
stored_path = self._to_stored_path(file_path)
|
|
723
|
+
await self._db.execute(
|
|
724
|
+
"""
|
|
725
|
+
INSERT INTO file_fingerprints (file_path, content_hash, size_bytes, last_modified, language, node_count)
|
|
726
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
727
|
+
ON CONFLICT(file_path) DO UPDATE SET
|
|
728
|
+
content_hash=excluded.content_hash,
|
|
729
|
+
size_bytes=excluded.size_bytes,
|
|
730
|
+
last_modified=excluded.last_modified,
|
|
731
|
+
language=excluded.language,
|
|
732
|
+
node_count=excluded.node_count
|
|
733
|
+
""",
|
|
734
|
+
(stored_path, content_hash, size_bytes, last_modified, language.value, node_count),
|
|
735
|
+
)
|
|
736
|
+
await self._db.commit()
|