codebase-intel 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codebase_intel/__init__.py +3 -0
- codebase_intel/analytics/__init__.py +1 -0
- codebase_intel/analytics/benchmark.py +406 -0
- codebase_intel/analytics/feedback.py +496 -0
- codebase_intel/analytics/tracker.py +439 -0
- codebase_intel/cli/__init__.py +1 -0
- codebase_intel/cli/main.py +740 -0
- codebase_intel/contracts/__init__.py +1 -0
- codebase_intel/contracts/auto_generator.py +438 -0
- codebase_intel/contracts/evaluator.py +531 -0
- codebase_intel/contracts/models.py +433 -0
- codebase_intel/contracts/registry.py +225 -0
- codebase_intel/core/__init__.py +1 -0
- codebase_intel/core/config.py +248 -0
- codebase_intel/core/exceptions.py +454 -0
- codebase_intel/core/types.py +375 -0
- codebase_intel/decisions/__init__.py +1 -0
- codebase_intel/decisions/miner.py +297 -0
- codebase_intel/decisions/models.py +302 -0
- codebase_intel/decisions/store.py +411 -0
- codebase_intel/drift/__init__.py +1 -0
- codebase_intel/drift/detector.py +443 -0
- codebase_intel/graph/__init__.py +1 -0
- codebase_intel/graph/builder.py +391 -0
- codebase_intel/graph/parser.py +1232 -0
- codebase_intel/graph/query.py +377 -0
- codebase_intel/graph/storage.py +736 -0
- codebase_intel/mcp/__init__.py +1 -0
- codebase_intel/mcp/server.py +710 -0
- codebase_intel/orchestrator/__init__.py +1 -0
- codebase_intel/orchestrator/assembler.py +649 -0
- codebase_intel-0.1.0.dist-info/METADATA +361 -0
- codebase_intel-0.1.0.dist-info/RECORD +36 -0
- codebase_intel-0.1.0.dist-info/WHEEL +4 -0
- codebase_intel-0.1.0.dist-info/entry_points.txt +2 -0
- codebase_intel-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,391 @@
|
|
|
1
|
+
"""Graph builder — orchestrates parsing and storage for full and incremental builds.
|
|
2
|
+
|
|
3
|
+
This module manages the lifecycle of the code graph:
|
|
4
|
+
- Full build: parse entire codebase from scratch
|
|
5
|
+
- Incremental build: only re-parse files that changed since last build
|
|
6
|
+
- Cleanup: remove nodes for deleted files
|
|
7
|
+
|
|
8
|
+
Edge cases:
|
|
9
|
+
- Huge codebase (100k+ files): progress reporting, chunked processing, memory limits
|
|
10
|
+
- Incremental build after large refactor: many files changed, detect renames via
|
|
11
|
+
content hash matching (old hash appears at new path)
|
|
12
|
+
- Build interrupted (crash, Ctrl+C): detect via build_status table, offer resume
|
|
13
|
+
- Concurrent builds: detect via build_status, refuse second build
|
|
14
|
+
- New language added to project: need to re-scan files that were previously skipped
|
|
15
|
+
- .gitignore changes: previously ignored files now need parsing
|
|
16
|
+
- Symlink loops: resolve() + seen set prevents infinite recursion
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import asyncio
|
|
22
|
+
import logging
|
|
23
|
+
import uuid
|
|
24
|
+
from datetime import UTC, datetime
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
from typing import TYPE_CHECKING, AsyncIterator
|
|
27
|
+
|
|
28
|
+
from codebase_intel.core.exceptions import ErrorContext, ParseError
|
|
29
|
+
from codebase_intel.core.types import Language
|
|
30
|
+
from codebase_intel.graph.parser import FileParser, ParseResult, compute_file_hash, detect_language
|
|
31
|
+
|
|
32
|
+
if TYPE_CHECKING:
|
|
33
|
+
from codebase_intel.core.config import ProjectConfig
|
|
34
|
+
from codebase_intel.graph.storage import GraphStorage
|
|
35
|
+
|
|
36
|
+
logger = logging.getLogger(__name__)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class BuildProgress:
|
|
40
|
+
"""Tracks and reports build progress."""
|
|
41
|
+
|
|
42
|
+
def __init__(self, total_files: int) -> None:
|
|
43
|
+
self.total_files = total_files
|
|
44
|
+
self.processed: int = 0
|
|
45
|
+
self.skipped: int = 0
|
|
46
|
+
self.failed: int = 0
|
|
47
|
+
self.nodes_created: int = 0
|
|
48
|
+
self.edges_created: int = 0
|
|
49
|
+
self.warnings: list[str] = []
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def completed_pct(self) -> float:
|
|
53
|
+
if self.total_files == 0:
|
|
54
|
+
return 100.0
|
|
55
|
+
return (self.processed + self.skipped + self.failed) / self.total_files * 100
|
|
56
|
+
|
|
57
|
+
def summary(self) -> dict[str, int | float]:
|
|
58
|
+
return {
|
|
59
|
+
"total_files": self.total_files,
|
|
60
|
+
"processed": self.processed,
|
|
61
|
+
"skipped": self.skipped,
|
|
62
|
+
"failed": self.failed,
|
|
63
|
+
"nodes_created": self.nodes_created,
|
|
64
|
+
"edges_created": self.edges_created,
|
|
65
|
+
"warning_count": len(self.warnings),
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class GraphBuilder:
|
|
70
|
+
"""Builds and maintains the semantic code graph."""
|
|
71
|
+
|
|
72
|
+
def __init__(
|
|
73
|
+
self,
|
|
74
|
+
config: ProjectConfig,
|
|
75
|
+
storage: GraphStorage,
|
|
76
|
+
) -> None:
|
|
77
|
+
self._config = config
|
|
78
|
+
self._storage = storage
|
|
79
|
+
self._parser = FileParser(config.parser, config.project_root)
|
|
80
|
+
|
|
81
|
+
async def full_build(self) -> BuildProgress:
|
|
82
|
+
"""Build the graph from scratch — scan all files in the project.
|
|
83
|
+
|
|
84
|
+
Edge cases:
|
|
85
|
+
- Previous incomplete build: detected via build_status, cleaned up first
|
|
86
|
+
- Concurrent build attempt: detected and refused
|
|
87
|
+
- Memory pressure on huge repos: process in batches, commit frequently
|
|
88
|
+
"""
|
|
89
|
+
build_id = str(uuid.uuid4())[:8]
|
|
90
|
+
logger.info("Starting full build [%s]", build_id)
|
|
91
|
+
|
|
92
|
+
# Record build start
|
|
93
|
+
await self._storage._db.execute(
|
|
94
|
+
"INSERT INTO build_status (build_id, started_at) VALUES (?, ?)",
|
|
95
|
+
(build_id, datetime.now(UTC).isoformat()),
|
|
96
|
+
)
|
|
97
|
+
await self._storage._db.commit()
|
|
98
|
+
|
|
99
|
+
# Discover files
|
|
100
|
+
files = list(self._discover_files())
|
|
101
|
+
progress = BuildProgress(len(files))
|
|
102
|
+
logger.info("Discovered %d files to process", len(files))
|
|
103
|
+
|
|
104
|
+
# Process in batches to manage memory
|
|
105
|
+
batch_size = 100
|
|
106
|
+
for i in range(0, len(files), batch_size):
|
|
107
|
+
batch = files[i : i + batch_size]
|
|
108
|
+
await self._process_batch(batch, progress)
|
|
109
|
+
|
|
110
|
+
# Log progress periodically
|
|
111
|
+
if (i + batch_size) % 500 == 0 or i + batch_size >= len(files):
|
|
112
|
+
logger.info(
|
|
113
|
+
"Progress: %.0f%% (%d/%d files, %d nodes, %d edges)",
|
|
114
|
+
progress.completed_pct,
|
|
115
|
+
progress.processed,
|
|
116
|
+
progress.total_files,
|
|
117
|
+
progress.nodes_created,
|
|
118
|
+
progress.edges_created,
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
# Mark build complete
|
|
122
|
+
await self._storage._db.execute(
|
|
123
|
+
"""
|
|
124
|
+
UPDATE build_status SET completed_at = ?, file_count = ?,
|
|
125
|
+
node_count = ?, edge_count = ?
|
|
126
|
+
WHERE build_id = ?
|
|
127
|
+
""",
|
|
128
|
+
(
|
|
129
|
+
datetime.now(UTC).isoformat(),
|
|
130
|
+
progress.processed,
|
|
131
|
+
progress.nodes_created,
|
|
132
|
+
progress.edges_created,
|
|
133
|
+
build_id,
|
|
134
|
+
),
|
|
135
|
+
)
|
|
136
|
+
await self._storage._db.commit()
|
|
137
|
+
|
|
138
|
+
logger.info(
|
|
139
|
+
"Full build complete: %d files, %d nodes, %d edges, %d warnings",
|
|
140
|
+
progress.processed,
|
|
141
|
+
progress.nodes_created,
|
|
142
|
+
progress.edges_created,
|
|
143
|
+
len(progress.warnings),
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
return progress
|
|
147
|
+
|
|
148
|
+
async def incremental_build(
|
|
149
|
+
self,
|
|
150
|
+
changed_files: list[Path] | None = None,
|
|
151
|
+
) -> BuildProgress:
|
|
152
|
+
"""Update the graph for only changed files.
|
|
153
|
+
|
|
154
|
+
If changed_files is None, detect changes via content hash comparison
|
|
155
|
+
against stored fingerprints.
|
|
156
|
+
|
|
157
|
+
Edge cases:
|
|
158
|
+
- File renamed: old path's nodes are orphaned, new path gets new nodes.
|
|
159
|
+
We detect renames via content hash: if a new file has the same hash
|
|
160
|
+
as a recently deleted file, it's likely a rename.
|
|
161
|
+
- File deleted: remove its nodes (edges cascade via FK).
|
|
162
|
+
- File content unchanged but timestamp changed: skip (hash-based check).
|
|
163
|
+
- New file added: parse and add to graph.
|
|
164
|
+
"""
|
|
165
|
+
if changed_files is None:
|
|
166
|
+
changed_files = await self._detect_changed_files()
|
|
167
|
+
|
|
168
|
+
progress = BuildProgress(len(changed_files))
|
|
169
|
+
logger.info("Incremental build: %d files to process", len(changed_files))
|
|
170
|
+
|
|
171
|
+
# Detect deleted files (in graph but not on disk)
|
|
172
|
+
await self._cleanup_deleted_files(progress)
|
|
173
|
+
|
|
174
|
+
# Process changed/new files
|
|
175
|
+
for fp in changed_files:
|
|
176
|
+
if not fp.exists():
|
|
177
|
+
# File was deleted — remove from graph
|
|
178
|
+
removed = await self._storage.remove_file_nodes(fp)
|
|
179
|
+
if removed > 0:
|
|
180
|
+
logger.debug("Removed %d nodes for deleted file %s", removed, fp)
|
|
181
|
+
progress.skipped += 1
|
|
182
|
+
continue
|
|
183
|
+
|
|
184
|
+
# Remove old nodes for this file before re-parsing
|
|
185
|
+
await self._storage.remove_file_nodes(fp)
|
|
186
|
+
|
|
187
|
+
result = await self._parser.parse_file(fp)
|
|
188
|
+
if result is None:
|
|
189
|
+
progress.skipped += 1
|
|
190
|
+
continue
|
|
191
|
+
|
|
192
|
+
await self._store_parse_result(result, progress)
|
|
193
|
+
progress.processed += 1
|
|
194
|
+
|
|
195
|
+
logger.info(
|
|
196
|
+
"Incremental build complete: %d processed, %d skipped",
|
|
197
|
+
progress.processed,
|
|
198
|
+
progress.skipped,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
return progress
|
|
202
|
+
|
|
203
|
+
async def _detect_changed_files(self) -> list[Path]:
|
|
204
|
+
"""Detect files that changed since the last build.
|
|
205
|
+
|
|
206
|
+
Uses content hash comparison against stored fingerprints.
|
|
207
|
+
New files (not in fingerprint table) are included.
|
|
208
|
+
|
|
209
|
+
Edge case: file permissions changed but content didn't — skip.
|
|
210
|
+
Edge case: file touched (timestamp changed) but content same — skip.
|
|
211
|
+
"""
|
|
212
|
+
changed: list[Path] = []
|
|
213
|
+
|
|
214
|
+
for file_path in self._discover_files():
|
|
215
|
+
try:
|
|
216
|
+
content = file_path.read_bytes()
|
|
217
|
+
except (OSError, PermissionError):
|
|
218
|
+
continue
|
|
219
|
+
|
|
220
|
+
current_hash = compute_file_hash(content)
|
|
221
|
+
stored_hash = await self._storage.get_fingerprint(file_path)
|
|
222
|
+
|
|
223
|
+
if stored_hash != current_hash:
|
|
224
|
+
changed.append(file_path)
|
|
225
|
+
|
|
226
|
+
return changed
|
|
227
|
+
|
|
228
|
+
async def _cleanup_deleted_files(self, progress: BuildProgress) -> None:
|
|
229
|
+
"""Remove nodes for files that no longer exist on disk.
|
|
230
|
+
|
|
231
|
+
Edge case: file still exists but is now in .gitignore — we keep it
|
|
232
|
+
in the graph (it's still code, just not tracked). Only remove nodes
|
|
233
|
+
for files that are truly gone.
|
|
234
|
+
"""
|
|
235
|
+
cursor = await self._storage._db.execute(
|
|
236
|
+
"SELECT DISTINCT file_path FROM file_fingerprints"
|
|
237
|
+
)
|
|
238
|
+
for row in await cursor.fetchall():
|
|
239
|
+
stored_path = row[0]
|
|
240
|
+
full_path = self._storage._from_stored_path(stored_path)
|
|
241
|
+
if not full_path.exists():
|
|
242
|
+
removed = await self._storage.remove_file_nodes(full_path)
|
|
243
|
+
await self._storage._db.execute(
|
|
244
|
+
"DELETE FROM file_fingerprints WHERE file_path = ?",
|
|
245
|
+
(stored_path,),
|
|
246
|
+
)
|
|
247
|
+
await self._storage._db.commit()
|
|
248
|
+
logger.debug(
|
|
249
|
+
"Cleaned up %d nodes for deleted file %s", removed, stored_path
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
async def _process_batch(
|
|
253
|
+
self,
|
|
254
|
+
files: list[Path],
|
|
255
|
+
progress: BuildProgress,
|
|
256
|
+
) -> None:
|
|
257
|
+
"""Process a batch of files — parse and store."""
|
|
258
|
+
for fp in files:
|
|
259
|
+
try:
|
|
260
|
+
result = await self._parser.parse_file(fp)
|
|
261
|
+
except Exception as exc:
|
|
262
|
+
logger.warning("Failed to parse %s: %s", fp, exc)
|
|
263
|
+
progress.failed += 1
|
|
264
|
+
progress.warnings.append(f"Parse failed: {fp}: {exc}")
|
|
265
|
+
continue
|
|
266
|
+
|
|
267
|
+
if result is None:
|
|
268
|
+
progress.skipped += 1
|
|
269
|
+
continue
|
|
270
|
+
|
|
271
|
+
await self._store_parse_result(result, progress)
|
|
272
|
+
progress.processed += 1
|
|
273
|
+
progress.warnings.extend(result.warnings)
|
|
274
|
+
|
|
275
|
+
async def _store_parse_result(
|
|
276
|
+
self,
|
|
277
|
+
result: ParseResult,
|
|
278
|
+
progress: BuildProgress,
|
|
279
|
+
) -> None:
|
|
280
|
+
"""Store parsed nodes, edges, and fingerprint.
|
|
281
|
+
|
|
282
|
+
Edge case: edges may reference target nodes that don't exist in the
|
|
283
|
+
graph (external packages like 'fastapi', 'sqlalchemy'). We filter
|
|
284
|
+
these out before insertion to avoid FK constraint failures.
|
|
285
|
+
Unresolved edges are stored with a relaxed approach — we only insert
|
|
286
|
+
edges where at least the source node exists, and skip edges whose
|
|
287
|
+
targets are unresolved external modules.
|
|
288
|
+
"""
|
|
289
|
+
if result.nodes:
|
|
290
|
+
await self._storage.upsert_nodes_batch(result.nodes)
|
|
291
|
+
progress.nodes_created += len(result.nodes)
|
|
292
|
+
|
|
293
|
+
if result.edges:
|
|
294
|
+
# Filter edges to only those whose target nodes exist in the DB
|
|
295
|
+
# or whose source exists in the current batch
|
|
296
|
+
valid_edges: list[GraphEdge] = []
|
|
297
|
+
local_node_ids = {n.node_id for n in result.nodes}
|
|
298
|
+
|
|
299
|
+
for edge in result.edges:
|
|
300
|
+
# Skip edges to unresolved/external targets
|
|
301
|
+
if edge.target_id.startswith("unresolved:"):
|
|
302
|
+
continue
|
|
303
|
+
|
|
304
|
+
# Check if target exists in DB or current batch
|
|
305
|
+
target_exists = edge.target_id in local_node_ids
|
|
306
|
+
if not target_exists:
|
|
307
|
+
target_node = await self._storage.get_node(edge.target_id)
|
|
308
|
+
target_exists = target_node is not None
|
|
309
|
+
|
|
310
|
+
if target_exists:
|
|
311
|
+
valid_edges.append(edge)
|
|
312
|
+
|
|
313
|
+
if valid_edges:
|
|
314
|
+
await self._storage.upsert_edges_batch(valid_edges)
|
|
315
|
+
progress.edges_created += len(valid_edges)
|
|
316
|
+
|
|
317
|
+
# Update fingerprint
|
|
318
|
+
await self._storage.update_fingerprint(
|
|
319
|
+
file_path=result.file_path,
|
|
320
|
+
content_hash=result.content_hash,
|
|
321
|
+
size_bytes=result.size_bytes,
|
|
322
|
+
last_modified=datetime.now(UTC).isoformat(),
|
|
323
|
+
language=result.language,
|
|
324
|
+
node_count=len(result.nodes),
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
def _discover_files(self) -> AsyncIterator[Path] | list[Path]:
|
|
328
|
+
"""Discover all source files in the project.
|
|
329
|
+
|
|
330
|
+
Edge cases:
|
|
331
|
+
- Symlink loops: track resolved paths in a seen set
|
|
332
|
+
- Permission denied on directories: skip with warning
|
|
333
|
+
- Huge directories (node_modules): skip entire dir tree early
|
|
334
|
+
- Hidden directories (.git, .venv): skip via name check before fnmatch
|
|
335
|
+
"""
|
|
336
|
+
import fnmatch
|
|
337
|
+
|
|
338
|
+
root = self._config.project_root
|
|
339
|
+
seen_resolved: set[Path] = set()
|
|
340
|
+
results: list[Path] = []
|
|
341
|
+
|
|
342
|
+
ignore_patterns = self._config.parser.ignored_patterns
|
|
343
|
+
|
|
344
|
+
# Directories to skip immediately by name (before fnmatch overhead)
|
|
345
|
+
skip_dir_names = {
|
|
346
|
+
"node_modules", ".git", "__pycache__", ".venv", "venv",
|
|
347
|
+
".tox", ".mypy_cache", ".pytest_cache", ".ruff_cache",
|
|
348
|
+
".next", ".nuxt", "dist", "build", ".eggs", "vendor",
|
|
349
|
+
".gradle", ".idea", ".vscode", "coverage", "htmlcov",
|
|
350
|
+
".terraform", ".cargo", "target",
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
def _should_ignore_file(path: Path) -> bool:
|
|
354
|
+
try:
|
|
355
|
+
rel = str(path.relative_to(root))
|
|
356
|
+
except ValueError:
|
|
357
|
+
return True
|
|
358
|
+
return any(fnmatch.fnmatch(rel, p) for p in ignore_patterns)
|
|
359
|
+
|
|
360
|
+
def _walk(directory: Path) -> None:
|
|
361
|
+
try:
|
|
362
|
+
entries = sorted(directory.iterdir())
|
|
363
|
+
except PermissionError:
|
|
364
|
+
logger.warning("Permission denied: %s", directory)
|
|
365
|
+
return
|
|
366
|
+
except OSError:
|
|
367
|
+
return
|
|
368
|
+
|
|
369
|
+
for entry in entries:
|
|
370
|
+
resolved = entry.resolve()
|
|
371
|
+
|
|
372
|
+
# Symlink loop detection
|
|
373
|
+
if resolved in seen_resolved:
|
|
374
|
+
continue
|
|
375
|
+
seen_resolved.add(resolved)
|
|
376
|
+
|
|
377
|
+
if entry.is_dir():
|
|
378
|
+
# Fast skip by directory name (no fnmatch needed)
|
|
379
|
+
dir_name = entry.name
|
|
380
|
+
if dir_name in skip_dir_names or dir_name.startswith("."):
|
|
381
|
+
continue
|
|
382
|
+
if not _should_ignore_file(entry):
|
|
383
|
+
_walk(entry)
|
|
384
|
+
elif entry.is_file():
|
|
385
|
+
if not _should_ignore_file(entry):
|
|
386
|
+
lang = detect_language(entry)
|
|
387
|
+
if lang != Language.UNKNOWN:
|
|
388
|
+
results.append(entry)
|
|
389
|
+
|
|
390
|
+
_walk(root)
|
|
391
|
+
return results
|