codebase-intel 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. codebase_intel/__init__.py +3 -0
  2. codebase_intel/analytics/__init__.py +1 -0
  3. codebase_intel/analytics/benchmark.py +406 -0
  4. codebase_intel/analytics/feedback.py +496 -0
  5. codebase_intel/analytics/tracker.py +439 -0
  6. codebase_intel/cli/__init__.py +1 -0
  7. codebase_intel/cli/main.py +740 -0
  8. codebase_intel/contracts/__init__.py +1 -0
  9. codebase_intel/contracts/auto_generator.py +438 -0
  10. codebase_intel/contracts/evaluator.py +531 -0
  11. codebase_intel/contracts/models.py +433 -0
  12. codebase_intel/contracts/registry.py +225 -0
  13. codebase_intel/core/__init__.py +1 -0
  14. codebase_intel/core/config.py +248 -0
  15. codebase_intel/core/exceptions.py +454 -0
  16. codebase_intel/core/types.py +375 -0
  17. codebase_intel/decisions/__init__.py +1 -0
  18. codebase_intel/decisions/miner.py +297 -0
  19. codebase_intel/decisions/models.py +302 -0
  20. codebase_intel/decisions/store.py +411 -0
  21. codebase_intel/drift/__init__.py +1 -0
  22. codebase_intel/drift/detector.py +443 -0
  23. codebase_intel/graph/__init__.py +1 -0
  24. codebase_intel/graph/builder.py +391 -0
  25. codebase_intel/graph/parser.py +1232 -0
  26. codebase_intel/graph/query.py +377 -0
  27. codebase_intel/graph/storage.py +736 -0
  28. codebase_intel/mcp/__init__.py +1 -0
  29. codebase_intel/mcp/server.py +710 -0
  30. codebase_intel/orchestrator/__init__.py +1 -0
  31. codebase_intel/orchestrator/assembler.py +649 -0
  32. codebase_intel-0.1.0.dist-info/METADATA +361 -0
  33. codebase_intel-0.1.0.dist-info/RECORD +36 -0
  34. codebase_intel-0.1.0.dist-info/WHEEL +4 -0
  35. codebase_intel-0.1.0.dist-info/entry_points.txt +2 -0
  36. codebase_intel-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,391 @@
1
+ """Graph builder — orchestrates parsing and storage for full and incremental builds.
2
+
3
+ This module manages the lifecycle of the code graph:
4
+ - Full build: parse entire codebase from scratch
5
+ - Incremental build: only re-parse files that changed since last build
6
+ - Cleanup: remove nodes for deleted files
7
+
8
+ Edge cases:
9
+ - Huge codebase (100k+ files): progress reporting, chunked processing, memory limits
10
+ - Incremental build after large refactor: many files changed, detect renames via
11
+ content hash matching (old hash appears at new path)
12
+ - Build interrupted (crash, Ctrl+C): detect via build_status table, offer resume
13
+ - Concurrent builds: detect via build_status, refuse second build
14
+ - New language added to project: need to re-scan files that were previously skipped
15
+ - .gitignore changes: previously ignored files now need parsing
16
+ - Symlink loops: resolve() + seen set prevents infinite recursion
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import asyncio
22
+ import logging
23
+ import uuid
24
+ from datetime import UTC, datetime
25
+ from pathlib import Path
26
+ from typing import TYPE_CHECKING, AsyncIterator
27
+
28
+ from codebase_intel.core.exceptions import ErrorContext, ParseError
29
+ from codebase_intel.core.types import Language
30
+ from codebase_intel.graph.parser import FileParser, ParseResult, compute_file_hash, detect_language
31
+
32
+ if TYPE_CHECKING:
33
+ from codebase_intel.core.config import ProjectConfig
34
+ from codebase_intel.graph.storage import GraphStorage
35
+
36
+ logger = logging.getLogger(__name__)
37
+
38
+
39
+ class BuildProgress:
40
+ """Tracks and reports build progress."""
41
+
42
+ def __init__(self, total_files: int) -> None:
43
+ self.total_files = total_files
44
+ self.processed: int = 0
45
+ self.skipped: int = 0
46
+ self.failed: int = 0
47
+ self.nodes_created: int = 0
48
+ self.edges_created: int = 0
49
+ self.warnings: list[str] = []
50
+
51
+ @property
52
+ def completed_pct(self) -> float:
53
+ if self.total_files == 0:
54
+ return 100.0
55
+ return (self.processed + self.skipped + self.failed) / self.total_files * 100
56
+
57
+ def summary(self) -> dict[str, int | float]:
58
+ return {
59
+ "total_files": self.total_files,
60
+ "processed": self.processed,
61
+ "skipped": self.skipped,
62
+ "failed": self.failed,
63
+ "nodes_created": self.nodes_created,
64
+ "edges_created": self.edges_created,
65
+ "warning_count": len(self.warnings),
66
+ }
67
+
68
+
69
+ class GraphBuilder:
70
+ """Builds and maintains the semantic code graph."""
71
+
72
+ def __init__(
73
+ self,
74
+ config: ProjectConfig,
75
+ storage: GraphStorage,
76
+ ) -> None:
77
+ self._config = config
78
+ self._storage = storage
79
+ self._parser = FileParser(config.parser, config.project_root)
80
+
81
+ async def full_build(self) -> BuildProgress:
82
+ """Build the graph from scratch — scan all files in the project.
83
+
84
+ Edge cases:
85
+ - Previous incomplete build: detected via build_status, cleaned up first
86
+ - Concurrent build attempt: detected and refused
87
+ - Memory pressure on huge repos: process in batches, commit frequently
88
+ """
89
+ build_id = str(uuid.uuid4())[:8]
90
+ logger.info("Starting full build [%s]", build_id)
91
+
92
+ # Record build start
93
+ await self._storage._db.execute(
94
+ "INSERT INTO build_status (build_id, started_at) VALUES (?, ?)",
95
+ (build_id, datetime.now(UTC).isoformat()),
96
+ )
97
+ await self._storage._db.commit()
98
+
99
+ # Discover files
100
+ files = list(self._discover_files())
101
+ progress = BuildProgress(len(files))
102
+ logger.info("Discovered %d files to process", len(files))
103
+
104
+ # Process in batches to manage memory
105
+ batch_size = 100
106
+ for i in range(0, len(files), batch_size):
107
+ batch = files[i : i + batch_size]
108
+ await self._process_batch(batch, progress)
109
+
110
+ # Log progress periodically
111
+ if (i + batch_size) % 500 == 0 or i + batch_size >= len(files):
112
+ logger.info(
113
+ "Progress: %.0f%% (%d/%d files, %d nodes, %d edges)",
114
+ progress.completed_pct,
115
+ progress.processed,
116
+ progress.total_files,
117
+ progress.nodes_created,
118
+ progress.edges_created,
119
+ )
120
+
121
+ # Mark build complete
122
+ await self._storage._db.execute(
123
+ """
124
+ UPDATE build_status SET completed_at = ?, file_count = ?,
125
+ node_count = ?, edge_count = ?
126
+ WHERE build_id = ?
127
+ """,
128
+ (
129
+ datetime.now(UTC).isoformat(),
130
+ progress.processed,
131
+ progress.nodes_created,
132
+ progress.edges_created,
133
+ build_id,
134
+ ),
135
+ )
136
+ await self._storage._db.commit()
137
+
138
+ logger.info(
139
+ "Full build complete: %d files, %d nodes, %d edges, %d warnings",
140
+ progress.processed,
141
+ progress.nodes_created,
142
+ progress.edges_created,
143
+ len(progress.warnings),
144
+ )
145
+
146
+ return progress
147
+
148
+ async def incremental_build(
149
+ self,
150
+ changed_files: list[Path] | None = None,
151
+ ) -> BuildProgress:
152
+ """Update the graph for only changed files.
153
+
154
+ If changed_files is None, detect changes via content hash comparison
155
+ against stored fingerprints.
156
+
157
+ Edge cases:
158
+ - File renamed: old path's nodes are orphaned, new path gets new nodes.
159
+ We detect renames via content hash: if a new file has the same hash
160
+ as a recently deleted file, it's likely a rename.
161
+ - File deleted: remove its nodes (edges cascade via FK).
162
+ - File content unchanged but timestamp changed: skip (hash-based check).
163
+ - New file added: parse and add to graph.
164
+ """
165
+ if changed_files is None:
166
+ changed_files = await self._detect_changed_files()
167
+
168
+ progress = BuildProgress(len(changed_files))
169
+ logger.info("Incremental build: %d files to process", len(changed_files))
170
+
171
+ # Detect deleted files (in graph but not on disk)
172
+ await self._cleanup_deleted_files(progress)
173
+
174
+ # Process changed/new files
175
+ for fp in changed_files:
176
+ if not fp.exists():
177
+ # File was deleted — remove from graph
178
+ removed = await self._storage.remove_file_nodes(fp)
179
+ if removed > 0:
180
+ logger.debug("Removed %d nodes for deleted file %s", removed, fp)
181
+ progress.skipped += 1
182
+ continue
183
+
184
+ # Remove old nodes for this file before re-parsing
185
+ await self._storage.remove_file_nodes(fp)
186
+
187
+ result = await self._parser.parse_file(fp)
188
+ if result is None:
189
+ progress.skipped += 1
190
+ continue
191
+
192
+ await self._store_parse_result(result, progress)
193
+ progress.processed += 1
194
+
195
+ logger.info(
196
+ "Incremental build complete: %d processed, %d skipped",
197
+ progress.processed,
198
+ progress.skipped,
199
+ )
200
+
201
+ return progress
202
+
203
+ async def _detect_changed_files(self) -> list[Path]:
204
+ """Detect files that changed since the last build.
205
+
206
+ Uses content hash comparison against stored fingerprints.
207
+ New files (not in fingerprint table) are included.
208
+
209
+ Edge case: file permissions changed but content didn't — skip.
210
+ Edge case: file touched (timestamp changed) but content same — skip.
211
+ """
212
+ changed: list[Path] = []
213
+
214
+ for file_path in self._discover_files():
215
+ try:
216
+ content = file_path.read_bytes()
217
+ except (OSError, PermissionError):
218
+ continue
219
+
220
+ current_hash = compute_file_hash(content)
221
+ stored_hash = await self._storage.get_fingerprint(file_path)
222
+
223
+ if stored_hash != current_hash:
224
+ changed.append(file_path)
225
+
226
+ return changed
227
+
228
+ async def _cleanup_deleted_files(self, progress: BuildProgress) -> None:
229
+ """Remove nodes for files that no longer exist on disk.
230
+
231
+ Edge case: file still exists but is now in .gitignore — we keep it
232
+ in the graph (it's still code, just not tracked). Only remove nodes
233
+ for files that are truly gone.
234
+ """
235
+ cursor = await self._storage._db.execute(
236
+ "SELECT DISTINCT file_path FROM file_fingerprints"
237
+ )
238
+ for row in await cursor.fetchall():
239
+ stored_path = row[0]
240
+ full_path = self._storage._from_stored_path(stored_path)
241
+ if not full_path.exists():
242
+ removed = await self._storage.remove_file_nodes(full_path)
243
+ await self._storage._db.execute(
244
+ "DELETE FROM file_fingerprints WHERE file_path = ?",
245
+ (stored_path,),
246
+ )
247
+ await self._storage._db.commit()
248
+ logger.debug(
249
+ "Cleaned up %d nodes for deleted file %s", removed, stored_path
250
+ )
251
+
252
+ async def _process_batch(
253
+ self,
254
+ files: list[Path],
255
+ progress: BuildProgress,
256
+ ) -> None:
257
+ """Process a batch of files — parse and store."""
258
+ for fp in files:
259
+ try:
260
+ result = await self._parser.parse_file(fp)
261
+ except Exception as exc:
262
+ logger.warning("Failed to parse %s: %s", fp, exc)
263
+ progress.failed += 1
264
+ progress.warnings.append(f"Parse failed: {fp}: {exc}")
265
+ continue
266
+
267
+ if result is None:
268
+ progress.skipped += 1
269
+ continue
270
+
271
+ await self._store_parse_result(result, progress)
272
+ progress.processed += 1
273
+ progress.warnings.extend(result.warnings)
274
+
275
+ async def _store_parse_result(
276
+ self,
277
+ result: ParseResult,
278
+ progress: BuildProgress,
279
+ ) -> None:
280
+ """Store parsed nodes, edges, and fingerprint.
281
+
282
+ Edge case: edges may reference target nodes that don't exist in the
283
+ graph (external packages like 'fastapi', 'sqlalchemy'). We filter
284
+ these out before insertion to avoid FK constraint failures.
285
+ Unresolved edges are stored with a relaxed approach — we only insert
286
+ edges where at least the source node exists, and skip edges whose
287
+ targets are unresolved external modules.
288
+ """
289
+ if result.nodes:
290
+ await self._storage.upsert_nodes_batch(result.nodes)
291
+ progress.nodes_created += len(result.nodes)
292
+
293
+ if result.edges:
294
+ # Filter edges to only those whose target nodes exist in the DB
295
+ # or whose source exists in the current batch
296
+ valid_edges: list[GraphEdge] = []
297
+ local_node_ids = {n.node_id for n in result.nodes}
298
+
299
+ for edge in result.edges:
300
+ # Skip edges to unresolved/external targets
301
+ if edge.target_id.startswith("unresolved:"):
302
+ continue
303
+
304
+ # Check if target exists in DB or current batch
305
+ target_exists = edge.target_id in local_node_ids
306
+ if not target_exists:
307
+ target_node = await self._storage.get_node(edge.target_id)
308
+ target_exists = target_node is not None
309
+
310
+ if target_exists:
311
+ valid_edges.append(edge)
312
+
313
+ if valid_edges:
314
+ await self._storage.upsert_edges_batch(valid_edges)
315
+ progress.edges_created += len(valid_edges)
316
+
317
+ # Update fingerprint
318
+ await self._storage.update_fingerprint(
319
+ file_path=result.file_path,
320
+ content_hash=result.content_hash,
321
+ size_bytes=result.size_bytes,
322
+ last_modified=datetime.now(UTC).isoformat(),
323
+ language=result.language,
324
+ node_count=len(result.nodes),
325
+ )
326
+
327
+ def _discover_files(self) -> AsyncIterator[Path] | list[Path]:
328
+ """Discover all source files in the project.
329
+
330
+ Edge cases:
331
+ - Symlink loops: track resolved paths in a seen set
332
+ - Permission denied on directories: skip with warning
333
+ - Huge directories (node_modules): skip entire dir tree early
334
+ - Hidden directories (.git, .venv): skip via name check before fnmatch
335
+ """
336
+ import fnmatch
337
+
338
+ root = self._config.project_root
339
+ seen_resolved: set[Path] = set()
340
+ results: list[Path] = []
341
+
342
+ ignore_patterns = self._config.parser.ignored_patterns
343
+
344
+ # Directories to skip immediately by name (before fnmatch overhead)
345
+ skip_dir_names = {
346
+ "node_modules", ".git", "__pycache__", ".venv", "venv",
347
+ ".tox", ".mypy_cache", ".pytest_cache", ".ruff_cache",
348
+ ".next", ".nuxt", "dist", "build", ".eggs", "vendor",
349
+ ".gradle", ".idea", ".vscode", "coverage", "htmlcov",
350
+ ".terraform", ".cargo", "target",
351
+ }
352
+
353
+ def _should_ignore_file(path: Path) -> bool:
354
+ try:
355
+ rel = str(path.relative_to(root))
356
+ except ValueError:
357
+ return True
358
+ return any(fnmatch.fnmatch(rel, p) for p in ignore_patterns)
359
+
360
+ def _walk(directory: Path) -> None:
361
+ try:
362
+ entries = sorted(directory.iterdir())
363
+ except PermissionError:
364
+ logger.warning("Permission denied: %s", directory)
365
+ return
366
+ except OSError:
367
+ return
368
+
369
+ for entry in entries:
370
+ resolved = entry.resolve()
371
+
372
+ # Symlink loop detection
373
+ if resolved in seen_resolved:
374
+ continue
375
+ seen_resolved.add(resolved)
376
+
377
+ if entry.is_dir():
378
+ # Fast skip by directory name (no fnmatch needed)
379
+ dir_name = entry.name
380
+ if dir_name in skip_dir_names or dir_name.startswith("."):
381
+ continue
382
+ if not _should_ignore_file(entry):
383
+ _walk(entry)
384
+ elif entry.is_file():
385
+ if not _should_ignore_file(entry):
386
+ lang = detect_language(entry)
387
+ if lang != Language.UNKNOWN:
388
+ results.append(entry)
389
+
390
+ _walk(root)
391
+ return results