codedebrief 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. codedebrief/__init__.py +12 -0
  2. codedebrief/analysis/__init__.py +16 -0
  3. codedebrief/analysis/common.py +527 -0
  4. codedebrief/analysis/discovery.py +100 -0
  5. codedebrief/analysis/languages/__init__.py +6 -0
  6. codedebrief/analysis/languages/_common.py +68 -0
  7. codedebrief/analysis/languages/c.py +96 -0
  8. codedebrief/analysis/languages/cpp.py +146 -0
  9. codedebrief/analysis/languages/csharp.py +137 -0
  10. codedebrief/analysis/languages/go.py +157 -0
  11. codedebrief/analysis/languages/java.py +158 -0
  12. codedebrief/analysis/languages/php.py +83 -0
  13. codedebrief/analysis/languages/ruby.py +75 -0
  14. codedebrief/analysis/languages/rust.py +96 -0
  15. codedebrief/analysis/project.py +373 -0
  16. codedebrief/analysis/python.py +939 -0
  17. codedebrief/analysis/registry.py +320 -0
  18. codedebrief/analysis/treesitter.py +884 -0
  19. codedebrief/analysis/typescript.py +1019 -0
  20. codedebrief/artifacts.py +49 -0
  21. codedebrief/cli.py +585 -0
  22. codedebrief/config.py +226 -0
  23. codedebrief/doctor.py +175 -0
  24. codedebrief/install.py +441 -0
  25. codedebrief/mcp_server.py +2720 -0
  26. codedebrief/model.py +189 -0
  27. codedebrief/py.typed +1 -0
  28. codedebrief/quality.py +392 -0
  29. codedebrief/query.py +641 -0
  30. codedebrief/render/__init__.py +6 -0
  31. codedebrief/render/assets/generated/codedebrief-viewer-runtime.iife.js +10 -0
  32. codedebrief/render/assets/panels.js +462 -0
  33. codedebrief/render/assets/shell.js +1649 -0
  34. codedebrief/render/assets/styles.css +1715 -0
  35. codedebrief/render/assets/tree.js +616 -0
  36. codedebrief/render/html.py +191 -0
  37. codedebrief/render/markdown.py +153 -0
  38. codedebrief/render/payload.py +326 -0
  39. codedebrief/render/snapshot.py +769 -0
  40. codedebrief/schema/codedebrief.schema.json +449 -0
  41. codedebrief/util.py +65 -0
  42. codedebrief/validation.py +214 -0
  43. codedebrief-0.11.0.dist-info/METADATA +426 -0
  44. codedebrief-0.11.0.dist-info/RECORD +48 -0
  45. codedebrief-0.11.0.dist-info/WHEEL +4 -0
  46. codedebrief-0.11.0.dist-info/entry_points.txt +2 -0
  47. codedebrief-0.11.0.dist-info/licenses/LICENSE +176 -0
  48. codedebrief-0.11.0.dist-info/licenses/NOTICE +9 -0
@@ -0,0 +1,373 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import Counter
4
+ from dataclasses import dataclass, field
5
+ from datetime import datetime, timezone
6
+ from pathlib import Path
7
+
8
+ from codedebrief.analysis.common import (
9
+ CONFIDENCE_HIGH,
10
+ CONFIDENCE_LOW,
11
+ CONFIDENCE_MEDIUM,
12
+ CONFIDENCE_NONE,
13
+ DEFAULT_EXPORT_MARKER,
14
+ )
15
+ from codedebrief.analysis.discovery import discover_source_files
16
+ from codedebrief.analysis.registry import (
17
+ LanguageAnalyzer,
18
+ language_capability_matrix,
19
+ language_for,
20
+ spec_for_language,
21
+ )
22
+ from codedebrief.config import CodeDebriefConfig
23
+ from codedebrief.model import (
24
+ FileAnalysis,
25
+ FileRecord,
26
+ Flow,
27
+ FlowNode,
28
+ NodeKind,
29
+ ProjectModel,
30
+ )
31
+ from codedebrief.quality import model_quality
32
+ from codedebrief.util import (
33
+ compact_text,
34
+ file_sha256,
35
+ read_json,
36
+ relpath,
37
+ stable_id,
38
+ write_json,
39
+ )
40
+
41
+ CACHE_VERSION = "7"
42
+
43
+ # One bad file (mid-edit syntax error, non-UTF-8 bytes, a merge-conflict marker,
44
+ # or a missing lazy language grammar in the current Python environment) must never
45
+ # abort the whole run - the tool's promise is to stay in sync on every commit.
46
+ # These are the errors the analyzers raise while ingesting one file.
47
+ _INGEST_ERRORS = (SyntaxError, UnicodeDecodeError, ValueError, OSError, ImportError)
48
+
49
+
50
+ @dataclass(slots=True)
51
+ class AnalysisResult:
52
+ model: ProjectModel
53
+ changed_files: list[str]
54
+ deleted_files: list[str]
55
+ cache_hits: int
56
+ skipped_files: list[tuple[str, str]] = field(default_factory=list)
57
+
58
+
59
+ class ProjectAnalyzer:
60
+ def __init__(self, root: Path, config: CodeDebriefConfig | None = None) -> None:
61
+ self.root = root.resolve()
62
+ self.config = config or CodeDebriefConfig.load(self.root)
63
+ self.cache_dir = self.root / ".codedebrief" / "cache"
64
+ self.index_path = self.cache_dir / "index.json"
65
+ self.previous_generated_at: str | None = None
66
+ # Language analyzers are built lazily from the registry, so a grammar is loaded
67
+ # only when a file of that language is actually present.
68
+ self._analyzers: dict[str, LanguageAnalyzer] = {}
69
+
70
+ def analyze(self, *, full: bool = False) -> AnalysisResult:
71
+ files = discover_source_files(self.root, self.config)
72
+ previous_index = {} if full else self._load_index()
73
+ current_paths = {relpath(path, self.root) for path in files}
74
+ deleted_files = sorted(set(previous_index) - current_paths)
75
+ analyses: list[FileAnalysis] = []
76
+ changed_files: list[str] = []
77
+ skipped_files: list[tuple[str, str]] = []
78
+ cache_hits = 0
79
+ new_index: dict[str, dict[str, str]] = {}
80
+
81
+ for path in files:
82
+ relative = relpath(path, self.root)
83
+ cache_file = self.cache_dir / f"{stable_id(relative, length=24)}.json"
84
+ # Hashing reads the file from disk, so a file deleted or locked mid-run raises
85
+ # OSError. Keep the digest inside the guarded unit: one vanishing file must
86
+ # degrade to a skipped record, never abort the whole run.
87
+ digest, reason = self._safe_digest(path)
88
+ if reason is not None:
89
+ skipped_files.append((relative, reason))
90
+ analysis = self._degraded_file(path, relative, digest)
91
+ write_json(cache_file, analysis.to_dict())
92
+ changed_files.append(relative)
93
+ analyses.append(analysis)
94
+ new_index[relative] = _index_entry(cache_file.name, digest, reason)
95
+ continue
96
+ cached = previous_index.get(relative)
97
+ reused = (
98
+ not full
99
+ and cached is not None
100
+ and cached.get("sha256") == digest
101
+ and self._load_cached_analysis(cache_file)
102
+ )
103
+ if reused:
104
+ analysis = reused
105
+ if cached and cached.get("skip_reason"):
106
+ skipped_files.append((relative, cached["skip_reason"]))
107
+ cache_hits += 1
108
+ else:
109
+ analysis, reason = self._safe_analyze_file(path, relative, digest)
110
+ if reason is not None:
111
+ skipped_files.append((relative, reason))
112
+ write_json(cache_file, analysis.to_dict())
113
+ changed_files.append(relative)
114
+ analyses.append(analysis)
115
+ new_index[relative] = _index_entry(
116
+ cache_file.name,
117
+ digest,
118
+ reason if reason is not None else (cached or {}).get("skip_reason"),
119
+ )
120
+
121
+ model = self._combine(analyses, skipped_files)
122
+ if not full and not changed_files and not deleted_files and self.previous_generated_at:
123
+ model.generated_at = self.previous_generated_at
124
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
125
+ write_json(
126
+ self.index_path,
127
+ {
128
+ "cache_version": CACHE_VERSION,
129
+ "generated_at": model.generated_at,
130
+ "files": new_index,
131
+ },
132
+ )
133
+ return AnalysisResult(
134
+ model=model,
135
+ changed_files=changed_files,
136
+ deleted_files=deleted_files,
137
+ cache_hits=cache_hits,
138
+ skipped_files=skipped_files,
139
+ )
140
+
141
+ def _analyze_file(self, path: Path) -> FileAnalysis:
142
+ return self._analyzer_for(language_for(path)).analyze(path)
143
+
144
+ def _analyzer_for(self, language: str) -> LanguageAnalyzer:
145
+ analyzer = self._analyzers.get(language)
146
+ if analyzer is None:
147
+ analyzer = spec_for_language(language).factory(self.root, self.config)
148
+ self._analyzers[language] = analyzer
149
+ return analyzer
150
+
151
+ def _safe_digest(self, path: Path) -> tuple[str, str | None]:
152
+ """Hash one file, degrading to a sentinel digest instead of aborting the run.
153
+
154
+ A file deleted or locked between discovery and hashing raises OSError here; that
155
+ single file must skip, not crash the whole analysis. The sentinel (path-derived,
156
+ prefixed so it can never collide with a real sha256) keeps the cache index well
157
+ formed and forces a re-hash on the next run once the file is readable again.
158
+ """
159
+ try:
160
+ return file_sha256(path), None
161
+ except OSError as error:
162
+ return f"unreadable:{stable_id(str(path), length=24)}", _skip_reason(error)
163
+
164
+ def _safe_analyze_file(
165
+ self, path: Path, relative: str, digest: str
166
+ ) -> tuple[FileAnalysis, str | None]:
167
+ """Analyze one file, degrading to an empty record instead of aborting the run.
168
+
169
+ A single un-parseable or non-UTF-8 file (common while editing, on a merge
170
+ conflict, or in a mixed-language repo) is recorded as skipped and the rest of
171
+ the model is still built - "always in sync" can't hinge on every file parsing.
172
+ """
173
+ try:
174
+ return self._analyze_file(path), None
175
+ except _INGEST_ERRORS as error:
176
+ return self._degraded_file(path, relative, digest), _skip_reason(error)
177
+
178
+ def _degraded_file(self, path: Path, relative: str, digest: str) -> FileAnalysis:
179
+ return FileAnalysis(path=relative, language=language_for(path), sha256=digest)
180
+
181
+ def _load_cached_analysis(self, cache_file: Path) -> FileAnalysis | None:
182
+ if not cache_file.exists():
183
+ return None
184
+ try:
185
+ return FileAnalysis.from_dict(read_json(cache_file))
186
+ except (ValueError, KeyError, TypeError, OSError):
187
+ # A corrupt cache entry is never fatal - fall back to a fresh analysis.
188
+ return None
189
+
190
+ def _load_index(self) -> dict[str, dict[str, str]]:
191
+ if not self.index_path.exists():
192
+ return {}
193
+ try:
194
+ data = read_json(self.index_path)
195
+ if data.get("cache_version") != CACHE_VERSION:
196
+ return {}
197
+ generated_at = data.get("generated_at")
198
+ self.previous_generated_at = str(generated_at) if generated_at else None
199
+ file_data = data.get("files", {})
200
+ return {
201
+ str(path): {
202
+ "sha256": str(item["sha256"]),
203
+ "cache": str(item["cache"]),
204
+ **(
205
+ {"skip_reason": str(item["skip_reason"])} if item.get("skip_reason") else {}
206
+ ),
207
+ }
208
+ for path, item in file_data.items()
209
+ }
210
+ except (ValueError, KeyError, TypeError, OSError):
211
+ # A corrupt or unreadable index is never fatal - discard it and force a clean
212
+ # full re-analyze, exactly as a corrupt per-file cache entry already does.
213
+ self.previous_generated_at = None
214
+ return {}
215
+
216
+ def _combine(
217
+ self, analyses: list[FileAnalysis], skipped_files: list[tuple[str, str]]
218
+ ) -> ProjectModel:
219
+ flows = [flow for analysis in analyses for flow in analysis.flows]
220
+ self._link_calls(flows)
221
+ self._link_tests(flows)
222
+ # Keyed by language so a Python enum and a same-named TS union stay distinct
223
+ # value universes (they are different closed sets).
224
+ enums: dict[str, dict[str, list[str]]] = {}
225
+ for analysis in analyses:
226
+ language_enums = enums.setdefault(analysis.language, {})
227
+ for name, members in analysis.enums.items():
228
+ known = language_enums.setdefault(name, [])
229
+ known.extend(member for member in members if member not in known)
230
+ # Tag every flow with the macro-part(s) it belongs to (backend/frontend/infra),
231
+ # so the model can be viewed whole or restricted to a scope.
232
+ scope_counts: Counter[str] = Counter()
233
+ for flow in flows:
234
+ scope = self.config.scopes_for(flow.location.path)
235
+ flow.metadata["scope"] = scope
236
+ scope_counts.update(scope)
237
+ files = [
238
+ FileRecord(
239
+ path=analysis.path,
240
+ language=analysis.language,
241
+ sha256=analysis.sha256,
242
+ flow_ids=[flow.id for flow in analysis.flows],
243
+ dependencies=analysis.dependencies,
244
+ )
245
+ for analysis in analyses
246
+ ]
247
+ model = ProjectModel(
248
+ schema_version="2.0",
249
+ generated_at=datetime.now(timezone.utc).isoformat(),
250
+ root=".",
251
+ flows=sorted(flows, key=lambda item: (not item.is_entrypoint, item.symbol)),
252
+ files=files,
253
+ metadata={
254
+ "languages": sorted({item.language for item in analyses}),
255
+ "entrypoint_count": sum(flow.is_entrypoint for flow in flows),
256
+ "flow_count": len(flows),
257
+ "enums": enums,
258
+ "language_capabilities": language_capability_matrix(),
259
+ "scopes": dict(sorted(scope_counts.items())),
260
+ "skipped_files": _skipped_file_records(skipped_files),
261
+ },
262
+ )
263
+ model.metadata["quality"] = model_quality(model)
264
+ return model
265
+
266
+ def _link_calls(self, flows: list[Flow]) -> None:
267
+ # Import-aware first (`qualified_calls` from the analyzers), short name as a
268
+ # fallback. Ambiguous candidates are recorded, not dropped, and each link
269
+ # carries a `link_confidence` so agents can explain whether an edge is direct,
270
+ # inferred from imports, or only a short-name fallback.
271
+ # Key on the flow symbol as-is (``module:qualified``) so a module-path boundary
272
+ # never collides with an attribute boundary; a default-export flow also answers
273
+ # to the module's default marker.
274
+ #
275
+ # Both tables are partitioned by language: module/symbol namespaces never span
276
+ # languages, so a TS `charge(request)` whose qualified target is missing must
277
+ # not fall back onto a same-named PYTHON `charge`.
278
+ by_qualified: dict[str, dict[str, list[Flow]]] = {}
279
+ by_name: dict[str, dict[str, list[Flow]]] = {}
280
+ for flow in flows:
281
+ qualified = by_qualified.setdefault(flow.language, {})
282
+ named = by_name.setdefault(flow.language, {})
283
+ qualified.setdefault(flow.symbol, []).append(flow)
284
+ if flow.metadata.get("default_export"):
285
+ module = flow.symbol.split(":", 1)[0]
286
+ qualified.setdefault(f"{module}:{DEFAULT_EXPORT_MARKER}", []).append(flow)
287
+ short = flow.symbol.split(":", 1)[-1].split(".")[-1]
288
+ named.setdefault(short, []).append(flow)
289
+
290
+ calls_seen = {flow.id: set(flow.calls) for flow in flows}
291
+ called_by_seen = {flow.id: set(flow.called_by) for flow in flows}
292
+ for flow in flows:
293
+ lang_qualified = by_qualified.get(flow.language, {})
294
+ lang_name = by_name.get(flow.language, {})
295
+ for node in flow.nodes:
296
+ if node.kind is not NodeKind.CALL:
297
+ continue
298
+ candidates, confidence = self._resolve_call(flow, node, lang_qualified, lang_name)
299
+ if not candidates:
300
+ continue
301
+ node.metadata["link_confidence"] = confidence
302
+ node.metadata["call_candidates"] = sorted(candidates)
303
+ if len(candidates) == 1:
304
+ target = next(iter(candidates.values()))
305
+ node.metadata["target_flow"] = target.id
306
+ node.metadata["target_symbol"] = target.symbol
307
+ if target.id not in calls_seen[flow.id]:
308
+ flow.calls.append(target.id)
309
+ calls_seen[flow.id].add(target.id)
310
+ if flow.id not in called_by_seen[target.id]:
311
+ target.called_by.append(flow.id)
312
+ called_by_seen[target.id].add(flow.id)
313
+
314
+ @staticmethod
315
+ def _resolve_call(
316
+ flow: Flow,
317
+ node: FlowNode,
318
+ by_qualified: dict[str, list[Flow]],
319
+ by_name: dict[str, list[Flow]],
320
+ ) -> tuple[dict[str, Flow], str]:
321
+ # `by_qualified` / `by_name` are already scoped to the caller flow's language
322
+ # (see `_link_calls`), so every candidate here shares the caller's language and
323
+ # no cross-language edge can be created.
324
+ qualified: dict[str, Flow] = {}
325
+ for name in node.metadata.get("qualified_calls", []):
326
+ for candidate in by_qualified.get(str(name), []):
327
+ if candidate.id != flow.id:
328
+ qualified[candidate.id] = candidate
329
+ if qualified:
330
+ return qualified, (CONFIDENCE_HIGH if len(qualified) == 1 else CONFIDENCE_LOW)
331
+
332
+ short_name: dict[str, Flow] = {}
333
+ for raw in node.metadata.get("calls", []):
334
+ for candidate in by_name.get(str(raw).split(".")[-1], []):
335
+ if candidate.id != flow.id:
336
+ short_name[candidate.id] = candidate
337
+ if short_name:
338
+ return short_name, (CONFIDENCE_MEDIUM if len(short_name) == 1 else CONFIDENCE_LOW)
339
+ return {}, CONFIDENCE_NONE
340
+
341
+ def _link_tests(self, flows: list[Flow]) -> None:
342
+ by_id = {flow.id: flow for flow in flows}
343
+ for flow in flows:
344
+ if not flow.metadata.get("test"):
345
+ continue
346
+ for target_id in flow.calls:
347
+ target = by_id.get(target_id)
348
+ if target and flow.symbol not in target.tests:
349
+ target.tests.append(flow.symbol)
350
+
351
+
352
+ def _skip_reason(error: Exception) -> str:
353
+ """A one-line, human-readable reason a file was skipped."""
354
+ text = str(error).strip() or error.__class__.__name__
355
+ return compact_text(text, 200)
356
+
357
+
358
+ def _index_entry(cache_name: str, digest: str, reason: str | None = None) -> dict[str, str]:
359
+ entry = {"sha256": digest, "cache": cache_name}
360
+ if reason:
361
+ entry["skip_reason"] = reason
362
+ return entry
363
+
364
+
365
+ def _skipped_file_records(skipped_files: list[tuple[str, str]]) -> list[dict[str, str]]:
366
+ return [
367
+ {
368
+ "path": path,
369
+ "language": language_for(Path(path)),
370
+ "reason": reason,
371
+ }
372
+ for path, reason in sorted(skipped_files)
373
+ ]