suitable-loop 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,510 @@
1
+ """Git-based change analysis engine for CodeZero.
2
+
3
+ Reads repository history via GitPython, computes per-commit risk scores
4
+ using a weighted formula over complexity delta, blast radius, churn,
5
+ lines changed, and file count, and persists results to the database.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ from collections import Counter
12
+ from pathlib import Path
13
+
14
+ import git
15
+
16
+ from suitable_loop.config import SuitableLoopConfig
17
+ from suitable_loop.db import Database
18
+ from suitable_loop.models import CommitFile, CommitInfo
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class GitAnalyzer:
24
+ """Analyzes Git repository history to surface risky changes and hotspots."""
25
+
26
+ def __init__(self, db: Database, config: SuitableLoopConfig) -> None:
27
+ self.db = db
28
+ self.config = config
29
+
30
+ # ------------------------------------------------------------------
31
+ # Public API
32
+ # ------------------------------------------------------------------
33
+
34
+ def analyze_recent_changes(
35
+ self, repo_path: str, n_commits: int | None = None
36
+ ) -> list[dict]:
37
+ """Analyze the last *n_commits* commits and return risk-scored results.
38
+
39
+ Each commit is scored with a weighted combination of complexity delta,
40
+ blast radius, churn rate, lines changed, and file count. Results are
41
+ persisted to the database and returned sorted by ``risk_score``
42
+ descending.
43
+ """
44
+ depth = n_commits or self.config.git.default_commit_depth
45
+ weights = self.config.git.risk_weights
46
+
47
+ repo = self._open_repo(repo_path)
48
+ if repo is None:
49
+ return []
50
+
51
+ commits = self._iter_commits(repo, depth)
52
+ if not commits:
53
+ logger.info("No commits found in %s", repo_path)
54
+ return []
55
+
56
+ # ----------------------------------------------------------
57
+ # Phase 1: extract raw data per commit
58
+ # ----------------------------------------------------------
59
+ raw_records: list[dict] = []
60
+ # Track file appearance counts across the batch for churn.
61
+ file_appearance: Counter[str] = Counter()
62
+
63
+ for commit in commits:
64
+ changed_files = self._extract_changed_files(commit)
65
+ total_insertions = 0
66
+ total_deletions = 0
67
+ commit_file_records: list[dict] = []
68
+
69
+ for cf in changed_files:
70
+ total_insertions += cf["insertions"]
71
+ total_deletions += cf["deletions"]
72
+ file_appearance[cf["file_path"]] += 1
73
+
74
+ # Complexity delta for .py files
75
+ complexity_before, complexity_after = self._compute_complexity_delta(
76
+ commit, cf["file_path"], cf["change_type"]
77
+ )
78
+ cf["complexity_before"] = complexity_before
79
+ cf["complexity_after"] = complexity_after
80
+ commit_file_records.append(cf)
81
+
82
+ record = {
83
+ "sha": commit.hexsha,
84
+ "author": str(commit.author),
85
+ "timestamp": commit.committed_date,
86
+ "message": commit.message.strip(),
87
+ "files_changed": len(changed_files),
88
+ "insertions": total_insertions,
89
+ "deletions": total_deletions,
90
+ "commit_files": commit_file_records,
91
+ # Raw factors (pre-normalization)
92
+ "_complexity_delta": sum(
93
+ abs((cf.get("complexity_after") or 0) - (cf.get("complexity_before") or 0))
94
+ for cf in commit_file_records
95
+ ),
96
+ "_blast_radius": self._commit_blast_radius(commit_file_records, repo_path),
97
+ "_churn_rate": sum(
98
+ file_appearance[cf["file_path"]] for cf in commit_file_records
99
+ ),
100
+ "_lines_changed": total_insertions + total_deletions,
101
+ "_file_count": len(changed_files),
102
+ }
103
+ raw_records.append(record)
104
+
105
+ # ----------------------------------------------------------
106
+ # Phase 2: normalize factors and compute risk scores
107
+ # ----------------------------------------------------------
108
+ complexity_vals = [r["_complexity_delta"] for r in raw_records]
109
+ blast_vals = [r["_blast_radius"] for r in raw_records]
110
+ churn_vals = [float(r["_churn_rate"]) for r in raw_records]
111
+ lines_vals = [float(r["_lines_changed"]) for r in raw_records]
112
+ files_vals = [float(r["_file_count"]) for r in raw_records]
113
+
114
+ norm_complexity = self._normalize(complexity_vals)
115
+ norm_blast = self._normalize(blast_vals)
116
+ norm_churn = self._normalize(churn_vals)
117
+ norm_lines = self._normalize(lines_vals)
118
+ norm_files = self._normalize(files_vals)
119
+
120
+ w_complexity = weights.get("complexity", 0.30)
121
+ w_blast = weights.get("blast_radius", 0.25)
122
+ w_churn = weights.get("churn", 0.20)
123
+ w_lines = weights.get("lines", 0.15)
124
+ w_files = weights.get("files", 0.10)
125
+
126
+ results: list[dict] = []
127
+ for i, record in enumerate(raw_records):
128
+ risk_score = (
129
+ w_complexity * norm_complexity[i]
130
+ + w_blast * norm_blast[i]
131
+ + w_churn * norm_churn[i]
132
+ + w_lines * norm_lines[i]
133
+ + w_files * norm_files[i]
134
+ )
135
+
136
+ # Persist commit
137
+ commit_info = CommitInfo(
138
+ repo_path=repo_path,
139
+ sha=record["sha"],
140
+ author=record["author"],
141
+ timestamp=record["timestamp"],
142
+ message=record["message"],
143
+ files_changed=record["files_changed"],
144
+ insertions=record["insertions"],
145
+ deletions=record["deletions"],
146
+ risk_score=risk_score,
147
+ )
148
+ commit_id = self.db.upsert_commit(commit_info)
149
+
150
+ # Persist commit files
151
+ for cf in record["commit_files"]:
152
+ commit_file = CommitFile(
153
+ commit_id=commit_id,
154
+ file_path=cf["file_path"],
155
+ change_type=cf["change_type"],
156
+ insertions=cf["insertions"],
157
+ deletions=cf["deletions"],
158
+ complexity_before=cf.get("complexity_before"),
159
+ complexity_after=cf.get("complexity_after"),
160
+ )
161
+ self.db.insert_commit_file(commit_file)
162
+
163
+ results.append({
164
+ "sha": record["sha"],
165
+ "author": record["author"],
166
+ "timestamp": record["timestamp"],
167
+ "message": record["message"],
168
+ "files_changed": record["files_changed"],
169
+ "insertions": record["insertions"],
170
+ "deletions": record["deletions"],
171
+ "risk_score": round(risk_score, 4),
172
+ "factors": {
173
+ "complexity_delta": norm_complexity[i],
174
+ "blast_radius": norm_blast[i],
175
+ "churn_rate": norm_churn[i],
176
+ "lines_changed": norm_lines[i],
177
+ "file_count": norm_files[i],
178
+ },
179
+ })
180
+
181
+ self.db.commit()
182
+ results.sort(key=lambda r: r["risk_score"], reverse=True)
183
+ logger.info(
184
+ "Analyzed %d commits in %s; highest risk %.4f (%s)",
185
+ len(results),
186
+ repo_path,
187
+ results[0]["risk_score"] if results else 0.0,
188
+ results[0]["sha"][:8] if results else "n/a",
189
+ )
190
+ return results
191
+
192
+ def analyze_commit(self, repo_path: str, sha: str) -> dict:
193
+ """Return a detailed breakdown of a single commit.
194
+
195
+ Includes per-file diffs, complexity deltas, and blast radius.
196
+ """
197
+ repo = self._open_repo(repo_path)
198
+ if repo is None:
199
+ return {"error": f"Cannot open repository at {repo_path}"}
200
+
201
+ try:
202
+ commit = repo.commit(sha)
203
+ except (git.BadName, git.GitCommandError, ValueError) as exc:
204
+ logger.warning("Cannot resolve commit %s: %s", sha, exc)
205
+ return {"error": f"Cannot resolve commit {sha}: {exc}"}
206
+
207
+ changed_files = self._extract_changed_files(commit)
208
+ file_details: list[dict] = []
209
+ total_insertions = 0
210
+ total_deletions = 0
211
+
212
+ for cf in changed_files:
213
+ total_insertions += cf["insertions"]
214
+ total_deletions += cf["deletions"]
215
+
216
+ complexity_before, complexity_after = self._compute_complexity_delta(
217
+ commit, cf["file_path"], cf["change_type"]
218
+ )
219
+
220
+ blast = self.blast_radius(cf["file_path"])
221
+
222
+ # Attempt to get the diff for this file
223
+ diff_text = self._file_diff_text(commit, cf["file_path"])
224
+
225
+ file_details.append({
226
+ "file_path": cf["file_path"],
227
+ "change_type": cf["change_type"],
228
+ "insertions": cf["insertions"],
229
+ "deletions": cf["deletions"],
230
+ "complexity_before": complexity_before,
231
+ "complexity_after": complexity_after,
232
+ "complexity_delta": (complexity_after or 0) - (complexity_before or 0),
233
+ "blast_radius": blast,
234
+ "diff": diff_text,
235
+ })
236
+
237
+ return {
238
+ "sha": commit.hexsha,
239
+ "author": str(commit.author),
240
+ "timestamp": commit.committed_date,
241
+ "message": commit.message.strip(),
242
+ "files_changed": len(changed_files),
243
+ "insertions": total_insertions,
244
+ "deletions": total_deletions,
245
+ "files": file_details,
246
+ }
247
+
248
+ def hotspot_report(
249
+ self, repo_path: str, n_commits: int | None = None
250
+ ) -> list[dict]:
251
+ """Identify hotspot files by cross-referencing churn with dependency count.
252
+
253
+ Returns files ranked by ``churn * dependency_count`` descending.
254
+ """
255
+ depth = n_commits or self.config.git.default_commit_depth
256
+
257
+ repo = self._open_repo(repo_path)
258
+ if repo is None:
259
+ return []
260
+
261
+ commits = self._iter_commits(repo, depth)
262
+ if not commits:
263
+ return []
264
+
265
+ # Count how often each file path appears across commits.
266
+ file_churn: Counter[str] = Counter()
267
+ for commit in commits:
268
+ changed = self._extract_changed_files(commit)
269
+ for cf in changed:
270
+ file_churn[cf["file_path"]] += 1
271
+
272
+ # For each file, look up dependency count from the database.
273
+ hotspots: list[dict] = []
274
+ for file_path, churn in file_churn.items():
275
+ dep_count = self._dependency_count(file_path)
276
+ score = churn * dep_count
277
+ hotspots.append({
278
+ "file_path": file_path,
279
+ "churn": churn,
280
+ "dependency_count": dep_count,
281
+ "hotspot_score": score,
282
+ })
283
+
284
+ hotspots.sort(key=lambda h: h["hotspot_score"], reverse=True)
285
+ return hotspots
286
+
287
+ def blast_radius(self, file_path: str) -> dict:
288
+ """Find all files that transitively depend on *file_path*.
289
+
290
+ Returns a dict with ``count`` and ``dependents`` (list of file paths).
291
+ """
292
+ file_entity = self.db.get_file_by_path(file_path)
293
+ if file_entity is None:
294
+ file_entity = self.db.find_file_by_suffix(file_path)
295
+ if file_entity is None:
296
+ return {"count": 0, "dependents": []}
297
+
298
+ visited: set[int] = set()
299
+ queue: list[int] = [file_entity.id] # type: ignore[arg-type]
300
+
301
+ while queue:
302
+ current_id = queue.pop()
303
+ if current_id in visited:
304
+ continue
305
+ visited.add(current_id)
306
+ dependents = self.db.get_file_dependents(current_id)
307
+ for dep in dependents:
308
+ if dep.id not in visited:
309
+ queue.append(dep.id) # type: ignore[arg-type]
310
+
311
+ # Remove the original file itself from the result set.
312
+ visited.discard(file_entity.id) # type: ignore[arg-type]
313
+
314
+ dependent_paths: list[str] = []
315
+ for fid in visited:
316
+ entity = self.db.get_file_by_id(fid)
317
+ if entity is not None:
318
+ dependent_paths.append(entity.path)
319
+
320
+ return {"count": len(dependent_paths), "dependents": sorted(dependent_paths)}
321
+
322
+ # ------------------------------------------------------------------
323
+ # Normalization helper
324
+ # ------------------------------------------------------------------
325
+
326
+ @staticmethod
327
+ def _normalize(values: list[float]) -> list[float]:
328
+ """Normalize *values* to the ``[0, 1]`` range using min-max scaling.
329
+
330
+ Returns a list of zeros when all values are identical.
331
+ """
332
+ if not values:
333
+ return []
334
+ min_val = min(values)
335
+ max_val = max(values)
336
+ span = max_val - min_val
337
+ if span == 0:
338
+ return [0.0] * len(values)
339
+ return [(v - min_val) / span for v in values]
340
+
341
+ # ------------------------------------------------------------------
342
+ # Git helpers
343
+ # ------------------------------------------------------------------
344
+
345
+ @staticmethod
346
+ def _open_repo(repo_path: str) -> git.Repo | None:
347
+ """Open a Git repository, returning ``None`` on failure."""
348
+ try:
349
+ return git.Repo(repo_path, search_parent_directories=True)
350
+ except (git.InvalidGitRepositoryError, git.NoSuchPathError) as exc:
351
+ logger.error("Cannot open Git repository at %s: %s", repo_path, exc)
352
+ return None
353
+
354
+ @staticmethod
355
+ def _iter_commits(repo: git.Repo, max_count: int) -> list[git.Commit]:
356
+ """Return up to *max_count* commits from the active branch."""
357
+ try:
358
+ return list(repo.iter_commits(max_count=max_count))
359
+ except (git.GitCommandError, ValueError) as exc:
360
+ logger.warning("Cannot iterate commits: %s", exc)
361
+ return []
362
+
363
+ @staticmethod
364
+ def _extract_changed_files(commit: git.Commit) -> list[dict]:
365
+ """Extract changed file metadata from a commit's diff against its parent.
366
+
367
+ Returns a list of dicts with keys ``file_path``, ``change_type``,
368
+ ``insertions``, and ``deletions``.
369
+ """
370
+ results: list[dict] = []
371
+
372
+ # Determine the parent to diff against. For the initial commit the
373
+ # tree is diffed against an empty tree (NULL_TREE).
374
+ if commit.parents:
375
+ diffs = commit.parents[0].diff(commit, create_patch=False)
376
+ else:
377
+ diffs = commit.diff(git.NULL_TREE, create_patch=False)
378
+
379
+ # Use commit.stats for line-level counts since Diff objects do not
380
+ # always expose insertions/deletions directly.
381
+ stats_files: dict[str, dict] = {}
382
+ try:
383
+ stats_files = commit.stats.files
384
+ except Exception:
385
+ pass
386
+
387
+ for diff_item in diffs:
388
+ # Determine a usable file path from the diff.
389
+ file_path = diff_item.b_path or diff_item.a_path or ""
390
+ if not file_path:
391
+ continue
392
+
393
+ # Map GitPython change type to a human-readable label.
394
+ change_type = _diff_change_type(diff_item)
395
+
396
+ file_stats = stats_files.get(file_path, {})
397
+ results.append({
398
+ "file_path": file_path,
399
+ "change_type": change_type,
400
+ "insertions": file_stats.get("insertions", 0),
401
+ "deletions": file_stats.get("deletions", 0),
402
+ })
403
+
404
+ return results
405
+
406
+ # ------------------------------------------------------------------
407
+ # Complexity helpers
408
+ # ------------------------------------------------------------------
409
+
410
+ @staticmethod
411
+ def _compute_complexity_delta(
412
+ commit: git.Commit, file_path: str, change_type: str
413
+ ) -> tuple[int | None, int | None]:
414
+ """Compute radon cyclomatic complexity before and after the commit.
415
+
416
+ Only applies to ``.py`` files. Returns ``(None, None)`` for
417
+ non-Python files or when radon is unavailable.
418
+ """
419
+ if not file_path.endswith(".py"):
420
+ return None, None
421
+
422
+ try:
423
+ from radon.complexity import cc_visit # type: ignore[import-untyped]
424
+ except ImportError:
425
+ return None, None
426
+
427
+ complexity_before: int | None = None
428
+ complexity_after: int | None = None
429
+
430
+ # Complexity *after* the commit (file in the commit's tree).
431
+ if change_type != "D":
432
+ try:
433
+ blob = commit.tree / file_path
434
+ source_after = blob.data_stream.read().decode("utf-8", errors="replace")
435
+ complexity_after = _total_complexity(cc_visit, source_after)
436
+ except (KeyError, TypeError, Exception):
437
+ complexity_after = 0
438
+
439
+ # Complexity *before* the commit (file in the parent's tree).
440
+ if change_type != "A" and commit.parents:
441
+ try:
442
+ parent_blob = commit.parents[0].tree / file_path
443
+ source_before = parent_blob.data_stream.read().decode("utf-8", errors="replace")
444
+ complexity_before = _total_complexity(cc_visit, source_before)
445
+ except (KeyError, TypeError, Exception):
446
+ complexity_before = 0
447
+
448
+ return complexity_before, complexity_after
449
+
450
+ @staticmethod
451
+ def _file_diff_text(commit: git.Commit, file_path: str) -> str:
452
+ """Return the unified diff text for *file_path* within *commit*."""
453
+ try:
454
+ if commit.parents:
455
+ diffs = commit.parents[0].diff(commit, paths=[file_path], create_patch=True)
456
+ else:
457
+ diffs = commit.diff(git.NULL_TREE, paths=[file_path], create_patch=True)
458
+
459
+ for diff_item in diffs:
460
+ if diff_item.diff:
461
+ raw = diff_item.diff
462
+ if isinstance(raw, bytes):
463
+ return raw.decode("utf-8", errors="replace")
464
+ return str(raw)
465
+ except Exception as exc:
466
+ logger.debug("Cannot get diff for %s in %s: %s", file_path, commit.hexsha[:8], exc)
467
+ return ""
468
+
469
+ # ------------------------------------------------------------------
470
+ # Blast radius / dependency helpers
471
+ # ------------------------------------------------------------------
472
+
473
+ def _commit_blast_radius(self, commit_files: list[dict], repo_path: str) -> float:
474
+ """Sum transitive dependent counts for all files in a commit."""
475
+ total = 0
476
+ for cf in commit_files:
477
+ total += self._dependency_count(cf["file_path"])
478
+ return float(total)
479
+
480
+ def _dependency_count(self, file_path: str) -> int:
481
+ """Return the number of files that transitively depend on *file_path*."""
482
+ result = self.blast_radius(file_path)
483
+ return result["count"]
484
+
485
+
486
+ # ------------------------------------------------------------------
487
+ # Module-level helpers
488
+ # ------------------------------------------------------------------
489
+
490
+ def _diff_change_type(diff_item: git.Diff) -> str: # type: ignore[name-defined]
491
+ """Map a GitPython Diff object's change_type to a readable string."""
492
+ mapping = {
493
+ "A": "A",
494
+ "D": "D",
495
+ "M": "M",
496
+ "R": "R",
497
+ "C": "C",
498
+ "T": "T",
499
+ }
500
+ ct = getattr(diff_item, "change_type", None) or "M"
501
+ return mapping.get(ct, ct)
502
+
503
+
504
+ def _total_complexity(cc_visit_fn, source: str) -> int:
505
+ """Sum cyclomatic complexity across all blocks returned by radon."""
506
+ try:
507
+ blocks = cc_visit_fn(source)
508
+ return sum(block.complexity for block in blocks)
509
+ except Exception:
510
+ return 0