whycode-cli 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
whycode/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """WhyCode — tells you what to be afraid of before touching a file."""
2
2
 
3
- __version__ = "0.4.0"
3
+ __version__ = "0.4.2"
whycode/cache.py CHANGED
@@ -112,10 +112,21 @@ class CacheStore:
112
112
  cache misses; this class never invokes ``git`` itself.
113
113
  """
114
114
 
115
- def __init__(self, db_path: Path) -> None:
115
+ def __init__(self, db_path: Path, *, in_memory: bool = False) -> None:
116
+ """Open (creating if needed) the SQLite cache at ``db_path``.
117
+
118
+ ``in_memory=True`` opens a transient ``:memory:`` connection
119
+ instead — the disk file is never created and is never read.
120
+ Used by ``--no-cache`` to retain in-session amortisation
121
+ (matches the cold-fill code path) without persisting anything.
122
+ """
116
123
  self.db_path = db_path
117
- self.db_path.parent.mkdir(parents=True, exist_ok=True)
118
- self._conn = sqlite3.connect(self.db_path)
124
+ self._in_memory = in_memory
125
+ if in_memory:
126
+ self._conn = sqlite3.connect(":memory:")
127
+ else:
128
+ self.db_path.parent.mkdir(parents=True, exist_ok=True)
129
+ self._conn = sqlite3.connect(self.db_path)
119
130
  # row_factory makes column access readable in tests / debug.
120
131
  self._conn.row_factory = sqlite3.Row
121
132
  self._conn.execute("PRAGMA foreign_keys = ON")
@@ -402,13 +413,18 @@ class CacheStore:
402
413
  file_row_count = int(
403
414
  self._conn.execute("SELECT COUNT(*) FROM commit_files").fetchone()[0]
404
415
  )
405
- try:
406
- size_bytes = self.db_path.stat().st_size
407
- except OSError:
416
+ if self._in_memory:
408
417
  size_bytes = 0
418
+ exists = False
419
+ else:
420
+ try:
421
+ size_bytes = self.db_path.stat().st_size
422
+ except OSError:
423
+ size_bytes = 0
424
+ exists = self.db_path.exists()
409
425
  return CacheStats(
410
426
  path=self.db_path,
411
- exists=self.db_path.exists(),
427
+ exists=exists,
412
428
  schema_version=self.schema_version,
413
429
  head_sha=self.head_sha,
414
430
  commit_count=commit_count,
@@ -430,6 +446,16 @@ def open_for(repo_root: Path) -> CacheStore:
430
446
  return CacheStore(cache_path_for(repo_root))
431
447
 
432
448
 
449
+ def open_in_memory(repo_root: Path) -> CacheStore:
450
+ """Open a transient in-memory cache for ``repo_root``.
451
+
452
+ Used by ``--no-cache`` to keep within-session amortisation (the same
453
+ cold-fill code path everything else uses) while never touching disk.
454
+ The store is destroyed on ``close()`` and has no after-effects.
455
+ """
456
+ return CacheStore(cache_path_for(repo_root), in_memory=True)
457
+
458
+
433
459
  def parse_authored_at(value: str) -> datetime:
434
460
  """Parse the ``authored_at`` string we stored from git.
435
461
 
whycode/cli.py CHANGED
@@ -20,10 +20,12 @@ Commands
20
20
 
21
21
  from __future__ import annotations
22
22
 
23
+ import functools
23
24
  import json
24
25
  import sys
26
+ from collections.abc import Callable
25
27
  from pathlib import Path
26
- from typing import Any
28
+ from typing import Any, TypeVar
27
29
 
28
30
  import typer
29
31
  from rich.console import Console
@@ -48,18 +50,27 @@ err = Console(stderr=True)
48
50
 
49
51
 
50
52
  def _open_cache(repo_root: Path, no_cache: bool) -> ch.CacheStore | None:
51
- """Open the on-disk cache for ``repo_root`` unless suppressed.
52
-
53
- A None return means "do not pass a cache through git_facts" — every
54
- git-side helper falls back to its original network-free, cache-free
55
- implementation. This is the escape hatch behind ``--no-cache`` and
56
- is also the default when the cache cannot be initialised at all
57
- (read-only filesystem, etc.); we never want a cache failure to
58
- block the main read path.
53
+ """Open the cache for ``repo_root`` according to the no-cache flag.
54
+
55
+ Modes:
56
+ * ``no_cache=False`` (the default): persistent on-disk SQLite at
57
+ ``.whycode/cache.db``.
58
+ * ``no_cache=True``: a transient ``:memory:`` SQLite store. The
59
+ same git-walk code path runs as for the cold-fill, but the
60
+ database is destroyed on ``close()`` — nothing lands on disk
61
+ and the next run starts cold. Keeping per-run amortisation
62
+ (one ``git log`` walk shared across files) is what makes
63
+ ``--no-cache`` at most as slow as a cold persistent fill;
64
+ the previous ``cache=None`` short-circuit lost that and so
65
+ ``--no-cache`` re-issued per-file walks every iteration.
66
+
67
+ A ``None`` return means "do not pass a cache through git_facts".
68
+ Happens only when even an in-memory open fails — very rare and
69
+ we never want a cache problem to block the main read path.
59
70
  """
60
- if no_cache:
61
- return None
62
71
  try:
72
+ if no_cache:
73
+ return ch.open_in_memory(repo_root)
63
74
  return ch.open_for(repo_root)
64
75
  except OSError:
65
76
  return None
@@ -115,6 +126,37 @@ def _require_tracked(path_arg: str) -> tuple[Path, str]:
115
126
  return repo_root, rel
116
127
 
117
128
 
129
+ _F = TypeVar("_F", bound=Callable[..., Any])
130
+
131
+
132
+ def _propagate_failures(func: _F) -> _F:
133
+ """Convert any uncaught exception into ``typer.Exit(2)``.
134
+
135
+ A read-only field test against psf/requests caught a bug where a single
136
+ bad-timezone commit raised ``ValueError`` deep inside ``_parse_log_records``;
137
+ Rich rendered the traceback to stderr, but the process exited with status
138
+ 0. CI integrations could not tell that the run had silently failed
139
+ (a ``whycode diff --fail-on history`` step was reported as green even
140
+ though it had crashed). We wrap each command body so any unhandled
141
+ exception leaves the existing rich traceback rendering in place but
142
+ forces a non-zero exit code (``2`` for general failure). ``typer.Exit``
143
+ and ``KeyboardInterrupt`` propagate untouched so explicit exit-code
144
+ paths and Ctrl-C still behave normally.
145
+ """
146
+
147
+ @functools.wraps(func)
148
+ def wrapper(*args: Any, **kwargs: Any) -> Any:
149
+ try:
150
+ return func(*args, **kwargs)
151
+ except (typer.Exit, typer.Abort, KeyboardInterrupt):
152
+ raise
153
+ except Exception as exc:
154
+ err.print_exception(show_locals=False)
155
+ raise typer.Exit(2) from exc
156
+
157
+ return wrapper # type: ignore[return-value]
158
+
159
+
118
160
  # --- shared: band threshold parsing ----------------------------------------
119
161
 
120
162
  _BAND_THRESHOLDS_BY_KEY: dict[str, int] = {
@@ -148,6 +190,7 @@ def _print_brief(card: rc.RiskCard) -> None:
148
190
 
149
191
 
150
192
  @app.command()
193
+ @_propagate_failures
151
194
  def why(
152
195
  path: str = typer.Argument(..., help="File path to inspect."),
153
196
  json_out: bool = typer.Option(
@@ -317,6 +360,7 @@ def _resolve_base_ref(repo_root: Path, requested: str | None) -> str:
317
360
 
318
361
 
319
362
  @app.command()
363
+ @_propagate_failures
320
364
  def diff(
321
365
  base: str | None = typer.Option(
322
366
  None, "--base", help="Base ref (default: origin/main → main → HEAD~1)."
@@ -390,7 +434,9 @@ def diff(
390
434
  cards.append(rc.build(repo_root, f, cache=cache))
391
435
  except gf.GitError:
392
436
  continue
393
- cards.sort(key=lambda c: -c.score.value)
437
+ # Stable tie-break: lex smallest path on identical scores so cache
438
+ # and --no-cache truncate the same files at --top N.
439
+ cards.sort(key=lambda c: (-c.score.value, c.path))
394
440
  cards = cards[:top]
395
441
  finally:
396
442
  if cache is not None:
@@ -482,6 +528,7 @@ def diff(
482
528
 
483
529
 
484
530
  @app.command()
531
+ @_propagate_failures
485
532
  def highlights(
486
533
  invariants: int = typer.Option(
487
534
  5, "--invariants", help="How many invariant lines to surface."
@@ -529,16 +576,17 @@ def highlights(
529
576
 
530
577
  inv_pairs = gf.extract_invariant_quotes(commits)
531
578
  sha_to_commit = {c.sha: c for c in commits}
532
- seen_lines: dict[str, str] = {}
533
- for sha, line in inv_pairs:
534
- seen_lines.setdefault(line, sha)
579
+ deduped = gf.dedupe_invariant_lines(inv_pairs, sha_to_commit)
535
580
  inv_records: list[tuple[str, str, gf.Commit]] = []
536
- for line, sha in seen_lines.items():
581
+ for sha, line in deduped:
537
582
  commit = sha_to_commit.get(sha)
538
583
  if commit is None:
539
584
  continue
540
585
  inv_records.append((line, sha, commit))
541
- inv_records.sort(key=lambda t: t[2].authored_at, reverse=True)
586
+ # Sort newest first; on identical timestamps fall back to lexicographically
587
+ # smallest sha so cache and --no-cache emit byte-identical output.
588
+ inv_records.sort(key=lambda t: t[1]) # secondary: sha asc
589
+ inv_records.sort(key=lambda t: t[2].authored_at, reverse=True) # primary
542
590
  inv_records = inv_records[:invariants]
543
591
 
544
592
  incident_records = gf.find_incidents(commits)[:incidents]
@@ -636,6 +684,7 @@ def _sample_indices(total: int, max_samples: int) -> list[int]:
636
684
 
637
685
 
638
686
  @app.command()
687
+ @_propagate_failures
639
688
  def timeline(
640
689
  path: str = typer.Argument(..., help="File path to inspect."),
641
690
  samples: int = typer.Option(
@@ -677,6 +726,12 @@ def timeline(
677
726
  top,
678
727
  )
679
728
  )
729
+ # Field-test report F14: ``timeline`` used to render rows in whatever
730
+ # non-monotonic order ``_sample_indices`` produced (uniform-across-index
731
+ # selection on a list whose ordering is git's parent traversal). Sort
732
+ # by date ascending before rendering so a reader can scan left-to-right
733
+ # without misreading the trajectory.
734
+ rows.sort(key=lambda r: r[0])
680
735
 
681
736
  if json_out:
682
737
  console.print_json(
@@ -714,6 +769,7 @@ def timeline(
714
769
 
715
770
 
716
771
  @app.command()
772
+ @_propagate_failures
717
773
  def scan(
718
774
  top: int = typer.Option(10, "--top", help="How many files to list."),
719
775
  sample: int = typer.Option(
@@ -783,7 +839,10 @@ def scan(
783
839
  if cache is not None:
784
840
  cache.close()
785
841
 
786
- cards.sort(key=lambda c: -c.score.value)
842
+ # Stable tie-break on identical scores: lexicographically smallest path
843
+ # so cache and --no-cache produce byte-identical text output for the
844
+ # same HEAD. Without this, the truncation at --top N is non-deterministic.
845
+ cards.sort(key=lambda c: (-c.score.value, c.path))
787
846
  top_cards = cards[:top]
788
847
  if not top_cards:
789
848
  # Be honest about what "no flagged files" actually means. A user who
@@ -811,6 +870,7 @@ def scan(
811
870
 
812
871
 
813
872
  @app.command()
873
+ @_propagate_failures
814
874
  def honest(
815
875
  path: str = typer.Argument(..., help="File path to inspect."),
816
876
  json_out: bool = typer.Option(False, "--json", help="Emit JSON instead of prose."),
@@ -874,6 +934,7 @@ def honest(
874
934
 
875
935
 
876
936
  @app.command()
937
+ @_propagate_failures
877
938
  def show(
878
939
  sha: str = typer.Argument(..., help="Commit SHA (full or short) to inspect."),
879
940
  repo: Path = typer.Option(Path("."), "--repo", help="Path inside the repo."),
@@ -903,7 +964,8 @@ def show(
903
964
  cards.append(rc.build(repo_root, change.path))
904
965
  except gf.GitError:
905
966
  continue
906
- cards.sort(key=lambda c: -c.score.value)
967
+ # Stable tie-break on identical scores: lex smallest path.
968
+ cards.sort(key=lambda c: (-c.score.value, c.path))
907
969
 
908
970
  if json_out:
909
971
  console.print_json(
@@ -981,6 +1043,7 @@ _MCP_SNIPPET = ''' {
981
1043
 
982
1044
 
983
1045
  @app.command()
1046
+ @_propagate_failures
984
1047
  def tour(
985
1048
  repo: Path = typer.Option(Path("."), "--repo", help="Path inside the repo."),
986
1049
  no_cache: bool = typer.Option(
@@ -1018,29 +1081,50 @@ def tour(
1018
1081
 
1019
1082
  inv_pairs = gf.extract_invariant_quotes(commits)
1020
1083
  sha_to_commit = {c.sha: c for c in commits}
1021
- seen_lines: dict[str, str] = {}
1022
- for sha, line in inv_pairs:
1023
- seen_lines.setdefault(line, sha)
1084
+ deduped = gf.dedupe_invariant_lines(inv_pairs, sha_to_commit)
1085
+ # Sort newest first with sha-asc tie-break so cache and --no-cache
1086
+ # surface the same three lines in the same order.
1087
+ deduped_sorted = sorted(
1088
+ (p for p in deduped if p[0] in sha_to_commit),
1089
+ key=lambda p: p[0],
1090
+ )
1091
+ deduped_sorted.sort(
1092
+ key=lambda p: sha_to_commit[p[0]].authored_at, reverse=True
1093
+ )
1024
1094
  invariants_top = [
1025
- (line, sha_to_commit[sha])
1026
- for line, sha in seen_lines.items()
1027
- if sha in sha_to_commit
1095
+ (line, sha_to_commit[sha]) for sha, line in deduped_sorted
1028
1096
  ][:3]
1029
1097
  incidents_top = gf.find_incidents(commits)[:3]
1030
1098
 
1031
1099
  if invariants_top or incidents_top:
1032
- console.print("[bold yellow]Decisions and incidents[/bold yellow]")
1033
- for line, c in invariants_top:
1034
- console.print(f" [italic]{line}[/italic]")
1100
+ # Field-test report F16: the original tour rendered both classes
1101
+ # under one ``Decisions and incidents`` header, so a parenthetical
1102
+ # invariant prose line was visually indistinguishable from a real
1103
+ # incident commit. Render two subheads matching the layout
1104
+ # ``highlights`` already uses.
1105
+ if invariants_top:
1035
1106
  console.print(
1036
- f" [dim]{c.sha[:7]} {c.authored_at.date()} {c.author_name}[/dim]\n"
1107
+ f"[bold yellow]Stated invariants[/bold yellow] "
1108
+ f"[dim]({len(invariants_top)} most recent)[/dim]"
1037
1109
  )
1038
- for c in incidents_top:
1039
- subj = c.subject if len(c.subject) <= 70 else c.subject[:69] + "…"
1040
- console.print(f" [red]{subj}[/red]")
1110
+ for line, c in invariants_top:
1111
+ console.print(f" [italic]{line}[/italic]")
1112
+ console.print(
1113
+ f" [dim]{c.sha[:7]} {c.authored_at.date()} "
1114
+ f"{c.author_name}[/dim]\n"
1115
+ )
1116
+ if incidents_top:
1041
1117
  console.print(
1042
- f" [dim]{c.sha[:7]} {c.authored_at.date()} {c.author_name}[/dim]\n"
1118
+ f"[bold red]Recent incidents[/bold red] "
1119
+ f"[dim]({len(incidents_top)} most recent)[/dim]"
1043
1120
  )
1121
+ for c in incidents_top:
1122
+ subj = c.subject if len(c.subject) <= 70 else c.subject[:69] + "…"
1123
+ console.print(f" [red]{subj}[/red]")
1124
+ console.print(
1125
+ f" [dim]{c.sha[:7]} {c.authored_at.date()} "
1126
+ f"{c.author_name}[/dim]\n"
1127
+ )
1044
1128
  else:
1045
1129
  console.print(
1046
1130
  "[dim]No headline decisions or incidents in recent history.[/dim]"
@@ -1072,7 +1156,8 @@ def tour(
1072
1156
  ]
1073
1157
  if useful:
1074
1158
  cards.append(card)
1075
- cards.sort(key=lambda c: -c.score.value)
1159
+ # Stable tie-break: lex smallest path on identical scores.
1160
+ cards.sort(key=lambda c: (-c.score.value, c.path))
1076
1161
 
1077
1162
  if cards:
1078
1163
  console.print("[bold red]Top 3 risky files[/bold red]")
@@ -1113,6 +1198,7 @@ def tour(
1113
1198
 
1114
1199
 
1115
1200
  @app.command()
1201
+ @_propagate_failures
1116
1202
  def init(
1117
1203
  force: bool = typer.Option(
1118
1204
  False, "--force", "-f", help="Overwrite existing files instead of skipping."
whycode/git_facts.py CHANGED
@@ -18,10 +18,11 @@ from __future__ import annotations
18
18
 
19
19
  import re
20
20
  import subprocess
21
+ import sys
21
22
  from collections import Counter
22
23
  from collections.abc import Sequence
23
24
  from dataclasses import dataclass, field
24
- from datetime import datetime
25
+ from datetime import UTC, datetime
25
26
  from pathlib import Path
26
27
  from typing import TYPE_CHECKING
27
28
 
@@ -31,6 +32,17 @@ if TYPE_CHECKING:
31
32
  UNIT_SEP = "\x1f"
32
33
  RECORD_SEP = "\x1e"
33
34
 
35
+ # Per-process record of commits whose authored timestamp could not be parsed
36
+ # even after defensive normalisation. We surface these once per session via
37
+ # a single stderr line so a single bad record does not spam a per-line warning
38
+ # — never on every read, never to a network.
39
+ _UNPARSEABLE_TIMESTAMPS: set[str] = set()
40
+ _BAD_TZ_WARNING_EMITTED = False
41
+ # The Unix epoch as a tz-aware UTC datetime; used as a safe fallback when a
42
+ # commit's authored_at is irrecoverably malformed. Picked over datetime.min
43
+ # because callers expect a tz-aware value (signal age math compares to UTC).
44
+ _EPOCH_FALLBACK = datetime.fromtimestamp(0, UTC)
45
+
34
46
  # A commit subject/body containing one of these markers is treated as evidence
35
47
  # that the original author flagged something worth carrying forward.
36
48
  INCIDENT_TOKENS: tuple[str, ...] = (
@@ -69,6 +81,39 @@ _BREAKING_CC_RE = re.compile(
69
81
  _ISSUE_ID_RE = re.compile(
70
82
  r"(?:#\d+|\b[A-Z][A-Z0-9_]+-\d+|\bSEV[- ]?\d\b|\bP[01]\b)",
71
83
  )
84
+ # Security-advisory tokens fire as incidents on subject alone — the deliberate
85
+ # act of citing one is unambiguous high-confidence evidence.
86
+ _CVE_RE = re.compile(r"\bCVE-\d{4}-\d+\b")
87
+ _GHSA_RE = re.compile(r"\bGHSA-[a-z0-9-]+\b", re.IGNORECASE)
88
+ # Default ``git revert`` body subject ("Reverted ...") and the human variant
89
+ # ("Reverts <sha>") are both unambiguous incident-class evidence on subject.
90
+ _REVERTED_SUBJECT_RE = re.compile(r'^Reverted\s+"', re.IGNORECASE)
91
+ _REVERTS_SUBJECT_RE = re.compile(r"\bReverts\s+[0-9a-f]{7,}\b", re.IGNORECASE)
92
+ # Subject-level "regression" usage that is descriptive rather than incident:
93
+ # "regression test(s)", "regression suite", "no regression", "regression nature".
94
+ # These prevention/test-housekeeping phrases must NOT fire as incidents.
95
+ _BENIGN_REGRESSION_RE = re.compile(
96
+ r"\b(?:regression\s+(?:tests?|suite|nature)|no\s+regression)\b",
97
+ re.IGNORECASE,
98
+ )
99
+ # Conversely, a subject using "regression" with a corroborating incident id
100
+ # (``#1234``, ``INC-447``, …), or as part of an unambiguous incident phrase
101
+ # like ``regression in <something>`` / ``Fixed: regression`` / ``fix the
102
+ # regression``, IS an incident. These are the patterns that distinguish
103
+ # "split the regression-test files" (housekeeping) from "fix the refund
104
+ # regression" (a real outage marker).
105
+ _INCIDENT_REGRESSION_RE = re.compile(
106
+ # ``regression in <something>`` — "fix the refund regression in admin".
107
+ r"\bregression\s+in\b"
108
+ # ``fix the regression`` / ``fix a regression`` — explicit incident verb.
109
+ r"|\bfix(?:ed|es)?\s+(?:the\s+|a\s+)?regression\b"
110
+ # ``regression — …`` / ``regression: …`` — stated subject category.
111
+ r"|\bregression\s*[:—-]"
112
+ # ``Fixed: regression`` / ``Hotfix: regression`` — pre-colon incident verb
113
+ # explicitly framing the rest as the incident category itself.
114
+ r"|\b(?:fix(?:ed|es)?|hotfix|revert(?:ed)?)\s*:\s*regression\b",
115
+ re.IGNORECASE,
116
+ )
72
117
  INVARIANT_TOKENS: tuple[str, ...] = (
73
118
  "do not",
74
119
  "don't",
@@ -187,8 +232,94 @@ def is_tracked(repo_root: Path, path: str) -> bool:
187
232
  return bool(out.strip())
188
233
 
189
234
 
235
+ def _normalise_tz_offset(timestamp: str) -> str:
236
+ """Repair pathological tz offsets that ``datetime.fromisoformat`` rejects.
237
+
238
+ Real-world git history contains commits authored on systems with broken
239
+ timezone configuration (e.g. an offset of ``+518:00`` or ``+51800`` —
240
+ encountered on a 2011 commit in psf/requests, where the underlying object
241
+ really stores ``+51800``). ``fromisoformat`` raises ``ValueError`` on
242
+ those, which would otherwise poison every command that walks history.
243
+
244
+ We coerce the suffix into the canonical ``[+-]HH:MM`` form when we can
245
+ recognise it. Anything else is left untouched and the caller falls back
246
+ to a safe default.
247
+ """
248
+ stripped = timestamp.strip()
249
+ if "T" not in stripped:
250
+ return stripped
251
+ body, _, after_t = stripped.partition("T")
252
+ sign_idx = -1
253
+ for i, ch in enumerate(after_t):
254
+ if ch in "+-":
255
+ sign_idx = i
256
+ break
257
+ if sign_idx < 0:
258
+ return stripped
259
+ prefix = body + "T" + after_t[:sign_idx]
260
+ sign = after_t[sign_idx]
261
+ rest = after_t[sign_idx + 1 :]
262
+ digits = rest.replace(":", "")
263
+ if not digits.isdigit():
264
+ return stripped
265
+ # Acceptable shapes: 4 digits → HHMM; 5 digits → HHHMM (broken, e.g.
266
+ # ``+51800`` → hours ``5``, minutes ``18``); 6 digits → HHMMSS (rare).
267
+ if len(digits) == 4:
268
+ hh, mm = digits[:2], digits[2:]
269
+ elif len(digits) == 5:
270
+ hh, mm = "0" + digits[0], digits[1:3]
271
+ elif len(digits) == 6:
272
+ hh, mm = digits[:2], digits[2:4]
273
+ elif len(digits) == 2:
274
+ hh, mm = digits, "00"
275
+ else:
276
+ return stripped
277
+ try:
278
+ if int(hh) > 23 or int(mm) > 59:
279
+ return stripped
280
+ except ValueError:
281
+ return stripped
282
+ return f"{prefix}{sign}{hh}:{mm}"
283
+
284
+
190
285
  def _parse_iso(timestamp: str) -> datetime:
191
- return datetime.fromisoformat(timestamp.strip())
286
+ """Parse an ISO 8601 timestamp; tolerate malformed tz offsets.
287
+
288
+ Returns a Unix-epoch sentinel if the offset cannot be repaired so a
289
+ single bad record never crashes a whole-repo analysis. The bad raw
290
+ string is tracked in a per-session set so verbose callers can mention
291
+ which commits were affected.
292
+ """
293
+ raw = timestamp.strip()
294
+ try:
295
+ return datetime.fromisoformat(raw)
296
+ except ValueError:
297
+ repaired = _normalise_tz_offset(raw)
298
+ if repaired != raw:
299
+ try:
300
+ return datetime.fromisoformat(repaired)
301
+ except ValueError:
302
+ pass
303
+ _UNPARSEABLE_TIMESTAMPS.add(raw)
304
+ return _EPOCH_FALLBACK
305
+
306
+
307
+ def _maybe_warn_bad_timestamps() -> None:
308
+ """Emit a single stderr line per session if any record fell back to epoch.
309
+
310
+ Called once at the end of a top-level read so a single bad commit never
311
+ spams a per-line warning. Stays purely local — no network, no telemetry.
312
+ """
313
+ global _BAD_TZ_WARNING_EMITTED
314
+ if _BAD_TZ_WARNING_EMITTED or not _UNPARSEABLE_TIMESTAMPS:
315
+ return
316
+ _BAD_TZ_WARNING_EMITTED = True
317
+ n = len(_UNPARSEABLE_TIMESTAMPS)
318
+ print(
319
+ f"warning: {n} commit{'s' if n != 1 else ''} had an unparseable "
320
+ f"authored timestamp; treating those as epoch for date math.",
321
+ file=sys.stderr,
322
+ )
192
323
 
193
324
 
194
325
  def _log_format() -> str:
@@ -285,6 +416,7 @@ def commits_for_path(
285
416
  cache.store_path_log(path, head_sha, [c.sha for c in commits])
286
417
  if max_count is not None and len(commits) > max_count:
287
418
  commits = commits[:max_count]
419
+ _maybe_warn_bad_timestamps()
288
420
  return commits
289
421
 
290
422
 
@@ -342,12 +474,15 @@ def all_commits(
342
474
  """
343
475
  if cache is not None:
344
476
  full = _all_commits_via_cache(repo_root, cache)
477
+ _maybe_warn_bad_timestamps()
345
478
  return full if max_count is None else full[:max_count]
346
479
  args = ["log", "--no-merges", f"--pretty=format:{_log_format()}"]
347
480
  if max_count is not None:
348
481
  args.append(f"--max-count={max_count}")
349
482
  raw = _run_git(repo_root, *args)
350
- return _parse_log_records(raw)
483
+ out = _parse_log_records(raw)
484
+ _maybe_warn_bad_timestamps()
485
+ return out
351
486
 
352
487
 
353
488
  def _store_commits(cache: CacheStore, commits: Sequence[Commit]) -> None:
@@ -624,26 +759,77 @@ def find_revert_pairs(commits: Sequence[Commit]) -> list[tuple[str, str]]:
624
759
  return pairs
625
760
 
626
761
 
762
+ def _is_subject_incident(subject: str, body: str) -> bool:
763
+ """Determine whether a single commit's subject signals incident intent.
764
+
765
+ The trickiest case is ``regression``. The 0.4.0 classifier accepted any
766
+ subject that contained the word and so flagged routine bug fixes that
767
+ happened to mention "regression tests" or "regression nature" as
768
+ incidents. The new rule:
769
+
770
+ - ``regression`` in a subject fires only when corroborated:
771
+ * an issue / incident id on the same subject or anywhere in the body
772
+ (``#1234``, ``INC-447``, ``SEV-1``, …); OR
773
+ * a pre-marker that anchors the word as an incident reference,
774
+ such as ``regression in <something>`` / ``Fixed: regression`` /
775
+ ``fix the regression`` / ``regression — …``.
776
+ - The phrases ``regression test(s)``, ``regression suite``,
777
+ ``no regression``, ``regression nature`` never fire on their own.
778
+ - Subjects citing a security advisory (``CVE-…`` / ``GHSA-…``) always
779
+ fire — the act of naming an advisory is unambiguous high-confidence.
780
+ - The default ``git revert`` body subject (``Reverted "…"``) and the
781
+ human variant (``Reverts <sha>``) always fire — both are explicit
782
+ rollback markers.
783
+ - Other incident keywords (``hotfix``, ``outage``, ``rollback``, …)
784
+ keep their existing subject-level acceptance.
785
+ """
786
+ if _CVE_RE.search(subject) or _GHSA_RE.search(subject):
787
+ return True
788
+ if _REVERTED_SUBJECT_RE.search(subject) or _REVERTS_SUBJECT_RE.search(subject):
789
+ return True
790
+ if _BREAKING_CC_RE.search(subject):
791
+ return True
792
+ # Regression demands corroboration: either it appears as part of a
793
+ # high-confidence phrase, or an issue/incident id is present.
794
+ has_regression = bool(re.search(r"\bregression\b", subject, re.IGNORECASE))
795
+ if has_regression:
796
+ if _BENIGN_REGRESSION_RE.search(subject):
797
+ return False
798
+ if _INCIDENT_REGRESSION_RE.search(subject):
799
+ return True
800
+ if _ISSUE_ID_RE.search(subject) or _ISSUE_ID_RE.search(body):
801
+ return True
802
+ # Strip the word and check whether any other incident keyword carries
803
+ # the subject — a "rollback regression" should still fire on
804
+ # "rollback" alone.
805
+ without_regression = re.sub(r"\bregression\b", "", subject, flags=re.IGNORECASE)
806
+ return bool(_INCIDENT_RE.search(without_regression))
807
+ return bool(_INCIDENT_RE.search(subject))
808
+
809
+
627
810
  def find_incidents(commits: Sequence[Commit]) -> list[Commit]:
628
811
  """Return commits whose evidence-level signals incident-flavored intent.
629
812
 
630
813
  Acceptance ladder (highest to lowest confidence):
631
- 1. Subject contains an incident keyword. A commit's subject is its
632
- declared purpose, so a subject hit is treated as ground truth.
633
- 2. Subject carries the Conventional Commits breaking marker
814
+ 1. Subject cites a security advisory (``CVE-…`` / ``GHSA-…``) — fires
815
+ on subject alone.
816
+ 2. Subject is a default ``git revert`` body (``Reverted "…"``) or a
817
+ human revert pointer (``Reverts <sha>``) — fires on subject alone.
818
+ 3. Subject carries the Conventional Commits breaking marker
634
819
  (``feat!:`` / ``fix!:`` / …).
635
- 3. Body carries the structured ``BREAKING CHANGE:`` footer. This is a
636
- deliberate, anchored marker, not free-form prose.
637
- 4. Body contains an incident keyword AND an issue / incident
638
- identifier nearby (``#1234``, ``INC-447``, ``SEV-1``, ``P0``).
639
- This filters out passing mentions in prose like "feat: add
640
- incident-aware logging" where the keyword describes a *feature*.
820
+ 4. Subject contains an incident keyword that is NOT a benign
821
+ "regression test/suite/nature" phrase. ``regression`` requires
822
+ either an issue id (on subject or body) or a pre-marker that
823
+ anchors it as an incident reference.
824
+ 5. Body carries the structured ``BREAKING CHANGE:`` footer.
825
+ 6. Body contains an incident keyword AND an issue / incident
826
+ identifier nearby. Filters out passing mentions in prose.
641
827
 
642
828
  A bare body keyword with no corroborating ID does NOT fire.
643
829
  """
644
830
  out: list[Commit] = []
645
831
  for c in commits:
646
- if _INCIDENT_RE.search(c.subject) or _BREAKING_CC_RE.search(c.subject):
832
+ if _is_subject_incident(c.subject, c.body):
647
833
  out.append(c)
648
834
  continue
649
835
  if _BREAKING_FOOTER_RE.search(c.body):
@@ -696,6 +882,41 @@ def _all_matches_are_quoted(line: str, regex: re.Pattern[str]) -> bool:
696
882
  return True
697
883
 
698
884
 
885
+ # An ALLCAPS line prefix (e.g. ``WARNING:``, ``ERROR:``, ``DEBUG:``) is the
886
+ # canonical signature of pasted compiler / linter / spell-checker output.
887
+ # A genuine human invariant statement opens with a normal sentence ("Do
888
+ # not...", "Important: ...") and never with two or more uppercase letters
889
+ # followed by an immediate colon.
890
+ _TOOL_OUTPUT_ALLCAPS_RE = re.compile(r"^[A-Z]{2,}:\s")
891
+ # A ``path:line:`` or ``path:line:col:`` prefix near the start of a line is
892
+ # the unmistakable shape of compiler / aspell output. We accept any path-
893
+ # shaped token (slashes, dots, hyphens, underscores, alnum) followed by
894
+ # ``:<digits>:`` — anchored so it also catches ``./foo/bar.py:50:``.
895
+ _TOOL_OUTPUT_PATH_RE = re.compile(r"^[\w./-]+:\d+:")
896
+ # Per-(commit, file) cap on invariant lines pulled from one body. A real
897
+ # author rarely states more than two crisp invariants in a single message;
898
+ # anything beyond is almost certainly a paste. Set deliberately low so a
899
+ # single noisy commit can no longer dominate the "highlights" view.
900
+ _PER_COMMIT_INVARIANT_CAP = 2
901
+
902
+
903
+ def _is_tool_output_line(line: str, prev_line: str) -> bool:
904
+ """True if ``line`` looks like quoted compiler / linter / aspell output.
905
+
906
+ Heuristics:
907
+ - ALLCAPS followed immediately by a colon (``WARNING:``, ``ERROR:``,
908
+ ``DEBUG:``…) — pasted tool output.
909
+ - ``path/to/file:line:`` prefix near the start — clang / mypy / aspell.
910
+ - Preceded by a ``> `` block-quote line — markdown-style "this is
911
+ what the tool said" framing.
912
+ """
913
+ if _TOOL_OUTPUT_ALLCAPS_RE.match(line):
914
+ return True
915
+ if _TOOL_OUTPUT_PATH_RE.match(line):
916
+ return True
917
+ return prev_line.startswith("> ")
918
+
919
+
699
920
  def extract_invariant_quotes(commits: Sequence[Commit]) -> list[tuple[str, str]]:
700
921
  """Pull lines from commit *bodies* that match invariant tokens.
701
922
 
@@ -706,23 +927,87 @@ def extract_invariant_quotes(commits: Sequence[Commit]) -> list[tuple[str, str]]
706
927
  eliminates the meta-mention failure mode where a commit *about* an
707
928
  invariant token (e.g. "fix invariant matcher") would self-flag.
708
929
 
930
+ Two filters keep pasted tool output out of the "stated invariants"
931
+ surface:
932
+
933
+ 1. Lines that look like quoted compiler / linter / aspell output are
934
+ dropped (``WARNING: …``, ``foo/bar.py:50: …``, lines preceded by a
935
+ ``> `` block-quote). One noisy spell-check commit on django used to
936
+ supply 15 of the top-20 highlights; this rule kills it at the
937
+ source.
938
+ 2. A per-commit cap of two invariants. Real authors rarely state more
939
+ than two crisp constraints in one message; anything beyond is
940
+ almost certainly a paste. The first two matches are preserved
941
+ (most informative-looking entries rank).
942
+
709
943
  Lines where every matching token is wrapped in quotes (``"do not"``) are
710
944
  treated as references rather than statements and are skipped.
711
945
  """
712
946
  out: list[tuple[str, str]] = []
713
947
  for commit in commits:
948
+ per_commit = 0
949
+ prev_line = ""
714
950
  for raw_line in commit.body.splitlines():
715
951
  line = raw_line.strip()
716
952
  if not line:
953
+ prev_line = raw_line
717
954
  continue
955
+ if _is_tool_output_line(line, prev_line):
956
+ prev_line = raw_line
957
+ continue
958
+ prev_line = raw_line
718
959
  if not _INVARIANT_RE.search(line):
719
960
  continue
720
961
  if _all_matches_are_quoted(line, _INVARIANT_RE):
721
962
  continue
963
+ if per_commit >= _PER_COMMIT_INVARIANT_CAP:
964
+ continue
722
965
  out.append((commit.sha, line[:200]))
966
+ per_commit += 1
723
967
  return out
724
968
 
725
969
 
970
+ def dedupe_invariant_lines(
971
+ pairs: Sequence[tuple[str, str]],
972
+ sha_to_commit: dict[str, Commit],
973
+ ) -> list[tuple[str, str]]:
974
+ """Collapse identical invariant lines to one canonical (sha, line) pair.
975
+
976
+ When two commits state the same invariant line — typically a cherry-pick
977
+ onto a maintenance branch, or a rebase that duplicated the message — we
978
+ must pick exactly one to surface. Without a deterministic rule the cache
979
+ and ``--no-cache`` paths can disagree (their walk orders differ when
980
+ timestamps tie), and downstream JSON consumers see flaky output across
981
+ runs.
982
+
983
+ The rule:
984
+
985
+ 1. Earliest ``authored_at`` wins. The original statement is canonical;
986
+ cherry-picks and rebases are derivatives.
987
+ 2. Lexicographically smallest ``sha`` breaks ties on identical timestamps.
988
+
989
+ The returned list preserves first-encounter order of the (now-unique)
990
+ lines so downstream code that sorts by date sees a stable input.
991
+ Pairs whose ``sha`` is not in ``sha_to_commit`` keep their first-seen
992
+ record (no metadata to compare on).
993
+ """
994
+ canonical: dict[str, str] = {}
995
+ for sha, line in pairs:
996
+ existing = canonical.get(line)
997
+ if existing is None:
998
+ canonical[line] = sha
999
+ continue
1000
+ old_commit = sha_to_commit.get(existing)
1001
+ new_commit = sha_to_commit.get(sha)
1002
+ if old_commit is None or new_commit is None:
1003
+ continue
1004
+ old_key = (old_commit.authored_at, existing)
1005
+ new_key = (new_commit.authored_at, sha)
1006
+ if new_key < old_key:
1007
+ canonical[line] = sha
1008
+ return [(sha, line) for line, sha in canonical.items()]
1009
+
1010
+
726
1011
  def author_last_activity(repo_root: Path, email: str) -> datetime | None:
727
1012
  """Most recent commit timestamp by ``email`` anywhere in the repo, or None."""
728
1013
  raw = _run_git(
whycode/ignore.py CHANGED
@@ -3,7 +3,16 @@
3
3
  These are paths/files that almost always pollute risk analysis without
4
4
  adding signal: changelogs (touched on every release, so they look "tightly
5
5
  coupled to everything"), lockfiles (regenerated on every dependency bump),
6
- vendored third-party code, and machine-generated stubs.
6
+ vendored third-party code, machine-generated stubs, CI / packaging
7
+ metadata, project-membership files (``AUTHORS``, ``LICENSE``), and
8
+ translation catalogues (``*.po`` / ``*.mo``).
9
+
10
+ A field test against django (10,000 commits, 7,043 files) showed the
11
+ top-10 risk list was dominated by these high-touch metadata files —
12
+ ``AUTHORS``, ``.github/workflows/*.yml``, locale ``.po``, ``.gitignore``
13
+ — and no application code at all reached the top 10. A scan-top list
14
+ that surfaces zero source files is unactionable; demoting these
15
+ metadata files lets real source code rank.
7
16
 
8
17
  Users can extend this list with a ``.whycodeignore`` file at repo root,
9
18
  one ``fnmatch``-style pattern per line. Comments start with ``#``.
@@ -71,6 +80,49 @@ DEFAULT_IGNORE_PATTERNS: tuple[str, ...] = (
71
80
  "*.ttf",
72
81
  "*.otf",
73
82
  "*.eot",
83
+ # CI / repo metadata — high-touch but never the source of risk in code.
84
+ ".github/**",
85
+ ".gitlab/**",
86
+ ".circleci/**",
87
+ ".gitignore",
88
+ ".gitattributes",
89
+ ".editorconfig",
90
+ ".pre-commit-config.yaml",
91
+ ".readthedocs.yaml",
92
+ ".readthedocs.yml",
93
+ ".flake8",
94
+ ".coveragerc",
95
+ "tox.ini",
96
+ "pytest.ini",
97
+ "Makefile",
98
+ # Project-membership / licensing files — touched on every contributor add.
99
+ "AUTHORS",
100
+ "AUTHORS.*",
101
+ "CONTRIBUTORS",
102
+ "CONTRIBUTORS.*",
103
+ "LICENSE",
104
+ "LICENSE.*",
105
+ "LICENSES/**",
106
+ "COPYING",
107
+ "COPYING.*",
108
+ "NOTICE",
109
+ "NOTICE.*",
110
+ # Python packaging metadata — low-signal-per-touch.
111
+ "setup.py",
112
+ "setup.cfg",
113
+ "MANIFEST.in",
114
+ # Translation catalogues — bulk-edited every release, never an indicator
115
+ # of code risk.
116
+ "*.po",
117
+ "*.mo",
118
+ "*.pot",
119
+ # Release-notes-style ``*.txt`` files only — narrow patterns; we are
120
+ # deliberately conservative here so a random ``requirements.txt`` is not
121
+ # ignored. The shapes below match common repo layouts (django, flask).
122
+ "release_notes/*.txt",
123
+ "docs/releases/*.txt",
124
+ "docs/release-notes/*.txt",
125
+ "release-notes/*.txt",
74
126
  )
75
127
 
76
128
  _USER_IGNORE_FILE = ".whycodeignore"
whycode/signals.py CHANGED
@@ -13,6 +13,7 @@ from enum import StrEnum
13
13
  from typing import TYPE_CHECKING
14
14
 
15
15
  from whycode import git_facts as gf
16
+ from whycode import ignore as ign
16
17
 
17
18
  if TYPE_CHECKING:
18
19
  from whycode.git_facts import RepoFacts
@@ -148,7 +149,23 @@ def detect_high_churn(facts: RepoFacts) -> Signal | None:
148
149
 
149
150
 
150
151
  def detect_coupling(facts: RepoFacts) -> Signal | None:
151
- paired = [(p, n) for p, n in facts.co_changed_files.items() if n >= COUPLING_MIN_COCHANGES]
152
+ """Files that change together with the target file, ranked by frequency.
153
+
154
+ Co-change candidates are filtered through the same ignore list that
155
+ powers ``whycode scan`` (built-in defaults plus an optional repo-local
156
+ ``.whycodeignore``). Without this filter, a per-file coupling signal
157
+ would surface ``CHANGELOG``, ``.github/workflows/*.yml``, ``AUTHORS``
158
+ and similar high-touch metadata as the file's "tight coupling" — the
159
+ field-test report flagged ``flask/app.py``'s top co-changers as 60%
160
+ metadata, leaving only two genuinely informative entries. Applying
161
+ the same filter here keeps the most-shown signal honest.
162
+ """
163
+ patterns = ign.effective_patterns(facts.repo_root)
164
+ paired = [
165
+ (p, n)
166
+ for p, n in facts.co_changed_files.items()
167
+ if n >= COUPLING_MIN_COCHANGES and not ign.is_ignored(p, patterns)
168
+ ]
152
169
  if not paired:
153
170
  return None
154
171
  paired.sort(key=lambda x: (-x[1], x[0]))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: whycode-cli
3
- Version: 0.4.0
3
+ Version: 0.4.2
4
4
  Summary: Tells you what to be afraid of before you touch a file.
5
5
  Author: Kevin
6
6
  License-Expression: MIT
@@ -1,22 +1,22 @@
1
- whycode/__init__.py,sha256=DT8PsrrvRkCF3U7cRgChch8aCNUobwQ9iFpPowLmLWY,96
1
+ whycode/__init__.py,sha256=YXMeIO9f86OJ3_EonP3wlcLW6Qv9sIHQQZqr-Ja4HV8,96
2
2
  whycode/__main__.py,sha256=dqAk6746YpuM-FTIH4TBOULegGc5WweojiZjce0VYgQ,105
3
- whycode/cache.py,sha256=v55KbSlTqmP_ot1FEFqxCNpAApj6vthpHl2l0lGLX3A,17477
4
- whycode/cli.py,sha256=97LJmxOYBTtHkEtMlCabIogCxJNswOGusAdpvT3-mf8,45146
3
+ whycode/cache.py,sha256=0cEPZHdolQbSiBLAOnMu20tobIrc7G0MNycpldHRpkk,18536
4
+ whycode/cli.py,sha256=uRW5aysC2ufYvs_qPC1gzZcjQTFUZHdXxAmF25d4oY8,49328
5
5
  whycode/decisions.py,sha256=oCVhEF7QfHeci0LAWNtEjV2mUAEBJloL1rT3I4XXbkw,7570
6
- whycode/git_facts.py,sha256=cKPywdrAsQBsPl7R4kLO5zBAELmXlhoy23g29XjuK18,29044
7
- whycode/ignore.py,sha256=sdRO_0HSedm8aO69CSGl-zQrUVX5MEg9QGcAJWwAvP4,3021
6
+ whycode/git_facts.py,sha256=MLp8e4nGaam6lBGCHY5-sftHj71lyg_HmmBOBx3g-kg,41829
7
+ whycode/ignore.py,sha256=O_8bHIt0d1U-sYrBajBa7oEqpnHWU3f6Zf-8PU8CpO0,4748
8
8
  whycode/llm.py,sha256=leB94pBg8kUCq_BujZq5ixny0urGtKskjdaKoum_eCA,4092
9
9
  whycode/mcp_server.py,sha256=ht1tStAkOwmQzNIRkm1eA8Tnc59fzDRSGkgyIprft-0,18503
10
10
  whycode/risk_card.py,sha256=xOJkHwIkS_6yw_dSowsQ6LHfeD9Mwr2tymL7_wqxs0U,8855
11
11
  whycode/scorer.py,sha256=4pBejunfxzYhGUzMeL8uGEMQzC6DWiqwcTeMdo3eras,1444
12
- whycode/signals.py,sha256=M2x6868G1YQ4eWoIuwE0PMjurCoZn1jyJWySLF7FlW0,13085
12
+ whycode/signals.py,sha256=z0kZfXR60nS-j56nchHd1V3aK8A5CGR1BAyHZZAff3s,13899
13
13
  whycode/suppressions.py,sha256=1lKSs-kCgpnJbcxozcgiSP8ZAfjEDMHXuM3sw4FaY78,3836
14
14
  whycode/templates/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
15
  whycode/templates/github-workflow.yml,sha256=LAfHMDG2TkAwi4vCNinHk-4zOt-mCWErBpmpaqlW5oA,2251
16
16
  whycode/templates/pre-commit,sha256=IhU11CvoDwqRAAsvHwUo-BwaNbdgy1cpXc54Z_phrmQ,316
17
- whycode_cli-0.4.0.dist-info/licenses/LICENSE,sha256=U6LN5qg5kJXSJf7KFPm9KJhmiGn3qK_GsTVWXdt1DFA,1062
18
- whycode_cli-0.4.0.dist-info/METADATA,sha256=3VurI0V9_AtQdTTC8Fyis3C3pulEIdEe_bMC4_iH7xs,10218
19
- whycode_cli-0.4.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
20
- whycode_cli-0.4.0.dist-info/entry_points.txt,sha256=xrNWc4CQn3ZhQFJxsGIPiTqpN19K4pRpgaj6qGaEzSQ,44
21
- whycode_cli-0.4.0.dist-info/top_level.txt,sha256=6yIL5rxW-4DbARHQYrPlGQVqKddZ88sjvmNosDh1w3A,8
22
- whycode_cli-0.4.0.dist-info/RECORD,,
17
+ whycode_cli-0.4.2.dist-info/licenses/LICENSE,sha256=U6LN5qg5kJXSJf7KFPm9KJhmiGn3qK_GsTVWXdt1DFA,1062
18
+ whycode_cli-0.4.2.dist-info/METADATA,sha256=GD3cP18eEcHePHEXxroFuuZ-2pysLn51biNROQKDBXw,10218
19
+ whycode_cli-0.4.2.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
20
+ whycode_cli-0.4.2.dist-info/entry_points.txt,sha256=xrNWc4CQn3ZhQFJxsGIPiTqpN19K4pRpgaj6qGaEzSQ,44
21
+ whycode_cli-0.4.2.dist-info/top_level.txt,sha256=6yIL5rxW-4DbARHQYrPlGQVqKddZ88sjvmNosDh1w3A,8
22
+ whycode_cli-0.4.2.dist-info/RECORD,,