whycode-cli 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
whycode/__init__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """WhyCode — tells you what to be afraid of before touching a file."""
2
2
 
3
- __version__ = "0.4.0"
3
+ __version__ = "0.4.1"
whycode/cli.py CHANGED
@@ -20,10 +20,12 @@ Commands
20
20
 
21
21
  from __future__ import annotations
22
22
 
23
+ import functools
23
24
  import json
24
25
  import sys
26
+ from collections.abc import Callable
25
27
  from pathlib import Path
26
- from typing import Any
28
+ from typing import Any, TypeVar
27
29
 
28
30
  import typer
29
31
  from rich.console import Console
@@ -115,6 +117,37 @@ def _require_tracked(path_arg: str) -> tuple[Path, str]:
115
117
  return repo_root, rel
116
118
 
117
119
 
120
+ _F = TypeVar("_F", bound=Callable[..., Any])
121
+
122
+
123
+ def _propagate_failures(func: _F) -> _F:
124
+ """Convert any uncaught exception into ``typer.Exit(2)``.
125
+
126
+ A read-only field test against psf/requests caught a bug where a single
127
+ bad-timezone commit raised ``ValueError`` deep inside ``_parse_log_records``;
128
+ Rich rendered the traceback to stderr, but the process exited with status
129
+ 0. CI integrations could not tell that the run had silently failed
130
+ (a ``whycode diff --fail-on history`` step was reported as green even
131
+ though it had crashed). We wrap each command body so any unhandled
132
+ exception leaves the existing rich traceback rendering in place but
133
+ forces a non-zero exit code (``2`` for general failure). ``typer.Exit``
134
+ and ``KeyboardInterrupt`` propagate untouched so explicit exit-code
135
+ paths and Ctrl-C still behave normally.
136
+ """
137
+
138
+ @functools.wraps(func)
139
+ def wrapper(*args: Any, **kwargs: Any) -> Any:
140
+ try:
141
+ return func(*args, **kwargs)
142
+ except (typer.Exit, typer.Abort, KeyboardInterrupt):
143
+ raise
144
+ except Exception as exc:
145
+ err.print_exception(show_locals=False)
146
+ raise typer.Exit(2) from exc
147
+
148
+ return wrapper # type: ignore[return-value]
149
+
150
+
118
151
  # --- shared: band threshold parsing ----------------------------------------
119
152
 
120
153
  _BAND_THRESHOLDS_BY_KEY: dict[str, int] = {
@@ -148,6 +181,7 @@ def _print_brief(card: rc.RiskCard) -> None:
148
181
 
149
182
 
150
183
  @app.command()
184
+ @_propagate_failures
151
185
  def why(
152
186
  path: str = typer.Argument(..., help="File path to inspect."),
153
187
  json_out: bool = typer.Option(
@@ -317,6 +351,7 @@ def _resolve_base_ref(repo_root: Path, requested: str | None) -> str:
317
351
 
318
352
 
319
353
  @app.command()
354
+ @_propagate_failures
320
355
  def diff(
321
356
  base: str | None = typer.Option(
322
357
  None, "--base", help="Base ref (default: origin/main → main → HEAD~1)."
@@ -482,6 +517,7 @@ def diff(
482
517
 
483
518
 
484
519
  @app.command()
520
+ @_propagate_failures
485
521
  def highlights(
486
522
  invariants: int = typer.Option(
487
523
  5, "--invariants", help="How many invariant lines to surface."
@@ -636,6 +672,7 @@ def _sample_indices(total: int, max_samples: int) -> list[int]:
636
672
 
637
673
 
638
674
  @app.command()
675
+ @_propagate_failures
639
676
  def timeline(
640
677
  path: str = typer.Argument(..., help="File path to inspect."),
641
678
  samples: int = typer.Option(
@@ -677,6 +714,12 @@ def timeline(
677
714
  top,
678
715
  )
679
716
  )
717
+ # Field-test report F14: ``timeline`` used to render rows in whatever
718
+ # non-monotonic order ``_sample_indices`` produced (uniform-across-index
719
+ # selection on a list whose ordering is git's parent traversal). Sort
720
+ # by date ascending before rendering so a reader can scan left-to-right
721
+ # without misreading the trajectory.
722
+ rows.sort(key=lambda r: r[0])
680
723
 
681
724
  if json_out:
682
725
  console.print_json(
@@ -714,6 +757,7 @@ def timeline(
714
757
 
715
758
 
716
759
  @app.command()
760
+ @_propagate_failures
717
761
  def scan(
718
762
  top: int = typer.Option(10, "--top", help="How many files to list."),
719
763
  sample: int = typer.Option(
@@ -811,6 +855,7 @@ def scan(
811
855
 
812
856
 
813
857
  @app.command()
858
+ @_propagate_failures
814
859
  def honest(
815
860
  path: str = typer.Argument(..., help="File path to inspect."),
816
861
  json_out: bool = typer.Option(False, "--json", help="Emit JSON instead of prose."),
@@ -874,6 +919,7 @@ def honest(
874
919
 
875
920
 
876
921
  @app.command()
922
+ @_propagate_failures
877
923
  def show(
878
924
  sha: str = typer.Argument(..., help="Commit SHA (full or short) to inspect."),
879
925
  repo: Path = typer.Option(Path("."), "--repo", help="Path inside the repo."),
@@ -981,6 +1027,7 @@ _MCP_SNIPPET = ''' {
981
1027
 
982
1028
 
983
1029
  @app.command()
1030
+ @_propagate_failures
984
1031
  def tour(
985
1032
  repo: Path = typer.Option(Path("."), "--repo", help="Path inside the repo."),
986
1033
  no_cache: bool = typer.Option(
@@ -1029,18 +1076,34 @@ def tour(
1029
1076
  incidents_top = gf.find_incidents(commits)[:3]
1030
1077
 
1031
1078
  if invariants_top or incidents_top:
1032
- console.print("[bold yellow]Decisions and incidents[/bold yellow]")
1033
- for line, c in invariants_top:
1034
- console.print(f" [italic]{line}[/italic]")
1079
+ # Field-test report F16: the original tour rendered both classes
1080
+ # under one ``Decisions and incidents`` header, so a parenthetical
1081
+ # invariant prose line was visually indistinguishable from a real
1082
+ # incident commit. Render two subheads matching the layout
1083
+ # ``highlights`` already uses.
1084
+ if invariants_top:
1035
1085
  console.print(
1036
- f" [dim]{c.sha[:7]} {c.authored_at.date()} {c.author_name}[/dim]\n"
1086
+ f"[bold yellow]Stated invariants[/bold yellow] "
1087
+ f"[dim]({len(invariants_top)} most recent)[/dim]"
1037
1088
  )
1038
- for c in incidents_top:
1039
- subj = c.subject if len(c.subject) <= 70 else c.subject[:69] + "…"
1040
- console.print(f" [red]{subj}[/red]")
1089
+ for line, c in invariants_top:
1090
+ console.print(f" [italic]{line}[/italic]")
1091
+ console.print(
1092
+ f" [dim]{c.sha[:7]} {c.authored_at.date()} "
1093
+ f"{c.author_name}[/dim]\n"
1094
+ )
1095
+ if incidents_top:
1041
1096
  console.print(
1042
- f" [dim]{c.sha[:7]} {c.authored_at.date()} {c.author_name}[/dim]\n"
1097
+ f"[bold red]Recent incidents[/bold red] "
1098
+ f"[dim]({len(incidents_top)} most recent)[/dim]"
1043
1099
  )
1100
+ for c in incidents_top:
1101
+ subj = c.subject if len(c.subject) <= 70 else c.subject[:69] + "…"
1102
+ console.print(f" [red]{subj}[/red]")
1103
+ console.print(
1104
+ f" [dim]{c.sha[:7]} {c.authored_at.date()} "
1105
+ f"{c.author_name}[/dim]\n"
1106
+ )
1044
1107
  else:
1045
1108
  console.print(
1046
1109
  "[dim]No headline decisions or incidents in recent history.[/dim]"
@@ -1113,6 +1176,7 @@ def tour(
1113
1176
 
1114
1177
 
1115
1178
  @app.command()
1179
+ @_propagate_failures
1116
1180
  def init(
1117
1181
  force: bool = typer.Option(
1118
1182
  False, "--force", "-f", help="Overwrite existing files instead of skipping."
whycode/git_facts.py CHANGED
@@ -18,10 +18,11 @@ from __future__ import annotations
18
18
 
19
19
  import re
20
20
  import subprocess
21
+ import sys
21
22
  from collections import Counter
22
23
  from collections.abc import Sequence
23
24
  from dataclasses import dataclass, field
24
- from datetime import datetime
25
+ from datetime import UTC, datetime
25
26
  from pathlib import Path
26
27
  from typing import TYPE_CHECKING
27
28
 
@@ -31,6 +32,17 @@ if TYPE_CHECKING:
31
32
  UNIT_SEP = "\x1f"
32
33
  RECORD_SEP = "\x1e"
33
34
 
35
+ # Per-process record of commits whose authored timestamp could not be parsed
36
+ # even after defensive normalisation. We surface these once per session via
37
+ # a single stderr line so a single bad record does not spam a per-line warning
38
+ # — never on every read, never to a network.
39
+ _UNPARSEABLE_TIMESTAMPS: set[str] = set()
40
+ _BAD_TZ_WARNING_EMITTED = False
41
+ # The Unix epoch as a tz-aware UTC datetime; used as a safe fallback when a
42
+ # commit's authored_at is irrecoverably malformed. Picked over datetime.min
43
+ # because callers expect a tz-aware value (signal age math compares to UTC).
44
+ _EPOCH_FALLBACK = datetime.fromtimestamp(0, UTC)
45
+
34
46
  # A commit subject/body containing one of these markers is treated as evidence
35
47
  # that the original author flagged something worth carrying forward.
36
48
  INCIDENT_TOKENS: tuple[str, ...] = (
@@ -69,6 +81,39 @@ _BREAKING_CC_RE = re.compile(
69
81
  _ISSUE_ID_RE = re.compile(
70
82
  r"(?:#\d+|\b[A-Z][A-Z0-9_]+-\d+|\bSEV[- ]?\d\b|\bP[01]\b)",
71
83
  )
84
+ # Security-advisory tokens fire as incidents on subject alone — the deliberate
85
+ # act of citing one is unambiguous high-confidence evidence.
86
+ _CVE_RE = re.compile(r"\bCVE-\d{4}-\d+\b")
87
+ _GHSA_RE = re.compile(r"\bGHSA-[a-z0-9-]+\b", re.IGNORECASE)
88
+ # Default ``git revert`` body subject ("Reverted ...") and the human variant
89
+ # ("Reverts <sha>") are both unambiguous incident-class evidence on subject.
90
+ _REVERTED_SUBJECT_RE = re.compile(r'^Reverted\s+"', re.IGNORECASE)
91
+ _REVERTS_SUBJECT_RE = re.compile(r"\bReverts\s+[0-9a-f]{7,}\b", re.IGNORECASE)
92
+ # Subject-level "regression" usage that is descriptive rather than incident:
93
+ # "regression test(s)", "regression suite", "no regression", "regression nature".
94
+ # These prevention/test-housekeeping phrases must NOT fire as incidents.
95
+ _BENIGN_REGRESSION_RE = re.compile(
96
+ r"\b(?:regression\s+(?:tests?|suite|nature)|no\s+regression)\b",
97
+ re.IGNORECASE,
98
+ )
99
+ # Conversely, a subject using "regression" with a corroborating incident id
100
+ # (``#1234``, ``INC-447``, …), or as part of an unambiguous incident phrase
101
+ # like ``regression in <something>`` / ``Fixed: regression`` / ``fix the
102
+ # regression``, IS an incident. These are the patterns that distinguish
103
+ # "split the regression-test files" (housekeeping) from "fix the refund
104
+ # regression" (a real outage marker).
105
+ _INCIDENT_REGRESSION_RE = re.compile(
106
+ # ``regression in <something>`` — "fix the refund regression in admin".
107
+ r"\bregression\s+in\b"
108
+ # ``fix the regression`` / ``fix a regression`` — explicit incident verb.
109
+ r"|\bfix(?:ed|es)?\s+(?:the\s+|a\s+)?regression\b"
110
+ # ``regression — …`` / ``regression: …`` — stated subject category.
111
+ r"|\bregression\s*[:—-]"
112
+ # ``Fixed: regression`` / ``Hotfix: regression`` — pre-colon incident verb
113
+ # explicitly framing the rest as the incident category itself.
114
+ r"|\b(?:fix(?:ed|es)?|hotfix|revert(?:ed)?)\s*:\s*regression\b",
115
+ re.IGNORECASE,
116
+ )
72
117
  INVARIANT_TOKENS: tuple[str, ...] = (
73
118
  "do not",
74
119
  "don't",
@@ -187,8 +232,94 @@ def is_tracked(repo_root: Path, path: str) -> bool:
187
232
  return bool(out.strip())
188
233
 
189
234
 
235
+ def _normalise_tz_offset(timestamp: str) -> str:
236
+ """Repair pathological tz offsets that ``datetime.fromisoformat`` rejects.
237
+
238
+ Real-world git history contains commits authored on systems with broken
239
+ timezone configuration (e.g. an offset of ``+518:00`` or ``+51800`` —
240
+ encountered on a 2011 commit in psf/requests, where the underlying object
241
+ really stores ``+51800``). ``fromisoformat`` raises ``ValueError`` on
242
+ those, which would otherwise poison every command that walks history.
243
+
244
+ We coerce the suffix into the canonical ``[+-]HH:MM`` form when we can
245
+ recognise it. Anything else is left untouched and the caller falls back
246
+ to a safe default.
247
+ """
248
+ stripped = timestamp.strip()
249
+ if "T" not in stripped:
250
+ return stripped
251
+ body, _, after_t = stripped.partition("T")
252
+ sign_idx = -1
253
+ for i, ch in enumerate(after_t):
254
+ if ch in "+-":
255
+ sign_idx = i
256
+ break
257
+ if sign_idx < 0:
258
+ return stripped
259
+ prefix = body + "T" + after_t[:sign_idx]
260
+ sign = after_t[sign_idx]
261
+ rest = after_t[sign_idx + 1 :]
262
+ digits = rest.replace(":", "")
263
+ if not digits.isdigit():
264
+ return stripped
265
+ # Acceptable shapes: 4 digits → HHMM; 5 digits → HHHMM (broken, e.g.
266
+ # ``+51800`` → hours ``5``, minutes ``18``); 6 digits → HHMMSS (rare).
267
+ if len(digits) == 4:
268
+ hh, mm = digits[:2], digits[2:]
269
+ elif len(digits) == 5:
270
+ hh, mm = "0" + digits[0], digits[1:3]
271
+ elif len(digits) == 6:
272
+ hh, mm = digits[:2], digits[2:4]
273
+ elif len(digits) == 2:
274
+ hh, mm = digits, "00"
275
+ else:
276
+ return stripped
277
+ try:
278
+ if int(hh) > 23 or int(mm) > 59:
279
+ return stripped
280
+ except ValueError:
281
+ return stripped
282
+ return f"{prefix}{sign}{hh}:{mm}"
283
+
284
+
190
285
  def _parse_iso(timestamp: str) -> datetime:
191
- return datetime.fromisoformat(timestamp.strip())
286
+ """Parse an ISO 8601 timestamp; tolerate malformed tz offsets.
287
+
288
+ Returns a Unix-epoch sentinel if the offset cannot be repaired so a
289
+ single bad record never crashes a whole-repo analysis. The bad raw
290
+ string is tracked in a per-session set so verbose callers can mention
291
+ which commits were affected.
292
+ """
293
+ raw = timestamp.strip()
294
+ try:
295
+ return datetime.fromisoformat(raw)
296
+ except ValueError:
297
+ repaired = _normalise_tz_offset(raw)
298
+ if repaired != raw:
299
+ try:
300
+ return datetime.fromisoformat(repaired)
301
+ except ValueError:
302
+ pass
303
+ _UNPARSEABLE_TIMESTAMPS.add(raw)
304
+ return _EPOCH_FALLBACK
305
+
306
+
307
+ def _maybe_warn_bad_timestamps() -> None:
308
+ """Emit a single stderr line per session if any record fell back to epoch.
309
+
310
+ Called once at the end of a top-level read so a single bad commit never
311
+ spams a per-line warning. Stays purely local — no network, no telemetry.
312
+ """
313
+ global _BAD_TZ_WARNING_EMITTED
314
+ if _BAD_TZ_WARNING_EMITTED or not _UNPARSEABLE_TIMESTAMPS:
315
+ return
316
+ _BAD_TZ_WARNING_EMITTED = True
317
+ n = len(_UNPARSEABLE_TIMESTAMPS)
318
+ print(
319
+ f"warning: {n} commit{'s' if n != 1 else ''} had an unparseable "
320
+ f"authored timestamp; treating those as epoch for date math.",
321
+ file=sys.stderr,
322
+ )
192
323
 
193
324
 
194
325
  def _log_format() -> str:
@@ -285,6 +416,7 @@ def commits_for_path(
285
416
  cache.store_path_log(path, head_sha, [c.sha for c in commits])
286
417
  if max_count is not None and len(commits) > max_count:
287
418
  commits = commits[:max_count]
419
+ _maybe_warn_bad_timestamps()
288
420
  return commits
289
421
 
290
422
 
@@ -342,12 +474,15 @@ def all_commits(
342
474
  """
343
475
  if cache is not None:
344
476
  full = _all_commits_via_cache(repo_root, cache)
477
+ _maybe_warn_bad_timestamps()
345
478
  return full if max_count is None else full[:max_count]
346
479
  args = ["log", "--no-merges", f"--pretty=format:{_log_format()}"]
347
480
  if max_count is not None:
348
481
  args.append(f"--max-count={max_count}")
349
482
  raw = _run_git(repo_root, *args)
350
- return _parse_log_records(raw)
483
+ out = _parse_log_records(raw)
484
+ _maybe_warn_bad_timestamps()
485
+ return out
351
486
 
352
487
 
353
488
  def _store_commits(cache: CacheStore, commits: Sequence[Commit]) -> None:
@@ -624,26 +759,77 @@ def find_revert_pairs(commits: Sequence[Commit]) -> list[tuple[str, str]]:
624
759
  return pairs
625
760
 
626
761
 
762
+ def _is_subject_incident(subject: str, body: str) -> bool:
763
+ """Determine whether a single commit's subject signals incident intent.
764
+
765
+ The trickiest case is ``regression``. The 0.4.0 classifier accepted any
766
+ subject that contained the word and so flagged routine bug fixes that
767
+ happened to mention "regression tests" or "regression nature" as
768
+ incidents. The new rule:
769
+
770
+ - ``regression`` in a subject fires only when corroborated:
771
+ * an issue / incident id on the same subject or anywhere in the body
772
+ (``#1234``, ``INC-447``, ``SEV-1``, …); OR
773
+ * a pre-marker that anchors the word as an incident reference,
774
+ such as ``regression in <something>`` / ``Fixed: regression`` /
775
+ ``fix the regression`` / ``regression — …``.
776
+ - The phrases ``regression test(s)``, ``regression suite``,
777
+ ``no regression``, ``regression nature`` never fire on their own.
778
+ - Subjects citing a security advisory (``CVE-…`` / ``GHSA-…``) always
779
+ fire — the act of naming an advisory is unambiguous high-confidence.
780
+ - The default ``git revert`` body subject (``Reverted "…"``) and the
781
+ human variant (``Reverts <sha>``) always fire — both are explicit
782
+ rollback markers.
783
+ - Other incident keywords (``hotfix``, ``outage``, ``rollback``, …)
784
+ keep their existing subject-level acceptance.
785
+ """
786
+ if _CVE_RE.search(subject) or _GHSA_RE.search(subject):
787
+ return True
788
+ if _REVERTED_SUBJECT_RE.search(subject) or _REVERTS_SUBJECT_RE.search(subject):
789
+ return True
790
+ if _BREAKING_CC_RE.search(subject):
791
+ return True
792
+ # Regression demands corroboration: either it appears as part of a
793
+ # high-confidence phrase, or an issue/incident id is present.
794
+ has_regression = bool(re.search(r"\bregression\b", subject, re.IGNORECASE))
795
+ if has_regression:
796
+ if _BENIGN_REGRESSION_RE.search(subject):
797
+ return False
798
+ if _INCIDENT_REGRESSION_RE.search(subject):
799
+ return True
800
+ if _ISSUE_ID_RE.search(subject) or _ISSUE_ID_RE.search(body):
801
+ return True
802
+ # Strip the word and check whether any other incident keyword carries
803
+ # the subject — a "rollback regression" should still fire on
804
+ # "rollback" alone.
805
+ without_regression = re.sub(r"\bregression\b", "", subject, flags=re.IGNORECASE)
806
+ return bool(_INCIDENT_RE.search(without_regression))
807
+ return bool(_INCIDENT_RE.search(subject))
808
+
809
+
627
810
  def find_incidents(commits: Sequence[Commit]) -> list[Commit]:
628
811
  """Return commits whose evidence-level signals incident-flavored intent.
629
812
 
630
813
  Acceptance ladder (highest to lowest confidence):
631
- 1. Subject contains an incident keyword. A commit's subject is its
632
- declared purpose, so a subject hit is treated as ground truth.
633
- 2. Subject carries the Conventional Commits breaking marker
814
+ 1. Subject cites a security advisory (``CVE-…`` / ``GHSA-…``) — fires
815
+ on subject alone.
816
+ 2. Subject is a default ``git revert`` body (``Reverted "…"``) or a
817
+ human revert pointer (``Reverts <sha>``) — fires on subject alone.
818
+ 3. Subject carries the Conventional Commits breaking marker
634
819
  (``feat!:`` / ``fix!:`` / …).
635
- 3. Body carries the structured ``BREAKING CHANGE:`` footer. This is a
636
- deliberate, anchored marker, not free-form prose.
637
- 4. Body contains an incident keyword AND an issue / incident
638
- identifier nearby (``#1234``, ``INC-447``, ``SEV-1``, ``P0``).
639
- This filters out passing mentions in prose like "feat: add
640
- incident-aware logging" where the keyword describes a *feature*.
820
+ 4. Subject contains an incident keyword that is NOT a benign
821
+ "regression test/suite/nature" phrase. ``regression`` requires
822
+ either an issue id (on subject or body) or a pre-marker that
823
+ anchors it as an incident reference.
824
+ 5. Body carries the structured ``BREAKING CHANGE:`` footer.
825
+ 6. Body contains an incident keyword AND an issue / incident
826
+ identifier nearby. Filters out passing mentions in prose.
641
827
 
642
828
  A bare body keyword with no corroborating ID does NOT fire.
643
829
  """
644
830
  out: list[Commit] = []
645
831
  for c in commits:
646
- if _INCIDENT_RE.search(c.subject) or _BREAKING_CC_RE.search(c.subject):
832
+ if _is_subject_incident(c.subject, c.body):
647
833
  out.append(c)
648
834
  continue
649
835
  if _BREAKING_FOOTER_RE.search(c.body):
@@ -696,6 +882,41 @@ def _all_matches_are_quoted(line: str, regex: re.Pattern[str]) -> bool:
696
882
  return True
697
883
 
698
884
 
885
+ # An ALLCAPS line prefix (e.g. ``WARNING:``, ``ERROR:``, ``DEBUG:``) is the
886
+ # canonical signature of pasted compiler / linter / spell-checker output.
887
+ # A genuine human invariant statement opens with a normal sentence ("Do
888
+ # not...", "Important: ...") and never with two or more uppercase letters
889
+ # followed by an immediate colon.
890
+ _TOOL_OUTPUT_ALLCAPS_RE = re.compile(r"^[A-Z]{2,}:\s")
891
+ # A ``path:line:`` or ``path:line:col:`` prefix near the start of a line is
892
+ # the unmistakable shape of compiler / aspell output. We accept any path-
893
+ # shaped token (slashes, dots, hyphens, underscores, alnum) followed by
894
+ # ``:<digits>:`` — anchored so it also catches ``./foo/bar.py:50:``.
895
+ _TOOL_OUTPUT_PATH_RE = re.compile(r"^[\w./-]+:\d+:")
896
+ # Per-(commit, file) cap on invariant lines pulled from one body. A real
897
+ # author rarely states more than two crisp invariants in a single message;
898
+ # anything beyond is almost certainly a paste. Set deliberately low so a
899
+ # single noisy commit can no longer dominate the "highlights" view.
900
+ _PER_COMMIT_INVARIANT_CAP = 2
901
+
902
+
903
+ def _is_tool_output_line(line: str, prev_line: str) -> bool:
904
+ """True if ``line`` looks like quoted compiler / linter / aspell output.
905
+
906
+ Heuristics:
907
+ - ALLCAPS followed immediately by a colon (``WARNING:``, ``ERROR:``,
908
+ ``DEBUG:``…) — pasted tool output.
909
+ - ``path/to/file:line:`` prefix near the start — clang / mypy / aspell.
910
+ - Preceded by a ``> `` block-quote line — markdown-style "this is
911
+ what the tool said" framing.
912
+ """
913
+ if _TOOL_OUTPUT_ALLCAPS_RE.match(line):
914
+ return True
915
+ if _TOOL_OUTPUT_PATH_RE.match(line):
916
+ return True
917
+ return prev_line.startswith("> ")
918
+
919
+
699
920
  def extract_invariant_quotes(commits: Sequence[Commit]) -> list[tuple[str, str]]:
700
921
  """Pull lines from commit *bodies* that match invariant tokens.
701
922
 
@@ -706,20 +927,43 @@ def extract_invariant_quotes(commits: Sequence[Commit]) -> list[tuple[str, str]]
706
927
  eliminates the meta-mention failure mode where a commit *about* an
707
928
  invariant token (e.g. "fix invariant matcher") would self-flag.
708
929
 
930
+ Two filters keep pasted tool output out of the "stated invariants"
931
+ surface:
932
+
933
+ 1. Lines that look like quoted compiler / linter / aspell output are
934
+ dropped (``WARNING: …``, ``foo/bar.py:50: …``, lines preceded by a
935
+ ``> `` block-quote). One noisy spell-check commit on django used to
936
+ supply 15 of the top-20 highlights; this rule kills it at the
937
+ source.
938
+ 2. A per-commit cap of two invariants. Real authors rarely state more
939
+ than two crisp constraints in one message; anything beyond is
940
+ almost certainly a paste. The first two matches are preserved
941
+ (most informative-looking entries rank).
942
+
709
943
  Lines where every matching token is wrapped in quotes (``"do not"``) are
710
944
  treated as references rather than statements and are skipped.
711
945
  """
712
946
  out: list[tuple[str, str]] = []
713
947
  for commit in commits:
948
+ per_commit = 0
949
+ prev_line = ""
714
950
  for raw_line in commit.body.splitlines():
715
951
  line = raw_line.strip()
716
952
  if not line:
953
+ prev_line = raw_line
717
954
  continue
955
+ if _is_tool_output_line(line, prev_line):
956
+ prev_line = raw_line
957
+ continue
958
+ prev_line = raw_line
718
959
  if not _INVARIANT_RE.search(line):
719
960
  continue
720
961
  if _all_matches_are_quoted(line, _INVARIANT_RE):
721
962
  continue
963
+ if per_commit >= _PER_COMMIT_INVARIANT_CAP:
964
+ continue
722
965
  out.append((commit.sha, line[:200]))
966
+ per_commit += 1
723
967
  return out
724
968
 
725
969
 
whycode/ignore.py CHANGED
@@ -3,7 +3,16 @@
3
3
  These are paths/files that almost always pollute risk analysis without
4
4
  adding signal: changelogs (touched on every release, so they look "tightly
5
5
  coupled to everything"), lockfiles (regenerated on every dependency bump),
6
- vendored third-party code, and machine-generated stubs.
6
+ vendored third-party code, machine-generated stubs, CI / packaging
7
+ metadata, project-membership files (``AUTHORS``, ``LICENSE``), and
8
+ translation catalogues (``*.po`` / ``*.mo``).
9
+
10
+ A field test against django (10,000 commits, 7,043 files) showed the
11
+ top-10 risk list was dominated by these high-touch metadata files —
12
+ ``AUTHORS``, ``.github/workflows/*.yml``, locale ``.po``, ``.gitignore``
13
+ — and no application code at all reached the top 10. A scan-top list
14
+ that surfaces zero source files is unactionable; demoting these
15
+ metadata files lets real source code rank.
7
16
 
8
17
  Users can extend this list with a ``.whycodeignore`` file at repo root,
9
18
  one ``fnmatch``-style pattern per line. Comments start with ``#``.
@@ -71,6 +80,49 @@ DEFAULT_IGNORE_PATTERNS: tuple[str, ...] = (
71
80
  "*.ttf",
72
81
  "*.otf",
73
82
  "*.eot",
83
+ # CI / repo metadata — high-touch but never the source of risk in code.
84
+ ".github/**",
85
+ ".gitlab/**",
86
+ ".circleci/**",
87
+ ".gitignore",
88
+ ".gitattributes",
89
+ ".editorconfig",
90
+ ".pre-commit-config.yaml",
91
+ ".readthedocs.yaml",
92
+ ".readthedocs.yml",
93
+ ".flake8",
94
+ ".coveragerc",
95
+ "tox.ini",
96
+ "pytest.ini",
97
+ "Makefile",
98
+ # Project-membership / licensing files — touched on every contributor add.
99
+ "AUTHORS",
100
+ "AUTHORS.*",
101
+ "CONTRIBUTORS",
102
+ "CONTRIBUTORS.*",
103
+ "LICENSE",
104
+ "LICENSE.*",
105
+ "LICENSES/**",
106
+ "COPYING",
107
+ "COPYING.*",
108
+ "NOTICE",
109
+ "NOTICE.*",
110
+ # Python packaging metadata — low-signal-per-touch.
111
+ "setup.py",
112
+ "setup.cfg",
113
+ "MANIFEST.in",
114
+ # Translation catalogues — bulk-edited every release, never an indicator
115
+ # of code risk.
116
+ "*.po",
117
+ "*.mo",
118
+ "*.pot",
119
+ # Release-notes-style ``*.txt`` files only — narrow patterns; we are
120
+ # deliberately conservative here so a random ``requirements.txt`` is not
121
+ # ignored. The shapes below match common repo layouts (django, flask).
122
+ "release_notes/*.txt",
123
+ "docs/releases/*.txt",
124
+ "docs/release-notes/*.txt",
125
+ "release-notes/*.txt",
74
126
  )
75
127
 
76
128
  _USER_IGNORE_FILE = ".whycodeignore"
whycode/signals.py CHANGED
@@ -13,6 +13,7 @@ from enum import StrEnum
13
13
  from typing import TYPE_CHECKING
14
14
 
15
15
  from whycode import git_facts as gf
16
+ from whycode import ignore as ign
16
17
 
17
18
  if TYPE_CHECKING:
18
19
  from whycode.git_facts import RepoFacts
@@ -148,7 +149,23 @@ def detect_high_churn(facts: RepoFacts) -> Signal | None:
148
149
 
149
150
 
150
151
  def detect_coupling(facts: RepoFacts) -> Signal | None:
151
- paired = [(p, n) for p, n in facts.co_changed_files.items() if n >= COUPLING_MIN_COCHANGES]
152
+ """Files that change together with the target file, ranked by frequency.
153
+
154
+ Co-change candidates are filtered through the same ignore list that
155
+ powers ``whycode scan`` (built-in defaults plus an optional repo-local
156
+ ``.whycodeignore``). Without this filter, a per-file coupling signal
157
+ would surface ``CHANGELOG``, ``.github/workflows/*.yml``, ``AUTHORS``
158
+ and similar high-touch metadata as the file's "tight coupling" — the
159
+ field-test report flagged ``flask/app.py``'s top co-changers as 60%
160
+ metadata, leaving only two genuinely informative entries. Applying
161
+ the same filter here keeps the most-shown signal honest.
162
+ """
163
+ patterns = ign.effective_patterns(facts.repo_root)
164
+ paired = [
165
+ (p, n)
166
+ for p, n in facts.co_changed_files.items()
167
+ if n >= COUPLING_MIN_COCHANGES and not ign.is_ignored(p, patterns)
168
+ ]
152
169
  if not paired:
153
170
  return None
154
171
  paired.sort(key=lambda x: (-x[1], x[0]))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: whycode-cli
3
- Version: 0.4.0
3
+ Version: 0.4.1
4
4
  Summary: Tells you what to be afraid of before you touch a file.
5
5
  Author: Kevin
6
6
  License-Expression: MIT
@@ -1,22 +1,22 @@
1
- whycode/__init__.py,sha256=DT8PsrrvRkCF3U7cRgChch8aCNUobwQ9iFpPowLmLWY,96
1
+ whycode/__init__.py,sha256=dPQOppaGvPoPBoACrHwxqGykCdDMNZRROtDjOmyRuf8,96
2
2
  whycode/__main__.py,sha256=dqAk6746YpuM-FTIH4TBOULegGc5WweojiZjce0VYgQ,105
3
3
  whycode/cache.py,sha256=v55KbSlTqmP_ot1FEFqxCNpAApj6vthpHl2l0lGLX3A,17477
4
- whycode/cli.py,sha256=97LJmxOYBTtHkEtMlCabIogCxJNswOGusAdpvT3-mf8,45146
4
+ whycode/cli.py,sha256=OTYPhp8ItBXPRrQ1y6zGt0BwKyAYEuHAo3T0hMHqINk,47836
5
5
  whycode/decisions.py,sha256=oCVhEF7QfHeci0LAWNtEjV2mUAEBJloL1rT3I4XXbkw,7570
6
- whycode/git_facts.py,sha256=cKPywdrAsQBsPl7R4kLO5zBAELmXlhoy23g29XjuK18,29044
7
- whycode/ignore.py,sha256=sdRO_0HSedm8aO69CSGl-zQrUVX5MEg9QGcAJWwAvP4,3021
6
+ whycode/git_facts.py,sha256=vAeyhxZTrqa_6zmVuBV-06JhZ-TFBiRmcaISK1oOQjM,40162
7
+ whycode/ignore.py,sha256=O_8bHIt0d1U-sYrBajBa7oEqpnHWU3f6Zf-8PU8CpO0,4748
8
8
  whycode/llm.py,sha256=leB94pBg8kUCq_BujZq5ixny0urGtKskjdaKoum_eCA,4092
9
9
  whycode/mcp_server.py,sha256=ht1tStAkOwmQzNIRkm1eA8Tnc59fzDRSGkgyIprft-0,18503
10
10
  whycode/risk_card.py,sha256=xOJkHwIkS_6yw_dSowsQ6LHfeD9Mwr2tymL7_wqxs0U,8855
11
11
  whycode/scorer.py,sha256=4pBejunfxzYhGUzMeL8uGEMQzC6DWiqwcTeMdo3eras,1444
12
- whycode/signals.py,sha256=M2x6868G1YQ4eWoIuwE0PMjurCoZn1jyJWySLF7FlW0,13085
12
+ whycode/signals.py,sha256=z0kZfXR60nS-j56nchHd1V3aK8A5CGR1BAyHZZAff3s,13899
13
13
  whycode/suppressions.py,sha256=1lKSs-kCgpnJbcxozcgiSP8ZAfjEDMHXuM3sw4FaY78,3836
14
14
  whycode/templates/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
15
  whycode/templates/github-workflow.yml,sha256=LAfHMDG2TkAwi4vCNinHk-4zOt-mCWErBpmpaqlW5oA,2251
16
16
  whycode/templates/pre-commit,sha256=IhU11CvoDwqRAAsvHwUo-BwaNbdgy1cpXc54Z_phrmQ,316
17
- whycode_cli-0.4.0.dist-info/licenses/LICENSE,sha256=U6LN5qg5kJXSJf7KFPm9KJhmiGn3qK_GsTVWXdt1DFA,1062
18
- whycode_cli-0.4.0.dist-info/METADATA,sha256=3VurI0V9_AtQdTTC8Fyis3C3pulEIdEe_bMC4_iH7xs,10218
19
- whycode_cli-0.4.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
20
- whycode_cli-0.4.0.dist-info/entry_points.txt,sha256=xrNWc4CQn3ZhQFJxsGIPiTqpN19K4pRpgaj6qGaEzSQ,44
21
- whycode_cli-0.4.0.dist-info/top_level.txt,sha256=6yIL5rxW-4DbARHQYrPlGQVqKddZ88sjvmNosDh1w3A,8
22
- whycode_cli-0.4.0.dist-info/RECORD,,
17
+ whycode_cli-0.4.1.dist-info/licenses/LICENSE,sha256=U6LN5qg5kJXSJf7KFPm9KJhmiGn3qK_GsTVWXdt1DFA,1062
18
+ whycode_cli-0.4.1.dist-info/METADATA,sha256=M2XBAL02LMRZtW4Pj4L3Gcuifqh2lIAQa_1Hpt3xfPI,10218
19
+ whycode_cli-0.4.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
20
+ whycode_cli-0.4.1.dist-info/entry_points.txt,sha256=xrNWc4CQn3ZhQFJxsGIPiTqpN19K4pRpgaj6qGaEzSQ,44
21
+ whycode_cli-0.4.1.dist-info/top_level.txt,sha256=6yIL5rxW-4DbARHQYrPlGQVqKddZ88sjvmNosDh1w3A,8
22
+ whycode_cli-0.4.1.dist-info/RECORD,,