whycode-cli 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- whycode/__init__.py +1 -1
- whycode/cli.py +73 -9
- whycode/git_facts.py +257 -13
- whycode/ignore.py +53 -1
- whycode/signals.py +18 -1
- {whycode_cli-0.4.0.dist-info → whycode_cli-0.4.1.dist-info}/METADATA +1 -1
- {whycode_cli-0.4.0.dist-info → whycode_cli-0.4.1.dist-info}/RECORD +11 -11
- {whycode_cli-0.4.0.dist-info → whycode_cli-0.4.1.dist-info}/WHEEL +0 -0
- {whycode_cli-0.4.0.dist-info → whycode_cli-0.4.1.dist-info}/entry_points.txt +0 -0
- {whycode_cli-0.4.0.dist-info → whycode_cli-0.4.1.dist-info}/licenses/LICENSE +0 -0
- {whycode_cli-0.4.0.dist-info → whycode_cli-0.4.1.dist-info}/top_level.txt +0 -0
whycode/__init__.py
CHANGED
whycode/cli.py
CHANGED
|
@@ -20,10 +20,12 @@ Commands
|
|
|
20
20
|
|
|
21
21
|
from __future__ import annotations
|
|
22
22
|
|
|
23
|
+
import functools
|
|
23
24
|
import json
|
|
24
25
|
import sys
|
|
26
|
+
from collections.abc import Callable
|
|
25
27
|
from pathlib import Path
|
|
26
|
-
from typing import Any
|
|
28
|
+
from typing import Any, TypeVar
|
|
27
29
|
|
|
28
30
|
import typer
|
|
29
31
|
from rich.console import Console
|
|
@@ -115,6 +117,37 @@ def _require_tracked(path_arg: str) -> tuple[Path, str]:
|
|
|
115
117
|
return repo_root, rel
|
|
116
118
|
|
|
117
119
|
|
|
120
|
+
_F = TypeVar("_F", bound=Callable[..., Any])
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _propagate_failures(func: _F) -> _F:
|
|
124
|
+
"""Convert any uncaught exception into ``typer.Exit(2)``.
|
|
125
|
+
|
|
126
|
+
A read-only field test against psf/requests caught a bug where a single
|
|
127
|
+
bad-timezone commit raised ``ValueError`` deep inside ``_parse_log_records``;
|
|
128
|
+
Rich rendered the traceback to stderr, but the process exited with status
|
|
129
|
+
0. CI integrations could not tell that the run had silently failed
|
|
130
|
+
(a ``whycode diff --fail-on history`` step was reported as green even
|
|
131
|
+
though it had crashed). We wrap each command body so any unhandled
|
|
132
|
+
exception leaves the existing rich traceback rendering in place but
|
|
133
|
+
forces a non-zero exit code (``2`` for general failure). ``typer.Exit``
|
|
134
|
+
and ``KeyboardInterrupt`` propagate untouched so explicit exit-code
|
|
135
|
+
paths and Ctrl-C still behave normally.
|
|
136
|
+
"""
|
|
137
|
+
|
|
138
|
+
@functools.wraps(func)
|
|
139
|
+
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
140
|
+
try:
|
|
141
|
+
return func(*args, **kwargs)
|
|
142
|
+
except (typer.Exit, typer.Abort, KeyboardInterrupt):
|
|
143
|
+
raise
|
|
144
|
+
except Exception as exc:
|
|
145
|
+
err.print_exception(show_locals=False)
|
|
146
|
+
raise typer.Exit(2) from exc
|
|
147
|
+
|
|
148
|
+
return wrapper # type: ignore[return-value]
|
|
149
|
+
|
|
150
|
+
|
|
118
151
|
# --- shared: band threshold parsing ----------------------------------------
|
|
119
152
|
|
|
120
153
|
_BAND_THRESHOLDS_BY_KEY: dict[str, int] = {
|
|
@@ -148,6 +181,7 @@ def _print_brief(card: rc.RiskCard) -> None:
|
|
|
148
181
|
|
|
149
182
|
|
|
150
183
|
@app.command()
|
|
184
|
+
@_propagate_failures
|
|
151
185
|
def why(
|
|
152
186
|
path: str = typer.Argument(..., help="File path to inspect."),
|
|
153
187
|
json_out: bool = typer.Option(
|
|
@@ -317,6 +351,7 @@ def _resolve_base_ref(repo_root: Path, requested: str | None) -> str:
|
|
|
317
351
|
|
|
318
352
|
|
|
319
353
|
@app.command()
|
|
354
|
+
@_propagate_failures
|
|
320
355
|
def diff(
|
|
321
356
|
base: str | None = typer.Option(
|
|
322
357
|
None, "--base", help="Base ref (default: origin/main → main → HEAD~1)."
|
|
@@ -482,6 +517,7 @@ def diff(
|
|
|
482
517
|
|
|
483
518
|
|
|
484
519
|
@app.command()
|
|
520
|
+
@_propagate_failures
|
|
485
521
|
def highlights(
|
|
486
522
|
invariants: int = typer.Option(
|
|
487
523
|
5, "--invariants", help="How many invariant lines to surface."
|
|
@@ -636,6 +672,7 @@ def _sample_indices(total: int, max_samples: int) -> list[int]:
|
|
|
636
672
|
|
|
637
673
|
|
|
638
674
|
@app.command()
|
|
675
|
+
@_propagate_failures
|
|
639
676
|
def timeline(
|
|
640
677
|
path: str = typer.Argument(..., help="File path to inspect."),
|
|
641
678
|
samples: int = typer.Option(
|
|
@@ -677,6 +714,12 @@ def timeline(
|
|
|
677
714
|
top,
|
|
678
715
|
)
|
|
679
716
|
)
|
|
717
|
+
# Field-test report F14: ``timeline`` used to render rows in whatever
|
|
718
|
+
# non-monotonic order ``_sample_indices`` produced (uniform-across-index
|
|
719
|
+
# selection on a list whose ordering is git's parent traversal). Sort
|
|
720
|
+
# by date ascending before rendering so a reader can scan left-to-right
|
|
721
|
+
# without misreading the trajectory.
|
|
722
|
+
rows.sort(key=lambda r: r[0])
|
|
680
723
|
|
|
681
724
|
if json_out:
|
|
682
725
|
console.print_json(
|
|
@@ -714,6 +757,7 @@ def timeline(
|
|
|
714
757
|
|
|
715
758
|
|
|
716
759
|
@app.command()
|
|
760
|
+
@_propagate_failures
|
|
717
761
|
def scan(
|
|
718
762
|
top: int = typer.Option(10, "--top", help="How many files to list."),
|
|
719
763
|
sample: int = typer.Option(
|
|
@@ -811,6 +855,7 @@ def scan(
|
|
|
811
855
|
|
|
812
856
|
|
|
813
857
|
@app.command()
|
|
858
|
+
@_propagate_failures
|
|
814
859
|
def honest(
|
|
815
860
|
path: str = typer.Argument(..., help="File path to inspect."),
|
|
816
861
|
json_out: bool = typer.Option(False, "--json", help="Emit JSON instead of prose."),
|
|
@@ -874,6 +919,7 @@ def honest(
|
|
|
874
919
|
|
|
875
920
|
|
|
876
921
|
@app.command()
|
|
922
|
+
@_propagate_failures
|
|
877
923
|
def show(
|
|
878
924
|
sha: str = typer.Argument(..., help="Commit SHA (full or short) to inspect."),
|
|
879
925
|
repo: Path = typer.Option(Path("."), "--repo", help="Path inside the repo."),
|
|
@@ -981,6 +1027,7 @@ _MCP_SNIPPET = ''' {
|
|
|
981
1027
|
|
|
982
1028
|
|
|
983
1029
|
@app.command()
|
|
1030
|
+
@_propagate_failures
|
|
984
1031
|
def tour(
|
|
985
1032
|
repo: Path = typer.Option(Path("."), "--repo", help="Path inside the repo."),
|
|
986
1033
|
no_cache: bool = typer.Option(
|
|
@@ -1029,18 +1076,34 @@ def tour(
|
|
|
1029
1076
|
incidents_top = gf.find_incidents(commits)[:3]
|
|
1030
1077
|
|
|
1031
1078
|
if invariants_top or incidents_top:
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1079
|
+
# Field-test report F16: the original tour rendered both classes
|
|
1080
|
+
# under one ``Decisions and incidents`` header, so a parenthetical
|
|
1081
|
+
# invariant prose line was visually indistinguishable from a real
|
|
1082
|
+
# incident commit. Render two subheads matching the layout
|
|
1083
|
+
# ``highlights`` already uses.
|
|
1084
|
+
if invariants_top:
|
|
1035
1085
|
console.print(
|
|
1036
|
-
f"
|
|
1086
|
+
f"[bold yellow]Stated invariants[/bold yellow] "
|
|
1087
|
+
f"[dim]({len(invariants_top)} most recent)[/dim]"
|
|
1037
1088
|
)
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1089
|
+
for line, c in invariants_top:
|
|
1090
|
+
console.print(f" [italic]{line}[/italic]")
|
|
1091
|
+
console.print(
|
|
1092
|
+
f" [dim]{c.sha[:7]} {c.authored_at.date()} "
|
|
1093
|
+
f"{c.author_name}[/dim]\n"
|
|
1094
|
+
)
|
|
1095
|
+
if incidents_top:
|
|
1041
1096
|
console.print(
|
|
1042
|
-
f"
|
|
1097
|
+
f"[bold red]Recent incidents[/bold red] "
|
|
1098
|
+
f"[dim]({len(incidents_top)} most recent)[/dim]"
|
|
1043
1099
|
)
|
|
1100
|
+
for c in incidents_top:
|
|
1101
|
+
subj = c.subject if len(c.subject) <= 70 else c.subject[:69] + "…"
|
|
1102
|
+
console.print(f" [red]{subj}[/red]")
|
|
1103
|
+
console.print(
|
|
1104
|
+
f" [dim]{c.sha[:7]} {c.authored_at.date()} "
|
|
1105
|
+
f"{c.author_name}[/dim]\n"
|
|
1106
|
+
)
|
|
1044
1107
|
else:
|
|
1045
1108
|
console.print(
|
|
1046
1109
|
"[dim]No headline decisions or incidents in recent history.[/dim]"
|
|
@@ -1113,6 +1176,7 @@ def tour(
|
|
|
1113
1176
|
|
|
1114
1177
|
|
|
1115
1178
|
@app.command()
|
|
1179
|
+
@_propagate_failures
|
|
1116
1180
|
def init(
|
|
1117
1181
|
force: bool = typer.Option(
|
|
1118
1182
|
False, "--force", "-f", help="Overwrite existing files instead of skipping."
|
whycode/git_facts.py
CHANGED
|
@@ -18,10 +18,11 @@ from __future__ import annotations
|
|
|
18
18
|
|
|
19
19
|
import re
|
|
20
20
|
import subprocess
|
|
21
|
+
import sys
|
|
21
22
|
from collections import Counter
|
|
22
23
|
from collections.abc import Sequence
|
|
23
24
|
from dataclasses import dataclass, field
|
|
24
|
-
from datetime import datetime
|
|
25
|
+
from datetime import UTC, datetime
|
|
25
26
|
from pathlib import Path
|
|
26
27
|
from typing import TYPE_CHECKING
|
|
27
28
|
|
|
@@ -31,6 +32,17 @@ if TYPE_CHECKING:
|
|
|
31
32
|
UNIT_SEP = "\x1f"
|
|
32
33
|
RECORD_SEP = "\x1e"
|
|
33
34
|
|
|
35
|
+
# Per-process record of commits whose authored timestamp could not be parsed
|
|
36
|
+
# even after defensive normalisation. We surface these once per session via
|
|
37
|
+
# a single stderr line so a single bad record does not spam a per-line warning
|
|
38
|
+
# — never on every read, never to a network.
|
|
39
|
+
_UNPARSEABLE_TIMESTAMPS: set[str] = set()
|
|
40
|
+
_BAD_TZ_WARNING_EMITTED = False
|
|
41
|
+
# The Unix epoch as a tz-aware UTC datetime; used as a safe fallback when a
|
|
42
|
+
# commit's authored_at is irrecoverably malformed. Picked over datetime.min
|
|
43
|
+
# because callers expect a tz-aware value (signal age math compares to UTC).
|
|
44
|
+
_EPOCH_FALLBACK = datetime.fromtimestamp(0, UTC)
|
|
45
|
+
|
|
34
46
|
# A commit subject/body containing one of these markers is treated as evidence
|
|
35
47
|
# that the original author flagged something worth carrying forward.
|
|
36
48
|
INCIDENT_TOKENS: tuple[str, ...] = (
|
|
@@ -69,6 +81,39 @@ _BREAKING_CC_RE = re.compile(
|
|
|
69
81
|
_ISSUE_ID_RE = re.compile(
|
|
70
82
|
r"(?:#\d+|\b[A-Z][A-Z0-9_]+-\d+|\bSEV[- ]?\d\b|\bP[01]\b)",
|
|
71
83
|
)
|
|
84
|
+
# Security-advisory tokens fire as incidents on subject alone — the deliberate
|
|
85
|
+
# act of citing one is unambiguous high-confidence evidence.
|
|
86
|
+
_CVE_RE = re.compile(r"\bCVE-\d{4}-\d+\b")
|
|
87
|
+
_GHSA_RE = re.compile(r"\bGHSA-[a-z0-9-]+\b", re.IGNORECASE)
|
|
88
|
+
# Default ``git revert`` body subject ("Reverted ...") and the human variant
|
|
89
|
+
# ("Reverts <sha>") are both unambiguous incident-class evidence on subject.
|
|
90
|
+
_REVERTED_SUBJECT_RE = re.compile(r'^Reverted\s+"', re.IGNORECASE)
|
|
91
|
+
_REVERTS_SUBJECT_RE = re.compile(r"\bReverts\s+[0-9a-f]{7,}\b", re.IGNORECASE)
|
|
92
|
+
# Subject-level "regression" usage that is descriptive rather than incident:
|
|
93
|
+
# "regression test(s)", "regression suite", "no regression", "regression nature".
|
|
94
|
+
# These prevention/test-housekeeping phrases must NOT fire as incidents.
|
|
95
|
+
_BENIGN_REGRESSION_RE = re.compile(
|
|
96
|
+
r"\b(?:regression\s+(?:tests?|suite|nature)|no\s+regression)\b",
|
|
97
|
+
re.IGNORECASE,
|
|
98
|
+
)
|
|
99
|
+
# Conversely, a subject using "regression" with a corroborating incident id
|
|
100
|
+
# (``#1234``, ``INC-447``, …), or as part of an unambiguous incident phrase
|
|
101
|
+
# like ``regression in <something>`` / ``Fixed: regression`` / ``fix the
|
|
102
|
+
# regression``, IS an incident. These are the patterns that distinguish
|
|
103
|
+
# "split the regression-test files" (housekeeping) from "fix the refund
|
|
104
|
+
# regression" (a real outage marker).
|
|
105
|
+
_INCIDENT_REGRESSION_RE = re.compile(
|
|
106
|
+
# ``regression in <something>`` — "fix the refund regression in admin".
|
|
107
|
+
r"\bregression\s+in\b"
|
|
108
|
+
# ``fix the regression`` / ``fix a regression`` — explicit incident verb.
|
|
109
|
+
r"|\bfix(?:ed|es)?\s+(?:the\s+|a\s+)?regression\b"
|
|
110
|
+
# ``regression — …`` / ``regression: …`` — stated subject category.
|
|
111
|
+
r"|\bregression\s*[:—-]"
|
|
112
|
+
# ``Fixed: regression`` / ``Hotfix: regression`` — pre-colon incident verb
|
|
113
|
+
# explicitly framing the rest as the incident category itself.
|
|
114
|
+
r"|\b(?:fix(?:ed|es)?|hotfix|revert(?:ed)?)\s*:\s*regression\b",
|
|
115
|
+
re.IGNORECASE,
|
|
116
|
+
)
|
|
72
117
|
INVARIANT_TOKENS: tuple[str, ...] = (
|
|
73
118
|
"do not",
|
|
74
119
|
"don't",
|
|
@@ -187,8 +232,94 @@ def is_tracked(repo_root: Path, path: str) -> bool:
|
|
|
187
232
|
return bool(out.strip())
|
|
188
233
|
|
|
189
234
|
|
|
235
|
+
def _normalise_tz_offset(timestamp: str) -> str:
|
|
236
|
+
"""Repair pathological tz offsets that ``datetime.fromisoformat`` rejects.
|
|
237
|
+
|
|
238
|
+
Real-world git history contains commits authored on systems with broken
|
|
239
|
+
timezone configuration (e.g. an offset of ``+518:00`` or ``+51800`` —
|
|
240
|
+
encountered on a 2011 commit in psf/requests, where the underlying object
|
|
241
|
+
really stores ``+51800``). ``fromisoformat`` raises ``ValueError`` on
|
|
242
|
+
those, which would otherwise poison every command that walks history.
|
|
243
|
+
|
|
244
|
+
We coerce the suffix into the canonical ``[+-]HH:MM`` form when we can
|
|
245
|
+
recognise it. Anything else is left untouched and the caller falls back
|
|
246
|
+
to a safe default.
|
|
247
|
+
"""
|
|
248
|
+
stripped = timestamp.strip()
|
|
249
|
+
if "T" not in stripped:
|
|
250
|
+
return stripped
|
|
251
|
+
body, _, after_t = stripped.partition("T")
|
|
252
|
+
sign_idx = -1
|
|
253
|
+
for i, ch in enumerate(after_t):
|
|
254
|
+
if ch in "+-":
|
|
255
|
+
sign_idx = i
|
|
256
|
+
break
|
|
257
|
+
if sign_idx < 0:
|
|
258
|
+
return stripped
|
|
259
|
+
prefix = body + "T" + after_t[:sign_idx]
|
|
260
|
+
sign = after_t[sign_idx]
|
|
261
|
+
rest = after_t[sign_idx + 1 :]
|
|
262
|
+
digits = rest.replace(":", "")
|
|
263
|
+
if not digits.isdigit():
|
|
264
|
+
return stripped
|
|
265
|
+
# Acceptable shapes: 4 digits → HHMM; 5 digits → HHHMM (broken, e.g.
|
|
266
|
+
# ``+51800`` → hours ``5``, minutes ``18``); 6 digits → HHMMSS (rare).
|
|
267
|
+
if len(digits) == 4:
|
|
268
|
+
hh, mm = digits[:2], digits[2:]
|
|
269
|
+
elif len(digits) == 5:
|
|
270
|
+
hh, mm = "0" + digits[0], digits[1:3]
|
|
271
|
+
elif len(digits) == 6:
|
|
272
|
+
hh, mm = digits[:2], digits[2:4]
|
|
273
|
+
elif len(digits) == 2:
|
|
274
|
+
hh, mm = digits, "00"
|
|
275
|
+
else:
|
|
276
|
+
return stripped
|
|
277
|
+
try:
|
|
278
|
+
if int(hh) > 23 or int(mm) > 59:
|
|
279
|
+
return stripped
|
|
280
|
+
except ValueError:
|
|
281
|
+
return stripped
|
|
282
|
+
return f"{prefix}{sign}{hh}:{mm}"
|
|
283
|
+
|
|
284
|
+
|
|
190
285
|
def _parse_iso(timestamp: str) -> datetime:
|
|
191
|
-
|
|
286
|
+
"""Parse an ISO 8601 timestamp; tolerate malformed tz offsets.
|
|
287
|
+
|
|
288
|
+
Returns a Unix-epoch sentinel if the offset cannot be repaired so a
|
|
289
|
+
single bad record never crashes a whole-repo analysis. The bad raw
|
|
290
|
+
string is tracked in a per-session set so verbose callers can mention
|
|
291
|
+
which commits were affected.
|
|
292
|
+
"""
|
|
293
|
+
raw = timestamp.strip()
|
|
294
|
+
try:
|
|
295
|
+
return datetime.fromisoformat(raw)
|
|
296
|
+
except ValueError:
|
|
297
|
+
repaired = _normalise_tz_offset(raw)
|
|
298
|
+
if repaired != raw:
|
|
299
|
+
try:
|
|
300
|
+
return datetime.fromisoformat(repaired)
|
|
301
|
+
except ValueError:
|
|
302
|
+
pass
|
|
303
|
+
_UNPARSEABLE_TIMESTAMPS.add(raw)
|
|
304
|
+
return _EPOCH_FALLBACK
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def _maybe_warn_bad_timestamps() -> None:
|
|
308
|
+
"""Emit a single stderr line per session if any record fell back to epoch.
|
|
309
|
+
|
|
310
|
+
Called once at the end of a top-level read so a single bad commit never
|
|
311
|
+
spams a per-line warning. Stays purely local — no network, no telemetry.
|
|
312
|
+
"""
|
|
313
|
+
global _BAD_TZ_WARNING_EMITTED
|
|
314
|
+
if _BAD_TZ_WARNING_EMITTED or not _UNPARSEABLE_TIMESTAMPS:
|
|
315
|
+
return
|
|
316
|
+
_BAD_TZ_WARNING_EMITTED = True
|
|
317
|
+
n = len(_UNPARSEABLE_TIMESTAMPS)
|
|
318
|
+
print(
|
|
319
|
+
f"warning: {n} commit{'s' if n != 1 else ''} had an unparseable "
|
|
320
|
+
f"authored timestamp; treating those as epoch for date math.",
|
|
321
|
+
file=sys.stderr,
|
|
322
|
+
)
|
|
192
323
|
|
|
193
324
|
|
|
194
325
|
def _log_format() -> str:
|
|
@@ -285,6 +416,7 @@ def commits_for_path(
|
|
|
285
416
|
cache.store_path_log(path, head_sha, [c.sha for c in commits])
|
|
286
417
|
if max_count is not None and len(commits) > max_count:
|
|
287
418
|
commits = commits[:max_count]
|
|
419
|
+
_maybe_warn_bad_timestamps()
|
|
288
420
|
return commits
|
|
289
421
|
|
|
290
422
|
|
|
@@ -342,12 +474,15 @@ def all_commits(
|
|
|
342
474
|
"""
|
|
343
475
|
if cache is not None:
|
|
344
476
|
full = _all_commits_via_cache(repo_root, cache)
|
|
477
|
+
_maybe_warn_bad_timestamps()
|
|
345
478
|
return full if max_count is None else full[:max_count]
|
|
346
479
|
args = ["log", "--no-merges", f"--pretty=format:{_log_format()}"]
|
|
347
480
|
if max_count is not None:
|
|
348
481
|
args.append(f"--max-count={max_count}")
|
|
349
482
|
raw = _run_git(repo_root, *args)
|
|
350
|
-
|
|
483
|
+
out = _parse_log_records(raw)
|
|
484
|
+
_maybe_warn_bad_timestamps()
|
|
485
|
+
return out
|
|
351
486
|
|
|
352
487
|
|
|
353
488
|
def _store_commits(cache: CacheStore, commits: Sequence[Commit]) -> None:
|
|
@@ -624,26 +759,77 @@ def find_revert_pairs(commits: Sequence[Commit]) -> list[tuple[str, str]]:
|
|
|
624
759
|
return pairs
|
|
625
760
|
|
|
626
761
|
|
|
762
|
+
def _is_subject_incident(subject: str, body: str) -> bool:
|
|
763
|
+
"""Determine whether a single commit's subject signals incident intent.
|
|
764
|
+
|
|
765
|
+
The trickiest case is ``regression``. The 0.4.0 classifier accepted any
|
|
766
|
+
subject that contained the word and so flagged routine bug fixes that
|
|
767
|
+
happened to mention "regression tests" or "regression nature" as
|
|
768
|
+
incidents. The new rule:
|
|
769
|
+
|
|
770
|
+
- ``regression`` in a subject fires only when corroborated:
|
|
771
|
+
* an issue / incident id on the same subject or anywhere in the body
|
|
772
|
+
(``#1234``, ``INC-447``, ``SEV-1``, …); OR
|
|
773
|
+
* a pre-marker that anchors the word as an incident reference,
|
|
774
|
+
such as ``regression in <something>`` / ``Fixed: regression`` /
|
|
775
|
+
``fix the regression`` / ``regression — …``.
|
|
776
|
+
- The phrases ``regression test(s)``, ``regression suite``,
|
|
777
|
+
``no regression``, ``regression nature`` never fire on their own.
|
|
778
|
+
- Subjects citing a security advisory (``CVE-…`` / ``GHSA-…``) always
|
|
779
|
+
fire — the act of naming an advisory is unambiguous high-confidence.
|
|
780
|
+
- The default ``git revert`` body subject (``Reverted "…"``) and the
|
|
781
|
+
human variant (``Reverts <sha>``) always fire — both are explicit
|
|
782
|
+
rollback markers.
|
|
783
|
+
- Other incident keywords (``hotfix``, ``outage``, ``rollback``, …)
|
|
784
|
+
keep their existing subject-level acceptance.
|
|
785
|
+
"""
|
|
786
|
+
if _CVE_RE.search(subject) or _GHSA_RE.search(subject):
|
|
787
|
+
return True
|
|
788
|
+
if _REVERTED_SUBJECT_RE.search(subject) or _REVERTS_SUBJECT_RE.search(subject):
|
|
789
|
+
return True
|
|
790
|
+
if _BREAKING_CC_RE.search(subject):
|
|
791
|
+
return True
|
|
792
|
+
# Regression demands corroboration: either it appears as part of a
|
|
793
|
+
# high-confidence phrase, or an issue/incident id is present.
|
|
794
|
+
has_regression = bool(re.search(r"\bregression\b", subject, re.IGNORECASE))
|
|
795
|
+
if has_regression:
|
|
796
|
+
if _BENIGN_REGRESSION_RE.search(subject):
|
|
797
|
+
return False
|
|
798
|
+
if _INCIDENT_REGRESSION_RE.search(subject):
|
|
799
|
+
return True
|
|
800
|
+
if _ISSUE_ID_RE.search(subject) or _ISSUE_ID_RE.search(body):
|
|
801
|
+
return True
|
|
802
|
+
# Strip the word and check whether any other incident keyword carries
|
|
803
|
+
# the subject — a "rollback regression" should still fire on
|
|
804
|
+
# "rollback" alone.
|
|
805
|
+
without_regression = re.sub(r"\bregression\b", "", subject, flags=re.IGNORECASE)
|
|
806
|
+
return bool(_INCIDENT_RE.search(without_regression))
|
|
807
|
+
return bool(_INCIDENT_RE.search(subject))
|
|
808
|
+
|
|
809
|
+
|
|
627
810
|
def find_incidents(commits: Sequence[Commit]) -> list[Commit]:
|
|
628
811
|
"""Return commits whose evidence-level signals incident-flavored intent.
|
|
629
812
|
|
|
630
813
|
Acceptance ladder (highest to lowest confidence):
|
|
631
|
-
1. Subject
|
|
632
|
-
|
|
633
|
-
2. Subject
|
|
814
|
+
1. Subject cites a security advisory (``CVE-…`` / ``GHSA-…``) — fires
|
|
815
|
+
on subject alone.
|
|
816
|
+
2. Subject is a default ``git revert`` body (``Reverted "…"``) or a
|
|
817
|
+
human revert pointer (``Reverts <sha>``) — fires on subject alone.
|
|
818
|
+
3. Subject carries the Conventional Commits breaking marker
|
|
634
819
|
(``feat!:`` / ``fix!:`` / …).
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
820
|
+
4. Subject contains an incident keyword that is NOT a benign
|
|
821
|
+
"regression test/suite/nature" phrase. ``regression`` requires
|
|
822
|
+
either an issue id (on subject or body) or a pre-marker that
|
|
823
|
+
anchors it as an incident reference.
|
|
824
|
+
5. Body carries the structured ``BREAKING CHANGE:`` footer.
|
|
825
|
+
6. Body contains an incident keyword AND an issue / incident
|
|
826
|
+
identifier nearby. Filters out passing mentions in prose.
|
|
641
827
|
|
|
642
828
|
A bare body keyword with no corroborating ID does NOT fire.
|
|
643
829
|
"""
|
|
644
830
|
out: list[Commit] = []
|
|
645
831
|
for c in commits:
|
|
646
|
-
if
|
|
832
|
+
if _is_subject_incident(c.subject, c.body):
|
|
647
833
|
out.append(c)
|
|
648
834
|
continue
|
|
649
835
|
if _BREAKING_FOOTER_RE.search(c.body):
|
|
@@ -696,6 +882,41 @@ def _all_matches_are_quoted(line: str, regex: re.Pattern[str]) -> bool:
|
|
|
696
882
|
return True
|
|
697
883
|
|
|
698
884
|
|
|
885
|
+
# An ALLCAPS line prefix (e.g. ``WARNING:``, ``ERROR:``, ``DEBUG:``) is the
|
|
886
|
+
# canonical signature of pasted compiler / linter / spell-checker output.
|
|
887
|
+
# A genuine human invariant statement opens with a normal sentence ("Do
|
|
888
|
+
# not...", "Important: ...") and never with two or more uppercase letters
|
|
889
|
+
# followed by an immediate colon.
|
|
890
|
+
_TOOL_OUTPUT_ALLCAPS_RE = re.compile(r"^[A-Z]{2,}:\s")
|
|
891
|
+
# A ``path:line:`` or ``path:line:col:`` prefix near the start of a line is
|
|
892
|
+
# the unmistakable shape of compiler / aspell output. We accept any path-
|
|
893
|
+
# shaped token (slashes, dots, hyphens, underscores, alnum) followed by
|
|
894
|
+
# ``:<digits>:`` — anchored so it also catches ``./foo/bar.py:50:``.
|
|
895
|
+
_TOOL_OUTPUT_PATH_RE = re.compile(r"^[\w./-]+:\d+:")
|
|
896
|
+
# Per-(commit, file) cap on invariant lines pulled from one body. A real
|
|
897
|
+
# author rarely states more than two crisp invariants in a single message;
|
|
898
|
+
# anything beyond is almost certainly a paste. Set deliberately low so a
|
|
899
|
+
# single noisy commit can no longer dominate the "highlights" view.
|
|
900
|
+
_PER_COMMIT_INVARIANT_CAP = 2
|
|
901
|
+
|
|
902
|
+
|
|
903
|
+
def _is_tool_output_line(line: str, prev_line: str) -> bool:
|
|
904
|
+
"""True if ``line`` looks like quoted compiler / linter / aspell output.
|
|
905
|
+
|
|
906
|
+
Heuristics:
|
|
907
|
+
- ALLCAPS followed immediately by a colon (``WARNING:``, ``ERROR:``,
|
|
908
|
+
``DEBUG:``…) — pasted tool output.
|
|
909
|
+
- ``path/to/file:line:`` prefix near the start — clang / mypy / aspell.
|
|
910
|
+
- Preceded by a ``> `` block-quote line — markdown-style "this is
|
|
911
|
+
what the tool said" framing.
|
|
912
|
+
"""
|
|
913
|
+
if _TOOL_OUTPUT_ALLCAPS_RE.match(line):
|
|
914
|
+
return True
|
|
915
|
+
if _TOOL_OUTPUT_PATH_RE.match(line):
|
|
916
|
+
return True
|
|
917
|
+
return prev_line.startswith("> ")
|
|
918
|
+
|
|
919
|
+
|
|
699
920
|
def extract_invariant_quotes(commits: Sequence[Commit]) -> list[tuple[str, str]]:
|
|
700
921
|
"""Pull lines from commit *bodies* that match invariant tokens.
|
|
701
922
|
|
|
@@ -706,20 +927,43 @@ def extract_invariant_quotes(commits: Sequence[Commit]) -> list[tuple[str, str]]
|
|
|
706
927
|
eliminates the meta-mention failure mode where a commit *about* an
|
|
707
928
|
invariant token (e.g. "fix invariant matcher") would self-flag.
|
|
708
929
|
|
|
930
|
+
Two filters keep pasted tool output out of the "stated invariants"
|
|
931
|
+
surface:
|
|
932
|
+
|
|
933
|
+
1. Lines that look like quoted compiler / linter / aspell output are
|
|
934
|
+
dropped (``WARNING: …``, ``foo/bar.py:50: …``, lines preceded by a
|
|
935
|
+
``> `` block-quote). One noisy spell-check commit on django used to
|
|
936
|
+
supply 15 of the top-20 highlights; this rule kills it at the
|
|
937
|
+
source.
|
|
938
|
+
2. A per-commit cap of two invariants. Real authors rarely state more
|
|
939
|
+
than two crisp constraints in one message; anything beyond is
|
|
940
|
+
almost certainly a paste. The first two matches are preserved
|
|
941
|
+
(most informative-looking entries rank).
|
|
942
|
+
|
|
709
943
|
Lines where every matching token is wrapped in quotes (``"do not"``) are
|
|
710
944
|
treated as references rather than statements and are skipped.
|
|
711
945
|
"""
|
|
712
946
|
out: list[tuple[str, str]] = []
|
|
713
947
|
for commit in commits:
|
|
948
|
+
per_commit = 0
|
|
949
|
+
prev_line = ""
|
|
714
950
|
for raw_line in commit.body.splitlines():
|
|
715
951
|
line = raw_line.strip()
|
|
716
952
|
if not line:
|
|
953
|
+
prev_line = raw_line
|
|
717
954
|
continue
|
|
955
|
+
if _is_tool_output_line(line, prev_line):
|
|
956
|
+
prev_line = raw_line
|
|
957
|
+
continue
|
|
958
|
+
prev_line = raw_line
|
|
718
959
|
if not _INVARIANT_RE.search(line):
|
|
719
960
|
continue
|
|
720
961
|
if _all_matches_are_quoted(line, _INVARIANT_RE):
|
|
721
962
|
continue
|
|
963
|
+
if per_commit >= _PER_COMMIT_INVARIANT_CAP:
|
|
964
|
+
continue
|
|
722
965
|
out.append((commit.sha, line[:200]))
|
|
966
|
+
per_commit += 1
|
|
723
967
|
return out
|
|
724
968
|
|
|
725
969
|
|
whycode/ignore.py
CHANGED
|
@@ -3,7 +3,16 @@
|
|
|
3
3
|
These are paths/files that almost always pollute risk analysis without
|
|
4
4
|
adding signal: changelogs (touched on every release, so they look "tightly
|
|
5
5
|
coupled to everything"), lockfiles (regenerated on every dependency bump),
|
|
6
|
-
vendored third-party code,
|
|
6
|
+
vendored third-party code, machine-generated stubs, CI / packaging
|
|
7
|
+
metadata, project-membership files (``AUTHORS``, ``LICENSE``), and
|
|
8
|
+
translation catalogues (``*.po`` / ``*.mo``).
|
|
9
|
+
|
|
10
|
+
A field test against django (10,000 commits, 7,043 files) showed the
|
|
11
|
+
top-10 risk list was dominated by these high-touch metadata files —
|
|
12
|
+
``AUTHORS``, ``.github/workflows/*.yml``, locale ``.po``, ``.gitignore``
|
|
13
|
+
— and no application code at all reached the top 10. A scan-top list
|
|
14
|
+
that surfaces zero source files is unactionable; demoting these
|
|
15
|
+
metadata files lets real source code rank.
|
|
7
16
|
|
|
8
17
|
Users can extend this list with a ``.whycodeignore`` file at repo root,
|
|
9
18
|
one ``fnmatch``-style pattern per line. Comments start with ``#``.
|
|
@@ -71,6 +80,49 @@ DEFAULT_IGNORE_PATTERNS: tuple[str, ...] = (
|
|
|
71
80
|
"*.ttf",
|
|
72
81
|
"*.otf",
|
|
73
82
|
"*.eot",
|
|
83
|
+
# CI / repo metadata — high-touch but never the source of risk in code.
|
|
84
|
+
".github/**",
|
|
85
|
+
".gitlab/**",
|
|
86
|
+
".circleci/**",
|
|
87
|
+
".gitignore",
|
|
88
|
+
".gitattributes",
|
|
89
|
+
".editorconfig",
|
|
90
|
+
".pre-commit-config.yaml",
|
|
91
|
+
".readthedocs.yaml",
|
|
92
|
+
".readthedocs.yml",
|
|
93
|
+
".flake8",
|
|
94
|
+
".coveragerc",
|
|
95
|
+
"tox.ini",
|
|
96
|
+
"pytest.ini",
|
|
97
|
+
"Makefile",
|
|
98
|
+
# Project-membership / licensing files — touched on every contributor add.
|
|
99
|
+
"AUTHORS",
|
|
100
|
+
"AUTHORS.*",
|
|
101
|
+
"CONTRIBUTORS",
|
|
102
|
+
"CONTRIBUTORS.*",
|
|
103
|
+
"LICENSE",
|
|
104
|
+
"LICENSE.*",
|
|
105
|
+
"LICENSES/**",
|
|
106
|
+
"COPYING",
|
|
107
|
+
"COPYING.*",
|
|
108
|
+
"NOTICE",
|
|
109
|
+
"NOTICE.*",
|
|
110
|
+
# Python packaging metadata — low-signal-per-touch.
|
|
111
|
+
"setup.py",
|
|
112
|
+
"setup.cfg",
|
|
113
|
+
"MANIFEST.in",
|
|
114
|
+
# Translation catalogues — bulk-edited every release, never an indicator
|
|
115
|
+
# of code risk.
|
|
116
|
+
"*.po",
|
|
117
|
+
"*.mo",
|
|
118
|
+
"*.pot",
|
|
119
|
+
# Release-notes-style ``*.txt`` files only — narrow patterns; we are
|
|
120
|
+
# deliberately conservative here so a random ``requirements.txt`` is not
|
|
121
|
+
# ignored. The shapes below match common repo layouts (django, flask).
|
|
122
|
+
"release_notes/*.txt",
|
|
123
|
+
"docs/releases/*.txt",
|
|
124
|
+
"docs/release-notes/*.txt",
|
|
125
|
+
"release-notes/*.txt",
|
|
74
126
|
)
|
|
75
127
|
|
|
76
128
|
_USER_IGNORE_FILE = ".whycodeignore"
|
whycode/signals.py
CHANGED
|
@@ -13,6 +13,7 @@ from enum import StrEnum
|
|
|
13
13
|
from typing import TYPE_CHECKING
|
|
14
14
|
|
|
15
15
|
from whycode import git_facts as gf
|
|
16
|
+
from whycode import ignore as ign
|
|
16
17
|
|
|
17
18
|
if TYPE_CHECKING:
|
|
18
19
|
from whycode.git_facts import RepoFacts
|
|
@@ -148,7 +149,23 @@ def detect_high_churn(facts: RepoFacts) -> Signal | None:
|
|
|
148
149
|
|
|
149
150
|
|
|
150
151
|
def detect_coupling(facts: RepoFacts) -> Signal | None:
|
|
151
|
-
|
|
152
|
+
"""Files that change together with the target file, ranked by frequency.
|
|
153
|
+
|
|
154
|
+
Co-change candidates are filtered through the same ignore list that
|
|
155
|
+
powers ``whycode scan`` (built-in defaults plus an optional repo-local
|
|
156
|
+
``.whycodeignore``). Without this filter, a per-file coupling signal
|
|
157
|
+
would surface ``CHANGELOG``, ``.github/workflows/*.yml``, ``AUTHORS``
|
|
158
|
+
and similar high-touch metadata as the file's "tight coupling" — the
|
|
159
|
+
field-test report flagged ``flask/app.py``'s top co-changers as 60%
|
|
160
|
+
metadata, leaving only two genuinely informative entries. Applying
|
|
161
|
+
the same filter here keeps the most-shown signal honest.
|
|
162
|
+
"""
|
|
163
|
+
patterns = ign.effective_patterns(facts.repo_root)
|
|
164
|
+
paired = [
|
|
165
|
+
(p, n)
|
|
166
|
+
for p, n in facts.co_changed_files.items()
|
|
167
|
+
if n >= COUPLING_MIN_COCHANGES and not ign.is_ignored(p, patterns)
|
|
168
|
+
]
|
|
152
169
|
if not paired:
|
|
153
170
|
return None
|
|
154
171
|
paired.sort(key=lambda x: (-x[1], x[0]))
|
|
@@ -1,22 +1,22 @@
|
|
|
1
|
-
whycode/__init__.py,sha256=
|
|
1
|
+
whycode/__init__.py,sha256=dPQOppaGvPoPBoACrHwxqGykCdDMNZRROtDjOmyRuf8,96
|
|
2
2
|
whycode/__main__.py,sha256=dqAk6746YpuM-FTIH4TBOULegGc5WweojiZjce0VYgQ,105
|
|
3
3
|
whycode/cache.py,sha256=v55KbSlTqmP_ot1FEFqxCNpAApj6vthpHl2l0lGLX3A,17477
|
|
4
|
-
whycode/cli.py,sha256=
|
|
4
|
+
whycode/cli.py,sha256=OTYPhp8ItBXPRrQ1y6zGt0BwKyAYEuHAo3T0hMHqINk,47836
|
|
5
5
|
whycode/decisions.py,sha256=oCVhEF7QfHeci0LAWNtEjV2mUAEBJloL1rT3I4XXbkw,7570
|
|
6
|
-
whycode/git_facts.py,sha256=
|
|
7
|
-
whycode/ignore.py,sha256=
|
|
6
|
+
whycode/git_facts.py,sha256=vAeyhxZTrqa_6zmVuBV-06JhZ-TFBiRmcaISK1oOQjM,40162
|
|
7
|
+
whycode/ignore.py,sha256=O_8bHIt0d1U-sYrBajBa7oEqpnHWU3f6Zf-8PU8CpO0,4748
|
|
8
8
|
whycode/llm.py,sha256=leB94pBg8kUCq_BujZq5ixny0urGtKskjdaKoum_eCA,4092
|
|
9
9
|
whycode/mcp_server.py,sha256=ht1tStAkOwmQzNIRkm1eA8Tnc59fzDRSGkgyIprft-0,18503
|
|
10
10
|
whycode/risk_card.py,sha256=xOJkHwIkS_6yw_dSowsQ6LHfeD9Mwr2tymL7_wqxs0U,8855
|
|
11
11
|
whycode/scorer.py,sha256=4pBejunfxzYhGUzMeL8uGEMQzC6DWiqwcTeMdo3eras,1444
|
|
12
|
-
whycode/signals.py,sha256=
|
|
12
|
+
whycode/signals.py,sha256=z0kZfXR60nS-j56nchHd1V3aK8A5CGR1BAyHZZAff3s,13899
|
|
13
13
|
whycode/suppressions.py,sha256=1lKSs-kCgpnJbcxozcgiSP8ZAfjEDMHXuM3sw4FaY78,3836
|
|
14
14
|
whycode/templates/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
15
|
whycode/templates/github-workflow.yml,sha256=LAfHMDG2TkAwi4vCNinHk-4zOt-mCWErBpmpaqlW5oA,2251
|
|
16
16
|
whycode/templates/pre-commit,sha256=IhU11CvoDwqRAAsvHwUo-BwaNbdgy1cpXc54Z_phrmQ,316
|
|
17
|
-
whycode_cli-0.4.
|
|
18
|
-
whycode_cli-0.4.
|
|
19
|
-
whycode_cli-0.4.
|
|
20
|
-
whycode_cli-0.4.
|
|
21
|
-
whycode_cli-0.4.
|
|
22
|
-
whycode_cli-0.4.
|
|
17
|
+
whycode_cli-0.4.1.dist-info/licenses/LICENSE,sha256=U6LN5qg5kJXSJf7KFPm9KJhmiGn3qK_GsTVWXdt1DFA,1062
|
|
18
|
+
whycode_cli-0.4.1.dist-info/METADATA,sha256=M2XBAL02LMRZtW4Pj4L3Gcuifqh2lIAQa_1Hpt3xfPI,10218
|
|
19
|
+
whycode_cli-0.4.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
20
|
+
whycode_cli-0.4.1.dist-info/entry_points.txt,sha256=xrNWc4CQn3ZhQFJxsGIPiTqpN19K4pRpgaj6qGaEzSQ,44
|
|
21
|
+
whycode_cli-0.4.1.dist-info/top_level.txt,sha256=6yIL5rxW-4DbARHQYrPlGQVqKddZ88sjvmNosDh1w3A,8
|
|
22
|
+
whycode_cli-0.4.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|