whycode-cli 0.4.0__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- whycode/__init__.py +1 -1
- whycode/cache.py +33 -7
- whycode/cli.py +120 -34
- whycode/git_facts.py +298 -13
- whycode/ignore.py +53 -1
- whycode/signals.py +18 -1
- {whycode_cli-0.4.0.dist-info → whycode_cli-0.4.2.dist-info}/METADATA +1 -1
- {whycode_cli-0.4.0.dist-info → whycode_cli-0.4.2.dist-info}/RECORD +12 -12
- {whycode_cli-0.4.0.dist-info → whycode_cli-0.4.2.dist-info}/WHEEL +0 -0
- {whycode_cli-0.4.0.dist-info → whycode_cli-0.4.2.dist-info}/entry_points.txt +0 -0
- {whycode_cli-0.4.0.dist-info → whycode_cli-0.4.2.dist-info}/licenses/LICENSE +0 -0
- {whycode_cli-0.4.0.dist-info → whycode_cli-0.4.2.dist-info}/top_level.txt +0 -0
whycode/__init__.py
CHANGED
whycode/cache.py
CHANGED
|
@@ -112,10 +112,21 @@ class CacheStore:
|
|
|
112
112
|
cache misses; this class never invokes ``git`` itself.
|
|
113
113
|
"""
|
|
114
114
|
|
|
115
|
-
def __init__(self, db_path: Path) -> None:
|
|
115
|
+
def __init__(self, db_path: Path, *, in_memory: bool = False) -> None:
|
|
116
|
+
"""Open (creating if needed) the SQLite cache at ``db_path``.
|
|
117
|
+
|
|
118
|
+
``in_memory=True`` opens a transient ``:memory:`` connection
|
|
119
|
+
instead — the disk file is never created and is never read.
|
|
120
|
+
Used by ``--no-cache`` to retain in-session amortisation
|
|
121
|
+
(matches the cold-fill code path) without persisting anything.
|
|
122
|
+
"""
|
|
116
123
|
self.db_path = db_path
|
|
117
|
-
self.
|
|
118
|
-
|
|
124
|
+
self._in_memory = in_memory
|
|
125
|
+
if in_memory:
|
|
126
|
+
self._conn = sqlite3.connect(":memory:")
|
|
127
|
+
else:
|
|
128
|
+
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
129
|
+
self._conn = sqlite3.connect(self.db_path)
|
|
119
130
|
# row_factory makes column access readable in tests / debug.
|
|
120
131
|
self._conn.row_factory = sqlite3.Row
|
|
121
132
|
self._conn.execute("PRAGMA foreign_keys = ON")
|
|
@@ -402,13 +413,18 @@ class CacheStore:
|
|
|
402
413
|
file_row_count = int(
|
|
403
414
|
self._conn.execute("SELECT COUNT(*) FROM commit_files").fetchone()[0]
|
|
404
415
|
)
|
|
405
|
-
|
|
406
|
-
size_bytes = self.db_path.stat().st_size
|
|
407
|
-
except OSError:
|
|
416
|
+
if self._in_memory:
|
|
408
417
|
size_bytes = 0
|
|
418
|
+
exists = False
|
|
419
|
+
else:
|
|
420
|
+
try:
|
|
421
|
+
size_bytes = self.db_path.stat().st_size
|
|
422
|
+
except OSError:
|
|
423
|
+
size_bytes = 0
|
|
424
|
+
exists = self.db_path.exists()
|
|
409
425
|
return CacheStats(
|
|
410
426
|
path=self.db_path,
|
|
411
|
-
exists=
|
|
427
|
+
exists=exists,
|
|
412
428
|
schema_version=self.schema_version,
|
|
413
429
|
head_sha=self.head_sha,
|
|
414
430
|
commit_count=commit_count,
|
|
@@ -430,6 +446,16 @@ def open_for(repo_root: Path) -> CacheStore:
|
|
|
430
446
|
return CacheStore(cache_path_for(repo_root))
|
|
431
447
|
|
|
432
448
|
|
|
449
|
+
def open_in_memory(repo_root: Path) -> CacheStore:
|
|
450
|
+
"""Open a transient in-memory cache for ``repo_root``.
|
|
451
|
+
|
|
452
|
+
Used by ``--no-cache`` to keep within-session amortisation (the same
|
|
453
|
+
cold-fill code path everything else uses) while never touching disk.
|
|
454
|
+
The store is destroyed on ``close()`` and has no after-effects.
|
|
455
|
+
"""
|
|
456
|
+
return CacheStore(cache_path_for(repo_root), in_memory=True)
|
|
457
|
+
|
|
458
|
+
|
|
433
459
|
def parse_authored_at(value: str) -> datetime:
|
|
434
460
|
"""Parse the ``authored_at`` string we stored from git.
|
|
435
461
|
|
whycode/cli.py
CHANGED
|
@@ -20,10 +20,12 @@ Commands
|
|
|
20
20
|
|
|
21
21
|
from __future__ import annotations
|
|
22
22
|
|
|
23
|
+
import functools
|
|
23
24
|
import json
|
|
24
25
|
import sys
|
|
26
|
+
from collections.abc import Callable
|
|
25
27
|
from pathlib import Path
|
|
26
|
-
from typing import Any
|
|
28
|
+
from typing import Any, TypeVar
|
|
27
29
|
|
|
28
30
|
import typer
|
|
29
31
|
from rich.console import Console
|
|
@@ -48,18 +50,27 @@ err = Console(stderr=True)
|
|
|
48
50
|
|
|
49
51
|
|
|
50
52
|
def _open_cache(repo_root: Path, no_cache: bool) -> ch.CacheStore | None:
|
|
51
|
-
"""Open the
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
53
|
+
"""Open the cache for ``repo_root`` according to the no-cache flag.
|
|
54
|
+
|
|
55
|
+
Modes:
|
|
56
|
+
* ``no_cache=False`` (the default): persistent on-disk SQLite at
|
|
57
|
+
``.whycode/cache.db``.
|
|
58
|
+
* ``no_cache=True``: a transient ``:memory:`` SQLite store. The
|
|
59
|
+
same git-walk code path runs as for the cold-fill, but the
|
|
60
|
+
database is destroyed on ``close()`` — nothing lands on disk
|
|
61
|
+
and the next run starts cold. Keeping per-run amortisation
|
|
62
|
+
(one ``git log`` walk shared across files) is what makes
|
|
63
|
+
``--no-cache`` at most as slow as a cold persistent fill;
|
|
64
|
+
the previous ``cache=None`` short-circuit lost that and so
|
|
65
|
+
``--no-cache`` re-issued per-file walks every iteration.
|
|
66
|
+
|
|
67
|
+
A ``None`` return means "do not pass a cache through git_facts".
|
|
68
|
+
Happens only when even an in-memory open fails — very rare and
|
|
69
|
+
we never want a cache problem to block the main read path.
|
|
59
70
|
"""
|
|
60
|
-
if no_cache:
|
|
61
|
-
return None
|
|
62
71
|
try:
|
|
72
|
+
if no_cache:
|
|
73
|
+
return ch.open_in_memory(repo_root)
|
|
63
74
|
return ch.open_for(repo_root)
|
|
64
75
|
except OSError:
|
|
65
76
|
return None
|
|
@@ -115,6 +126,37 @@ def _require_tracked(path_arg: str) -> tuple[Path, str]:
|
|
|
115
126
|
return repo_root, rel
|
|
116
127
|
|
|
117
128
|
|
|
129
|
+
_F = TypeVar("_F", bound=Callable[..., Any])
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _propagate_failures(func: _F) -> _F:
|
|
133
|
+
"""Convert any uncaught exception into ``typer.Exit(2)``.
|
|
134
|
+
|
|
135
|
+
A read-only field test against psf/requests caught a bug where a single
|
|
136
|
+
bad-timezone commit raised ``ValueError`` deep inside ``_parse_log_records``;
|
|
137
|
+
Rich rendered the traceback to stderr, but the process exited with status
|
|
138
|
+
0. CI integrations could not tell that the run had silently failed
|
|
139
|
+
(a ``whycode diff --fail-on history`` step was reported as green even
|
|
140
|
+
though it had crashed). We wrap each command body so any unhandled
|
|
141
|
+
exception leaves the existing rich traceback rendering in place but
|
|
142
|
+
forces a non-zero exit code (``2`` for general failure). ``typer.Exit``
|
|
143
|
+
and ``KeyboardInterrupt`` propagate untouched so explicit exit-code
|
|
144
|
+
paths and Ctrl-C still behave normally.
|
|
145
|
+
"""
|
|
146
|
+
|
|
147
|
+
@functools.wraps(func)
|
|
148
|
+
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
149
|
+
try:
|
|
150
|
+
return func(*args, **kwargs)
|
|
151
|
+
except (typer.Exit, typer.Abort, KeyboardInterrupt):
|
|
152
|
+
raise
|
|
153
|
+
except Exception as exc:
|
|
154
|
+
err.print_exception(show_locals=False)
|
|
155
|
+
raise typer.Exit(2) from exc
|
|
156
|
+
|
|
157
|
+
return wrapper # type: ignore[return-value]
|
|
158
|
+
|
|
159
|
+
|
|
118
160
|
# --- shared: band threshold parsing ----------------------------------------
|
|
119
161
|
|
|
120
162
|
_BAND_THRESHOLDS_BY_KEY: dict[str, int] = {
|
|
@@ -148,6 +190,7 @@ def _print_brief(card: rc.RiskCard) -> None:
|
|
|
148
190
|
|
|
149
191
|
|
|
150
192
|
@app.command()
|
|
193
|
+
@_propagate_failures
|
|
151
194
|
def why(
|
|
152
195
|
path: str = typer.Argument(..., help="File path to inspect."),
|
|
153
196
|
json_out: bool = typer.Option(
|
|
@@ -317,6 +360,7 @@ def _resolve_base_ref(repo_root: Path, requested: str | None) -> str:
|
|
|
317
360
|
|
|
318
361
|
|
|
319
362
|
@app.command()
|
|
363
|
+
@_propagate_failures
|
|
320
364
|
def diff(
|
|
321
365
|
base: str | None = typer.Option(
|
|
322
366
|
None, "--base", help="Base ref (default: origin/main → main → HEAD~1)."
|
|
@@ -390,7 +434,9 @@ def diff(
|
|
|
390
434
|
cards.append(rc.build(repo_root, f, cache=cache))
|
|
391
435
|
except gf.GitError:
|
|
392
436
|
continue
|
|
393
|
-
|
|
437
|
+
# Stable tie-break: lex smallest path on identical scores so cache
|
|
438
|
+
# and --no-cache truncate the same files at --top N.
|
|
439
|
+
cards.sort(key=lambda c: (-c.score.value, c.path))
|
|
394
440
|
cards = cards[:top]
|
|
395
441
|
finally:
|
|
396
442
|
if cache is not None:
|
|
@@ -482,6 +528,7 @@ def diff(
|
|
|
482
528
|
|
|
483
529
|
|
|
484
530
|
@app.command()
|
|
531
|
+
@_propagate_failures
|
|
485
532
|
def highlights(
|
|
486
533
|
invariants: int = typer.Option(
|
|
487
534
|
5, "--invariants", help="How many invariant lines to surface."
|
|
@@ -529,16 +576,17 @@ def highlights(
|
|
|
529
576
|
|
|
530
577
|
inv_pairs = gf.extract_invariant_quotes(commits)
|
|
531
578
|
sha_to_commit = {c.sha: c for c in commits}
|
|
532
|
-
|
|
533
|
-
for sha, line in inv_pairs:
|
|
534
|
-
seen_lines.setdefault(line, sha)
|
|
579
|
+
deduped = gf.dedupe_invariant_lines(inv_pairs, sha_to_commit)
|
|
535
580
|
inv_records: list[tuple[str, str, gf.Commit]] = []
|
|
536
|
-
for
|
|
581
|
+
for sha, line in deduped:
|
|
537
582
|
commit = sha_to_commit.get(sha)
|
|
538
583
|
if commit is None:
|
|
539
584
|
continue
|
|
540
585
|
inv_records.append((line, sha, commit))
|
|
541
|
-
|
|
586
|
+
# Sort newest first; on identical timestamps fall back to lexicographically
|
|
587
|
+
# smallest sha so cache and --no-cache emit byte-identical output.
|
|
588
|
+
inv_records.sort(key=lambda t: t[1]) # secondary: sha asc
|
|
589
|
+
inv_records.sort(key=lambda t: t[2].authored_at, reverse=True) # primary
|
|
542
590
|
inv_records = inv_records[:invariants]
|
|
543
591
|
|
|
544
592
|
incident_records = gf.find_incidents(commits)[:incidents]
|
|
@@ -636,6 +684,7 @@ def _sample_indices(total: int, max_samples: int) -> list[int]:
|
|
|
636
684
|
|
|
637
685
|
|
|
638
686
|
@app.command()
|
|
687
|
+
@_propagate_failures
|
|
639
688
|
def timeline(
|
|
640
689
|
path: str = typer.Argument(..., help="File path to inspect."),
|
|
641
690
|
samples: int = typer.Option(
|
|
@@ -677,6 +726,12 @@ def timeline(
|
|
|
677
726
|
top,
|
|
678
727
|
)
|
|
679
728
|
)
|
|
729
|
+
# Field-test report F14: ``timeline`` used to render rows in whatever
|
|
730
|
+
# non-monotonic order ``_sample_indices`` produced (uniform-across-index
|
|
731
|
+
# selection on a list whose ordering is git's parent traversal). Sort
|
|
732
|
+
# by date ascending before rendering so a reader can scan left-to-right
|
|
733
|
+
# without misreading the trajectory.
|
|
734
|
+
rows.sort(key=lambda r: r[0])
|
|
680
735
|
|
|
681
736
|
if json_out:
|
|
682
737
|
console.print_json(
|
|
@@ -714,6 +769,7 @@ def timeline(
|
|
|
714
769
|
|
|
715
770
|
|
|
716
771
|
@app.command()
|
|
772
|
+
@_propagate_failures
|
|
717
773
|
def scan(
|
|
718
774
|
top: int = typer.Option(10, "--top", help="How many files to list."),
|
|
719
775
|
sample: int = typer.Option(
|
|
@@ -783,7 +839,10 @@ def scan(
|
|
|
783
839
|
if cache is not None:
|
|
784
840
|
cache.close()
|
|
785
841
|
|
|
786
|
-
|
|
842
|
+
# Stable tie-break on identical scores: lexicographically smallest path
|
|
843
|
+
# so cache and --no-cache produce byte-identical text output for the
|
|
844
|
+
# same HEAD. Without this, the truncation at --top N is non-deterministic.
|
|
845
|
+
cards.sort(key=lambda c: (-c.score.value, c.path))
|
|
787
846
|
top_cards = cards[:top]
|
|
788
847
|
if not top_cards:
|
|
789
848
|
# Be honest about what "no flagged files" actually means. A user who
|
|
@@ -811,6 +870,7 @@ def scan(
|
|
|
811
870
|
|
|
812
871
|
|
|
813
872
|
@app.command()
|
|
873
|
+
@_propagate_failures
|
|
814
874
|
def honest(
|
|
815
875
|
path: str = typer.Argument(..., help="File path to inspect."),
|
|
816
876
|
json_out: bool = typer.Option(False, "--json", help="Emit JSON instead of prose."),
|
|
@@ -874,6 +934,7 @@ def honest(
|
|
|
874
934
|
|
|
875
935
|
|
|
876
936
|
@app.command()
|
|
937
|
+
@_propagate_failures
|
|
877
938
|
def show(
|
|
878
939
|
sha: str = typer.Argument(..., help="Commit SHA (full or short) to inspect."),
|
|
879
940
|
repo: Path = typer.Option(Path("."), "--repo", help="Path inside the repo."),
|
|
@@ -903,7 +964,8 @@ def show(
|
|
|
903
964
|
cards.append(rc.build(repo_root, change.path))
|
|
904
965
|
except gf.GitError:
|
|
905
966
|
continue
|
|
906
|
-
|
|
967
|
+
# Stable tie-break on identical scores: lex smallest path.
|
|
968
|
+
cards.sort(key=lambda c: (-c.score.value, c.path))
|
|
907
969
|
|
|
908
970
|
if json_out:
|
|
909
971
|
console.print_json(
|
|
@@ -981,6 +1043,7 @@ _MCP_SNIPPET = ''' {
|
|
|
981
1043
|
|
|
982
1044
|
|
|
983
1045
|
@app.command()
|
|
1046
|
+
@_propagate_failures
|
|
984
1047
|
def tour(
|
|
985
1048
|
repo: Path = typer.Option(Path("."), "--repo", help="Path inside the repo."),
|
|
986
1049
|
no_cache: bool = typer.Option(
|
|
@@ -1018,29 +1081,50 @@ def tour(
|
|
|
1018
1081
|
|
|
1019
1082
|
inv_pairs = gf.extract_invariant_quotes(commits)
|
|
1020
1083
|
sha_to_commit = {c.sha: c for c in commits}
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1084
|
+
deduped = gf.dedupe_invariant_lines(inv_pairs, sha_to_commit)
|
|
1085
|
+
# Sort newest first with sha-asc tie-break so cache and --no-cache
|
|
1086
|
+
# surface the same three lines in the same order.
|
|
1087
|
+
deduped_sorted = sorted(
|
|
1088
|
+
(p for p in deduped if p[0] in sha_to_commit),
|
|
1089
|
+
key=lambda p: p[0],
|
|
1090
|
+
)
|
|
1091
|
+
deduped_sorted.sort(
|
|
1092
|
+
key=lambda p: sha_to_commit[p[0]].authored_at, reverse=True
|
|
1093
|
+
)
|
|
1024
1094
|
invariants_top = [
|
|
1025
|
-
(line, sha_to_commit[sha])
|
|
1026
|
-
for line, sha in seen_lines.items()
|
|
1027
|
-
if sha in sha_to_commit
|
|
1095
|
+
(line, sha_to_commit[sha]) for sha, line in deduped_sorted
|
|
1028
1096
|
][:3]
|
|
1029
1097
|
incidents_top = gf.find_incidents(commits)[:3]
|
|
1030
1098
|
|
|
1031
1099
|
if invariants_top or incidents_top:
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1100
|
+
# Field-test report F16: the original tour rendered both classes
|
|
1101
|
+
# under one ``Decisions and incidents`` header, so a parenthetical
|
|
1102
|
+
# invariant prose line was visually indistinguishable from a real
|
|
1103
|
+
# incident commit. Render two subheads matching the layout
|
|
1104
|
+
# ``highlights`` already uses.
|
|
1105
|
+
if invariants_top:
|
|
1035
1106
|
console.print(
|
|
1036
|
-
f"
|
|
1107
|
+
f"[bold yellow]Stated invariants[/bold yellow] "
|
|
1108
|
+
f"[dim]({len(invariants_top)} most recent)[/dim]"
|
|
1037
1109
|
)
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1110
|
+
for line, c in invariants_top:
|
|
1111
|
+
console.print(f" [italic]{line}[/italic]")
|
|
1112
|
+
console.print(
|
|
1113
|
+
f" [dim]{c.sha[:7]} {c.authored_at.date()} "
|
|
1114
|
+
f"{c.author_name}[/dim]\n"
|
|
1115
|
+
)
|
|
1116
|
+
if incidents_top:
|
|
1041
1117
|
console.print(
|
|
1042
|
-
f"
|
|
1118
|
+
f"[bold red]Recent incidents[/bold red] "
|
|
1119
|
+
f"[dim]({len(incidents_top)} most recent)[/dim]"
|
|
1043
1120
|
)
|
|
1121
|
+
for c in incidents_top:
|
|
1122
|
+
subj = c.subject if len(c.subject) <= 70 else c.subject[:69] + "…"
|
|
1123
|
+
console.print(f" [red]{subj}[/red]")
|
|
1124
|
+
console.print(
|
|
1125
|
+
f" [dim]{c.sha[:7]} {c.authored_at.date()} "
|
|
1126
|
+
f"{c.author_name}[/dim]\n"
|
|
1127
|
+
)
|
|
1044
1128
|
else:
|
|
1045
1129
|
console.print(
|
|
1046
1130
|
"[dim]No headline decisions or incidents in recent history.[/dim]"
|
|
@@ -1072,7 +1156,8 @@ def tour(
|
|
|
1072
1156
|
]
|
|
1073
1157
|
if useful:
|
|
1074
1158
|
cards.append(card)
|
|
1075
|
-
|
|
1159
|
+
# Stable tie-break: lex smallest path on identical scores.
|
|
1160
|
+
cards.sort(key=lambda c: (-c.score.value, c.path))
|
|
1076
1161
|
|
|
1077
1162
|
if cards:
|
|
1078
1163
|
console.print("[bold red]Top 3 risky files[/bold red]")
|
|
@@ -1113,6 +1198,7 @@ def tour(
|
|
|
1113
1198
|
|
|
1114
1199
|
|
|
1115
1200
|
@app.command()
|
|
1201
|
+
@_propagate_failures
|
|
1116
1202
|
def init(
|
|
1117
1203
|
force: bool = typer.Option(
|
|
1118
1204
|
False, "--force", "-f", help="Overwrite existing files instead of skipping."
|
whycode/git_facts.py
CHANGED
|
@@ -18,10 +18,11 @@ from __future__ import annotations
|
|
|
18
18
|
|
|
19
19
|
import re
|
|
20
20
|
import subprocess
|
|
21
|
+
import sys
|
|
21
22
|
from collections import Counter
|
|
22
23
|
from collections.abc import Sequence
|
|
23
24
|
from dataclasses import dataclass, field
|
|
24
|
-
from datetime import datetime
|
|
25
|
+
from datetime import UTC, datetime
|
|
25
26
|
from pathlib import Path
|
|
26
27
|
from typing import TYPE_CHECKING
|
|
27
28
|
|
|
@@ -31,6 +32,17 @@ if TYPE_CHECKING:
|
|
|
31
32
|
UNIT_SEP = "\x1f"
|
|
32
33
|
RECORD_SEP = "\x1e"
|
|
33
34
|
|
|
35
|
+
# Per-process record of commits whose authored timestamp could not be parsed
|
|
36
|
+
# even after defensive normalisation. We surface these once per session via
|
|
37
|
+
# a single stderr line so a single bad record does not spam a per-line warning
|
|
38
|
+
# — never on every read, never to a network.
|
|
39
|
+
_UNPARSEABLE_TIMESTAMPS: set[str] = set()
|
|
40
|
+
_BAD_TZ_WARNING_EMITTED = False
|
|
41
|
+
# The Unix epoch as a tz-aware UTC datetime; used as a safe fallback when a
|
|
42
|
+
# commit's authored_at is irrecoverably malformed. Picked over datetime.min
|
|
43
|
+
# because callers expect a tz-aware value (signal age math compares to UTC).
|
|
44
|
+
_EPOCH_FALLBACK = datetime.fromtimestamp(0, UTC)
|
|
45
|
+
|
|
34
46
|
# A commit subject/body containing one of these markers is treated as evidence
|
|
35
47
|
# that the original author flagged something worth carrying forward.
|
|
36
48
|
INCIDENT_TOKENS: tuple[str, ...] = (
|
|
@@ -69,6 +81,39 @@ _BREAKING_CC_RE = re.compile(
|
|
|
69
81
|
_ISSUE_ID_RE = re.compile(
|
|
70
82
|
r"(?:#\d+|\b[A-Z][A-Z0-9_]+-\d+|\bSEV[- ]?\d\b|\bP[01]\b)",
|
|
71
83
|
)
|
|
84
|
+
# Security-advisory tokens fire as incidents on subject alone — the deliberate
|
|
85
|
+
# act of citing one is unambiguous high-confidence evidence.
|
|
86
|
+
_CVE_RE = re.compile(r"\bCVE-\d{4}-\d+\b")
|
|
87
|
+
_GHSA_RE = re.compile(r"\bGHSA-[a-z0-9-]+\b", re.IGNORECASE)
|
|
88
|
+
# Default ``git revert`` body subject ("Reverted ...") and the human variant
|
|
89
|
+
# ("Reverts <sha>") are both unambiguous incident-class evidence on subject.
|
|
90
|
+
_REVERTED_SUBJECT_RE = re.compile(r'^Reverted\s+"', re.IGNORECASE)
|
|
91
|
+
_REVERTS_SUBJECT_RE = re.compile(r"\bReverts\s+[0-9a-f]{7,}\b", re.IGNORECASE)
|
|
92
|
+
# Subject-level "regression" usage that is descriptive rather than incident:
|
|
93
|
+
# "regression test(s)", "regression suite", "no regression", "regression nature".
|
|
94
|
+
# These prevention/test-housekeeping phrases must NOT fire as incidents.
|
|
95
|
+
_BENIGN_REGRESSION_RE = re.compile(
|
|
96
|
+
r"\b(?:regression\s+(?:tests?|suite|nature)|no\s+regression)\b",
|
|
97
|
+
re.IGNORECASE,
|
|
98
|
+
)
|
|
99
|
+
# Conversely, a subject using "regression" with a corroborating incident id
|
|
100
|
+
# (``#1234``, ``INC-447``, …), or as part of an unambiguous incident phrase
|
|
101
|
+
# like ``regression in <something>`` / ``Fixed: regression`` / ``fix the
|
|
102
|
+
# regression``, IS an incident. These are the patterns that distinguish
|
|
103
|
+
# "split the regression-test files" (housekeeping) from "fix the refund
|
|
104
|
+
# regression" (a real outage marker).
|
|
105
|
+
_INCIDENT_REGRESSION_RE = re.compile(
|
|
106
|
+
# ``regression in <something>`` — "fix the refund regression in admin".
|
|
107
|
+
r"\bregression\s+in\b"
|
|
108
|
+
# ``fix the regression`` / ``fix a regression`` — explicit incident verb.
|
|
109
|
+
r"|\bfix(?:ed|es)?\s+(?:the\s+|a\s+)?regression\b"
|
|
110
|
+
# ``regression — …`` / ``regression: …`` — stated subject category.
|
|
111
|
+
r"|\bregression\s*[:—-]"
|
|
112
|
+
# ``Fixed: regression`` / ``Hotfix: regression`` — pre-colon incident verb
|
|
113
|
+
# explicitly framing the rest as the incident category itself.
|
|
114
|
+
r"|\b(?:fix(?:ed|es)?|hotfix|revert(?:ed)?)\s*:\s*regression\b",
|
|
115
|
+
re.IGNORECASE,
|
|
116
|
+
)
|
|
72
117
|
INVARIANT_TOKENS: tuple[str, ...] = (
|
|
73
118
|
"do not",
|
|
74
119
|
"don't",
|
|
@@ -187,8 +232,94 @@ def is_tracked(repo_root: Path, path: str) -> bool:
|
|
|
187
232
|
return bool(out.strip())
|
|
188
233
|
|
|
189
234
|
|
|
235
|
+
def _normalise_tz_offset(timestamp: str) -> str:
|
|
236
|
+
"""Repair pathological tz offsets that ``datetime.fromisoformat`` rejects.
|
|
237
|
+
|
|
238
|
+
Real-world git history contains commits authored on systems with broken
|
|
239
|
+
timezone configuration (e.g. an offset of ``+518:00`` or ``+51800`` —
|
|
240
|
+
encountered on a 2011 commit in psf/requests, where the underlying object
|
|
241
|
+
really stores ``+51800``). ``fromisoformat`` raises ``ValueError`` on
|
|
242
|
+
those, which would otherwise poison every command that walks history.
|
|
243
|
+
|
|
244
|
+
We coerce the suffix into the canonical ``[+-]HH:MM`` form when we can
|
|
245
|
+
recognise it. Anything else is left untouched and the caller falls back
|
|
246
|
+
to a safe default.
|
|
247
|
+
"""
|
|
248
|
+
stripped = timestamp.strip()
|
|
249
|
+
if "T" not in stripped:
|
|
250
|
+
return stripped
|
|
251
|
+
body, _, after_t = stripped.partition("T")
|
|
252
|
+
sign_idx = -1
|
|
253
|
+
for i, ch in enumerate(after_t):
|
|
254
|
+
if ch in "+-":
|
|
255
|
+
sign_idx = i
|
|
256
|
+
break
|
|
257
|
+
if sign_idx < 0:
|
|
258
|
+
return stripped
|
|
259
|
+
prefix = body + "T" + after_t[:sign_idx]
|
|
260
|
+
sign = after_t[sign_idx]
|
|
261
|
+
rest = after_t[sign_idx + 1 :]
|
|
262
|
+
digits = rest.replace(":", "")
|
|
263
|
+
if not digits.isdigit():
|
|
264
|
+
return stripped
|
|
265
|
+
# Acceptable shapes: 4 digits → HHMM; 5 digits → HHHMM (broken, e.g.
|
|
266
|
+
# ``+51800`` → hours ``5``, minutes ``18``); 6 digits → HHMMSS (rare).
|
|
267
|
+
if len(digits) == 4:
|
|
268
|
+
hh, mm = digits[:2], digits[2:]
|
|
269
|
+
elif len(digits) == 5:
|
|
270
|
+
hh, mm = "0" + digits[0], digits[1:3]
|
|
271
|
+
elif len(digits) == 6:
|
|
272
|
+
hh, mm = digits[:2], digits[2:4]
|
|
273
|
+
elif len(digits) == 2:
|
|
274
|
+
hh, mm = digits, "00"
|
|
275
|
+
else:
|
|
276
|
+
return stripped
|
|
277
|
+
try:
|
|
278
|
+
if int(hh) > 23 or int(mm) > 59:
|
|
279
|
+
return stripped
|
|
280
|
+
except ValueError:
|
|
281
|
+
return stripped
|
|
282
|
+
return f"{prefix}{sign}{hh}:{mm}"
|
|
283
|
+
|
|
284
|
+
|
|
190
285
|
def _parse_iso(timestamp: str) -> datetime:
|
|
191
|
-
|
|
286
|
+
"""Parse an ISO 8601 timestamp; tolerate malformed tz offsets.
|
|
287
|
+
|
|
288
|
+
Returns a Unix-epoch sentinel if the offset cannot be repaired so a
|
|
289
|
+
single bad record never crashes a whole-repo analysis. The bad raw
|
|
290
|
+
string is tracked in a per-session set so verbose callers can mention
|
|
291
|
+
which commits were affected.
|
|
292
|
+
"""
|
|
293
|
+
raw = timestamp.strip()
|
|
294
|
+
try:
|
|
295
|
+
return datetime.fromisoformat(raw)
|
|
296
|
+
except ValueError:
|
|
297
|
+
repaired = _normalise_tz_offset(raw)
|
|
298
|
+
if repaired != raw:
|
|
299
|
+
try:
|
|
300
|
+
return datetime.fromisoformat(repaired)
|
|
301
|
+
except ValueError:
|
|
302
|
+
pass
|
|
303
|
+
_UNPARSEABLE_TIMESTAMPS.add(raw)
|
|
304
|
+
return _EPOCH_FALLBACK
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def _maybe_warn_bad_timestamps() -> None:
|
|
308
|
+
"""Emit a single stderr line per session if any record fell back to epoch.
|
|
309
|
+
|
|
310
|
+
Called once at the end of a top-level read so a single bad commit never
|
|
311
|
+
spams a per-line warning. Stays purely local — no network, no telemetry.
|
|
312
|
+
"""
|
|
313
|
+
global _BAD_TZ_WARNING_EMITTED
|
|
314
|
+
if _BAD_TZ_WARNING_EMITTED or not _UNPARSEABLE_TIMESTAMPS:
|
|
315
|
+
return
|
|
316
|
+
_BAD_TZ_WARNING_EMITTED = True
|
|
317
|
+
n = len(_UNPARSEABLE_TIMESTAMPS)
|
|
318
|
+
print(
|
|
319
|
+
f"warning: {n} commit{'s' if n != 1 else ''} had an unparseable "
|
|
320
|
+
f"authored timestamp; treating those as epoch for date math.",
|
|
321
|
+
file=sys.stderr,
|
|
322
|
+
)
|
|
192
323
|
|
|
193
324
|
|
|
194
325
|
def _log_format() -> str:
|
|
@@ -285,6 +416,7 @@ def commits_for_path(
|
|
|
285
416
|
cache.store_path_log(path, head_sha, [c.sha for c in commits])
|
|
286
417
|
if max_count is not None and len(commits) > max_count:
|
|
287
418
|
commits = commits[:max_count]
|
|
419
|
+
_maybe_warn_bad_timestamps()
|
|
288
420
|
return commits
|
|
289
421
|
|
|
290
422
|
|
|
@@ -342,12 +474,15 @@ def all_commits(
|
|
|
342
474
|
"""
|
|
343
475
|
if cache is not None:
|
|
344
476
|
full = _all_commits_via_cache(repo_root, cache)
|
|
477
|
+
_maybe_warn_bad_timestamps()
|
|
345
478
|
return full if max_count is None else full[:max_count]
|
|
346
479
|
args = ["log", "--no-merges", f"--pretty=format:{_log_format()}"]
|
|
347
480
|
if max_count is not None:
|
|
348
481
|
args.append(f"--max-count={max_count}")
|
|
349
482
|
raw = _run_git(repo_root, *args)
|
|
350
|
-
|
|
483
|
+
out = _parse_log_records(raw)
|
|
484
|
+
_maybe_warn_bad_timestamps()
|
|
485
|
+
return out
|
|
351
486
|
|
|
352
487
|
|
|
353
488
|
def _store_commits(cache: CacheStore, commits: Sequence[Commit]) -> None:
|
|
@@ -624,26 +759,77 @@ def find_revert_pairs(commits: Sequence[Commit]) -> list[tuple[str, str]]:
|
|
|
624
759
|
return pairs
|
|
625
760
|
|
|
626
761
|
|
|
762
|
+
def _is_subject_incident(subject: str, body: str) -> bool:
|
|
763
|
+
"""Determine whether a single commit's subject signals incident intent.
|
|
764
|
+
|
|
765
|
+
The trickiest case is ``regression``. The 0.4.0 classifier accepted any
|
|
766
|
+
subject that contained the word and so flagged routine bug fixes that
|
|
767
|
+
happened to mention "regression tests" or "regression nature" as
|
|
768
|
+
incidents. The new rule:
|
|
769
|
+
|
|
770
|
+
- ``regression`` in a subject fires only when corroborated:
|
|
771
|
+
* an issue / incident id on the same subject or anywhere in the body
|
|
772
|
+
(``#1234``, ``INC-447``, ``SEV-1``, …); OR
|
|
773
|
+
* a pre-marker that anchors the word as an incident reference,
|
|
774
|
+
such as ``regression in <something>`` / ``Fixed: regression`` /
|
|
775
|
+
``fix the regression`` / ``regression — …``.
|
|
776
|
+
- The phrases ``regression test(s)``, ``regression suite``,
|
|
777
|
+
``no regression``, ``regression nature`` never fire on their own.
|
|
778
|
+
- Subjects citing a security advisory (``CVE-…`` / ``GHSA-…``) always
|
|
779
|
+
fire — the act of naming an advisory is unambiguous high-confidence.
|
|
780
|
+
- The default ``git revert`` body subject (``Reverted "…"``) and the
|
|
781
|
+
human variant (``Reverts <sha>``) always fire — both are explicit
|
|
782
|
+
rollback markers.
|
|
783
|
+
- Other incident keywords (``hotfix``, ``outage``, ``rollback``, …)
|
|
784
|
+
keep their existing subject-level acceptance.
|
|
785
|
+
"""
|
|
786
|
+
if _CVE_RE.search(subject) or _GHSA_RE.search(subject):
|
|
787
|
+
return True
|
|
788
|
+
if _REVERTED_SUBJECT_RE.search(subject) or _REVERTS_SUBJECT_RE.search(subject):
|
|
789
|
+
return True
|
|
790
|
+
if _BREAKING_CC_RE.search(subject):
|
|
791
|
+
return True
|
|
792
|
+
# Regression demands corroboration: either it appears as part of a
|
|
793
|
+
# high-confidence phrase, or an issue/incident id is present.
|
|
794
|
+
has_regression = bool(re.search(r"\bregression\b", subject, re.IGNORECASE))
|
|
795
|
+
if has_regression:
|
|
796
|
+
if _BENIGN_REGRESSION_RE.search(subject):
|
|
797
|
+
return False
|
|
798
|
+
if _INCIDENT_REGRESSION_RE.search(subject):
|
|
799
|
+
return True
|
|
800
|
+
if _ISSUE_ID_RE.search(subject) or _ISSUE_ID_RE.search(body):
|
|
801
|
+
return True
|
|
802
|
+
# Strip the word and check whether any other incident keyword carries
|
|
803
|
+
# the subject — a "rollback regression" should still fire on
|
|
804
|
+
# "rollback" alone.
|
|
805
|
+
without_regression = re.sub(r"\bregression\b", "", subject, flags=re.IGNORECASE)
|
|
806
|
+
return bool(_INCIDENT_RE.search(without_regression))
|
|
807
|
+
return bool(_INCIDENT_RE.search(subject))
|
|
808
|
+
|
|
809
|
+
|
|
627
810
|
def find_incidents(commits: Sequence[Commit]) -> list[Commit]:
|
|
628
811
|
"""Return commits whose evidence-level signals incident-flavored intent.
|
|
629
812
|
|
|
630
813
|
Acceptance ladder (highest to lowest confidence):
|
|
631
|
-
1. Subject
|
|
632
|
-
|
|
633
|
-
2. Subject
|
|
814
|
+
1. Subject cites a security advisory (``CVE-…`` / ``GHSA-…``) — fires
|
|
815
|
+
on subject alone.
|
|
816
|
+
2. Subject is a default ``git revert`` body (``Reverted "…"``) or a
|
|
817
|
+
human revert pointer (``Reverts <sha>``) — fires on subject alone.
|
|
818
|
+
3. Subject carries the Conventional Commits breaking marker
|
|
634
819
|
(``feat!:`` / ``fix!:`` / …).
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
820
|
+
4. Subject contains an incident keyword that is NOT a benign
|
|
821
|
+
"regression test/suite/nature" phrase. ``regression`` requires
|
|
822
|
+
either an issue id (on subject or body) or a pre-marker that
|
|
823
|
+
anchors it as an incident reference.
|
|
824
|
+
5. Body carries the structured ``BREAKING CHANGE:`` footer.
|
|
825
|
+
6. Body contains an incident keyword AND an issue / incident
|
|
826
|
+
identifier nearby. Filters out passing mentions in prose.
|
|
641
827
|
|
|
642
828
|
A bare body keyword with no corroborating ID does NOT fire.
|
|
643
829
|
"""
|
|
644
830
|
out: list[Commit] = []
|
|
645
831
|
for c in commits:
|
|
646
|
-
if
|
|
832
|
+
if _is_subject_incident(c.subject, c.body):
|
|
647
833
|
out.append(c)
|
|
648
834
|
continue
|
|
649
835
|
if _BREAKING_FOOTER_RE.search(c.body):
|
|
@@ -696,6 +882,41 @@ def _all_matches_are_quoted(line: str, regex: re.Pattern[str]) -> bool:
|
|
|
696
882
|
return True
|
|
697
883
|
|
|
698
884
|
|
|
885
|
+
# An ALLCAPS line prefix (e.g. ``WARNING:``, ``ERROR:``, ``DEBUG:``) is the
|
|
886
|
+
# canonical signature of pasted compiler / linter / spell-checker output.
|
|
887
|
+
# A genuine human invariant statement opens with a normal sentence ("Do
|
|
888
|
+
# not...", "Important: ...") and never with two or more uppercase letters
|
|
889
|
+
# followed by an immediate colon.
|
|
890
|
+
_TOOL_OUTPUT_ALLCAPS_RE = re.compile(r"^[A-Z]{2,}:\s")
|
|
891
|
+
# A ``path:line:`` or ``path:line:col:`` prefix near the start of a line is
|
|
892
|
+
# the unmistakable shape of compiler / aspell output. We accept any path-
|
|
893
|
+
# shaped token (slashes, dots, hyphens, underscores, alnum) followed by
|
|
894
|
+
# ``:<digits>:`` — anchored so it also catches ``./foo/bar.py:50:``.
|
|
895
|
+
_TOOL_OUTPUT_PATH_RE = re.compile(r"^[\w./-]+:\d+:")
|
|
896
|
+
# Per-(commit, file) cap on invariant lines pulled from one body. A real
|
|
897
|
+
# author rarely states more than two crisp invariants in a single message;
|
|
898
|
+
# anything beyond is almost certainly a paste. Set deliberately low so a
|
|
899
|
+
# single noisy commit can no longer dominate the "highlights" view.
|
|
900
|
+
_PER_COMMIT_INVARIANT_CAP = 2
|
|
901
|
+
|
|
902
|
+
|
|
903
|
+
def _is_tool_output_line(line: str, prev_line: str) -> bool:
|
|
904
|
+
"""True if ``line`` looks like quoted compiler / linter / aspell output.
|
|
905
|
+
|
|
906
|
+
Heuristics:
|
|
907
|
+
- ALLCAPS followed immediately by a colon (``WARNING:``, ``ERROR:``,
|
|
908
|
+
``DEBUG:``…) — pasted tool output.
|
|
909
|
+
- ``path/to/file:line:`` prefix near the start — clang / mypy / aspell.
|
|
910
|
+
- Preceded by a ``> `` block-quote line — markdown-style "this is
|
|
911
|
+
what the tool said" framing.
|
|
912
|
+
"""
|
|
913
|
+
if _TOOL_OUTPUT_ALLCAPS_RE.match(line):
|
|
914
|
+
return True
|
|
915
|
+
if _TOOL_OUTPUT_PATH_RE.match(line):
|
|
916
|
+
return True
|
|
917
|
+
return prev_line.startswith("> ")
|
|
918
|
+
|
|
919
|
+
|
|
699
920
|
def extract_invariant_quotes(commits: Sequence[Commit]) -> list[tuple[str, str]]:
|
|
700
921
|
"""Pull lines from commit *bodies* that match invariant tokens.
|
|
701
922
|
|
|
@@ -706,23 +927,87 @@ def extract_invariant_quotes(commits: Sequence[Commit]) -> list[tuple[str, str]]
|
|
|
706
927
|
eliminates the meta-mention failure mode where a commit *about* an
|
|
707
928
|
invariant token (e.g. "fix invariant matcher") would self-flag.
|
|
708
929
|
|
|
930
|
+
Two filters keep pasted tool output out of the "stated invariants"
|
|
931
|
+
surface:
|
|
932
|
+
|
|
933
|
+
1. Lines that look like quoted compiler / linter / aspell output are
|
|
934
|
+
dropped (``WARNING: …``, ``foo/bar.py:50: …``, lines preceded by a
|
|
935
|
+
``> `` block-quote). One noisy spell-check commit on django used to
|
|
936
|
+
supply 15 of the top-20 highlights; this rule kills it at the
|
|
937
|
+
source.
|
|
938
|
+
2. A per-commit cap of two invariants. Real authors rarely state more
|
|
939
|
+
than two crisp constraints in one message; anything beyond is
|
|
940
|
+
almost certainly a paste. The first two matches are preserved
|
|
941
|
+
(most informative-looking entries rank).
|
|
942
|
+
|
|
709
943
|
Lines where every matching token is wrapped in quotes (``"do not"``) are
|
|
710
944
|
treated as references rather than statements and are skipped.
|
|
711
945
|
"""
|
|
712
946
|
out: list[tuple[str, str]] = []
|
|
713
947
|
for commit in commits:
|
|
948
|
+
per_commit = 0
|
|
949
|
+
prev_line = ""
|
|
714
950
|
for raw_line in commit.body.splitlines():
|
|
715
951
|
line = raw_line.strip()
|
|
716
952
|
if not line:
|
|
953
|
+
prev_line = raw_line
|
|
717
954
|
continue
|
|
955
|
+
if _is_tool_output_line(line, prev_line):
|
|
956
|
+
prev_line = raw_line
|
|
957
|
+
continue
|
|
958
|
+
prev_line = raw_line
|
|
718
959
|
if not _INVARIANT_RE.search(line):
|
|
719
960
|
continue
|
|
720
961
|
if _all_matches_are_quoted(line, _INVARIANT_RE):
|
|
721
962
|
continue
|
|
963
|
+
if per_commit >= _PER_COMMIT_INVARIANT_CAP:
|
|
964
|
+
continue
|
|
722
965
|
out.append((commit.sha, line[:200]))
|
|
966
|
+
per_commit += 1
|
|
723
967
|
return out
|
|
724
968
|
|
|
725
969
|
|
|
970
|
+
def dedupe_invariant_lines(
|
|
971
|
+
pairs: Sequence[tuple[str, str]],
|
|
972
|
+
sha_to_commit: dict[str, Commit],
|
|
973
|
+
) -> list[tuple[str, str]]:
|
|
974
|
+
"""Collapse identical invariant lines to one canonical (sha, line) pair.
|
|
975
|
+
|
|
976
|
+
When two commits state the same invariant line — typically a cherry-pick
|
|
977
|
+
onto a maintenance branch, or a rebase that duplicated the message — we
|
|
978
|
+
must pick exactly one to surface. Without a deterministic rule the cache
|
|
979
|
+
and ``--no-cache`` paths can disagree (their walk orders differ when
|
|
980
|
+
timestamps tie), and downstream JSON consumers see flaky output across
|
|
981
|
+
runs.
|
|
982
|
+
|
|
983
|
+
The rule:
|
|
984
|
+
|
|
985
|
+
1. Earliest ``authored_at`` wins. The original statement is canonical;
|
|
986
|
+
cherry-picks and rebases are derivatives.
|
|
987
|
+
2. Lexicographically smallest ``sha`` breaks ties on identical timestamps.
|
|
988
|
+
|
|
989
|
+
The returned list preserves first-encounter order of the (now-unique)
|
|
990
|
+
lines so downstream code that sorts by date sees a stable input.
|
|
991
|
+
Pairs whose ``sha`` is not in ``sha_to_commit`` keep their first-seen
|
|
992
|
+
record (no metadata to compare on).
|
|
993
|
+
"""
|
|
994
|
+
canonical: dict[str, str] = {}
|
|
995
|
+
for sha, line in pairs:
|
|
996
|
+
existing = canonical.get(line)
|
|
997
|
+
if existing is None:
|
|
998
|
+
canonical[line] = sha
|
|
999
|
+
continue
|
|
1000
|
+
old_commit = sha_to_commit.get(existing)
|
|
1001
|
+
new_commit = sha_to_commit.get(sha)
|
|
1002
|
+
if old_commit is None or new_commit is None:
|
|
1003
|
+
continue
|
|
1004
|
+
old_key = (old_commit.authored_at, existing)
|
|
1005
|
+
new_key = (new_commit.authored_at, sha)
|
|
1006
|
+
if new_key < old_key:
|
|
1007
|
+
canonical[line] = sha
|
|
1008
|
+
return [(sha, line) for line, sha in canonical.items()]
|
|
1009
|
+
|
|
1010
|
+
|
|
726
1011
|
def author_last_activity(repo_root: Path, email: str) -> datetime | None:
|
|
727
1012
|
"""Most recent commit timestamp by ``email`` anywhere in the repo, or None."""
|
|
728
1013
|
raw = _run_git(
|
whycode/ignore.py
CHANGED
|
@@ -3,7 +3,16 @@
|
|
|
3
3
|
These are paths/files that almost always pollute risk analysis without
|
|
4
4
|
adding signal: changelogs (touched on every release, so they look "tightly
|
|
5
5
|
coupled to everything"), lockfiles (regenerated on every dependency bump),
|
|
6
|
-
vendored third-party code,
|
|
6
|
+
vendored third-party code, machine-generated stubs, CI / packaging
|
|
7
|
+
metadata, project-membership files (``AUTHORS``, ``LICENSE``), and
|
|
8
|
+
translation catalogues (``*.po`` / ``*.mo``).
|
|
9
|
+
|
|
10
|
+
A field test against django (10,000 commits, 7,043 files) showed the
|
|
11
|
+
top-10 risk list was dominated by these high-touch metadata files —
|
|
12
|
+
``AUTHORS``, ``.github/workflows/*.yml``, locale ``.po``, ``.gitignore``
|
|
13
|
+
— and no application code at all reached the top 10. A scan-top list
|
|
14
|
+
that surfaces zero source files is unactionable; demoting these
|
|
15
|
+
metadata files lets real source code rank.
|
|
7
16
|
|
|
8
17
|
Users can extend this list with a ``.whycodeignore`` file at repo root,
|
|
9
18
|
one ``fnmatch``-style pattern per line. Comments start with ``#``.
|
|
@@ -71,6 +80,49 @@ DEFAULT_IGNORE_PATTERNS: tuple[str, ...] = (
|
|
|
71
80
|
"*.ttf",
|
|
72
81
|
"*.otf",
|
|
73
82
|
"*.eot",
|
|
83
|
+
# CI / repo metadata — high-touch but never the source of risk in code.
|
|
84
|
+
".github/**",
|
|
85
|
+
".gitlab/**",
|
|
86
|
+
".circleci/**",
|
|
87
|
+
".gitignore",
|
|
88
|
+
".gitattributes",
|
|
89
|
+
".editorconfig",
|
|
90
|
+
".pre-commit-config.yaml",
|
|
91
|
+
".readthedocs.yaml",
|
|
92
|
+
".readthedocs.yml",
|
|
93
|
+
".flake8",
|
|
94
|
+
".coveragerc",
|
|
95
|
+
"tox.ini",
|
|
96
|
+
"pytest.ini",
|
|
97
|
+
"Makefile",
|
|
98
|
+
# Project-membership / licensing files — touched on every contributor add.
|
|
99
|
+
"AUTHORS",
|
|
100
|
+
"AUTHORS.*",
|
|
101
|
+
"CONTRIBUTORS",
|
|
102
|
+
"CONTRIBUTORS.*",
|
|
103
|
+
"LICENSE",
|
|
104
|
+
"LICENSE.*",
|
|
105
|
+
"LICENSES/**",
|
|
106
|
+
"COPYING",
|
|
107
|
+
"COPYING.*",
|
|
108
|
+
"NOTICE",
|
|
109
|
+
"NOTICE.*",
|
|
110
|
+
# Python packaging metadata — low-signal-per-touch.
|
|
111
|
+
"setup.py",
|
|
112
|
+
"setup.cfg",
|
|
113
|
+
"MANIFEST.in",
|
|
114
|
+
# Translation catalogues — bulk-edited every release, never an indicator
|
|
115
|
+
# of code risk.
|
|
116
|
+
"*.po",
|
|
117
|
+
"*.mo",
|
|
118
|
+
"*.pot",
|
|
119
|
+
# Release-notes-style ``*.txt`` files only — narrow patterns; we are
|
|
120
|
+
# deliberately conservative here so a random ``requirements.txt`` is not
|
|
121
|
+
# ignored. The shapes below match common repo layouts (django, flask).
|
|
122
|
+
"release_notes/*.txt",
|
|
123
|
+
"docs/releases/*.txt",
|
|
124
|
+
"docs/release-notes/*.txt",
|
|
125
|
+
"release-notes/*.txt",
|
|
74
126
|
)
|
|
75
127
|
|
|
76
128
|
_USER_IGNORE_FILE = ".whycodeignore"
|
whycode/signals.py
CHANGED
|
@@ -13,6 +13,7 @@ from enum import StrEnum
|
|
|
13
13
|
from typing import TYPE_CHECKING
|
|
14
14
|
|
|
15
15
|
from whycode import git_facts as gf
|
|
16
|
+
from whycode import ignore as ign
|
|
16
17
|
|
|
17
18
|
if TYPE_CHECKING:
|
|
18
19
|
from whycode.git_facts import RepoFacts
|
|
@@ -148,7 +149,23 @@ def detect_high_churn(facts: RepoFacts) -> Signal | None:
|
|
|
148
149
|
|
|
149
150
|
|
|
150
151
|
def detect_coupling(facts: RepoFacts) -> Signal | None:
|
|
151
|
-
|
|
152
|
+
"""Files that change together with the target file, ranked by frequency.
|
|
153
|
+
|
|
154
|
+
Co-change candidates are filtered through the same ignore list that
|
|
155
|
+
powers ``whycode scan`` (built-in defaults plus an optional repo-local
|
|
156
|
+
``.whycodeignore``). Without this filter, a per-file coupling signal
|
|
157
|
+
would surface ``CHANGELOG``, ``.github/workflows/*.yml``, ``AUTHORS``
|
|
158
|
+
and similar high-touch metadata as the file's "tight coupling" — the
|
|
159
|
+
field-test report flagged ``flask/app.py``'s top co-changers as 60%
|
|
160
|
+
metadata, leaving only two genuinely informative entries. Applying
|
|
161
|
+
the same filter here keeps the most-shown signal honest.
|
|
162
|
+
"""
|
|
163
|
+
patterns = ign.effective_patterns(facts.repo_root)
|
|
164
|
+
paired = [
|
|
165
|
+
(p, n)
|
|
166
|
+
for p, n in facts.co_changed_files.items()
|
|
167
|
+
if n >= COUPLING_MIN_COCHANGES and not ign.is_ignored(p, patterns)
|
|
168
|
+
]
|
|
152
169
|
if not paired:
|
|
153
170
|
return None
|
|
154
171
|
paired.sort(key=lambda x: (-x[1], x[0]))
|
|
@@ -1,22 +1,22 @@
|
|
|
1
|
-
whycode/__init__.py,sha256=
|
|
1
|
+
whycode/__init__.py,sha256=YXMeIO9f86OJ3_EonP3wlcLW6Qv9sIHQQZqr-Ja4HV8,96
|
|
2
2
|
whycode/__main__.py,sha256=dqAk6746YpuM-FTIH4TBOULegGc5WweojiZjce0VYgQ,105
|
|
3
|
-
whycode/cache.py,sha256=
|
|
4
|
-
whycode/cli.py,sha256=
|
|
3
|
+
whycode/cache.py,sha256=0cEPZHdolQbSiBLAOnMu20tobIrc7G0MNycpldHRpkk,18536
|
|
4
|
+
whycode/cli.py,sha256=uRW5aysC2ufYvs_qPC1gzZcjQTFUZHdXxAmF25d4oY8,49328
|
|
5
5
|
whycode/decisions.py,sha256=oCVhEF7QfHeci0LAWNtEjV2mUAEBJloL1rT3I4XXbkw,7570
|
|
6
|
-
whycode/git_facts.py,sha256=
|
|
7
|
-
whycode/ignore.py,sha256=
|
|
6
|
+
whycode/git_facts.py,sha256=MLp8e4nGaam6lBGCHY5-sftHj71lyg_HmmBOBx3g-kg,41829
|
|
7
|
+
whycode/ignore.py,sha256=O_8bHIt0d1U-sYrBajBa7oEqpnHWU3f6Zf-8PU8CpO0,4748
|
|
8
8
|
whycode/llm.py,sha256=leB94pBg8kUCq_BujZq5ixny0urGtKskjdaKoum_eCA,4092
|
|
9
9
|
whycode/mcp_server.py,sha256=ht1tStAkOwmQzNIRkm1eA8Tnc59fzDRSGkgyIprft-0,18503
|
|
10
10
|
whycode/risk_card.py,sha256=xOJkHwIkS_6yw_dSowsQ6LHfeD9Mwr2tymL7_wqxs0U,8855
|
|
11
11
|
whycode/scorer.py,sha256=4pBejunfxzYhGUzMeL8uGEMQzC6DWiqwcTeMdo3eras,1444
|
|
12
|
-
whycode/signals.py,sha256=
|
|
12
|
+
whycode/signals.py,sha256=z0kZfXR60nS-j56nchHd1V3aK8A5CGR1BAyHZZAff3s,13899
|
|
13
13
|
whycode/suppressions.py,sha256=1lKSs-kCgpnJbcxozcgiSP8ZAfjEDMHXuM3sw4FaY78,3836
|
|
14
14
|
whycode/templates/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
15
|
whycode/templates/github-workflow.yml,sha256=LAfHMDG2TkAwi4vCNinHk-4zOt-mCWErBpmpaqlW5oA,2251
|
|
16
16
|
whycode/templates/pre-commit,sha256=IhU11CvoDwqRAAsvHwUo-BwaNbdgy1cpXc54Z_phrmQ,316
|
|
17
|
-
whycode_cli-0.4.
|
|
18
|
-
whycode_cli-0.4.
|
|
19
|
-
whycode_cli-0.4.
|
|
20
|
-
whycode_cli-0.4.
|
|
21
|
-
whycode_cli-0.4.
|
|
22
|
-
whycode_cli-0.4.
|
|
17
|
+
whycode_cli-0.4.2.dist-info/licenses/LICENSE,sha256=U6LN5qg5kJXSJf7KFPm9KJhmiGn3qK_GsTVWXdt1DFA,1062
|
|
18
|
+
whycode_cli-0.4.2.dist-info/METADATA,sha256=GD3cP18eEcHePHEXxroFuuZ-2pysLn51biNROQKDBXw,10218
|
|
19
|
+
whycode_cli-0.4.2.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
20
|
+
whycode_cli-0.4.2.dist-info/entry_points.txt,sha256=xrNWc4CQn3ZhQFJxsGIPiTqpN19K4pRpgaj6qGaEzSQ,44
|
|
21
|
+
whycode_cli-0.4.2.dist-info/top_level.txt,sha256=6yIL5rxW-4DbARHQYrPlGQVqKddZ88sjvmNosDh1w3A,8
|
|
22
|
+
whycode_cli-0.4.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|