whycode-cli 0.4.1__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/PKG-INFO +1 -1
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/pyproject.toml +1 -1
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/src/whycode/__init__.py +1 -1
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/src/whycode/cache.py +33 -7
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/src/whycode/cli.py +133 -33
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/src/whycode/git_facts.py +277 -0
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/src/whycode/risk_card.py +92 -2
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/src/whycode_cli.egg-info/PKG-INFO +1 -1
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/tests/test_cache.py +51 -0
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/tests/test_cli.py +113 -0
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/tests/test_git_facts.py +89 -0
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/LICENSE +0 -0
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/README.md +0 -0
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/setup.cfg +0 -0
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/src/whycode/__main__.py +0 -0
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/src/whycode/decisions.py +0 -0
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/src/whycode/ignore.py +0 -0
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/src/whycode/llm.py +0 -0
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/src/whycode/mcp_server.py +0 -0
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/src/whycode/scorer.py +0 -0
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/src/whycode/signals.py +0 -0
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/src/whycode/suppressions.py +0 -0
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/src/whycode/templates/__init__.py +0 -0
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/src/whycode/templates/github-workflow.yml +0 -0
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/src/whycode/templates/pre-commit +0 -0
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/src/whycode_cli.egg-info/SOURCES.txt +0 -0
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/src/whycode_cli.egg-info/dependency_links.txt +0 -0
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/src/whycode_cli.egg-info/entry_points.txt +0 -0
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/src/whycode_cli.egg-info/requires.txt +0 -0
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/src/whycode_cli.egg-info/top_level.txt +0 -0
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/tests/test_decisions.py +0 -0
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/tests/test_ignore.py +0 -0
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/tests/test_mcp_prompts.py +0 -0
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/tests/test_scorer.py +0 -0
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/tests/test_signals.py +0 -0
- {whycode_cli-0.4.1 → whycode_cli-0.5.0}/tests/test_suppressions.py +0 -0
|
@@ -112,10 +112,21 @@ class CacheStore:
|
|
|
112
112
|
cache misses; this class never invokes ``git`` itself.
|
|
113
113
|
"""
|
|
114
114
|
|
|
115
|
-
def __init__(self, db_path: Path) -> None:
|
|
115
|
+
def __init__(self, db_path: Path, *, in_memory: bool = False) -> None:
|
|
116
|
+
"""Open (creating if needed) the SQLite cache at ``db_path``.
|
|
117
|
+
|
|
118
|
+
``in_memory=True`` opens a transient ``:memory:`` connection
|
|
119
|
+
instead — the disk file is never created and is never read.
|
|
120
|
+
Used by ``--no-cache`` to retain in-session amortisation
|
|
121
|
+
(matches the cold-fill code path) without persisting anything.
|
|
122
|
+
"""
|
|
116
123
|
self.db_path = db_path
|
|
117
|
-
self.
|
|
118
|
-
|
|
124
|
+
self._in_memory = in_memory
|
|
125
|
+
if in_memory:
|
|
126
|
+
self._conn = sqlite3.connect(":memory:")
|
|
127
|
+
else:
|
|
128
|
+
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
129
|
+
self._conn = sqlite3.connect(self.db_path)
|
|
119
130
|
# row_factory makes column access readable in tests / debug.
|
|
120
131
|
self._conn.row_factory = sqlite3.Row
|
|
121
132
|
self._conn.execute("PRAGMA foreign_keys = ON")
|
|
@@ -402,13 +413,18 @@ class CacheStore:
|
|
|
402
413
|
file_row_count = int(
|
|
403
414
|
self._conn.execute("SELECT COUNT(*) FROM commit_files").fetchone()[0]
|
|
404
415
|
)
|
|
405
|
-
|
|
406
|
-
size_bytes = self.db_path.stat().st_size
|
|
407
|
-
except OSError:
|
|
416
|
+
if self._in_memory:
|
|
408
417
|
size_bytes = 0
|
|
418
|
+
exists = False
|
|
419
|
+
else:
|
|
420
|
+
try:
|
|
421
|
+
size_bytes = self.db_path.stat().st_size
|
|
422
|
+
except OSError:
|
|
423
|
+
size_bytes = 0
|
|
424
|
+
exists = self.db_path.exists()
|
|
409
425
|
return CacheStats(
|
|
410
426
|
path=self.db_path,
|
|
411
|
-
exists=
|
|
427
|
+
exists=exists,
|
|
412
428
|
schema_version=self.schema_version,
|
|
413
429
|
head_sha=self.head_sha,
|
|
414
430
|
commit_count=commit_count,
|
|
@@ -430,6 +446,16 @@ def open_for(repo_root: Path) -> CacheStore:
|
|
|
430
446
|
return CacheStore(cache_path_for(repo_root))
|
|
431
447
|
|
|
432
448
|
|
|
449
|
+
def open_in_memory(repo_root: Path) -> CacheStore:
|
|
450
|
+
"""Open a transient in-memory cache for ``repo_root``.
|
|
451
|
+
|
|
452
|
+
Used by ``--no-cache`` to keep within-session amortisation (the same
|
|
453
|
+
cold-fill code path everything else uses) while never touching disk.
|
|
454
|
+
The store is destroyed on ``close()`` and has no after-effects.
|
|
455
|
+
"""
|
|
456
|
+
return CacheStore(cache_path_for(repo_root), in_memory=True)
|
|
457
|
+
|
|
458
|
+
|
|
433
459
|
def parse_authored_at(value: str) -> datetime:
|
|
434
460
|
"""Parse the ``authored_at`` string we stored from git.
|
|
435
461
|
|
|
@@ -20,10 +20,11 @@ Commands
|
|
|
20
20
|
|
|
21
21
|
from __future__ import annotations
|
|
22
22
|
|
|
23
|
+
import contextlib
|
|
23
24
|
import functools
|
|
24
25
|
import json
|
|
25
26
|
import sys
|
|
26
|
-
from collections.abc import Callable
|
|
27
|
+
from collections.abc import Callable, Iterator
|
|
27
28
|
from pathlib import Path
|
|
28
29
|
from typing import Any, TypeVar
|
|
29
30
|
|
|
@@ -50,18 +51,27 @@ err = Console(stderr=True)
|
|
|
50
51
|
|
|
51
52
|
|
|
52
53
|
def _open_cache(repo_root: Path, no_cache: bool) -> ch.CacheStore | None:
|
|
53
|
-
"""Open the
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
54
|
+
"""Open the cache for ``repo_root`` according to the no-cache flag.
|
|
55
|
+
|
|
56
|
+
Modes:
|
|
57
|
+
* ``no_cache=False`` (the default): persistent on-disk SQLite at
|
|
58
|
+
``.whycode/cache.db``.
|
|
59
|
+
* ``no_cache=True``: a transient ``:memory:`` SQLite store. The
|
|
60
|
+
same git-walk code path runs as for the cold-fill, but the
|
|
61
|
+
database is destroyed on ``close()`` — nothing lands on disk
|
|
62
|
+
and the next run starts cold. Keeping per-run amortisation
|
|
63
|
+
(one ``git log`` walk shared across files) is what makes
|
|
64
|
+
``--no-cache`` at most as slow as a cold persistent fill;
|
|
65
|
+
the previous ``cache=None`` short-circuit lost that and so
|
|
66
|
+
``--no-cache`` re-issued per-file walks every iteration.
|
|
67
|
+
|
|
68
|
+
A ``None`` return means "do not pass a cache through git_facts".
|
|
69
|
+
Happens only when even an in-memory open fails — very rare and
|
|
70
|
+
we never want a cache problem to block the main read path.
|
|
61
71
|
"""
|
|
62
|
-
if no_cache:
|
|
63
|
-
return None
|
|
64
72
|
try:
|
|
73
|
+
if no_cache:
|
|
74
|
+
return ch.open_in_memory(repo_root)
|
|
65
75
|
return ch.open_for(repo_root)
|
|
66
76
|
except OSError:
|
|
67
77
|
return None
|
|
@@ -117,6 +127,42 @@ def _require_tracked(path_arg: str) -> tuple[Path, str]:
|
|
|
117
127
|
return repo_root, rel
|
|
118
128
|
|
|
119
129
|
|
|
130
|
+
@contextlib.contextmanager
|
|
131
|
+
def _memoised_is_ignored(repo_root: Path) -> Iterator[None]:
|
|
132
|
+
"""Memoise ``ign.is_ignored`` for the duration of the ``with`` block.
|
|
133
|
+
|
|
134
|
+
The diff command's evaluation re-applies the same ``is_ignored`` test
|
|
135
|
+
against thousands of co-change candidates per file. Each call resolves
|
|
136
|
+
fnmatch over ~83 patterns; uncached, that is ~100 CPU-seconds across
|
|
137
|
+
a 1,927-file diff on django.
|
|
138
|
+
|
|
139
|
+
A path's verdict is fully determined by the path string and the
|
|
140
|
+
repo's effective ignore-pattern tuple, so we cache by ``(path,
|
|
141
|
+
patterns)`` for the duration of the diff and restore the original
|
|
142
|
+
function on exit. The cache is process-local; the rest of the CLI
|
|
143
|
+
(``why``, ``scan``, …) sees the un-memoised function. ``ign`` itself
|
|
144
|
+
is unchanged.
|
|
145
|
+
"""
|
|
146
|
+
patterns = ign.effective_patterns(repo_root)
|
|
147
|
+
cache: dict[str, bool] = {}
|
|
148
|
+
original = ign.is_ignored
|
|
149
|
+
|
|
150
|
+
def memoised(path: str, patterns_arg: object = patterns) -> bool:
|
|
151
|
+
if patterns_arg is patterns:
|
|
152
|
+
cached = cache.get(path)
|
|
153
|
+
if cached is None:
|
|
154
|
+
cached = original(path, patterns)
|
|
155
|
+
cache[path] = cached
|
|
156
|
+
return cached
|
|
157
|
+
return original(path, patterns_arg) # type: ignore[arg-type]
|
|
158
|
+
|
|
159
|
+
ign.is_ignored = memoised # type: ignore[assignment]
|
|
160
|
+
try:
|
|
161
|
+
yield
|
|
162
|
+
finally:
|
|
163
|
+
ign.is_ignored = original
|
|
164
|
+
|
|
165
|
+
|
|
120
166
|
_F = TypeVar("_F", bound=Callable[..., Any])
|
|
121
167
|
|
|
122
168
|
|
|
@@ -419,14 +465,57 @@ def diff(
|
|
|
419
465
|
|
|
420
466
|
cache = _open_cache(repo_root, no_cache)
|
|
421
467
|
try:
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
468
|
+
# One git log walk feeds every changed file's scoring. Without this
|
|
469
|
+
# batched load, diff against an old base on a large repo runs N
|
|
470
|
+
# `git log --follow` calls (one per changed file): on django at 1,927
|
|
471
|
+
# changed files the legacy path measured 6+ minutes, with the
|
|
472
|
+
# 12+ minute variant timing out outright. ``load_diff_facts`` parses
|
|
473
|
+
# one un-pathed walk into a path -> [Commit] map; per-file scoring
|
|
474
|
+
# then does dict lookups instead of re-shelling-out.
|
|
475
|
+
try:
|
|
476
|
+
diff_facts = gf.load_diff_facts(repo_root, cache=cache)
|
|
477
|
+
except gf.GitError as exc:
|
|
478
|
+
err.print(f"[red]error:[/red] {exc}")
|
|
479
|
+
raise typer.Exit(2) from exc
|
|
480
|
+
# Pre-compute the ignore-pattern set ONCE and a verdict-per-path
|
|
481
|
+
# memo. ``signals.detect_coupling`` (re-introduced in 0.4.1 as F10)
|
|
482
|
+
# filters every coupling candidate through ``ign.is_ignored`` —
|
|
483
|
+
# without memoisation that's 83 patterns x 700 candidates x 1,927
|
|
484
|
+
# files = ~100 CPU-seconds across the diff. The memo cache turns
|
|
485
|
+
# each path's verdict into a dict lookup after the first hit.
|
|
486
|
+
with _memoised_is_ignored(repo_root):
|
|
487
|
+
# First pass: every changed file is scored without the
|
|
488
|
+
# ghost-keeper detector, which would otherwise fire ``git
|
|
489
|
+
# blame`` per file. With 1,927 changed files on django this
|
|
490
|
+
# single deferral saves ~5 minutes. We then sort and
|
|
491
|
+
# re-evaluate only the top-N with full signals — at most
|
|
492
|
+
# ``top`` blame calls instead of ``len(files)``.
|
|
493
|
+
prelim: list[rc.RiskCard] = []
|
|
494
|
+
for f in files:
|
|
495
|
+
try:
|
|
496
|
+
prelim.append(
|
|
497
|
+
rc.build_from_diff_facts(diff_facts, f, skip_ghost_keeper=True)
|
|
498
|
+
)
|
|
499
|
+
except gf.GitError:
|
|
500
|
+
continue
|
|
501
|
+
# Stable tie-break (from 0.4.2): lex smallest path on identical
|
|
502
|
+
# scores so cache and --no-cache truncate the same files at --top N.
|
|
503
|
+
prelim.sort(key=lambda c: (-c.score.value, c.path))
|
|
504
|
+
# Second pass: re-score the top-N with the full detector ladder
|
|
505
|
+
# so the rendered table includes ghost-keeper findings where
|
|
506
|
+
# they apply. Files outside the top-N keep their first-pass
|
|
507
|
+
# score; they were not going to appear in the user's view
|
|
508
|
+
# anyway.
|
|
509
|
+
refined_top: list[rc.RiskCard] = []
|
|
510
|
+
for prelim_card in prelim[:top]:
|
|
511
|
+
try:
|
|
512
|
+
refined_top.append(
|
|
513
|
+
rc.build_from_diff_facts(diff_facts, prelim_card.path)
|
|
514
|
+
)
|
|
515
|
+
except gf.GitError:
|
|
516
|
+
refined_top.append(prelim_card)
|
|
517
|
+
cards = refined_top
|
|
518
|
+
cards.sort(key=lambda c: (-c.score.value, c.path))
|
|
430
519
|
finally:
|
|
431
520
|
if cache is not None:
|
|
432
521
|
cache.close()
|
|
@@ -565,16 +654,17 @@ def highlights(
|
|
|
565
654
|
|
|
566
655
|
inv_pairs = gf.extract_invariant_quotes(commits)
|
|
567
656
|
sha_to_commit = {c.sha: c for c in commits}
|
|
568
|
-
|
|
569
|
-
for sha, line in inv_pairs:
|
|
570
|
-
seen_lines.setdefault(line, sha)
|
|
657
|
+
deduped = gf.dedupe_invariant_lines(inv_pairs, sha_to_commit)
|
|
571
658
|
inv_records: list[tuple[str, str, gf.Commit]] = []
|
|
572
|
-
for
|
|
659
|
+
for sha, line in deduped:
|
|
573
660
|
commit = sha_to_commit.get(sha)
|
|
574
661
|
if commit is None:
|
|
575
662
|
continue
|
|
576
663
|
inv_records.append((line, sha, commit))
|
|
577
|
-
|
|
664
|
+
# Sort newest first; on identical timestamps fall back to lexicographically
|
|
665
|
+
# smallest sha so cache and --no-cache emit byte-identical output.
|
|
666
|
+
inv_records.sort(key=lambda t: t[1]) # secondary: sha asc
|
|
667
|
+
inv_records.sort(key=lambda t: t[2].authored_at, reverse=True) # primary
|
|
578
668
|
inv_records = inv_records[:invariants]
|
|
579
669
|
|
|
580
670
|
incident_records = gf.find_incidents(commits)[:incidents]
|
|
@@ -827,7 +917,10 @@ def scan(
|
|
|
827
917
|
if cache is not None:
|
|
828
918
|
cache.close()
|
|
829
919
|
|
|
830
|
-
|
|
920
|
+
# Stable tie-break on identical scores: lexicographically smallest path
|
|
921
|
+
# so cache and --no-cache produce byte-identical text output for the
|
|
922
|
+
# same HEAD. Without this, the truncation at --top N is non-deterministic.
|
|
923
|
+
cards.sort(key=lambda c: (-c.score.value, c.path))
|
|
831
924
|
top_cards = cards[:top]
|
|
832
925
|
if not top_cards:
|
|
833
926
|
# Be honest about what "no flagged files" actually means. A user who
|
|
@@ -949,7 +1042,8 @@ def show(
|
|
|
949
1042
|
cards.append(rc.build(repo_root, change.path))
|
|
950
1043
|
except gf.GitError:
|
|
951
1044
|
continue
|
|
952
|
-
|
|
1045
|
+
# Stable tie-break on identical scores: lex smallest path.
|
|
1046
|
+
cards.sort(key=lambda c: (-c.score.value, c.path))
|
|
953
1047
|
|
|
954
1048
|
if json_out:
|
|
955
1049
|
console.print_json(
|
|
@@ -1065,13 +1159,18 @@ def tour(
|
|
|
1065
1159
|
|
|
1066
1160
|
inv_pairs = gf.extract_invariant_quotes(commits)
|
|
1067
1161
|
sha_to_commit = {c.sha: c for c in commits}
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1162
|
+
deduped = gf.dedupe_invariant_lines(inv_pairs, sha_to_commit)
|
|
1163
|
+
# Sort newest first with sha-asc tie-break so cache and --no-cache
|
|
1164
|
+
# surface the same three lines in the same order.
|
|
1165
|
+
deduped_sorted = sorted(
|
|
1166
|
+
(p for p in deduped if p[0] in sha_to_commit),
|
|
1167
|
+
key=lambda p: p[0],
|
|
1168
|
+
)
|
|
1169
|
+
deduped_sorted.sort(
|
|
1170
|
+
key=lambda p: sha_to_commit[p[0]].authored_at, reverse=True
|
|
1171
|
+
)
|
|
1071
1172
|
invariants_top = [
|
|
1072
|
-
(line, sha_to_commit[sha])
|
|
1073
|
-
for line, sha in seen_lines.items()
|
|
1074
|
-
if sha in sha_to_commit
|
|
1173
|
+
(line, sha_to_commit[sha]) for sha, line in deduped_sorted
|
|
1075
1174
|
][:3]
|
|
1076
1175
|
incidents_top = gf.find_incidents(commits)[:3]
|
|
1077
1176
|
|
|
@@ -1135,7 +1234,8 @@ def tour(
|
|
|
1135
1234
|
]
|
|
1136
1235
|
if useful:
|
|
1137
1236
|
cards.append(card)
|
|
1138
|
-
|
|
1237
|
+
# Stable tie-break: lex smallest path on identical scores.
|
|
1238
|
+
cards.sort(key=lambda c: (-c.score.value, c.path))
|
|
1139
1239
|
|
|
1140
1240
|
if cards:
|
|
1141
1241
|
console.print("[bold red]Top 3 risky files[/bold red]")
|
|
@@ -735,6 +735,242 @@ def _populate_diffstat_cache(
|
|
|
735
735
|
cache.upsert_commit_files(rows)
|
|
736
736
|
|
|
737
737
|
|
|
738
|
+
# ---- batch loading for whycode diff ---------------------------------------
|
|
739
|
+
|
|
740
|
+
|
|
741
|
+
@dataclass(frozen=True)
|
|
742
|
+
class DiffFacts:
|
|
743
|
+
"""A whole-repo snapshot built once for a single ``whycode diff`` evaluation.
|
|
744
|
+
|
|
745
|
+
The diff command scores N changed files; previously each file fired its
|
|
746
|
+
own ``git log --follow`` plus a co-change diffstat pass, so wall-clock
|
|
747
|
+
cost scaled with N. ``DiffFacts`` replaces N path-restricted log walks
|
|
748
|
+
with a single un-pathed walk: one ``git log --no-merges --numstat`` over
|
|
749
|
+
the repo, parsed once into ``commits_by_path`` (every commit that named
|
|
750
|
+
each path) and ``co_change_index`` (each commit's full file-set, used
|
|
751
|
+
for in-memory coupling counts). Per-file scoring then reads from this
|
|
752
|
+
map rather than re-shelling-out.
|
|
753
|
+
|
|
754
|
+
The map deliberately does NOT follow renames: the diff command only
|
|
755
|
+
scores files present in HEAD's working tree, so the tradeoff is "lose
|
|
756
|
+
rename-resolved history pre-rename" against "scoring 1,927 files in
|
|
757
|
+
seconds rather than minutes". Coupling against pre-rename names still
|
|
758
|
+
surfaces under those names in the map; the surface diff in practice is
|
|
759
|
+
a stable-tie-break difference, not a structural one.
|
|
760
|
+
"""
|
|
761
|
+
|
|
762
|
+
repo_root: Path
|
|
763
|
+
commits_by_path: dict[str, list[Commit]]
|
|
764
|
+
"""``path -> [Commit]``, newest-first, capped per path during load.
|
|
765
|
+
|
|
766
|
+
A missing key — i.e. ``commits_by_path.get(path)`` returns ``None`` —
|
|
767
|
+
means the loader walk did not see this path. ``gather_for_diff`` treats
|
|
768
|
+
that the same as an empty list: a path that the un-pathed walk did not
|
|
769
|
+
touch has no history to score from.
|
|
770
|
+
"""
|
|
771
|
+
|
|
772
|
+
co_change_index: dict[str, tuple[str, ...]]
|
|
773
|
+
"""``commit_sha -> tuple of paths touched by that commit``.
|
|
774
|
+
|
|
775
|
+
Snapshot of the same numstat parse used to build ``commits_by_path``.
|
|
776
|
+
Per-file ``co_changes`` reads this for in-memory coupling counts so
|
|
777
|
+
the diff pipeline never re-issues ``git log --no-walk`` per file.
|
|
778
|
+
"""
|
|
779
|
+
|
|
780
|
+
cache: CacheStore | None = None
|
|
781
|
+
"""Optional cache, threaded through so signal detectors (specifically
|
|
782
|
+
``detect_ghost_keeper``) reuse it for ``git blame`` line ownership."""
|
|
783
|
+
|
|
784
|
+
|
|
785
|
+
_NUMSTAT_LINE_RE = re.compile(r"^(\d+|-)\t(\d+|-)\t(.+)$")
|
|
786
|
+
|
|
787
|
+
|
|
788
|
+
def load_diff_facts(
|
|
789
|
+
repo_root: Path,
|
|
790
|
+
*,
|
|
791
|
+
max_commits: int | None = None,
|
|
792
|
+
cache: CacheStore | None = None,
|
|
793
|
+
) -> DiffFacts:
|
|
794
|
+
"""Build a :class:`DiffFacts` snapshot from one ``git log`` invocation.
|
|
795
|
+
|
|
796
|
+
Strategy:
|
|
797
|
+
1. Walk HEAD with ``git log --no-merges --numstat --pretty=...`` once.
|
|
798
|
+
2. Parse each commit + its full file-set into a single in-memory map.
|
|
799
|
+
3. Return the snapshot for the diff command's per-file scorer to drive.
|
|
800
|
+
|
|
801
|
+
With a ``cache`` supplied, the walked commits are persisted to
|
|
802
|
+
``commits``; per-file diffstat presence rows are persisted to
|
|
803
|
+
``commit_files`` so a subsequent ``why`` / ``scan`` / ``diff`` invocation
|
|
804
|
+
on the same HEAD reuses what we just paid for.
|
|
805
|
+
|
|
806
|
+
The walk is intentionally un-pathed: the diff command scores files
|
|
807
|
+
that appear in ``git diff --name-only base...HEAD``, all of which exist
|
|
808
|
+
at HEAD by definition. A single un-pathed walk that captures every
|
|
809
|
+
commit's diffstat is strictly cheaper than N path-restricted walks
|
|
810
|
+
that each re-walk the full graph. ``max_commits`` is applied per-path
|
|
811
|
+
*after* the walk so callers can cap per-file depth without changing
|
|
812
|
+
the cost of the walk itself.
|
|
813
|
+
"""
|
|
814
|
+
# Pretty format: RECORD_SEP starts each commit; metadata fields are
|
|
815
|
+
# UNIT_SEP-delimited; the body is the last metadata field. Numstat
|
|
816
|
+
# output git appends after the body needs no further separator —
|
|
817
|
+
# the next commit's leading RECORD_SEP marks the boundary.
|
|
818
|
+
pretty_format = (
|
|
819
|
+
f"{RECORD_SEP}%H{UNIT_SEP}%an{UNIT_SEP}%ae{UNIT_SEP}"
|
|
820
|
+
f"%aI{UNIT_SEP}%s{UNIT_SEP}%b"
|
|
821
|
+
)
|
|
822
|
+
raw = _run_git(
|
|
823
|
+
repo_root,
|
|
824
|
+
"log",
|
|
825
|
+
"--no-merges",
|
|
826
|
+
"--numstat",
|
|
827
|
+
f"--pretty=format:{pretty_format}",
|
|
828
|
+
)
|
|
829
|
+
all_commits, commits_by_path, co_change_index = _parse_log_with_files(raw)
|
|
830
|
+
if max_commits is not None:
|
|
831
|
+
commits_by_path = {p: cs[:max_commits] for p, cs in commits_by_path.items()}
|
|
832
|
+
if cache is not None and all_commits:
|
|
833
|
+
_store_commits(cache, all_commits)
|
|
834
|
+
# Persist diffstat presence rows so a subsequent `why` against the
|
|
835
|
+
# same HEAD does not re-shell-out per file. Insertion/deletion
|
|
836
|
+
# widths are not captured by this walk (the diff command's
|
|
837
|
+
# detectors only depend on the *path set* of each commit), so they
|
|
838
|
+
# are stored as zero — see the paragraph in ``DiffFacts``.
|
|
839
|
+
files_rows: list[tuple[str, str, int, int]] = []
|
|
840
|
+
for sha, paths in co_change_index.items():
|
|
841
|
+
for p in paths:
|
|
842
|
+
files_rows.append((sha, p, 0, 0))
|
|
843
|
+
if files_rows:
|
|
844
|
+
cache.upsert_commit_files(files_rows)
|
|
845
|
+
try:
|
|
846
|
+
head_sha = _run_git(repo_root, "rev-parse", "HEAD").strip()
|
|
847
|
+
except GitError:
|
|
848
|
+
head_sha = ""
|
|
849
|
+
if head_sha and not cache.head_sha:
|
|
850
|
+
cache.set_head_sha(head_sha)
|
|
851
|
+
return DiffFacts(
|
|
852
|
+
repo_root=repo_root,
|
|
853
|
+
commits_by_path=commits_by_path,
|
|
854
|
+
co_change_index=co_change_index,
|
|
855
|
+
cache=cache,
|
|
856
|
+
)
|
|
857
|
+
|
|
858
|
+
|
|
859
|
+
def _parse_log_with_files(
|
|
860
|
+
raw: str,
|
|
861
|
+
) -> tuple[list[Commit], dict[str, list[Commit]], dict[str, tuple[str, ...]]]:
|
|
862
|
+
"""Parse ``git log --no-merges --numstat --pretty=<sep><commit>`` output.
|
|
863
|
+
|
|
864
|
+
Returns ``(all_commits, commits_by_path, co_change_index)``:
|
|
865
|
+
- ``all_commits`` is every parsed commit, newest first.
|
|
866
|
+
- ``commits_by_path[path]`` is the subset whose numstat block named
|
|
867
|
+
``path``, preserving the newest-first order of the walk.
|
|
868
|
+
- ``co_change_index[sha]`` is the full path tuple from the same
|
|
869
|
+
numstat block, used by the diff command's in-memory coupling.
|
|
870
|
+
|
|
871
|
+
Within one record the format is
|
|
872
|
+
``<sha>\\x1f<an>\\x1f<ae>\\x1f<aI>\\x1f<subject>\\x1f<body...>``
|
|
873
|
+
followed by zero or more numstat lines (``ins\\tdel\\tpath``). The body
|
|
874
|
+
is free-form prose; numstat is tab-delimited 3-column. We walk lines
|
|
875
|
+
forward, holding the first line as the metadata + start-of-body, and
|
|
876
|
+
accumulate further lines as either body (free-form) or numstat
|
|
877
|
+
(matches :data:`_NUMSTAT_LINE_RE`). Once a numstat line fires, the
|
|
878
|
+
remaining lines for that record are taken to be more numstat lines.
|
|
879
|
+
"""
|
|
880
|
+
all_commits: list[Commit] = []
|
|
881
|
+
commits_by_path: dict[str, list[Commit]] = {}
|
|
882
|
+
co_change_index: dict[str, tuple[str, ...]] = {}
|
|
883
|
+
for record in raw.split(RECORD_SEP):
|
|
884
|
+
record = record.strip("\n")
|
|
885
|
+
if not record:
|
|
886
|
+
continue
|
|
887
|
+
lines = record.split("\n")
|
|
888
|
+
# The first line carries every metadata field plus the first body
|
|
889
|
+
# line (the body itself was emitted verbatim by ``%b``).
|
|
890
|
+
head_parts = lines[0].split(UNIT_SEP)
|
|
891
|
+
if len(head_parts) < 6:
|
|
892
|
+
continue
|
|
893
|
+
sha = head_parts[0].strip()
|
|
894
|
+
if not sha:
|
|
895
|
+
continue
|
|
896
|
+
author_name = head_parts[1]
|
|
897
|
+
author_email = head_parts[2]
|
|
898
|
+
authored_at = head_parts[3]
|
|
899
|
+
subject = head_parts[4]
|
|
900
|
+
first_body = UNIT_SEP.join(head_parts[5:])
|
|
901
|
+
body_lines: list[str] = [first_body] if first_body else []
|
|
902
|
+
files: list[str] = []
|
|
903
|
+
in_numstat = False
|
|
904
|
+
for line in lines[1:]:
|
|
905
|
+
m = _NUMSTAT_LINE_RE.match(line)
|
|
906
|
+
if in_numstat:
|
|
907
|
+
if m is not None:
|
|
908
|
+
files.append(m.group(3))
|
|
909
|
+
continue
|
|
910
|
+
if m is not None:
|
|
911
|
+
in_numstat = True
|
|
912
|
+
files.append(m.group(3))
|
|
913
|
+
continue
|
|
914
|
+
body_lines.append(line)
|
|
915
|
+
try:
|
|
916
|
+
authored = _parse_iso(authored_at)
|
|
917
|
+
except ValueError:
|
|
918
|
+
# Bad timestamps from a single 15-year-old commit shouldn't kill
|
|
919
|
+
# the diff command. F1 (full timezone-tolerant parser) is owned
|
|
920
|
+
# by another branch; we degrade locally rather than crash.
|
|
921
|
+
continue
|
|
922
|
+
body = "\n".join(body_lines).strip("\n")
|
|
923
|
+
commit = Commit(
|
|
924
|
+
sha=sha,
|
|
925
|
+
author_name=author_name,
|
|
926
|
+
author_email=author_email,
|
|
927
|
+
authored_at=authored,
|
|
928
|
+
subject=subject,
|
|
929
|
+
body=body,
|
|
930
|
+
files=tuple(files),
|
|
931
|
+
)
|
|
932
|
+
all_commits.append(commit)
|
|
933
|
+
co_change_index[sha] = commit.files
|
|
934
|
+
for path in files:
|
|
935
|
+
commits_by_path.setdefault(path, []).append(commit)
|
|
936
|
+
return all_commits, commits_by_path, co_change_index
|
|
937
|
+
|
|
938
|
+
|
|
939
|
+
def gather_for_diff(
|
|
940
|
+
diff_facts: DiffFacts,
|
|
941
|
+
path: str,
|
|
942
|
+
*,
|
|
943
|
+
max_commits: int | None = None,
|
|
944
|
+
) -> RepoFacts:
|
|
945
|
+
"""Build a :class:`RepoFacts` for ``path`` using only the in-memory map.
|
|
946
|
+
|
|
947
|
+
The diff command calls this once per changed file, replacing the per-file
|
|
948
|
+
``gather()`` (and its embedded ``git log --follow`` + co-change shell-out)
|
|
949
|
+
with O(1) dict lookups. All higher-layer detectors run unchanged on the
|
|
950
|
+
returned ``RepoFacts``.
|
|
951
|
+
"""
|
|
952
|
+
commits = diff_facts.commits_by_path.get(path, [])
|
|
953
|
+
if max_commits is not None:
|
|
954
|
+
commits = commits[:max_commits]
|
|
955
|
+
co_changed: Counter[str] = Counter()
|
|
956
|
+
for commit in commits:
|
|
957
|
+
touched = diff_facts.co_change_index.get(commit.sha, ())
|
|
958
|
+
for other in touched:
|
|
959
|
+
if other == path:
|
|
960
|
+
continue
|
|
961
|
+
co_changed[other] += 1
|
|
962
|
+
return RepoFacts(
|
|
963
|
+
repo_root=diff_facts.repo_root,
|
|
964
|
+
path=path,
|
|
965
|
+
commits=commits,
|
|
966
|
+
co_changed_files=co_changed,
|
|
967
|
+
revert_pairs=find_revert_pairs(commits),
|
|
968
|
+
incident_commits=find_incidents(commits),
|
|
969
|
+
invariant_quotes=extract_invariant_quotes(commits),
|
|
970
|
+
cache=diff_facts.cache,
|
|
971
|
+
)
|
|
972
|
+
|
|
973
|
+
|
|
738
974
|
_REVERT_PREFIX = 'this reverts commit '
|
|
739
975
|
|
|
740
976
|
|
|
@@ -967,6 +1203,47 @@ def extract_invariant_quotes(commits: Sequence[Commit]) -> list[tuple[str, str]]
|
|
|
967
1203
|
return out
|
|
968
1204
|
|
|
969
1205
|
|
|
1206
|
+
def dedupe_invariant_lines(
|
|
1207
|
+
pairs: Sequence[tuple[str, str]],
|
|
1208
|
+
sha_to_commit: dict[str, Commit],
|
|
1209
|
+
) -> list[tuple[str, str]]:
|
|
1210
|
+
"""Collapse identical invariant lines to one canonical (sha, line) pair.
|
|
1211
|
+
|
|
1212
|
+
When two commits state the same invariant line — typically a cherry-pick
|
|
1213
|
+
onto a maintenance branch, or a rebase that duplicated the message — we
|
|
1214
|
+
must pick exactly one to surface. Without a deterministic rule the cache
|
|
1215
|
+
and ``--no-cache`` paths can disagree (their walk orders differ when
|
|
1216
|
+
timestamps tie), and downstream JSON consumers see flaky output across
|
|
1217
|
+
runs.
|
|
1218
|
+
|
|
1219
|
+
The rule:
|
|
1220
|
+
|
|
1221
|
+
1. Earliest ``authored_at`` wins. The original statement is canonical;
|
|
1222
|
+
cherry-picks and rebases are derivatives.
|
|
1223
|
+
2. Lexicographically smallest ``sha`` breaks ties on identical timestamps.
|
|
1224
|
+
|
|
1225
|
+
The returned list preserves first-encounter order of the (now-unique)
|
|
1226
|
+
lines so downstream code that sorts by date sees a stable input.
|
|
1227
|
+
Pairs whose ``sha`` is not in ``sha_to_commit`` keep their first-seen
|
|
1228
|
+
record (no metadata to compare on).
|
|
1229
|
+
"""
|
|
1230
|
+
canonical: dict[str, str] = {}
|
|
1231
|
+
for sha, line in pairs:
|
|
1232
|
+
existing = canonical.get(line)
|
|
1233
|
+
if existing is None:
|
|
1234
|
+
canonical[line] = sha
|
|
1235
|
+
continue
|
|
1236
|
+
old_commit = sha_to_commit.get(existing)
|
|
1237
|
+
new_commit = sha_to_commit.get(sha)
|
|
1238
|
+
if old_commit is None or new_commit is None:
|
|
1239
|
+
continue
|
|
1240
|
+
old_key = (old_commit.authored_at, existing)
|
|
1241
|
+
new_key = (new_commit.authored_at, sha)
|
|
1242
|
+
if new_key < old_key:
|
|
1243
|
+
canonical[line] = sha
|
|
1244
|
+
return [(sha, line) for line, sha in canonical.items()]
|
|
1245
|
+
|
|
1246
|
+
|
|
970
1247
|
def author_last_activity(repo_root: Path, email: str) -> datetime | None:
|
|
971
1248
|
"""Most recent commit timestamp by ``email`` anywhere in the repo, or None."""
|
|
972
1249
|
raw = _run_git(
|
|
@@ -101,7 +101,63 @@ def build(
|
|
|
101
101
|
the same repo (e.g. inside ``scan`` or ``diff``) share a warm cache.
|
|
102
102
|
"""
|
|
103
103
|
facts = gf.gather(repo_root, path, max_commits=max_commits, ref=ref, cache=cache)
|
|
104
|
-
|
|
104
|
+
return _from_facts(
|
|
105
|
+
path=path,
|
|
106
|
+
facts=facts,
|
|
107
|
+
repo_root=repo_root,
|
|
108
|
+
ref=ref,
|
|
109
|
+
apply_suppressions=apply_suppressions,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def build_from_diff_facts(
|
|
114
|
+
diff_facts: gf.DiffFacts,
|
|
115
|
+
path: str,
|
|
116
|
+
*,
|
|
117
|
+
max_commits: int | None = None,
|
|
118
|
+
apply_suppressions: bool = True,
|
|
119
|
+
skip_ghost_keeper: bool = False,
|
|
120
|
+
) -> RiskCard:
|
|
121
|
+
"""Build a Risk Card from an in-memory :class:`DiffFacts` map.
|
|
122
|
+
|
|
123
|
+
The diff command pre-loads one ``DiffFacts`` for the whole evaluation
|
|
124
|
+
via :func:`whycode.git_facts.load_diff_facts`, then calls this helper
|
|
125
|
+
once per changed file. The card's signals, score, and ``most_recent_*``
|
|
126
|
+
fields all derive from the same in-memory map, so per-file cost is
|
|
127
|
+
O(1) rather than the per-file ``git log --follow`` it replaces.
|
|
128
|
+
|
|
129
|
+
With ``skip_ghost_keeper=True`` the per-file ``git blame`` call is
|
|
130
|
+
deferred — the diff command uses this for its first pass over every
|
|
131
|
+
changed file, then re-evaluates only the top-N with full signals.
|
|
132
|
+
Without this skip, scoring 1,927 files spends ~4-5 minutes inside
|
|
133
|
+
``git blame`` even though > 95% of those files never reach the table
|
|
134
|
+
the user sees.
|
|
135
|
+
"""
|
|
136
|
+
facts = gf.gather_for_diff(diff_facts, path, max_commits=max_commits)
|
|
137
|
+
return _from_facts(
|
|
138
|
+
path=path,
|
|
139
|
+
facts=facts,
|
|
140
|
+
repo_root=diff_facts.repo_root,
|
|
141
|
+
ref=None,
|
|
142
|
+
apply_suppressions=apply_suppressions,
|
|
143
|
+
skip_ghost_keeper=skip_ghost_keeper,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _from_facts(
|
|
148
|
+
*,
|
|
149
|
+
path: str,
|
|
150
|
+
facts: gf.RepoFacts,
|
|
151
|
+
repo_root: Path,
|
|
152
|
+
ref: str | None,
|
|
153
|
+
apply_suppressions: bool,
|
|
154
|
+
skip_ghost_keeper: bool = False,
|
|
155
|
+
) -> RiskCard:
|
|
156
|
+
"""Common tail of :func:`build` and :func:`build_from_diff_facts`."""
|
|
157
|
+
if skip_ghost_keeper:
|
|
158
|
+
signals = _all_signals_without_ghost_keeper(facts)
|
|
159
|
+
else:
|
|
160
|
+
signals = sig.all_signals(facts)
|
|
105
161
|
if apply_suppressions:
|
|
106
162
|
suppressions = supp.load(repo_root)
|
|
107
163
|
signals = supp.filter_signals(signals, suppressions, path)
|
|
@@ -120,6 +176,40 @@ def build(
|
|
|
120
176
|
)
|
|
121
177
|
|
|
122
178
|
|
|
179
|
+
# Detectors whose evidence is already in :class:`RepoFacts` (no git blame, no
|
|
180
|
+
# follow-up shell-out). The ghost-keeper detector is the only one missing
|
|
181
|
+
# here — it calls ``git blame`` per-file, which is the diff command's
|
|
182
|
+
# remaining bottleneck after the log walk is shared.
|
|
183
|
+
_FAST_DETECTORS = (
|
|
184
|
+
sig.detect_revert_chain,
|
|
185
|
+
sig.detect_incident_history,
|
|
186
|
+
sig.detect_invariant_quotes,
|
|
187
|
+
sig.detect_coupling,
|
|
188
|
+
sig.detect_high_churn,
|
|
189
|
+
sig.detect_silence,
|
|
190
|
+
sig.detect_newborn,
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _all_signals_without_ghost_keeper(facts: gf.RepoFacts) -> list[sig.Signal]:
|
|
195
|
+
"""Re-implement the public ``all_signals`` ladder, minus ghost-keeper.
|
|
196
|
+
|
|
197
|
+
Mirrors the NEWBORN-suppression rule that ``signals.all_signals`` uses
|
|
198
|
+
so that an empty signal list collapses cleanly to NEWBORN-only when the
|
|
199
|
+
other detectors are all silent. If any other detector fires, NEWBORN is
|
|
200
|
+
dropped, exactly as the canonical helper does.
|
|
201
|
+
"""
|
|
202
|
+
out: list[sig.Signal] = []
|
|
203
|
+
for detector in _FAST_DETECTORS:
|
|
204
|
+
signal = detector(facts)
|
|
205
|
+
if signal is not None:
|
|
206
|
+
out.append(signal)
|
|
207
|
+
if any(s.kind is not sig.SignalKind.NEWBORN for s in out):
|
|
208
|
+
out = [s for s in out if s.kind is not sig.SignalKind.NEWBORN]
|
|
209
|
+
out.sort(key=lambda s: (-s.severity, s.kind.value))
|
|
210
|
+
return out
|
|
211
|
+
|
|
212
|
+
|
|
123
213
|
# ----- rendering ------------------------------------------------------------
|
|
124
214
|
|
|
125
215
|
_BAND_STYLE: dict[Band, str] = {
|
|
@@ -247,4 +337,4 @@ def render_text(card: RiskCard) -> Group:
|
|
|
247
337
|
return Group(*pieces)
|
|
248
338
|
|
|
249
339
|
|
|
250
|
-
__all__ = ["RiskCard", "build", "render_text"]
|
|
340
|
+
__all__ = ["RiskCard", "build", "build_from_diff_facts", "render_text"]
|
|
@@ -306,6 +306,57 @@ def test_open_for_idempotent_open_close(tmp_path: Path) -> None:
|
|
|
306
306
|
store_b.close()
|
|
307
307
|
|
|
308
308
|
|
|
309
|
+
# ---- F7: in-memory cache for --no-cache amortisation ---------------------
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def test_open_in_memory_does_not_touch_disk(tmp_path: Path) -> None:
|
|
313
|
+
"""The `:memory:` mode must leave the filesystem completely untouched."""
|
|
314
|
+
store = ch.open_in_memory(tmp_path)
|
|
315
|
+
try:
|
|
316
|
+
# Write a few rows; nothing should land on disk.
|
|
317
|
+
store.upsert_commits([_commit(sha="a" * 40)])
|
|
318
|
+
store.upsert_commit_files([("a" * 40, "x.py", 1, 0)])
|
|
319
|
+
store.set_head_sha("deadbeef")
|
|
320
|
+
assert not (tmp_path / ch.CACHE_DIRNAME).exists()
|
|
321
|
+
# Reads still return what we wrote.
|
|
322
|
+
rows = store.fetch_all_commit_rows()
|
|
323
|
+
assert len(rows) == 1
|
|
324
|
+
assert store.head_sha == "deadbeef"
|
|
325
|
+
finally:
|
|
326
|
+
store.close()
|
|
327
|
+
# And after close there's still nothing on disk.
|
|
328
|
+
assert not (tmp_path / ch.CACHE_DIRNAME).exists()
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def test_in_memory_cache_amortises_across_files(repo) -> None: # type: ignore[no-untyped-def]
|
|
332
|
+
"""The in-memory store reuses cached diffstat rows across calls.
|
|
333
|
+
|
|
334
|
+
The cold (persistent) path's main perf advantage over a no-cache call
|
|
335
|
+
was that, after a single batched ``git log --no-walk --numstat`` for
|
|
336
|
+
file A, file B's overlapping shas were already in the cache. The
|
|
337
|
+
`:memory:` store must give --no-cache the same amortisation in-process.
|
|
338
|
+
"""
|
|
339
|
+
repo.commit("init", {"a.txt": "1", "b.txt": "1"})
|
|
340
|
+
repo.commit("touch a and b", {"a.txt": "2", "b.txt": "2"})
|
|
341
|
+
repo.commit("touch only b", {"b.txt": "3"})
|
|
342
|
+
with ch.open_in_memory(repo.root) as store:
|
|
343
|
+
# First call on a.txt populates diffstat rows for both shared shas.
|
|
344
|
+
a_commits = gf.commits_for_path(repo.root, "a.txt", cache=store)
|
|
345
|
+
gf.co_changes(repo.root, a_commits, "a.txt", cache=store)
|
|
346
|
+
# All shas a.txt touched are now present.
|
|
347
|
+
a_shas = [c.sha for c in a_commits]
|
|
348
|
+
assert store.shas_missing_files(a_shas) == []
|
|
349
|
+
# When b.txt's call runs, the two shas it shares with a.txt are
|
|
350
|
+
# served from the cache; only the b-only sha is missing.
|
|
351
|
+
b_commits = gf.commits_for_path(repo.root, "b.txt", cache=store)
|
|
352
|
+
b_shas = [c.sha for c in b_commits]
|
|
353
|
+
missing_for_b = set(store.shas_missing_files(b_shas))
|
|
354
|
+
# Exactly the shas that b.txt touched but a.txt did not are missing.
|
|
355
|
+
a_set = set(a_shas)
|
|
356
|
+
expected_missing = {s for s in b_shas if s not in a_set}
|
|
357
|
+
assert missing_for_b == expected_missing
|
|
358
|
+
|
|
359
|
+
|
|
309
360
|
def test_fetch_co_changes_chunked_query_handles_many_shas(tmp_path: Path) -> None:
|
|
310
361
|
"""SQLite limits host parameters per statement; we chunk above 500."""
|
|
311
362
|
with ch.open_for(tmp_path) as store:
|
|
@@ -755,3 +755,116 @@ def test_repeat_scan_produces_identical_top_files(repo, days_ago) -> None: # ty
|
|
|
755
755
|
assert "refund.py" in cold
|
|
756
756
|
assert "refund.py" in warm_first
|
|
757
757
|
assert "refund.py" in warm_second
|
|
758
|
+
|
|
759
|
+
|
|
760
|
+
# ---- F4: highlights determinism across cache state ------------------------
|
|
761
|
+
|
|
762
|
+
|
|
763
|
+
def test_highlights_json_is_byte_identical_across_cache_state(
|
|
764
|
+
repo, days_ago
|
|
765
|
+
) -> None: # type: ignore[no-untyped-def]
|
|
766
|
+
"""Two commits with identical bodies and timestamps (a cherry-pick on a
|
|
767
|
+
different branch) must not flip which SHA the dedup picks across cache
|
|
768
|
+
versus --no-cache reads of the same HEAD.
|
|
769
|
+
|
|
770
|
+
Without a stable tie-breaker, the cache's authored_at-DESC walk and git
|
|
771
|
+
log's walk can disagree on the order of identical-timestamp commits, and
|
|
772
|
+
the JSON consumer sees a different SHA on the same field across runs.
|
|
773
|
+
"""
|
|
774
|
+
same_time = days_ago(30)
|
|
775
|
+
repo.commit(
|
|
776
|
+
"init",
|
|
777
|
+
{"a.txt": "1", "b.txt": "1"},
|
|
778
|
+
when=days_ago(60),
|
|
779
|
+
)
|
|
780
|
+
# Two commits, identical timestamps, identical bodies — only the SHAs
|
|
781
|
+
# and the touched-file set differ. Mirrors the flask cherry-pick pattern
|
|
782
|
+
# the field test surfaced.
|
|
783
|
+
repo.commit(
|
|
784
|
+
"use global contributing guide on master",
|
|
785
|
+
{"a.txt": "2"},
|
|
786
|
+
body="Do not duplicate the contributing guide between branches.",
|
|
787
|
+
when=same_time,
|
|
788
|
+
)
|
|
789
|
+
repo.commit(
|
|
790
|
+
"use global contributing guide on stable",
|
|
791
|
+
{"b.txt": "2"},
|
|
792
|
+
body="Do not duplicate the contributing guide between branches.",
|
|
793
|
+
when=same_time,
|
|
794
|
+
)
|
|
795
|
+
cold = _invoke(repo.root, "highlights", "--no-cache", "--json").output
|
|
796
|
+
warm = _invoke(repo.root, "highlights", "--json").output
|
|
797
|
+
second_warm = _invoke(repo.root, "highlights", "--json").output
|
|
798
|
+
assert cold == warm
|
|
799
|
+
assert warm == second_warm
|
|
800
|
+
payload = json.loads(cold)
|
|
801
|
+
# Exactly one invariant should survive the dedup; the other commit's
|
|
802
|
+
# statement is identical and must not appear twice.
|
|
803
|
+
assert len(payload["invariants"]) == 1
|
|
804
|
+
|
|
805
|
+
|
|
806
|
+
# ---- F5: scan determinism across cache state ------------------------------
|
|
807
|
+
|
|
808
|
+
|
|
809
|
+
def test_scan_text_is_byte_identical_across_cache_state(
|
|
810
|
+
repo, days_ago
|
|
811
|
+
) -> None: # type: ignore[no-untyped-def]
|
|
812
|
+
"""Two files that earn the same score from the same signals must not
|
|
813
|
+
swap positions in the --top N truncation across cache versus --no-cache
|
|
814
|
+
reads. Stable tie-break on the lexicographically smallest path keeps
|
|
815
|
+
cold and warm output byte-identical.
|
|
816
|
+
"""
|
|
817
|
+
# Two files always touched together → identical histories, identical
|
|
818
|
+
# signals, identical scores. The ordering between them is settled
|
|
819
|
+
# only by the path tie-break.
|
|
820
|
+
sha = repo.commit(
|
|
821
|
+
"feature: introduce zeta and alpha",
|
|
822
|
+
{"zeta.py": "1", "alpha.py": "1"},
|
|
823
|
+
when=days_ago(50),
|
|
824
|
+
)
|
|
825
|
+
repo.revert(sha, when=days_ago(45))
|
|
826
|
+
repo.commit(
|
|
827
|
+
"hotfix: regression",
|
|
828
|
+
{"zeta.py": "2", "alpha.py": "2"},
|
|
829
|
+
body="incident #INC-1",
|
|
830
|
+
when=days_ago(20),
|
|
831
|
+
)
|
|
832
|
+
cold = _invoke(repo.root, "scan", "--top", "10", "--no-cache").output
|
|
833
|
+
warm = _invoke(repo.root, "scan", "--top", "10").output
|
|
834
|
+
second_warm = _invoke(repo.root, "scan", "--top", "10").output
|
|
835
|
+
assert cold == warm
|
|
836
|
+
assert warm == second_warm
|
|
837
|
+
# Lexicographic tie-break: alpha.py is listed before zeta.py despite
|
|
838
|
+
# equal scores.
|
|
839
|
+
alpha_pos = cold.find("alpha.py")
|
|
840
|
+
zeta_pos = cold.find("zeta.py")
|
|
841
|
+
assert alpha_pos != -1
|
|
842
|
+
assert zeta_pos != -1
|
|
843
|
+
assert alpha_pos < zeta_pos
|
|
844
|
+
|
|
845
|
+
|
|
846
|
+
# ---- F7: --no-cache uses an in-memory cache for amortisation -------------
|
|
847
|
+
|
|
848
|
+
|
|
849
|
+
def test_no_cache_scan_matches_warm_scan_byte_for_byte(
|
|
850
|
+
repo, days_ago
|
|
851
|
+
) -> None: # type: ignore[no-untyped-def]
|
|
852
|
+
"""Cache-correctness contract: ``--no-cache`` must agree with the
|
|
853
|
+
persistent cache on the same HEAD. The in-memory ``:memory:`` store
|
|
854
|
+
backing ``--no-cache`` shares the same git-walk and dedup code paths
|
|
855
|
+
as the on-disk store; output must be byte-identical.
|
|
856
|
+
"""
|
|
857
|
+
sha = repo.commit("feature", {"a.py": "1", "b.py": "1"}, when=days_ago(50))
|
|
858
|
+
repo.revert(sha, when=days_ago(45))
|
|
859
|
+
repo.commit(
|
|
860
|
+
"hotfix: regression",
|
|
861
|
+
{"a.py": "2", "b.py": "2"},
|
|
862
|
+
body="incident #INC-1",
|
|
863
|
+
when=days_ago(10),
|
|
864
|
+
)
|
|
865
|
+
# Warm path first (writes the on-disk cache).
|
|
866
|
+
warm = _invoke(repo.root, "scan", "--top", "5").output
|
|
867
|
+
no_cache = _invoke(repo.root, "scan", "--top", "5", "--no-cache").output
|
|
868
|
+
warm_again = _invoke(repo.root, "scan", "--top", "5").output
|
|
869
|
+
assert warm == no_cache
|
|
870
|
+
assert warm == warm_again
|
|
@@ -392,3 +392,92 @@ def test_parse_log_records_irrecoverable_falls_back_to_epoch() -> None:
|
|
|
392
392
|
assert len(commits) == 1
|
|
393
393
|
# Still a tz-aware datetime so callers can compare it.
|
|
394
394
|
assert commits[0].authored_at.tzinfo is not None
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
# ---- DiffFacts batch loader (perf/diff-batched) ----------------------------
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def test_load_diff_facts_indexes_commits_by_path(repo) -> None: # type: ignore[no-untyped-def]
|
|
401
|
+
repo.commit("init", {"a.py": "1", "b.py": "1"})
|
|
402
|
+
repo.commit("update a", {"a.py": "2"})
|
|
403
|
+
repo.commit("update b", {"b.py": "2"})
|
|
404
|
+
repo.commit("update both", {"a.py": "3", "b.py": "3"})
|
|
405
|
+
|
|
406
|
+
facts = gf.load_diff_facts(repo.root)
|
|
407
|
+
|
|
408
|
+
a_subjects = [c.subject for c in facts.commits_by_path["a.py"]]
|
|
409
|
+
b_subjects = [c.subject for c in facts.commits_by_path["b.py"]]
|
|
410
|
+
# newest first
|
|
411
|
+
assert a_subjects == ["update both", "update a", "init"]
|
|
412
|
+
assert b_subjects == ["update both", "update b", "init"]
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
def test_load_diff_facts_co_change_index_lists_full_file_set(repo) -> None: # type: ignore[no-untyped-def]
|
|
416
|
+
repo.commit("init", {"a.py": "1", "b.py": "1", "c.py": "1"})
|
|
417
|
+
sha = repo.commit("change a and b", {"a.py": "2", "b.py": "2"})
|
|
418
|
+
|
|
419
|
+
facts = gf.load_diff_facts(repo.root)
|
|
420
|
+
|
|
421
|
+
paths = facts.co_change_index[sha]
|
|
422
|
+
assert set(paths) == {"a.py", "b.py"}
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
def test_gather_for_diff_returns_repo_facts_from_in_memory_map(repo, days_ago) -> None: # type: ignore[no-untyped-def]
|
|
426
|
+
sha = repo.commit("feature: A", {"a.py": "1"}, when=days_ago(40))
|
|
427
|
+
repo.revert(sha, when=days_ago(35))
|
|
428
|
+
repo.commit(
|
|
429
|
+
"hotfix: regression in a",
|
|
430
|
+
{"a.py": "2"},
|
|
431
|
+
body="incident #42",
|
|
432
|
+
when=days_ago(5),
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
diff_facts = gf.load_diff_facts(repo.root)
|
|
436
|
+
facts = gf.gather_for_diff(diff_facts, "a.py")
|
|
437
|
+
|
|
438
|
+
assert len(facts.commits) == 3
|
|
439
|
+
assert len(facts.revert_pairs) == 1
|
|
440
|
+
assert any("hotfix" in c.subject for c in facts.incident_commits)
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
def test_gather_for_diff_co_changes_match_per_file_pipeline(repo) -> None: # type: ignore[no-untyped-def]
|
|
444
|
+
repo.commit("init", {"a.py": "1", "b.py": "1", "c.py": "1"})
|
|
445
|
+
repo.commit("change a and b", {"a.py": "2", "b.py": "2"})
|
|
446
|
+
repo.commit("change a and b again", {"a.py": "3", "b.py": "3"})
|
|
447
|
+
repo.commit("change a alone", {"a.py": "4"})
|
|
448
|
+
|
|
449
|
+
diff_facts = gf.load_diff_facts(repo.root)
|
|
450
|
+
batched = gf.gather_for_diff(diff_facts, "a.py")
|
|
451
|
+
legacy = gf.gather(repo.root, "a.py")
|
|
452
|
+
|
|
453
|
+
assert dict(batched.co_changed_files) == dict(legacy.co_changed_files)
|
|
454
|
+
|
|
455
|
+
|
|
456
|
+
def test_gather_for_diff_for_unseen_path_returns_empty_facts(repo) -> None: # type: ignore[no-untyped-def]
|
|
457
|
+
repo.commit("init", {"a.py": "1"})
|
|
458
|
+
|
|
459
|
+
diff_facts = gf.load_diff_facts(repo.root)
|
|
460
|
+
facts = gf.gather_for_diff(diff_facts, "never-touched.py")
|
|
461
|
+
|
|
462
|
+
assert facts.commits == []
|
|
463
|
+
assert dict(facts.co_changed_files) == {}
|
|
464
|
+
|
|
465
|
+
|
|
466
|
+
def test_load_diff_facts_handles_multiline_body_then_numstat(repo) -> None: # type: ignore[no-untyped-def]
|
|
467
|
+
body = "first paragraph\n\nsecond paragraph with: colons, commas, etc."
|
|
468
|
+
repo.commit("subject line", {"x.txt": "1"}, body=body)
|
|
469
|
+
|
|
470
|
+
facts = gf.load_diff_facts(repo.root)
|
|
471
|
+
[commit] = facts.commits_by_path["x.txt"]
|
|
472
|
+
assert commit.subject == "subject line"
|
|
473
|
+
assert "second paragraph" in commit.body
|
|
474
|
+
assert commit.files == ("x.txt",)
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
def test_load_diff_facts_max_commits_caps_per_path(repo) -> None: # type: ignore[no-untyped-def]
|
|
478
|
+
repo.commit("init", {"a.py": "1"})
|
|
479
|
+
for i in range(5):
|
|
480
|
+
repo.commit(f"tweak {i}", {"a.py": str(i + 2)})
|
|
481
|
+
|
|
482
|
+
facts = gf.load_diff_facts(repo.root, max_commits=2)
|
|
483
|
+
assert len(facts.commits_by_path["a.py"]) == 2
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|