PyPI - whycode-cli - Versions diffs - 0.4.1__tar.gz → 0.4.2__tar.gz - Mend

whycode-cli 0.4.1tar.gz → 0.4.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

{whycode_cli-0.4.1 → whycode_cli-0.4.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: whycode-cli
-Version: 0.4.1
+Version: 0.4.2
 Summary: Tells you what to be afraid of before you touch a file.
 Author: Kevin
 License-Expression: MIT

{whycode_cli-0.4.1 → whycode_cli-0.4.2}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "whycode-cli"
-version = "0.4.1"
+version = "0.4.2"
 description = "Tells you what to be afraid of before you touch a file."
 readme = "README.md"
 license = "MIT"

{whycode_cli-0.4.1 → whycode_cli-0.4.2}/src/whycode/__init__.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """WhyCode — tells you what to be afraid of before touching a file."""
-__version__ = "0.4.1"
+__version__ = "0.4.2"

{whycode_cli-0.4.1 → whycode_cli-0.4.2}/src/whycode/cache.py RENAMED Viewed

@@ -112,10 +112,21 @@ class CacheStore:
     cache misses; this class never invokes ``git`` itself.
     """
-    def __init__(self, db_path: Path) -> None:
+    def __init__(self, db_path: Path, *, in_memory: bool = False) -> None:
+        """Open (creating if needed) the SQLite cache at ``db_path``.
+        ``in_memory=True`` opens a transient ``:memory:`` connection
+        instead — the disk file is never created and is never read.
+        Used by ``--no-cache`` to retain in-session amortisation
+        (matches the cold-fill code path) without persisting anything.
+        """
         self.db_path = db_path
-        self.db_path.parent.mkdir(parents=True, exist_ok=True)
-        self._conn = sqlite3.connect(self.db_path)
+        self._in_memory = in_memory
+        if in_memory:
+            self._conn = sqlite3.connect(":memory:")
+        else:
+            self.db_path.parent.mkdir(parents=True, exist_ok=True)
+            self._conn = sqlite3.connect(self.db_path)
         # row_factory makes column access readable in tests / debug.
         self._conn.row_factory = sqlite3.Row
         self._conn.execute("PRAGMA foreign_keys = ON")
@@ -402,13 +413,18 @@ class CacheStore:
         file_row_count = int(
             self._conn.execute("SELECT COUNT(*) FROM commit_files").fetchone()[0]
         )
-        try:
-            size_bytes = self.db_path.stat().st_size
-        except OSError:
+        if self._in_memory:
             size_bytes = 0
+            exists = False
+        else:
+            try:
+                size_bytes = self.db_path.stat().st_size
+            except OSError:
+                size_bytes = 0
+            exists = self.db_path.exists()
         return CacheStats(
             path=self.db_path,
-            exists=self.db_path.exists(),
+            exists=exists,
             schema_version=self.schema_version,
             head_sha=self.head_sha,
             commit_count=commit_count,
@@ -430,6 +446,16 @@ def open_for(repo_root: Path) -> CacheStore:
     return CacheStore(cache_path_for(repo_root))
+def open_in_memory(repo_root: Path) -> CacheStore:
+    """Open a transient in-memory cache for ``repo_root``.
+    Used by ``--no-cache`` to keep within-session amortisation (the same
+    cold-fill code path everything else uses) while never touching disk.
+    The store is destroyed on ``close()`` and has no after-effects.
+    """
+    return CacheStore(cache_path_for(repo_root), in_memory=True)
 def parse_authored_at(value: str) -> datetime:
     """Parse the ``authored_at`` string we stored from git.

{whycode_cli-0.4.1 → whycode_cli-0.4.2}/src/whycode/cli.py RENAMED Viewed

@@ -50,18 +50,27 @@ err = Console(stderr=True)
 def _open_cache(repo_root: Path, no_cache: bool) -> ch.CacheStore | None:
-    """Open the on-disk cache for ``repo_root`` unless suppressed.
-    A None return means "do not pass a cache through git_facts" — every
-    git-side helper falls back to its original network-free, cache-free
-    implementation. This is the escape hatch behind ``--no-cache`` and
-    is also the default when the cache cannot be initialised at all
-    (read-only filesystem, etc.); we never want a cache failure to
-    block the main read path.
+    """Open the cache for ``repo_root`` according to the no-cache flag.
+    Modes:
+      * ``no_cache=False`` (the default): persistent on-disk SQLite at
+        ``.whycode/cache.db``.
+      * ``no_cache=True``: a transient ``:memory:`` SQLite store. The
+        same git-walk code path runs as for the cold-fill, but the
+        database is destroyed on ``close()`` — nothing lands on disk
+        and the next run starts cold. Keeping per-run amortisation
+        (one ``git log`` walk shared across files) is what makes
+        ``--no-cache`` at most as slow as a cold persistent fill;
+        the previous ``cache=None`` short-circuit lost that and so
+        ``--no-cache`` re-issued per-file walks every iteration.
+    A ``None`` return means "do not pass a cache through git_facts".
+    Happens only when even an in-memory open fails — very rare and
+    we never want a cache problem to block the main read path.
     """
-    if no_cache:
-        return None
     try:
+        if no_cache:
+            return ch.open_in_memory(repo_root)
         return ch.open_for(repo_root)
     except OSError:
         return None
@@ -425,7 +434,9 @@ def diff(
                 cards.append(rc.build(repo_root, f, cache=cache))
             except gf.GitError:
                 continue
-        cards.sort(key=lambda c: -c.score.value)
+        # Stable tie-break: lex smallest path on identical scores so cache
+        # and --no-cache truncate the same files at --top N.
+        cards.sort(key=lambda c: (-c.score.value, c.path))
         cards = cards[:top]
     finally:
         if cache is not None:
@@ -565,16 +576,17 @@ def highlights(
     inv_pairs = gf.extract_invariant_quotes(commits)
     sha_to_commit = {c.sha: c for c in commits}
-    seen_lines: dict[str, str] = {}
-    for sha, line in inv_pairs:
-        seen_lines.setdefault(line, sha)
+    deduped = gf.dedupe_invariant_lines(inv_pairs, sha_to_commit)
     inv_records: list[tuple[str, str, gf.Commit]] = []
-    for line, sha in seen_lines.items():
+    for sha, line in deduped:
         commit = sha_to_commit.get(sha)
         if commit is None:
             continue
         inv_records.append((line, sha, commit))
-    inv_records.sort(key=lambda t: t[2].authored_at, reverse=True)
+    # Sort newest first; on identical timestamps fall back to lexicographically
+    # smallest sha so cache and --no-cache emit byte-identical output.
+    inv_records.sort(key=lambda t: t[1])  # secondary: sha asc
+    inv_records.sort(key=lambda t: t[2].authored_at, reverse=True)  # primary
     inv_records = inv_records[:invariants]
     incident_records = gf.find_incidents(commits)[:incidents]
@@ -827,7 +839,10 @@ def scan(
         if cache is not None:
             cache.close()
-    cards.sort(key=lambda c: -c.score.value)
+    # Stable tie-break on identical scores: lexicographically smallest path
+    # so cache and --no-cache produce byte-identical text output for the
+    # same HEAD. Without this, the truncation at --top N is non-deterministic.
+    cards.sort(key=lambda c: (-c.score.value, c.path))
     top_cards = cards[:top]
     if not top_cards:
         # Be honest about what "no flagged files" actually means. A user who
@@ -949,7 +964,8 @@ def show(
             cards.append(rc.build(repo_root, change.path))
         except gf.GitError:
             continue
-    cards.sort(key=lambda c: -c.score.value)
+    # Stable tie-break on identical scores: lex smallest path.
+    cards.sort(key=lambda c: (-c.score.value, c.path))
     if json_out:
         console.print_json(
@@ -1065,13 +1081,18 @@ def tour(
         inv_pairs = gf.extract_invariant_quotes(commits)
         sha_to_commit = {c.sha: c for c in commits}
-        seen_lines: dict[str, str] = {}
-        for sha, line in inv_pairs:
-            seen_lines.setdefault(line, sha)
+        deduped = gf.dedupe_invariant_lines(inv_pairs, sha_to_commit)
+        # Sort newest first with sha-asc tie-break so cache and --no-cache
+        # surface the same three lines in the same order.
+        deduped_sorted = sorted(
+            (p for p in deduped if p[0] in sha_to_commit),
+            key=lambda p: p[0],
+        )
+        deduped_sorted.sort(
+            key=lambda p: sha_to_commit[p[0]].authored_at, reverse=True
+        )
         invariants_top = [
-            (line, sha_to_commit[sha])
-            for line, sha in seen_lines.items()
-            if sha in sha_to_commit
+            (line, sha_to_commit[sha]) for sha, line in deduped_sorted
         ][:3]
         incidents_top = gf.find_incidents(commits)[:3]
@@ -1135,7 +1156,8 @@ def tour(
                     ]
                     if useful:
                         cards.append(card)
-            cards.sort(key=lambda c: -c.score.value)
+            # Stable tie-break: lex smallest path on identical scores.
+            cards.sort(key=lambda c: (-c.score.value, c.path))
         if cards:
             console.print("[bold red]Top 3 risky files[/bold red]")

{whycode_cli-0.4.1 → whycode_cli-0.4.2}/src/whycode/git_facts.py RENAMED Viewed

@@ -967,6 +967,47 @@ def extract_invariant_quotes(commits: Sequence[Commit]) -> list[tuple[str, str]]
     return out
+def dedupe_invariant_lines(
+    pairs: Sequence[tuple[str, str]],
+    sha_to_commit: dict[str, Commit],
+) -> list[tuple[str, str]]:
+    """Collapse identical invariant lines to one canonical (sha, line) pair.
+    When two commits state the same invariant line — typically a cherry-pick
+    onto a maintenance branch, or a rebase that duplicated the message — we
+    must pick exactly one to surface. Without a deterministic rule the cache
+    and ``--no-cache`` paths can disagree (their walk orders differ when
+    timestamps tie), and downstream JSON consumers see flaky output across
+    runs.
+    The rule:
+    1. Earliest ``authored_at`` wins. The original statement is canonical;
+       cherry-picks and rebases are derivatives.
+    2. Lexicographically smallest ``sha`` breaks ties on identical timestamps.
+    The returned list preserves first-encounter order of the (now-unique)
+    lines so downstream code that sorts by date sees a stable input.
+    Pairs whose ``sha`` is not in ``sha_to_commit`` keep their first-seen
+    record (no metadata to compare on).
+    """
+    canonical: dict[str, str] = {}
+    for sha, line in pairs:
+        existing = canonical.get(line)
+        if existing is None:
+            canonical[line] = sha
+            continue
+        old_commit = sha_to_commit.get(existing)
+        new_commit = sha_to_commit.get(sha)
+        if old_commit is None or new_commit is None:
+            continue
+        old_key = (old_commit.authored_at, existing)
+        new_key = (new_commit.authored_at, sha)
+        if new_key < old_key:
+            canonical[line] = sha
+    return [(sha, line) for line, sha in canonical.items()]
 def author_last_activity(repo_root: Path, email: str) -> datetime | None:
     """Most recent commit timestamp by ``email`` anywhere in the repo, or None."""
     raw = _run_git(

{whycode_cli-0.4.1 → whycode_cli-0.4.2}/src/whycode_cli.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: whycode-cli
-Version: 0.4.1
+Version: 0.4.2
 Summary: Tells you what to be afraid of before you touch a file.
 Author: Kevin
 License-Expression: MIT

{whycode_cli-0.4.1 → whycode_cli-0.4.2}/tests/test_cache.py RENAMED Viewed

@@ -306,6 +306,57 @@ def test_open_for_idempotent_open_close(tmp_path: Path) -> None:
         store_b.close()
+# ---- F7: in-memory cache for --no-cache amortisation ---------------------
+def test_open_in_memory_does_not_touch_disk(tmp_path: Path) -> None:
+    """The `:memory:` mode must leave the filesystem completely untouched."""
+    store = ch.open_in_memory(tmp_path)
+    try:
+        # Write a few rows; nothing should land on disk.
+        store.upsert_commits([_commit(sha="a" * 40)])
+        store.upsert_commit_files([("a" * 40, "x.py", 1, 0)])
+        store.set_head_sha("deadbeef")
+        assert not (tmp_path / ch.CACHE_DIRNAME).exists()
+        # Reads still return what we wrote.
+        rows = store.fetch_all_commit_rows()
+        assert len(rows) == 1
+        assert store.head_sha == "deadbeef"
+    finally:
+        store.close()
+    # And after close there's still nothing on disk.
+    assert not (tmp_path / ch.CACHE_DIRNAME).exists()
+def test_in_memory_cache_amortises_across_files(repo) -> None:  # type: ignore[no-untyped-def]
+    """The in-memory store reuses cached diffstat rows across calls.
+    The cold (persistent) path's main perf advantage over a no-cache call
+    was that, after a single batched ``git log --no-walk --numstat`` for
+    file A, file B's overlapping shas were already in the cache. The
+    `:memory:` store must give --no-cache the same amortisation in-process.
+    """
+    repo.commit("init", {"a.txt": "1", "b.txt": "1"})
+    repo.commit("touch a and b", {"a.txt": "2", "b.txt": "2"})
+    repo.commit("touch only b", {"b.txt": "3"})
+    with ch.open_in_memory(repo.root) as store:
+        # First call on a.txt populates diffstat rows for both shared shas.
+        a_commits = gf.commits_for_path(repo.root, "a.txt", cache=store)
+        gf.co_changes(repo.root, a_commits, "a.txt", cache=store)
+        # All shas a.txt touched are now present.
+        a_shas = [c.sha for c in a_commits]
+        assert store.shas_missing_files(a_shas) == []
+        # When b.txt's call runs, the two shas it shares with a.txt are
+        # served from the cache; only the b-only sha is missing.
+        b_commits = gf.commits_for_path(repo.root, "b.txt", cache=store)
+        b_shas = [c.sha for c in b_commits]
+        missing_for_b = set(store.shas_missing_files(b_shas))
+        # Exactly the shas that b.txt touched but a.txt did not are missing.
+        a_set = set(a_shas)
+        expected_missing = {s for s in b_shas if s not in a_set}
+        assert missing_for_b == expected_missing
 def test_fetch_co_changes_chunked_query_handles_many_shas(tmp_path: Path) -> None:
     """SQLite limits host parameters per statement; we chunk above 500."""
     with ch.open_for(tmp_path) as store:

{whycode_cli-0.4.1 → whycode_cli-0.4.2}/tests/test_cli.py RENAMED Viewed

@@ -755,3 +755,116 @@ def test_repeat_scan_produces_identical_top_files(repo, days_ago) -> None:  # ty
     assert "refund.py" in cold
     assert "refund.py" in warm_first
     assert "refund.py" in warm_second
+# ---- F4: highlights determinism across cache state ------------------------
+def test_highlights_json_is_byte_identical_across_cache_state(
+    repo, days_ago
+) -> None:  # type: ignore[no-untyped-def]
+    """Two commits with identical bodies and timestamps (a cherry-pick on a
+    different branch) must not flip which SHA the dedup picks across cache
+    versus --no-cache reads of the same HEAD.
+    Without a stable tie-breaker, the cache's authored_at-DESC walk and git
+    log's walk can disagree on the order of identical-timestamp commits, and
+    the JSON consumer sees a different SHA on the same field across runs.
+    """
+    same_time = days_ago(30)
+    repo.commit(
+        "init",
+        {"a.txt": "1", "b.txt": "1"},
+        when=days_ago(60),
+    )
+    # Two commits, identical timestamps, identical bodies — only the SHAs
+    # and the touched-file set differ. Mirrors the flask cherry-pick pattern
+    # the field test surfaced.
+    repo.commit(
+        "use global contributing guide on master",
+        {"a.txt": "2"},
+        body="Do not duplicate the contributing guide between branches.",
+        when=same_time,
+    )
+    repo.commit(
+        "use global contributing guide on stable",
+        {"b.txt": "2"},
+        body="Do not duplicate the contributing guide between branches.",
+        when=same_time,
+    )
+    cold = _invoke(repo.root, "highlights", "--no-cache", "--json").output
+    warm = _invoke(repo.root, "highlights", "--json").output
+    second_warm = _invoke(repo.root, "highlights", "--json").output
+    assert cold == warm
+    assert warm == second_warm
+    payload = json.loads(cold)
+    # Exactly one invariant should survive the dedup; the other commit's
+    # statement is identical and must not appear twice.
+    assert len(payload["invariants"]) == 1
+# ---- F5: scan determinism across cache state ------------------------------
+def test_scan_text_is_byte_identical_across_cache_state(
+    repo, days_ago
+) -> None:  # type: ignore[no-untyped-def]
+    """Two files that earn the same score from the same signals must not
+    swap positions in the --top N truncation across cache versus --no-cache
+    reads. Stable tie-break on the lexicographically smallest path keeps
+    cold and warm output byte-identical.
+    """
+    # Two files always touched together → identical histories, identical
+    # signals, identical scores. The ordering between them is settled
+    # only by the path tie-break.
+    sha = repo.commit(
+        "feature: introduce zeta and alpha",
+        {"zeta.py": "1", "alpha.py": "1"},
+        when=days_ago(50),
+    )
+    repo.revert(sha, when=days_ago(45))
+    repo.commit(
+        "hotfix: regression",
+        {"zeta.py": "2", "alpha.py": "2"},
+        body="incident #INC-1",
+        when=days_ago(20),
+    )
+    cold = _invoke(repo.root, "scan", "--top", "10", "--no-cache").output
+    warm = _invoke(repo.root, "scan", "--top", "10").output
+    second_warm = _invoke(repo.root, "scan", "--top", "10").output
+    assert cold == warm
+    assert warm == second_warm
+    # Lexicographic tie-break: alpha.py is listed before zeta.py despite
+    # equal scores.
+    alpha_pos = cold.find("alpha.py")
+    zeta_pos = cold.find("zeta.py")
+    assert alpha_pos != -1
+    assert zeta_pos != -1
+    assert alpha_pos < zeta_pos
+# ---- F7: --no-cache uses an in-memory cache for amortisation -------------
+def test_no_cache_scan_matches_warm_scan_byte_for_byte(
+    repo, days_ago
+) -> None:  # type: ignore[no-untyped-def]
+    """Cache-correctness contract: ``--no-cache`` must agree with the
+    persistent cache on the same HEAD. The in-memory ``:memory:`` store
+    backing ``--no-cache`` shares the same git-walk and dedup code paths
+    as the on-disk store; output must be byte-identical.
+    """
+    sha = repo.commit("feature", {"a.py": "1", "b.py": "1"}, when=days_ago(50))
+    repo.revert(sha, when=days_ago(45))
+    repo.commit(
+        "hotfix: regression",
+        {"a.py": "2", "b.py": "2"},
+        body="incident #INC-1",
+        when=days_ago(10),
+    )
+    # Warm path first (writes the on-disk cache).
+    warm = _invoke(repo.root, "scan", "--top", "5").output
+    no_cache = _invoke(repo.root, "scan", "--top", "5", "--no-cache").output
+    warm_again = _invoke(repo.root, "scan", "--top", "5").output
+    assert warm == no_cache
+    assert warm == warm_again