whycode-cli 0.3.0__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {whycode_cli-0.3.0 → whycode_cli-0.4.0}/PKG-INFO +1 -1
- {whycode_cli-0.3.0 → whycode_cli-0.4.0}/pyproject.toml +1 -1
- {whycode_cli-0.3.0 → whycode_cli-0.4.0}/src/whycode/__init__.py +1 -1
- whycode_cli-0.4.0/src/whycode/cache.py +456 -0
- {whycode_cli-0.3.0 → whycode_cli-0.4.0}/src/whycode/cli.py +304 -159
- {whycode_cli-0.3.0 → whycode_cli-0.4.0}/src/whycode/git_facts.py +307 -10
- whycode_cli-0.4.0/src/whycode/mcp_server.py +509 -0
- {whycode_cli-0.3.0 → whycode_cli-0.4.0}/src/whycode/risk_card.py +6 -1
- {whycode_cli-0.3.0 → whycode_cli-0.4.0}/src/whycode/signals.py +1 -1
- {whycode_cli-0.3.0 → whycode_cli-0.4.0}/src/whycode_cli.egg-info/PKG-INFO +1 -1
- {whycode_cli-0.3.0 → whycode_cli-0.4.0}/src/whycode_cli.egg-info/SOURCES.txt +3 -0
- whycode_cli-0.4.0/tests/test_cache.py +339 -0
- {whycode_cli-0.3.0 → whycode_cli-0.4.0}/tests/test_cli.py +55 -0
- whycode_cli-0.4.0/tests/test_mcp_prompts.py +315 -0
- whycode_cli-0.3.0/src/whycode/mcp_server.py +0 -204
- {whycode_cli-0.3.0 → whycode_cli-0.4.0}/LICENSE +0 -0
- {whycode_cli-0.3.0 → whycode_cli-0.4.0}/README.md +0 -0
- {whycode_cli-0.3.0 → whycode_cli-0.4.0}/setup.cfg +0 -0
- {whycode_cli-0.3.0 → whycode_cli-0.4.0}/src/whycode/__main__.py +0 -0
- {whycode_cli-0.3.0 → whycode_cli-0.4.0}/src/whycode/decisions.py +0 -0
- {whycode_cli-0.3.0 → whycode_cli-0.4.0}/src/whycode/ignore.py +0 -0
- {whycode_cli-0.3.0 → whycode_cli-0.4.0}/src/whycode/llm.py +0 -0
- {whycode_cli-0.3.0 → whycode_cli-0.4.0}/src/whycode/scorer.py +0 -0
- {whycode_cli-0.3.0 → whycode_cli-0.4.0}/src/whycode/suppressions.py +0 -0
- {whycode_cli-0.3.0 → whycode_cli-0.4.0}/src/whycode/templates/__init__.py +0 -0
- {whycode_cli-0.3.0 → whycode_cli-0.4.0}/src/whycode/templates/github-workflow.yml +0 -0
- {whycode_cli-0.3.0 → whycode_cli-0.4.0}/src/whycode/templates/pre-commit +0 -0
- {whycode_cli-0.3.0 → whycode_cli-0.4.0}/src/whycode_cli.egg-info/dependency_links.txt +0 -0
- {whycode_cli-0.3.0 → whycode_cli-0.4.0}/src/whycode_cli.egg-info/entry_points.txt +0 -0
- {whycode_cli-0.3.0 → whycode_cli-0.4.0}/src/whycode_cli.egg-info/requires.txt +0 -0
- {whycode_cli-0.3.0 → whycode_cli-0.4.0}/src/whycode_cli.egg-info/top_level.txt +0 -0
- {whycode_cli-0.3.0 → whycode_cli-0.4.0}/tests/test_decisions.py +0 -0
- {whycode_cli-0.3.0 → whycode_cli-0.4.0}/tests/test_git_facts.py +0 -0
- {whycode_cli-0.3.0 → whycode_cli-0.4.0}/tests/test_ignore.py +0 -0
- {whycode_cli-0.3.0 → whycode_cli-0.4.0}/tests/test_scorer.py +0 -0
- {whycode_cli-0.3.0 → whycode_cli-0.4.0}/tests/test_signals.py +0 -0
- {whycode_cli-0.3.0 → whycode_cli-0.4.0}/tests/test_suppressions.py +0 -0
|
@@ -0,0 +1,456 @@
|
|
|
1
|
+
"""Layer 1 cache: persist git-derived facts in a per-repo SQLite database.
|
|
2
|
+
|
|
3
|
+
Why a cache exists at all
|
|
4
|
+
-------------------------
|
|
5
|
+
``whycode scan``, ``highlights`` and friends drive the same handful of
|
|
6
|
+
``git log`` invocations on every run. On a 3000-commit repo each run
|
|
7
|
+
re-parses the same output; on a 50000-commit repo it scales to minutes.
|
|
8
|
+
The repo's history is append-only, so the obvious win is to remember
|
|
9
|
+
what we've already parsed and only ask git for what's new.
|
|
10
|
+
|
|
11
|
+
Design notes
|
|
12
|
+
------------
|
|
13
|
+
- One database per repository, at ``<repo>/.whycode/cache.db``.
|
|
14
|
+
- Schema is intentionally tiny and hand-editable. ``schema_version`` lives
|
|
15
|
+
in ``meta`` so we can grow it without breaking existing caches.
|
|
16
|
+
- Invalidation key is ``git rev-parse HEAD``. When unchanged, every read
|
|
17
|
+
is served from SQLite. When changed, we ask git only for commits in
|
|
18
|
+
``<last_head>..HEAD`` and append them; if ``last_head`` is unreachable
|
|
19
|
+
(force-push, branch switch) we fall back to a full rebuild.
|
|
20
|
+
- The cache is local-only: never uploaded, never queried by the network.
|
|
21
|
+
``.whycode/`` is gitignored by default, the same as every other on-disk
|
|
22
|
+
state this project keeps.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import contextlib
|
|
28
|
+
import sqlite3
|
|
29
|
+
from collections import Counter
|
|
30
|
+
from collections.abc import Iterable, Sequence
|
|
31
|
+
from contextlib import closing
|
|
32
|
+
from dataclasses import dataclass
|
|
33
|
+
from datetime import datetime
|
|
34
|
+
from pathlib import Path
|
|
35
|
+
|
|
36
|
+
SCHEMA_VERSION = 1
|
|
37
|
+
CACHE_DIRNAME = ".whycode"
|
|
38
|
+
CACHE_FILENAME = "cache.db"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
_SCHEMA_SQL = """
|
|
42
|
+
CREATE TABLE IF NOT EXISTS meta (
|
|
43
|
+
key TEXT PRIMARY KEY,
|
|
44
|
+
value TEXT NOT NULL
|
|
45
|
+
);
|
|
46
|
+
|
|
47
|
+
CREATE TABLE IF NOT EXISTS commits (
|
|
48
|
+
sha TEXT PRIMARY KEY,
|
|
49
|
+
author_name TEXT NOT NULL,
|
|
50
|
+
author_email TEXT NOT NULL,
|
|
51
|
+
authored_at TEXT NOT NULL,
|
|
52
|
+
subject TEXT NOT NULL,
|
|
53
|
+
body TEXT NOT NULL
|
|
54
|
+
);
|
|
55
|
+
|
|
56
|
+
CREATE TABLE IF NOT EXISTS commit_files (
|
|
57
|
+
sha TEXT NOT NULL,
|
|
58
|
+
path TEXT NOT NULL,
|
|
59
|
+
insertions INTEGER NOT NULL,
|
|
60
|
+
deletions INTEGER NOT NULL,
|
|
61
|
+
PRIMARY KEY (sha, path)
|
|
62
|
+
);
|
|
63
|
+
|
|
64
|
+
-- Per-path rename-resolved sha lists, populated by the first git log --follow
|
|
65
|
+
-- call for that path. The (path, head_sha) pair is the validity key: if the
|
|
66
|
+
-- HEAD a row was resolved against differs from the current HEAD, the row is
|
|
67
|
+
-- ignored. Position preserves git's newest-first ordering.
|
|
68
|
+
CREATE TABLE IF NOT EXISTS path_log (
|
|
69
|
+
path TEXT NOT NULL,
|
|
70
|
+
head_sha TEXT NOT NULL,
|
|
71
|
+
position INTEGER NOT NULL,
|
|
72
|
+
sha TEXT NOT NULL,
|
|
73
|
+
PRIMARY KEY (path, head_sha, position)
|
|
74
|
+
);
|
|
75
|
+
|
|
76
|
+
-- Per-path blame ownership: author_email -> line count, scoped to a HEAD.
|
|
77
|
+
-- Cached so the ghost-keeper detector does not re-shell-out to git blame
|
|
78
|
+
-- on every scan of the same repo.
|
|
79
|
+
CREATE TABLE IF NOT EXISTS line_ownership (
|
|
80
|
+
path TEXT NOT NULL,
|
|
81
|
+
head_sha TEXT NOT NULL,
|
|
82
|
+
author_email TEXT NOT NULL,
|
|
83
|
+
line_count INTEGER NOT NULL,
|
|
84
|
+
PRIMARY KEY (path, head_sha, author_email)
|
|
85
|
+
);
|
|
86
|
+
|
|
87
|
+
CREATE INDEX IF NOT EXISTS idx_commit_files_path ON commit_files(path);
|
|
88
|
+
CREATE INDEX IF NOT EXISTS idx_commits_authored_at ON commits(authored_at);
|
|
89
|
+
CREATE INDEX IF NOT EXISTS idx_path_log_path_head ON path_log(path, head_sha);
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@dataclass(frozen=True)
|
|
94
|
+
class CacheStats:
|
|
95
|
+
"""Summary returned by ``whycode cache stats`` and friends."""
|
|
96
|
+
|
|
97
|
+
path: Path
|
|
98
|
+
exists: bool
|
|
99
|
+
schema_version: int | None
|
|
100
|
+
head_sha: str | None
|
|
101
|
+
commit_count: int
|
|
102
|
+
file_row_count: int
|
|
103
|
+
size_bytes: int
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class CacheStore:
|
|
107
|
+
"""A thin wrapper over a per-repo SQLite database.
|
|
108
|
+
|
|
109
|
+
The store is responsible only for read/write of cached commits and the
|
|
110
|
+
head-pointer used to drive incremental updates. Higher layers
|
|
111
|
+
(``git_facts``) decide *what* to cache and how to fall back when the
|
|
112
|
+
cache misses; this class never invokes ``git`` itself.
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
def __init__(self, db_path: Path) -> None:
|
|
116
|
+
self.db_path = db_path
|
|
117
|
+
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
118
|
+
self._conn = sqlite3.connect(self.db_path)
|
|
119
|
+
# row_factory makes column access readable in tests / debug.
|
|
120
|
+
self._conn.row_factory = sqlite3.Row
|
|
121
|
+
self._conn.execute("PRAGMA foreign_keys = ON")
|
|
122
|
+
self._initialise()
|
|
123
|
+
|
|
124
|
+
# ---- lifecycle --------------------------------------------------------
|
|
125
|
+
|
|
126
|
+
def close(self) -> None:
|
|
127
|
+
self._conn.close()
|
|
128
|
+
|
|
129
|
+
def __enter__(self) -> CacheStore:
|
|
130
|
+
return self
|
|
131
|
+
|
|
132
|
+
def __exit__(self, *exc: object) -> None:
|
|
133
|
+
self.close()
|
|
134
|
+
|
|
135
|
+
# ---- schema -----------------------------------------------------------
|
|
136
|
+
|
|
137
|
+
def _initialise(self) -> None:
|
|
138
|
+
with self._conn:
|
|
139
|
+
self._conn.executescript(_SCHEMA_SQL)
|
|
140
|
+
existing = self._get_meta("schema_version")
|
|
141
|
+
if existing is None:
|
|
142
|
+
self._set_meta("schema_version", str(SCHEMA_VERSION))
|
|
143
|
+
elif int(existing) != SCHEMA_VERSION:
|
|
144
|
+
# Future-proofing: when we bump the schema, drop tables and
|
|
145
|
+
# rebuild rather than try to migrate. The cache is a derived
|
|
146
|
+
# artefact — losing it is never destructive.
|
|
147
|
+
self._conn.executescript(
|
|
148
|
+
"DROP TABLE IF EXISTS commit_files;"
|
|
149
|
+
"DROP TABLE IF EXISTS commits;"
|
|
150
|
+
"DROP TABLE IF EXISTS meta;"
|
|
151
|
+
)
|
|
152
|
+
self._conn.executescript(_SCHEMA_SQL)
|
|
153
|
+
self._set_meta("schema_version", str(SCHEMA_VERSION))
|
|
154
|
+
|
|
155
|
+
# ---- meta -------------------------------------------------------------
|
|
156
|
+
|
|
157
|
+
def _get_meta(self, key: str) -> str | None:
|
|
158
|
+
cur = self._conn.execute("SELECT value FROM meta WHERE key = ?", (key,))
|
|
159
|
+
row = cur.fetchone()
|
|
160
|
+
return None if row is None else str(row["value"])
|
|
161
|
+
|
|
162
|
+
def _set_meta(self, key: str, value: str) -> None:
|
|
163
|
+
with self._conn:
|
|
164
|
+
self._conn.execute(
|
|
165
|
+
"INSERT INTO meta(key, value) VALUES(?, ?) "
|
|
166
|
+
"ON CONFLICT(key) DO UPDATE SET value = excluded.value",
|
|
167
|
+
(key, value),
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
@property
|
|
171
|
+
def head_sha(self) -> str | None:
|
|
172
|
+
return self._get_meta("head_sha")
|
|
173
|
+
|
|
174
|
+
def set_head_sha(self, sha: str) -> None:
|
|
175
|
+
self._set_meta("head_sha", sha)
|
|
176
|
+
|
|
177
|
+
@property
|
|
178
|
+
def schema_version(self) -> int:
|
|
179
|
+
v = self._get_meta("schema_version")
|
|
180
|
+
return int(v) if v is not None else 0
|
|
181
|
+
|
|
182
|
+
# ---- writes -----------------------------------------------------------
|
|
183
|
+
|
|
184
|
+
def upsert_commits(self, rows: Iterable[tuple[str, str, str, str, str, str]]) -> None:
|
|
185
|
+
"""Bulk-upsert commit rows.
|
|
186
|
+
|
|
187
|
+
Each row is ``(sha, author_name, author_email, authored_at, subject, body)``.
|
|
188
|
+
``authored_at`` is stored as the ISO-8601 string git produces — we never
|
|
189
|
+
re-parse it on the way in, only on the way out.
|
|
190
|
+
"""
|
|
191
|
+
with self._conn:
|
|
192
|
+
self._conn.executemany(
|
|
193
|
+
"INSERT INTO commits(sha, author_name, author_email, authored_at, subject, body) "
|
|
194
|
+
"VALUES(?, ?, ?, ?, ?, ?) "
|
|
195
|
+
"ON CONFLICT(sha) DO UPDATE SET "
|
|
196
|
+
"author_name = excluded.author_name, "
|
|
197
|
+
"author_email = excluded.author_email, "
|
|
198
|
+
"authored_at = excluded.authored_at, "
|
|
199
|
+
"subject = excluded.subject, "
|
|
200
|
+
"body = excluded.body",
|
|
201
|
+
rows,
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
def upsert_commit_files(self, rows: Iterable[tuple[str, str, int, int]]) -> None:
|
|
205
|
+
"""Bulk-upsert per-file diffstat rows.
|
|
206
|
+
|
|
207
|
+
Each row is ``(sha, path, insertions, deletions)``. We treat the
|
|
208
|
+
diffstat for a given commit as the source of truth: re-uploading
|
|
209
|
+
for an existing sha wholly replaces the previous rows for that sha.
|
|
210
|
+
"""
|
|
211
|
+
rows = list(rows)
|
|
212
|
+
if not rows:
|
|
213
|
+
return
|
|
214
|
+
seen_shas = {sha for sha, _, _, _ in rows}
|
|
215
|
+
with self._conn:
|
|
216
|
+
self._conn.executemany(
|
|
217
|
+
"DELETE FROM commit_files WHERE sha = ?",
|
|
218
|
+
[(s,) for s in seen_shas],
|
|
219
|
+
)
|
|
220
|
+
self._conn.executemany(
|
|
221
|
+
"INSERT INTO commit_files(sha, path, insertions, deletions) "
|
|
222
|
+
"VALUES(?, ?, ?, ?)",
|
|
223
|
+
rows,
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
def has_commit_files(self, sha: str) -> bool:
|
|
227
|
+
"""True iff ``commit_files`` already has rows for ``sha``."""
|
|
228
|
+
cur = self._conn.execute(
|
|
229
|
+
"SELECT 1 FROM commit_files WHERE sha = ? LIMIT 1", (sha,)
|
|
230
|
+
)
|
|
231
|
+
return cur.fetchone() is not None
|
|
232
|
+
|
|
233
|
+
def clear(self) -> None:
|
|
234
|
+
"""Wipe every row, keeping the schema in place."""
|
|
235
|
+
with self._conn:
|
|
236
|
+
self._conn.execute("DELETE FROM commit_files")
|
|
237
|
+
self._conn.execute("DELETE FROM commits")
|
|
238
|
+
self._conn.execute("DELETE FROM path_log")
|
|
239
|
+
self._conn.execute("DELETE FROM line_ownership")
|
|
240
|
+
self._conn.execute("DELETE FROM meta WHERE key != 'schema_version'")
|
|
241
|
+
|
|
242
|
+
# ---- path log (rename-resolved per-path SHA list) --------------------
|
|
243
|
+
|
|
244
|
+
def fetch_path_log(self, path: str, head_sha: str) -> list[str] | None:
|
|
245
|
+
"""Return the cached rename-resolved sha list for ``path`` at ``head_sha``.
|
|
246
|
+
|
|
247
|
+
Returns ``None`` if no row was stored at that HEAD; an empty list
|
|
248
|
+
means we know there is no history (e.g. the path was never touched).
|
|
249
|
+
"""
|
|
250
|
+
cur = self._conn.execute(
|
|
251
|
+
"SELECT sha FROM path_log WHERE path = ? AND head_sha = ? "
|
|
252
|
+
"ORDER BY position ASC",
|
|
253
|
+
(path, head_sha),
|
|
254
|
+
)
|
|
255
|
+
rows = cur.fetchall()
|
|
256
|
+
if not rows:
|
|
257
|
+
# Distinguish "absent" from "empty": check the meta marker.
|
|
258
|
+
sentinel = self._get_meta(f"path_log_known:{head_sha}:{path}")
|
|
259
|
+
if sentinel == "1":
|
|
260
|
+
return []
|
|
261
|
+
return None
|
|
262
|
+
return [str(r["sha"]) for r in rows]
|
|
263
|
+
|
|
264
|
+
def store_path_log(self, path: str, head_sha: str, shas: Sequence[str]) -> None:
|
|
265
|
+
"""Persist the rename-resolved sha list for ``path`` at ``head_sha``."""
|
|
266
|
+
with self._conn:
|
|
267
|
+
self._conn.execute(
|
|
268
|
+
"DELETE FROM path_log WHERE path = ? AND head_sha = ?",
|
|
269
|
+
(path, head_sha),
|
|
270
|
+
)
|
|
271
|
+
self._conn.executemany(
|
|
272
|
+
"INSERT INTO path_log(path, head_sha, position, sha) "
|
|
273
|
+
"VALUES(?, ?, ?, ?)",
|
|
274
|
+
[(path, head_sha, i, sha) for i, sha in enumerate(shas)],
|
|
275
|
+
)
|
|
276
|
+
# Mark the (path, head) as resolved even if shas is empty.
|
|
277
|
+
self._set_meta(f"path_log_known:{head_sha}:{path}", "1")
|
|
278
|
+
|
|
279
|
+
def fetch_line_ownership(
|
|
280
|
+
self, path: str, head_sha: str
|
|
281
|
+
) -> dict[str, int] | None:
|
|
282
|
+
"""Return cached line-ownership counts for ``path`` at ``head_sha``."""
|
|
283
|
+
cur = self._conn.execute(
|
|
284
|
+
"SELECT author_email, line_count FROM line_ownership "
|
|
285
|
+
"WHERE path = ? AND head_sha = ?",
|
|
286
|
+
(path, head_sha),
|
|
287
|
+
)
|
|
288
|
+
rows = cur.fetchall()
|
|
289
|
+
if not rows:
|
|
290
|
+
sentinel = self._get_meta(f"blame_known:{head_sha}:{path}")
|
|
291
|
+
if sentinel == "1":
|
|
292
|
+
return {}
|
|
293
|
+
return None
|
|
294
|
+
return {str(r["author_email"]): int(r["line_count"]) for r in rows}
|
|
295
|
+
|
|
296
|
+
def store_line_ownership(
|
|
297
|
+
self, path: str, head_sha: str, counts: dict[str, int]
|
|
298
|
+
) -> None:
|
|
299
|
+
"""Persist line-ownership counts for ``path`` at ``head_sha``."""
|
|
300
|
+
with self._conn:
|
|
301
|
+
self._conn.execute(
|
|
302
|
+
"DELETE FROM line_ownership WHERE path = ? AND head_sha = ?",
|
|
303
|
+
(path, head_sha),
|
|
304
|
+
)
|
|
305
|
+
self._conn.executemany(
|
|
306
|
+
"INSERT INTO line_ownership(path, head_sha, author_email, line_count) "
|
|
307
|
+
"VALUES(?, ?, ?, ?)",
|
|
308
|
+
[(path, head_sha, email, count) for email, count in counts.items()],
|
|
309
|
+
)
|
|
310
|
+
self._set_meta(f"blame_known:{head_sha}:{path}", "1")
|
|
311
|
+
|
|
312
|
+
# ---- reads ------------------------------------------------------------
|
|
313
|
+
|
|
314
|
+
def has_commit(self, sha: str) -> bool:
|
|
315
|
+
cur = self._conn.execute("SELECT 1 FROM commits WHERE sha = ? LIMIT 1", (sha,))
|
|
316
|
+
return cur.fetchone() is not None
|
|
317
|
+
|
|
318
|
+
def fetch_all_commit_rows(self) -> list[sqlite3.Row]:
|
|
319
|
+
"""Return every cached commit row, newest first by authored_at."""
|
|
320
|
+
cur = self._conn.execute(
|
|
321
|
+
"SELECT sha, author_name, author_email, authored_at, subject, body "
|
|
322
|
+
"FROM commits ORDER BY authored_at DESC"
|
|
323
|
+
)
|
|
324
|
+
return cur.fetchall()
|
|
325
|
+
|
|
326
|
+
def fetch_commits_for_path(self, path: str) -> list[sqlite3.Row]:
|
|
327
|
+
"""Return every cached commit that touches ``path`` (newest first).
|
|
328
|
+
|
|
329
|
+
Note: the cache stores diffstat rows under the path that git emitted
|
|
330
|
+
at the time. Rename history that ``git log --follow`` would expand is
|
|
331
|
+
NOT pre-resolved here — callers that need rename-following must fall
|
|
332
|
+
back to git when the cache returns nothing.
|
|
333
|
+
"""
|
|
334
|
+
cur = self._conn.execute(
|
|
335
|
+
"SELECT c.sha, c.author_name, c.author_email, c.authored_at, c.subject, c.body "
|
|
336
|
+
"FROM commits c "
|
|
337
|
+
"JOIN commit_files f ON f.sha = c.sha "
|
|
338
|
+
"WHERE f.path = ? "
|
|
339
|
+
"ORDER BY c.authored_at DESC",
|
|
340
|
+
(path,),
|
|
341
|
+
)
|
|
342
|
+
return cur.fetchall()
|
|
343
|
+
|
|
344
|
+
def fetch_co_changes(self, shas: Sequence[str], target_path: str) -> Counter[str]:
|
|
345
|
+
"""Count co-changed files for a fixed list of commit SHAs.
|
|
346
|
+
|
|
347
|
+
Replaces the ``git log --no-walk --numstat`` pass when every input
|
|
348
|
+
sha is already cached. Callers that detect a miss must fall back.
|
|
349
|
+
"""
|
|
350
|
+
if not shas:
|
|
351
|
+
return Counter()
|
|
352
|
+
# SQLite caps the number of host parameters in a single statement at
|
|
353
|
+
# 999 by default. Chunk so we never exceed that, no matter how
|
|
354
|
+
# large the file's history is.
|
|
355
|
+
counter: Counter[str] = Counter()
|
|
356
|
+
chunk_size = 500
|
|
357
|
+
for i in range(0, len(shas), chunk_size):
|
|
358
|
+
chunk = shas[i : i + chunk_size]
|
|
359
|
+
placeholders = ",".join("?" for _ in chunk)
|
|
360
|
+
sql = (
|
|
361
|
+
f"SELECT path, COUNT(*) AS n FROM commit_files "
|
|
362
|
+
f"WHERE sha IN ({placeholders}) AND path != ? "
|
|
363
|
+
f"GROUP BY path"
|
|
364
|
+
)
|
|
365
|
+
params: list[str] = [*chunk, target_path]
|
|
366
|
+
cur = self._conn.execute(sql, params)
|
|
367
|
+
for row in cur.fetchall():
|
|
368
|
+
counter[str(row["path"])] += int(row["n"])
|
|
369
|
+
return counter
|
|
370
|
+
|
|
371
|
+
def fetch_files_for_commit(self, sha: str) -> list[sqlite3.Row]:
|
|
372
|
+
"""Return diffstat rows for a single commit (used by ``files_changed_in``)."""
|
|
373
|
+
cur = self._conn.execute(
|
|
374
|
+
"SELECT sha, path, insertions, deletions "
|
|
375
|
+
"FROM commit_files WHERE sha = ?",
|
|
376
|
+
(sha,),
|
|
377
|
+
)
|
|
378
|
+
return cur.fetchall()
|
|
379
|
+
|
|
380
|
+
def shas_missing_files(self, shas: Sequence[str]) -> list[str]:
|
|
381
|
+
"""Return the subset of ``shas`` for which we have no diffstat rows."""
|
|
382
|
+
if not shas:
|
|
383
|
+
return []
|
|
384
|
+
present: set[str] = set()
|
|
385
|
+
chunk_size = 500
|
|
386
|
+
for i in range(0, len(shas), chunk_size):
|
|
387
|
+
chunk = shas[i : i + chunk_size]
|
|
388
|
+
placeholders = ",".join("?" for _ in chunk)
|
|
389
|
+
cur = self._conn.execute(
|
|
390
|
+
f"SELECT DISTINCT sha FROM commit_files WHERE sha IN ({placeholders})",
|
|
391
|
+
list(chunk),
|
|
392
|
+
)
|
|
393
|
+
present.update(str(row["sha"]) for row in cur.fetchall())
|
|
394
|
+
return [s for s in shas if s not in present]
|
|
395
|
+
|
|
396
|
+
# ---- stats ------------------------------------------------------------
|
|
397
|
+
|
|
398
|
+
def stats(self) -> CacheStats:
|
|
399
|
+
commit_count = int(
|
|
400
|
+
self._conn.execute("SELECT COUNT(*) FROM commits").fetchone()[0]
|
|
401
|
+
)
|
|
402
|
+
file_row_count = int(
|
|
403
|
+
self._conn.execute("SELECT COUNT(*) FROM commit_files").fetchone()[0]
|
|
404
|
+
)
|
|
405
|
+
try:
|
|
406
|
+
size_bytes = self.db_path.stat().st_size
|
|
407
|
+
except OSError:
|
|
408
|
+
size_bytes = 0
|
|
409
|
+
return CacheStats(
|
|
410
|
+
path=self.db_path,
|
|
411
|
+
exists=self.db_path.exists(),
|
|
412
|
+
schema_version=self.schema_version,
|
|
413
|
+
head_sha=self.head_sha,
|
|
414
|
+
commit_count=commit_count,
|
|
415
|
+
file_row_count=file_row_count,
|
|
416
|
+
size_bytes=size_bytes,
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
# ---------- module-level helpers --------------------------------------------
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
def cache_path_for(repo_root: Path) -> Path:
|
|
424
|
+
"""Return the canonical cache file location for ``repo_root``."""
|
|
425
|
+
return repo_root / CACHE_DIRNAME / CACHE_FILENAME
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
def open_for(repo_root: Path) -> CacheStore:
|
|
429
|
+
"""Open (creating if absent) the cache store for ``repo_root``."""
|
|
430
|
+
return CacheStore(cache_path_for(repo_root))
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
def parse_authored_at(value: str) -> datetime:
|
|
434
|
+
"""Parse the ``authored_at`` string we stored from git.
|
|
435
|
+
|
|
436
|
+
Centralised so callers don't each invent their own parsing dance —
|
|
437
|
+
``datetime.fromisoformat`` accepts what ``%aI`` emits.
|
|
438
|
+
"""
|
|
439
|
+
return datetime.fromisoformat(value.strip())
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
def remove(repo_root: Path) -> bool:
|
|
443
|
+
"""Delete the cache file. Returns True if a file was removed."""
|
|
444
|
+
p = cache_path_for(repo_root)
|
|
445
|
+
if p.exists():
|
|
446
|
+
p.unlink()
|
|
447
|
+
# Try to drop the parent dir too, but only if we left it empty.
|
|
448
|
+
with contextlib.suppress(OSError):
|
|
449
|
+
p.parent.rmdir()
|
|
450
|
+
return True
|
|
451
|
+
return False
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
def with_closing(store: CacheStore) -> closing[CacheStore]:
|
|
455
|
+
"""Tiny convenience for callers that want context-managed cleanup."""
|
|
456
|
+
return closing(store)
|