git2xml 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- git2xml/__init__.py +43 -0
- git2xml/__main__.py +8 -0
- git2xml/api.py +95 -0
- git2xml/cli.py +158 -0
- git2xml/constants.py +31 -0
- git2xml/core.py +633 -0
- git2xml/git_scanner.py +708 -0
- git2xml/models.py +273 -0
- git2xml/py.typed +0 -0
- git2xml/utils.py +251 -0
- git2xml-0.1.0.dist-info/METADATA +349 -0
- git2xml-0.1.0.dist-info/RECORD +16 -0
- git2xml-0.1.0.dist-info/WHEEL +5 -0
- git2xml-0.1.0.dist-info/entry_points.txt +2 -0
- git2xml-0.1.0.dist-info/licenses/LICENSE +21 -0
- git2xml-0.1.0.dist-info/top_level.txt +1 -0
git2xml/git_scanner.py
ADDED
|
@@ -0,0 +1,708 @@
|
|
|
1
|
+
"""GitScanner: the only place that talks to the git binary.
|
|
2
|
+
|
|
3
|
+
Every git invocation goes through ``run_git`` (async, no shell, ``--`` before
|
|
4
|
+
paths), so this module is the project's entire subprocess surface. Methods are
|
|
5
|
+
stateless per call - they read configuration from ``self.config`` and take the
|
|
6
|
+
data they need as arguments - and return plain data (ScanResult, ChangedFile,
|
|
7
|
+
metadata maps, blobs), never XML. Concurrency is imposed by the caller in
|
|
8
|
+
``core``; the scanner knows nothing about the orchestrator's budget.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import asyncio
|
|
12
|
+
import logging
|
|
13
|
+
import subprocess
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, overload
|
|
16
|
+
|
|
17
|
+
from .constants import LS_TREE_PATH_BUDGET
|
|
18
|
+
from .models import (
|
|
19
|
+
ChangedFile,
|
|
20
|
+
FileStatus,
|
|
21
|
+
Git2xmlConfig,
|
|
22
|
+
GitCommandError,
|
|
23
|
+
GitNotInstalledError,
|
|
24
|
+
NameStatusEntry,
|
|
25
|
+
NotAGitRepositoryError,
|
|
26
|
+
PRCommit,
|
|
27
|
+
ScanResult,
|
|
28
|
+
StagingState,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
# Suppress console flash on Windows.
|
|
34
|
+
_SUBPROCESS_FLAGS = getattr(subprocess, "CREATE_NO_WINDOW", 0)
|
|
35
|
+
|
|
36
|
+
# Fixed prefix on every git invocation: no pager, no color, no path quoting,
|
|
37
|
+
# so output is stable and machine-parseable.
|
|
38
|
+
_GIT_PREFIX = ["git", "--no-pager", "-c", "color.ui=false", "-c", "core.quotepath=false"]
|
|
39
|
+
|
|
40
|
+
# Chunk size for streaming reads of a subprocess pipe.
|
|
41
|
+
_STREAM_CHUNK = 64 * 1024
|
|
42
|
+
|
|
43
|
+
# Read this many bytes beyond max_diff_size before giving up on a diff. The
|
|
44
|
+
# slack guarantees the returned (truncated) text still measures strictly over
|
|
45
|
+
# max_diff_size once decoded - a cut trailing multi-byte sequence collapses to a
|
|
46
|
+
# single U+FFFD, shrinking the re-encoded length by at most a few bytes - so the
|
|
47
|
+
# engine's diff_exceeds_limit check reliably drops it. Tiny next to the diffs
|
|
48
|
+
# this guards against, so it costs nothing in the common case.
|
|
49
|
+
_DIFF_CAP_SLACK = 8
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
async def _read_capped(stream: asyncio.StreamReader, ceiling: int) -> Tuple[bytes, bool]:
|
|
53
|
+
"""Read ``stream`` until EOF or ``ceiling`` bytes, whichever comes first.
|
|
54
|
+
|
|
55
|
+
Returns ``(data, truncated)`` where ``truncated`` is True iff the ceiling was
|
|
56
|
+
hit before EOF - i.e. there was more to read. Bounds buffered memory to
|
|
57
|
+
``ceiling + _STREAM_CHUNK``.
|
|
58
|
+
"""
|
|
59
|
+
chunks: List[bytes] = []
|
|
60
|
+
total = 0
|
|
61
|
+
while total < ceiling:
|
|
62
|
+
chunk = await stream.read(_STREAM_CHUNK)
|
|
63
|
+
if not chunk:
|
|
64
|
+
return b"".join(chunks), False
|
|
65
|
+
chunks.append(chunk)
|
|
66
|
+
total += len(chunk)
|
|
67
|
+
return b"".join(chunks), True
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
async def _drain(stream: asyncio.StreamReader) -> None:
|
|
71
|
+
"""Read a pipe to EOF and discard the bytes. Run concurrently with the stdout read so a chatty git
|
|
72
|
+
can't deadlock by filling its stderr pipe while we're capped on stdout."""
|
|
73
|
+
while True:
|
|
74
|
+
chunk = await stream.read(_STREAM_CHUNK)
|
|
75
|
+
if not chunk:
|
|
76
|
+
return
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _parse_name_status(raw: str) -> Dict[str, NameStatusEntry]:
|
|
80
|
+
"""Parse NUL-delimited output from ``git diff --name-status -z``.
|
|
81
|
+
|
|
82
|
+
Token sequence per entry:
|
|
83
|
+
- normal (M/A/D/T/…): STATUS NUL PATH NUL
|
|
84
|
+
- rename/copy (R/C): STATUS NUL OLD_PATH NUL NEW_PATH NUL
|
|
85
|
+
"""
|
|
86
|
+
files: Dict[str, NameStatusEntry] = {}
|
|
87
|
+
tokens = [t for t in raw.split("\0") if t]
|
|
88
|
+
i = 0
|
|
89
|
+
while i < len(tokens):
|
|
90
|
+
status_code = tokens[i]
|
|
91
|
+
i += 1
|
|
92
|
+
if not status_code:
|
|
93
|
+
continue
|
|
94
|
+
if status_code[0] in ("R", "C"):
|
|
95
|
+
if i + 1 >= len(tokens):
|
|
96
|
+
break
|
|
97
|
+
old_path = tokens[i]
|
|
98
|
+
new_path = tokens[i + 1]
|
|
99
|
+
i += 2
|
|
100
|
+
files[new_path] = {
|
|
101
|
+
"status": FileStatus.RENAMED if status_code[0] == "R" else FileStatus.COPIED,
|
|
102
|
+
"old_path": old_path,
|
|
103
|
+
}
|
|
104
|
+
else:
|
|
105
|
+
if i >= len(tokens):
|
|
106
|
+
break
|
|
107
|
+
path = tokens[i]
|
|
108
|
+
i += 1
|
|
109
|
+
files[path] = {"status": FileStatus.from_code(status_code)}
|
|
110
|
+
return files
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _normalize_git_path(path: str) -> str:
|
|
114
|
+
"""Converts a path to a POSIX-style relative path suitable for git tree lookups."""
|
|
115
|
+
return path.replace("\\", "/")
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _chunk_paths(paths: List[str], budget: int) -> Iterator[List[str]]:
|
|
119
|
+
"""Split POSIX paths into batches whose rendered command-line cost fits ``budget``.
|
|
120
|
+
|
|
121
|
+
Counts 3 chars/path for quote wrapping plus the argument separator; the fixed
|
|
122
|
+
git prefix and exe path are covered by the reserve baked into the budget. A
|
|
123
|
+
single path longer than the budget is emitted alone rather than dropped --
|
|
124
|
+
still safe, since POSIX MAX_ARG_STRLEN (128 KiB) dwarfs any real path.
|
|
125
|
+
"""
|
|
126
|
+
batch: List[str] = []
|
|
127
|
+
size = 0
|
|
128
|
+
for raw in paths:
|
|
129
|
+
p = _normalize_git_path(raw)
|
|
130
|
+
# UTF-16 units, matching LS_TREE_PATH_BUDGET's Windows-derived cap (astral
|
|
131
|
+
# chars = 2 units). See constants.py before changing the budget per-platform.
|
|
132
|
+
plen = len(p.encode("utf-16-le")) // 2 + 3
|
|
133
|
+
if batch and size + plen > budget:
|
|
134
|
+
yield batch
|
|
135
|
+
batch, size = [], 0
|
|
136
|
+
batch.append(p)
|
|
137
|
+
size += plen
|
|
138
|
+
if batch:
|
|
139
|
+
yield batch
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
class GitScanner:
|
|
143
|
+
config: Git2xmlConfig
|
|
144
|
+
cwd: str
|
|
145
|
+
|
|
146
|
+
def __init__(self, config: Git2xmlConfig):
|
|
147
|
+
self.config = config
|
|
148
|
+
self.cwd = str(Path(config.repo).resolve())
|
|
149
|
+
|
|
150
|
+
@overload
|
|
151
|
+
async def run_git(
|
|
152
|
+
self, args: List[str], binary: Literal[False] = False
|
|
153
|
+
) -> "subprocess.CompletedProcess[str]": ...
|
|
154
|
+
@overload
|
|
155
|
+
async def run_git(
|
|
156
|
+
self, args: List[str], binary: Literal[True]
|
|
157
|
+
) -> "subprocess.CompletedProcess[bytes]": ...
|
|
158
|
+
|
|
159
|
+
async def run_git(
|
|
160
|
+
self, args: List[str], binary: bool = False
|
|
161
|
+
) -> "subprocess.CompletedProcess[Any]":
|
|
162
|
+
"""Run a git command asynchronously using instance-scoped configuration.
|
|
163
|
+
|
|
164
|
+
Prepends ``--no-pager`` and disables color/quotepath so output is stable
|
|
165
|
+
and machine-parseable. Decodes stdout/stderr as UTF-8 unless ``binary`` is
|
|
166
|
+
set, in which case raw bytes are returned (used for blob fetches).
|
|
167
|
+
|
|
168
|
+
Typed via ``@overload`` so the return is ``CompletedProcess[bytes]`` when
|
|
169
|
+
``binary=True`` and ``CompletedProcess[str]`` otherwise - callers get the
|
|
170
|
+
right ``stdout``/``stderr`` type without a cast.
|
|
171
|
+
|
|
172
|
+
Raises:
|
|
173
|
+
GitNotInstalledError: if the git executable is not on PATH.
|
|
174
|
+
GitCommandError: if the command exceeds the configured git timeout.
|
|
175
|
+
"""
|
|
176
|
+
|
|
177
|
+
base_cmd = _GIT_PREFIX + args
|
|
178
|
+
process = await self._spawn(base_cmd)
|
|
179
|
+
|
|
180
|
+
# communicate() manages stream consumption in the background. If cancelled or
|
|
181
|
+
# timed out, it is safe to wait() after kill() without manual pipe draining.
|
|
182
|
+
try:
|
|
183
|
+
stdout_bytes, stderr_bytes = await asyncio.wait_for(
|
|
184
|
+
process.communicate(), timeout=self.config.git_timeout
|
|
185
|
+
)
|
|
186
|
+
except asyncio.TimeoutError:
|
|
187
|
+
process.kill()
|
|
188
|
+
await (
|
|
189
|
+
process.wait()
|
|
190
|
+
) # reap the killed child; avoids zombie + "event loop closed" warnings
|
|
191
|
+
raise GitCommandError(
|
|
192
|
+
command=" ".join(base_cmd),
|
|
193
|
+
returncode=-1,
|
|
194
|
+
stderr=f"Command timed out after {self.config.git_timeout}s.",
|
|
195
|
+
) from None
|
|
196
|
+
except asyncio.CancelledError:
|
|
197
|
+
# Task aborted mid-await (agent/host cancelled the coroutine). communicate()
|
|
198
|
+
# being cancelled does not kill the child, so kill+reap to avoid an orphan,
|
|
199
|
+
# then let the cancellation propagate.
|
|
200
|
+
if process.returncode is None:
|
|
201
|
+
process.kill()
|
|
202
|
+
await process.wait()
|
|
203
|
+
raise
|
|
204
|
+
|
|
205
|
+
# communicate() guarantees the process has terminated; satisfy type checkers.
|
|
206
|
+
return_code = process.returncode if process.returncode is not None else -1
|
|
207
|
+
|
|
208
|
+
stdout = stdout_bytes if binary else stdout_bytes.decode("utf-8", errors="replace")
|
|
209
|
+
stderr = stderr_bytes if binary else stderr_bytes.decode("utf-8", errors="replace")
|
|
210
|
+
|
|
211
|
+
return subprocess.CompletedProcess(
|
|
212
|
+
args=base_cmd, returncode=return_code, stdout=stdout, stderr=stderr
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
async def _spawn(self, cmd: List[str]) -> "asyncio.subprocess.Process":
|
|
216
|
+
"""Spawn a git process with the standard pipes/flags (no shell, no stdin)."""
|
|
217
|
+
try:
|
|
218
|
+
return await asyncio.create_subprocess_exec(
|
|
219
|
+
*cmd,
|
|
220
|
+
cwd=self.cwd,
|
|
221
|
+
stdin=asyncio.subprocess.DEVNULL,
|
|
222
|
+
stdout=asyncio.subprocess.PIPE,
|
|
223
|
+
stderr=asyncio.subprocess.PIPE,
|
|
224
|
+
creationflags=_SUBPROCESS_FLAGS,
|
|
225
|
+
)
|
|
226
|
+
except FileNotFoundError:
|
|
227
|
+
raise GitNotInstalledError() from None
|
|
228
|
+
|
|
229
|
+
async def _run_git_capped(self, args: List[str]) -> Tuple[int, str, bool]:
|
|
230
|
+
"""Run a diff-producing git command, bounding buffered stdout in memory.
|
|
231
|
+
|
|
232
|
+
Unlike content (whose size git reports before we load it), a diff has no
|
|
233
|
+
size until git computes it, and ``communicate()`` would buffer the whole
|
|
234
|
+
thing - a modified-but-tracked multi-megabyte file produces a diff just as
|
|
235
|
+
large, and these fetches run ``diff_semaphore_limit``-wide, so the naive
|
|
236
|
+
path can balloon to (limit x diff size) of RAM. Here stdout is streamed and
|
|
237
|
+
abandoned once it passes ``max_diff_size`` (there's no point buffering bytes
|
|
238
|
+
the engine will drop), while stderr is drained concurrently to avoid a
|
|
239
|
+
pipe-buffer deadlock.
|
|
240
|
+
|
|
241
|
+
Returns ``(returncode, stdout_text, truncated)``. When ``truncated`` is
|
|
242
|
+
True the child was killed mid-diff and ``returncode`` is meaningless (the
|
|
243
|
+
text is already over-limit and the engine drops it). ``max_diff_size <= 0``
|
|
244
|
+
means uncapped (honors ``--max-diff-size 0``) and defers to ``run_git``.
|
|
245
|
+
"""
|
|
246
|
+
max_diff_size = self.config.max_diff_size
|
|
247
|
+
if max_diff_size <= 0:
|
|
248
|
+
res = await self.run_git(args)
|
|
249
|
+
return res.returncode, res.stdout, False
|
|
250
|
+
|
|
251
|
+
process = await self._spawn(_GIT_PREFIX + args)
|
|
252
|
+
assert process.stdout is not None
|
|
253
|
+
assert process.stderr is not None
|
|
254
|
+
ceiling = max_diff_size + _DIFF_CAP_SLACK
|
|
255
|
+
stderr_task = asyncio.create_task(_drain(process.stderr))
|
|
256
|
+
|
|
257
|
+
try:
|
|
258
|
+
try:
|
|
259
|
+
stdout_bytes, truncated = await asyncio.wait_for(
|
|
260
|
+
_read_capped(process.stdout, ceiling),
|
|
261
|
+
timeout=self.config.git_timeout,
|
|
262
|
+
)
|
|
263
|
+
except asyncio.TimeoutError:
|
|
264
|
+
# Child is still running and blocked; kill so the finally can reap it.
|
|
265
|
+
process.kill()
|
|
266
|
+
raise GitCommandError(
|
|
267
|
+
command=" ".join(_GIT_PREFIX + args),
|
|
268
|
+
returncode=-1,
|
|
269
|
+
stderr=f"Command timed out after {self.config.git_timeout}s.",
|
|
270
|
+
) from None
|
|
271
|
+
if truncated:
|
|
272
|
+
# Cap reached mid-diff: kill to stop further output and to EOF the
|
|
273
|
+
# child's stderr so the concurrent _drain can finish.
|
|
274
|
+
process.kill()
|
|
275
|
+
except asyncio.CancelledError:
|
|
276
|
+
# Coroutine cancelled mid-read; the child outlives the await unless killed.
|
|
277
|
+
process.kill()
|
|
278
|
+
raise
|
|
279
|
+
except Exception:
|
|
280
|
+
# Unexpected stream/transport error: don't leave the child running.
|
|
281
|
+
process.kill()
|
|
282
|
+
raise
|
|
283
|
+
finally:
|
|
284
|
+
# On every path the child must be reaped, and on Windows' Proactor loop
|
|
285
|
+
# process.wait() hangs while a pipe holds unread bytes - so drain both
|
|
286
|
+
# pipes to EOF first. A clean read leaves the child exiting on its own;
|
|
287
|
+
# it is reaped here without a kill.
|
|
288
|
+
await asyncio.gather(_drain(process.stdout), stderr_task, return_exceptions=True)
|
|
289
|
+
await process.wait()
|
|
290
|
+
|
|
291
|
+
return_code = process.returncode if process.returncode is not None else -1
|
|
292
|
+
return return_code, stdout_bytes.decode("utf-8", errors="replace"), truncated
|
|
293
|
+
|
|
294
|
+
def check_git_result(self, result: subprocess.CompletedProcess, command: str) -> None:
|
|
295
|
+
"""Raise GitCommandError if ``result`` is a non-zero exit, else do nothing.
|
|
296
|
+
|
|
297
|
+
``command`` is a human-readable label for the failing call, used in the
|
|
298
|
+
error message. A convenience guard for callers that treat any non-zero
|
|
299
|
+
git exit as fatal (unlike the lookup helpers, which interpret specific
|
|
300
|
+
codes themselves).
|
|
301
|
+
"""
|
|
302
|
+
|
|
303
|
+
if result.returncode != 0:
|
|
304
|
+
raise GitCommandError(
|
|
305
|
+
command=command, returncode=result.returncode, stderr=result.stderr
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
async def validate_repository(self) -> None:
|
|
309
|
+
"""Verify ``self.cwd`` exists and is inside a git work tree.
|
|
310
|
+
|
|
311
|
+
Raises NotAGitRepositoryError if the path is not a directory, or if
|
|
312
|
+
``git rev-parse --is-inside-work-tree`` does not confirm a work tree
|
|
313
|
+
(covers a non-repo directory and a bare repository alike).
|
|
314
|
+
"""
|
|
315
|
+
|
|
316
|
+
if not Path(self.cwd).is_dir():
|
|
317
|
+
raise NotAGitRepositoryError(self.cwd)
|
|
318
|
+
result = await self.run_git(["rev-parse", "--is-inside-work-tree"])
|
|
319
|
+
if result.returncode != 0 or result.stdout.strip() != "true":
|
|
320
|
+
raise NotAGitRepositoryError(self.cwd)
|
|
321
|
+
|
|
322
|
+
async def resolve_base_ref(self) -> str:
|
|
323
|
+
"""Resolve the configured base ref, trying it as-is before falling back to refs/heads/.
|
|
324
|
+
|
|
325
|
+
Reads ``self.config.base`` as the user-supplied input and returns the form
|
|
326
|
+
that resolved successfully. Supports remote refs (origin/main), tags,
|
|
327
|
+
commit SHAs, and bare branch names.
|
|
328
|
+
"""
|
|
329
|
+
# A leading '-' would be parsed by git as an option, not a ref (option
|
|
330
|
+
# injection). No legitimate ref a user passes starts with '-': branch
|
|
331
|
+
# names forbid it, and refs/tags/SHAs/remote refs never lead with it.
|
|
332
|
+
# A dash mid-string (feature/-x, my-branch) is fine - only the first
|
|
333
|
+
# character is rejected. A pathologically dash-named branch is still
|
|
334
|
+
# reachable via its fully-qualified refs/heads/-name form.
|
|
335
|
+
if self.config.base.startswith("-"):
|
|
336
|
+
raise GitCommandError(
|
|
337
|
+
command=f"rev-parse --verify {self.config.base}",
|
|
338
|
+
returncode=-1,
|
|
339
|
+
stderr=(
|
|
340
|
+
f"Invalid base ref '{self.config.base}': cannot start with '-'. "
|
|
341
|
+
"If you have a branch named this way, pass its full ref "
|
|
342
|
+
"(e.g. refs/heads/...)."
|
|
343
|
+
),
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
result = await self.run_git(["rev-parse", "--verify", self.config.base])
|
|
347
|
+
if result.returncode == 0:
|
|
348
|
+
return self.config.base
|
|
349
|
+
if not self.config.base.startswith("refs/"):
|
|
350
|
+
qualified = f"refs/heads/{self.config.base}"
|
|
351
|
+
result2 = await self.run_git(["rev-parse", "--verify", qualified])
|
|
352
|
+
if result2.returncode == 0:
|
|
353
|
+
return qualified
|
|
354
|
+
raise GitCommandError(
|
|
355
|
+
command=f"rev-parse --verify {self.config.base}",
|
|
356
|
+
returncode=result.returncode,
|
|
357
|
+
stderr=f"Base branch '{self.config.base}' not found in repository.",
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
async def scan_commit_changes(self) -> ScanResult:
|
|
361
|
+
"""Scan the working tree for changes against HEAD, returning a ScanResult.
|
|
362
|
+
|
|
363
|
+
Runs git queries concurrently: staged changes
|
|
364
|
+
(``diff --name-status --staged``) and, unless ``self.config.staged`` is set,
|
|
365
|
+
also unstaged changes (``diff --name-status``) and untracked files
|
|
366
|
+
(``ls-files --others --exclude-standard``). Each changed file becomes a
|
|
367
|
+
ChangedFile with a derived StagingState - a path present in both the staged
|
|
368
|
+
and unstaged sets is STAGED_AND_MODIFIED, staged-only is STAGED,
|
|
369
|
+
unstaged-only is UNSTAGED, and untracked files are UNTRACKED.
|
|
370
|
+
|
|
371
|
+
Under ``self.config.staged`` the unstaged and untracked scans are skipped
|
|
372
|
+
(their results would be discarded by the caller's ``--staged`` filter anyway),
|
|
373
|
+
so every staged file is reported as plain STAGED. The STAGED_AND_MODIFIED
|
|
374
|
+
label is not derivable without the unstaged scan, but it is unused in
|
|
375
|
+
``--staged`` mode - it does not affect the filter, the XML output, or
|
|
376
|
+
``has_staged``.
|
|
377
|
+
|
|
378
|
+
``has_staged`` reports whether anything was staged, so the caller can give a
|
|
379
|
+
precise message when ``--staged`` finds nothing.
|
|
380
|
+
"""
|
|
381
|
+
staged_task = self.run_git(["diff", "--name-status", "-z", "--staged"])
|
|
382
|
+
|
|
383
|
+
if self.config.staged:
|
|
384
|
+
staged_res = await staged_task
|
|
385
|
+
self.check_git_result(staged_res, "diff staged")
|
|
386
|
+
staged = _parse_name_status(staged_res.stdout)
|
|
387
|
+
all_files = [
|
|
388
|
+
ChangedFile(path, info["status"], StagingState.STAGED, info.get("old_path"))
|
|
389
|
+
for path, info in staged.items()
|
|
390
|
+
]
|
|
391
|
+
return ScanResult(files=all_files, has_staged=len(staged) > 0)
|
|
392
|
+
|
|
393
|
+
unstaged_task = self.run_git(["diff", "--name-status", "-z"])
|
|
394
|
+
untracked_task = self.run_git(["ls-files", "-z", "--others", "--exclude-standard"])
|
|
395
|
+
|
|
396
|
+
staged_res, unstaged_res, untracked_res = await asyncio.gather(
|
|
397
|
+
staged_task, unstaged_task, untracked_task
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
self.check_git_result(staged_res, "diff staged")
|
|
401
|
+
self.check_git_result(unstaged_res, "diff unstaged")
|
|
402
|
+
self.check_git_result(untracked_res, "ls-files untracked")
|
|
403
|
+
|
|
404
|
+
staged = _parse_name_status(staged_res.stdout)
|
|
405
|
+
unstaged = _parse_name_status(unstaged_res.stdout)
|
|
406
|
+
untracked = [p for p in untracked_res.stdout.split("\0") if p]
|
|
407
|
+
|
|
408
|
+
all_files = []
|
|
409
|
+
for path, info in staged.items():
|
|
410
|
+
staging = StagingState.STAGED_AND_MODIFIED if path in unstaged else StagingState.STAGED
|
|
411
|
+
all_files.append(ChangedFile(path, info["status"], staging, info.get("old_path")))
|
|
412
|
+
for path, info in unstaged.items():
|
|
413
|
+
if path not in staged:
|
|
414
|
+
all_files.append(ChangedFile(path, info["status"], StagingState.UNSTAGED))
|
|
415
|
+
for path in untracked:
|
|
416
|
+
all_files.append(ChangedFile(path, FileStatus.UNTRACKED, StagingState.UNTRACKED))
|
|
417
|
+
|
|
418
|
+
return ScanResult(files=all_files, has_staged=len(staged) > 0)
|
|
419
|
+
|
|
420
|
+
async def scan_pr_changes(self, base_branch: str) -> ScanResult:
|
|
421
|
+
"""Scan the branch's changes against ``base_branch``, returning a ScanResult.
|
|
422
|
+
|
|
423
|
+
Uses ``diff --name-status <base>...HEAD`` (the merge-base diff), so the
|
|
424
|
+
result reflects what the branch added relative to where it forked from, not
|
|
425
|
+
the raw tip-to-tip diff. These are committed changes, so per-file
|
|
426
|
+
StagingState and ``has_staged`` are benign placeholders, never read in PR
|
|
427
|
+
mode (see the inline note).
|
|
428
|
+
"""
|
|
429
|
+
|
|
430
|
+
diff_result = await self.run_git(["diff", "--name-status", "-z", f"{base_branch}...HEAD"])
|
|
431
|
+
self.check_git_result(diff_result, f"diff {base_branch}...HEAD")
|
|
432
|
+
|
|
433
|
+
files_dict = _parse_name_status(diff_result.stdout)
|
|
434
|
+
# Staging state is meaningless in PR mode (these are committed branch changes,
|
|
435
|
+
# not working-tree/index states). Both the per-file StagingState.STAGED and
|
|
436
|
+
# has_staged=True below are benign placeholders - the commit-mode code paths
|
|
437
|
+
# that read these fields (staging filter, --staged guard) never run in PR
|
|
438
|
+
# mode - not real claims about staging.
|
|
439
|
+
all_files = [
|
|
440
|
+
ChangedFile(path, info["status"], StagingState.STAGED, info.get("old_path"))
|
|
441
|
+
for path, info in files_dict.items()
|
|
442
|
+
]
|
|
443
|
+
return ScanResult(files=all_files, has_staged=True)
|
|
444
|
+
|
|
445
|
+
async def get_single_diff(
|
|
446
|
+
self,
|
|
447
|
+
file: ChangedFile,
|
|
448
|
+
base_ref: str,
|
|
449
|
+
untracked: bool = False,
|
|
450
|
+
) -> Tuple[str, str]:
|
|
451
|
+
"""Fetch the diff for one changed file and return ``(path, diff_text)``.
|
|
452
|
+
|
|
453
|
+
Args:
|
|
454
|
+
file: The file whose diff is to be fetched.
|
|
455
|
+
base_ref: Base ref for the diff (e.g. ``"HEAD"`` or ``"main...HEAD"``).
|
|
456
|
+
untracked: If True, synthesize an add-diff for a not-yet-tracked file
|
|
457
|
+
via ``git diff --no-index -- /dev/null <path>``. ``--no-index``
|
|
458
|
+
exits 1 when the files differ (the normal case), so rc 0 and 1
|
|
459
|
+
are both accepted.
|
|
460
|
+
|
|
461
|
+
The staged/unstaged distinction is read from ``self.config.staged``.
|
|
462
|
+
|
|
463
|
+
Git commands are called with ``--no-ext-diff`` and ``--no-textconv`` to
|
|
464
|
+
neutralize repo-configured external diff and textconv drivers on
|
|
465
|
+
untrusted repos (``--no-index`` honors textconv too, so the untracked
|
|
466
|
+
path needs the guard as well).
|
|
467
|
+
"""
|
|
468
|
+
if untracked:
|
|
469
|
+
# Untracked files are invisible to plain `git diff`; synthesize an
|
|
470
|
+
# add-diff against an empty input. "/dev/null" gives git's canonical
|
|
471
|
+
# creation header identically across platforms (see prior note).
|
|
472
|
+
diff_args = [
|
|
473
|
+
"diff",
|
|
474
|
+
"--no-ext-diff",
|
|
475
|
+
"--no-textconv",
|
|
476
|
+
"--no-index",
|
|
477
|
+
"--",
|
|
478
|
+
"/dev/null",
|
|
479
|
+
_normalize_git_path(file.path),
|
|
480
|
+
]
|
|
481
|
+
rc, out, truncated = await self._run_git_capped(diff_args)
|
|
482
|
+
if truncated:
|
|
483
|
+
return file.path, out # over-limit; the engine drops it
|
|
484
|
+
if rc not in (0, 1):
|
|
485
|
+
logger.debug("no-index diff for %s exited %d", file.path, rc)
|
|
486
|
+
return file.path, ""
|
|
487
|
+
return file.path, out.strip()
|
|
488
|
+
|
|
489
|
+
paths = [file.path] if not file.old_path else [file.path, file.old_path]
|
|
490
|
+
diff_args = ["diff", "--no-ext-diff", "--no-textconv", "-M"]
|
|
491
|
+
if self.config.staged:
|
|
492
|
+
diff_args.append("--cached")
|
|
493
|
+
diff_args += [base_ref, "--"] + paths
|
|
494
|
+
rc, out, truncated = await self._run_git_capped(diff_args)
|
|
495
|
+
if truncated:
|
|
496
|
+
return file.path, out # over-limit; the engine drops it
|
|
497
|
+
if rc != 0:
|
|
498
|
+
logger.debug("diff for %s exited %d", file.path, rc)
|
|
499
|
+
return file.path, ""
|
|
500
|
+
return file.path, out.strip()
|
|
501
|
+
|
|
502
|
+
async def get_current_branch(self) -> str:
|
|
503
|
+
"""Gets the name of the current active branch."""
|
|
504
|
+
result = await self.run_git(["rev-parse", "--abbrev-ref", "HEAD"])
|
|
505
|
+
return result.stdout.strip() if result.returncode == 0 else "HEAD"
|
|
506
|
+
|
|
507
|
+
async def get_pr_commits(self, base_branch: str) -> List[PRCommit]:
|
|
508
|
+
"""Fetches structured commit history for a PR.
|
|
509
|
+
|
|
510
|
+
Records are NUL-separated via ``git log -z`` (NUL cannot appear in commit
|
|
511
|
+
content, making record splitting unambiguous). Fields within a record are
|
|
512
|
+
separated by %x1e (ASCII Record Separator). A literal RS byte in a field
|
|
513
|
+
(author/subject) would misalign that one record - accepted as negligible,
|
|
514
|
+
since RS does not occur in organic commit metadata.
|
|
515
|
+
"""
|
|
516
|
+
# %h short hash, %an author, %aI ISO-8601 date, %s subject, %b body
|
|
517
|
+
format_str = "%h%x1e%an%x1e%aI%x1e%s%x1e%b"
|
|
518
|
+
args = ["log", "-z", f"{base_branch}..HEAD", f"--format={format_str}"]
|
|
519
|
+
|
|
520
|
+
result = await self.run_git(args)
|
|
521
|
+
commits: List[PRCommit] = []
|
|
522
|
+
|
|
523
|
+
if result.returncode == 0 and result.stdout:
|
|
524
|
+
for raw_commit in result.stdout.split("\x00"):
|
|
525
|
+
if not raw_commit.strip():
|
|
526
|
+
continue
|
|
527
|
+
parts = raw_commit.split("\x1e", 4)
|
|
528
|
+
if len(parts) == 5:
|
|
529
|
+
commits.append(
|
|
530
|
+
{
|
|
531
|
+
"hash": parts[0].strip(),
|
|
532
|
+
"author": parts[1].strip(),
|
|
533
|
+
"date": parts[2].strip(),
|
|
534
|
+
"subject": parts[3].strip(),
|
|
535
|
+
"body": parts[4].strip(),
|
|
536
|
+
}
|
|
537
|
+
)
|
|
538
|
+
|
|
539
|
+
return commits
|
|
540
|
+
|
|
541
|
+
async def has_uncommitted_changes(self) -> bool:
|
|
542
|
+
"""Checks if the working directory is dirty."""
|
|
543
|
+
result = await self.run_git(["status", "--porcelain"])
|
|
544
|
+
return bool(result.stdout.strip())
|
|
545
|
+
|
|
546
|
+
async def get_blob_size_at_index(self, path: str) -> int:
|
|
547
|
+
"""Return the byte size of a staged blob in the index, or -1 if unavailable.
|
|
548
|
+
|
|
549
|
+
Reads the index entry via ``git cat-file -s :<path>``. Lets the caller
|
|
550
|
+
enforce the size limit *before* fetching the blob into memory, so an
|
|
551
|
+
oversized staged file is never fully buffered. Returns -1 if the path is not
|
|
552
|
+
in the index or the size can't be parsed.
|
|
553
|
+
"""
|
|
554
|
+
posix_path = _normalize_git_path(path)
|
|
555
|
+
result = await self.run_git(["cat-file", "-s", f":{posix_path}"])
|
|
556
|
+
if result.returncode != 0:
|
|
557
|
+
return -1
|
|
558
|
+
try:
|
|
559
|
+
return int(result.stdout.strip())
|
|
560
|
+
except ValueError:
|
|
561
|
+
return -1
|
|
562
|
+
|
|
563
|
+
async def _ls_tree_meta(
|
|
564
|
+
self, ref: str, paths: Optional[List[str]] = None
|
|
565
|
+
) -> Dict[str, Tuple[int, bool]]:
|
|
566
|
+
"""Run one ``ls-tree -r -l -z`` (optionally scoped to ``paths``) and parse it.
|
|
567
|
+
|
|
568
|
+
``paths`` are assumed already POSIX-normalized and small enough for one
|
|
569
|
+
command line (chunking is the caller's job). The ``-l`` record format is::
|
|
570
|
+
|
|
571
|
+
<mode> SP <type> SP <object> SP <size> TAB <path>
|
|
572
|
+
|
|
573
|
+
so we partition once on TAB to isolate the path (which may contain spaces),
|
|
574
|
+
then whitespace-split the metadata half. Size is -1 for entries git reports
|
|
575
|
+
without one (the '-' placeholder for non-blobs). Empty map on non-zero exit.
|
|
576
|
+
"""
|
|
577
|
+
args = ["ls-tree", "-r", "-l", "-z", ref]
|
|
578
|
+
if paths:
|
|
579
|
+
args += ["--", *paths]
|
|
580
|
+
|
|
581
|
+
result = await self.run_git(args)
|
|
582
|
+
if result.returncode != 0:
|
|
583
|
+
logger.debug(
|
|
584
|
+
"ls-tree for %s exited %d: %s", ref, result.returncode, result.stderr.strip()
|
|
585
|
+
)
|
|
586
|
+
return {}
|
|
587
|
+
|
|
588
|
+
meta: Dict[str, Tuple[int, bool]] = {}
|
|
589
|
+
for record in result.stdout.split("\0"):
|
|
590
|
+
if not record:
|
|
591
|
+
continue
|
|
592
|
+
head, tab, path = record.partition("\t")
|
|
593
|
+
if not tab:
|
|
594
|
+
continue
|
|
595
|
+
fields = head.split()
|
|
596
|
+
if len(fields) < 4:
|
|
597
|
+
continue
|
|
598
|
+
mode, size_str = fields[0], fields[3]
|
|
599
|
+
try:
|
|
600
|
+
size = int(size_str)
|
|
601
|
+
except ValueError:
|
|
602
|
+
size = -1 # '-' placeholder for non-blob entries
|
|
603
|
+
meta[path] = (size, mode == "120000")
|
|
604
|
+
return meta
|
|
605
|
+
|
|
606
|
+
async def get_tree_metadata(
|
|
607
|
+
self, ref: str, paths: Optional[List[str]] = None
|
|
608
|
+
) -> Dict[str, Tuple[int, bool]]:
|
|
609
|
+
"""Return ``{path: (size, is_symlink)}`` for blobs in ``ref``'s tree.
|
|
610
|
+
|
|
611
|
+
With ``paths``, the listing is always scoped to them, chunked only so the
|
|
612
|
+
argument list never exceeds the platform command-line limit (see
|
|
613
|
+
LS_TREE_PATH_BUDGET). Batches run concurrently; because they partition the
|
|
614
|
+
paths into disjoint sets, merging their maps is order-independent. There is
|
|
615
|
+
no full-tree performance fallback: scoping fetches P entries, full-tree
|
|
616
|
+
fetches every entry (N >= P) to save only spawns -- a bad trade in exactly
|
|
617
|
+
the large repos where scoping matters most. ``paths=None`` lists the whole
|
|
618
|
+
tree, for callers that explicitly want it.
|
|
619
|
+
|
|
620
|
+
Callers only ever look up changed paths and never iterate the map, so a
|
|
621
|
+
scoped map behaves identically to the full tree. Empty map if ``ref`` can't
|
|
622
|
+
be listed; callers treat a missing path as size -1.
|
|
623
|
+
"""
|
|
624
|
+
if not paths:
|
|
625
|
+
return await self._ls_tree_meta(ref)
|
|
626
|
+
|
|
627
|
+
sem = asyncio.Semaphore(self.config.diff_semaphore_limit)
|
|
628
|
+
|
|
629
|
+
async def _scoped(batch: List[str]) -> Dict[str, Tuple[int, bool]]:
|
|
630
|
+
async with sem:
|
|
631
|
+
return await self._ls_tree_meta(ref, batch)
|
|
632
|
+
|
|
633
|
+
coros = [_scoped(batch) for batch in _chunk_paths(paths, LS_TREE_PATH_BUDGET)]
|
|
634
|
+
meta: Dict[str, Tuple[int, bool]] = {}
|
|
635
|
+
for partial in await asyncio.gather(*coros): # concurrent; disjoint, so order is moot
|
|
636
|
+
meta.update(partial)
|
|
637
|
+
return meta
|
|
638
|
+
|
|
639
|
+
async def get_blob_at_head(self, path: str) -> Optional[bytes]:
|
|
640
|
+
"""Fetches the raw bytes of a file at HEAD. Returns None if not found, b'' if empty."""
|
|
641
|
+
posix_path = _normalize_git_path(path)
|
|
642
|
+
result = await self.run_git(["show", f"HEAD:{posix_path}"], binary=True)
|
|
643
|
+
return result.stdout if result.returncode == 0 else None
|
|
644
|
+
|
|
645
|
+
async def get_blob_at_index(self, path: str) -> Optional[bytes]:
|
|
646
|
+
"""Fetches the raw bytes of a file from the Git index. Returns None if not found, b'' if empty."""
|
|
647
|
+
posix_path = _normalize_git_path(path)
|
|
648
|
+
result = await self.run_git(["show", f":{posix_path}"], binary=True)
|
|
649
|
+
return result.stdout if result.returncode == 0 else None
|
|
650
|
+
|
|
651
|
+
async def is_symlink_at_index(self, path: str) -> bool:
|
|
652
|
+
"""Returns True iff the path is stored as a symlink (mode 120000) in the Git index."""
|
|
653
|
+
posix_path = _normalize_git_path(path)
|
|
654
|
+
result = await self.run_git(["ls-files", "-s", "--", posix_path])
|
|
655
|
+
if result.returncode != 0 or not result.stdout.strip():
|
|
656
|
+
return False
|
|
657
|
+
try:
|
|
658
|
+
return result.stdout.split()[0] == "120000"
|
|
659
|
+
except IndexError:
|
|
660
|
+
return False
|
|
661
|
+
|
|
662
|
+
async def get_index_metadata(
|
|
663
|
+
self, paths: Optional[List[str]] = None
|
|
664
|
+
) -> Dict[str, Tuple[int, bool]]:
|
|
665
|
+
"""Return ``{path: (size, is_symlink)}`` for blobs in the git index.
|
|
666
|
+
|
|
667
|
+
The index analogue of ``get_tree_metadata``: ``git write-tree`` writes the
|
|
668
|
+
current index to a tree object (read-only w.r.t. index/working tree/HEAD --
|
|
669
|
+
it only adds loose objects, reclaimed later by ``git gc``), which is then
|
|
670
|
+
listed via ``get_tree_metadata``, inheriting its scoping/chunking. Replaces
|
|
671
|
+
the per-file ``cat-file -s`` + ``ls-files -s`` probe that --staged rendering
|
|
672
|
+
would otherwise issue per file.
|
|
673
|
+
|
|
674
|
+
Empty map if the index can't be written as a tree (e.g. an unmerged index);
|
|
675
|
+
callers treat a missing path as size -1 and fall back to per-file probes.
|
|
676
|
+
"""
|
|
677
|
+
result = await self.run_git(["write-tree"])
|
|
678
|
+
if result.returncode != 0:
|
|
679
|
+
logger.debug("write-tree failed (%d): %s", result.returncode, result.stderr.strip())
|
|
680
|
+
return {}
|
|
681
|
+
tree_sha = result.stdout.strip()
|
|
682
|
+
if not tree_sha:
|
|
683
|
+
return {}
|
|
684
|
+
return await self.get_tree_metadata(tree_sha, paths)
|
|
685
|
+
|
|
686
|
+
async def new_file_size(self, f: ChangedFile, tree_meta: Dict[str, Tuple[int, bool]]) -> int:
|
|
687
|
+
"""Return the byte size of a new file from the same source the render path uses.
|
|
688
|
+
|
|
689
|
+
Reads size from the working tree (stat), the git index (cat-file on
|
|
690
|
+
``:<path>``), or - in PR mode - the batched HEAD tree map, mirroring
|
|
691
|
+
exactly what ``_render_file_xml`` does. This keeps the diff-side guard
|
|
692
|
+
and the content-side guard in agreement so a large new file is never
|
|
693
|
+
fully read as a diff under --no-content when it would be omitted in
|
|
694
|
+
normal mode.
|
|
695
|
+
"""
|
|
696
|
+
mode = self.config.command
|
|
697
|
+
staged_only = self.config.staged
|
|
698
|
+
if f.status == FileStatus.UNTRACKED or (mode == "commit" and not staged_only):
|
|
699
|
+
try:
|
|
700
|
+
return (Path(self.cwd) / f.path).stat().st_size
|
|
701
|
+
except OSError:
|
|
702
|
+
return -1
|
|
703
|
+
if mode == "pr":
|
|
704
|
+
return tree_meta.get(f.path, (-1, False))[0]
|
|
705
|
+
size = tree_meta.get(f.path, (-1, False))[0] # commit + --staged
|
|
706
|
+
if size < 0:
|
|
707
|
+
size = await self.get_blob_size_at_index(f.path)
|
|
708
|
+
return size
|