git2xml 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
git2xml/git_scanner.py ADDED
@@ -0,0 +1,708 @@
1
+ """GitScanner: the only place that talks to the git binary.
2
+
3
+ Every git invocation goes through ``run_git`` (async, no shell, ``--`` before
4
+ paths), so this module is the project's entire subprocess surface. Methods are
5
+ stateless per call - they read configuration from ``self.config`` and take the
6
+ data they need as arguments - and return plain data (ScanResult, ChangedFile,
7
+ metadata maps, blobs), never XML. Concurrency is imposed by the caller in
8
+ ``core``; the scanner knows nothing about the orchestrator's budget.
9
+ """
10
+
11
+ import asyncio
12
+ import logging
13
+ import subprocess
14
+ from pathlib import Path
15
+ from typing import Any, Dict, Iterator, List, Literal, Optional, Tuple, overload
16
+
17
+ from .constants import LS_TREE_PATH_BUDGET
18
+ from .models import (
19
+ ChangedFile,
20
+ FileStatus,
21
+ Git2xmlConfig,
22
+ GitCommandError,
23
+ GitNotInstalledError,
24
+ NameStatusEntry,
25
+ NotAGitRepositoryError,
26
+ PRCommit,
27
+ ScanResult,
28
+ StagingState,
29
+ )
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+ # Suppress console flash on Windows.
34
+ _SUBPROCESS_FLAGS = getattr(subprocess, "CREATE_NO_WINDOW", 0)
35
+
36
+ # Fixed prefix on every git invocation: no pager, no color, no path quoting,
37
+ # so output is stable and machine-parseable.
38
+ _GIT_PREFIX = ["git", "--no-pager", "-c", "color.ui=false", "-c", "core.quotepath=false"]
39
+
40
+ # Chunk size for streaming reads of a subprocess pipe.
41
+ _STREAM_CHUNK = 64 * 1024
42
+
43
+ # Read this many bytes beyond max_diff_size before giving up on a diff. The
44
+ # slack guarantees the returned (truncated) text still measures strictly over
45
+ # max_diff_size once decoded - a cut trailing multi-byte sequence collapses to a
46
+ # single U+FFFD, shrinking the re-encoded length by at most a few bytes - so the
47
+ # engine's diff_exceeds_limit check reliably drops it. Tiny next to the diffs
48
+ # this guards against, so it costs nothing in the common case.
49
+ _DIFF_CAP_SLACK = 8
50
+
51
+
52
+ async def _read_capped(stream: asyncio.StreamReader, ceiling: int) -> Tuple[bytes, bool]:
53
+ """Read ``stream`` until EOF or ``ceiling`` bytes, whichever comes first.
54
+
55
+ Returns ``(data, truncated)`` where ``truncated`` is True iff the ceiling was
56
+ hit before EOF - i.e. there was more to read. Bounds buffered memory to
57
+ ``ceiling + _STREAM_CHUNK``.
58
+ """
59
+ chunks: List[bytes] = []
60
+ total = 0
61
+ while total < ceiling:
62
+ chunk = await stream.read(_STREAM_CHUNK)
63
+ if not chunk:
64
+ return b"".join(chunks), False
65
+ chunks.append(chunk)
66
+ total += len(chunk)
67
+ return b"".join(chunks), True
68
+
69
+
70
+ async def _drain(stream: asyncio.StreamReader) -> None:
71
+ """Read a pipe to EOF and discard the bytes. Run concurrently with the stdout read so a chatty git
72
+ can't deadlock by filling its stderr pipe while we're capped on stdout."""
73
+ while True:
74
+ chunk = await stream.read(_STREAM_CHUNK)
75
+ if not chunk:
76
+ return
77
+
78
+
79
+ def _parse_name_status(raw: str) -> Dict[str, NameStatusEntry]:
80
+ """Parse NUL-delimited output from ``git diff --name-status -z``.
81
+
82
+ Token sequence per entry:
83
+ - normal (M/A/D/T/…): STATUS NUL PATH NUL
84
+ - rename/copy (R/C): STATUS NUL OLD_PATH NUL NEW_PATH NUL
85
+ """
86
+ files: Dict[str, NameStatusEntry] = {}
87
+ tokens = [t for t in raw.split("\0") if t]
88
+ i = 0
89
+ while i < len(tokens):
90
+ status_code = tokens[i]
91
+ i += 1
92
+ if not status_code:
93
+ continue
94
+ if status_code[0] in ("R", "C"):
95
+ if i + 1 >= len(tokens):
96
+ break
97
+ old_path = tokens[i]
98
+ new_path = tokens[i + 1]
99
+ i += 2
100
+ files[new_path] = {
101
+ "status": FileStatus.RENAMED if status_code[0] == "R" else FileStatus.COPIED,
102
+ "old_path": old_path,
103
+ }
104
+ else:
105
+ if i >= len(tokens):
106
+ break
107
+ path = tokens[i]
108
+ i += 1
109
+ files[path] = {"status": FileStatus.from_code(status_code)}
110
+ return files
111
+
112
+
113
+ def _normalize_git_path(path: str) -> str:
114
+ """Converts a path to a POSIX-style relative path suitable for git tree lookups."""
115
+ return path.replace("\\", "/")
116
+
117
+
118
+ def _chunk_paths(paths: List[str], budget: int) -> Iterator[List[str]]:
119
+ """Split POSIX paths into batches whose rendered command-line cost fits ``budget``.
120
+
121
+ Counts 3 chars/path for quote wrapping plus the argument separator; the fixed
122
+ git prefix and exe path are covered by the reserve baked into the budget. A
123
+ single path longer than the budget is emitted alone rather than dropped --
124
+ still safe, since POSIX MAX_ARG_STRLEN (128 KiB) dwarfs any real path.
125
+ """
126
+ batch: List[str] = []
127
+ size = 0
128
+ for raw in paths:
129
+ p = _normalize_git_path(raw)
130
+ # UTF-16 units, matching LS_TREE_PATH_BUDGET's Windows-derived cap (astral
131
+ # chars = 2 units). See constants.py before changing the budget per-platform.
132
+ plen = len(p.encode("utf-16-le")) // 2 + 3
133
+ if batch and size + plen > budget:
134
+ yield batch
135
+ batch, size = [], 0
136
+ batch.append(p)
137
+ size += plen
138
+ if batch:
139
+ yield batch
140
+
141
+
142
+ class GitScanner:
143
+ config: Git2xmlConfig
144
+ cwd: str
145
+
146
+ def __init__(self, config: Git2xmlConfig):
147
+ self.config = config
148
+ self.cwd = str(Path(config.repo).resolve())
149
+
150
+ @overload
151
+ async def run_git(
152
+ self, args: List[str], binary: Literal[False] = False
153
+ ) -> "subprocess.CompletedProcess[str]": ...
154
+ @overload
155
+ async def run_git(
156
+ self, args: List[str], binary: Literal[True]
157
+ ) -> "subprocess.CompletedProcess[bytes]": ...
158
+
159
+ async def run_git(
160
+ self, args: List[str], binary: bool = False
161
+ ) -> "subprocess.CompletedProcess[Any]":
162
+ """Run a git command asynchronously using instance-scoped configuration.
163
+
164
+ Prepends ``--no-pager`` and disables color/quotepath so output is stable
165
+ and machine-parseable. Decodes stdout/stderr as UTF-8 unless ``binary`` is
166
+ set, in which case raw bytes are returned (used for blob fetches).
167
+
168
+ Typed via ``@overload`` so the return is ``CompletedProcess[bytes]`` when
169
+ ``binary=True`` and ``CompletedProcess[str]`` otherwise - callers get the
170
+ right ``stdout``/``stderr`` type without a cast.
171
+
172
+ Raises:
173
+ GitNotInstalledError: if the git executable is not on PATH.
174
+ GitCommandError: if the command exceeds the configured git timeout.
175
+ """
176
+
177
+ base_cmd = _GIT_PREFIX + args
178
+ process = await self._spawn(base_cmd)
179
+
180
+ # communicate() manages stream consumption in the background. If cancelled or
181
+ # timed out, it is safe to wait() after kill() without manual pipe draining.
182
+ try:
183
+ stdout_bytes, stderr_bytes = await asyncio.wait_for(
184
+ process.communicate(), timeout=self.config.git_timeout
185
+ )
186
+ except asyncio.TimeoutError:
187
+ process.kill()
188
+ await (
189
+ process.wait()
190
+ ) # reap the killed child; avoids zombie + "event loop closed" warnings
191
+ raise GitCommandError(
192
+ command=" ".join(base_cmd),
193
+ returncode=-1,
194
+ stderr=f"Command timed out after {self.config.git_timeout}s.",
195
+ ) from None
196
+ except asyncio.CancelledError:
197
+ # Task aborted mid-await (agent/host cancelled the coroutine). communicate()
198
+ # being cancelled does not kill the child, so kill+reap to avoid an orphan,
199
+ # then let the cancellation propagate.
200
+ if process.returncode is None:
201
+ process.kill()
202
+ await process.wait()
203
+ raise
204
+
205
+ # communicate() guarantees the process has terminated; satisfy type checkers.
206
+ return_code = process.returncode if process.returncode is not None else -1
207
+
208
+ stdout = stdout_bytes if binary else stdout_bytes.decode("utf-8", errors="replace")
209
+ stderr = stderr_bytes if binary else stderr_bytes.decode("utf-8", errors="replace")
210
+
211
+ return subprocess.CompletedProcess(
212
+ args=base_cmd, returncode=return_code, stdout=stdout, stderr=stderr
213
+ )
214
+
215
+ async def _spawn(self, cmd: List[str]) -> "asyncio.subprocess.Process":
216
+ """Spawn a git process with the standard pipes/flags (no shell, no stdin)."""
217
+ try:
218
+ return await asyncio.create_subprocess_exec(
219
+ *cmd,
220
+ cwd=self.cwd,
221
+ stdin=asyncio.subprocess.DEVNULL,
222
+ stdout=asyncio.subprocess.PIPE,
223
+ stderr=asyncio.subprocess.PIPE,
224
+ creationflags=_SUBPROCESS_FLAGS,
225
+ )
226
+ except FileNotFoundError:
227
+ raise GitNotInstalledError() from None
228
+
229
+ async def _run_git_capped(self, args: List[str]) -> Tuple[int, str, bool]:
230
+ """Run a diff-producing git command, bounding buffered stdout in memory.
231
+
232
+ Unlike content (whose size git reports before we load it), a diff has no
233
+ size until git computes it, and ``communicate()`` would buffer the whole
234
+ thing - a modified-but-tracked multi-megabyte file produces a diff just as
235
+ large, and these fetches run ``diff_semaphore_limit``-wide, so the naive
236
+ path can balloon to (limit x diff size) of RAM. Here stdout is streamed and
237
+ abandoned once it passes ``max_diff_size`` (there's no point buffering bytes
238
+ the engine will drop), while stderr is drained concurrently to avoid a
239
+ pipe-buffer deadlock.
240
+
241
+ Returns ``(returncode, stdout_text, truncated)``. When ``truncated`` is
242
+ True the child was killed mid-diff and ``returncode`` is meaningless (the
243
+ text is already over-limit and the engine drops it). ``max_diff_size <= 0``
244
+ means uncapped (honors ``--max-diff-size 0``) and defers to ``run_git``.
245
+ """
246
+ max_diff_size = self.config.max_diff_size
247
+ if max_diff_size <= 0:
248
+ res = await self.run_git(args)
249
+ return res.returncode, res.stdout, False
250
+
251
+ process = await self._spawn(_GIT_PREFIX + args)
252
+ assert process.stdout is not None
253
+ assert process.stderr is not None
254
+ ceiling = max_diff_size + _DIFF_CAP_SLACK
255
+ stderr_task = asyncio.create_task(_drain(process.stderr))
256
+
257
+ try:
258
+ try:
259
+ stdout_bytes, truncated = await asyncio.wait_for(
260
+ _read_capped(process.stdout, ceiling),
261
+ timeout=self.config.git_timeout,
262
+ )
263
+ except asyncio.TimeoutError:
264
+ # Child is still running and blocked; kill so the finally can reap it.
265
+ process.kill()
266
+ raise GitCommandError(
267
+ command=" ".join(_GIT_PREFIX + args),
268
+ returncode=-1,
269
+ stderr=f"Command timed out after {self.config.git_timeout}s.",
270
+ ) from None
271
+ if truncated:
272
+ # Cap reached mid-diff: kill to stop further output and to EOF the
273
+ # child's stderr so the concurrent _drain can finish.
274
+ process.kill()
275
+ except asyncio.CancelledError:
276
+ # Coroutine cancelled mid-read; the child outlives the await unless killed.
277
+ process.kill()
278
+ raise
279
+ except Exception:
280
+ # Unexpected stream/transport error: don't leave the child running.
281
+ process.kill()
282
+ raise
283
+ finally:
284
+ # On every path the child must be reaped, and on Windows' Proactor loop
285
+ # process.wait() hangs while a pipe holds unread bytes - so drain both
286
+ # pipes to EOF first. A clean read leaves the child exiting on its own;
287
+ # it is reaped here without a kill.
288
+ await asyncio.gather(_drain(process.stdout), stderr_task, return_exceptions=True)
289
+ await process.wait()
290
+
291
+ return_code = process.returncode if process.returncode is not None else -1
292
+ return return_code, stdout_bytes.decode("utf-8", errors="replace"), truncated
293
+
294
+ def check_git_result(self, result: subprocess.CompletedProcess, command: str) -> None:
295
+ """Raise GitCommandError if ``result`` is a non-zero exit, else do nothing.
296
+
297
+ ``command`` is a human-readable label for the failing call, used in the
298
+ error message. A convenience guard for callers that treat any non-zero
299
+ git exit as fatal (unlike the lookup helpers, which interpret specific
300
+ codes themselves).
301
+ """
302
+
303
+ if result.returncode != 0:
304
+ raise GitCommandError(
305
+ command=command, returncode=result.returncode, stderr=result.stderr
306
+ )
307
+
308
+ async def validate_repository(self) -> None:
309
+ """Verify ``self.cwd`` exists and is inside a git work tree.
310
+
311
+ Raises NotAGitRepositoryError if the path is not a directory, or if
312
+ ``git rev-parse --is-inside-work-tree`` does not confirm a work tree
313
+ (covers a non-repo directory and a bare repository alike).
314
+ """
315
+
316
+ if not Path(self.cwd).is_dir():
317
+ raise NotAGitRepositoryError(self.cwd)
318
+ result = await self.run_git(["rev-parse", "--is-inside-work-tree"])
319
+ if result.returncode != 0 or result.stdout.strip() != "true":
320
+ raise NotAGitRepositoryError(self.cwd)
321
+
322
+ async def resolve_base_ref(self) -> str:
323
+ """Resolve the configured base ref, trying it as-is before falling back to refs/heads/.
324
+
325
+ Reads ``self.config.base`` as the user-supplied input and returns the form
326
+ that resolved successfully. Supports remote refs (origin/main), tags,
327
+ commit SHAs, and bare branch names.
328
+ """
329
+ # A leading '-' would be parsed by git as an option, not a ref (option
330
+ # injection). No legitimate ref a user passes starts with '-': branch
331
+ # names forbid it, and refs/tags/SHAs/remote refs never lead with it.
332
+ # A dash mid-string (feature/-x, my-branch) is fine - only the first
333
+ # character is rejected. A pathologically dash-named branch is still
334
+ # reachable via its fully-qualified refs/heads/-name form.
335
+ if self.config.base.startswith("-"):
336
+ raise GitCommandError(
337
+ command=f"rev-parse --verify {self.config.base}",
338
+ returncode=-1,
339
+ stderr=(
340
+ f"Invalid base ref '{self.config.base}': cannot start with '-'. "
341
+ "If you have a branch named this way, pass its full ref "
342
+ "(e.g. refs/heads/...)."
343
+ ),
344
+ )
345
+
346
+ result = await self.run_git(["rev-parse", "--verify", self.config.base])
347
+ if result.returncode == 0:
348
+ return self.config.base
349
+ if not self.config.base.startswith("refs/"):
350
+ qualified = f"refs/heads/{self.config.base}"
351
+ result2 = await self.run_git(["rev-parse", "--verify", qualified])
352
+ if result2.returncode == 0:
353
+ return qualified
354
+ raise GitCommandError(
355
+ command=f"rev-parse --verify {self.config.base}",
356
+ returncode=result.returncode,
357
+ stderr=f"Base branch '{self.config.base}' not found in repository.",
358
+ )
359
+
360
+ async def scan_commit_changes(self) -> ScanResult:
361
+ """Scan the working tree for changes against HEAD, returning a ScanResult.
362
+
363
+ Runs git queries concurrently: staged changes
364
+ (``diff --name-status --staged``) and, unless ``self.config.staged`` is set,
365
+ also unstaged changes (``diff --name-status``) and untracked files
366
+ (``ls-files --others --exclude-standard``). Each changed file becomes a
367
+ ChangedFile with a derived StagingState - a path present in both the staged
368
+ and unstaged sets is STAGED_AND_MODIFIED, staged-only is STAGED,
369
+ unstaged-only is UNSTAGED, and untracked files are UNTRACKED.
370
+
371
+ Under ``self.config.staged`` the unstaged and untracked scans are skipped
372
+ (their results would be discarded by the caller's ``--staged`` filter anyway),
373
+ so every staged file is reported as plain STAGED. The STAGED_AND_MODIFIED
374
+ label is not derivable without the unstaged scan, but it is unused in
375
+ ``--staged`` mode - it does not affect the filter, the XML output, or
376
+ ``has_staged``.
377
+
378
+ ``has_staged`` reports whether anything was staged, so the caller can give a
379
+ precise message when ``--staged`` finds nothing.
380
+ """
381
+ staged_task = self.run_git(["diff", "--name-status", "-z", "--staged"])
382
+
383
+ if self.config.staged:
384
+ staged_res = await staged_task
385
+ self.check_git_result(staged_res, "diff staged")
386
+ staged = _parse_name_status(staged_res.stdout)
387
+ all_files = [
388
+ ChangedFile(path, info["status"], StagingState.STAGED, info.get("old_path"))
389
+ for path, info in staged.items()
390
+ ]
391
+ return ScanResult(files=all_files, has_staged=len(staged) > 0)
392
+
393
+ unstaged_task = self.run_git(["diff", "--name-status", "-z"])
394
+ untracked_task = self.run_git(["ls-files", "-z", "--others", "--exclude-standard"])
395
+
396
+ staged_res, unstaged_res, untracked_res = await asyncio.gather(
397
+ staged_task, unstaged_task, untracked_task
398
+ )
399
+
400
+ self.check_git_result(staged_res, "diff staged")
401
+ self.check_git_result(unstaged_res, "diff unstaged")
402
+ self.check_git_result(untracked_res, "ls-files untracked")
403
+
404
+ staged = _parse_name_status(staged_res.stdout)
405
+ unstaged = _parse_name_status(unstaged_res.stdout)
406
+ untracked = [p for p in untracked_res.stdout.split("\0") if p]
407
+
408
+ all_files = []
409
+ for path, info in staged.items():
410
+ staging = StagingState.STAGED_AND_MODIFIED if path in unstaged else StagingState.STAGED
411
+ all_files.append(ChangedFile(path, info["status"], staging, info.get("old_path")))
412
+ for path, info in unstaged.items():
413
+ if path not in staged:
414
+ all_files.append(ChangedFile(path, info["status"], StagingState.UNSTAGED))
415
+ for path in untracked:
416
+ all_files.append(ChangedFile(path, FileStatus.UNTRACKED, StagingState.UNTRACKED))
417
+
418
+ return ScanResult(files=all_files, has_staged=len(staged) > 0)
419
+
420
+ async def scan_pr_changes(self, base_branch: str) -> ScanResult:
421
+ """Scan the branch's changes against ``base_branch``, returning a ScanResult.
422
+
423
+ Uses ``diff --name-status <base>...HEAD`` (the merge-base diff), so the
424
+ result reflects what the branch added relative to where it forked from, not
425
+ the raw tip-to-tip diff. These are committed changes, so per-file
426
+ StagingState and ``has_staged`` are benign placeholders, never read in PR
427
+ mode (see the inline note).
428
+ """
429
+
430
+ diff_result = await self.run_git(["diff", "--name-status", "-z", f"{base_branch}...HEAD"])
431
+ self.check_git_result(diff_result, f"diff {base_branch}...HEAD")
432
+
433
+ files_dict = _parse_name_status(diff_result.stdout)
434
+ # Staging state is meaningless in PR mode (these are committed branch changes,
435
+ # not working-tree/index states). Both the per-file StagingState.STAGED and
436
+ # has_staged=True below are benign placeholders - the commit-mode code paths
437
+ # that read these fields (staging filter, --staged guard) never run in PR
438
+ # mode - not real claims about staging.
439
+ all_files = [
440
+ ChangedFile(path, info["status"], StagingState.STAGED, info.get("old_path"))
441
+ for path, info in files_dict.items()
442
+ ]
443
+ return ScanResult(files=all_files, has_staged=True)
444
+
445
+ async def get_single_diff(
446
+ self,
447
+ file: ChangedFile,
448
+ base_ref: str,
449
+ untracked: bool = False,
450
+ ) -> Tuple[str, str]:
451
+ """Fetch the diff for one changed file and return ``(path, diff_text)``.
452
+
453
+ Args:
454
+ file: The file whose diff is to be fetched.
455
+ base_ref: Base ref for the diff (e.g. ``"HEAD"`` or ``"main...HEAD"``).
456
+ untracked: If True, synthesize an add-diff for a not-yet-tracked file
457
+ via ``git diff --no-index -- /dev/null <path>``. ``--no-index``
458
+ exits 1 when the files differ (the normal case), so rc 0 and 1
459
+ are both accepted.
460
+
461
+ The staged/unstaged distinction is read from ``self.config.staged``.
462
+
463
+ Git commands are called with ``--no-ext-diff`` and ``--no-textconv`` to
464
+ neutralize repo-configured external diff and textconv drivers on
465
+ untrusted repos (``--no-index`` honors textconv too, so the untracked
466
+ path needs the guard as well).
467
+ """
468
+ if untracked:
469
+ # Untracked files are invisible to plain `git diff`; synthesize an
470
+ # add-diff against an empty input. "/dev/null" gives git's canonical
471
+ # creation header identically across platforms (see prior note).
472
+ diff_args = [
473
+ "diff",
474
+ "--no-ext-diff",
475
+ "--no-textconv",
476
+ "--no-index",
477
+ "--",
478
+ "/dev/null",
479
+ _normalize_git_path(file.path),
480
+ ]
481
+ rc, out, truncated = await self._run_git_capped(diff_args)
482
+ if truncated:
483
+ return file.path, out # over-limit; the engine drops it
484
+ if rc not in (0, 1):
485
+ logger.debug("no-index diff for %s exited %d", file.path, rc)
486
+ return file.path, ""
487
+ return file.path, out.strip()
488
+
489
+ paths = [file.path] if not file.old_path else [file.path, file.old_path]
490
+ diff_args = ["diff", "--no-ext-diff", "--no-textconv", "-M"]
491
+ if self.config.staged:
492
+ diff_args.append("--cached")
493
+ diff_args += [base_ref, "--"] + paths
494
+ rc, out, truncated = await self._run_git_capped(diff_args)
495
+ if truncated:
496
+ return file.path, out # over-limit; the engine drops it
497
+ if rc != 0:
498
+ logger.debug("diff for %s exited %d", file.path, rc)
499
+ return file.path, ""
500
+ return file.path, out.strip()
501
+
502
+ async def get_current_branch(self) -> str:
503
+ """Gets the name of the current active branch."""
504
+ result = await self.run_git(["rev-parse", "--abbrev-ref", "HEAD"])
505
+ return result.stdout.strip() if result.returncode == 0 else "HEAD"
506
+
507
+ async def get_pr_commits(self, base_branch: str) -> List[PRCommit]:
508
+ """Fetches structured commit history for a PR.
509
+
510
+ Records are NUL-separated via ``git log -z`` (NUL cannot appear in commit
511
+ content, making record splitting unambiguous). Fields within a record are
512
+ separated by %x1e (ASCII Record Separator). A literal RS byte in a field
513
+ (author/subject) would misalign that one record - accepted as negligible,
514
+ since RS does not occur in organic commit metadata.
515
+ """
516
+ # %h short hash, %an author, %aI ISO-8601 date, %s subject, %b body
517
+ format_str = "%h%x1e%an%x1e%aI%x1e%s%x1e%b"
518
+ args = ["log", "-z", f"{base_branch}..HEAD", f"--format={format_str}"]
519
+
520
+ result = await self.run_git(args)
521
+ commits: List[PRCommit] = []
522
+
523
+ if result.returncode == 0 and result.stdout:
524
+ for raw_commit in result.stdout.split("\x00"):
525
+ if not raw_commit.strip():
526
+ continue
527
+ parts = raw_commit.split("\x1e", 4)
528
+ if len(parts) == 5:
529
+ commits.append(
530
+ {
531
+ "hash": parts[0].strip(),
532
+ "author": parts[1].strip(),
533
+ "date": parts[2].strip(),
534
+ "subject": parts[3].strip(),
535
+ "body": parts[4].strip(),
536
+ }
537
+ )
538
+
539
+ return commits
540
+
541
+ async def has_uncommitted_changes(self) -> bool:
542
+ """Checks if the working directory is dirty."""
543
+ result = await self.run_git(["status", "--porcelain"])
544
+ return bool(result.stdout.strip())
545
+
546
+ async def get_blob_size_at_index(self, path: str) -> int:
547
+ """Return the byte size of a staged blob in the index, or -1 if unavailable.
548
+
549
+ Reads the index entry via ``git cat-file -s :<path>``. Lets the caller
550
+ enforce the size limit *before* fetching the blob into memory, so an
551
+ oversized staged file is never fully buffered. Returns -1 if the path is not
552
+ in the index or the size can't be parsed.
553
+ """
554
+ posix_path = _normalize_git_path(path)
555
+ result = await self.run_git(["cat-file", "-s", f":{posix_path}"])
556
+ if result.returncode != 0:
557
+ return -1
558
+ try:
559
+ return int(result.stdout.strip())
560
+ except ValueError:
561
+ return -1
562
+
563
+ async def _ls_tree_meta(
564
+ self, ref: str, paths: Optional[List[str]] = None
565
+ ) -> Dict[str, Tuple[int, bool]]:
566
+ """Run one ``ls-tree -r -l -z`` (optionally scoped to ``paths``) and parse it.
567
+
568
+ ``paths`` are assumed already POSIX-normalized and small enough for one
569
+ command line (chunking is the caller's job). The ``-l`` record format is::
570
+
571
+ <mode> SP <type> SP <object> SP <size> TAB <path>
572
+
573
+ so we partition once on TAB to isolate the path (which may contain spaces),
574
+ then whitespace-split the metadata half. Size is -1 for entries git reports
575
+ without one (the '-' placeholder for non-blobs). Empty map on non-zero exit.
576
+ """
577
+ args = ["ls-tree", "-r", "-l", "-z", ref]
578
+ if paths:
579
+ args += ["--", *paths]
580
+
581
+ result = await self.run_git(args)
582
+ if result.returncode != 0:
583
+ logger.debug(
584
+ "ls-tree for %s exited %d: %s", ref, result.returncode, result.stderr.strip()
585
+ )
586
+ return {}
587
+
588
+ meta: Dict[str, Tuple[int, bool]] = {}
589
+ for record in result.stdout.split("\0"):
590
+ if not record:
591
+ continue
592
+ head, tab, path = record.partition("\t")
593
+ if not tab:
594
+ continue
595
+ fields = head.split()
596
+ if len(fields) < 4:
597
+ continue
598
+ mode, size_str = fields[0], fields[3]
599
+ try:
600
+ size = int(size_str)
601
+ except ValueError:
602
+ size = -1 # '-' placeholder for non-blob entries
603
+ meta[path] = (size, mode == "120000")
604
+ return meta
605
+
606
+ async def get_tree_metadata(
607
+ self, ref: str, paths: Optional[List[str]] = None
608
+ ) -> Dict[str, Tuple[int, bool]]:
609
+ """Return ``{path: (size, is_symlink)}`` for blobs in ``ref``'s tree.
610
+
611
+ With ``paths``, the listing is always scoped to them, chunked only so the
612
+ argument list never exceeds the platform command-line limit (see
613
+ LS_TREE_PATH_BUDGET). Batches run concurrently; because they partition the
614
+ paths into disjoint sets, merging their maps is order-independent. There is
615
+ no full-tree performance fallback: scoping fetches P entries, full-tree
616
+ fetches every entry (N >= P) to save only spawns -- a bad trade in exactly
617
+ the large repos where scoping matters most. ``paths=None`` lists the whole
618
+ tree, for callers that explicitly want it.
619
+
620
+ Callers only ever look up changed paths and never iterate the map, so a
621
+ scoped map behaves identically to the full tree. Empty map if ``ref`` can't
622
+ be listed; callers treat a missing path as size -1.
623
+ """
624
+ if not paths:
625
+ return await self._ls_tree_meta(ref)
626
+
627
+ sem = asyncio.Semaphore(self.config.diff_semaphore_limit)
628
+
629
+ async def _scoped(batch: List[str]) -> Dict[str, Tuple[int, bool]]:
630
+ async with sem:
631
+ return await self._ls_tree_meta(ref, batch)
632
+
633
+ coros = [_scoped(batch) for batch in _chunk_paths(paths, LS_TREE_PATH_BUDGET)]
634
+ meta: Dict[str, Tuple[int, bool]] = {}
635
+ for partial in await asyncio.gather(*coros): # concurrent; disjoint, so order is moot
636
+ meta.update(partial)
637
+ return meta
638
+
639
+ async def get_blob_at_head(self, path: str) -> Optional[bytes]:
640
+ """Fetches the raw bytes of a file at HEAD. Returns None if not found, b'' if empty."""
641
+ posix_path = _normalize_git_path(path)
642
+ result = await self.run_git(["show", f"HEAD:{posix_path}"], binary=True)
643
+ return result.stdout if result.returncode == 0 else None
644
+
645
+ async def get_blob_at_index(self, path: str) -> Optional[bytes]:
646
+ """Fetches the raw bytes of a file from the Git index. Returns None if not found, b'' if empty."""
647
+ posix_path = _normalize_git_path(path)
648
+ result = await self.run_git(["show", f":{posix_path}"], binary=True)
649
+ return result.stdout if result.returncode == 0 else None
650
+
651
+ async def is_symlink_at_index(self, path: str) -> bool:
652
+ """Returns True iff the path is stored as a symlink (mode 120000) in the Git index."""
653
+ posix_path = _normalize_git_path(path)
654
+ result = await self.run_git(["ls-files", "-s", "--", posix_path])
655
+ if result.returncode != 0 or not result.stdout.strip():
656
+ return False
657
+ try:
658
+ return result.stdout.split()[0] == "120000"
659
+ except IndexError:
660
+ return False
661
+
662
+ async def get_index_metadata(
663
+ self, paths: Optional[List[str]] = None
664
+ ) -> Dict[str, Tuple[int, bool]]:
665
+ """Return ``{path: (size, is_symlink)}`` for blobs in the git index.
666
+
667
+ The index analogue of ``get_tree_metadata``: ``git write-tree`` writes the
668
+ current index to a tree object (read-only w.r.t. index/working tree/HEAD --
669
+ it only adds loose objects, reclaimed later by ``git gc``), which is then
670
+ listed via ``get_tree_metadata``, inheriting its scoping/chunking. Replaces
671
+ the per-file ``cat-file -s`` + ``ls-files -s`` probe that --staged rendering
672
+ would otherwise issue per file.
673
+
674
+ Empty map if the index can't be written as a tree (e.g. an unmerged index);
675
+ callers treat a missing path as size -1 and fall back to per-file probes.
676
+ """
677
+ result = await self.run_git(["write-tree"])
678
+ if result.returncode != 0:
679
+ logger.debug("write-tree failed (%d): %s", result.returncode, result.stderr.strip())
680
+ return {}
681
+ tree_sha = result.stdout.strip()
682
+ if not tree_sha:
683
+ return {}
684
+ return await self.get_tree_metadata(tree_sha, paths)
685
+
686
+ async def new_file_size(self, f: ChangedFile, tree_meta: Dict[str, Tuple[int, bool]]) -> int:
687
+ """Return the byte size of a new file from the same source the render path uses.
688
+
689
+ Reads size from the working tree (stat), the git index (cat-file on
690
+ ``:<path>``), or - in PR mode - the batched HEAD tree map, mirroring
691
+ exactly what ``_render_file_xml`` does. This keeps the diff-side guard
692
+ and the content-side guard in agreement so a large new file is never
693
+ fully read as a diff under --no-content when it would be omitted in
694
+ normal mode.
695
+ """
696
+ mode = self.config.command
697
+ staged_only = self.config.staged
698
+ if f.status == FileStatus.UNTRACKED or (mode == "commit" and not staged_only):
699
+ try:
700
+ return (Path(self.cwd) / f.path).stat().st_size
701
+ except OSError:
702
+ return -1
703
+ if mode == "pr":
704
+ return tree_meta.get(f.path, (-1, False))[0]
705
+ size = tree_meta.get(f.path, (-1, False))[0] # commit + --staged
706
+ if size < 0:
707
+ size = await self.get_blob_size_at_index(f.path)
708
+ return size