git2xml 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- git2xml/__init__.py +43 -0
- git2xml/__main__.py +8 -0
- git2xml/api.py +95 -0
- git2xml/cli.py +158 -0
- git2xml/constants.py +31 -0
- git2xml/core.py +633 -0
- git2xml/git_scanner.py +708 -0
- git2xml/models.py +273 -0
- git2xml/py.typed +0 -0
- git2xml/utils.py +251 -0
- git2xml-0.1.0.dist-info/METADATA +349 -0
- git2xml-0.1.0.dist-info/RECORD +16 -0
- git2xml-0.1.0.dist-info/WHEEL +5 -0
- git2xml-0.1.0.dist-info/entry_points.txt +2 -0
- git2xml-0.1.0.dist-info/licenses/LICENSE +21 -0
- git2xml-0.1.0.dist-info/top_level.txt +1 -0
git2xml/core.py
ADDED
|
@@ -0,0 +1,633 @@
|
|
|
1
|
+
"""Brief-assembly engine: scan -> fetch diffs -> render -> XML.
|
|
2
|
+
|
|
3
|
+
The orchestration layer. ``build_brief`` drives a GitScanner through the
|
|
4
|
+
phases (validate, scan, fetch diffs concurrently, render each file, emit XML)
|
|
5
|
+
and returns the brief as a string; ``save_brief`` wraps it with an atomic
|
|
6
|
+
write to disk. Concurrency policy (the diff/render semaphores) and
|
|
7
|
+
diff-eligibility rules live here, not in the scanner. The per-file render
|
|
8
|
+
helpers resolve a file's content source (working tree, index, or HEAD blob)
|
|
9
|
+
and delegate formatting to ``utils``.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import asyncio
|
|
13
|
+
import logging
|
|
14
|
+
import os
|
|
15
|
+
import tempfile
|
|
16
|
+
import time
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Dict, Optional, Tuple
|
|
19
|
+
|
|
20
|
+
from .git_scanner import GitScanner
|
|
21
|
+
from .models import (
|
|
22
|
+
NO_DIFF,
|
|
23
|
+
ChangedFile,
|
|
24
|
+
DiffOmission,
|
|
25
|
+
DiffResult,
|
|
26
|
+
FileStatus,
|
|
27
|
+
Git2xmlCliConfig,
|
|
28
|
+
Git2xmlConfig,
|
|
29
|
+
Git2xmlError,
|
|
30
|
+
StagingState,
|
|
31
|
+
)
|
|
32
|
+
from .utils import (
|
|
33
|
+
build_commit_log_xml,
|
|
34
|
+
decode_bytes_bom_aware,
|
|
35
|
+
diff_exceeds_limit,
|
|
36
|
+
escape_xml_attr,
|
|
37
|
+
format_file_xml,
|
|
38
|
+
is_binary_bytes,
|
|
39
|
+
is_binary_file,
|
|
40
|
+
read_text_bom_aware,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
logger = logging.getLogger(__name__)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _render_worktree_file_sync(
|
|
47
|
+
cwd_path: Path,
|
|
48
|
+
file_obj: ChangedFile,
|
|
49
|
+
status_label: str,
|
|
50
|
+
diff: DiffResult,
|
|
51
|
+
no_content: bool,
|
|
52
|
+
max_size: int,
|
|
53
|
+
strict_xml: bool,
|
|
54
|
+
) -> str:
|
|
55
|
+
"""Synchronous working-tree file resolution (symlink/missing/binary/size/content).
|
|
56
|
+
|
|
57
|
+
Runs the blocking working-tree resolution off the event loop.
|
|
58
|
+
"""
|
|
59
|
+
full_path = cwd_path / file_obj.path
|
|
60
|
+
|
|
61
|
+
# Avoid including Symlink file content. Replicate Git behaviour with a clear content message
|
|
62
|
+
if full_path.is_symlink():
|
|
63
|
+
target = full_path.readlink()
|
|
64
|
+
symlink_content = f"Symlink pointing to: {target}"
|
|
65
|
+
return format_file_xml(
|
|
66
|
+
file_obj.path,
|
|
67
|
+
None if no_content else symlink_content,
|
|
68
|
+
status=status_label,
|
|
69
|
+
diff=diff,
|
|
70
|
+
indent=" ",
|
|
71
|
+
strict_xml=strict_xml,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
# Default: pull from the live working tree
|
|
75
|
+
if not full_path.exists():
|
|
76
|
+
return format_file_xml(
|
|
77
|
+
file_obj.path,
|
|
78
|
+
None,
|
|
79
|
+
status="omitted",
|
|
80
|
+
reason="deleted or unavailable",
|
|
81
|
+
indent=" ",
|
|
82
|
+
strict_xml=strict_xml,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
if is_binary_file(full_path):
|
|
86
|
+
# No diff passed: binary files are omitted whole (see diff-eligibility note
|
|
87
|
+
# in build_brief on why their diff was fetched but isn't rendered).
|
|
88
|
+
return format_file_xml(
|
|
89
|
+
file_obj.path,
|
|
90
|
+
None,
|
|
91
|
+
status=status_label,
|
|
92
|
+
reason="omitted - binary file detected",
|
|
93
|
+
indent=" ",
|
|
94
|
+
strict_xml=strict_xml,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
if full_path.stat().st_size > max_size:
|
|
98
|
+
return format_file_xml(
|
|
99
|
+
file_obj.path,
|
|
100
|
+
None,
|
|
101
|
+
status=status_label,
|
|
102
|
+
reason=f"omitted - file exceeds {max_size} bytes",
|
|
103
|
+
diff=diff,
|
|
104
|
+
indent=" ",
|
|
105
|
+
strict_xml=strict_xml,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
content = read_text_bom_aware(full_path)
|
|
109
|
+
return format_file_xml(
|
|
110
|
+
file_obj.path,
|
|
111
|
+
None if no_content else content,
|
|
112
|
+
status=status_label,
|
|
113
|
+
diff=diff,
|
|
114
|
+
indent=" ",
|
|
115
|
+
strict_xml=strict_xml,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _render_blob_sync(
|
|
120
|
+
path: str,
|
|
121
|
+
blob: bytes,
|
|
122
|
+
is_symlink: bool,
|
|
123
|
+
status_label: str,
|
|
124
|
+
diff: DiffResult,
|
|
125
|
+
no_content: bool,
|
|
126
|
+
strict_xml: bool,
|
|
127
|
+
) -> str:
|
|
128
|
+
"""Decode an already-fetched blob and format its <file> XML (CPU-bound).
|
|
129
|
+
|
|
130
|
+
Runs off the event loop via asyncio.to_thread. The blob bytes are fetched
|
|
131
|
+
by the async caller; this does only the binary check, BOM-aware decode, and
|
|
132
|
+
formatting - the work that would otherwise block the loop for large blobs.
|
|
133
|
+
Mirrors the post-fetch tail of the worktree path.
|
|
134
|
+
"""
|
|
135
|
+
if is_symlink:
|
|
136
|
+
target = decode_bytes_bom_aware(blob)
|
|
137
|
+
symlink_content = f"Symlink pointing to: {target}"
|
|
138
|
+
return format_file_xml(
|
|
139
|
+
path,
|
|
140
|
+
None if no_content else symlink_content,
|
|
141
|
+
status=status_label,
|
|
142
|
+
diff=diff,
|
|
143
|
+
indent=" ",
|
|
144
|
+
strict_xml=strict_xml,
|
|
145
|
+
)
|
|
146
|
+
if is_binary_bytes(blob):
|
|
147
|
+
# No diff passed: binary files are omitted whole (see diff-eligibility note
|
|
148
|
+
# in build_brief on why their diff was fetched but isn't rendered).
|
|
149
|
+
return format_file_xml(
|
|
150
|
+
path,
|
|
151
|
+
None,
|
|
152
|
+
status=status_label,
|
|
153
|
+
reason="omitted - binary file detected",
|
|
154
|
+
indent=" ",
|
|
155
|
+
strict_xml=strict_xml,
|
|
156
|
+
)
|
|
157
|
+
content = decode_bytes_bom_aware(blob)
|
|
158
|
+
return format_file_xml(
|
|
159
|
+
path,
|
|
160
|
+
None if no_content else content,
|
|
161
|
+
status=status_label,
|
|
162
|
+
diff=diff,
|
|
163
|
+
indent=" ",
|
|
164
|
+
strict_xml=strict_xml,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
async def _render_file_xml(
|
|
169
|
+
scanner: GitScanner,
|
|
170
|
+
cwd_path: Path,
|
|
171
|
+
file_obj: ChangedFile,
|
|
172
|
+
mode: str,
|
|
173
|
+
diff: DiffResult,
|
|
174
|
+
staged_only: bool,
|
|
175
|
+
no_content: bool,
|
|
176
|
+
max_size: int,
|
|
177
|
+
strict_xml: bool,
|
|
178
|
+
sem: asyncio.Semaphore,
|
|
179
|
+
tree_meta: Dict[str, Tuple[int, bool]],
|
|
180
|
+
) -> str:
|
|
181
|
+
"""Resolve one file's content source and return its formatted <file> XML.
|
|
182
|
+
|
|
183
|
+
Runs concurrently with other files, bounded by ``sem``.
|
|
184
|
+
"""
|
|
185
|
+
status_label = file_obj.status.label
|
|
186
|
+
|
|
187
|
+
async with sem:
|
|
188
|
+
if mode == "pr":
|
|
189
|
+
# PR Briefs pull directly from Git memory (HEAD)
|
|
190
|
+
if file_obj.status == FileStatus.DELETED:
|
|
191
|
+
return format_file_xml(
|
|
192
|
+
file_obj.path,
|
|
193
|
+
None,
|
|
194
|
+
status=status_label,
|
|
195
|
+
diff=diff,
|
|
196
|
+
indent=" ",
|
|
197
|
+
strict_xml=strict_xml,
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# Size and symlink-ness come from the batched ls-tree map, not
|
|
201
|
+
# per-file git calls. A path absent from the map yields size -1
|
|
202
|
+
# (intentionally not > max_size, so it falls through to the fetch
|
|
203
|
+
# below, which returns None and renders the "deleted or unavailable"
|
|
204
|
+
# omission - do not special-case it here).
|
|
205
|
+
blob_size, is_symlink = tree_meta.get(file_obj.path, (-1, False))
|
|
206
|
+
if blob_size > max_size:
|
|
207
|
+
return format_file_xml(
|
|
208
|
+
file_obj.path,
|
|
209
|
+
None,
|
|
210
|
+
status=status_label,
|
|
211
|
+
reason=f"omitted - file exceeds {max_size} bytes",
|
|
212
|
+
diff=diff,
|
|
213
|
+
indent=" ",
|
|
214
|
+
strict_xml=strict_xml,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
blob = await scanner.get_blob_at_head(file_obj.path)
|
|
218
|
+
if blob is None:
|
|
219
|
+
return format_file_xml(
|
|
220
|
+
file_obj.path,
|
|
221
|
+
None,
|
|
222
|
+
status="omitted",
|
|
223
|
+
reason="deleted or unavailable at HEAD",
|
|
224
|
+
indent=" ",
|
|
225
|
+
strict_xml=strict_xml,
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
return await asyncio.to_thread(
|
|
229
|
+
_render_blob_sync,
|
|
230
|
+
file_obj.path,
|
|
231
|
+
blob,
|
|
232
|
+
is_symlink,
|
|
233
|
+
status_label,
|
|
234
|
+
diff,
|
|
235
|
+
no_content,
|
|
236
|
+
strict_xml,
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
else:
|
|
240
|
+
# Commit Briefs
|
|
241
|
+
if file_obj.status == FileStatus.DELETED:
|
|
242
|
+
return format_file_xml(
|
|
243
|
+
file_obj.path,
|
|
244
|
+
None,
|
|
245
|
+
status=status_label,
|
|
246
|
+
diff=diff,
|
|
247
|
+
indent=" ",
|
|
248
|
+
strict_xml=strict_xml,
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
if staged_only:
|
|
252
|
+
# --staged: size + symlink mode come from the batched index map
|
|
253
|
+
# (mirrors PR mode), not per-file cat-file/ls-files probes. A path
|
|
254
|
+
# missing from the map (e.g. an unmerged index where write-tree
|
|
255
|
+
# failed) falls back to the per-file probes. Content is still read
|
|
256
|
+
# from the index per file.
|
|
257
|
+
blob_size, is_symlink = tree_meta.get(file_obj.path, (-1, False))
|
|
258
|
+
if blob_size < 0:
|
|
259
|
+
blob_size = await scanner.get_blob_size_at_index(file_obj.path)
|
|
260
|
+
is_symlink = await scanner.is_symlink_at_index(file_obj.path)
|
|
261
|
+
if blob_size > max_size:
|
|
262
|
+
return format_file_xml(
|
|
263
|
+
file_obj.path,
|
|
264
|
+
None,
|
|
265
|
+
status=status_label,
|
|
266
|
+
reason=f"omitted - file exceeds {max_size} bytes",
|
|
267
|
+
diff=diff,
|
|
268
|
+
indent=" ",
|
|
269
|
+
strict_xml=strict_xml,
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
blob = await scanner.get_blob_at_index(file_obj.path)
|
|
273
|
+
if blob is None:
|
|
274
|
+
return format_file_xml(
|
|
275
|
+
file_obj.path,
|
|
276
|
+
None,
|
|
277
|
+
status="omitted",
|
|
278
|
+
reason="deleted or unavailable in index",
|
|
279
|
+
indent=" ",
|
|
280
|
+
strict_xml=strict_xml,
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
return await asyncio.to_thread(
|
|
284
|
+
_render_blob_sync,
|
|
285
|
+
file_obj.path,
|
|
286
|
+
blob,
|
|
287
|
+
is_symlink,
|
|
288
|
+
status_label,
|
|
289
|
+
diff,
|
|
290
|
+
no_content,
|
|
291
|
+
strict_xml,
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
# Working-tree branch: blocking FS I/O -> run in a thread so it
|
|
295
|
+
# doesn't stall the event loop and actually runs concurrently.
|
|
296
|
+
return await asyncio.to_thread(
|
|
297
|
+
_render_worktree_file_sync,
|
|
298
|
+
cwd_path,
|
|
299
|
+
file_obj,
|
|
300
|
+
status_label,
|
|
301
|
+
diff,
|
|
302
|
+
no_content,
|
|
303
|
+
max_size,
|
|
304
|
+
strict_xml,
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
async def build_brief(config: Git2xmlConfig) -> str:
|
|
309
|
+
"""Generate an XML brief of a repository's changes and return it as a string.
|
|
310
|
+
|
|
311
|
+
Scans the repository for changes, fetches their diffs concurrently, and
|
|
312
|
+
assembles an XML document mapping each changed file to its status, diff,
|
|
313
|
+
and (optionally) full content - formatted for consumption by an LLM.
|
|
314
|
+
|
|
315
|
+
Two modes determine what is summarized and where content is read from:
|
|
316
|
+
|
|
317
|
+
- ``"commit"``: changes in the working tree relative to HEAD. Content is
|
|
318
|
+
read from the working tree, or from the git index when ``staged_only``
|
|
319
|
+
is set. Includes staged, unstaged, and untracked files by default.
|
|
320
|
+
- ``"pr"``: all changes on the current branch relative to the configuration
|
|
321
|
+
``base`` field (a ``base...HEAD`` diff). Content is read from the committed
|
|
322
|
+
blob at HEAD, and a structured ``<commit_log>`` of the branch's commits is
|
|
323
|
+
included. Uncommitted changes are not part of a PR brief.
|
|
324
|
+
|
|
325
|
+
Args:
|
|
326
|
+
config: ``Git2xmlConfig`` configuration object.
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
The XML brief output as a string.
|
|
330
|
+
|
|
331
|
+
Raises:
|
|
332
|
+
Git2xmlError: If the repository is invalid, git is not installed, the
|
|
333
|
+
base ref cannot be resolved, a git command fails or times out, or
|
|
334
|
+
the output file cannot be written.
|
|
335
|
+
"""
|
|
336
|
+
|
|
337
|
+
scanner = GitScanner(config)
|
|
338
|
+
|
|
339
|
+
cwd_path = Path(config.repo).resolve()
|
|
340
|
+
mode = config.command
|
|
341
|
+
base_branch = config.base
|
|
342
|
+
verbose = config.verbose
|
|
343
|
+
staged_only = config.staged
|
|
344
|
+
strict_xml = config.strict_xml
|
|
345
|
+
no_untracked = config.no_untracked
|
|
346
|
+
max_size = config.max_size
|
|
347
|
+
max_diff_size = config.max_diff_size
|
|
348
|
+
no_content = config.no_content
|
|
349
|
+
diff_semaphore_limit = config.diff_semaphore_limit
|
|
350
|
+
hide_repo_path = config.hide_repo_path
|
|
351
|
+
|
|
352
|
+
logger.info(f"Validating repository at {cwd_path}...")
|
|
353
|
+
await scanner.validate_repository()
|
|
354
|
+
|
|
355
|
+
if mode == "commit":
|
|
356
|
+
logger.info("Scanning commit changes...")
|
|
357
|
+
scan = await scanner.scan_commit_changes()
|
|
358
|
+
base_ref = "HEAD"
|
|
359
|
+
|
|
360
|
+
# Default to all changed files, filter only if user explicitly asks for --staged
|
|
361
|
+
if staged_only:
|
|
362
|
+
active_files = [
|
|
363
|
+
f
|
|
364
|
+
for f in scan.files
|
|
365
|
+
if f.staging in (StagingState.STAGED, StagingState.STAGED_AND_MODIFIED)
|
|
366
|
+
]
|
|
367
|
+
else:
|
|
368
|
+
active_files = scan.files
|
|
369
|
+
|
|
370
|
+
# --no-untracked: drop untracked files (no-op under --staged, which already excludes them)
|
|
371
|
+
if no_untracked:
|
|
372
|
+
active_files = [f for f in active_files if f.status != FileStatus.UNTRACKED]
|
|
373
|
+
else:
|
|
374
|
+
base_branch = await scanner.resolve_base_ref()
|
|
375
|
+
logger.info("Scanning pr changes...")
|
|
376
|
+
|
|
377
|
+
# Check for uncommitted changes and warn the user
|
|
378
|
+
if await scanner.has_uncommitted_changes():
|
|
379
|
+
logger.warning(
|
|
380
|
+
"You have uncommitted changes! These will NOT be included in the PR brief."
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
scan = await scanner.scan_pr_changes(base_branch)
|
|
384
|
+
base_ref = f"{base_branch}...HEAD"
|
|
385
|
+
active_files = scan.files
|
|
386
|
+
|
|
387
|
+
if not active_files:
|
|
388
|
+
if mode == "commit":
|
|
389
|
+
if staged_only and not scan.has_staged:
|
|
390
|
+
logger.info(
|
|
391
|
+
f"No staged changes in {cwd_path}. "
|
|
392
|
+
"Stage files with 'git add' first, or omit --staged to include "
|
|
393
|
+
"unstaged and untracked changes."
|
|
394
|
+
)
|
|
395
|
+
else:
|
|
396
|
+
logger.info(f"No staged or unstaged changes to commit in {cwd_path}.")
|
|
397
|
+
else:
|
|
398
|
+
logger.info(f"No commits between {base_branch} and HEAD in {cwd_path}.")
|
|
399
|
+
return ""
|
|
400
|
+
|
|
401
|
+
# Files that always get a diff fetched.
|
|
402
|
+
#
|
|
403
|
+
# MODIFIED/TYPE_CHANGED include binary files, whose diff the render path then
|
|
404
|
+
# discards (a binary file is omitted with no <diff>). We fetch it anyway:
|
|
405
|
+
# binary-ness isn't known until the bytes are sniffed at render time, and
|
|
406
|
+
# hoisting that sniff ahead of the fetch isn't worth the plumbing. The wasted
|
|
407
|
+
# fetch is bounded to ~max_diff_size by the streaming cap in
|
|
408
|
+
# GitScanner._run_git_capped, so the cost is one spawn + a capped read - not
|
|
409
|
+
# memory, not correctness.
|
|
410
|
+
diff_eligible_statuses = {
|
|
411
|
+
FileStatus.MODIFIED,
|
|
412
|
+
FileStatus.RENAMED,
|
|
413
|
+
FileStatus.COPIED,
|
|
414
|
+
FileStatus.DELETED,
|
|
415
|
+
FileStatus.TYPE_CHANGED,
|
|
416
|
+
}
|
|
417
|
+
# Under --no-content, a new file's diff is the only carrier of its change,
|
|
418
|
+
# so added/untracked files become diff-eligible too. In normal
|
|
419
|
+
# mode their content carries the change and no diff is needed.
|
|
420
|
+
if no_content:
|
|
421
|
+
diff_eligible_statuses |= {FileStatus.ADDED, FileStatus.UNTRACKED}
|
|
422
|
+
|
|
423
|
+
diff_eligible = [f for f in active_files if f.status in diff_eligible_statuses]
|
|
424
|
+
sem = asyncio.Semaphore(diff_semaphore_limit)
|
|
425
|
+
|
|
426
|
+
# PR rendering reads size + symlink mode from one batched ls-tree call
|
|
427
|
+
# instead of two git spawns per file; --staged does the same against the
|
|
428
|
+
# index (via write-tree). Default commit mode reads from the working tree
|
|
429
|
+
# per file, so the map stays empty there.
|
|
430
|
+
scoped = [f.path for f in active_files]
|
|
431
|
+
tree_meta: Dict[str, Tuple[int, bool]]
|
|
432
|
+
if mode == "pr":
|
|
433
|
+
tree_meta = await scanner.get_tree_metadata("HEAD", scoped)
|
|
434
|
+
elif staged_only:
|
|
435
|
+
tree_meta = await scanner.get_index_metadata(scoped)
|
|
436
|
+
else:
|
|
437
|
+
tree_meta = {}
|
|
438
|
+
|
|
439
|
+
async def _fetch_diff(f: ChangedFile) -> Tuple[str, DiffResult]:
|
|
440
|
+
try:
|
|
441
|
+
if f.status in (FileStatus.ADDED, FileStatus.UNTRACKED):
|
|
442
|
+
# Size pre-check stays OUTSIDE the semaphore: it's a metadata
|
|
443
|
+
# lookup (tree map, with a stat / cat-file fallback), not a diff
|
|
444
|
+
# fetch, and it must run even when the diff is skipped.
|
|
445
|
+
if await scanner.new_file_size(f, tree_meta) > max_size:
|
|
446
|
+
return f.path, DiffResult(omission=DiffOmission.SIZE_EXCEEDED)
|
|
447
|
+
untracked = f.status == FileStatus.UNTRACKED
|
|
448
|
+
else:
|
|
449
|
+
untracked = False
|
|
450
|
+
|
|
451
|
+
# max_diff_size caps the diff, but only as output-shaping, not a
|
|
452
|
+
# memory guard: unlike content (whose size git reports via
|
|
453
|
+
# ls-tree/cat-file before we load it), a diff has no size until git
|
|
454
|
+
# computes it. The cap can only run post-fetch - git has already
|
|
455
|
+
# produced and buffered the full diff by the time we measure it - so
|
|
456
|
+
# an oversized diff is dropped from the OUTPUT (DIFF_SIZE_EXCEEDED),
|
|
457
|
+
# not prevented from being built. This keeps a pathological diff out
|
|
458
|
+
# of the brief / LLM context, but does not bound peak memory.
|
|
459
|
+
async with sem:
|
|
460
|
+
path, diff_text = await scanner.get_single_diff(f, base_ref, untracked=untracked)
|
|
461
|
+
if diff_exceeds_limit(diff_text, max_diff_size):
|
|
462
|
+
return path, DiffResult(
|
|
463
|
+
omission=DiffOmission.DIFF_SIZE_EXCEEDED, limit=max_diff_size
|
|
464
|
+
)
|
|
465
|
+
return path, DiffResult(text=diff_text)
|
|
466
|
+
except Exception as e:
|
|
467
|
+
logger.warning("Failed to read diff for %s: %s", f.path, e)
|
|
468
|
+
return f.path, DiffResult(omission=DiffOmission.FETCH_ERROR)
|
|
469
|
+
|
|
470
|
+
diff_tasks = [_fetch_diff(f) for f in diff_eligible]
|
|
471
|
+
total_diffs = len(diff_tasks)
|
|
472
|
+
logger.info(f"Reading diffs for {total_diffs} file(s)...")
|
|
473
|
+
diffs: Dict[str, DiffResult] = {}
|
|
474
|
+
for i, coro in enumerate(asyncio.as_completed(diff_tasks), 1):
|
|
475
|
+
path, result = await coro
|
|
476
|
+
diffs[path] = result
|
|
477
|
+
if verbose:
|
|
478
|
+
logger.info(f" Read diff {i}/{total_diffs}")
|
|
479
|
+
|
|
480
|
+
logger.info("Generating XML...")
|
|
481
|
+
tag_name = f"{mode}_brief"
|
|
482
|
+
repo_name = cwd_path.name
|
|
483
|
+
|
|
484
|
+
out_lines = []
|
|
485
|
+
# Only --strict-xml is guaranteed well-formed XML 1.0 (CDATA terminators are
|
|
486
|
+
# split, control chars escaped), so only strict mode gets an XML prologue.
|
|
487
|
+
# Default mode prioritizes byte-fidelity of code content and may use the
|
|
488
|
+
# markdown-fenced fallback, which is intentionally not well-formed XML - so
|
|
489
|
+
# it must NOT carry a declaration that promises conformance it doesn't meet.
|
|
490
|
+
if strict_xml:
|
|
491
|
+
out_lines.append('<?xml version="1.0" encoding="UTF-8"?>')
|
|
492
|
+
|
|
493
|
+
repo_path = cwd_path.name if hide_repo_path else cwd_path.as_posix()
|
|
494
|
+
out_lines.append(f'<{tag_name} repo="{escape_xml_attr(repo_path, strict_xml=strict_xml)}">')
|
|
495
|
+
out_lines.append(f" <name>{escape_xml_attr(repo_name, strict_xml=strict_xml)}</name>")
|
|
496
|
+
|
|
497
|
+
# Inject the structured commit log for PRs
|
|
498
|
+
if mode == "pr":
|
|
499
|
+
current_branch = await scanner.get_current_branch()
|
|
500
|
+
logger.info("Fetching commit history...")
|
|
501
|
+
commits = await scanner.get_pr_commits(base_branch)
|
|
502
|
+
|
|
503
|
+
if commits:
|
|
504
|
+
if verbose:
|
|
505
|
+
for j, c in enumerate(commits, 1):
|
|
506
|
+
logger.info(f" Read commit {j}/{len(commits)} [{c['hash']}]: {c['subject']}")
|
|
507
|
+
commit_log_xml = build_commit_log_xml(
|
|
508
|
+
commits, current_branch, base_branch, strict_xml=strict_xml
|
|
509
|
+
)
|
|
510
|
+
out_lines.append(commit_log_xml)
|
|
511
|
+
elif verbose:
|
|
512
|
+
logger.info(" No commits found for this PR.")
|
|
513
|
+
|
|
514
|
+
content_sem = asyncio.Semaphore(diff_semaphore_limit)
|
|
515
|
+
render_tasks = [
|
|
516
|
+
_render_file_xml(
|
|
517
|
+
scanner,
|
|
518
|
+
cwd_path,
|
|
519
|
+
file_obj,
|
|
520
|
+
mode,
|
|
521
|
+
diffs.get(file_obj.path, NO_DIFF),
|
|
522
|
+
staged_only,
|
|
523
|
+
no_content,
|
|
524
|
+
max_size,
|
|
525
|
+
strict_xml,
|
|
526
|
+
content_sem,
|
|
527
|
+
tree_meta,
|
|
528
|
+
)
|
|
529
|
+
for file_obj in active_files
|
|
530
|
+
]
|
|
531
|
+
# return_exceptions=True so a single file that raises (e.g. a transient
|
|
532
|
+
# PermissionError or a file lock while reading the working tree) degrades to
|
|
533
|
+
# an omitted <file> element instead of aborting the whole brief and losing
|
|
534
|
+
# every other successfully rendered file. Results are positional, so they
|
|
535
|
+
# zip back to active_files in order.
|
|
536
|
+
rendered = await asyncio.gather(*render_tasks, return_exceptions=True)
|
|
537
|
+
for file_obj, result in zip(active_files, rendered):
|
|
538
|
+
if isinstance(result, BaseException):
|
|
539
|
+
# Never swallow cancellation / interrupts (KeyboardInterrupt,
|
|
540
|
+
# SystemExit, CancelledError) - those are not Exception subclasses.
|
|
541
|
+
if not isinstance(result, Exception):
|
|
542
|
+
raise result
|
|
543
|
+
detail = " ".join(str(result).split())
|
|
544
|
+
if len(detail) > 200:
|
|
545
|
+
detail = detail[:197] + "..."
|
|
546
|
+
reason = f"error rendering file: {type(result).__name__}"
|
|
547
|
+
if detail:
|
|
548
|
+
reason += f": {detail}"
|
|
549
|
+
logger.warning("Failed to render %s: %s", file_obj.path, result)
|
|
550
|
+
out_lines.append(
|
|
551
|
+
format_file_xml(
|
|
552
|
+
file_obj.path,
|
|
553
|
+
None,
|
|
554
|
+
status="omitted",
|
|
555
|
+
reason=reason,
|
|
556
|
+
indent=" ",
|
|
557
|
+
strict_xml=strict_xml,
|
|
558
|
+
)
|
|
559
|
+
)
|
|
560
|
+
else:
|
|
561
|
+
out_lines.append(result)
|
|
562
|
+
|
|
563
|
+
out_lines.append(f"</{tag_name}>")
|
|
564
|
+
|
|
565
|
+
# Return the joined string instead of writing to disk
|
|
566
|
+
return "\n".join(out_lines) + "\n"
|
|
567
|
+
|
|
568
|
+
|
|
569
|
+
def _swap_temp(temp_path: Path, out_path: Path) -> None:
|
|
570
|
+
"""Atomically move ``temp_path`` onto ``out_path``, retrying briefly.
|
|
571
|
+
|
|
572
|
+
``os.replace`` is atomic, but on Windows antivirus and search-indexer hooks
|
|
573
|
+
can hold a transient lock on a just-closed file, making the replace fail with
|
|
574
|
+
a spurious ``PermissionError``. The lock clears in milliseconds, so we retry a
|
|
575
|
+
few times before giving up and re-raising the last error.
|
|
576
|
+
"""
|
|
577
|
+
for attempt in range(3):
|
|
578
|
+
try:
|
|
579
|
+
os.replace(temp_path, out_path)
|
|
580
|
+
return
|
|
581
|
+
except PermissionError:
|
|
582
|
+
if attempt == 2:
|
|
583
|
+
raise
|
|
584
|
+
time.sleep(0.1)
|
|
585
|
+
|
|
586
|
+
|
|
587
|
+
async def save_brief(config: Git2xmlCliConfig) -> Optional[Path]:
|
|
588
|
+
"""Generate a brief and atomically write it to disk.
|
|
589
|
+
|
|
590
|
+
Returns:
|
|
591
|
+
Path of the created output file. The brief is written to the configuration ``output`` field, resolved against the
|
|
592
|
+
current working directory (the directory the command was run from),
|
|
593
|
+
not against ``cwd`` (the repo). So ``--repo /elsewhere`` still writes
|
|
594
|
+
the brief into your current directory. Missing parent directories in
|
|
595
|
+
the configuration ``output`` field are created. If there are no changes
|
|
596
|
+
to summarize, logs an informational message and writes nothing.
|
|
597
|
+
"""
|
|
598
|
+
xml_content = await build_brief(config)
|
|
599
|
+
|
|
600
|
+
if not xml_content:
|
|
601
|
+
return None
|
|
602
|
+
|
|
603
|
+
out_path = (Path.cwd() / config.output).resolve()
|
|
604
|
+
temp_path: Optional[Path] = None
|
|
605
|
+
|
|
606
|
+
try:
|
|
607
|
+
# Create any missing parent directories (e.g. --output reports/brief.xml)
|
|
608
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
609
|
+
|
|
610
|
+
# Write to a collision-free temporary file first
|
|
611
|
+
# Note: NamedTemporaryFile creates at 0600; os.replace preserves it, so the
|
|
612
|
+
# brief is owner-readable only - intentional, since it can contain source.
|
|
613
|
+
with tempfile.NamedTemporaryFile(
|
|
614
|
+
mode="w", encoding="utf-8", newline="", dir=out_path.parent, delete=False
|
|
615
|
+
) as f:
|
|
616
|
+
temp_path = Path(f.name)
|
|
617
|
+
f.write(xml_content)
|
|
618
|
+
|
|
619
|
+
# Atomically swap the temporary file into place (retries on transient
|
|
620
|
+
# Windows AV/indexer locks; see _swap_temp).
|
|
621
|
+
_swap_temp(temp_path, out_path)
|
|
622
|
+
temp_path = None
|
|
623
|
+
except OSError as e:
|
|
624
|
+
raise Git2xmlError(f"Could not write brief to {out_path}: {e}") from e
|
|
625
|
+
finally:
|
|
626
|
+
if temp_path is not None and temp_path.exists():
|
|
627
|
+
try:
|
|
628
|
+
temp_path.unlink()
|
|
629
|
+
except OSError:
|
|
630
|
+
pass
|
|
631
|
+
|
|
632
|
+
logger.info(f"Successfully generated {config.command} brief: {out_path.absolute()}")
|
|
633
|
+
return out_path
|