codetool-shell 0.1.1__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. codetool_shell/__init__.py +11 -0
  2. codetool_shell/api.py +59 -0
  3. codetool_shell/bin/windows-x86_64/codetool-shell-rust.exe +0 -0
  4. codetool_shell/filters/__init__.py +14 -0
  5. codetool_shell/filters/build_compiler/__init__.py +7 -0
  6. codetool_shell/filters/build_compiler/detector.py +412 -0
  7. codetool_shell/filters/build_compiler/reducer.py +166 -0
  8. codetool_shell/filters/build_compiler/summary.py +617 -0
  9. codetool_shell/filters/ci_job_log/__init__.py +7 -0
  10. codetool_shell/filters/ci_job_log/detector.py +64 -0
  11. codetool_shell/filters/ci_job_log/reducer.py +99 -0
  12. codetool_shell/filters/ci_job_log/summary.py +243 -0
  13. codetool_shell/filters/diff/__init__.py +7 -0
  14. codetool_shell/filters/diff/detector.py +136 -0
  15. codetool_shell/filters/diff/reducer.py +308 -0
  16. codetool_shell/filters/generic_log/__init__.py +7 -0
  17. codetool_shell/filters/generic_log/detector.py +175 -0
  18. codetool_shell/filters/generic_log/reducer.py +99 -0
  19. codetool_shell/filters/generic_log/summary.py +161 -0
  20. codetool_shell/filters/git.py +514 -0
  21. codetool_shell/filters/html_cleanup/__init__.py +7 -0
  22. codetool_shell/filters/html_cleanup/detector.py +136 -0
  23. codetool_shell/filters/html_cleanup/reducer.py +27 -0
  24. codetool_shell/filters/html_cleanup/summary.py +422 -0
  25. codetool_shell/filters/json_payload/__init__.py +7 -0
  26. codetool_shell/filters/json_payload/detector.py +62 -0
  27. codetool_shell/filters/json_payload/reducer.py +81 -0
  28. codetool_shell/filters/json_payload/summary.py +233 -0
  29. codetool_shell/filters/listing/__init__.py +7 -0
  30. codetool_shell/filters/listing/detector.py +294 -0
  31. codetool_shell/filters/listing/reducer.py +30 -0
  32. codetool_shell/filters/log_template/__init__.py +7 -0
  33. codetool_shell/filters/log_template/constants.py +76 -0
  34. codetool_shell/filters/log_template/detector.py +331 -0
  35. codetool_shell/filters/log_template/reducer.py +78 -0
  36. codetool_shell/filters/log_template/template.py +280 -0
  37. codetool_shell/filters/log_template/types.py +21 -0
  38. codetool_shell/filters/opaque_payload/__init__.py +7 -0
  39. codetool_shell/filters/opaque_payload/detector.py +563 -0
  40. codetool_shell/filters/opaque_payload/reducer.py +142 -0
  41. codetool_shell/filters/opaque_payload/summary.py +61 -0
  42. codetool_shell/filters/package_manager/__init__.py +7 -0
  43. codetool_shell/filters/package_manager/detector.py +220 -0
  44. codetool_shell/filters/package_manager/reducer.py +110 -0
  45. codetool_shell/filters/package_manager/summary.py +172 -0
  46. codetool_shell/filters/pipeline.py +65 -0
  47. codetool_shell/filters/rg.py +250 -0
  48. codetool_shell/filters/system_output/__init__.py +7 -0
  49. codetool_shell/filters/system_output/detector.py +600 -0
  50. codetool_shell/filters/system_output/reducer.py +331 -0
  51. codetool_shell/filters/system_output/summary.py +164 -0
  52. codetool_shell/filters/table/__init__.py +7 -0
  53. codetool_shell/filters/table/detector.py +244 -0
  54. codetool_shell/filters/table/reducer.py +57 -0
  55. codetool_shell/filters/table/summary.py +37 -0
  56. codetool_shell/filters/test_runner/__init__.py +7 -0
  57. codetool_shell/filters/test_runner/ansi.py +80 -0
  58. codetool_shell/filters/test_runner/detector.py +409 -0
  59. codetool_shell/filters/test_runner/reducer.py +288 -0
  60. codetool_shell/filters/test_runner/summary.py +449 -0
  61. codetool_shell/filters/text.py +38 -0
  62. codetool_shell/filters/traceback/__init__.py +7 -0
  63. codetool_shell/filters/traceback/detector.py +209 -0
  64. codetool_shell/filters/traceback/reducer.py +141 -0
  65. codetool_shell/filters/traceback/summary.py +122 -0
  66. codetool_shell/filters/tree.py +59 -0
  67. codetool_shell/py.typed +0 -0
  68. codetool_shell/python_backend.py +38 -0
  69. codetool_shell/rust_backend.py +254 -0
  70. codetool_shell-0.1.1.dist-info/METADATA +152 -0
  71. codetool_shell-0.1.1.dist-info/RECORD +72 -0
  72. codetool_shell-0.1.1.dist-info/WHEEL +4 -0
@@ -0,0 +1,161 @@
1
+ """Line parsing and classification for generic application logs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from dataclasses import dataclass
7
+
8
+
9
+ @dataclass(frozen=True)
10
+ class ParsedLogLine:
11
+ """A log line with a recognized timestamp/level prefix."""
12
+
13
+ level: str
14
+ message: str
15
+ had_timestamp: bool
16
+
17
+
18
+ LOW_SEVERITY_LEVELS = frozenset({"INFO", "DEBUG", "TRACE"})
19
+ KNOWN_LEVELS = frozenset(
20
+ {
21
+ "TRACE",
22
+ "DEBUG",
23
+ "INFO",
24
+ "NOTICE",
25
+ "WARN",
26
+ "WARNING",
27
+ "ERROR",
28
+ "FATAL",
29
+ "PANIC",
30
+ "CRITICAL",
31
+ "ALERT",
32
+ "EMERG",
33
+ }
34
+ )
35
+
36
+ _TIMESTAMP_PREFIX_RE = re.compile(
37
+ r"^\s*\[?(?P<timestamp>\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}"
38
+ r"(?:[.,]\d+)?(?:Z|[+-]\d{2}:?\d{2})?)\]?\s+(?P<rest>.+)$"
39
+ )
40
+ _LEVEL_WITH_TIMESTAMP_RE = re.compile(
41
+ r"^\s*(?:\[(?P<bracket>[A-Za-z]{3,8})\]|"
42
+ r"(?P<plain>[A-Za-z]{3,8})\b|"
43
+ r"level=(?P<kv>[A-Za-z]{3,8})\b)\s*[:\]-]?\s*(?P<message>.*)$",
44
+ re.IGNORECASE,
45
+ )
46
+ _STANDALONE_LEVEL_RE = re.compile(
47
+ r"^\s*(?:\[(?P<bracket>[A-Za-z]{3,8})\]\s*|"
48
+ r"(?P<plain>[A-Za-z]{3,8})\s*[:-]\s*|"
49
+ r"level=(?P<kv>[A-Za-z]{3,8})\b\s*)"
50
+ r"(?P<message>.*)$",
51
+ re.IGNORECASE,
52
+ )
53
+ _NUMBER_RE = re.compile(r"\d+(?:\.\d+)?")
54
+ _WHITESPACE_RE = re.compile(r"\s+")
55
+ _EXCEPTION_RE = re.compile(
56
+ r"\b(?:exception|traceback|stack trace|caused by|panic|segmentation fault)\b",
57
+ re.IGNORECASE,
58
+ )
59
+ _FAILURE_RE = re.compile(
60
+ r"\b(?:failed|failure|exit code|exited with|status: failed|fatal error)\b",
61
+ re.IGNORECASE,
62
+ )
63
+ _URL_RE = re.compile(r"\b[A-Za-z][A-Za-z0-9+.-]*://\S+")
64
+ _FILE_PATH_RE = re.compile(
65
+ r"(^|[\s=:\"'(\[])"
66
+ r"(?:\.{0,2}/|/|~/|[A-Za-z]:[\\/]|[A-Za-z0-9_.@+-]+/)"
67
+ r"[A-Za-z0-9_.@+/\-\\]+"
68
+ r"(?:\.(?:py|rs|ts|tsx|js|jsx|json|ya?ml|toml|log|txt|html|xml|zip|png|jpg|jpeg|csv|md)"
69
+ r"|:\d+)"
70
+ )
71
+
72
+
73
+ def parse_log_line(line: str) -> ParsedLogLine | None:
74
+ """Parse a timestamp/level or bracketed-level application log line."""
75
+
76
+ stripped = line.strip()
77
+ if not stripped:
78
+ return None
79
+
80
+ timestamp_match = _TIMESTAMP_PREFIX_RE.match(stripped)
81
+ if timestamp_match is not None:
82
+ return _parse_level_prefix(
83
+ timestamp_match.group("rest"),
84
+ had_timestamp=True,
85
+ allow_bare_level=True,
86
+ )
87
+
88
+ return _parse_level_prefix(
89
+ stripped,
90
+ had_timestamp=False,
91
+ allow_bare_level=False,
92
+ )
93
+
94
+
95
+ def is_low_severity(parsed: ParsedLogLine) -> bool:
96
+ return parsed.level in LOW_SEVERITY_LEVELS
97
+
98
+
99
+ def is_important_line(line: str, parsed: ParsedLogLine | None) -> bool:
100
+ """Return whether a line must be preserved from a generic log."""
101
+
102
+ if parsed is not None and parsed.level not in LOW_SEVERITY_LEVELS:
103
+ return True
104
+ return (
105
+ _EXCEPTION_RE.search(line) is not None
106
+ or _FAILURE_RE.search(line) is not None
107
+ or _URL_RE.search(line) is not None
108
+ or _FILE_PATH_RE.search(line) is not None
109
+ )
110
+
111
+
112
+ def normalize_selected_line(line: str) -> str:
113
+ return line.strip()
114
+
115
+
116
+ def normalized_message_pattern(parsed: ParsedLogLine) -> str:
117
+ """Return a compact pattern key for repeated low-severity messages."""
118
+
119
+ message = _NUMBER_RE.sub("<num>", parsed.message.strip())
120
+ message = _WHITESPACE_RE.sub(" ", message).strip()
121
+ if not message:
122
+ message = "<empty message>"
123
+ if len(message) > 140:
124
+ message = f"{message[:137]}..."
125
+ return message
126
+
127
+
128
+ def normalize_level(value: str | None) -> str | None:
129
+ if value is None:
130
+ return None
131
+ level = value.upper()
132
+ if level == "WARNING":
133
+ return "WARN"
134
+ if level not in KNOWN_LEVELS:
135
+ return None
136
+ return level
137
+
138
+
139
+ def _parse_level_prefix(
140
+ text: str,
141
+ *,
142
+ had_timestamp: bool,
143
+ allow_bare_level: bool,
144
+ ) -> ParsedLogLine | None:
145
+ regex = _LEVEL_WITH_TIMESTAMP_RE if allow_bare_level else _STANDALONE_LEVEL_RE
146
+ match = regex.match(text)
147
+ if match is None:
148
+ return None
149
+
150
+ level = normalize_level(
151
+ match.groupdict().get("bracket")
152
+ or match.groupdict().get("plain")
153
+ or match.groupdict().get("kv")
154
+ )
155
+ if level is None:
156
+ return None
157
+ return ParsedLogLine(
158
+ level=level,
159
+ message=match.group("message").strip(),
160
+ had_timestamp=had_timestamp,
161
+ )
@@ -0,0 +1,514 @@
1
+ """Compression for common git command output."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from dataclasses import dataclass
7
+
8
+ from .text import (
9
+ choose_smaller_lines,
10
+ indent_lines,
11
+ join_preserving_final_newline,
12
+ score,
13
+ split_preserving_final_newline,
14
+ )
15
+ from .tree import TreeRow, format_tree
16
+
17
+
18
+ def compress_git_output(text: str) -> str:
19
+ """Compress common git command output shapes."""
20
+
21
+ current = text
22
+ for transform in (
23
+ _compress_git_diff_blocks,
24
+ _compress_git_stat_blocks,
25
+ _compress_git_name_status_blocks,
26
+ _compress_git_long_status_blocks,
27
+ _compress_git_status_blocks,
28
+ ):
29
+ candidate = transform(current)
30
+ if score(candidate) < score(current):
31
+ current = candidate
32
+ return current
33
+
34
+
35
+ @dataclass(frozen=True)
36
+ class _GitStatusEntry:
37
+ code: str
38
+ path: str
39
+
40
+
41
+ _GIT_STATUS_CHARS = frozenset(" MADRCTU!?")
42
+
43
+
44
+ def _parse_git_status_line(line: str) -> _GitStatusEntry | None:
45
+ if len(line) < 4 or line[2] != " ":
46
+ return None
47
+ raw_code = line[:2]
48
+ if raw_code != "??" and (
49
+ not any(char != " " for char in raw_code)
50
+ or any(char not in _GIT_STATUS_CHARS for char in raw_code)
51
+ ):
52
+ return None
53
+ path = line[3:]
54
+ if not path:
55
+ return None
56
+ return _GitStatusEntry(raw_code.strip() or raw_code, path)
57
+
58
+
59
+ def _is_git_status_block_line(line: str) -> bool:
60
+ return line.startswith("## ") or _parse_git_status_line(line) is not None
61
+
62
+
63
+ def _compress_git_status_blocks(text: str) -> str:
64
+ lines, final_newline = split_preserving_final_newline(text)
65
+ output: list[str] = []
66
+ i = 0
67
+
68
+ while i < len(lines):
69
+ if not _is_git_status_block_line(lines[i]):
70
+ output.append(lines[i])
71
+ i += 1
72
+ continue
73
+
74
+ block: list[str] = []
75
+ while i < len(lines) and _is_git_status_block_line(lines[i]):
76
+ block.append(lines[i])
77
+ i += 1
78
+ output.extend(_compress_git_status_block(block))
79
+
80
+ return join_preserving_final_newline(output, final_newline)
81
+
82
+
83
+ def _compress_git_status_block(block: list[str]) -> list[str]:
84
+ branches = [line for line in block if line.startswith("## ")]
85
+ entries = [
86
+ parsed
87
+ for line in block
88
+ if (parsed := _parse_git_status_line(line)) is not None
89
+ ]
90
+ if not entries:
91
+ return block
92
+
93
+ groups: list[tuple[str, list[TreeRow]]] = []
94
+ for entry in entries:
95
+ for code, rows in groups:
96
+ if code == entry.code:
97
+ rows.append(TreeRow(entry.path))
98
+ break
99
+ else:
100
+ groups.append((entry.code, [TreeRow(entry.path)]))
101
+
102
+ summary = "git status: " + " ".join(
103
+ f"{code}={len(rows)}" for code, rows in groups
104
+ )
105
+ candidate: list[str] = [*branches, summary]
106
+ for code, rows in groups:
107
+ candidate.append(f"{code}:")
108
+ candidate.extend(indent_lines(format_tree(rows)))
109
+
110
+ return choose_smaller_lines(block, candidate)
111
+
112
+
113
+ @dataclass(frozen=True)
114
+ class _GitNameStatusEntry:
115
+ code: str
116
+ path: str
117
+
118
+
119
+ _GIT_NAME_STATUS_CODE_RE = re.compile(r"^(?:[MADTUXB]|R\d{0,3}|C\d{0,3})$")
120
+ _GIT_PATHLIKE_RE = re.compile(
121
+ r"[/\\]|^\.\.?/|(?:^|/)\.[^/]+|\.[A-Za-z0-9]{1,12}$|"
122
+ r"^(?:README|Makefile|Dockerfile|Cargo\.lock|uv\.lock)$"
123
+ )
124
+
125
+
126
+ def _looks_git_path_like(value: str) -> bool:
127
+ clean = value.strip().strip('"')
128
+ if not clean or len(clean) > 300 or "://" in clean:
129
+ return False
130
+ return bool(_GIT_PATHLIKE_RE.search(clean))
131
+
132
+
133
+ def _parse_git_name_status_line(line: str) -> _GitNameStatusEntry | None:
134
+ parts = line.split("\t")
135
+ if len(parts) < 2:
136
+ return None
137
+ code = parts[0]
138
+ if _GIT_NAME_STATUS_CODE_RE.match(code) is None:
139
+ return None
140
+
141
+ if code.startswith(("R", "C")):
142
+ if len(parts) != 3 or not all(_looks_git_path_like(path) for path in parts[1:]):
143
+ return None
144
+ return _GitNameStatusEntry(code, f"{parts[1]} -> {parts[2]}")
145
+
146
+ if len(parts) != 2 or not _looks_git_path_like(parts[1]):
147
+ return None
148
+ return _GitNameStatusEntry(code, parts[1])
149
+
150
+
151
+ def _compress_git_name_status_blocks(text: str) -> str:
152
+ lines, final_newline = split_preserving_final_newline(text)
153
+ output: list[str] = []
154
+ i = 0
155
+
156
+ while i < len(lines):
157
+ first = _parse_git_name_status_line(lines[i])
158
+ if first is None:
159
+ output.append(lines[i])
160
+ i += 1
161
+ continue
162
+
163
+ block = [lines[i]]
164
+ entries = [first]
165
+ i += 1
166
+ while i < len(lines):
167
+ parsed = _parse_git_name_status_line(lines[i])
168
+ if parsed is None:
169
+ break
170
+ block.append(lines[i])
171
+ entries.append(parsed)
172
+ i += 1
173
+
174
+ if len(entries) < 3:
175
+ output.extend(block)
176
+ continue
177
+ output.extend(choose_smaller_lines(block, _compress_git_name_status_block(entries)))
178
+
179
+ return join_preserving_final_newline(output, final_newline)
180
+
181
+
182
+ def _compress_git_name_status_block(entries: list[_GitNameStatusEntry]) -> list[str]:
183
+ groups: list[tuple[str, list[str]]] = []
184
+ for entry in entries:
185
+ for code, paths in groups:
186
+ if code == entry.code:
187
+ paths.append(entry.path)
188
+ break
189
+ else:
190
+ groups.append((entry.code, [entry.path]))
191
+
192
+ summary = "git name-status: " + " ".join(
193
+ f"{code}={len(paths)}" for code, paths in groups
194
+ )
195
+ candidate = [summary]
196
+ for code, paths in groups:
197
+ candidate.append(f"{code}:")
198
+ candidate.extend(_format_git_paths(paths))
199
+ return candidate
200
+
201
+
202
+ @dataclass
203
+ class _GitLongStatusSection:
204
+ title: str
205
+ entries: list[tuple[str, str]]
206
+
207
+
208
+ _GIT_LONG_STATUS_CATEGORY_RE = re.compile(
209
+ r"^(Changes to be committed|Changes not staged for commit|Unmerged paths|"
210
+ r"Untracked files|Ignored files):$"
211
+ )
212
+ _GIT_LONG_STATUS_ENTRY_RE = re.compile(
213
+ r"^(?:(?P<label>[a-z][a-z ]+):\s+)?(?P<path>\S.*)$"
214
+ )
215
+
216
+
217
+ def _compress_git_long_status_blocks(text: str) -> str:
218
+ lines, final_newline = split_preserving_final_newline(text)
219
+ if not _looks_like_git_long_status(lines):
220
+ return text
221
+
222
+ candidate = _compress_git_long_status_block(lines)
223
+ chosen = choose_smaller_lines(lines, candidate)
224
+ return join_preserving_final_newline(chosen, final_newline)
225
+
226
+
227
+ def _looks_like_git_long_status(lines: list[str]) -> bool:
228
+ nonempty = [line for line in lines if line.strip()]
229
+ if not nonempty:
230
+ return False
231
+ has_status_start = nonempty[0].startswith(("On branch ", "HEAD detached "))
232
+ has_category = any(_GIT_LONG_STATUS_CATEGORY_RE.match(line) for line in nonempty)
233
+ return has_status_start and has_category
234
+
235
+
236
+ def _compress_git_long_status_block(block: list[str]) -> list[str]:
237
+ preamble: list[str] = []
238
+ footer: list[str] = []
239
+ sections: list[_GitLongStatusSection] = []
240
+ current_section: _GitLongStatusSection | None = None
241
+
242
+ for line in block:
243
+ if not line.strip() or _is_git_long_status_hint(line):
244
+ continue
245
+
246
+ category = _GIT_LONG_STATUS_CATEGORY_RE.match(line)
247
+ if category is not None:
248
+ current_section = _GitLongStatusSection(category.group(1), [])
249
+ sections.append(current_section)
250
+ continue
251
+
252
+ if current_section is not None:
253
+ entry = _parse_git_long_status_entry(line)
254
+ if entry is not None:
255
+ current_section.entries.append(entry)
256
+ continue
257
+ footer.append(line.strip())
258
+ continue
259
+
260
+ preamble.append(line.strip())
261
+
262
+ if not any(section.entries for section in sections):
263
+ return block
264
+
265
+ candidate = ["git status:", *preamble]
266
+ for section in sections:
267
+ if not section.entries:
268
+ candidate.append(f"{section.title}: 0")
269
+ continue
270
+ groups: list[tuple[str, list[str]]] = []
271
+ for label, path in section.entries:
272
+ for group_label, paths in groups:
273
+ if group_label == label:
274
+ paths.append(path)
275
+ break
276
+ else:
277
+ groups.append((label, [path]))
278
+
279
+ summary_parts = [
280
+ f"{'files' if label == 'path' else label}={len(paths)}"
281
+ for label, paths in groups
282
+ ]
283
+ candidate.append(f"{section.title}: {' '.join(summary_parts)}")
284
+ for label, paths in groups:
285
+ heading = "files" if label == "path" else label
286
+ candidate.append(f" {heading}:")
287
+ candidate.extend(_format_git_paths(paths))
288
+
289
+ candidate.extend(footer)
290
+ return candidate
291
+
292
+
293
+ def _is_git_long_status_hint(line: str) -> bool:
294
+ stripped = line.strip()
295
+ return stripped.startswith("(") and stripped.endswith(")")
296
+
297
+
298
+ def _parse_git_long_status_entry(line: str) -> tuple[str, str] | None:
299
+ if not line.startswith((" ", "\t")):
300
+ return None
301
+ stripped = line.strip()
302
+ if not stripped or _is_git_long_status_hint(stripped):
303
+ return None
304
+ match = _GIT_LONG_STATUS_ENTRY_RE.match(stripped)
305
+ if match is None:
306
+ return None
307
+ label = (match.group("label") or "path").strip()
308
+ path = match.group("path").strip()
309
+ if not path:
310
+ return None
311
+ return label, path
312
+
313
+
314
+ def _format_git_paths(paths: list[str]) -> list[str]:
315
+ tree_rows: list[TreeRow] = []
316
+ literal_paths: list[str] = []
317
+ for path in paths:
318
+ if " -> " in path or "\t" in path:
319
+ literal_paths.append(path)
320
+ else:
321
+ tree_rows.append(TreeRow(path))
322
+
323
+ lines: list[str] = []
324
+ if tree_rows:
325
+ lines.extend(indent_lines(format_tree(tree_rows)))
326
+ lines.extend(f" {path}" for path in literal_paths)
327
+ return lines
328
+
329
+
330
+ def _parse_git_stat_line(line: str) -> TreeRow | None:
331
+ if "|" not in line:
332
+ return None
333
+ path_part, summary_part = line.rsplit("|", 1)
334
+ path = path_part.strip()
335
+ summary = summary_part.strip()
336
+ if not path or not summary:
337
+ return None
338
+ if path.startswith(("---", "+++")):
339
+ return None
340
+ if not (
341
+ re.match(r"^(?:\d+|-)\s*(?:[+\-]+)?$", summary)
342
+ or summary.startswith("Bin ")
343
+ ):
344
+ return None
345
+ return TreeRow(path, f" | {summary}")
346
+
347
+
348
+ _GIT_STAT_SUMMARY_RE = re.compile(r"^\s*\d+ files? changed(?:, .*)?$")
349
+
350
+
351
+ def _compress_git_stat_blocks(text: str) -> str:
352
+ lines, final_newline = split_preserving_final_newline(text)
353
+ output: list[str] = []
354
+ i = 0
355
+
356
+ while i < len(lines):
357
+ first = _parse_git_stat_line(lines[i])
358
+ if first is None:
359
+ output.append(lines[i])
360
+ i += 1
361
+ continue
362
+
363
+ block = [lines[i]]
364
+ rows = [first]
365
+ i += 1
366
+ while i < len(lines):
367
+ parsed = _parse_git_stat_line(lines[i])
368
+ if parsed is None:
369
+ break
370
+ block.append(lines[i])
371
+ rows.append(parsed)
372
+ i += 1
373
+
374
+ summaries: list[str] = []
375
+ while i < len(lines) and _GIT_STAT_SUMMARY_RE.match(lines[i]):
376
+ block.append(lines[i])
377
+ summaries.append(lines[i].strip())
378
+ i += 1
379
+
380
+ candidate = summaries[:] if summaries else [f"git diff --stat: files={len(rows)}"]
381
+ candidate.extend(format_tree(rows).splitlines())
382
+ output.extend(choose_smaller_lines(block, candidate))
383
+
384
+ return join_preserving_final_newline(output, final_newline)
385
+
386
+
387
+ def _compress_git_diff_blocks(text: str) -> str:
388
+ if "diff --git " not in text:
389
+ return text
390
+
391
+ lines, final_newline = split_preserving_final_newline(text)
392
+ output: list[str] = []
393
+ i = 0
394
+
395
+ while i < len(lines):
396
+ if not lines[i].startswith("diff --git "):
397
+ output.append(lines[i])
398
+ i += 1
399
+ continue
400
+
401
+ block: list[str] = []
402
+ while i < len(lines):
403
+ if block and lines[i].startswith("diff --git "):
404
+ break
405
+ block.append(lines[i])
406
+ i += 1
407
+
408
+ candidate = _compress_git_diff_block(block)
409
+ output.extend(choose_smaller_lines(block, candidate))
410
+
411
+ return join_preserving_final_newline(output, final_newline)
412
+
413
+
414
+ def _compress_git_diff_block(block: list[str]) -> list[str]:
415
+ output: list[str] = []
416
+ omitted_context = 0
417
+ omitted_binary_payload = 0
418
+ in_hunk = False
419
+ in_binary_payload = False
420
+
421
+ def flush_context() -> None:
422
+ nonlocal omitted_context
423
+ if omitted_context:
424
+ plural = "" if omitted_context == 1 else "s"
425
+ output.append(f"… {omitted_context} context line{plural}")
426
+ omitted_context = 0
427
+
428
+ def flush_binary_payload() -> None:
429
+ nonlocal omitted_binary_payload
430
+ if omitted_binary_payload:
431
+ plural = "" if omitted_binary_payload == 1 else "s"
432
+ output.append(f"… {omitted_binary_payload} binary patch payload line{plural}")
433
+ omitted_binary_payload = 0
434
+
435
+ for line in block:
436
+ if line.startswith("@@ "):
437
+ flush_context()
438
+ flush_binary_payload()
439
+ output.append(line)
440
+ in_hunk = True
441
+ in_binary_payload = False
442
+ continue
443
+
444
+ if line.startswith(("literal ", "delta ")):
445
+ flush_context()
446
+ flush_binary_payload()
447
+ output.append(line)
448
+ in_binary_payload = True
449
+ in_hunk = False
450
+ continue
451
+
452
+ if in_binary_payload:
453
+ if _is_diff_metadata_line(line):
454
+ flush_binary_payload()
455
+ output.append(line)
456
+ continue
457
+ omitted_binary_payload += 1
458
+ continue
459
+
460
+ if in_hunk and line.startswith(" "):
461
+ omitted_context += 1
462
+ continue
463
+
464
+ if _is_diff_metadata_line(line):
465
+ flush_context()
466
+ flush_binary_payload()
467
+ output.append(line)
468
+ in_binary_payload = line.startswith("GIT binary patch")
469
+ continue
470
+
471
+ if in_hunk and (line.startswith("+") or line.startswith("-")):
472
+ flush_context()
473
+ output.append(line)
474
+ continue
475
+
476
+ if line.startswith("\\ No newline"):
477
+ flush_context()
478
+ output.append(line)
479
+ continue
480
+
481
+ if line.startswith("index "):
482
+ continue
483
+
484
+ flush_context()
485
+ flush_binary_payload()
486
+ output.append(line)
487
+
488
+ flush_context()
489
+ flush_binary_payload()
490
+ return output
491
+
492
+
493
+ def _is_diff_metadata_line(line: str) -> bool:
494
+ return line.startswith(
495
+ (
496
+ "diff --git ",
497
+ "--- ",
498
+ "+++ ",
499
+ "new file mode ",
500
+ "deleted file mode ",
501
+ "old mode ",
502
+ "new mode ",
503
+ "similarity index ",
504
+ "dissimilarity index ",
505
+ "rename from ",
506
+ "rename to ",
507
+ "copy from ",
508
+ "copy to ",
509
+ "Binary files ",
510
+ "GIT binary patch",
511
+ "literal ",
512
+ "delta ",
513
+ )
514
+ )
@@ -0,0 +1,7 @@
1
+ """Conservative full-document HTML cleanup."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .reducer import compress_html_cleanup_output
6
+
7
+ __all__ = ["compress_html_cleanup_output"]