git2xml 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
git2xml/models.py ADDED
@@ -0,0 +1,273 @@
1
+ """Shared types: config, the error hierarchy, and the data passed between layers.
2
+
3
+ Pure declarations with no I/O and no dependency on other git2xml modules
4
+ (except ``constants`` for defaults), so every layer can import it freely. Holds
5
+ the user-facing ``Git2xmlError`` hierarchy, the immutable ``Git2xmlConfig``,
6
+ the ``FileStatus`` / ``StagingState`` enums (``FileStatus`` being the single
7
+ source of truth for git status codes and their labels), and the
8
+ ``ChangedFile`` / ``ScanResult`` / ``DiffResult`` records the scanner and
9
+ engine exchange.
10
+ """
11
+
12
+ from dataclasses import dataclass
13
+ from enum import Enum
14
+ from typing import Dict, List, Literal, Optional, TypedDict
15
+
16
+ from .constants import DIFF_SEMAPHORE_LIMIT, GIT_TIMEOUT, MAX_DIFF_SIZE, MAX_TEXT_FILE_SIZE
17
+
18
+
19
+ class Git2xmlError(Exception):
20
+ """Base class for user-facing git2xml errors."""
21
+
22
+
23
+ class GitNotInstalledError(Git2xmlError):
24
+ def __init__(self):
25
+ super().__init__("git executable not found on PATH. Install git and try again.")
26
+
27
+
28
+ class NotAGitRepositoryError(Git2xmlError):
29
+ def __init__(self, path: str):
30
+ self.path = path
31
+ super().__init__(f"Not a git repository: {path}")
32
+
33
+
34
+ class GitCommandError(Git2xmlError):
35
+ def __init__(self, command: str, returncode: int, stderr: str):
36
+ self.command = command
37
+ self.returncode = returncode
38
+ self.stderr = stderr
39
+ super().__init__(
40
+ f"Git command failed (code {returncode}): {stderr.strip() or 'unknown error'}"
41
+ )
42
+
43
+
44
+ class FileStatus(str, Enum):
45
+ MODIFIED = "M"
46
+ ADDED = "A"
47
+ DELETED = "D"
48
+ RENAMED = "R"
49
+ COPIED = "C"
50
+ TYPE_CHANGED = "T"
51
+ UNTRACKED = "??"
52
+
53
+ @property
54
+ def label(self) -> str:
55
+ """Human-readable status for XML output (e.g. MODIFIED -> 'modified')."""
56
+ return _STATUS_LABELS[self]
57
+
58
+ @classmethod
59
+ def from_code(cls, code: str) -> "FileStatus":
60
+ """Map a raw ``git diff --name-status`` code to a FileStatus.
61
+
62
+ Recognises the single-letter codes that reach the non-rename/copy branch
63
+ of the parser (M/A/D/T); anything else (an unmerged 'U', an 'X' bug
64
+ marker) defaults to MODIFIED, matching git's "treat as a change" posture.
65
+ R/C are handled separately by the parser (they carry a score + paths) and
66
+ never reach here.
67
+ """
68
+ try:
69
+ return cls(code)
70
+ except ValueError:
71
+ return cls.MODIFIED
72
+
73
+
74
+ _STATUS_LABELS: Dict[FileStatus, str] = {
75
+ FileStatus.MODIFIED: "modified",
76
+ FileStatus.ADDED: "added",
77
+ FileStatus.DELETED: "deleted",
78
+ FileStatus.RENAMED: "renamed",
79
+ FileStatus.COPIED: "copied",
80
+ FileStatus.TYPE_CHANGED: "type changed",
81
+ FileStatus.UNTRACKED: "new file",
82
+ }
83
+ # The completeness assertion makes a missing label a hard import-time error,
84
+ # so _STATUS_LABELS can't silently drift from the enum.
85
+ assert set(_STATUS_LABELS) == set(FileStatus), "every FileStatus needs a label"
86
+
87
+
88
+ class StagingState(str, Enum):
89
+ STAGED = "staged"
90
+ UNSTAGED = "unstaged"
91
+ STAGED_AND_MODIFIED = "staged+modified"
92
+ UNTRACKED = "untracked"
93
+
94
+
95
+ class PRCommit(TypedDict):
96
+ """One commit record parsed from ``git log`` in ``get_pr_commits``."""
97
+
98
+ hash: str
99
+ author: str
100
+ date: str
101
+ subject: str
102
+ body: str
103
+
104
+
105
+ # Required vs optional keys are split across two classes because NotRequired is
106
+ # 3.11+ and the project supports 3.9: ``status`` is always present, ``old_path``
107
+ # only for renames/copies.
108
+ class _NameStatusBase(TypedDict):
109
+ status: FileStatus
110
+
111
+
112
+ class NameStatusEntry(_NameStatusBase, total=False):
113
+ """One entry from ``parse_name_status``; ``old_path`` set only for R/C."""
114
+
115
+ old_path: str
116
+
117
+
118
+ @dataclass
119
+ class ChangedFile:
120
+ path: str
121
+ status: FileStatus
122
+ staging: StagingState
123
+ old_path: Optional[str] = None
124
+
125
+
126
+ @dataclass
127
+ class ScanResult:
128
+ files: List[ChangedFile]
129
+ has_staged: bool
130
+
131
+
132
+ @dataclass(frozen=True)
133
+ class Git2xmlConfig:
134
+ """Configuration settings for generating a git2xml brief.
135
+ The dataclass is immutable.
136
+
137
+ Attributes:
138
+ command: Either ``"commit"`` or ``"pr"`` (Default is ``commit``).
139
+ repo: Path to the git repository (resolved to an absolute path).
140
+ base: Base ref for ``pr`` mode. Resolved flexibly - a bare
141
+ branch name, remote ref (``origin/main``), tag, or SHA all work.
142
+ Ignored in ``commit`` mode. Defaults to ``"main"``.
143
+ verbose: If True, log per-file and per-commit progress, as well as debug logs.
144
+ staged: ``commit`` mode only. If True, restrict output to staged
145
+ files and read their content from the index rather than the
146
+ working tree. No effect in ``pr`` mode.
147
+ strict_xml: If True, emit strict XML 1.0 - escape control characters
148
+ and split CDATA terminators. If False (default), prioritize exact
149
+ file fidelity, falling back to markdown fencing when a CDATA
150
+ terminator is present.
151
+ no_untracked: ``commit`` mode only. If True, exclude untracked files.
152
+ No effect in ``pr`` mode or when ``staged`` is set (untracked
153
+ files are already excluded there).
154
+ max_size: Per-file *content* size limit in bytes. Files whose content
155
+ exceeds this have their content omitted with an explanatory reason,
156
+ but their ``<file>`` element and ``<diff>`` are still emitted - so the
157
+ change stays visible even when the full content is too large to
158
+ include. This differs from binary omission, which drops the ``<diff>``
159
+ too, since git produces no useful textual diff for binary files.
160
+ Content size is read from git metadata or the filesystem before the blob is loaded,
161
+ so oversized content is never buffered. This limit does not apply to diffs
162
+ - that is ``max_diff_size``'s job: a diff has no size git can report
163
+ before computing it, so it is always fetched in full and
164
+ ``max_diff_size`` then drops it from the output if oversized. Defaults
165
+ to ``MAX_TEXT_FILE_SIZE``.
166
+ max_diff_size: Per-file *diff* size limit in bytes (UTF-8), the same
167
+ unit as ``max_size``. A diff larger than this is omitted entirely -
168
+ its ``<diff>`` slot renders ``status="omitted"`` with a reason, while
169
+ the ``<file>`` element and any ``<content>`` stay - mirroring how
170
+ ``max_size`` drops oversized content but keeps the diff. Unlike
171
+ ``max_size``, this is output-shaping, not a memory guard:
172
+ a diff has no size git can report up front, so it is fully produced
173
+ before it can be measured. The cap keeps a pathological diff
174
+ (e.g. a deleted multi-megabyte file, whose brief carries only a diff)
175
+ from overflowing the LLM context window. Defaults to ``MAX_DIFF_SIZE``;
176
+ pass ``0`` to disable the cap.
177
+ no_content: If True, omit the ``<content>`` body for every file,
178
+ producing a diff-only brief. Diffs are unaffected.
179
+ git_timeout: Git command execution timeout. Defaults to ``GIT_TIMEOUT``.
180
+ diff_semaphore_limit: Maximum number of concurrent diff fetch actions. Defaults to ``DIFF_SEMAPHORE_LIMIT``.
181
+ hide_repo_path: If True, emit only the repository's directory name in the
182
+ root ``repo`` attribute instead of its absolute filesystem path. Use
183
+ when pasting briefs into third-party tools to avoid leaking your local
184
+ path (username, directory layout). File ``path`` attributes are always
185
+ repo-relative and unaffected. Defaults to False.
186
+ """
187
+
188
+ command: Literal["commit", "pr"] = "commit"
189
+ repo: str = "."
190
+ base: str = "main"
191
+ verbose: bool = False
192
+ staged: bool = False
193
+ strict_xml: bool = False
194
+ no_untracked: bool = False
195
+ max_size: int = MAX_TEXT_FILE_SIZE
196
+ max_diff_size: int = MAX_DIFF_SIZE
197
+ no_content: bool = False
198
+ git_timeout: int = GIT_TIMEOUT
199
+ diff_semaphore_limit: int = DIFF_SEMAPHORE_LIMIT
200
+ hide_repo_path: bool = False
201
+
202
+ def __post_init__(self) -> None:
203
+ """Validate field bounds at construction so the CLI and the programmatic
204
+ API share one definition of "valid". Raises ``ValueError`` on bad input.
205
+ """
206
+ if self.command not in ("commit", "pr"):
207
+ raise ValueError(f"command must be 'commit' or 'pr', got {self.command!r}")
208
+ for name in ("max_size", "git_timeout", "diff_semaphore_limit"):
209
+ value = getattr(self, name)
210
+ if value <= 0:
211
+ raise ValueError(f"{name} must be a positive integer, got {value}")
212
+ if self.max_diff_size < 0:
213
+ raise ValueError(
214
+ f"max_diff_size must be >= 0 (0 disables the cap), got {self.max_diff_size}"
215
+ )
216
+
217
+
218
+ @dataclass(frozen=True)
219
+ class Git2xmlCliConfig(Git2xmlConfig):
220
+ """Configuration for the file-writing (CLI) path: a ``Git2xmlConfig`` plus an
221
+ output target.
222
+
223
+ ``output`` lives here, not on the base, because it is consumed only by the
224
+ file-writing path (``save_brief`` / the ``git2xml`` console script). The
225
+ programmatic API returns the brief as a string and never writes a file, so it
226
+ takes the base ``Git2xmlConfig`` and has no ``output`` field to ignore or to
227
+ leave stale when ``command`` is coerced.
228
+
229
+ Attributes:
230
+ output: Name of the XML file to write, resolved against the process's
231
+ current working directory (not ``repo``). If empty, it is derived
232
+ from ``command`` as ``"{command}_brief.xml"``.
233
+ """
234
+
235
+ output: str = ""
236
+
237
+ def __post_init__(self):
238
+ """Derive the default output filename from ``command`` when unset."""
239
+ super().__post_init__() # runs the base bound checks
240
+ if not self.output:
241
+ object.__setattr__(self, "output", f"{self.command}_brief.xml")
242
+
243
+
244
+ class DiffOmission(str, Enum):
245
+ """Why a file's <diff> slot is empty. Single source of truth shared by the
246
+ diff producer (_fetch_diff) and the XML renderer (format_file_xml)."""
247
+
248
+ NONE = "" # no diff applies (e.g. a new file's content carries the change)
249
+ SIZE_EXCEEDED = "size-exceeded" # new file too large to render as an add-diff
250
+ DIFF_SIZE_EXCEEDED = (
251
+ "diff-size-exceeded" # the diff text itself exceeded max_diff_size; dropped with a notice
252
+ )
253
+ FETCH_ERROR = "fetch-error" # git failed to produce the diff (see logs for detail)
254
+
255
+
256
+ @dataclass(frozen=True)
257
+ class DiffResult:
258
+ """A file's fetched diff plus why its <diff> slot is empty, if it is.
259
+
260
+ text is the literal diff ("" when none was produced); omission names the
261
+ reason it's empty so producer (_fetch_diff) and renderer (format_file_xml)
262
+ share one vocabulary. A non-empty text always pairs with DiffOmission.NONE.
263
+ limit carries the max_diff_size in effect, so a DIFF_SIZE_EXCEEDED omission
264
+ can name the byte cap it crossed (0 otherwise).
265
+ """
266
+
267
+ text: str = ""
268
+ omission: DiffOmission = DiffOmission.NONE
269
+ limit: int = 0
270
+
271
+
272
+ # Shared empty default - frozen, so safe to reuse as a default arg and .get() fallback.
273
+ NO_DIFF = DiffResult()
git2xml/py.typed ADDED
File without changes
git2xml/utils.py ADDED
@@ -0,0 +1,251 @@
1
+ """Pure helpers for XML assembly and content classification.
2
+
3
+ The leaf layer: no git, no async, no orchestration - just deterministic,
4
+ unit-testable functions the engine composes (XML escaping, the hybrid
5
+ CDATA/fenced body wrapping, ``<file>`` formatting, binary detection, BOM-aware
6
+ decoding). The only disk I/O is ``is_binary_file`` and ``read_text_bom_aware``.
7
+ """
8
+
9
+ import re
10
+ from pathlib import Path
11
+ from typing import List, Optional
12
+ from xml.sax.saxutils import escape as xml_escape
13
+
14
+ from .models import NO_DIFF, DiffOmission, DiffResult, PRCommit
15
+
16
+ # Matches XML 1.0 illegal characters
17
+ _ILLEGAL_XML_CHARS = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")
18
+
19
+ # NOTE: \b (0x08) counts as text here (so files with occasional backspace
20
+ # bytes aren't misclassified as binary), even though it is an *illegal* XML
21
+ # 1.0 char that --strict-xml escapes. Different questions: "omit as binary?"
22
+ # vs. "legal in strict XML?" - both answers are intentional.
23
+ _TEXT_CHARACTERS = b"".join(
24
+ [bytes([i]) for i in range(32, 127)]
25
+ + [b"\n", b"\r", b"\t", b"\b"]
26
+ + [bytes([i]) for i in range(128, 256)]
27
+ )
28
+
29
+
30
+ def _escape_illegal_xml_chars(match):
31
+ """Replace one XML-illegal control char with its ``\\xNN`` string form.
32
+
33
+ Used as the ``re.sub`` callback for ``_ILLEGAL_XML_CHARS`` in strict-XML
34
+ mode; e.g. an ESC byte (0x1b) becomes the literal text ``\\x1b``.
35
+ """
36
+
37
+ return f"\\x{ord(match.group(0)):02x}"
38
+
39
+
40
+ def escape_xml_attr(value: Optional[str], strict_xml: bool = False) -> str:
41
+ """Escape a string for safe use inside an XML attribute value (or element text).
42
+
43
+ Always escapes ``&``, ``<``, ``>`` and both quotes (``"`` -> ``&quot;``,
44
+ ``'`` -> ``&apos;``) so the result is safe in single- or double-quoted
45
+ attributes. ``None`` or empty input returns "".
46
+
47
+ Under ``strict_xml``, also escapes the C0 control characters that are illegal
48
+ in XML 1.0 - everything except tab/LF/CR, which are legal and parser-normalized -
49
+ to their ``\\xNN`` form, mirroring ``wrap_in_hybrid_tag``. This keeps a
50
+ pathological path or author name from breaking the well-formedness that
51
+ ``--strict-xml`` guarantees. In default (fidelity) mode they pass through,
52
+ matching the body-text policy.
53
+ """
54
+
55
+ if not value:
56
+ return ""
57
+ text = str(value)
58
+ if strict_xml:
59
+ text = _ILLEGAL_XML_CHARS.sub(_escape_illegal_xml_chars, text)
60
+ return xml_escape(text, {'"': "&quot;", "'": "&apos;"})
61
+
62
+
63
+ def wrap_in_hybrid_tag(tag_name: str, text: str, strict_xml: bool = False) -> str:
64
+ """Wrap ``text`` in a CDATA element, choosing fidelity or strict compliance.
65
+
66
+ Two modes trade off raw-byte fidelity against XML 1.0 validity:
67
+
68
+ - Default (``strict_xml=False``): prioritize exact fidelity for LLM
69
+ consumption. Control characters are passed through untouched. If ``text``
70
+ contains a CDATA terminator (``]]>``), the CDATA section can't hold it
71
+ without alteration, so the element falls back to a Markdown-fenced body
72
+ (``format="fenced"``) using a backtick fence long enough to not collide
73
+ with any backtick run already in the text.
74
+ - Strict (``strict_xml=True``): produce valid XML 1.0. Illegal control
75
+ characters are escaped to their string form (e.g. ``\\x1b``) and any
76
+ ``]]>`` is split safely (``]]]]><![CDATA[>``) so the CDATA section stays
77
+ well-formed.
78
+
79
+ Empty ``text`` renders as an explicit empty CDATA element in both modes,
80
+ keeping an empty file distinguishable from omitted content.
81
+ """
82
+
83
+ if strict_xml:
84
+ # Strict XML mode: Escape control chars and safely split CDATA terminators
85
+ safe_text = _ILLEGAL_XML_CHARS.sub(_escape_illegal_xml_chars, text)
86
+ safe_text = safe_text.replace("]]>", "]]]]><![CDATA[>")
87
+ return f'<{tag_name} format="cdata"><![CDATA[{safe_text}]]></{tag_name}>'
88
+
89
+ # Default LLM mode: Fidelity over specification
90
+ if "]]>" not in text:
91
+ return f'<{tag_name} format="cdata"><![CDATA[{text}]]></{tag_name}>'
92
+
93
+ backtick_matches = re.findall(r"`+", text)
94
+ max_backticks = len(max(backtick_matches, key=len)) if backtick_matches else 0
95
+ fence_len = max(3, max_backticks + 1)
96
+ fence = "`" * fence_len
97
+
98
+ return f'<{tag_name} format="fenced" fence="{fence}">\n{fence}\n{text}\n{fence}\n</{tag_name}>'
99
+
100
+
101
+ def format_file_xml(
102
+ path: str,
103
+ content: Optional[str],
104
+ status: str = "included",
105
+ reason: str = "",
106
+ diff: DiffResult = NO_DIFF,
107
+ indent: str = "",
108
+ strict_xml: bool = False,
109
+ ) -> str:
110
+ """Render a single ``<file>`` element from its resolved parts.
111
+
112
+ Emits a self-closing ``<file ... />`` when there is no body - no content, no
113
+ diff text, and no fetch failure to report. Otherwise opens a ``<file>`` around
114
+ a ``<content>`` (when ``content`` is not None, including "") and/or a diff slot.
115
+
116
+ The diff slot reflects ``diff``: real ``text`` renders a ``<diff>`` body; a
117
+ ``FETCH_ERROR`` renders a self-closing ``<diff status="unavailable">`` so a
118
+ reader can tell "the diff failed" from "no diff applies." A new-file diff omitted
119
+ for content size (``SIZE_EXCEEDED``) adds nothing here - the file-level ``reason``
120
+ already announces it - whereas a diff dropped for its own size (``DIFF_SIZE_EXCEEDED``)
121
+ renders a self-closing ``<diff status="omitted">`` so the omission stays visible
122
+ even with content present.
123
+ """
124
+
125
+ safe_path = escape_xml_attr(path, strict_xml=strict_xml)
126
+ status_str = f' status="{status}"' if status and status != "included" else ""
127
+ reason_str = f' reason="{escape_xml_attr(reason, strict_xml=strict_xml)}"' if reason else ""
128
+
129
+ diff_failed = diff.omission is DiffOmission.FETCH_ERROR
130
+ diff_too_large = diff.omission is DiffOmission.DIFF_SIZE_EXCEEDED
131
+ if content is None and not diff.text and not diff_failed and not diff_too_large:
132
+ return f'{indent}<file path="{safe_path}"{status_str}{reason_str} />'
133
+
134
+ child = indent + " "
135
+ out = [f'{indent}<file path="{safe_path}"{status_str}{reason_str}>']
136
+
137
+ if content is not None:
138
+ out.append(child + wrap_in_hybrid_tag("content", content, strict_xml=strict_xml))
139
+
140
+ if diff.text:
141
+ out.append(child + wrap_in_hybrid_tag("diff", diff.text, strict_xml=strict_xml))
142
+ elif diff_failed:
143
+ out.append(f'{child}<diff status="unavailable" reason="failed to fetch diff" />')
144
+ elif diff_too_large:
145
+ out.append(f'{child}<diff status="omitted" reason="diff exceeds {diff.limit} bytes" />')
146
+
147
+ out.append(f"{indent}</file>")
148
+ return "\n".join(line for line in out if line)
149
+
150
+
151
+ def build_commit_log_xml(
152
+ commits: List[PRCommit],
153
+ branch: str,
154
+ base: str,
155
+ strict_xml: bool = False,
156
+ indent: str = " ",
157
+ ) -> str:
158
+ """Render the PR ``<commit_log>`` block from parsed commit records.
159
+
160
+ Returns the whole block as one newline-joined string, or ``""`` when
161
+ ``commits`` is empty (the caller appends nothing). Attributes are escaped
162
+ via ``escape_xml_attr``; each commit's ``subject``/``body`` is wrapped with
163
+ ``wrap_in_hybrid_tag``, matching how ``format_file_xml`` handles
164
+ ``<content>``/``<diff>``.
165
+
166
+ Only the opening line of each body element is indented; interior newlines
167
+ stay flush-left so the CDATA payload is byte-faithful. Re-indenting would
168
+ inject leading spaces into every body line and corrupt the commit text.
169
+ """
170
+ if not commits:
171
+ return ""
172
+
173
+ commit_indent = indent + " " # <commit>
174
+ body_indent = commit_indent + " " # <subject> / <body>
175
+
176
+ branch_attr = escape_xml_attr(branch, strict_xml=strict_xml)
177
+ base_attr = escape_xml_attr(base, strict_xml=strict_xml)
178
+
179
+ lines = [
180
+ f'{indent}<commit_log branch="{branch_attr}" base="{base_attr}" commits="{len(commits)}">'
181
+ ]
182
+ for c in commits:
183
+ hash_attr = escape_xml_attr(c["hash"], strict_xml=strict_xml)
184
+ author_attr = escape_xml_attr(c["author"], strict_xml=strict_xml)
185
+ date_attr = escape_xml_attr(c["date"], strict_xml=strict_xml)
186
+ lines.append(
187
+ f'{commit_indent}<commit hash="{hash_attr}" author="{author_attr}" date="{date_attr}">'
188
+ )
189
+
190
+ subject_xml = wrap_in_hybrid_tag("subject", c["subject"], strict_xml=strict_xml)
191
+ lines.append(body_indent + subject_xml)
192
+ if c["body"]:
193
+ body_xml = wrap_in_hybrid_tag("body", c["body"], strict_xml=strict_xml)
194
+ lines.append(body_indent + body_xml)
195
+ lines.append(f"{commit_indent}</commit>")
196
+ lines.append(f"{indent}</commit_log>")
197
+ return "\n".join(lines)
198
+
199
+
200
+ def is_binary_bytes(data: bytes) -> bool:
201
+ """Returns True if the given bytes look like binary content."""
202
+ chunk = data[:4096]
203
+ if not chunk:
204
+ return False
205
+ if chunk[:2] in (b"\xff\xfe", b"\xfe\xff"):
206
+ return False
207
+
208
+ # BOM-less UTF-16 is intentionally treated as binary here (it's all NUL bytes).
209
+ if b"\x00" in chunk:
210
+ return True
211
+
212
+ non_text = chunk.translate(None, _TEXT_CHARACTERS)
213
+ return len(non_text) / len(chunk) > 0.30
214
+
215
+
216
+ def is_binary_file(path: Path) -> bool:
217
+ """Robust check reading from disk."""
218
+ with open(path, "rb") as f:
219
+ return is_binary_bytes(f.read(4096))
220
+
221
+
222
+ def decode_bytes_bom_aware(raw: bytes) -> str:
223
+ """Pure function to safely decode bytes to a string."""
224
+ if raw[:2] in (b"\xff\xfe", b"\xfe\xff"):
225
+ return raw.decode("utf-16", errors="replace")
226
+ if raw[:3] == b"\xef\xbb\xbf":
227
+ return raw[3:].decode("utf-8", errors="replace")
228
+ return raw.decode("utf-8", errors="replace")
229
+
230
+
231
+ def read_text_bom_aware(path: Path) -> str:
232
+ """Reads a text file from disk and decodes it."""
233
+ return decode_bytes_bom_aware(path.read_bytes())
234
+
235
+
236
+ def diff_exceeds_limit(text: str, max_bytes: int) -> bool:
237
+ """Return True if ``text`` exceeds ``max_bytes`` when encoded as UTF-8.
238
+
239
+ Measured in bytes to match ``max_size`` (content), so both limits speak the
240
+ same unit. ``max_bytes <= 0`` disables the check (never exceeds). Two cheap
241
+ bounds avoid encoding in the common cases: a str of N codepoints is between N
242
+ and 4N UTF-8 bytes, so ``4N <= max`` is always under and ``N > max`` is always
243
+ over; only the ambiguous middle band is actually encoded.
244
+ """
245
+ if max_bytes <= 0:
246
+ return False
247
+ if len(text) * 4 <= max_bytes:
248
+ return False
249
+ if len(text) > max_bytes:
250
+ return True
251
+ return len(text.encode("utf-8")) > max_bytes