PyPI - git2xml - Versions diffs - 0.1.0__py3-none-any.whl - Mend

git2xml 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

git2xml/__init__.py +43 -0
git2xml/__main__.py +8 -0
git2xml/api.py +95 -0
git2xml/cli.py +158 -0
git2xml/constants.py +31 -0
git2xml/core.py +633 -0
git2xml/git_scanner.py +708 -0
git2xml/models.py +273 -0
git2xml/py.typed +0 -0
git2xml/utils.py +251 -0
git2xml-0.1.0.dist-info/METADATA +349 -0
git2xml-0.1.0.dist-info/RECORD +16 -0
git2xml-0.1.0.dist-info/WHEEL +5 -0
git2xml-0.1.0.dist-info/entry_points.txt +2 -0
git2xml-0.1.0.dist-info/licenses/LICENSE +21 -0
git2xml-0.1.0.dist-info/top_level.txt +1 -0

git2xml/models.py ADDED Viewed

@@ -0,0 +1,273 @@
+"""Shared types: config, the error hierarchy, and the data passed between layers.
+Pure declarations with no I/O and no dependency on other git2xml modules
+(except ``constants`` for defaults), so every layer can import it freely. Holds
+the user-facing ``Git2xmlError`` hierarchy, the immutable ``Git2xmlConfig``,
+the ``FileStatus`` / ``StagingState`` enums (``FileStatus`` being the single
+source of truth for git status codes and their labels), and the
+``ChangedFile`` / ``ScanResult`` / ``DiffResult`` records the scanner and
+engine exchange.
+"""
+from dataclasses import dataclass
+from enum import Enum
+from typing import Dict, List, Literal, Optional, TypedDict
+from .constants import DIFF_SEMAPHORE_LIMIT, GIT_TIMEOUT, MAX_DIFF_SIZE, MAX_TEXT_FILE_SIZE
+class Git2xmlError(Exception):
+    """Base class for user-facing git2xml errors."""
+class GitNotInstalledError(Git2xmlError):
+    def __init__(self):
+        super().__init__("git executable not found on PATH. Install git and try again.")
+class NotAGitRepositoryError(Git2xmlError):
+    def __init__(self, path: str):
+        self.path = path
+        super().__init__(f"Not a git repository: {path}")
+class GitCommandError(Git2xmlError):
+    def __init__(self, command: str, returncode: int, stderr: str):
+        self.command = command
+        self.returncode = returncode
+        self.stderr = stderr
+        super().__init__(
+            f"Git command failed (code {returncode}): {stderr.strip() or 'unknown error'}"
+        )
+class FileStatus(str, Enum):
+    MODIFIED = "M"
+    ADDED = "A"
+    DELETED = "D"
+    RENAMED = "R"
+    COPIED = "C"
+    TYPE_CHANGED = "T"
+    UNTRACKED = "??"
+    @property
+    def label(self) -> str:
+        """Human-readable status for XML output (e.g. MODIFIED -> 'modified')."""
+        return _STATUS_LABELS[self]
+    @classmethod
+    def from_code(cls, code: str) -> "FileStatus":
+        """Map a raw ``git diff --name-status`` code to a FileStatus.
+        Recognises the single-letter codes that reach the non-rename/copy branch
+        of the parser (M/A/D/T); anything else (an unmerged 'U', an 'X' bug
+        marker) defaults to MODIFIED, matching git's "treat as a change" posture.
+        R/C are handled separately by the parser (they carry a score + paths) and
+        never reach here.
+        """
+        try:
+            return cls(code)
+        except ValueError:
+            return cls.MODIFIED
+_STATUS_LABELS: Dict[FileStatus, str] = {
+    FileStatus.MODIFIED: "modified",
+    FileStatus.ADDED: "added",
+    FileStatus.DELETED: "deleted",
+    FileStatus.RENAMED: "renamed",
+    FileStatus.COPIED: "copied",
+    FileStatus.TYPE_CHANGED: "type changed",
+    FileStatus.UNTRACKED: "new file",
+}
+# The completeness assertion makes a missing label a hard import-time error,
+# so _STATUS_LABELS can't silently drift from the enum.
+assert set(_STATUS_LABELS) == set(FileStatus), "every FileStatus needs a label"
+class StagingState(str, Enum):
+    STAGED = "staged"
+    UNSTAGED = "unstaged"
+    STAGED_AND_MODIFIED = "staged+modified"
+    UNTRACKED = "untracked"
+class PRCommit(TypedDict):
+    """One commit record parsed from ``git log`` in ``get_pr_commits``."""
+    hash: str
+    author: str
+    date: str
+    subject: str
+    body: str
+# Required vs optional keys are split across two classes because NotRequired is
+# 3.11+ and the project supports 3.9: ``status`` is always present, ``old_path``
+# only for renames/copies.
+class _NameStatusBase(TypedDict):
+    status: FileStatus
+class NameStatusEntry(_NameStatusBase, total=False):
+    """One entry from ``parse_name_status``; ``old_path`` set only for R/C."""
+    old_path: str
+@dataclass
+class ChangedFile:
+    path: str
+    status: FileStatus
+    staging: StagingState
+    old_path: Optional[str] = None
+@dataclass
+class ScanResult:
+    files: List[ChangedFile]
+    has_staged: bool
+@dataclass(frozen=True)
+class Git2xmlConfig:
+    """Configuration settings for generating a git2xml brief.
+    The dataclass is immutable.
+    Attributes:
+        command: Either ``"commit"`` or ``"pr"`` (Default is ``commit``).
+        repo: Path to the git repository (resolved to an absolute path).
+        base: Base ref for ``pr`` mode. Resolved flexibly - a bare
+            branch name, remote ref (``origin/main``), tag, or SHA all work.
+            Ignored in ``commit`` mode. Defaults to ``"main"``.
+        verbose: If True, log per-file and per-commit progress, as well as debug logs.
+        staged: ``commit`` mode only. If True, restrict output to staged
+            files and read their content from the index rather than the
+            working tree. No effect in ``pr`` mode.
+        strict_xml: If True, emit strict XML 1.0 - escape control characters
+            and split CDATA terminators. If False (default), prioritize exact
+            file fidelity, falling back to markdown fencing when a CDATA
+            terminator is present.
+        no_untracked: ``commit`` mode only. If True, exclude untracked files.
+            No effect in ``pr`` mode or when ``staged`` is set (untracked
+            files are already excluded there).
+        max_size: Per-file *content* size limit in bytes. Files whose content
+            exceeds this have their content omitted with an explanatory reason,
+            but their ``<file>`` element and ``<diff>`` are still emitted - so the
+            change stays visible even when the full content is too large to
+            include. This differs from binary omission, which drops the ``<diff>``
+            too, since git produces no useful textual diff for binary files.
+            Content size is read from git metadata or the filesystem before the blob is loaded,
+            so oversized content is never buffered. This limit does not apply to diffs
+            - that is ``max_diff_size``'s job: a diff has no size git can report
+            before computing it, so it is always fetched in full and
+            ``max_diff_size`` then drops it from the output if oversized. Defaults
+            to ``MAX_TEXT_FILE_SIZE``.
+        max_diff_size: Per-file *diff* size limit in bytes (UTF-8), the same
+            unit as ``max_size``. A diff larger than this is omitted entirely -
+            its ``<diff>`` slot renders ``status="omitted"`` with a reason, while
+            the ``<file>`` element and any ``<content>`` stay - mirroring how
+            ``max_size`` drops oversized content but keeps the diff. Unlike
+            ``max_size``, this is output-shaping, not a memory guard:
+            a diff has no size git can report up front, so it is fully produced
+            before it can be measured. The cap keeps a pathological diff
+            (e.g. a deleted multi-megabyte file, whose brief carries only a diff)
+            from overflowing the LLM context window. Defaults to ``MAX_DIFF_SIZE``;
+            pass ``0`` to disable the cap.
+        no_content: If True, omit the ``<content>`` body for every file,
+            producing a diff-only brief. Diffs are unaffected.
+        git_timeout: Git command execution timeout. Defaults to ``GIT_TIMEOUT``.
+        diff_semaphore_limit: Maximum number of concurrent diff fetch actions. Defaults to ``DIFF_SEMAPHORE_LIMIT``.
+        hide_repo_path: If True, emit only the repository's directory name in the
+            root ``repo`` attribute instead of its absolute filesystem path. Use
+            when pasting briefs into third-party tools to avoid leaking your local
+            path (username, directory layout). File ``path`` attributes are always
+            repo-relative and unaffected. Defaults to False.
+    """
+    command: Literal["commit", "pr"] = "commit"
+    repo: str = "."
+    base: str = "main"
+    verbose: bool = False
+    staged: bool = False
+    strict_xml: bool = False
+    no_untracked: bool = False
+    max_size: int = MAX_TEXT_FILE_SIZE
+    max_diff_size: int = MAX_DIFF_SIZE
+    no_content: bool = False
+    git_timeout: int = GIT_TIMEOUT
+    diff_semaphore_limit: int = DIFF_SEMAPHORE_LIMIT
+    hide_repo_path: bool = False
+    def __post_init__(self) -> None:
+        """Validate field bounds at construction so the CLI and the programmatic
+        API share one definition of "valid". Raises ``ValueError`` on bad input.
+        """
+        if self.command not in ("commit", "pr"):
+            raise ValueError(f"command must be 'commit' or 'pr', got {self.command!r}")
+        for name in ("max_size", "git_timeout", "diff_semaphore_limit"):
+            value = getattr(self, name)
+            if value <= 0:
+                raise ValueError(f"{name} must be a positive integer, got {value}")
+        if self.max_diff_size < 0:
+            raise ValueError(
+                f"max_diff_size must be >= 0 (0 disables the cap), got {self.max_diff_size}"
+            )
+@dataclass(frozen=True)
+class Git2xmlCliConfig(Git2xmlConfig):
+    """Configuration for the file-writing (CLI) path: a ``Git2xmlConfig`` plus an
+    output target.
+    ``output`` lives here, not on the base, because it is consumed only by the
+    file-writing path (``save_brief`` / the ``git2xml`` console script). The
+    programmatic API returns the brief as a string and never writes a file, so it
+    takes the base ``Git2xmlConfig`` and has no ``output`` field to ignore or to
+    leave stale when ``command`` is coerced.
+    Attributes:
+        output: Name of the XML file to write, resolved against the process's
+            current working directory (not ``repo``). If empty, it is derived
+            from ``command`` as ``"{command}_brief.xml"``.
+    """
+    output: str = ""
+    def __post_init__(self):
+        """Derive the default output filename from ``command`` when unset."""
+        super().__post_init__()  # runs the base bound checks
+        if not self.output:
+            object.__setattr__(self, "output", f"{self.command}_brief.xml")
+class DiffOmission(str, Enum):
+    """Why a file's <diff> slot is empty. Single source of truth shared by the
+    diff producer (_fetch_diff) and the XML renderer (format_file_xml)."""
+    NONE = ""  # no diff applies (e.g. a new file's content carries the change)
+    SIZE_EXCEEDED = "size-exceeded"  # new file too large to render as an add-diff
+    DIFF_SIZE_EXCEEDED = (
+        "diff-size-exceeded"  # the diff text itself exceeded max_diff_size; dropped with a notice
+    )
+    FETCH_ERROR = "fetch-error"  # git failed to produce the diff (see logs for detail)
+@dataclass(frozen=True)
+class DiffResult:
+    """A file's fetched diff plus why its <diff> slot is empty, if it is.
+    text is the literal diff ("" when none was produced); omission names the
+    reason it's empty so producer (_fetch_diff) and renderer (format_file_xml)
+    share one vocabulary. A non-empty text always pairs with DiffOmission.NONE.
+    limit carries the max_diff_size in effect, so a DIFF_SIZE_EXCEEDED omission
+    can name the byte cap it crossed (0 otherwise).
+    """
+    text: str = ""
+    omission: DiffOmission = DiffOmission.NONE
+    limit: int = 0
+# Shared empty default - frozen, so safe to reuse as a default arg and .get() fallback.
+NO_DIFF = DiffResult()

git2xml/py.typed ADDED Viewed

File without changes

git2xml/utils.py ADDED Viewed

@@ -0,0 +1,251 @@
+"""Pure helpers for XML assembly and content classification.
+The leaf layer: no git, no async, no orchestration - just deterministic,
+unit-testable functions the engine composes (XML escaping, the hybrid
+CDATA/fenced body wrapping, ``<file>`` formatting, binary detection, BOM-aware
+decoding). The only disk I/O is ``is_binary_file`` and ``read_text_bom_aware``.
+"""
+import re
+from pathlib import Path
+from typing import List, Optional
+from xml.sax.saxutils import escape as xml_escape
+from .models import NO_DIFF, DiffOmission, DiffResult, PRCommit
+# Matches XML 1.0 illegal characters
+_ILLEGAL_XML_CHARS = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f]")
+# NOTE: \b (0x08) counts as text here (so files with occasional backspace
+# bytes aren't misclassified as binary), even though it is an *illegal* XML
+# 1.0 char that --strict-xml escapes. Different questions: "omit as binary?"
+# vs. "legal in strict XML?" - both answers are intentional.
+_TEXT_CHARACTERS = b"".join(
+    [bytes([i]) for i in range(32, 127)]
+    + [b"\n", b"\r", b"\t", b"\b"]
+    + [bytes([i]) for i in range(128, 256)]
+)
+def _escape_illegal_xml_chars(match):
+    """Replace one XML-illegal control char with its ``\\xNN`` string form.
+    Used as the ``re.sub`` callback for ``_ILLEGAL_XML_CHARS`` in strict-XML
+    mode; e.g. an ESC byte (0x1b) becomes the literal text ``\\x1b``.
+    """
+    return f"\\x{ord(match.group(0)):02x}"
+def escape_xml_attr(value: Optional[str], strict_xml: bool = False) -> str:
+    """Escape a string for safe use inside an XML attribute value (or element text).
+    Always escapes ``&``, ``<``, ``>`` and both quotes (``"`` -> ``&quot;``,
+    ``'`` -> ``&apos;``) so the result is safe in single- or double-quoted
+    attributes. ``None`` or empty input returns "".
+    Under ``strict_xml``, also escapes the C0 control characters that are illegal
+    in XML 1.0 - everything except tab/LF/CR, which are legal and parser-normalized -
+    to their ``\\xNN`` form, mirroring ``wrap_in_hybrid_tag``. This keeps a
+    pathological path or author name from breaking the well-formedness that
+    ``--strict-xml`` guarantees. In default (fidelity) mode they pass through,
+    matching the body-text policy.
+    """
+    if not value:
+        return ""
+    text = str(value)
+    if strict_xml:
+        text = _ILLEGAL_XML_CHARS.sub(_escape_illegal_xml_chars, text)
+    return xml_escape(text, {'"': "&quot;", "'": "&apos;"})
+def wrap_in_hybrid_tag(tag_name: str, text: str, strict_xml: bool = False) -> str:
+    """Wrap ``text`` in a CDATA element, choosing fidelity or strict compliance.
+    Two modes trade off raw-byte fidelity against XML 1.0 validity:
+    - Default (``strict_xml=False``): prioritize exact fidelity for LLM
+      consumption. Control characters are passed through untouched. If ``text``
+      contains a CDATA terminator (``]]>``), the CDATA section can't hold it
+      without alteration, so the element falls back to a Markdown-fenced body
+      (``format="fenced"``) using a backtick fence long enough to not collide
+      with any backtick run already in the text.
+    - Strict (``strict_xml=True``): produce valid XML 1.0. Illegal control
+      characters are escaped to their string form (e.g. ``\\x1b``) and any
+      ``]]>`` is split safely (``]]]]><![CDATA[>``) so the CDATA section stays
+      well-formed.
+    Empty ``text`` renders as an explicit empty CDATA element in both modes,
+    keeping an empty file distinguishable from omitted content.
+    """
+    if strict_xml:
+        # Strict XML mode: Escape control chars and safely split CDATA terminators
+        safe_text = _ILLEGAL_XML_CHARS.sub(_escape_illegal_xml_chars, text)
+        safe_text = safe_text.replace("]]>", "]]]]><![CDATA[>")
+        return f'<{tag_name} format="cdata"><![CDATA[{safe_text}]]></{tag_name}>'
+    # Default LLM mode: Fidelity over specification
+    if "]]>" not in text:
+        return f'<{tag_name} format="cdata"><![CDATA[{text}]]></{tag_name}>'
+    backtick_matches = re.findall(r"`+", text)
+    max_backticks = len(max(backtick_matches, key=len)) if backtick_matches else 0
+    fence_len = max(3, max_backticks + 1)
+    fence = "`" * fence_len
+    return f'<{tag_name} format="fenced" fence="{fence}">\n{fence}\n{text}\n{fence}\n</{tag_name}>'
+def format_file_xml(
+    path: str,
+    content: Optional[str],
+    status: str = "included",
+    reason: str = "",
+    diff: DiffResult = NO_DIFF,
+    indent: str = "",
+    strict_xml: bool = False,
+) -> str:
+    """Render a single ``<file>`` element from its resolved parts.
+    Emits a self-closing ``<file ... />`` when there is no body - no content, no
+    diff text, and no fetch failure to report. Otherwise opens a ``<file>`` around
+    a ``<content>`` (when ``content`` is not None, including "") and/or a diff slot.
+    The diff slot reflects ``diff``: real ``text`` renders a ``<diff>`` body; a
+    ``FETCH_ERROR`` renders a self-closing ``<diff status="unavailable">`` so a
+    reader can tell "the diff failed" from "no diff applies." A new-file diff omitted
+    for content size (``SIZE_EXCEEDED``) adds nothing here - the file-level ``reason``
+    already announces it - whereas a diff dropped for its own size (``DIFF_SIZE_EXCEEDED``)
+    renders a self-closing ``<diff status="omitted">`` so the omission stays visible
+    even with content present.
+    """
+    safe_path = escape_xml_attr(path, strict_xml=strict_xml)
+    status_str = f' status="{status}"' if status and status != "included" else ""
+    reason_str = f' reason="{escape_xml_attr(reason, strict_xml=strict_xml)}"' if reason else ""
+    diff_failed = diff.omission is DiffOmission.FETCH_ERROR
+    diff_too_large = diff.omission is DiffOmission.DIFF_SIZE_EXCEEDED
+    if content is None and not diff.text and not diff_failed and not diff_too_large:
+        return f'{indent}<file path="{safe_path}"{status_str}{reason_str} />'
+    child = indent + "  "
+    out = [f'{indent}<file path="{safe_path}"{status_str}{reason_str}>']
+    if content is not None:
+        out.append(child + wrap_in_hybrid_tag("content", content, strict_xml=strict_xml))
+    if diff.text:
+        out.append(child + wrap_in_hybrid_tag("diff", diff.text, strict_xml=strict_xml))
+    elif diff_failed:
+        out.append(f'{child}<diff status="unavailable" reason="failed to fetch diff" />')
+    elif diff_too_large:
+        out.append(f'{child}<diff status="omitted" reason="diff exceeds {diff.limit} bytes" />')
+    out.append(f"{indent}</file>")
+    return "\n".join(line for line in out if line)
+def build_commit_log_xml(
+    commits: List[PRCommit],
+    branch: str,
+    base: str,
+    strict_xml: bool = False,
+    indent: str = "  ",
+) -> str:
+    """Render the PR ``<commit_log>`` block from parsed commit records.
+    Returns the whole block as one newline-joined string, or ``""`` when
+    ``commits`` is empty (the caller appends nothing). Attributes are escaped
+    via ``escape_xml_attr``; each commit's ``subject``/``body`` is wrapped with
+    ``wrap_in_hybrid_tag``, matching how ``format_file_xml`` handles
+    ``<content>``/``<diff>``.
+    Only the opening line of each body element is indented; interior newlines
+    stay flush-left so the CDATA payload is byte-faithful. Re-indenting would
+    inject leading spaces into every body line and corrupt the commit text.
+    """
+    if not commits:
+        return ""
+    commit_indent = indent + "  "  # <commit>
+    body_indent = commit_indent + "  "  # <subject> / <body>
+    branch_attr = escape_xml_attr(branch, strict_xml=strict_xml)
+    base_attr = escape_xml_attr(base, strict_xml=strict_xml)
+    lines = [
+        f'{indent}<commit_log branch="{branch_attr}" base="{base_attr}" commits="{len(commits)}">'
+    ]
+    for c in commits:
+        hash_attr = escape_xml_attr(c["hash"], strict_xml=strict_xml)
+        author_attr = escape_xml_attr(c["author"], strict_xml=strict_xml)
+        date_attr = escape_xml_attr(c["date"], strict_xml=strict_xml)
+        lines.append(
+            f'{commit_indent}<commit hash="{hash_attr}" author="{author_attr}" date="{date_attr}">'
+        )
+        subject_xml = wrap_in_hybrid_tag("subject", c["subject"], strict_xml=strict_xml)
+        lines.append(body_indent + subject_xml)
+        if c["body"]:
+            body_xml = wrap_in_hybrid_tag("body", c["body"], strict_xml=strict_xml)
+            lines.append(body_indent + body_xml)
+        lines.append(f"{commit_indent}</commit>")
+    lines.append(f"{indent}</commit_log>")
+    return "\n".join(lines)
+def is_binary_bytes(data: bytes) -> bool:
+    """Returns True if the given bytes look like binary content."""
+    chunk = data[:4096]
+    if not chunk:
+        return False
+    if chunk[:2] in (b"\xff\xfe", b"\xfe\xff"):
+        return False
+    # BOM-less UTF-16 is intentionally treated as binary here (it's all NUL bytes).
+    if b"\x00" in chunk:
+        return True
+    non_text = chunk.translate(None, _TEXT_CHARACTERS)
+    return len(non_text) / len(chunk) > 0.30
+def is_binary_file(path: Path) -> bool:
+    """Robust check reading from disk."""
+    with open(path, "rb") as f:
+        return is_binary_bytes(f.read(4096))
+def decode_bytes_bom_aware(raw: bytes) -> str:
+    """Pure function to safely decode bytes to a string."""
+    if raw[:2] in (b"\xff\xfe", b"\xfe\xff"):
+        return raw.decode("utf-16", errors="replace")
+    if raw[:3] == b"\xef\xbb\xbf":
+        return raw[3:].decode("utf-8", errors="replace")
+    return raw.decode("utf-8", errors="replace")
+def read_text_bom_aware(path: Path) -> str:
+    """Reads a text file from disk and decodes it."""
+    return decode_bytes_bom_aware(path.read_bytes())
+def diff_exceeds_limit(text: str, max_bytes: int) -> bool:
+    """Return True if ``text`` exceeds ``max_bytes`` when encoded as UTF-8.
+    Measured in bytes to match ``max_size`` (content), so both limits speak the
+    same unit. ``max_bytes <= 0`` disables the check (never exceeds). Two cheap
+    bounds avoid encoding in the common cases: a str of N codepoints is between N
+    and 4N UTF-8 bytes, so ``4N <= max`` is always under and ``N > max`` is always
+    over; only the ambiguous middle band is actually encoded.
+    """
+    if max_bytes <= 0:
+        return False
+    if len(text) * 4 <= max_bytes:
+        return False
+    if len(text) > max_bytes:
+        return True
+    return len(text.encode("utf-8")) > max_bytes