PyPI - codecrate - Versions diffs - 0.1.0__py3-none-any.whl - Mend

codecrate 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

codecrate/__init__.py +0 -0
codecrate/_version.py +34 -0
codecrate/cli.py +250 -0
codecrate/config.py +98 -0
codecrate/diffgen.py +110 -0
codecrate/discover.py +113 -0
codecrate/ids.py +17 -0
codecrate/manifest.py +31 -0
codecrate/markdown.py +457 -0
codecrate/mdparse.py +145 -0
codecrate/model.py +51 -0
codecrate/packer.py +108 -0
codecrate/parse.py +133 -0
codecrate/stubber.py +82 -0
codecrate/token_budget.py +388 -0
codecrate/udiff.py +187 -0
codecrate/unpacker.py +149 -0
codecrate/validate.py +120 -0
codecrate-0.1.0.dist-info/METADATA +357 -0
codecrate-0.1.0.dist-info/RECORD +24 -0
codecrate-0.1.0.dist-info/WHEEL +5 -0
codecrate-0.1.0.dist-info/entry_points.txt +2 -0
codecrate-0.1.0.dist-info/licenses/LICENSE +21 -0
codecrate-0.1.0.dist-info/top_level.txt +1 -0

codecrate/token_budget.py ADDED Viewed

@@ -0,0 +1,388 @@
+from __future__ import annotations
+import re
+from dataclasses import dataclass
+from pathlib import Path
+@dataclass(frozen=True)
+class Part:
+    path: Path
+    content: str
+def split_by_max_chars(markdown: str, out_path: Path, max_chars: int) -> list[Part]:
+    """Split markdown into multiple files limited by *approximate* character count.
+    For normal markdown, this splits on paragraph boundaries ("\n\n"). For Codecrate
+    context packs, it performs a semantic split that avoids breaking code fences and
+    rewrites intra-pack links so that an LLM can navigate across the generated parts.
+    Notes:
+    - The returned parts are intended for LLM consumption. The original,
+      unsplit markdown should still be written to ``out_path``
+      for machine parsing (unpack/validate).
+    - When splitting a Codecrate pack, markdown line ranges like ``(L123-150)`` are
+      removed because they are unstable across parts.
+    """
+    if max_chars <= 0 or len(markdown) <= max_chars:
+        return [Part(path=out_path, content=markdown)]
+    if _looks_like_codecrate_pack(markdown):
+        return _split_codecrate_pack(markdown, out_path, max_chars)
+    return _split_paragraphs(markdown, out_path, max_chars)
+def _split_paragraphs(markdown: str, out_path: Path, max_chars: int) -> list[Part]:
+    parts: list[Part] = []
+    chunk: list[str] = []
+    chunk_len = 0
+    idx = 1
+    for block in markdown.split("\n\n"):
+        add = block + "\n\n"
+        if chunk_len + len(add) > max_chars and chunk:
+            part_path = out_path.with_name(
+                f"{out_path.stem}.part{idx}{out_path.suffix}"
+            )
+            parts.append(Part(path=part_path, content="".join(chunk).rstrip() + "\n"))
+            idx += 1
+            chunk = []
+            chunk_len = 0
+        chunk.append(add)
+        chunk_len += len(add)
+    if chunk:
+        part_path = out_path.with_name(f"{out_path.stem}.part{idx}{out_path.suffix}")
+        parts.append(Part(path=part_path, content="".join(chunk).rstrip() + "\n"))
+    return parts
+_FENCE_RE = re.compile(r"^```")
+_FUNC_ANCHOR_RE = re.compile(r'^<a id="func-([0-9a-f]{8})"></a>\s*$')
+_FILE_HEADING_RE = re.compile(r"^### `([^`]+)`")
+def _looks_like_codecrate_pack(markdown: str) -> bool:
+    head = markdown.lstrip()[:200]
+    return head.startswith("# Codecrate Context Pack") and "## Files" in markdown
+def _find_heading_line_index(lines: list[str], heading: str) -> int | None:
+    in_fence = False
+    for i, line in enumerate(lines):
+        if _FENCE_RE.match(line):
+            if not in_fence:
+                in_fence = True
+            elif line.strip() == "```":
+                in_fence = False
+        if not in_fence and line.startswith(heading):
+            return i
+    return None
+def _drop_section(text: str, heading: str) -> str:
+    """Drop a top-level '## ...' section from a Codecrate prefix (fence-safe)."""
+    lines = text.splitlines(keepends=True)
+    start = _find_heading_line_index(lines, heading)
+    if start is None:
+        return text
+    in_fence = False
+    end = len(lines)
+    for i in range(start + 1, len(lines)):
+        line = lines[i]
+        if _FENCE_RE.match(line):
+            if not in_fence:
+                in_fence = True
+            elif line.strip() == "```":
+                in_fence = False
+            continue
+        if not in_fence and line.startswith("## "):
+            end = i
+            break
+    return "".join(lines[:start] + lines[end:])
+def _split_codecrate_pack(markdown: str, out_path: Path, max_chars: int) -> list[Part]:
+    """Semantic split for Codecrate packs.
+    Strategy:
+    - Keep the "index" prefix (everything before the first content section: Function
+      Library or Files) in part1.
+    - Split the remaining content only at safe boundaries:
+        * function library entry anchors (<a id="func-..."></a>)
+        * file blocks (### `path`) inside the Files section
+        * section headings (## Function Library / ## Files)
+      while never splitting inside a fenced code block.
+    - Rewrite links across parts:
+        * Symbol Index links target the part that contains the relevant anchor.
+        * "jump to index" links in file blocks point back to part1.
+        * func jump links inside file symbol lists point to the part containing the
+          function library entry.
+    - Strip markdown line-range decorations like (L10-20) because they don't survive
+      splitting.
+    """
+    lines = markdown.splitlines(keepends=True)
+    idx_files = _find_heading_line_index(lines, "## Files")
+    idx_funcs = _find_heading_line_index(lines, "## Function Library")
+    if idx_files is None and idx_funcs is None:
+        return _split_paragraphs(markdown, out_path, max_chars)
+    content_start = min(i for i in [idx_files, idx_funcs] if i is not None)
+    # Parts are intended for LLM consumption; drop the Manifest to save tokens
+    # while keeping the unsplit output (written by the CLI) fully machine-readable.
+    prefix = "".join(lines[:content_start])
+    prefix = _drop_section(prefix, "## Manifest")
+    prefix = prefix.rstrip() + "\n"
+    content_lines = lines[content_start:]
+    breakpoints: list[int] = [0]
+    in_fence = False
+    in_files = False
+    for i, line in enumerate(content_lines):
+        if _FENCE_RE.match(line):
+            if not in_fence:
+                in_fence = True
+            elif line.strip() == "```":
+                in_fence = False
+        if in_fence:
+            continue
+        if line.startswith("## Files"):
+            in_files = True
+            breakpoints.append(i)
+            continue
+        if line.startswith("## Function Library"):
+            breakpoints.append(i)
+            continue
+        if line.startswith('<a id="func-'):
+            breakpoints.append(i)
+            continue
+        if in_files and line.startswith("### `"):
+            breakpoints.append(i)
+    breakpoints = sorted(set(bp for bp in breakpoints if 0 <= bp < len(content_lines)))
+    if not breakpoints or breakpoints[0] != 0:
+        breakpoints = [0] + breakpoints
+    breakpoints.append(len(content_lines))
+    blocks: list[str] = []
+    for a, b in zip(breakpoints, breakpoints[1:], strict=False):
+        if a == b:
+            continue
+        blocks.append("".join(content_lines[a:b]))
+    parts: list[Part] = []
+    idx = 1
+    part1_path = out_path.with_name(f"{out_path.stem}.part{idx}{out_path.suffix}")
+    parts.append(Part(path=part1_path, content=prefix))
+    idx += 1
+    chunk: list[str] = []
+    chunk_len = 0
+    for block in blocks:
+        if chunk and chunk_len + len(block) > max_chars:
+            part_path = out_path.with_name(
+                f"{out_path.stem}.part{idx}{out_path.suffix}"
+            )
+            parts.append(Part(path=part_path, content="".join(chunk).rstrip() + "\n"))
+            idx += 1
+            chunk = []
+            chunk_len = 0
+        chunk.append(block)
+        chunk_len += len(block)
+    if chunk:
+        part_path = out_path.with_name(f"{out_path.stem}.part{idx}{out_path.suffix}")
+        parts.append(Part(path=part_path, content="".join(chunk).rstrip() + "\n"))
+    file_to_part: dict[str, str] = {}
+    func_to_part: dict[str, str] = {}
+    for part in parts[1:]:
+        _scan_part_for_anchors(part.content, part.path.name, file_to_part, func_to_part)
+    index_name = parts[0].path.name
+    parts[0] = Part(
+        path=parts[0].path,
+        content=_rewrite_part1(parts[0].content, file_to_part, func_to_part),
+    )
+    rewritten_parts: list[Part] = [parts[0]]
+    for part in parts[1:]:
+        text = part.content
+        text = _strip_markdown_line_ranges(text)
+        text = _rewrite_jump_to_index(text, index_name)
+        text = _rewrite_func_links(text, func_to_part)
+        rewritten_parts.append(Part(path=part.path, content=text))
+    return rewritten_parts
+def _scan_part_for_anchors(
+    text: str,
+    part_filename: str,
+    file_to_part: dict[str, str],
+    func_to_part: dict[str, str],
+) -> None:
+    in_fence = False
+    for line in text.splitlines():
+        if _FENCE_RE.match(line):
+            if not in_fence:
+                in_fence = True
+            elif line.strip() == "```":
+                in_fence = False
+            continue
+        if in_fence:
+            continue
+        m = _FUNC_ANCHOR_RE.match(line.strip())
+        if m:
+            func_to_part[m.group(1).upper()] = part_filename
+            continue
+        m2 = _FILE_HEADING_RE.match(line)
+        if m2:
+            file_to_part[m2.group(1)] = part_filename
+def _strip_markdown_line_ranges(text: str) -> str:
+    out: list[str] = []
+    in_fence = False
+    for line in text.splitlines(keepends=True):
+        if _FENCE_RE.match(line):
+            if not in_fence:
+                in_fence = True
+            elif line.strip() == "```":
+                in_fence = False
+            out.append(line)
+            continue
+        if not in_fence:
+            line = re.sub(r"\s*\(L\d+-\d+\)", "", line)
+        out.append(line)
+    return "".join(out)
+def _rewrite_jump_to_index(text: str, index_filename: str) -> str:
+    out: list[str] = []
+    in_fence = False
+    pat = re.compile(r"\[jump to index\]\(\#(file-[^)]+)\)")
+    for line in text.splitlines(keepends=True):
+        if _FENCE_RE.match(line):
+            if not in_fence:
+                in_fence = True
+            elif line.strip() == "```":
+                in_fence = False
+            out.append(line)
+            continue
+        if not in_fence:
+            line = pat.sub(rf"[jump to index]({index_filename}#\1)", line)
+        out.append(line)
+    return "".join(out)
+def _rewrite_func_links(text: str, func_to_part: dict[str, str]) -> str:
+    out: list[str] = []
+    in_fence = False
+    pat = re.compile(r"\(\#(func-[0-9a-f]{8})\)")
+    for line in text.splitlines(keepends=True):
+        if _FENCE_RE.match(line):
+            if not in_fence:
+                in_fence = True
+            elif line.strip() == "```":
+                in_fence = False
+            out.append(line)
+            continue
+        if not in_fence and "(#func-" in line:
+            def repl(m: re.Match[str]) -> str:
+                anchor = m.group(1)
+                fid = anchor.split("-")[1].upper()
+                part = func_to_part.get(fid)
+                if not part:
+                    return m.group(0)
+                return f"({part}#{anchor})"
+            line = pat.sub(repl, line)
+        out.append(line)
+    return "".join(out)
+def _rewrite_part1(
+    text: str, file_to_part: dict[str, str], func_to_part: dict[str, str]
+) -> str:
+    lines = text.splitlines(keepends=True)
+    out: list[str] = []
+    in_fence = False
+    in_index = False
+    current_file_part: str | None = None
+    for line in lines:
+        if _FENCE_RE.match(line):
+            if not in_fence:
+                in_fence = True
+            elif line.strip() == "```":
+                in_fence = False
+        if not in_fence and line.startswith("## Symbol Index"):
+            in_index = True
+            out.append(line)
+            continue
+        if (
+            in_index
+            and not in_fence
+            and line.startswith("## ")
+            and not line.startswith("## Symbol Index")
+        ):
+            in_index = False
+            current_file_part = None
+            out.append(line)
+            continue
+        if not in_index or in_fence:
+            out.append(line)
+            continue
+        m_file = _FILE_HEADING_RE.match(line)
+        if m_file:
+            rel = m_file.group(1)
+            current_file_part = file_to_part.get(rel)
+            empty = " (empty)" if "(empty)" in line else ""
+            m_jump = re.search(r"\[jump\]\(\#([^)]+)\)", line)
+            anchor = m_jump.group(1) if m_jump else None
+            if current_file_part and anchor:
+                out.append(
+                    f"### `{rel}`{empty} (in {current_file_part}) — "
+                    f"[jump]({current_file_part}#{anchor})\n"
+                )
+            else:
+                out.append(re.sub(r"\s*\(L\d+-\d+\)", "", line))
+            continue
+        if line.lstrip().startswith("- "):
+            ln = re.sub(r"\s*\(L\d+-\d+\)", "", line)
+            m = re.search(r"\[jump\]\(\#func-([0-9a-f]{8})\)", ln)
+            if m:
+                fid = m.group(1).upper()
+                part = func_to_part.get(fid) or current_file_part
+                if part:
+                    ln = ln.replace("— [jump]", f"(in {part}) — [jump]")
+                    ln = re.sub(r"\(\#(func-[0-9a-f]{8})\)", rf"({part}#\1)", ln)
+                out.append(ln)
+                continue
+            if current_file_part:
+                ln = ln.rstrip("\n") + f" (in {current_file_part})\n"
+            out.append(ln)
+            continue
+        out.append(re.sub(r"\s*\(L\d+-\d+\)", "", line))
+    return _strip_markdown_line_ranges("".join(out))

codecrate/udiff.py ADDED Viewed

@@ -0,0 +1,187 @@
+from __future__ import annotations
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Literal
+_HUNK_RE = re.compile(r"^@@\s+-(\d+),?(\d*)\s+\+(\d+),?(\d*)\s+@@")
+def normalize_newlines(s: str) -> str:
+    return s.replace("\r\n", "\n").replace("\r", "\n")
+def ensure_parent_dir(path: Path) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+@dataclass
+class FileDiff:
+    path: str
+    hunks: list[list[str]]  # raw hunk lines including @@ header and +/-/space lines
+    op: Literal["add", "modify", "delete"]
+def parse_unified_diff(diff_text: str) -> list[FileDiff]:
+    lines = normalize_newlines(diff_text).splitlines()
+    i = 0
+    out: list[FileDiff] = []
+    while i < len(lines):
+        if not lines[i].startswith("--- "):
+            i += 1
+            continue
+        if i + 1 >= len(lines):
+            break
+        if not lines[i + 1].startswith("+++ "):
+            i += 1
+            continue
+        from_raw = lines[i][4:].strip()
+        to_raw = lines[i + 1][4:].strip()
+        def _side(raw: str, prefix: str) -> str | None:
+            if raw == "/dev/null":
+                return None
+            return raw[len(prefix) :] if raw.startswith(prefix) else raw
+        from_path = _side(from_raw, "a/")
+        to_path = _side(to_raw, "b/")
+        if from_path is None and to_path is None:
+            i += 2
+            continue
+        if from_path is None:
+            op: Literal["add", "modify", "delete"] = "add"
+            path = to_path or ""
+        elif to_path is None:
+            op = "delete"
+            path = from_path
+        else:
+            op = "modify"
+            path = to_path
+        i += 2
+        hunks: list[list[str]] = []
+        while i < len(lines):
+            if lines[i].startswith("--- "):
+                break
+            if lines[i].startswith("@@"):
+                h = [lines[i]]
+                i += 1
+                while (
+                    i < len(lines)
+                    and not lines[i].startswith("@@")
+                    and not lines[i].startswith("--- ")
+                ):
+                    if lines[i].startswith((" ", "+", "-")):
+                        h.append(lines[i])
+                    i += 1
+                hunks.append(h)
+            else:
+                i += 1
+        out.append(FileDiff(path=path, hunks=hunks, op=op))
+    return out
+def apply_hunks_to_text(old_text: str, hunks: list[list[str]]) -> str:
+    """
+    Minimal unified-diff applier.
+    - Expects hunks in order and matching context lines.
+    - Raises ValueError on mismatch.
+    """
+    old_lines = normalize_newlines(old_text).splitlines()
+    new_lines: list[str] = []
+    old_i = 0
+    for hunk in hunks:
+        m = _HUNK_RE.match(hunk[0])
+        if not m:
+            raise ValueError(f"Bad hunk header: {hunk[0]}")
+        old_start = int(m.group(1)) - 1  # 0-based
+        # copy unchanged prefix
+        if old_start < old_i and not (old_i == 0 and len(old_lines) == 0):
+            raise ValueError("Overlapping hunks")
+        new_lines.extend(old_lines[old_i:old_start])
+        old_i = old_start
+        # apply hunk lines
+        for line in hunk[1:]:
+            tag = line[:1]
+            payload = line[1:]
+            if tag == " ":
+                if old_i >= len(old_lines) or old_lines[old_i] != payload:
+                    raise ValueError("Context mismatch while applying patch")
+                new_lines.append(payload)
+                old_i += 1
+            elif tag == "-":
+                # Check if current line matches what we'd add (file already modified)
+                current_line_matches_target = False
+                if old_i < len(old_lines) and old_lines[old_i] != payload:
+                    # Look ahead to find what we're adding
+                    add_line = None
+                    for next_line in hunk[1:]:
+                        if next_line.startswith("+"):
+                            add_line = next_line[1:]  # Full add line with indentation
+                            break
+                    # If current line matches what we'd add (with/without stripping),
+                    # skip the delete operation
+                    if add_line is not None:
+                        current_matches_add = old_lines[old_i] == add_line
+                        current_matches_add_stripped = (
+                            old_lines[old_i].strip() == add_line.strip()
+                        )
+                        if current_matches_add or current_matches_add_stripped:
+                            current_line_matches_target = True
+                if not current_line_matches_target:
+                    # Use original payload for comparison, but also try stripped version
+                    if old_i < len(old_lines) and old_lines[old_i] != payload:
+                        # Try stripped versions for fuzzy matching
+                        old_stripped = old_lines[old_i].strip()
+                        payload_stripped = payload.strip()
+                        if old_stripped != payload_stripped:
+                            raise ValueError("Delete mismatch while applying patch")
+                old_i += 1
+            elif tag == "+":
+                new_lines.append(payload)
+            else:
+                raise ValueError(f"Unexpected diff tag: {tag}")
+    # copy remainder
+    new_lines.extend(old_lines[old_i:])
+    out = "\n".join(new_lines)
+    if old_text.endswith("\n") or (not old_text and out):
+        out += "\n"
+    return out
+def apply_file_diffs(diffs: list[FileDiff], root: Path) -> list[Path]:
+    """
+    Applies diffs to files under root. Returns list of modified paths.
+    """
+    root = root.resolve()
+    changed: list[Path] = []
+    for fd in diffs:
+        path = root / fd.path
+        if fd.op == "delete":
+            if path.exists():
+                path.unlink()
+            changed.append(path)
+            continue
+        old = ""
+        if path.exists():
+            old = path.read_text(encoding="utf-8", errors="replace")
+        new = apply_hunks_to_text(old, fd.hunks)
+        ensure_parent_dir(path)
+        path.write_text(new, encoding="utf-8")
+        changed.append(path)
+    return changed