PyPI - codebatch - Versions diffs - 0.1.0__py3-none-any.whl - Mend

codebatch 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

codebatch/__init__.py +3 -0
codebatch/batch.py +366 -0
codebatch/cas.py +170 -0
codebatch/cli.py +432 -0
codebatch/common.py +104 -0
codebatch/paths.py +196 -0
codebatch/query.py +242 -0
codebatch/runner.py +495 -0
codebatch/snapshot.py +340 -0
codebatch/store.py +162 -0
codebatch/tasks/__init__.py +37 -0
codebatch/tasks/analyze.py +109 -0
codebatch/tasks/lint.py +244 -0
codebatch/tasks/parse.py +304 -0
codebatch/tasks/symbols.py +223 -0
codebatch-0.1.0.dist-info/METADATA +66 -0
codebatch-0.1.0.dist-info/RECORD +19 -0
codebatch-0.1.0.dist-info/WHEEL +4 -0
codebatch-0.1.0.dist-info/entry_points.txt +2 -0

codebatch/snapshot.py ADDED Viewed

@@ -0,0 +1,340 @@
+"""Snapshot builder for creating immutable snapshots of directory sources.
+A snapshot represents a frozen view of an input source at a specific point in time.
+Snapshots are immutable once written.
+"""
+import json
+import os
+import uuid
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Iterator, Optional
+from .cas import ObjectStore
+from .common import SCHEMA_VERSION, PRODUCER, utc_now_z, SnapshotExistsError, object_shard_prefix
+from .paths import canonicalize_path, compute_path_key, PathEscapeError, InvalidPathError, detect_case_collision
+# Language detection by extension
+LANG_HINTS = {
+    ".py": "python",
+    ".js": "javascript",
+    ".ts": "typescript",
+    ".tsx": "typescript",
+    ".jsx": "javascript",
+    ".cs": "csharp",
+    ".java": "java",
+    ".go": "go",
+    ".rs": "rust",
+    ".c": "c",
+    ".cpp": "cpp",
+    ".cc": "cpp",
+    ".h": "c",
+    ".hpp": "cpp",
+    ".rb": "ruby",
+    ".php": "php",
+    ".swift": "swift",
+    ".kt": "kotlin",
+    ".scala": "scala",
+    ".r": "r",
+    ".R": "r",
+    ".sql": "sql",
+    ".sh": "shell",
+    ".bash": "shell",
+    ".zsh": "shell",
+    ".ps1": "powershell",
+    ".md": "markdown",
+    ".json": "json",
+    ".yaml": "yaml",
+    ".yml": "yaml",
+    ".xml": "xml",
+    ".html": "html",
+    ".css": "css",
+    ".scss": "scss",
+    ".sass": "sass",
+    ".less": "less",
+}
+def detect_lang_hint(path: str) -> Optional[str]:
+    """Detect language hint from file extension.
+    Args:
+        path: File path.
+    Returns:
+        Language hint string, or None if unknown.
+    """
+    ext = os.path.splitext(path)[1].lower()
+    return LANG_HINTS.get(ext)
+def generate_snapshot_id() -> str:
+    """Generate a unique snapshot ID.
+    Returns:
+        Snapshot ID in format: snap-YYYYMMDD-HHMMSS-XXXX
+    """
+    now = datetime.now(timezone.utc)
+    timestamp = now.strftime("%Y%m%d-%H%M%S")
+    suffix = uuid.uuid4().hex[:8]
+    return f"snap-{timestamp}-{suffix}"
+class SnapshotBuilder:
+    """Builds immutable snapshots from directory sources."""
+    def __init__(self, store_root: Path):
+        """Initialize the snapshot builder.
+        Args:
+            store_root: Root directory of the CodeBatch store.
+        """
+        self.store_root = Path(store_root)
+        self.object_store = ObjectStore(store_root)
+        self.snapshots_dir = self.store_root / "snapshots"
+    def _walk_directory(
+        self,
+        source_dir: Path,
+        include_hidden: bool = False,
+    ) -> Iterator[tuple[Path, str]]:
+        """Walk a directory and yield (file_path, relative_path) pairs.
+        Args:
+            source_dir: Directory to walk.
+            include_hidden: If True, include hidden files/dirs.
+        Yields:
+            Tuples of (absolute_path, relative_path).
+        """
+        source_dir = source_dir.resolve()
+        for root, dirs, files in os.walk(source_dir):
+            if not include_hidden:
+                # Skip hidden directories
+                dirs[:] = [d for d in dirs if not d.startswith(".")]
+            root_path = Path(root)
+            for file in files:
+                # Skip hidden files unless configured
+                if not include_hidden and file.startswith("."):
+                    continue
+                file_path = root_path / file
+                try:
+                    rel_path = file_path.relative_to(source_dir)
+                    yield file_path, str(rel_path)
+                except ValueError:
+                    # File not under source_dir (shouldn't happen)
+                    continue
+    def build(
+        self,
+        source_dir: Path,
+        snapshot_id: Optional[str] = None,
+        metadata: Optional[dict] = None,
+        include_hidden: bool = False,
+        allow_overwrite: bool = False,
+    ) -> str:
+        """Build a snapshot from a directory.
+        Args:
+            source_dir: Directory to snapshot.
+            snapshot_id: Optional snapshot ID (auto-generated if not provided).
+            metadata: Optional user metadata to include.
+            include_hidden: If True, include hidden files/dirs.
+            allow_overwrite: If True, allow overwriting existing snapshot (default False).
+        Returns:
+            The snapshot ID.
+        Raises:
+            SnapshotExistsError: If snapshot already exists and allow_overwrite=False.
+            ValueError: If source is not a directory.
+        """
+        source_dir = Path(source_dir).resolve()
+        if not source_dir.is_dir():
+            raise ValueError(f"Source is not a directory: {source_dir}")
+        if snapshot_id is None:
+            snapshot_id = generate_snapshot_id()
+        # Check for existing snapshot (immutability enforcement)
+        # Fail if directory exists at all - even empty dirs indicate a prior attempt
+        snapshot_dir = self.snapshots_dir / snapshot_id
+        if snapshot_dir.exists() and not allow_overwrite:
+            raise SnapshotExistsError(snapshot_id)
+        # Create snapshot directory
+        snapshot_dir.mkdir(parents=True, exist_ok=True)
+        # Collect file records and track diagnostics
+        file_records = []
+        skipped_files = []
+        total_bytes = 0
+        for file_path, rel_path in self._walk_directory(source_dir, include_hidden):
+            try:
+                # Canonicalize path
+                canonical_path = canonicalize_path(rel_path)
+                path_key = compute_path_key(canonical_path)
+                # Read file and store in CAS
+                data = file_path.read_bytes()
+                object_ref = self.object_store.put_bytes(data)
+                size = len(data)
+                total_bytes += size
+                # Build record
+                record = {
+                    "schema_version": SCHEMA_VERSION,
+                    "path": canonical_path,
+                    "path_key": path_key,
+                    "object": object_ref,
+                    "size": size,
+                }
+                # Add optional fields
+                lang_hint = detect_lang_hint(canonical_path)
+                if lang_hint:
+                    record["lang_hint"] = lang_hint
+                file_records.append(record)
+            except (PathEscapeError, InvalidPathError) as e:
+                skipped_files.append({
+                    "path": rel_path,
+                    "reason": "invalid_path",
+                    "message": str(e),
+                })
+            except OSError as e:
+                skipped_files.append({
+                    "path": rel_path,
+                    "reason": "unreadable",
+                    "message": str(e),
+                })
+        # Detect case collisions
+        all_paths = [r["path"] for r in file_records]
+        case_collisions = detect_case_collision(all_paths)
+        collision_warnings = []
+        for p1, p2 in case_collisions:
+            collision_warnings.append({
+                "paths": [p1, p2],
+                "reason": "case_collision",
+                "message": f"Paths differ only by case: {p1} vs {p2}",
+            })
+        # Sort records by path_key for deterministic output
+        file_records.sort(key=lambda r: r["path_key"])
+        # Write files.index.jsonl
+        index_path = snapshot_dir / "files.index.jsonl"
+        with open(index_path, "w", encoding="utf-8") as f:
+            for record in file_records:
+                f.write(json.dumps(record, ensure_ascii=False, separators=(",", ":")))
+                f.write("\n")
+        # Write snapshot.json
+        snapshot_meta = {
+            "schema_name": "codebatch.snapshot",
+            "schema_version": SCHEMA_VERSION,
+            "producer": PRODUCER,
+            "snapshot_id": snapshot_id,
+            "created_at": utc_now_z(),
+            "source": {
+                "type": "directory",
+                "path": str(source_dir),
+            },
+            "file_count": len(file_records),
+            "total_bytes": total_bytes,
+            "config": {
+                "include_hidden": include_hidden,
+            },
+        }
+        if metadata:
+            snapshot_meta["metadata"] = metadata
+        # Add warnings if any
+        if skipped_files or collision_warnings:
+            snapshot_meta["warnings"] = []
+            snapshot_meta["warnings"].extend(skipped_files)
+            snapshot_meta["warnings"].extend(collision_warnings)
+        snapshot_json_path = snapshot_dir / "snapshot.json"
+        with open(snapshot_json_path, "w", encoding="utf-8") as f:
+            json.dump(snapshot_meta, f, indent=2)
+        return snapshot_id
+    def load_snapshot(self, snapshot_id: str) -> dict:
+        """Load snapshot metadata.
+        Args:
+            snapshot_id: Snapshot ID to load.
+        Returns:
+            Snapshot metadata dict.
+        Raises:
+            FileNotFoundError: If snapshot doesn't exist.
+        """
+        snapshot_path = self.snapshots_dir / snapshot_id / "snapshot.json"
+        with open(snapshot_path, "r", encoding="utf-8") as f:
+            return json.load(f)
+    def load_file_index(self, snapshot_id: str) -> list[dict]:
+        """Load file index records.
+        Args:
+            snapshot_id: Snapshot ID to load.
+        Returns:
+            List of file index records.
+        Raises:
+            FileNotFoundError: If snapshot doesn't exist.
+        """
+        index_path = self.snapshots_dir / snapshot_id / "files.index.jsonl"
+        records = []
+        with open(index_path, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    records.append(json.loads(line))
+        return records
+    def iter_file_index(self, snapshot_id: str) -> Iterator[dict]:
+        """Stream file index records without loading all into memory.
+        Args:
+            snapshot_id: Snapshot ID.
+        Yields:
+            File index record dicts.
+        """
+        index_path = self.snapshots_dir / snapshot_id / "files.index.jsonl"
+        with open(index_path, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    yield json.loads(line)
+    def list_snapshots(self) -> list[str]:
+        """List all snapshot IDs.
+        Returns:
+            List of snapshot IDs.
+        """
+        if not self.snapshots_dir.exists():
+            return []
+        return [
+            d.name
+            for d in self.snapshots_dir.iterdir()
+            if d.is_dir() and (d / "snapshot.json").exists()
+        ]

codebatch/store.py ADDED Viewed

@@ -0,0 +1,162 @@
+"""Store initialization and validation.
+A CodeBatch store is a directory with a specific layout:
+    <store_root>/
+      store.json       # Store metadata
+      objects/         # Content-addressed objects
+      snapshots/       # Frozen input state
+      batches/         # Execution attempts
+      indexes/         # Optional acceleration (not required for correctness)
+"""
+import json
+from pathlib import Path
+from typing import Optional
+from .common import SCHEMA_VERSION, PRODUCER, utc_now_z
+class StoreExistsError(Exception):
+    """Raised when attempting to initialize a store that already exists."""
+    def __init__(self, store_root: Path):
+        self.store_root = store_root
+        super().__init__(f"Store already exists: {store_root}")
+class InvalidStoreError(Exception):
+    """Raised when a store is missing or invalid."""
+    def __init__(self, store_root: Path, reason: str):
+        self.store_root = store_root
+        self.reason = reason
+        super().__init__(f"Invalid store at {store_root}: {reason}")
+def init_store(store_root: Path, *, allow_reinit: bool = False) -> dict:
+    """Initialize a new CodeBatch store.
+    Creates the directory structure and store.json file.
+    Args:
+        store_root: Root directory for the store.
+        allow_reinit: If True, allow re-initialization of existing empty store.
+    Returns:
+        The store metadata dict.
+    Raises:
+        StoreExistsError: If store already exists (and not empty or allow_reinit=False).
+    """
+    store_root = Path(store_root)
+    store_json_path = store_root / "store.json"
+    # Check if store already exists
+    if store_json_path.exists():
+        raise StoreExistsError(store_root)
+    # If directory exists but is not a valid store, check if it's empty
+    if store_root.exists():
+        contents = list(store_root.iterdir())
+        if contents and not allow_reinit:
+            raise StoreExistsError(store_root)
+    # Create directory structure
+    store_root.mkdir(parents=True, exist_ok=True)
+    (store_root / "objects" / "sha256").mkdir(parents=True, exist_ok=True)
+    (store_root / "snapshots").mkdir(exist_ok=True)
+    (store_root / "batches").mkdir(exist_ok=True)
+    # Create store.json
+    store_meta = {
+        "schema_name": "codebatch.store",
+        "schema_version": SCHEMA_VERSION,
+        "producer": PRODUCER.copy(),
+        "created_at": utc_now_z(),
+    }
+    with open(store_json_path, "w", encoding="utf-8") as f:
+        json.dump(store_meta, f, indent=2)
+        f.write("\n")
+    return store_meta
+def load_store(store_root: Path) -> dict:
+    """Load and validate store metadata.
+    Args:
+        store_root: Root directory of the store.
+    Returns:
+        The store metadata dict.
+    Raises:
+        InvalidStoreError: If store is missing or invalid.
+    """
+    store_root = Path(store_root)
+    store_json_path = store_root / "store.json"
+    if not store_root.exists():
+        raise InvalidStoreError(store_root, "directory does not exist")
+    if not store_json_path.exists():
+        raise InvalidStoreError(store_root, "missing store.json")
+    try:
+        with open(store_json_path, "r", encoding="utf-8") as f:
+            store_meta = json.load(f)
+    except json.JSONDecodeError as e:
+        raise InvalidStoreError(store_root, f"invalid JSON in store.json: {e}")
+    # Validate required fields
+    if store_meta.get("schema_name") != "codebatch.store":
+        raise InvalidStoreError(
+            store_root,
+            f"invalid schema_name: {store_meta.get('schema_name')}"
+        )
+    if not isinstance(store_meta.get("schema_version"), int):
+        raise InvalidStoreError(
+            store_root,
+            f"invalid schema_version: {store_meta.get('schema_version')}"
+        )
+    return store_meta
+def ensure_store(store_root: Path) -> dict:
+    """Ensure a store exists, initializing if necessary.
+    This is the recommended way to get a store reference when you
+    don't care if it's new or existing.
+    Args:
+        store_root: Root directory for the store.
+    Returns:
+        The store metadata dict.
+    """
+    store_root = Path(store_root)
+    store_json_path = store_root / "store.json"
+    if store_json_path.exists():
+        return load_store(store_root)
+    else:
+        return init_store(store_root)
+def is_valid_store(store_root: Path) -> bool:
+    """Check if a directory is a valid CodeBatch store.
+    Args:
+        store_root: Root directory to check.
+    Returns:
+        True if valid store, False otherwise.
+    """
+    try:
+        load_store(store_root)
+        return True
+    except (InvalidStoreError, FileNotFoundError):
+        return False

codebatch/tasks/__init__.py ADDED Viewed

@@ -0,0 +1,37 @@
+"""Task executors registry."""
+from typing import Callable
+from ..runner import ShardRunner
+# Task executor type: (config, files, runner) -> list[output_records]
+TaskExecutor = Callable[[dict, list[dict], ShardRunner], list[dict]]
+def get_executor(task_id: str) -> TaskExecutor:
+    """Get the executor function for a task.
+    Args:
+        task_id: Task ID (e.g., '01_parse').
+    Returns:
+        Executor function.
+    Raises:
+        ValueError: If task executor not found.
+    """
+    if task_id == "01_parse":
+        from .parse import parse_executor
+        return parse_executor
+    elif task_id == "02_analyze":
+        from .analyze import analyze_executor
+        return analyze_executor
+    elif task_id == "03_symbols":
+        from .symbols import symbols_executor
+        return symbols_executor
+    elif task_id == "04_lint":
+        from .lint import lint_executor
+        return lint_executor
+    else:
+        raise ValueError(f"Unknown task: {task_id}")

codebatch/tasks/analyze.py ADDED Viewed

@@ -0,0 +1,109 @@
+"""Analyze task executor - produces file-level metrics.
+Emits:
+- kind=metric: File-level metrics (bytes, loc, lang, parse_status)
+Inputs:
+- Snapshot file records for this shard
+- Optionally parse outputs to determine parse_status
+Metrics are stable and cheap - no deep analysis.
+"""
+from typing import Iterable, Optional
+from ..runner import ShardRunner
+def count_lines(content: str) -> int:
+    """Count lines of code (non-empty lines)."""
+    lines = content.split('\n')
+    return sum(1 for line in lines if line.strip())
+def analyze_executor(config: dict, files: Iterable[dict], runner: ShardRunner) -> list[dict]:
+    """Execute the analyze task.
+    Produces file-level metrics for each file in the shard:
+    - bytes: File size from snapshot
+    - loc: Lines of code (non-empty lines, text files only)
+    - lang: Language hint from snapshot
+    - parse_status: 'ok', 'failed', or 'missing' if parse dep exists
+    Args:
+        config: Task configuration.
+        files: Iterable of file records for this shard.
+        runner: ShardRunner for CAS access.
+    Returns:
+        List of metric output records.
+    """
+    outputs = []
+    # Check if we should look at parse outputs
+    check_parse = config.get("check_parse_status", True)
+    # Materialize files for potential multi-pass
+    file_list = list(files)
+    # Build parse status map if checking parse
+    parse_status_map: dict[str, str] = {}
+    if check_parse:
+        # Get batch/task context from runner if available
+        # We'll populate this when we have context from the shard run
+        pass  # Will be populated per-file below
+    for file_record in file_list:
+        path = file_record["path"]
+        object_ref = file_record["object"]
+        lang_hint = file_record.get("lang_hint", "unknown")
+        try:
+            # Get file content from CAS
+            data = runner.object_store.get_bytes(object_ref)
+            file_bytes = len(data)
+            # Try to decode as text for LOC
+            loc: Optional[int] = None
+            try:
+                content = data.decode("utf-8")
+                loc = count_lines(content)
+            except UnicodeDecodeError:
+                # Binary file - no LOC
+                pass
+            # Emit bytes metric
+            outputs.append({
+                "kind": "metric",
+                "path": path,
+                "metric": "bytes",
+                "value": file_bytes,
+            })
+            # Emit LOC metric (if text)
+            if loc is not None:
+                outputs.append({
+                    "kind": "metric",
+                    "path": path,
+                    "metric": "loc",
+                    "value": loc,
+                })
+            # Emit lang metric
+            outputs.append({
+                "kind": "metric",
+                "path": path,
+                "metric": "lang",
+                "value": lang_hint,
+            })
+        except Exception as e:
+            # Emit error metric
+            outputs.append({
+                "kind": "metric",
+                "path": path,
+                "metric": "error",
+                "value": str(e),
+            })
+    return outputs