PyPI - codebatch - Versions diffs - 0.1.0__py3-none-any.whl - Mend

codebatch 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

codebatch/__init__.py +3 -0
codebatch/batch.py +366 -0
codebatch/cas.py +170 -0
codebatch/cli.py +432 -0
codebatch/common.py +104 -0
codebatch/paths.py +196 -0
codebatch/query.py +242 -0
codebatch/runner.py +495 -0
codebatch/snapshot.py +340 -0
codebatch/store.py +162 -0
codebatch/tasks/__init__.py +37 -0
codebatch/tasks/analyze.py +109 -0
codebatch/tasks/lint.py +244 -0
codebatch/tasks/parse.py +304 -0
codebatch/tasks/symbols.py +223 -0
codebatch-0.1.0.dist-info/METADATA +66 -0
codebatch-0.1.0.dist-info/RECORD +19 -0
codebatch-0.1.0.dist-info/WHEEL +4 -0
codebatch-0.1.0.dist-info/entry_points.txt +2 -0

codebatch/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""CodeBatch - Content-addressed batch execution engine."""
+__version__ = "0.1.0"

codebatch/batch.py ADDED Viewed

@@ -0,0 +1,366 @@
+"""Batch and task scaffolding generator.
+A batch represents one execution attempt over a snapshot.
+Batches are isolated, repeatable, and discardable.
+"""
+import json
+import uuid
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Optional
+from .common import SCHEMA_VERSION, PRODUCER, utc_now_z, BatchExistsError
+from .snapshot import SnapshotBuilder
+def generate_batch_id() -> str:
+    """Generate a unique batch ID.
+    Returns:
+        Batch ID in format: batch-YYYYMMDD-HHMMSS-XXXX
+    """
+    now = datetime.now(timezone.utc)
+    timestamp = now.strftime("%Y%m%d-%H%M%S")
+    suffix = uuid.uuid4().hex[:8]
+    return f"batch-{timestamp}-{suffix}"
+# Pipeline definitions
+PIPELINES = {
+    "parse": {
+        "description": "Parse source files and emit AST + diagnostics",
+        "tasks": [
+            {
+                "task_id": "01_parse",
+                "type": "parse",
+                "config": {
+                    "languages": ["python", "javascript", "typescript"],
+                    "emit_ast": True,
+                    "emit_diagnostics": True,
+                },
+            }
+        ],
+    },
+    "analyze": {
+        "description": "Parse and analyze source files",
+        "tasks": [
+            {
+                "task_id": "01_parse",
+                "type": "parse",
+                "config": {
+                    "languages": ["python", "javascript", "typescript"],
+                    "emit_ast": True,
+                    "emit_diagnostics": True,
+                },
+            },
+            {
+                "task_id": "02_analyze",
+                "type": "analyze",
+                "depends_on": ["01_parse"],
+                "config": {},
+            },
+        ],
+    },
+    "full": {
+        "description": "Complete Phase 2 pipeline: parse -> analyze -> symbols -> lint",
+        "tasks": [
+            {
+                "task_id": "01_parse",
+                "type": "parse",
+                "depends_on": [],
+                "config": {
+                    "languages": ["python", "javascript", "typescript"],
+                    "emit_ast": True,
+                    "emit_diagnostics": True,
+                },
+            },
+            {
+                "task_id": "02_analyze",
+                "type": "analyze",
+                "depends_on": ["01_parse"],
+                "config": {},
+            },
+            {
+                "task_id": "03_symbols",
+                "type": "symbols",
+                "depends_on": ["01_parse"],
+                "config": {},
+            },
+            {
+                "task_id": "04_lint",
+                "type": "lint",
+                "depends_on": ["01_parse"],
+                "config": {},
+            },
+        ],
+    },
+}
+class BatchManager:
+    """Manages batch creation and execution scaffolding."""
+    SHARD_COUNT = 256  # 00-ff
+    def __init__(self, store_root: Path):
+        """Initialize the batch manager.
+        Args:
+            store_root: Root directory of the CodeBatch store.
+        """
+        self.store_root = Path(store_root)
+        self.batches_dir = self.store_root / "batches"
+        self.snapshot_builder = SnapshotBuilder(store_root)
+    def _generate_shard_ids(self) -> list[str]:
+        """Generate all shard IDs (00-ff).
+        Returns:
+            List of 256 shard IDs.
+        """
+        return [f"{i:02x}" for i in range(256)]
+    def init_batch(
+        self,
+        snapshot_id: str,
+        pipeline: str,
+        batch_id: Optional[str] = None,
+        metadata: Optional[dict] = None,
+        allow_overwrite: bool = False,
+    ) -> str:
+        """Initialize a new batch with complete skeleton.
+        Args:
+            snapshot_id: Snapshot ID to execute against.
+            pipeline: Pipeline name (e.g., 'parse', 'analyze').
+            batch_id: Optional batch ID (auto-generated if not provided).
+            metadata: Optional user metadata.
+            allow_overwrite: If True, allow overwriting existing batch.
+        Returns:
+            The batch ID.
+        Raises:
+            ValueError: If snapshot or pipeline doesn't exist.
+            BatchExistsError: If batch already exists and allow_overwrite=False.
+        """
+        # Verify snapshot exists
+        try:
+            self.snapshot_builder.load_snapshot(snapshot_id)
+        except FileNotFoundError:
+            raise ValueError(f"Snapshot not found: {snapshot_id}")
+        # Verify pipeline exists
+        if pipeline not in PIPELINES:
+            raise ValueError(f"Unknown pipeline: {pipeline}. Available: {list(PIPELINES.keys())}")
+        if batch_id is None:
+            batch_id = generate_batch_id()
+        pipeline_def = PIPELINES[pipeline]
+        shard_ids = self._generate_shard_ids()
+        # Check for existing batch (immutability enforcement)
+        # Fail if directory exists at all - even empty dirs indicate a prior attempt
+        batch_dir = self.batches_dir / batch_id
+        if batch_dir.exists() and not allow_overwrite:
+            raise BatchExistsError(batch_id)
+        # Create batch directory
+        batch_dir.mkdir(parents=True, exist_ok=True)
+        created_at = utc_now_z()
+        # Write batch.json
+        batch_meta = {
+            "schema_name": "codebatch.batch",
+            "schema_version": SCHEMA_VERSION,
+            "producer": PRODUCER,
+            "batch_id": batch_id,
+            "snapshot_id": snapshot_id,
+            "created_at": created_at,
+            "pipeline": pipeline,
+            "status": "pending",
+        }
+        if metadata:
+            batch_meta["metadata"] = metadata
+        with open(batch_dir / "batch.json", "w", encoding="utf-8") as f:
+            json.dump(batch_meta, f, indent=2)
+        # Write plan.json
+        plan = {
+            "schema_name": "codebatch.plan",
+            "schema_version": SCHEMA_VERSION,
+            "producer": PRODUCER,
+            "batch_id": batch_id,
+            "tasks": pipeline_def["tasks"],
+        }
+        with open(batch_dir / "plan.json", "w", encoding="utf-8") as f:
+            json.dump(plan, f, indent=2)
+        # Create empty events.jsonl
+        (batch_dir / "events.jsonl").touch()
+        # Create tasks directory and task scaffolding
+        tasks_dir = batch_dir / "tasks"
+        tasks_dir.mkdir(exist_ok=True)
+        for task_def in pipeline_def["tasks"]:
+            task_id = task_def["task_id"]
+            task_dir = tasks_dir / task_id
+            task_dir.mkdir(exist_ok=True)
+            # Write task.json
+            task_meta = {
+                "schema_name": "codebatch.task",
+                "schema_version": SCHEMA_VERSION,
+                "producer": PRODUCER,
+                "task_id": task_id,
+                "batch_id": batch_id,
+                "type": task_def["type"],
+                "sharding": {
+                    "strategy": "hash_prefix",
+                    "shard_count": self.SHARD_COUNT,
+                    "shard_ids": shard_ids,
+                },
+                "inputs": {
+                    "snapshot": True,
+                    "tasks": task_def.get("depends_on", []),
+                },
+                "config": task_def.get("config", {}),
+                "status": "pending",
+            }
+            with open(task_dir / "task.json", "w", encoding="utf-8") as f:
+                json.dump(task_meta, f, indent=2)
+            # Create empty events.jsonl
+            (task_dir / "events.jsonl").touch()
+            # Create shards directory with all shard subdirectories
+            shards_dir = task_dir / "shards"
+            shards_dir.mkdir(exist_ok=True)
+            for shard_id in shard_ids:
+                shard_dir = shards_dir / shard_id
+                shard_dir.mkdir(exist_ok=True)
+                # Write initial state.json
+                state = {
+                    "schema_name": "codebatch.shard_state",
+                    "schema_version": SCHEMA_VERSION,
+                    "producer": PRODUCER,
+                    "shard_id": shard_id,
+                    "task_id": task_id,
+                    "batch_id": batch_id,
+                    "status": "ready",
+                    "attempt": 0,
+                }
+                with open(shard_dir / "state.json", "w", encoding="utf-8") as f:
+                    json.dump(state, f, indent=2)
+                # Create empty outputs.index.jsonl
+                (shard_dir / "outputs.index.jsonl").touch()
+        return batch_id
+    def load_batch(self, batch_id: str) -> dict:
+        """Load batch metadata.
+        Args:
+            batch_id: Batch ID to load.
+        Returns:
+            Batch metadata dict.
+        Raises:
+            FileNotFoundError: If batch doesn't exist.
+        """
+        batch_path = self.batches_dir / batch_id / "batch.json"
+        with open(batch_path, "r", encoding="utf-8") as f:
+            return json.load(f)
+    def load_plan(self, batch_id: str) -> dict:
+        """Load batch execution plan.
+        Args:
+            batch_id: Batch ID to load.
+        Returns:
+            Plan dict.
+        Raises:
+            FileNotFoundError: If batch doesn't exist.
+        """
+        plan_path = self.batches_dir / batch_id / "plan.json"
+        with open(plan_path, "r", encoding="utf-8") as f:
+            return json.load(f)
+    def load_task(self, batch_id: str, task_id: str) -> dict:
+        """Load task metadata.
+        Args:
+            batch_id: Batch ID.
+            task_id: Task ID.
+        Returns:
+            Task metadata dict.
+        """
+        task_path = self.batches_dir / batch_id / "tasks" / task_id / "task.json"
+        with open(task_path, "r", encoding="utf-8") as f:
+            return json.load(f)
+    def load_shard_state(self, batch_id: str, task_id: str, shard_id: str) -> dict:
+        """Load shard state.
+        Args:
+            batch_id: Batch ID.
+            task_id: Task ID.
+            shard_id: Shard ID.
+        Returns:
+            Shard state dict.
+        """
+        state_path = (
+            self.batches_dir / batch_id / "tasks" / task_id / "shards" / shard_id / "state.json"
+        )
+        with open(state_path, "r", encoding="utf-8") as f:
+            return json.load(f)
+    def list_batches(self) -> list[str]:
+        """List all batch IDs.
+        Returns:
+            List of batch IDs.
+        """
+        if not self.batches_dir.exists():
+            return []
+        return [
+            d.name
+            for d in self.batches_dir.iterdir()
+            if d.is_dir() and (d / "batch.json").exists()
+        ]
+    def get_task_ids(self, batch_id: str) -> list[str]:
+        """Get task IDs for a batch.
+        Args:
+            batch_id: Batch ID.
+        Returns:
+            List of task IDs.
+        """
+        tasks_dir = self.batches_dir / batch_id / "tasks"
+        if not tasks_dir.exists():
+            return []
+        return [
+            d.name
+            for d in tasks_dir.iterdir()
+            if d.is_dir() and (d / "task.json").exists()
+        ]

codebatch/cas.py ADDED Viewed

@@ -0,0 +1,170 @@
+"""Content-Addressed Storage (CAS) object store.
+Objects are stored at: objects/sha256/<aa>/<bb>/<full_hash>
+Where <aa> and <bb> are the first two byte pairs of the hex hash.
+Object references use canonical format: sha256:<hex>
+"""
+import hashlib
+import os
+from pathlib import Path
+from typing import Optional
+from .common import parse_object_ref, make_object_ref
+class ObjectNotFoundError(Exception):
+    """Raised when an object is not found in the store."""
+    def __init__(self, object_ref: str):
+        self.object_ref = object_ref
+        super().__init__(f"Object not found: {object_ref}")
+class ObjectStore:
+    """Content-addressed object store using SHA-256."""
+    def __init__(self, store_root: Path):
+        """Initialize the object store.
+        Args:
+            store_root: Root directory of the CodeBatch store.
+        """
+        self.store_root = Path(store_root)
+        self.objects_dir = self.store_root / "objects" / "sha256"
+    def _hex_to_path(self, hex_hash: str) -> Path:
+        """Get the filesystem path for a hex hash.
+        Args:
+            hex_hash: SHA-256 hex hash (64 characters).
+        Returns:
+            Path to the object file.
+        """
+        aa = hex_hash[:2]
+        bb = hex_hash[2:4]
+        return self.objects_dir / aa / bb / hex_hash
+    def _object_path(self, object_ref: str) -> Path:
+        """Get the filesystem path for an object reference.
+        Args:
+            object_ref: Object reference (sha256:<hex> or legacy bare hex).
+        Returns:
+            Path to the object file.
+        Raises:
+            ValueError: If object reference is invalid.
+        """
+        _, hex_hash = parse_object_ref(object_ref)
+        return self._hex_to_path(hex_hash)
+    def put_bytes(self, data: bytes) -> str:
+        """Store bytes and return the canonical object reference.
+        Thread-safe: handles concurrent writes correctly.
+        Args:
+            data: Raw bytes to store.
+        Returns:
+            Canonical object reference in format sha256:<hex>.
+        """
+        hex_hash = hashlib.sha256(data).hexdigest()
+        object_path = self._hex_to_path(hex_hash)
+        # Dedupe: if object already exists, skip write
+        if object_path.exists():
+            return make_object_ref(hex_hash)
+        # Atomic write: write to temp file, then replace
+        object_path.parent.mkdir(parents=True, exist_ok=True)
+        # Use PID in temp filename to avoid collisions
+        temp_path = object_path.with_suffix(f".tmp.{os.getpid()}")
+        try:
+            temp_path.write_bytes(data)
+            try:
+                # Use replace() for atomic overwrite (works on Windows)
+                temp_path.replace(object_path)
+            except OSError:
+                # Race condition: another process wrote the same object
+                # This is fine - CAS is content-addressed, so result is identical
+                if object_path.exists():
+                    # Object was written by another process, clean up our temp
+                    if temp_path.exists():
+                        temp_path.unlink()
+                else:
+                    # Actual error, re-raise
+                    raise
+        except Exception:
+            # Clean up temp file on failure
+            if temp_path.exists():
+                try:
+                    temp_path.unlink()
+                except OSError:
+                    pass
+            raise
+        return make_object_ref(hex_hash)
+    def has(self, object_ref: str) -> bool:
+        """Check if an object exists in the store.
+        Args:
+            object_ref: Object reference (sha256:<hex> or bare hex).
+        Returns:
+            True if object exists, False otherwise.
+        """
+        try:
+            return self._object_path(object_ref).exists()
+        except ValueError:
+            return False
+    def get_bytes(self, object_ref: str) -> bytes:
+        """Retrieve bytes for an object reference.
+        Args:
+            object_ref: Object reference (sha256:<hex> or bare hex).
+        Returns:
+            Raw bytes of the object.
+        Raises:
+            ObjectNotFoundError: If object does not exist.
+        """
+        object_path = self._object_path(object_ref)
+        if not object_path.exists():
+            raise ObjectNotFoundError(object_ref)
+        return object_path.read_bytes()
+    def get_path(self, object_ref: str) -> Optional[Path]:
+        """Get the filesystem path for an object if it exists.
+        Args:
+            object_ref: Object reference (sha256:<hex> or bare hex).
+        Returns:
+            Path to object file, or None if not found.
+        """
+        try:
+            object_path = self._object_path(object_ref)
+            return object_path if object_path.exists() else None
+        except ValueError:
+            return None
+    def get_hex(self, object_ref: str) -> str:
+        """Extract the hex hash from an object reference.
+        Args:
+            object_ref: Object reference (sha256:<hex> or bare hex).
+        Returns:
+            64-character hex hash.
+        """
+        _, hex_hash = parse_object_ref(object_ref)
+        return hex_hash