codebatch 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
codebatch/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """CodeBatch - Content-addressed batch execution engine."""
2
+
3
+ __version__ = "0.1.0"
codebatch/batch.py ADDED
@@ -0,0 +1,366 @@
1
+ """Batch and task scaffolding generator.
2
+
3
+ A batch represents one execution attempt over a snapshot.
4
+ Batches are isolated, repeatable, and discardable.
5
+ """
6
+
7
+ import json
8
+ import uuid
9
+ from datetime import datetime, timezone
10
+ from pathlib import Path
11
+ from typing import Optional
12
+
13
+ from .common import SCHEMA_VERSION, PRODUCER, utc_now_z, BatchExistsError
14
+ from .snapshot import SnapshotBuilder
15
+
16
+
17
+ def generate_batch_id() -> str:
18
+ """Generate a unique batch ID.
19
+
20
+ Returns:
21
+ Batch ID in format: batch-YYYYMMDD-HHMMSS-XXXX
22
+ """
23
+ now = datetime.now(timezone.utc)
24
+ timestamp = now.strftime("%Y%m%d-%H%M%S")
25
+ suffix = uuid.uuid4().hex[:8]
26
+ return f"batch-{timestamp}-{suffix}"
27
+
28
+
29
+ # Pipeline definitions
30
+ PIPELINES = {
31
+ "parse": {
32
+ "description": "Parse source files and emit AST + diagnostics",
33
+ "tasks": [
34
+ {
35
+ "task_id": "01_parse",
36
+ "type": "parse",
37
+ "config": {
38
+ "languages": ["python", "javascript", "typescript"],
39
+ "emit_ast": True,
40
+ "emit_diagnostics": True,
41
+ },
42
+ }
43
+ ],
44
+ },
45
+ "analyze": {
46
+ "description": "Parse and analyze source files",
47
+ "tasks": [
48
+ {
49
+ "task_id": "01_parse",
50
+ "type": "parse",
51
+ "config": {
52
+ "languages": ["python", "javascript", "typescript"],
53
+ "emit_ast": True,
54
+ "emit_diagnostics": True,
55
+ },
56
+ },
57
+ {
58
+ "task_id": "02_analyze",
59
+ "type": "analyze",
60
+ "depends_on": ["01_parse"],
61
+ "config": {},
62
+ },
63
+ ],
64
+ },
65
+ "full": {
66
+ "description": "Complete Phase 2 pipeline: parse -> analyze -> symbols -> lint",
67
+ "tasks": [
68
+ {
69
+ "task_id": "01_parse",
70
+ "type": "parse",
71
+ "depends_on": [],
72
+ "config": {
73
+ "languages": ["python", "javascript", "typescript"],
74
+ "emit_ast": True,
75
+ "emit_diagnostics": True,
76
+ },
77
+ },
78
+ {
79
+ "task_id": "02_analyze",
80
+ "type": "analyze",
81
+ "depends_on": ["01_parse"],
82
+ "config": {},
83
+ },
84
+ {
85
+ "task_id": "03_symbols",
86
+ "type": "symbols",
87
+ "depends_on": ["01_parse"],
88
+ "config": {},
89
+ },
90
+ {
91
+ "task_id": "04_lint",
92
+ "type": "lint",
93
+ "depends_on": ["01_parse"],
94
+ "config": {},
95
+ },
96
+ ],
97
+ },
98
+ }
99
+
100
+
101
+ class BatchManager:
102
+ """Manages batch creation and execution scaffolding."""
103
+
104
+ SHARD_COUNT = 256 # 00-ff
105
+
106
+ def __init__(self, store_root: Path):
107
+ """Initialize the batch manager.
108
+
109
+ Args:
110
+ store_root: Root directory of the CodeBatch store.
111
+ """
112
+ self.store_root = Path(store_root)
113
+ self.batches_dir = self.store_root / "batches"
114
+ self.snapshot_builder = SnapshotBuilder(store_root)
115
+
116
+ def _generate_shard_ids(self) -> list[str]:
117
+ """Generate all shard IDs (00-ff).
118
+
119
+ Returns:
120
+ List of 256 shard IDs.
121
+ """
122
+ return [f"{i:02x}" for i in range(256)]
123
+
124
+ def init_batch(
125
+ self,
126
+ snapshot_id: str,
127
+ pipeline: str,
128
+ batch_id: Optional[str] = None,
129
+ metadata: Optional[dict] = None,
130
+ allow_overwrite: bool = False,
131
+ ) -> str:
132
+ """Initialize a new batch with complete skeleton.
133
+
134
+ Args:
135
+ snapshot_id: Snapshot ID to execute against.
136
+ pipeline: Pipeline name (e.g., 'parse', 'analyze').
137
+ batch_id: Optional batch ID (auto-generated if not provided).
138
+ metadata: Optional user metadata.
139
+ allow_overwrite: If True, allow overwriting existing batch.
140
+
141
+ Returns:
142
+ The batch ID.
143
+
144
+ Raises:
145
+ ValueError: If snapshot or pipeline doesn't exist.
146
+ BatchExistsError: If batch already exists and allow_overwrite=False.
147
+ """
148
+ # Verify snapshot exists
149
+ try:
150
+ self.snapshot_builder.load_snapshot(snapshot_id)
151
+ except FileNotFoundError:
152
+ raise ValueError(f"Snapshot not found: {snapshot_id}")
153
+
154
+ # Verify pipeline exists
155
+ if pipeline not in PIPELINES:
156
+ raise ValueError(f"Unknown pipeline: {pipeline}. Available: {list(PIPELINES.keys())}")
157
+
158
+ if batch_id is None:
159
+ batch_id = generate_batch_id()
160
+
161
+ pipeline_def = PIPELINES[pipeline]
162
+ shard_ids = self._generate_shard_ids()
163
+
164
+ # Check for existing batch (immutability enforcement)
165
+ # Fail if directory exists at all - even empty dirs indicate a prior attempt
166
+ batch_dir = self.batches_dir / batch_id
167
+ if batch_dir.exists() and not allow_overwrite:
168
+ raise BatchExistsError(batch_id)
169
+
170
+ # Create batch directory
171
+ batch_dir.mkdir(parents=True, exist_ok=True)
172
+
173
+ created_at = utc_now_z()
174
+
175
+ # Write batch.json
176
+ batch_meta = {
177
+ "schema_name": "codebatch.batch",
178
+ "schema_version": SCHEMA_VERSION,
179
+ "producer": PRODUCER,
180
+ "batch_id": batch_id,
181
+ "snapshot_id": snapshot_id,
182
+ "created_at": created_at,
183
+ "pipeline": pipeline,
184
+ "status": "pending",
185
+ }
186
+ if metadata:
187
+ batch_meta["metadata"] = metadata
188
+
189
+ with open(batch_dir / "batch.json", "w", encoding="utf-8") as f:
190
+ json.dump(batch_meta, f, indent=2)
191
+
192
+ # Write plan.json
193
+ plan = {
194
+ "schema_name": "codebatch.plan",
195
+ "schema_version": SCHEMA_VERSION,
196
+ "producer": PRODUCER,
197
+ "batch_id": batch_id,
198
+ "tasks": pipeline_def["tasks"],
199
+ }
200
+
201
+ with open(batch_dir / "plan.json", "w", encoding="utf-8") as f:
202
+ json.dump(plan, f, indent=2)
203
+
204
+ # Create empty events.jsonl
205
+ (batch_dir / "events.jsonl").touch()
206
+
207
+ # Create tasks directory and task scaffolding
208
+ tasks_dir = batch_dir / "tasks"
209
+ tasks_dir.mkdir(exist_ok=True)
210
+
211
+ for task_def in pipeline_def["tasks"]:
212
+ task_id = task_def["task_id"]
213
+ task_dir = tasks_dir / task_id
214
+ task_dir.mkdir(exist_ok=True)
215
+
216
+ # Write task.json
217
+ task_meta = {
218
+ "schema_name": "codebatch.task",
219
+ "schema_version": SCHEMA_VERSION,
220
+ "producer": PRODUCER,
221
+ "task_id": task_id,
222
+ "batch_id": batch_id,
223
+ "type": task_def["type"],
224
+ "sharding": {
225
+ "strategy": "hash_prefix",
226
+ "shard_count": self.SHARD_COUNT,
227
+ "shard_ids": shard_ids,
228
+ },
229
+ "inputs": {
230
+ "snapshot": True,
231
+ "tasks": task_def.get("depends_on", []),
232
+ },
233
+ "config": task_def.get("config", {}),
234
+ "status": "pending",
235
+ }
236
+
237
+ with open(task_dir / "task.json", "w", encoding="utf-8") as f:
238
+ json.dump(task_meta, f, indent=2)
239
+
240
+ # Create empty events.jsonl
241
+ (task_dir / "events.jsonl").touch()
242
+
243
+ # Create shards directory with all shard subdirectories
244
+ shards_dir = task_dir / "shards"
245
+ shards_dir.mkdir(exist_ok=True)
246
+
247
+ for shard_id in shard_ids:
248
+ shard_dir = shards_dir / shard_id
249
+ shard_dir.mkdir(exist_ok=True)
250
+
251
+ # Write initial state.json
252
+ state = {
253
+ "schema_name": "codebatch.shard_state",
254
+ "schema_version": SCHEMA_VERSION,
255
+ "producer": PRODUCER,
256
+ "shard_id": shard_id,
257
+ "task_id": task_id,
258
+ "batch_id": batch_id,
259
+ "status": "ready",
260
+ "attempt": 0,
261
+ }
262
+
263
+ with open(shard_dir / "state.json", "w", encoding="utf-8") as f:
264
+ json.dump(state, f, indent=2)
265
+
266
+ # Create empty outputs.index.jsonl
267
+ (shard_dir / "outputs.index.jsonl").touch()
268
+
269
+ return batch_id
270
+
271
+ def load_batch(self, batch_id: str) -> dict:
272
+ """Load batch metadata.
273
+
274
+ Args:
275
+ batch_id: Batch ID to load.
276
+
277
+ Returns:
278
+ Batch metadata dict.
279
+
280
+ Raises:
281
+ FileNotFoundError: If batch doesn't exist.
282
+ """
283
+ batch_path = self.batches_dir / batch_id / "batch.json"
284
+ with open(batch_path, "r", encoding="utf-8") as f:
285
+ return json.load(f)
286
+
287
+ def load_plan(self, batch_id: str) -> dict:
288
+ """Load batch execution plan.
289
+
290
+ Args:
291
+ batch_id: Batch ID to load.
292
+
293
+ Returns:
294
+ Plan dict.
295
+
296
+ Raises:
297
+ FileNotFoundError: If batch doesn't exist.
298
+ """
299
+ plan_path = self.batches_dir / batch_id / "plan.json"
300
+ with open(plan_path, "r", encoding="utf-8") as f:
301
+ return json.load(f)
302
+
303
+ def load_task(self, batch_id: str, task_id: str) -> dict:
304
+ """Load task metadata.
305
+
306
+ Args:
307
+ batch_id: Batch ID.
308
+ task_id: Task ID.
309
+
310
+ Returns:
311
+ Task metadata dict.
312
+ """
313
+ task_path = self.batches_dir / batch_id / "tasks" / task_id / "task.json"
314
+ with open(task_path, "r", encoding="utf-8") as f:
315
+ return json.load(f)
316
+
317
+ def load_shard_state(self, batch_id: str, task_id: str, shard_id: str) -> dict:
318
+ """Load shard state.
319
+
320
+ Args:
321
+ batch_id: Batch ID.
322
+ task_id: Task ID.
323
+ shard_id: Shard ID.
324
+
325
+ Returns:
326
+ Shard state dict.
327
+ """
328
+ state_path = (
329
+ self.batches_dir / batch_id / "tasks" / task_id / "shards" / shard_id / "state.json"
330
+ )
331
+ with open(state_path, "r", encoding="utf-8") as f:
332
+ return json.load(f)
333
+
334
+ def list_batches(self) -> list[str]:
335
+ """List all batch IDs.
336
+
337
+ Returns:
338
+ List of batch IDs.
339
+ """
340
+ if not self.batches_dir.exists():
341
+ return []
342
+
343
+ return [
344
+ d.name
345
+ for d in self.batches_dir.iterdir()
346
+ if d.is_dir() and (d / "batch.json").exists()
347
+ ]
348
+
349
+ def get_task_ids(self, batch_id: str) -> list[str]:
350
+ """Get task IDs for a batch.
351
+
352
+ Args:
353
+ batch_id: Batch ID.
354
+
355
+ Returns:
356
+ List of task IDs.
357
+ """
358
+ tasks_dir = self.batches_dir / batch_id / "tasks"
359
+ if not tasks_dir.exists():
360
+ return []
361
+
362
+ return [
363
+ d.name
364
+ for d in tasks_dir.iterdir()
365
+ if d.is_dir() and (d / "task.json").exists()
366
+ ]
codebatch/cas.py ADDED
@@ -0,0 +1,170 @@
1
+ """Content-Addressed Storage (CAS) object store.
2
+
3
+ Objects are stored at: objects/sha256/<aa>/<bb>/<full_hash>
4
+ Where <aa> and <bb> are the first two byte pairs of the hex hash.
5
+
6
+ Object references use canonical format: sha256:<hex>
7
+ """
8
+
9
+ import hashlib
10
+ import os
11
+ from pathlib import Path
12
+ from typing import Optional
13
+
14
+ from .common import parse_object_ref, make_object_ref
15
+
16
+
17
+ class ObjectNotFoundError(Exception):
18
+ """Raised when an object is not found in the store."""
19
+
20
+ def __init__(self, object_ref: str):
21
+ self.object_ref = object_ref
22
+ super().__init__(f"Object not found: {object_ref}")
23
+
24
+
25
+ class ObjectStore:
26
+ """Content-addressed object store using SHA-256."""
27
+
28
+ def __init__(self, store_root: Path):
29
+ """Initialize the object store.
30
+
31
+ Args:
32
+ store_root: Root directory of the CodeBatch store.
33
+ """
34
+ self.store_root = Path(store_root)
35
+ self.objects_dir = self.store_root / "objects" / "sha256"
36
+
37
+ def _hex_to_path(self, hex_hash: str) -> Path:
38
+ """Get the filesystem path for a hex hash.
39
+
40
+ Args:
41
+ hex_hash: SHA-256 hex hash (64 characters).
42
+
43
+ Returns:
44
+ Path to the object file.
45
+ """
46
+ aa = hex_hash[:2]
47
+ bb = hex_hash[2:4]
48
+ return self.objects_dir / aa / bb / hex_hash
49
+
50
+ def _object_path(self, object_ref: str) -> Path:
51
+ """Get the filesystem path for an object reference.
52
+
53
+ Args:
54
+ object_ref: Object reference (sha256:<hex> or legacy bare hex).
55
+
56
+ Returns:
57
+ Path to the object file.
58
+
59
+ Raises:
60
+ ValueError: If object reference is invalid.
61
+ """
62
+ _, hex_hash = parse_object_ref(object_ref)
63
+ return self._hex_to_path(hex_hash)
64
+
65
+ def put_bytes(self, data: bytes) -> str:
66
+ """Store bytes and return the canonical object reference.
67
+
68
+ Thread-safe: handles concurrent writes correctly.
69
+
70
+ Args:
71
+ data: Raw bytes to store.
72
+
73
+ Returns:
74
+ Canonical object reference in format sha256:<hex>.
75
+ """
76
+ hex_hash = hashlib.sha256(data).hexdigest()
77
+ object_path = self._hex_to_path(hex_hash)
78
+
79
+ # Dedupe: if object already exists, skip write
80
+ if object_path.exists():
81
+ return make_object_ref(hex_hash)
82
+
83
+ # Atomic write: write to temp file, then replace
84
+ object_path.parent.mkdir(parents=True, exist_ok=True)
85
+
86
+ # Use PID in temp filename to avoid collisions
87
+ temp_path = object_path.with_suffix(f".tmp.{os.getpid()}")
88
+ try:
89
+ temp_path.write_bytes(data)
90
+ try:
91
+ # Use replace() for atomic overwrite (works on Windows)
92
+ temp_path.replace(object_path)
93
+ except OSError:
94
+ # Race condition: another process wrote the same object
95
+ # This is fine - CAS is content-addressed, so result is identical
96
+ if object_path.exists():
97
+ # Object was written by another process, clean up our temp
98
+ if temp_path.exists():
99
+ temp_path.unlink()
100
+ else:
101
+ # Actual error, re-raise
102
+ raise
103
+ except Exception:
104
+ # Clean up temp file on failure
105
+ if temp_path.exists():
106
+ try:
107
+ temp_path.unlink()
108
+ except OSError:
109
+ pass
110
+ raise
111
+
112
+ return make_object_ref(hex_hash)
113
+
114
+ def has(self, object_ref: str) -> bool:
115
+ """Check if an object exists in the store.
116
+
117
+ Args:
118
+ object_ref: Object reference (sha256:<hex> or bare hex).
119
+
120
+ Returns:
121
+ True if object exists, False otherwise.
122
+ """
123
+ try:
124
+ return self._object_path(object_ref).exists()
125
+ except ValueError:
126
+ return False
127
+
128
+ def get_bytes(self, object_ref: str) -> bytes:
129
+ """Retrieve bytes for an object reference.
130
+
131
+ Args:
132
+ object_ref: Object reference (sha256:<hex> or bare hex).
133
+
134
+ Returns:
135
+ Raw bytes of the object.
136
+
137
+ Raises:
138
+ ObjectNotFoundError: If object does not exist.
139
+ """
140
+ object_path = self._object_path(object_ref)
141
+ if not object_path.exists():
142
+ raise ObjectNotFoundError(object_ref)
143
+ return object_path.read_bytes()
144
+
145
+ def get_path(self, object_ref: str) -> Optional[Path]:
146
+ """Get the filesystem path for an object if it exists.
147
+
148
+ Args:
149
+ object_ref: Object reference (sha256:<hex> or bare hex).
150
+
151
+ Returns:
152
+ Path to object file, or None if not found.
153
+ """
154
+ try:
155
+ object_path = self._object_path(object_ref)
156
+ return object_path if object_path.exists() else None
157
+ except ValueError:
158
+ return None
159
+
160
+ def get_hex(self, object_ref: str) -> str:
161
+ """Extract the hex hash from an object reference.
162
+
163
+ Args:
164
+ object_ref: Object reference (sha256:<hex> or bare hex).
165
+
166
+ Returns:
167
+ 64-character hex hash.
168
+ """
169
+ _, hex_hash = parse_object_ref(object_ref)
170
+ return hex_hash