codebatch 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
codebatch/cli.py ADDED
@@ -0,0 +1,432 @@
1
+ """Command-line interface for CodeBatch."""
2
+
3
+ import argparse
4
+ import json
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ from .batch import BatchManager, PIPELINES
9
+ from .query import QueryEngine
10
+ from .runner import ShardRunner
11
+ from .snapshot import SnapshotBuilder
12
+ from .store import init_store, ensure_store, StoreExistsError, InvalidStoreError
13
+
14
+
15
+ def cmd_init(args: argparse.Namespace) -> int:
16
+ """Handle the init command."""
17
+ store_root = Path(args.store)
18
+
19
+ try:
20
+ store_meta = init_store(store_root)
21
+ print(f"Initialized store: {store_root}")
22
+ if args.verbose:
23
+ print(f" Schema version: {store_meta['schema_version']}")
24
+ print(f" Created: {store_meta['created_at']}")
25
+ return 0
26
+ except StoreExistsError:
27
+ print(f"Error: Store already exists: {store_root}", file=sys.stderr)
28
+ return 1
29
+
30
+
31
+ def cmd_snapshot(args: argparse.Namespace) -> int:
32
+ """Handle the snapshot command."""
33
+ source_dir = Path(args.source)
34
+ store_root = Path(args.store)
35
+
36
+ if not source_dir.is_dir():
37
+ print(f"Error: Source is not a directory: {source_dir}", file=sys.stderr)
38
+ return 1
39
+
40
+ # Ensure store is initialized
41
+ try:
42
+ ensure_store(store_root)
43
+ except InvalidStoreError as e:
44
+ print(f"Error: {e}", file=sys.stderr)
45
+ return 1
46
+
47
+ builder = SnapshotBuilder(store_root)
48
+
49
+ metadata = None
50
+ if args.metadata:
51
+ try:
52
+ metadata = json.loads(args.metadata)
53
+ except json.JSONDecodeError as e:
54
+ print(f"Error: Invalid JSON metadata: {e}", file=sys.stderr)
55
+ return 1
56
+
57
+ snapshot_id = builder.build(
58
+ source_dir,
59
+ snapshot_id=args.id,
60
+ metadata=metadata,
61
+ )
62
+
63
+ print(f"Created snapshot: {snapshot_id}")
64
+
65
+ if args.verbose:
66
+ snapshot = builder.load_snapshot(snapshot_id)
67
+ print(f" Files: {snapshot['file_count']}")
68
+ print(f" Total bytes: {snapshot['total_bytes']}")
69
+
70
+ return 0
71
+
72
+
73
+ def cmd_snapshot_list(args: argparse.Namespace) -> int:
74
+ """Handle the snapshot list command."""
75
+ store_root = Path(args.store)
76
+
77
+ if not store_root.exists():
78
+ print(f"Error: Store does not exist: {store_root}", file=sys.stderr)
79
+ return 1
80
+
81
+ builder = SnapshotBuilder(store_root)
82
+ snapshots = builder.list_snapshots()
83
+
84
+ if not snapshots:
85
+ print("No snapshots found.")
86
+ return 0
87
+
88
+ for snapshot_id in sorted(snapshots):
89
+ if args.verbose:
90
+ snapshot = builder.load_snapshot(snapshot_id)
91
+ print(f"{snapshot_id} files={snapshot['file_count']} bytes={snapshot['total_bytes']}")
92
+ else:
93
+ print(snapshot_id)
94
+
95
+ return 0
96
+
97
+
98
+ def cmd_snapshot_show(args: argparse.Namespace) -> int:
99
+ """Handle the snapshot show command."""
100
+ store_root = Path(args.store)
101
+ snapshot_id = args.id
102
+
103
+ builder = SnapshotBuilder(store_root)
104
+
105
+ try:
106
+ snapshot = builder.load_snapshot(snapshot_id)
107
+ except FileNotFoundError:
108
+ print(f"Error: Snapshot not found: {snapshot_id}", file=sys.stderr)
109
+ return 1
110
+
111
+ if args.json:
112
+ print(json.dumps(snapshot, indent=2))
113
+ else:
114
+ print(f"Snapshot: {snapshot_id}")
115
+ print(f" Created: {snapshot['created_at']}")
116
+ print(f" Source: {snapshot['source']['path']}")
117
+ print(f" Files: {snapshot['file_count']}")
118
+ print(f" Total bytes: {snapshot['total_bytes']}")
119
+
120
+ if args.files:
121
+ print("\nFiles:")
122
+ records = builder.load_file_index(snapshot_id)
123
+ for record in records:
124
+ lang = f" [{record['lang_hint']}]" if record.get("lang_hint") else ""
125
+ print(f" {record['path']} ({record['size']} bytes){lang}")
126
+
127
+ return 0
128
+
129
+
130
+ def main(argv: list[str] = None) -> int:
131
+ """Main entry point."""
132
+ parser = argparse.ArgumentParser(
133
+ prog="codebatch",
134
+ description="Content-addressed batch execution engine",
135
+ )
136
+ parser.add_argument("--version", action="version", version="%(prog)s 0.1.0")
137
+
138
+ subparsers = parser.add_subparsers(dest="command", help="Commands")
139
+
140
+ # init command
141
+ init_parser = subparsers.add_parser("init", help="Initialize a new store")
142
+ init_parser.add_argument("store", help="Store root directory to initialize")
143
+ init_parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
144
+ init_parser.set_defaults(func=cmd_init)
145
+
146
+ # snapshot command
147
+ snapshot_parser = subparsers.add_parser("snapshot", help="Create a snapshot")
148
+ snapshot_parser.add_argument("source", help="Source directory to snapshot")
149
+ snapshot_parser.add_argument("--store", required=True, help="Store root directory")
150
+ snapshot_parser.add_argument("--id", help="Snapshot ID (auto-generated if not provided)")
151
+ snapshot_parser.add_argument("--metadata", help="JSON metadata to include")
152
+ snapshot_parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
153
+ snapshot_parser.set_defaults(func=cmd_snapshot)
154
+
155
+ # snapshot list command
156
+ list_parser = subparsers.add_parser("snapshot-list", help="List snapshots")
157
+ list_parser.add_argument("--store", required=True, help="Store root directory")
158
+ list_parser.add_argument("-v", "--verbose", action="store_true", help="Show details")
159
+ list_parser.set_defaults(func=cmd_snapshot_list)
160
+
161
+ # snapshot show command
162
+ show_parser = subparsers.add_parser("snapshot-show", help="Show snapshot details")
163
+ show_parser.add_argument("id", help="Snapshot ID")
164
+ show_parser.add_argument("--store", required=True, help="Store root directory")
165
+ show_parser.add_argument("--json", action="store_true", help="Output as JSON")
166
+ show_parser.add_argument("--files", action="store_true", help="List files in snapshot")
167
+ show_parser.set_defaults(func=cmd_snapshot_show)
168
+
169
+ # batch init command
170
+ batch_init_parser = subparsers.add_parser("batch", help="Initialize a batch")
171
+ batch_init_parser.add_argument("action", choices=["init"], help="Batch action")
172
+ batch_init_parser.add_argument("--snapshot", required=True, help="Snapshot ID to execute")
173
+ batch_init_parser.add_argument("--pipeline", required=True, help="Pipeline name (e.g., 'parse')")
174
+ batch_init_parser.add_argument("--store", required=True, help="Store root directory")
175
+ batch_init_parser.add_argument("--id", help="Batch ID (auto-generated if not provided)")
176
+ batch_init_parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
177
+ batch_init_parser.set_defaults(func=cmd_batch_init)
178
+
179
+ # batch list command
180
+ batch_list_parser = subparsers.add_parser("batch-list", help="List batches")
181
+ batch_list_parser.add_argument("--store", required=True, help="Store root directory")
182
+ batch_list_parser.add_argument("-v", "--verbose", action="store_true", help="Show details")
183
+ batch_list_parser.set_defaults(func=cmd_batch_list)
184
+
185
+ # batch show command
186
+ batch_show_parser = subparsers.add_parser("batch-show", help="Show batch details")
187
+ batch_show_parser.add_argument("id", help="Batch ID")
188
+ batch_show_parser.add_argument("--store", required=True, help="Store root directory")
189
+ batch_show_parser.add_argument("--json", action="store_true", help="Output as JSON")
190
+ batch_show_parser.set_defaults(func=cmd_batch_show)
191
+
192
+ # run-shard command
193
+ run_shard_parser = subparsers.add_parser("run-shard", help="Run a shard")
194
+ run_shard_parser.add_argument("--batch", required=True, help="Batch ID")
195
+ run_shard_parser.add_argument("--task", required=True, help="Task ID")
196
+ run_shard_parser.add_argument("--shard", required=True, help="Shard ID (e.g., 'ab')")
197
+ run_shard_parser.add_argument("--store", required=True, help="Store root directory")
198
+ run_shard_parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
199
+ run_shard_parser.set_defaults(func=cmd_run_shard)
200
+
201
+ # query diagnostics command
202
+ query_diag_parser = subparsers.add_parser("query", help="Query outputs")
203
+ query_diag_parser.add_argument("query_type", choices=["diagnostics", "outputs", "stats"], help="Query type")
204
+ query_diag_parser.add_argument("--batch", required=True, help="Batch ID")
205
+ query_diag_parser.add_argument("--task", required=True, help="Task ID")
206
+ query_diag_parser.add_argument("--store", required=True, help="Store root directory")
207
+ query_diag_parser.add_argument("--severity", help="Filter by severity (error, warning, info, hint)")
208
+ query_diag_parser.add_argument("--kind", help="Filter by output kind")
209
+ query_diag_parser.add_argument("--code", help="Filter by diagnostic code")
210
+ query_diag_parser.add_argument("--path", help="Filter by path substring")
211
+ query_diag_parser.add_argument("--group-by", choices=["kind", "severity", "code", "lang"], default="kind", help="Group stats by field")
212
+ query_diag_parser.add_argument("--json", action="store_true", help="Output as JSON")
213
+ query_diag_parser.set_defaults(func=cmd_query)
214
+
215
+ args = parser.parse_args(argv)
216
+
217
+ if not args.command:
218
+ parser.print_help()
219
+ return 0
220
+
221
+ return args.func(args)
222
+
223
+
224
+ def cmd_batch_init(args: argparse.Namespace) -> int:
225
+ """Handle the batch init command."""
226
+ store_root = Path(args.store)
227
+
228
+ if not store_root.exists():
229
+ print(f"Error: Store does not exist: {store_root}", file=sys.stderr)
230
+ return 1
231
+
232
+ manager = BatchManager(store_root)
233
+
234
+ try:
235
+ batch_id = manager.init_batch(
236
+ snapshot_id=args.snapshot,
237
+ pipeline=args.pipeline,
238
+ batch_id=args.id,
239
+ )
240
+ except ValueError as e:
241
+ print(f"Error: {e}", file=sys.stderr)
242
+ return 1
243
+
244
+ print(f"Created batch: {batch_id}")
245
+
246
+ if args.verbose:
247
+ batch = manager.load_batch(batch_id)
248
+ plan = manager.load_plan(batch_id)
249
+ print(f" Snapshot: {batch['snapshot_id']}")
250
+ print(f" Pipeline: {batch['pipeline']}")
251
+ print(f" Tasks: {len(plan['tasks'])}")
252
+ for task in plan["tasks"]:
253
+ print(f" - {task['task_id']} ({task['type']})")
254
+
255
+ return 0
256
+
257
+
258
+ def cmd_batch_list(args: argparse.Namespace) -> int:
259
+ """Handle the batch list command."""
260
+ store_root = Path(args.store)
261
+
262
+ if not store_root.exists():
263
+ print(f"Error: Store does not exist: {store_root}", file=sys.stderr)
264
+ return 1
265
+
266
+ manager = BatchManager(store_root)
267
+ batches = manager.list_batches()
268
+
269
+ if not batches:
270
+ print("No batches found.")
271
+ return 0
272
+
273
+ for batch_id in sorted(batches):
274
+ if args.verbose:
275
+ batch = manager.load_batch(batch_id)
276
+ print(f"{batch_id} snapshot={batch['snapshot_id']} pipeline={batch['pipeline']} status={batch['status']}")
277
+ else:
278
+ print(batch_id)
279
+
280
+ return 0
281
+
282
+
283
+ def cmd_batch_show(args: argparse.Namespace) -> int:
284
+ """Handle the batch show command."""
285
+ store_root = Path(args.store)
286
+ batch_id = args.id
287
+
288
+ manager = BatchManager(store_root)
289
+
290
+ try:
291
+ batch = manager.load_batch(batch_id)
292
+ except FileNotFoundError:
293
+ print(f"Error: Batch not found: {batch_id}", file=sys.stderr)
294
+ return 1
295
+
296
+ if args.json:
297
+ print(json.dumps(batch, indent=2))
298
+ else:
299
+ print(f"Batch: {batch_id}")
300
+ print(f" Snapshot: {batch['snapshot_id']}")
301
+ print(f" Pipeline: {batch['pipeline']}")
302
+ print(f" Status: {batch['status']}")
303
+ print(f" Created: {batch['created_at']}")
304
+
305
+ plan = manager.load_plan(batch_id)
306
+ print(f"\nTasks ({len(plan['tasks'])}):")
307
+ for task_def in plan["tasks"]:
308
+ task = manager.load_task(batch_id, task_def["task_id"])
309
+ print(f" {task['task_id']}: {task['type']} [{task['status']}]")
310
+
311
+ return 0
312
+
313
+
314
+ def cmd_run_shard(args: argparse.Namespace) -> int:
315
+ """Handle the run-shard command."""
316
+ store_root = Path(args.store)
317
+ batch_id = args.batch
318
+ task_id = args.task
319
+ shard_id = args.shard
320
+
321
+ if not store_root.exists():
322
+ print(f"Error: Store does not exist: {store_root}", file=sys.stderr)
323
+ return 1
324
+
325
+ runner = ShardRunner(store_root)
326
+
327
+ # Check current state
328
+ try:
329
+ state = runner._load_state(batch_id, task_id, shard_id)
330
+ except FileNotFoundError:
331
+ print(f"Error: Shard not found: {batch_id}/{task_id}/{shard_id}", file=sys.stderr)
332
+ return 1
333
+
334
+ if state["status"] == "done":
335
+ print(f"Shard {shard_id} already done, skipping.")
336
+ return 0
337
+
338
+ # Import the task executor
339
+ from .tasks import get_executor
340
+
341
+ try:
342
+ executor = get_executor(task_id)
343
+ except ValueError as e:
344
+ print(f"Error: {e}", file=sys.stderr)
345
+ return 1
346
+
347
+ print(f"Running shard {shard_id} for task {task_id}...")
348
+
349
+ final_state = runner.run_shard(batch_id, task_id, shard_id, executor)
350
+
351
+ if final_state["status"] == "done":
352
+ print(f"Shard completed: {final_state['stats']['files_processed']} files, {final_state['stats']['outputs_written']} outputs")
353
+ return 0
354
+ else:
355
+ print(f"Shard failed: {final_state.get('error', {}).get('message', 'Unknown error')}")
356
+ return 1
357
+
358
+
359
+ def cmd_query(args: argparse.Namespace) -> int:
360
+ """Handle query commands."""
361
+ store_root = Path(args.store)
362
+ batch_id = args.batch
363
+ task_id = args.task
364
+ query_type = args.query_type
365
+
366
+ if not store_root.exists():
367
+ print(f"Error: Store does not exist: {store_root}", file=sys.stderr)
368
+ return 1
369
+
370
+ engine = QueryEngine(store_root)
371
+
372
+ if query_type == "diagnostics":
373
+ results = engine.query_diagnostics(
374
+ batch_id,
375
+ task_id,
376
+ severity=args.severity,
377
+ code=args.code,
378
+ path_pattern=args.path,
379
+ )
380
+
381
+ if args.json:
382
+ print(json.dumps(results, indent=2))
383
+ else:
384
+ if not results:
385
+ print("No diagnostics found.")
386
+ else:
387
+ for diag in results:
388
+ sev = diag.get("severity", "?")
389
+ code = diag.get("code", "?")
390
+ path = diag.get("path", "?")
391
+ line = diag.get("line", "?")
392
+ msg = diag.get("message", "")
393
+ print(f"[{sev.upper()}] {path}:{line} {code}: {msg}")
394
+
395
+ elif query_type == "outputs":
396
+ results = engine.query_outputs(
397
+ batch_id,
398
+ task_id,
399
+ kind=args.kind,
400
+ path_pattern=args.path,
401
+ )
402
+
403
+ if args.json:
404
+ print(json.dumps(results, indent=2))
405
+ else:
406
+ if not results:
407
+ print("No outputs found.")
408
+ else:
409
+ for output in results:
410
+ kind = output.get("kind", "?")
411
+ path = output.get("path", "?")
412
+ obj = output.get("object", "")[:12] + "..." if output.get("object") else ""
413
+ print(f"{kind:15} {path} {obj}")
414
+
415
+ elif query_type == "stats":
416
+ stats = engine.query_stats(batch_id, task_id, group_by=args.group_by)
417
+
418
+ if args.json:
419
+ print(json.dumps(stats, indent=2))
420
+ else:
421
+ if not stats:
422
+ print("No outputs found.")
423
+ else:
424
+ print(f"Stats grouped by {args.group_by}:")
425
+ for key, count in sorted(stats.items(), key=lambda x: -x[1]):
426
+ print(f" {key}: {count}")
427
+
428
+ return 0
429
+
430
+
431
+ if __name__ == "__main__":
432
+ sys.exit(main())
codebatch/common.py ADDED
@@ -0,0 +1,104 @@
1
+ """Common utilities and constants for CodeBatch.
2
+
3
+ This module defines contract-level constants and helpers used across all components.
4
+ """
5
+
6
+ from datetime import datetime, timezone
7
+ from typing import Tuple
8
+
9
+ # Schema version as integer per contract
10
+ SCHEMA_VERSION = 1
11
+
12
+ # Producer info - identifies the implementation that created records
13
+ PRODUCER = {
14
+ "name": "codebatch",
15
+ "version": "0.1.0",
16
+ }
17
+
18
+
19
+ def utc_now_z() -> str:
20
+ """Return current UTC time in RFC3339 format with Z suffix.
21
+
22
+ Returns:
23
+ ISO8601/RFC3339 timestamp ending in Z (e.g., "2025-02-02T12:00:00Z").
24
+ """
25
+ return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
26
+
27
+
28
+ def parse_object_ref(object_ref: str) -> Tuple[str, str]:
29
+ """Parse an object reference into algorithm and hex hash.
30
+
31
+ Args:
32
+ object_ref: Object reference in format "sha256:<hex>" or bare hex.
33
+
34
+ Returns:
35
+ Tuple of (algorithm, hex_hash).
36
+
37
+ Raises:
38
+ ValueError: If format is invalid.
39
+ """
40
+ if ":" in object_ref:
41
+ parts = object_ref.split(":", 1)
42
+ if len(parts) != 2:
43
+ raise ValueError(f"Invalid object ref format: {object_ref}")
44
+ algo, hex_hash = parts
45
+ if algo != "sha256":
46
+ raise ValueError(f"Unsupported algorithm: {algo}")
47
+ else:
48
+ # Legacy bare hex format
49
+ algo = "sha256"
50
+ hex_hash = object_ref
51
+
52
+ # Validate hex
53
+ if len(hex_hash) != 64:
54
+ raise ValueError(f"Invalid hash length: {len(hex_hash)} (expected 64)")
55
+
56
+ try:
57
+ int(hex_hash, 16)
58
+ except ValueError:
59
+ raise ValueError(f"Invalid hex characters in hash: {hex_hash}")
60
+
61
+ return algo, hex_hash
62
+
63
+
64
+ def make_object_ref(hex_hash: str) -> str:
65
+ """Create a canonical object reference from a hex hash.
66
+
67
+ Args:
68
+ hex_hash: SHA-256 hex hash (64 characters).
69
+
70
+ Returns:
71
+ Canonical object reference in format "sha256:<hex>".
72
+ """
73
+ if len(hex_hash) != 64:
74
+ raise ValueError(f"Invalid hash length: {len(hex_hash)}")
75
+ return f"sha256:{hex_hash}"
76
+
77
+
78
+ def object_shard_prefix(object_ref: str) -> str:
79
+ """Get the shard prefix (first byte hex) from an object reference.
80
+
81
+ Args:
82
+ object_ref: Object reference (sha256:<hex> or bare hex).
83
+
84
+ Returns:
85
+ Two-character hex string (e.g., "ab").
86
+ """
87
+ _, hex_hash = parse_object_ref(object_ref)
88
+ return hex_hash[:2]
89
+
90
+
91
+ class SnapshotExistsError(Exception):
92
+ """Raised when attempting to create a snapshot that already exists."""
93
+
94
+ def __init__(self, snapshot_id: str):
95
+ self.snapshot_id = snapshot_id
96
+ super().__init__(f"Snapshot already exists: {snapshot_id}")
97
+
98
+
99
+ class BatchExistsError(Exception):
100
+ """Raised when attempting to create a batch that already exists."""
101
+
102
+ def __init__(self, batch_id: str):
103
+ self.batch_id = batch_id
104
+ super().__init__(f"Batch already exists: {batch_id}")