codebatch 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
codebatch/snapshot.py ADDED
@@ -0,0 +1,340 @@
1
+ """Snapshot builder for creating immutable snapshots of directory sources.
2
+
3
+ A snapshot represents a frozen view of an input source at a specific point in time.
4
+ Snapshots are immutable once written.
5
+ """
6
+
7
+ import json
8
+ import os
9
+ import uuid
10
+ from datetime import datetime, timezone
11
+ from pathlib import Path
12
+ from typing import Iterator, Optional
13
+
14
+ from .cas import ObjectStore
15
+ from .common import SCHEMA_VERSION, PRODUCER, utc_now_z, SnapshotExistsError, object_shard_prefix
16
+ from .paths import canonicalize_path, compute_path_key, PathEscapeError, InvalidPathError, detect_case_collision
17
+
18
+
19
+ # Language detection by extension
20
+ LANG_HINTS = {
21
+ ".py": "python",
22
+ ".js": "javascript",
23
+ ".ts": "typescript",
24
+ ".tsx": "typescript",
25
+ ".jsx": "javascript",
26
+ ".cs": "csharp",
27
+ ".java": "java",
28
+ ".go": "go",
29
+ ".rs": "rust",
30
+ ".c": "c",
31
+ ".cpp": "cpp",
32
+ ".cc": "cpp",
33
+ ".h": "c",
34
+ ".hpp": "cpp",
35
+ ".rb": "ruby",
36
+ ".php": "php",
37
+ ".swift": "swift",
38
+ ".kt": "kotlin",
39
+ ".scala": "scala",
40
+ ".r": "r",
41
+ ".R": "r",
42
+ ".sql": "sql",
43
+ ".sh": "shell",
44
+ ".bash": "shell",
45
+ ".zsh": "shell",
46
+ ".ps1": "powershell",
47
+ ".md": "markdown",
48
+ ".json": "json",
49
+ ".yaml": "yaml",
50
+ ".yml": "yaml",
51
+ ".xml": "xml",
52
+ ".html": "html",
53
+ ".css": "css",
54
+ ".scss": "scss",
55
+ ".sass": "sass",
56
+ ".less": "less",
57
+ }
58
+
59
+
60
+ def detect_lang_hint(path: str) -> Optional[str]:
61
+ """Detect language hint from file extension.
62
+
63
+ Args:
64
+ path: File path.
65
+
66
+ Returns:
67
+ Language hint string, or None if unknown.
68
+ """
69
+ ext = os.path.splitext(path)[1].lower()
70
+ return LANG_HINTS.get(ext)
71
+
72
+
73
+ def generate_snapshot_id() -> str:
74
+ """Generate a unique snapshot ID.
75
+
76
+ Returns:
77
+ Snapshot ID in format: snap-YYYYMMDD-HHMMSS-XXXX
78
+ """
79
+ now = datetime.now(timezone.utc)
80
+ timestamp = now.strftime("%Y%m%d-%H%M%S")
81
+ suffix = uuid.uuid4().hex[:8]
82
+ return f"snap-{timestamp}-{suffix}"
83
+
84
+
85
+ class SnapshotBuilder:
86
+ """Builds immutable snapshots from directory sources."""
87
+
88
+ def __init__(self, store_root: Path):
89
+ """Initialize the snapshot builder.
90
+
91
+ Args:
92
+ store_root: Root directory of the CodeBatch store.
93
+ """
94
+ self.store_root = Path(store_root)
95
+ self.object_store = ObjectStore(store_root)
96
+ self.snapshots_dir = self.store_root / "snapshots"
97
+
98
+ def _walk_directory(
99
+ self,
100
+ source_dir: Path,
101
+ include_hidden: bool = False,
102
+ ) -> Iterator[tuple[Path, str]]:
103
+ """Walk a directory and yield (file_path, relative_path) pairs.
104
+
105
+ Args:
106
+ source_dir: Directory to walk.
107
+ include_hidden: If True, include hidden files/dirs.
108
+
109
+ Yields:
110
+ Tuples of (absolute_path, relative_path).
111
+ """
112
+ source_dir = source_dir.resolve()
113
+
114
+ for root, dirs, files in os.walk(source_dir):
115
+ if not include_hidden:
116
+ # Skip hidden directories
117
+ dirs[:] = [d for d in dirs if not d.startswith(".")]
118
+
119
+ root_path = Path(root)
120
+ for file in files:
121
+ # Skip hidden files unless configured
122
+ if not include_hidden and file.startswith("."):
123
+ continue
124
+
125
+ file_path = root_path / file
126
+ try:
127
+ rel_path = file_path.relative_to(source_dir)
128
+ yield file_path, str(rel_path)
129
+ except ValueError:
130
+ # File not under source_dir (shouldn't happen)
131
+ continue
132
+
133
+ def build(
134
+ self,
135
+ source_dir: Path,
136
+ snapshot_id: Optional[str] = None,
137
+ metadata: Optional[dict] = None,
138
+ include_hidden: bool = False,
139
+ allow_overwrite: bool = False,
140
+ ) -> str:
141
+ """Build a snapshot from a directory.
142
+
143
+ Args:
144
+ source_dir: Directory to snapshot.
145
+ snapshot_id: Optional snapshot ID (auto-generated if not provided).
146
+ metadata: Optional user metadata to include.
147
+ include_hidden: If True, include hidden files/dirs.
148
+ allow_overwrite: If True, allow overwriting existing snapshot (default False).
149
+
150
+ Returns:
151
+ The snapshot ID.
152
+
153
+ Raises:
154
+ SnapshotExistsError: If snapshot already exists and allow_overwrite=False.
155
+ ValueError: If source is not a directory.
156
+ """
157
+ source_dir = Path(source_dir).resolve()
158
+
159
+ if not source_dir.is_dir():
160
+ raise ValueError(f"Source is not a directory: {source_dir}")
161
+
162
+ if snapshot_id is None:
163
+ snapshot_id = generate_snapshot_id()
164
+
165
+ # Check for existing snapshot (immutability enforcement)
166
+ # Fail if directory exists at all - even empty dirs indicate a prior attempt
167
+ snapshot_dir = self.snapshots_dir / snapshot_id
168
+ if snapshot_dir.exists() and not allow_overwrite:
169
+ raise SnapshotExistsError(snapshot_id)
170
+
171
+ # Create snapshot directory
172
+ snapshot_dir.mkdir(parents=True, exist_ok=True)
173
+
174
+ # Collect file records and track diagnostics
175
+ file_records = []
176
+ skipped_files = []
177
+ total_bytes = 0
178
+
179
+ for file_path, rel_path in self._walk_directory(source_dir, include_hidden):
180
+ try:
181
+ # Canonicalize path
182
+ canonical_path = canonicalize_path(rel_path)
183
+ path_key = compute_path_key(canonical_path)
184
+
185
+ # Read file and store in CAS
186
+ data = file_path.read_bytes()
187
+ object_ref = self.object_store.put_bytes(data)
188
+ size = len(data)
189
+ total_bytes += size
190
+
191
+ # Build record
192
+ record = {
193
+ "schema_version": SCHEMA_VERSION,
194
+ "path": canonical_path,
195
+ "path_key": path_key,
196
+ "object": object_ref,
197
+ "size": size,
198
+ }
199
+
200
+ # Add optional fields
201
+ lang_hint = detect_lang_hint(canonical_path)
202
+ if lang_hint:
203
+ record["lang_hint"] = lang_hint
204
+
205
+ file_records.append(record)
206
+
207
+ except (PathEscapeError, InvalidPathError) as e:
208
+ skipped_files.append({
209
+ "path": rel_path,
210
+ "reason": "invalid_path",
211
+ "message": str(e),
212
+ })
213
+ except OSError as e:
214
+ skipped_files.append({
215
+ "path": rel_path,
216
+ "reason": "unreadable",
217
+ "message": str(e),
218
+ })
219
+
220
+ # Detect case collisions
221
+ all_paths = [r["path"] for r in file_records]
222
+ case_collisions = detect_case_collision(all_paths)
223
+ collision_warnings = []
224
+ for p1, p2 in case_collisions:
225
+ collision_warnings.append({
226
+ "paths": [p1, p2],
227
+ "reason": "case_collision",
228
+ "message": f"Paths differ only by case: {p1} vs {p2}",
229
+ })
230
+
231
+ # Sort records by path_key for deterministic output
232
+ file_records.sort(key=lambda r: r["path_key"])
233
+
234
+ # Write files.index.jsonl
235
+ index_path = snapshot_dir / "files.index.jsonl"
236
+ with open(index_path, "w", encoding="utf-8") as f:
237
+ for record in file_records:
238
+ f.write(json.dumps(record, ensure_ascii=False, separators=(",", ":")))
239
+ f.write("\n")
240
+
241
+ # Write snapshot.json
242
+ snapshot_meta = {
243
+ "schema_name": "codebatch.snapshot",
244
+ "schema_version": SCHEMA_VERSION,
245
+ "producer": PRODUCER,
246
+ "snapshot_id": snapshot_id,
247
+ "created_at": utc_now_z(),
248
+ "source": {
249
+ "type": "directory",
250
+ "path": str(source_dir),
251
+ },
252
+ "file_count": len(file_records),
253
+ "total_bytes": total_bytes,
254
+ "config": {
255
+ "include_hidden": include_hidden,
256
+ },
257
+ }
258
+
259
+ if metadata:
260
+ snapshot_meta["metadata"] = metadata
261
+
262
+ # Add warnings if any
263
+ if skipped_files or collision_warnings:
264
+ snapshot_meta["warnings"] = []
265
+ snapshot_meta["warnings"].extend(skipped_files)
266
+ snapshot_meta["warnings"].extend(collision_warnings)
267
+
268
+ snapshot_json_path = snapshot_dir / "snapshot.json"
269
+ with open(snapshot_json_path, "w", encoding="utf-8") as f:
270
+ json.dump(snapshot_meta, f, indent=2)
271
+
272
+ return snapshot_id
273
+
274
+ def load_snapshot(self, snapshot_id: str) -> dict:
275
+ """Load snapshot metadata.
276
+
277
+ Args:
278
+ snapshot_id: Snapshot ID to load.
279
+
280
+ Returns:
281
+ Snapshot metadata dict.
282
+
283
+ Raises:
284
+ FileNotFoundError: If snapshot doesn't exist.
285
+ """
286
+ snapshot_path = self.snapshots_dir / snapshot_id / "snapshot.json"
287
+ with open(snapshot_path, "r", encoding="utf-8") as f:
288
+ return json.load(f)
289
+
290
+ def load_file_index(self, snapshot_id: str) -> list[dict]:
291
+ """Load file index records.
292
+
293
+ Args:
294
+ snapshot_id: Snapshot ID to load.
295
+
296
+ Returns:
297
+ List of file index records.
298
+
299
+ Raises:
300
+ FileNotFoundError: If snapshot doesn't exist.
301
+ """
302
+ index_path = self.snapshots_dir / snapshot_id / "files.index.jsonl"
303
+ records = []
304
+ with open(index_path, "r", encoding="utf-8") as f:
305
+ for line in f:
306
+ line = line.strip()
307
+ if line:
308
+ records.append(json.loads(line))
309
+ return records
310
+
311
+ def iter_file_index(self, snapshot_id: str) -> Iterator[dict]:
312
+ """Stream file index records without loading all into memory.
313
+
314
+ Args:
315
+ snapshot_id: Snapshot ID.
316
+
317
+ Yields:
318
+ File index record dicts.
319
+ """
320
+ index_path = self.snapshots_dir / snapshot_id / "files.index.jsonl"
321
+ with open(index_path, "r", encoding="utf-8") as f:
322
+ for line in f:
323
+ line = line.strip()
324
+ if line:
325
+ yield json.loads(line)
326
+
327
+ def list_snapshots(self) -> list[str]:
328
+ """List all snapshot IDs.
329
+
330
+ Returns:
331
+ List of snapshot IDs.
332
+ """
333
+ if not self.snapshots_dir.exists():
334
+ return []
335
+
336
+ return [
337
+ d.name
338
+ for d in self.snapshots_dir.iterdir()
339
+ if d.is_dir() and (d / "snapshot.json").exists()
340
+ ]
codebatch/store.py ADDED
@@ -0,0 +1,162 @@
1
+ """Store initialization and validation.
2
+
3
+ A CodeBatch store is a directory with a specific layout:
4
+ <store_root>/
5
+ store.json # Store metadata
6
+ objects/ # Content-addressed objects
7
+ snapshots/ # Frozen input state
8
+ batches/ # Execution attempts
9
+ indexes/ # Optional acceleration (not required for correctness)
10
+ """
11
+
12
+ import json
13
+ from pathlib import Path
14
+ from typing import Optional
15
+
16
+ from .common import SCHEMA_VERSION, PRODUCER, utc_now_z
17
+
18
+
19
+ class StoreExistsError(Exception):
20
+ """Raised when attempting to initialize a store that already exists."""
21
+
22
+ def __init__(self, store_root: Path):
23
+ self.store_root = store_root
24
+ super().__init__(f"Store already exists: {store_root}")
25
+
26
+
27
+ class InvalidStoreError(Exception):
28
+ """Raised when a store is missing or invalid."""
29
+
30
+ def __init__(self, store_root: Path, reason: str):
31
+ self.store_root = store_root
32
+ self.reason = reason
33
+ super().__init__(f"Invalid store at {store_root}: {reason}")
34
+
35
+
36
+ def init_store(store_root: Path, *, allow_reinit: bool = False) -> dict:
37
+ """Initialize a new CodeBatch store.
38
+
39
+ Creates the directory structure and store.json file.
40
+
41
+ Args:
42
+ store_root: Root directory for the store.
43
+ allow_reinit: If True, allow re-initialization of existing empty store.
44
+
45
+ Returns:
46
+ The store metadata dict.
47
+
48
+ Raises:
49
+ StoreExistsError: If store already exists (and not empty or allow_reinit=False).
50
+ """
51
+ store_root = Path(store_root)
52
+ store_json_path = store_root / "store.json"
53
+
54
+ # Check if store already exists
55
+ if store_json_path.exists():
56
+ raise StoreExistsError(store_root)
57
+
58
+ # If directory exists but is not a valid store, check if it's empty
59
+ if store_root.exists():
60
+ contents = list(store_root.iterdir())
61
+ if contents and not allow_reinit:
62
+ raise StoreExistsError(store_root)
63
+
64
+ # Create directory structure
65
+ store_root.mkdir(parents=True, exist_ok=True)
66
+ (store_root / "objects" / "sha256").mkdir(parents=True, exist_ok=True)
67
+ (store_root / "snapshots").mkdir(exist_ok=True)
68
+ (store_root / "batches").mkdir(exist_ok=True)
69
+
70
+ # Create store.json
71
+ store_meta = {
72
+ "schema_name": "codebatch.store",
73
+ "schema_version": SCHEMA_VERSION,
74
+ "producer": PRODUCER.copy(),
75
+ "created_at": utc_now_z(),
76
+ }
77
+
78
+ with open(store_json_path, "w", encoding="utf-8") as f:
79
+ json.dump(store_meta, f, indent=2)
80
+ f.write("\n")
81
+
82
+ return store_meta
83
+
84
+
85
+ def load_store(store_root: Path) -> dict:
86
+ """Load and validate store metadata.
87
+
88
+ Args:
89
+ store_root: Root directory of the store.
90
+
91
+ Returns:
92
+ The store metadata dict.
93
+
94
+ Raises:
95
+ InvalidStoreError: If store is missing or invalid.
96
+ """
97
+ store_root = Path(store_root)
98
+ store_json_path = store_root / "store.json"
99
+
100
+ if not store_root.exists():
101
+ raise InvalidStoreError(store_root, "directory does not exist")
102
+
103
+ if not store_json_path.exists():
104
+ raise InvalidStoreError(store_root, "missing store.json")
105
+
106
+ try:
107
+ with open(store_json_path, "r", encoding="utf-8") as f:
108
+ store_meta = json.load(f)
109
+ except json.JSONDecodeError as e:
110
+ raise InvalidStoreError(store_root, f"invalid JSON in store.json: {e}")
111
+
112
+ # Validate required fields
113
+ if store_meta.get("schema_name") != "codebatch.store":
114
+ raise InvalidStoreError(
115
+ store_root,
116
+ f"invalid schema_name: {store_meta.get('schema_name')}"
117
+ )
118
+
119
+ if not isinstance(store_meta.get("schema_version"), int):
120
+ raise InvalidStoreError(
121
+ store_root,
122
+ f"invalid schema_version: {store_meta.get('schema_version')}"
123
+ )
124
+
125
+ return store_meta
126
+
127
+
128
+ def ensure_store(store_root: Path) -> dict:
129
+ """Ensure a store exists, initializing if necessary.
130
+
131
+ This is the recommended way to get a store reference when you
132
+ don't care if it's new or existing.
133
+
134
+ Args:
135
+ store_root: Root directory for the store.
136
+
137
+ Returns:
138
+ The store metadata dict.
139
+ """
140
+ store_root = Path(store_root)
141
+ store_json_path = store_root / "store.json"
142
+
143
+ if store_json_path.exists():
144
+ return load_store(store_root)
145
+ else:
146
+ return init_store(store_root)
147
+
148
+
149
+ def is_valid_store(store_root: Path) -> bool:
150
+ """Check if a directory is a valid CodeBatch store.
151
+
152
+ Args:
153
+ store_root: Root directory to check.
154
+
155
+ Returns:
156
+ True if valid store, False otherwise.
157
+ """
158
+ try:
159
+ load_store(store_root)
160
+ return True
161
+ except (InvalidStoreError, FileNotFoundError):
162
+ return False
@@ -0,0 +1,37 @@
1
+ """Task executors registry."""
2
+
3
+ from typing import Callable
4
+
5
+ from ..runner import ShardRunner
6
+
7
+
8
+ # Task executor type: (config, files, runner) -> list[output_records]
9
+ TaskExecutor = Callable[[dict, list[dict], ShardRunner], list[dict]]
10
+
11
+
12
+ def get_executor(task_id: str) -> TaskExecutor:
13
+ """Get the executor function for a task.
14
+
15
+ Args:
16
+ task_id: Task ID (e.g., '01_parse').
17
+
18
+ Returns:
19
+ Executor function.
20
+
21
+ Raises:
22
+ ValueError: If task executor not found.
23
+ """
24
+ if task_id == "01_parse":
25
+ from .parse import parse_executor
26
+ return parse_executor
27
+ elif task_id == "02_analyze":
28
+ from .analyze import analyze_executor
29
+ return analyze_executor
30
+ elif task_id == "03_symbols":
31
+ from .symbols import symbols_executor
32
+ return symbols_executor
33
+ elif task_id == "04_lint":
34
+ from .lint import lint_executor
35
+ return lint_executor
36
+ else:
37
+ raise ValueError(f"Unknown task: {task_id}")
@@ -0,0 +1,109 @@
1
+ """Analyze task executor - produces file-level metrics.
2
+
3
+ Emits:
4
+ - kind=metric: File-level metrics (bytes, loc, lang, parse_status)
5
+
6
+ Inputs:
7
+ - Snapshot file records for this shard
8
+ - Optionally parse outputs to determine parse_status
9
+
10
+ Metrics are stable and cheap - no deep analysis.
11
+ """
12
+
13
+ from typing import Iterable, Optional
14
+
15
+ from ..runner import ShardRunner
16
+
17
+
18
+ def count_lines(content: str) -> int:
19
+ """Count lines of code (non-empty lines)."""
20
+ lines = content.split('\n')
21
+ return sum(1 for line in lines if line.strip())
22
+
23
+
24
+ def analyze_executor(config: dict, files: Iterable[dict], runner: ShardRunner) -> list[dict]:
25
+ """Execute the analyze task.
26
+
27
+ Produces file-level metrics for each file in the shard:
28
+ - bytes: File size from snapshot
29
+ - loc: Lines of code (non-empty lines, text files only)
30
+ - lang: Language hint from snapshot
31
+ - parse_status: 'ok', 'failed', or 'missing' if parse dep exists
32
+
33
+ Args:
34
+ config: Task configuration.
35
+ files: Iterable of file records for this shard.
36
+ runner: ShardRunner for CAS access.
37
+
38
+ Returns:
39
+ List of metric output records.
40
+ """
41
+ outputs = []
42
+
43
+ # Check if we should look at parse outputs
44
+ check_parse = config.get("check_parse_status", True)
45
+
46
+ # Materialize files for potential multi-pass
47
+ file_list = list(files)
48
+
49
+ # Build parse status map if checking parse
50
+ parse_status_map: dict[str, str] = {}
51
+ if check_parse:
52
+ # Get batch/task context from runner if available
53
+ # We'll populate this when we have context from the shard run
54
+ pass # Will be populated per-file below
55
+
56
+ for file_record in file_list:
57
+ path = file_record["path"]
58
+ object_ref = file_record["object"]
59
+ lang_hint = file_record.get("lang_hint", "unknown")
60
+
61
+ try:
62
+ # Get file content from CAS
63
+ data = runner.object_store.get_bytes(object_ref)
64
+ file_bytes = len(data)
65
+
66
+ # Try to decode as text for LOC
67
+ loc: Optional[int] = None
68
+ try:
69
+ content = data.decode("utf-8")
70
+ loc = count_lines(content)
71
+ except UnicodeDecodeError:
72
+ # Binary file - no LOC
73
+ pass
74
+
75
+ # Emit bytes metric
76
+ outputs.append({
77
+ "kind": "metric",
78
+ "path": path,
79
+ "metric": "bytes",
80
+ "value": file_bytes,
81
+ })
82
+
83
+ # Emit LOC metric (if text)
84
+ if loc is not None:
85
+ outputs.append({
86
+ "kind": "metric",
87
+ "path": path,
88
+ "metric": "loc",
89
+ "value": loc,
90
+ })
91
+
92
+ # Emit lang metric
93
+ outputs.append({
94
+ "kind": "metric",
95
+ "path": path,
96
+ "metric": "lang",
97
+ "value": lang_hint,
98
+ })
99
+
100
+ except Exception as e:
101
+ # Emit error metric
102
+ outputs.append({
103
+ "kind": "metric",
104
+ "path": path,
105
+ "metric": "error",
106
+ "value": str(e),
107
+ })
108
+
109
+ return outputs