codebatch 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
codebatch/paths.py ADDED
@@ -0,0 +1,196 @@
1
+ """Path canonicalization and safety utilities.
2
+
3
+ All file paths in a snapshot are canonicalized:
4
+ - UTF-8 encoded
5
+ - / as separator
6
+ - No . or .. segments
7
+ - No trailing slash
8
+ - Stable casing preserved
9
+
10
+ A path_key is included for normalized comparison (lowercase).
11
+ """
12
+
13
+ import os
14
+ import re
15
+ from pathlib import Path, PurePosixPath
16
+ from typing import Tuple
17
+
18
+
19
+ class PathEscapeError(Exception):
20
+ """Raised when a path attempts to escape the root directory."""
21
+
22
+ def __init__(self, path: str, reason: str):
23
+ self.path = path
24
+ self.reason = reason
25
+ super().__init__(f"Path escape attempt: {path} - {reason}")
26
+
27
+
28
+ class InvalidPathError(Exception):
29
+ """Raised for invalid path characters or structure."""
30
+
31
+ def __init__(self, path: str, reason: str):
32
+ self.path = path
33
+ self.reason = reason
34
+ super().__init__(f"Invalid path: {path} - {reason}")
35
+
36
+
37
+ # Characters not allowed in paths (Windows restrictions + control chars)
38
+ INVALID_CHARS = re.compile(r'[\x00-\x1f<>:"|?*]')
39
+
40
+ # Reserved Windows filenames
41
+ RESERVED_NAMES = frozenset([
42
+ "CON", "PRN", "AUX", "NUL",
43
+ "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9",
44
+ "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9",
45
+ ])
46
+
47
+
48
+ def canonicalize_path(path: str, root: Path = None) -> str:
49
+ """Canonicalize a file path for storage.
50
+
51
+ Args:
52
+ path: Input path (may use any separator).
53
+ root: Optional root directory for escape detection.
54
+
55
+ Returns:
56
+ Canonicalized path with / separators, no . or .. segments.
57
+
58
+ Raises:
59
+ PathEscapeError: If path escapes the root.
60
+ InvalidPathError: If path contains invalid characters.
61
+ """
62
+ if not path:
63
+ raise InvalidPathError(path, "empty path")
64
+
65
+ # Check for invalid characters
66
+ if INVALID_CHARS.search(path):
67
+ raise InvalidPathError(path, "contains invalid characters")
68
+
69
+ # Normalize separators to forward slash
70
+ normalized = path.replace("\\", "/")
71
+
72
+ # Remove leading/trailing whitespace
73
+ normalized = normalized.strip()
74
+
75
+ # Remove trailing slash
76
+ normalized = normalized.rstrip("/")
77
+
78
+ if not normalized:
79
+ raise InvalidPathError(path, "path is empty after normalization")
80
+
81
+ # Split into components and resolve . and ..
82
+ parts = normalized.split("/")
83
+ resolved = []
84
+
85
+ for part in parts:
86
+ if not part or part == ".":
87
+ # Skip empty parts and current directory
88
+ continue
89
+ elif part == "..":
90
+ # Parent directory - check for escape
91
+ if not resolved:
92
+ raise PathEscapeError(path, "attempts to go above root")
93
+ resolved.pop()
94
+ else:
95
+ # Check for reserved Windows names
96
+ base_name = part.split(".")[0].upper()
97
+ if base_name in RESERVED_NAMES:
98
+ raise InvalidPathError(path, f"contains reserved name: {part}")
99
+ resolved.append(part)
100
+
101
+ if not resolved:
102
+ raise InvalidPathError(path, "path resolves to root")
103
+
104
+ canonical = "/".join(resolved)
105
+
106
+ # If root provided, verify the resolved path doesn't escape
107
+ if root is not None:
108
+ try:
109
+ full_path = (root / canonical).resolve()
110
+ root_resolved = root.resolve()
111
+ # Check that full_path is under root
112
+ try:
113
+ full_path.relative_to(root_resolved)
114
+ except ValueError:
115
+ raise PathEscapeError(path, "resolved path escapes root")
116
+ except OSError:
117
+ # Path resolution failed - might be too long or invalid
118
+ pass
119
+
120
+ return canonical
121
+
122
+
123
+ def compute_path_key(path: str) -> str:
124
+ """Compute a normalized path key for comparison.
125
+
126
+ The path_key is lowercase with normalized separators,
127
+ used for case-insensitive comparisons and collision detection.
128
+
129
+ Args:
130
+ path: Canonicalized path.
131
+
132
+ Returns:
133
+ Lowercase path key.
134
+ """
135
+ # Path should already be canonicalized (/ separators, no . or ..)
136
+ return path.lower()
137
+
138
+
139
+ def canonicalize_with_key(path: str, root: Path = None) -> Tuple[str, str]:
140
+ """Canonicalize a path and compute its key in one call.
141
+
142
+ Args:
143
+ path: Input path.
144
+ root: Optional root directory for escape detection.
145
+
146
+ Returns:
147
+ Tuple of (canonical_path, path_key).
148
+ """
149
+ canonical = canonicalize_path(path, root)
150
+ key = compute_path_key(canonical)
151
+ return canonical, key
152
+
153
+
154
+ def is_safe_path(path: str, root: Path = None) -> bool:
155
+ """Check if a path is safe (doesn't escape root, valid characters).
156
+
157
+ Args:
158
+ path: Path to check.
159
+ root: Optional root directory.
160
+
161
+ Returns:
162
+ True if path is safe, False otherwise.
163
+ """
164
+ try:
165
+ canonicalize_path(path, root)
166
+ return True
167
+ except (PathEscapeError, InvalidPathError):
168
+ return False
169
+
170
+
171
+ def detect_case_collision(paths: list[str]) -> list[Tuple[str, str]]:
172
+ """Detect case collisions in a list of paths.
173
+
174
+ Args:
175
+ paths: List of canonicalized paths.
176
+
177
+ Returns:
178
+ List of colliding path pairs.
179
+ """
180
+ key_to_paths: dict[str, list[str]] = {}
181
+
182
+ for path in paths:
183
+ key = compute_path_key(path)
184
+ if key not in key_to_paths:
185
+ key_to_paths[key] = []
186
+ key_to_paths[key].append(path)
187
+
188
+ collisions = []
189
+ for key, path_list in key_to_paths.items():
190
+ if len(path_list) > 1:
191
+ # Return all pairs
192
+ for i, p1 in enumerate(path_list):
193
+ for p2 in path_list[i + 1:]:
194
+ collisions.append((p1, p2))
195
+
196
+ return collisions
codebatch/query.py ADDED
@@ -0,0 +1,242 @@
1
+ """Query engine for output indexes.
2
+
3
+ Answers questions from JSONL scans without requiring a database:
4
+ - Which files produced diagnostics?
5
+ - Which outputs exist for a given task?
6
+ - Which files failed a given task?
7
+ - Aggregate counts by kind, severity, or language
8
+ """
9
+
10
+ import json
11
+ from collections import Counter
12
+ from pathlib import Path
13
+ from typing import Iterator, Optional
14
+
15
+
16
+ class QueryEngine:
17
+ """Query engine for batch output indexes."""
18
+
19
+ def __init__(self, store_root: Path):
20
+ """Initialize the query engine.
21
+
22
+ Args:
23
+ store_root: Root directory of the CodeBatch store.
24
+ """
25
+ self.store_root = Path(store_root)
26
+ self.batches_dir = self.store_root / "batches"
27
+
28
+ def _iter_shard_outputs(
29
+ self, batch_id: str, task_id: str
30
+ ) -> Iterator[dict]:
31
+ """Iterate over all output records for a task.
32
+
33
+ Args:
34
+ batch_id: Batch ID.
35
+ task_id: Task ID.
36
+
37
+ Yields:
38
+ Output record dicts.
39
+ """
40
+ shards_dir = self.batches_dir / batch_id / "tasks" / task_id / "shards"
41
+
42
+ if not shards_dir.exists():
43
+ return
44
+
45
+ for shard_dir in sorted(shards_dir.iterdir()):
46
+ if not shard_dir.is_dir():
47
+ continue
48
+
49
+ outputs_path = shard_dir / "outputs.index.jsonl"
50
+ if not outputs_path.exists():
51
+ continue
52
+
53
+ with open(outputs_path, "r", encoding="utf-8") as f:
54
+ for line in f:
55
+ line = line.strip()
56
+ if line:
57
+ yield json.loads(line)
58
+
59
+ def query_diagnostics(
60
+ self,
61
+ batch_id: str,
62
+ task_id: str,
63
+ severity: Optional[str] = None,
64
+ code: Optional[str] = None,
65
+ path_pattern: Optional[str] = None,
66
+ ) -> list[dict]:
67
+ """Query diagnostic outputs.
68
+
69
+ Args:
70
+ batch_id: Batch ID.
71
+ task_id: Task ID.
72
+ severity: Filter by severity (error, warning, info, hint).
73
+ code: Filter by diagnostic code.
74
+ path_pattern: Filter by path substring.
75
+
76
+ Returns:
77
+ List of diagnostic records.
78
+ """
79
+ results = []
80
+
81
+ for record in self._iter_shard_outputs(batch_id, task_id):
82
+ if record.get("kind") != "diagnostic":
83
+ continue
84
+
85
+ if severity and record.get("severity") != severity:
86
+ continue
87
+
88
+ if code and record.get("code") != code:
89
+ continue
90
+
91
+ if path_pattern and path_pattern.lower() not in record.get("path", "").lower():
92
+ continue
93
+
94
+ results.append(record)
95
+
96
+ return results
97
+
98
+ def query_outputs(
99
+ self,
100
+ batch_id: str,
101
+ task_id: str,
102
+ kind: Optional[str] = None,
103
+ path_pattern: Optional[str] = None,
104
+ ) -> list[dict]:
105
+ """Query output records.
106
+
107
+ Args:
108
+ batch_id: Batch ID.
109
+ task_id: Task ID.
110
+ kind: Filter by output kind (ast, diagnostic, metric, etc.).
111
+ path_pattern: Filter by path substring.
112
+
113
+ Returns:
114
+ List of output records.
115
+ """
116
+ results = []
117
+
118
+ for record in self._iter_shard_outputs(batch_id, task_id):
119
+ if kind and record.get("kind") != kind:
120
+ continue
121
+
122
+ if path_pattern and path_pattern.lower() not in record.get("path", "").lower():
123
+ continue
124
+
125
+ results.append(record)
126
+
127
+ return results
128
+
129
+ def query_stats(
130
+ self,
131
+ batch_id: str,
132
+ task_id: str,
133
+ group_by: str = "kind",
134
+ ) -> dict[str, int]:
135
+ """Get aggregate statistics.
136
+
137
+ Args:
138
+ batch_id: Batch ID.
139
+ task_id: Task ID.
140
+ group_by: Field to group by (kind, severity, code, lang).
141
+
142
+ Returns:
143
+ Dict mapping group values to counts.
144
+ """
145
+ counter: Counter[str] = Counter()
146
+
147
+ for record in self._iter_shard_outputs(batch_id, task_id):
148
+ if group_by == "kind":
149
+ value = record.get("kind", "unknown")
150
+ elif group_by == "severity":
151
+ value = record.get("severity", "none")
152
+ elif group_by == "code":
153
+ value = record.get("code", "none")
154
+ elif group_by == "lang":
155
+ # Extract language from path extension
156
+ path = record.get("path", "")
157
+ ext = path.rsplit(".", 1)[-1] if "." in path else "none"
158
+ value = ext
159
+ else:
160
+ value = record.get(group_by, "unknown")
161
+
162
+ counter[value] += 1
163
+
164
+ return dict(counter)
165
+
166
+ def query_failed_files(
167
+ self, batch_id: str, task_id: str
168
+ ) -> list[str]:
169
+ """Get paths of files that produced error diagnostics.
170
+
171
+ Args:
172
+ batch_id: Batch ID.
173
+ task_id: Task ID.
174
+
175
+ Returns:
176
+ List of file paths with errors.
177
+ """
178
+ failed_paths = set()
179
+
180
+ for record in self._iter_shard_outputs(batch_id, task_id):
181
+ if record.get("kind") == "diagnostic" and record.get("severity") == "error":
182
+ failed_paths.add(record.get("path", ""))
183
+
184
+ return sorted(failed_paths)
185
+
186
+ def query_files_with_outputs(
187
+ self, batch_id: str, task_id: str, kind: str
188
+ ) -> list[str]:
189
+ """Get paths of files that produced outputs of a given kind.
190
+
191
+ Args:
192
+ batch_id: Batch ID.
193
+ task_id: Task ID.
194
+ kind: Output kind to filter by.
195
+
196
+ Returns:
197
+ List of file paths.
198
+ """
199
+ paths = set()
200
+
201
+ for record in self._iter_shard_outputs(batch_id, task_id):
202
+ if record.get("kind") == kind:
203
+ paths.add(record.get("path", ""))
204
+
205
+ return sorted(paths)
206
+
207
+ def get_task_summary(self, batch_id: str, task_id: str) -> dict:
208
+ """Get a summary of task outputs.
209
+
210
+ Args:
211
+ batch_id: Batch ID.
212
+ task_id: Task ID.
213
+
214
+ Returns:
215
+ Summary dict with counts.
216
+ """
217
+ total = 0
218
+ by_kind: Counter[str] = Counter()
219
+ by_severity: Counter[str] = Counter()
220
+ files_with_outputs: set[str] = set()
221
+ files_with_errors: set[str] = set()
222
+
223
+ for record in self._iter_shard_outputs(batch_id, task_id):
224
+ total += 1
225
+ kind = record.get("kind", "unknown")
226
+ by_kind[kind] += 1
227
+
228
+ if kind == "diagnostic":
229
+ severity = record.get("severity", "unknown")
230
+ by_severity[severity] += 1
231
+ if severity == "error":
232
+ files_with_errors.add(record.get("path", ""))
233
+
234
+ files_with_outputs.add(record.get("path", ""))
235
+
236
+ return {
237
+ "total_outputs": total,
238
+ "by_kind": dict(by_kind),
239
+ "by_severity": dict(by_severity),
240
+ "files_with_outputs": len(files_with_outputs),
241
+ "files_with_errors": len(files_with_errors),
242
+ }