codebatch 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codebatch/__init__.py +3 -0
- codebatch/batch.py +366 -0
- codebatch/cas.py +170 -0
- codebatch/cli.py +432 -0
- codebatch/common.py +104 -0
- codebatch/paths.py +196 -0
- codebatch/query.py +242 -0
- codebatch/runner.py +495 -0
- codebatch/snapshot.py +340 -0
- codebatch/store.py +162 -0
- codebatch/tasks/__init__.py +37 -0
- codebatch/tasks/analyze.py +109 -0
- codebatch/tasks/lint.py +244 -0
- codebatch/tasks/parse.py +304 -0
- codebatch/tasks/symbols.py +223 -0
- codebatch-0.1.0.dist-info/METADATA +66 -0
- codebatch-0.1.0.dist-info/RECORD +19 -0
- codebatch-0.1.0.dist-info/WHEEL +4 -0
- codebatch-0.1.0.dist-info/entry_points.txt +2 -0
codebatch/paths.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
"""Path canonicalization and safety utilities.
|
|
2
|
+
|
|
3
|
+
All file paths in a snapshot are canonicalized:
|
|
4
|
+
- UTF-8 encoded
|
|
5
|
+
- / as separator
|
|
6
|
+
- No . or .. segments
|
|
7
|
+
- No trailing slash
|
|
8
|
+
- Stable casing preserved
|
|
9
|
+
|
|
10
|
+
A path_key is included for normalized comparison (lowercase).
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import os
|
|
14
|
+
import re
|
|
15
|
+
from pathlib import Path, PurePosixPath
|
|
16
|
+
from typing import Tuple
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class PathEscapeError(Exception):
|
|
20
|
+
"""Raised when a path attempts to escape the root directory."""
|
|
21
|
+
|
|
22
|
+
def __init__(self, path: str, reason: str):
|
|
23
|
+
self.path = path
|
|
24
|
+
self.reason = reason
|
|
25
|
+
super().__init__(f"Path escape attempt: {path} - {reason}")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class InvalidPathError(Exception):
|
|
29
|
+
"""Raised for invalid path characters or structure."""
|
|
30
|
+
|
|
31
|
+
def __init__(self, path: str, reason: str):
|
|
32
|
+
self.path = path
|
|
33
|
+
self.reason = reason
|
|
34
|
+
super().__init__(f"Invalid path: {path} - {reason}")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# Characters not allowed in paths (Windows restrictions + control chars)
|
|
38
|
+
INVALID_CHARS = re.compile(r'[\x00-\x1f<>:"|?*]')
|
|
39
|
+
|
|
40
|
+
# Reserved Windows filenames
|
|
41
|
+
RESERVED_NAMES = frozenset([
|
|
42
|
+
"CON", "PRN", "AUX", "NUL",
|
|
43
|
+
"COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9",
|
|
44
|
+
"LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9",
|
|
45
|
+
])
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def canonicalize_path(path: str, root: Path = None) -> str:
|
|
49
|
+
"""Canonicalize a file path for storage.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
path: Input path (may use any separator).
|
|
53
|
+
root: Optional root directory for escape detection.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
Canonicalized path with / separators, no . or .. segments.
|
|
57
|
+
|
|
58
|
+
Raises:
|
|
59
|
+
PathEscapeError: If path escapes the root.
|
|
60
|
+
InvalidPathError: If path contains invalid characters.
|
|
61
|
+
"""
|
|
62
|
+
if not path:
|
|
63
|
+
raise InvalidPathError(path, "empty path")
|
|
64
|
+
|
|
65
|
+
# Check for invalid characters
|
|
66
|
+
if INVALID_CHARS.search(path):
|
|
67
|
+
raise InvalidPathError(path, "contains invalid characters")
|
|
68
|
+
|
|
69
|
+
# Normalize separators to forward slash
|
|
70
|
+
normalized = path.replace("\\", "/")
|
|
71
|
+
|
|
72
|
+
# Remove leading/trailing whitespace
|
|
73
|
+
normalized = normalized.strip()
|
|
74
|
+
|
|
75
|
+
# Remove trailing slash
|
|
76
|
+
normalized = normalized.rstrip("/")
|
|
77
|
+
|
|
78
|
+
if not normalized:
|
|
79
|
+
raise InvalidPathError(path, "path is empty after normalization")
|
|
80
|
+
|
|
81
|
+
# Split into components and resolve . and ..
|
|
82
|
+
parts = normalized.split("/")
|
|
83
|
+
resolved = []
|
|
84
|
+
|
|
85
|
+
for part in parts:
|
|
86
|
+
if not part or part == ".":
|
|
87
|
+
# Skip empty parts and current directory
|
|
88
|
+
continue
|
|
89
|
+
elif part == "..":
|
|
90
|
+
# Parent directory - check for escape
|
|
91
|
+
if not resolved:
|
|
92
|
+
raise PathEscapeError(path, "attempts to go above root")
|
|
93
|
+
resolved.pop()
|
|
94
|
+
else:
|
|
95
|
+
# Check for reserved Windows names
|
|
96
|
+
base_name = part.split(".")[0].upper()
|
|
97
|
+
if base_name in RESERVED_NAMES:
|
|
98
|
+
raise InvalidPathError(path, f"contains reserved name: {part}")
|
|
99
|
+
resolved.append(part)
|
|
100
|
+
|
|
101
|
+
if not resolved:
|
|
102
|
+
raise InvalidPathError(path, "path resolves to root")
|
|
103
|
+
|
|
104
|
+
canonical = "/".join(resolved)
|
|
105
|
+
|
|
106
|
+
# If root provided, verify the resolved path doesn't escape
|
|
107
|
+
if root is not None:
|
|
108
|
+
try:
|
|
109
|
+
full_path = (root / canonical).resolve()
|
|
110
|
+
root_resolved = root.resolve()
|
|
111
|
+
# Check that full_path is under root
|
|
112
|
+
try:
|
|
113
|
+
full_path.relative_to(root_resolved)
|
|
114
|
+
except ValueError:
|
|
115
|
+
raise PathEscapeError(path, "resolved path escapes root")
|
|
116
|
+
except OSError:
|
|
117
|
+
# Path resolution failed - might be too long or invalid
|
|
118
|
+
pass
|
|
119
|
+
|
|
120
|
+
return canonical
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def compute_path_key(path: str) -> str:
|
|
124
|
+
"""Compute a normalized path key for comparison.
|
|
125
|
+
|
|
126
|
+
The path_key is lowercase with normalized separators,
|
|
127
|
+
used for case-insensitive comparisons and collision detection.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
path: Canonicalized path.
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
Lowercase path key.
|
|
134
|
+
"""
|
|
135
|
+
# Path should already be canonicalized (/ separators, no . or ..)
|
|
136
|
+
return path.lower()
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def canonicalize_with_key(path: str, root: Path = None) -> Tuple[str, str]:
|
|
140
|
+
"""Canonicalize a path and compute its key in one call.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
path: Input path.
|
|
144
|
+
root: Optional root directory for escape detection.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
Tuple of (canonical_path, path_key).
|
|
148
|
+
"""
|
|
149
|
+
canonical = canonicalize_path(path, root)
|
|
150
|
+
key = compute_path_key(canonical)
|
|
151
|
+
return canonical, key
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def is_safe_path(path: str, root: Path = None) -> bool:
|
|
155
|
+
"""Check if a path is safe (doesn't escape root, valid characters).
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
path: Path to check.
|
|
159
|
+
root: Optional root directory.
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
True if path is safe, False otherwise.
|
|
163
|
+
"""
|
|
164
|
+
try:
|
|
165
|
+
canonicalize_path(path, root)
|
|
166
|
+
return True
|
|
167
|
+
except (PathEscapeError, InvalidPathError):
|
|
168
|
+
return False
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def detect_case_collision(paths: list[str]) -> list[Tuple[str, str]]:
|
|
172
|
+
"""Detect case collisions in a list of paths.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
paths: List of canonicalized paths.
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
List of colliding path pairs.
|
|
179
|
+
"""
|
|
180
|
+
key_to_paths: dict[str, list[str]] = {}
|
|
181
|
+
|
|
182
|
+
for path in paths:
|
|
183
|
+
key = compute_path_key(path)
|
|
184
|
+
if key not in key_to_paths:
|
|
185
|
+
key_to_paths[key] = []
|
|
186
|
+
key_to_paths[key].append(path)
|
|
187
|
+
|
|
188
|
+
collisions = []
|
|
189
|
+
for key, path_list in key_to_paths.items():
|
|
190
|
+
if len(path_list) > 1:
|
|
191
|
+
# Return all pairs
|
|
192
|
+
for i, p1 in enumerate(path_list):
|
|
193
|
+
for p2 in path_list[i + 1:]:
|
|
194
|
+
collisions.append((p1, p2))
|
|
195
|
+
|
|
196
|
+
return collisions
|
codebatch/query.py
ADDED
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
"""Query engine for output indexes.
|
|
2
|
+
|
|
3
|
+
Answers questions from JSONL scans without requiring a database:
|
|
4
|
+
- Which files produced diagnostics?
|
|
5
|
+
- Which outputs exist for a given task?
|
|
6
|
+
- Which files failed a given task?
|
|
7
|
+
- Aggregate counts by kind, severity, or language
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
from collections import Counter
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Iterator, Optional
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class QueryEngine:
|
|
17
|
+
"""Query engine for batch output indexes."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, store_root: Path):
|
|
20
|
+
"""Initialize the query engine.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
store_root: Root directory of the CodeBatch store.
|
|
24
|
+
"""
|
|
25
|
+
self.store_root = Path(store_root)
|
|
26
|
+
self.batches_dir = self.store_root / "batches"
|
|
27
|
+
|
|
28
|
+
def _iter_shard_outputs(
|
|
29
|
+
self, batch_id: str, task_id: str
|
|
30
|
+
) -> Iterator[dict]:
|
|
31
|
+
"""Iterate over all output records for a task.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
batch_id: Batch ID.
|
|
35
|
+
task_id: Task ID.
|
|
36
|
+
|
|
37
|
+
Yields:
|
|
38
|
+
Output record dicts.
|
|
39
|
+
"""
|
|
40
|
+
shards_dir = self.batches_dir / batch_id / "tasks" / task_id / "shards"
|
|
41
|
+
|
|
42
|
+
if not shards_dir.exists():
|
|
43
|
+
return
|
|
44
|
+
|
|
45
|
+
for shard_dir in sorted(shards_dir.iterdir()):
|
|
46
|
+
if not shard_dir.is_dir():
|
|
47
|
+
continue
|
|
48
|
+
|
|
49
|
+
outputs_path = shard_dir / "outputs.index.jsonl"
|
|
50
|
+
if not outputs_path.exists():
|
|
51
|
+
continue
|
|
52
|
+
|
|
53
|
+
with open(outputs_path, "r", encoding="utf-8") as f:
|
|
54
|
+
for line in f:
|
|
55
|
+
line = line.strip()
|
|
56
|
+
if line:
|
|
57
|
+
yield json.loads(line)
|
|
58
|
+
|
|
59
|
+
def query_diagnostics(
|
|
60
|
+
self,
|
|
61
|
+
batch_id: str,
|
|
62
|
+
task_id: str,
|
|
63
|
+
severity: Optional[str] = None,
|
|
64
|
+
code: Optional[str] = None,
|
|
65
|
+
path_pattern: Optional[str] = None,
|
|
66
|
+
) -> list[dict]:
|
|
67
|
+
"""Query diagnostic outputs.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
batch_id: Batch ID.
|
|
71
|
+
task_id: Task ID.
|
|
72
|
+
severity: Filter by severity (error, warning, info, hint).
|
|
73
|
+
code: Filter by diagnostic code.
|
|
74
|
+
path_pattern: Filter by path substring.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
List of diagnostic records.
|
|
78
|
+
"""
|
|
79
|
+
results = []
|
|
80
|
+
|
|
81
|
+
for record in self._iter_shard_outputs(batch_id, task_id):
|
|
82
|
+
if record.get("kind") != "diagnostic":
|
|
83
|
+
continue
|
|
84
|
+
|
|
85
|
+
if severity and record.get("severity") != severity:
|
|
86
|
+
continue
|
|
87
|
+
|
|
88
|
+
if code and record.get("code") != code:
|
|
89
|
+
continue
|
|
90
|
+
|
|
91
|
+
if path_pattern and path_pattern.lower() not in record.get("path", "").lower():
|
|
92
|
+
continue
|
|
93
|
+
|
|
94
|
+
results.append(record)
|
|
95
|
+
|
|
96
|
+
return results
|
|
97
|
+
|
|
98
|
+
def query_outputs(
|
|
99
|
+
self,
|
|
100
|
+
batch_id: str,
|
|
101
|
+
task_id: str,
|
|
102
|
+
kind: Optional[str] = None,
|
|
103
|
+
path_pattern: Optional[str] = None,
|
|
104
|
+
) -> list[dict]:
|
|
105
|
+
"""Query output records.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
batch_id: Batch ID.
|
|
109
|
+
task_id: Task ID.
|
|
110
|
+
kind: Filter by output kind (ast, diagnostic, metric, etc.).
|
|
111
|
+
path_pattern: Filter by path substring.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
List of output records.
|
|
115
|
+
"""
|
|
116
|
+
results = []
|
|
117
|
+
|
|
118
|
+
for record in self._iter_shard_outputs(batch_id, task_id):
|
|
119
|
+
if kind and record.get("kind") != kind:
|
|
120
|
+
continue
|
|
121
|
+
|
|
122
|
+
if path_pattern and path_pattern.lower() not in record.get("path", "").lower():
|
|
123
|
+
continue
|
|
124
|
+
|
|
125
|
+
results.append(record)
|
|
126
|
+
|
|
127
|
+
return results
|
|
128
|
+
|
|
129
|
+
def query_stats(
|
|
130
|
+
self,
|
|
131
|
+
batch_id: str,
|
|
132
|
+
task_id: str,
|
|
133
|
+
group_by: str = "kind",
|
|
134
|
+
) -> dict[str, int]:
|
|
135
|
+
"""Get aggregate statistics.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
batch_id: Batch ID.
|
|
139
|
+
task_id: Task ID.
|
|
140
|
+
group_by: Field to group by (kind, severity, code, lang).
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
Dict mapping group values to counts.
|
|
144
|
+
"""
|
|
145
|
+
counter: Counter[str] = Counter()
|
|
146
|
+
|
|
147
|
+
for record in self._iter_shard_outputs(batch_id, task_id):
|
|
148
|
+
if group_by == "kind":
|
|
149
|
+
value = record.get("kind", "unknown")
|
|
150
|
+
elif group_by == "severity":
|
|
151
|
+
value = record.get("severity", "none")
|
|
152
|
+
elif group_by == "code":
|
|
153
|
+
value = record.get("code", "none")
|
|
154
|
+
elif group_by == "lang":
|
|
155
|
+
# Extract language from path extension
|
|
156
|
+
path = record.get("path", "")
|
|
157
|
+
ext = path.rsplit(".", 1)[-1] if "." in path else "none"
|
|
158
|
+
value = ext
|
|
159
|
+
else:
|
|
160
|
+
value = record.get(group_by, "unknown")
|
|
161
|
+
|
|
162
|
+
counter[value] += 1
|
|
163
|
+
|
|
164
|
+
return dict(counter)
|
|
165
|
+
|
|
166
|
+
def query_failed_files(
|
|
167
|
+
self, batch_id: str, task_id: str
|
|
168
|
+
) -> list[str]:
|
|
169
|
+
"""Get paths of files that produced error diagnostics.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
batch_id: Batch ID.
|
|
173
|
+
task_id: Task ID.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
List of file paths with errors.
|
|
177
|
+
"""
|
|
178
|
+
failed_paths = set()
|
|
179
|
+
|
|
180
|
+
for record in self._iter_shard_outputs(batch_id, task_id):
|
|
181
|
+
if record.get("kind") == "diagnostic" and record.get("severity") == "error":
|
|
182
|
+
failed_paths.add(record.get("path", ""))
|
|
183
|
+
|
|
184
|
+
return sorted(failed_paths)
|
|
185
|
+
|
|
186
|
+
def query_files_with_outputs(
|
|
187
|
+
self, batch_id: str, task_id: str, kind: str
|
|
188
|
+
) -> list[str]:
|
|
189
|
+
"""Get paths of files that produced outputs of a given kind.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
batch_id: Batch ID.
|
|
193
|
+
task_id: Task ID.
|
|
194
|
+
kind: Output kind to filter by.
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
List of file paths.
|
|
198
|
+
"""
|
|
199
|
+
paths = set()
|
|
200
|
+
|
|
201
|
+
for record in self._iter_shard_outputs(batch_id, task_id):
|
|
202
|
+
if record.get("kind") == kind:
|
|
203
|
+
paths.add(record.get("path", ""))
|
|
204
|
+
|
|
205
|
+
return sorted(paths)
|
|
206
|
+
|
|
207
|
+
def get_task_summary(self, batch_id: str, task_id: str) -> dict:
|
|
208
|
+
"""Get a summary of task outputs.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
batch_id: Batch ID.
|
|
212
|
+
task_id: Task ID.
|
|
213
|
+
|
|
214
|
+
Returns:
|
|
215
|
+
Summary dict with counts.
|
|
216
|
+
"""
|
|
217
|
+
total = 0
|
|
218
|
+
by_kind: Counter[str] = Counter()
|
|
219
|
+
by_severity: Counter[str] = Counter()
|
|
220
|
+
files_with_outputs: set[str] = set()
|
|
221
|
+
files_with_errors: set[str] = set()
|
|
222
|
+
|
|
223
|
+
for record in self._iter_shard_outputs(batch_id, task_id):
|
|
224
|
+
total += 1
|
|
225
|
+
kind = record.get("kind", "unknown")
|
|
226
|
+
by_kind[kind] += 1
|
|
227
|
+
|
|
228
|
+
if kind == "diagnostic":
|
|
229
|
+
severity = record.get("severity", "unknown")
|
|
230
|
+
by_severity[severity] += 1
|
|
231
|
+
if severity == "error":
|
|
232
|
+
files_with_errors.add(record.get("path", ""))
|
|
233
|
+
|
|
234
|
+
files_with_outputs.add(record.get("path", ""))
|
|
235
|
+
|
|
236
|
+
return {
|
|
237
|
+
"total_outputs": total,
|
|
238
|
+
"by_kind": dict(by_kind),
|
|
239
|
+
"by_severity": dict(by_severity),
|
|
240
|
+
"files_with_outputs": len(files_with_outputs),
|
|
241
|
+
"files_with_errors": len(files_with_errors),
|
|
242
|
+
}
|