codebatch 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,244 @@
1
+ """Lint task executor - rule-based diagnostics.
2
+
3
+ Emits:
4
+ - kind=diagnostic: Lint warnings/errors with severity, code, message, location
5
+
6
+ Inputs:
7
+ - Parse outputs (kind=ast) via iter_prior_outputs (preferred)
8
+ - Falls back to raw file content for simple text rules
9
+
10
+ Rules (Phase 2 minimal set):
11
+ - L001: Trailing whitespace
12
+ - L002: Line too long (>120 chars)
13
+ - L003: TODO/FIXME presence
14
+ - L004: Tab indentation (prefer spaces)
15
+ - L005: Missing newline at end of file
16
+ """
17
+
18
+ import json
19
+ from typing import Iterable, Optional
20
+
21
+ from ..runner import ShardRunner
22
+
23
+
24
+ # Rule configuration
25
+ DEFAULT_MAX_LINE_LENGTH = 120
26
+ TODO_PATTERNS = ["TODO", "FIXME", "XXX", "HACK"]
27
+
28
+
29
+ def lint_trailing_whitespace(lines: list[str], path: str) -> list[dict]:
30
+ """L001: Detect trailing whitespace."""
31
+ diagnostics = []
32
+ for i, line in enumerate(lines, 1):
33
+ # Don't strip newline, just check for trailing spaces/tabs before it
34
+ stripped = line.rstrip('\n\r')
35
+ if stripped != stripped.rstrip():
36
+ diagnostics.append({
37
+ "kind": "diagnostic",
38
+ "path": path,
39
+ "severity": "warning",
40
+ "code": "L001",
41
+ "message": "Trailing whitespace",
42
+ "line": i,
43
+ "col": len(stripped.rstrip()) + 1,
44
+ })
45
+ return diagnostics
46
+
47
+
48
+ def lint_line_too_long(lines: list[str], path: str, max_length: int = DEFAULT_MAX_LINE_LENGTH) -> list[dict]:
49
+ """L002: Detect lines exceeding max length."""
50
+ diagnostics = []
51
+ for i, line in enumerate(lines, 1):
52
+ stripped = line.rstrip('\n\r')
53
+ if len(stripped) > max_length:
54
+ diagnostics.append({
55
+ "kind": "diagnostic",
56
+ "path": path,
57
+ "severity": "warning",
58
+ "code": "L002",
59
+ "message": f"Line too long ({len(stripped)} > {max_length})",
60
+ "line": i,
61
+ "col": max_length + 1,
62
+ })
63
+ return diagnostics
64
+
65
+
66
+ def lint_todo_fixme(lines: list[str], path: str) -> list[dict]:
67
+ """L003: Detect TODO/FIXME/XXX/HACK comments."""
68
+ diagnostics = []
69
+ for i, line in enumerate(lines, 1):
70
+ upper_line = line.upper()
71
+ for pattern in TODO_PATTERNS:
72
+ if pattern in upper_line:
73
+ col = line.upper().find(pattern) + 1
74
+ diagnostics.append({
75
+ "kind": "diagnostic",
76
+ "path": path,
77
+ "severity": "info",
78
+ "code": "L003",
79
+ "message": f"Found {pattern} comment",
80
+ "line": i,
81
+ "col": col,
82
+ })
83
+ break # Only report once per line
84
+ return diagnostics
85
+
86
+
87
+ def lint_tab_indentation(lines: list[str], path: str) -> list[dict]:
88
+ """L004: Detect tab indentation (prefer spaces)."""
89
+ diagnostics = []
90
+ for i, line in enumerate(lines, 1):
91
+ if line.startswith('\t'):
92
+ diagnostics.append({
93
+ "kind": "diagnostic",
94
+ "path": path,
95
+ "severity": "warning",
96
+ "code": "L004",
97
+ "message": "Tab indentation (prefer spaces)",
98
+ "line": i,
99
+ "col": 1,
100
+ })
101
+ return diagnostics
102
+
103
+
104
+ def lint_missing_final_newline(content: str, path: str) -> list[dict]:
105
+ """L005: Detect missing newline at end of file."""
106
+ diagnostics = []
107
+ if content and not content.endswith('\n'):
108
+ lines = content.split('\n')
109
+ diagnostics.append({
110
+ "kind": "diagnostic",
111
+ "path": path,
112
+ "severity": "warning",
113
+ "code": "L005",
114
+ "message": "Missing newline at end of file",
115
+ "line": len(lines),
116
+ "col": len(lines[-1]) + 1 if lines else 1,
117
+ })
118
+ return diagnostics
119
+
120
+
121
+ def lint_content(content: str, path: str, config: dict) -> list[dict]:
122
+ """Run all lint rules on content.
123
+
124
+ Args:
125
+ content: File content as string.
126
+ path: File path.
127
+ config: Lint configuration.
128
+
129
+ Returns:
130
+ List of diagnostic records.
131
+ """
132
+ diagnostics = []
133
+ lines = content.split('\n')
134
+
135
+ # Get config options
136
+ max_line_length = config.get("max_line_length", DEFAULT_MAX_LINE_LENGTH)
137
+ check_trailing = config.get("check_trailing_whitespace", True)
138
+ check_line_length = config.get("check_line_length", True)
139
+ check_todo = config.get("check_todo", True)
140
+ check_tabs = config.get("check_tab_indentation", True)
141
+ check_final_newline = config.get("check_final_newline", True)
142
+
143
+ if check_trailing:
144
+ diagnostics.extend(lint_trailing_whitespace(lines, path))
145
+
146
+ if check_line_length:
147
+ diagnostics.extend(lint_line_too_long(lines, path, max_line_length))
148
+
149
+ if check_todo:
150
+ diagnostics.extend(lint_todo_fixme(lines, path))
151
+
152
+ if check_tabs:
153
+ diagnostics.extend(lint_tab_indentation(lines, path))
154
+
155
+ if check_final_newline:
156
+ diagnostics.extend(lint_missing_final_newline(content, path))
157
+
158
+ return diagnostics
159
+
160
+
161
+ def lint_executor(config: dict, files: Iterable[dict], runner: ShardRunner) -> list[dict]:
162
+ """Execute the lint task.
163
+
164
+ Runs lint rules on files in the shard. Prefers AST-based linting but
165
+ falls back to text-based rules for all files.
166
+
167
+ Args:
168
+ config: Task configuration.
169
+ files: Iterable of file records for this shard.
170
+ runner: ShardRunner for CAS access.
171
+
172
+ Returns:
173
+ List of diagnostic output records.
174
+ """
175
+ outputs = []
176
+
177
+ # Get execution context
178
+ batch_id = config.get("_batch_id")
179
+ shard_id = config.get("_shard_id")
180
+
181
+ # Track which files we've linted (to avoid duplicates)
182
+ linted_paths = set()
183
+
184
+ # First pass: lint files that have AST (from parse task)
185
+ if batch_id and shard_id:
186
+ for ast_output in runner.iter_prior_outputs(batch_id, "01_parse", shard_id, kind="ast"):
187
+ path = ast_output.get("path")
188
+ object_ref = ast_output.get("object")
189
+
190
+ if not path or not object_ref or path in linted_paths:
191
+ continue
192
+
193
+ # Skip chunked ASTs
194
+ if ast_output.get("format") == "json+chunks":
195
+ continue
196
+
197
+ try:
198
+ # Load AST to check for any parse-related issues
199
+ # (Future: could add AST-based lint rules here)
200
+
201
+ # For now, get the original file content and run text rules
202
+ # We need to find the file's object ref from snapshot
203
+ pass
204
+
205
+ except Exception:
206
+ pass
207
+
208
+ # Second pass: lint all files in shard by reading from CAS
209
+ file_list = list(files)
210
+ for file_record in file_list:
211
+ path = file_record["path"]
212
+ object_ref = file_record["object"]
213
+
214
+ if path in linted_paths:
215
+ continue
216
+
217
+ try:
218
+ # Get file content from CAS
219
+ data = runner.object_store.get_bytes(object_ref)
220
+
221
+ # Try to decode as text
222
+ try:
223
+ content = data.decode("utf-8")
224
+ except UnicodeDecodeError:
225
+ # Binary file - skip
226
+ continue
227
+
228
+ # Run lint rules
229
+ diagnostics = lint_content(content, path, config)
230
+ outputs.extend(diagnostics)
231
+ linted_paths.add(path)
232
+
233
+ except Exception as e:
234
+ outputs.append({
235
+ "kind": "diagnostic",
236
+ "path": path,
237
+ "severity": "error",
238
+ "code": "L999",
239
+ "message": f"Lint error: {e}",
240
+ "line": 1,
241
+ "col": 1,
242
+ })
243
+
244
+ return outputs
@@ -0,0 +1,304 @@
1
+ """Parse task executor - produces AST and diagnostic outputs.
2
+
3
+ For Phase 1, supports:
4
+ - Python (via ast module)
5
+ - JavaScript/TypeScript (simple tokenization)
6
+ - Text files (line-based tokenization)
7
+
8
+ Emits:
9
+ - kind=ast: AST objects stored in CAS (format=json or format=json+chunks)
10
+ - kind=diagnostic: Parse errors/warnings
11
+
12
+ Enforces chunking threshold (default 16MB) with chunk manifest objects.
13
+ """
14
+
15
+ import ast
16
+ import json
17
+ import re
18
+ from typing import Iterable, Optional
19
+
20
+ from ..common import SCHEMA_VERSION, PRODUCER
21
+ from ..runner import ShardRunner
22
+
23
+
24
+ # Default chunk size: 16MB
25
+ DEFAULT_CHUNK_SIZE = 16 * 1024 * 1024
26
+
27
+
28
+ def parse_python(content: str, path: str) -> tuple[Optional[dict], list[dict]]:
29
+ """Parse Python source code.
30
+
31
+ Args:
32
+ content: Python source code.
33
+ path: File path for error reporting.
34
+
35
+ Returns:
36
+ Tuple of (AST dict or None, list of diagnostics).
37
+ """
38
+ diagnostics = []
39
+
40
+ try:
41
+ tree = ast.parse(content, filename=path)
42
+ # Convert AST to dict - summarized mode for reasonable size
43
+ ast_dict = {
44
+ "type": "Module",
45
+ "ast_mode": "summary", # Explicit about summarization
46
+ "body": [
47
+ {
48
+ "type": node.__class__.__name__,
49
+ "lineno": getattr(node, "lineno", None),
50
+ "col_offset": getattr(node, "col_offset", None),
51
+ }
52
+ for node in ast.walk(tree)
53
+ if hasattr(node, "lineno")
54
+ ][:100], # Limit for reasonable size
55
+ "stats": {
56
+ "total_nodes": len(list(ast.walk(tree))),
57
+ },
58
+ }
59
+ return ast_dict, diagnostics
60
+
61
+ except SyntaxError as e:
62
+ diagnostics.append({
63
+ "severity": "error",
64
+ "code": "E0001",
65
+ "message": str(e.msg) if e.msg else "Syntax error",
66
+ "line": e.lineno or 1,
67
+ "column": e.offset or 1,
68
+ })
69
+ return None, diagnostics
70
+
71
+
72
+ def parse_javascript(content: str, path: str) -> tuple[Optional[dict], list[dict]]:
73
+ """Simple JavaScript/TypeScript tokenization.
74
+
75
+ This is a basic tokenizer, not a full parser.
76
+ For Phase 1, we just identify tokens and structure.
77
+
78
+ Args:
79
+ content: JS/TS source code.
80
+ path: File path.
81
+
82
+ Returns:
83
+ Tuple of (token info dict or None, list of diagnostics).
84
+ """
85
+ diagnostics = []
86
+
87
+ # Simple token patterns
88
+ patterns = {
89
+ "keyword": r'\b(function|const|let|var|if|else|for|while|return|class|import|export|async|await)\b',
90
+ "string": r'(["\'])(?:(?!\1)[^\\]|\\.)*\1',
91
+ "number": r'\b\d+(?:\.\d+)?\b',
92
+ "comment": r'//.*|/\*[\s\S]*?\*/',
93
+ "identifier": r'\b[a-zA-Z_$][a-zA-Z0-9_$]*\b',
94
+ }
95
+
96
+ token_counts = {}
97
+ for token_type, pattern in patterns.items():
98
+ matches = re.findall(pattern, content)
99
+ # Handle tuple returns from capture groups
100
+ if matches and isinstance(matches[0], tuple):
101
+ token_counts[token_type] = len(matches)
102
+ else:
103
+ token_counts[token_type] = len(matches)
104
+
105
+ # Check for common issues
106
+ # Unbalanced braces
107
+ open_braces = content.count('{')
108
+ close_braces = content.count('}')
109
+ if open_braces != close_braces:
110
+ diagnostics.append({
111
+ "severity": "warning",
112
+ "code": "W0001",
113
+ "message": f"Unbalanced braces: {open_braces} open, {close_braces} close",
114
+ "line": 1,
115
+ "column": 1,
116
+ })
117
+
118
+ ast_dict = {
119
+ "type": "TokenInfo",
120
+ "ast_mode": "tokens",
121
+ "tokens": token_counts,
122
+ "stats": {
123
+ "lines": content.count('\n') + 1,
124
+ "characters": len(content),
125
+ },
126
+ }
127
+
128
+ return ast_dict, diagnostics
129
+
130
+
131
+ def parse_text(content: str, path: str) -> tuple[Optional[dict], list[dict]]:
132
+ """Simple text file tokenization.
133
+
134
+ Args:
135
+ content: Text content.
136
+ path: File path.
137
+
138
+ Returns:
139
+ Tuple of (token info dict, empty diagnostics).
140
+ """
141
+ lines = content.split('\n')
142
+ words = content.split()
143
+
144
+ ast_dict = {
145
+ "type": "TextInfo",
146
+ "ast_mode": "text_stats",
147
+ "stats": {
148
+ "lines": len(lines),
149
+ "words": len(words),
150
+ "characters": len(content),
151
+ "non_empty_lines": sum(1 for line in lines if line.strip()),
152
+ },
153
+ }
154
+
155
+ return ast_dict, []
156
+
157
+
158
+ def create_chunk_manifest(
159
+ data: bytes,
160
+ kind: str,
161
+ fmt: str,
162
+ runner: ShardRunner,
163
+ chunk_size: int = DEFAULT_CHUNK_SIZE,
164
+ ) -> tuple[str, dict]:
165
+ """Create a chunk manifest for large data.
166
+
167
+ Args:
168
+ data: Raw bytes to chunk.
169
+ kind: Output kind.
170
+ fmt: Base format identifier (will be suffixed with +chunks).
171
+ runner: ShardRunner for CAS access.
172
+ chunk_size: Target chunk size.
173
+
174
+ Returns:
175
+ Tuple of (manifest object ref, manifest dict).
176
+ """
177
+ chunks = []
178
+ total_bytes = len(data)
179
+
180
+ for i in range(0, total_bytes, chunk_size):
181
+ chunk_data = data[i:i + chunk_size]
182
+ chunk_ref = runner.object_store.put_bytes(chunk_data)
183
+ chunks.append({
184
+ "object": chunk_ref,
185
+ "size": len(chunk_data),
186
+ "index": len(chunks),
187
+ })
188
+
189
+ manifest = {
190
+ "schema_name": "codebatch.chunk_manifest",
191
+ "schema_version": SCHEMA_VERSION,
192
+ "producer": PRODUCER,
193
+ "kind": kind,
194
+ "format": fmt,
195
+ "chunks": chunks,
196
+ "total_bytes": total_bytes,
197
+ "chunk_size": chunk_size,
198
+ }
199
+
200
+ manifest_bytes = json.dumps(manifest, separators=(",", ":")).encode("utf-8")
201
+ manifest_ref = runner.object_store.put_bytes(manifest_bytes)
202
+
203
+ return manifest_ref, manifest
204
+
205
+
206
+ def parse_executor(config: dict, files: Iterable[dict], runner: ShardRunner) -> list[dict]:
207
+ """Execute the parse task.
208
+
209
+ Args:
210
+ config: Task configuration.
211
+ files: Iterable of file records for this shard (may be iterator).
212
+ runner: ShardRunner for CAS access.
213
+
214
+ Returns:
215
+ List of output records.
216
+ """
217
+ outputs = []
218
+ chunk_threshold = config.get("chunk_threshold", DEFAULT_CHUNK_SIZE)
219
+ emit_ast = config.get("emit_ast", True)
220
+ emit_diagnostics = config.get("emit_diagnostics", True)
221
+
222
+ for file_record in files:
223
+ path = file_record["path"]
224
+ object_ref = file_record["object"]
225
+ lang_hint = file_record.get("lang_hint")
226
+
227
+ try:
228
+ # Get file content from CAS
229
+ data = runner.object_store.get_bytes(object_ref)
230
+
231
+ # Try to decode as text
232
+ try:
233
+ content = data.decode("utf-8")
234
+ except UnicodeDecodeError:
235
+ # Binary file - skip
236
+ continue
237
+
238
+ # Parse based on language
239
+ ast_dict = None
240
+ diagnostics = []
241
+
242
+ if lang_hint == "python":
243
+ ast_dict, diagnostics = parse_python(content, path)
244
+ elif lang_hint in ("javascript", "typescript"):
245
+ ast_dict, diagnostics = parse_javascript(content, path)
246
+ elif lang_hint in ("markdown", "json", "yaml", "xml", "html", "css"):
247
+ # Text-based formats
248
+ ast_dict, diagnostics = parse_text(content, path)
249
+ else:
250
+ # Default text tokenization for unknown types
251
+ ast_dict, diagnostics = parse_text(content, path)
252
+
253
+ # Emit AST output
254
+ if emit_ast and ast_dict is not None:
255
+ ast_bytes = json.dumps(ast_dict, separators=(",", ":")).encode("utf-8")
256
+
257
+ if len(ast_bytes) > chunk_threshold:
258
+ # Create chunk manifest - kind stays "ast", format becomes "json+chunks"
259
+ manifest_ref, _ = create_chunk_manifest(
260
+ ast_bytes, "ast", "json", runner, chunk_threshold
261
+ )
262
+ outputs.append({
263
+ "path": path,
264
+ "kind": "ast", # Semantic kind stays ast
265
+ "object": manifest_ref,
266
+ "format": "json+chunks", # Format indicates chunking
267
+ })
268
+ else:
269
+ # Store directly
270
+ ast_ref = runner.object_store.put_bytes(ast_bytes)
271
+ outputs.append({
272
+ "path": path,
273
+ "kind": "ast",
274
+ "object": ast_ref,
275
+ "format": "json",
276
+ })
277
+
278
+ # Emit diagnostics
279
+ if emit_diagnostics:
280
+ for diag in diagnostics:
281
+ outputs.append({
282
+ "path": path,
283
+ "kind": "diagnostic",
284
+ "severity": diag["severity"],
285
+ "code": diag["code"],
286
+ "message": diag["message"],
287
+ "line": diag.get("line"),
288
+ "column": diag.get("column"),
289
+ })
290
+
291
+ except Exception as e:
292
+ # Emit error diagnostic
293
+ if emit_diagnostics:
294
+ outputs.append({
295
+ "path": path,
296
+ "kind": "diagnostic",
297
+ "severity": "error",
298
+ "code": "E9999",
299
+ "message": f"Parse error: {str(e)}",
300
+ "line": 1,
301
+ "column": 1,
302
+ })
303
+
304
+ return outputs