codebatch 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codebatch/__init__.py +3 -0
- codebatch/batch.py +366 -0
- codebatch/cas.py +170 -0
- codebatch/cli.py +432 -0
- codebatch/common.py +104 -0
- codebatch/paths.py +196 -0
- codebatch/query.py +242 -0
- codebatch/runner.py +495 -0
- codebatch/snapshot.py +340 -0
- codebatch/store.py +162 -0
- codebatch/tasks/__init__.py +37 -0
- codebatch/tasks/analyze.py +109 -0
- codebatch/tasks/lint.py +244 -0
- codebatch/tasks/parse.py +304 -0
- codebatch/tasks/symbols.py +223 -0
- codebatch-0.1.0.dist-info/METADATA +66 -0
- codebatch-0.1.0.dist-info/RECORD +19 -0
- codebatch-0.1.0.dist-info/WHEEL +4 -0
- codebatch-0.1.0.dist-info/entry_points.txt +2 -0
codebatch/tasks/lint.py
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
"""Lint task executor - rule-based diagnostics.
|
|
2
|
+
|
|
3
|
+
Emits:
|
|
4
|
+
- kind=diagnostic: Lint warnings/errors with severity, code, message, location
|
|
5
|
+
|
|
6
|
+
Inputs:
|
|
7
|
+
- Parse outputs (kind=ast) via iter_prior_outputs (preferred)
|
|
8
|
+
- Falls back to raw file content for simple text rules
|
|
9
|
+
|
|
10
|
+
Rules (Phase 2 minimal set):
|
|
11
|
+
- L001: Trailing whitespace
|
|
12
|
+
- L002: Line too long (>120 chars)
|
|
13
|
+
- L003: TODO/FIXME presence
|
|
14
|
+
- L004: Tab indentation (prefer spaces)
|
|
15
|
+
- L005: Missing newline at end of file
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import json
|
|
19
|
+
from typing import Iterable, Optional
|
|
20
|
+
|
|
21
|
+
from ..runner import ShardRunner
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# Rule configuration
|
|
25
|
+
DEFAULT_MAX_LINE_LENGTH = 120
|
|
26
|
+
TODO_PATTERNS = ["TODO", "FIXME", "XXX", "HACK"]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def lint_trailing_whitespace(lines: list[str], path: str) -> list[dict]:
|
|
30
|
+
"""L001: Detect trailing whitespace."""
|
|
31
|
+
diagnostics = []
|
|
32
|
+
for i, line in enumerate(lines, 1):
|
|
33
|
+
# Don't strip newline, just check for trailing spaces/tabs before it
|
|
34
|
+
stripped = line.rstrip('\n\r')
|
|
35
|
+
if stripped != stripped.rstrip():
|
|
36
|
+
diagnostics.append({
|
|
37
|
+
"kind": "diagnostic",
|
|
38
|
+
"path": path,
|
|
39
|
+
"severity": "warning",
|
|
40
|
+
"code": "L001",
|
|
41
|
+
"message": "Trailing whitespace",
|
|
42
|
+
"line": i,
|
|
43
|
+
"col": len(stripped.rstrip()) + 1,
|
|
44
|
+
})
|
|
45
|
+
return diagnostics
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def lint_line_too_long(lines: list[str], path: str, max_length: int = DEFAULT_MAX_LINE_LENGTH) -> list[dict]:
|
|
49
|
+
"""L002: Detect lines exceeding max length."""
|
|
50
|
+
diagnostics = []
|
|
51
|
+
for i, line in enumerate(lines, 1):
|
|
52
|
+
stripped = line.rstrip('\n\r')
|
|
53
|
+
if len(stripped) > max_length:
|
|
54
|
+
diagnostics.append({
|
|
55
|
+
"kind": "diagnostic",
|
|
56
|
+
"path": path,
|
|
57
|
+
"severity": "warning",
|
|
58
|
+
"code": "L002",
|
|
59
|
+
"message": f"Line too long ({len(stripped)} > {max_length})",
|
|
60
|
+
"line": i,
|
|
61
|
+
"col": max_length + 1,
|
|
62
|
+
})
|
|
63
|
+
return diagnostics
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def lint_todo_fixme(lines: list[str], path: str) -> list[dict]:
|
|
67
|
+
"""L003: Detect TODO/FIXME/XXX/HACK comments."""
|
|
68
|
+
diagnostics = []
|
|
69
|
+
for i, line in enumerate(lines, 1):
|
|
70
|
+
upper_line = line.upper()
|
|
71
|
+
for pattern in TODO_PATTERNS:
|
|
72
|
+
if pattern in upper_line:
|
|
73
|
+
col = line.upper().find(pattern) + 1
|
|
74
|
+
diagnostics.append({
|
|
75
|
+
"kind": "diagnostic",
|
|
76
|
+
"path": path,
|
|
77
|
+
"severity": "info",
|
|
78
|
+
"code": "L003",
|
|
79
|
+
"message": f"Found {pattern} comment",
|
|
80
|
+
"line": i,
|
|
81
|
+
"col": col,
|
|
82
|
+
})
|
|
83
|
+
break # Only report once per line
|
|
84
|
+
return diagnostics
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def lint_tab_indentation(lines: list[str], path: str) -> list[dict]:
|
|
88
|
+
"""L004: Detect tab indentation (prefer spaces)."""
|
|
89
|
+
diagnostics = []
|
|
90
|
+
for i, line in enumerate(lines, 1):
|
|
91
|
+
if line.startswith('\t'):
|
|
92
|
+
diagnostics.append({
|
|
93
|
+
"kind": "diagnostic",
|
|
94
|
+
"path": path,
|
|
95
|
+
"severity": "warning",
|
|
96
|
+
"code": "L004",
|
|
97
|
+
"message": "Tab indentation (prefer spaces)",
|
|
98
|
+
"line": i,
|
|
99
|
+
"col": 1,
|
|
100
|
+
})
|
|
101
|
+
return diagnostics
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def lint_missing_final_newline(content: str, path: str) -> list[dict]:
|
|
105
|
+
"""L005: Detect missing newline at end of file."""
|
|
106
|
+
diagnostics = []
|
|
107
|
+
if content and not content.endswith('\n'):
|
|
108
|
+
lines = content.split('\n')
|
|
109
|
+
diagnostics.append({
|
|
110
|
+
"kind": "diagnostic",
|
|
111
|
+
"path": path,
|
|
112
|
+
"severity": "warning",
|
|
113
|
+
"code": "L005",
|
|
114
|
+
"message": "Missing newline at end of file",
|
|
115
|
+
"line": len(lines),
|
|
116
|
+
"col": len(lines[-1]) + 1 if lines else 1,
|
|
117
|
+
})
|
|
118
|
+
return diagnostics
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def lint_content(content: str, path: str, config: dict) -> list[dict]:
|
|
122
|
+
"""Run all lint rules on content.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
content: File content as string.
|
|
126
|
+
path: File path.
|
|
127
|
+
config: Lint configuration.
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
List of diagnostic records.
|
|
131
|
+
"""
|
|
132
|
+
diagnostics = []
|
|
133
|
+
lines = content.split('\n')
|
|
134
|
+
|
|
135
|
+
# Get config options
|
|
136
|
+
max_line_length = config.get("max_line_length", DEFAULT_MAX_LINE_LENGTH)
|
|
137
|
+
check_trailing = config.get("check_trailing_whitespace", True)
|
|
138
|
+
check_line_length = config.get("check_line_length", True)
|
|
139
|
+
check_todo = config.get("check_todo", True)
|
|
140
|
+
check_tabs = config.get("check_tab_indentation", True)
|
|
141
|
+
check_final_newline = config.get("check_final_newline", True)
|
|
142
|
+
|
|
143
|
+
if check_trailing:
|
|
144
|
+
diagnostics.extend(lint_trailing_whitespace(lines, path))
|
|
145
|
+
|
|
146
|
+
if check_line_length:
|
|
147
|
+
diagnostics.extend(lint_line_too_long(lines, path, max_line_length))
|
|
148
|
+
|
|
149
|
+
if check_todo:
|
|
150
|
+
diagnostics.extend(lint_todo_fixme(lines, path))
|
|
151
|
+
|
|
152
|
+
if check_tabs:
|
|
153
|
+
diagnostics.extend(lint_tab_indentation(lines, path))
|
|
154
|
+
|
|
155
|
+
if check_final_newline:
|
|
156
|
+
diagnostics.extend(lint_missing_final_newline(content, path))
|
|
157
|
+
|
|
158
|
+
return diagnostics
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def lint_executor(config: dict, files: Iterable[dict], runner: ShardRunner) -> list[dict]:
|
|
162
|
+
"""Execute the lint task.
|
|
163
|
+
|
|
164
|
+
Runs lint rules on files in the shard. Prefers AST-based linting but
|
|
165
|
+
falls back to text-based rules for all files.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
config: Task configuration.
|
|
169
|
+
files: Iterable of file records for this shard.
|
|
170
|
+
runner: ShardRunner for CAS access.
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
List of diagnostic output records.
|
|
174
|
+
"""
|
|
175
|
+
outputs = []
|
|
176
|
+
|
|
177
|
+
# Get execution context
|
|
178
|
+
batch_id = config.get("_batch_id")
|
|
179
|
+
shard_id = config.get("_shard_id")
|
|
180
|
+
|
|
181
|
+
# Track which files we've linted (to avoid duplicates)
|
|
182
|
+
linted_paths = set()
|
|
183
|
+
|
|
184
|
+
# First pass: lint files that have AST (from parse task)
|
|
185
|
+
if batch_id and shard_id:
|
|
186
|
+
for ast_output in runner.iter_prior_outputs(batch_id, "01_parse", shard_id, kind="ast"):
|
|
187
|
+
path = ast_output.get("path")
|
|
188
|
+
object_ref = ast_output.get("object")
|
|
189
|
+
|
|
190
|
+
if not path or not object_ref or path in linted_paths:
|
|
191
|
+
continue
|
|
192
|
+
|
|
193
|
+
# Skip chunked ASTs
|
|
194
|
+
if ast_output.get("format") == "json+chunks":
|
|
195
|
+
continue
|
|
196
|
+
|
|
197
|
+
try:
|
|
198
|
+
# Load AST to check for any parse-related issues
|
|
199
|
+
# (Future: could add AST-based lint rules here)
|
|
200
|
+
|
|
201
|
+
# For now, get the original file content and run text rules
|
|
202
|
+
# We need to find the file's object ref from snapshot
|
|
203
|
+
pass
|
|
204
|
+
|
|
205
|
+
except Exception:
|
|
206
|
+
pass
|
|
207
|
+
|
|
208
|
+
# Second pass: lint all files in shard by reading from CAS
|
|
209
|
+
file_list = list(files)
|
|
210
|
+
for file_record in file_list:
|
|
211
|
+
path = file_record["path"]
|
|
212
|
+
object_ref = file_record["object"]
|
|
213
|
+
|
|
214
|
+
if path in linted_paths:
|
|
215
|
+
continue
|
|
216
|
+
|
|
217
|
+
try:
|
|
218
|
+
# Get file content from CAS
|
|
219
|
+
data = runner.object_store.get_bytes(object_ref)
|
|
220
|
+
|
|
221
|
+
# Try to decode as text
|
|
222
|
+
try:
|
|
223
|
+
content = data.decode("utf-8")
|
|
224
|
+
except UnicodeDecodeError:
|
|
225
|
+
# Binary file - skip
|
|
226
|
+
continue
|
|
227
|
+
|
|
228
|
+
# Run lint rules
|
|
229
|
+
diagnostics = lint_content(content, path, config)
|
|
230
|
+
outputs.extend(diagnostics)
|
|
231
|
+
linted_paths.add(path)
|
|
232
|
+
|
|
233
|
+
except Exception as e:
|
|
234
|
+
outputs.append({
|
|
235
|
+
"kind": "diagnostic",
|
|
236
|
+
"path": path,
|
|
237
|
+
"severity": "error",
|
|
238
|
+
"code": "L999",
|
|
239
|
+
"message": f"Lint error: {e}",
|
|
240
|
+
"line": 1,
|
|
241
|
+
"col": 1,
|
|
242
|
+
})
|
|
243
|
+
|
|
244
|
+
return outputs
|
codebatch/tasks/parse.py
ADDED
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
"""Parse task executor - produces AST and diagnostic outputs.
|
|
2
|
+
|
|
3
|
+
For Phase 1, supports:
|
|
4
|
+
- Python (via ast module)
|
|
5
|
+
- JavaScript/TypeScript (simple tokenization)
|
|
6
|
+
- Text files (line-based tokenization)
|
|
7
|
+
|
|
8
|
+
Emits:
|
|
9
|
+
- kind=ast: AST objects stored in CAS (format=json or format=json+chunks)
|
|
10
|
+
- kind=diagnostic: Parse errors/warnings
|
|
11
|
+
|
|
12
|
+
Enforces chunking threshold (default 16MB) with chunk manifest objects.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import ast
|
|
16
|
+
import json
|
|
17
|
+
import re
|
|
18
|
+
from typing import Iterable, Optional
|
|
19
|
+
|
|
20
|
+
from ..common import SCHEMA_VERSION, PRODUCER
|
|
21
|
+
from ..runner import ShardRunner
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# Default chunk size: 16MB
|
|
25
|
+
DEFAULT_CHUNK_SIZE = 16 * 1024 * 1024
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def parse_python(content: str, path: str) -> tuple[Optional[dict], list[dict]]:
|
|
29
|
+
"""Parse Python source code.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
content: Python source code.
|
|
33
|
+
path: File path for error reporting.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Tuple of (AST dict or None, list of diagnostics).
|
|
37
|
+
"""
|
|
38
|
+
diagnostics = []
|
|
39
|
+
|
|
40
|
+
try:
|
|
41
|
+
tree = ast.parse(content, filename=path)
|
|
42
|
+
# Convert AST to dict - summarized mode for reasonable size
|
|
43
|
+
ast_dict = {
|
|
44
|
+
"type": "Module",
|
|
45
|
+
"ast_mode": "summary", # Explicit about summarization
|
|
46
|
+
"body": [
|
|
47
|
+
{
|
|
48
|
+
"type": node.__class__.__name__,
|
|
49
|
+
"lineno": getattr(node, "lineno", None),
|
|
50
|
+
"col_offset": getattr(node, "col_offset", None),
|
|
51
|
+
}
|
|
52
|
+
for node in ast.walk(tree)
|
|
53
|
+
if hasattr(node, "lineno")
|
|
54
|
+
][:100], # Limit for reasonable size
|
|
55
|
+
"stats": {
|
|
56
|
+
"total_nodes": len(list(ast.walk(tree))),
|
|
57
|
+
},
|
|
58
|
+
}
|
|
59
|
+
return ast_dict, diagnostics
|
|
60
|
+
|
|
61
|
+
except SyntaxError as e:
|
|
62
|
+
diagnostics.append({
|
|
63
|
+
"severity": "error",
|
|
64
|
+
"code": "E0001",
|
|
65
|
+
"message": str(e.msg) if e.msg else "Syntax error",
|
|
66
|
+
"line": e.lineno or 1,
|
|
67
|
+
"column": e.offset or 1,
|
|
68
|
+
})
|
|
69
|
+
return None, diagnostics
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def parse_javascript(content: str, path: str) -> tuple[Optional[dict], list[dict]]:
|
|
73
|
+
"""Simple JavaScript/TypeScript tokenization.
|
|
74
|
+
|
|
75
|
+
This is a basic tokenizer, not a full parser.
|
|
76
|
+
For Phase 1, we just identify tokens and structure.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
content: JS/TS source code.
|
|
80
|
+
path: File path.
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
Tuple of (token info dict or None, list of diagnostics).
|
|
84
|
+
"""
|
|
85
|
+
diagnostics = []
|
|
86
|
+
|
|
87
|
+
# Simple token patterns
|
|
88
|
+
patterns = {
|
|
89
|
+
"keyword": r'\b(function|const|let|var|if|else|for|while|return|class|import|export|async|await)\b',
|
|
90
|
+
"string": r'(["\'])(?:(?!\1)[^\\]|\\.)*\1',
|
|
91
|
+
"number": r'\b\d+(?:\.\d+)?\b',
|
|
92
|
+
"comment": r'//.*|/\*[\s\S]*?\*/',
|
|
93
|
+
"identifier": r'\b[a-zA-Z_$][a-zA-Z0-9_$]*\b',
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
token_counts = {}
|
|
97
|
+
for token_type, pattern in patterns.items():
|
|
98
|
+
matches = re.findall(pattern, content)
|
|
99
|
+
# Handle tuple returns from capture groups
|
|
100
|
+
if matches and isinstance(matches[0], tuple):
|
|
101
|
+
token_counts[token_type] = len(matches)
|
|
102
|
+
else:
|
|
103
|
+
token_counts[token_type] = len(matches)
|
|
104
|
+
|
|
105
|
+
# Check for common issues
|
|
106
|
+
# Unbalanced braces
|
|
107
|
+
open_braces = content.count('{')
|
|
108
|
+
close_braces = content.count('}')
|
|
109
|
+
if open_braces != close_braces:
|
|
110
|
+
diagnostics.append({
|
|
111
|
+
"severity": "warning",
|
|
112
|
+
"code": "W0001",
|
|
113
|
+
"message": f"Unbalanced braces: {open_braces} open, {close_braces} close",
|
|
114
|
+
"line": 1,
|
|
115
|
+
"column": 1,
|
|
116
|
+
})
|
|
117
|
+
|
|
118
|
+
ast_dict = {
|
|
119
|
+
"type": "TokenInfo",
|
|
120
|
+
"ast_mode": "tokens",
|
|
121
|
+
"tokens": token_counts,
|
|
122
|
+
"stats": {
|
|
123
|
+
"lines": content.count('\n') + 1,
|
|
124
|
+
"characters": len(content),
|
|
125
|
+
},
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
return ast_dict, diagnostics
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def parse_text(content: str, path: str) -> tuple[Optional[dict], list[dict]]:
|
|
132
|
+
"""Simple text file tokenization.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
content: Text content.
|
|
136
|
+
path: File path.
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
Tuple of (token info dict, empty diagnostics).
|
|
140
|
+
"""
|
|
141
|
+
lines = content.split('\n')
|
|
142
|
+
words = content.split()
|
|
143
|
+
|
|
144
|
+
ast_dict = {
|
|
145
|
+
"type": "TextInfo",
|
|
146
|
+
"ast_mode": "text_stats",
|
|
147
|
+
"stats": {
|
|
148
|
+
"lines": len(lines),
|
|
149
|
+
"words": len(words),
|
|
150
|
+
"characters": len(content),
|
|
151
|
+
"non_empty_lines": sum(1 for line in lines if line.strip()),
|
|
152
|
+
},
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
return ast_dict, []
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def create_chunk_manifest(
|
|
159
|
+
data: bytes,
|
|
160
|
+
kind: str,
|
|
161
|
+
fmt: str,
|
|
162
|
+
runner: ShardRunner,
|
|
163
|
+
chunk_size: int = DEFAULT_CHUNK_SIZE,
|
|
164
|
+
) -> tuple[str, dict]:
|
|
165
|
+
"""Create a chunk manifest for large data.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
data: Raw bytes to chunk.
|
|
169
|
+
kind: Output kind.
|
|
170
|
+
fmt: Base format identifier (will be suffixed with +chunks).
|
|
171
|
+
runner: ShardRunner for CAS access.
|
|
172
|
+
chunk_size: Target chunk size.
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
Tuple of (manifest object ref, manifest dict).
|
|
176
|
+
"""
|
|
177
|
+
chunks = []
|
|
178
|
+
total_bytes = len(data)
|
|
179
|
+
|
|
180
|
+
for i in range(0, total_bytes, chunk_size):
|
|
181
|
+
chunk_data = data[i:i + chunk_size]
|
|
182
|
+
chunk_ref = runner.object_store.put_bytes(chunk_data)
|
|
183
|
+
chunks.append({
|
|
184
|
+
"object": chunk_ref,
|
|
185
|
+
"size": len(chunk_data),
|
|
186
|
+
"index": len(chunks),
|
|
187
|
+
})
|
|
188
|
+
|
|
189
|
+
manifest = {
|
|
190
|
+
"schema_name": "codebatch.chunk_manifest",
|
|
191
|
+
"schema_version": SCHEMA_VERSION,
|
|
192
|
+
"producer": PRODUCER,
|
|
193
|
+
"kind": kind,
|
|
194
|
+
"format": fmt,
|
|
195
|
+
"chunks": chunks,
|
|
196
|
+
"total_bytes": total_bytes,
|
|
197
|
+
"chunk_size": chunk_size,
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
manifest_bytes = json.dumps(manifest, separators=(",", ":")).encode("utf-8")
|
|
201
|
+
manifest_ref = runner.object_store.put_bytes(manifest_bytes)
|
|
202
|
+
|
|
203
|
+
return manifest_ref, manifest
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def parse_executor(config: dict, files: Iterable[dict], runner: ShardRunner) -> list[dict]:
|
|
207
|
+
"""Execute the parse task.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
config: Task configuration.
|
|
211
|
+
files: Iterable of file records for this shard (may be iterator).
|
|
212
|
+
runner: ShardRunner for CAS access.
|
|
213
|
+
|
|
214
|
+
Returns:
|
|
215
|
+
List of output records.
|
|
216
|
+
"""
|
|
217
|
+
outputs = []
|
|
218
|
+
chunk_threshold = config.get("chunk_threshold", DEFAULT_CHUNK_SIZE)
|
|
219
|
+
emit_ast = config.get("emit_ast", True)
|
|
220
|
+
emit_diagnostics = config.get("emit_diagnostics", True)
|
|
221
|
+
|
|
222
|
+
for file_record in files:
|
|
223
|
+
path = file_record["path"]
|
|
224
|
+
object_ref = file_record["object"]
|
|
225
|
+
lang_hint = file_record.get("lang_hint")
|
|
226
|
+
|
|
227
|
+
try:
|
|
228
|
+
# Get file content from CAS
|
|
229
|
+
data = runner.object_store.get_bytes(object_ref)
|
|
230
|
+
|
|
231
|
+
# Try to decode as text
|
|
232
|
+
try:
|
|
233
|
+
content = data.decode("utf-8")
|
|
234
|
+
except UnicodeDecodeError:
|
|
235
|
+
# Binary file - skip
|
|
236
|
+
continue
|
|
237
|
+
|
|
238
|
+
# Parse based on language
|
|
239
|
+
ast_dict = None
|
|
240
|
+
diagnostics = []
|
|
241
|
+
|
|
242
|
+
if lang_hint == "python":
|
|
243
|
+
ast_dict, diagnostics = parse_python(content, path)
|
|
244
|
+
elif lang_hint in ("javascript", "typescript"):
|
|
245
|
+
ast_dict, diagnostics = parse_javascript(content, path)
|
|
246
|
+
elif lang_hint in ("markdown", "json", "yaml", "xml", "html", "css"):
|
|
247
|
+
# Text-based formats
|
|
248
|
+
ast_dict, diagnostics = parse_text(content, path)
|
|
249
|
+
else:
|
|
250
|
+
# Default text tokenization for unknown types
|
|
251
|
+
ast_dict, diagnostics = parse_text(content, path)
|
|
252
|
+
|
|
253
|
+
# Emit AST output
|
|
254
|
+
if emit_ast and ast_dict is not None:
|
|
255
|
+
ast_bytes = json.dumps(ast_dict, separators=(",", ":")).encode("utf-8")
|
|
256
|
+
|
|
257
|
+
if len(ast_bytes) > chunk_threshold:
|
|
258
|
+
# Create chunk manifest - kind stays "ast", format becomes "json+chunks"
|
|
259
|
+
manifest_ref, _ = create_chunk_manifest(
|
|
260
|
+
ast_bytes, "ast", "json", runner, chunk_threshold
|
|
261
|
+
)
|
|
262
|
+
outputs.append({
|
|
263
|
+
"path": path,
|
|
264
|
+
"kind": "ast", # Semantic kind stays ast
|
|
265
|
+
"object": manifest_ref,
|
|
266
|
+
"format": "json+chunks", # Format indicates chunking
|
|
267
|
+
})
|
|
268
|
+
else:
|
|
269
|
+
# Store directly
|
|
270
|
+
ast_ref = runner.object_store.put_bytes(ast_bytes)
|
|
271
|
+
outputs.append({
|
|
272
|
+
"path": path,
|
|
273
|
+
"kind": "ast",
|
|
274
|
+
"object": ast_ref,
|
|
275
|
+
"format": "json",
|
|
276
|
+
})
|
|
277
|
+
|
|
278
|
+
# Emit diagnostics
|
|
279
|
+
if emit_diagnostics:
|
|
280
|
+
for diag in diagnostics:
|
|
281
|
+
outputs.append({
|
|
282
|
+
"path": path,
|
|
283
|
+
"kind": "diagnostic",
|
|
284
|
+
"severity": diag["severity"],
|
|
285
|
+
"code": diag["code"],
|
|
286
|
+
"message": diag["message"],
|
|
287
|
+
"line": diag.get("line"),
|
|
288
|
+
"column": diag.get("column"),
|
|
289
|
+
})
|
|
290
|
+
|
|
291
|
+
except Exception as e:
|
|
292
|
+
# Emit error diagnostic
|
|
293
|
+
if emit_diagnostics:
|
|
294
|
+
outputs.append({
|
|
295
|
+
"path": path,
|
|
296
|
+
"kind": "diagnostic",
|
|
297
|
+
"severity": "error",
|
|
298
|
+
"code": "E9999",
|
|
299
|
+
"message": f"Parse error: {str(e)}",
|
|
300
|
+
"line": 1,
|
|
301
|
+
"column": 1,
|
|
302
|
+
})
|
|
303
|
+
|
|
304
|
+
return outputs
|