recursive-cleaner 0.8.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- backends/__init__.py +2 -1
- backends/openai_backend.py +71 -0
- recursive_cleaner/__init__.py +4 -1
- recursive_cleaner/__main__.py +8 -0
- recursive_cleaner/apply.py +483 -0
- recursive_cleaner/cleaner.py +27 -5
- recursive_cleaner/cli.py +395 -0
- recursive_cleaner/prompt.py +8 -4
- recursive_cleaner/tui.py +43 -24
- recursive_cleaner/validation.py +40 -1
- {recursive_cleaner-0.8.0.dist-info → recursive_cleaner-1.0.1.dist-info}/METADATA +100 -4
- {recursive_cleaner-0.8.0.dist-info → recursive_cleaner-1.0.1.dist-info}/RECORD +15 -10
- recursive_cleaner-1.0.1.dist-info/entry_points.txt +2 -0
- {recursive_cleaner-0.8.0.dist-info → recursive_cleaner-1.0.1.dist-info}/WHEEL +0 -0
- {recursive_cleaner-0.8.0.dist-info → recursive_cleaner-1.0.1.dist-info}/licenses/LICENSE +0 -0
recursive_cleaner/cli.py
ADDED
|
@@ -0,0 +1,395 @@
|
|
|
1
|
+
"""CLI interface for Recursive Data Cleaner."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def create_backend(provider: str, model: str, base_url: str | None, api_key: str | None):
|
|
9
|
+
"""
|
|
10
|
+
Factory function to create the appropriate backend.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
provider: Backend provider ("mlx" or "openai")
|
|
14
|
+
model: Model name/path
|
|
15
|
+
base_url: Optional API base URL (for openai-compatible servers)
|
|
16
|
+
api_key: Optional API key
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
LLMBackend instance
|
|
20
|
+
|
|
21
|
+
Raises:
|
|
22
|
+
SystemExit: With code 2 if provider is invalid or import fails
|
|
23
|
+
"""
|
|
24
|
+
if provider == "mlx":
|
|
25
|
+
try:
|
|
26
|
+
from backends import MLXBackend
|
|
27
|
+
return MLXBackend(model_path=model)
|
|
28
|
+
except ImportError:
|
|
29
|
+
print("Error: MLX backend requires mlx-lm. Install with: pip install mlx-lm", file=sys.stderr)
|
|
30
|
+
sys.exit(2)
|
|
31
|
+
elif provider == "openai":
|
|
32
|
+
try:
|
|
33
|
+
from backends import OpenAIBackend
|
|
34
|
+
return OpenAIBackend(model=model, api_key=api_key, base_url=base_url)
|
|
35
|
+
except ImportError as e:
|
|
36
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
37
|
+
sys.exit(2)
|
|
38
|
+
else:
|
|
39
|
+
print(f"Error: Unknown provider '{provider}'. Use 'mlx' or 'openai'.", file=sys.stderr)
|
|
40
|
+
sys.exit(2)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def read_instructions(value: str) -> str:
|
|
44
|
+
"""
|
|
45
|
+
Read instructions from inline text or file.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
value: Instructions string or @file.txt path
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Instructions text
|
|
52
|
+
"""
|
|
53
|
+
if value.startswith("@"):
|
|
54
|
+
file_path = value[1:]
|
|
55
|
+
try:
|
|
56
|
+
with open(file_path, "r") as f:
|
|
57
|
+
return f.read().strip()
|
|
58
|
+
except FileNotFoundError:
|
|
59
|
+
print(f"Error: Instructions file not found: {file_path}", file=sys.stderr)
|
|
60
|
+
sys.exit(1)
|
|
61
|
+
except IOError as e:
|
|
62
|
+
print(f"Error reading instructions file: {e}", file=sys.stderr)
|
|
63
|
+
sys.exit(1)
|
|
64
|
+
elif value == "-":
|
|
65
|
+
return sys.stdin.read().strip()
|
|
66
|
+
return value
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def cmd_generate(args) -> int:
|
|
70
|
+
"""Handle the generate command."""
|
|
71
|
+
from recursive_cleaner import DataCleaner
|
|
72
|
+
|
|
73
|
+
# Check if file exists
|
|
74
|
+
if not os.path.exists(args.file):
|
|
75
|
+
print(f"Error: File not found: {args.file}", file=sys.stderr)
|
|
76
|
+
return 1
|
|
77
|
+
|
|
78
|
+
# Create backend
|
|
79
|
+
backend = create_backend(args.provider, args.model, args.base_url, args.api_key)
|
|
80
|
+
|
|
81
|
+
# Read instructions
|
|
82
|
+
instructions = read_instructions(args.instructions) if args.instructions else ""
|
|
83
|
+
|
|
84
|
+
# Create progress callback for non-TUI mode
|
|
85
|
+
def on_progress(event):
|
|
86
|
+
if not args.tui:
|
|
87
|
+
event_type = event.get("type", "")
|
|
88
|
+
if event_type == "function_generated":
|
|
89
|
+
print(f" Generated: {event.get('function_name', '')}")
|
|
90
|
+
|
|
91
|
+
try:
|
|
92
|
+
cleaner = DataCleaner(
|
|
93
|
+
llm_backend=backend,
|
|
94
|
+
file_path=args.file,
|
|
95
|
+
chunk_size=args.chunk_size,
|
|
96
|
+
instructions=instructions,
|
|
97
|
+
max_iterations=args.max_iterations,
|
|
98
|
+
mode=args.mode,
|
|
99
|
+
state_file=args.state_file,
|
|
100
|
+
report_path=args.report if args.report else None,
|
|
101
|
+
tui=args.tui,
|
|
102
|
+
optimize=args.optimize,
|
|
103
|
+
track_metrics=args.track_metrics,
|
|
104
|
+
early_termination=args.early_termination,
|
|
105
|
+
on_progress=on_progress if not args.tui else None,
|
|
106
|
+
output_path=args.output,
|
|
107
|
+
)
|
|
108
|
+
cleaner.run()
|
|
109
|
+
return 0
|
|
110
|
+
except FileNotFoundError as e:
|
|
111
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
112
|
+
return 1
|
|
113
|
+
except Exception as e:
|
|
114
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
115
|
+
return 3
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def cmd_analyze(args) -> int:
|
|
119
|
+
"""Handle the analyze command (dry-run mode)."""
|
|
120
|
+
from recursive_cleaner import DataCleaner
|
|
121
|
+
|
|
122
|
+
# Check if file exists
|
|
123
|
+
if not os.path.exists(args.file):
|
|
124
|
+
print(f"Error: File not found: {args.file}", file=sys.stderr)
|
|
125
|
+
return 1
|
|
126
|
+
|
|
127
|
+
# Create backend
|
|
128
|
+
backend = create_backend(args.provider, args.model, args.base_url, args.api_key)
|
|
129
|
+
|
|
130
|
+
# Read instructions
|
|
131
|
+
instructions = read_instructions(args.instructions) if args.instructions else ""
|
|
132
|
+
|
|
133
|
+
# Progress callback for analysis output
|
|
134
|
+
def on_progress(event):
|
|
135
|
+
if not args.tui:
|
|
136
|
+
event_type = event.get("type", "")
|
|
137
|
+
if event_type == "issues_detected":
|
|
138
|
+
issues = event.get("issues", [])
|
|
139
|
+
chunk_idx = event.get("chunk_index", 0)
|
|
140
|
+
unsolved = [i for i in issues if not i.get("solved", False)]
|
|
141
|
+
print(f"Chunk {chunk_idx + 1}: {len(issues)} issues ({len(unsolved)} unsolved)")
|
|
142
|
+
|
|
143
|
+
try:
|
|
144
|
+
cleaner = DataCleaner(
|
|
145
|
+
llm_backend=backend,
|
|
146
|
+
file_path=args.file,
|
|
147
|
+
chunk_size=args.chunk_size,
|
|
148
|
+
instructions=instructions,
|
|
149
|
+
max_iterations=args.max_iterations,
|
|
150
|
+
mode=args.mode,
|
|
151
|
+
dry_run=True,
|
|
152
|
+
tui=args.tui,
|
|
153
|
+
on_progress=on_progress if not args.tui else None,
|
|
154
|
+
)
|
|
155
|
+
cleaner.run()
|
|
156
|
+
return 0
|
|
157
|
+
except FileNotFoundError as e:
|
|
158
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
159
|
+
return 1
|
|
160
|
+
except Exception as e:
|
|
161
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
162
|
+
return 3
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def cmd_apply(args) -> int:
|
|
166
|
+
"""Handle the apply command."""
|
|
167
|
+
from recursive_cleaner.apply import apply_cleaning
|
|
168
|
+
|
|
169
|
+
# Check if input file exists
|
|
170
|
+
if not os.path.exists(args.file):
|
|
171
|
+
print(f"Error: File not found: {args.file}", file=sys.stderr)
|
|
172
|
+
return 1
|
|
173
|
+
|
|
174
|
+
# Check if functions file exists
|
|
175
|
+
if not os.path.exists(args.functions):
|
|
176
|
+
print(f"Error: Functions file not found: {args.functions}", file=sys.stderr)
|
|
177
|
+
return 1
|
|
178
|
+
|
|
179
|
+
try:
|
|
180
|
+
output_path = apply_cleaning(
|
|
181
|
+
input_path=args.file,
|
|
182
|
+
functions_path=args.functions,
|
|
183
|
+
output_path=args.output,
|
|
184
|
+
)
|
|
185
|
+
print(f"Cleaned data written to: {output_path}")
|
|
186
|
+
return 0
|
|
187
|
+
except FileNotFoundError as e:
|
|
188
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
189
|
+
return 1
|
|
190
|
+
except ImportError as e:
|
|
191
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
192
|
+
return 2
|
|
193
|
+
except Exception as e:
|
|
194
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
195
|
+
return 3
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def cmd_resume(args) -> int:
|
|
199
|
+
"""Handle the resume command."""
|
|
200
|
+
from recursive_cleaner import DataCleaner
|
|
201
|
+
|
|
202
|
+
# Check if state file exists
|
|
203
|
+
if not os.path.exists(args.state_file):
|
|
204
|
+
print(f"Error: State file not found: {args.state_file}", file=sys.stderr)
|
|
205
|
+
return 1
|
|
206
|
+
|
|
207
|
+
# Create backend
|
|
208
|
+
backend = create_backend(args.provider, args.model, args.base_url, args.api_key)
|
|
209
|
+
|
|
210
|
+
try:
|
|
211
|
+
cleaner = DataCleaner.resume(args.state_file, backend)
|
|
212
|
+
cleaner.run()
|
|
213
|
+
return 0
|
|
214
|
+
except FileNotFoundError as e:
|
|
215
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
216
|
+
return 1
|
|
217
|
+
except ValueError as e:
|
|
218
|
+
print(f"Error: Invalid state file: {e}", file=sys.stderr)
|
|
219
|
+
return 1
|
|
220
|
+
except Exception as e:
|
|
221
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
222
|
+
return 3
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def create_parser() -> argparse.ArgumentParser:
|
|
226
|
+
"""Create the argument parser with all subcommands."""
|
|
227
|
+
parser = argparse.ArgumentParser(
|
|
228
|
+
prog="recursive-cleaner",
|
|
229
|
+
description="LLM-powered incremental data cleaning pipeline",
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
|
233
|
+
|
|
234
|
+
# --- generate command ---
|
|
235
|
+
gen_parser = subparsers.add_parser(
|
|
236
|
+
"generate",
|
|
237
|
+
help="Generate cleaning functions from data file",
|
|
238
|
+
)
|
|
239
|
+
gen_parser.add_argument("file", metavar="FILE", help="Path to input data file")
|
|
240
|
+
gen_parser.add_argument(
|
|
241
|
+
"-p", "--provider", required=True, choices=["mlx", "openai"],
|
|
242
|
+
help="LLM provider (mlx or openai)"
|
|
243
|
+
)
|
|
244
|
+
gen_parser.add_argument(
|
|
245
|
+
"-m", "--model", required=True, help="Model name/path"
|
|
246
|
+
)
|
|
247
|
+
gen_parser.add_argument(
|
|
248
|
+
"-i", "--instructions", default="",
|
|
249
|
+
help="Cleaning instructions (text or @file.txt)"
|
|
250
|
+
)
|
|
251
|
+
gen_parser.add_argument(
|
|
252
|
+
"--base-url", help="API base URL (for openai-compatible servers)"
|
|
253
|
+
)
|
|
254
|
+
gen_parser.add_argument(
|
|
255
|
+
"--api-key", help="API key (or use OPENAI_API_KEY env var)"
|
|
256
|
+
)
|
|
257
|
+
gen_parser.add_argument(
|
|
258
|
+
"--chunk-size", type=int, default=50, help="Items per chunk (default: 50)"
|
|
259
|
+
)
|
|
260
|
+
gen_parser.add_argument(
|
|
261
|
+
"--max-iterations", type=int, default=5,
|
|
262
|
+
help="Max iterations per chunk (default: 5)"
|
|
263
|
+
)
|
|
264
|
+
gen_parser.add_argument(
|
|
265
|
+
"--mode", choices=["auto", "structured", "text"], default="auto",
|
|
266
|
+
help="Processing mode (default: auto)"
|
|
267
|
+
)
|
|
268
|
+
gen_parser.add_argument(
|
|
269
|
+
"-o", "--output", default="cleaning_functions.py",
|
|
270
|
+
help="Output file path (default: cleaning_functions.py)"
|
|
271
|
+
)
|
|
272
|
+
gen_parser.add_argument(
|
|
273
|
+
"--report", default="cleaning_report.md",
|
|
274
|
+
help="Report file path (empty to disable, default: cleaning_report.md)"
|
|
275
|
+
)
|
|
276
|
+
gen_parser.add_argument(
|
|
277
|
+
"--state-file", help="Checkpoint file for resume"
|
|
278
|
+
)
|
|
279
|
+
gen_parser.add_argument(
|
|
280
|
+
"--tui", action="store_true", help="Enable Rich terminal dashboard"
|
|
281
|
+
)
|
|
282
|
+
gen_parser.add_argument(
|
|
283
|
+
"--optimize", action="store_true", help="Consolidate redundant functions"
|
|
284
|
+
)
|
|
285
|
+
gen_parser.add_argument(
|
|
286
|
+
"--track-metrics", action="store_true", help="Measure before/after quality"
|
|
287
|
+
)
|
|
288
|
+
gen_parser.add_argument(
|
|
289
|
+
"--early-termination", action="store_true",
|
|
290
|
+
help="Stop on pattern saturation"
|
|
291
|
+
)
|
|
292
|
+
gen_parser.set_defaults(func=cmd_generate)
|
|
293
|
+
|
|
294
|
+
# --- analyze command ---
|
|
295
|
+
analyze_parser = subparsers.add_parser(
|
|
296
|
+
"analyze",
|
|
297
|
+
help="Dry-run analysis without generating functions",
|
|
298
|
+
)
|
|
299
|
+
analyze_parser.add_argument("file", metavar="FILE", help="Path to input data file")
|
|
300
|
+
analyze_parser.add_argument(
|
|
301
|
+
"-p", "--provider", required=True, choices=["mlx", "openai"],
|
|
302
|
+
help="LLM provider (mlx or openai)"
|
|
303
|
+
)
|
|
304
|
+
analyze_parser.add_argument(
|
|
305
|
+
"-m", "--model", required=True, help="Model name/path"
|
|
306
|
+
)
|
|
307
|
+
analyze_parser.add_argument(
|
|
308
|
+
"-i", "--instructions", default="",
|
|
309
|
+
help="Cleaning instructions (text or @file.txt)"
|
|
310
|
+
)
|
|
311
|
+
analyze_parser.add_argument(
|
|
312
|
+
"--base-url", help="API base URL (for openai-compatible servers)"
|
|
313
|
+
)
|
|
314
|
+
analyze_parser.add_argument(
|
|
315
|
+
"--api-key", help="API key (or use OPENAI_API_KEY env var)"
|
|
316
|
+
)
|
|
317
|
+
analyze_parser.add_argument(
|
|
318
|
+
"--chunk-size", type=int, default=50, help="Items per chunk (default: 50)"
|
|
319
|
+
)
|
|
320
|
+
analyze_parser.add_argument(
|
|
321
|
+
"--max-iterations", type=int, default=5,
|
|
322
|
+
help="Max iterations per chunk (default: 5)"
|
|
323
|
+
)
|
|
324
|
+
analyze_parser.add_argument(
|
|
325
|
+
"--mode", choices=["auto", "structured", "text"], default="auto",
|
|
326
|
+
help="Processing mode (default: auto)"
|
|
327
|
+
)
|
|
328
|
+
analyze_parser.add_argument(
|
|
329
|
+
"--tui", action="store_true", help="Enable Rich terminal dashboard"
|
|
330
|
+
)
|
|
331
|
+
analyze_parser.set_defaults(func=cmd_analyze)
|
|
332
|
+
|
|
333
|
+
# --- resume command ---
|
|
334
|
+
resume_parser = subparsers.add_parser(
|
|
335
|
+
"resume",
|
|
336
|
+
help="Resume from checkpoint file",
|
|
337
|
+
)
|
|
338
|
+
resume_parser.add_argument(
|
|
339
|
+
"state_file", metavar="STATE_FILE", help="Path to checkpoint JSON file"
|
|
340
|
+
)
|
|
341
|
+
resume_parser.add_argument(
|
|
342
|
+
"-p", "--provider", required=True, choices=["mlx", "openai"],
|
|
343
|
+
help="LLM provider (mlx or openai)"
|
|
344
|
+
)
|
|
345
|
+
resume_parser.add_argument(
|
|
346
|
+
"-m", "--model", required=True, help="Model name/path"
|
|
347
|
+
)
|
|
348
|
+
resume_parser.add_argument(
|
|
349
|
+
"--base-url", help="API base URL (for openai-compatible servers)"
|
|
350
|
+
)
|
|
351
|
+
resume_parser.add_argument(
|
|
352
|
+
"--api-key", help="API key (or use OPENAI_API_KEY env var)"
|
|
353
|
+
)
|
|
354
|
+
resume_parser.set_defaults(func=cmd_resume)
|
|
355
|
+
|
|
356
|
+
# --- apply command ---
|
|
357
|
+
apply_parser = subparsers.add_parser(
|
|
358
|
+
"apply",
|
|
359
|
+
help="Apply cleaning functions to data file",
|
|
360
|
+
)
|
|
361
|
+
apply_parser.add_argument("file", metavar="FILE", help="Path to input data file")
|
|
362
|
+
apply_parser.add_argument(
|
|
363
|
+
"-f", "--functions", required=True,
|
|
364
|
+
help="Path to cleaning_functions.py"
|
|
365
|
+
)
|
|
366
|
+
apply_parser.add_argument(
|
|
367
|
+
"-o", "--output", help="Output file path (default: <input>.cleaned.<ext>)"
|
|
368
|
+
)
|
|
369
|
+
apply_parser.set_defaults(func=cmd_apply)
|
|
370
|
+
|
|
371
|
+
return parser
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
def main(args: list[str] | None = None) -> int:
|
|
375
|
+
"""
|
|
376
|
+
Main entry point for the CLI.
|
|
377
|
+
|
|
378
|
+
Args:
|
|
379
|
+
args: Command-line arguments (defaults to sys.argv[1:])
|
|
380
|
+
|
|
381
|
+
Returns:
|
|
382
|
+
Exit code (0=success, 1=general error, 2=backend error, 3=validation error)
|
|
383
|
+
"""
|
|
384
|
+
parser = create_parser()
|
|
385
|
+
parsed = parser.parse_args(args)
|
|
386
|
+
|
|
387
|
+
if parsed.command is None:
|
|
388
|
+
parser.print_help()
|
|
389
|
+
return 0
|
|
390
|
+
|
|
391
|
+
return parsed.func(parsed)
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
if __name__ == "__main__":
|
|
395
|
+
sys.exit(main())
|
recursive_cleaner/prompt.py
CHANGED
|
@@ -52,7 +52,8 @@ CONSOLIDATION_TEMPLATE = '''You are reviewing cleaning functions for consolidati
|
|
|
52
52
|
</docstring>
|
|
53
53
|
<code>
|
|
54
54
|
```python
|
|
55
|
-
def merged_function_name(record):
|
|
55
|
+
def merged_function_name(record: dict) -> dict:
|
|
56
|
+
# Modify fields, return record
|
|
56
57
|
...
|
|
57
58
|
```
|
|
58
59
|
</code>
|
|
@@ -108,9 +109,10 @@ Tags: domain, action, detail
|
|
|
108
109
|
</docstring>
|
|
109
110
|
<code>
|
|
110
111
|
```python
|
|
111
|
-
def function_name(
|
|
112
|
-
#
|
|
113
|
-
|
|
112
|
+
def function_name(record: dict) -> dict:
|
|
113
|
+
# Modify field(s) in the record
|
|
114
|
+
record["field"] = cleaned_value
|
|
115
|
+
return record
|
|
114
116
|
```
|
|
115
117
|
</code>
|
|
116
118
|
</function_to_generate>
|
|
@@ -120,6 +122,8 @@ def function_name(data):
|
|
|
120
122
|
|
|
121
123
|
RULES:
|
|
122
124
|
- ONE function per response
|
|
125
|
+
- Function signature: takes a dict (one record), returns the modified dict
|
|
126
|
+
- Modify fields directly on the record, then return it
|
|
123
127
|
- If all issues solved: <chunk_status>clean</chunk_status>, omit <function_to_generate>
|
|
124
128
|
- Include imports inside the function or document needed imports in docstring
|
|
125
129
|
- Function must be idempotent (safe to run multiple times)
|
recursive_cleaner/tui.py
CHANGED
|
@@ -505,19 +505,28 @@ class TUIRenderer:
|
|
|
505
505
|
)
|
|
506
506
|
self._layout["left_panel"].update(left_panel)
|
|
507
507
|
|
|
508
|
-
def
|
|
509
|
-
"""Parse LLM XML response into
|
|
508
|
+
def _colorize_transmission(self, response: str) -> "Text":
|
|
509
|
+
"""Parse LLM XML response into colorized Rich Text for transmission log.
|
|
510
|
+
|
|
511
|
+
Color scheme:
|
|
512
|
+
- Issues (solved): dim
|
|
513
|
+
- Issues (unsolved): bright_white with cycling accent (blue/magenta/cyan/yellow)
|
|
514
|
+
- Function names: green
|
|
515
|
+
- Docstrings: italic
|
|
516
|
+
- Status clean: green
|
|
517
|
+
- Status needs_more_work: yellow
|
|
510
518
|
|
|
511
519
|
Args:
|
|
512
520
|
response: Raw LLM response text (XML format)
|
|
513
521
|
|
|
514
522
|
Returns:
|
|
515
|
-
|
|
516
|
-
generated, and chunk status.
|
|
523
|
+
Rich Text object with colors applied.
|
|
517
524
|
"""
|
|
518
525
|
import re
|
|
519
526
|
|
|
520
|
-
|
|
527
|
+
ISSUE_COLORS = ["blue", "magenta", "cyan", "yellow"]
|
|
528
|
+
text = Text()
|
|
529
|
+
unsolved_index = 0
|
|
521
530
|
|
|
522
531
|
try:
|
|
523
532
|
# Find all issues
|
|
@@ -525,53 +534,63 @@ class TUIRenderer:
|
|
|
525
534
|
issues = re.findall(issue_pattern, response, re.DOTALL)
|
|
526
535
|
|
|
527
536
|
if issues:
|
|
528
|
-
|
|
537
|
+
text.append("ISSUES DETECTED:\n", style="bold cyan")
|
|
529
538
|
for issue_id, solved, desc in issues[:8]: # Limit to 8 issues
|
|
530
|
-
marker = "\u2713" if solved == "true" else "\u2717" # checkmark or X
|
|
531
539
|
desc_clean = desc.strip()[:40] # Truncate description
|
|
532
|
-
|
|
540
|
+
if solved == "true":
|
|
541
|
+
text.append(" \u2713 ", style="green")
|
|
542
|
+
text.append(f"{desc_clean}\n", style="dim")
|
|
543
|
+
else:
|
|
544
|
+
accent = ISSUE_COLORS[unsolved_index % len(ISSUE_COLORS)]
|
|
545
|
+
text.append(" \u2717 ", style=accent)
|
|
546
|
+
text.append(f"{desc_clean}\n", style="bright_white")
|
|
547
|
+
unsolved_index += 1
|
|
533
548
|
if len(issues) > 8:
|
|
534
|
-
|
|
535
|
-
|
|
549
|
+
text.append(f" (+{len(issues) - 8} more)\n", style="dim")
|
|
550
|
+
text.append("\n")
|
|
536
551
|
|
|
537
552
|
# Find function being generated
|
|
538
553
|
name_match = re.search(r'<name>([^<]+)</name>', response)
|
|
539
554
|
docstring_match = re.search(r'<docstring>([^<]+)</docstring>', response, re.DOTALL)
|
|
540
555
|
|
|
541
556
|
if name_match:
|
|
542
|
-
|
|
557
|
+
text.append("GENERATING: ", style="bold cyan")
|
|
558
|
+
text.append(f"{name_match.group(1).strip()}\n", style="green bold")
|
|
543
559
|
if docstring_match:
|
|
544
560
|
doc = docstring_match.group(1).strip()[:60]
|
|
545
|
-
|
|
546
|
-
|
|
561
|
+
text.append(f' "{doc}..."\n', style="italic")
|
|
562
|
+
text.append("\n")
|
|
547
563
|
|
|
548
564
|
# Find chunk status
|
|
549
565
|
status_match = re.search(r'<chunk_status>([^<]+)</chunk_status>', response)
|
|
550
566
|
if status_match:
|
|
551
567
|
status = status_match.group(1).strip()
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
568
|
+
text.append("STATUS: ", style="bold cyan")
|
|
569
|
+
if status == "clean":
|
|
570
|
+
text.append(status.upper(), style="green bold")
|
|
571
|
+
else:
|
|
572
|
+
text.append(status.upper().replace("_", " "), style="yellow bold")
|
|
573
|
+
|
|
574
|
+
if text.plain:
|
|
575
|
+
return text
|
|
556
576
|
except Exception:
|
|
557
577
|
pass
|
|
558
578
|
|
|
559
579
|
# Fallback: show truncated raw response
|
|
560
|
-
|
|
580
|
+
fallback = response[:500] + "..." if len(response) > 500 else response
|
|
581
|
+
return Text(fallback, style="dim cyan")
|
|
561
582
|
|
|
562
583
|
def _refresh_right_panel(self) -> None:
|
|
563
|
-
"""Refresh the right panel with
|
|
584
|
+
"""Refresh the right panel with colorized transmission log."""
|
|
564
585
|
if not HAS_RICH or self._layout is None:
|
|
565
586
|
return
|
|
566
587
|
|
|
567
|
-
# Get last response and
|
|
588
|
+
# Get last response and colorize for display
|
|
568
589
|
response = self._state.last_response
|
|
569
590
|
if not response:
|
|
570
|
-
|
|
591
|
+
log_text = Text("(Awaiting transmission...)", style="dim cyan")
|
|
571
592
|
else:
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
log_text = Text(display_text, style="dim cyan")
|
|
593
|
+
log_text = self._colorize_transmission(response)
|
|
575
594
|
|
|
576
595
|
right_panel = Panel(
|
|
577
596
|
log_text,
|
recursive_cleaner/validation.py
CHANGED
|
@@ -160,7 +160,10 @@ def validate_function(
|
|
|
160
160
|
# Structured mode: sample_data is list[dict]
|
|
161
161
|
for i, record in enumerate(sample_data):
|
|
162
162
|
try:
|
|
163
|
-
func(record)
|
|
163
|
+
result = func(record)
|
|
164
|
+
# Verify function returns a dict (not string, int, etc.)
|
|
165
|
+
if not isinstance(result, dict):
|
|
166
|
+
return False, f"Function must return dict, got {type(result).__name__}"
|
|
164
167
|
except Exception as e:
|
|
165
168
|
return False, f"Runtime error on sample {i}: {type(e).__name__}: {e}"
|
|
166
169
|
|
|
@@ -200,3 +203,39 @@ def extract_sample_data(
|
|
|
200
203
|
except json.JSONDecodeError:
|
|
201
204
|
continue
|
|
202
205
|
return samples
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def extract_modified_fields(code: str) -> set[str]:
|
|
209
|
+
"""
|
|
210
|
+
Extract field names that are modified via record["field"] = ... pattern.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
code: Python source code of the function
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
Set of field names that are assigned to
|
|
217
|
+
"""
|
|
218
|
+
try:
|
|
219
|
+
tree = ast.parse(code)
|
|
220
|
+
except SyntaxError:
|
|
221
|
+
return set()
|
|
222
|
+
|
|
223
|
+
fields = set()
|
|
224
|
+
# Common parameter names for the data/record argument
|
|
225
|
+
data_names = {"record", "data"}
|
|
226
|
+
|
|
227
|
+
for node in ast.walk(tree):
|
|
228
|
+
# Look for assignment statements
|
|
229
|
+
if isinstance(node, ast.Assign):
|
|
230
|
+
for target in node.targets:
|
|
231
|
+
# Check if target is a subscript: record["field"] or data["field"]
|
|
232
|
+
if isinstance(target, ast.Subscript):
|
|
233
|
+
# The value should be a Name node (record or data)
|
|
234
|
+
if isinstance(target.value, ast.Name):
|
|
235
|
+
if target.value.id in data_names:
|
|
236
|
+
# The slice should be a string constant
|
|
237
|
+
if isinstance(target.slice, ast.Constant):
|
|
238
|
+
if isinstance(target.slice.value, str):
|
|
239
|
+
fields.add(target.slice.value)
|
|
240
|
+
|
|
241
|
+
return fields
|