recursive-cleaner 1.0.0__tar.gz → 1.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/CLAUDE.md +4 -2
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/PKG-INFO +2 -2
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/README.md +1 -1
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/pyproject.toml +1 -1
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/__init__.py +2 -1
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/cleaner.py +21 -1
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/prompt.py +8 -4
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/validation.py +40 -1
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_holdout.py +3 -2
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_text_mode.py +1 -1
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_validation.py +184 -3
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/.gitignore +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/AGENTS.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/LICENSE +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/TODO.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/backends/__init__.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/backends/mlx_backend.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/backends/openai_backend.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/demo_tui.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/archive/langchain-analysis.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/archive/langgraph-analysis.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/archive/other-frameworks-analysis.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/archive/smolagents-analysis.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/api-contract.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/data-schema.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/success-criteria.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/text-mode-contract.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/tier2-contract.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/tier4-contract.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/tier4-success-criteria.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/two-pass-contract.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/v070-success-criteria.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/v080-api-contract.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/v080-data-schema.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/v080-success-criteria.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/v090-cli-contract.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/v090-success-criteria.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/v100-api-contract.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/v100-success-criteria.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/handoffs/tier4-handoff.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/handoffs/v090-research-handoff.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/handoffs/v100-research-handoff.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/implementation-plan-tier4.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/implementation-plan-v03.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/implementation-plan-v04.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/implementation-plan-v05.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/implementation-plan-v080.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/implementation-plan.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/mlx-lm-guide.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/refactor-assessment/data/dependency.json +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/refactor-assessment/data/stats.json +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/refactor-assessment/plan.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/refactor-assessment/report.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/research/chonkie-extraction.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/research/chonkie.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/research/cli-backend-patterns.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/research/cli-local-research.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/research/markitdown.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/research/rich-tui-patterns.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/research/v100-apply-mode-research.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/v090-implementation-plan.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/v100-implementation-plan.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/workflow-state.md +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/__main__.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/apply.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/cli.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/context.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/dependencies.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/errors.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/metrics.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/optimizer.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/output.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/parser_generator.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/parsers.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/report.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/response.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/schema.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/tui.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/types.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/vendor/__init__.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/vendor/chunker.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/test_cases/ecommerce_instructions.txt +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/test_cases/ecommerce_products.jsonl +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/test_cases/financial_instructions.txt +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/test_cases/financial_transactions.jsonl +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/test_cases/healthcare_instructions.txt +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/test_cases/healthcare_patients.jsonl +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/test_cases/run_ecommerce_test.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/test_cases/run_financial_test.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/test_cases/run_healthcare_test.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/__init__.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_apply.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_callbacks.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_cleaner.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_cli.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_context.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_dependencies.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_dry_run.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_incremental.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_integration.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_latency.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_metrics.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_openai_backend.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_optimizer.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_output.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_parser_generator.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_parsers.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_report.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_sampling.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_schema.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_tui.py +0 -0
- {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_vendor_chunker.py +0 -0
|
@@ -4,7 +4,8 @@
|
|
|
4
4
|
|
|
5
5
|
| Version | Status | Date |
|
|
6
6
|
|---------|--------|------|
|
|
7
|
-
| v1.0.
|
|
7
|
+
| v1.0.1 | **Implemented** | 2025-02-05 |
|
|
8
|
+
| v1.0.0 | Implemented | 2025-01-30 |
|
|
8
9
|
| v0.9.0 | Implemented | 2025-01-19 |
|
|
9
10
|
| v0.8.0 | Implemented | 2025-01-19 |
|
|
10
11
|
| v0.7.0 | Implemented | 2025-01-17 |
|
|
@@ -16,9 +17,10 @@
|
|
|
16
17
|
| v0.2.0 | Implemented | 2025-01-14 |
|
|
17
18
|
| v0.1.0 | Implemented | 2025-01-14 |
|
|
18
19
|
|
|
19
|
-
**Current State**: v1.0.
|
|
20
|
+
**Current State**: v1.0.1 complete. 555 tests passing.
|
|
20
21
|
|
|
21
22
|
### Version History
|
|
23
|
+
- **v1.0.1**: Return type validation, prompt signature clarity, duplicate field detection
|
|
22
24
|
- **v1.0.0**: Apply mode for applying cleaning functions to data, Excel support, TUI color enhancement
|
|
23
25
|
- **v0.9.0**: CLI tool with MLX and OpenAI-compatible backends (LM Studio, Ollama)
|
|
24
26
|
- **v0.8.0**: Terminal UI with Rich dashboard, mission control aesthetic, transmission log
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: recursive-cleaner
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.1
|
|
4
4
|
Summary: LLM-powered incremental data cleaning pipeline that processes massive datasets in chunks and generates Python cleaning functions
|
|
5
5
|
Project-URL: Homepage, https://github.com/gaztrabisme/recursive-data-cleaner
|
|
6
6
|
Project-URL: Repository, https://github.com/gaztrabisme/recursive-data-cleaner
|
|
@@ -389,7 +389,7 @@ backends/
|
|
|
389
389
|
pytest tests/ -v
|
|
390
390
|
```
|
|
391
391
|
|
|
392
|
-
|
|
392
|
+
555 tests covering all features. Test datasets in `test_cases/`:
|
|
393
393
|
- E-commerce product catalogs
|
|
394
394
|
- Healthcare patient records
|
|
395
395
|
- Financial transaction data
|
|
@@ -346,7 +346,7 @@ backends/
|
|
|
346
346
|
pytest tests/ -v
|
|
347
347
|
```
|
|
348
348
|
|
|
349
|
-
|
|
349
|
+
555 tests covering all features. Test datasets in `test_cases/`:
|
|
350
350
|
- E-commerce product catalogs
|
|
351
351
|
- Healthcare patient records
|
|
352
352
|
- Financial transaction data
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "recursive-cleaner"
|
|
7
|
-
version = "1.0.
|
|
7
|
+
version = "1.0.1"
|
|
8
8
|
description = "LLM-powered incremental data cleaning pipeline that processes massive datasets in chunks and generates Python cleaning functions"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "MIT"
|
|
@@ -22,7 +22,7 @@ from recursive_cleaner.prompt import build_prompt
|
|
|
22
22
|
from recursive_cleaner.response import extract_python_block, parse_response
|
|
23
23
|
from recursive_cleaner.parser_generator import check_parser_safety, generate_parser
|
|
24
24
|
from recursive_cleaner.tui import HAS_RICH, TUIRenderer
|
|
25
|
-
from recursive_cleaner.validation import check_code_safety, extract_sample_data, validate_function
|
|
25
|
+
from recursive_cleaner.validation import check_code_safety, extract_modified_fields, extract_sample_data, validate_function
|
|
26
26
|
|
|
27
27
|
__all__ = [
|
|
28
28
|
"apply_cleaning",
|
|
@@ -43,6 +43,7 @@ __all__ = [
|
|
|
43
43
|
"validate_function",
|
|
44
44
|
"extract_sample_data",
|
|
45
45
|
"check_code_safety",
|
|
46
|
+
"extract_modified_fields",
|
|
46
47
|
"resolve_dependencies",
|
|
47
48
|
"QualityMetrics",
|
|
48
49
|
"measure_quality",
|
|
@@ -17,7 +17,7 @@ from .prompt import build_prompt
|
|
|
17
17
|
from .response import parse_response
|
|
18
18
|
from .schema import format_schema_for_prompt, infer_schema
|
|
19
19
|
from .types import LLMBackend
|
|
20
|
-
from .validation import check_code_safety, extract_sample_data, split_holdout, validate_function
|
|
20
|
+
from .validation import check_code_safety, extract_modified_fields, extract_sample_data, split_holdout, validate_function
|
|
21
21
|
|
|
22
22
|
STATE_VERSION = "0.5.0"
|
|
23
23
|
|
|
@@ -110,6 +110,8 @@ class DataCleaner:
|
|
|
110
110
|
"min_ms": float("inf"),
|
|
111
111
|
"max_ms": 0.0,
|
|
112
112
|
}
|
|
113
|
+
# Track fields already covered by generated functions (per chunk)
|
|
114
|
+
self._fields_covered: set[str] = set()
|
|
113
115
|
|
|
114
116
|
def _emit(self, event_type: str, chunk_index: int = 0, **kwargs) -> None:
|
|
115
117
|
"""Emit a progress event to the callback, if set."""
|
|
@@ -533,6 +535,8 @@ class DataCleaner:
|
|
|
533
535
|
"""Process a single chunk, iterating until clean or max iterations."""
|
|
534
536
|
self._emit("chunk_start", chunk_index=chunk_idx)
|
|
535
537
|
error_feedback = ""
|
|
538
|
+
# Reset fields covered for new chunk
|
|
539
|
+
self._fields_covered = set()
|
|
536
540
|
|
|
537
541
|
# Dry run mode: just detect issues, don't generate functions
|
|
538
542
|
if self.dry_run:
|
|
@@ -594,6 +598,20 @@ class DataCleaner:
|
|
|
594
598
|
print(f" Safety check failed: {safety_error}")
|
|
595
599
|
continue
|
|
596
600
|
|
|
601
|
+
# Check for duplicate field coverage
|
|
602
|
+
new_fields = extract_modified_fields(result["code"])
|
|
603
|
+
overlap = new_fields & self._fields_covered
|
|
604
|
+
if overlap:
|
|
605
|
+
field_list = ", ".join(sorted(overlap))
|
|
606
|
+
error_feedback = f"You already generated a function for field(s): {field_list}. This issue is solved. Move on to the next unsolved issue."
|
|
607
|
+
self._emit(
|
|
608
|
+
"duplicate_field",
|
|
609
|
+
chunk_index=chunk_idx,
|
|
610
|
+
function_name=result["name"],
|
|
611
|
+
fields=list(overlap),
|
|
612
|
+
)
|
|
613
|
+
continue
|
|
614
|
+
|
|
597
615
|
# Runtime validation if enabled
|
|
598
616
|
if self.validate_runtime:
|
|
599
617
|
# Use holdout data if available, else sample from generation chunk
|
|
@@ -628,6 +646,8 @@ class DataCleaner:
|
|
|
628
646
|
"docstring": result["docstring"],
|
|
629
647
|
"code": result["code"],
|
|
630
648
|
})
|
|
649
|
+
# Track fields covered by this function
|
|
650
|
+
self._fields_covered.update(new_fields)
|
|
631
651
|
# Track for saturation check
|
|
632
652
|
self._recent_new_function_count += 1
|
|
633
653
|
|
|
@@ -52,7 +52,8 @@ CONSOLIDATION_TEMPLATE = '''You are reviewing cleaning functions for consolidati
|
|
|
52
52
|
</docstring>
|
|
53
53
|
<code>
|
|
54
54
|
```python
|
|
55
|
-
def merged_function_name(record):
|
|
55
|
+
def merged_function_name(record: dict) -> dict:
|
|
56
|
+
# Modify fields, return record
|
|
56
57
|
...
|
|
57
58
|
```
|
|
58
59
|
</code>
|
|
@@ -108,9 +109,10 @@ Tags: domain, action, detail
|
|
|
108
109
|
</docstring>
|
|
109
110
|
<code>
|
|
110
111
|
```python
|
|
111
|
-
def function_name(
|
|
112
|
-
#
|
|
113
|
-
|
|
112
|
+
def function_name(record: dict) -> dict:
|
|
113
|
+
# Modify field(s) in the record
|
|
114
|
+
record["field"] = cleaned_value
|
|
115
|
+
return record
|
|
114
116
|
```
|
|
115
117
|
</code>
|
|
116
118
|
</function_to_generate>
|
|
@@ -120,6 +122,8 @@ def function_name(data):
|
|
|
120
122
|
|
|
121
123
|
RULES:
|
|
122
124
|
- ONE function per response
|
|
125
|
+
- Function signature: takes a dict (one record), returns the modified dict
|
|
126
|
+
- Modify fields directly on the record, then return it
|
|
123
127
|
- If all issues solved: <chunk_status>clean</chunk_status>, omit <function_to_generate>
|
|
124
128
|
- Include imports inside the function or document needed imports in docstring
|
|
125
129
|
- Function must be idempotent (safe to run multiple times)
|
|
@@ -160,7 +160,10 @@ def validate_function(
|
|
|
160
160
|
# Structured mode: sample_data is list[dict]
|
|
161
161
|
for i, record in enumerate(sample_data):
|
|
162
162
|
try:
|
|
163
|
-
func(record)
|
|
163
|
+
result = func(record)
|
|
164
|
+
# Verify function returns a dict (not string, int, etc.)
|
|
165
|
+
if not isinstance(result, dict):
|
|
166
|
+
return False, f"Function must return dict, got {type(result).__name__}"
|
|
164
167
|
except Exception as e:
|
|
165
168
|
return False, f"Runtime error on sample {i}: {type(e).__name__}: {e}"
|
|
166
169
|
|
|
@@ -200,3 +203,39 @@ def extract_sample_data(
|
|
|
200
203
|
except json.JSONDecodeError:
|
|
201
204
|
continue
|
|
202
205
|
return samples
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def extract_modified_fields(code: str) -> set[str]:
|
|
209
|
+
"""
|
|
210
|
+
Extract field names that are modified via record["field"] = ... pattern.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
code: Python source code of the function
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
Set of field names that are assigned to
|
|
217
|
+
"""
|
|
218
|
+
try:
|
|
219
|
+
tree = ast.parse(code)
|
|
220
|
+
except SyntaxError:
|
|
221
|
+
return set()
|
|
222
|
+
|
|
223
|
+
fields = set()
|
|
224
|
+
# Common parameter names for the data/record argument
|
|
225
|
+
data_names = {"record", "data"}
|
|
226
|
+
|
|
227
|
+
for node in ast.walk(tree):
|
|
228
|
+
# Look for assignment statements
|
|
229
|
+
if isinstance(node, ast.Assign):
|
|
230
|
+
for target in node.targets:
|
|
231
|
+
# Check if target is a subscript: record["field"] or data["field"]
|
|
232
|
+
if isinstance(target, ast.Subscript):
|
|
233
|
+
# The value should be a Name node (record or data)
|
|
234
|
+
if isinstance(target.value, ast.Name):
|
|
235
|
+
if target.value.id in data_names:
|
|
236
|
+
# The slice should be a string constant
|
|
237
|
+
if isinstance(target.slice, ast.Constant):
|
|
238
|
+
if isinstance(target.slice.value, str):
|
|
239
|
+
fields.add(target.slice.value)
|
|
240
|
+
|
|
241
|
+
return fields
|
|
@@ -246,7 +246,7 @@ def test_cleaner_holdout_validation_uses_holdout_data(tmp_path):
|
|
|
246
246
|
# Records where only the holdout has a specific key
|
|
247
247
|
test_file.write_text('{"name":"A"}\n{"name":"B"}\n{"name":"C"}\n{"name":"D"}\n{"special":"E"}\n')
|
|
248
248
|
|
|
249
|
-
# Function that accesses "special" key - should
|
|
249
|
+
# Function that accesses "special" key - should pass on holdout (which has it)
|
|
250
250
|
response_with_special_access = '''
|
|
251
251
|
<cleaning_analysis>
|
|
252
252
|
<issues_detected>
|
|
@@ -258,7 +258,8 @@ def test_cleaner_holdout_validation_uses_holdout_data(tmp_path):
|
|
|
258
258
|
<code>
|
|
259
259
|
```python
|
|
260
260
|
def get_special(data):
|
|
261
|
-
|
|
261
|
+
data["found_special"] = data["special"]
|
|
262
|
+
return data
|
|
262
263
|
```
|
|
263
264
|
</code>
|
|
264
265
|
</function_to_generate>
|
|
@@ -261,7 +261,7 @@ class TestTextPromptTemplate:
|
|
|
261
261
|
)
|
|
262
262
|
|
|
263
263
|
# Should have structured-specific content
|
|
264
|
-
assert "def function_name(
|
|
264
|
+
assert "def function_name(record: dict) -> dict:" in prompt
|
|
265
265
|
# Should have schema section
|
|
266
266
|
assert "=== DATA SCHEMA ===" in prompt
|
|
267
267
|
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
import pytest
|
|
4
4
|
from recursive_cleaner import DataCleaner, validate_function, extract_sample_data, check_code_safety
|
|
5
|
+
from recursive_cleaner.validation import extract_modified_fields
|
|
5
6
|
|
|
6
7
|
|
|
7
8
|
class MockLLM:
|
|
@@ -22,7 +23,8 @@ def test_validate_function_accepts_working_code():
|
|
|
22
23
|
"""Function that works on sample data is accepted."""
|
|
23
24
|
code = '''
|
|
24
25
|
def process_data(data):
|
|
25
|
-
|
|
26
|
+
data["processed"] = True
|
|
27
|
+
return data
|
|
26
28
|
'''
|
|
27
29
|
sample_data = [{"name": "Alice"}, {"name": "Bob"}]
|
|
28
30
|
valid, error = validate_function(code, sample_data, "process_data")
|
|
@@ -30,6 +32,18 @@ def process_data(data):
|
|
|
30
32
|
assert error is None
|
|
31
33
|
|
|
32
34
|
|
|
35
|
+
def test_validate_function_rejects_wrong_return_type():
|
|
36
|
+
"""Function that returns non-dict is rejected."""
|
|
37
|
+
code = '''
|
|
38
|
+
def bad_return(data):
|
|
39
|
+
return data.get("name", "unknown")
|
|
40
|
+
'''
|
|
41
|
+
sample_data = [{"name": "Alice"}]
|
|
42
|
+
valid, error = validate_function(code, sample_data, "bad_return")
|
|
43
|
+
assert valid is False
|
|
44
|
+
assert "must return dict" in error
|
|
45
|
+
|
|
46
|
+
|
|
33
47
|
def test_validate_function_rejects_key_error():
|
|
34
48
|
"""Function with KeyError on nonexistent key is rejected."""
|
|
35
49
|
code = '''
|
|
@@ -157,7 +171,8 @@ RESPONSE_WITH_GOOD_FUNCTION = '''
|
|
|
157
171
|
<code>
|
|
158
172
|
```python
|
|
159
173
|
def good_processor(data):
|
|
160
|
-
|
|
174
|
+
data["processed"] = True
|
|
175
|
+
return data
|
|
161
176
|
```
|
|
162
177
|
</code>
|
|
163
178
|
</function_to_generate>
|
|
@@ -516,7 +531,8 @@ RESPONSE_SAFE_FUNCTION = '''
|
|
|
516
531
|
<code>
|
|
517
532
|
```python
|
|
518
533
|
def safe_processor(data):
|
|
519
|
-
|
|
534
|
+
data["doubled"] = data.get("value", 0) * 2
|
|
535
|
+
return data
|
|
520
536
|
```
|
|
521
537
|
</code>
|
|
522
538
|
</function_to_generate>
|
|
@@ -557,3 +573,168 @@ def test_cleaner_rejects_dangerous_code_and_retries(tmp_path):
|
|
|
557
573
|
safety_events = [e for e in events if e["type"] == "safety_failed"]
|
|
558
574
|
assert len(safety_events) == 1
|
|
559
575
|
assert "os" in safety_events[0]["error"]
|
|
576
|
+
|
|
577
|
+
|
|
578
|
+
# Tests for extract_modified_fields
|
|
579
|
+
|
|
580
|
+
|
|
581
|
+
def test_extract_modified_fields_simple():
|
|
582
|
+
"""Extract single field from record assignment."""
|
|
583
|
+
code = '''
|
|
584
|
+
def fix_phone(record: dict) -> dict:
|
|
585
|
+
record["phone"] = normalize(record["phone"])
|
|
586
|
+
return record
|
|
587
|
+
'''
|
|
588
|
+
fields = extract_modified_fields(code)
|
|
589
|
+
assert fields == {"phone"}
|
|
590
|
+
|
|
591
|
+
|
|
592
|
+
def test_extract_modified_fields_multiple():
|
|
593
|
+
"""Extract multiple fields from function."""
|
|
594
|
+
code = '''
|
|
595
|
+
def clean_record(record: dict) -> dict:
|
|
596
|
+
record["phone"] = normalize(record["phone"])
|
|
597
|
+
record["email"] = record["email"].lower()
|
|
598
|
+
return record
|
|
599
|
+
'''
|
|
600
|
+
fields = extract_modified_fields(code)
|
|
601
|
+
assert fields == {"phone", "email"}
|
|
602
|
+
|
|
603
|
+
|
|
604
|
+
def test_extract_modified_fields_data_param():
|
|
605
|
+
"""Works with 'data' parameter name too."""
|
|
606
|
+
code = '''
|
|
607
|
+
def fix_status(data: dict) -> dict:
|
|
608
|
+
data["status"] = data["status"].lower()
|
|
609
|
+
return data
|
|
610
|
+
'''
|
|
611
|
+
fields = extract_modified_fields(code)
|
|
612
|
+
assert fields == {"status"}
|
|
613
|
+
|
|
614
|
+
|
|
615
|
+
def test_extract_modified_fields_no_assignments():
|
|
616
|
+
"""Returns empty set when no field assignments."""
|
|
617
|
+
code = '''
|
|
618
|
+
def passthrough(record: dict) -> dict:
|
|
619
|
+
return record
|
|
620
|
+
'''
|
|
621
|
+
fields = extract_modified_fields(code)
|
|
622
|
+
assert fields == set()
|
|
623
|
+
|
|
624
|
+
|
|
625
|
+
def test_extract_modified_fields_nested_not_extracted():
|
|
626
|
+
"""Only extracts direct record assignments, not nested."""
|
|
627
|
+
code = '''
|
|
628
|
+
def process(record: dict) -> dict:
|
|
629
|
+
temp = {}
|
|
630
|
+
temp["key"] = "value"
|
|
631
|
+
record["result"] = temp
|
|
632
|
+
return record
|
|
633
|
+
'''
|
|
634
|
+
fields = extract_modified_fields(code)
|
|
635
|
+
assert fields == {"result"}
|
|
636
|
+
|
|
637
|
+
|
|
638
|
+
# Integration test: duplicate field detection in DataCleaner
|
|
639
|
+
|
|
640
|
+
|
|
641
|
+
def test_cleaner_rejects_duplicate_field_with_feedback(tmp_path):
|
|
642
|
+
"""Cleaner rejects functions that modify already-covered fields."""
|
|
643
|
+
test_file = tmp_path / "test.jsonl"
|
|
644
|
+
test_file.write_text('{"phone": "555-1234", "status": "active"}\n')
|
|
645
|
+
|
|
646
|
+
# First response: phone function (should be accepted)
|
|
647
|
+
response_phone = '''
|
|
648
|
+
<cleaning_analysis>
|
|
649
|
+
<issues_detected>
|
|
650
|
+
<issue id="1" solved="false">Phone needs normalization</issue>
|
|
651
|
+
</issues_detected>
|
|
652
|
+
<function_to_generate>
|
|
653
|
+
<name>normalize_phone</name>
|
|
654
|
+
<docstring>Normalize phone numbers.</docstring>
|
|
655
|
+
<code>
|
|
656
|
+
```python
|
|
657
|
+
def normalize_phone(record: dict) -> dict:
|
|
658
|
+
record["phone"] = record["phone"].replace("-", "")
|
|
659
|
+
return record
|
|
660
|
+
```
|
|
661
|
+
</code>
|
|
662
|
+
</function_to_generate>
|
|
663
|
+
<chunk_status>needs_more_work</chunk_status>
|
|
664
|
+
</cleaning_analysis>
|
|
665
|
+
'''
|
|
666
|
+
|
|
667
|
+
# Second response: another phone function (should be rejected as duplicate)
|
|
668
|
+
response_phone_again = '''
|
|
669
|
+
<cleaning_analysis>
|
|
670
|
+
<issues_detected>
|
|
671
|
+
<issue id="1" solved="false">Phone still needs work</issue>
|
|
672
|
+
</issues_detected>
|
|
673
|
+
<function_to_generate>
|
|
674
|
+
<name>fix_phone_format</name>
|
|
675
|
+
<docstring>Fix phone format.</docstring>
|
|
676
|
+
<code>
|
|
677
|
+
```python
|
|
678
|
+
def fix_phone_format(record: dict) -> dict:
|
|
679
|
+
record["phone"] = "+1" + record["phone"]
|
|
680
|
+
return record
|
|
681
|
+
```
|
|
682
|
+
</code>
|
|
683
|
+
</function_to_generate>
|
|
684
|
+
<chunk_status>needs_more_work</chunk_status>
|
|
685
|
+
</cleaning_analysis>
|
|
686
|
+
'''
|
|
687
|
+
|
|
688
|
+
# Third response: status function (should be accepted - different field)
|
|
689
|
+
response_status = '''
|
|
690
|
+
<cleaning_analysis>
|
|
691
|
+
<issues_detected>
|
|
692
|
+
<issue id="1" solved="true">Phone handled</issue>
|
|
693
|
+
<issue id="2" solved="false">Status needs fixing</issue>
|
|
694
|
+
</issues_detected>
|
|
695
|
+
<function_to_generate>
|
|
696
|
+
<name>fix_status</name>
|
|
697
|
+
<docstring>Fix status field.</docstring>
|
|
698
|
+
<code>
|
|
699
|
+
```python
|
|
700
|
+
def fix_status(record: dict) -> dict:
|
|
701
|
+
record["status"] = record["status"].lower()
|
|
702
|
+
return record
|
|
703
|
+
```
|
|
704
|
+
</code>
|
|
705
|
+
</function_to_generate>
|
|
706
|
+
<chunk_status>needs_more_work</chunk_status>
|
|
707
|
+
</cleaning_analysis>
|
|
708
|
+
'''
|
|
709
|
+
|
|
710
|
+
response_clean = '''
|
|
711
|
+
<cleaning_analysis>
|
|
712
|
+
<issues_detected>
|
|
713
|
+
<issue id="1" solved="true">Phone handled</issue>
|
|
714
|
+
<issue id="2" solved="true">Status handled</issue>
|
|
715
|
+
</issues_detected>
|
|
716
|
+
<chunk_status>clean</chunk_status>
|
|
717
|
+
</cleaning_analysis>
|
|
718
|
+
'''
|
|
719
|
+
|
|
720
|
+
mock_llm = MockLLM([response_phone, response_phone_again, response_status, response_clean])
|
|
721
|
+
|
|
722
|
+
cleaner = DataCleaner(
|
|
723
|
+
llm_backend=mock_llm,
|
|
724
|
+
file_path=str(test_file),
|
|
725
|
+
chunk_size=10,
|
|
726
|
+
validate_runtime=True,
|
|
727
|
+
)
|
|
728
|
+
cleaner.run()
|
|
729
|
+
|
|
730
|
+
# Should have 2 functions: normalize_phone and fix_status
|
|
731
|
+
# fix_phone_format should have been rejected as duplicate
|
|
732
|
+
assert len(cleaner.functions) == 2
|
|
733
|
+
function_names = [f["name"] for f in cleaner.functions]
|
|
734
|
+
assert "normalize_phone" in function_names
|
|
735
|
+
assert "fix_status" in function_names
|
|
736
|
+
assert "fix_phone_format" not in function_names
|
|
737
|
+
|
|
738
|
+
# The retry prompt should mention "already generated" or "duplicate"
|
|
739
|
+
assert any("phone" in call.lower() and ("already" in call.lower() or "duplicate" in call.lower())
|
|
740
|
+
for call in mock_llm.calls[2:]) # Check prompts after first acceptance
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/archive/other-frameworks-analysis.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/tier4-success-criteria.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/refactor-assessment/data/dependency.json
RENAMED
|
File without changes
|
{recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/refactor-assessment/data/stats.json
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/research/v100-apply-mode-research.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|