recursive-cleaner 1.0.0__tar.gz → 1.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/CLAUDE.md +4 -2
  2. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/PKG-INFO +2 -2
  3. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/README.md +1 -1
  4. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/pyproject.toml +1 -1
  5. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/__init__.py +2 -1
  6. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/cleaner.py +21 -1
  7. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/prompt.py +8 -4
  8. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/validation.py +40 -1
  9. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_holdout.py +3 -2
  10. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_text_mode.py +1 -1
  11. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_validation.py +184 -3
  12. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/.gitignore +0 -0
  13. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/AGENTS.md +0 -0
  14. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/LICENSE +0 -0
  15. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/TODO.md +0 -0
  16. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/backends/__init__.py +0 -0
  17. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/backends/mlx_backend.py +0 -0
  18. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/backends/openai_backend.py +0 -0
  19. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/demo_tui.py +0 -0
  20. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/archive/langchain-analysis.md +0 -0
  21. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/archive/langgraph-analysis.md +0 -0
  22. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/archive/other-frameworks-analysis.md +0 -0
  23. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/archive/smolagents-analysis.md +0 -0
  24. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/api-contract.md +0 -0
  25. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/data-schema.md +0 -0
  26. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/success-criteria.md +0 -0
  27. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/text-mode-contract.md +0 -0
  28. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/tier2-contract.md +0 -0
  29. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/tier4-contract.md +0 -0
  30. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/tier4-success-criteria.md +0 -0
  31. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/two-pass-contract.md +0 -0
  32. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/v070-success-criteria.md +0 -0
  33. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/v080-api-contract.md +0 -0
  34. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/v080-data-schema.md +0 -0
  35. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/v080-success-criteria.md +0 -0
  36. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/v090-cli-contract.md +0 -0
  37. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/v090-success-criteria.md +0 -0
  38. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/v100-api-contract.md +0 -0
  39. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/contracts/v100-success-criteria.md +0 -0
  40. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/handoffs/tier4-handoff.md +0 -0
  41. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/handoffs/v090-research-handoff.md +0 -0
  42. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/handoffs/v100-research-handoff.md +0 -0
  43. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/implementation-plan-tier4.md +0 -0
  44. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/implementation-plan-v03.md +0 -0
  45. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/implementation-plan-v04.md +0 -0
  46. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/implementation-plan-v05.md +0 -0
  47. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/implementation-plan-v080.md +0 -0
  48. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/implementation-plan.md +0 -0
  49. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/mlx-lm-guide.md +0 -0
  50. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/refactor-assessment/data/dependency.json +0 -0
  51. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/refactor-assessment/data/stats.json +0 -0
  52. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/refactor-assessment/plan.md +0 -0
  53. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/refactor-assessment/report.md +0 -0
  54. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/research/chonkie-extraction.md +0 -0
  55. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/research/chonkie.md +0 -0
  56. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/research/cli-backend-patterns.md +0 -0
  57. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/research/cli-local-research.md +0 -0
  58. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/research/markitdown.md +0 -0
  59. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/research/rich-tui-patterns.md +0 -0
  60. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/research/v100-apply-mode-research.md +0 -0
  61. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/v090-implementation-plan.md +0 -0
  62. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/v100-implementation-plan.md +0 -0
  63. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/docs/workflow-state.md +0 -0
  64. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/__main__.py +0 -0
  65. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/apply.py +0 -0
  66. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/cli.py +0 -0
  67. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/context.py +0 -0
  68. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/dependencies.py +0 -0
  69. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/errors.py +0 -0
  70. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/metrics.py +0 -0
  71. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/optimizer.py +0 -0
  72. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/output.py +0 -0
  73. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/parser_generator.py +0 -0
  74. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/parsers.py +0 -0
  75. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/report.py +0 -0
  76. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/response.py +0 -0
  77. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/schema.py +0 -0
  78. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/tui.py +0 -0
  79. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/types.py +0 -0
  80. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/vendor/__init__.py +0 -0
  81. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/recursive_cleaner/vendor/chunker.py +0 -0
  82. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/test_cases/ecommerce_instructions.txt +0 -0
  83. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/test_cases/ecommerce_products.jsonl +0 -0
  84. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/test_cases/financial_instructions.txt +0 -0
  85. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/test_cases/financial_transactions.jsonl +0 -0
  86. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/test_cases/healthcare_instructions.txt +0 -0
  87. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/test_cases/healthcare_patients.jsonl +0 -0
  88. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/test_cases/run_ecommerce_test.py +0 -0
  89. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/test_cases/run_financial_test.py +0 -0
  90. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/test_cases/run_healthcare_test.py +0 -0
  91. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/__init__.py +0 -0
  92. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_apply.py +0 -0
  93. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_callbacks.py +0 -0
  94. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_cleaner.py +0 -0
  95. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_cli.py +0 -0
  96. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_context.py +0 -0
  97. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_dependencies.py +0 -0
  98. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_dry_run.py +0 -0
  99. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_incremental.py +0 -0
  100. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_integration.py +0 -0
  101. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_latency.py +0 -0
  102. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_metrics.py +0 -0
  103. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_openai_backend.py +0 -0
  104. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_optimizer.py +0 -0
  105. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_output.py +0 -0
  106. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_parser_generator.py +0 -0
  107. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_parsers.py +0 -0
  108. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_report.py +0 -0
  109. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_sampling.py +0 -0
  110. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_schema.py +0 -0
  111. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_tui.py +0 -0
  112. {recursive_cleaner-1.0.0 → recursive_cleaner-1.0.1}/tests/test_vendor_chunker.py +0 -0
@@ -4,7 +4,8 @@
4
4
 
5
5
  | Version | Status | Date |
6
6
  |---------|--------|------|
7
- | v1.0.0 | **Implemented** | 2025-01-30 |
7
+ | v1.0.1 | **Implemented** | 2025-02-05 |
8
+ | v1.0.0 | Implemented | 2025-01-30 |
8
9
  | v0.9.0 | Implemented | 2025-01-19 |
9
10
  | v0.8.0 | Implemented | 2025-01-19 |
10
11
  | v0.7.0 | Implemented | 2025-01-17 |
@@ -16,9 +17,10 @@
16
17
  | v0.2.0 | Implemented | 2025-01-14 |
17
18
  | v0.1.0 | Implemented | 2025-01-14 |
18
19
 
19
- **Current State**: v1.0.0 complete. 548 tests passing.
20
+ **Current State**: v1.0.1 complete. 555 tests passing.
20
21
 
21
22
  ### Version History
23
+ - **v1.0.1**: Return type validation, prompt signature clarity, duplicate field detection
22
24
  - **v1.0.0**: Apply mode for applying cleaning functions to data, Excel support, TUI color enhancement
23
25
  - **v0.9.0**: CLI tool with MLX and OpenAI-compatible backends (LM Studio, Ollama)
24
26
  - **v0.8.0**: Terminal UI with Rich dashboard, mission control aesthetic, transmission log
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: recursive-cleaner
3
- Version: 1.0.0
3
+ Version: 1.0.1
4
4
  Summary: LLM-powered incremental data cleaning pipeline that processes massive datasets in chunks and generates Python cleaning functions
5
5
  Project-URL: Homepage, https://github.com/gaztrabisme/recursive-data-cleaner
6
6
  Project-URL: Repository, https://github.com/gaztrabisme/recursive-data-cleaner
@@ -389,7 +389,7 @@ backends/
389
389
  pytest tests/ -v
390
390
  ```
391
391
 
392
- 548 tests covering all features. Test datasets in `test_cases/`:
392
+ 555 tests covering all features. Test datasets in `test_cases/`:
393
393
  - E-commerce product catalogs
394
394
  - Healthcare patient records
395
395
  - Financial transaction data
@@ -346,7 +346,7 @@ backends/
346
346
  pytest tests/ -v
347
347
  ```
348
348
 
349
- 548 tests covering all features. Test datasets in `test_cases/`:
349
+ 555 tests covering all features. Test datasets in `test_cases/`:
350
350
  - E-commerce product catalogs
351
351
  - Healthcare patient records
352
352
  - Financial transaction data
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "recursive-cleaner"
7
- version = "1.0.0"
7
+ version = "1.0.1"
8
8
  description = "LLM-powered incremental data cleaning pipeline that processes massive datasets in chunks and generates Python cleaning functions"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -22,7 +22,7 @@ from recursive_cleaner.prompt import build_prompt
22
22
  from recursive_cleaner.response import extract_python_block, parse_response
23
23
  from recursive_cleaner.parser_generator import check_parser_safety, generate_parser
24
24
  from recursive_cleaner.tui import HAS_RICH, TUIRenderer
25
- from recursive_cleaner.validation import check_code_safety, extract_sample_data, validate_function
25
+ from recursive_cleaner.validation import check_code_safety, extract_modified_fields, extract_sample_data, validate_function
26
26
 
27
27
  __all__ = [
28
28
  "apply_cleaning",
@@ -43,6 +43,7 @@ __all__ = [
43
43
  "validate_function",
44
44
  "extract_sample_data",
45
45
  "check_code_safety",
46
+ "extract_modified_fields",
46
47
  "resolve_dependencies",
47
48
  "QualityMetrics",
48
49
  "measure_quality",
@@ -17,7 +17,7 @@ from .prompt import build_prompt
17
17
  from .response import parse_response
18
18
  from .schema import format_schema_for_prompt, infer_schema
19
19
  from .types import LLMBackend
20
- from .validation import check_code_safety, extract_sample_data, split_holdout, validate_function
20
+ from .validation import check_code_safety, extract_modified_fields, extract_sample_data, split_holdout, validate_function
21
21
 
22
22
  STATE_VERSION = "0.5.0"
23
23
 
@@ -110,6 +110,8 @@ class DataCleaner:
110
110
  "min_ms": float("inf"),
111
111
  "max_ms": 0.0,
112
112
  }
113
+ # Track fields already covered by generated functions (per chunk)
114
+ self._fields_covered: set[str] = set()
113
115
 
114
116
  def _emit(self, event_type: str, chunk_index: int = 0, **kwargs) -> None:
115
117
  """Emit a progress event to the callback, if set."""
@@ -533,6 +535,8 @@ class DataCleaner:
533
535
  """Process a single chunk, iterating until clean or max iterations."""
534
536
  self._emit("chunk_start", chunk_index=chunk_idx)
535
537
  error_feedback = ""
538
+ # Reset fields covered for new chunk
539
+ self._fields_covered = set()
536
540
 
537
541
  # Dry run mode: just detect issues, don't generate functions
538
542
  if self.dry_run:
@@ -594,6 +598,20 @@ class DataCleaner:
594
598
  print(f" Safety check failed: {safety_error}")
595
599
  continue
596
600
 
601
+ # Check for duplicate field coverage
602
+ new_fields = extract_modified_fields(result["code"])
603
+ overlap = new_fields & self._fields_covered
604
+ if overlap:
605
+ field_list = ", ".join(sorted(overlap))
606
+ error_feedback = f"You already generated a function for field(s): {field_list}. This issue is solved. Move on to the next unsolved issue."
607
+ self._emit(
608
+ "duplicate_field",
609
+ chunk_index=chunk_idx,
610
+ function_name=result["name"],
611
+ fields=list(overlap),
612
+ )
613
+ continue
614
+
597
615
  # Runtime validation if enabled
598
616
  if self.validate_runtime:
599
617
  # Use holdout data if available, else sample from generation chunk
@@ -628,6 +646,8 @@ class DataCleaner:
628
646
  "docstring": result["docstring"],
629
647
  "code": result["code"],
630
648
  })
649
+ # Track fields covered by this function
650
+ self._fields_covered.update(new_fields)
631
651
  # Track for saturation check
632
652
  self._recent_new_function_count += 1
633
653
 
@@ -52,7 +52,8 @@ CONSOLIDATION_TEMPLATE = '''You are reviewing cleaning functions for consolidati
52
52
  </docstring>
53
53
  <code>
54
54
  ```python
55
- def merged_function_name(record):
55
+ def merged_function_name(record: dict) -> dict:
56
+ # Modify fields, return record
56
57
  ...
57
58
  ```
58
59
  </code>
@@ -108,9 +109,10 @@ Tags: domain, action, detail
108
109
  </docstring>
109
110
  <code>
110
111
  ```python
111
- def function_name(data):
112
- # Complete implementation
113
- pass
112
+ def function_name(record: dict) -> dict:
113
+ # Modify field(s) in the record
114
+ record["field"] = cleaned_value
115
+ return record
114
116
  ```
115
117
  </code>
116
118
  </function_to_generate>
@@ -120,6 +122,8 @@ def function_name(data):
120
122
 
121
123
  RULES:
122
124
  - ONE function per response
125
+ - Function signature: takes a dict (one record), returns the modified dict
126
+ - Modify fields directly on the record, then return it
123
127
  - If all issues solved: <chunk_status>clean</chunk_status>, omit <function_to_generate>
124
128
  - Include imports inside the function or document needed imports in docstring
125
129
  - Function must be idempotent (safe to run multiple times)
@@ -160,7 +160,10 @@ def validate_function(
160
160
  # Structured mode: sample_data is list[dict]
161
161
  for i, record in enumerate(sample_data):
162
162
  try:
163
- func(record)
163
+ result = func(record)
164
+ # Verify function returns a dict (not string, int, etc.)
165
+ if not isinstance(result, dict):
166
+ return False, f"Function must return dict, got {type(result).__name__}"
164
167
  except Exception as e:
165
168
  return False, f"Runtime error on sample {i}: {type(e).__name__}: {e}"
166
169
 
@@ -200,3 +203,39 @@ def extract_sample_data(
200
203
  except json.JSONDecodeError:
201
204
  continue
202
205
  return samples
206
+
207
+
208
+ def extract_modified_fields(code: str) -> set[str]:
209
+ """
210
+ Extract field names that are modified via record["field"] = ... pattern.
211
+
212
+ Args:
213
+ code: Python source code of the function
214
+
215
+ Returns:
216
+ Set of field names that are assigned to
217
+ """
218
+ try:
219
+ tree = ast.parse(code)
220
+ except SyntaxError:
221
+ return set()
222
+
223
+ fields = set()
224
+ # Common parameter names for the data/record argument
225
+ data_names = {"record", "data"}
226
+
227
+ for node in ast.walk(tree):
228
+ # Look for assignment statements
229
+ if isinstance(node, ast.Assign):
230
+ for target in node.targets:
231
+ # Check if target is a subscript: record["field"] or data["field"]
232
+ if isinstance(target, ast.Subscript):
233
+ # The value should be a Name node (record or data)
234
+ if isinstance(target.value, ast.Name):
235
+ if target.value.id in data_names:
236
+ # The slice should be a string constant
237
+ if isinstance(target.slice, ast.Constant):
238
+ if isinstance(target.slice.value, str):
239
+ fields.add(target.slice.value)
240
+
241
+ return fields
@@ -246,7 +246,7 @@ def test_cleaner_holdout_validation_uses_holdout_data(tmp_path):
246
246
  # Records where only the holdout has a specific key
247
247
  test_file.write_text('{"name":"A"}\n{"name":"B"}\n{"name":"C"}\n{"name":"D"}\n{"special":"E"}\n')
248
248
 
249
- # Function that accesses "special" key - should fail on holdout
249
+ # Function that accesses "special" key - should pass on holdout (which has it)
250
250
  response_with_special_access = '''
251
251
  <cleaning_analysis>
252
252
  <issues_detected>
@@ -258,7 +258,8 @@ def test_cleaner_holdout_validation_uses_holdout_data(tmp_path):
258
258
  <code>
259
259
  ```python
260
260
  def get_special(data):
261
- return data["special"]
261
+ data["found_special"] = data["special"]
262
+ return data
262
263
  ```
263
264
  </code>
264
265
  </function_to_generate>
@@ -261,7 +261,7 @@ class TestTextPromptTemplate:
261
261
  )
262
262
 
263
263
  # Should have structured-specific content
264
- assert "def function_name(data):" in prompt
264
+ assert "def function_name(record: dict) -> dict:" in prompt
265
265
  # Should have schema section
266
266
  assert "=== DATA SCHEMA ===" in prompt
267
267
 
@@ -2,6 +2,7 @@
2
2
 
3
3
  import pytest
4
4
  from recursive_cleaner import DataCleaner, validate_function, extract_sample_data, check_code_safety
5
+ from recursive_cleaner.validation import extract_modified_fields
5
6
 
6
7
 
7
8
  class MockLLM:
@@ -22,7 +23,8 @@ def test_validate_function_accepts_working_code():
22
23
  """Function that works on sample data is accepted."""
23
24
  code = '''
24
25
  def process_data(data):
25
- return data.get("name", "unknown")
26
+ data["processed"] = True
27
+ return data
26
28
  '''
27
29
  sample_data = [{"name": "Alice"}, {"name": "Bob"}]
28
30
  valid, error = validate_function(code, sample_data, "process_data")
@@ -30,6 +32,18 @@ def process_data(data):
30
32
  assert error is None
31
33
 
32
34
 
35
+ def test_validate_function_rejects_wrong_return_type():
36
+ """Function that returns non-dict is rejected."""
37
+ code = '''
38
+ def bad_return(data):
39
+ return data.get("name", "unknown")
40
+ '''
41
+ sample_data = [{"name": "Alice"}]
42
+ valid, error = validate_function(code, sample_data, "bad_return")
43
+ assert valid is False
44
+ assert "must return dict" in error
45
+
46
+
33
47
  def test_validate_function_rejects_key_error():
34
48
  """Function with KeyError on nonexistent key is rejected."""
35
49
  code = '''
@@ -157,7 +171,8 @@ RESPONSE_WITH_GOOD_FUNCTION = '''
157
171
  <code>
158
172
  ```python
159
173
  def good_processor(data):
160
- return data.get("name", "unknown")
174
+ data["processed"] = True
175
+ return data
161
176
  ```
162
177
  </code>
163
178
  </function_to_generate>
@@ -516,7 +531,8 @@ RESPONSE_SAFE_FUNCTION = '''
516
531
  <code>
517
532
  ```python
518
533
  def safe_processor(data):
519
- return data.get("value", 0) * 2
534
+ data["doubled"] = data.get("value", 0) * 2
535
+ return data
520
536
  ```
521
537
  </code>
522
538
  </function_to_generate>
@@ -557,3 +573,168 @@ def test_cleaner_rejects_dangerous_code_and_retries(tmp_path):
557
573
  safety_events = [e for e in events if e["type"] == "safety_failed"]
558
574
  assert len(safety_events) == 1
559
575
  assert "os" in safety_events[0]["error"]
576
+
577
+
578
+ # Tests for extract_modified_fields
579
+
580
+
581
+ def test_extract_modified_fields_simple():
582
+ """Extract single field from record assignment."""
583
+ code = '''
584
+ def fix_phone(record: dict) -> dict:
585
+ record["phone"] = normalize(record["phone"])
586
+ return record
587
+ '''
588
+ fields = extract_modified_fields(code)
589
+ assert fields == {"phone"}
590
+
591
+
592
+ def test_extract_modified_fields_multiple():
593
+ """Extract multiple fields from function."""
594
+ code = '''
595
+ def clean_record(record: dict) -> dict:
596
+ record["phone"] = normalize(record["phone"])
597
+ record["email"] = record["email"].lower()
598
+ return record
599
+ '''
600
+ fields = extract_modified_fields(code)
601
+ assert fields == {"phone", "email"}
602
+
603
+
604
+ def test_extract_modified_fields_data_param():
605
+ """Works with 'data' parameter name too."""
606
+ code = '''
607
+ def fix_status(data: dict) -> dict:
608
+ data["status"] = data["status"].lower()
609
+ return data
610
+ '''
611
+ fields = extract_modified_fields(code)
612
+ assert fields == {"status"}
613
+
614
+
615
+ def test_extract_modified_fields_no_assignments():
616
+ """Returns empty set when no field assignments."""
617
+ code = '''
618
+ def passthrough(record: dict) -> dict:
619
+ return record
620
+ '''
621
+ fields = extract_modified_fields(code)
622
+ assert fields == set()
623
+
624
+
625
+ def test_extract_modified_fields_nested_not_extracted():
626
+ """Only extracts direct record assignments, not nested."""
627
+ code = '''
628
+ def process(record: dict) -> dict:
629
+ temp = {}
630
+ temp["key"] = "value"
631
+ record["result"] = temp
632
+ return record
633
+ '''
634
+ fields = extract_modified_fields(code)
635
+ assert fields == {"result"}
636
+
637
+
638
+ # Integration test: duplicate field detection in DataCleaner
639
+
640
+
641
+ def test_cleaner_rejects_duplicate_field_with_feedback(tmp_path):
642
+ """Cleaner rejects functions that modify already-covered fields."""
643
+ test_file = tmp_path / "test.jsonl"
644
+ test_file.write_text('{"phone": "555-1234", "status": "active"}\n')
645
+
646
+ # First response: phone function (should be accepted)
647
+ response_phone = '''
648
+ <cleaning_analysis>
649
+ <issues_detected>
650
+ <issue id="1" solved="false">Phone needs normalization</issue>
651
+ </issues_detected>
652
+ <function_to_generate>
653
+ <name>normalize_phone</name>
654
+ <docstring>Normalize phone numbers.</docstring>
655
+ <code>
656
+ ```python
657
+ def normalize_phone(record: dict) -> dict:
658
+ record["phone"] = record["phone"].replace("-", "")
659
+ return record
660
+ ```
661
+ </code>
662
+ </function_to_generate>
663
+ <chunk_status>needs_more_work</chunk_status>
664
+ </cleaning_analysis>
665
+ '''
666
+
667
+ # Second response: another phone function (should be rejected as duplicate)
668
+ response_phone_again = '''
669
+ <cleaning_analysis>
670
+ <issues_detected>
671
+ <issue id="1" solved="false">Phone still needs work</issue>
672
+ </issues_detected>
673
+ <function_to_generate>
674
+ <name>fix_phone_format</name>
675
+ <docstring>Fix phone format.</docstring>
676
+ <code>
677
+ ```python
678
+ def fix_phone_format(record: dict) -> dict:
679
+ record["phone"] = "+1" + record["phone"]
680
+ return record
681
+ ```
682
+ </code>
683
+ </function_to_generate>
684
+ <chunk_status>needs_more_work</chunk_status>
685
+ </cleaning_analysis>
686
+ '''
687
+
688
+ # Third response: status function (should be accepted - different field)
689
+ response_status = '''
690
+ <cleaning_analysis>
691
+ <issues_detected>
692
+ <issue id="1" solved="true">Phone handled</issue>
693
+ <issue id="2" solved="false">Status needs fixing</issue>
694
+ </issues_detected>
695
+ <function_to_generate>
696
+ <name>fix_status</name>
697
+ <docstring>Fix status field.</docstring>
698
+ <code>
699
+ ```python
700
+ def fix_status(record: dict) -> dict:
701
+ record["status"] = record["status"].lower()
702
+ return record
703
+ ```
704
+ </code>
705
+ </function_to_generate>
706
+ <chunk_status>needs_more_work</chunk_status>
707
+ </cleaning_analysis>
708
+ '''
709
+
710
+ response_clean = '''
711
+ <cleaning_analysis>
712
+ <issues_detected>
713
+ <issue id="1" solved="true">Phone handled</issue>
714
+ <issue id="2" solved="true">Status handled</issue>
715
+ </issues_detected>
716
+ <chunk_status>clean</chunk_status>
717
+ </cleaning_analysis>
718
+ '''
719
+
720
+ mock_llm = MockLLM([response_phone, response_phone_again, response_status, response_clean])
721
+
722
+ cleaner = DataCleaner(
723
+ llm_backend=mock_llm,
724
+ file_path=str(test_file),
725
+ chunk_size=10,
726
+ validate_runtime=True,
727
+ )
728
+ cleaner.run()
729
+
730
+ # Should have 2 functions: normalize_phone and fix_status
731
+ # fix_phone_format should have been rejected as duplicate
732
+ assert len(cleaner.functions) == 2
733
+ function_names = [f["name"] for f in cleaner.functions]
734
+ assert "normalize_phone" in function_names
735
+ assert "fix_status" in function_names
736
+ assert "fix_phone_format" not in function_names
737
+
738
+ # The retry prompt should mention "already generated" or "duplicate"
739
+ assert any("phone" in call.lower() and ("already" in call.lower() or "duplicate" in call.lower())
740
+ for call in mock_llm.calls[2:]) # Check prompts after first acceptance