recursive-cleaner 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,7 +22,7 @@ from recursive_cleaner.prompt import build_prompt
22
22
  from recursive_cleaner.response import extract_python_block, parse_response
23
23
  from recursive_cleaner.parser_generator import check_parser_safety, generate_parser
24
24
  from recursive_cleaner.tui import HAS_RICH, TUIRenderer
25
- from recursive_cleaner.validation import check_code_safety, extract_sample_data, validate_function
25
+ from recursive_cleaner.validation import check_code_safety, extract_modified_fields, extract_sample_data, validate_function
26
26
 
27
27
  __all__ = [
28
28
  "apply_cleaning",
@@ -43,6 +43,7 @@ __all__ = [
43
43
  "validate_function",
44
44
  "extract_sample_data",
45
45
  "check_code_safety",
46
+ "extract_modified_fields",
46
47
  "resolve_dependencies",
47
48
  "QualityMetrics",
48
49
  "measure_quality",
@@ -17,7 +17,7 @@ from .prompt import build_prompt
17
17
  from .response import parse_response
18
18
  from .schema import format_schema_for_prompt, infer_schema
19
19
  from .types import LLMBackend
20
- from .validation import check_code_safety, extract_sample_data, split_holdout, validate_function
20
+ from .validation import check_code_safety, extract_modified_fields, extract_sample_data, split_holdout, validate_function
21
21
 
22
22
  STATE_VERSION = "0.5.0"
23
23
 
@@ -110,6 +110,8 @@ class DataCleaner:
110
110
  "min_ms": float("inf"),
111
111
  "max_ms": 0.0,
112
112
  }
113
+ # Track fields already covered by generated functions (per chunk)
114
+ self._fields_covered: set[str] = set()
113
115
 
114
116
  def _emit(self, event_type: str, chunk_index: int = 0, **kwargs) -> None:
115
117
  """Emit a progress event to the callback, if set."""
@@ -533,6 +535,8 @@ class DataCleaner:
533
535
  """Process a single chunk, iterating until clean or max iterations."""
534
536
  self._emit("chunk_start", chunk_index=chunk_idx)
535
537
  error_feedback = ""
538
+ # Reset fields covered for new chunk
539
+ self._fields_covered = set()
536
540
 
537
541
  # Dry run mode: just detect issues, don't generate functions
538
542
  if self.dry_run:
@@ -594,6 +598,20 @@ class DataCleaner:
594
598
  print(f" Safety check failed: {safety_error}")
595
599
  continue
596
600
 
601
+ # Check for duplicate field coverage
602
+ new_fields = extract_modified_fields(result["code"])
603
+ overlap = new_fields & self._fields_covered
604
+ if overlap:
605
+ field_list = ", ".join(sorted(overlap))
606
+ error_feedback = f"You already generated a function for field(s): {field_list}. This issue is solved. Move on to the next unsolved issue."
607
+ self._emit(
608
+ "duplicate_field",
609
+ chunk_index=chunk_idx,
610
+ function_name=result["name"],
611
+ fields=list(overlap),
612
+ )
613
+ continue
614
+
597
615
  # Runtime validation if enabled
598
616
  if self.validate_runtime:
599
617
  # Use holdout data if available, else sample from generation chunk
@@ -628,6 +646,8 @@ class DataCleaner:
628
646
  "docstring": result["docstring"],
629
647
  "code": result["code"],
630
648
  })
649
+ # Track fields covered by this function
650
+ self._fields_covered.update(new_fields)
631
651
  # Track for saturation check
632
652
  self._recent_new_function_count += 1
633
653
 
@@ -52,7 +52,8 @@ CONSOLIDATION_TEMPLATE = '''You are reviewing cleaning functions for consolidati
52
52
  </docstring>
53
53
  <code>
54
54
  ```python
55
- def merged_function_name(record):
55
+ def merged_function_name(record: dict) -> dict:
56
+ # Modify fields, return record
56
57
  ...
57
58
  ```
58
59
  </code>
@@ -108,9 +109,10 @@ Tags: domain, action, detail
108
109
  </docstring>
109
110
  <code>
110
111
  ```python
111
- def function_name(data):
112
- # Complete implementation
113
- pass
112
+ def function_name(record: dict) -> dict:
113
+ # Modify field(s) in the record
114
+ record["field"] = cleaned_value
115
+ return record
114
116
  ```
115
117
  </code>
116
118
  </function_to_generate>
@@ -120,6 +122,8 @@ def function_name(data):
120
122
 
121
123
  RULES:
122
124
  - ONE function per response
125
+ - Function signature: takes a dict (one record), returns the modified dict
126
+ - Modify fields directly on the record, then return it
123
127
  - If all issues solved: <chunk_status>clean</chunk_status>, omit <function_to_generate>
124
128
  - Include imports inside the function or document needed imports in docstring
125
129
  - Function must be idempotent (safe to run multiple times)
@@ -160,7 +160,10 @@ def validate_function(
160
160
  # Structured mode: sample_data is list[dict]
161
161
  for i, record in enumerate(sample_data):
162
162
  try:
163
- func(record)
163
+ result = func(record)
164
+ # Verify function returns a dict (not string, int, etc.)
165
+ if not isinstance(result, dict):
166
+ return False, f"Function must return dict, got {type(result).__name__}"
164
167
  except Exception as e:
165
168
  return False, f"Runtime error on sample {i}: {type(e).__name__}: {e}"
166
169
 
@@ -200,3 +203,39 @@ def extract_sample_data(
200
203
  except json.JSONDecodeError:
201
204
  continue
202
205
  return samples
206
+
207
+
208
+ def extract_modified_fields(code: str) -> set[str]:
209
+ """
210
+ Extract field names that are modified via record["field"] = ... pattern.
211
+
212
+ Args:
213
+ code: Python source code of the function
214
+
215
+ Returns:
216
+ Set of field names that are assigned to
217
+ """
218
+ try:
219
+ tree = ast.parse(code)
220
+ except SyntaxError:
221
+ return set()
222
+
223
+ fields = set()
224
+ # Common parameter names for the data/record argument
225
+ data_names = {"record", "data"}
226
+
227
+ for node in ast.walk(tree):
228
+ # Look for assignment statements
229
+ if isinstance(node, ast.Assign):
230
+ for target in node.targets:
231
+ # Check if target is a subscript: record["field"] or data["field"]
232
+ if isinstance(target, ast.Subscript):
233
+ # The value should be a Name node (record or data)
234
+ if isinstance(target.value, ast.Name):
235
+ if target.value.id in data_names:
236
+ # The slice should be a string constant
237
+ if isinstance(target.slice, ast.Constant):
238
+ if isinstance(target.slice.value, str):
239
+ fields.add(target.slice.value)
240
+
241
+ return fields
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: recursive-cleaner
3
- Version: 1.0.0
3
+ Version: 1.0.1
4
4
  Summary: LLM-powered incremental data cleaning pipeline that processes massive datasets in chunks and generates Python cleaning functions
5
5
  Project-URL: Homepage, https://github.com/gaztrabisme/recursive-data-cleaner
6
6
  Project-URL: Repository, https://github.com/gaztrabisme/recursive-data-cleaner
@@ -389,7 +389,7 @@ backends/
389
389
  pytest tests/ -v
390
390
  ```
391
391
 
392
- 548 tests covering all features. Test datasets in `test_cases/`:
392
+ 555 tests covering all features. Test datasets in `test_cases/`:
393
393
  - E-commerce product catalogs
394
394
  - Healthcare patient records
395
395
  - Financial transaction data
@@ -1,10 +1,10 @@
1
1
  backends/__init__.py,sha256=vWcPASV0GGEAydzOSjdrknkSHoGbSs4edtuv9HIzBhI,180
2
2
  backends/mlx_backend.py,sha256=0U6IqmDHyk4vjKzytvEcQvSUBryQTgFtsNOcpwFNKk8,2945
3
3
  backends/openai_backend.py,sha256=vKWsXKltBv_tJDoQfQ_7KVMZDfomhFFN2vl1oZ1KGbQ,2057
4
- recursive_cleaner/__init__.py,sha256=xCFlkqmmBoa7ntUZQnRQxVMv9iLeOvmboDS_j2EHfZI,1862
4
+ recursive_cleaner/__init__.py,sha256=-NesTf9deCVOxkadFuyfVl-IjfbEHlYcMNAaAW9kUuw,1918
5
5
  recursive_cleaner/__main__.py,sha256=WXmMaL_myHPsG_qXAhZDufD43Ydsd25RV2IPeW2Kg08,152
6
6
  recursive_cleaner/apply.py,sha256=hjeljhZNiOuwz9m09RYVLl_z_9tet7LwubH6cb_Wy6Y,13855
7
- recursive_cleaner/cleaner.py,sha256=kPOQ44hgiJzABiqdmjg2hqd7Ot9uxKUSOe8_jz0UBQc,29911
7
+ recursive_cleaner/cleaner.py,sha256=lLe7LNaVYwukDhBTxLs8ezsQf7fes9m9OX7g9nGo760,30954
8
8
  recursive_cleaner/cli.py,sha256=Sk_qYKxSn1PiPmMLKkyj9VxsseHaSXmSlGazxfmkTFc,12807
9
9
  recursive_cleaner/context.py,sha256=avMXRDxLd7nd8CKWtvPHQy1MFhBKiA0aUVVJIlWoLZ4,824
10
10
  recursive_cleaner/dependencies.py,sha256=vlYeoGL517v3yUSWN0wYDuIs9OOuQwM_dCBADrlitW8,2080
@@ -14,17 +14,17 @@ recursive_cleaner/optimizer.py,sha256=lnQC9Y1ClkW4po1eYa2bnYYu4smiDuUpMPPX6EN1UQ
14
14
  recursive_cleaner/output.py,sha256=quTlZYtKZm9h37mbnwQmEjg0q8VQSZWEqwaHfhSAd3s,6106
15
15
  recursive_cleaner/parser_generator.py,sha256=enn6_okGWB2ddVkwI7ytndT04S4QEVAk6cbmb7shxcM,3905
16
16
  recursive_cleaner/parsers.py,sha256=HCS2UiVFhboq_go4DyWUygkJTkpfYkFj9_hqWiGIEXo,14572
17
- recursive_cleaner/prompt.py,sha256=ep0eOXz_XbhH3HduJ76LvzVSftonhcv4GLEecIqd3lY,6484
17
+ recursive_cleaner/prompt.py,sha256=yqwUyB6Z51Oqhvxz3mNijZraXr-QEUYQ_ubyiryZSrU,6730
18
18
  recursive_cleaner/report.py,sha256=AWWneRjvl76ccLlExdkKJeY3GVFUG_LtmzVIJJT5cFI,4629
19
19
  recursive_cleaner/response.py,sha256=3w0mLnqEPdB4daMSF0mtTcG0PTP-utb1HFtKuYA1ljw,9064
20
20
  recursive_cleaner/schema.py,sha256=w2hcEdApR15KVI9SFWB3VfumMoHFwn1YJrktdfgPo8M,3925
21
21
  recursive_cleaner/tui.py,sha256=zuiFPtMh3K-sC1CWZoaoUmgZ3rESkl10gYcqMzpVqiM,22598
22
22
  recursive_cleaner/types.py,sha256=-GdCmsfHd3rfdfCi5c-RXqX4TyuCSHgA__3AF3bMhoQ,290
23
- recursive_cleaner/validation.py,sha256=-KAolhw3GQyhHwmh0clEj8xqPD5O-R2AO5rx7vubIME,6442
23
+ recursive_cleaner/validation.py,sha256=IlXz5EhXaUb0mJlaH0ygFH1ePPWHVfgjL-5ZawyKicY,7910
24
24
  recursive_cleaner/vendor/__init__.py,sha256=E87TjmjRzu8ty39nqThvBwM611yXlLKQZ6KGY_zp3Dk,117
25
25
  recursive_cleaner/vendor/chunker.py,sha256=pDDbfF6FoSmUji0-RG4MletPxJ-VybGw0yfnhh0aMSQ,6730
26
- recursive_cleaner-1.0.0.dist-info/METADATA,sha256=L86ATNd8JxmPp32HKaO6PPwkmq4sIE3Mdvgx3pmUulE,14285
27
- recursive_cleaner-1.0.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
28
- recursive_cleaner-1.0.0.dist-info/entry_points.txt,sha256=S5nbi0rnifpShxdXGExeZnd65UZfp8K7DNyuKPST6nk,65
29
- recursive_cleaner-1.0.0.dist-info/licenses/LICENSE,sha256=P8hRMK-UqRbQDsVN9nr901wpZcqwXEHr28DXhBUheF0,1064
30
- recursive_cleaner-1.0.0.dist-info/RECORD,,
26
+ recursive_cleaner-1.0.1.dist-info/METADATA,sha256=qEmuiRPtRjuigM29FgjrkUUZm0YV91xNjuc7j16NhKU,14285
27
+ recursive_cleaner-1.0.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
28
+ recursive_cleaner-1.0.1.dist-info/entry_points.txt,sha256=S5nbi0rnifpShxdXGExeZnd65UZfp8K7DNyuKPST6nk,65
29
+ recursive_cleaner-1.0.1.dist-info/licenses/LICENSE,sha256=P8hRMK-UqRbQDsVN9nr901wpZcqwXEHr28DXhBUheF0,1064
30
+ recursive_cleaner-1.0.1.dist-info/RECORD,,