recursive-cleaner 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,9 +16,10 @@ from recursive_cleaner.optimizer import (
16
16
  group_by_salience,
17
17
  )
18
18
  from recursive_cleaner.output import write_cleaning_file
19
- from recursive_cleaner.parsers import chunk_file
19
+ from recursive_cleaner.parsers import MARKITDOWN_EXTENSIONS, chunk_file, load_parquet, preprocess_with_markitdown
20
20
  from recursive_cleaner.prompt import build_prompt
21
21
  from recursive_cleaner.response import extract_python_block, parse_response
22
+ from recursive_cleaner.parser_generator import check_parser_safety, generate_parser
22
23
  from recursive_cleaner.validation import check_code_safety, extract_sample_data, validate_function
23
24
 
24
25
  __all__ = [
@@ -27,6 +28,9 @@ __all__ = [
27
28
  "MaxIterationsError",
28
29
  "OutputValidationError",
29
30
  "chunk_file",
31
+ "MARKITDOWN_EXTENSIONS",
32
+ "load_parquet",
33
+ "preprocess_with_markitdown",
30
34
  "parse_response",
31
35
  "extract_python_block",
32
36
  "build_context",
@@ -43,4 +47,6 @@ __all__ = [
43
47
  "extract_tags",
44
48
  "group_by_salience",
45
49
  "consolidate_with_agency",
50
+ "generate_parser",
51
+ "check_parser_safety",
46
52
  ]
@@ -12,7 +12,7 @@ from tenacity import retry, stop_after_attempt, wait_exponential
12
12
  from .context import build_context
13
13
  from .errors import OutputValidationError, ParseError
14
14
  from .metrics import QualityMetrics, compare_quality, load_structured_data, measure_quality
15
- from .parsers import chunk_file
15
+ from .parsers import MARKITDOWN_EXTENSIONS, chunk_file
16
16
  from .prompt import build_prompt
17
17
  from .response import parse_response
18
18
  from .schema import format_schema_for_prompt, infer_schema
@@ -61,6 +61,7 @@ class DataCleaner:
61
61
  saturation_check_interval: int = 20,
62
62
  report_path: str | None = "cleaning_report.md",
63
63
  dry_run: bool = False,
64
+ auto_parse: bool = False,
64
65
  ):
65
66
  self.backend = llm_backend
66
67
  self.file_path = file_path
@@ -84,7 +85,9 @@ class DataCleaner:
84
85
  self.saturation_check_interval = saturation_check_interval
85
86
  self.report_path = report_path
86
87
  self.dry_run = dry_run
88
+ self.auto_parse = auto_parse
87
89
  self.functions: list[dict] = [] # List of {name, docstring, code}
90
+ self._generated_parser: callable | None = None # LLM-generated parser for unknown formats
88
91
  # Track recent function generation for saturation check
89
92
  self._recent_new_function_count = 0
90
93
  self._last_check_function_count = 0
@@ -319,27 +322,72 @@ class DataCleaner:
319
322
  def _detect_mode(self) -> Literal["structured", "text"]:
320
323
  """Detect mode from file extension."""
321
324
  suffix = Path(self.file_path).suffix.lower()
322
- structured_extensions = {".jsonl", ".csv", ".json"}
325
+ # Markitdown formats are processed as text
326
+ if suffix in MARKITDOWN_EXTENSIONS:
327
+ return "text"
328
+ structured_extensions = {".jsonl", ".csv", ".json", ".parquet"}
323
329
  if suffix in structured_extensions:
324
330
  return "structured"
325
331
  return "text"
326
332
 
333
+ def _is_known_extension(self) -> bool:
334
+ """Check if file extension is natively supported."""
335
+ suffix = Path(self.file_path).suffix.lower()
336
+ known = {".jsonl", ".csv", ".json", ".parquet", ".txt"}
337
+ return suffix in known or suffix in MARKITDOWN_EXTENSIONS
338
+
339
+ def _load_with_auto_parser(self) -> list[str]:
340
+ """Load file using LLM-generated parser, return JSONL chunks."""
341
+ from .parser_generator import generate_parser
342
+
343
+ print(f"Unknown file format, generating parser...")
344
+ self._emit("parser_generation_start")
345
+
346
+ parser = generate_parser(self.backend, self.file_path)
347
+ self._generated_parser = parser
348
+
349
+ self._emit("parser_generation_complete")
350
+ print("Parser generated successfully.")
351
+
352
+ # Parse the file
353
+ records = parser(self.file_path)
354
+ if not records:
355
+ return []
356
+
357
+ # Convert to JSONL chunks
358
+ import json
359
+ chunks = []
360
+ for i in range(0, len(records), self.chunk_size):
361
+ chunk_records = records[i:i + self.chunk_size]
362
+ chunk_lines = [json.dumps(r) for r in chunk_records]
363
+ chunks.append("\n".join(chunk_lines))
364
+
365
+ return chunks
366
+
327
367
  def run(self) -> None:
328
368
  """Run the cleaning pipeline."""
329
- # Resolve effective mode
330
- if self.mode == "auto":
331
- self._effective_mode = self._detect_mode()
369
+ # Check if we should use auto-parser for unknown formats
370
+ use_auto_parser = self.auto_parse and not self._is_known_extension()
371
+
372
+ if use_auto_parser:
373
+ # LLM generates parser, always structured mode
374
+ self._effective_mode = "structured"
375
+ chunks = self._load_with_auto_parser()
332
376
  else:
333
- self._effective_mode = self.mode
377
+ # Resolve effective mode
378
+ if self.mode == "auto":
379
+ self._effective_mode = self._detect_mode()
380
+ else:
381
+ self._effective_mode = self.mode
334
382
 
335
- chunks = chunk_file(
336
- self.file_path,
337
- self.chunk_size,
338
- mode=self._effective_mode,
339
- chunk_overlap=self.chunk_overlap,
340
- sampling_strategy=self.sampling_strategy,
341
- stratify_field=self.stratify_field,
342
- )
383
+ chunks = chunk_file(
384
+ self.file_path,
385
+ self.chunk_size,
386
+ mode=self._effective_mode,
387
+ chunk_overlap=self.chunk_overlap,
388
+ sampling_strategy=self.sampling_strategy,
389
+ stratify_field=self.stratify_field,
390
+ )
343
391
 
344
392
  if not chunks:
345
393
  print("No data to process.")
@@ -0,0 +1,123 @@
1
+ """LLM-generated parser for unknown file formats."""
2
+
3
+ import ast
4
+ import re
5
+ from pathlib import Path
6
+
7
+ from .types import LLMBackend
8
+
9
+ # Dangerous patterns for parser code (allows 'open' since parsers need file I/O)
10
+ _DANGEROUS_IMPORTS = frozenset({
11
+ "os", "subprocess", "sys", "shutil", "socket", "urllib",
12
+ "requests", "httplib", "ftplib", "smtplib", "pickle",
13
+ })
14
+ _DANGEROUS_CALLS = frozenset({"eval", "exec", "compile", "__import__"})
15
+
16
+ PARSER_PROMPT = '''You are a data parsing expert. Generate a Python function to parse this file format.
17
+
18
+ === SAMPLE (first 4KB) ===
19
+ {sample}
20
+
21
+ === TASK ===
22
+ Generate a function with this EXACT signature:
23
+
24
+ ```python
25
+ def parse_file(file_path: str) -> list[dict]:
26
+ """Parse the file into a list of records."""
27
+ # Your implementation
28
+ ```
29
+
30
+ RULES:
31
+ - Return list of dicts, one dict per logical record
32
+ - Use only stdlib (xml.etree, json, re, csv)
33
+ - Handle the ENTIRE file, not just this sample
34
+ - Be defensive about malformed data
35
+ - Include necessary imports inside or before the function
36
+ '''
37
+
38
+
39
+ def check_parser_safety(code: str) -> list[str]:
40
+ """Check parser code for dangerous patterns. Returns list of issues."""
41
+ issues = []
42
+ try:
43
+ tree = ast.parse(code)
44
+ except SyntaxError as e:
45
+ return [f"Syntax error: {e}"]
46
+
47
+ for node in ast.walk(tree):
48
+ if isinstance(node, ast.Import):
49
+ for alias in node.names:
50
+ module = alias.name.split(".")[0]
51
+ if module in _DANGEROUS_IMPORTS:
52
+ issues.append(f"Dangerous import: {alias.name}")
53
+ if isinstance(node, ast.ImportFrom):
54
+ if node.module:
55
+ module = node.module.split(".")[0]
56
+ if module in _DANGEROUS_IMPORTS:
57
+ issues.append(f"Dangerous import: from {node.module}")
58
+ if isinstance(node, ast.Call):
59
+ if isinstance(node.func, ast.Name):
60
+ if node.func.id in _DANGEROUS_CALLS:
61
+ issues.append(f"Dangerous call: {node.func.id}()")
62
+ return issues
63
+
64
+
65
+ def extract_python_block(text: str) -> str:
66
+ """Extract code from ```python ... ``` block."""
67
+ match = re.search(r"```python\s*(.*?)\s*```", text, re.DOTALL)
68
+ return match.group(1).strip() if match else text.strip()
69
+
70
+
71
+ def generate_parser(llm_backend: LLMBackend, file_path: str) -> callable:
72
+ """
73
+ Generate a parser function for an unknown file format.
74
+
75
+ Args:
76
+ llm_backend: LLM backend implementing generate(prompt) -> str
77
+ file_path: Path to the file to parse
78
+
79
+ Returns:
80
+ A callable parse_file(file_path) -> list[dict]
81
+
82
+ Raises:
83
+ ValueError: If generated code is unsafe, has invalid syntax,
84
+ or doesn't return list of dicts
85
+ """
86
+ path = Path(file_path)
87
+ with open(path, "r", errors="replace") as f:
88
+ sample = f.read(4096)
89
+
90
+ prompt = PARSER_PROMPT.format(sample=sample)
91
+ response = llm_backend.generate(prompt)
92
+ code = extract_python_block(response)
93
+
94
+ # Validate syntax
95
+ try:
96
+ ast.parse(code)
97
+ except SyntaxError as e:
98
+ raise ValueError(f"Generated parser has invalid syntax: {e}")
99
+
100
+ # Security check
101
+ issues = check_parser_safety(code)
102
+ if issues:
103
+ raise ValueError(f"Generated parser contains dangerous code: {issues}")
104
+
105
+ # Execute to get function
106
+ namespace: dict = {}
107
+ exec(code, namespace)
108
+
109
+ if "parse_file" not in namespace:
110
+ raise ValueError("Generated code must define 'parse_file' function")
111
+
112
+ parser = namespace["parse_file"]
113
+
114
+ # Validate on actual file
115
+ result = parser(file_path)
116
+ if not isinstance(result, list):
117
+ raise ValueError(f"Parser must return list, got {type(result).__name__}")
118
+ if result and not isinstance(result[0], dict):
119
+ raise ValueError(
120
+ f"Parser must return list of dicts, got list of {type(result[0]).__name__}"
121
+ )
122
+
123
+ return parser
@@ -17,6 +17,62 @@ try:
17
17
  except ImportError:
18
18
  _HAS_SENTENCE_CHUNKER = False
19
19
 
20
+ # File extensions supported by markitdown for conversion to text
21
+ MARKITDOWN_EXTENSIONS = {
22
+ ".pdf", ".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt",
23
+ ".html", ".htm", ".epub", ".msg", ".rtf", ".odt", ".ods", ".odp"
24
+ }
25
+
26
+
27
+ def load_parquet(file_path: str) -> list[dict]:
28
+ """Load parquet file as list of dicts.
29
+
30
+ Args:
31
+ file_path: Path to the parquet file
32
+
33
+ Returns:
34
+ List of dictionaries, one per row
35
+
36
+ Raises:
37
+ ImportError: If pyarrow is not installed
38
+ """
39
+ try:
40
+ import pyarrow.parquet as pq
41
+ except ImportError:
42
+ raise ImportError(
43
+ "pyarrow is required for parquet files. "
44
+ "Install with: pip install recursive-cleaner[parquet]"
45
+ )
46
+
47
+ table = pq.read_table(file_path)
48
+ return table.to_pylist()
49
+
50
+
51
+ def preprocess_with_markitdown(file_path: str) -> str:
52
+ """
53
+ Convert supported formats to text using markitdown.
54
+
55
+ Args:
56
+ file_path: Path to the file to convert
57
+
58
+ Returns:
59
+ Extracted text content from the file
60
+
61
+ Raises:
62
+ ImportError: If markitdown is not installed
63
+ """
64
+ try:
65
+ from markitdown import MarkItDown
66
+ except ImportError:
67
+ raise ImportError(
68
+ "markitdown is required for this file type. "
69
+ "Install with: pip install recursive-cleaner[markitdown]"
70
+ )
71
+
72
+ md = MarkItDown()
73
+ result = md.convert(file_path)
74
+ return result.text_content
75
+
20
76
 
21
77
  def chunk_file(
22
78
  file_path: str,
@@ -50,6 +106,25 @@ def chunk_file(
50
106
  if not path.exists():
51
107
  raise FileNotFoundError(f"File not found: {file_path}")
52
108
 
109
+ # Handle markitdown formats: preprocess to text, then chunk as text
110
+ if suffix in MARKITDOWN_EXTENSIONS:
111
+ content = preprocess_with_markitdown(file_path)
112
+ if not content.strip():
113
+ return []
114
+ # Markitdown output is always processed as text
115
+ if sampling_strategy != "sequential":
116
+ raise ValueError(
117
+ f"Text mode only supports 'sequential' sampling, got '{sampling_strategy}'"
118
+ )
119
+ return chunk_text_sentences(content, chunk_size, chunk_overlap)
120
+
121
+ # Handle parquet files: load as list of dicts, chunk like JSONL
122
+ if suffix == ".parquet":
123
+ records = load_parquet(file_path)
124
+ if not records:
125
+ return []
126
+ return _chunk_records(records, chunk_size, sampling_strategy, stratify_field)
127
+
53
128
  content = path.read_text(encoding="utf-8")
54
129
 
55
130
  if not content.strip():
@@ -79,7 +154,7 @@ def chunk_file(
79
154
 
80
155
  def _detect_mode(suffix: str) -> Literal["structured", "text"]:
81
156
  """Detect mode from file extension."""
82
- structured_extensions = {".jsonl", ".csv", ".json"}
157
+ structured_extensions = {".jsonl", ".csv", ".json", ".parquet"}
83
158
  if suffix in structured_extensions:
84
159
  return "structured"
85
160
  return "text"
@@ -281,6 +356,61 @@ def _chunk_jsonl(
281
356
  return chunks
282
357
 
283
358
 
359
+ def _chunk_records(
360
+ records: list[dict],
361
+ item_count: int,
362
+ sampling_strategy: Literal["sequential", "random", "stratified"] = "sequential",
363
+ stratify_field: str | None = None,
364
+ ) -> list[str]:
365
+ """Chunk a list of dicts by item count with optional sampling."""
366
+ if not records:
367
+ return []
368
+
369
+ # For seed computation, use JSON representation
370
+ seed = _compute_seed(json.dumps(records[0]))
371
+
372
+ # Apply sampling strategy
373
+ if sampling_strategy == "random":
374
+ records = _shuffle_records(records, seed)
375
+ elif sampling_strategy == "stratified" and stratify_field:
376
+ records = _stratified_sample_dicts(records, stratify_field, seed)
377
+
378
+ chunks = []
379
+ for i in range(0, len(records), item_count):
380
+ chunk_records = records[i:i + item_count]
381
+ # Convert to JSONL format for LLM context
382
+ chunk_lines = [json.dumps(r) for r in chunk_records]
383
+ chunks.append("\n".join(chunk_lines))
384
+
385
+ return chunks
386
+
387
+
388
+ def _stratified_sample_dicts(records: list[dict], field: str, seed: int) -> list[dict]:
389
+ """Group dicts by field, interleave proportionally."""
390
+ groups: dict[str, list] = {}
391
+ for record in records:
392
+ key = str(record.get(field, "_missing_"))
393
+ if key not in groups:
394
+ groups[key] = []
395
+ groups[key].append(record)
396
+
397
+ # Shuffle within each group
398
+ rng = random.Random(seed)
399
+ for key in groups:
400
+ rng.shuffle(groups[key])
401
+
402
+ # Interleave from groups (round-robin)
403
+ result = []
404
+ group_lists = list(groups.values())
405
+ while any(group_lists):
406
+ for g in group_lists:
407
+ if g:
408
+ result.append(g.pop(0))
409
+ group_lists = [g for g in group_lists if g]
410
+
411
+ return result
412
+
413
+
284
414
  def _compute_seed(content: str) -> int:
285
415
  """Compute deterministic seed from content hash."""
286
416
  return int(hashlib.md5(content.encode("utf-8")).hexdigest()[:8], 16)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: recursive-cleaner
3
- Version: 0.6.1
3
+ Version: 0.7.0
4
4
  Summary: LLM-powered incremental data cleaning pipeline that processes massive datasets in chunks and generates Python cleaning functions
5
5
  Project-URL: Homepage, https://github.com/gaztrabisme/recursive-data-cleaner
6
6
  Project-URL: Repository, https://github.com/gaztrabisme/recursive-data-cleaner
@@ -26,8 +26,12 @@ Requires-Dist: tenacity>=8.0
26
26
  Provides-Extra: dev
27
27
  Requires-Dist: pytest-cov>=4.0; extra == 'dev'
28
28
  Requires-Dist: pytest>=7.0; extra == 'dev'
29
+ Provides-Extra: markitdown
30
+ Requires-Dist: markitdown>=0.1.0; extra == 'markitdown'
29
31
  Provides-Extra: mlx
30
32
  Requires-Dist: mlx-lm>=0.10.0; extra == 'mlx'
33
+ Provides-Extra: parquet
34
+ Requires-Dist: pyarrow>=14.0.0; extra == 'parquet'
31
35
  Description-Content-Type: text/markdown
32
36
 
33
37
  # Recursive Data Cleaner
@@ -1,14 +1,15 @@
1
1
  backends/__init__.py,sha256=FUgODeYSGBvT0-z6myVby6YeAHG0nEUgWLITBKobUew,121
2
2
  backends/mlx_backend.py,sha256=0U6IqmDHyk4vjKzytvEcQvSUBryQTgFtsNOcpwFNKk8,2945
3
- recursive_cleaner/__init__.py,sha256=Wm_sQdh1mkJ-DByo4lMiAIiLWFRfj8G2Limr0eSG0s0,1416
4
- recursive_cleaner/cleaner.py,sha256=vSrogf8T1AquLakmqbpgvuFoTD6_AZp_hrG3vJxx9gk,24340
3
+ recursive_cleaner/__init__.py,sha256=bG83PcmkxAYMC17FmKuyMJUrMnuukp32JO3rlCLyB-Q,1698
4
+ recursive_cleaner/cleaner.py,sha256=J2X5bnk2OsWJyOn4BNR-cj0sqeKCylznfs_WEyMGxG8,26280
5
5
  recursive_cleaner/context.py,sha256=avMXRDxLd7nd8CKWtvPHQy1MFhBKiA0aUVVJIlWoLZ4,824
6
6
  recursive_cleaner/dependencies.py,sha256=vlYeoGL517v3yUSWN0wYDuIs9OOuQwM_dCBADrlitW8,2080
7
7
  recursive_cleaner/errors.py,sha256=hwRJF8NSmWy_FZHCxcZDZxLQ0zqvo5dX8ImkB9mrOYc,433
8
8
  recursive_cleaner/metrics.py,sha256=C6RlvZMTtVm7kdRhfMZu4xd-R-i1EQGMT5FCasPOO3A,5003
9
9
  recursive_cleaner/optimizer.py,sha256=lnQC9Y1ClkW4po1eYa2bnYYu4smiDuUpMPPX6EN1UQ0,9700
10
10
  recursive_cleaner/output.py,sha256=quTlZYtKZm9h37mbnwQmEjg0q8VQSZWEqwaHfhSAd3s,6106
11
- recursive_cleaner/parsers.py,sha256=39oMg0WGRFV_eRBzOfB7LIGXMP1cIDwdeYk4UOlw140,10595
11
+ recursive_cleaner/parser_generator.py,sha256=enn6_okGWB2ddVkwI7ytndT04S4QEVAk6cbmb7shxcM,3905
12
+ recursive_cleaner/parsers.py,sha256=HCS2UiVFhboq_go4DyWUygkJTkpfYkFj9_hqWiGIEXo,14572
12
13
  recursive_cleaner/prompt.py,sha256=ep0eOXz_XbhH3HduJ76LvzVSftonhcv4GLEecIqd3lY,6484
13
14
  recursive_cleaner/report.py,sha256=AWWneRjvl76ccLlExdkKJeY3GVFUG_LtmzVIJJT5cFI,4629
14
15
  recursive_cleaner/response.py,sha256=3w0mLnqEPdB4daMSF0mtTcG0PTP-utb1HFtKuYA1ljw,9064
@@ -17,7 +18,7 @@ recursive_cleaner/types.py,sha256=-GdCmsfHd3rfdfCi5c-RXqX4TyuCSHgA__3AF3bMhoQ,29
17
18
  recursive_cleaner/validation.py,sha256=-KAolhw3GQyhHwmh0clEj8xqPD5O-R2AO5rx7vubIME,6442
18
19
  recursive_cleaner/vendor/__init__.py,sha256=E87TjmjRzu8ty39nqThvBwM611yXlLKQZ6KGY_zp3Dk,117
19
20
  recursive_cleaner/vendor/chunker.py,sha256=pDDbfF6FoSmUji0-RG4MletPxJ-VybGw0yfnhh0aMSQ,6730
20
- recursive_cleaner-0.6.1.dist-info/METADATA,sha256=q7MqUiWUUZ2zlhujiR725ydPOKSIqCEG01ncXkI1NkA,9328
21
- recursive_cleaner-0.6.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
22
- recursive_cleaner-0.6.1.dist-info/licenses/LICENSE,sha256=P8hRMK-UqRbQDsVN9nr901wpZcqwXEHr28DXhBUheF0,1064
23
- recursive_cleaner-0.6.1.dist-info/RECORD,,
21
+ recursive_cleaner-0.7.0.dist-info/METADATA,sha256=bSCS8YBPAYzBufVF41LDYAgpLnYc4JAynA4xkNVuKyo,9486
22
+ recursive_cleaner-0.7.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
23
+ recursive_cleaner-0.7.0.dist-info/licenses/LICENSE,sha256=P8hRMK-UqRbQDsVN9nr901wpZcqwXEHr28DXhBUheF0,1064
24
+ recursive_cleaner-0.7.0.dist-info/RECORD,,