recursive-cleaner 0.6.1__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- recursive_cleaner/__init__.py +7 -1
- recursive_cleaner/cleaner.py +62 -14
- recursive_cleaner/parser_generator.py +123 -0
- recursive_cleaner/parsers.py +131 -1
- {recursive_cleaner-0.6.1.dist-info → recursive_cleaner-0.7.0.dist-info}/METADATA +5 -1
- {recursive_cleaner-0.6.1.dist-info → recursive_cleaner-0.7.0.dist-info}/RECORD +8 -7
- {recursive_cleaner-0.6.1.dist-info → recursive_cleaner-0.7.0.dist-info}/WHEEL +0 -0
- {recursive_cleaner-0.6.1.dist-info → recursive_cleaner-0.7.0.dist-info}/licenses/LICENSE +0 -0
recursive_cleaner/__init__.py
CHANGED
|
@@ -16,9 +16,10 @@ from recursive_cleaner.optimizer import (
|
|
|
16
16
|
group_by_salience,
|
|
17
17
|
)
|
|
18
18
|
from recursive_cleaner.output import write_cleaning_file
|
|
19
|
-
from recursive_cleaner.parsers import chunk_file
|
|
19
|
+
from recursive_cleaner.parsers import MARKITDOWN_EXTENSIONS, chunk_file, load_parquet, preprocess_with_markitdown
|
|
20
20
|
from recursive_cleaner.prompt import build_prompt
|
|
21
21
|
from recursive_cleaner.response import extract_python_block, parse_response
|
|
22
|
+
from recursive_cleaner.parser_generator import check_parser_safety, generate_parser
|
|
22
23
|
from recursive_cleaner.validation import check_code_safety, extract_sample_data, validate_function
|
|
23
24
|
|
|
24
25
|
__all__ = [
|
|
@@ -27,6 +28,9 @@ __all__ = [
|
|
|
27
28
|
"MaxIterationsError",
|
|
28
29
|
"OutputValidationError",
|
|
29
30
|
"chunk_file",
|
|
31
|
+
"MARKITDOWN_EXTENSIONS",
|
|
32
|
+
"load_parquet",
|
|
33
|
+
"preprocess_with_markitdown",
|
|
30
34
|
"parse_response",
|
|
31
35
|
"extract_python_block",
|
|
32
36
|
"build_context",
|
|
@@ -43,4 +47,6 @@ __all__ = [
|
|
|
43
47
|
"extract_tags",
|
|
44
48
|
"group_by_salience",
|
|
45
49
|
"consolidate_with_agency",
|
|
50
|
+
"generate_parser",
|
|
51
|
+
"check_parser_safety",
|
|
46
52
|
]
|
recursive_cleaner/cleaner.py
CHANGED
|
@@ -12,7 +12,7 @@ from tenacity import retry, stop_after_attempt, wait_exponential
|
|
|
12
12
|
from .context import build_context
|
|
13
13
|
from .errors import OutputValidationError, ParseError
|
|
14
14
|
from .metrics import QualityMetrics, compare_quality, load_structured_data, measure_quality
|
|
15
|
-
from .parsers import chunk_file
|
|
15
|
+
from .parsers import MARKITDOWN_EXTENSIONS, chunk_file
|
|
16
16
|
from .prompt import build_prompt
|
|
17
17
|
from .response import parse_response
|
|
18
18
|
from .schema import format_schema_for_prompt, infer_schema
|
|
@@ -61,6 +61,7 @@ class DataCleaner:
|
|
|
61
61
|
saturation_check_interval: int = 20,
|
|
62
62
|
report_path: str | None = "cleaning_report.md",
|
|
63
63
|
dry_run: bool = False,
|
|
64
|
+
auto_parse: bool = False,
|
|
64
65
|
):
|
|
65
66
|
self.backend = llm_backend
|
|
66
67
|
self.file_path = file_path
|
|
@@ -84,7 +85,9 @@ class DataCleaner:
|
|
|
84
85
|
self.saturation_check_interval = saturation_check_interval
|
|
85
86
|
self.report_path = report_path
|
|
86
87
|
self.dry_run = dry_run
|
|
88
|
+
self.auto_parse = auto_parse
|
|
87
89
|
self.functions: list[dict] = [] # List of {name, docstring, code}
|
|
90
|
+
self._generated_parser: callable | None = None # LLM-generated parser for unknown formats
|
|
88
91
|
# Track recent function generation for saturation check
|
|
89
92
|
self._recent_new_function_count = 0
|
|
90
93
|
self._last_check_function_count = 0
|
|
@@ -319,27 +322,72 @@ class DataCleaner:
|
|
|
319
322
|
def _detect_mode(self) -> Literal["structured", "text"]:
|
|
320
323
|
"""Detect mode from file extension."""
|
|
321
324
|
suffix = Path(self.file_path).suffix.lower()
|
|
322
|
-
|
|
325
|
+
# Markitdown formats are processed as text
|
|
326
|
+
if suffix in MARKITDOWN_EXTENSIONS:
|
|
327
|
+
return "text"
|
|
328
|
+
structured_extensions = {".jsonl", ".csv", ".json", ".parquet"}
|
|
323
329
|
if suffix in structured_extensions:
|
|
324
330
|
return "structured"
|
|
325
331
|
return "text"
|
|
326
332
|
|
|
333
|
+
def _is_known_extension(self) -> bool:
|
|
334
|
+
"""Check if file extension is natively supported."""
|
|
335
|
+
suffix = Path(self.file_path).suffix.lower()
|
|
336
|
+
known = {".jsonl", ".csv", ".json", ".parquet", ".txt"}
|
|
337
|
+
return suffix in known or suffix in MARKITDOWN_EXTENSIONS
|
|
338
|
+
|
|
339
|
+
def _load_with_auto_parser(self) -> list[str]:
|
|
340
|
+
"""Load file using LLM-generated parser, return JSONL chunks."""
|
|
341
|
+
from .parser_generator import generate_parser
|
|
342
|
+
|
|
343
|
+
print(f"Unknown file format, generating parser...")
|
|
344
|
+
self._emit("parser_generation_start")
|
|
345
|
+
|
|
346
|
+
parser = generate_parser(self.backend, self.file_path)
|
|
347
|
+
self._generated_parser = parser
|
|
348
|
+
|
|
349
|
+
self._emit("parser_generation_complete")
|
|
350
|
+
print("Parser generated successfully.")
|
|
351
|
+
|
|
352
|
+
# Parse the file
|
|
353
|
+
records = parser(self.file_path)
|
|
354
|
+
if not records:
|
|
355
|
+
return []
|
|
356
|
+
|
|
357
|
+
# Convert to JSONL chunks
|
|
358
|
+
import json
|
|
359
|
+
chunks = []
|
|
360
|
+
for i in range(0, len(records), self.chunk_size):
|
|
361
|
+
chunk_records = records[i:i + self.chunk_size]
|
|
362
|
+
chunk_lines = [json.dumps(r) for r in chunk_records]
|
|
363
|
+
chunks.append("\n".join(chunk_lines))
|
|
364
|
+
|
|
365
|
+
return chunks
|
|
366
|
+
|
|
327
367
|
def run(self) -> None:
|
|
328
368
|
"""Run the cleaning pipeline."""
|
|
329
|
-
#
|
|
330
|
-
|
|
331
|
-
|
|
369
|
+
# Check if we should use auto-parser for unknown formats
|
|
370
|
+
use_auto_parser = self.auto_parse and not self._is_known_extension()
|
|
371
|
+
|
|
372
|
+
if use_auto_parser:
|
|
373
|
+
# LLM generates parser, always structured mode
|
|
374
|
+
self._effective_mode = "structured"
|
|
375
|
+
chunks = self._load_with_auto_parser()
|
|
332
376
|
else:
|
|
333
|
-
|
|
377
|
+
# Resolve effective mode
|
|
378
|
+
if self.mode == "auto":
|
|
379
|
+
self._effective_mode = self._detect_mode()
|
|
380
|
+
else:
|
|
381
|
+
self._effective_mode = self.mode
|
|
334
382
|
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
383
|
+
chunks = chunk_file(
|
|
384
|
+
self.file_path,
|
|
385
|
+
self.chunk_size,
|
|
386
|
+
mode=self._effective_mode,
|
|
387
|
+
chunk_overlap=self.chunk_overlap,
|
|
388
|
+
sampling_strategy=self.sampling_strategy,
|
|
389
|
+
stratify_field=self.stratify_field,
|
|
390
|
+
)
|
|
343
391
|
|
|
344
392
|
if not chunks:
|
|
345
393
|
print("No data to process.")
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""LLM-generated parser for unknown file formats."""
|
|
2
|
+
|
|
3
|
+
import ast
|
|
4
|
+
import re
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from .types import LLMBackend
|
|
8
|
+
|
|
9
|
+
# Dangerous patterns for parser code (allows 'open' since parsers need file I/O)
|
|
10
|
+
_DANGEROUS_IMPORTS = frozenset({
|
|
11
|
+
"os", "subprocess", "sys", "shutil", "socket", "urllib",
|
|
12
|
+
"requests", "httplib", "ftplib", "smtplib", "pickle",
|
|
13
|
+
})
|
|
14
|
+
_DANGEROUS_CALLS = frozenset({"eval", "exec", "compile", "__import__"})
|
|
15
|
+
|
|
16
|
+
PARSER_PROMPT = '''You are a data parsing expert. Generate a Python function to parse this file format.
|
|
17
|
+
|
|
18
|
+
=== SAMPLE (first 4KB) ===
|
|
19
|
+
{sample}
|
|
20
|
+
|
|
21
|
+
=== TASK ===
|
|
22
|
+
Generate a function with this EXACT signature:
|
|
23
|
+
|
|
24
|
+
```python
|
|
25
|
+
def parse_file(file_path: str) -> list[dict]:
|
|
26
|
+
"""Parse the file into a list of records."""
|
|
27
|
+
# Your implementation
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
RULES:
|
|
31
|
+
- Return list of dicts, one dict per logical record
|
|
32
|
+
- Use only stdlib (xml.etree, json, re, csv)
|
|
33
|
+
- Handle the ENTIRE file, not just this sample
|
|
34
|
+
- Be defensive about malformed data
|
|
35
|
+
- Include necessary imports inside or before the function
|
|
36
|
+
'''
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def check_parser_safety(code: str) -> list[str]:
|
|
40
|
+
"""Check parser code for dangerous patterns. Returns list of issues."""
|
|
41
|
+
issues = []
|
|
42
|
+
try:
|
|
43
|
+
tree = ast.parse(code)
|
|
44
|
+
except SyntaxError as e:
|
|
45
|
+
return [f"Syntax error: {e}"]
|
|
46
|
+
|
|
47
|
+
for node in ast.walk(tree):
|
|
48
|
+
if isinstance(node, ast.Import):
|
|
49
|
+
for alias in node.names:
|
|
50
|
+
module = alias.name.split(".")[0]
|
|
51
|
+
if module in _DANGEROUS_IMPORTS:
|
|
52
|
+
issues.append(f"Dangerous import: {alias.name}")
|
|
53
|
+
if isinstance(node, ast.ImportFrom):
|
|
54
|
+
if node.module:
|
|
55
|
+
module = node.module.split(".")[0]
|
|
56
|
+
if module in _DANGEROUS_IMPORTS:
|
|
57
|
+
issues.append(f"Dangerous import: from {node.module}")
|
|
58
|
+
if isinstance(node, ast.Call):
|
|
59
|
+
if isinstance(node.func, ast.Name):
|
|
60
|
+
if node.func.id in _DANGEROUS_CALLS:
|
|
61
|
+
issues.append(f"Dangerous call: {node.func.id}()")
|
|
62
|
+
return issues
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def extract_python_block(text: str) -> str:
|
|
66
|
+
"""Extract code from ```python ... ``` block."""
|
|
67
|
+
match = re.search(r"```python\s*(.*?)\s*```", text, re.DOTALL)
|
|
68
|
+
return match.group(1).strip() if match else text.strip()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def generate_parser(llm_backend: LLMBackend, file_path: str) -> callable:
|
|
72
|
+
"""
|
|
73
|
+
Generate a parser function for an unknown file format.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
llm_backend: LLM backend implementing generate(prompt) -> str
|
|
77
|
+
file_path: Path to the file to parse
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
A callable parse_file(file_path) -> list[dict]
|
|
81
|
+
|
|
82
|
+
Raises:
|
|
83
|
+
ValueError: If generated code is unsafe, has invalid syntax,
|
|
84
|
+
or doesn't return list of dicts
|
|
85
|
+
"""
|
|
86
|
+
path = Path(file_path)
|
|
87
|
+
with open(path, "r", errors="replace") as f:
|
|
88
|
+
sample = f.read(4096)
|
|
89
|
+
|
|
90
|
+
prompt = PARSER_PROMPT.format(sample=sample)
|
|
91
|
+
response = llm_backend.generate(prompt)
|
|
92
|
+
code = extract_python_block(response)
|
|
93
|
+
|
|
94
|
+
# Validate syntax
|
|
95
|
+
try:
|
|
96
|
+
ast.parse(code)
|
|
97
|
+
except SyntaxError as e:
|
|
98
|
+
raise ValueError(f"Generated parser has invalid syntax: {e}")
|
|
99
|
+
|
|
100
|
+
# Security check
|
|
101
|
+
issues = check_parser_safety(code)
|
|
102
|
+
if issues:
|
|
103
|
+
raise ValueError(f"Generated parser contains dangerous code: {issues}")
|
|
104
|
+
|
|
105
|
+
# Execute to get function
|
|
106
|
+
namespace: dict = {}
|
|
107
|
+
exec(code, namespace)
|
|
108
|
+
|
|
109
|
+
if "parse_file" not in namespace:
|
|
110
|
+
raise ValueError("Generated code must define 'parse_file' function")
|
|
111
|
+
|
|
112
|
+
parser = namespace["parse_file"]
|
|
113
|
+
|
|
114
|
+
# Validate on actual file
|
|
115
|
+
result = parser(file_path)
|
|
116
|
+
if not isinstance(result, list):
|
|
117
|
+
raise ValueError(f"Parser must return list, got {type(result).__name__}")
|
|
118
|
+
if result and not isinstance(result[0], dict):
|
|
119
|
+
raise ValueError(
|
|
120
|
+
f"Parser must return list of dicts, got list of {type(result[0]).__name__}"
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
return parser
|
recursive_cleaner/parsers.py
CHANGED
|
@@ -17,6 +17,62 @@ try:
|
|
|
17
17
|
except ImportError:
|
|
18
18
|
_HAS_SENTENCE_CHUNKER = False
|
|
19
19
|
|
|
20
|
+
# File extensions supported by markitdown for conversion to text
|
|
21
|
+
MARKITDOWN_EXTENSIONS = {
|
|
22
|
+
".pdf", ".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt",
|
|
23
|
+
".html", ".htm", ".epub", ".msg", ".rtf", ".odt", ".ods", ".odp"
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def load_parquet(file_path: str) -> list[dict]:
|
|
28
|
+
"""Load parquet file as list of dicts.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
file_path: Path to the parquet file
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
List of dictionaries, one per row
|
|
35
|
+
|
|
36
|
+
Raises:
|
|
37
|
+
ImportError: If pyarrow is not installed
|
|
38
|
+
"""
|
|
39
|
+
try:
|
|
40
|
+
import pyarrow.parquet as pq
|
|
41
|
+
except ImportError:
|
|
42
|
+
raise ImportError(
|
|
43
|
+
"pyarrow is required for parquet files. "
|
|
44
|
+
"Install with: pip install recursive-cleaner[parquet]"
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
table = pq.read_table(file_path)
|
|
48
|
+
return table.to_pylist()
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def preprocess_with_markitdown(file_path: str) -> str:
|
|
52
|
+
"""
|
|
53
|
+
Convert supported formats to text using markitdown.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
file_path: Path to the file to convert
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
Extracted text content from the file
|
|
60
|
+
|
|
61
|
+
Raises:
|
|
62
|
+
ImportError: If markitdown is not installed
|
|
63
|
+
"""
|
|
64
|
+
try:
|
|
65
|
+
from markitdown import MarkItDown
|
|
66
|
+
except ImportError:
|
|
67
|
+
raise ImportError(
|
|
68
|
+
"markitdown is required for this file type. "
|
|
69
|
+
"Install with: pip install recursive-cleaner[markitdown]"
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
md = MarkItDown()
|
|
73
|
+
result = md.convert(file_path)
|
|
74
|
+
return result.text_content
|
|
75
|
+
|
|
20
76
|
|
|
21
77
|
def chunk_file(
|
|
22
78
|
file_path: str,
|
|
@@ -50,6 +106,25 @@ def chunk_file(
|
|
|
50
106
|
if not path.exists():
|
|
51
107
|
raise FileNotFoundError(f"File not found: {file_path}")
|
|
52
108
|
|
|
109
|
+
# Handle markitdown formats: preprocess to text, then chunk as text
|
|
110
|
+
if suffix in MARKITDOWN_EXTENSIONS:
|
|
111
|
+
content = preprocess_with_markitdown(file_path)
|
|
112
|
+
if not content.strip():
|
|
113
|
+
return []
|
|
114
|
+
# Markitdown output is always processed as text
|
|
115
|
+
if sampling_strategy != "sequential":
|
|
116
|
+
raise ValueError(
|
|
117
|
+
f"Text mode only supports 'sequential' sampling, got '{sampling_strategy}'"
|
|
118
|
+
)
|
|
119
|
+
return chunk_text_sentences(content, chunk_size, chunk_overlap)
|
|
120
|
+
|
|
121
|
+
# Handle parquet files: load as list of dicts, chunk like JSONL
|
|
122
|
+
if suffix == ".parquet":
|
|
123
|
+
records = load_parquet(file_path)
|
|
124
|
+
if not records:
|
|
125
|
+
return []
|
|
126
|
+
return _chunk_records(records, chunk_size, sampling_strategy, stratify_field)
|
|
127
|
+
|
|
53
128
|
content = path.read_text(encoding="utf-8")
|
|
54
129
|
|
|
55
130
|
if not content.strip():
|
|
@@ -79,7 +154,7 @@ def chunk_file(
|
|
|
79
154
|
|
|
80
155
|
def _detect_mode(suffix: str) -> Literal["structured", "text"]:
|
|
81
156
|
"""Detect mode from file extension."""
|
|
82
|
-
structured_extensions = {".jsonl", ".csv", ".json"}
|
|
157
|
+
structured_extensions = {".jsonl", ".csv", ".json", ".parquet"}
|
|
83
158
|
if suffix in structured_extensions:
|
|
84
159
|
return "structured"
|
|
85
160
|
return "text"
|
|
@@ -281,6 +356,61 @@ def _chunk_jsonl(
|
|
|
281
356
|
return chunks
|
|
282
357
|
|
|
283
358
|
|
|
359
|
+
def _chunk_records(
|
|
360
|
+
records: list[dict],
|
|
361
|
+
item_count: int,
|
|
362
|
+
sampling_strategy: Literal["sequential", "random", "stratified"] = "sequential",
|
|
363
|
+
stratify_field: str | None = None,
|
|
364
|
+
) -> list[str]:
|
|
365
|
+
"""Chunk a list of dicts by item count with optional sampling."""
|
|
366
|
+
if not records:
|
|
367
|
+
return []
|
|
368
|
+
|
|
369
|
+
# For seed computation, use JSON representation
|
|
370
|
+
seed = _compute_seed(json.dumps(records[0]))
|
|
371
|
+
|
|
372
|
+
# Apply sampling strategy
|
|
373
|
+
if sampling_strategy == "random":
|
|
374
|
+
records = _shuffle_records(records, seed)
|
|
375
|
+
elif sampling_strategy == "stratified" and stratify_field:
|
|
376
|
+
records = _stratified_sample_dicts(records, stratify_field, seed)
|
|
377
|
+
|
|
378
|
+
chunks = []
|
|
379
|
+
for i in range(0, len(records), item_count):
|
|
380
|
+
chunk_records = records[i:i + item_count]
|
|
381
|
+
# Convert to JSONL format for LLM context
|
|
382
|
+
chunk_lines = [json.dumps(r) for r in chunk_records]
|
|
383
|
+
chunks.append("\n".join(chunk_lines))
|
|
384
|
+
|
|
385
|
+
return chunks
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
def _stratified_sample_dicts(records: list[dict], field: str, seed: int) -> list[dict]:
|
|
389
|
+
"""Group dicts by field, interleave proportionally."""
|
|
390
|
+
groups: dict[str, list] = {}
|
|
391
|
+
for record in records:
|
|
392
|
+
key = str(record.get(field, "_missing_"))
|
|
393
|
+
if key not in groups:
|
|
394
|
+
groups[key] = []
|
|
395
|
+
groups[key].append(record)
|
|
396
|
+
|
|
397
|
+
# Shuffle within each group
|
|
398
|
+
rng = random.Random(seed)
|
|
399
|
+
for key in groups:
|
|
400
|
+
rng.shuffle(groups[key])
|
|
401
|
+
|
|
402
|
+
# Interleave from groups (round-robin)
|
|
403
|
+
result = []
|
|
404
|
+
group_lists = list(groups.values())
|
|
405
|
+
while any(group_lists):
|
|
406
|
+
for g in group_lists:
|
|
407
|
+
if g:
|
|
408
|
+
result.append(g.pop(0))
|
|
409
|
+
group_lists = [g for g in group_lists if g]
|
|
410
|
+
|
|
411
|
+
return result
|
|
412
|
+
|
|
413
|
+
|
|
284
414
|
def _compute_seed(content: str) -> int:
|
|
285
415
|
"""Compute deterministic seed from content hash."""
|
|
286
416
|
return int(hashlib.md5(content.encode("utf-8")).hexdigest()[:8], 16)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: recursive-cleaner
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.0
|
|
4
4
|
Summary: LLM-powered incremental data cleaning pipeline that processes massive datasets in chunks and generates Python cleaning functions
|
|
5
5
|
Project-URL: Homepage, https://github.com/gaztrabisme/recursive-data-cleaner
|
|
6
6
|
Project-URL: Repository, https://github.com/gaztrabisme/recursive-data-cleaner
|
|
@@ -26,8 +26,12 @@ Requires-Dist: tenacity>=8.0
|
|
|
26
26
|
Provides-Extra: dev
|
|
27
27
|
Requires-Dist: pytest-cov>=4.0; extra == 'dev'
|
|
28
28
|
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
29
|
+
Provides-Extra: markitdown
|
|
30
|
+
Requires-Dist: markitdown>=0.1.0; extra == 'markitdown'
|
|
29
31
|
Provides-Extra: mlx
|
|
30
32
|
Requires-Dist: mlx-lm>=0.10.0; extra == 'mlx'
|
|
33
|
+
Provides-Extra: parquet
|
|
34
|
+
Requires-Dist: pyarrow>=14.0.0; extra == 'parquet'
|
|
31
35
|
Description-Content-Type: text/markdown
|
|
32
36
|
|
|
33
37
|
# Recursive Data Cleaner
|
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
backends/__init__.py,sha256=FUgODeYSGBvT0-z6myVby6YeAHG0nEUgWLITBKobUew,121
|
|
2
2
|
backends/mlx_backend.py,sha256=0U6IqmDHyk4vjKzytvEcQvSUBryQTgFtsNOcpwFNKk8,2945
|
|
3
|
-
recursive_cleaner/__init__.py,sha256=
|
|
4
|
-
recursive_cleaner/cleaner.py,sha256=
|
|
3
|
+
recursive_cleaner/__init__.py,sha256=bG83PcmkxAYMC17FmKuyMJUrMnuukp32JO3rlCLyB-Q,1698
|
|
4
|
+
recursive_cleaner/cleaner.py,sha256=J2X5bnk2OsWJyOn4BNR-cj0sqeKCylznfs_WEyMGxG8,26280
|
|
5
5
|
recursive_cleaner/context.py,sha256=avMXRDxLd7nd8CKWtvPHQy1MFhBKiA0aUVVJIlWoLZ4,824
|
|
6
6
|
recursive_cleaner/dependencies.py,sha256=vlYeoGL517v3yUSWN0wYDuIs9OOuQwM_dCBADrlitW8,2080
|
|
7
7
|
recursive_cleaner/errors.py,sha256=hwRJF8NSmWy_FZHCxcZDZxLQ0zqvo5dX8ImkB9mrOYc,433
|
|
8
8
|
recursive_cleaner/metrics.py,sha256=C6RlvZMTtVm7kdRhfMZu4xd-R-i1EQGMT5FCasPOO3A,5003
|
|
9
9
|
recursive_cleaner/optimizer.py,sha256=lnQC9Y1ClkW4po1eYa2bnYYu4smiDuUpMPPX6EN1UQ0,9700
|
|
10
10
|
recursive_cleaner/output.py,sha256=quTlZYtKZm9h37mbnwQmEjg0q8VQSZWEqwaHfhSAd3s,6106
|
|
11
|
-
recursive_cleaner/
|
|
11
|
+
recursive_cleaner/parser_generator.py,sha256=enn6_okGWB2ddVkwI7ytndT04S4QEVAk6cbmb7shxcM,3905
|
|
12
|
+
recursive_cleaner/parsers.py,sha256=HCS2UiVFhboq_go4DyWUygkJTkpfYkFj9_hqWiGIEXo,14572
|
|
12
13
|
recursive_cleaner/prompt.py,sha256=ep0eOXz_XbhH3HduJ76LvzVSftonhcv4GLEecIqd3lY,6484
|
|
13
14
|
recursive_cleaner/report.py,sha256=AWWneRjvl76ccLlExdkKJeY3GVFUG_LtmzVIJJT5cFI,4629
|
|
14
15
|
recursive_cleaner/response.py,sha256=3w0mLnqEPdB4daMSF0mtTcG0PTP-utb1HFtKuYA1ljw,9064
|
|
@@ -17,7 +18,7 @@ recursive_cleaner/types.py,sha256=-GdCmsfHd3rfdfCi5c-RXqX4TyuCSHgA__3AF3bMhoQ,29
|
|
|
17
18
|
recursive_cleaner/validation.py,sha256=-KAolhw3GQyhHwmh0clEj8xqPD5O-R2AO5rx7vubIME,6442
|
|
18
19
|
recursive_cleaner/vendor/__init__.py,sha256=E87TjmjRzu8ty39nqThvBwM611yXlLKQZ6KGY_zp3Dk,117
|
|
19
20
|
recursive_cleaner/vendor/chunker.py,sha256=pDDbfF6FoSmUji0-RG4MletPxJ-VybGw0yfnhh0aMSQ,6730
|
|
20
|
-
recursive_cleaner-0.
|
|
21
|
-
recursive_cleaner-0.
|
|
22
|
-
recursive_cleaner-0.
|
|
23
|
-
recursive_cleaner-0.
|
|
21
|
+
recursive_cleaner-0.7.0.dist-info/METADATA,sha256=bSCS8YBPAYzBufVF41LDYAgpLnYc4JAynA4xkNVuKyo,9486
|
|
22
|
+
recursive_cleaner-0.7.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
23
|
+
recursive_cleaner-0.7.0.dist-info/licenses/LICENSE,sha256=P8hRMK-UqRbQDsVN9nr901wpZcqwXEHr28DXhBUheF0,1064
|
|
24
|
+
recursive_cleaner-0.7.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|