recursive-cleaner 0.6.0__tar.gz → 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/PKG-INFO +11 -2
  2. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/README.md +5 -0
  3. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/TODO.md +27 -15
  4. recursive_cleaner-0.7.0/docs/contracts/v070-success-criteria.md +13 -0
  5. recursive_cleaner-0.7.0/docs/workflow-state.md +26 -0
  6. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/pyproject.toml +8 -2
  7. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/recursive_cleaner/__init__.py +7 -1
  8. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/recursive_cleaner/cleaner.py +62 -14
  9. recursive_cleaner-0.7.0/recursive_cleaner/parser_generator.py +123 -0
  10. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/recursive_cleaner/parsers.py +131 -1
  11. recursive_cleaner-0.7.0/tests/test_parser_generator.py +611 -0
  12. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/tests/test_parsers.py +216 -0
  13. recursive_cleaner-0.6.0/CLAUDE_ADVANCED.md +0 -955
  14. recursive_cleaner-0.6.0/docs/workflow-state.md +0 -45
  15. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/.gitignore +0 -0
  16. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/CLAUDE.md +0 -0
  17. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/LICENSE +0 -0
  18. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/backends/__init__.py +0 -0
  19. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/backends/mlx_backend.py +0 -0
  20. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/docs/contracts/api-contract.md +0 -0
  21. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/docs/contracts/data-schema.md +0 -0
  22. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/docs/contracts/success-criteria.md +0 -0
  23. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/docs/contracts/text-mode-contract.md +0 -0
  24. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/docs/contracts/tier2-contract.md +0 -0
  25. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/docs/contracts/tier4-contract.md +0 -0
  26. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/docs/contracts/tier4-success-criteria.md +0 -0
  27. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/docs/contracts/two-pass-contract.md +0 -0
  28. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/docs/handoffs/tier4-handoff.md +0 -0
  29. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/docs/implementation-plan-tier4.md +0 -0
  30. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/docs/implementation-plan-v03.md +0 -0
  31. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/docs/implementation-plan-v04.md +0 -0
  32. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/docs/implementation-plan-v05.md +0 -0
  33. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/docs/implementation-plan.md +0 -0
  34. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/docs/langchain-analysis.md +0 -0
  35. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/docs/langgraph-analysis.md +0 -0
  36. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/docs/mlx-lm-guide.md +0 -0
  37. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/docs/other-frameworks-analysis.md +0 -0
  38. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/docs/refactor-assessment/data/dependency.json +0 -0
  39. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/docs/refactor-assessment/data/stats.json +0 -0
  40. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/docs/refactor-assessment/plan.md +0 -0
  41. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/docs/refactor-assessment/report.md +0 -0
  42. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/docs/research/chonkie-extraction.md +0 -0
  43. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/docs/research/chonkie.md +0 -0
  44. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/docs/research/markitdown.md +0 -0
  45. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/docs/smolagents-analysis.md +0 -0
  46. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/recursive_cleaner/context.py +0 -0
  47. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/recursive_cleaner/dependencies.py +0 -0
  48. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/recursive_cleaner/errors.py +0 -0
  49. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/recursive_cleaner/metrics.py +0 -0
  50. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/recursive_cleaner/optimizer.py +0 -0
  51. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/recursive_cleaner/output.py +0 -0
  52. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/recursive_cleaner/prompt.py +0 -0
  53. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/recursive_cleaner/report.py +0 -0
  54. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/recursive_cleaner/response.py +0 -0
  55. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/recursive_cleaner/schema.py +0 -0
  56. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/recursive_cleaner/types.py +0 -0
  57. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/recursive_cleaner/validation.py +0 -0
  58. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/recursive_cleaner/vendor/__init__.py +0 -0
  59. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/recursive_cleaner/vendor/chunker.py +0 -0
  60. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/test_cases/ecommerce_instructions.txt +0 -0
  61. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/test_cases/ecommerce_products.jsonl +0 -0
  62. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/test_cases/financial_instructions.txt +0 -0
  63. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/test_cases/financial_transactions.jsonl +0 -0
  64. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/test_cases/healthcare_instructions.txt +0 -0
  65. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/test_cases/healthcare_patients.jsonl +0 -0
  66. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/test_cases/run_ecommerce_test.py +0 -0
  67. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/test_cases/run_financial_test.py +0 -0
  68. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/test_cases/run_healthcare_test.py +0 -0
  69. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/tests/__init__.py +0 -0
  70. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/tests/test_callbacks.py +0 -0
  71. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/tests/test_cleaner.py +0 -0
  72. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/tests/test_context.py +0 -0
  73. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/tests/test_dependencies.py +0 -0
  74. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/tests/test_dry_run.py +0 -0
  75. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/tests/test_holdout.py +0 -0
  76. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/tests/test_incremental.py +0 -0
  77. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/tests/test_integration.py +0 -0
  78. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/tests/test_latency.py +0 -0
  79. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/tests/test_metrics.py +0 -0
  80. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/tests/test_optimizer.py +0 -0
  81. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/tests/test_output.py +0 -0
  82. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/tests/test_report.py +0 -0
  83. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/tests/test_sampling.py +0 -0
  84. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/tests/test_schema.py +0 -0
  85. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/tests/test_text_mode.py +0 -0
  86. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/tests/test_validation.py +0 -0
  87. {recursive_cleaner-0.6.0 → recursive_cleaner-0.7.0}/tests/test_vendor_chunker.py +0 -0
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: recursive-cleaner
3
- Version: 0.6.0
3
+ Version: 0.7.0
4
4
  Summary: LLM-powered incremental data cleaning pipeline that processes massive datasets in chunks and generates Python cleaning functions
5
5
  Project-URL: Homepage, https://github.com/gaztrabisme/recursive-data-cleaner
6
6
  Project-URL: Repository, https://github.com/gaztrabisme/recursive-data-cleaner
7
7
  Project-URL: Issues, https://github.com/gaztrabisme/recursive-data-cleaner/issues
8
- Author: Gary Tou
8
+ Author: Gary Tran
9
9
  License-Expression: MIT
10
10
  License-File: LICENSE
11
11
  Keywords: automation,data-cleaning,data-quality,etl,llm,machine-learning
@@ -26,8 +26,12 @@ Requires-Dist: tenacity>=8.0
26
26
  Provides-Extra: dev
27
27
  Requires-Dist: pytest-cov>=4.0; extra == 'dev'
28
28
  Requires-Dist: pytest>=7.0; extra == 'dev'
29
+ Provides-Extra: markitdown
30
+ Requires-Dist: markitdown>=0.1.0; extra == 'markitdown'
29
31
  Provides-Extra: mlx
30
32
  Requires-Dist: mlx-lm>=0.10.0; extra == 'mlx'
33
+ Provides-Extra: parquet
34
+ Requires-Dist: pyarrow>=14.0.0; extra == 'parquet'
31
35
  Description-Content-Type: text/markdown
32
36
 
33
37
  # Recursive Data Cleaner
@@ -277,6 +281,11 @@ pytest tests/ -v
277
281
  | v0.2.0 | Runtime validation, schema inference, callbacks, incremental saves |
278
282
  | v0.1.0 | Core pipeline, chunking, docstring registry |
279
283
 
284
+ ## Acknowledgments
285
+
286
+ - Sentence-aware text chunking adapted from [Chonkie](https://github.com/chonkie-inc/chonkie) (MIT License)
287
+ - Development assisted by [Claude Code](https://claude.ai/claude-code)
288
+
280
289
  ## License
281
290
 
282
291
  MIT
@@ -245,6 +245,11 @@ pytest tests/ -v
245
245
  | v0.2.0 | Runtime validation, schema inference, callbacks, incremental saves |
246
246
  | v0.1.0 | Core pipeline, chunking, docstring registry |
247
247
 
248
+ ## Acknowledgments
249
+
250
+ - Sentence-aware text chunking adapted from [Chonkie](https://github.com/chonkie-inc/chonkie) (MIT License)
251
+ - Development assisted by [Claude Code](https://claude.ai/claude-code)
252
+
248
253
  ## License
249
254
 
250
255
  MIT
@@ -60,30 +60,42 @@ These patterns proved high-value with low implementation effort:
60
60
 
61
61
  ---
62
62
 
63
- ## Future Considerations
63
+ ## Tier 5: Format Expansion & UI (v0.7.0) - PLANNED
64
+
65
+ ### Markitdown Integration
66
+ - [ ] Add markitdown as optional dependency
67
+ - [ ] Auto-convert 20+ formats: Excel, HTML, Word, PDF, PowerPoint, EPUB, etc.
68
+ - [ ] Preprocessing step before chunking
69
+ - **Approach**: `pip install recursive-cleaner[markitdown]`
70
+
71
+ ### Parquet Support
72
+ - [ ] Native parser using pyarrow
73
+ - [ ] Read as list of dicts (same as JSONL)
74
+ - **Approach**: Optional dependency, ~10 lines of code
75
+
76
+ ### LLM-Generated Parsers
77
+ - [ ] For XML and unknown formats
78
+ - [ ] Send sample to LLM: "Generate a function to parse this into list of records"
79
+ - [ ] Validate generated parser on sample before using
80
+ - **Approach**: Wu wei - let LLM decide how to parse data it understands
81
+
82
+ ### Terminal UI (Textual)
83
+ - [ ] Optional `[ui]` extra dependency
84
+ - [ ] Live dashboard showing: chunk progress, function generation, latency sparkline
85
+ - [ ] Pure terminal, no browser needed
86
+ - **Approach**: `pip install recursive-cleaner[ui]`
64
87
 
65
- Ideas that might be valuable but need more thought.
88
+ ---
66
89
 
67
- ### Confidence Scoring
68
- - LLM rates confidence in each generated function (high/medium/low)
69
- - Low confidence = flag for human review
70
- - **Question**: Does this actually help users, or just add noise?
90
+ ## Future Considerations
71
91
 
72
- ### Before/After Examples
73
- - User provides expected input→output pairs
74
- - Validate generated functions match expectations
75
- - **Question**: How to handle functions that transform data differently but correctly?
92
+ Ideas that might be valuable but need more thought.
76
93
 
77
94
  ### Multi-File Batch Mode
78
95
  - Process multiple files with shared function registry
79
96
  - Functions learned from file A applied to file B
80
97
  - **Question**: How to handle schema differences between files?
81
98
 
82
- ### Summary Buffer Memory
83
- - Compress old function docstrings into summaries
84
- - Keep recent functions verbatim
85
- - **Question**: Does FIFO eviction already work well enough?
86
-
87
99
  ---
88
100
 
89
101
  ## Explicitly Deferred
@@ -0,0 +1,13 @@
1
+ # Success Criteria - v0.7.0 Format Expansion
2
+
3
+ ## Project-Level Success
4
+ - [ ] Markitdown integration converts 20+ formats to text before chunking
5
+ - [ ] Parquet files load as list of dicts like JSONL/CSV
6
+ - [ ] LLM-generated parsers handle XML and unknown formats
7
+ - [ ] All new formats integrate seamlessly with existing cleaning pipeline
8
+ - [ ] Optional dependencies don't break core functionality when not installed
9
+ - [ ] All 392 existing tests still pass
10
+
11
+ ## Phase Success Criteria
12
+
13
+ [To be populated during planning]
@@ -0,0 +1,26 @@
1
+ # Workflow State - v0.7.0 Format Expansion
2
+
3
+ ## Current Phase
4
+ Research
5
+
6
+ ## Awaiting
7
+ Subagent Completion (Research)
8
+
9
+ ## Blockers
10
+ None
11
+
12
+ ## Progress
13
+ - [ ] Research complete
14
+ - [ ] Contracts approved
15
+ - [ ] Plan approved
16
+ - [ ] Phase 1: Markitdown integration
17
+ - [ ] Phase 1 audit
18
+ - [ ] Phase 2: Parquet support
19
+ - [ ] Phase 2 audit
20
+ - [ ] Phase 3: LLM-generated parsers
21
+ - [ ] Phase 3 audit
22
+
23
+ ## Previous Version (v0.6.0)
24
+ - **Tests**: 392 passing
25
+ - **Lines**: 2,967 total
26
+ - **Status**: Released on GitHub + PyPI
@@ -4,13 +4,13 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "recursive-cleaner"
7
- version = "0.6.0"
7
+ version = "0.7.0"
8
8
  description = "LLM-powered incremental data cleaning pipeline that processes massive datasets in chunks and generates Python cleaning functions"
9
9
  readme = "README.md"
10
10
  license = "MIT"
11
11
  requires-python = ">=3.10"
12
12
  authors = [
13
- { name = "Gary Tou" },
13
+ { name = "Gary Tran" },
14
14
  ]
15
15
  keywords = [
16
16
  "data-cleaning",
@@ -46,6 +46,12 @@ dev = [
46
46
  mlx = [
47
47
  "mlx-lm>=0.10.0",
48
48
  ]
49
+ markitdown = [
50
+ "markitdown>=0.1.0",
51
+ ]
52
+ parquet = [
53
+ "pyarrow>=14.0.0",
54
+ ]
49
55
 
50
56
  [project.urls]
51
57
  Homepage = "https://github.com/gaztrabisme/recursive-data-cleaner"
@@ -16,9 +16,10 @@ from recursive_cleaner.optimizer import (
16
16
  group_by_salience,
17
17
  )
18
18
  from recursive_cleaner.output import write_cleaning_file
19
- from recursive_cleaner.parsers import chunk_file
19
+ from recursive_cleaner.parsers import MARKITDOWN_EXTENSIONS, chunk_file, load_parquet, preprocess_with_markitdown
20
20
  from recursive_cleaner.prompt import build_prompt
21
21
  from recursive_cleaner.response import extract_python_block, parse_response
22
+ from recursive_cleaner.parser_generator import check_parser_safety, generate_parser
22
23
  from recursive_cleaner.validation import check_code_safety, extract_sample_data, validate_function
23
24
 
24
25
  __all__ = [
@@ -27,6 +28,9 @@ __all__ = [
27
28
  "MaxIterationsError",
28
29
  "OutputValidationError",
29
30
  "chunk_file",
31
+ "MARKITDOWN_EXTENSIONS",
32
+ "load_parquet",
33
+ "preprocess_with_markitdown",
30
34
  "parse_response",
31
35
  "extract_python_block",
32
36
  "build_context",
@@ -43,4 +47,6 @@ __all__ = [
43
47
  "extract_tags",
44
48
  "group_by_salience",
45
49
  "consolidate_with_agency",
50
+ "generate_parser",
51
+ "check_parser_safety",
46
52
  ]
@@ -12,7 +12,7 @@ from tenacity import retry, stop_after_attempt, wait_exponential
12
12
  from .context import build_context
13
13
  from .errors import OutputValidationError, ParseError
14
14
  from .metrics import QualityMetrics, compare_quality, load_structured_data, measure_quality
15
- from .parsers import chunk_file
15
+ from .parsers import MARKITDOWN_EXTENSIONS, chunk_file
16
16
  from .prompt import build_prompt
17
17
  from .response import parse_response
18
18
  from .schema import format_schema_for_prompt, infer_schema
@@ -61,6 +61,7 @@ class DataCleaner:
61
61
  saturation_check_interval: int = 20,
62
62
  report_path: str | None = "cleaning_report.md",
63
63
  dry_run: bool = False,
64
+ auto_parse: bool = False,
64
65
  ):
65
66
  self.backend = llm_backend
66
67
  self.file_path = file_path
@@ -84,7 +85,9 @@ class DataCleaner:
84
85
  self.saturation_check_interval = saturation_check_interval
85
86
  self.report_path = report_path
86
87
  self.dry_run = dry_run
88
+ self.auto_parse = auto_parse
87
89
  self.functions: list[dict] = [] # List of {name, docstring, code}
90
+ self._generated_parser: callable | None = None # LLM-generated parser for unknown formats
88
91
  # Track recent function generation for saturation check
89
92
  self._recent_new_function_count = 0
90
93
  self._last_check_function_count = 0
@@ -319,27 +322,72 @@ class DataCleaner:
319
322
  def _detect_mode(self) -> Literal["structured", "text"]:
320
323
  """Detect mode from file extension."""
321
324
  suffix = Path(self.file_path).suffix.lower()
322
- structured_extensions = {".jsonl", ".csv", ".json"}
325
+ # Markitdown formats are processed as text
326
+ if suffix in MARKITDOWN_EXTENSIONS:
327
+ return "text"
328
+ structured_extensions = {".jsonl", ".csv", ".json", ".parquet"}
323
329
  if suffix in structured_extensions:
324
330
  return "structured"
325
331
  return "text"
326
332
 
333
+ def _is_known_extension(self) -> bool:
334
+ """Check if file extension is natively supported."""
335
+ suffix = Path(self.file_path).suffix.lower()
336
+ known = {".jsonl", ".csv", ".json", ".parquet", ".txt"}
337
+ return suffix in known or suffix in MARKITDOWN_EXTENSIONS
338
+
339
+ def _load_with_auto_parser(self) -> list[str]:
340
+ """Load file using LLM-generated parser, return JSONL chunks."""
341
+ from .parser_generator import generate_parser
342
+
343
+ print(f"Unknown file format, generating parser...")
344
+ self._emit("parser_generation_start")
345
+
346
+ parser = generate_parser(self.backend, self.file_path)
347
+ self._generated_parser = parser
348
+
349
+ self._emit("parser_generation_complete")
350
+ print("Parser generated successfully.")
351
+
352
+ # Parse the file
353
+ records = parser(self.file_path)
354
+ if not records:
355
+ return []
356
+
357
+ # Convert to JSONL chunks
358
+ import json
359
+ chunks = []
360
+ for i in range(0, len(records), self.chunk_size):
361
+ chunk_records = records[i:i + self.chunk_size]
362
+ chunk_lines = [json.dumps(r) for r in chunk_records]
363
+ chunks.append("\n".join(chunk_lines))
364
+
365
+ return chunks
366
+
327
367
  def run(self) -> None:
328
368
  """Run the cleaning pipeline."""
329
- # Resolve effective mode
330
- if self.mode == "auto":
331
- self._effective_mode = self._detect_mode()
369
+ # Check if we should use auto-parser for unknown formats
370
+ use_auto_parser = self.auto_parse and not self._is_known_extension()
371
+
372
+ if use_auto_parser:
373
+ # LLM generates parser, always structured mode
374
+ self._effective_mode = "structured"
375
+ chunks = self._load_with_auto_parser()
332
376
  else:
333
- self._effective_mode = self.mode
377
+ # Resolve effective mode
378
+ if self.mode == "auto":
379
+ self._effective_mode = self._detect_mode()
380
+ else:
381
+ self._effective_mode = self.mode
334
382
 
335
- chunks = chunk_file(
336
- self.file_path,
337
- self.chunk_size,
338
- mode=self._effective_mode,
339
- chunk_overlap=self.chunk_overlap,
340
- sampling_strategy=self.sampling_strategy,
341
- stratify_field=self.stratify_field,
342
- )
383
+ chunks = chunk_file(
384
+ self.file_path,
385
+ self.chunk_size,
386
+ mode=self._effective_mode,
387
+ chunk_overlap=self.chunk_overlap,
388
+ sampling_strategy=self.sampling_strategy,
389
+ stratify_field=self.stratify_field,
390
+ )
343
391
 
344
392
  if not chunks:
345
393
  print("No data to process.")
@@ -0,0 +1,123 @@
1
+ """LLM-generated parser for unknown file formats."""
2
+
3
+ import ast
4
+ import re
5
+ from pathlib import Path
6
+
7
+ from .types import LLMBackend
8
+
9
+ # Dangerous patterns for parser code (allows 'open' since parsers need file I/O)
10
+ _DANGEROUS_IMPORTS = frozenset({
11
+ "os", "subprocess", "sys", "shutil", "socket", "urllib",
12
+ "requests", "httplib", "ftplib", "smtplib", "pickle",
13
+ })
14
+ _DANGEROUS_CALLS = frozenset({"eval", "exec", "compile", "__import__"})
15
+
16
+ PARSER_PROMPT = '''You are a data parsing expert. Generate a Python function to parse this file format.
17
+
18
+ === SAMPLE (first 4KB) ===
19
+ {sample}
20
+
21
+ === TASK ===
22
+ Generate a function with this EXACT signature:
23
+
24
+ ```python
25
+ def parse_file(file_path: str) -> list[dict]:
26
+ """Parse the file into a list of records."""
27
+ # Your implementation
28
+ ```
29
+
30
+ RULES:
31
+ - Return list of dicts, one dict per logical record
32
+ - Use only stdlib (xml.etree, json, re, csv)
33
+ - Handle the ENTIRE file, not just this sample
34
+ - Be defensive about malformed data
35
+ - Include necessary imports inside or before the function
36
+ '''
37
+
38
+
39
+ def check_parser_safety(code: str) -> list[str]:
40
+ """Check parser code for dangerous patterns. Returns list of issues."""
41
+ issues = []
42
+ try:
43
+ tree = ast.parse(code)
44
+ except SyntaxError as e:
45
+ return [f"Syntax error: {e}"]
46
+
47
+ for node in ast.walk(tree):
48
+ if isinstance(node, ast.Import):
49
+ for alias in node.names:
50
+ module = alias.name.split(".")[0]
51
+ if module in _DANGEROUS_IMPORTS:
52
+ issues.append(f"Dangerous import: {alias.name}")
53
+ if isinstance(node, ast.ImportFrom):
54
+ if node.module:
55
+ module = node.module.split(".")[0]
56
+ if module in _DANGEROUS_IMPORTS:
57
+ issues.append(f"Dangerous import: from {node.module}")
58
+ if isinstance(node, ast.Call):
59
+ if isinstance(node.func, ast.Name):
60
+ if node.func.id in _DANGEROUS_CALLS:
61
+ issues.append(f"Dangerous call: {node.func.id}()")
62
+ return issues
63
+
64
+
65
+ def extract_python_block(text: str) -> str:
66
+ """Extract code from ```python ... ``` block."""
67
+ match = re.search(r"```python\s*(.*?)\s*```", text, re.DOTALL)
68
+ return match.group(1).strip() if match else text.strip()
69
+
70
+
71
+ def generate_parser(llm_backend: LLMBackend, file_path: str) -> callable:
72
+ """
73
+ Generate a parser function for an unknown file format.
74
+
75
+ Args:
76
+ llm_backend: LLM backend implementing generate(prompt) -> str
77
+ file_path: Path to the file to parse
78
+
79
+ Returns:
80
+ A callable parse_file(file_path) -> list[dict]
81
+
82
+ Raises:
83
+ ValueError: If generated code is unsafe, has invalid syntax,
84
+ or doesn't return list of dicts
85
+ """
86
+ path = Path(file_path)
87
+ with open(path, "r", errors="replace") as f:
88
+ sample = f.read(4096)
89
+
90
+ prompt = PARSER_PROMPT.format(sample=sample)
91
+ response = llm_backend.generate(prompt)
92
+ code = extract_python_block(response)
93
+
94
+ # Validate syntax
95
+ try:
96
+ ast.parse(code)
97
+ except SyntaxError as e:
98
+ raise ValueError(f"Generated parser has invalid syntax: {e}")
99
+
100
+ # Security check
101
+ issues = check_parser_safety(code)
102
+ if issues:
103
+ raise ValueError(f"Generated parser contains dangerous code: {issues}")
104
+
105
+ # Execute to get function
106
+ namespace: dict = {}
107
+ exec(code, namespace)
108
+
109
+ if "parse_file" not in namespace:
110
+ raise ValueError("Generated code must define 'parse_file' function")
111
+
112
+ parser = namespace["parse_file"]
113
+
114
+ # Validate on actual file
115
+ result = parser(file_path)
116
+ if not isinstance(result, list):
117
+ raise ValueError(f"Parser must return list, got {type(result).__name__}")
118
+ if result and not isinstance(result[0], dict):
119
+ raise ValueError(
120
+ f"Parser must return list of dicts, got list of {type(result[0]).__name__}"
121
+ )
122
+
123
+ return parser
@@ -17,6 +17,62 @@ try:
17
17
  except ImportError:
18
18
  _HAS_SENTENCE_CHUNKER = False
19
19
 
20
+ # File extensions supported by markitdown for conversion to text
21
+ MARKITDOWN_EXTENSIONS = {
22
+ ".pdf", ".docx", ".doc", ".xlsx", ".xls", ".pptx", ".ppt",
23
+ ".html", ".htm", ".epub", ".msg", ".rtf", ".odt", ".ods", ".odp"
24
+ }
25
+
26
+
27
+ def load_parquet(file_path: str) -> list[dict]:
28
+ """Load parquet file as list of dicts.
29
+
30
+ Args:
31
+ file_path: Path to the parquet file
32
+
33
+ Returns:
34
+ List of dictionaries, one per row
35
+
36
+ Raises:
37
+ ImportError: If pyarrow is not installed
38
+ """
39
+ try:
40
+ import pyarrow.parquet as pq
41
+ except ImportError:
42
+ raise ImportError(
43
+ "pyarrow is required for parquet files. "
44
+ "Install with: pip install recursive-cleaner[parquet]"
45
+ )
46
+
47
+ table = pq.read_table(file_path)
48
+ return table.to_pylist()
49
+
50
+
51
+ def preprocess_with_markitdown(file_path: str) -> str:
52
+ """
53
+ Convert supported formats to text using markitdown.
54
+
55
+ Args:
56
+ file_path: Path to the file to convert
57
+
58
+ Returns:
59
+ Extracted text content from the file
60
+
61
+ Raises:
62
+ ImportError: If markitdown is not installed
63
+ """
64
+ try:
65
+ from markitdown import MarkItDown
66
+ except ImportError:
67
+ raise ImportError(
68
+ "markitdown is required for this file type. "
69
+ "Install with: pip install recursive-cleaner[markitdown]"
70
+ )
71
+
72
+ md = MarkItDown()
73
+ result = md.convert(file_path)
74
+ return result.text_content
75
+
20
76
 
21
77
  def chunk_file(
22
78
  file_path: str,
@@ -50,6 +106,25 @@ def chunk_file(
50
106
  if not path.exists():
51
107
  raise FileNotFoundError(f"File not found: {file_path}")
52
108
 
109
+ # Handle markitdown formats: preprocess to text, then chunk as text
110
+ if suffix in MARKITDOWN_EXTENSIONS:
111
+ content = preprocess_with_markitdown(file_path)
112
+ if not content.strip():
113
+ return []
114
+ # Markitdown output is always processed as text
115
+ if sampling_strategy != "sequential":
116
+ raise ValueError(
117
+ f"Text mode only supports 'sequential' sampling, got '{sampling_strategy}'"
118
+ )
119
+ return chunk_text_sentences(content, chunk_size, chunk_overlap)
120
+
121
+ # Handle parquet files: load as list of dicts, chunk like JSONL
122
+ if suffix == ".parquet":
123
+ records = load_parquet(file_path)
124
+ if not records:
125
+ return []
126
+ return _chunk_records(records, chunk_size, sampling_strategy, stratify_field)
127
+
53
128
  content = path.read_text(encoding="utf-8")
54
129
 
55
130
  if not content.strip():
@@ -79,7 +154,7 @@ def chunk_file(
79
154
 
80
155
  def _detect_mode(suffix: str) -> Literal["structured", "text"]:
81
156
  """Detect mode from file extension."""
82
- structured_extensions = {".jsonl", ".csv", ".json"}
157
+ structured_extensions = {".jsonl", ".csv", ".json", ".parquet"}
83
158
  if suffix in structured_extensions:
84
159
  return "structured"
85
160
  return "text"
@@ -281,6 +356,61 @@ def _chunk_jsonl(
281
356
  return chunks
282
357
 
283
358
 
359
+ def _chunk_records(
360
+ records: list[dict],
361
+ item_count: int,
362
+ sampling_strategy: Literal["sequential", "random", "stratified"] = "sequential",
363
+ stratify_field: str | None = None,
364
+ ) -> list[str]:
365
+ """Chunk a list of dicts by item count with optional sampling."""
366
+ if not records:
367
+ return []
368
+
369
+ # For seed computation, use JSON representation
370
+ seed = _compute_seed(json.dumps(records[0]))
371
+
372
+ # Apply sampling strategy
373
+ if sampling_strategy == "random":
374
+ records = _shuffle_records(records, seed)
375
+ elif sampling_strategy == "stratified" and stratify_field:
376
+ records = _stratified_sample_dicts(records, stratify_field, seed)
377
+
378
+ chunks = []
379
+ for i in range(0, len(records), item_count):
380
+ chunk_records = records[i:i + item_count]
381
+ # Convert to JSONL format for LLM context
382
+ chunk_lines = [json.dumps(r) for r in chunk_records]
383
+ chunks.append("\n".join(chunk_lines))
384
+
385
+ return chunks
386
+
387
+
388
+ def _stratified_sample_dicts(records: list[dict], field: str, seed: int) -> list[dict]:
389
+ """Group dicts by field, interleave proportionally."""
390
+ groups: dict[str, list] = {}
391
+ for record in records:
392
+ key = str(record.get(field, "_missing_"))
393
+ if key not in groups:
394
+ groups[key] = []
395
+ groups[key].append(record)
396
+
397
+ # Shuffle within each group
398
+ rng = random.Random(seed)
399
+ for key in groups:
400
+ rng.shuffle(groups[key])
401
+
402
+ # Interleave from groups (round-robin)
403
+ result = []
404
+ group_lists = list(groups.values())
405
+ while any(group_lists):
406
+ for g in group_lists:
407
+ if g:
408
+ result.append(g.pop(0))
409
+ group_lists = [g for g in group_lists if g]
410
+
411
+ return result
412
+
413
+
284
414
  def _compute_seed(content: str) -> int:
285
415
  """Compute deterministic seed from content hash."""
286
416
  return int(hashlib.md5(content.encode("utf-8")).hexdigest()[:8], 16)