cleanllm 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. cleanllm-0.4.0/.gitignore +16 -0
  2. cleanllm-0.4.0/CHANGELOG.md +43 -0
  3. cleanllm-0.4.0/LICENSE +21 -0
  4. cleanllm-0.4.0/PKG-INFO +391 -0
  5. cleanllm-0.4.0/README.md +337 -0
  6. cleanllm-0.4.0/pyproject.toml +67 -0
  7. cleanllm-0.4.0/src/cleanllm/__init__.py +41 -0
  8. cleanllm-0.4.0/src/cleanllm/__main__.py +11 -0
  9. cleanllm-0.4.0/src/cleanllm/audit.py +287 -0
  10. cleanllm-0.4.0/src/cleanllm/cli.py +1171 -0
  11. cleanllm-0.4.0/src/cleanllm/compare.py +290 -0
  12. cleanllm-0.4.0/src/cleanllm/convert.py +131 -0
  13. cleanllm-0.4.0/src/cleanllm/dedup.py +545 -0
  14. cleanllm-0.4.0/src/cleanllm/fix.py +501 -0
  15. cleanllm-0.4.0/src/cleanllm/gate.py +358 -0
  16. cleanllm-0.4.0/src/cleanllm/hf.py +111 -0
  17. cleanllm-0.4.0/src/cleanllm/manifest.py +56 -0
  18. cleanllm-0.4.0/src/cleanllm/merge.py +52 -0
  19. cleanllm-0.4.0/src/cleanllm/presets.py +112 -0
  20. cleanllm-0.4.0/src/cleanllm/recipes.py +391 -0
  21. cleanllm-0.4.0/src/cleanllm/reports.py +302 -0
  22. cleanllm-0.4.0/src/cleanllm/run.py +659 -0
  23. cleanllm-0.4.0/src/cleanllm/sample.py +241 -0
  24. cleanllm-0.4.0/src/cleanllm/scan.py +1140 -0
  25. cleanllm-0.4.0/src/cleanllm/shard.py +73 -0
  26. cleanllm-0.4.0/src/cleanllm/split.py +65 -0
  27. cleanllm-0.4.0/src/cleanllm/stats.py +865 -0
  28. cleanllm-0.4.0/src/cleanllm/util.py +1424 -0
  29. cleanllm-0.4.0/src/cleanllm/validate.py +221 -0
  30. cleanllm-0.4.0/tests/conftest.py +37 -0
  31. cleanllm-0.4.0/tests/test_adversarial.py +776 -0
  32. cleanllm-0.4.0/tests/test_audit.py +152 -0
  33. cleanllm-0.4.0/tests/test_cli_convert_merge_split.py +441 -0
  34. cleanllm-0.4.0/tests/test_cli_error_handling.py +566 -0
  35. cleanllm-0.4.0/tests/test_cli_extended.py +541 -0
  36. cleanllm-0.4.0/tests/test_cli_help.py +65 -0
  37. cleanllm-0.4.0/tests/test_compare.py +194 -0
  38. cleanllm-0.4.0/tests/test_coverage_gaps.py +670 -0
  39. cleanllm-0.4.0/tests/test_dedup.py +308 -0
  40. cleanllm-0.4.0/tests/test_dedup_extended.py +477 -0
  41. cleanllm-0.4.0/tests/test_edge_cases.py +1005 -0
  42. cleanllm-0.4.0/tests/test_encoding_quality.py +277 -0
  43. cleanllm-0.4.0/tests/test_extended_pipeline.py +735 -0
  44. cleanllm-0.4.0/tests/test_fix.py +31 -0
  45. cleanllm-0.4.0/tests/test_fix_extended.py +600 -0
  46. cleanllm-0.4.0/tests/test_gate.py +278 -0
  47. cleanllm-0.4.0/tests/test_haiku_benchmark.py +653 -0
  48. cleanllm-0.4.0/tests/test_hard_benchmark.py +557 -0
  49. cleanllm-0.4.0/tests/test_integration.py +473 -0
  50. cleanllm-0.4.0/tests/test_integration_pipelines.py +674 -0
  51. cleanllm-0.4.0/tests/test_limits_benchmark.py +874 -0
  52. cleanllm-0.4.0/tests/test_new_comprehensive.py +897 -0
  53. cleanllm-0.4.0/tests/test_new_features.py +716 -0
  54. cleanllm-0.4.0/tests/test_new_features_v2.py +480 -0
  55. cleanllm-0.4.0/tests/test_presets_reports.py +105 -0
  56. cleanllm-0.4.0/tests/test_presets_reports_extended.py +470 -0
  57. cleanllm-0.4.0/tests/test_property_invariants.py +737 -0
  58. cleanllm-0.4.0/tests/test_randomized_invariants.py +136 -0
  59. cleanllm-0.4.0/tests/test_recipes.py +86 -0
  60. cleanllm-0.4.0/tests/test_research_benchmark.py +579 -0
  61. cleanllm-0.4.0/tests/test_round2_edge_cases.py +620 -0
  62. cleanllm-0.4.0/tests/test_round2_features.py +764 -0
  63. cleanllm-0.4.0/tests/test_run.py +499 -0
  64. cleanllm-0.4.0/tests/test_sample.py +183 -0
  65. cleanllm-0.4.0/tests/test_scan.py +184 -0
  66. cleanllm-0.4.0/tests/test_scan_extended.py +503 -0
  67. cleanllm-0.4.0/tests/test_secrets_language.py +296 -0
  68. cleanllm-0.4.0/tests/test_security_safety.py +705 -0
  69. cleanllm-0.4.0/tests/test_shard_manifest.py +23 -0
  70. cleanllm-0.4.0/tests/test_stats.py +166 -0
  71. cleanllm-0.4.0/tests/test_stats_extended.py +560 -0
  72. cleanllm-0.4.0/tests/test_stress_load.py +777 -0
  73. cleanllm-0.4.0/tests/test_unicode_nested.py +796 -0
  74. cleanllm-0.4.0/tests/test_util_extended.py +289 -0
  75. cleanllm-0.4.0/tests/test_v3_extended.py +539 -0
  76. cleanllm-0.4.0/tests/test_v3_features.py +352 -0
  77. cleanllm-0.4.0/tests/test_v4_extended.py +1129 -0
  78. cleanllm-0.4.0/tests/test_v4_features.py +455 -0
  79. cleanllm-0.4.0/tests/test_validate.py +104 -0
  80. cleanllm-0.4.0/tests/test_version.py +32 -0
@@ -0,0 +1,16 @@
1
+ __pycache__/
2
+ *.pyc
3
+ dist/
4
+ .pytest_cache/
5
+ .pytest_tmp/
6
+ .pytest_tmp*/
7
+ .run_tmp/
8
+ .smoke_func/
9
+ .smoke_tmp/
10
+ _smoke_final_*/
11
+ .tmp/
12
+ .build_tmp/
13
+ .piptmp*/
14
+ pytest-cache-files-*/
15
+ test_results/
16
+ *.egg-info/
@@ -0,0 +1,43 @@
1
+ # Changelog
2
+
3
+ ## 0.4.0
4
+
5
+ - **Breaking:** `FixRules.forbidden_patterns` now defaults to `[]` (no patterns) instead of the CP-specific patterns (`freopen`, `ifstream`, etc.). Use `--preset cpp17_clean` / `--preset usaco_portable` in the CLI, or pass `forbidden_patterns=list(DEFAULT_FORBIDDEN_PATTERNS)` in Python, to restore the previous CP behavior.
6
+ - **Breaking:** `scan` and `fix` CLI commands now default to no forbidden patterns when no `--preset` or `--forbidden-pattern` flag is given. Same change — use a preset for CP datasets.
7
+ - Fix: `FixRules.min_assistant_chars` (default `20`) is now the source of truth for the `empty_assistant` drop threshold, replacing the hardcoded value inside `code_quality_flags`. Set `min_assistant_chars=1` to only drop truly blank responses in text/chat datasets.
8
+ - CI: added GitHub Actions workflow testing Python 3.9–3.13 on push/PR to main.
9
+
10
+ ## 0.3.0
11
+
12
+ - New schemas: `sharegpt` and `alpaca` — validate the two most common fine-tuning formats out of the box.
13
+ - New: HuggingFace Hub integration (`cleanllm hf download`) — stream any HF dataset to JSONL with auto schema detection. Requires `pip install cleanllm[hf]`.
14
+ - New Python API: `download_from_hub()` and `detect_hf_schema()` exported from `cleanllm`.
15
+ - New: `convert` command — convert JSONL files between `sharegpt`, `alpaca`, and `chatml` formats.
16
+ - New: `merge` command — merge multiple JSONL files into one, with optional deduplication.
17
+ - New: `split` command — split a JSONL file into train/val sets with configurable ratio, shuffle, and seed.
18
+
19
+ ## 0.2.0
20
+
21
+ - New schema: `basic_sft` — general-purpose SFT validation requiring only `id` and `messages`.
22
+ - New preset: `general` — URL removal + whitespace normalization with no domain-specific forbidden patterns.
23
+ - Perf: `detect_encoding_issues` ASCII fast-path — skips all regex for pure-ASCII content (~37% scan speedup on code datasets).
24
+ - Perf: pre-compiled patterns for `code_quality_flags` — eliminates per-call `re._compile` overhead.
25
+ - Perf: `_collect_scan_features` consolidates encoding check + token count into a single string pass (no redundant object traversal).
26
+ - Fix: audit `input_line_number` now correctly populated with the original line number from the input file (was always blank in v0.1.0).
27
+ - Fix: `detect_encoding_issues` now correctly flags double BOM (`\ufeff\ufeff`).
28
+ - Fix: parallel `fix_jsonl` no longer constructs an unconsumed tqdm progress bar.
29
+
30
+ ## 0.1.0
31
+
32
+ Initial release:
33
+
34
+ - Streaming JSONL scan/fix/dedup/shard/manifest/validate/sample/audit.
35
+ - Dataset profiling and diffs (`stats`, `compare`).
36
+ - Configurable pipelines (`run`) and CI gating (`gate`).
37
+ - Built-in templates (`recipes`).
38
+ - CLI polish: consistent help text and clearer error messages.
39
+ - Release prep: version plumbing, `--version`, and `python -m cleanllm`.
40
+ - Stabilization: clearer JSON parse errors for run/gate/compare configs.
41
+ - Audit bundle metadata contract made deterministic (manifest includes summary; summary excludes manifest).
42
+ - Bug fix: `detect_encoding_issues` now correctly flags double BOM (`\ufeff\ufeff`).
43
+ - Bug fix: parallel `fix_jsonl` no longer creates an unconsumed tqdm progress bar.
cleanllm-0.4.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 cleanllm contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,391 @@
1
+ Metadata-Version: 2.4
2
+ Name: cleanllm
3
+ Version: 0.4.0
4
+ Summary: Streaming JSONL cleaner for LLM fine-tuning datasets.
5
+ Author: cleanllm contributors
6
+ License: MIT License
7
+
8
+ Copyright (c) 2026 cleanllm contributors
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+ License-File: LICENSE
28
+ Keywords: alpaca,audit,chatml,cleaning,dataset,deduplication,fine-tuning,jsonl,llm,sampling,sft,sharegpt
29
+ Classifier: Development Status :: 4 - Beta
30
+ Classifier: Intended Audience :: Developers
31
+ Classifier: Intended Audience :: Science/Research
32
+ Classifier: License :: OSI Approved :: MIT License
33
+ Classifier: Programming Language :: Python :: 3
34
+ Classifier: Programming Language :: Python :: 3.9
35
+ Classifier: Programming Language :: Python :: 3.10
36
+ Classifier: Programming Language :: Python :: 3.11
37
+ Classifier: Programming Language :: Python :: 3.12
38
+ Classifier: Programming Language :: Python :: 3.13
39
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
40
+ Classifier: Topic :: Software Development :: Libraries
41
+ Classifier: Topic :: Utilities
42
+ Requires-Python: >=3.9
43
+ Requires-Dist: orjson>=3.9.0
44
+ Requires-Dist: tqdm>=4.66.0
45
+ Requires-Dist: typer>=0.12.0
46
+ Provides-Extra: dev
47
+ Requires-Dist: build>=1.0.0; extra == 'dev'
48
+ Requires-Dist: twine>=4.0.2; extra == 'dev'
49
+ Provides-Extra: hf
50
+ Requires-Dist: datasets>=2.14.0; extra == 'hf'
51
+ Provides-Extra: tiktoken
52
+ Requires-Dist: tiktoken>=0.5.0; extra == 'tiktoken'
53
+ Description-Content-Type: text/markdown
54
+
55
+ # cleanllm
56
+
57
+ **Streaming JSONL cleaner for LLM fine-tuning datasets.** Minimal dependencies, memory-safe, and fast — processes files line-by-line without loading them into memory.
58
+
59
+ [![PyPI](https://img.shields.io/pypi/v/cleanllm)](https://pypi.org/project/cleanllm/)
60
+ [![Python](https://img.shields.io/badge/python-3.9%2B-blue)](https://www.python.org/)
61
+ [![License: MIT](https://img.shields.io/badge/license-MIT-green)](LICENSE)
62
+
63
+ ---
64
+
65
+ ## What it does
66
+
67
+ cleanllm gives you a pipeline for cleaning, validating, and profiling JSONL datasets before fine-tuning:
68
+
69
+ ```
70
+ raw.jsonl → scan → fix → dedup → validate → stats → audit bundle → shards
71
+ ```
72
+
73
+ Every step is streaming (no full-file load), resumable, and produces machine-readable JSON reports for CI gating.
74
+
75
+ ---
76
+
77
+ ## Install
78
+
79
+ ```bash
80
+ pip install cleanllm
81
+ ```
82
+
83
+ Or from source:
84
+
85
+ ```bash
86
+ git clone https://github.com/verma8076/cleanllm
87
+ cd cleanllm
88
+ pip install -e .
89
+ ```
90
+
91
+ ---
92
+
93
+ ## Quickstart
94
+
95
+ ```bash
96
+ # Scan for issues
97
+ cleanllm scan data.jsonl
98
+
99
+ # Fix: remove URLs, normalize whitespace, redact forbidden patterns
100
+ cleanllm fix data.jsonl -o data.cleaned.jsonl
101
+
102
+ # Deduplicate by prompt content
103
+ cleanllm dedup data.cleaned.jsonl -o data.dedup.jsonl --by prompt
104
+
105
+ # Profile the cleaned dataset
106
+ cleanllm stats data.dedup.jsonl --report-json stats.json
107
+
108
+ # Gate in CI: fail if invalid rows increased
109
+ cleanllm gate --compare compare.json --rules gate_rules.json
110
+ ```
111
+
112
+ ---
113
+
114
+ ## CLI reference
115
+
116
+ ### `scan`
117
+ Streaming scan for issues — invalid JSON, missing keys, URLs, forbidden patterns, language distribution, duplicate estimate.
118
+
119
+ ```bash
120
+ cleanllm scan data.jsonl
121
+ cleanllm scan data.jsonl --report-json scan_report.json --dup-estimate
122
+ cleanllm scan data.jsonl --preset usaco_portable
123
+ ```
124
+
125
+ ### `fix`
126
+ Remove URLs, normalize whitespace, redact or drop rows with forbidden patterns.
127
+
128
+ ```bash
129
+ cleanllm fix data.jsonl -o cleaned.jsonl
130
+ cleanllm fix data.jsonl -o cleaned.jsonl --drop-on forbidden_pattern --drop-on invalid_json
131
+ cleanllm fix data.jsonl -o cleaned.jsonl --preset cpp17_clean --report-json fix_report.json
132
+ ```
133
+
134
+ Drop rules: `invalid_json`, `missing_required_keys`, `forbidden_pattern`, `empty_assistant`, `placeholder`, `repetitive_response`, `bad_conversation`.
135
+
136
+ > **Note on `empty_assistant`:** By default this drops assistant responses shorter than 20 characters — calibrated for code datasets where very short responses are almost always errors. For text/chat datasets, set `--min-assistant-chars 1` to only drop truly blank responses.
137
+
138
+ ### `validate`
139
+ Schema validation, line by line. Exit code `0` only if all rows pass.
140
+
141
+ ```bash
142
+ cleanllm validate data.jsonl --schema basic_sft
143
+ cleanllm validate data.jsonl --schema cp_sft_v1
144
+ ```
145
+
146
+ | Schema | Required fields |
147
+ |---|---|
148
+ | `basic_sft` | `id`, `messages` (list of `role`/`content` dicts) |
149
+ | `cp_sft_v1` | `id`, `source`, `problem_id`, `messages`, `tests` (non-empty, with `input`/`output`) |
150
+
151
+ ### `dedup`
152
+ First-occurrence deduplication — by full record, prompt (system+user), or code (assistant).
153
+
154
+ ```bash
155
+ cleanllm dedup data.jsonl -o deduped.jsonl --by record
156
+ cleanllm dedup data.jsonl -o deduped.jsonl --by prompt --normalized
157
+ cleanllm dedup data.jsonl -o deduped.jsonl --by code --report-json dedup_report.json
158
+ ```
159
+
160
+ ### `stats`
161
+ Single-pass profiler: distributions, structural stats, schema counts, response lengths, language distribution.
162
+
163
+ ```bash
164
+ cleanllm stats data.jsonl
165
+ cleanllm stats data.jsonl --schema cp_sft_v1 --keys source,difficulty_bucket --top-k 20
166
+ cleanllm stats data.jsonl --report-json stats.json
167
+ ```
168
+
169
+ ### `compare`
170
+ Diff two stats reports to catch regressions between dataset versions.
171
+
172
+ ```bash
173
+ cleanllm compare old_stats.json new_stats.json
174
+ cleanllm compare old_stats.json new_stats.json --report-json compare.json
175
+ cleanllm compare old.jsonl new.jsonl --from-jsonl --schema cp_sft_v1
176
+ ```
177
+
178
+ ### `gate`
179
+ CI-friendly quality gating. Nonzero exit on failures.
180
+
181
+ ```bash
182
+ cleanllm gate --stats stats.json --rules gate_rules.json
183
+ cleanllm gate --compare compare.json --rules gate_rules.json --strict
184
+ cleanllm gate --compare compare.json --inline-rule "counts_diff.invalid_json_rows.delta<=0"
185
+ ```
186
+
187
+ Gate rules JSON:
188
+
189
+ ```json
190
+ {
191
+ "version": 1,
192
+ "mode": "compare",
193
+ "rules": [
194
+ {"name": "no_new_invalid", "metric": "counts_diff.invalid_json_rows.delta", "op": "<=", "value": 0},
195
+ {"name": "enough_valid", "metric": "counts_diff.valid_json_rows.new", "op": ">=", "value": 1000}
196
+ ]
197
+ }
198
+ ```
199
+
200
+ Supported ops: `==`, `!=`, `<`, `<=`, `>`, `>=`. Severities: `error` (default), `warn`.
201
+
202
+ ### `run`
203
+ Execute a JSON-defined multi-step pipeline with variable substitution.
204
+
205
+ ```bash
206
+ cleanllm run --config pipeline.json
207
+ cleanllm run --config pipeline.json --set input_path=data.jsonl --set outdir=out/v2
208
+ cleanllm run --config pipeline.json --dry-run
209
+ ```
210
+
211
+ Supported step types: `fix`, `validate`, `dedup`, `stats`, `audit`, `sample`, `shard`, `manifest`, `scan`, `compare`.
212
+
213
+ ### `sample`
214
+ Reservoir sampling — random or stratified, deterministic with `--seed`.
215
+
216
+ ```bash
217
+ cleanllm sample data.jsonl -o sample.jsonl -n 500 --seed 42
218
+ cleanllm sample data.jsonl -o sample.jsonl -n 500 --stratify source,difficulty_bucket
219
+ ```
220
+
221
+ ### `audit`
222
+ Build a reproducible audit bundle in one command: sampled JSONL + CSV review index (with original line numbers) + summary + manifest.
223
+
224
+ ```bash
225
+ cleanllm audit data.jsonl --outdir audit_bundle -n 200 --seed 42
226
+ cleanllm audit data.jsonl --outdir audit_bundle -n 200 --stratify source --schema cp_sft_v1
227
+ ```
228
+
229
+ Bundle contents: `audit_sample.jsonl`, `audit_index.csv`, `audit_summary.json`, `AUDIT_README.md`, `manifest.json`.
230
+
231
+ ### `shard` / `manifest`
232
+
233
+ ```bash
234
+ cleanllm shard data.jsonl --outdir shards --size 5000 --gzip
235
+ cleanllm manifest shards -o manifest.json
236
+ ```
237
+
238
+ ### `convert`
239
+ Convert a JSONL file between `sharegpt`, `alpaca`, and `chatml` formats.
240
+
241
+ ```bash
242
+ cleanllm convert data.jsonl -o converted.jsonl --from sharegpt --to chatml
243
+ cleanllm convert data.jsonl -o converted.jsonl --from alpaca --to sharegpt
244
+ ```
245
+
246
+ Supported formats: `sharegpt` (conversations list), `alpaca` (instruction/output), `chatml` (messages list).
247
+
248
+ ### `merge`
249
+ Merge multiple JSONL files into one, with optional deduplication.
250
+
251
+ ```bash
252
+ cleanllm merge a.jsonl b.jsonl c.jsonl -o merged.jsonl
253
+ cleanllm merge a.jsonl b.jsonl -o merged.jsonl --dedup
254
+ ```
255
+
256
+ ### `split`
257
+ Split a JSONL file into train and val sets.
258
+
259
+ ```bash
260
+ cleanllm split data.jsonl --outdir splits/
261
+ cleanllm split data.jsonl --outdir splits/ --ratio 0.95 --seed 42 --no-shuffle
262
+ ```
263
+
264
+ Outputs `<basename>_train.jsonl` and `<basename>_val.jsonl` in the output directory. Default ratio is 0.9 (90% train).
265
+
266
+ ### `recipes`
267
+ Bootstrap pipelines and gate rules from built-in templates.
268
+
269
+ ```bash
270
+ cleanllm recipes list
271
+ cleanllm recipes show cp_pipeline_usaco_portable
272
+ cleanllm recipes write cp_bundle --outdir bootstrap/
273
+ ```
274
+
275
+ Built-in recipes: `cp_pipeline_basic`, `cp_pipeline_usaco_portable`, `cp_pipeline_fast_audit`, `gate_stats_basic`, `gate_compare_basic`, `gate_compare_strict`, `cp_bundle`.
276
+
277
+ ---
278
+
279
+ ## Python API
280
+
281
+ ```python
282
+ from cleanllm import (
283
+ scan_jsonl, fix_jsonl, FixRules,
284
+ dedup_jsonl, validate_jsonl, stats_jsonl,
285
+ sample_jsonl, audit_bundle,
286
+ shard_jsonl, make_manifest,
287
+ download_from_hub, detect_hf_schema,
288
+ )
289
+ from cleanllm.convert import convert_jsonl
290
+ from cleanllm.merge import merge_jsonl
291
+ from cleanllm.split import split_jsonl
292
+
293
+ # Scan
294
+ report = scan_jsonl("data.jsonl")
295
+
296
+ # Fix (code dataset)
297
+ rules = FixRules(
298
+ drop_on={"forbidden_pattern", "empty_assistant"},
299
+ max_tokens=4096,
300
+ keep_language="python",
301
+ )
302
+ summary = fix_jsonl("data.jsonl", "cleaned.jsonl", rules)
303
+
304
+ # Fix (text/chat dataset — only drop truly blank responses)
305
+ rules = FixRules(drop_on={"empty_assistant"}, min_assistant_chars=1, forbidden_patterns=[])
306
+
307
+ # Dedup
308
+ result = dedup_jsonl("cleaned.jsonl", "deduped.jsonl", by="prompt", normalized=True)
309
+
310
+ # Stats
311
+ stats = stats_jsonl("deduped.jsonl", schema="cp_sft_v1", keys=["source", "difficulty_bucket"])
312
+
313
+ # Sample + audit
314
+ sample_jsonl("deduped.jsonl", "sample.jsonl", num_rows=200, seed=42)
315
+ audit_bundle("deduped.jsonl", "audit_bundle", num_rows=200, seed=42, stratify=["source"])
316
+
317
+ # Shard + manifest
318
+ shard_jsonl("deduped.jsonl", "shards", shard_size=5000, gzip_output=True)
319
+ make_manifest("shards", "manifest.json")
320
+
321
+ # Convert between formats
322
+ convert_jsonl("data.jsonl", "out.jsonl", from_fmt="sharegpt", to_fmt="chatml")
323
+
324
+ # Merge + split
325
+ merge_jsonl(["a.jsonl", "b.jsonl"], "merged.jsonl", dedup=True)
326
+ split_jsonl("merged.jsonl", "splits/", ratio=0.9, seed=42)
327
+
328
+ # Download from HuggingFace Hub (requires pip install cleanllm[hf])
329
+ result = download_from_hub("HuggingFaceH4/ultrachat_200k", "data.jsonl", split="train_sft")
330
+ ```
331
+
332
+ ---
333
+
334
+ ## Presets
335
+
336
+ | Preset | Description |
337
+ |---|---|
338
+ | `general` | URL removal + whitespace normalization, no domain-specific forbidden patterns |
339
+ | `security_scan` | Redacts secrets: AWS keys, GitHub tokens, API keys, private keys |
340
+ | `pii_scan` | Redacts PII: emails, US phone numbers, SSNs, credit cards, IPv4 addresses |
341
+ | `cpp17_clean` | URL removal + whitespace normalization + redact C++ portability issues |
342
+ | `usaco_portable` | Strict CP portability — drops rows with forbidden patterns |
343
+ | `deterministic_only` | Drops rows with non-deterministic APIs (`rand()`, `random_device`, etc.) |
344
+
345
+ ---
346
+
347
+ ## Defaults
348
+
349
+ - **Required keys:** `id`, `messages`
350
+ - **Forbidden patterns (default):** none — use `--preset cpp17_clean` or `--preset usaco_portable` for CP datasets
351
+ - **`empty_assistant` threshold:** 20 characters (responses shorter than this are flagged as empty)
352
+
353
+ > **CP datasets:** To apply competitive-programming forbidden patterns (`freopen`, `ifstream`, `bits/extc++.h`, etc.) use a preset: `cleanllm fix data.jsonl -o out.jsonl --preset usaco_portable`. In Python, pass `forbidden_patterns=list(DEFAULT_FORBIDDEN_PATTERNS)` explicitly.
354
+
355
+ ---
356
+
357
+ ## Data format
358
+
359
+ cleanllm expects JSONL where each line is a JSON object. The default schema (`cp_sft_v1`) requires:
360
+
361
+ ```json
362
+ {
363
+ "id": "unique-id",
364
+ "messages": [
365
+ {"role": "system", "content": "..."},
366
+ {"role": "user", "content": "..."},
367
+ {"role": "assistant", "content": "..."}
368
+ ]
369
+ }
370
+ ```
371
+
372
+ Optional fields: `source`, `difficulty_bucket`, `problem_id`, `tests`.
373
+
374
+ ---
375
+
376
+ ## Development
377
+
378
+ ```bash
379
+ pip install -e .[dev]
380
+ pytest
381
+ python -m build
382
+ twine check dist/*
383
+ ```
384
+
385
+ See `RELEASE_CHECKLIST.md` for the full release workflow.
386
+
387
+ ---
388
+
389
+ ## License
390
+
391
+ MIT