cleanllm 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cleanllm-0.4.0/.gitignore +16 -0
- cleanllm-0.4.0/CHANGELOG.md +43 -0
- cleanllm-0.4.0/LICENSE +21 -0
- cleanllm-0.4.0/PKG-INFO +391 -0
- cleanllm-0.4.0/README.md +337 -0
- cleanllm-0.4.0/pyproject.toml +67 -0
- cleanllm-0.4.0/src/cleanllm/__init__.py +41 -0
- cleanllm-0.4.0/src/cleanllm/__main__.py +11 -0
- cleanllm-0.4.0/src/cleanllm/audit.py +287 -0
- cleanllm-0.4.0/src/cleanllm/cli.py +1171 -0
- cleanllm-0.4.0/src/cleanllm/compare.py +290 -0
- cleanllm-0.4.0/src/cleanllm/convert.py +131 -0
- cleanllm-0.4.0/src/cleanllm/dedup.py +545 -0
- cleanllm-0.4.0/src/cleanllm/fix.py +501 -0
- cleanllm-0.4.0/src/cleanllm/gate.py +358 -0
- cleanllm-0.4.0/src/cleanllm/hf.py +111 -0
- cleanllm-0.4.0/src/cleanllm/manifest.py +56 -0
- cleanllm-0.4.0/src/cleanllm/merge.py +52 -0
- cleanllm-0.4.0/src/cleanllm/presets.py +112 -0
- cleanllm-0.4.0/src/cleanllm/recipes.py +391 -0
- cleanllm-0.4.0/src/cleanllm/reports.py +302 -0
- cleanllm-0.4.0/src/cleanllm/run.py +659 -0
- cleanllm-0.4.0/src/cleanllm/sample.py +241 -0
- cleanllm-0.4.0/src/cleanllm/scan.py +1140 -0
- cleanllm-0.4.0/src/cleanllm/shard.py +73 -0
- cleanllm-0.4.0/src/cleanllm/split.py +65 -0
- cleanllm-0.4.0/src/cleanllm/stats.py +865 -0
- cleanllm-0.4.0/src/cleanllm/util.py +1424 -0
- cleanllm-0.4.0/src/cleanllm/validate.py +221 -0
- cleanllm-0.4.0/tests/conftest.py +37 -0
- cleanllm-0.4.0/tests/test_adversarial.py +776 -0
- cleanllm-0.4.0/tests/test_audit.py +152 -0
- cleanllm-0.4.0/tests/test_cli_convert_merge_split.py +441 -0
- cleanllm-0.4.0/tests/test_cli_error_handling.py +566 -0
- cleanllm-0.4.0/tests/test_cli_extended.py +541 -0
- cleanllm-0.4.0/tests/test_cli_help.py +65 -0
- cleanllm-0.4.0/tests/test_compare.py +194 -0
- cleanllm-0.4.0/tests/test_coverage_gaps.py +670 -0
- cleanllm-0.4.0/tests/test_dedup.py +308 -0
- cleanllm-0.4.0/tests/test_dedup_extended.py +477 -0
- cleanllm-0.4.0/tests/test_edge_cases.py +1005 -0
- cleanllm-0.4.0/tests/test_encoding_quality.py +277 -0
- cleanllm-0.4.0/tests/test_extended_pipeline.py +735 -0
- cleanllm-0.4.0/tests/test_fix.py +31 -0
- cleanllm-0.4.0/tests/test_fix_extended.py +600 -0
- cleanllm-0.4.0/tests/test_gate.py +278 -0
- cleanllm-0.4.0/tests/test_haiku_benchmark.py +653 -0
- cleanllm-0.4.0/tests/test_hard_benchmark.py +557 -0
- cleanllm-0.4.0/tests/test_integration.py +473 -0
- cleanllm-0.4.0/tests/test_integration_pipelines.py +674 -0
- cleanllm-0.4.0/tests/test_limits_benchmark.py +874 -0
- cleanllm-0.4.0/tests/test_new_comprehensive.py +897 -0
- cleanllm-0.4.0/tests/test_new_features.py +716 -0
- cleanllm-0.4.0/tests/test_new_features_v2.py +480 -0
- cleanllm-0.4.0/tests/test_presets_reports.py +105 -0
- cleanllm-0.4.0/tests/test_presets_reports_extended.py +470 -0
- cleanllm-0.4.0/tests/test_property_invariants.py +737 -0
- cleanllm-0.4.0/tests/test_randomized_invariants.py +136 -0
- cleanllm-0.4.0/tests/test_recipes.py +86 -0
- cleanllm-0.4.0/tests/test_research_benchmark.py +579 -0
- cleanllm-0.4.0/tests/test_round2_edge_cases.py +620 -0
- cleanllm-0.4.0/tests/test_round2_features.py +764 -0
- cleanllm-0.4.0/tests/test_run.py +499 -0
- cleanllm-0.4.0/tests/test_sample.py +183 -0
- cleanllm-0.4.0/tests/test_scan.py +184 -0
- cleanllm-0.4.0/tests/test_scan_extended.py +503 -0
- cleanllm-0.4.0/tests/test_secrets_language.py +296 -0
- cleanllm-0.4.0/tests/test_security_safety.py +705 -0
- cleanllm-0.4.0/tests/test_shard_manifest.py +23 -0
- cleanllm-0.4.0/tests/test_stats.py +166 -0
- cleanllm-0.4.0/tests/test_stats_extended.py +560 -0
- cleanllm-0.4.0/tests/test_stress_load.py +777 -0
- cleanllm-0.4.0/tests/test_unicode_nested.py +796 -0
- cleanllm-0.4.0/tests/test_util_extended.py +289 -0
- cleanllm-0.4.0/tests/test_v3_extended.py +539 -0
- cleanllm-0.4.0/tests/test_v3_features.py +352 -0
- cleanllm-0.4.0/tests/test_v4_extended.py +1129 -0
- cleanllm-0.4.0/tests/test_v4_features.py +455 -0
- cleanllm-0.4.0/tests/test_validate.py +104 -0
- cleanllm-0.4.0/tests/test_version.py +32 -0
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## 0.4.0
|
|
4
|
+
|
|
5
|
+
- **Breaking:** `FixRules.forbidden_patterns` now defaults to `[]` (no patterns) instead of the CP-specific patterns (`freopen`, `ifstream`, etc.). Use `--preset cpp17_clean` / `--preset usaco_portable` in the CLI, or pass `forbidden_patterns=list(DEFAULT_FORBIDDEN_PATTERNS)` in Python, to restore the previous CP behavior.
|
|
6
|
+
- **Breaking:** `scan` and `fix` CLI commands now default to no forbidden patterns when no `--preset` or `--forbidden-pattern` flag is given. Same change — use a preset for CP datasets.
|
|
7
|
+
- Fix: `FixRules.min_assistant_chars` (default `20`) is now the source of truth for the `empty_assistant` drop threshold, replacing the hardcoded value inside `code_quality_flags`. Set `min_assistant_chars=1` to only drop truly blank responses in text/chat datasets.
|
|
8
|
+
- CI: added GitHub Actions workflow testing Python 3.9–3.13 on push/PR to main.
|
|
9
|
+
|
|
10
|
+
## 0.3.0
|
|
11
|
+
|
|
12
|
+
- New schemas: `sharegpt` and `alpaca` — validate the two most common fine-tuning formats out of the box.
|
|
13
|
+
- New: HuggingFace Hub integration (`cleanllm hf download`) — stream any HF dataset to JSONL with auto schema detection. Requires `pip install cleanllm[hf]`.
|
|
14
|
+
- New Python API: `download_from_hub()` and `detect_hf_schema()` exported from `cleanllm`.
|
|
15
|
+
- New: `convert` command — convert JSONL files between `sharegpt`, `alpaca`, and `chatml` formats.
|
|
16
|
+
- New: `merge` command — merge multiple JSONL files into one, with optional deduplication.
|
|
17
|
+
- New: `split` command — split a JSONL file into train/val sets with configurable ratio, shuffle, and seed.
|
|
18
|
+
|
|
19
|
+
## 0.2.0
|
|
20
|
+
|
|
21
|
+
- New schema: `basic_sft` — general-purpose SFT validation requiring only `id` and `messages`.
|
|
22
|
+
- New preset: `general` — URL removal + whitespace normalization with no domain-specific forbidden patterns.
|
|
23
|
+
- Perf: `detect_encoding_issues` ASCII fast-path — skips all regex for pure-ASCII content (~37% scan speedup on code datasets).
|
|
24
|
+
- Perf: pre-compiled patterns for `code_quality_flags` — eliminates per-call `re._compile` overhead.
|
|
25
|
+
- Perf: `_collect_scan_features` consolidates encoding check + token count into a single string pass (no redundant object traversal).
|
|
26
|
+
- Fix: audit `input_line_number` now correctly populated with the original line number from the input file (was always blank in v0.1.0).
|
|
27
|
+
- Fix: `detect_encoding_issues` now correctly flags double BOM (`\ufeff\ufeff`).
|
|
28
|
+
- Fix: parallel `fix_jsonl` no longer constructs an unconsumed tqdm progress bar.
|
|
29
|
+
|
|
30
|
+
## 0.1.0
|
|
31
|
+
|
|
32
|
+
Initial release:
|
|
33
|
+
|
|
34
|
+
- Streaming JSONL scan/fix/dedup/shard/manifest/validate/sample/audit.
|
|
35
|
+
- Dataset profiling and diffs (`stats`, `compare`).
|
|
36
|
+
- Configurable pipelines (`run`) and CI gating (`gate`).
|
|
37
|
+
- Built-in templates (`recipes`).
|
|
38
|
+
- CLI polish: consistent help text and clearer error messages.
|
|
39
|
+
- Release prep: version plumbing, `--version`, and `python -m cleanllm`.
|
|
40
|
+
- Stabilization: clearer JSON parse errors for run/gate/compare configs.
|
|
41
|
+
- Audit bundle metadata contract made deterministic (manifest includes summary; summary excludes manifest).
|
|
42
|
+
- Bug fix: `detect_encoding_issues` now correctly flags double BOM (`\ufeff\ufeff`).
|
|
43
|
+
- Bug fix: parallel `fix_jsonl` no longer creates an unconsumed tqdm progress bar.
|
cleanllm-0.4.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 cleanllm contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
cleanllm-0.4.0/PKG-INFO
ADDED
|
@@ -0,0 +1,391 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cleanllm
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Summary: Streaming JSONL cleaner for LLM fine-tuning datasets.
|
|
5
|
+
Author: cleanllm contributors
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 cleanllm contributors
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
License-File: LICENSE
|
|
28
|
+
Keywords: alpaca,audit,chatml,cleaning,dataset,deduplication,fine-tuning,jsonl,llm,sampling,sft,sharegpt
|
|
29
|
+
Classifier: Development Status :: 4 - Beta
|
|
30
|
+
Classifier: Intended Audience :: Developers
|
|
31
|
+
Classifier: Intended Audience :: Science/Research
|
|
32
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
33
|
+
Classifier: Programming Language :: Python :: 3
|
|
34
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
35
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
36
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
37
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
39
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
40
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
41
|
+
Classifier: Topic :: Utilities
|
|
42
|
+
Requires-Python: >=3.9
|
|
43
|
+
Requires-Dist: orjson>=3.9.0
|
|
44
|
+
Requires-Dist: tqdm>=4.66.0
|
|
45
|
+
Requires-Dist: typer>=0.12.0
|
|
46
|
+
Provides-Extra: dev
|
|
47
|
+
Requires-Dist: build>=1.0.0; extra == 'dev'
|
|
48
|
+
Requires-Dist: twine>=4.0.2; extra == 'dev'
|
|
49
|
+
Provides-Extra: hf
|
|
50
|
+
Requires-Dist: datasets>=2.14.0; extra == 'hf'
|
|
51
|
+
Provides-Extra: tiktoken
|
|
52
|
+
Requires-Dist: tiktoken>=0.5.0; extra == 'tiktoken'
|
|
53
|
+
Description-Content-Type: text/markdown
|
|
54
|
+
|
|
55
|
+
# cleanllm
|
|
56
|
+
|
|
57
|
+
**Streaming JSONL cleaner for LLM fine-tuning datasets.** Minimal dependencies, memory-safe, and fast — processes files line-by-line without loading them into memory.
|
|
58
|
+
|
|
59
|
+
[](https://pypi.org/project/cleanllm/)
|
|
60
|
+
[](https://www.python.org/)
|
|
61
|
+
[](LICENSE)
|
|
62
|
+
|
|
63
|
+
---
|
|
64
|
+
|
|
65
|
+
## What it does
|
|
66
|
+
|
|
67
|
+
cleanllm gives you a pipeline for cleaning, validating, and profiling JSONL datasets before fine-tuning:
|
|
68
|
+
|
|
69
|
+
```
|
|
70
|
+
raw.jsonl → scan → fix → dedup → validate → stats → audit bundle → shards
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Every step is streaming (no full-file load), resumable, and produces machine-readable JSON reports for CI gating.
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## Install
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
pip install cleanllm
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
Or from source:
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
git clone https://github.com/verma8076/cleanllm
|
|
87
|
+
cd cleanllm
|
|
88
|
+
pip install -e .
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
---
|
|
92
|
+
|
|
93
|
+
## Quickstart
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
# Scan for issues
|
|
97
|
+
cleanllm scan data.jsonl
|
|
98
|
+
|
|
99
|
+
# Fix: remove URLs, normalize whitespace, redact forbidden patterns
|
|
100
|
+
cleanllm fix data.jsonl -o data.cleaned.jsonl
|
|
101
|
+
|
|
102
|
+
# Deduplicate by prompt content
|
|
103
|
+
cleanllm dedup data.cleaned.jsonl -o data.dedup.jsonl --by prompt
|
|
104
|
+
|
|
105
|
+
# Profile the cleaned dataset
|
|
106
|
+
cleanllm stats data.dedup.jsonl --report-json stats.json
|
|
107
|
+
|
|
108
|
+
# Gate in CI: fail if invalid rows increased
|
|
109
|
+
cleanllm gate --compare compare.json --rules gate_rules.json
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
## CLI reference
|
|
115
|
+
|
|
116
|
+
### `scan`
|
|
117
|
+
Streaming scan for issues — invalid JSON, missing keys, URLs, forbidden patterns, language distribution, duplicate estimate.
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
cleanllm scan data.jsonl
|
|
121
|
+
cleanllm scan data.jsonl --report-json scan_report.json --dup-estimate
|
|
122
|
+
cleanllm scan data.jsonl --preset usaco_portable
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### `fix`
|
|
126
|
+
Remove URLs, normalize whitespace, redact or drop rows with forbidden patterns.
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
cleanllm fix data.jsonl -o cleaned.jsonl
|
|
130
|
+
cleanllm fix data.jsonl -o cleaned.jsonl --drop-on forbidden_pattern --drop-on invalid_json
|
|
131
|
+
cleanllm fix data.jsonl -o cleaned.jsonl --preset cpp17_clean --report-json fix_report.json
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
Drop rules: `invalid_json`, `missing_required_keys`, `forbidden_pattern`, `empty_assistant`, `placeholder`, `repetitive_response`, `bad_conversation`.
|
|
135
|
+
|
|
136
|
+
> **Note on `empty_assistant`:** By default this drops assistant responses shorter than 20 characters — calibrated for code datasets where very short responses are almost always errors. For text/chat datasets, set `--min-assistant-chars 1` to only drop truly blank responses.
|
|
137
|
+
|
|
138
|
+
### `validate`
|
|
139
|
+
Schema validation, line by line. Exit code `0` only if all rows pass.
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
cleanllm validate data.jsonl --schema basic_sft
|
|
143
|
+
cleanllm validate data.jsonl --schema cp_sft_v1
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
| Schema | Required fields |
|
|
147
|
+
|---|---|
|
|
148
|
+
| `basic_sft` | `id`, `messages` (list of `role`/`content` dicts) |
|
|
149
|
+
| `cp_sft_v1` | `id`, `source`, `problem_id`, `messages`, `tests` (non-empty, with `input`/`output`) |
|
|
150
|
+
|
|
151
|
+
### `dedup`
|
|
152
|
+
First-occurrence deduplication — by full record, prompt (system+user), or code (assistant).
|
|
153
|
+
|
|
154
|
+
```bash
|
|
155
|
+
cleanllm dedup data.jsonl -o deduped.jsonl --by record
|
|
156
|
+
cleanllm dedup data.jsonl -o deduped.jsonl --by prompt --normalized
|
|
157
|
+
cleanllm dedup data.jsonl -o deduped.jsonl --by code --report-json dedup_report.json
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
### `stats`
|
|
161
|
+
Single-pass profiler: distributions, structural stats, schema counts, response lengths, language distribution.
|
|
162
|
+
|
|
163
|
+
```bash
|
|
164
|
+
cleanllm stats data.jsonl
|
|
165
|
+
cleanllm stats data.jsonl --schema cp_sft_v1 --keys source,difficulty_bucket --top-k 20
|
|
166
|
+
cleanllm stats data.jsonl --report-json stats.json
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
### `compare`
|
|
170
|
+
Diff two stats reports to catch regressions between dataset versions.
|
|
171
|
+
|
|
172
|
+
```bash
|
|
173
|
+
cleanllm compare old_stats.json new_stats.json
|
|
174
|
+
cleanllm compare old_stats.json new_stats.json --report-json compare.json
|
|
175
|
+
cleanllm compare old.jsonl new.jsonl --from-jsonl --schema cp_sft_v1
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
### `gate`
|
|
179
|
+
CI-friendly quality gating. Nonzero exit on failures.
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
cleanllm gate --stats stats.json --rules gate_rules.json
|
|
183
|
+
cleanllm gate --compare compare.json --rules gate_rules.json --strict
|
|
184
|
+
cleanllm gate --compare compare.json --inline-rule "counts_diff.invalid_json_rows.delta<=0"
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
Gate rules JSON:
|
|
188
|
+
|
|
189
|
+
```json
|
|
190
|
+
{
|
|
191
|
+
"version": 1,
|
|
192
|
+
"mode": "compare",
|
|
193
|
+
"rules": [
|
|
194
|
+
{"name": "no_new_invalid", "metric": "counts_diff.invalid_json_rows.delta", "op": "<=", "value": 0},
|
|
195
|
+
{"name": "enough_valid", "metric": "counts_diff.valid_json_rows.new", "op": ">=", "value": 1000}
|
|
196
|
+
]
|
|
197
|
+
}
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
Supported ops: `==`, `!=`, `<`, `<=`, `>`, `>=`. Severities: `error` (default), `warn`.
|
|
201
|
+
|
|
202
|
+
### `run`
|
|
203
|
+
Execute a JSON-defined multi-step pipeline with variable substitution.
|
|
204
|
+
|
|
205
|
+
```bash
|
|
206
|
+
cleanllm run --config pipeline.json
|
|
207
|
+
cleanllm run --config pipeline.json --set input_path=data.jsonl --set outdir=out/v2
|
|
208
|
+
cleanllm run --config pipeline.json --dry-run
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
Supported step types: `fix`, `validate`, `dedup`, `stats`, `audit`, `sample`, `shard`, `manifest`, `scan`, `compare`.
|
|
212
|
+
|
|
213
|
+
### `sample`
|
|
214
|
+
Reservoir sampling — random or stratified, deterministic with `--seed`.
|
|
215
|
+
|
|
216
|
+
```bash
|
|
217
|
+
cleanllm sample data.jsonl -o sample.jsonl -n 500 --seed 42
|
|
218
|
+
cleanllm sample data.jsonl -o sample.jsonl -n 500 --stratify source,difficulty_bucket
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
### `audit`
|
|
222
|
+
Build a reproducible audit bundle in one command: sampled JSONL + CSV review index (with original line numbers) + summary + manifest.
|
|
223
|
+
|
|
224
|
+
```bash
|
|
225
|
+
cleanllm audit data.jsonl --outdir audit_bundle -n 200 --seed 42
|
|
226
|
+
cleanllm audit data.jsonl --outdir audit_bundle -n 200 --stratify source --schema cp_sft_v1
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
Bundle contents: `audit_sample.jsonl`, `audit_index.csv`, `audit_summary.json`, `AUDIT_README.md`, `manifest.json`.
|
|
230
|
+
|
|
231
|
+
### `shard` / `manifest`
|
|
232
|
+
|
|
233
|
+
```bash
|
|
234
|
+
cleanllm shard data.jsonl --outdir shards --size 5000 --gzip
|
|
235
|
+
cleanllm manifest shards -o manifest.json
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
### `convert`
|
|
239
|
+
Convert a JSONL file between `sharegpt`, `alpaca`, and `chatml` formats.
|
|
240
|
+
|
|
241
|
+
```bash
|
|
242
|
+
cleanllm convert data.jsonl -o converted.jsonl --from sharegpt --to chatml
|
|
243
|
+
cleanllm convert data.jsonl -o converted.jsonl --from alpaca --to sharegpt
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
Supported formats: `sharegpt` (conversations list), `alpaca` (instruction/output), `chatml` (messages list).
|
|
247
|
+
|
|
248
|
+
### `merge`
|
|
249
|
+
Merge multiple JSONL files into one, with optional deduplication.
|
|
250
|
+
|
|
251
|
+
```bash
|
|
252
|
+
cleanllm merge a.jsonl b.jsonl c.jsonl -o merged.jsonl
|
|
253
|
+
cleanllm merge a.jsonl b.jsonl -o merged.jsonl --dedup
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
### `split`
|
|
257
|
+
Split a JSONL file into train and val sets.
|
|
258
|
+
|
|
259
|
+
```bash
|
|
260
|
+
cleanllm split data.jsonl --outdir splits/
|
|
261
|
+
cleanllm split data.jsonl --outdir splits/ --ratio 0.95 --seed 42 --no-shuffle
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
Outputs `<basename>_train.jsonl` and `<basename>_val.jsonl` in the output directory. Default ratio is 0.9 (90% train).
|
|
265
|
+
|
|
266
|
+
### `recipes`
|
|
267
|
+
Bootstrap pipelines and gate rules from built-in templates.
|
|
268
|
+
|
|
269
|
+
```bash
|
|
270
|
+
cleanllm recipes list
|
|
271
|
+
cleanllm recipes show cp_pipeline_usaco_portable
|
|
272
|
+
cleanllm recipes write cp_bundle --outdir bootstrap/
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
Built-in recipes: `cp_pipeline_basic`, `cp_pipeline_usaco_portable`, `cp_pipeline_fast_audit`, `gate_stats_basic`, `gate_compare_basic`, `gate_compare_strict`, `cp_bundle`.
|
|
276
|
+
|
|
277
|
+
---
|
|
278
|
+
|
|
279
|
+
## Python API
|
|
280
|
+
|
|
281
|
+
```python
|
|
282
|
+
from cleanllm import (
|
|
283
|
+
scan_jsonl, fix_jsonl, FixRules,
|
|
284
|
+
dedup_jsonl, validate_jsonl, stats_jsonl,
|
|
285
|
+
sample_jsonl, audit_bundle,
|
|
286
|
+
shard_jsonl, make_manifest,
|
|
287
|
+
download_from_hub, detect_hf_schema,
|
|
288
|
+
)
|
|
289
|
+
from cleanllm.convert import convert_jsonl
|
|
290
|
+
from cleanllm.merge import merge_jsonl
|
|
291
|
+
from cleanllm.split import split_jsonl
|
|
292
|
+
|
|
293
|
+
# Scan
|
|
294
|
+
report = scan_jsonl("data.jsonl")
|
|
295
|
+
|
|
296
|
+
# Fix (code dataset)
|
|
297
|
+
rules = FixRules(
|
|
298
|
+
drop_on={"forbidden_pattern", "empty_assistant"},
|
|
299
|
+
max_tokens=4096,
|
|
300
|
+
keep_language="python",
|
|
301
|
+
)
|
|
302
|
+
summary = fix_jsonl("data.jsonl", "cleaned.jsonl", rules)
|
|
303
|
+
|
|
304
|
+
# Fix (text/chat dataset — only drop truly blank responses)
|
|
305
|
+
rules = FixRules(drop_on={"empty_assistant"}, min_assistant_chars=1, forbidden_patterns=[])
|
|
306
|
+
|
|
307
|
+
# Dedup
|
|
308
|
+
result = dedup_jsonl("cleaned.jsonl", "deduped.jsonl", by="prompt", normalized=True)
|
|
309
|
+
|
|
310
|
+
# Stats
|
|
311
|
+
stats = stats_jsonl("deduped.jsonl", schema="cp_sft_v1", keys=["source", "difficulty_bucket"])
|
|
312
|
+
|
|
313
|
+
# Sample + audit
|
|
314
|
+
sample_jsonl("deduped.jsonl", "sample.jsonl", num_rows=200, seed=42)
|
|
315
|
+
audit_bundle("deduped.jsonl", "audit_bundle", num_rows=200, seed=42, stratify=["source"])
|
|
316
|
+
|
|
317
|
+
# Shard + manifest
|
|
318
|
+
shard_jsonl("deduped.jsonl", "shards", shard_size=5000, gzip_output=True)
|
|
319
|
+
make_manifest("shards", "manifest.json")
|
|
320
|
+
|
|
321
|
+
# Convert between formats
|
|
322
|
+
convert_jsonl("data.jsonl", "out.jsonl", from_fmt="sharegpt", to_fmt="chatml")
|
|
323
|
+
|
|
324
|
+
# Merge + split
|
|
325
|
+
merge_jsonl(["a.jsonl", "b.jsonl"], "merged.jsonl", dedup=True)
|
|
326
|
+
split_jsonl("merged.jsonl", "splits/", ratio=0.9, seed=42)
|
|
327
|
+
|
|
328
|
+
# Download from HuggingFace Hub (requires pip install cleanllm[hf])
|
|
329
|
+
result = download_from_hub("HuggingFaceH4/ultrachat_200k", "data.jsonl", split="train_sft")
|
|
330
|
+
```
|
|
331
|
+
|
|
332
|
+
---
|
|
333
|
+
|
|
334
|
+
## Presets
|
|
335
|
+
|
|
336
|
+
| Preset | Description |
|
|
337
|
+
|---|---|
|
|
338
|
+
| `general` | URL removal + whitespace normalization, no domain-specific forbidden patterns |
|
|
339
|
+
| `security_scan` | Redacts secrets: AWS keys, GitHub tokens, API keys, private keys |
|
|
340
|
+
| `pii_scan` | Redacts PII: emails, US phone numbers, SSNs, credit cards, IPv4 addresses |
|
|
341
|
+
| `cpp17_clean` | URL removal + whitespace normalization + redact C++ portability issues |
|
|
342
|
+
| `usaco_portable` | Strict CP portability — drops rows with forbidden patterns |
|
|
343
|
+
| `deterministic_only` | Drops rows with non-deterministic APIs (`rand()`, `random_device`, etc.) |
|
|
344
|
+
|
|
345
|
+
---
|
|
346
|
+
|
|
347
|
+
## Defaults
|
|
348
|
+
|
|
349
|
+
- **Required keys:** `id`, `messages`
|
|
350
|
+
- **Forbidden patterns (default):** none — use `--preset cpp17_clean` or `--preset usaco_portable` for CP datasets
|
|
351
|
+
- **`empty_assistant` threshold:** 20 characters (responses shorter than this are flagged as empty)
|
|
352
|
+
|
|
353
|
+
> **CP datasets:** To apply competitive-programming forbidden patterns (`freopen`, `ifstream`, `bits/extc++.h`, etc.) use a preset: `cleanllm fix data.jsonl -o out.jsonl --preset usaco_portable`. In Python, pass `forbidden_patterns=list(DEFAULT_FORBIDDEN_PATTERNS)` explicitly.
|
|
354
|
+
|
|
355
|
+
---
|
|
356
|
+
|
|
357
|
+
## Data format
|
|
358
|
+
|
|
359
|
+
cleanllm expects JSONL where each line is a JSON object. The default schema (`cp_sft_v1`) requires:
|
|
360
|
+
|
|
361
|
+
```json
|
|
362
|
+
{
|
|
363
|
+
"id": "unique-id",
|
|
364
|
+
"messages": [
|
|
365
|
+
{"role": "system", "content": "..."},
|
|
366
|
+
{"role": "user", "content": "..."},
|
|
367
|
+
{"role": "assistant", "content": "..."}
|
|
368
|
+
]
|
|
369
|
+
}
|
|
370
|
+
```
|
|
371
|
+
|
|
372
|
+
Optional fields: `source`, `difficulty_bucket`, `problem_id`, `tests`.
|
|
373
|
+
|
|
374
|
+
---
|
|
375
|
+
|
|
376
|
+
## Development
|
|
377
|
+
|
|
378
|
+
```bash
|
|
379
|
+
pip install -e .[dev]
|
|
380
|
+
pytest
|
|
381
|
+
python -m build
|
|
382
|
+
twine check dist/*
|
|
383
|
+
```
|
|
384
|
+
|
|
385
|
+
See `RELEASE_CHECKLIST.md` for the full release workflow.
|
|
386
|
+
|
|
387
|
+
---
|
|
388
|
+
|
|
389
|
+
## License
|
|
390
|
+
|
|
391
|
+
MIT
|