cleanllm 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cleanllm/__init__.py +41 -0
- cleanllm/__main__.py +11 -0
- cleanllm/audit.py +287 -0
- cleanllm/cli.py +1171 -0
- cleanllm/compare.py +290 -0
- cleanllm/convert.py +131 -0
- cleanllm/dedup.py +545 -0
- cleanllm/fix.py +501 -0
- cleanllm/gate.py +358 -0
- cleanllm/hf.py +111 -0
- cleanllm/manifest.py +56 -0
- cleanllm/merge.py +52 -0
- cleanllm/presets.py +112 -0
- cleanllm/recipes.py +391 -0
- cleanllm/reports.py +302 -0
- cleanllm/run.py +659 -0
- cleanllm/sample.py +241 -0
- cleanllm/scan.py +1140 -0
- cleanllm/shard.py +73 -0
- cleanllm/split.py +65 -0
- cleanllm/stats.py +865 -0
- cleanllm/util.py +1424 -0
- cleanllm/validate.py +221 -0
- cleanllm-0.4.0.dist-info/METADATA +391 -0
- cleanllm-0.4.0.dist-info/RECORD +28 -0
- cleanllm-0.4.0.dist-info/WHEEL +4 -0
- cleanllm-0.4.0.dist-info/entry_points.txt +2 -0
- cleanllm-0.4.0.dist-info/licenses/LICENSE +21 -0
cleanllm/__init__.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""cleanllm: streaming JSONL cleaner for LLM fine-tuning datasets."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
5
|
+
|
|
6
|
+
from .scan import scan_jsonl
|
|
7
|
+
from .fix import fix_jsonl, FixRules
|
|
8
|
+
from .shard import shard_jsonl
|
|
9
|
+
from .manifest import make_manifest
|
|
10
|
+
from .validate import validate_jsonl
|
|
11
|
+
from .dedup import dedup_jsonl
|
|
12
|
+
from .sample import sample_jsonl
|
|
13
|
+
from .audit import audit_bundle
|
|
14
|
+
from .stats import stats_jsonl
|
|
15
|
+
from .hf import download_from_hub, detect_hf_schema
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _resolve_version() -> str:
|
|
19
|
+
try:
|
|
20
|
+
return version("cleanllm")
|
|
21
|
+
except PackageNotFoundError:
|
|
22
|
+
return "0.0.0"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
__version__ = _resolve_version()
|
|
26
|
+
|
|
27
|
+
__all__ = [
|
|
28
|
+
"scan_jsonl",
|
|
29
|
+
"fix_jsonl",
|
|
30
|
+
"FixRules",
|
|
31
|
+
"shard_jsonl",
|
|
32
|
+
"make_manifest",
|
|
33
|
+
"validate_jsonl",
|
|
34
|
+
"dedup_jsonl",
|
|
35
|
+
"sample_jsonl",
|
|
36
|
+
"audit_bundle",
|
|
37
|
+
"stats_jsonl",
|
|
38
|
+
"download_from_hub",
|
|
39
|
+
"detect_hf_schema",
|
|
40
|
+
"__version__",
|
|
41
|
+
]
|
cleanllm/__main__.py
ADDED
cleanllm/audit.py
ADDED
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import os
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from .sample import sample_jsonl
|
|
9
|
+
from .util import parse_json, sha256_file
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass(frozen=True)
|
|
13
|
+
class AuditPaths:
|
|
14
|
+
outdir: str
|
|
15
|
+
sample_path: str
|
|
16
|
+
index_path: str
|
|
17
|
+
summary_path: str
|
|
18
|
+
readme_path: str
|
|
19
|
+
manifest_path: str
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def audit_bundle(
|
|
23
|
+
inp: str,
|
|
24
|
+
outdir: str,
|
|
25
|
+
num_rows: int,
|
|
26
|
+
seed: int | None = None,
|
|
27
|
+
stratify: list[str] | None = None,
|
|
28
|
+
schema: str | None = None,
|
|
29
|
+
overwrite: bool = False,
|
|
30
|
+
show_progress: bool = False,
|
|
31
|
+
started_at: str | None = None,
|
|
32
|
+
ended_at: str | None = None,
|
|
33
|
+
duration_seconds: float | None = None,
|
|
34
|
+
version: str | None = None,
|
|
35
|
+
) -> dict[str, Any]:
|
|
36
|
+
_prepare_outdir(outdir, overwrite=overwrite)
|
|
37
|
+
paths = _bundle_paths(outdir)
|
|
38
|
+
|
|
39
|
+
sample_result = sample_jsonl(
|
|
40
|
+
inp,
|
|
41
|
+
paths.sample_path,
|
|
42
|
+
num_rows=num_rows,
|
|
43
|
+
seed=seed,
|
|
44
|
+
stratify=stratify,
|
|
45
|
+
schema=schema,
|
|
46
|
+
show_progress=show_progress,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
sampled_line_numbers = sample_result.get("sampled_line_numbers", [])
|
|
50
|
+
index_rows = _write_index_csv(paths.sample_path, paths.index_path, sampled_line_numbers)
|
|
51
|
+
_write_readme(
|
|
52
|
+
paths.readme_path,
|
|
53
|
+
inp=inp,
|
|
54
|
+
requested_rows=num_rows,
|
|
55
|
+
sampled_rows=sample_result["sampled_rows"],
|
|
56
|
+
seed=seed,
|
|
57
|
+
stratify=stratify,
|
|
58
|
+
schema=schema,
|
|
59
|
+
paths=paths,
|
|
60
|
+
)
|
|
61
|
+
# Summary excludes summary+manifest to avoid hash loops.
|
|
62
|
+
summary_files = _bundle_files(paths, include_summary=False, include_manifest=False)
|
|
63
|
+
summary = _build_summary(
|
|
64
|
+
inp=inp,
|
|
65
|
+
outdir=outdir,
|
|
66
|
+
num_rows=num_rows,
|
|
67
|
+
seed=seed,
|
|
68
|
+
stratify=stratify,
|
|
69
|
+
schema=schema,
|
|
70
|
+
sample_result=sample_result,
|
|
71
|
+
index_rows=index_rows,
|
|
72
|
+
bundle_files=summary_files,
|
|
73
|
+
started_at=started_at,
|
|
74
|
+
ended_at=ended_at,
|
|
75
|
+
duration_seconds=duration_seconds,
|
|
76
|
+
version=version,
|
|
77
|
+
)
|
|
78
|
+
_write_summary(paths.summary_path, summary)
|
|
79
|
+
|
|
80
|
+
# Manifest includes summary but excludes itself to avoid self-referential hashing.
|
|
81
|
+
_write_manifest(paths.manifest_path, paths)
|
|
82
|
+
|
|
83
|
+
return summary
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _prepare_outdir(outdir: str, overwrite: bool) -> None:
|
|
87
|
+
if os.path.exists(outdir):
|
|
88
|
+
if not overwrite and os.listdir(outdir):
|
|
89
|
+
raise ValueError(f"Outdir exists and is not empty: {outdir}")
|
|
90
|
+
os.makedirs(outdir, exist_ok=True)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _bundle_paths(outdir: str) -> AuditPaths:
|
|
94
|
+
return AuditPaths(
|
|
95
|
+
outdir=outdir,
|
|
96
|
+
sample_path=os.path.join(outdir, "audit_sample.jsonl"),
|
|
97
|
+
index_path=os.path.join(outdir, "audit_index.csv"),
|
|
98
|
+
summary_path=os.path.join(outdir, "audit_summary.json"),
|
|
99
|
+
readme_path=os.path.join(outdir, "AUDIT_README.md"),
|
|
100
|
+
manifest_path=os.path.join(outdir, "manifest.json"),
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _write_index_csv(sample_path: str, index_path: str, sampled_line_numbers: list[int]) -> int:
|
|
105
|
+
columns = [
|
|
106
|
+
"sample_idx",
|
|
107
|
+
"input_line_number",
|
|
108
|
+
"id",
|
|
109
|
+
"source",
|
|
110
|
+
"problem_id",
|
|
111
|
+
"difficulty_bucket",
|
|
112
|
+
"has_tests",
|
|
113
|
+
"num_messages",
|
|
114
|
+
"num_tests",
|
|
115
|
+
"review_status",
|
|
116
|
+
"review_label",
|
|
117
|
+
"notes",
|
|
118
|
+
]
|
|
119
|
+
count = 0
|
|
120
|
+
with open(sample_path, "r", encoding="utf-8", errors="replace") as f_in, open(
|
|
121
|
+
index_path, "w", encoding="utf-8", newline=""
|
|
122
|
+
) as f_out:
|
|
123
|
+
writer = csv.DictWriter(f_out, fieldnames=columns)
|
|
124
|
+
writer.writeheader()
|
|
125
|
+
for line in f_in:
|
|
126
|
+
if not line.strip():
|
|
127
|
+
continue
|
|
128
|
+
line_number = sampled_line_numbers[count] if count < len(sampled_line_numbers) else ""
|
|
129
|
+
count += 1
|
|
130
|
+
try:
|
|
131
|
+
obj = parse_json(line)
|
|
132
|
+
except Exception:
|
|
133
|
+
continue
|
|
134
|
+
|
|
135
|
+
messages = obj.get("messages")
|
|
136
|
+
tests = obj.get("tests")
|
|
137
|
+
writer.writerow(
|
|
138
|
+
{
|
|
139
|
+
"sample_idx": count,
|
|
140
|
+
"input_line_number": line_number,
|
|
141
|
+
"id": _as_str(obj.get("id")),
|
|
142
|
+
"source": _as_str(obj.get("source")),
|
|
143
|
+
"problem_id": _as_str(obj.get("problem_id")),
|
|
144
|
+
"difficulty_bucket": _as_str(obj.get("difficulty_bucket")),
|
|
145
|
+
"has_tests": _bool_str(isinstance(tests, list) and len(tests) > 0),
|
|
146
|
+
"num_messages": str(len(messages)) if isinstance(messages, list) else "",
|
|
147
|
+
"num_tests": str(len(tests)) if isinstance(tests, list) else "",
|
|
148
|
+
"review_status": "",
|
|
149
|
+
"review_label": "",
|
|
150
|
+
"notes": "",
|
|
151
|
+
}
|
|
152
|
+
)
|
|
153
|
+
return count
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _build_summary(
|
|
157
|
+
inp: str,
|
|
158
|
+
outdir: str,
|
|
159
|
+
num_rows: int,
|
|
160
|
+
seed: int | None,
|
|
161
|
+
stratify: list[str] | None,
|
|
162
|
+
schema: str | None,
|
|
163
|
+
sample_result: dict[str, Any],
|
|
164
|
+
index_rows: int,
|
|
165
|
+
bundle_files: list[dict[str, Any]],
|
|
166
|
+
started_at: str | None,
|
|
167
|
+
ended_at: str | None,
|
|
168
|
+
duration_seconds: float | None,
|
|
169
|
+
version: str | None,
|
|
170
|
+
) -> dict[str, Any]:
|
|
171
|
+
return {
|
|
172
|
+
"command": "audit",
|
|
173
|
+
"input_path": inp,
|
|
174
|
+
"outdir": outdir,
|
|
175
|
+
"requested_rows": num_rows,
|
|
176
|
+
"sampled_rows": sample_result["sampled_rows"],
|
|
177
|
+
"seed": seed,
|
|
178
|
+
"stratify": stratify or None,
|
|
179
|
+
"schema": schema,
|
|
180
|
+
"valid_rows_seen": sample_result["valid_rows_seen"],
|
|
181
|
+
"invalid_json_rows": sample_result["invalid_json_rows"],
|
|
182
|
+
"invalid_schema_rows": sample_result["invalid_schema_rows"],
|
|
183
|
+
"missing_strata_field_rows": sample_result["missing_strata_field_rows"],
|
|
184
|
+
"error_counts_by_reason": sample_result["error_counts_by_reason"],
|
|
185
|
+
"issue_examples": [
|
|
186
|
+
{"line": issue.line, "reason": issue.reason} for issue in sample_result["issues"][:5]
|
|
187
|
+
],
|
|
188
|
+
"strata_counts_seen": sample_result["strata_counts_seen"],
|
|
189
|
+
"strata_counts_sampled": sample_result["strata_counts_sampled"],
|
|
190
|
+
"bundle_files": bundle_files,
|
|
191
|
+
"index_rows": index_rows,
|
|
192
|
+
"started_at": started_at,
|
|
193
|
+
"ended_at": ended_at,
|
|
194
|
+
"duration_seconds": duration_seconds,
|
|
195
|
+
"version": version,
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def _bundle_files(
|
|
200
|
+
paths: AuditPaths,
|
|
201
|
+
include_summary: bool,
|
|
202
|
+
include_manifest: bool,
|
|
203
|
+
) -> list[dict[str, Any]]:
|
|
204
|
+
files = []
|
|
205
|
+
candidates = [
|
|
206
|
+
paths.sample_path,
|
|
207
|
+
paths.index_path,
|
|
208
|
+
paths.readme_path,
|
|
209
|
+
]
|
|
210
|
+
if include_summary:
|
|
211
|
+
candidates.append(paths.summary_path)
|
|
212
|
+
if include_manifest:
|
|
213
|
+
candidates.append(paths.manifest_path)
|
|
214
|
+
for path in candidates:
|
|
215
|
+
if not os.path.exists(path):
|
|
216
|
+
continue
|
|
217
|
+
files.append(
|
|
218
|
+
{
|
|
219
|
+
"path": os.path.basename(path),
|
|
220
|
+
"size_bytes": os.path.getsize(path),
|
|
221
|
+
"sha256": sha256_file(path),
|
|
222
|
+
}
|
|
223
|
+
)
|
|
224
|
+
return files
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _write_summary(path: str, summary: dict[str, Any]) -> None:
|
|
228
|
+
import orjson
|
|
229
|
+
|
|
230
|
+
with open(path, "wb") as f:
|
|
231
|
+
f.write(orjson.dumps(summary, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS))
|
|
232
|
+
f.write(b"\n")
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def _write_readme(
|
|
236
|
+
path: str,
|
|
237
|
+
inp: str,
|
|
238
|
+
requested_rows: int,
|
|
239
|
+
sampled_rows: int,
|
|
240
|
+
seed: int | None,
|
|
241
|
+
stratify: list[str] | None,
|
|
242
|
+
schema: str | None,
|
|
243
|
+
paths: AuditPaths,
|
|
244
|
+
) -> None:
|
|
245
|
+
lines = [
|
|
246
|
+
"# Audit Bundle",
|
|
247
|
+
"",
|
|
248
|
+
"## Configuration",
|
|
249
|
+
f"- input: {inp}",
|
|
250
|
+
f"- requested_rows: {requested_rows}",
|
|
251
|
+
f"- sampled_rows: {sampled_rows}",
|
|
252
|
+
f"- seed: {seed}",
|
|
253
|
+
f"- stratify: {stratify}",
|
|
254
|
+
f"- schema: {schema}",
|
|
255
|
+
"",
|
|
256
|
+
"## Bundle Contents",
|
|
257
|
+
f"- {os.path.basename(paths.sample_path)}: sampled JSONL rows",
|
|
258
|
+
f"- {os.path.basename(paths.index_path)}: review index (edit review fields)",
|
|
259
|
+
f"- {os.path.basename(paths.summary_path)}: machine-readable summary",
|
|
260
|
+
f"- {os.path.basename(paths.manifest_path)}: file hashes",
|
|
261
|
+
"",
|
|
262
|
+
"## Review Workflow",
|
|
263
|
+
"1. Open audit_index.csv",
|
|
264
|
+
"2. Fill in review_status / review_label / notes",
|
|
265
|
+
"3. Use id to cross-reference audit_sample.jsonl",
|
|
266
|
+
]
|
|
267
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
268
|
+
f.write("\n".join(lines) + "\n")
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def _write_manifest(path: str, paths: AuditPaths) -> None:
|
|
272
|
+
import orjson
|
|
273
|
+
|
|
274
|
+
entries = _bundle_files(paths, include_summary=True, include_manifest=False)
|
|
275
|
+
with open(path, "wb") as f:
|
|
276
|
+
f.write(orjson.dumps({"files": entries}, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS))
|
|
277
|
+
f.write(b"\n")
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def _as_str(value: Any) -> str:
|
|
281
|
+
if value is None:
|
|
282
|
+
return ""
|
|
283
|
+
return str(value)
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def _bool_str(value: bool) -> str:
|
|
287
|
+
return "true" if value else "false"
|