cleanllm 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cleanllm/__init__.py ADDED
@@ -0,0 +1,41 @@
1
+ """cleanllm: streaming JSONL cleaner for LLM fine-tuning datasets."""
2
+ from __future__ import annotations
3
+
4
+ from importlib.metadata import PackageNotFoundError, version
5
+
6
+ from .scan import scan_jsonl
7
+ from .fix import fix_jsonl, FixRules
8
+ from .shard import shard_jsonl
9
+ from .manifest import make_manifest
10
+ from .validate import validate_jsonl
11
+ from .dedup import dedup_jsonl
12
+ from .sample import sample_jsonl
13
+ from .audit import audit_bundle
14
+ from .stats import stats_jsonl
15
+ from .hf import download_from_hub, detect_hf_schema
16
+
17
+
18
+ def _resolve_version() -> str:
19
+ try:
20
+ return version("cleanllm")
21
+ except PackageNotFoundError:
22
+ return "0.0.0"
23
+
24
+
25
+ __version__ = _resolve_version()
26
+
27
+ __all__ = [
28
+ "scan_jsonl",
29
+ "fix_jsonl",
30
+ "FixRules",
31
+ "shard_jsonl",
32
+ "make_manifest",
33
+ "validate_jsonl",
34
+ "dedup_jsonl",
35
+ "sample_jsonl",
36
+ "audit_bundle",
37
+ "stats_jsonl",
38
+ "download_from_hub",
39
+ "detect_hf_schema",
40
+ "__version__",
41
+ ]
cleanllm/__main__.py ADDED
@@ -0,0 +1,11 @@
1
+ from __future__ import annotations
2
+
3
+ from .cli import app
4
+
5
+
6
+ def main() -> None:
7
+ app()
8
+
9
+
10
+ if __name__ == "__main__":
11
+ main()
cleanllm/audit.py ADDED
@@ -0,0 +1,287 @@
1
+ from __future__ import annotations
2
+
3
+ import csv
4
+ import os
5
+ from dataclasses import dataclass
6
+ from typing import Any
7
+
8
+ from .sample import sample_jsonl
9
+ from .util import parse_json, sha256_file
10
+
11
+
12
+ @dataclass(frozen=True)
13
+ class AuditPaths:
14
+ outdir: str
15
+ sample_path: str
16
+ index_path: str
17
+ summary_path: str
18
+ readme_path: str
19
+ manifest_path: str
20
+
21
+
22
+ def audit_bundle(
23
+ inp: str,
24
+ outdir: str,
25
+ num_rows: int,
26
+ seed: int | None = None,
27
+ stratify: list[str] | None = None,
28
+ schema: str | None = None,
29
+ overwrite: bool = False,
30
+ show_progress: bool = False,
31
+ started_at: str | None = None,
32
+ ended_at: str | None = None,
33
+ duration_seconds: float | None = None,
34
+ version: str | None = None,
35
+ ) -> dict[str, Any]:
36
+ _prepare_outdir(outdir, overwrite=overwrite)
37
+ paths = _bundle_paths(outdir)
38
+
39
+ sample_result = sample_jsonl(
40
+ inp,
41
+ paths.sample_path,
42
+ num_rows=num_rows,
43
+ seed=seed,
44
+ stratify=stratify,
45
+ schema=schema,
46
+ show_progress=show_progress,
47
+ )
48
+
49
+ sampled_line_numbers = sample_result.get("sampled_line_numbers", [])
50
+ index_rows = _write_index_csv(paths.sample_path, paths.index_path, sampled_line_numbers)
51
+ _write_readme(
52
+ paths.readme_path,
53
+ inp=inp,
54
+ requested_rows=num_rows,
55
+ sampled_rows=sample_result["sampled_rows"],
56
+ seed=seed,
57
+ stratify=stratify,
58
+ schema=schema,
59
+ paths=paths,
60
+ )
61
+ # Summary excludes summary+manifest to avoid hash loops.
62
+ summary_files = _bundle_files(paths, include_summary=False, include_manifest=False)
63
+ summary = _build_summary(
64
+ inp=inp,
65
+ outdir=outdir,
66
+ num_rows=num_rows,
67
+ seed=seed,
68
+ stratify=stratify,
69
+ schema=schema,
70
+ sample_result=sample_result,
71
+ index_rows=index_rows,
72
+ bundle_files=summary_files,
73
+ started_at=started_at,
74
+ ended_at=ended_at,
75
+ duration_seconds=duration_seconds,
76
+ version=version,
77
+ )
78
+ _write_summary(paths.summary_path, summary)
79
+
80
+ # Manifest includes summary but excludes itself to avoid self-referential hashing.
81
+ _write_manifest(paths.manifest_path, paths)
82
+
83
+ return summary
84
+
85
+
86
+ def _prepare_outdir(outdir: str, overwrite: bool) -> None:
87
+ if os.path.exists(outdir):
88
+ if not overwrite and os.listdir(outdir):
89
+ raise ValueError(f"Outdir exists and is not empty: {outdir}")
90
+ os.makedirs(outdir, exist_ok=True)
91
+
92
+
93
+ def _bundle_paths(outdir: str) -> AuditPaths:
94
+ return AuditPaths(
95
+ outdir=outdir,
96
+ sample_path=os.path.join(outdir, "audit_sample.jsonl"),
97
+ index_path=os.path.join(outdir, "audit_index.csv"),
98
+ summary_path=os.path.join(outdir, "audit_summary.json"),
99
+ readme_path=os.path.join(outdir, "AUDIT_README.md"),
100
+ manifest_path=os.path.join(outdir, "manifest.json"),
101
+ )
102
+
103
+
104
+ def _write_index_csv(sample_path: str, index_path: str, sampled_line_numbers: list[int]) -> int:
105
+ columns = [
106
+ "sample_idx",
107
+ "input_line_number",
108
+ "id",
109
+ "source",
110
+ "problem_id",
111
+ "difficulty_bucket",
112
+ "has_tests",
113
+ "num_messages",
114
+ "num_tests",
115
+ "review_status",
116
+ "review_label",
117
+ "notes",
118
+ ]
119
+ count = 0
120
+ with open(sample_path, "r", encoding="utf-8", errors="replace") as f_in, open(
121
+ index_path, "w", encoding="utf-8", newline=""
122
+ ) as f_out:
123
+ writer = csv.DictWriter(f_out, fieldnames=columns)
124
+ writer.writeheader()
125
+ for line in f_in:
126
+ if not line.strip():
127
+ continue
128
+ line_number = sampled_line_numbers[count] if count < len(sampled_line_numbers) else ""
129
+ count += 1
130
+ try:
131
+ obj = parse_json(line)
132
+ except Exception:
133
+ continue
134
+
135
+ messages = obj.get("messages")
136
+ tests = obj.get("tests")
137
+ writer.writerow(
138
+ {
139
+ "sample_idx": count,
140
+ "input_line_number": line_number,
141
+ "id": _as_str(obj.get("id")),
142
+ "source": _as_str(obj.get("source")),
143
+ "problem_id": _as_str(obj.get("problem_id")),
144
+ "difficulty_bucket": _as_str(obj.get("difficulty_bucket")),
145
+ "has_tests": _bool_str(isinstance(tests, list) and len(tests) > 0),
146
+ "num_messages": str(len(messages)) if isinstance(messages, list) else "",
147
+ "num_tests": str(len(tests)) if isinstance(tests, list) else "",
148
+ "review_status": "",
149
+ "review_label": "",
150
+ "notes": "",
151
+ }
152
+ )
153
+ return count
154
+
155
+
156
+ def _build_summary(
157
+ inp: str,
158
+ outdir: str,
159
+ num_rows: int,
160
+ seed: int | None,
161
+ stratify: list[str] | None,
162
+ schema: str | None,
163
+ sample_result: dict[str, Any],
164
+ index_rows: int,
165
+ bundle_files: list[dict[str, Any]],
166
+ started_at: str | None,
167
+ ended_at: str | None,
168
+ duration_seconds: float | None,
169
+ version: str | None,
170
+ ) -> dict[str, Any]:
171
+ return {
172
+ "command": "audit",
173
+ "input_path": inp,
174
+ "outdir": outdir,
175
+ "requested_rows": num_rows,
176
+ "sampled_rows": sample_result["sampled_rows"],
177
+ "seed": seed,
178
+ "stratify": stratify or None,
179
+ "schema": schema,
180
+ "valid_rows_seen": sample_result["valid_rows_seen"],
181
+ "invalid_json_rows": sample_result["invalid_json_rows"],
182
+ "invalid_schema_rows": sample_result["invalid_schema_rows"],
183
+ "missing_strata_field_rows": sample_result["missing_strata_field_rows"],
184
+ "error_counts_by_reason": sample_result["error_counts_by_reason"],
185
+ "issue_examples": [
186
+ {"line": issue.line, "reason": issue.reason} for issue in sample_result["issues"][:5]
187
+ ],
188
+ "strata_counts_seen": sample_result["strata_counts_seen"],
189
+ "strata_counts_sampled": sample_result["strata_counts_sampled"],
190
+ "bundle_files": bundle_files,
191
+ "index_rows": index_rows,
192
+ "started_at": started_at,
193
+ "ended_at": ended_at,
194
+ "duration_seconds": duration_seconds,
195
+ "version": version,
196
+ }
197
+
198
+
199
+ def _bundle_files(
200
+ paths: AuditPaths,
201
+ include_summary: bool,
202
+ include_manifest: bool,
203
+ ) -> list[dict[str, Any]]:
204
+ files = []
205
+ candidates = [
206
+ paths.sample_path,
207
+ paths.index_path,
208
+ paths.readme_path,
209
+ ]
210
+ if include_summary:
211
+ candidates.append(paths.summary_path)
212
+ if include_manifest:
213
+ candidates.append(paths.manifest_path)
214
+ for path in candidates:
215
+ if not os.path.exists(path):
216
+ continue
217
+ files.append(
218
+ {
219
+ "path": os.path.basename(path),
220
+ "size_bytes": os.path.getsize(path),
221
+ "sha256": sha256_file(path),
222
+ }
223
+ )
224
+ return files
225
+
226
+
227
+ def _write_summary(path: str, summary: dict[str, Any]) -> None:
228
+ import orjson
229
+
230
+ with open(path, "wb") as f:
231
+ f.write(orjson.dumps(summary, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS))
232
+ f.write(b"\n")
233
+
234
+
235
+ def _write_readme(
236
+ path: str,
237
+ inp: str,
238
+ requested_rows: int,
239
+ sampled_rows: int,
240
+ seed: int | None,
241
+ stratify: list[str] | None,
242
+ schema: str | None,
243
+ paths: AuditPaths,
244
+ ) -> None:
245
+ lines = [
246
+ "# Audit Bundle",
247
+ "",
248
+ "## Configuration",
249
+ f"- input: {inp}",
250
+ f"- requested_rows: {requested_rows}",
251
+ f"- sampled_rows: {sampled_rows}",
252
+ f"- seed: {seed}",
253
+ f"- stratify: {stratify}",
254
+ f"- schema: {schema}",
255
+ "",
256
+ "## Bundle Contents",
257
+ f"- {os.path.basename(paths.sample_path)}: sampled JSONL rows",
258
+ f"- {os.path.basename(paths.index_path)}: review index (edit review fields)",
259
+ f"- {os.path.basename(paths.summary_path)}: machine-readable summary",
260
+ f"- {os.path.basename(paths.manifest_path)}: file hashes",
261
+ "",
262
+ "## Review Workflow",
263
+ "1. Open audit_index.csv",
264
+ "2. Fill in review_status / review_label / notes",
265
+ "3. Use id to cross-reference audit_sample.jsonl",
266
+ ]
267
+ with open(path, "w", encoding="utf-8") as f:
268
+ f.write("\n".join(lines) + "\n")
269
+
270
+
271
+ def _write_manifest(path: str, paths: AuditPaths) -> None:
272
+ import orjson
273
+
274
+ entries = _bundle_files(paths, include_summary=True, include_manifest=False)
275
+ with open(path, "wb") as f:
276
+ f.write(orjson.dumps({"files": entries}, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS))
277
+ f.write(b"\n")
278
+
279
+
280
+ def _as_str(value: Any) -> str:
281
+ if value is None:
282
+ return ""
283
+ return str(value)
284
+
285
+
286
+ def _bool_str(value: bool) -> str:
287
+ return "true" if value else "false"