parallelogram 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2 @@
1
+ """parallelogram — strict validator for fine-tuning datasets."""
2
+ __version__ = "0.2.0"
parallelogram/cli.py ADDED
@@ -0,0 +1,308 @@
1
+ """parallelogram CLI.
2
+
3
+ Single command: `parallelogram check <path>`. Designed to be the
4
+ fastest possible local pre-flight before kicking off a training run.
5
+
6
+ Exit codes are deliberately minimal:
7
+ 0 — clean (no issues)
8
+ 1 — warnings only
9
+ 2 — errors
10
+
11
+ These map cleanly to CI gates without any extra wiring.
12
+
13
+ `--fix` (Phase 2 mechanical tier) attempts to mechanically repair fixable
14
+ issues — strip BOM, replace mojibake, drop empty turns, truncate
15
+ context-window overflow, deduplicate. SLM-tier fixes (rewrite broken role
16
+ sequences, fill in incomplete assistant turns) are not yet implemented and
17
+ will land in a future release as a paid hosted tier.
18
+ """
19
+ from __future__ import annotations
20
+
21
+ import json
22
+ import sys
23
+ from pathlib import Path
24
+ from typing import Optional
25
+
26
+ import typer
27
+ from rich.console import Console
28
+
29
+ from .core.fixer import Fixer, Disposition
30
+ from .core.io import atomic_write_jsonl
31
+ from .core.runner import Runner
32
+ from .core.rules import registry
33
+
34
+ # Importing the rule modules triggers self-registration via @registry.register.
35
+ from .rules import ( # noqa: F401
36
+ schema,
37
+ roles,
38
+ empty_content,
39
+ context_window,
40
+ duplicates,
41
+ encoding,
42
+ )
43
+ from .output.terminal import render_report
44
+ from .output.json_output import render_json
45
+
46
+
47
+ app = typer.Typer(
48
+ name="parallelogram",
49
+ help="Strict validator for fine-tuning datasets. Run before you train.",
50
+ add_completion=False,
51
+ no_args_is_help=True,
52
+ )
53
+
54
+
55
+ @app.callback()
56
+ def main() -> None:
57
+ """Strict validator for fine-tuning datasets. Run before you train."""
58
+
59
+
60
+ @app.command()
61
+ def check(
62
+ path: Path = typer.Argument(..., exists=True, readable=True, help="Path to JSONL dataset."),
63
+ format: str = typer.Option(
64
+ "openai-chat",
65
+ "--format", "-f",
66
+ help="Dataset format. Only 'openai-chat' is supported in v0.1.",
67
+ ),
68
+ tokenizer: Optional[str] = typer.Option(
69
+ None,
70
+ "--tokenizer", "-t",
71
+ help="HuggingFace tokenizer name (e.g. meta-llama/Llama-3-8B). Required for context-window check.",
72
+ ),
73
+ max_seq_len: int = typer.Option(
74
+ 4096,
75
+ "--max-seq-len",
76
+ help="Max tokens per record. Records over this are flagged because frameworks truncate them silently.",
77
+ ),
78
+ output: Optional[Path] = typer.Option(
79
+ None,
80
+ "--output", "-o",
81
+ help="Write only error-free records to this file. With --fix, writes the repaired dataset.",
82
+ ),
83
+ fix: bool = typer.Option(
84
+ False,
85
+ "--fix",
86
+ help="Attempt mechanical fixes (dedupe, truncate, BOM strip, etc.). "
87
+ "SLM-tier fixes are not yet available.",
88
+ ),
89
+ dry_run: bool = typer.Option(
90
+ False,
91
+ "--dry-run",
92
+ help="With --fix, report what would change without writing any files.",
93
+ ),
94
+ json_output: bool = typer.Option(
95
+ False,
96
+ "--json",
97
+ help="Emit a JSON report to stdout (suppresses pretty output).",
98
+ ),
99
+ disable: list[str] = typer.Option(
100
+ [],
101
+ "--disable",
102
+ help="Disable a rule by id. Repeat to disable multiple. "
103
+ "Disabling rules invalidates the exit-0 guarantee — see warnings.",
104
+ ),
105
+ no_color: bool = typer.Option(False, "--no-color", help="Disable colored output."),
106
+ ) -> None:
107
+ """Check a fine-tuning dataset for problems that would silently corrupt training."""
108
+ if format != "openai-chat":
109
+ typer.echo(
110
+ f"format {format!r} is not supported in v0.1 (only 'openai-chat').",
111
+ err=True,
112
+ )
113
+ raise typer.Exit(2)
114
+
115
+ if dry_run and not fix:
116
+ typer.echo("--dry-run is only meaningful with --fix.", err=True)
117
+ raise typer.Exit(2)
118
+
119
+ # ── --disable handling — three layers of safety ───────────────────────
120
+ #
121
+ # 1. Reject unknown rule ids outright. Typos here are a foot-gun: the
122
+ # user thinks they disabled something but the run uses the full set.
123
+ # 2. Refuse to disable schema. Every other rule assumes structurally
124
+ # valid records; disabling schema means the others silently no-op
125
+ # on malformed input, producing deceptively clean output.
126
+ # 3. Loud stderr warning naming exactly what got disabled, plus an
127
+ # explicit note that the exit-0 guarantee no longer applies.
128
+ known_rule_ids = {rc.id for rc in registry.all()}
129
+ disabled = set(disable)
130
+ unknown = disabled - known_rule_ids
131
+ if unknown:
132
+ typer.echo(
133
+ f"unknown rule id(s) in --disable: {sorted(unknown)}. "
134
+ f"Valid rule ids: {sorted(known_rule_ids)}",
135
+ err=True,
136
+ )
137
+ raise typer.Exit(2)
138
+
139
+ if "schema" in disabled:
140
+ typer.echo(
141
+ "the 'schema' rule cannot be disabled — every other rule "
142
+ "depends on its guarantees about record structure.",
143
+ err=True,
144
+ )
145
+ raise typer.Exit(2)
146
+
147
+ if disabled:
148
+ sorted_disabled = sorted(disabled)
149
+ typer.echo(
150
+ " ! WARNING: --disable in effect for: "
151
+ + ", ".join(sorted_disabled),
152
+ err=True,
153
+ )
154
+ typer.echo(
155
+ " The 'if it exits 0, your run won't fail' guarantee "
156
+ "applies only with all rules enabled.",
157
+ err=True,
158
+ )
159
+
160
+ rule_classes = [rc for rc in registry.all() if rc.id not in disabled]
161
+
162
+ rules = []
163
+ for rc in rule_classes:
164
+ if rc.id == "context-window":
165
+ rules.append(rc({"tokenizer": tokenizer, "max_seq_len": max_seq_len}))
166
+ else:
167
+ rules.append(rc())
168
+
169
+ runner = Runner(rules)
170
+ console = Console(no_color=no_color)
171
+
172
+ if fix:
173
+ # ── Fix mode ────────────────────────────────────────────────────
174
+ report, parsed, _, unparseable = runner.run_with_records(str(path))
175
+ fixer = Fixer(rules)
176
+ fix_report = fixer.fix(parsed, report.issues, unparseable)
177
+
178
+ # Format and emit results.
179
+ if json_output:
180
+ payload = {
181
+ "file": str(path),
182
+ "mode": "fix",
183
+ "dry_run": dry_run,
184
+ "disabled_rules": sorted(disabled),
185
+ "summary": {
186
+ "total": fix_report.total_records,
187
+ "unchanged": fix_report.unchanged,
188
+ "fixed": fix_report.fixed,
189
+ "dropped": fix_report.dropped,
190
+ "unparseable": fix_report.unparseable,
191
+ "emitted": len(fix_report.clean_records),
192
+ },
193
+ "fixes_by_rule": fix_report.fixes_by_rule,
194
+ "outcomes": [
195
+ {
196
+ "line": o.line_no,
197
+ "disposition": o.disposition.value,
198
+ "rules_fixed": o.rules_fixed,
199
+ "rules_unfixable": o.rules_unfixable,
200
+ }
201
+ for o in fix_report.outcomes
202
+ ],
203
+ }
204
+ json.dump(payload, sys.stdout, indent=2)
205
+ sys.stdout.write("\n")
206
+ else:
207
+ _render_fix_report(console, fix_report, str(path), dry_run,
208
+ has_output=output is not None,
209
+ disabled_rules=sorted(disabled))
210
+
211
+ if not dry_run and output:
212
+ lines = [json.dumps(rec, ensure_ascii=False)
213
+ for _, rec in fix_report.clean_records]
214
+ atomic_write_jsonl(output, lines)
215
+ if not json_output:
216
+ console.print(f" [green]→[/green] Wrote {len(lines)} records to {output}")
217
+ elif not dry_run and not output:
218
+ # Without --output, --fix does nothing destructive but we should
219
+ # tell the user that no file was written.
220
+ if not json_output:
221
+ console.print(
222
+ " [yellow]![/yellow] --fix was requested without --output. "
223
+ "Re-run with --output PATH to write the repaired dataset."
224
+ )
225
+
226
+ # Exit code semantics for fix mode:
227
+ # 0 if everything is now clean (no dropped records)
228
+ # 1 if some records were dropped (partial fix)
229
+ # 2 if nothing was fixable (no records emitted)
230
+ if not fix_report.clean_records:
231
+ raise typer.Exit(2)
232
+ if fix_report.dropped or fix_report.unparseable:
233
+ raise typer.Exit(1)
234
+ raise typer.Exit(0)
235
+
236
+ # ── Check-only mode (existing behavior) ────────────────────────────
237
+ report, clean = runner.run(str(path))
238
+
239
+ if json_output:
240
+ render_json(report, str(path), disabled_rules=sorted(disabled))
241
+ else:
242
+ render_report(report, console, str(path), disabled_rules=sorted(disabled))
243
+
244
+ if output:
245
+ lines = [raw if raw.endswith("\n") else raw + "\n" for _, raw in clean]
246
+ atomic_write_jsonl(output, [l.rstrip("\n") for l in lines])
247
+ if not json_output:
248
+ console.print(
249
+ f" [green]→[/green] Wrote {len(clean)} clean records to {output}"
250
+ )
251
+
252
+ if report.has_errors:
253
+ raise typer.Exit(2)
254
+ if report.has_warnings:
255
+ raise typer.Exit(1)
256
+ raise typer.Exit(0)
257
+
258
+
259
+ def _render_fix_report(console, fr, path: str, dry_run: bool,
260
+ has_output: bool,
261
+ disabled_rules: list[str] | None = None) -> None:
262
+ """Pretty-print the fix report for a human."""
263
+ from rich.panel import Panel
264
+ from rich.text import Text
265
+
266
+ title = "parallelogram --fix" + (" (dry run)" if dry_run else "")
267
+
268
+ # Per-rule fix counts
269
+ if fr.fixes_by_rule:
270
+ console.print()
271
+ for rid, count in sorted(fr.fixes_by_rule.items()):
272
+ console.print(f" [green]✓[/green] [cyan]{rid}[/cyan] [dim]·[/dim] {count} fix{'es' if count != 1 else ''}")
273
+
274
+ # Show first few dropped records so the user knows what was lost
275
+ dropped_outcomes = [o for o in fr.outcomes if o.disposition == Disposition.DROPPED]
276
+ if dropped_outcomes:
277
+ console.print()
278
+ console.print(" [red]✗[/red] [bold]dropped:[/bold]")
279
+ for o in dropped_outcomes[:5]:
280
+ if o.rules_unfixable:
281
+ reason = ", ".join(o.rules_unfixable) + " (unfixable)"
282
+ elif o.rules_fixed:
283
+ reason = ", ".join(o.rules_fixed) + " (dropped as fix)"
284
+ else:
285
+ reason = "—"
286
+ console.print(f" [dim]{path}:[/dim]{o.line_no} [dim]→[/dim] {reason}")
287
+ if len(dropped_outcomes) > 5:
288
+ console.print(f" [dim]… and {len(dropped_outcomes) - 5} more[/dim]")
289
+
290
+ # Summary — appended note when rules were disabled, so a clean run
291
+ # with disabled rules can never be mistaken for a clean full run.
292
+ summary_parts = [
293
+ (f"{fr.total_records} records ", "bold"),
294
+ (f"{fr.unchanged} unchanged ", "dim"),
295
+ (f"{fr.fixed} fixed ", "green bold" if fr.fixed else "dim"),
296
+ (f"{fr.dropped} dropped ", "red bold" if fr.dropped else "dim"),
297
+ (f"{fr.unparseable} unparseable", "red bold" if fr.unparseable else "dim"),
298
+ ]
299
+ if disabled_rules:
300
+ summary_parts.append((f" ({len(disabled_rules)} rule(s) disabled)", "yellow"))
301
+ summary = Text.assemble(*summary_parts)
302
+ border = "green" if not (fr.dropped or fr.unparseable or disabled_rules) else "yellow"
303
+ console.print()
304
+ console.print(Panel.fit(summary, title=title, border_style=border))
305
+
306
+
307
+ if __name__ == "__main__":
308
+ app()
File without changes
@@ -0,0 +1,242 @@
1
+ """Fixer — orchestrates mechanical-tier fixes and re-validates.
2
+
3
+ The flow:
4
+ 1. Run a check pass to find issues (already done by Runner).
5
+ 2. For each record with fixable issues, ask the rules to fix it.
6
+ Per-record fixes (empty-content, encoding, context-window) operate
7
+ on individual records.
8
+ 3. Apply cross-record fixes (duplicates) to the surviving records.
9
+ 4. Re-run check on the fixed records to confirm cleanliness. Anything
10
+ still erroring is by definition unfixable at the mechanical tier
11
+ and is dropped from the clean output.
12
+ 5. Return a FixReport with per-record disposition and aggregate counts.
13
+
14
+ The fixer is deliberately conservative: when in doubt, drop the record
15
+ rather than emit something we're not sure about. Better to lose 5% of
16
+ records than to emit broken ones.
17
+ """
18
+ from __future__ import annotations
19
+
20
+ from dataclasses import dataclass, field
21
+ from enum import Enum
22
+ from typing import Any
23
+
24
+ from .report import Issue, Report, Severity
25
+ from .rules import Rule
26
+
27
+
28
+ class Disposition(str, Enum):
29
+ UNCHANGED = "unchanged" # was already clean
30
+ FIXED = "fixed" # had issues, fixed, now clean
31
+ DROPPED = "dropped" # had issues, couldn't fix, dropped
32
+ UNPARSEABLE = "unparseable" # JSON parse error — never fixable
33
+
34
+
35
+ @dataclass
36
+ class RecordOutcome:
37
+ """Per-record disposition after fix attempt."""
38
+ line_no: int
39
+ disposition: Disposition
40
+ rules_fixed: list[str] = field(default_factory=list)
41
+ rules_unfixable: list[str] = field(default_factory=list)
42
+
43
+
44
+ @dataclass
45
+ class FixReport:
46
+ total_records: int = 0
47
+ unchanged: int = 0
48
+ fixed: int = 0
49
+ dropped: int = 0
50
+ unparseable: int = 0
51
+ outcomes: list[RecordOutcome] = field(default_factory=list)
52
+ # Aggregate count of fixes by rule id, useful for usage analytics
53
+ # and for surfacing what got fixed in the terminal output.
54
+ fixes_by_rule: dict[str, int] = field(default_factory=dict)
55
+ # Records that survived the fix pass (line_no, fixed_record).
56
+ clean_records: list[tuple[int, Any]] = field(default_factory=list)
57
+
58
+
59
+ class Fixer:
60
+ """Applies fix methods on rules and re-validates the output."""
61
+
62
+ def __init__(self, rules: list[Rule]):
63
+ self.rules = rules
64
+ self._rules_by_id = {r.id: r for r in rules}
65
+
66
+ def fix(
67
+ self,
68
+ records: list[tuple[int, Any]],
69
+ initial_issues: list[Issue],
70
+ unparseable_lines: set[int],
71
+ ) -> FixReport:
72
+ """Run the fix pipeline.
73
+
74
+ Args:
75
+ records: parsed records as (line_no, record) — the runner's
76
+ non-unparseable lines.
77
+ initial_issues: issues found by the original check pass.
78
+ unparseable_lines: lines that failed JSON parsing.
79
+ """
80
+ fr = FixReport()
81
+ fr.total_records = len(records) + len(unparseable_lines)
82
+ fr.unparseable = len(unparseable_lines)
83
+
84
+ # Bucket issues by line_no so we can decide per record what to fix.
85
+ issues_by_line: dict[int, list[Issue]] = {}
86
+ for issue in initial_issues:
87
+ if issue.line_no is None:
88
+ continue
89
+ issues_by_line.setdefault(issue.line_no, []).append(issue)
90
+
91
+ # ── Stage 1: per-record fixes ───────────────────────────────────
92
+ # Records with no issues sail through unchanged. Records with
93
+ # only fixable issues get patched. Records with any unfixable
94
+ # issue are dropped immediately — partial fixes aren't worth it.
95
+ stage1: list[tuple[int, Any]] = []
96
+ outcomes_partial: dict[int, RecordOutcome] = {}
97
+
98
+ for line_no, record in records:
99
+ issues = issues_by_line.get(line_no, [])
100
+ if not issues:
101
+ stage1.append((line_no, record))
102
+ outcomes_partial[line_no] = RecordOutcome(
103
+ line_no=line_no,
104
+ disposition=Disposition.UNCHANGED,
105
+ )
106
+ continue
107
+
108
+ # Decide if we can fix this record. Skip duplicates issues
109
+ # here — they're handled in stage 2 (cross-record).
110
+ non_dup_issues = [i for i in issues if i.rule_id != "duplicates"]
111
+ unfixable_rules = [
112
+ i.rule_id for i in non_dup_issues
113
+ if not (self._rules_by_id.get(i.rule_id)
114
+ and self._rules_by_id[i.rule_id].fixable)
115
+ ]
116
+
117
+ if unfixable_rules:
118
+ outcomes_partial[line_no] = RecordOutcome(
119
+ line_no=line_no,
120
+ disposition=Disposition.DROPPED,
121
+ rules_unfixable=unfixable_rules,
122
+ )
123
+ continue
124
+
125
+ # All non-duplicate issues are fixable. Apply each rule's
126
+ # fix in turn, threading the record through.
127
+ current = record
128
+ applied: list[str] = []
129
+ failed = False
130
+ for issue in non_dup_issues:
131
+ rule = self._rules_by_id.get(issue.rule_id)
132
+ if rule is None or not rule.fixable:
133
+ continue
134
+ try:
135
+ new = rule.fix_record(current, issue)
136
+ except Exception: # noqa: BLE001 — defensive
137
+ failed = True
138
+ break
139
+ if new is None:
140
+ failed = True
141
+ break
142
+ if new is not current:
143
+ applied.append(rule.id)
144
+ current = new
145
+
146
+ if failed:
147
+ outcomes_partial[line_no] = RecordOutcome(
148
+ line_no=line_no,
149
+ disposition=Disposition.DROPPED,
150
+ rules_unfixable=[i.rule_id for i in non_dup_issues],
151
+ )
152
+ continue
153
+
154
+ stage1.append((line_no, current))
155
+ for rid in applied:
156
+ fr.fixes_by_rule[rid] = fr.fixes_by_rule.get(rid, 0) + 1
157
+ outcomes_partial[line_no] = RecordOutcome(
158
+ line_no=line_no,
159
+ disposition=Disposition.FIXED if applied else Disposition.UNCHANGED,
160
+ rules_fixed=applied,
161
+ )
162
+
163
+ # ── Stage 2: cross-record fixes ────────────────────────────────
164
+ # Each rule with a fix_dataset method gets a chance to drop or
165
+ # transform the surviving stage-1 records. duplicates is the
166
+ # canonical example.
167
+ stage2 = stage1
168
+ kept_lines_before = {ln for ln, _ in stage2}
169
+ for rule in self.rules:
170
+ if not rule.fixable:
171
+ continue
172
+ try:
173
+ stage2 = list(rule.fix_dataset(stage2))
174
+ except Exception: # noqa: BLE001 — defensive
175
+ continue
176
+
177
+ kept_lines_after = {ln for ln, _ in stage2}
178
+ dropped_by_dataset = kept_lines_before - kept_lines_after
179
+ for ln in dropped_by_dataset:
180
+ # These were dropped by a cross-record rule — count as fixed
181
+ # (the dataset is now correct) but record the line as dropped
182
+ # so the user sees what happened.
183
+ fr.fixes_by_rule["duplicates"] = fr.fixes_by_rule.get("duplicates", 0) + 1
184
+ outcomes_partial[ln] = RecordOutcome(
185
+ line_no=ln,
186
+ disposition=Disposition.DROPPED,
187
+ rules_fixed=["duplicates"],
188
+ )
189
+
190
+ # ── Stage 3: re-validate fixed records ─────────────────────────
191
+ # Anything still erroring is unfixable at the mechanical tier.
192
+ for rule in self.rules:
193
+ rule.reset()
194
+
195
+ final: list[tuple[int, Any]] = []
196
+ rechecked_issues: dict[int, list[Issue]] = {}
197
+ for line_no, record in stage2:
198
+ for rule in self.rules:
199
+ for issue in rule.check_record(record, line_no):
200
+ if issue.severity == Severity.ERROR:
201
+ rechecked_issues.setdefault(line_no, []).append(issue)
202
+ for rule in self.rules:
203
+ for issue in rule.finalize():
204
+ if (issue.severity == Severity.ERROR
205
+ and issue.line_no is not None):
206
+ rechecked_issues.setdefault(issue.line_no, []).append(issue)
207
+
208
+ for line_no, record in stage2:
209
+ if line_no in rechecked_issues:
210
+ # Survived stage 1 + 2 but still has errors → drop.
211
+ still_bad = [i.rule_id for i in rechecked_issues[line_no]]
212
+ prior = outcomes_partial.get(line_no)
213
+ outcomes_partial[line_no] = RecordOutcome(
214
+ line_no=line_no,
215
+ disposition=Disposition.DROPPED,
216
+ rules_fixed=prior.rules_fixed if prior else [],
217
+ rules_unfixable=still_bad,
218
+ )
219
+ continue
220
+ final.append((line_no, record))
221
+
222
+ # ── Tally and return ────────────────────────────────────────────
223
+ # Add unparseable lines to outcomes for reporting completeness.
224
+ for ln in unparseable_lines:
225
+ outcomes_partial[ln] = RecordOutcome(
226
+ line_no=ln,
227
+ disposition=Disposition.UNPARSEABLE,
228
+ )
229
+
230
+ # Sort outcomes by line number for stable, readable reports.
231
+ fr.outcomes = sorted(outcomes_partial.values(), key=lambda o: o.line_no)
232
+ for o in fr.outcomes:
233
+ if o.disposition == Disposition.UNCHANGED:
234
+ fr.unchanged += 1
235
+ elif o.disposition == Disposition.FIXED:
236
+ fr.fixed += 1
237
+ elif o.disposition == Disposition.DROPPED:
238
+ fr.dropped += 1
239
+ # unparseable already tallied
240
+
241
+ fr.clean_records = final
242
+ return fr
@@ -0,0 +1,40 @@
1
+ """I/O utilities — atomic writes for JSONL output.
2
+
3
+ Kept separate from cli.py so it has no dependency on typer/rich and can
4
+ be imported by tests, the fixer, or any future module without dragging
5
+ the CLI surface into the import graph.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import os
10
+ import tempfile
11
+ from pathlib import Path
12
+
13
+
14
+ def atomic_write_jsonl(path: Path, lines: list[str]) -> None:
15
+ """Write JSONL to path atomically — write to a temp sibling and rename.
16
+
17
+ Guarantees that path either contains the complete output or remains
18
+ untouched. Critical for --fix since users will overwrite their input.
19
+ """
20
+ path = Path(path)
21
+ path.parent.mkdir(parents=True, exist_ok=True)
22
+ fd, tmp_path = tempfile.mkstemp(
23
+ prefix=f".{path.name}.",
24
+ suffix=".tmp",
25
+ dir=str(path.parent),
26
+ )
27
+ try:
28
+ with os.fdopen(fd, "w", encoding="utf-8") as f:
29
+ for line in lines:
30
+ if not line.endswith("\n"):
31
+ line += "\n"
32
+ f.write(line)
33
+ os.replace(tmp_path, path)
34
+ except Exception:
35
+ # Best-effort cleanup; never leave .tmp files behind on failure
36
+ try:
37
+ os.unlink(tmp_path)
38
+ except OSError:
39
+ pass
40
+ raise
@@ -0,0 +1,55 @@
1
+ """Issue and Report types — the structured output shape of a validation run."""
2
+ from __future__ import annotations
3
+
4
+ from dataclasses import dataclass, field
5
+ from enum import Enum
6
+ from typing import Optional, Any
7
+
8
+
9
+ class Severity(str, Enum):
10
+ ERROR = "error"
11
+ WARNING = "warning"
12
+ INFO = "info"
13
+
14
+
15
+ @dataclass
16
+ class Issue:
17
+ """A single problem found in the dataset."""
18
+ rule_id: str
19
+ severity: Severity
20
+ line_no: Optional[int]
21
+ message: str
22
+ detail: Optional[str] = None
23
+ fixable: bool = False
24
+ context: dict[str, Any] = field(default_factory=dict)
25
+
26
+
27
+ @dataclass
28
+ class Report:
29
+ """Aggregated result of a validation run."""
30
+ issues: list[Issue] = field(default_factory=list)
31
+ total_records: int = 0
32
+ valid_records: int = 0
33
+
34
+ @property
35
+ def errors(self) -> list[Issue]:
36
+ return [i for i in self.issues if i.severity == Severity.ERROR]
37
+
38
+ @property
39
+ def warnings(self) -> list[Issue]:
40
+ return [i for i in self.issues if i.severity == Severity.WARNING]
41
+
42
+ @property
43
+ def has_errors(self) -> bool:
44
+ return any(i.severity == Severity.ERROR for i in self.issues)
45
+
46
+ @property
47
+ def has_warnings(self) -> bool:
48
+ return any(i.severity == Severity.WARNING for i in self.issues)
49
+
50
+ @property
51
+ def is_clean(self) -> bool:
52
+ return not self.issues
53
+
54
+ def add(self, issue: Issue) -> None:
55
+ self.issues.append(issue)