PyPI - parallelogram - Versions diffs - 0.2.0__py3-none-any.whl - Mend

parallelogram 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

parallelogram/__init__.py +2 -0
parallelogram/cli.py +308 -0
parallelogram/core/__init__.py +0 -0
parallelogram/core/fixer.py +242 -0
parallelogram/core/io.py +40 -0
parallelogram/core/report.py +55 -0
parallelogram/core/rules.py +101 -0
parallelogram/core/runner.py +90 -0
parallelogram/formats/__init__.py +0 -0
parallelogram/formats/openai_chat.py +41 -0
parallelogram/output/__init__.py +0 -0
parallelogram/output/json_output.py +41 -0
parallelogram/output/terminal.py +89 -0
parallelogram/rules/__init__.py +0 -0
parallelogram/rules/context_window.py +163 -0
parallelogram/rules/duplicates.py +95 -0
parallelogram/rules/empty_content.py +73 -0
parallelogram/rules/encoding.py +114 -0
parallelogram/rules/roles.py +95 -0
parallelogram/rules/schema.py +106 -0
parallelogram-0.2.0.dist-info/METADATA +183 -0
parallelogram-0.2.0.dist-info/RECORD +25 -0
parallelogram-0.2.0.dist-info/WHEEL +4 -0
parallelogram-0.2.0.dist-info/entry_points.txt +2 -0
parallelogram-0.2.0.dist-info/licenses/LICENSE +189 -0

parallelogram/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ """parallelogram — strict validator for fine-tuning datasets."""
2	+ __version__ = "0.2.0"

parallelogram/cli.py ADDED Viewed

@@ -0,0 +1,308 @@
+"""parallelogram CLI.
+Single command: `parallelogram check <path>`. Designed to be the
+fastest possible local pre-flight before kicking off a training run.
+Exit codes are deliberately minimal:
+  0 — clean (no issues)
+  1 — warnings only
+  2 — errors
+These map cleanly to CI gates without any extra wiring.
+`--fix` (Phase 2 mechanical tier) attempts to mechanically repair fixable
+issues — strip BOM, replace mojibake, drop empty turns, truncate
+context-window overflow, deduplicate. SLM-tier fixes (rewrite broken role
+sequences, fill in incomplete assistant turns) are not yet implemented and
+will land in a future release as a paid hosted tier.
+"""
+from __future__ import annotations
+import json
+import sys
+from pathlib import Path
+from typing import Optional
+import typer
+from rich.console import Console
+from .core.fixer import Fixer, Disposition
+from .core.io import atomic_write_jsonl
+from .core.runner import Runner
+from .core.rules import registry
+# Importing the rule modules triggers self-registration via @registry.register.
+from .rules import (  # noqa: F401
+    schema,
+    roles,
+    empty_content,
+    context_window,
+    duplicates,
+    encoding,
+)
+from .output.terminal import render_report
+from .output.json_output import render_json
+app = typer.Typer(
+    name="parallelogram",
+    help="Strict validator for fine-tuning datasets. Run before you train.",
+    add_completion=False,
+    no_args_is_help=True,
+)
+@app.callback()
+def main() -> None:
+    """Strict validator for fine-tuning datasets. Run before you train."""
+@app.command()
+def check(
+    path: Path = typer.Argument(..., exists=True, readable=True, help="Path to JSONL dataset."),
+    format: str = typer.Option(
+        "openai-chat",
+        "--format", "-f",
+        help="Dataset format. Only 'openai-chat' is supported in v0.1.",
+    ),
+    tokenizer: Optional[str] = typer.Option(
+        None,
+        "--tokenizer", "-t",
+        help="HuggingFace tokenizer name (e.g. meta-llama/Llama-3-8B). Required for context-window check.",
+    ),
+    max_seq_len: int = typer.Option(
+        4096,
+        "--max-seq-len",
+        help="Max tokens per record. Records over this are flagged because frameworks truncate them silently.",
+    ),
+    output: Optional[Path] = typer.Option(
+        None,
+        "--output", "-o",
+        help="Write only error-free records to this file. With --fix, writes the repaired dataset.",
+    ),
+    fix: bool = typer.Option(
+        False,
+        "--fix",
+        help="Attempt mechanical fixes (dedupe, truncate, BOM strip, etc.). "
+             "SLM-tier fixes are not yet available.",
+    ),
+    dry_run: bool = typer.Option(
+        False,
+        "--dry-run",
+        help="With --fix, report what would change without writing any files.",
+    ),
+    json_output: bool = typer.Option(
+        False,
+        "--json",
+        help="Emit a JSON report to stdout (suppresses pretty output).",
+    ),
+    disable: list[str] = typer.Option(
+        [],
+        "--disable",
+        help="Disable a rule by id. Repeat to disable multiple. "
+             "Disabling rules invalidates the exit-0 guarantee — see warnings.",
+    ),
+    no_color: bool = typer.Option(False, "--no-color", help="Disable colored output."),
+) -> None:
+    """Check a fine-tuning dataset for problems that would silently corrupt training."""
+    if format != "openai-chat":
+        typer.echo(
+            f"format {format!r} is not supported in v0.1 (only 'openai-chat').",
+            err=True,
+        )
+        raise typer.Exit(2)
+    if dry_run and not fix:
+        typer.echo("--dry-run is only meaningful with --fix.", err=True)
+        raise typer.Exit(2)
+    # ── --disable handling — three layers of safety ───────────────────────
+    #
+    # 1. Reject unknown rule ids outright. Typos here are a foot-gun: the
+    #    user thinks they disabled something but the run uses the full set.
+    # 2. Refuse to disable schema. Every other rule assumes structurally
+    #    valid records; disabling schema means the others silently no-op
+    #    on malformed input, producing deceptively clean output.
+    # 3. Loud stderr warning naming exactly what got disabled, plus an
+    #    explicit note that the exit-0 guarantee no longer applies.
+    known_rule_ids = {rc.id for rc in registry.all()}
+    disabled = set(disable)
+    unknown = disabled - known_rule_ids
+    if unknown:
+        typer.echo(
+            f"unknown rule id(s) in --disable: {sorted(unknown)}. "
+            f"Valid rule ids: {sorted(known_rule_ids)}",
+            err=True,
+        )
+        raise typer.Exit(2)
+    if "schema" in disabled:
+        typer.echo(
+            "the 'schema' rule cannot be disabled — every other rule "
+            "depends on its guarantees about record structure.",
+            err=True,
+        )
+        raise typer.Exit(2)
+    if disabled:
+        sorted_disabled = sorted(disabled)
+        typer.echo(
+            "  ! WARNING: --disable in effect for: "
+            + ", ".join(sorted_disabled),
+            err=True,
+        )
+        typer.echo(
+            "    The 'if it exits 0, your run won't fail' guarantee "
+            "applies only with all rules enabled.",
+            err=True,
+        )
+    rule_classes = [rc for rc in registry.all() if rc.id not in disabled]
+    rules = []
+    for rc in rule_classes:
+        if rc.id == "context-window":
+            rules.append(rc({"tokenizer": tokenizer, "max_seq_len": max_seq_len}))
+        else:
+            rules.append(rc())
+    runner = Runner(rules)
+    console = Console(no_color=no_color)
+    if fix:
+        # ── Fix mode ────────────────────────────────────────────────────
+        report, parsed, _, unparseable = runner.run_with_records(str(path))
+        fixer = Fixer(rules)
+        fix_report = fixer.fix(parsed, report.issues, unparseable)
+        # Format and emit results.
+        if json_output:
+            payload = {
+                "file": str(path),
+                "mode": "fix",
+                "dry_run": dry_run,
+                "disabled_rules": sorted(disabled),
+                "summary": {
+                    "total": fix_report.total_records,
+                    "unchanged": fix_report.unchanged,
+                    "fixed": fix_report.fixed,
+                    "dropped": fix_report.dropped,
+                    "unparseable": fix_report.unparseable,
+                    "emitted": len(fix_report.clean_records),
+                },
+                "fixes_by_rule": fix_report.fixes_by_rule,
+                "outcomes": [
+                    {
+                        "line": o.line_no,
+                        "disposition": o.disposition.value,
+                        "rules_fixed": o.rules_fixed,
+                        "rules_unfixable": o.rules_unfixable,
+                    }
+                    for o in fix_report.outcomes
+                ],
+            }
+            json.dump(payload, sys.stdout, indent=2)
+            sys.stdout.write("\n")
+        else:
+            _render_fix_report(console, fix_report, str(path), dry_run,
+                               has_output=output is not None,
+                               disabled_rules=sorted(disabled))
+        if not dry_run and output:
+            lines = [json.dumps(rec, ensure_ascii=False)
+                     for _, rec in fix_report.clean_records]
+            atomic_write_jsonl(output, lines)
+            if not json_output:
+                console.print(f"  [green]→[/green] Wrote {len(lines)} records to {output}")
+        elif not dry_run and not output:
+            # Without --output, --fix does nothing destructive but we should
+            # tell the user that no file was written.
+            if not json_output:
+                console.print(
+                    "  [yellow]![/yellow] --fix was requested without --output. "
+                    "Re-run with --output PATH to write the repaired dataset."
+                )
+        # Exit code semantics for fix mode:
+        #   0 if everything is now clean (no dropped records)
+        #   1 if some records were dropped (partial fix)
+        #   2 if nothing was fixable (no records emitted)
+        if not fix_report.clean_records:
+            raise typer.Exit(2)
+        if fix_report.dropped or fix_report.unparseable:
+            raise typer.Exit(1)
+        raise typer.Exit(0)
+    # ── Check-only mode (existing behavior) ────────────────────────────
+    report, clean = runner.run(str(path))
+    if json_output:
+        render_json(report, str(path), disabled_rules=sorted(disabled))
+    else:
+        render_report(report, console, str(path), disabled_rules=sorted(disabled))
+    if output:
+        lines = [raw if raw.endswith("\n") else raw + "\n" for _, raw in clean]
+        atomic_write_jsonl(output, [l.rstrip("\n") for l in lines])
+        if not json_output:
+            console.print(
+                f"  [green]→[/green] Wrote {len(clean)} clean records to {output}"
+            )
+    if report.has_errors:
+        raise typer.Exit(2)
+    if report.has_warnings:
+        raise typer.Exit(1)
+    raise typer.Exit(0)
+def _render_fix_report(console, fr, path: str, dry_run: bool,
+                       has_output: bool,
+                       disabled_rules: list[str] | None = None) -> None:
+    """Pretty-print the fix report for a human."""
+    from rich.panel import Panel
+    from rich.text import Text
+    title = "parallelogram --fix" + (" (dry run)" if dry_run else "")
+    # Per-rule fix counts
+    if fr.fixes_by_rule:
+        console.print()
+        for rid, count in sorted(fr.fixes_by_rule.items()):
+            console.print(f"  [green]✓[/green] [cyan]{rid}[/cyan] [dim]·[/dim] {count} fix{'es' if count != 1 else ''}")
+    # Show first few dropped records so the user knows what was lost
+    dropped_outcomes = [o for o in fr.outcomes if o.disposition == Disposition.DROPPED]
+    if dropped_outcomes:
+        console.print()
+        console.print("  [red]✗[/red] [bold]dropped:[/bold]")
+        for o in dropped_outcomes[:5]:
+            if o.rules_unfixable:
+                reason = ", ".join(o.rules_unfixable) + " (unfixable)"
+            elif o.rules_fixed:
+                reason = ", ".join(o.rules_fixed) + " (dropped as fix)"
+            else:
+                reason = "—"
+            console.print(f"      [dim]{path}:[/dim]{o.line_no} [dim]→[/dim] {reason}")
+        if len(dropped_outcomes) > 5:
+            console.print(f"      [dim]… and {len(dropped_outcomes) - 5} more[/dim]")
+    # Summary — appended note when rules were disabled, so a clean run
+    # with disabled rules can never be mistaken for a clean full run.
+    summary_parts = [
+        (f"{fr.total_records} records  ", "bold"),
+        (f"{fr.unchanged} unchanged  ", "dim"),
+        (f"{fr.fixed} fixed  ", "green bold" if fr.fixed else "dim"),
+        (f"{fr.dropped} dropped  ", "red bold" if fr.dropped else "dim"),
+        (f"{fr.unparseable} unparseable", "red bold" if fr.unparseable else "dim"),
+    ]
+    if disabled_rules:
+        summary_parts.append((f"  ({len(disabled_rules)} rule(s) disabled)", "yellow"))
+    summary = Text.assemble(*summary_parts)
+    border = "green" if not (fr.dropped or fr.unparseable or disabled_rules) else "yellow"
+    console.print()
+    console.print(Panel.fit(summary, title=title, border_style=border))
+if __name__ == "__main__":
+    app()

parallelogram/core/__init__.py ADDED Viewed

File without changes

parallelogram/core/fixer.py ADDED Viewed

@@ -0,0 +1,242 @@
+"""Fixer — orchestrates mechanical-tier fixes and re-validates.
+The flow:
+  1. Run a check pass to find issues (already done by Runner).
+  2. For each record with fixable issues, ask the rules to fix it.
+     Per-record fixes (empty-content, encoding, context-window) operate
+     on individual records.
+  3. Apply cross-record fixes (duplicates) to the surviving records.
+  4. Re-run check on the fixed records to confirm cleanliness. Anything
+     still erroring is by definition unfixable at the mechanical tier
+     and is dropped from the clean output.
+  5. Return a FixReport with per-record disposition and aggregate counts.
+The fixer is deliberately conservative: when in doubt, drop the record
+rather than emit something we're not sure about. Better to lose 5% of
+records than to emit broken ones.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any
+from .report import Issue, Report, Severity
+from .rules import Rule
+class Disposition(str, Enum):
+    UNCHANGED = "unchanged"   # was already clean
+    FIXED = "fixed"           # had issues, fixed, now clean
+    DROPPED = "dropped"       # had issues, couldn't fix, dropped
+    UNPARSEABLE = "unparseable"  # JSON parse error — never fixable
+@dataclass
+class RecordOutcome:
+    """Per-record disposition after fix attempt."""
+    line_no: int
+    disposition: Disposition
+    rules_fixed: list[str] = field(default_factory=list)
+    rules_unfixable: list[str] = field(default_factory=list)
+@dataclass
+class FixReport:
+    total_records: int = 0
+    unchanged: int = 0
+    fixed: int = 0
+    dropped: int = 0
+    unparseable: int = 0
+    outcomes: list[RecordOutcome] = field(default_factory=list)
+    # Aggregate count of fixes by rule id, useful for usage analytics
+    # and for surfacing what got fixed in the terminal output.
+    fixes_by_rule: dict[str, int] = field(default_factory=dict)
+    # Records that survived the fix pass (line_no, fixed_record).
+    clean_records: list[tuple[int, Any]] = field(default_factory=list)
+class Fixer:
+    """Applies fix methods on rules and re-validates the output."""
+    def __init__(self, rules: list[Rule]):
+        self.rules = rules
+        self._rules_by_id = {r.id: r for r in rules}
+    def fix(
+        self,
+        records: list[tuple[int, Any]],
+        initial_issues: list[Issue],
+        unparseable_lines: set[int],
+    ) -> FixReport:
+        """Run the fix pipeline.
+        Args:
+            records: parsed records as (line_no, record) — the runner's
+                non-unparseable lines.
+            initial_issues: issues found by the original check pass.
+            unparseable_lines: lines that failed JSON parsing.
+        """
+        fr = FixReport()
+        fr.total_records = len(records) + len(unparseable_lines)
+        fr.unparseable = len(unparseable_lines)
+        # Bucket issues by line_no so we can decide per record what to fix.
+        issues_by_line: dict[int, list[Issue]] = {}
+        for issue in initial_issues:
+            if issue.line_no is None:
+                continue
+            issues_by_line.setdefault(issue.line_no, []).append(issue)
+        # ── Stage 1: per-record fixes ───────────────────────────────────
+        # Records with no issues sail through unchanged. Records with
+        # only fixable issues get patched. Records with any unfixable
+        # issue are dropped immediately — partial fixes aren't worth it.
+        stage1: list[tuple[int, Any]] = []
+        outcomes_partial: dict[int, RecordOutcome] = {}
+        for line_no, record in records:
+            issues = issues_by_line.get(line_no, [])
+            if not issues:
+                stage1.append((line_no, record))
+                outcomes_partial[line_no] = RecordOutcome(
+                    line_no=line_no,
+                    disposition=Disposition.UNCHANGED,
+                )
+                continue
+            # Decide if we can fix this record. Skip duplicates issues
+            # here — they're handled in stage 2 (cross-record).
+            non_dup_issues = [i for i in issues if i.rule_id != "duplicates"]
+            unfixable_rules = [
+                i.rule_id for i in non_dup_issues
+                if not (self._rules_by_id.get(i.rule_id)
+                        and self._rules_by_id[i.rule_id].fixable)
+            ]
+            if unfixable_rules:
+                outcomes_partial[line_no] = RecordOutcome(
+                    line_no=line_no,
+                    disposition=Disposition.DROPPED,
+                    rules_unfixable=unfixable_rules,
+                )
+                continue
+            # All non-duplicate issues are fixable. Apply each rule's
+            # fix in turn, threading the record through.
+            current = record
+            applied: list[str] = []
+            failed = False
+            for issue in non_dup_issues:
+                rule = self._rules_by_id.get(issue.rule_id)
+                if rule is None or not rule.fixable:
+                    continue
+                try:
+                    new = rule.fix_record(current, issue)
+                except Exception:  # noqa: BLE001 — defensive
+                    failed = True
+                    break
+                if new is None:
+                    failed = True
+                    break
+                if new is not current:
+                    applied.append(rule.id)
+                current = new
+            if failed:
+                outcomes_partial[line_no] = RecordOutcome(
+                    line_no=line_no,
+                    disposition=Disposition.DROPPED,
+                    rules_unfixable=[i.rule_id for i in non_dup_issues],
+                )
+                continue
+            stage1.append((line_no, current))
+            for rid in applied:
+                fr.fixes_by_rule[rid] = fr.fixes_by_rule.get(rid, 0) + 1
+            outcomes_partial[line_no] = RecordOutcome(
+                line_no=line_no,
+                disposition=Disposition.FIXED if applied else Disposition.UNCHANGED,
+                rules_fixed=applied,
+            )
+        # ── Stage 2: cross-record fixes ────────────────────────────────
+        # Each rule with a fix_dataset method gets a chance to drop or
+        # transform the surviving stage-1 records. duplicates is the
+        # canonical example.
+        stage2 = stage1
+        kept_lines_before = {ln for ln, _ in stage2}
+        for rule in self.rules:
+            if not rule.fixable:
+                continue
+            try:
+                stage2 = list(rule.fix_dataset(stage2))
+            except Exception:  # noqa: BLE001 — defensive
+                continue
+        kept_lines_after = {ln for ln, _ in stage2}
+        dropped_by_dataset = kept_lines_before - kept_lines_after
+        for ln in dropped_by_dataset:
+            # These were dropped by a cross-record rule — count as fixed
+            # (the dataset is now correct) but record the line as dropped
+            # so the user sees what happened.
+            fr.fixes_by_rule["duplicates"] = fr.fixes_by_rule.get("duplicates", 0) + 1
+            outcomes_partial[ln] = RecordOutcome(
+                line_no=ln,
+                disposition=Disposition.DROPPED,
+                rules_fixed=["duplicates"],
+            )
+        # ── Stage 3: re-validate fixed records ─────────────────────────
+        # Anything still erroring is unfixable at the mechanical tier.
+        for rule in self.rules:
+            rule.reset()
+        final: list[tuple[int, Any]] = []
+        rechecked_issues: dict[int, list[Issue]] = {}
+        for line_no, record in stage2:
+            for rule in self.rules:
+                for issue in rule.check_record(record, line_no):
+                    if issue.severity == Severity.ERROR:
+                        rechecked_issues.setdefault(line_no, []).append(issue)
+        for rule in self.rules:
+            for issue in rule.finalize():
+                if (issue.severity == Severity.ERROR
+                        and issue.line_no is not None):
+                    rechecked_issues.setdefault(issue.line_no, []).append(issue)
+        for line_no, record in stage2:
+            if line_no in rechecked_issues:
+                # Survived stage 1 + 2 but still has errors → drop.
+                still_bad = [i.rule_id for i in rechecked_issues[line_no]]
+                prior = outcomes_partial.get(line_no)
+                outcomes_partial[line_no] = RecordOutcome(
+                    line_no=line_no,
+                    disposition=Disposition.DROPPED,
+                    rules_fixed=prior.rules_fixed if prior else [],
+                    rules_unfixable=still_bad,
+                )
+                continue
+            final.append((line_no, record))
+        # ── Tally and return ────────────────────────────────────────────
+        # Add unparseable lines to outcomes for reporting completeness.
+        for ln in unparseable_lines:
+            outcomes_partial[ln] = RecordOutcome(
+                line_no=ln,
+                disposition=Disposition.UNPARSEABLE,
+            )
+        # Sort outcomes by line number for stable, readable reports.
+        fr.outcomes = sorted(outcomes_partial.values(), key=lambda o: o.line_no)
+        for o in fr.outcomes:
+            if o.disposition == Disposition.UNCHANGED:
+                fr.unchanged += 1
+            elif o.disposition == Disposition.FIXED:
+                fr.fixed += 1
+            elif o.disposition == Disposition.DROPPED:
+                fr.dropped += 1
+            # unparseable already tallied
+        fr.clean_records = final
+        return fr

parallelogram/core/io.py ADDED Viewed

@@ -0,0 +1,40 @@
+"""I/O utilities — atomic writes for JSONL output.
+Kept separate from cli.py so it has no dependency on typer/rich and can
+be imported by tests, the fixer, or any future module without dragging
+the CLI surface into the import graph.
+"""
+from __future__ import annotations
+import os
+import tempfile
+from pathlib import Path
+def atomic_write_jsonl(path: Path, lines: list[str]) -> None:
+    """Write JSONL to path atomically — write to a temp sibling and rename.
+    Guarantees that path either contains the complete output or remains
+    untouched. Critical for --fix since users will overwrite their input.
+    """
+    path = Path(path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    fd, tmp_path = tempfile.mkstemp(
+        prefix=f".{path.name}.",
+        suffix=".tmp",
+        dir=str(path.parent),
+    )
+    try:
+        with os.fdopen(fd, "w", encoding="utf-8") as f:
+            for line in lines:
+                if not line.endswith("\n"):
+                    line += "\n"
+                f.write(line)
+        os.replace(tmp_path, path)
+    except Exception:
+        # Best-effort cleanup; never leave .tmp files behind on failure
+        try:
+            os.unlink(tmp_path)
+        except OSError:
+            pass
+        raise

parallelogram/core/report.py ADDED Viewed

@@ -0,0 +1,55 @@
+"""Issue and Report types — the structured output shape of a validation run."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Optional, Any
+class Severity(str, Enum):
+    ERROR = "error"
+    WARNING = "warning"
+    INFO = "info"
+@dataclass
+class Issue:
+    """A single problem found in the dataset."""
+    rule_id: str
+    severity: Severity
+    line_no: Optional[int]
+    message: str
+    detail: Optional[str] = None
+    fixable: bool = False
+    context: dict[str, Any] = field(default_factory=dict)
+@dataclass
+class Report:
+    """Aggregated result of a validation run."""
+    issues: list[Issue] = field(default_factory=list)
+    total_records: int = 0
+    valid_records: int = 0
+    @property
+    def errors(self) -> list[Issue]:
+        return [i for i in self.issues if i.severity == Severity.ERROR]
+    @property
+    def warnings(self) -> list[Issue]:
+        return [i for i in self.issues if i.severity == Severity.WARNING]
+    @property
+    def has_errors(self) -> bool:
+        return any(i.severity == Severity.ERROR for i in self.issues)
+    @property
+    def has_warnings(self) -> bool:
+        return any(i.severity == Severity.WARNING for i in self.issues)
+    @property
+    def is_clean(self) -> bool:
+        return not self.issues
+    def add(self, issue: Issue) -> None:
+        self.issues.append(issue)