PyPI - tablecodec - Versions diffs - 0.0.18__py3-none-any.whl - Mend

tablecodec 0.0.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

tablecodec/__init__.py +29 -0
tablecodec/_invariants.py +311 -0
tablecodec/cli.py +314 -0
tablecodec/codecs/__init__.py +111 -0
tablecodec/codecs/_base.py +79 -0
tablecodec/codecs/_htmltable.py +510 -0
tablecodec/codecs/_otslgrid.py +318 -0
tablecodec/codecs/builtins.py +36 -0
tablecodec/codecs/doctags.py +278 -0
tablecodec/codecs/fintabnet.py +84 -0
tablecodec/codecs/fintabnet_otsl.py +141 -0
tablecodec/codecs/otsl.py +138 -0
tablecodec/codecs/pubtables1m.py +161 -0
tablecodec/codecs/pubtabnet.py +128 -0
tablecodec/codecs/tablebank.py +76 -0
tablecodec/codecs/tableformer.py +80 -0
tablecodec/io.py +91 -0
tablecodec/ir.py +101 -0
tablecodec/loss.py +105 -0
tablecodec/py.typed +0 -0
tablecodec/teds.py +243 -0
tablecodec/validate.py +185 -0
tablecodec-0.0.18.dist-info/METADATA +200 -0
tablecodec-0.0.18.dist-info/RECORD +27 -0
tablecodec-0.0.18.dist-info/WHEEL +4 -0
tablecodec-0.0.18.dist-info/entry_points.txt +2 -0
tablecodec-0.0.18.dist-info/licenses/LICENSE +21 -0

tablecodec/__init__.py ADDED Viewed

@@ -0,0 +1,29 @@
+"""tablecodec — neutral IR + codec registry for image-based table datasets.
+Public API (M1):
+- :class:`tablecodec.ir.BBox`, :class:`GridCell`, :class:`TableSample`
+- :func:`validate` and :data:`profiles`
+- :class:`ValidationError`
+"""
+from __future__ import annotations
+from tablecodec.ir import BBox, GridCell, TableSample
+from tablecodec.loss import LossReport, analyze_loss
+from tablecodec.validate import Profile, ValidationError, profiles, validate
+__all__ = [
+    "BBox",
+    "GridCell",
+    "LossReport",
+    "Profile",
+    "TableSample",
+    "ValidationError",
+    "__version__",
+    "analyze_loss",
+    "profiles",
+    "validate",
+]
+__version__: str = "0.0.18"

tablecodec/_invariants.py ADDED Viewed

@@ -0,0 +1,311 @@
+"""SPEC §5.2 invariants I-01..I-07 as independent check functions.
+Each ``check_iXX`` returns a list of :class:`ValidationError` describing every
+violation it found (empty list = pass). Functions never raise on data; they
+raise only on programmer error (e.g. wrong type passed in).
+Functions are pure and read-only. They never mutate the input.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from tablecodec.ir import TableSample
+__all__ = [
+    "ValidationError",
+    "check_i01_nrows_ncols_positive",
+    "check_i02_cell_in_bounds",
+    "check_i03_span_in_bounds",
+    "check_i04_grid_exact_cover",
+    "check_i05_bbox_well_formed",
+    "check_i06_header_contiguous_top",
+    "check_i07_tokens_is_tuple",
+]
+# Cap how many gap coordinates I-04 enumerates in its error message,
+# so a totally empty grid does not produce a multi-megabyte string.
+_GAP_PREVIEW_LIMIT = 5
+@dataclass(frozen=True, slots=True)
+class ValidationError:
+    """A single invariant violation.
+    Attributes:
+        invariant: The SPEC §5.2 id (``"I-01"`` ... ``"I-07"``).
+        message: Human-readable description of the violation.
+        cell_index: Index into ``TableSample.cells`` if applicable, else
+            ``None`` (e.g. grid-level invariants like I-01, I-04).
+    """
+    invariant: str
+    message: str
+    cell_index: int | None = None
+# ---------- I-01: nrows >= 1 and ncols >= 1 ----------
+def check_i01_nrows_ncols_positive(sample: TableSample) -> list[ValidationError]:
+    errors: list[ValidationError] = []
+    if sample.nrows < 1:
+        errors.append(
+            ValidationError(invariant="I-01", message=f"nrows must be >= 1, got {sample.nrows}")
+        )
+    if sample.ncols < 1:
+        errors.append(
+            ValidationError(invariant="I-01", message=f"ncols must be >= 1, got {sample.ncols}")
+        )
+    return errors
+# ---------- I-02: 0 <= row < nrows, 0 <= col < ncols ----------
+def check_i02_cell_in_bounds(sample: TableSample) -> list[ValidationError]:
+    errors: list[ValidationError] = []
+    for idx, cell in enumerate(sample.cells):
+        if cell.row < 0 or cell.row >= sample.nrows:
+            errors.append(
+                ValidationError(
+                    invariant="I-02",
+                    message=(f"row {cell.row} out of [0, {sample.nrows}) at cell index {idx}"),
+                    cell_index=idx,
+                )
+            )
+        if cell.col < 0 or cell.col >= sample.ncols:
+            errors.append(
+                ValidationError(
+                    invariant="I-02",
+                    message=(f"col {cell.col} out of [0, {sample.ncols}) at cell index {idx}"),
+                    cell_index=idx,
+                )
+            )
+    return errors
+# ---------- I-03: row + rowspan <= nrows, col + colspan <= ncols ----------
+def check_i03_span_in_bounds(sample: TableSample) -> list[ValidationError]:
+    errors: list[ValidationError] = []
+    for idx, cell in enumerate(sample.cells):
+        # SPEC §5.1: rowspan/colspan must be >= 1.
+        if cell.rowspan < 1:
+            errors.append(
+                ValidationError(
+                    invariant="I-03",
+                    message=f"rowspan must be >= 1, got {cell.rowspan} at cell index {idx}",
+                    cell_index=idx,
+                )
+            )
+        if cell.colspan < 1:
+            errors.append(
+                ValidationError(
+                    invariant="I-03",
+                    message=f"colspan must be >= 1, got {cell.colspan} at cell index {idx}",
+                    cell_index=idx,
+                )
+            )
+        if cell.row + cell.rowspan > sample.nrows:
+            errors.append(
+                ValidationError(
+                    invariant="I-03",
+                    message=(
+                        f"row+rowspan = {cell.row + cell.rowspan} exceeds nrows "
+                        f"{sample.nrows} at cell index {idx}"
+                    ),
+                    cell_index=idx,
+                )
+            )
+        if cell.col + cell.colspan > sample.ncols:
+            errors.append(
+                ValidationError(
+                    invariant="I-03",
+                    message=(
+                        f"col+colspan = {cell.col + cell.colspan} exceeds ncols "
+                        f"{sample.ncols} at cell index {idx}"
+                    ),
+                    cell_index=idx,
+                )
+            )
+    return errors
+# ---------- I-04: union of cell footprints exactly covers the grid ----------
+def check_i04_grid_exact_cover(sample: TableSample) -> list[ValidationError]:
+    """Check the cell footprints exactly cover the ``nrows × ncols`` grid.
+    Implementation: 2D occupancy bitmap. For every cell, iterate over its
+    footprint and increment the count at each (row, col). Overlap = any
+    count > 1; gap = any count == 0 inside the grid.
+    Out-of-grid cell coordinates (caught by I-02/I-03) are skipped here so
+    this check never raises; under-coverage of the in-grid cells is still
+    reported, which is the right user-visible outcome.
+    Complexity: O(N) where N = sum of all footprint areas.
+    """
+    errors: list[ValidationError] = []
+    # I-01 must hold for the grid to make sense.
+    if sample.nrows < 1 or sample.ncols < 1:
+        # I-01 already reports this; do not double-report under I-04.
+        return errors
+    occupancy = [[0] * sample.ncols for _ in range(sample.nrows)]
+    for idx, cell in enumerate(sample.cells):
+        # Defensive clipping: stay within bounds even if I-02/I-03 violated.
+        r0 = max(0, cell.row)
+        c0 = max(0, cell.col)
+        r1 = min(sample.nrows, cell.row + max(1, cell.rowspan))
+        c1 = min(sample.ncols, cell.col + max(1, cell.colspan))
+        for r in range(r0, r1):
+            row = occupancy[r]
+            for c in range(c0, c1):
+                row[c] += 1
+                if row[c] > 1:
+                    errors.append(
+                        ValidationError(
+                            invariant="I-04",
+                            message=(
+                                f"overlap at (row={r}, col={c}); cell index {idx} "
+                                f"overlaps a previously placed cell"
+                            ),
+                            cell_index=idx,
+                        )
+                    )
+    gaps: list[tuple[int, int]] = [
+        (r, c) for r, row in enumerate(occupancy) for c, count in enumerate(row) if count == 0
+    ]
+    if gaps:
+        # Report a single I-04 error with the first few coordinates to
+        # keep error volume bounded for pathological cases.
+        preview = gaps[:_GAP_PREVIEW_LIMIT]
+        extra = len(gaps) - _GAP_PREVIEW_LIMIT
+        suffix = f" (+{extra} more)" if extra > 0 else ""
+        errors.append(
+            ValidationError(
+                invariant="I-04",
+                message=f"gap(s) in grid coverage at {preview}{suffix}",
+            )
+        )
+    return errors
+# ---------- I-05: bbox well-formed when set ----------
+def _is_content_empty(tokens: tuple[str, ...]) -> bool:
+    """Whether a cell's tokens localize no content.
+    A cell is content-empty when its tokens, concatenated, contain no
+    non-whitespace character (`tokens == ()`, `("",)`, `(" ",)`, ...).
+    Markup-bearing tokens (e.g. `("<sup>",)`) are NOT empty: the core IR
+    does not model HTML, so it treats any non-whitespace token as content
+    (spec §5.2, ADR 0010).
+    """
+    return "".join(tokens).strip() == ""
+def check_i05_bbox_well_formed(sample: TableSample) -> list[ValidationError]:
+    errors: list[ValidationError] = []
+    for idx, cell in enumerate(sample.cells):
+        bbox = cell.bbox
+        if bbox is None:
+            continue
+        if _is_content_empty(cell.tokens):
+            # I-05 guards a box that *localizes content*. An empty cell
+            # localizes nothing and datasets routinely give it a zero-area
+            # placeholder box, so its geometry is out of scope (spec §5.2,
+            # ADR 0010, refining ADR 0007). The bbox itself is still kept
+            # on the IR.
+            continue
+        x0, y0, x1, y1 = bbox
+        if x0 >= x1:
+            errors.append(
+                ValidationError(
+                    invariant="I-05",
+                    message=(f"bbox x0 >= x1 ({x0} >= {x1}) at cell index {idx}"),
+                    cell_index=idx,
+                )
+            )
+        if y0 >= y1:
+            errors.append(
+                ValidationError(
+                    invariant="I-05",
+                    message=(f"bbox y0 >= y1 ({y0} >= {y1}) at cell index {idx}"),
+                    cell_index=idx,
+                )
+            )
+    return errors
+# ---------- I-06: header cells form a contiguous top region ----------
+def check_i06_header_contiguous_top(sample: TableSample) -> list[ValidationError]:
+    """Check headers form a contiguous top-region of the grid.
+    Reads "contiguous top region" as: there exists an integer
+    ``H in [0, nrows]`` such that every cell anchored at ``row < H`` is a
+    header and every cell anchored at ``row >= H`` is a body cell. H is the
+    smallest row at which any body cell appears.
+    """
+    errors: list[ValidationError] = []
+    body_rows = [c.row for c in sample.cells if c.role == "body"]
+    header_rows = [c.row for c in sample.cells if c.role == "header"]
+    if not header_rows:
+        return errors
+    if not body_rows:
+        # All headers — fine; the header region spans the whole grid.
+        return errors
+    first_body_row = min(body_rows)
+    for idx, cell in enumerate(sample.cells):
+        if cell.role == "header" and cell.row >= first_body_row:
+            errors.append(
+                ValidationError(
+                    invariant="I-06",
+                    message=(
+                        f"header cell at row {cell.row} (cell index {idx}) appears "
+                        f"at or below the first body row {first_body_row}"
+                    ),
+                    cell_index=idx,
+                )
+            )
+    return errors
+# ---------- I-07: tokens is a tuple (never None) ----------
+def check_i07_tokens_is_tuple(sample: TableSample) -> list[ValidationError]:
+    errors: list[ValidationError] = []
+    for idx, cell in enumerate(sample.cells):
+        # Runtime defense: callers can bypass the static type with
+        # ``object.__setattr__`` and inject None / list. SPEC §5.2 I-07
+        # requires this be reported, so the runtime check is intentional.
+        if not isinstance(cell.tokens, tuple):  # pyright: ignore[reportUnnecessaryIsInstance]
+            errors.append(
+                ValidationError(
+                    invariant="I-07",
+                    message=(
+                        f"tokens must be a tuple (possibly empty) at cell index "
+                        f"{idx}, got {type(cell.tokens).__name__}"
+                    ),
+                    cell_index=idx,
+                )
+            )
+    return errors

tablecodec/cli.py ADDED Viewed

@@ -0,0 +1,314 @@
+"""Click-based command line interface (SPEC §12).
+Optional: requires the ``[cli]`` extra (``pip install "tablecodec[cli]"``).
+Importing this module without click installed will fail with a clear
+``ImportError`` — by design, since the rest of the package must run
+without click.
+Subcommands implemented for M6:
+- ``validate``    — run a profile against every record in a file.
+- ``convert``     — re-encode a file from one codec to another.
+- ``stats``       — print sample / cell / span counts.
+- ``diff``        — record-by-record diff of two same-codec files.
+- ``analyze-loss`` — static loss report for a codec pair.
+- ``codecs list`` — list registered codec names.
+All commands stream their input. Exit codes:
+- ``0`` success / no findings.
+- ``1`` validation failures, diff mismatches, or recoverable errors.
+- ``2`` argument / usage error (click default).
+"""
+from __future__ import annotations
+import dataclasses
+import json
+import sys
+from collections.abc import Iterable, Iterator
+from pathlib import Path
+from typing import Any
+import click
+from tablecodec import codecs
+from tablecodec import io as tio
+from tablecodec.codecs._base import Codec
+from tablecodec.codecs.builtins import BUILTIN_CODECS
+from tablecodec.ir import TableSample
+from tablecodec.loss import analyze_loss
+from tablecodec.validate import Profile, profiles, validate
+_PROFILE_NAMES = ["LENIENT", "DEFAULT", "PUBTABNET_2_0", "TABLEFORMER", "STRICT"]
+def _ensure_builtins_registered() -> None:
+    existing = set(codecs.list_codecs())
+    for codec in BUILTIN_CODECS:
+        if codec.name not in existing:
+            codecs.register(codec)
+    # SPEC §6.2: also pick up third-party codecs that self-register via the
+    # `tablecodec.codecs` entry-point group (idempotent / no-op if none).
+    codecs.load_plugins()
+def _resolve_codec_name(name: str) -> Codec:
+    try:
+        return codecs.get(name)
+    except KeyError as exc:
+        msg = f"unknown codec {name!r}. Run `tablecodec codecs list` to see registered names."
+        raise click.UsageError(msg) from exc
+def _resolve_profile(name: str) -> Profile:
+    upper = name.upper()
+    if upper not in _PROFILE_NAMES:
+        msg = f"unknown profile {name!r}. Available: {', '.join(_PROFILE_NAMES)}."
+        raise click.UsageError(msg)
+    profile: Profile = getattr(profiles, upper)
+    return profile
+@click.group()
+@click.version_option(package_name="tablecodec")
+def main() -> None:
+    """tablecodec command-line interface."""
+    _ensure_builtins_registered()
+# ---------- validate ----------
+@main.command("validate")
+@click.argument("source", type=click.Path(exists=True, dir_okay=False, path_type=Path))
+@click.option(
+    "--profile",
+    "profile_name",
+    default="DEFAULT",
+    show_default=True,
+    help="Validation profile (see SPEC §8).",
+)
+@click.option(
+    "--codec",
+    "codec_name",
+    default=None,
+    help="Codec name; if omitted, auto-detect from the file.",
+)
+@click.option("--json", "as_json", is_flag=True, help="Emit machine-readable JSON.")
+def validate_cmd(source: Path, profile_name: str, codec_name: str | None, as_json: bool) -> None:
+    """Validate every record in SOURCE against the chosen profile."""
+    profile = _resolve_profile(profile_name)
+    codec = _resolve_codec_name(codec_name) if codec_name else None
+    findings: list[dict[str, Any]] = []
+    sample_count = 0
+    for sample_index, sample in enumerate(tio.open(source, codec=codec)):
+        sample_count += 1
+        for err in validate(sample, profile=profile):
+            findings.append(
+                {
+                    "record": sample_index,
+                    "filename": sample.filename,
+                    "invariant": err.invariant,
+                    "message": err.message,
+                    "cell_index": err.cell_index,
+                }
+            )
+    if as_json:
+        click.echo(
+            json.dumps(
+                {
+                    "source": str(source),
+                    "profile": profile.name,
+                    "sample_count": sample_count,
+                    "findings": findings,
+                },
+                ensure_ascii=False,
+            )
+        )
+    else:
+        for finding in findings:
+            click.echo(f"record={finding['record']} {finding['invariant']}: {finding['message']}")
+        click.echo(f"checked {sample_count} record(s), {len(findings)} finding(s)")
+    if findings:
+        sys.exit(1)
+# ---------- convert ----------
+@main.command("convert")
+@click.argument("input_path", type=click.Path(exists=True, dir_okay=False, path_type=Path))
+@click.argument("output_path", type=click.Path(dir_okay=False, path_type=Path))
+@click.option("--from", "from_codec", required=True, help="Source codec name.")
+@click.option("--to", "to_codec", required=True, help="Target codec name.")
+@click.option(
+    "--dry-run",
+    is_flag=True,
+    help="Do not read the input; print the static analyze_loss report and exit.",
+)
+def convert_cmd(
+    input_path: Path,
+    output_path: Path,
+    from_codec: str,
+    to_codec: str,
+    dry_run: bool,
+) -> None:
+    """Re-encode INPUT_PATH from --from to --to, writing OUTPUT_PATH."""
+    src = _resolve_codec_name(from_codec)
+    tgt = _resolve_codec_name(to_codec)
+    if dry_run:
+        report = analyze_loss(source=from_codec, target=to_codec)
+        click.echo(json.dumps(dataclasses.asdict(_serialize_report(report))))
+        return
+    written = _stream_convert(input_path, output_path, src, tgt)
+    click.echo(f"wrote {written} record(s) to {output_path}")
+def _stream_convert(input_path: Path, output_path: Path, src: Codec, tgt: Codec) -> int:
+    written = 0
+    def _consume() -> Iterator[TableSample]:
+        nonlocal written
+        with input_path.open(encoding="utf-8") as handle:
+            for sample in src.read(handle):
+                written += 1
+                yield sample
+    with output_path.open("w", encoding="utf-8") as sink:
+        tgt.write(_consume(), sink)
+    return written
+@dataclasses.dataclass(frozen=True, slots=True)
+class _SerializableReport:
+    source: str
+    target: str
+    source_fields_dropped_on_read: list[str]
+    ir_fields_unrepresentable_in_target: list[str]
+    round_trip_classification: str
+def _serialize_report(report: Any) -> _SerializableReport:
+    return _SerializableReport(
+        source=report.source,
+        target=report.target,
+        source_fields_dropped_on_read=sorted(report.source_fields_dropped_on_read),
+        ir_fields_unrepresentable_in_target=sorted(report.ir_fields_unrepresentable_in_target),
+        round_trip_classification=report.round_trip_classification,
+    )
+# ---------- stats ----------
+@main.command("stats")
+@click.argument("source", type=click.Path(exists=True, dir_okay=False, path_type=Path))
+@click.option(
+    "--codec",
+    "codec_name",
+    default=None,
+    help="Codec name; if omitted, auto-detect.",
+)
+@click.option("--json", "as_json", is_flag=True)
+def stats_cmd(source: Path, codec_name: str | None, as_json: bool) -> None:
+    """Print sample / cell / spanned-cell counts for SOURCE."""
+    codec = _resolve_codec_name(codec_name) if codec_name else None
+    sample_count = 0
+    cell_count = 0
+    spanned_count = 0
+    for sample in tio.open(source, codec=codec):
+        sample_count += 1
+        cell_count += len(sample.cells)
+        spanned_count += sum(1 for c in sample.cells if c.rowspan != 1 or c.colspan != 1)
+    payload = {
+        "source": str(source),
+        "samples": sample_count,
+        "cells": cell_count,
+        "spanned_cells": spanned_count,
+    }
+    if as_json:
+        click.echo(json.dumps(payload))
+    else:
+        click.echo(f"samples: {sample_count}")
+        click.echo(f"cells: {cell_count}")
+        click.echo(f"spanned cells: {spanned_count}")
+# ---------- diff ----------
+@main.command("diff")
+@click.argument("a_path", type=click.Path(exists=True, dir_okay=False, path_type=Path))
+@click.argument("b_path", type=click.Path(exists=True, dir_okay=False, path_type=Path))
+@click.option("--codec", "codec_name", default=None)
+def diff_cmd(a_path: Path, b_path: Path, codec_name: str | None) -> None:
+    """Record-by-record diff of A_PATH and B_PATH (same codec on both sides)."""
+    codec = _resolve_codec_name(codec_name) if codec_name else None
+    a_iter = tio.open(a_path, codec=codec)
+    b_iter = tio.open(b_path, codec=codec)
+    diffs = list(_iter_diffs(a_iter, b_iter))
+    for index, side, sample in diffs:
+        if side == "both":
+            click.echo(f"differ @ record {index}: {sample}")
+        elif side == "left-only":
+            click.echo(f"only in A @ record {index}: filename={sample}")
+        else:
+            click.echo(f"only in B @ record {index}: filename={sample}")
+    click.echo(f"{len(diffs)} difference(s)")
+    if diffs:
+        sys.exit(1)
+def _iter_diffs(
+    a: Iterable[TableSample], b: Iterable[TableSample]
+) -> Iterator[tuple[int, str, str]]:
+    a_it = iter(a)
+    b_it = iter(b)
+    index = 0
+    while True:
+        a_sample = next(a_it, None)
+        b_sample = next(b_it, None)
+        if a_sample is None and b_sample is None:
+            return
+        if a_sample is None:
+            assert b_sample is not None  # narrows for pyright
+            yield index, "right-only", b_sample.filename
+        elif b_sample is None:
+            yield index, "left-only", a_sample.filename
+        elif a_sample != b_sample:
+            yield index, "both", f"{a_sample.filename} != {b_sample.filename}"
+        index += 1
+# ---------- analyze-loss ----------
+@main.command("analyze-loss")
+@click.option("--from", "from_codec", required=True)
+@click.option("--to", "to_codec", required=True)
+def analyze_loss_cmd(from_codec: str, to_codec: str) -> None:
+    """Static loss report for the FROM -> TO codec pair."""
+    _resolve_codec_name(from_codec)
+    _resolve_codec_name(to_codec)
+    report = analyze_loss(source=from_codec, target=to_codec)
+    click.echo(json.dumps(dataclasses.asdict(_serialize_report(report))))
+# ---------- codecs list ----------
+@main.group("codecs")
+def codecs_group() -> None:
+    """Inspect the in-process codec registry."""
+@codecs_group.command("list")
+def codecs_list_cmd() -> None:
+    """List registered codec names."""
+    for name in codecs.list_codecs():
+        click.echo(name)
+if __name__ == "__main__":  # pragma: no cover
+    main()