PyPI - tablecodec - Versions diffs - 0.0.18__py3-none-any.whl - Mend

tablecodec 0.0.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

tablecodec/__init__.py +29 -0
tablecodec/_invariants.py +311 -0
tablecodec/cli.py +314 -0
tablecodec/codecs/__init__.py +111 -0
tablecodec/codecs/_base.py +79 -0
tablecodec/codecs/_htmltable.py +510 -0
tablecodec/codecs/_otslgrid.py +318 -0
tablecodec/codecs/builtins.py +36 -0
tablecodec/codecs/doctags.py +278 -0
tablecodec/codecs/fintabnet.py +84 -0
tablecodec/codecs/fintabnet_otsl.py +141 -0
tablecodec/codecs/otsl.py +138 -0
tablecodec/codecs/pubtables1m.py +161 -0
tablecodec/codecs/pubtabnet.py +128 -0
tablecodec/codecs/tablebank.py +76 -0
tablecodec/codecs/tableformer.py +80 -0
tablecodec/io.py +91 -0
tablecodec/ir.py +101 -0
tablecodec/loss.py +105 -0
tablecodec/py.typed +0 -0
tablecodec/teds.py +243 -0
tablecodec/validate.py +185 -0
tablecodec-0.0.18.dist-info/METADATA +200 -0
tablecodec-0.0.18.dist-info/RECORD +27 -0
tablecodec-0.0.18.dist-info/WHEEL +4 -0
tablecodec-0.0.18.dist-info/entry_points.txt +2 -0
tablecodec-0.0.18.dist-info/licenses/LICENSE +21 -0

tablecodec/codecs/tablebank.py ADDED Viewed

@@ -0,0 +1,76 @@
+"""TableBank codec.
+TableBank ships table *structure* only — the source has no per-cell
+tokens or bbox. On read, the grid is reconstructed from the structure
+tokens and every cell is empty (``tokens=()``, ``bbox=None``). Writing
+emits structure only, so any tokens/bbox an IR carries are dropped
+(SPEC §7 marks TableBank write as partial / lossy).
+Record shape::
+    {
+        "filename": "...",
+        "split": "train" | "val" | "test",  # optional
+        "imgid": 0,  # optional
+        "html": {"structure": {"tokens": [...]}},  # no "cells"
+    }
+"""
+from __future__ import annotations
+import json
+from collections.abc import Iterable, Iterator
+from dataclasses import dataclass
+from typing import IO, Any
+from tablecodec.codecs._htmltable import (
+    parse_html_structure_only,
+    serialize_html_structure_only,
+    sniff_html_table,
+)
+from tablecodec.ir import TableSample
+__all__ = ["TableBankCodec"]
+@dataclass(frozen=True, slots=True)
+class TableBankCodec:
+    """Codec for the TableBank jsonl format (structure only, no cell content)."""
+    name: str = "tablebank"
+    spec_version: str = "1.0.0"
+    media_type: str = "application/jsonl"
+    writable: bool = True
+    def read(self, source: IO[str]) -> Iterator[TableSample]:
+        for line_no, raw in enumerate(source, start=1):
+            line = raw.strip()
+            if not line:
+                continue
+            try:
+                payload: dict[str, Any] = json.loads(line)
+            except json.JSONDecodeError as exc:
+                msg = f"invalid JSON at line {line_no}: {exc.msg}"
+                raise ValueError(msg) from exc
+            try:
+                yield parse_html_structure_only(payload)
+            except (KeyError, ValueError, TypeError) as exc:
+                msg = f"malformed TableBank record at line {line_no}: {exc}"
+                raise ValueError(msg) from exc
+    def write(self, samples: Iterable[TableSample], sink: IO[str]) -> None:
+        for sample in samples:
+            sink.write(json.dumps(serialize_html_structure_only(sample), ensure_ascii=False))
+            sink.write("\n")
+    def lossy_read(self) -> frozenset[str]:
+        # The source carries no cell content; reading a richer file via
+        # this codec discards tokens and bbox.
+        return frozenset({"tokens", "bbox"})
+    def lossy_write(self) -> frozenset[str]:
+        return frozenset({"tokens", "bbox", "extras"})
+    def sniff(self, source: IO[str]) -> bool:
+        # TableBank records have html.structure but NO html.cells.
+        return sniff_html_table(source, require_no_cells=True)

tablecodec/codecs/tableformer.py ADDED Viewed

@@ -0,0 +1,80 @@
+"""TableFormer Format codec.
+TableFormer (IBM internal) uses PubTabNet 2.0's HTML-token structure
+with one extra invariant: EVERY cell — including empty ones — carries a
+bbox. This codec enforces that on read (raising if any cell lacks one)
+and its output therefore satisfies ``profiles.TABLEFORMER``.
+Record shape is the PubTabNet 2.0 shape; the difference is purely that
+``cells[i].bbox`` is always present, even when ``tokens`` is empty.
+"""
+from __future__ import annotations
+import json
+from collections.abc import Iterable, Iterator
+from dataclasses import dataclass
+from typing import IO, Any
+from tablecodec.codecs._htmltable import (
+    parse_html_table,
+    serialize_html_table,
+    sniff_html_table,
+)
+from tablecodec.ir import TableSample
+__all__ = ["TableFormerCodec"]
+@dataclass(frozen=True, slots=True)
+class TableFormerCodec:
+    """Codec for the TableFormer Format jsonl (every cell has bbox)."""
+    name: str = "tableformer"
+    spec_version: str = "1.0.0"
+    media_type: str = "application/jsonl"
+    writable: bool = True
+    def read(self, source: IO[str]) -> Iterator[TableSample]:
+        for line_no, raw in enumerate(source, start=1):
+            line = raw.strip()
+            if not line:
+                continue
+            try:
+                payload: dict[str, Any] = json.loads(line)
+            except json.JSONDecodeError as exc:
+                msg = f"invalid JSON at line {line_no}: {exc.msg}"
+                raise ValueError(msg) from exc
+            try:
+                sample = parse_html_table(payload)
+                _require_all_cells_have_bbox(sample, line_no)
+            except (KeyError, ValueError, TypeError) as exc:
+                msg = f"malformed TableFormer record at line {line_no}: {exc}"
+                raise ValueError(msg) from exc
+            yield sample
+    def write(self, samples: Iterable[TableSample], sink: IO[str]) -> None:
+        for sample in samples:
+            sink.write(json.dumps(serialize_html_table(sample), ensure_ascii=False))
+            sink.write("\n")
+    def lossy_read(self) -> frozenset[str]:
+        return frozenset()
+    def lossy_write(self) -> frozenset[str]:
+        return frozenset({"extras"})
+    def sniff(self, source: IO[str]) -> bool:
+        # Discriminator: every cell carries a bbox (PubTabNet may omit it
+        # on empty cells, so a bbox-less cell rules TableFormer out).
+        return sniff_html_table(source, require_all_bbox=True)
+def _require_all_cells_have_bbox(sample: TableSample, line_no: int) -> None:
+    for idx, cell in enumerate(sample.cells):
+        if cell.bbox is None:
+            msg = (
+                f"TableFormer requires every cell to have a bbox; cell index "
+                f"{idx} has none (line {line_no})"
+            )
+            raise ValueError(msg)

tablecodec/io.py ADDED Viewed

@@ -0,0 +1,91 @@
+"""High-level streaming I/O helpers (SPEC §10).
+``open()`` and ``detect()`` accept either an already-open text stream
+or a path-like; they always return iterators backed by the codec's
+streaming ``read`` (never slurp the file into memory).
+"""
+from __future__ import annotations
+from collections.abc import Generator, Iterator
+from contextlib import contextmanager
+from os import PathLike
+from pathlib import Path
+from typing import IO, Union
+from tablecodec import codecs
+from tablecodec.codecs._base import Codec
+from tablecodec.ir import TableSample
+__all__ = ["detect", "open"]
+PathOrStream = Union[str, "PathLike[str]", IO[str]]
+def open(  # noqa: A001  # mirrors builtin name on purpose, like ``codecs.open``.
+    source: PathOrStream,
+    codec: str | Codec | None = None,
+    encoding: str = "utf-8",
+) -> Iterator[TableSample]:
+    """Stream samples from *source* using *codec*.
+    Args:
+        source: Path-like or already-open text stream. Paths are opened
+            with the given *encoding* and closed when the returned
+            iterator is exhausted or garbage-collected.
+        codec: Codec instance, registry name, or ``None`` to auto-detect.
+        encoding: Text encoding when *source* is a path; ignored
+            otherwise.
+    Yields:
+        :class:`TableSample` instances, one per record in *source*.
+    Raises:
+        KeyError: when *codec* is a name that is not registered.
+        ValueError: when *codec* is ``None`` and detection fails.
+    """
+    resolved = _resolve_codec(source, codec)
+    @contextmanager
+    def _owned_stream() -> Generator[IO[str], None, None]:
+        if isinstance(source, (str, PathLike)):
+            handle = Path(source).open(encoding=encoding)
+            try:
+                yield handle
+            finally:
+                handle.close()
+        else:
+            yield source
+    def _iter() -> Iterator[TableSample]:
+        with _owned_stream() as stream:
+            yield from resolved.read(stream)
+    return _iter()
+def detect(source: PathOrStream, encoding: str = "utf-8") -> str | None:
+    """Return the registered codec name that matches *source*, or ``None``.
+    A path-like *source* is opened, peeked, and closed. A stream-like
+    *source* has its position restored after the peek.
+    """
+    if isinstance(source, (str, PathLike)):
+        with Path(source).open(encoding=encoding) as stream:
+            return codecs.detect(stream)
+    return codecs.detect(source)
+# ---------- internals ----------
+def _resolve_codec(source: PathOrStream, codec: str | Codec | None) -> Codec:
+    if isinstance(codec, str):
+        return codecs.get(codec)
+    if codec is not None:
+        return codec
+    name = detect(source)
+    if name is None:
+        msg = "could not detect codec; pass codec= explicitly"
+        raise ValueError(msg)
+    return codecs.get(name)

tablecodec/ir.py ADDED Viewed

@@ -0,0 +1,101 @@
+"""Internal Representation (IR) for tablecodec.
+SPEC §5: the 2D grid model that every supported codec maps to/from.
+Types are immutable (``frozen=True``), memory-compact (``slots=True``),
+and hashable. Zero third-party dependencies (SPEC §13).
+"""
+from __future__ import annotations
+from collections.abc import Mapping
+from dataclasses import dataclass, field
+from typing import Literal
+__all__ = ["BBox", "GridCell", "TableSample"]
+# Absolute pixel coordinates: (x0, y0, x1, y1). See SPEC §5.1.
+BBox = tuple[int, int, int, int]
+def _empty_extras() -> dict[str, object]:
+    return {}
+@dataclass(frozen=True, slots=True)
+class GridCell:
+    """A single grid cell in a :class:`TableSample`.
+    Attributes:
+        row: Zero-indexed row of the cell's top-left anchor.
+        col: Zero-indexed column of the cell's top-left anchor.
+        rowspan: Number of rows the cell occupies (``>= 1``).
+        colspan: Number of columns the cell occupies (``>= 1``).
+        tokens: Ordered tokens that form the cell content. Empty tuple for
+            empty cells. The tuple is never ``None`` (SPEC §5.2 I-07).
+        bbox: Bounding box in absolute pixels, or ``None`` when the source
+            format does not provide one (e.g. empty cells, pubtabnet-1.0.0).
+        role: ``"header"`` or ``"body"``. Header cells must form a
+            contiguous top-region (SPEC §5.2 I-06).
+    """
+    row: int
+    col: int
+    rowspan: int = 1
+    colspan: int = 1
+    tokens: tuple[str, ...] = ()
+    bbox: BBox | None = None
+    role: Literal["header", "body"] = "body"
+@dataclass(frozen=True, slots=True)
+class TableSample:
+    """One annotated table image.
+    Attributes:
+        filename: Source image filename.
+        nrows: Logical row count of the grid (``>= 1``).
+        ncols: Logical column count of the grid (``>= 1``).
+        cells: Ordered top-to-bottom, left-to-right (SPEC §5.1).
+        split: Optional dataset split assignment.
+        imgid: Optional dataset-defined integer id.
+        image_width: Source image width in pixels, or ``None`` when the
+            format does not carry it. Sample-level metadata (a peer of
+            ``filename`` / ``imgid``), not table content. Backs the STRICT
+            profile's bbox-in-image cross-check (SPEC §8).
+        image_height: Source image height in pixels, or ``None``. See
+            ``image_width``.
+        extras: Codec-defined opaque metadata. Opaque to validation but
+            must be JSON-serializable for codecs that round-trip via it
+            (SPEC §5.2 closing paragraph). Excluded from :meth:`__hash__`
+            because ``Mapping`` is not generally hashable; equality still
+            considers it via the dataclass-generated ``__eq__``.
+    """
+    filename: str
+    nrows: int
+    ncols: int
+    cells: tuple[GridCell, ...]
+    split: Literal["train", "val", "test"] | None = None
+    imgid: int | None = None
+    image_width: int | None = None
+    image_height: int | None = None
+    extras: Mapping[str, object] = field(default_factory=_empty_extras)
+    def __hash__(self) -> int:
+        # extras is a Mapping (potentially a dict, which is unhashable);
+        # excluding it preserves the hash/eq contract: equal samples that
+        # also have equal extras hash identically, while two samples that
+        # differ only in extras may collide (acceptable for a hash).
+        return hash(
+            (
+                self.filename,
+                self.nrows,
+                self.ncols,
+                self.cells,
+                self.split,
+                self.imgid,
+                self.image_width,
+                self.image_height,
+            )
+        )

tablecodec/loss.py ADDED Viewed

@@ -0,0 +1,105 @@
+"""Static loss analysis between any two registered codecs (SPEC §9).
+``analyze_loss(source, target)`` reads only the codecs' ``lossy_read``
+and ``lossy_write`` declarations — no data is touched. The result is a
+:class:`LossReport` summarising:
+- ``source_fields_dropped_on_read`` — verbatim from ``source.lossy_read()``.
+- ``ir_fields_unrepresentable_in_target`` — verbatim from
+  ``target.lossy_write()``.
+- ``round_trip_classification`` — one of:
+    * ``"lossless"`` — nothing dropped anywhere.
+    * ``"structure-preserving"`` — only auxiliary fields lost
+      (``bbox``, ``role``, ``extras``); grid topology and cell tokens
+      survive.
+    * ``"lossy"`` — at least one structural / content field lost
+      (``tokens`` or anything not in the auxiliary set).
+**Scope of the loss model.** Loss tracking covers the fields a codec
+actually reads into / writes from the IR — i.e. table content
+(``tokens``, ``bbox``, ``role``, ``extras``). Sample-level identity /
+acquisition metadata (``filename``, ``nrows``, ``ncols``, ``split``,
+``imgid``, ``image_width``, ``image_height``) is intentionally outside the
+model: a field that no codec populates cannot be dropped on a round-trip
+(``None`` in, ``None`` out), so it would be dishonest to list it in any
+``lossy_*`` declaration. A field enters the model only once a codec carries
+it (e.g. a future PubTables-1M codec reading image dims from VOC ``<size>``);
+that codec, if it then writes to a format that cannot store the field, would
+declare it in ``lossy_write`` at that point. See ADR 0012.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Literal
+from tablecodec import codecs
+__all__ = ["LossReport", "analyze_loss"]
+# Fields whose loss does NOT destroy grid structure or cell content.
+# A round-trip that loses only these is "structure-preserving".
+_AUXILIARY_FIELDS = frozenset({"bbox", "role", "extras"})
+Classification = Literal["lossless", "structure-preserving", "lossy", "unwritable"]
+@dataclass(frozen=True, slots=True)
+class LossReport:
+    """Summary of what is lost when *source* samples are re-encoded into *target*.
+    Attributes:
+        source: Registered codec name of the source format.
+        target: Registered codec name of the target format.
+        source_fields_dropped_on_read: Fields the source codec discards
+            during ``read`` (the source format had them; the IR will not).
+        ir_fields_unrepresentable_in_target: IR fields that the target
+            codec cannot persist during ``write``.
+        round_trip_classification: ``"lossless"``, ``"structure-preserving"``,
+            ``"lossy"``, or ``"unwritable"`` (target is a read-only codec;
+            see ADR 0002).
+    """
+    source: str
+    target: str
+    source_fields_dropped_on_read: frozenset[str]
+    ir_fields_unrepresentable_in_target: frozenset[str]
+    round_trip_classification: Classification
+def analyze_loss(source: str, target: str) -> LossReport:
+    """Static loss report for the ``source -> IR -> target`` pipeline.
+    Raises:
+        KeyError: when *source* or *target* is not a registered codec.
+    """
+    src_codec = codecs.get(source)
+    tgt_codec = codecs.get(target)
+    dropped = src_codec.lossy_read()
+    if not tgt_codec.writable:
+        # ADR 0002: a read-only target cannot be written; its lossy_write
+        # is not meaningful, so report "unwritable" and stop.
+        return LossReport(
+            source=source,
+            target=target,
+            source_fields_dropped_on_read=dropped,
+            ir_fields_unrepresentable_in_target=frozenset(),
+            round_trip_classification="unwritable",
+        )
+    unrepresentable = tgt_codec.lossy_write()
+    return LossReport(
+        source=source,
+        target=target,
+        source_fields_dropped_on_read=dropped,
+        ir_fields_unrepresentable_in_target=unrepresentable,
+        round_trip_classification=_classify(dropped | unrepresentable),
+    )
+def _classify(all_lost: frozenset[str]) -> Classification:
+    if not all_lost:
+        return "lossless"
+    if all_lost <= _AUXILIARY_FIELDS:
+        return "structure-preserving"
+    return "lossy"

tablecodec/py.typed ADDED Viewed

File without changes