tablecodec 0.0.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,76 @@
1
+ """TableBank codec.
2
+
3
+ TableBank ships table *structure* only — the source has no per-cell
4
+ tokens or bbox. On read, the grid is reconstructed from the structure
5
+ tokens and every cell is empty (``tokens=()``, ``bbox=None``). Writing
6
+ emits structure only, so any tokens/bbox an IR carries are dropped
7
+ (SPEC §7 marks TableBank write as partial / lossy).
8
+
9
+ Record shape::
10
+
11
+ {
12
+ "filename": "...",
13
+ "split": "train" | "val" | "test", # optional
14
+ "imgid": 0, # optional
15
+ "html": {"structure": {"tokens": [...]}}, # no "cells"
16
+ }
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import json
22
+ from collections.abc import Iterable, Iterator
23
+ from dataclasses import dataclass
24
+ from typing import IO, Any
25
+
26
+ from tablecodec.codecs._htmltable import (
27
+ parse_html_structure_only,
28
+ serialize_html_structure_only,
29
+ sniff_html_table,
30
+ )
31
+ from tablecodec.ir import TableSample
32
+
33
+ __all__ = ["TableBankCodec"]
34
+
35
+
36
+ @dataclass(frozen=True, slots=True)
37
+ class TableBankCodec:
38
+ """Codec for the TableBank jsonl format (structure only, no cell content)."""
39
+
40
+ name: str = "tablebank"
41
+ spec_version: str = "1.0.0"
42
+ media_type: str = "application/jsonl"
43
+ writable: bool = True
44
+
45
+ def read(self, source: IO[str]) -> Iterator[TableSample]:
46
+ for line_no, raw in enumerate(source, start=1):
47
+ line = raw.strip()
48
+ if not line:
49
+ continue
50
+ try:
51
+ payload: dict[str, Any] = json.loads(line)
52
+ except json.JSONDecodeError as exc:
53
+ msg = f"invalid JSON at line {line_no}: {exc.msg}"
54
+ raise ValueError(msg) from exc
55
+ try:
56
+ yield parse_html_structure_only(payload)
57
+ except (KeyError, ValueError, TypeError) as exc:
58
+ msg = f"malformed TableBank record at line {line_no}: {exc}"
59
+ raise ValueError(msg) from exc
60
+
61
+ def write(self, samples: Iterable[TableSample], sink: IO[str]) -> None:
62
+ for sample in samples:
63
+ sink.write(json.dumps(serialize_html_structure_only(sample), ensure_ascii=False))
64
+ sink.write("\n")
65
+
66
+ def lossy_read(self) -> frozenset[str]:
67
+ # The source carries no cell content; reading a richer file via
68
+ # this codec discards tokens and bbox.
69
+ return frozenset({"tokens", "bbox"})
70
+
71
+ def lossy_write(self) -> frozenset[str]:
72
+ return frozenset({"tokens", "bbox", "extras"})
73
+
74
+ def sniff(self, source: IO[str]) -> bool:
75
+ # TableBank records have html.structure but NO html.cells.
76
+ return sniff_html_table(source, require_no_cells=True)
@@ -0,0 +1,80 @@
1
+ """TableFormer Format codec.
2
+
3
+ TableFormer (IBM internal) uses PubTabNet 2.0's HTML-token structure
4
+ with one extra invariant: EVERY cell — including empty ones — carries a
5
+ bbox. This codec enforces that on read (raising if any cell lacks one)
6
+ and its output therefore satisfies ``profiles.TABLEFORMER``.
7
+
8
+ Record shape is the PubTabNet 2.0 shape; the difference is purely that
9
+ ``cells[i].bbox`` is always present, even when ``tokens`` is empty.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ from collections.abc import Iterable, Iterator
16
+ from dataclasses import dataclass
17
+ from typing import IO, Any
18
+
19
+ from tablecodec.codecs._htmltable import (
20
+ parse_html_table,
21
+ serialize_html_table,
22
+ sniff_html_table,
23
+ )
24
+ from tablecodec.ir import TableSample
25
+
26
+ __all__ = ["TableFormerCodec"]
27
+
28
+
29
+ @dataclass(frozen=True, slots=True)
30
+ class TableFormerCodec:
31
+ """Codec for the TableFormer Format jsonl (every cell has bbox)."""
32
+
33
+ name: str = "tableformer"
34
+ spec_version: str = "1.0.0"
35
+ media_type: str = "application/jsonl"
36
+ writable: bool = True
37
+
38
+ def read(self, source: IO[str]) -> Iterator[TableSample]:
39
+ for line_no, raw in enumerate(source, start=1):
40
+ line = raw.strip()
41
+ if not line:
42
+ continue
43
+ try:
44
+ payload: dict[str, Any] = json.loads(line)
45
+ except json.JSONDecodeError as exc:
46
+ msg = f"invalid JSON at line {line_no}: {exc.msg}"
47
+ raise ValueError(msg) from exc
48
+ try:
49
+ sample = parse_html_table(payload)
50
+ _require_all_cells_have_bbox(sample, line_no)
51
+ except (KeyError, ValueError, TypeError) as exc:
52
+ msg = f"malformed TableFormer record at line {line_no}: {exc}"
53
+ raise ValueError(msg) from exc
54
+ yield sample
55
+
56
+ def write(self, samples: Iterable[TableSample], sink: IO[str]) -> None:
57
+ for sample in samples:
58
+ sink.write(json.dumps(serialize_html_table(sample), ensure_ascii=False))
59
+ sink.write("\n")
60
+
61
+ def lossy_read(self) -> frozenset[str]:
62
+ return frozenset()
63
+
64
+ def lossy_write(self) -> frozenset[str]:
65
+ return frozenset({"extras"})
66
+
67
+ def sniff(self, source: IO[str]) -> bool:
68
+ # Discriminator: every cell carries a bbox (PubTabNet may omit it
69
+ # on empty cells, so a bbox-less cell rules TableFormer out).
70
+ return sniff_html_table(source, require_all_bbox=True)
71
+
72
+
73
+ def _require_all_cells_have_bbox(sample: TableSample, line_no: int) -> None:
74
+ for idx, cell in enumerate(sample.cells):
75
+ if cell.bbox is None:
76
+ msg = (
77
+ f"TableFormer requires every cell to have a bbox; cell index "
78
+ f"{idx} has none (line {line_no})"
79
+ )
80
+ raise ValueError(msg)
tablecodec/io.py ADDED
@@ -0,0 +1,91 @@
1
+ """High-level streaming I/O helpers (SPEC §10).
2
+
3
+ ``open()`` and ``detect()`` accept either an already-open text stream
4
+ or a path-like; they always return iterators backed by the codec's
5
+ streaming ``read`` (never slurp the file into memory).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from collections.abc import Generator, Iterator
11
+ from contextlib import contextmanager
12
+ from os import PathLike
13
+ from pathlib import Path
14
+ from typing import IO, Union
15
+
16
+ from tablecodec import codecs
17
+ from tablecodec.codecs._base import Codec
18
+ from tablecodec.ir import TableSample
19
+
20
+ __all__ = ["detect", "open"]
21
+
22
+ PathOrStream = Union[str, "PathLike[str]", IO[str]]
23
+
24
+
25
+ def open( # noqa: A001 # mirrors builtin name on purpose, like ``codecs.open``.
26
+ source: PathOrStream,
27
+ codec: str | Codec | None = None,
28
+ encoding: str = "utf-8",
29
+ ) -> Iterator[TableSample]:
30
+ """Stream samples from *source* using *codec*.
31
+
32
+ Args:
33
+ source: Path-like or already-open text stream. Paths are opened
34
+ with the given *encoding* and closed when the returned
35
+ iterator is exhausted or garbage-collected.
36
+ codec: Codec instance, registry name, or ``None`` to auto-detect.
37
+ encoding: Text encoding when *source* is a path; ignored
38
+ otherwise.
39
+
40
+ Yields:
41
+ :class:`TableSample` instances, one per record in *source*.
42
+
43
+ Raises:
44
+ KeyError: when *codec* is a name that is not registered.
45
+ ValueError: when *codec* is ``None`` and detection fails.
46
+ """
47
+ resolved = _resolve_codec(source, codec)
48
+
49
+ @contextmanager
50
+ def _owned_stream() -> Generator[IO[str], None, None]:
51
+ if isinstance(source, (str, PathLike)):
52
+ handle = Path(source).open(encoding=encoding)
53
+ try:
54
+ yield handle
55
+ finally:
56
+ handle.close()
57
+ else:
58
+ yield source
59
+
60
+ def _iter() -> Iterator[TableSample]:
61
+ with _owned_stream() as stream:
62
+ yield from resolved.read(stream)
63
+
64
+ return _iter()
65
+
66
+
67
+ def detect(source: PathOrStream, encoding: str = "utf-8") -> str | None:
68
+ """Return the registered codec name that matches *source*, or ``None``.
69
+
70
+ A path-like *source* is opened, peeked, and closed. A stream-like
71
+ *source* has its position restored after the peek.
72
+ """
73
+ if isinstance(source, (str, PathLike)):
74
+ with Path(source).open(encoding=encoding) as stream:
75
+ return codecs.detect(stream)
76
+ return codecs.detect(source)
77
+
78
+
79
+ # ---------- internals ----------
80
+
81
+
82
+ def _resolve_codec(source: PathOrStream, codec: str | Codec | None) -> Codec:
83
+ if isinstance(codec, str):
84
+ return codecs.get(codec)
85
+ if codec is not None:
86
+ return codec
87
+ name = detect(source)
88
+ if name is None:
89
+ msg = "could not detect codec; pass codec= explicitly"
90
+ raise ValueError(msg)
91
+ return codecs.get(name)
tablecodec/ir.py ADDED
@@ -0,0 +1,101 @@
1
+ """Internal Representation (IR) for tablecodec.
2
+
3
+ SPEC §5: the 2D grid model that every supported codec maps to/from.
4
+ Types are immutable (``frozen=True``), memory-compact (``slots=True``),
5
+ and hashable. Zero third-party dependencies (SPEC §13).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from collections.abc import Mapping
11
+ from dataclasses import dataclass, field
12
+ from typing import Literal
13
+
14
+ __all__ = ["BBox", "GridCell", "TableSample"]
15
+
16
+
17
+ # Absolute pixel coordinates: (x0, y0, x1, y1). See SPEC §5.1.
18
+ BBox = tuple[int, int, int, int]
19
+
20
+
21
+ def _empty_extras() -> dict[str, object]:
22
+ return {}
23
+
24
+
25
+ @dataclass(frozen=True, slots=True)
26
+ class GridCell:
27
+ """A single grid cell in a :class:`TableSample`.
28
+
29
+ Attributes:
30
+ row: Zero-indexed row of the cell's top-left anchor.
31
+ col: Zero-indexed column of the cell's top-left anchor.
32
+ rowspan: Number of rows the cell occupies (``>= 1``).
33
+ colspan: Number of columns the cell occupies (``>= 1``).
34
+ tokens: Ordered tokens that form the cell content. Empty tuple for
35
+ empty cells. The tuple is never ``None`` (SPEC §5.2 I-07).
36
+ bbox: Bounding box in absolute pixels, or ``None`` when the source
37
+ format does not provide one (e.g. empty cells, pubtabnet-1.0.0).
38
+ role: ``"header"`` or ``"body"``. Header cells must form a
39
+ contiguous top-region (SPEC §5.2 I-06).
40
+ """
41
+
42
+ row: int
43
+ col: int
44
+ rowspan: int = 1
45
+ colspan: int = 1
46
+ tokens: tuple[str, ...] = ()
47
+ bbox: BBox | None = None
48
+ role: Literal["header", "body"] = "body"
49
+
50
+
51
+ @dataclass(frozen=True, slots=True)
52
+ class TableSample:
53
+ """One annotated table image.
54
+
55
+ Attributes:
56
+ filename: Source image filename.
57
+ nrows: Logical row count of the grid (``>= 1``).
58
+ ncols: Logical column count of the grid (``>= 1``).
59
+ cells: Ordered top-to-bottom, left-to-right (SPEC §5.1).
60
+ split: Optional dataset split assignment.
61
+ imgid: Optional dataset-defined integer id.
62
+ image_width: Source image width in pixels, or ``None`` when the
63
+ format does not carry it. Sample-level metadata (a peer of
64
+ ``filename`` / ``imgid``), not table content. Backs the STRICT
65
+ profile's bbox-in-image cross-check (SPEC §8).
66
+ image_height: Source image height in pixels, or ``None``. See
67
+ ``image_width``.
68
+ extras: Codec-defined opaque metadata. Opaque to validation but
69
+ must be JSON-serializable for codecs that round-trip via it
70
+ (SPEC §5.2 closing paragraph). Excluded from :meth:`__hash__`
71
+ because ``Mapping`` is not generally hashable; equality still
72
+ considers it via the dataclass-generated ``__eq__``.
73
+ """
74
+
75
+ filename: str
76
+ nrows: int
77
+ ncols: int
78
+ cells: tuple[GridCell, ...]
79
+ split: Literal["train", "val", "test"] | None = None
80
+ imgid: int | None = None
81
+ image_width: int | None = None
82
+ image_height: int | None = None
83
+ extras: Mapping[str, object] = field(default_factory=_empty_extras)
84
+
85
+ def __hash__(self) -> int:
86
+ # extras is a Mapping (potentially a dict, which is unhashable);
87
+ # excluding it preserves the hash/eq contract: equal samples that
88
+ # also have equal extras hash identically, while two samples that
89
+ # differ only in extras may collide (acceptable for a hash).
90
+ return hash(
91
+ (
92
+ self.filename,
93
+ self.nrows,
94
+ self.ncols,
95
+ self.cells,
96
+ self.split,
97
+ self.imgid,
98
+ self.image_width,
99
+ self.image_height,
100
+ )
101
+ )
tablecodec/loss.py ADDED
@@ -0,0 +1,105 @@
1
+ """Static loss analysis between any two registered codecs (SPEC §9).
2
+
3
+ ``analyze_loss(source, target)`` reads only the codecs' ``lossy_read``
4
+ and ``lossy_write`` declarations — no data is touched. The result is a
5
+ :class:`LossReport` summarising:
6
+
7
+ - ``source_fields_dropped_on_read`` — verbatim from ``source.lossy_read()``.
8
+ - ``ir_fields_unrepresentable_in_target`` — verbatim from
9
+ ``target.lossy_write()``.
10
+ - ``round_trip_classification`` — one of:
11
+ * ``"lossless"`` — nothing dropped anywhere.
12
+ * ``"structure-preserving"`` — only auxiliary fields lost
13
+ (``bbox``, ``role``, ``extras``); grid topology and cell tokens
14
+ survive.
15
+ * ``"lossy"`` — at least one structural / content field lost
16
+ (``tokens`` or anything not in the auxiliary set).
17
+
18
+ **Scope of the loss model.** Loss tracking covers the fields a codec
19
+ actually reads into / writes from the IR — i.e. table content
20
+ (``tokens``, ``bbox``, ``role``, ``extras``). Sample-level identity /
21
+ acquisition metadata (``filename``, ``nrows``, ``ncols``, ``split``,
22
+ ``imgid``, ``image_width``, ``image_height``) is intentionally outside the
23
+ model: a field that no codec populates cannot be dropped on a round-trip
24
+ (``None`` in, ``None`` out), so it would be dishonest to list it in any
25
+ ``lossy_*`` declaration. A field enters the model only once a codec carries
26
+ it (e.g. a future PubTables-1M codec reading image dims from VOC ``<size>``);
27
+ that codec, if it then writes to a format that cannot store the field, would
28
+ declare it in ``lossy_write`` at that point. See ADR 0012.
29
+ """
30
+
31
+ from __future__ import annotations
32
+
33
+ from dataclasses import dataclass
34
+ from typing import Literal
35
+
36
+ from tablecodec import codecs
37
+
38
+ __all__ = ["LossReport", "analyze_loss"]
39
+
40
+ # Fields whose loss does NOT destroy grid structure or cell content.
41
+ # A round-trip that loses only these is "structure-preserving".
42
+ _AUXILIARY_FIELDS = frozenset({"bbox", "role", "extras"})
43
+
44
+ Classification = Literal["lossless", "structure-preserving", "lossy", "unwritable"]
45
+
46
+
47
+ @dataclass(frozen=True, slots=True)
48
+ class LossReport:
49
+ """Summary of what is lost when *source* samples are re-encoded into *target*.
50
+
51
+ Attributes:
52
+ source: Registered codec name of the source format.
53
+ target: Registered codec name of the target format.
54
+ source_fields_dropped_on_read: Fields the source codec discards
55
+ during ``read`` (the source format had them; the IR will not).
56
+ ir_fields_unrepresentable_in_target: IR fields that the target
57
+ codec cannot persist during ``write``.
58
+ round_trip_classification: ``"lossless"``, ``"structure-preserving"``,
59
+ ``"lossy"``, or ``"unwritable"`` (target is a read-only codec;
60
+ see ADR 0002).
61
+ """
62
+
63
+ source: str
64
+ target: str
65
+ source_fields_dropped_on_read: frozenset[str]
66
+ ir_fields_unrepresentable_in_target: frozenset[str]
67
+ round_trip_classification: Classification
68
+
69
+
70
+ def analyze_loss(source: str, target: str) -> LossReport:
71
+ """Static loss report for the ``source -> IR -> target`` pipeline.
72
+
73
+ Raises:
74
+ KeyError: when *source* or *target* is not a registered codec.
75
+ """
76
+ src_codec = codecs.get(source)
77
+ tgt_codec = codecs.get(target)
78
+
79
+ dropped = src_codec.lossy_read()
80
+ if not tgt_codec.writable:
81
+ # ADR 0002: a read-only target cannot be written; its lossy_write
82
+ # is not meaningful, so report "unwritable" and stop.
83
+ return LossReport(
84
+ source=source,
85
+ target=target,
86
+ source_fields_dropped_on_read=dropped,
87
+ ir_fields_unrepresentable_in_target=frozenset(),
88
+ round_trip_classification="unwritable",
89
+ )
90
+ unrepresentable = tgt_codec.lossy_write()
91
+ return LossReport(
92
+ source=source,
93
+ target=target,
94
+ source_fields_dropped_on_read=dropped,
95
+ ir_fields_unrepresentable_in_target=unrepresentable,
96
+ round_trip_classification=_classify(dropped | unrepresentable),
97
+ )
98
+
99
+
100
+ def _classify(all_lost: frozenset[str]) -> Classification:
101
+ if not all_lost:
102
+ return "lossless"
103
+ if all_lost <= _AUXILIARY_FIELDS:
104
+ return "structure-preserving"
105
+ return "lossy"
tablecodec/py.typed ADDED
File without changes