tablecodec 0.0.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tablecodec/__init__.py +29 -0
- tablecodec/_invariants.py +311 -0
- tablecodec/cli.py +314 -0
- tablecodec/codecs/__init__.py +111 -0
- tablecodec/codecs/_base.py +79 -0
- tablecodec/codecs/_htmltable.py +510 -0
- tablecodec/codecs/_otslgrid.py +318 -0
- tablecodec/codecs/builtins.py +36 -0
- tablecodec/codecs/doctags.py +278 -0
- tablecodec/codecs/fintabnet.py +84 -0
- tablecodec/codecs/fintabnet_otsl.py +141 -0
- tablecodec/codecs/otsl.py +138 -0
- tablecodec/codecs/pubtables1m.py +161 -0
- tablecodec/codecs/pubtabnet.py +128 -0
- tablecodec/codecs/tablebank.py +76 -0
- tablecodec/codecs/tableformer.py +80 -0
- tablecodec/io.py +91 -0
- tablecodec/ir.py +101 -0
- tablecodec/loss.py +105 -0
- tablecodec/py.typed +0 -0
- tablecodec/teds.py +243 -0
- tablecodec/validate.py +185 -0
- tablecodec-0.0.18.dist-info/METADATA +200 -0
- tablecodec-0.0.18.dist-info/RECORD +27 -0
- tablecodec-0.0.18.dist-info/WHEEL +4 -0
- tablecodec-0.0.18.dist-info/entry_points.txt +2 -0
- tablecodec-0.0.18.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""Codec registry (SPEC §6.2).
|
|
2
|
+
|
|
3
|
+
Third-party codecs ship as separate PyPI packages and self-register via the
|
|
4
|
+
entry-point group ``tablecodec.codecs``; :func:`load_plugins` discovers and
|
|
5
|
+
registers them. The library does not auto-register anything at import time —
|
|
6
|
+
callers register the built-ins they need (the CLI does) and call
|
|
7
|
+
``load_plugins`` to pick up installed third-party codecs.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import importlib.metadata
|
|
13
|
+
from typing import IO
|
|
14
|
+
|
|
15
|
+
from tablecodec.codecs._base import Codec
|
|
16
|
+
|
|
17
|
+
__all__ = ["Codec", "detect", "get", "list_codecs", "load_plugins", "register"]
|
|
18
|
+
|
|
19
|
+
_PLUGIN_GROUP = "tablecodec.codecs"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# Module-level mutable registry. Tests use _snapshot/_restore to isolate.
|
|
23
|
+
_registry: dict[str, Codec] = {}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def register(codec: Codec) -> None:
|
|
27
|
+
"""Register *codec* under its declared name.
|
|
28
|
+
|
|
29
|
+
Raises:
|
|
30
|
+
ValueError: when a codec with the same name is already registered.
|
|
31
|
+
"""
|
|
32
|
+
if codec.name in _registry:
|
|
33
|
+
msg = f"codec {codec.name!r} is already registered"
|
|
34
|
+
raise ValueError(msg)
|
|
35
|
+
_registry[codec.name] = codec
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def get(name: str) -> Codec:
|
|
39
|
+
"""Look up a codec by name.
|
|
40
|
+
|
|
41
|
+
Raises:
|
|
42
|
+
KeyError: when *name* is not registered.
|
|
43
|
+
"""
|
|
44
|
+
if name not in _registry:
|
|
45
|
+
msg = f"no codec registered under {name!r}"
|
|
46
|
+
raise KeyError(msg)
|
|
47
|
+
return _registry[name]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def list_codecs() -> tuple[str, ...]:
|
|
51
|
+
"""Return the registered codec names in registration order."""
|
|
52
|
+
return tuple(_registry)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def detect(source: IO[str]) -> str | None:
|
|
56
|
+
"""Peek at *source* and return the matching codec name, or ``None``.
|
|
57
|
+
|
|
58
|
+
Implementation: iterate registered codecs and ask each whether the
|
|
59
|
+
first non-empty line of *source* looks like its format. The source
|
|
60
|
+
stream's position is restored before returning, so callers may
|
|
61
|
+
immediately pass the same stream to ``codec.read()``.
|
|
62
|
+
|
|
63
|
+
For M2 there is one auto-detecting codec (``pubtabnet-2.0.0``); the
|
|
64
|
+
detection delegate API is internal and will firm up in M3 when
|
|
65
|
+
``pubtabnet-1.0.0`` also self-detects.
|
|
66
|
+
"""
|
|
67
|
+
pos = source.tell()
|
|
68
|
+
try:
|
|
69
|
+
for codec in _registry.values():
|
|
70
|
+
sniff = getattr(codec, "sniff", None)
|
|
71
|
+
if sniff is None:
|
|
72
|
+
continue
|
|
73
|
+
source.seek(pos)
|
|
74
|
+
if sniff(source):
|
|
75
|
+
return codec.name
|
|
76
|
+
finally:
|
|
77
|
+
source.seek(pos)
|
|
78
|
+
return None
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def load_plugins() -> tuple[str, ...]:
|
|
82
|
+
"""Discover and register third-party codecs (SPEC §6.2).
|
|
83
|
+
|
|
84
|
+
Scans the ``tablecodec.codecs`` entry-point group; each entry point
|
|
85
|
+
references a :class:`Codec` class (instantiated with no arguments) or a
|
|
86
|
+
ready instance. Names already registered are skipped, so this is safe to
|
|
87
|
+
call more than once. Returns the names newly registered, in load order.
|
|
88
|
+
"""
|
|
89
|
+
loaded: list[str] = []
|
|
90
|
+
for entry_point in importlib.metadata.entry_points(group=_PLUGIN_GROUP):
|
|
91
|
+
obj = entry_point.load()
|
|
92
|
+
codec: Codec = obj() if isinstance(obj, type) else obj
|
|
93
|
+
if codec.name in _registry:
|
|
94
|
+
continue
|
|
95
|
+
register(codec)
|
|
96
|
+
loaded.append(codec.name)
|
|
97
|
+
return tuple(loaded)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
# ---------- test helpers (intentionally underscore-prefixed) ----------
|
|
101
|
+
# Marked with pyright: ignore because they're consumed only by tests via
|
|
102
|
+
# attribute access (codecs._snapshot()), which pyright does not track.
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _snapshot() -> dict[str, Codec]: # pyright: ignore[reportUnusedFunction]
|
|
106
|
+
return dict(_registry)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _restore(snapshot: dict[str, Codec]) -> None: # pyright: ignore[reportUnusedFunction]
|
|
110
|
+
_registry.clear()
|
|
111
|
+
_registry.update(snapshot)
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""The Codec Protocol (SPEC §6).
|
|
2
|
+
|
|
3
|
+
A codec is a reader + writer pair for one external table-recognition
|
|
4
|
+
format, accompanied by an honest self-declaration of what is lost on
|
|
5
|
+
read or write.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from collections.abc import Iterable, Iterator
|
|
11
|
+
from typing import IO, Protocol, runtime_checkable
|
|
12
|
+
|
|
13
|
+
from tablecodec.ir import TableSample
|
|
14
|
+
|
|
15
|
+
__all__ = ["Codec"]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@runtime_checkable
|
|
19
|
+
class Codec(Protocol):
|
|
20
|
+
"""SPEC §6 codec contract.
|
|
21
|
+
|
|
22
|
+
Implementations are typically frozen dataclasses or singletons.
|
|
23
|
+
They MUST be safe to share across threads (no per-call mutable
|
|
24
|
+
state). They MUST NOT mutate their inputs.
|
|
25
|
+
|
|
26
|
+
Identity attributes (``name``, ``spec_version``, ``media_type``) are
|
|
27
|
+
declared as ``@property`` getters so that implementations may use
|
|
28
|
+
read-only attributes (e.g. ``dataclass(frozen=True)`` fields) to
|
|
29
|
+
satisfy the protocol.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def name(self) -> str:
|
|
34
|
+
"""Stable registry key, e.g. ``"pubtabnet-2.0.0"``."""
|
|
35
|
+
...
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def spec_version(self) -> str:
|
|
39
|
+
"""Version of the source format (not of this library)."""
|
|
40
|
+
...
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def media_type(self) -> str:
|
|
44
|
+
"""Canonical MIME type, e.g. ``"application/jsonl"``."""
|
|
45
|
+
...
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def writable(self) -> bool:
|
|
49
|
+
"""Whether this codec supports :meth:`write`.
|
|
50
|
+
|
|
51
|
+
Read-only codecs (SPEC §7, e.g. PubTables-1M) return ``False`` and
|
|
52
|
+
raise ``NotImplementedError`` from :meth:`write`. ``analyze_loss``
|
|
53
|
+
short-circuits to ``"unwritable"`` when a read-only codec is used
|
|
54
|
+
as a conversion target (see ADR 0002).
|
|
55
|
+
"""
|
|
56
|
+
...
|
|
57
|
+
|
|
58
|
+
def read(self, source: IO[str]) -> Iterator[TableSample]:
|
|
59
|
+
"""Yield :class:`TableSample` instances lazily from *source*.
|
|
60
|
+
|
|
61
|
+
Implementations MUST stream — no full-file slurp. ``read`` parses
|
|
62
|
+
and raises (with the record offset) on records it cannot parse; it
|
|
63
|
+
does NOT evaluate the structural invariants. Invariant checking is
|
|
64
|
+
a separate, opt-in step via :func:`tablecodec.validate` (SPEC §6.1
|
|
65
|
+
/ §8, ADR 0008).
|
|
66
|
+
"""
|
|
67
|
+
...
|
|
68
|
+
|
|
69
|
+
def write(self, samples: Iterable[TableSample], sink: IO[str]) -> None:
|
|
70
|
+
"""Serialise *samples* to *sink* in the codec's external format."""
|
|
71
|
+
...
|
|
72
|
+
|
|
73
|
+
def lossy_read(self) -> frozenset[str]:
|
|
74
|
+
"""Source-format field paths dropped on read (e.g. ``"styles"``)."""
|
|
75
|
+
...
|
|
76
|
+
|
|
77
|
+
def lossy_write(self) -> frozenset[str]:
|
|
78
|
+
"""IR fields that cannot be expressed in this format on write."""
|
|
79
|
+
...
|
|
@@ -0,0 +1,510 @@
|
|
|
1
|
+
"""Shared HTML-token table machinery for codec implementations.
|
|
2
|
+
|
|
3
|
+
PubTabNet (1.x / 2.0) and FinTabNet (original) all encode table structure
|
|
4
|
+
as an HTML-like token stream (``<thead>``/``<tbody>``/``<tr>``/``<td>`` with
|
|
5
|
+
optional ``rowspan``/``colspan`` attributes) paired with a positional
|
|
6
|
+
``cells`` array. This module owns the parsing, grid placement, and
|
|
7
|
+
serialization so the concrete codecs stay thin and never duplicate it.
|
|
8
|
+
|
|
9
|
+
The only per-format knobs are:
|
|
10
|
+
|
|
11
|
+
- ``id_field`` — the record-level integer id key (``"imgid"`` for
|
|
12
|
+
PubTabNet, ``"table_id"`` for FinTabNet).
|
|
13
|
+
- ``drop_bbox`` — discard per-cell bbox on read (PubTabNet 1.0).
|
|
14
|
+
- ``include_bbox`` — omit per-cell bbox on write (PubTabNet 1.0).
|
|
15
|
+
|
|
16
|
+
Stdlib-only (SPEC §13).
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import json
|
|
22
|
+
import re
|
|
23
|
+
from dataclasses import dataclass, field
|
|
24
|
+
from typing import IO, Any, Literal, cast
|
|
25
|
+
|
|
26
|
+
from tablecodec.ir import BBox, GridCell, TableSample
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
"looks_like_html_table",
|
|
30
|
+
"parse_html_structure_only",
|
|
31
|
+
"parse_html_table",
|
|
32
|
+
"serialize_html_structure_only",
|
|
33
|
+
"serialize_html_table",
|
|
34
|
+
"sniff_html_table",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
_ROWSPAN_RE = re.compile(r'rowspan\s*=\s*"(\d+)"')
|
|
38
|
+
_COLSPAN_RE = re.compile(r'colspan\s*=\s*"(\d+)"')
|
|
39
|
+
|
|
40
|
+
_SECTION_TOKENS: dict[str, Literal["header", "body"] | None] = {
|
|
41
|
+
"<thead>": "header",
|
|
42
|
+
"</thead>": "body",
|
|
43
|
+
"<tbody>": "body",
|
|
44
|
+
"</tbody>": None,
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# ---------- structure parser ----------
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass(slots=True)
|
|
52
|
+
class _CellSpec:
|
|
53
|
+
"""One ``<td>`` opening parsed out of the structure token stream."""
|
|
54
|
+
|
|
55
|
+
rowspan: int = 1
|
|
56
|
+
colspan: int = 1
|
|
57
|
+
role: Literal["header", "body"] = "body"
|
|
58
|
+
row: int = -1 # assigned by the placement pass
|
|
59
|
+
col: int = -1
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _empty_cell_spec_list() -> list[_CellSpec]:
|
|
63
|
+
return []
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass(slots=True)
|
|
67
|
+
class _ParseState:
|
|
68
|
+
section: Literal["header", "body"] = "body"
|
|
69
|
+
cur_row: int = -1
|
|
70
|
+
cells: list[_CellSpec] = field(default_factory=_empty_cell_spec_list)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _parse_span_attrs(tokens: list[str], start: int) -> tuple[int, int, int]:
|
|
74
|
+
"""Scan attribute tokens after ``<td`` until ``>``; return (rowspan, colspan, end_index)."""
|
|
75
|
+
rowspan = colspan = 1
|
|
76
|
+
j = start
|
|
77
|
+
while j < len(tokens) and tokens[j] != ">":
|
|
78
|
+
attr = tokens[j]
|
|
79
|
+
if (m := _ROWSPAN_RE.search(attr)) is not None:
|
|
80
|
+
rowspan = int(m.group(1))
|
|
81
|
+
if (m := _COLSPAN_RE.search(attr)) is not None:
|
|
82
|
+
colspan = int(m.group(1))
|
|
83
|
+
j += 1
|
|
84
|
+
return rowspan, colspan, j
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _parse_structure_tokens(tokens: list[str]) -> list[_CellSpec]:
|
|
88
|
+
"""Parse HTML structure tokens into ordered cell specs."""
|
|
89
|
+
state = _ParseState()
|
|
90
|
+
i = 0
|
|
91
|
+
while i < len(tokens):
|
|
92
|
+
tok = tokens[i]
|
|
93
|
+
if tok in _SECTION_TOKENS:
|
|
94
|
+
new_section = _SECTION_TOKENS[tok]
|
|
95
|
+
if new_section is not None:
|
|
96
|
+
state.section = new_section
|
|
97
|
+
elif tok == "<tr>":
|
|
98
|
+
state.cur_row += 1
|
|
99
|
+
elif tok == "<td>":
|
|
100
|
+
state.cells.append(_CellSpec(role=state.section, row=state.cur_row))
|
|
101
|
+
elif tok == "<td":
|
|
102
|
+
rowspan, colspan, end = _parse_span_attrs(tokens, i + 1)
|
|
103
|
+
state.cells.append(
|
|
104
|
+
_CellSpec(
|
|
105
|
+
rowspan=rowspan,
|
|
106
|
+
colspan=colspan,
|
|
107
|
+
role=state.section,
|
|
108
|
+
row=state.cur_row,
|
|
109
|
+
)
|
|
110
|
+
)
|
|
111
|
+
i = end
|
|
112
|
+
# </tr>, </td>, and unknown tokens are ignored.
|
|
113
|
+
i += 1
|
|
114
|
+
return state.cells
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# ---------- grid placement ----------
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _empty_rows() -> list[list[bool]]:
|
|
121
|
+
return []
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
@dataclass(slots=True)
|
|
125
|
+
class _OccupiedGrid:
|
|
126
|
+
"""Mutable 2D bitmap with grow-on-demand columns."""
|
|
127
|
+
|
|
128
|
+
nrows: int
|
|
129
|
+
ncols: int
|
|
130
|
+
rows: list[list[bool]] = field(default_factory=_empty_rows)
|
|
131
|
+
|
|
132
|
+
def __post_init__(self) -> None:
|
|
133
|
+
self.rows = [[False] * self.ncols for _ in range(self.nrows)]
|
|
134
|
+
|
|
135
|
+
def ensure_cols(self, want: int) -> None:
|
|
136
|
+
if want > self.ncols:
|
|
137
|
+
for row in self.rows:
|
|
138
|
+
row.extend([False] * (want - self.ncols))
|
|
139
|
+
self.ncols = want
|
|
140
|
+
|
|
141
|
+
def can_place(self, r: int, c: int, rowspan: int, colspan: int) -> bool:
|
|
142
|
+
return all(
|
|
143
|
+
not self.rows[r + dr][c + dc]
|
|
144
|
+
for dr in range(rowspan)
|
|
145
|
+
for dc in range(colspan)
|
|
146
|
+
if r + dr < self.nrows
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
def mark(self, r: int, c: int, rowspan: int, colspan: int) -> None:
|
|
150
|
+
for dr in range(rowspan):
|
|
151
|
+
rr = r + dr
|
|
152
|
+
if rr >= self.nrows:
|
|
153
|
+
continue
|
|
154
|
+
for dc in range(colspan):
|
|
155
|
+
self.rows[rr][c + dc] = True
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _place_cells(specs: list[_CellSpec]) -> tuple[int, int]:
|
|
159
|
+
"""Assign (row, col) to every spec using HTML table placement."""
|
|
160
|
+
if not specs:
|
|
161
|
+
return (0, 0)
|
|
162
|
+
nrows = max(s.row for s in specs) + 1
|
|
163
|
+
grid = _OccupiedGrid(nrows=nrows, ncols=max(8, sum(s.colspan for s in specs)))
|
|
164
|
+
for spec in specs:
|
|
165
|
+
c = 0
|
|
166
|
+
while True:
|
|
167
|
+
grid.ensure_cols(c + spec.colspan)
|
|
168
|
+
if grid.can_place(spec.row, c, spec.rowspan, spec.colspan):
|
|
169
|
+
break
|
|
170
|
+
c += 1
|
|
171
|
+
spec.col = c
|
|
172
|
+
grid.mark(spec.row, c, spec.rowspan, spec.colspan)
|
|
173
|
+
ncols = max((s.col + s.colspan for s in specs), default=0)
|
|
174
|
+
return (nrows, ncols)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
# ---------- payload -> sample ----------
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _normalize_split(value: object) -> Literal["train", "val", "test"] | None:
|
|
181
|
+
if value == "train":
|
|
182
|
+
return "train"
|
|
183
|
+
if value == "val":
|
|
184
|
+
return "val"
|
|
185
|
+
if value == "test":
|
|
186
|
+
return "test"
|
|
187
|
+
if value is None:
|
|
188
|
+
return None
|
|
189
|
+
msg = f"unknown split value {value!r}"
|
|
190
|
+
raise ValueError(msg)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def parse_html_table(
|
|
194
|
+
payload: dict[str, Any], *, id_field: str = "imgid", drop_bbox: bool = False
|
|
195
|
+
) -> TableSample:
|
|
196
|
+
"""Build a :class:`TableSample` from an HTML-token table record."""
|
|
197
|
+
html = payload["html"]
|
|
198
|
+
structure_tokens = html["structure"]["tokens"]
|
|
199
|
+
cell_payloads = html["cells"]
|
|
200
|
+
|
|
201
|
+
specs = _parse_structure_tokens(structure_tokens)
|
|
202
|
+
if len(specs) != len(cell_payloads):
|
|
203
|
+
msg = f"structure declares {len(specs)} cells but cells[] has {len(cell_payloads)} entries"
|
|
204
|
+
raise ValueError(msg)
|
|
205
|
+
|
|
206
|
+
nrows, ncols = _place_cells(specs)
|
|
207
|
+
|
|
208
|
+
cells: list[GridCell] = []
|
|
209
|
+
for spec, cell_payload in zip(specs, cell_payloads, strict=True):
|
|
210
|
+
tokens = tuple(cell_payload.get("tokens", ()))
|
|
211
|
+
bbox_raw = None if drop_bbox else cell_payload.get("bbox")
|
|
212
|
+
bbox: BBox | None = None
|
|
213
|
+
if bbox_raw is not None:
|
|
214
|
+
bbox = (int(bbox_raw[0]), int(bbox_raw[1]), int(bbox_raw[2]), int(bbox_raw[3]))
|
|
215
|
+
cells.append(
|
|
216
|
+
GridCell(
|
|
217
|
+
row=spec.row,
|
|
218
|
+
col=spec.col,
|
|
219
|
+
rowspan=spec.rowspan,
|
|
220
|
+
colspan=spec.colspan,
|
|
221
|
+
tokens=tokens,
|
|
222
|
+
bbox=bbox,
|
|
223
|
+
role=spec.role,
|
|
224
|
+
)
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
return TableSample(
|
|
228
|
+
filename=str(payload["filename"]),
|
|
229
|
+
nrows=nrows,
|
|
230
|
+
ncols=ncols,
|
|
231
|
+
cells=tuple(cells),
|
|
232
|
+
split=_normalize_split(payload.get("split")),
|
|
233
|
+
imgid=payload.get(id_field),
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def parse_html_structure_only(payload: dict[str, Any], *, id_field: str = "imgid") -> TableSample:
|
|
238
|
+
"""Build a :class:`TableSample` from structure tokens alone.
|
|
239
|
+
|
|
240
|
+
For formats that ship table structure without per-cell content
|
|
241
|
+
(e.g. TableBank): every anchor becomes an empty cell (``tokens=()``,
|
|
242
|
+
``bbox=None``). The record has no ``html.cells`` array.
|
|
243
|
+
"""
|
|
244
|
+
structure_tokens = payload["html"]["structure"]["tokens"]
|
|
245
|
+
specs = _parse_structure_tokens(structure_tokens)
|
|
246
|
+
nrows, ncols = _place_cells(specs)
|
|
247
|
+
cells = tuple(
|
|
248
|
+
GridCell(
|
|
249
|
+
row=spec.row,
|
|
250
|
+
col=spec.col,
|
|
251
|
+
rowspan=spec.rowspan,
|
|
252
|
+
colspan=spec.colspan,
|
|
253
|
+
tokens=(),
|
|
254
|
+
bbox=None,
|
|
255
|
+
role=spec.role,
|
|
256
|
+
)
|
|
257
|
+
for spec in specs
|
|
258
|
+
)
|
|
259
|
+
return TableSample(
|
|
260
|
+
filename=str(payload["filename"]),
|
|
261
|
+
nrows=nrows,
|
|
262
|
+
ncols=ncols,
|
|
263
|
+
cells=cells,
|
|
264
|
+
split=_normalize_split(payload.get("split")),
|
|
265
|
+
imgid=payload.get(id_field),
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
# ---------- sample -> payload ----------
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def _group_cells_by_row(cells: tuple[GridCell, ...]) -> dict[int, list[GridCell]]:
|
|
273
|
+
by_row: dict[int, list[GridCell]] = {}
|
|
274
|
+
for cell in cells:
|
|
275
|
+
by_row.setdefault(cell.row, []).append(cell)
|
|
276
|
+
for row_cells in by_row.values():
|
|
277
|
+
row_cells.sort(key=lambda c: c.col)
|
|
278
|
+
return by_row
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def _count_header_rows(by_row: dict[int, list[GridCell]], nrows: int) -> int:
|
|
282
|
+
header_rows = 0
|
|
283
|
+
while header_rows < nrows:
|
|
284
|
+
row_cells = by_row.get(header_rows, [])
|
|
285
|
+
if not row_cells or not all(c.role == "header" for c in row_cells):
|
|
286
|
+
break
|
|
287
|
+
header_rows += 1
|
|
288
|
+
return header_rows
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
@dataclass(slots=True)
|
|
292
|
+
class _SectionRange:
|
|
293
|
+
open_tag: str
|
|
294
|
+
close_tag: str
|
|
295
|
+
start: int
|
|
296
|
+
end: int
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def _emit_row(structure: list[str], emitted: list[GridCell], row_cells: list[GridCell]) -> None:
|
|
300
|
+
structure.append("<tr>")
|
|
301
|
+
for cell in row_cells:
|
|
302
|
+
if cell.rowspan == 1 and cell.colspan == 1:
|
|
303
|
+
structure.extend(["<td>", "</td>"])
|
|
304
|
+
else:
|
|
305
|
+
structure.append("<td")
|
|
306
|
+
if cell.rowspan != 1:
|
|
307
|
+
structure.append(f' rowspan="{cell.rowspan}"')
|
|
308
|
+
if cell.colspan != 1:
|
|
309
|
+
structure.append(f' colspan="{cell.colspan}"')
|
|
310
|
+
structure.extend([">", "</td>"])
|
|
311
|
+
emitted.append(cell)
|
|
312
|
+
structure.append("</tr>")
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def _emit_section(
|
|
316
|
+
structure: list[str],
|
|
317
|
+
emitted: list[GridCell],
|
|
318
|
+
by_row: dict[int, list[GridCell]],
|
|
319
|
+
span: _SectionRange,
|
|
320
|
+
) -> None:
|
|
321
|
+
if span.start >= span.end:
|
|
322
|
+
return
|
|
323
|
+
structure.append(span.open_tag)
|
|
324
|
+
for r in range(span.start, span.end):
|
|
325
|
+
_emit_row(structure, emitted, by_row.get(r, []))
|
|
326
|
+
structure.append(span.close_tag)
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def _cell_to_payload(cell: GridCell, *, include_bbox: bool) -> dict[str, Any]:
|
|
330
|
+
payload: dict[str, Any] = {"tokens": list(cell.tokens)}
|
|
331
|
+
if include_bbox and cell.bbox is not None:
|
|
332
|
+
payload["bbox"] = list(cell.bbox)
|
|
333
|
+
return payload
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
def _structure_and_cells(
|
|
337
|
+
sample: TableSample, *, include_bbox: bool
|
|
338
|
+
) -> tuple[list[str], list[dict[str, Any]]]:
|
|
339
|
+
by_row = _group_cells_by_row(sample.cells)
|
|
340
|
+
header_rows = _count_header_rows(by_row, sample.nrows)
|
|
341
|
+
structure: list[str] = []
|
|
342
|
+
emitted_order: list[GridCell] = []
|
|
343
|
+
_emit_section(
|
|
344
|
+
structure, emitted_order, by_row, _SectionRange("<thead>", "</thead>", 0, header_rows)
|
|
345
|
+
)
|
|
346
|
+
_emit_section(
|
|
347
|
+
structure,
|
|
348
|
+
emitted_order,
|
|
349
|
+
by_row,
|
|
350
|
+
_SectionRange("<tbody>", "</tbody>", header_rows, sample.nrows),
|
|
351
|
+
)
|
|
352
|
+
return structure, [_cell_to_payload(c, include_bbox=include_bbox) for c in emitted_order]
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
def serialize_html_table(
|
|
356
|
+
sample: TableSample, *, id_field: str = "imgid", include_bbox: bool = True
|
|
357
|
+
) -> dict[str, Any]:
|
|
358
|
+
"""Serialize a :class:`TableSample` to an HTML-token table record.
|
|
359
|
+
|
|
360
|
+
``extras`` is intentionally omitted (declared in the codec's
|
|
361
|
+
``lossy_write``).
|
|
362
|
+
"""
|
|
363
|
+
structure_tokens, cell_payloads = _structure_and_cells(sample, include_bbox=include_bbox)
|
|
364
|
+
payload: dict[str, Any] = {
|
|
365
|
+
"filename": sample.filename,
|
|
366
|
+
"html": {"structure": {"tokens": structure_tokens}, "cells": cell_payloads},
|
|
367
|
+
}
|
|
368
|
+
if sample.split is not None:
|
|
369
|
+
payload["split"] = sample.split
|
|
370
|
+
if sample.imgid is not None:
|
|
371
|
+
payload[id_field] = sample.imgid
|
|
372
|
+
return payload
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def serialize_html_structure_only(
|
|
376
|
+
sample: TableSample, *, id_field: str = "imgid"
|
|
377
|
+
) -> dict[str, Any]:
|
|
378
|
+
"""Serialize structure tokens only (no ``cells``).
|
|
379
|
+
|
|
380
|
+
For structure-only formats (TableBank): cell tokens and bboxes are
|
|
381
|
+
dropped (declared in the codec's ``lossy_write``).
|
|
382
|
+
"""
|
|
383
|
+
structure_tokens, _ = _structure_and_cells(sample, include_bbox=False)
|
|
384
|
+
payload: dict[str, Any] = {
|
|
385
|
+
"filename": sample.filename,
|
|
386
|
+
"html": {"structure": {"tokens": structure_tokens}},
|
|
387
|
+
}
|
|
388
|
+
if sample.split is not None:
|
|
389
|
+
payload["split"] = sample.split
|
|
390
|
+
if sample.imgid is not None:
|
|
391
|
+
payload[id_field] = sample.imgid
|
|
392
|
+
return payload
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
# ---------- detection ----------
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
def _cells_list(html_dict: dict[str, Any]) -> list[object] | None:
|
|
399
|
+
cells_field: object = html_dict.get("cells", [])
|
|
400
|
+
if not isinstance(cells_field, list):
|
|
401
|
+
return None
|
|
402
|
+
return cast("list[object]", cells_field)
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
def _no_cell_has_bbox(html_dict: dict[str, Any]) -> bool:
|
|
406
|
+
cells = _cells_list(html_dict)
|
|
407
|
+
if cells is None:
|
|
408
|
+
return False
|
|
409
|
+
return not any(isinstance(c, dict) and "bbox" in c for c in cells)
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
def _all_cells_have_bbox(html_dict: dict[str, Any]) -> bool:
|
|
413
|
+
cells = _cells_list(html_dict)
|
|
414
|
+
if cells is None:
|
|
415
|
+
return False
|
|
416
|
+
return all(isinstance(c, dict) and "bbox" in c for c in cells)
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
def _bbox_constraint_ok(
|
|
420
|
+
html_dict: dict[str, Any], *, require_no_bbox: bool, require_all_bbox: bool
|
|
421
|
+
) -> bool:
|
|
422
|
+
if require_no_bbox:
|
|
423
|
+
return _no_cell_has_bbox(html_dict)
|
|
424
|
+
if require_all_bbox:
|
|
425
|
+
return _all_cells_have_bbox(html_dict)
|
|
426
|
+
return True
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
def _cells_constraint_ok(
|
|
430
|
+
html_dict: dict[str, Any],
|
|
431
|
+
*,
|
|
432
|
+
require_no_bbox: bool,
|
|
433
|
+
require_all_bbox: bool,
|
|
434
|
+
require_no_cells: bool,
|
|
435
|
+
) -> bool:
|
|
436
|
+
has_cells = "cells" in html_dict
|
|
437
|
+
if require_no_cells:
|
|
438
|
+
return not has_cells
|
|
439
|
+
if not has_cells:
|
|
440
|
+
return False
|
|
441
|
+
return _bbox_constraint_ok(
|
|
442
|
+
html_dict, require_no_bbox=require_no_bbox, require_all_bbox=require_all_bbox
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
def looks_like_html_table(
|
|
447
|
+
payload: object,
|
|
448
|
+
*,
|
|
449
|
+
require_no_bbox: bool = False,
|
|
450
|
+
require_all_bbox: bool = False,
|
|
451
|
+
require_no_cells: bool = False,
|
|
452
|
+
require_field: str | None = None,
|
|
453
|
+
) -> bool:
|
|
454
|
+
"""Pure (no I/O) shape check for an HTML-token table record.
|
|
455
|
+
|
|
456
|
+
``html.structure`` is always required. ``html.cells`` is required
|
|
457
|
+
unless *require_no_cells* is True (structure-only formats like
|
|
458
|
+
TableBank), in which case its absence is required instead.
|
|
459
|
+
"""
|
|
460
|
+
if not isinstance(payload, dict):
|
|
461
|
+
return False
|
|
462
|
+
payload_dict = cast("dict[str, Any]", payload)
|
|
463
|
+
if require_field is not None and require_field not in payload_dict:
|
|
464
|
+
return False
|
|
465
|
+
html: object = payload_dict.get("html")
|
|
466
|
+
if not isinstance(html, dict):
|
|
467
|
+
return False
|
|
468
|
+
html_dict = cast("dict[str, Any]", html)
|
|
469
|
+
if "structure" not in html_dict:
|
|
470
|
+
return False
|
|
471
|
+
return _cells_constraint_ok(
|
|
472
|
+
html_dict,
|
|
473
|
+
require_no_bbox=require_no_bbox,
|
|
474
|
+
require_all_bbox=require_all_bbox,
|
|
475
|
+
require_no_cells=require_no_cells,
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
def sniff_html_table(
|
|
480
|
+
source: IO[str],
|
|
481
|
+
*,
|
|
482
|
+
require_no_bbox: bool = False,
|
|
483
|
+
require_all_bbox: bool = False,
|
|
484
|
+
require_no_cells: bool = False,
|
|
485
|
+
require_field: str | None = None,
|
|
486
|
+
) -> bool:
|
|
487
|
+
"""Peek the first non-blank line; verify it is an HTML-token table.
|
|
488
|
+
|
|
489
|
+
Stream position is always restored.
|
|
490
|
+
"""
|
|
491
|
+
pos = source.tell()
|
|
492
|
+
try:
|
|
493
|
+
for raw in source:
|
|
494
|
+
line = raw.strip()
|
|
495
|
+
if not line:
|
|
496
|
+
continue
|
|
497
|
+
try:
|
|
498
|
+
payload: object = json.loads(line)
|
|
499
|
+
except json.JSONDecodeError:
|
|
500
|
+
return False
|
|
501
|
+
return looks_like_html_table(
|
|
502
|
+
payload,
|
|
503
|
+
require_no_bbox=require_no_bbox,
|
|
504
|
+
require_all_bbox=require_all_bbox,
|
|
505
|
+
require_no_cells=require_no_cells,
|
|
506
|
+
require_field=require_field,
|
|
507
|
+
)
|
|
508
|
+
return False
|
|
509
|
+
finally:
|
|
510
|
+
source.seek(pos)
|