tablecodec 0.0.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tablecodec/__init__.py +29 -0
- tablecodec/_invariants.py +311 -0
- tablecodec/cli.py +314 -0
- tablecodec/codecs/__init__.py +111 -0
- tablecodec/codecs/_base.py +79 -0
- tablecodec/codecs/_htmltable.py +510 -0
- tablecodec/codecs/_otslgrid.py +318 -0
- tablecodec/codecs/builtins.py +36 -0
- tablecodec/codecs/doctags.py +278 -0
- tablecodec/codecs/fintabnet.py +84 -0
- tablecodec/codecs/fintabnet_otsl.py +141 -0
- tablecodec/codecs/otsl.py +138 -0
- tablecodec/codecs/pubtables1m.py +161 -0
- tablecodec/codecs/pubtabnet.py +128 -0
- tablecodec/codecs/tablebank.py +76 -0
- tablecodec/codecs/tableformer.py +80 -0
- tablecodec/io.py +91 -0
- tablecodec/ir.py +101 -0
- tablecodec/loss.py +105 -0
- tablecodec/py.typed +0 -0
- tablecodec/teds.py +243 -0
- tablecodec/validate.py +185 -0
- tablecodec-0.0.18.dist-info/METADATA +200 -0
- tablecodec-0.0.18.dist-info/RECORD +27 -0
- tablecodec-0.0.18.dist-info/WHEEL +4 -0
- tablecodec-0.0.18.dist-info/entry_points.txt +2 -0
- tablecodec-0.0.18.dist-info/licenses/LICENSE +21 -0
tablecodec/teds.py
ADDED
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
"""TEDS (Tree-Edit-Distance based Similarity) for table samples.
|
|
2
|
+
|
|
3
|
+
TEDS scores the similarity of two tables in ``[0, 1]`` by the normalized
|
|
4
|
+
tree-edit distance between their HTML-DOM trees (Zhong et al., "Image-based
|
|
5
|
+
table recognition: data, model, and evaluation"). ``1.0`` means identical
|
|
6
|
+
structure and content; ``structure_only=True`` ignores cell text (TEDS-Struct).
|
|
7
|
+
|
|
8
|
+
This is the optional ``[teds]`` feature: it imports ``apted`` and ``lxml`` and
|
|
9
|
+
therefore lives OUTSIDE the zero-dependency core. It is never imported by
|
|
10
|
+
``tablecodec/__init__`` — use ``from tablecodec.teds import teds``.
|
|
11
|
+
|
|
12
|
+
Attribution: the tree construction, the rename-cost rule, and the
|
|
13
|
+
``1 - dist / max_nodes`` formula are adapted from IBM's PubTabNet reference
|
|
14
|
+
metric (``src/metric.py``, Apache License 2.0, Copyright 2020 IBM,
|
|
15
|
+
peter.zhong@au1.ibm.com). This is NOT a verbatim copy: the IR-native entry
|
|
16
|
+
point, a pure-Python normalized Levenshtein (replacing the ``distance``
|
|
17
|
+
package), and the removal of batching/parallelism are tablecodec's. See
|
|
18
|
+
``THIRD_PARTY_NOTICES.md`` and ``docs/adr/0011-teds-metric-port.md``.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
from typing import Any, cast
|
|
24
|
+
|
|
25
|
+
from apted import APTED, Config # pyright: ignore[reportMissingTypeStubs]
|
|
26
|
+
from lxml import html # pyright: ignore[reportMissingTypeStubs]
|
|
27
|
+
|
|
28
|
+
from tablecodec.ir import GridCell, TableSample
|
|
29
|
+
|
|
30
|
+
__all__ = ["teds", "teds_html"]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# ---------- normalized Levenshtein (pure stdlib; replaces `distance`) ----------
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _levenshtein(a: list[str], b: list[str]) -> int:
|
|
37
|
+
"""Edit distance between two token sequences."""
|
|
38
|
+
if a == b:
|
|
39
|
+
return 0
|
|
40
|
+
if not a:
|
|
41
|
+
return len(b)
|
|
42
|
+
if not b:
|
|
43
|
+
return len(a)
|
|
44
|
+
previous = list(range(len(b) + 1))
|
|
45
|
+
for i, ca in enumerate(a, start=1):
|
|
46
|
+
current = [i]
|
|
47
|
+
for j, cb in enumerate(b, start=1):
|
|
48
|
+
cost = 0 if ca == cb else 1
|
|
49
|
+
current.append(min(previous[j] + 1, current[j - 1] + 1, previous[j - 1] + cost))
|
|
50
|
+
previous = current
|
|
51
|
+
return previous[-1]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _normalized_distance(a: list[str], b: list[str]) -> float:
|
|
55
|
+
"""Levenshtein distance scaled to ``[0, 1]`` by the longer sequence."""
|
|
56
|
+
longest = max(len(a), len(b))
|
|
57
|
+
if longest == 0:
|
|
58
|
+
return 0.0
|
|
59
|
+
return _levenshtein(a, b) / longest
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
# ---------- apted tree + config (adapted from IBM PubTabNet metric.py) ----------
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class _TableTree:
|
|
66
|
+
"""An apted tree node for one HTML table element."""
|
|
67
|
+
|
|
68
|
+
def __init__(
|
|
69
|
+
self,
|
|
70
|
+
tag: str,
|
|
71
|
+
colspan: int | None,
|
|
72
|
+
rowspan: int | None,
|
|
73
|
+
content: list[str] | None,
|
|
74
|
+
*children: _TableTree,
|
|
75
|
+
) -> None:
|
|
76
|
+
self.tag = tag
|
|
77
|
+
self.colspan = colspan
|
|
78
|
+
self.rowspan = rowspan
|
|
79
|
+
self.content = content
|
|
80
|
+
self.children: list[_TableTree] = list(children)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class _CustomConfig(Config):
|
|
84
|
+
def children(self, node: _TableTree) -> list[_TableTree]:
|
|
85
|
+
return node.children
|
|
86
|
+
|
|
87
|
+
# apted annotates rename as `-> int`, but TEDS uses fractional content
|
|
88
|
+
# costs; apted sums costs as numbers, so a float is correct at runtime.
|
|
89
|
+
def rename(self, node1: _TableTree, node2: _TableTree) -> float: # pyright: ignore[reportIncompatibleMethodOverride]
|
|
90
|
+
"""Cost of relabeling ``node1`` to ``node2``."""
|
|
91
|
+
if (
|
|
92
|
+
node1.tag != node2.tag
|
|
93
|
+
or node1.colspan != node2.colspan
|
|
94
|
+
or node1.rowspan != node2.rowspan
|
|
95
|
+
):
|
|
96
|
+
return 1.0
|
|
97
|
+
if node1.tag == "td" and (node1.content or node2.content):
|
|
98
|
+
return _normalized_distance(node1.content or [], node2.content or [])
|
|
99
|
+
return 0.0
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
# ---------- untyped third-party boundary (apted + lxml have no stubs) ----------
|
|
103
|
+
#
|
|
104
|
+
# apted and lxml ship no type information, so pyright (strict) cannot type the
|
|
105
|
+
# few lines that touch them. These thin wrappers confine that boundary: each
|
|
106
|
+
# returns a concretely-typed value, so the rest of the module is fully checked.
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _parse_first_table(doc: str) -> Any:
|
|
110
|
+
"""Parse `doc` and return its first ``body/table`` element, or ``None``."""
|
|
111
|
+
parser = cast("Any", html.HTMLParser(remove_comments=True))
|
|
112
|
+
root = cast("Any", html.fromstring(doc, parser=parser)) # pyright: ignore[reportUnknownMemberType]
|
|
113
|
+
tables = cast("list[Any]", root.xpath("body/table"))
|
|
114
|
+
return tables[0] if tables else None
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _count_descendant_elements(table: Any) -> int:
|
|
118
|
+
"""Number of element nodes below `table` (the TEDS denominator term)."""
|
|
119
|
+
return len(cast("list[Any]", table.xpath(".//*")))
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _tree_edit_distance(tree1: _TableTree, tree2: _TableTree) -> float:
|
|
123
|
+
raw = cast("Any", APTED(tree1, tree2, _CustomConfig()).compute_edit_distance())
|
|
124
|
+
return float(raw)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
# ---------- lxml element -> apted tree (adapted from IBM PubTabNet) ----------
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _tokenize(node: Any, tokens: list[str]) -> None:
|
|
131
|
+
"""Flatten an element into tokens: char-level text + tag markers."""
|
|
132
|
+
tokens.append(f"<{node.tag}>")
|
|
133
|
+
if node.text is not None:
|
|
134
|
+
tokens.extend(list(node.text))
|
|
135
|
+
for child in node:
|
|
136
|
+
_tokenize(child, tokens)
|
|
137
|
+
if node.tag != "unk":
|
|
138
|
+
tokens.append(f"</{node.tag}>")
|
|
139
|
+
if node.tag != "td" and node.tail is not None:
|
|
140
|
+
tokens.extend(list(node.tail))
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _load_html_tree(node: Any, *, structure_only: bool) -> _TableTree:
|
|
144
|
+
"""Convert an lxml table element into the apted tree apted expects."""
|
|
145
|
+
if node.tag == "td":
|
|
146
|
+
if structure_only:
|
|
147
|
+
content: list[str] = []
|
|
148
|
+
else:
|
|
149
|
+
tokens: list[str] = []
|
|
150
|
+
_tokenize(node, tokens)
|
|
151
|
+
content = tokens[1:-1]
|
|
152
|
+
new_node = _TableTree(
|
|
153
|
+
"td",
|
|
154
|
+
int(node.attrib.get("colspan", "1")),
|
|
155
|
+
int(node.attrib.get("rowspan", "1")),
|
|
156
|
+
content,
|
|
157
|
+
)
|
|
158
|
+
else:
|
|
159
|
+
new_node = _TableTree(str(node.tag), None, None, None)
|
|
160
|
+
for child in node:
|
|
161
|
+
new_node.children.append(_load_html_tree(child, structure_only=structure_only))
|
|
162
|
+
return new_node
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
# ---------- IR -> HTML ----------
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _is_header_row(cells: list[GridCell]) -> bool:
|
|
169
|
+
return any(cell.role == "header" for cell in cells)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _cell_html(cell: GridCell) -> str:
|
|
173
|
+
attrs = ""
|
|
174
|
+
if cell.colspan != 1:
|
|
175
|
+
attrs += f' colspan="{cell.colspan}"'
|
|
176
|
+
if cell.rowspan != 1:
|
|
177
|
+
attrs += f' rowspan="{cell.rowspan}"'
|
|
178
|
+
return f"<td{attrs}>{''.join(cell.tokens)}</td>"
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _row_html(cells: list[GridCell]) -> str:
|
|
182
|
+
inner = "".join(_cell_html(cell) for cell in sorted(cells, key=lambda c: c.col))
|
|
183
|
+
return f"<tr>{inner}</tr>"
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _sample_to_html(sample: TableSample) -> str:
|
|
187
|
+
"""Render a sample as ``<html><body><table>...`` for TEDS.
|
|
188
|
+
|
|
189
|
+
Cells are grouped by anchor row (HTML rowspan/colspan handle the rest);
|
|
190
|
+
header rows go in ``<thead>``, body rows in ``<tbody>``. All cells render
|
|
191
|
+
as ``<td>`` (PubTabNet convention) so the metric scores their content.
|
|
192
|
+
"""
|
|
193
|
+
rows: dict[int, list[GridCell]] = {}
|
|
194
|
+
for cell in sample.cells:
|
|
195
|
+
rows.setdefault(cell.row, []).append(cell)
|
|
196
|
+
|
|
197
|
+
header = [r for r in sorted(rows) if _is_header_row(rows[r])]
|
|
198
|
+
body = [r for r in sorted(rows) if not _is_header_row(rows[r])]
|
|
199
|
+
|
|
200
|
+
parts = ["<html><body><table>"]
|
|
201
|
+
if header:
|
|
202
|
+
parts.append("<thead>")
|
|
203
|
+
parts.extend(_row_html(rows[r]) for r in header)
|
|
204
|
+
parts.append("</thead>")
|
|
205
|
+
if body:
|
|
206
|
+
parts.append("<tbody>")
|
|
207
|
+
parts.extend(_row_html(rows[r]) for r in body)
|
|
208
|
+
parts.append("</tbody>")
|
|
209
|
+
parts.append("</table></body></html>")
|
|
210
|
+
return "".join(parts)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
# ---------- public API ----------
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def teds_html(pred_html: str, true_html: str, *, structure_only: bool = False) -> float:
|
|
217
|
+
"""TEDS between two HTML table documents.
|
|
218
|
+
|
|
219
|
+
Each input is parsed; the first ``body/table`` is scored. Empty input or
|
|
220
|
+
HTML with no table scores ``0.0`` (the canonical convention).
|
|
221
|
+
"""
|
|
222
|
+
if not pred_html or not true_html:
|
|
223
|
+
return 0.0
|
|
224
|
+
pred_table = _parse_first_table(pred_html)
|
|
225
|
+
true_table = _parse_first_table(true_html)
|
|
226
|
+
if pred_table is None or true_table is None:
|
|
227
|
+
return 0.0
|
|
228
|
+
n_nodes = max(_count_descendant_elements(pred_table), _count_descendant_elements(true_table))
|
|
229
|
+
if n_nodes == 0:
|
|
230
|
+
return 1.0
|
|
231
|
+
tree_pred = _load_html_tree(pred_table, structure_only=structure_only)
|
|
232
|
+
tree_true = _load_html_tree(true_table, structure_only=structure_only)
|
|
233
|
+
distance = _tree_edit_distance(tree_pred, tree_true)
|
|
234
|
+
return 1.0 - distance / n_nodes
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def teds(pred: TableSample, true: TableSample, *, structure_only: bool = False) -> float:
|
|
238
|
+
"""TEDS between two :class:`TableSample`s.
|
|
239
|
+
|
|
240
|
+
Both samples are rendered to HTML with the same renderer, so the score is
|
|
241
|
+
a well-defined similarity in ``[0, 1]`` regardless of their source codecs.
|
|
242
|
+
"""
|
|
243
|
+
return teds_html(_sample_to_html(pred), _sample_to_html(true), structure_only=structure_only)
|
tablecodec/validate.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
"""Validation entry-point and named profiles.
|
|
2
|
+
|
|
3
|
+
SPEC §8: a user explicitly opts into the strictness they need. Five
|
|
4
|
+
profiles ship: ``LENIENT``, ``DEFAULT``, ``PUBTABNET_2_0``, ``TABLEFORMER``,
|
|
5
|
+
``STRICT``. Custom profiles can be constructed by composing the
|
|
6
|
+
``check_iXX`` functions in :mod:`tablecodec._invariants`.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from collections.abc import Callable
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from types import SimpleNamespace
|
|
14
|
+
|
|
15
|
+
from tablecodec._invariants import (
|
|
16
|
+
ValidationError,
|
|
17
|
+
check_i01_nrows_ncols_positive,
|
|
18
|
+
check_i02_cell_in_bounds,
|
|
19
|
+
check_i03_span_in_bounds,
|
|
20
|
+
check_i04_grid_exact_cover,
|
|
21
|
+
check_i05_bbox_well_formed,
|
|
22
|
+
check_i06_header_contiguous_top,
|
|
23
|
+
check_i07_tokens_is_tuple,
|
|
24
|
+
)
|
|
25
|
+
from tablecodec.ir import TableSample
|
|
26
|
+
|
|
27
|
+
__all__ = ["Profile", "ValidationError", "profiles", "validate"]
|
|
28
|
+
|
|
29
|
+
Check = Callable[[TableSample], list[ValidationError]]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass(frozen=True, slots=True)
|
|
33
|
+
class Profile:
|
|
34
|
+
"""A named bundle of invariant checks.
|
|
35
|
+
|
|
36
|
+
Attributes:
|
|
37
|
+
name: Human-visible profile identifier.
|
|
38
|
+
checks: Ordered tuple of check functions. Order determines the
|
|
39
|
+
order of errors in the returned list (lower-numbered
|
|
40
|
+
invariants first, by convention).
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
name: str
|
|
44
|
+
checks: tuple[Check, ...] = field(default_factory=tuple)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# ---------- profile-specific extra checks ----------
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _check_pubtabnet_20_bbox(sample: TableSample) -> list[ValidationError]:
|
|
51
|
+
"""SPEC §8 pubtabnet-2.0 profile: non-empty cells must have bbox."""
|
|
52
|
+
errors: list[ValidationError] = []
|
|
53
|
+
for idx, cell in enumerate(sample.cells):
|
|
54
|
+
if cell.tokens and cell.bbox is None:
|
|
55
|
+
errors.append(
|
|
56
|
+
ValidationError(
|
|
57
|
+
invariant="PUBTABNET-2.0-BBOX",
|
|
58
|
+
message=f"non-empty cell index {idx} is missing bbox",
|
|
59
|
+
cell_index=idx,
|
|
60
|
+
)
|
|
61
|
+
)
|
|
62
|
+
return errors
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _check_tableformer_bbox(sample: TableSample) -> list[ValidationError]:
|
|
66
|
+
"""SPEC §8 tableformer profile: every cell (even empty) must have bbox."""
|
|
67
|
+
errors: list[ValidationError] = []
|
|
68
|
+
for idx, cell in enumerate(sample.cells):
|
|
69
|
+
if cell.bbox is None:
|
|
70
|
+
errors.append(
|
|
71
|
+
ValidationError(
|
|
72
|
+
invariant="TABLEFORMER-BBOX",
|
|
73
|
+
message=f"cell index {idx} is missing bbox",
|
|
74
|
+
cell_index=idx,
|
|
75
|
+
)
|
|
76
|
+
)
|
|
77
|
+
return errors
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _check_strict_bbox_in_image(sample: TableSample) -> list[ValidationError]:
|
|
81
|
+
"""SPEC §8 strict profile / ADR 0012: cross-check bbox vs image dimensions.
|
|
82
|
+
|
|
83
|
+
Semantics (option C): a bbox-free sample needs no image metadata. If any
|
|
84
|
+
cell carries a bbox, the sample MUST declare ``image_width`` and
|
|
85
|
+
``image_height`` (else the coordinates cannot be bound-checked), and every
|
|
86
|
+
bbox must lie within the image rectangle ``0 <= x0 < x1 <= width`` and
|
|
87
|
+
``0 <= y0 < y1 <= height`` (upper bound inclusive — a bbox may touch the
|
|
88
|
+
image edge).
|
|
89
|
+
"""
|
|
90
|
+
cells_with_bbox = [(idx, c.bbox) for idx, c in enumerate(sample.cells) if c.bbox is not None]
|
|
91
|
+
if not cells_with_bbox:
|
|
92
|
+
return []
|
|
93
|
+
|
|
94
|
+
width, height = sample.image_width, sample.image_height
|
|
95
|
+
if width is None or height is None:
|
|
96
|
+
return [
|
|
97
|
+
ValidationError(
|
|
98
|
+
invariant="STRICT-IMAGE-METADATA",
|
|
99
|
+
message=(
|
|
100
|
+
"sample carries cell bboxes but no image_width/image_height "
|
|
101
|
+
"to cross-check them against"
|
|
102
|
+
),
|
|
103
|
+
cell_index=None,
|
|
104
|
+
)
|
|
105
|
+
]
|
|
106
|
+
|
|
107
|
+
errors: list[ValidationError] = []
|
|
108
|
+
for idx, bbox in cells_with_bbox:
|
|
109
|
+
x0, y0, x1, y1 = bbox
|
|
110
|
+
if not (0 <= x0 and x1 <= width):
|
|
111
|
+
errors.append(
|
|
112
|
+
ValidationError(
|
|
113
|
+
invariant="STRICT-BBOX-OUT-OF-BOUNDS",
|
|
114
|
+
message=(f"bbox x-range [{x0}, {x1}] outside [0, {width}] at cell index {idx}"),
|
|
115
|
+
cell_index=idx,
|
|
116
|
+
)
|
|
117
|
+
)
|
|
118
|
+
if not (0 <= y0 and y1 <= height):
|
|
119
|
+
errors.append(
|
|
120
|
+
ValidationError(
|
|
121
|
+
invariant="STRICT-BBOX-OUT-OF-BOUNDS",
|
|
122
|
+
message=(
|
|
123
|
+
f"bbox y-range [{y0}, {y1}] outside [0, {height}] at cell index {idx}"
|
|
124
|
+
),
|
|
125
|
+
cell_index=idx,
|
|
126
|
+
)
|
|
127
|
+
)
|
|
128
|
+
return errors
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
# ---------- profile registry ----------
|
|
132
|
+
|
|
133
|
+
_DEFAULT_CHECKS: tuple[Check, ...] = (
|
|
134
|
+
check_i01_nrows_ncols_positive,
|
|
135
|
+
check_i02_cell_in_bounds,
|
|
136
|
+
check_i03_span_in_bounds,
|
|
137
|
+
check_i04_grid_exact_cover,
|
|
138
|
+
check_i05_bbox_well_formed,
|
|
139
|
+
check_i06_header_contiguous_top,
|
|
140
|
+
check_i07_tokens_is_tuple,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# SPEC §8: LENIENT enforces I-01, I-02, I-03, I-05 only.
|
|
144
|
+
_LENIENT_CHECKS: tuple[Check, ...] = (
|
|
145
|
+
check_i01_nrows_ncols_positive,
|
|
146
|
+
check_i02_cell_in_bounds,
|
|
147
|
+
check_i03_span_in_bounds,
|
|
148
|
+
check_i05_bbox_well_formed,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
# SimpleNamespace exposes the five built-in profiles as ``profiles.NAME``
|
|
153
|
+
# without pyright flagging uppercase attributes as ``reportConstantRedefinition``.
|
|
154
|
+
# SPEC §8 / ADR 0012: STRICT = DEFAULT + a bbox-in-image cross-check that
|
|
155
|
+
# requires image metadata whenever a sample carries bboxes.
|
|
156
|
+
profiles = SimpleNamespace(
|
|
157
|
+
LENIENT=Profile(name="LENIENT", checks=_LENIENT_CHECKS),
|
|
158
|
+
DEFAULT=Profile(name="DEFAULT", checks=_DEFAULT_CHECKS),
|
|
159
|
+
PUBTABNET_2_0=Profile(
|
|
160
|
+
name="PUBTABNET_2_0",
|
|
161
|
+
checks=(*_DEFAULT_CHECKS, _check_pubtabnet_20_bbox),
|
|
162
|
+
),
|
|
163
|
+
TABLEFORMER=Profile(
|
|
164
|
+
name="TABLEFORMER",
|
|
165
|
+
checks=(*_DEFAULT_CHECKS, _check_tableformer_bbox),
|
|
166
|
+
),
|
|
167
|
+
STRICT=Profile(name="STRICT", checks=(*_DEFAULT_CHECKS, _check_strict_bbox_in_image)),
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def validate(sample: TableSample, profile: Profile) -> list[ValidationError]:
|
|
172
|
+
"""Run the checks bundled in *profile* against *sample*.
|
|
173
|
+
|
|
174
|
+
Returns a flat list of :class:`ValidationError`. Empty list = valid.
|
|
175
|
+
Never raises on data; raises ``TypeError`` if *profile* is not a
|
|
176
|
+
:class:`Profile` instance (SPEC §8 "raise only on programmer error").
|
|
177
|
+
"""
|
|
178
|
+
if not isinstance(profile, Profile): # pyright: ignore[reportUnnecessaryIsInstance]
|
|
179
|
+
msg = f"profile must be a Profile instance, got {type(profile).__name__}"
|
|
180
|
+
raise TypeError(msg)
|
|
181
|
+
|
|
182
|
+
errors: list[ValidationError] = []
|
|
183
|
+
for check in profile.checks:
|
|
184
|
+
errors.extend(check(sample))
|
|
185
|
+
return errors
|
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tablecodec
|
|
3
|
+
Version: 0.0.18
|
|
4
|
+
Summary: Neutral Internal Representation and Codec registry for image-based table-recognition datasets
|
|
5
|
+
Project-URL: Homepage, https://github.com/hironow/tablecodec
|
|
6
|
+
Project-URL: Repository, https://github.com/hironow/tablecodec
|
|
7
|
+
Project-URL: Issues, https://github.com/hironow/tablecodec/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/hironow/tablecodec/blob/main/CHANGELOG.md
|
|
9
|
+
Author-email: hironow <hironow365@gmail.com>
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: dataset,doctags,fintabnet,ocr,otsl,pubtabnet,table
|
|
13
|
+
Classifier: Development Status :: 2 - Pre-Alpha
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Image Recognition
|
|
22
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
23
|
+
Classifier: Typing :: Typed
|
|
24
|
+
Requires-Python: >=3.11
|
|
25
|
+
Provides-Extra: all
|
|
26
|
+
Requires-Dist: apted>=1.0.3; extra == 'all'
|
|
27
|
+
Requires-Dist: click>=8.1; extra == 'all'
|
|
28
|
+
Requires-Dist: datasets>=2.19; extra == 'all'
|
|
29
|
+
Requires-Dist: defusedxml>=0.7; extra == 'all'
|
|
30
|
+
Requires-Dist: lxml>=5.0; extra == 'all'
|
|
31
|
+
Provides-Extra: cli
|
|
32
|
+
Requires-Dist: click>=8.1; extra == 'cli'
|
|
33
|
+
Provides-Extra: dev
|
|
34
|
+
Requires-Dist: coverage>=7.5; extra == 'dev'
|
|
35
|
+
Requires-Dist: hypothesis>=6.100; extra == 'dev'
|
|
36
|
+
Requires-Dist: jsonschema>=4.20; extra == 'dev'
|
|
37
|
+
Requires-Dist: pyright>=1.1.380; extra == 'dev'
|
|
38
|
+
Requires-Dist: pytest-benchmark>=4.0; extra == 'dev'
|
|
39
|
+
Requires-Dist: pytest-cov>=5.0; extra == 'dev'
|
|
40
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
41
|
+
Requires-Dist: ruff>=0.6; extra == 'dev'
|
|
42
|
+
Provides-Extra: hf
|
|
43
|
+
Requires-Dist: datasets>=2.19; extra == 'hf'
|
|
44
|
+
Requires-Dist: defusedxml>=0.7; extra == 'hf'
|
|
45
|
+
Provides-Extra: teds
|
|
46
|
+
Requires-Dist: apted>=1.0.3; extra == 'teds'
|
|
47
|
+
Requires-Dist: lxml>=5.0; extra == 'teds'
|
|
48
|
+
Description-Content-Type: text/markdown
|
|
49
|
+
|
|
50
|
+
# tablecodec
|
|
51
|
+
|
|
52
|
+
> Neutral Internal Representation + Codec registry for image-based table-recognition datasets.
|
|
53
|
+
|
|
54
|
+
`tablecodec` is a Python library that provides a single, lossless Internal
|
|
55
|
+
Representation (IR) for tables and a registry-based codec layer that translates
|
|
56
|
+
between this IR and the fragmented landscape of public table-recognition
|
|
57
|
+
datasets — PubTabNet, FinTabNet, OTSL, TableFormer, DocTags-tables,
|
|
58
|
+
PubTables-1M, TableBank.
|
|
59
|
+
|
|
60
|
+
- Stdlib-only core. Heavier features (TEDS, CLI) are opt-in extras.
|
|
61
|
+
- Streams large JSONL datasets at constant memory.
|
|
62
|
+
- Self-declared loss analysis between any two codecs.
|
|
63
|
+
|
|
64
|
+
## Status
|
|
65
|
+
|
|
66
|
+
**0.0.18 (pre-alpha).** Not yet published to PyPI. The nine codecs, the TEDS
|
|
67
|
+
metric (`[teds]`), and the STRICT validation profile were all added
|
|
68
|
+
incrementally within the 0.0.x series; a separate `tablecodec-docling` bridge
|
|
69
|
+
codec lives in `packages/` (its own version). The 0.x line makes no
|
|
70
|
+
API-stability promises; the public surface freezes at 1.0 (see
|
|
71
|
+
[docs/spec.md](docs/spec.md) §14). The specification is the source of
|
|
72
|
+
truth. Auto-generated codec / loss tables live at
|
|
73
|
+
[docs/format_support.md](docs/format_support.md) and
|
|
74
|
+
[docs/loss_matrix.md](docs/loss_matrix.md).
|
|
75
|
+
|
|
76
|
+
## Installation
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
pip install tablecodec # stdlib-only core
|
|
80
|
+
pip install "tablecodec[cli]" # + command-line interface (click)
|
|
81
|
+
pip install "tablecodec[teds]" # + TEDS similarity metric (apted, lxml)
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
Requires Python 3.11+.
|
|
85
|
+
|
|
86
|
+
## Basic usage
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
import tablecodec
|
|
90
|
+
from tablecodec import codecs, validate, profiles, analyze_loss
|
|
91
|
+
from tablecodec.codecs.pubtabnet import PubTabNet20Codec
|
|
92
|
+
|
|
93
|
+
# Register a codec (built-ins self-register through the CLI; in library
|
|
94
|
+
# use you register the ones you need).
|
|
95
|
+
codecs.register(PubTabNet20Codec())
|
|
96
|
+
|
|
97
|
+
# Stream-read a dataset into the neutral IR.
|
|
98
|
+
with open("pubtabnet_val.jsonl", encoding="utf-8") as f:
|
|
99
|
+
for sample in codecs.get("pubtabnet-2.0.0").read(f):
|
|
100
|
+
errors = validate(sample, profile=profiles.DEFAULT)
|
|
101
|
+
if errors:
|
|
102
|
+
print(sample.filename, errors)
|
|
103
|
+
|
|
104
|
+
# Static, data-free loss analysis between two formats.
|
|
105
|
+
report = analyze_loss(source="pubtabnet-2.0.0", target="otsl-1.0.0")
|
|
106
|
+
print(report.round_trip_classification) # "structure-preserving"
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
The core has **zero third-party runtime dependencies** (SPEC §13);
|
|
110
|
+
`import tablecodec` works on a bare Python 3.11+.
|
|
111
|
+
|
|
112
|
+
## TEDS similarity (optional)
|
|
113
|
+
|
|
114
|
+
The `[teds]` extra adds a Tree-Edit-Distance based Similarity score between
|
|
115
|
+
two samples. It lives outside the core (it imports `apted`/`lxml`), so import
|
|
116
|
+
it from its submodule:
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
from tablecodec.teds import teds
|
|
120
|
+
|
|
121
|
+
score = teds(pred_sample, true_sample) # 0.0 .. 1.0
|
|
122
|
+
struct = teds(pred_sample, true_sample, structure_only=True) # ignore cell text
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
## CLI
|
|
126
|
+
|
|
127
|
+
Install with the optional ``[cli]`` extra:
|
|
128
|
+
|
|
129
|
+
```bash
|
|
130
|
+
pip install "tablecodec[cli]"
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
tablecodec codecs list
|
|
135
|
+
tablecodec analyze-loss --from pubtabnet-2.0.0 --to otsl-1.0.0
|
|
136
|
+
tablecodec validate path/to/dataset.jsonl --codec pubtabnet-2.0.0 --profile DEFAULT
|
|
137
|
+
tablecodec stats path/to/dataset.jsonl --codec pubtabnet-2.0.0 --json
|
|
138
|
+
tablecodec convert in.jsonl out.jsonl --from pubtabnet-2.0.0 --to otsl-1.0.0
|
|
139
|
+
tablecodec convert in.jsonl /dev/null --from pubtabnet-2.0.0 --to otsl-1.0.0 --dry-run
|
|
140
|
+
tablecodec diff a.jsonl b.jsonl --codec pubtabnet-2.0.0
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
All commands stream their input; exit codes are non-zero on validation
|
|
144
|
+
failures or diffs (suitable for CI / data pipelines).
|
|
145
|
+
|
|
146
|
+
## End-to-end check against real datasets
|
|
147
|
+
|
|
148
|
+
`scripts/e2e_hf_check.py` streams real datasets through the codecs and
|
|
149
|
+
validates the resulting IR. Every shipped codec gets at least one
|
|
150
|
+
official-corpus check. Two data sources are used:
|
|
151
|
+
|
|
152
|
+
- the Docling OTSL family
|
|
153
|
+
(`docling-project/{PubTabNet,FinTabNet,PubTables-1M,SynthTabNet}_OTSL`)
|
|
154
|
+
— a uniform converted schema that feeds all nine codecs;
|
|
155
|
+
- the **native** first-published PubTabNet annotation
|
|
156
|
+
(`apoidea/pubtabnet-html`) fed unmodified to the `pubtabnet` codecs;
|
|
157
|
+
- the **native** PubTables-1M PASCAL VOC structure annotation
|
|
158
|
+
(`bsmock/pubtables-1m`, download-only) read from a local tar under
|
|
159
|
+
`input/` with the logical grid reconstructed for the `pubtables-1m`
|
|
160
|
+
codec (FinTabNet / TableBank natives stay download-only + Docling-covered).
|
|
161
|
+
|
|
162
|
+
It is **occasional / local-only** (network + multi-GB datasets), not part
|
|
163
|
+
of CI.
|
|
164
|
+
|
|
165
|
+
```bash
|
|
166
|
+
just e2e-selftest # network-free adapter smoke test
|
|
167
|
+
just e2e 200 # 200 randomly-sampled rows per check (needs [hf] extra)
|
|
168
|
+
uv run --extra hf python scripts/e2e_hf_check.py --dataset apoidea --limit 50
|
|
169
|
+
just e2e-fetch-pubtables1m # download native PubTables-1M VOC (~30MB) into input/
|
|
170
|
+
uv run --extra hf python scripts/e2e_hf_check.py --dataset bsmock --limit 200
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
Rows are sampled randomly (streaming shuffle reshuffles shard order), so
|
|
174
|
+
repeated runs progressively cover the multi-hundred-thousand-row corpora.
|
|
175
|
+
Each run prints its `--seed` so a finding can be reproduced; pass
|
|
176
|
+
`--seed N` to fix it or `--no-shuffle` for a deterministic head read.
|
|
177
|
+
The harness reports parse errors and validation findings — e.g. it
|
|
178
|
+
surfaces real upstream rows with geometrically invalid bboxes (I-05) —
|
|
179
|
+
and appends each failed row to `output/e2e_findings/` (gitignored) with
|
|
180
|
+
its full provenance and replayable payload for later audit.
|
|
181
|
+
|
|
182
|
+
See [`docs/adr/0003-e2e-against-docling-otsl-family.md`](docs/adr/0003-e2e-against-docling-otsl-family.md)
|
|
183
|
+
and [`docs/adr/0004-e2e-native-first-published-datasets.md`](docs/adr/0004-e2e-native-first-published-datasets.md)
|
|
184
|
+
for the data-source decisions and the canonical-vs-real-shape caveats.
|
|
185
|
+
|
|
186
|
+
## Documents
|
|
187
|
+
|
|
188
|
+
- [`docs/spec.md`](docs/spec.md) — Specification (the single source of truth).
|
|
189
|
+
- [`docs/glossary.md`](docs/glossary.md) — Precise vocabulary: terms tablecodec
|
|
190
|
+
defines vs. borrows, and the words most likely to be misread (e.g. "loss"
|
|
191
|
+
vs a "degenerate" bbox).
|
|
192
|
+
- [`docs/intent.md`](docs/intent.md) — Implementation brief (milestones, order,
|
|
193
|
+
quality bar).
|
|
194
|
+
- [`CHANGELOG.md`](CHANGELOG.md) — Keep a Changelog format.
|
|
195
|
+
|
|
196
|
+
## License
|
|
197
|
+
|
|
198
|
+
MIT. See [LICENSE](LICENSE). The OTSL grid-reconstruction logic is
|
|
199
|
+
adapted (with attribution) from the MIT-licensed docling-ibm-models — see
|
|
200
|
+
[THIRD_PARTY_NOTICES.md](THIRD_PARTY_NOTICES.md).
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
tablecodec/__init__.py,sha256=_U9z_gMXJt-WA6_rOxFhiWtn_G8D8e425Kg1zZQijTc,678
|
|
2
|
+
tablecodec/_invariants.py,sha256=jLaypJX0Z-YSLLN0rAs0bWQbQbtTDl2wO66KH3XnPa4,11054
|
|
3
|
+
tablecodec/cli.py,sha256=iyxzB4VPKDTTQ9FDHp-RmZ4Jk3I1iQf26fcq4nKAiBM,10419
|
|
4
|
+
tablecodec/io.py,sha256=Zyc9Xr-ZC7Ga0SXybW9nNl0fWXhe-F7N-hQjv07X_Mc,2873
|
|
5
|
+
tablecodec/ir.py,sha256=7AQXFVmq56WjuSehlLv3rmUbfhcIatiUg-0LWKhu30w,3683
|
|
6
|
+
tablecodec/loss.py,sha256=PBssNn9EOPw6ttl828c5VKcDtl3upd7EReCD2Sz1c70,4195
|
|
7
|
+
tablecodec/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
+
tablecodec/teds.py,sha256=FXtscI55jOJHt7bQ84-qCawfA2qOeE-LBKT64q-5YK4,8812
|
|
9
|
+
tablecodec/validate.py,sha256=d_WyQ5i7pwataS4Qb3N6RMBVd-cGrwlG_fSQkafcp5Q,6484
|
|
10
|
+
tablecodec/codecs/__init__.py,sha256=hUOBUiQB7LuF5mWxC5nkacTVYqOoTFxZvG0clVylRF4,3618
|
|
11
|
+
tablecodec/codecs/_base.py,sha256=5EGO5-cYNxlR8rFrtLtroCiy1Vp1OKDwjbXuY7LwnHk,2578
|
|
12
|
+
tablecodec/codecs/_htmltable.py,sha256=wlT9POZ_famRPuSqyS0fSG-rNDBLmn3jfXW0NzlWlZw,15644
|
|
13
|
+
tablecodec/codecs/_otslgrid.py,sha256=jM-g8Cf20s6sC6DGrGhSi4Kdzsns3D_6UHPxXrnEaIA,11559
|
|
14
|
+
tablecodec/codecs/builtins.py,sha256=S0CLZtbBLkQOF_K5FQqXcTGUL2iL82xsYZGi7Iz4Fi8,1255
|
|
15
|
+
tablecodec/codecs/doctags.py,sha256=liBBrJFYuwOKpcEsnxT4cn7ovarqBI81ZJKJMA02suA,8546
|
|
16
|
+
tablecodec/codecs/fintabnet.py,sha256=UYeD_xvAtGNAuZ5PTWfA9lL64RvTjBwPyrnjUMoMg44,2746
|
|
17
|
+
tablecodec/codecs/fintabnet_otsl.py,sha256=0dKhBXTwL9c8F6iRMLqkvqkmpixJGV6WmpjwNA5dMqw,4740
|
|
18
|
+
tablecodec/codecs/otsl.py,sha256=ZBLWu8NMKgiX3-9qoGUdNpcGDcw_8UHInSxVl-8c5nQ,4604
|
|
19
|
+
tablecodec/codecs/pubtables1m.py,sha256=ev3bHSGaoRDcnFBNThFXw5W9baKx_NbJNP3NVbYTR3g,5450
|
|
20
|
+
tablecodec/codecs/pubtabnet.py,sha256=0OtrmrKjSddNJ4S7aH-4qD05D_sZq1MWuTGdsb8j0lM,4424
|
|
21
|
+
tablecodec/codecs/tablebank.py,sha256=KVeKCCWXAnQkfaFIjDSloH1rgEyOQ0ZJ3sflg6MQsxs,2608
|
|
22
|
+
tablecodec/codecs/tableformer.py,sha256=V_WHjP2sl77GCj-9f8gN12PPeC40gJLnEa02jXdt-kg,2823
|
|
23
|
+
tablecodec-0.0.18.dist-info/METADATA,sha256=4qpPkHuoBPUuDqg62G_opJh4wdiWwvGR3yqxEDnNZc8,8356
|
|
24
|
+
tablecodec-0.0.18.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
25
|
+
tablecodec-0.0.18.dist-info/entry_points.txt,sha256=CxI3i14zvY80ATqZIUhuNVYt5a4Yt363HirK237RW2U,51
|
|
26
|
+
tablecodec-0.0.18.dist-info/licenses/LICENSE,sha256=A7Sy6xlibOHoadZDwSczt7tulPwnUmzF_XDQUuJmYr4,1092
|
|
27
|
+
tablecodec-0.0.18.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 hironow and tablecodec contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|