tablecodec 0.0.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tablecodec/teds.py ADDED
@@ -0,0 +1,243 @@
1
+ """TEDS (Tree-Edit-Distance based Similarity) for table samples.
2
+
3
+ TEDS scores the similarity of two tables in ``[0, 1]`` by the normalized
4
+ tree-edit distance between their HTML-DOM trees (Zhong et al., "Image-based
5
+ table recognition: data, model, and evaluation"). ``1.0`` means identical
6
+ structure and content; ``structure_only=True`` ignores cell text (TEDS-Struct).
7
+
8
+ This is the optional ``[teds]`` feature: it imports ``apted`` and ``lxml`` and
9
+ therefore lives OUTSIDE the zero-dependency core. It is never imported by
10
+ ``tablecodec/__init__`` — use ``from tablecodec.teds import teds``.
11
+
12
+ Attribution: the tree construction, the rename-cost rule, and the
13
+ ``1 - dist / max_nodes`` formula are adapted from IBM's PubTabNet reference
14
+ metric (``src/metric.py``, Apache License 2.0, Copyright 2020 IBM,
15
+ peter.zhong@au1.ibm.com). This is NOT a verbatim copy: the IR-native entry
16
+ point, a pure-Python normalized Levenshtein (replacing the ``distance``
17
+ package), and the removal of batching/parallelism are tablecodec's. See
18
+ ``THIRD_PARTY_NOTICES.md`` and ``docs/adr/0011-teds-metric-port.md``.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ from typing import Any, cast
24
+
25
+ from apted import APTED, Config # pyright: ignore[reportMissingTypeStubs]
26
+ from lxml import html # pyright: ignore[reportMissingTypeStubs]
27
+
28
+ from tablecodec.ir import GridCell, TableSample
29
+
30
+ __all__ = ["teds", "teds_html"]
31
+
32
+
33
+ # ---------- normalized Levenshtein (pure stdlib; replaces `distance`) ----------
34
+
35
+
36
+ def _levenshtein(a: list[str], b: list[str]) -> int:
37
+ """Edit distance between two token sequences."""
38
+ if a == b:
39
+ return 0
40
+ if not a:
41
+ return len(b)
42
+ if not b:
43
+ return len(a)
44
+ previous = list(range(len(b) + 1))
45
+ for i, ca in enumerate(a, start=1):
46
+ current = [i]
47
+ for j, cb in enumerate(b, start=1):
48
+ cost = 0 if ca == cb else 1
49
+ current.append(min(previous[j] + 1, current[j - 1] + 1, previous[j - 1] + cost))
50
+ previous = current
51
+ return previous[-1]
52
+
53
+
54
+ def _normalized_distance(a: list[str], b: list[str]) -> float:
55
+ """Levenshtein distance scaled to ``[0, 1]`` by the longer sequence."""
56
+ longest = max(len(a), len(b))
57
+ if longest == 0:
58
+ return 0.0
59
+ return _levenshtein(a, b) / longest
60
+
61
+
62
+ # ---------- apted tree + config (adapted from IBM PubTabNet metric.py) ----------
63
+
64
+
65
+ class _TableTree:
66
+ """An apted tree node for one HTML table element."""
67
+
68
+ def __init__(
69
+ self,
70
+ tag: str,
71
+ colspan: int | None,
72
+ rowspan: int | None,
73
+ content: list[str] | None,
74
+ *children: _TableTree,
75
+ ) -> None:
76
+ self.tag = tag
77
+ self.colspan = colspan
78
+ self.rowspan = rowspan
79
+ self.content = content
80
+ self.children: list[_TableTree] = list(children)
81
+
82
+
83
+ class _CustomConfig(Config):
84
+ def children(self, node: _TableTree) -> list[_TableTree]:
85
+ return node.children
86
+
87
+ # apted annotates rename as `-> int`, but TEDS uses fractional content
88
+ # costs; apted sums costs as numbers, so a float is correct at runtime.
89
+ def rename(self, node1: _TableTree, node2: _TableTree) -> float: # pyright: ignore[reportIncompatibleMethodOverride]
90
+ """Cost of relabeling ``node1`` to ``node2``."""
91
+ if (
92
+ node1.tag != node2.tag
93
+ or node1.colspan != node2.colspan
94
+ or node1.rowspan != node2.rowspan
95
+ ):
96
+ return 1.0
97
+ if node1.tag == "td" and (node1.content or node2.content):
98
+ return _normalized_distance(node1.content or [], node2.content or [])
99
+ return 0.0
100
+
101
+
102
+ # ---------- untyped third-party boundary (apted + lxml have no stubs) ----------
103
+ #
104
+ # apted and lxml ship no type information, so pyright (strict) cannot type the
105
+ # few lines that touch them. These thin wrappers confine that boundary: each
106
+ # returns a concretely-typed value, so the rest of the module is fully checked.
107
+
108
+
109
+ def _parse_first_table(doc: str) -> Any:
110
+ """Parse `doc` and return its first ``body/table`` element, or ``None``."""
111
+ parser = cast("Any", html.HTMLParser(remove_comments=True))
112
+ root = cast("Any", html.fromstring(doc, parser=parser)) # pyright: ignore[reportUnknownMemberType]
113
+ tables = cast("list[Any]", root.xpath("body/table"))
114
+ return tables[0] if tables else None
115
+
116
+
117
+ def _count_descendant_elements(table: Any) -> int:
118
+ """Number of element nodes below `table` (the TEDS denominator term)."""
119
+ return len(cast("list[Any]", table.xpath(".//*")))
120
+
121
+
122
+ def _tree_edit_distance(tree1: _TableTree, tree2: _TableTree) -> float:
123
+ raw = cast("Any", APTED(tree1, tree2, _CustomConfig()).compute_edit_distance())
124
+ return float(raw)
125
+
126
+
127
+ # ---------- lxml element -> apted tree (adapted from IBM PubTabNet) ----------
128
+
129
+
130
+ def _tokenize(node: Any, tokens: list[str]) -> None:
131
+ """Flatten an element into tokens: char-level text + tag markers."""
132
+ tokens.append(f"<{node.tag}>")
133
+ if node.text is not None:
134
+ tokens.extend(list(node.text))
135
+ for child in node:
136
+ _tokenize(child, tokens)
137
+ if node.tag != "unk":
138
+ tokens.append(f"</{node.tag}>")
139
+ if node.tag != "td" and node.tail is not None:
140
+ tokens.extend(list(node.tail))
141
+
142
+
143
+ def _load_html_tree(node: Any, *, structure_only: bool) -> _TableTree:
144
+ """Convert an lxml table element into the apted tree apted expects."""
145
+ if node.tag == "td":
146
+ if structure_only:
147
+ content: list[str] = []
148
+ else:
149
+ tokens: list[str] = []
150
+ _tokenize(node, tokens)
151
+ content = tokens[1:-1]
152
+ new_node = _TableTree(
153
+ "td",
154
+ int(node.attrib.get("colspan", "1")),
155
+ int(node.attrib.get("rowspan", "1")),
156
+ content,
157
+ )
158
+ else:
159
+ new_node = _TableTree(str(node.tag), None, None, None)
160
+ for child in node:
161
+ new_node.children.append(_load_html_tree(child, structure_only=structure_only))
162
+ return new_node
163
+
164
+
165
+ # ---------- IR -> HTML ----------
166
+
167
+
168
+ def _is_header_row(cells: list[GridCell]) -> bool:
169
+ return any(cell.role == "header" for cell in cells)
170
+
171
+
172
+ def _cell_html(cell: GridCell) -> str:
173
+ attrs = ""
174
+ if cell.colspan != 1:
175
+ attrs += f' colspan="{cell.colspan}"'
176
+ if cell.rowspan != 1:
177
+ attrs += f' rowspan="{cell.rowspan}"'
178
+ return f"<td{attrs}>{''.join(cell.tokens)}</td>"
179
+
180
+
181
+ def _row_html(cells: list[GridCell]) -> str:
182
+ inner = "".join(_cell_html(cell) for cell in sorted(cells, key=lambda c: c.col))
183
+ return f"<tr>{inner}</tr>"
184
+
185
+
186
+ def _sample_to_html(sample: TableSample) -> str:
187
+ """Render a sample as ``<html><body><table>...`` for TEDS.
188
+
189
+ Cells are grouped by anchor row (HTML rowspan/colspan handle the rest);
190
+ header rows go in ``<thead>``, body rows in ``<tbody>``. All cells render
191
+ as ``<td>`` (PubTabNet convention) so the metric scores their content.
192
+ """
193
+ rows: dict[int, list[GridCell]] = {}
194
+ for cell in sample.cells:
195
+ rows.setdefault(cell.row, []).append(cell)
196
+
197
+ header = [r for r in sorted(rows) if _is_header_row(rows[r])]
198
+ body = [r for r in sorted(rows) if not _is_header_row(rows[r])]
199
+
200
+ parts = ["<html><body><table>"]
201
+ if header:
202
+ parts.append("<thead>")
203
+ parts.extend(_row_html(rows[r]) for r in header)
204
+ parts.append("</thead>")
205
+ if body:
206
+ parts.append("<tbody>")
207
+ parts.extend(_row_html(rows[r]) for r in body)
208
+ parts.append("</tbody>")
209
+ parts.append("</table></body></html>")
210
+ return "".join(parts)
211
+
212
+
213
+ # ---------- public API ----------
214
+
215
+
216
+ def teds_html(pred_html: str, true_html: str, *, structure_only: bool = False) -> float:
217
+ """TEDS between two HTML table documents.
218
+
219
+ Each input is parsed; the first ``body/table`` is scored. Empty input or
220
+ HTML with no table scores ``0.0`` (the canonical convention).
221
+ """
222
+ if not pred_html or not true_html:
223
+ return 0.0
224
+ pred_table = _parse_first_table(pred_html)
225
+ true_table = _parse_first_table(true_html)
226
+ if pred_table is None or true_table is None:
227
+ return 0.0
228
+ n_nodes = max(_count_descendant_elements(pred_table), _count_descendant_elements(true_table))
229
+ if n_nodes == 0:
230
+ return 1.0
231
+ tree_pred = _load_html_tree(pred_table, structure_only=structure_only)
232
+ tree_true = _load_html_tree(true_table, structure_only=structure_only)
233
+ distance = _tree_edit_distance(tree_pred, tree_true)
234
+ return 1.0 - distance / n_nodes
235
+
236
+
237
+ def teds(pred: TableSample, true: TableSample, *, structure_only: bool = False) -> float:
238
+ """TEDS between two :class:`TableSample`s.
239
+
240
+ Both samples are rendered to HTML with the same renderer, so the score is
241
+ a well-defined similarity in ``[0, 1]`` regardless of their source codecs.
242
+ """
243
+ return teds_html(_sample_to_html(pred), _sample_to_html(true), structure_only=structure_only)
tablecodec/validate.py ADDED
@@ -0,0 +1,185 @@
1
+ """Validation entry-point and named profiles.
2
+
3
+ SPEC §8: a user explicitly opts into the strictness they need. Five
4
+ profiles ship: ``LENIENT``, ``DEFAULT``, ``PUBTABNET_2_0``, ``TABLEFORMER``,
5
+ ``STRICT``. Custom profiles can be constructed by composing the
6
+ ``check_iXX`` functions in :mod:`tablecodec._invariants`.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from collections.abc import Callable
12
+ from dataclasses import dataclass, field
13
+ from types import SimpleNamespace
14
+
15
+ from tablecodec._invariants import (
16
+ ValidationError,
17
+ check_i01_nrows_ncols_positive,
18
+ check_i02_cell_in_bounds,
19
+ check_i03_span_in_bounds,
20
+ check_i04_grid_exact_cover,
21
+ check_i05_bbox_well_formed,
22
+ check_i06_header_contiguous_top,
23
+ check_i07_tokens_is_tuple,
24
+ )
25
+ from tablecodec.ir import TableSample
26
+
27
+ __all__ = ["Profile", "ValidationError", "profiles", "validate"]
28
+
29
+ Check = Callable[[TableSample], list[ValidationError]]
30
+
31
+
32
+ @dataclass(frozen=True, slots=True)
33
+ class Profile:
34
+ """A named bundle of invariant checks.
35
+
36
+ Attributes:
37
+ name: Human-visible profile identifier.
38
+ checks: Ordered tuple of check functions. Order determines the
39
+ order of errors in the returned list (lower-numbered
40
+ invariants first, by convention).
41
+ """
42
+
43
+ name: str
44
+ checks: tuple[Check, ...] = field(default_factory=tuple)
45
+
46
+
47
+ # ---------- profile-specific extra checks ----------
48
+
49
+
50
+ def _check_pubtabnet_20_bbox(sample: TableSample) -> list[ValidationError]:
51
+ """SPEC §8 pubtabnet-2.0 profile: non-empty cells must have bbox."""
52
+ errors: list[ValidationError] = []
53
+ for idx, cell in enumerate(sample.cells):
54
+ if cell.tokens and cell.bbox is None:
55
+ errors.append(
56
+ ValidationError(
57
+ invariant="PUBTABNET-2.0-BBOX",
58
+ message=f"non-empty cell index {idx} is missing bbox",
59
+ cell_index=idx,
60
+ )
61
+ )
62
+ return errors
63
+
64
+
65
+ def _check_tableformer_bbox(sample: TableSample) -> list[ValidationError]:
66
+ """SPEC §8 tableformer profile: every cell (even empty) must have bbox."""
67
+ errors: list[ValidationError] = []
68
+ for idx, cell in enumerate(sample.cells):
69
+ if cell.bbox is None:
70
+ errors.append(
71
+ ValidationError(
72
+ invariant="TABLEFORMER-BBOX",
73
+ message=f"cell index {idx} is missing bbox",
74
+ cell_index=idx,
75
+ )
76
+ )
77
+ return errors
78
+
79
+
80
+ def _check_strict_bbox_in_image(sample: TableSample) -> list[ValidationError]:
81
+ """SPEC §8 strict profile / ADR 0012: cross-check bbox vs image dimensions.
82
+
83
+ Semantics (option C): a bbox-free sample needs no image metadata. If any
84
+ cell carries a bbox, the sample MUST declare ``image_width`` and
85
+ ``image_height`` (else the coordinates cannot be bound-checked), and every
86
+ bbox must lie within the image rectangle ``0 <= x0 < x1 <= width`` and
87
+ ``0 <= y0 < y1 <= height`` (upper bound inclusive — a bbox may touch the
88
+ image edge).
89
+ """
90
+ cells_with_bbox = [(idx, c.bbox) for idx, c in enumerate(sample.cells) if c.bbox is not None]
91
+ if not cells_with_bbox:
92
+ return []
93
+
94
+ width, height = sample.image_width, sample.image_height
95
+ if width is None or height is None:
96
+ return [
97
+ ValidationError(
98
+ invariant="STRICT-IMAGE-METADATA",
99
+ message=(
100
+ "sample carries cell bboxes but no image_width/image_height "
101
+ "to cross-check them against"
102
+ ),
103
+ cell_index=None,
104
+ )
105
+ ]
106
+
107
+ errors: list[ValidationError] = []
108
+ for idx, bbox in cells_with_bbox:
109
+ x0, y0, x1, y1 = bbox
110
+ if not (0 <= x0 and x1 <= width):
111
+ errors.append(
112
+ ValidationError(
113
+ invariant="STRICT-BBOX-OUT-OF-BOUNDS",
114
+ message=(f"bbox x-range [{x0}, {x1}] outside [0, {width}] at cell index {idx}"),
115
+ cell_index=idx,
116
+ )
117
+ )
118
+ if not (0 <= y0 and y1 <= height):
119
+ errors.append(
120
+ ValidationError(
121
+ invariant="STRICT-BBOX-OUT-OF-BOUNDS",
122
+ message=(
123
+ f"bbox y-range [{y0}, {y1}] outside [0, {height}] at cell index {idx}"
124
+ ),
125
+ cell_index=idx,
126
+ )
127
+ )
128
+ return errors
129
+
130
+
131
+ # ---------- profile registry ----------
132
+
133
+ _DEFAULT_CHECKS: tuple[Check, ...] = (
134
+ check_i01_nrows_ncols_positive,
135
+ check_i02_cell_in_bounds,
136
+ check_i03_span_in_bounds,
137
+ check_i04_grid_exact_cover,
138
+ check_i05_bbox_well_formed,
139
+ check_i06_header_contiguous_top,
140
+ check_i07_tokens_is_tuple,
141
+ )
142
+
143
+ # SPEC §8: LENIENT enforces I-01, I-02, I-03, I-05 only.
144
+ _LENIENT_CHECKS: tuple[Check, ...] = (
145
+ check_i01_nrows_ncols_positive,
146
+ check_i02_cell_in_bounds,
147
+ check_i03_span_in_bounds,
148
+ check_i05_bbox_well_formed,
149
+ )
150
+
151
+
152
+ # SimpleNamespace exposes the five built-in profiles as ``profiles.NAME``
153
+ # without pyright flagging uppercase attributes as ``reportConstantRedefinition``.
154
+ # SPEC §8 / ADR 0012: STRICT = DEFAULT + a bbox-in-image cross-check that
155
+ # requires image metadata whenever a sample carries bboxes.
156
+ profiles = SimpleNamespace(
157
+ LENIENT=Profile(name="LENIENT", checks=_LENIENT_CHECKS),
158
+ DEFAULT=Profile(name="DEFAULT", checks=_DEFAULT_CHECKS),
159
+ PUBTABNET_2_0=Profile(
160
+ name="PUBTABNET_2_0",
161
+ checks=(*_DEFAULT_CHECKS, _check_pubtabnet_20_bbox),
162
+ ),
163
+ TABLEFORMER=Profile(
164
+ name="TABLEFORMER",
165
+ checks=(*_DEFAULT_CHECKS, _check_tableformer_bbox),
166
+ ),
167
+ STRICT=Profile(name="STRICT", checks=(*_DEFAULT_CHECKS, _check_strict_bbox_in_image)),
168
+ )
169
+
170
+
171
+ def validate(sample: TableSample, profile: Profile) -> list[ValidationError]:
172
+ """Run the checks bundled in *profile* against *sample*.
173
+
174
+ Returns a flat list of :class:`ValidationError`. Empty list = valid.
175
+ Never raises on data; raises ``TypeError`` if *profile* is not a
176
+ :class:`Profile` instance (SPEC §8 "raise only on programmer error").
177
+ """
178
+ if not isinstance(profile, Profile): # pyright: ignore[reportUnnecessaryIsInstance]
179
+ msg = f"profile must be a Profile instance, got {type(profile).__name__}"
180
+ raise TypeError(msg)
181
+
182
+ errors: list[ValidationError] = []
183
+ for check in profile.checks:
184
+ errors.extend(check(sample))
185
+ return errors
@@ -0,0 +1,200 @@
1
+ Metadata-Version: 2.4
2
+ Name: tablecodec
3
+ Version: 0.0.18
4
+ Summary: Neutral Internal Representation and Codec registry for image-based table-recognition datasets
5
+ Project-URL: Homepage, https://github.com/hironow/tablecodec
6
+ Project-URL: Repository, https://github.com/hironow/tablecodec
7
+ Project-URL: Issues, https://github.com/hironow/tablecodec/issues
8
+ Project-URL: Changelog, https://github.com/hironow/tablecodec/blob/main/CHANGELOG.md
9
+ Author-email: hironow <hironow365@gmail.com>
10
+ License-Expression: MIT
11
+ License-File: LICENSE
12
+ Keywords: dataset,doctags,fintabnet,ocr,otsl,pubtabnet,table
13
+ Classifier: Development Status :: 2 - Pre-Alpha
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Science/Research
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Scientific/Engineering :: Image Recognition
22
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
23
+ Classifier: Typing :: Typed
24
+ Requires-Python: >=3.11
25
+ Provides-Extra: all
26
+ Requires-Dist: apted>=1.0.3; extra == 'all'
27
+ Requires-Dist: click>=8.1; extra == 'all'
28
+ Requires-Dist: datasets>=2.19; extra == 'all'
29
+ Requires-Dist: defusedxml>=0.7; extra == 'all'
30
+ Requires-Dist: lxml>=5.0; extra == 'all'
31
+ Provides-Extra: cli
32
+ Requires-Dist: click>=8.1; extra == 'cli'
33
+ Provides-Extra: dev
34
+ Requires-Dist: coverage>=7.5; extra == 'dev'
35
+ Requires-Dist: hypothesis>=6.100; extra == 'dev'
36
+ Requires-Dist: jsonschema>=4.20; extra == 'dev'
37
+ Requires-Dist: pyright>=1.1.380; extra == 'dev'
38
+ Requires-Dist: pytest-benchmark>=4.0; extra == 'dev'
39
+ Requires-Dist: pytest-cov>=5.0; extra == 'dev'
40
+ Requires-Dist: pytest>=8.0; extra == 'dev'
41
+ Requires-Dist: ruff>=0.6; extra == 'dev'
42
+ Provides-Extra: hf
43
+ Requires-Dist: datasets>=2.19; extra == 'hf'
44
+ Requires-Dist: defusedxml>=0.7; extra == 'hf'
45
+ Provides-Extra: teds
46
+ Requires-Dist: apted>=1.0.3; extra == 'teds'
47
+ Requires-Dist: lxml>=5.0; extra == 'teds'
48
+ Description-Content-Type: text/markdown
49
+
50
+ # tablecodec
51
+
52
+ > Neutral Internal Representation + Codec registry for image-based table-recognition datasets.
53
+
54
+ `tablecodec` is a Python library that provides a single, lossless Internal
55
+ Representation (IR) for tables and a registry-based codec layer that translates
56
+ between this IR and the fragmented landscape of public table-recognition
57
+ datasets — PubTabNet, FinTabNet, OTSL, TableFormer, DocTags-tables,
58
+ PubTables-1M, TableBank.
59
+
60
+ - Stdlib-only core. Heavier features (TEDS, CLI) are opt-in extras.
61
+ - Streams large JSONL datasets at constant memory.
62
+ - Self-declared loss analysis between any two codecs.
63
+
64
+ ## Status
65
+
66
+ **0.0.18 (pre-alpha).** Not yet published to PyPI. The nine codecs, the TEDS
67
+ metric (`[teds]`), and the STRICT validation profile were all added
68
+ incrementally within the 0.0.x series; a separate `tablecodec-docling` bridge
69
+ codec lives in `packages/` (its own version). The 0.x line makes no
70
+ API-stability promises; the public surface freezes at 1.0 (see
71
+ [docs/spec.md](docs/spec.md) §14). The specification is the source of
72
+ truth. Auto-generated codec / loss tables live at
73
+ [docs/format_support.md](docs/format_support.md) and
74
+ [docs/loss_matrix.md](docs/loss_matrix.md).
75
+
76
+ ## Installation
77
+
78
+ ```bash
79
+ pip install tablecodec # stdlib-only core
80
+ pip install "tablecodec[cli]" # + command-line interface (click)
81
+ pip install "tablecodec[teds]" # + TEDS similarity metric (apted, lxml)
82
+ ```
83
+
84
+ Requires Python 3.11+.
85
+
86
+ ## Basic usage
87
+
88
+ ```python
89
+ import tablecodec
90
+ from tablecodec import codecs, validate, profiles, analyze_loss
91
+ from tablecodec.codecs.pubtabnet import PubTabNet20Codec
92
+
93
+ # Register a codec (built-ins self-register through the CLI; in library
94
+ # use you register the ones you need).
95
+ codecs.register(PubTabNet20Codec())
96
+
97
+ # Stream-read a dataset into the neutral IR.
98
+ with open("pubtabnet_val.jsonl", encoding="utf-8") as f:
99
+ for sample in codecs.get("pubtabnet-2.0.0").read(f):
100
+ errors = validate(sample, profile=profiles.DEFAULT)
101
+ if errors:
102
+ print(sample.filename, errors)
103
+
104
+ # Static, data-free loss analysis between two formats.
105
+ report = analyze_loss(source="pubtabnet-2.0.0", target="otsl-1.0.0")
106
+ print(report.round_trip_classification) # "structure-preserving"
107
+ ```
108
+
109
+ The core has **zero third-party runtime dependencies** (SPEC §13);
110
+ `import tablecodec` works on a bare Python 3.11+.
111
+
112
+ ## TEDS similarity (optional)
113
+
114
+ The `[teds]` extra adds a Tree-Edit-Distance based Similarity score between
115
+ two samples. It lives outside the core (it imports `apted`/`lxml`), so import
116
+ it from its submodule:
117
+
118
+ ```python
119
+ from tablecodec.teds import teds
120
+
121
+ score = teds(pred_sample, true_sample) # 0.0 .. 1.0
122
+ struct = teds(pred_sample, true_sample, structure_only=True) # ignore cell text
123
+ ```
124
+
125
+ ## CLI
126
+
127
+ Install with the optional ``[cli]`` extra:
128
+
129
+ ```bash
130
+ pip install "tablecodec[cli]"
131
+ ```
132
+
133
+ ```bash
134
+ tablecodec codecs list
135
+ tablecodec analyze-loss --from pubtabnet-2.0.0 --to otsl-1.0.0
136
+ tablecodec validate path/to/dataset.jsonl --codec pubtabnet-2.0.0 --profile DEFAULT
137
+ tablecodec stats path/to/dataset.jsonl --codec pubtabnet-2.0.0 --json
138
+ tablecodec convert in.jsonl out.jsonl --from pubtabnet-2.0.0 --to otsl-1.0.0
139
+ tablecodec convert in.jsonl /dev/null --from pubtabnet-2.0.0 --to otsl-1.0.0 --dry-run
140
+ tablecodec diff a.jsonl b.jsonl --codec pubtabnet-2.0.0
141
+ ```
142
+
143
+ All commands stream their input; exit codes are non-zero on validation
144
+ failures or diffs (suitable for CI / data pipelines).
145
+
146
+ ## End-to-end check against real datasets
147
+
148
+ `scripts/e2e_hf_check.py` streams real datasets through the codecs and
149
+ validates the resulting IR. Every shipped codec gets at least one
150
+ official-corpus check. Two data sources are used:
151
+
152
+ - the Docling OTSL family
153
+ (`docling-project/{PubTabNet,FinTabNet,PubTables-1M,SynthTabNet}_OTSL`)
154
+ — a uniform converted schema that feeds all nine codecs;
155
+ - the **native** first-published PubTabNet annotation
156
+ (`apoidea/pubtabnet-html`) fed unmodified to the `pubtabnet` codecs;
157
+ - the **native** PubTables-1M PASCAL VOC structure annotation
158
+ (`bsmock/pubtables-1m`, download-only) read from a local tar under
159
+ `input/` with the logical grid reconstructed for the `pubtables-1m`
160
+ codec (FinTabNet / TableBank natives stay download-only + Docling-covered).
161
+
162
+ It is **occasional / local-only** (network + multi-GB datasets), not part
163
+ of CI.
164
+
165
+ ```bash
166
+ just e2e-selftest # network-free adapter smoke test
167
+ just e2e 200 # 200 randomly-sampled rows per check (needs [hf] extra)
168
+ uv run --extra hf python scripts/e2e_hf_check.py --dataset apoidea --limit 50
169
+ just e2e-fetch-pubtables1m # download native PubTables-1M VOC (~30MB) into input/
170
+ uv run --extra hf python scripts/e2e_hf_check.py --dataset bsmock --limit 200
171
+ ```
172
+
173
+ Rows are sampled randomly (streaming shuffle reshuffles shard order), so
174
+ repeated runs progressively cover the multi-hundred-thousand-row corpora.
175
+ Each run prints its `--seed` so a finding can be reproduced; pass
176
+ `--seed N` to fix it or `--no-shuffle` for a deterministic head read.
177
+ The harness reports parse errors and validation findings — e.g. it
178
+ surfaces real upstream rows with geometrically invalid bboxes (I-05) —
179
+ and appends each failed row to `output/e2e_findings/` (gitignored) with
180
+ its full provenance and replayable payload for later audit.
181
+
182
+ See [`docs/adr/0003-e2e-against-docling-otsl-family.md`](docs/adr/0003-e2e-against-docling-otsl-family.md)
183
+ and [`docs/adr/0004-e2e-native-first-published-datasets.md`](docs/adr/0004-e2e-native-first-published-datasets.md)
184
+ for the data-source decisions and the canonical-vs-real-shape caveats.
185
+
186
+ ## Documents
187
+
188
+ - [`docs/spec.md`](docs/spec.md) — Specification (the single source of truth).
189
+ - [`docs/glossary.md`](docs/glossary.md) — Precise vocabulary: terms tablecodec
190
+ defines vs. borrows, and the words most likely to be misread (e.g. "loss"
191
+ vs a "degenerate" bbox).
192
+ - [`docs/intent.md`](docs/intent.md) — Implementation brief (milestones, order,
193
+ quality bar).
194
+ - [`CHANGELOG.md`](CHANGELOG.md) — Keep a Changelog format.
195
+
196
+ ## License
197
+
198
+ MIT. See [LICENSE](LICENSE). The OTSL grid-reconstruction logic is
199
+ adapted (with attribution) from the MIT-licensed docling-ibm-models — see
200
+ [THIRD_PARTY_NOTICES.md](THIRD_PARTY_NOTICES.md).
@@ -0,0 +1,27 @@
1
+ tablecodec/__init__.py,sha256=_U9z_gMXJt-WA6_rOxFhiWtn_G8D8e425Kg1zZQijTc,678
2
+ tablecodec/_invariants.py,sha256=jLaypJX0Z-YSLLN0rAs0bWQbQbtTDl2wO66KH3XnPa4,11054
3
+ tablecodec/cli.py,sha256=iyxzB4VPKDTTQ9FDHp-RmZ4Jk3I1iQf26fcq4nKAiBM,10419
4
+ tablecodec/io.py,sha256=Zyc9Xr-ZC7Ga0SXybW9nNl0fWXhe-F7N-hQjv07X_Mc,2873
5
+ tablecodec/ir.py,sha256=7AQXFVmq56WjuSehlLv3rmUbfhcIatiUg-0LWKhu30w,3683
6
+ tablecodec/loss.py,sha256=PBssNn9EOPw6ttl828c5VKcDtl3upd7EReCD2Sz1c70,4195
7
+ tablecodec/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ tablecodec/teds.py,sha256=FXtscI55jOJHt7bQ84-qCawfA2qOeE-LBKT64q-5YK4,8812
9
+ tablecodec/validate.py,sha256=d_WyQ5i7pwataS4Qb3N6RMBVd-cGrwlG_fSQkafcp5Q,6484
10
+ tablecodec/codecs/__init__.py,sha256=hUOBUiQB7LuF5mWxC5nkacTVYqOoTFxZvG0clVylRF4,3618
11
+ tablecodec/codecs/_base.py,sha256=5EGO5-cYNxlR8rFrtLtroCiy1Vp1OKDwjbXuY7LwnHk,2578
12
+ tablecodec/codecs/_htmltable.py,sha256=wlT9POZ_famRPuSqyS0fSG-rNDBLmn3jfXW0NzlWlZw,15644
13
+ tablecodec/codecs/_otslgrid.py,sha256=jM-g8Cf20s6sC6DGrGhSi4Kdzsns3D_6UHPxXrnEaIA,11559
14
+ tablecodec/codecs/builtins.py,sha256=S0CLZtbBLkQOF_K5FQqXcTGUL2iL82xsYZGi7Iz4Fi8,1255
15
+ tablecodec/codecs/doctags.py,sha256=liBBrJFYuwOKpcEsnxT4cn7ovarqBI81ZJKJMA02suA,8546
16
+ tablecodec/codecs/fintabnet.py,sha256=UYeD_xvAtGNAuZ5PTWfA9lL64RvTjBwPyrnjUMoMg44,2746
17
+ tablecodec/codecs/fintabnet_otsl.py,sha256=0dKhBXTwL9c8F6iRMLqkvqkmpixJGV6WmpjwNA5dMqw,4740
18
+ tablecodec/codecs/otsl.py,sha256=ZBLWu8NMKgiX3-9qoGUdNpcGDcw_8UHInSxVl-8c5nQ,4604
19
+ tablecodec/codecs/pubtables1m.py,sha256=ev3bHSGaoRDcnFBNThFXw5W9baKx_NbJNP3NVbYTR3g,5450
20
+ tablecodec/codecs/pubtabnet.py,sha256=0OtrmrKjSddNJ4S7aH-4qD05D_sZq1MWuTGdsb8j0lM,4424
21
+ tablecodec/codecs/tablebank.py,sha256=KVeKCCWXAnQkfaFIjDSloH1rgEyOQ0ZJ3sflg6MQsxs,2608
22
+ tablecodec/codecs/tableformer.py,sha256=V_WHjP2sl77GCj-9f8gN12PPeC40gJLnEa02jXdt-kg,2823
23
+ tablecodec-0.0.18.dist-info/METADATA,sha256=4qpPkHuoBPUuDqg62G_opJh4wdiWwvGR3yqxEDnNZc8,8356
24
+ tablecodec-0.0.18.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
25
+ tablecodec-0.0.18.dist-info/entry_points.txt,sha256=CxI3i14zvY80ATqZIUhuNVYt5a4Yt363HirK237RW2U,51
26
+ tablecodec-0.0.18.dist-info/licenses/LICENSE,sha256=A7Sy6xlibOHoadZDwSczt7tulPwnUmzF_XDQUuJmYr4,1092
27
+ tablecodec-0.0.18.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ tablecodec = tablecodec.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 hironow and tablecodec contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.