tablecodec 0.0.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,318 @@
1
+ """Shared OTSL grid machinery for codec implementations.
2
+
3
+ OTSL (Lysak et al., ICDAR 2023, arXiv 2305.03393) and the DocTags table
4
+ subset (IBM Granite-Docling) both encode table structure with the same
5
+ five-token cell vocabulary plus a row separator:
6
+
7
+ - ``fcel`` filled-cell anchor
8
+ - ``ecel`` empty-cell anchor
9
+ - ``lcel`` left-merged continuation (extends the anchor's colspan)
10
+ - ``ucel`` up-merged continuation (extends the anchor's rowspan)
11
+ - ``xcel`` cross-merged continuation (extends both)
12
+ - ``nl`` newline / row separator
13
+
14
+ This module owns the structure↔grid conversion so OTSL and DocTags do
15
+ not duplicate it. DocTags additionally interleaves location and content
16
+ tokens, which it strips before calling :func:`build_anchors`, and
17
+ re-inserts when serialising from :func:`build_token_grid`.
18
+
19
+ The grid-reconstruction logic in :func:`build_anchors` (the anchor-centric
20
+ scan, the ``check_right``/``check_down`` span runs, and the 2D-span
21
+ registry) is adapted from docling-ibm-models' ``otsl_to_html``:
22
+
23
+ https://github.com/docling-project/docling-ibm-models
24
+ docling_ibm_models/tableformer/otsl.py
25
+ Copyright (c) 2024 International Business Machines
26
+ Licensed under the MIT License.
27
+
28
+ It is reimplemented here for the neutral IR (it emits ``GridCell`` spans
29
+ rather than HTML strings) and carries no third-party imports. See
30
+ THIRD_PARTY_NOTICES.md and docs/adr/0005-port-otsl-reconstruction.md.
31
+
32
+ Stdlib-only (SPEC §13).
33
+ """
34
+
35
+ from __future__ import annotations
36
+
37
+ from dataclasses import dataclass
38
+ from typing import cast
39
+
40
+ from tablecodec.ir import GridCell, TableSample
41
+
42
+ __all__ = [
43
+ "ANCHOR_TOKENS",
44
+ "CELL_TOKENS",
45
+ "CONTINUATION_TOKENS",
46
+ "VALID_TOKENS",
47
+ "AnchorPlacement",
48
+ "build_anchors",
49
+ "build_token_grid",
50
+ "cells_to_otsl",
51
+ "ensure_square",
52
+ "otsl_to_cells",
53
+ "split_rows",
54
+ ]
55
+
56
+ ANCHOR_TOKENS = frozenset({"fcel", "ecel"})
57
+ CONTINUATION_TOKENS = frozenset({"lcel", "ucel", "xcel"})
58
+ CELL_TOKENS = ANCHOR_TOKENS | CONTINUATION_TOKENS
59
+ VALID_TOKENS = CELL_TOKENS | {"nl"}
60
+
61
+
62
+ @dataclass(slots=True)
63
+ class AnchorPlacement:
64
+ """One ``fcel`` / ``ecel`` anchor mapped to its grid coordinates."""
65
+
66
+ row: int
67
+ col: int
68
+ rowspan: int = 1
69
+ colspan: int = 1
70
+ is_empty: bool = False
71
+
72
+
73
+ def split_rows(tokens: list[str]) -> list[list[str]]:
74
+ """Split a flat cell-token stream on ``nl`` into per-row token lists.
75
+
76
+ Rejects any token outside :data:`VALID_TOKENS`. A trailing ``nl`` does
77
+ not produce an empty final row.
78
+ """
79
+ rows: list[list[str]] = [[]]
80
+ for tok in tokens:
81
+ if tok not in VALID_TOKENS:
82
+ msg = f"unknown OTSL token {tok!r}"
83
+ raise ValueError(msg)
84
+ if tok == "nl":
85
+ rows.append([])
86
+ else:
87
+ rows[-1].append(tok)
88
+ if rows and not rows[-1]:
89
+ rows.pop()
90
+ return rows
91
+
92
+
93
+ def ensure_square(rows: list[list[str]]) -> int:
94
+ """Return the common row width, or raise if rows are jagged."""
95
+ if not rows:
96
+ return 0
97
+ widths = {len(r) for r in rows}
98
+ if len(widths) != 1:
99
+ msg = f"OTSL square-table assumption violated; row widths = {sorted(widths)}"
100
+ raise ValueError(msg)
101
+ return next(iter(widths))
102
+
103
+
104
+ def _normalize_edge_continuations(rows: list[list[str]], nrows: int, ncols: int) -> list[list[str]]:
105
+ """Repair structurally-impossible continuations at the grid edges.
106
+
107
+ A continuation cannot merge in a direction that has no neighbour:
108
+ row 0 has nothing above, column 0 has nothing to the left. Real
109
+ encoders (and the docling OTSL decoder's "structure error correction")
110
+ emit ``xcel``/``ucel`` in row 0 and ``xcel``/``lcel`` in column 0 that
111
+ must be read as the only possible merge:
112
+
113
+ - row 0: ``ucel``/``xcel`` -> ``lcel`` (can only merge left).
114
+ - col 0: ``lcel``/``xcel`` -> ``ucel`` (can only merge up).
115
+
116
+ A copy is returned; the caller's rows are not mutated.
117
+ """
118
+ grid = [list(row) for row in rows]
119
+ for c in range(ncols):
120
+ if grid[0][c] in {"ucel", "xcel"}:
121
+ grid[0][c] = "lcel"
122
+ for r in range(nrows):
123
+ if grid[r][0] in {"lcel", "xcel"}:
124
+ grid[r][0] = "ucel"
125
+ return grid
126
+
127
+
128
+ @dataclass(slots=True)
129
+ class _OtslReader:
130
+ """Anchor-centric OTSL grid reader (logic adapted from docling, see header).
131
+
132
+ ``registry`` mirrors docling's ``registry_2d_span``: cells already
133
+ claimed by a 2D (``xcel``) span, so a later anchor cannot re-claim them.
134
+ """
135
+
136
+ grid: list[list[str]]
137
+ nrows: int
138
+ ncols: int
139
+ registry: list[list[bool]]
140
+
141
+ def _check_right(self, r: int, c: int) -> int:
142
+ # colspan: extend right over horizontal continuations (lcel/xcel);
143
+ # stop at an anchor, an up-merge, or the edge (docling check_right).
144
+ # Also stop at a cell already claimed by a 2D span above: an `xcel`
145
+ # there belongs to that span, not to this row's run — counting it
146
+ # would overlap (the irregular case real SynthTabNet rows hit).
147
+ dist = 1
148
+ x = c
149
+ while (
150
+ x + 1 < self.ncols
151
+ and self.grid[r][x + 1] in {"lcel", "xcel"}
152
+ and not self.registry[r][x + 1]
153
+ ):
154
+ x += 1
155
+ dist += 1
156
+ return dist
157
+
158
+ def _check_down(self, r: int, c: int) -> int:
159
+ # rowspan: extend down over vertical continuations (ucel/xcel);
160
+ # stop at an anchor, a left-merge, the edge, or a cell already
161
+ # claimed by another 2D span (symmetric to _check_right).
162
+ dist = 1
163
+ y = r
164
+ while (
165
+ y + 1 < self.nrows
166
+ and self.grid[y + 1][c] in {"ucel", "xcel"}
167
+ and not self.registry[y + 1][c]
168
+ ):
169
+ y += 1
170
+ dist += 1
171
+ return dist
172
+
173
+ def _claim_2d(self, r: int, c: int, rowspan: int, colspan: int) -> bool:
174
+ # Mark the rectangle in the registry iff none of it is already
175
+ # claimed (docling's double-count guard); returns whether it claimed.
176
+ for dr in range(rowspan):
177
+ for dc in range(colspan):
178
+ if self.registry[r + dr][c + dc]:
179
+ return False
180
+ for dr in range(rowspan):
181
+ for dc in range(colspan):
182
+ self.registry[r + dr][c + dc] = True
183
+ return True
184
+
185
+ def span_of(self, r: int, c: int) -> tuple[int, int]:
186
+ """Compute (rowspan, colspan) for the anchor at (r, c) from its neighbours."""
187
+ colspan = rowspan = 1
188
+ right = self.grid[r][c + 1] if c + 1 < self.ncols else ""
189
+ below = self.grid[r + 1][c] if r + 1 < self.nrows else ""
190
+ if right == "lcel":
191
+ colspan = self._check_right(r, c)
192
+ if below == "ucel":
193
+ rowspan = self._check_down(r, c)
194
+ if right == "xcel":
195
+ xr = self._check_right(r, c)
196
+ xd = self._check_down(r, c)
197
+ if self._claim_2d(r, c, xd, xr):
198
+ colspan, rowspan = xr, xd
199
+ return rowspan, colspan
200
+
201
+
202
+ def build_anchors(rows: list[list[str]]) -> tuple[int, int, list[AnchorPlacement]]:
203
+ """Walk the row × col grid; return (nrows, ncols, ordered anchors).
204
+
205
+ Anchor-centric reconstruction (adapted from docling's ``otsl_to_html``,
206
+ see the module header): each ``fcel``/``ecel`` is an anchor whose span
207
+ is read from its neighbouring continuation tokens — a right ``lcel`` run
208
+ gives colspan, a below ``ucel`` run gives rowspan, and a right ``xcel``
209
+ gives a 2D span guarded by the registry against double-claiming.
210
+ Continuation tokens carry no content and are skipped; anchors are
211
+ returned in row-major order — the order the source ``cells[]`` appear in.
212
+ """
213
+ nrows = len(rows)
214
+ ncols = ensure_square(rows)
215
+ reader = _OtslReader(
216
+ grid=_normalize_edge_continuations(rows, nrows, ncols),
217
+ nrows=nrows,
218
+ ncols=ncols,
219
+ registry=[[False] * ncols for _ in range(nrows)],
220
+ )
221
+ ordered: list[AnchorPlacement] = []
222
+ for r in range(nrows):
223
+ for c in range(ncols):
224
+ tok = reader.grid[r][c]
225
+ if tok not in ANCHOR_TOKENS:
226
+ continue
227
+ rowspan, colspan = reader.span_of(r, c)
228
+ ordered.append(
229
+ AnchorPlacement(
230
+ row=r, col=c, rowspan=rowspan, colspan=colspan, is_empty=(tok == "ecel")
231
+ )
232
+ )
233
+ return nrows, ncols, ordered
234
+
235
+
236
+ def build_token_grid(sample: TableSample) -> tuple[list[list[str]], list[GridCell]]:
237
+ """Lay a sample's cells onto a 2D token grid.
238
+
239
+ Returns ``(grid, anchors)`` where ``grid[r][c]`` is one of the five
240
+ cell tokens and ``anchors`` is the row-major list of the anchor cells
241
+ (so callers can attach per-cell content / bbox in the right order).
242
+ A cell with empty ``tokens`` becomes ``ecel``; otherwise ``fcel``.
243
+ """
244
+ grid: list[list[str]] = [[""] * sample.ncols for _ in range(sample.nrows)]
245
+ anchored = sorted(sample.cells, key=lambda c: (c.row, c.col))
246
+ for cell in anchored:
247
+ grid[cell.row][cell.col] = "ecel" if not cell.tokens else "fcel"
248
+ for dr in range(cell.rowspan):
249
+ for dc in range(cell.colspan):
250
+ if dr == 0 and dc == 0:
251
+ continue
252
+ rr, cc = cell.row + dr, cell.col + dc
253
+ if dr == 0:
254
+ grid[rr][cc] = "lcel"
255
+ elif dc == 0:
256
+ grid[rr][cc] = "ucel"
257
+ else:
258
+ grid[rr][cc] = "xcel"
259
+ return grid, anchored
260
+
261
+
262
+ # ---------- OTSL payload <-> GridCells ----------
263
+
264
+
265
+ def otsl_to_cells(
266
+ otsl_tokens: list[str], cell_payloads: list[dict[str, object]]
267
+ ) -> tuple[int, int, tuple[GridCell, ...]]:
268
+ """Map an OTSL token stream + positional ``cells[]`` to GridCells.
269
+
270
+ Returns ``(nrows, ncols, cells)``. Every cell defaults to
271
+ ``role="body"`` (the OTSL core has no header marker). Raises if the
272
+ anchor count and ``cells[]`` length disagree.
273
+ """
274
+ rows = split_rows(otsl_tokens)
275
+ nrows, ncols, anchors = build_anchors(rows)
276
+ if len(anchors) != len(cell_payloads):
277
+ msg = (
278
+ f"OTSL declares {len(anchors)} anchored cells but cells[] has "
279
+ f"{len(cell_payloads)} entries"
280
+ )
281
+ raise ValueError(msg)
282
+
283
+ cells: list[GridCell] = []
284
+ for anchor, cell_payload in zip(anchors, cell_payloads, strict=True):
285
+ tokens = tuple(cast("tuple[str, ...]", cell_payload.get("tokens", ())))
286
+ bbox_raw = cell_payload.get("bbox")
287
+ bbox = None
288
+ if bbox_raw is not None:
289
+ seq = cast("list[int]", bbox_raw)
290
+ bbox = (int(seq[0]), int(seq[1]), int(seq[2]), int(seq[3]))
291
+ cells.append(
292
+ GridCell(
293
+ row=anchor.row,
294
+ col=anchor.col,
295
+ rowspan=anchor.rowspan,
296
+ colspan=anchor.colspan,
297
+ tokens=tokens,
298
+ bbox=bbox,
299
+ role="body",
300
+ )
301
+ )
302
+ return nrows, ncols, tuple(cells)
303
+
304
+
305
+ def cells_to_otsl(sample: TableSample) -> tuple[list[str], list[dict[str, object]]]:
306
+ """Serialize a sample to an OTSL token stream + positional ``cells[]``."""
307
+ grid, emitted_order = build_token_grid(sample)
308
+ tokens: list[str] = []
309
+ for row in grid:
310
+ tokens.extend(row)
311
+ tokens.append("nl")
312
+ cell_payloads: list[dict[str, object]] = []
313
+ for cell in emitted_order:
314
+ payload: dict[str, object] = {"tokens": list(cell.tokens)}
315
+ if cell.bbox is not None:
316
+ payload["bbox"] = list(cell.bbox)
317
+ cell_payloads.append(payload)
318
+ return tokens, cell_payloads
@@ -0,0 +1,36 @@
1
+ """The built-in codec instances, as a single source of truth.
2
+
3
+ The library itself does not auto-register codecs (callers register the
4
+ ones they need). But the CLI and the documentation generators all need
5
+ "every codec that ships with tablecodec" — keeping that list in one
6
+ place avoids the drift where a new codec is added to some consumers but
7
+ not others.
8
+
9
+ Order is deterministic and shapes the rendered doc tables.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from tablecodec.codecs._base import Codec
15
+ from tablecodec.codecs.doctags import DocTagsTablesCodec
16
+ from tablecodec.codecs.fintabnet import FinTabNetCodec
17
+ from tablecodec.codecs.fintabnet_otsl import FinTabNetOTSLCodec
18
+ from tablecodec.codecs.otsl import OTSL10Codec
19
+ from tablecodec.codecs.pubtables1m import PubTables1MCodec
20
+ from tablecodec.codecs.pubtabnet import PubTabNet10Codec, PubTabNet20Codec
21
+ from tablecodec.codecs.tablebank import TableBankCodec
22
+ from tablecodec.codecs.tableformer import TableFormerCodec
23
+
24
+ __all__ = ["BUILTIN_CODECS"]
25
+
26
+ BUILTIN_CODECS: tuple[Codec, ...] = (
27
+ PubTabNet10Codec(),
28
+ PubTabNet20Codec(),
29
+ FinTabNetCodec(),
30
+ TableFormerCodec(),
31
+ TableBankCodec(),
32
+ PubTables1MCodec(),
33
+ OTSL10Codec(),
34
+ FinTabNetOTSLCodec(),
35
+ DocTagsTablesCodec(),
36
+ )
@@ -0,0 +1,278 @@
1
+ """DocTags table subset codec.
2
+
3
+ DocTags (IBM Granite-Docling, 2026) is a document markup where tables are
4
+ encoded as OTSL cell tokens wrapped in ``<otsl>`` ... ``</otsl>``. Each
5
+ anchor cell is annotated with four ``<loc_n>`` tokens — a bounding box on
6
+ a fixed 0–500 grid — followed by the cell's content tokens. Continuation
7
+ tokens (``lcel`` / ``ucel`` / ``xcel``) carry neither loc nor content.
8
+
9
+ This codec handles the **table subset** of DocTags:
10
+
11
+ - read: full — structure (via the shared :mod:`._otslgrid`), bbox (from
12
+ the loc tokens), and cell content.
13
+ - write: the OTSL-equivalent subset only (SPEC §7 marks this △). Header
14
+ / body ``role`` has no representation in the OTSL core, so it is lost
15
+ (``lossy_write`` declares ``role``); ``extras`` is also dropped.
16
+
17
+ Canonical jsonl record shape::
18
+
19
+ {
20
+ "filename": "...",
21
+ "split": "train" | "val" | "test", # optional
22
+ "imgid": 0, # optional
23
+ "doctags": [
24
+ "<otsl>",
25
+ "fcel",
26
+ "<loc_0>",
27
+ "<loc_0>",
28
+ "<loc_250>",
29
+ "<loc_50>",
30
+ "Year",
31
+ ...,
32
+ "nl",
33
+ "</otsl>",
34
+ ],
35
+ }
36
+
37
+ Derived from the published DocTags / OTSL description, not copied from
38
+ upstream reference code.
39
+ """
40
+
41
+ from __future__ import annotations
42
+
43
+ import json
44
+ import re
45
+ from collections.abc import Iterable, Iterator
46
+ from dataclasses import dataclass, field
47
+ from typing import IO, Any, cast
48
+
49
+ from tablecodec.codecs._otslgrid import (
50
+ ANCHOR_TOKENS,
51
+ CELL_TOKENS,
52
+ build_anchors,
53
+ build_token_grid,
54
+ )
55
+ from tablecodec.ir import BBox, GridCell, TableSample
56
+
57
+ __all__ = ["DocTagsTablesCodec"]
58
+
59
+ _OTSL_OPEN = "<otsl>"
60
+ _OTSL_CLOSE = "</otsl>"
61
+ _LOC_RE = re.compile(r"^<loc_(\d+)>$")
62
+ _LOC_PER_BBOX = 4
63
+
64
+
65
+ @dataclass(frozen=True, slots=True)
66
+ class DocTagsTablesCodec:
67
+ """Codec for the DocTags table subset (read full, write OTSL subset)."""
68
+
69
+ name: str = "doctags-tables"
70
+ spec_version: str = "1.0.0"
71
+ media_type: str = "application/jsonl"
72
+ writable: bool = True
73
+
74
+ def read(self, source: IO[str]) -> Iterator[TableSample]:
75
+ for line_no, raw in enumerate(source, start=1):
76
+ line = raw.strip()
77
+ if not line:
78
+ continue
79
+ try:
80
+ payload: dict[str, Any] = json.loads(line)
81
+ except json.JSONDecodeError as exc:
82
+ msg = f"invalid JSON at line {line_no}: {exc.msg}"
83
+ raise ValueError(msg) from exc
84
+ try:
85
+ yield _payload_to_sample(payload)
86
+ except (KeyError, ValueError, TypeError) as exc:
87
+ msg = f"malformed DocTags record at line {line_no}: {exc}"
88
+ raise ValueError(msg) from exc
89
+
90
+ def write(self, samples: Iterable[TableSample], sink: IO[str]) -> None:
91
+ for sample in samples:
92
+ sink.write(json.dumps(_sample_to_payload(sample), ensure_ascii=False))
93
+ sink.write("\n")
94
+
95
+ def lossy_read(self) -> frozenset[str]:
96
+ # The OTSL core has no header marker; role defaults to body.
97
+ return frozenset({"role"})
98
+
99
+ def lossy_write(self) -> frozenset[str]:
100
+ return frozenset({"extras", "role"})
101
+
102
+ def sniff(self, source: IO[str]) -> bool:
103
+ pos = source.tell()
104
+ try:
105
+ for raw in source:
106
+ line = raw.strip()
107
+ if not line:
108
+ continue
109
+ try:
110
+ payload: object = json.loads(line)
111
+ except json.JSONDecodeError:
112
+ return False
113
+ return isinstance(payload, dict) and "doctags" in cast("dict[str, Any]", payload)
114
+ return False
115
+ finally:
116
+ source.seek(pos)
117
+
118
+
119
+ # ---------- DocTags token stream parsing ----------
120
+
121
+
122
+ @dataclass(slots=True)
123
+ class _ParsedCell:
124
+ """A structural cell token plus, for anchors, its bbox and content."""
125
+
126
+ token: str
127
+ bbox: BBox | None = None
128
+ content: tuple[str, ...] = ()
129
+
130
+
131
+ def _initial_rows() -> list[list[str]]:
132
+ return [[]]
133
+
134
+
135
+ def _empty_parsed_cells() -> list[_ParsedCell]:
136
+ return []
137
+
138
+
139
+ @dataclass(slots=True)
140
+ class _StreamState:
141
+ rows: list[list[str]] = field(default_factory=_initial_rows)
142
+ anchors: list[_ParsedCell] = field(default_factory=_empty_parsed_cells)
143
+
144
+
145
+ def _parse_loc_run(tokens: list[str], start: int) -> tuple[BBox | None, int]:
146
+ """Read up to four consecutive ``<loc_n>`` tokens from *start*.
147
+
148
+ Returns (bbox or None, index after the loc run).
149
+ """
150
+ coords: list[int] = []
151
+ j = start
152
+ while j < len(tokens) and len(coords) < _LOC_PER_BBOX:
153
+ m = _LOC_RE.match(tokens[j])
154
+ if m is None:
155
+ break
156
+ coords.append(int(m.group(1)))
157
+ j += 1
158
+ if len(coords) == _LOC_PER_BBOX:
159
+ return (coords[0], coords[1], coords[2], coords[3]), j
160
+ return None, start # not a full bbox; leave tokens for content
161
+
162
+
163
+ def _parse_content_run(tokens: list[str], start: int) -> tuple[tuple[str, ...], int]:
164
+ """Read content tokens until the next structural / loc / nl / wrapper token."""
165
+ content: list[str] = []
166
+ j = start
167
+ while j < len(tokens):
168
+ tok = tokens[j]
169
+ if tok in CELL_TOKENS or tok == "nl" or tok in (_OTSL_OPEN, _OTSL_CLOSE):
170
+ break
171
+ if _LOC_RE.match(tok) is not None:
172
+ break
173
+ content.append(tok)
174
+ j += 1
175
+ return tuple(content), j
176
+
177
+
178
+ def _parse_doctags_stream(tokens: list[str]) -> _StreamState:
179
+ state = _StreamState()
180
+ i = 0
181
+ while i < len(tokens):
182
+ tok = tokens[i]
183
+ if tok in (_OTSL_OPEN, _OTSL_CLOSE):
184
+ i += 1
185
+ elif tok == "nl":
186
+ state.rows.append([])
187
+ i += 1
188
+ elif tok in CELL_TOKENS:
189
+ state.rows[-1].append(tok)
190
+ if tok in ANCHOR_TOKENS:
191
+ bbox, after_loc = _parse_loc_run(tokens, i + 1)
192
+ content, after_content = _parse_content_run(tokens, after_loc)
193
+ state.anchors.append(_ParsedCell(token=tok, bbox=bbox, content=content))
194
+ i = after_content
195
+ else:
196
+ i += 1
197
+ else:
198
+ msg = f"unexpected DocTags token {tok!r}"
199
+ raise ValueError(msg)
200
+ if state.rows and not state.rows[-1]:
201
+ state.rows.pop()
202
+ return state
203
+
204
+
205
+ def _payload_to_sample(payload: dict[str, Any]) -> TableSample:
206
+ tokens = list(payload["doctags"])
207
+ state = _parse_doctags_stream(tokens)
208
+ _nrows, _ncols, placements = build_anchors(state.rows)
209
+
210
+ if len(placements) != len(state.anchors):
211
+ msg = (
212
+ f"DocTags declares {len(placements)} anchors but the stream parsed "
213
+ f"{len(state.anchors)} cell contents"
214
+ )
215
+ raise ValueError(msg)
216
+
217
+ cells = tuple(
218
+ GridCell(
219
+ row=placement.row,
220
+ col=placement.col,
221
+ rowspan=placement.rowspan,
222
+ colspan=placement.colspan,
223
+ tokens=parsed.content,
224
+ bbox=parsed.bbox,
225
+ role="body",
226
+ )
227
+ for placement, parsed in zip(placements, state.anchors, strict=True)
228
+ )
229
+
230
+ return TableSample(
231
+ filename=str(payload["filename"]),
232
+ nrows=_nrows,
233
+ ncols=_ncols,
234
+ cells=cells,
235
+ split=_normalize_split(payload.get("split")),
236
+ imgid=payload.get("imgid"),
237
+ )
238
+
239
+
240
+ def _normalize_split(value: object) -> Any:
241
+ if value in ("train", "val", "test"):
242
+ return value
243
+ if value is None:
244
+ return None
245
+ msg = f"unknown split value {value!r}"
246
+ raise ValueError(msg)
247
+
248
+
249
+ # ---------- IR → DocTags ----------
250
+
251
+
252
+ def _loc_tokens(bbox: BBox) -> list[str]:
253
+ return [f"<loc_{v}>" for v in bbox]
254
+
255
+
256
+ def _sample_to_payload(sample: TableSample) -> dict[str, Any]:
257
+ grid, anchored = build_token_grid(sample)
258
+ by_pos = {(c.row, c.col): c for c in anchored}
259
+
260
+ tokens: list[str] = [_OTSL_OPEN]
261
+ for r, row in enumerate(grid):
262
+ for c, structural in enumerate(row):
263
+ tokens.append(structural)
264
+ cell = by_pos.get((r, c))
265
+ if cell is None:
266
+ continue # continuation token: no loc/content
267
+ if cell.bbox is not None:
268
+ tokens.extend(_loc_tokens(cell.bbox))
269
+ tokens.extend(cell.tokens)
270
+ tokens.append("nl")
271
+ tokens.append(_OTSL_CLOSE)
272
+
273
+ out: dict[str, Any] = {"filename": sample.filename, "doctags": tokens}
274
+ if sample.split is not None:
275
+ out["split"] = sample.split
276
+ if sample.imgid is not None:
277
+ out["imgid"] = sample.imgid
278
+ return out