tablecodec 0.0.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tablecodec/__init__.py +29 -0
- tablecodec/_invariants.py +311 -0
- tablecodec/cli.py +314 -0
- tablecodec/codecs/__init__.py +111 -0
- tablecodec/codecs/_base.py +79 -0
- tablecodec/codecs/_htmltable.py +510 -0
- tablecodec/codecs/_otslgrid.py +318 -0
- tablecodec/codecs/builtins.py +36 -0
- tablecodec/codecs/doctags.py +278 -0
- tablecodec/codecs/fintabnet.py +84 -0
- tablecodec/codecs/fintabnet_otsl.py +141 -0
- tablecodec/codecs/otsl.py +138 -0
- tablecodec/codecs/pubtables1m.py +161 -0
- tablecodec/codecs/pubtabnet.py +128 -0
- tablecodec/codecs/tablebank.py +76 -0
- tablecodec/codecs/tableformer.py +80 -0
- tablecodec/io.py +91 -0
- tablecodec/ir.py +101 -0
- tablecodec/loss.py +105 -0
- tablecodec/py.typed +0 -0
- tablecodec/teds.py +243 -0
- tablecodec/validate.py +185 -0
- tablecodec-0.0.18.dist-info/METADATA +200 -0
- tablecodec-0.0.18.dist-info/RECORD +27 -0
- tablecodec-0.0.18.dist-info/WHEEL +4 -0
- tablecodec-0.0.18.dist-info/entry_points.txt +2 -0
- tablecodec-0.0.18.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
"""Shared OTSL grid machinery for codec implementations.
|
|
2
|
+
|
|
3
|
+
OTSL (Lysak et al., ICDAR 2023, arXiv 2305.03393) and the DocTags table
|
|
4
|
+
subset (IBM Granite-Docling) both encode table structure with the same
|
|
5
|
+
five-token cell vocabulary plus a row separator:
|
|
6
|
+
|
|
7
|
+
- ``fcel`` filled-cell anchor
|
|
8
|
+
- ``ecel`` empty-cell anchor
|
|
9
|
+
- ``lcel`` left-merged continuation (extends the anchor's colspan)
|
|
10
|
+
- ``ucel`` up-merged continuation (extends the anchor's rowspan)
|
|
11
|
+
- ``xcel`` cross-merged continuation (extends both)
|
|
12
|
+
- ``nl`` newline / row separator
|
|
13
|
+
|
|
14
|
+
This module owns the structure↔grid conversion so OTSL and DocTags do
|
|
15
|
+
not duplicate it. DocTags additionally interleaves location and content
|
|
16
|
+
tokens, which it strips before calling :func:`build_anchors`, and
|
|
17
|
+
re-inserts when serialising from :func:`build_token_grid`.
|
|
18
|
+
|
|
19
|
+
The grid-reconstruction logic in :func:`build_anchors` (the anchor-centric
|
|
20
|
+
scan, the ``check_right``/``check_down`` span runs, and the 2D-span
|
|
21
|
+
registry) is adapted from docling-ibm-models' ``otsl_to_html``:
|
|
22
|
+
|
|
23
|
+
https://github.com/docling-project/docling-ibm-models
|
|
24
|
+
docling_ibm_models/tableformer/otsl.py
|
|
25
|
+
Copyright (c) 2024 International Business Machines
|
|
26
|
+
Licensed under the MIT License.
|
|
27
|
+
|
|
28
|
+
It is reimplemented here for the neutral IR (it emits ``GridCell`` spans
|
|
29
|
+
rather than HTML strings) and carries no third-party imports. See
|
|
30
|
+
THIRD_PARTY_NOTICES.md and docs/adr/0005-port-otsl-reconstruction.md.
|
|
31
|
+
|
|
32
|
+
Stdlib-only (SPEC §13).
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
from __future__ import annotations
|
|
36
|
+
|
|
37
|
+
from dataclasses import dataclass
|
|
38
|
+
from typing import cast
|
|
39
|
+
|
|
40
|
+
from tablecodec.ir import GridCell, TableSample
|
|
41
|
+
|
|
42
|
+
__all__ = [
|
|
43
|
+
"ANCHOR_TOKENS",
|
|
44
|
+
"CELL_TOKENS",
|
|
45
|
+
"CONTINUATION_TOKENS",
|
|
46
|
+
"VALID_TOKENS",
|
|
47
|
+
"AnchorPlacement",
|
|
48
|
+
"build_anchors",
|
|
49
|
+
"build_token_grid",
|
|
50
|
+
"cells_to_otsl",
|
|
51
|
+
"ensure_square",
|
|
52
|
+
"otsl_to_cells",
|
|
53
|
+
"split_rows",
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
ANCHOR_TOKENS = frozenset({"fcel", "ecel"})
|
|
57
|
+
CONTINUATION_TOKENS = frozenset({"lcel", "ucel", "xcel"})
|
|
58
|
+
CELL_TOKENS = ANCHOR_TOKENS | CONTINUATION_TOKENS
|
|
59
|
+
VALID_TOKENS = CELL_TOKENS | {"nl"}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass(slots=True)
|
|
63
|
+
class AnchorPlacement:
|
|
64
|
+
"""One ``fcel`` / ``ecel`` anchor mapped to its grid coordinates."""
|
|
65
|
+
|
|
66
|
+
row: int
|
|
67
|
+
col: int
|
|
68
|
+
rowspan: int = 1
|
|
69
|
+
colspan: int = 1
|
|
70
|
+
is_empty: bool = False
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def split_rows(tokens: list[str]) -> list[list[str]]:
|
|
74
|
+
"""Split a flat cell-token stream on ``nl`` into per-row token lists.
|
|
75
|
+
|
|
76
|
+
Rejects any token outside :data:`VALID_TOKENS`. A trailing ``nl`` does
|
|
77
|
+
not produce an empty final row.
|
|
78
|
+
"""
|
|
79
|
+
rows: list[list[str]] = [[]]
|
|
80
|
+
for tok in tokens:
|
|
81
|
+
if tok not in VALID_TOKENS:
|
|
82
|
+
msg = f"unknown OTSL token {tok!r}"
|
|
83
|
+
raise ValueError(msg)
|
|
84
|
+
if tok == "nl":
|
|
85
|
+
rows.append([])
|
|
86
|
+
else:
|
|
87
|
+
rows[-1].append(tok)
|
|
88
|
+
if rows and not rows[-1]:
|
|
89
|
+
rows.pop()
|
|
90
|
+
return rows
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def ensure_square(rows: list[list[str]]) -> int:
|
|
94
|
+
"""Return the common row width, or raise if rows are jagged."""
|
|
95
|
+
if not rows:
|
|
96
|
+
return 0
|
|
97
|
+
widths = {len(r) for r in rows}
|
|
98
|
+
if len(widths) != 1:
|
|
99
|
+
msg = f"OTSL square-table assumption violated; row widths = {sorted(widths)}"
|
|
100
|
+
raise ValueError(msg)
|
|
101
|
+
return next(iter(widths))
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _normalize_edge_continuations(rows: list[list[str]], nrows: int, ncols: int) -> list[list[str]]:
|
|
105
|
+
"""Repair structurally-impossible continuations at the grid edges.
|
|
106
|
+
|
|
107
|
+
A continuation cannot merge in a direction that has no neighbour:
|
|
108
|
+
row 0 has nothing above, column 0 has nothing to the left. Real
|
|
109
|
+
encoders (and the docling OTSL decoder's "structure error correction")
|
|
110
|
+
emit ``xcel``/``ucel`` in row 0 and ``xcel``/``lcel`` in column 0 that
|
|
111
|
+
must be read as the only possible merge:
|
|
112
|
+
|
|
113
|
+
- row 0: ``ucel``/``xcel`` -> ``lcel`` (can only merge left).
|
|
114
|
+
- col 0: ``lcel``/``xcel`` -> ``ucel`` (can only merge up).
|
|
115
|
+
|
|
116
|
+
A copy is returned; the caller's rows are not mutated.
|
|
117
|
+
"""
|
|
118
|
+
grid = [list(row) for row in rows]
|
|
119
|
+
for c in range(ncols):
|
|
120
|
+
if grid[0][c] in {"ucel", "xcel"}:
|
|
121
|
+
grid[0][c] = "lcel"
|
|
122
|
+
for r in range(nrows):
|
|
123
|
+
if grid[r][0] in {"lcel", "xcel"}:
|
|
124
|
+
grid[r][0] = "ucel"
|
|
125
|
+
return grid
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
@dataclass(slots=True)
|
|
129
|
+
class _OtslReader:
|
|
130
|
+
"""Anchor-centric OTSL grid reader (logic adapted from docling, see header).
|
|
131
|
+
|
|
132
|
+
``registry`` mirrors docling's ``registry_2d_span``: cells already
|
|
133
|
+
claimed by a 2D (``xcel``) span, so a later anchor cannot re-claim them.
|
|
134
|
+
"""
|
|
135
|
+
|
|
136
|
+
grid: list[list[str]]
|
|
137
|
+
nrows: int
|
|
138
|
+
ncols: int
|
|
139
|
+
registry: list[list[bool]]
|
|
140
|
+
|
|
141
|
+
def _check_right(self, r: int, c: int) -> int:
|
|
142
|
+
# colspan: extend right over horizontal continuations (lcel/xcel);
|
|
143
|
+
# stop at an anchor, an up-merge, or the edge (docling check_right).
|
|
144
|
+
# Also stop at a cell already claimed by a 2D span above: an `xcel`
|
|
145
|
+
# there belongs to that span, not to this row's run — counting it
|
|
146
|
+
# would overlap (the irregular case real SynthTabNet rows hit).
|
|
147
|
+
dist = 1
|
|
148
|
+
x = c
|
|
149
|
+
while (
|
|
150
|
+
x + 1 < self.ncols
|
|
151
|
+
and self.grid[r][x + 1] in {"lcel", "xcel"}
|
|
152
|
+
and not self.registry[r][x + 1]
|
|
153
|
+
):
|
|
154
|
+
x += 1
|
|
155
|
+
dist += 1
|
|
156
|
+
return dist
|
|
157
|
+
|
|
158
|
+
def _check_down(self, r: int, c: int) -> int:
|
|
159
|
+
# rowspan: extend down over vertical continuations (ucel/xcel);
|
|
160
|
+
# stop at an anchor, a left-merge, the edge, or a cell already
|
|
161
|
+
# claimed by another 2D span (symmetric to _check_right).
|
|
162
|
+
dist = 1
|
|
163
|
+
y = r
|
|
164
|
+
while (
|
|
165
|
+
y + 1 < self.nrows
|
|
166
|
+
and self.grid[y + 1][c] in {"ucel", "xcel"}
|
|
167
|
+
and not self.registry[y + 1][c]
|
|
168
|
+
):
|
|
169
|
+
y += 1
|
|
170
|
+
dist += 1
|
|
171
|
+
return dist
|
|
172
|
+
|
|
173
|
+
def _claim_2d(self, r: int, c: int, rowspan: int, colspan: int) -> bool:
|
|
174
|
+
# Mark the rectangle in the registry iff none of it is already
|
|
175
|
+
# claimed (docling's double-count guard); returns whether it claimed.
|
|
176
|
+
for dr in range(rowspan):
|
|
177
|
+
for dc in range(colspan):
|
|
178
|
+
if self.registry[r + dr][c + dc]:
|
|
179
|
+
return False
|
|
180
|
+
for dr in range(rowspan):
|
|
181
|
+
for dc in range(colspan):
|
|
182
|
+
self.registry[r + dr][c + dc] = True
|
|
183
|
+
return True
|
|
184
|
+
|
|
185
|
+
def span_of(self, r: int, c: int) -> tuple[int, int]:
|
|
186
|
+
"""Compute (rowspan, colspan) for the anchor at (r, c) from its neighbours."""
|
|
187
|
+
colspan = rowspan = 1
|
|
188
|
+
right = self.grid[r][c + 1] if c + 1 < self.ncols else ""
|
|
189
|
+
below = self.grid[r + 1][c] if r + 1 < self.nrows else ""
|
|
190
|
+
if right == "lcel":
|
|
191
|
+
colspan = self._check_right(r, c)
|
|
192
|
+
if below == "ucel":
|
|
193
|
+
rowspan = self._check_down(r, c)
|
|
194
|
+
if right == "xcel":
|
|
195
|
+
xr = self._check_right(r, c)
|
|
196
|
+
xd = self._check_down(r, c)
|
|
197
|
+
if self._claim_2d(r, c, xd, xr):
|
|
198
|
+
colspan, rowspan = xr, xd
|
|
199
|
+
return rowspan, colspan
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def build_anchors(rows: list[list[str]]) -> tuple[int, int, list[AnchorPlacement]]:
|
|
203
|
+
"""Walk the row × col grid; return (nrows, ncols, ordered anchors).
|
|
204
|
+
|
|
205
|
+
Anchor-centric reconstruction (adapted from docling's ``otsl_to_html``,
|
|
206
|
+
see the module header): each ``fcel``/``ecel`` is an anchor whose span
|
|
207
|
+
is read from its neighbouring continuation tokens — a right ``lcel`` run
|
|
208
|
+
gives colspan, a below ``ucel`` run gives rowspan, and a right ``xcel``
|
|
209
|
+
gives a 2D span guarded by the registry against double-claiming.
|
|
210
|
+
Continuation tokens carry no content and are skipped; anchors are
|
|
211
|
+
returned in row-major order — the order the source ``cells[]`` appear in.
|
|
212
|
+
"""
|
|
213
|
+
nrows = len(rows)
|
|
214
|
+
ncols = ensure_square(rows)
|
|
215
|
+
reader = _OtslReader(
|
|
216
|
+
grid=_normalize_edge_continuations(rows, nrows, ncols),
|
|
217
|
+
nrows=nrows,
|
|
218
|
+
ncols=ncols,
|
|
219
|
+
registry=[[False] * ncols for _ in range(nrows)],
|
|
220
|
+
)
|
|
221
|
+
ordered: list[AnchorPlacement] = []
|
|
222
|
+
for r in range(nrows):
|
|
223
|
+
for c in range(ncols):
|
|
224
|
+
tok = reader.grid[r][c]
|
|
225
|
+
if tok not in ANCHOR_TOKENS:
|
|
226
|
+
continue
|
|
227
|
+
rowspan, colspan = reader.span_of(r, c)
|
|
228
|
+
ordered.append(
|
|
229
|
+
AnchorPlacement(
|
|
230
|
+
row=r, col=c, rowspan=rowspan, colspan=colspan, is_empty=(tok == "ecel")
|
|
231
|
+
)
|
|
232
|
+
)
|
|
233
|
+
return nrows, ncols, ordered
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def build_token_grid(sample: TableSample) -> tuple[list[list[str]], list[GridCell]]:
|
|
237
|
+
"""Lay a sample's cells onto a 2D token grid.
|
|
238
|
+
|
|
239
|
+
Returns ``(grid, anchors)`` where ``grid[r][c]`` is one of the five
|
|
240
|
+
cell tokens and ``anchors`` is the row-major list of the anchor cells
|
|
241
|
+
(so callers can attach per-cell content / bbox in the right order).
|
|
242
|
+
A cell with empty ``tokens`` becomes ``ecel``; otherwise ``fcel``.
|
|
243
|
+
"""
|
|
244
|
+
grid: list[list[str]] = [[""] * sample.ncols for _ in range(sample.nrows)]
|
|
245
|
+
anchored = sorted(sample.cells, key=lambda c: (c.row, c.col))
|
|
246
|
+
for cell in anchored:
|
|
247
|
+
grid[cell.row][cell.col] = "ecel" if not cell.tokens else "fcel"
|
|
248
|
+
for dr in range(cell.rowspan):
|
|
249
|
+
for dc in range(cell.colspan):
|
|
250
|
+
if dr == 0 and dc == 0:
|
|
251
|
+
continue
|
|
252
|
+
rr, cc = cell.row + dr, cell.col + dc
|
|
253
|
+
if dr == 0:
|
|
254
|
+
grid[rr][cc] = "lcel"
|
|
255
|
+
elif dc == 0:
|
|
256
|
+
grid[rr][cc] = "ucel"
|
|
257
|
+
else:
|
|
258
|
+
grid[rr][cc] = "xcel"
|
|
259
|
+
return grid, anchored
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
# ---------- OTSL payload <-> GridCells ----------
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def otsl_to_cells(
|
|
266
|
+
otsl_tokens: list[str], cell_payloads: list[dict[str, object]]
|
|
267
|
+
) -> tuple[int, int, tuple[GridCell, ...]]:
|
|
268
|
+
"""Map an OTSL token stream + positional ``cells[]`` to GridCells.
|
|
269
|
+
|
|
270
|
+
Returns ``(nrows, ncols, cells)``. Every cell defaults to
|
|
271
|
+
``role="body"`` (the OTSL core has no header marker). Raises if the
|
|
272
|
+
anchor count and ``cells[]`` length disagree.
|
|
273
|
+
"""
|
|
274
|
+
rows = split_rows(otsl_tokens)
|
|
275
|
+
nrows, ncols, anchors = build_anchors(rows)
|
|
276
|
+
if len(anchors) != len(cell_payloads):
|
|
277
|
+
msg = (
|
|
278
|
+
f"OTSL declares {len(anchors)} anchored cells but cells[] has "
|
|
279
|
+
f"{len(cell_payloads)} entries"
|
|
280
|
+
)
|
|
281
|
+
raise ValueError(msg)
|
|
282
|
+
|
|
283
|
+
cells: list[GridCell] = []
|
|
284
|
+
for anchor, cell_payload in zip(anchors, cell_payloads, strict=True):
|
|
285
|
+
tokens = tuple(cast("tuple[str, ...]", cell_payload.get("tokens", ())))
|
|
286
|
+
bbox_raw = cell_payload.get("bbox")
|
|
287
|
+
bbox = None
|
|
288
|
+
if bbox_raw is not None:
|
|
289
|
+
seq = cast("list[int]", bbox_raw)
|
|
290
|
+
bbox = (int(seq[0]), int(seq[1]), int(seq[2]), int(seq[3]))
|
|
291
|
+
cells.append(
|
|
292
|
+
GridCell(
|
|
293
|
+
row=anchor.row,
|
|
294
|
+
col=anchor.col,
|
|
295
|
+
rowspan=anchor.rowspan,
|
|
296
|
+
colspan=anchor.colspan,
|
|
297
|
+
tokens=tokens,
|
|
298
|
+
bbox=bbox,
|
|
299
|
+
role="body",
|
|
300
|
+
)
|
|
301
|
+
)
|
|
302
|
+
return nrows, ncols, tuple(cells)
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
def cells_to_otsl(sample: TableSample) -> tuple[list[str], list[dict[str, object]]]:
|
|
306
|
+
"""Serialize a sample to an OTSL token stream + positional ``cells[]``."""
|
|
307
|
+
grid, emitted_order = build_token_grid(sample)
|
|
308
|
+
tokens: list[str] = []
|
|
309
|
+
for row in grid:
|
|
310
|
+
tokens.extend(row)
|
|
311
|
+
tokens.append("nl")
|
|
312
|
+
cell_payloads: list[dict[str, object]] = []
|
|
313
|
+
for cell in emitted_order:
|
|
314
|
+
payload: dict[str, object] = {"tokens": list(cell.tokens)}
|
|
315
|
+
if cell.bbox is not None:
|
|
316
|
+
payload["bbox"] = list(cell.bbox)
|
|
317
|
+
cell_payloads.append(payload)
|
|
318
|
+
return tokens, cell_payloads
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""The built-in codec instances, as a single source of truth.
|
|
2
|
+
|
|
3
|
+
The library itself does not auto-register codecs (callers register the
|
|
4
|
+
ones they need). But the CLI and the documentation generators all need
|
|
5
|
+
"every codec that ships with tablecodec" — keeping that list in one
|
|
6
|
+
place avoids the drift where a new codec is added to some consumers but
|
|
7
|
+
not others.
|
|
8
|
+
|
|
9
|
+
Order is deterministic and shapes the rendered doc tables.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from tablecodec.codecs._base import Codec
|
|
15
|
+
from tablecodec.codecs.doctags import DocTagsTablesCodec
|
|
16
|
+
from tablecodec.codecs.fintabnet import FinTabNetCodec
|
|
17
|
+
from tablecodec.codecs.fintabnet_otsl import FinTabNetOTSLCodec
|
|
18
|
+
from tablecodec.codecs.otsl import OTSL10Codec
|
|
19
|
+
from tablecodec.codecs.pubtables1m import PubTables1MCodec
|
|
20
|
+
from tablecodec.codecs.pubtabnet import PubTabNet10Codec, PubTabNet20Codec
|
|
21
|
+
from tablecodec.codecs.tablebank import TableBankCodec
|
|
22
|
+
from tablecodec.codecs.tableformer import TableFormerCodec
|
|
23
|
+
|
|
24
|
+
__all__ = ["BUILTIN_CODECS"]
|
|
25
|
+
|
|
26
|
+
BUILTIN_CODECS: tuple[Codec, ...] = (
|
|
27
|
+
PubTabNet10Codec(),
|
|
28
|
+
PubTabNet20Codec(),
|
|
29
|
+
FinTabNetCodec(),
|
|
30
|
+
TableFormerCodec(),
|
|
31
|
+
TableBankCodec(),
|
|
32
|
+
PubTables1MCodec(),
|
|
33
|
+
OTSL10Codec(),
|
|
34
|
+
FinTabNetOTSLCodec(),
|
|
35
|
+
DocTagsTablesCodec(),
|
|
36
|
+
)
|
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
"""DocTags table subset codec.
|
|
2
|
+
|
|
3
|
+
DocTags (IBM Granite-Docling, 2026) is a document markup where tables are
|
|
4
|
+
encoded as OTSL cell tokens wrapped in ``<otsl>`` ... ``</otsl>``. Each
|
|
5
|
+
anchor cell is annotated with four ``<loc_n>`` tokens — a bounding box on
|
|
6
|
+
a fixed 0–500 grid — followed by the cell's content tokens. Continuation
|
|
7
|
+
tokens (``lcel`` / ``ucel`` / ``xcel``) carry neither loc nor content.
|
|
8
|
+
|
|
9
|
+
This codec handles the **table subset** of DocTags:
|
|
10
|
+
|
|
11
|
+
- read: full — structure (via the shared :mod:`._otslgrid`), bbox (from
|
|
12
|
+
the loc tokens), and cell content.
|
|
13
|
+
- write: the OTSL-equivalent subset only (SPEC §7 marks this △). Header
|
|
14
|
+
/ body ``role`` has no representation in the OTSL core, so it is lost
|
|
15
|
+
(``lossy_write`` declares ``role``); ``extras`` is also dropped.
|
|
16
|
+
|
|
17
|
+
Canonical jsonl record shape::
|
|
18
|
+
|
|
19
|
+
{
|
|
20
|
+
"filename": "...",
|
|
21
|
+
"split": "train" | "val" | "test", # optional
|
|
22
|
+
"imgid": 0, # optional
|
|
23
|
+
"doctags": [
|
|
24
|
+
"<otsl>",
|
|
25
|
+
"fcel",
|
|
26
|
+
"<loc_0>",
|
|
27
|
+
"<loc_0>",
|
|
28
|
+
"<loc_250>",
|
|
29
|
+
"<loc_50>",
|
|
30
|
+
"Year",
|
|
31
|
+
...,
|
|
32
|
+
"nl",
|
|
33
|
+
"</otsl>",
|
|
34
|
+
],
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
Derived from the published DocTags / OTSL description, not copied from
|
|
38
|
+
upstream reference code.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
from __future__ import annotations
|
|
42
|
+
|
|
43
|
+
import json
|
|
44
|
+
import re
|
|
45
|
+
from collections.abc import Iterable, Iterator
|
|
46
|
+
from dataclasses import dataclass, field
|
|
47
|
+
from typing import IO, Any, cast
|
|
48
|
+
|
|
49
|
+
from tablecodec.codecs._otslgrid import (
|
|
50
|
+
ANCHOR_TOKENS,
|
|
51
|
+
CELL_TOKENS,
|
|
52
|
+
build_anchors,
|
|
53
|
+
build_token_grid,
|
|
54
|
+
)
|
|
55
|
+
from tablecodec.ir import BBox, GridCell, TableSample
|
|
56
|
+
|
|
57
|
+
__all__ = ["DocTagsTablesCodec"]
|
|
58
|
+
|
|
59
|
+
_OTSL_OPEN = "<otsl>"
|
|
60
|
+
_OTSL_CLOSE = "</otsl>"
|
|
61
|
+
_LOC_RE = re.compile(r"^<loc_(\d+)>$")
|
|
62
|
+
_LOC_PER_BBOX = 4
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dataclass(frozen=True, slots=True)
|
|
66
|
+
class DocTagsTablesCodec:
|
|
67
|
+
"""Codec for the DocTags table subset (read full, write OTSL subset)."""
|
|
68
|
+
|
|
69
|
+
name: str = "doctags-tables"
|
|
70
|
+
spec_version: str = "1.0.0"
|
|
71
|
+
media_type: str = "application/jsonl"
|
|
72
|
+
writable: bool = True
|
|
73
|
+
|
|
74
|
+
def read(self, source: IO[str]) -> Iterator[TableSample]:
|
|
75
|
+
for line_no, raw in enumerate(source, start=1):
|
|
76
|
+
line = raw.strip()
|
|
77
|
+
if not line:
|
|
78
|
+
continue
|
|
79
|
+
try:
|
|
80
|
+
payload: dict[str, Any] = json.loads(line)
|
|
81
|
+
except json.JSONDecodeError as exc:
|
|
82
|
+
msg = f"invalid JSON at line {line_no}: {exc.msg}"
|
|
83
|
+
raise ValueError(msg) from exc
|
|
84
|
+
try:
|
|
85
|
+
yield _payload_to_sample(payload)
|
|
86
|
+
except (KeyError, ValueError, TypeError) as exc:
|
|
87
|
+
msg = f"malformed DocTags record at line {line_no}: {exc}"
|
|
88
|
+
raise ValueError(msg) from exc
|
|
89
|
+
|
|
90
|
+
def write(self, samples: Iterable[TableSample], sink: IO[str]) -> None:
|
|
91
|
+
for sample in samples:
|
|
92
|
+
sink.write(json.dumps(_sample_to_payload(sample), ensure_ascii=False))
|
|
93
|
+
sink.write("\n")
|
|
94
|
+
|
|
95
|
+
def lossy_read(self) -> frozenset[str]:
|
|
96
|
+
# The OTSL core has no header marker; role defaults to body.
|
|
97
|
+
return frozenset({"role"})
|
|
98
|
+
|
|
99
|
+
def lossy_write(self) -> frozenset[str]:
|
|
100
|
+
return frozenset({"extras", "role"})
|
|
101
|
+
|
|
102
|
+
def sniff(self, source: IO[str]) -> bool:
|
|
103
|
+
pos = source.tell()
|
|
104
|
+
try:
|
|
105
|
+
for raw in source:
|
|
106
|
+
line = raw.strip()
|
|
107
|
+
if not line:
|
|
108
|
+
continue
|
|
109
|
+
try:
|
|
110
|
+
payload: object = json.loads(line)
|
|
111
|
+
except json.JSONDecodeError:
|
|
112
|
+
return False
|
|
113
|
+
return isinstance(payload, dict) and "doctags" in cast("dict[str, Any]", payload)
|
|
114
|
+
return False
|
|
115
|
+
finally:
|
|
116
|
+
source.seek(pos)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# ---------- DocTags token stream parsing ----------
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
@dataclass(slots=True)
|
|
123
|
+
class _ParsedCell:
|
|
124
|
+
"""A structural cell token plus, for anchors, its bbox and content."""
|
|
125
|
+
|
|
126
|
+
token: str
|
|
127
|
+
bbox: BBox | None = None
|
|
128
|
+
content: tuple[str, ...] = ()
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _initial_rows() -> list[list[str]]:
|
|
132
|
+
return [[]]
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _empty_parsed_cells() -> list[_ParsedCell]:
|
|
136
|
+
return []
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
@dataclass(slots=True)
|
|
140
|
+
class _StreamState:
|
|
141
|
+
rows: list[list[str]] = field(default_factory=_initial_rows)
|
|
142
|
+
anchors: list[_ParsedCell] = field(default_factory=_empty_parsed_cells)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _parse_loc_run(tokens: list[str], start: int) -> tuple[BBox | None, int]:
|
|
146
|
+
"""Read up to four consecutive ``<loc_n>`` tokens from *start*.
|
|
147
|
+
|
|
148
|
+
Returns (bbox or None, index after the loc run).
|
|
149
|
+
"""
|
|
150
|
+
coords: list[int] = []
|
|
151
|
+
j = start
|
|
152
|
+
while j < len(tokens) and len(coords) < _LOC_PER_BBOX:
|
|
153
|
+
m = _LOC_RE.match(tokens[j])
|
|
154
|
+
if m is None:
|
|
155
|
+
break
|
|
156
|
+
coords.append(int(m.group(1)))
|
|
157
|
+
j += 1
|
|
158
|
+
if len(coords) == _LOC_PER_BBOX:
|
|
159
|
+
return (coords[0], coords[1], coords[2], coords[3]), j
|
|
160
|
+
return None, start # not a full bbox; leave tokens for content
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _parse_content_run(tokens: list[str], start: int) -> tuple[tuple[str, ...], int]:
|
|
164
|
+
"""Read content tokens until the next structural / loc / nl / wrapper token."""
|
|
165
|
+
content: list[str] = []
|
|
166
|
+
j = start
|
|
167
|
+
while j < len(tokens):
|
|
168
|
+
tok = tokens[j]
|
|
169
|
+
if tok in CELL_TOKENS or tok == "nl" or tok in (_OTSL_OPEN, _OTSL_CLOSE):
|
|
170
|
+
break
|
|
171
|
+
if _LOC_RE.match(tok) is not None:
|
|
172
|
+
break
|
|
173
|
+
content.append(tok)
|
|
174
|
+
j += 1
|
|
175
|
+
return tuple(content), j
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _parse_doctags_stream(tokens: list[str]) -> _StreamState:
|
|
179
|
+
state = _StreamState()
|
|
180
|
+
i = 0
|
|
181
|
+
while i < len(tokens):
|
|
182
|
+
tok = tokens[i]
|
|
183
|
+
if tok in (_OTSL_OPEN, _OTSL_CLOSE):
|
|
184
|
+
i += 1
|
|
185
|
+
elif tok == "nl":
|
|
186
|
+
state.rows.append([])
|
|
187
|
+
i += 1
|
|
188
|
+
elif tok in CELL_TOKENS:
|
|
189
|
+
state.rows[-1].append(tok)
|
|
190
|
+
if tok in ANCHOR_TOKENS:
|
|
191
|
+
bbox, after_loc = _parse_loc_run(tokens, i + 1)
|
|
192
|
+
content, after_content = _parse_content_run(tokens, after_loc)
|
|
193
|
+
state.anchors.append(_ParsedCell(token=tok, bbox=bbox, content=content))
|
|
194
|
+
i = after_content
|
|
195
|
+
else:
|
|
196
|
+
i += 1
|
|
197
|
+
else:
|
|
198
|
+
msg = f"unexpected DocTags token {tok!r}"
|
|
199
|
+
raise ValueError(msg)
|
|
200
|
+
if state.rows and not state.rows[-1]:
|
|
201
|
+
state.rows.pop()
|
|
202
|
+
return state
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _payload_to_sample(payload: dict[str, Any]) -> TableSample:
|
|
206
|
+
tokens = list(payload["doctags"])
|
|
207
|
+
state = _parse_doctags_stream(tokens)
|
|
208
|
+
_nrows, _ncols, placements = build_anchors(state.rows)
|
|
209
|
+
|
|
210
|
+
if len(placements) != len(state.anchors):
|
|
211
|
+
msg = (
|
|
212
|
+
f"DocTags declares {len(placements)} anchors but the stream parsed "
|
|
213
|
+
f"{len(state.anchors)} cell contents"
|
|
214
|
+
)
|
|
215
|
+
raise ValueError(msg)
|
|
216
|
+
|
|
217
|
+
cells = tuple(
|
|
218
|
+
GridCell(
|
|
219
|
+
row=placement.row,
|
|
220
|
+
col=placement.col,
|
|
221
|
+
rowspan=placement.rowspan,
|
|
222
|
+
colspan=placement.colspan,
|
|
223
|
+
tokens=parsed.content,
|
|
224
|
+
bbox=parsed.bbox,
|
|
225
|
+
role="body",
|
|
226
|
+
)
|
|
227
|
+
for placement, parsed in zip(placements, state.anchors, strict=True)
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
return TableSample(
|
|
231
|
+
filename=str(payload["filename"]),
|
|
232
|
+
nrows=_nrows,
|
|
233
|
+
ncols=_ncols,
|
|
234
|
+
cells=cells,
|
|
235
|
+
split=_normalize_split(payload.get("split")),
|
|
236
|
+
imgid=payload.get("imgid"),
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def _normalize_split(value: object) -> Any:
|
|
241
|
+
if value in ("train", "val", "test"):
|
|
242
|
+
return value
|
|
243
|
+
if value is None:
|
|
244
|
+
return None
|
|
245
|
+
msg = f"unknown split value {value!r}"
|
|
246
|
+
raise ValueError(msg)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
# ---------- IR → DocTags ----------
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def _loc_tokens(bbox: BBox) -> list[str]:
|
|
253
|
+
return [f"<loc_{v}>" for v in bbox]
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def _sample_to_payload(sample: TableSample) -> dict[str, Any]:
|
|
257
|
+
grid, anchored = build_token_grid(sample)
|
|
258
|
+
by_pos = {(c.row, c.col): c for c in anchored}
|
|
259
|
+
|
|
260
|
+
tokens: list[str] = [_OTSL_OPEN]
|
|
261
|
+
for r, row in enumerate(grid):
|
|
262
|
+
for c, structural in enumerate(row):
|
|
263
|
+
tokens.append(structural)
|
|
264
|
+
cell = by_pos.get((r, c))
|
|
265
|
+
if cell is None:
|
|
266
|
+
continue # continuation token: no loc/content
|
|
267
|
+
if cell.bbox is not None:
|
|
268
|
+
tokens.extend(_loc_tokens(cell.bbox))
|
|
269
|
+
tokens.extend(cell.tokens)
|
|
270
|
+
tokens.append("nl")
|
|
271
|
+
tokens.append(_OTSL_CLOSE)
|
|
272
|
+
|
|
273
|
+
out: dict[str, Any] = {"filename": sample.filename, "doctags": tokens}
|
|
274
|
+
if sample.split is not None:
|
|
275
|
+
out["split"] = sample.split
|
|
276
|
+
if sample.imgid is not None:
|
|
277
|
+
out["imgid"] = sample.imgid
|
|
278
|
+
return out
|