tablecodec 0.0.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tablecodec/__init__.py +29 -0
- tablecodec/_invariants.py +311 -0
- tablecodec/cli.py +314 -0
- tablecodec/codecs/__init__.py +111 -0
- tablecodec/codecs/_base.py +79 -0
- tablecodec/codecs/_htmltable.py +510 -0
- tablecodec/codecs/_otslgrid.py +318 -0
- tablecodec/codecs/builtins.py +36 -0
- tablecodec/codecs/doctags.py +278 -0
- tablecodec/codecs/fintabnet.py +84 -0
- tablecodec/codecs/fintabnet_otsl.py +141 -0
- tablecodec/codecs/otsl.py +138 -0
- tablecodec/codecs/pubtables1m.py +161 -0
- tablecodec/codecs/pubtabnet.py +128 -0
- tablecodec/codecs/tablebank.py +76 -0
- tablecodec/codecs/tableformer.py +80 -0
- tablecodec/io.py +91 -0
- tablecodec/ir.py +101 -0
- tablecodec/loss.py +105 -0
- tablecodec/py.typed +0 -0
- tablecodec/teds.py +243 -0
- tablecodec/validate.py +185 -0
- tablecodec-0.0.18.dist-info/METADATA +200 -0
- tablecodec-0.0.18.dist-info/RECORD +27 -0
- tablecodec-0.0.18.dist-info/WHEEL +4 -0
- tablecodec-0.0.18.dist-info/entry_points.txt +2 -0
- tablecodec-0.0.18.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""FinTabNet (original) codec.
|
|
2
|
+
|
|
3
|
+
FinTabNet's original IBM annotations encode table structure with the
|
|
4
|
+
same HTML-token scheme as PubTabNet 2.0, differing only in the
|
|
5
|
+
record-level identifier: ``table_id`` instead of ``imgid``. The shared
|
|
6
|
+
machinery lives in :mod:`._htmltable`; this module just sets
|
|
7
|
+
``id_field="table_id"`` and a sniff discriminator.
|
|
8
|
+
|
|
9
|
+
Record shape::
|
|
10
|
+
|
|
11
|
+
{
|
|
12
|
+
"filename": "...",
|
|
13
|
+
"split": "train" | "val" | "test", # optional
|
|
14
|
+
"table_id": 0,
|
|
15
|
+
"html": {
|
|
16
|
+
"structure": {"tokens": [...]},
|
|
17
|
+
"cells": [{"tokens": [...], "bbox": [x0, y0, x1, y1]}, ...],
|
|
18
|
+
},
|
|
19
|
+
}
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import json
|
|
25
|
+
from collections.abc import Iterable, Iterator
|
|
26
|
+
from dataclasses import dataclass
|
|
27
|
+
from typing import IO, Any
|
|
28
|
+
|
|
29
|
+
from tablecodec.codecs._htmltable import (
|
|
30
|
+
parse_html_table,
|
|
31
|
+
serialize_html_table,
|
|
32
|
+
sniff_html_table,
|
|
33
|
+
)
|
|
34
|
+
from tablecodec.ir import TableSample
|
|
35
|
+
|
|
36
|
+
__all__ = ["FinTabNetCodec"]
|
|
37
|
+
|
|
38
|
+
_ID_FIELD = "table_id"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass(frozen=True, slots=True)
|
|
42
|
+
class FinTabNetCodec:
|
|
43
|
+
"""Codec for the FinTabNet (original) jsonl format."""
|
|
44
|
+
|
|
45
|
+
name: str = "fintabnet"
|
|
46
|
+
spec_version: str = "1.0.0"
|
|
47
|
+
media_type: str = "application/jsonl"
|
|
48
|
+
writable: bool = True
|
|
49
|
+
|
|
50
|
+
def read(self, source: IO[str]) -> Iterator[TableSample]:
|
|
51
|
+
for line_no, raw in enumerate(source, start=1):
|
|
52
|
+
line = raw.strip()
|
|
53
|
+
if not line:
|
|
54
|
+
continue
|
|
55
|
+
try:
|
|
56
|
+
payload: dict[str, Any] = json.loads(line)
|
|
57
|
+
except json.JSONDecodeError as exc:
|
|
58
|
+
msg = f"invalid JSON at line {line_no}: {exc.msg}"
|
|
59
|
+
raise ValueError(msg) from exc
|
|
60
|
+
try:
|
|
61
|
+
yield parse_html_table(payload, id_field=_ID_FIELD)
|
|
62
|
+
except (KeyError, ValueError, TypeError) as exc:
|
|
63
|
+
msg = f"malformed FinTabNet record at line {line_no}: {exc}"
|
|
64
|
+
raise ValueError(msg) from exc
|
|
65
|
+
|
|
66
|
+
def write(self, samples: Iterable[TableSample], sink: IO[str]) -> None:
|
|
67
|
+
for sample in samples:
|
|
68
|
+
sink.write(
|
|
69
|
+
json.dumps(serialize_html_table(sample, id_field=_ID_FIELD), ensure_ascii=False)
|
|
70
|
+
)
|
|
71
|
+
sink.write("\n")
|
|
72
|
+
|
|
73
|
+
def lossy_read(self) -> frozenset[str]:
|
|
74
|
+
# Same HTML-token model as PubTabNet 2.0: nothing dropped on read.
|
|
75
|
+
return frozenset()
|
|
76
|
+
|
|
77
|
+
def lossy_write(self) -> frozenset[str]:
|
|
78
|
+
# IR ``extras`` has no canonical home in the FinTabNet schema.
|
|
79
|
+
return frozenset({"extras"})
|
|
80
|
+
|
|
81
|
+
def sniff(self, source: IO[str]) -> bool:
|
|
82
|
+
# Require the table_id key so a PubTabNet (imgid) record is not
|
|
83
|
+
# mis-detected as FinTabNet.
|
|
84
|
+
return sniff_html_table(source, require_field=_ID_FIELD)
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""FinTabNet_OTSL codec.
|
|
2
|
+
|
|
3
|
+
FinTabNet_OTSL (Docling project, HF ``ds4sd/FinTabNet_OTSL``) is the
|
|
4
|
+
FinTabNet corpus re-encoded in OTSL. Compared to the plain
|
|
5
|
+
``otsl-1.0.0`` codec it adds FinTabNet provenance:
|
|
6
|
+
|
|
7
|
+
- ``table_id`` as the record identifier (mapped onto the IR ``imgid``),
|
|
8
|
+
like the ``fintabnet`` codec.
|
|
9
|
+
- an ``extras`` dict (carrying e.g. ``otsl_raw``, the original OTSL
|
|
10
|
+
markup string). This codec is the only one that **round-trips** IR
|
|
11
|
+
``extras``, so ``extras`` is deliberately absent from ``lossy_write``.
|
|
12
|
+
|
|
13
|
+
Structure / cell handling is shared with OTSL via :mod:`._otslgrid`.
|
|
14
|
+
|
|
15
|
+
Record shape::
|
|
16
|
+
|
|
17
|
+
{
|
|
18
|
+
"filename": "...",
|
|
19
|
+
"split": "train" | "val" | "test", # optional
|
|
20
|
+
"table_id": 0,
|
|
21
|
+
"otsl": ["fcel", "fcel", "nl", ...],
|
|
22
|
+
"cells": [{"tokens": ["a"], "bbox": [x0, y0, x1, y1]}, ...],
|
|
23
|
+
"extras": {"otsl_raw": "fcel fcel nl ...", ...} # optional
|
|
24
|
+
}
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import json
|
|
30
|
+
from collections.abc import Iterable, Iterator
|
|
31
|
+
from dataclasses import dataclass
|
|
32
|
+
from typing import IO, Any, cast
|
|
33
|
+
|
|
34
|
+
from tablecodec.codecs._otslgrid import cells_to_otsl, otsl_to_cells
|
|
35
|
+
from tablecodec.ir import TableSample
|
|
36
|
+
|
|
37
|
+
__all__ = ["FinTabNetOTSLCodec"]
|
|
38
|
+
|
|
39
|
+
_ID_FIELD = "table_id"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass(frozen=True, slots=True)
|
|
43
|
+
class FinTabNetOTSLCodec:
|
|
44
|
+
"""Codec for the FinTabNet_OTSL jsonl format (OTSL + table_id + extras)."""
|
|
45
|
+
|
|
46
|
+
name: str = "fintabnet-otsl"
|
|
47
|
+
spec_version: str = "1.0.0"
|
|
48
|
+
media_type: str = "application/jsonl"
|
|
49
|
+
writable: bool = True
|
|
50
|
+
|
|
51
|
+
def read(self, source: IO[str]) -> Iterator[TableSample]:
|
|
52
|
+
for line_no, raw in enumerate(source, start=1):
|
|
53
|
+
line = raw.strip()
|
|
54
|
+
if not line:
|
|
55
|
+
continue
|
|
56
|
+
try:
|
|
57
|
+
payload: dict[str, Any] = json.loads(line)
|
|
58
|
+
except json.JSONDecodeError as exc:
|
|
59
|
+
msg = f"invalid JSON at line {line_no}: {exc.msg}"
|
|
60
|
+
raise ValueError(msg) from exc
|
|
61
|
+
try:
|
|
62
|
+
yield _payload_to_sample(payload)
|
|
63
|
+
except (KeyError, ValueError, TypeError) as exc:
|
|
64
|
+
msg = f"malformed FinTabNet_OTSL record at line {line_no}: {exc}"
|
|
65
|
+
raise ValueError(msg) from exc
|
|
66
|
+
|
|
67
|
+
def write(self, samples: Iterable[TableSample], sink: IO[str]) -> None:
|
|
68
|
+
for sample in samples:
|
|
69
|
+
sink.write(json.dumps(_sample_to_payload(sample), ensure_ascii=False))
|
|
70
|
+
sink.write("\n")
|
|
71
|
+
|
|
72
|
+
def lossy_read(self) -> frozenset[str]:
|
|
73
|
+
# OTSL core has no header marker; role defaults to body. extras is
|
|
74
|
+
# preserved.
|
|
75
|
+
return frozenset({"role"})
|
|
76
|
+
|
|
77
|
+
def lossy_write(self) -> frozenset[str]:
|
|
78
|
+
# role is lost (OTSL core). extras is round-tripped, so unlike every
|
|
79
|
+
# other codec it is NOT listed here.
|
|
80
|
+
return frozenset({"role"})
|
|
81
|
+
|
|
82
|
+
def sniff(self, source: IO[str]) -> bool:
|
|
83
|
+
pos = source.tell()
|
|
84
|
+
try:
|
|
85
|
+
for raw in source:
|
|
86
|
+
line = raw.strip()
|
|
87
|
+
if not line:
|
|
88
|
+
continue
|
|
89
|
+
try:
|
|
90
|
+
payload: object = json.loads(line)
|
|
91
|
+
except json.JSONDecodeError:
|
|
92
|
+
return False
|
|
93
|
+
if not isinstance(payload, dict):
|
|
94
|
+
return False
|
|
95
|
+
payload_dict = cast("dict[str, Any]", payload)
|
|
96
|
+
return "otsl" in payload_dict and _ID_FIELD in payload_dict
|
|
97
|
+
return False
|
|
98
|
+
finally:
|
|
99
|
+
source.seek(pos)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _normalize_split(value: object) -> Any:
|
|
103
|
+
if value in ("train", "val", "test"):
|
|
104
|
+
return value
|
|
105
|
+
if value is None:
|
|
106
|
+
return None
|
|
107
|
+
msg = f"unknown split value {value!r}"
|
|
108
|
+
raise ValueError(msg)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _payload_to_sample(payload: dict[str, Any]) -> TableSample:
|
|
112
|
+
nrows, ncols, cells = otsl_to_cells(list(payload["otsl"]), list(payload["cells"]))
|
|
113
|
+
extras_raw: object = payload.get("extras", {})
|
|
114
|
+
extras: dict[str, object] = (
|
|
115
|
+
dict(cast("dict[str, object]", extras_raw)) if isinstance(extras_raw, dict) else {}
|
|
116
|
+
)
|
|
117
|
+
return TableSample(
|
|
118
|
+
filename=str(payload["filename"]),
|
|
119
|
+
nrows=nrows,
|
|
120
|
+
ncols=ncols,
|
|
121
|
+
cells=cells,
|
|
122
|
+
split=_normalize_split(payload.get("split")),
|
|
123
|
+
imgid=payload.get(_ID_FIELD),
|
|
124
|
+
extras=extras,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _sample_to_payload(sample: TableSample) -> dict[str, Any]:
|
|
129
|
+
tokens, cell_payloads = cells_to_otsl(sample)
|
|
130
|
+
out: dict[str, Any] = {
|
|
131
|
+
"filename": sample.filename,
|
|
132
|
+
"otsl": tokens,
|
|
133
|
+
"cells": cell_payloads,
|
|
134
|
+
}
|
|
135
|
+
if sample.split is not None:
|
|
136
|
+
out["split"] = sample.split
|
|
137
|
+
if sample.imgid is not None:
|
|
138
|
+
out[_ID_FIELD] = sample.imgid
|
|
139
|
+
if sample.extras:
|
|
140
|
+
out["extras"] = dict(sample.extras)
|
|
141
|
+
return out
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
"""OTSL 1.0 codec.
|
|
2
|
+
|
|
3
|
+
Implements the Optimized Table Structure Language (Lysak et al.,
|
|
4
|
+
ICDAR 2023, arXiv 2305.03393). OTSL uses a five-token vocabulary plus
|
|
5
|
+
a newline marker:
|
|
6
|
+
|
|
7
|
+
- ``fcel`` filled cell anchor (body content)
|
|
8
|
+
- ``ecel`` empty cell anchor
|
|
9
|
+
- ``lcel`` left-merged continuation — extends the colspan of the anchor
|
|
10
|
+
to its left
|
|
11
|
+
- ``ucel`` up-merged continuation — extends the rowspan of the anchor above
|
|
12
|
+
- ``xcel`` cross-merged continuation — extends both row and column
|
|
13
|
+
(the anchor sits at (r-1, c-1) of this position)
|
|
14
|
+
- ``nl`` newline / row separator
|
|
15
|
+
|
|
16
|
+
Square-table assumption (per the paper): every row produced by ``nl``
|
|
17
|
+
splits MUST have the same number of cell-position tokens. Jagged input
|
|
18
|
+
is rejected with a clear error.
|
|
19
|
+
|
|
20
|
+
This implementation is derived from the paper, not copied from the
|
|
21
|
+
official Docling OTSL reference implementation. Cross-validation
|
|
22
|
+
against the reference is wired separately in a later milestone.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import json
|
|
28
|
+
from collections.abc import Iterable, Iterator
|
|
29
|
+
from dataclasses import dataclass
|
|
30
|
+
from typing import IO, Any, cast
|
|
31
|
+
|
|
32
|
+
from tablecodec.codecs._otslgrid import cells_to_otsl, otsl_to_cells
|
|
33
|
+
from tablecodec.ir import TableSample
|
|
34
|
+
|
|
35
|
+
__all__ = ["OTSL10Codec"]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass(frozen=True, slots=True)
|
|
39
|
+
class OTSL10Codec:
|
|
40
|
+
"""Codec for the OTSL 1.0 jsonl format."""
|
|
41
|
+
|
|
42
|
+
name: str = "otsl-1.0.0"
|
|
43
|
+
spec_version: str = "1.0.0"
|
|
44
|
+
media_type: str = "application/jsonl"
|
|
45
|
+
writable: bool = True
|
|
46
|
+
|
|
47
|
+
def read(self, source: IO[str]) -> Iterator[TableSample]:
|
|
48
|
+
for line_no, raw in enumerate(source, start=1):
|
|
49
|
+
line = raw.strip()
|
|
50
|
+
if not line:
|
|
51
|
+
continue
|
|
52
|
+
try:
|
|
53
|
+
payload: dict[str, Any] = json.loads(line)
|
|
54
|
+
except json.JSONDecodeError as exc:
|
|
55
|
+
msg = f"invalid JSON at line {line_no}: {exc.msg}"
|
|
56
|
+
raise ValueError(msg) from exc
|
|
57
|
+
try:
|
|
58
|
+
yield _payload_to_sample(payload)
|
|
59
|
+
except (KeyError, ValueError, TypeError) as exc:
|
|
60
|
+
msg = f"malformed OTSL 1.0 record at line {line_no}: {exc}"
|
|
61
|
+
raise ValueError(msg) from exc
|
|
62
|
+
|
|
63
|
+
def write(self, samples: Iterable[TableSample], sink: IO[str]) -> None:
|
|
64
|
+
for sample in samples:
|
|
65
|
+
sink.write(json.dumps(_sample_to_payload(sample), ensure_ascii=False))
|
|
66
|
+
sink.write("\n")
|
|
67
|
+
|
|
68
|
+
def lossy_read(self) -> frozenset[str]:
|
|
69
|
+
# OTSL has no header/body distinction in its grammar; reads default
|
|
70
|
+
# every cell to role="body". This is a real loss when the source
|
|
71
|
+
# carried a header/body tag.
|
|
72
|
+
return frozenset({"role"})
|
|
73
|
+
|
|
74
|
+
def lossy_write(self) -> frozenset[str]:
|
|
75
|
+
# role: collapsed to "body" on write.
|
|
76
|
+
# extras: no canonical home in OTSL.
|
|
77
|
+
return frozenset({"extras", "role"})
|
|
78
|
+
|
|
79
|
+
def sniff(self, source: IO[str]) -> bool:
|
|
80
|
+
pos = source.tell()
|
|
81
|
+
try:
|
|
82
|
+
for raw in source:
|
|
83
|
+
line = raw.strip()
|
|
84
|
+
if not line:
|
|
85
|
+
continue
|
|
86
|
+
try:
|
|
87
|
+
payload: object = json.loads(line)
|
|
88
|
+
except json.JSONDecodeError:
|
|
89
|
+
return False
|
|
90
|
+
return _looks_like_otsl(payload)
|
|
91
|
+
return False
|
|
92
|
+
finally:
|
|
93
|
+
source.seek(pos)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _looks_like_otsl(payload: object) -> bool:
|
|
97
|
+
if not isinstance(payload, dict):
|
|
98
|
+
return False
|
|
99
|
+
payload_dict = cast("dict[str, Any]", payload)
|
|
100
|
+
return "otsl" in payload_dict and "cells" in payload_dict
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# ---------- payload <-> sample (delegates to _otslgrid) ----------
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _payload_to_sample(payload: dict[str, Any]) -> TableSample:
|
|
107
|
+
nrows, ncols, cells = otsl_to_cells(list(payload["otsl"]), list(payload["cells"]))
|
|
108
|
+
return TableSample(
|
|
109
|
+
filename=str(payload["filename"]),
|
|
110
|
+
nrows=nrows,
|
|
111
|
+
ncols=ncols,
|
|
112
|
+
cells=cells,
|
|
113
|
+
split=_normalize_split(payload.get("split")),
|
|
114
|
+
imgid=payload.get("imgid"),
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _normalize_split(value: object) -> Any:
|
|
119
|
+
if value in ("train", "val", "test"):
|
|
120
|
+
return value
|
|
121
|
+
if value is None:
|
|
122
|
+
return None
|
|
123
|
+
msg = f"unknown split value {value!r}"
|
|
124
|
+
raise ValueError(msg)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _sample_to_payload(sample: TableSample) -> dict[str, Any]:
|
|
128
|
+
tokens, cell_payloads = cells_to_otsl(sample)
|
|
129
|
+
out: dict[str, Any] = {
|
|
130
|
+
"filename": sample.filename,
|
|
131
|
+
"otsl": tokens,
|
|
132
|
+
"cells": cell_payloads,
|
|
133
|
+
}
|
|
134
|
+
if sample.split is not None:
|
|
135
|
+
out["split"] = sample.split
|
|
136
|
+
if sample.imgid is not None:
|
|
137
|
+
out["imgid"] = sample.imgid
|
|
138
|
+
return out
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
"""PubTables-1M codec (read-only).
|
|
2
|
+
|
|
3
|
+
PubTables-1M (Microsoft, table-transformer) is an object-detection
|
|
4
|
+
format: each cell carries explicit grid coordinates and a bbox, in
|
|
5
|
+
detection order rather than reading order. This codec READS that into
|
|
6
|
+
the IR (normalising to row-major order) and is READ-ONLY — ``write``
|
|
7
|
+
raises ``NotImplementedError`` and ``writable`` is ``False`` (ADR 0002).
|
|
8
|
+
|
|
9
|
+
Canonical jsonl record shape::
|
|
10
|
+
|
|
11
|
+
{
|
|
12
|
+
"filename": "...",
|
|
13
|
+
"split": "train" | "val" | "test", # optional
|
|
14
|
+
"imgid": 0, # optional
|
|
15
|
+
"nrows": 2, # optional; derived from cells if absent
|
|
16
|
+
"ncols": 2, # optional; derived from cells if absent
|
|
17
|
+
"cells": [
|
|
18
|
+
{
|
|
19
|
+
"row": 0,
|
|
20
|
+
"col": 0,
|
|
21
|
+
"rowspan": 1,
|
|
22
|
+
"colspan": 1,
|
|
23
|
+
"bbox": [x0, y0, x1, y1],
|
|
24
|
+
"tokens": ["..."],
|
|
25
|
+
},
|
|
26
|
+
..., # any order
|
|
27
|
+
],
|
|
28
|
+
}
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
from __future__ import annotations
|
|
32
|
+
|
|
33
|
+
import json
|
|
34
|
+
from collections.abc import Iterable, Iterator
|
|
35
|
+
from dataclasses import dataclass
|
|
36
|
+
from typing import IO, Any, Literal, cast
|
|
37
|
+
|
|
38
|
+
from tablecodec.ir import BBox, GridCell, TableSample
|
|
39
|
+
|
|
40
|
+
__all__ = ["PubTables1MCodec"]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass(frozen=True, slots=True)
|
|
44
|
+
class PubTables1MCodec:
|
|
45
|
+
"""Read-only codec for the PubTables-1M object-detection format."""
|
|
46
|
+
|
|
47
|
+
name: str = "pubtables-1m"
|
|
48
|
+
spec_version: str = "1.0.0"
|
|
49
|
+
media_type: str = "application/jsonl"
|
|
50
|
+
writable: bool = False
|
|
51
|
+
|
|
52
|
+
def read(self, source: IO[str]) -> Iterator[TableSample]:
|
|
53
|
+
for line_no, raw in enumerate(source, start=1):
|
|
54
|
+
line = raw.strip()
|
|
55
|
+
if not line:
|
|
56
|
+
continue
|
|
57
|
+
try:
|
|
58
|
+
payload: dict[str, Any] = json.loads(line)
|
|
59
|
+
except json.JSONDecodeError as exc:
|
|
60
|
+
msg = f"invalid JSON at line {line_no}: {exc.msg}"
|
|
61
|
+
raise ValueError(msg) from exc
|
|
62
|
+
try:
|
|
63
|
+
yield _payload_to_sample(payload)
|
|
64
|
+
except (KeyError, ValueError, TypeError) as exc:
|
|
65
|
+
msg = f"malformed PubTables-1M record at line {line_no}: {exc}"
|
|
66
|
+
raise ValueError(msg) from exc
|
|
67
|
+
|
|
68
|
+
def write(self, samples: Iterable[TableSample], sink: IO[str]) -> None:
|
|
69
|
+
msg = "pubtables-1m is a read-only codec (object-detection format); write is unsupported"
|
|
70
|
+
raise NotImplementedError(msg)
|
|
71
|
+
|
|
72
|
+
def lossy_read(self) -> frozenset[str]:
|
|
73
|
+
# Our canonical jsonl keeps every IR field; nothing dropped.
|
|
74
|
+
return frozenset()
|
|
75
|
+
|
|
76
|
+
def lossy_write(self) -> frozenset[str]:
|
|
77
|
+
# Never consulted: analyze_loss short-circuits on writable=False.
|
|
78
|
+
return frozenset()
|
|
79
|
+
|
|
80
|
+
def sniff(self, source: IO[str]) -> bool:
|
|
81
|
+
pos = source.tell()
|
|
82
|
+
try:
|
|
83
|
+
for raw in source:
|
|
84
|
+
line = raw.strip()
|
|
85
|
+
if not line:
|
|
86
|
+
continue
|
|
87
|
+
try:
|
|
88
|
+
payload: object = json.loads(line)
|
|
89
|
+
except json.JSONDecodeError:
|
|
90
|
+
return False
|
|
91
|
+
return _looks_like_pubtables1m(payload)
|
|
92
|
+
return False
|
|
93
|
+
finally:
|
|
94
|
+
source.seek(pos)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _looks_like_pubtables1m(payload: object) -> bool:
|
|
98
|
+
if not isinstance(payload, dict):
|
|
99
|
+
return False
|
|
100
|
+
payload_dict = cast("dict[str, Any]", payload)
|
|
101
|
+
if "html" in payload_dict: # rules out PubTabNet/FinTabNet/TableFormer/TableBank
|
|
102
|
+
return False
|
|
103
|
+
cells: object = payload_dict.get("cells")
|
|
104
|
+
if not isinstance(cells, list) or not cells:
|
|
105
|
+
return False
|
|
106
|
+
first = cast("list[object]", cells)[0]
|
|
107
|
+
return isinstance(first, dict) and "row" in first and "col" in first
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _normalize_split(value: object) -> Literal["train", "val", "test"] | None:
|
|
111
|
+
if value == "train":
|
|
112
|
+
return "train"
|
|
113
|
+
if value == "val":
|
|
114
|
+
return "val"
|
|
115
|
+
if value == "test":
|
|
116
|
+
return "test"
|
|
117
|
+
if value is None:
|
|
118
|
+
return None
|
|
119
|
+
msg = f"unknown split value {value!r}"
|
|
120
|
+
raise ValueError(msg)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _cell_from_payload(cell_payload: dict[str, Any]) -> GridCell:
|
|
124
|
+
bbox_raw = cell_payload.get("bbox")
|
|
125
|
+
bbox: BBox | None = None
|
|
126
|
+
if bbox_raw is not None:
|
|
127
|
+
bbox = (int(bbox_raw[0]), int(bbox_raw[1]), int(bbox_raw[2]), int(bbox_raw[3]))
|
|
128
|
+
role_raw = cell_payload.get("role", "body")
|
|
129
|
+
role: Literal["header", "body"] = "header" if role_raw == "header" else "body"
|
|
130
|
+
return GridCell(
|
|
131
|
+
row=int(cell_payload["row"]),
|
|
132
|
+
col=int(cell_payload["col"]),
|
|
133
|
+
rowspan=int(cell_payload.get("rowspan", 1)),
|
|
134
|
+
colspan=int(cell_payload.get("colspan", 1)),
|
|
135
|
+
tokens=tuple(cell_payload.get("tokens", ())),
|
|
136
|
+
bbox=bbox,
|
|
137
|
+
role=role,
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _payload_to_sample(payload: dict[str, Any]) -> TableSample:
|
|
142
|
+
cell_payloads = list(payload["cells"])
|
|
143
|
+
cells = [_cell_from_payload(c) for c in cell_payloads]
|
|
144
|
+
# Object-detection order is arbitrary; the IR is row-major.
|
|
145
|
+
cells.sort(key=lambda c: (c.row, c.col))
|
|
146
|
+
|
|
147
|
+
nrows = payload.get("nrows")
|
|
148
|
+
ncols = payload.get("ncols")
|
|
149
|
+
if nrows is None:
|
|
150
|
+
nrows = max((c.row + c.rowspan for c in cells), default=0)
|
|
151
|
+
if ncols is None:
|
|
152
|
+
ncols = max((c.col + c.colspan for c in cells), default=0)
|
|
153
|
+
|
|
154
|
+
return TableSample(
|
|
155
|
+
filename=str(payload["filename"]),
|
|
156
|
+
nrows=int(nrows),
|
|
157
|
+
ncols=int(ncols),
|
|
158
|
+
cells=tuple(cells),
|
|
159
|
+
split=_normalize_split(payload.get("split")),
|
|
160
|
+
imgid=payload.get("imgid"),
|
|
161
|
+
)
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""PubTabNet codecs (1.0.0 and 2.0.0).
|
|
2
|
+
|
|
3
|
+
Both share the HTML-token table machinery in :mod:`._htmltable`. The
|
|
4
|
+
only difference is bbox handling:
|
|
5
|
+
|
|
6
|
+
- ``pubtabnet-2.0.0`` reads and writes per-cell ``bbox``.
|
|
7
|
+
- ``pubtabnet-1.0.0`` has no bbox: it drops bbox on read and omits it on
|
|
8
|
+
write (declared honestly in ``lossy_read`` / ``lossy_write``).
|
|
9
|
+
|
|
10
|
+
PubTabNet jsonl record shape::
|
|
11
|
+
|
|
12
|
+
{
|
|
13
|
+
"filename": "PMC...",
|
|
14
|
+
"split": "train" | "val" | "test", # optional
|
|
15
|
+
"imgid": 0, # optional
|
|
16
|
+
"html": {
|
|
17
|
+
"structure": {"tokens": ["<thead>", "<tr>", "<td>", "</td>", ...]},
|
|
18
|
+
"cells": [
|
|
19
|
+
{"tokens": ["a"], "bbox": [x0, y0, x1, y1]},
|
|
20
|
+
{"tokens": []}, # empty cells may omit bbox
|
|
21
|
+
...,
|
|
22
|
+
],
|
|
23
|
+
},
|
|
24
|
+
}
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import json
|
|
30
|
+
from collections.abc import Iterable, Iterator
|
|
31
|
+
from dataclasses import dataclass
|
|
32
|
+
from typing import IO, Any
|
|
33
|
+
|
|
34
|
+
from tablecodec.codecs._htmltable import (
|
|
35
|
+
parse_html_table,
|
|
36
|
+
serialize_html_table,
|
|
37
|
+
sniff_html_table,
|
|
38
|
+
)
|
|
39
|
+
from tablecodec.ir import TableSample
|
|
40
|
+
|
|
41
|
+
__all__ = ["PubTabNet10Codec", "PubTabNet20Codec"]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass(frozen=True, slots=True)
|
|
45
|
+
class PubTabNet20Codec:
|
|
46
|
+
"""Codec for the PubTabNet 2.0 jsonl format."""
|
|
47
|
+
|
|
48
|
+
name: str = "pubtabnet-2.0.0"
|
|
49
|
+
spec_version: str = "2.0.0"
|
|
50
|
+
media_type: str = "application/jsonl"
|
|
51
|
+
writable: bool = True
|
|
52
|
+
|
|
53
|
+
def read(self, source: IO[str]) -> Iterator[TableSample]:
|
|
54
|
+
for line_no, raw in enumerate(source, start=1):
|
|
55
|
+
line = raw.strip()
|
|
56
|
+
if not line:
|
|
57
|
+
continue
|
|
58
|
+
try:
|
|
59
|
+
payload: dict[str, Any] = json.loads(line)
|
|
60
|
+
except json.JSONDecodeError as exc:
|
|
61
|
+
msg = f"invalid JSON at line {line_no}: {exc.msg}"
|
|
62
|
+
raise ValueError(msg) from exc
|
|
63
|
+
try:
|
|
64
|
+
yield parse_html_table(payload)
|
|
65
|
+
except (KeyError, ValueError, TypeError) as exc:
|
|
66
|
+
msg = f"malformed PubTabNet 2.0 record at line {line_no}: {exc}"
|
|
67
|
+
raise ValueError(msg) from exc
|
|
68
|
+
|
|
69
|
+
def write(self, samples: Iterable[TableSample], sink: IO[str]) -> None:
|
|
70
|
+
for sample in samples:
|
|
71
|
+
sink.write(json.dumps(serialize_html_table(sample), ensure_ascii=False))
|
|
72
|
+
sink.write("\n")
|
|
73
|
+
|
|
74
|
+
def lossy_read(self) -> frozenset[str]:
|
|
75
|
+
# PubTabNet 2.0 keeps filename, split, imgid, tokens, bbox,
|
|
76
|
+
# rowspan, colspan, header/body. Nothing dropped on read.
|
|
77
|
+
return frozenset()
|
|
78
|
+
|
|
79
|
+
def lossy_write(self) -> frozenset[str]:
|
|
80
|
+
# IR ``extras`` has no canonical home in the PubTabNet schema.
|
|
81
|
+
return frozenset({"extras"})
|
|
82
|
+
|
|
83
|
+
def sniff(self, source: IO[str]) -> bool:
|
|
84
|
+
return sniff_html_table(source, require_no_bbox=False)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@dataclass(frozen=True, slots=True)
|
|
88
|
+
class PubTabNet10Codec:
|
|
89
|
+
"""Codec for the PubTabNet 1.0.0 jsonl format (no bbox)."""
|
|
90
|
+
|
|
91
|
+
name: str = "pubtabnet-1.0.0"
|
|
92
|
+
spec_version: str = "1.0.0"
|
|
93
|
+
media_type: str = "application/jsonl"
|
|
94
|
+
writable: bool = True
|
|
95
|
+
|
|
96
|
+
def read(self, source: IO[str]) -> Iterator[TableSample]:
|
|
97
|
+
for line_no, raw in enumerate(source, start=1):
|
|
98
|
+
line = raw.strip()
|
|
99
|
+
if not line:
|
|
100
|
+
continue
|
|
101
|
+
try:
|
|
102
|
+
payload: dict[str, Any] = json.loads(line)
|
|
103
|
+
except json.JSONDecodeError as exc:
|
|
104
|
+
msg = f"invalid JSON at line {line_no}: {exc.msg}"
|
|
105
|
+
raise ValueError(msg) from exc
|
|
106
|
+
try:
|
|
107
|
+
yield parse_html_table(payload, drop_bbox=True)
|
|
108
|
+
except (KeyError, ValueError, TypeError) as exc:
|
|
109
|
+
msg = f"malformed PubTabNet 1.0 record at line {line_no}: {exc}"
|
|
110
|
+
raise ValueError(msg) from exc
|
|
111
|
+
|
|
112
|
+
def write(self, samples: Iterable[TableSample], sink: IO[str]) -> None:
|
|
113
|
+
for sample in samples:
|
|
114
|
+
sink.write(
|
|
115
|
+
json.dumps(serialize_html_table(sample, include_bbox=False), ensure_ascii=False)
|
|
116
|
+
)
|
|
117
|
+
sink.write("\n")
|
|
118
|
+
|
|
119
|
+
def lossy_read(self) -> frozenset[str]:
|
|
120
|
+
# bbox is not in the 1.0 source format; if the file is 2.0-shaped,
|
|
121
|
+
# bbox is dropped silently.
|
|
122
|
+
return frozenset({"bbox"})
|
|
123
|
+
|
|
124
|
+
def lossy_write(self) -> frozenset[str]:
|
|
125
|
+
return frozenset({"bbox", "extras"})
|
|
126
|
+
|
|
127
|
+
def sniff(self, source: IO[str]) -> bool:
|
|
128
|
+
return sniff_html_table(source, require_no_bbox=True)
|