tablecodec 0.0.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,84 @@
1
+ """FinTabNet (original) codec.
2
+
3
+ FinTabNet's original IBM annotations encode table structure with the
4
+ same HTML-token scheme as PubTabNet 2.0, differing only in the
5
+ record-level identifier: ``table_id`` instead of ``imgid``. The shared
6
+ machinery lives in :mod:`._htmltable`; this module just sets
7
+ ``id_field="table_id"`` and a sniff discriminator.
8
+
9
+ Record shape::
10
+
11
+ {
12
+ "filename": "...",
13
+ "split": "train" | "val" | "test", # optional
14
+ "table_id": 0,
15
+ "html": {
16
+ "structure": {"tokens": [...]},
17
+ "cells": [{"tokens": [...], "bbox": [x0, y0, x1, y1]}, ...],
18
+ },
19
+ }
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import json
25
+ from collections.abc import Iterable, Iterator
26
+ from dataclasses import dataclass
27
+ from typing import IO, Any
28
+
29
+ from tablecodec.codecs._htmltable import (
30
+ parse_html_table,
31
+ serialize_html_table,
32
+ sniff_html_table,
33
+ )
34
+ from tablecodec.ir import TableSample
35
+
36
+ __all__ = ["FinTabNetCodec"]
37
+
38
+ _ID_FIELD = "table_id"
39
+
40
+
41
+ @dataclass(frozen=True, slots=True)
42
+ class FinTabNetCodec:
43
+ """Codec for the FinTabNet (original) jsonl format."""
44
+
45
+ name: str = "fintabnet"
46
+ spec_version: str = "1.0.0"
47
+ media_type: str = "application/jsonl"
48
+ writable: bool = True
49
+
50
+ def read(self, source: IO[str]) -> Iterator[TableSample]:
51
+ for line_no, raw in enumerate(source, start=1):
52
+ line = raw.strip()
53
+ if not line:
54
+ continue
55
+ try:
56
+ payload: dict[str, Any] = json.loads(line)
57
+ except json.JSONDecodeError as exc:
58
+ msg = f"invalid JSON at line {line_no}: {exc.msg}"
59
+ raise ValueError(msg) from exc
60
+ try:
61
+ yield parse_html_table(payload, id_field=_ID_FIELD)
62
+ except (KeyError, ValueError, TypeError) as exc:
63
+ msg = f"malformed FinTabNet record at line {line_no}: {exc}"
64
+ raise ValueError(msg) from exc
65
+
66
+ def write(self, samples: Iterable[TableSample], sink: IO[str]) -> None:
67
+ for sample in samples:
68
+ sink.write(
69
+ json.dumps(serialize_html_table(sample, id_field=_ID_FIELD), ensure_ascii=False)
70
+ )
71
+ sink.write("\n")
72
+
73
+ def lossy_read(self) -> frozenset[str]:
74
+ # Same HTML-token model as PubTabNet 2.0: nothing dropped on read.
75
+ return frozenset()
76
+
77
+ def lossy_write(self) -> frozenset[str]:
78
+ # IR ``extras`` has no canonical home in the FinTabNet schema.
79
+ return frozenset({"extras"})
80
+
81
+ def sniff(self, source: IO[str]) -> bool:
82
+ # Require the table_id key so a PubTabNet (imgid) record is not
83
+ # mis-detected as FinTabNet.
84
+ return sniff_html_table(source, require_field=_ID_FIELD)
@@ -0,0 +1,141 @@
1
+ """FinTabNet_OTSL codec.
2
+
3
+ FinTabNet_OTSL (Docling project, HF ``ds4sd/FinTabNet_OTSL``) is the
4
+ FinTabNet corpus re-encoded in OTSL. Compared to the plain
5
+ ``otsl-1.0.0`` codec it adds FinTabNet provenance:
6
+
7
+ - ``table_id`` as the record identifier (mapped onto the IR ``imgid``),
8
+ like the ``fintabnet`` codec.
9
+ - an ``extras`` dict (carrying e.g. ``otsl_raw``, the original OTSL
10
+ markup string). This codec is the only one that **round-trips** IR
11
+ ``extras``, so ``extras`` is deliberately absent from ``lossy_write``.
12
+
13
+ Structure / cell handling is shared with OTSL via :mod:`._otslgrid`.
14
+
15
+ Record shape::
16
+
17
+ {
18
+ "filename": "...",
19
+ "split": "train" | "val" | "test", # optional
20
+ "table_id": 0,
21
+ "otsl": ["fcel", "fcel", "nl", ...],
22
+ "cells": [{"tokens": ["a"], "bbox": [x0, y0, x1, y1]}, ...],
23
+ "extras": {"otsl_raw": "fcel fcel nl ...", ...} # optional
24
+ }
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import json
30
+ from collections.abc import Iterable, Iterator
31
+ from dataclasses import dataclass
32
+ from typing import IO, Any, cast
33
+
34
+ from tablecodec.codecs._otslgrid import cells_to_otsl, otsl_to_cells
35
+ from tablecodec.ir import TableSample
36
+
37
+ __all__ = ["FinTabNetOTSLCodec"]
38
+
39
+ _ID_FIELD = "table_id"
40
+
41
+
42
+ @dataclass(frozen=True, slots=True)
43
+ class FinTabNetOTSLCodec:
44
+ """Codec for the FinTabNet_OTSL jsonl format (OTSL + table_id + extras)."""
45
+
46
+ name: str = "fintabnet-otsl"
47
+ spec_version: str = "1.0.0"
48
+ media_type: str = "application/jsonl"
49
+ writable: bool = True
50
+
51
+ def read(self, source: IO[str]) -> Iterator[TableSample]:
52
+ for line_no, raw in enumerate(source, start=1):
53
+ line = raw.strip()
54
+ if not line:
55
+ continue
56
+ try:
57
+ payload: dict[str, Any] = json.loads(line)
58
+ except json.JSONDecodeError as exc:
59
+ msg = f"invalid JSON at line {line_no}: {exc.msg}"
60
+ raise ValueError(msg) from exc
61
+ try:
62
+ yield _payload_to_sample(payload)
63
+ except (KeyError, ValueError, TypeError) as exc:
64
+ msg = f"malformed FinTabNet_OTSL record at line {line_no}: {exc}"
65
+ raise ValueError(msg) from exc
66
+
67
+ def write(self, samples: Iterable[TableSample], sink: IO[str]) -> None:
68
+ for sample in samples:
69
+ sink.write(json.dumps(_sample_to_payload(sample), ensure_ascii=False))
70
+ sink.write("\n")
71
+
72
+ def lossy_read(self) -> frozenset[str]:
73
+ # OTSL core has no header marker; role defaults to body. extras is
74
+ # preserved.
75
+ return frozenset({"role"})
76
+
77
+ def lossy_write(self) -> frozenset[str]:
78
+ # role is lost (OTSL core). extras is round-tripped, so unlike every
79
+ # other codec it is NOT listed here.
80
+ return frozenset({"role"})
81
+
82
+ def sniff(self, source: IO[str]) -> bool:
83
+ pos = source.tell()
84
+ try:
85
+ for raw in source:
86
+ line = raw.strip()
87
+ if not line:
88
+ continue
89
+ try:
90
+ payload: object = json.loads(line)
91
+ except json.JSONDecodeError:
92
+ return False
93
+ if not isinstance(payload, dict):
94
+ return False
95
+ payload_dict = cast("dict[str, Any]", payload)
96
+ return "otsl" in payload_dict and _ID_FIELD in payload_dict
97
+ return False
98
+ finally:
99
+ source.seek(pos)
100
+
101
+
102
+ def _normalize_split(value: object) -> Any:
103
+ if value in ("train", "val", "test"):
104
+ return value
105
+ if value is None:
106
+ return None
107
+ msg = f"unknown split value {value!r}"
108
+ raise ValueError(msg)
109
+
110
+
111
+ def _payload_to_sample(payload: dict[str, Any]) -> TableSample:
112
+ nrows, ncols, cells = otsl_to_cells(list(payload["otsl"]), list(payload["cells"]))
113
+ extras_raw: object = payload.get("extras", {})
114
+ extras: dict[str, object] = (
115
+ dict(cast("dict[str, object]", extras_raw)) if isinstance(extras_raw, dict) else {}
116
+ )
117
+ return TableSample(
118
+ filename=str(payload["filename"]),
119
+ nrows=nrows,
120
+ ncols=ncols,
121
+ cells=cells,
122
+ split=_normalize_split(payload.get("split")),
123
+ imgid=payload.get(_ID_FIELD),
124
+ extras=extras,
125
+ )
126
+
127
+
128
+ def _sample_to_payload(sample: TableSample) -> dict[str, Any]:
129
+ tokens, cell_payloads = cells_to_otsl(sample)
130
+ out: dict[str, Any] = {
131
+ "filename": sample.filename,
132
+ "otsl": tokens,
133
+ "cells": cell_payloads,
134
+ }
135
+ if sample.split is not None:
136
+ out["split"] = sample.split
137
+ if sample.imgid is not None:
138
+ out[_ID_FIELD] = sample.imgid
139
+ if sample.extras:
140
+ out["extras"] = dict(sample.extras)
141
+ return out
@@ -0,0 +1,138 @@
1
+ """OTSL 1.0 codec.
2
+
3
+ Implements the Optimized Table Structure Language (Lysak et al.,
4
+ ICDAR 2023, arXiv 2305.03393). OTSL uses a five-token vocabulary plus
5
+ a newline marker:
6
+
7
+ - ``fcel`` filled cell anchor (body content)
8
+ - ``ecel`` empty cell anchor
9
+ - ``lcel`` left-merged continuation — extends the colspan of the anchor
10
+ to its left
11
+ - ``ucel`` up-merged continuation — extends the rowspan of the anchor above
12
+ - ``xcel`` cross-merged continuation — extends both row and column
13
+ (the anchor sits at (r-1, c-1) of this position)
14
+ - ``nl`` newline / row separator
15
+
16
+ Square-table assumption (per the paper): every row produced by ``nl``
17
+ splits MUST have the same number of cell-position tokens. Jagged input
18
+ is rejected with a clear error.
19
+
20
+ This implementation is derived from the paper, not copied from the
21
+ official Docling OTSL reference implementation. Cross-validation
22
+ against the reference is wired separately in a later milestone.
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import json
28
+ from collections.abc import Iterable, Iterator
29
+ from dataclasses import dataclass
30
+ from typing import IO, Any, cast
31
+
32
+ from tablecodec.codecs._otslgrid import cells_to_otsl, otsl_to_cells
33
+ from tablecodec.ir import TableSample
34
+
35
+ __all__ = ["OTSL10Codec"]
36
+
37
+
38
+ @dataclass(frozen=True, slots=True)
39
+ class OTSL10Codec:
40
+ """Codec for the OTSL 1.0 jsonl format."""
41
+
42
+ name: str = "otsl-1.0.0"
43
+ spec_version: str = "1.0.0"
44
+ media_type: str = "application/jsonl"
45
+ writable: bool = True
46
+
47
+ def read(self, source: IO[str]) -> Iterator[TableSample]:
48
+ for line_no, raw in enumerate(source, start=1):
49
+ line = raw.strip()
50
+ if not line:
51
+ continue
52
+ try:
53
+ payload: dict[str, Any] = json.loads(line)
54
+ except json.JSONDecodeError as exc:
55
+ msg = f"invalid JSON at line {line_no}: {exc.msg}"
56
+ raise ValueError(msg) from exc
57
+ try:
58
+ yield _payload_to_sample(payload)
59
+ except (KeyError, ValueError, TypeError) as exc:
60
+ msg = f"malformed OTSL 1.0 record at line {line_no}: {exc}"
61
+ raise ValueError(msg) from exc
62
+
63
+ def write(self, samples: Iterable[TableSample], sink: IO[str]) -> None:
64
+ for sample in samples:
65
+ sink.write(json.dumps(_sample_to_payload(sample), ensure_ascii=False))
66
+ sink.write("\n")
67
+
68
+ def lossy_read(self) -> frozenset[str]:
69
+ # OTSL has no header/body distinction in its grammar; reads default
70
+ # every cell to role="body". This is a real loss when the source
71
+ # carried a header/body tag.
72
+ return frozenset({"role"})
73
+
74
+ def lossy_write(self) -> frozenset[str]:
75
+ # role: collapsed to "body" on write.
76
+ # extras: no canonical home in OTSL.
77
+ return frozenset({"extras", "role"})
78
+
79
+ def sniff(self, source: IO[str]) -> bool:
80
+ pos = source.tell()
81
+ try:
82
+ for raw in source:
83
+ line = raw.strip()
84
+ if not line:
85
+ continue
86
+ try:
87
+ payload: object = json.loads(line)
88
+ except json.JSONDecodeError:
89
+ return False
90
+ return _looks_like_otsl(payload)
91
+ return False
92
+ finally:
93
+ source.seek(pos)
94
+
95
+
96
+ def _looks_like_otsl(payload: object) -> bool:
97
+ if not isinstance(payload, dict):
98
+ return False
99
+ payload_dict = cast("dict[str, Any]", payload)
100
+ return "otsl" in payload_dict and "cells" in payload_dict
101
+
102
+
103
+ # ---------- payload <-> sample (delegates to _otslgrid) ----------
104
+
105
+
106
+ def _payload_to_sample(payload: dict[str, Any]) -> TableSample:
107
+ nrows, ncols, cells = otsl_to_cells(list(payload["otsl"]), list(payload["cells"]))
108
+ return TableSample(
109
+ filename=str(payload["filename"]),
110
+ nrows=nrows,
111
+ ncols=ncols,
112
+ cells=cells,
113
+ split=_normalize_split(payload.get("split")),
114
+ imgid=payload.get("imgid"),
115
+ )
116
+
117
+
118
+ def _normalize_split(value: object) -> Any:
119
+ if value in ("train", "val", "test"):
120
+ return value
121
+ if value is None:
122
+ return None
123
+ msg = f"unknown split value {value!r}"
124
+ raise ValueError(msg)
125
+
126
+
127
+ def _sample_to_payload(sample: TableSample) -> dict[str, Any]:
128
+ tokens, cell_payloads = cells_to_otsl(sample)
129
+ out: dict[str, Any] = {
130
+ "filename": sample.filename,
131
+ "otsl": tokens,
132
+ "cells": cell_payloads,
133
+ }
134
+ if sample.split is not None:
135
+ out["split"] = sample.split
136
+ if sample.imgid is not None:
137
+ out["imgid"] = sample.imgid
138
+ return out
@@ -0,0 +1,161 @@
1
+ """PubTables-1M codec (read-only).
2
+
3
+ PubTables-1M (Microsoft, table-transformer) is an object-detection
4
+ format: each cell carries explicit grid coordinates and a bbox, in
5
+ detection order rather than reading order. This codec READS that into
6
+ the IR (normalising to row-major order) and is READ-ONLY — ``write``
7
+ raises ``NotImplementedError`` and ``writable`` is ``False`` (ADR 0002).
8
+
9
+ Canonical jsonl record shape::
10
+
11
+ {
12
+ "filename": "...",
13
+ "split": "train" | "val" | "test", # optional
14
+ "imgid": 0, # optional
15
+ "nrows": 2, # optional; derived from cells if absent
16
+ "ncols": 2, # optional; derived from cells if absent
17
+ "cells": [
18
+ {
19
+ "row": 0,
20
+ "col": 0,
21
+ "rowspan": 1,
22
+ "colspan": 1,
23
+ "bbox": [x0, y0, x1, y1],
24
+ "tokens": ["..."],
25
+ },
26
+ ..., # any order
27
+ ],
28
+ }
29
+ """
30
+
31
+ from __future__ import annotations
32
+
33
+ import json
34
+ from collections.abc import Iterable, Iterator
35
+ from dataclasses import dataclass
36
+ from typing import IO, Any, Literal, cast
37
+
38
+ from tablecodec.ir import BBox, GridCell, TableSample
39
+
40
+ __all__ = ["PubTables1MCodec"]
41
+
42
+
43
+ @dataclass(frozen=True, slots=True)
44
+ class PubTables1MCodec:
45
+ """Read-only codec for the PubTables-1M object-detection format."""
46
+
47
+ name: str = "pubtables-1m"
48
+ spec_version: str = "1.0.0"
49
+ media_type: str = "application/jsonl"
50
+ writable: bool = False
51
+
52
+ def read(self, source: IO[str]) -> Iterator[TableSample]:
53
+ for line_no, raw in enumerate(source, start=1):
54
+ line = raw.strip()
55
+ if not line:
56
+ continue
57
+ try:
58
+ payload: dict[str, Any] = json.loads(line)
59
+ except json.JSONDecodeError as exc:
60
+ msg = f"invalid JSON at line {line_no}: {exc.msg}"
61
+ raise ValueError(msg) from exc
62
+ try:
63
+ yield _payload_to_sample(payload)
64
+ except (KeyError, ValueError, TypeError) as exc:
65
+ msg = f"malformed PubTables-1M record at line {line_no}: {exc}"
66
+ raise ValueError(msg) from exc
67
+
68
+ def write(self, samples: Iterable[TableSample], sink: IO[str]) -> None:
69
+ msg = "pubtables-1m is a read-only codec (object-detection format); write is unsupported"
70
+ raise NotImplementedError(msg)
71
+
72
+ def lossy_read(self) -> frozenset[str]:
73
+ # Our canonical jsonl keeps every IR field; nothing dropped.
74
+ return frozenset()
75
+
76
+ def lossy_write(self) -> frozenset[str]:
77
+ # Never consulted: analyze_loss short-circuits on writable=False.
78
+ return frozenset()
79
+
80
+ def sniff(self, source: IO[str]) -> bool:
81
+ pos = source.tell()
82
+ try:
83
+ for raw in source:
84
+ line = raw.strip()
85
+ if not line:
86
+ continue
87
+ try:
88
+ payload: object = json.loads(line)
89
+ except json.JSONDecodeError:
90
+ return False
91
+ return _looks_like_pubtables1m(payload)
92
+ return False
93
+ finally:
94
+ source.seek(pos)
95
+
96
+
97
+ def _looks_like_pubtables1m(payload: object) -> bool:
98
+ if not isinstance(payload, dict):
99
+ return False
100
+ payload_dict = cast("dict[str, Any]", payload)
101
+ if "html" in payload_dict: # rules out PubTabNet/FinTabNet/TableFormer/TableBank
102
+ return False
103
+ cells: object = payload_dict.get("cells")
104
+ if not isinstance(cells, list) or not cells:
105
+ return False
106
+ first = cast("list[object]", cells)[0]
107
+ return isinstance(first, dict) and "row" in first and "col" in first
108
+
109
+
110
+ def _normalize_split(value: object) -> Literal["train", "val", "test"] | None:
111
+ if value == "train":
112
+ return "train"
113
+ if value == "val":
114
+ return "val"
115
+ if value == "test":
116
+ return "test"
117
+ if value is None:
118
+ return None
119
+ msg = f"unknown split value {value!r}"
120
+ raise ValueError(msg)
121
+
122
+
123
+ def _cell_from_payload(cell_payload: dict[str, Any]) -> GridCell:
124
+ bbox_raw = cell_payload.get("bbox")
125
+ bbox: BBox | None = None
126
+ if bbox_raw is not None:
127
+ bbox = (int(bbox_raw[0]), int(bbox_raw[1]), int(bbox_raw[2]), int(bbox_raw[3]))
128
+ role_raw = cell_payload.get("role", "body")
129
+ role: Literal["header", "body"] = "header" if role_raw == "header" else "body"
130
+ return GridCell(
131
+ row=int(cell_payload["row"]),
132
+ col=int(cell_payload["col"]),
133
+ rowspan=int(cell_payload.get("rowspan", 1)),
134
+ colspan=int(cell_payload.get("colspan", 1)),
135
+ tokens=tuple(cell_payload.get("tokens", ())),
136
+ bbox=bbox,
137
+ role=role,
138
+ )
139
+
140
+
141
+ def _payload_to_sample(payload: dict[str, Any]) -> TableSample:
142
+ cell_payloads = list(payload["cells"])
143
+ cells = [_cell_from_payload(c) for c in cell_payloads]
144
+ # Object-detection order is arbitrary; the IR is row-major.
145
+ cells.sort(key=lambda c: (c.row, c.col))
146
+
147
+ nrows = payload.get("nrows")
148
+ ncols = payload.get("ncols")
149
+ if nrows is None:
150
+ nrows = max((c.row + c.rowspan for c in cells), default=0)
151
+ if ncols is None:
152
+ ncols = max((c.col + c.colspan for c in cells), default=0)
153
+
154
+ return TableSample(
155
+ filename=str(payload["filename"]),
156
+ nrows=int(nrows),
157
+ ncols=int(ncols),
158
+ cells=tuple(cells),
159
+ split=_normalize_split(payload.get("split")),
160
+ imgid=payload.get("imgid"),
161
+ )
@@ -0,0 +1,128 @@
1
+ """PubTabNet codecs (1.0.0 and 2.0.0).
2
+
3
+ Both share the HTML-token table machinery in :mod:`._htmltable`. The
4
+ only difference is bbox handling:
5
+
6
+ - ``pubtabnet-2.0.0`` reads and writes per-cell ``bbox``.
7
+ - ``pubtabnet-1.0.0`` has no bbox: it drops bbox on read and omits it on
8
+ write (declared honestly in ``lossy_read`` / ``lossy_write``).
9
+
10
+ PubTabNet jsonl record shape::
11
+
12
+ {
13
+ "filename": "PMC...",
14
+ "split": "train" | "val" | "test", # optional
15
+ "imgid": 0, # optional
16
+ "html": {
17
+ "structure": {"tokens": ["<thead>", "<tr>", "<td>", "</td>", ...]},
18
+ "cells": [
19
+ {"tokens": ["a"], "bbox": [x0, y0, x1, y1]},
20
+ {"tokens": []}, # empty cells may omit bbox
21
+ ...,
22
+ ],
23
+ },
24
+ }
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import json
30
+ from collections.abc import Iterable, Iterator
31
+ from dataclasses import dataclass
32
+ from typing import IO, Any
33
+
34
+ from tablecodec.codecs._htmltable import (
35
+ parse_html_table,
36
+ serialize_html_table,
37
+ sniff_html_table,
38
+ )
39
+ from tablecodec.ir import TableSample
40
+
41
+ __all__ = ["PubTabNet10Codec", "PubTabNet20Codec"]
42
+
43
+
44
+ @dataclass(frozen=True, slots=True)
45
+ class PubTabNet20Codec:
46
+ """Codec for the PubTabNet 2.0 jsonl format."""
47
+
48
+ name: str = "pubtabnet-2.0.0"
49
+ spec_version: str = "2.0.0"
50
+ media_type: str = "application/jsonl"
51
+ writable: bool = True
52
+
53
+ def read(self, source: IO[str]) -> Iterator[TableSample]:
54
+ for line_no, raw in enumerate(source, start=1):
55
+ line = raw.strip()
56
+ if not line:
57
+ continue
58
+ try:
59
+ payload: dict[str, Any] = json.loads(line)
60
+ except json.JSONDecodeError as exc:
61
+ msg = f"invalid JSON at line {line_no}: {exc.msg}"
62
+ raise ValueError(msg) from exc
63
+ try:
64
+ yield parse_html_table(payload)
65
+ except (KeyError, ValueError, TypeError) as exc:
66
+ msg = f"malformed PubTabNet 2.0 record at line {line_no}: {exc}"
67
+ raise ValueError(msg) from exc
68
+
69
+ def write(self, samples: Iterable[TableSample], sink: IO[str]) -> None:
70
+ for sample in samples:
71
+ sink.write(json.dumps(serialize_html_table(sample), ensure_ascii=False))
72
+ sink.write("\n")
73
+
74
+ def lossy_read(self) -> frozenset[str]:
75
+ # PubTabNet 2.0 keeps filename, split, imgid, tokens, bbox,
76
+ # rowspan, colspan, header/body. Nothing dropped on read.
77
+ return frozenset()
78
+
79
+ def lossy_write(self) -> frozenset[str]:
80
+ # IR ``extras`` has no canonical home in the PubTabNet schema.
81
+ return frozenset({"extras"})
82
+
83
+ def sniff(self, source: IO[str]) -> bool:
84
+ return sniff_html_table(source, require_no_bbox=False)
85
+
86
+
87
+ @dataclass(frozen=True, slots=True)
88
+ class PubTabNet10Codec:
89
+ """Codec for the PubTabNet 1.0.0 jsonl format (no bbox)."""
90
+
91
+ name: str = "pubtabnet-1.0.0"
92
+ spec_version: str = "1.0.0"
93
+ media_type: str = "application/jsonl"
94
+ writable: bool = True
95
+
96
+ def read(self, source: IO[str]) -> Iterator[TableSample]:
97
+ for line_no, raw in enumerate(source, start=1):
98
+ line = raw.strip()
99
+ if not line:
100
+ continue
101
+ try:
102
+ payload: dict[str, Any] = json.loads(line)
103
+ except json.JSONDecodeError as exc:
104
+ msg = f"invalid JSON at line {line_no}: {exc.msg}"
105
+ raise ValueError(msg) from exc
106
+ try:
107
+ yield parse_html_table(payload, drop_bbox=True)
108
+ except (KeyError, ValueError, TypeError) as exc:
109
+ msg = f"malformed PubTabNet 1.0 record at line {line_no}: {exc}"
110
+ raise ValueError(msg) from exc
111
+
112
+ def write(self, samples: Iterable[TableSample], sink: IO[str]) -> None:
113
+ for sample in samples:
114
+ sink.write(
115
+ json.dumps(serialize_html_table(sample, include_bbox=False), ensure_ascii=False)
116
+ )
117
+ sink.write("\n")
118
+
119
+ def lossy_read(self) -> frozenset[str]:
120
+ # bbox is not in the 1.0 source format; if the file is 2.0-shaped,
121
+ # bbox is dropped silently.
122
+ return frozenset({"bbox"})
123
+
124
+ def lossy_write(self) -> frozenset[str]:
125
+ return frozenset({"bbox", "extras"})
126
+
127
+ def sniff(self, source: IO[str]) -> bool:
128
+ return sniff_html_table(source, require_no_bbox=True)