tablecodec 0.0.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,111 @@
1
+ """Codec registry (SPEC §6.2).
2
+
3
+ Third-party codecs ship as separate PyPI packages and self-register via the
4
+ entry-point group ``tablecodec.codecs``; :func:`load_plugins` discovers and
5
+ registers them. The library does not auto-register anything at import time —
6
+ callers register the built-ins they need (the CLI does) and call
7
+ ``load_plugins`` to pick up installed third-party codecs.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import importlib.metadata
13
+ from typing import IO
14
+
15
+ from tablecodec.codecs._base import Codec
16
+
17
+ __all__ = ["Codec", "detect", "get", "list_codecs", "load_plugins", "register"]
18
+
19
+ _PLUGIN_GROUP = "tablecodec.codecs"
20
+
21
+
22
+ # Module-level mutable registry. Tests use _snapshot/_restore to isolate.
23
+ _registry: dict[str, Codec] = {}
24
+
25
+
26
+ def register(codec: Codec) -> None:
27
+ """Register *codec* under its declared name.
28
+
29
+ Raises:
30
+ ValueError: when a codec with the same name is already registered.
31
+ """
32
+ if codec.name in _registry:
33
+ msg = f"codec {codec.name!r} is already registered"
34
+ raise ValueError(msg)
35
+ _registry[codec.name] = codec
36
+
37
+
38
+ def get(name: str) -> Codec:
39
+ """Look up a codec by name.
40
+
41
+ Raises:
42
+ KeyError: when *name* is not registered.
43
+ """
44
+ if name not in _registry:
45
+ msg = f"no codec registered under {name!r}"
46
+ raise KeyError(msg)
47
+ return _registry[name]
48
+
49
+
50
+ def list_codecs() -> tuple[str, ...]:
51
+ """Return the registered codec names in registration order."""
52
+ return tuple(_registry)
53
+
54
+
55
+ def detect(source: IO[str]) -> str | None:
56
+ """Peek at *source* and return the matching codec name, or ``None``.
57
+
58
+ Implementation: iterate registered codecs and ask each whether the
59
+ first non-empty line of *source* looks like its format. The source
60
+ stream's position is restored before returning, so callers may
61
+ immediately pass the same stream to ``codec.read()``.
62
+
63
+ For M2 there is one auto-detecting codec (``pubtabnet-2.0.0``); the
64
+ detection delegate API is internal and will firm up in M3 when
65
+ ``pubtabnet-1.0.0`` also self-detects.
66
+ """
67
+ pos = source.tell()
68
+ try:
69
+ for codec in _registry.values():
70
+ sniff = getattr(codec, "sniff", None)
71
+ if sniff is None:
72
+ continue
73
+ source.seek(pos)
74
+ if sniff(source):
75
+ return codec.name
76
+ finally:
77
+ source.seek(pos)
78
+ return None
79
+
80
+
81
+ def load_plugins() -> tuple[str, ...]:
82
+ """Discover and register third-party codecs (SPEC §6.2).
83
+
84
+ Scans the ``tablecodec.codecs`` entry-point group; each entry point
85
+ references a :class:`Codec` class (instantiated with no arguments) or a
86
+ ready instance. Names already registered are skipped, so this is safe to
87
+ call more than once. Returns the names newly registered, in load order.
88
+ """
89
+ loaded: list[str] = []
90
+ for entry_point in importlib.metadata.entry_points(group=_PLUGIN_GROUP):
91
+ obj = entry_point.load()
92
+ codec: Codec = obj() if isinstance(obj, type) else obj
93
+ if codec.name in _registry:
94
+ continue
95
+ register(codec)
96
+ loaded.append(codec.name)
97
+ return tuple(loaded)
98
+
99
+
100
+ # ---------- test helpers (intentionally underscore-prefixed) ----------
101
+ # Marked with pyright: ignore because they're consumed only by tests via
102
+ # attribute access (codecs._snapshot()), which pyright does not track.
103
+
104
+
105
+ def _snapshot() -> dict[str, Codec]: # pyright: ignore[reportUnusedFunction]
106
+ return dict(_registry)
107
+
108
+
109
+ def _restore(snapshot: dict[str, Codec]) -> None: # pyright: ignore[reportUnusedFunction]
110
+ _registry.clear()
111
+ _registry.update(snapshot)
@@ -0,0 +1,79 @@
1
+ """The Codec Protocol (SPEC §6).
2
+
3
+ A codec is a reader + writer pair for one external table-recognition
4
+ format, accompanied by an honest self-declaration of what is lost on
5
+ read or write.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from collections.abc import Iterable, Iterator
11
+ from typing import IO, Protocol, runtime_checkable
12
+
13
+ from tablecodec.ir import TableSample
14
+
15
+ __all__ = ["Codec"]
16
+
17
+
18
+ @runtime_checkable
19
+ class Codec(Protocol):
20
+ """SPEC §6 codec contract.
21
+
22
+ Implementations are typically frozen dataclasses or singletons.
23
+ They MUST be safe to share across threads (no per-call mutable
24
+ state). They MUST NOT mutate their inputs.
25
+
26
+ Identity attributes (``name``, ``spec_version``, ``media_type``) are
27
+ declared as ``@property`` getters so that implementations may use
28
+ read-only attributes (e.g. ``dataclass(frozen=True)`` fields) to
29
+ satisfy the protocol.
30
+ """
31
+
32
+ @property
33
+ def name(self) -> str:
34
+ """Stable registry key, e.g. ``"pubtabnet-2.0.0"``."""
35
+ ...
36
+
37
+ @property
38
+ def spec_version(self) -> str:
39
+ """Version of the source format (not of this library)."""
40
+ ...
41
+
42
+ @property
43
+ def media_type(self) -> str:
44
+ """Canonical MIME type, e.g. ``"application/jsonl"``."""
45
+ ...
46
+
47
+ @property
48
+ def writable(self) -> bool:
49
+ """Whether this codec supports :meth:`write`.
50
+
51
+ Read-only codecs (SPEC §7, e.g. PubTables-1M) return ``False`` and
52
+ raise ``NotImplementedError`` from :meth:`write`. ``analyze_loss``
53
+ short-circuits to ``"unwritable"`` when a read-only codec is used
54
+ as a conversion target (see ADR 0002).
55
+ """
56
+ ...
57
+
58
+ def read(self, source: IO[str]) -> Iterator[TableSample]:
59
+ """Yield :class:`TableSample` instances lazily from *source*.
60
+
61
+ Implementations MUST stream — no full-file slurp. ``read`` parses
62
+ and raises (with the record offset) on records it cannot parse; it
63
+ does NOT evaluate the structural invariants. Invariant checking is
64
+ a separate, opt-in step via :func:`tablecodec.validate` (SPEC §6.1
65
+ / §8, ADR 0008).
66
+ """
67
+ ...
68
+
69
+ def write(self, samples: Iterable[TableSample], sink: IO[str]) -> None:
70
+ """Serialise *samples* to *sink* in the codec's external format."""
71
+ ...
72
+
73
+ def lossy_read(self) -> frozenset[str]:
74
+ """Source-format field paths dropped on read (e.g. ``"styles"``)."""
75
+ ...
76
+
77
+ def lossy_write(self) -> frozenset[str]:
78
+ """IR fields that cannot be expressed in this format on write."""
79
+ ...
@@ -0,0 +1,510 @@
1
+ """Shared HTML-token table machinery for codec implementations.
2
+
3
+ PubTabNet (1.x / 2.0) and FinTabNet (original) all encode table structure
4
+ as an HTML-like token stream (``<thead>``/``<tbody>``/``<tr>``/``<td>`` with
5
+ optional ``rowspan``/``colspan`` attributes) paired with a positional
6
+ ``cells`` array. This module owns the parsing, grid placement, and
7
+ serialization so the concrete codecs stay thin and never duplicate it.
8
+
9
+ The only per-format knobs are:
10
+
11
+ - ``id_field`` — the record-level integer id key (``"imgid"`` for
12
+ PubTabNet, ``"table_id"`` for FinTabNet).
13
+ - ``drop_bbox`` — discard per-cell bbox on read (PubTabNet 1.0).
14
+ - ``include_bbox`` — omit per-cell bbox on write (PubTabNet 1.0).
15
+
16
+ Stdlib-only (SPEC §13).
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import json
22
+ import re
23
+ from dataclasses import dataclass, field
24
+ from typing import IO, Any, Literal, cast
25
+
26
+ from tablecodec.ir import BBox, GridCell, TableSample
27
+
28
+ __all__ = [
29
+ "looks_like_html_table",
30
+ "parse_html_structure_only",
31
+ "parse_html_table",
32
+ "serialize_html_structure_only",
33
+ "serialize_html_table",
34
+ "sniff_html_table",
35
+ ]
36
+
37
+ _ROWSPAN_RE = re.compile(r'rowspan\s*=\s*"(\d+)"')
38
+ _COLSPAN_RE = re.compile(r'colspan\s*=\s*"(\d+)"')
39
+
40
+ _SECTION_TOKENS: dict[str, Literal["header", "body"] | None] = {
41
+ "<thead>": "header",
42
+ "</thead>": "body",
43
+ "<tbody>": "body",
44
+ "</tbody>": None,
45
+ }
46
+
47
+
48
+ # ---------- structure parser ----------
49
+
50
+
51
+ @dataclass(slots=True)
52
+ class _CellSpec:
53
+ """One ``<td>`` opening parsed out of the structure token stream."""
54
+
55
+ rowspan: int = 1
56
+ colspan: int = 1
57
+ role: Literal["header", "body"] = "body"
58
+ row: int = -1 # assigned by the placement pass
59
+ col: int = -1
60
+
61
+
62
+ def _empty_cell_spec_list() -> list[_CellSpec]:
63
+ return []
64
+
65
+
66
+ @dataclass(slots=True)
67
+ class _ParseState:
68
+ section: Literal["header", "body"] = "body"
69
+ cur_row: int = -1
70
+ cells: list[_CellSpec] = field(default_factory=_empty_cell_spec_list)
71
+
72
+
73
+ def _parse_span_attrs(tokens: list[str], start: int) -> tuple[int, int, int]:
74
+ """Scan attribute tokens after ``<td`` until ``>``; return (rowspan, colspan, end_index)."""
75
+ rowspan = colspan = 1
76
+ j = start
77
+ while j < len(tokens) and tokens[j] != ">":
78
+ attr = tokens[j]
79
+ if (m := _ROWSPAN_RE.search(attr)) is not None:
80
+ rowspan = int(m.group(1))
81
+ if (m := _COLSPAN_RE.search(attr)) is not None:
82
+ colspan = int(m.group(1))
83
+ j += 1
84
+ return rowspan, colspan, j
85
+
86
+
87
+ def _parse_structure_tokens(tokens: list[str]) -> list[_CellSpec]:
88
+ """Parse HTML structure tokens into ordered cell specs."""
89
+ state = _ParseState()
90
+ i = 0
91
+ while i < len(tokens):
92
+ tok = tokens[i]
93
+ if tok in _SECTION_TOKENS:
94
+ new_section = _SECTION_TOKENS[tok]
95
+ if new_section is not None:
96
+ state.section = new_section
97
+ elif tok == "<tr>":
98
+ state.cur_row += 1
99
+ elif tok == "<td>":
100
+ state.cells.append(_CellSpec(role=state.section, row=state.cur_row))
101
+ elif tok == "<td":
102
+ rowspan, colspan, end = _parse_span_attrs(tokens, i + 1)
103
+ state.cells.append(
104
+ _CellSpec(
105
+ rowspan=rowspan,
106
+ colspan=colspan,
107
+ role=state.section,
108
+ row=state.cur_row,
109
+ )
110
+ )
111
+ i = end
112
+ # </tr>, </td>, and unknown tokens are ignored.
113
+ i += 1
114
+ return state.cells
115
+
116
+
117
+ # ---------- grid placement ----------
118
+
119
+
120
+ def _empty_rows() -> list[list[bool]]:
121
+ return []
122
+
123
+
124
+ @dataclass(slots=True)
125
+ class _OccupiedGrid:
126
+ """Mutable 2D bitmap with grow-on-demand columns."""
127
+
128
+ nrows: int
129
+ ncols: int
130
+ rows: list[list[bool]] = field(default_factory=_empty_rows)
131
+
132
+ def __post_init__(self) -> None:
133
+ self.rows = [[False] * self.ncols for _ in range(self.nrows)]
134
+
135
+ def ensure_cols(self, want: int) -> None:
136
+ if want > self.ncols:
137
+ for row in self.rows:
138
+ row.extend([False] * (want - self.ncols))
139
+ self.ncols = want
140
+
141
+ def can_place(self, r: int, c: int, rowspan: int, colspan: int) -> bool:
142
+ return all(
143
+ not self.rows[r + dr][c + dc]
144
+ for dr in range(rowspan)
145
+ for dc in range(colspan)
146
+ if r + dr < self.nrows
147
+ )
148
+
149
+ def mark(self, r: int, c: int, rowspan: int, colspan: int) -> None:
150
+ for dr in range(rowspan):
151
+ rr = r + dr
152
+ if rr >= self.nrows:
153
+ continue
154
+ for dc in range(colspan):
155
+ self.rows[rr][c + dc] = True
156
+
157
+
158
+ def _place_cells(specs: list[_CellSpec]) -> tuple[int, int]:
159
+ """Assign (row, col) to every spec using HTML table placement."""
160
+ if not specs:
161
+ return (0, 0)
162
+ nrows = max(s.row for s in specs) + 1
163
+ grid = _OccupiedGrid(nrows=nrows, ncols=max(8, sum(s.colspan for s in specs)))
164
+ for spec in specs:
165
+ c = 0
166
+ while True:
167
+ grid.ensure_cols(c + spec.colspan)
168
+ if grid.can_place(spec.row, c, spec.rowspan, spec.colspan):
169
+ break
170
+ c += 1
171
+ spec.col = c
172
+ grid.mark(spec.row, c, spec.rowspan, spec.colspan)
173
+ ncols = max((s.col + s.colspan for s in specs), default=0)
174
+ return (nrows, ncols)
175
+
176
+
177
+ # ---------- payload -> sample ----------
178
+
179
+
180
+ def _normalize_split(value: object) -> Literal["train", "val", "test"] | None:
181
+ if value == "train":
182
+ return "train"
183
+ if value == "val":
184
+ return "val"
185
+ if value == "test":
186
+ return "test"
187
+ if value is None:
188
+ return None
189
+ msg = f"unknown split value {value!r}"
190
+ raise ValueError(msg)
191
+
192
+
193
+ def parse_html_table(
194
+ payload: dict[str, Any], *, id_field: str = "imgid", drop_bbox: bool = False
195
+ ) -> TableSample:
196
+ """Build a :class:`TableSample` from an HTML-token table record."""
197
+ html = payload["html"]
198
+ structure_tokens = html["structure"]["tokens"]
199
+ cell_payloads = html["cells"]
200
+
201
+ specs = _parse_structure_tokens(structure_tokens)
202
+ if len(specs) != len(cell_payloads):
203
+ msg = f"structure declares {len(specs)} cells but cells[] has {len(cell_payloads)} entries"
204
+ raise ValueError(msg)
205
+
206
+ nrows, ncols = _place_cells(specs)
207
+
208
+ cells: list[GridCell] = []
209
+ for spec, cell_payload in zip(specs, cell_payloads, strict=True):
210
+ tokens = tuple(cell_payload.get("tokens", ()))
211
+ bbox_raw = None if drop_bbox else cell_payload.get("bbox")
212
+ bbox: BBox | None = None
213
+ if bbox_raw is not None:
214
+ bbox = (int(bbox_raw[0]), int(bbox_raw[1]), int(bbox_raw[2]), int(bbox_raw[3]))
215
+ cells.append(
216
+ GridCell(
217
+ row=spec.row,
218
+ col=spec.col,
219
+ rowspan=spec.rowspan,
220
+ colspan=spec.colspan,
221
+ tokens=tokens,
222
+ bbox=bbox,
223
+ role=spec.role,
224
+ )
225
+ )
226
+
227
+ return TableSample(
228
+ filename=str(payload["filename"]),
229
+ nrows=nrows,
230
+ ncols=ncols,
231
+ cells=tuple(cells),
232
+ split=_normalize_split(payload.get("split")),
233
+ imgid=payload.get(id_field),
234
+ )
235
+
236
+
237
+ def parse_html_structure_only(payload: dict[str, Any], *, id_field: str = "imgid") -> TableSample:
238
+ """Build a :class:`TableSample` from structure tokens alone.
239
+
240
+ For formats that ship table structure without per-cell content
241
+ (e.g. TableBank): every anchor becomes an empty cell (``tokens=()``,
242
+ ``bbox=None``). The record has no ``html.cells`` array.
243
+ """
244
+ structure_tokens = payload["html"]["structure"]["tokens"]
245
+ specs = _parse_structure_tokens(structure_tokens)
246
+ nrows, ncols = _place_cells(specs)
247
+ cells = tuple(
248
+ GridCell(
249
+ row=spec.row,
250
+ col=spec.col,
251
+ rowspan=spec.rowspan,
252
+ colspan=spec.colspan,
253
+ tokens=(),
254
+ bbox=None,
255
+ role=spec.role,
256
+ )
257
+ for spec in specs
258
+ )
259
+ return TableSample(
260
+ filename=str(payload["filename"]),
261
+ nrows=nrows,
262
+ ncols=ncols,
263
+ cells=cells,
264
+ split=_normalize_split(payload.get("split")),
265
+ imgid=payload.get(id_field),
266
+ )
267
+
268
+
269
+ # ---------- sample -> payload ----------
270
+
271
+
272
+ def _group_cells_by_row(cells: tuple[GridCell, ...]) -> dict[int, list[GridCell]]:
273
+ by_row: dict[int, list[GridCell]] = {}
274
+ for cell in cells:
275
+ by_row.setdefault(cell.row, []).append(cell)
276
+ for row_cells in by_row.values():
277
+ row_cells.sort(key=lambda c: c.col)
278
+ return by_row
279
+
280
+
281
+ def _count_header_rows(by_row: dict[int, list[GridCell]], nrows: int) -> int:
282
+ header_rows = 0
283
+ while header_rows < nrows:
284
+ row_cells = by_row.get(header_rows, [])
285
+ if not row_cells or not all(c.role == "header" for c in row_cells):
286
+ break
287
+ header_rows += 1
288
+ return header_rows
289
+
290
+
291
+ @dataclass(slots=True)
292
+ class _SectionRange:
293
+ open_tag: str
294
+ close_tag: str
295
+ start: int
296
+ end: int
297
+
298
+
299
+ def _emit_row(structure: list[str], emitted: list[GridCell], row_cells: list[GridCell]) -> None:
300
+ structure.append("<tr>")
301
+ for cell in row_cells:
302
+ if cell.rowspan == 1 and cell.colspan == 1:
303
+ structure.extend(["<td>", "</td>"])
304
+ else:
305
+ structure.append("<td")
306
+ if cell.rowspan != 1:
307
+ structure.append(f' rowspan="{cell.rowspan}"')
308
+ if cell.colspan != 1:
309
+ structure.append(f' colspan="{cell.colspan}"')
310
+ structure.extend([">", "</td>"])
311
+ emitted.append(cell)
312
+ structure.append("</tr>")
313
+
314
+
315
+ def _emit_section(
316
+ structure: list[str],
317
+ emitted: list[GridCell],
318
+ by_row: dict[int, list[GridCell]],
319
+ span: _SectionRange,
320
+ ) -> None:
321
+ if span.start >= span.end:
322
+ return
323
+ structure.append(span.open_tag)
324
+ for r in range(span.start, span.end):
325
+ _emit_row(structure, emitted, by_row.get(r, []))
326
+ structure.append(span.close_tag)
327
+
328
+
329
+ def _cell_to_payload(cell: GridCell, *, include_bbox: bool) -> dict[str, Any]:
330
+ payload: dict[str, Any] = {"tokens": list(cell.tokens)}
331
+ if include_bbox and cell.bbox is not None:
332
+ payload["bbox"] = list(cell.bbox)
333
+ return payload
334
+
335
+
336
+ def _structure_and_cells(
337
+ sample: TableSample, *, include_bbox: bool
338
+ ) -> tuple[list[str], list[dict[str, Any]]]:
339
+ by_row = _group_cells_by_row(sample.cells)
340
+ header_rows = _count_header_rows(by_row, sample.nrows)
341
+ structure: list[str] = []
342
+ emitted_order: list[GridCell] = []
343
+ _emit_section(
344
+ structure, emitted_order, by_row, _SectionRange("<thead>", "</thead>", 0, header_rows)
345
+ )
346
+ _emit_section(
347
+ structure,
348
+ emitted_order,
349
+ by_row,
350
+ _SectionRange("<tbody>", "</tbody>", header_rows, sample.nrows),
351
+ )
352
+ return structure, [_cell_to_payload(c, include_bbox=include_bbox) for c in emitted_order]
353
+
354
+
355
+ def serialize_html_table(
356
+ sample: TableSample, *, id_field: str = "imgid", include_bbox: bool = True
357
+ ) -> dict[str, Any]:
358
+ """Serialize a :class:`TableSample` to an HTML-token table record.
359
+
360
+ ``extras`` is intentionally omitted (declared in the codec's
361
+ ``lossy_write``).
362
+ """
363
+ structure_tokens, cell_payloads = _structure_and_cells(sample, include_bbox=include_bbox)
364
+ payload: dict[str, Any] = {
365
+ "filename": sample.filename,
366
+ "html": {"structure": {"tokens": structure_tokens}, "cells": cell_payloads},
367
+ }
368
+ if sample.split is not None:
369
+ payload["split"] = sample.split
370
+ if sample.imgid is not None:
371
+ payload[id_field] = sample.imgid
372
+ return payload
373
+
374
+
375
+ def serialize_html_structure_only(
376
+ sample: TableSample, *, id_field: str = "imgid"
377
+ ) -> dict[str, Any]:
378
+ """Serialize structure tokens only (no ``cells``).
379
+
380
+ For structure-only formats (TableBank): cell tokens and bboxes are
381
+ dropped (declared in the codec's ``lossy_write``).
382
+ """
383
+ structure_tokens, _ = _structure_and_cells(sample, include_bbox=False)
384
+ payload: dict[str, Any] = {
385
+ "filename": sample.filename,
386
+ "html": {"structure": {"tokens": structure_tokens}},
387
+ }
388
+ if sample.split is not None:
389
+ payload["split"] = sample.split
390
+ if sample.imgid is not None:
391
+ payload[id_field] = sample.imgid
392
+ return payload
393
+
394
+
395
+ # ---------- detection ----------
396
+
397
+
398
+ def _cells_list(html_dict: dict[str, Any]) -> list[object] | None:
399
+ cells_field: object = html_dict.get("cells", [])
400
+ if not isinstance(cells_field, list):
401
+ return None
402
+ return cast("list[object]", cells_field)
403
+
404
+
405
+ def _no_cell_has_bbox(html_dict: dict[str, Any]) -> bool:
406
+ cells = _cells_list(html_dict)
407
+ if cells is None:
408
+ return False
409
+ return not any(isinstance(c, dict) and "bbox" in c for c in cells)
410
+
411
+
412
+ def _all_cells_have_bbox(html_dict: dict[str, Any]) -> bool:
413
+ cells = _cells_list(html_dict)
414
+ if cells is None:
415
+ return False
416
+ return all(isinstance(c, dict) and "bbox" in c for c in cells)
417
+
418
+
419
+ def _bbox_constraint_ok(
420
+ html_dict: dict[str, Any], *, require_no_bbox: bool, require_all_bbox: bool
421
+ ) -> bool:
422
+ if require_no_bbox:
423
+ return _no_cell_has_bbox(html_dict)
424
+ if require_all_bbox:
425
+ return _all_cells_have_bbox(html_dict)
426
+ return True
427
+
428
+
429
+ def _cells_constraint_ok(
430
+ html_dict: dict[str, Any],
431
+ *,
432
+ require_no_bbox: bool,
433
+ require_all_bbox: bool,
434
+ require_no_cells: bool,
435
+ ) -> bool:
436
+ has_cells = "cells" in html_dict
437
+ if require_no_cells:
438
+ return not has_cells
439
+ if not has_cells:
440
+ return False
441
+ return _bbox_constraint_ok(
442
+ html_dict, require_no_bbox=require_no_bbox, require_all_bbox=require_all_bbox
443
+ )
444
+
445
+
446
+ def looks_like_html_table(
447
+ payload: object,
448
+ *,
449
+ require_no_bbox: bool = False,
450
+ require_all_bbox: bool = False,
451
+ require_no_cells: bool = False,
452
+ require_field: str | None = None,
453
+ ) -> bool:
454
+ """Pure (no I/O) shape check for an HTML-token table record.
455
+
456
+ ``html.structure`` is always required. ``html.cells`` is required
457
+ unless *require_no_cells* is True (structure-only formats like
458
+ TableBank), in which case its absence is required instead.
459
+ """
460
+ if not isinstance(payload, dict):
461
+ return False
462
+ payload_dict = cast("dict[str, Any]", payload)
463
+ if require_field is not None and require_field not in payload_dict:
464
+ return False
465
+ html: object = payload_dict.get("html")
466
+ if not isinstance(html, dict):
467
+ return False
468
+ html_dict = cast("dict[str, Any]", html)
469
+ if "structure" not in html_dict:
470
+ return False
471
+ return _cells_constraint_ok(
472
+ html_dict,
473
+ require_no_bbox=require_no_bbox,
474
+ require_all_bbox=require_all_bbox,
475
+ require_no_cells=require_no_cells,
476
+ )
477
+
478
+
479
+ def sniff_html_table(
480
+ source: IO[str],
481
+ *,
482
+ require_no_bbox: bool = False,
483
+ require_all_bbox: bool = False,
484
+ require_no_cells: bool = False,
485
+ require_field: str | None = None,
486
+ ) -> bool:
487
+ """Peek the first non-blank line; verify it is an HTML-token table.
488
+
489
+ Stream position is always restored.
490
+ """
491
+ pos = source.tell()
492
+ try:
493
+ for raw in source:
494
+ line = raw.strip()
495
+ if not line:
496
+ continue
497
+ try:
498
+ payload: object = json.loads(line)
499
+ except json.JSONDecodeError:
500
+ return False
501
+ return looks_like_html_table(
502
+ payload,
503
+ require_no_bbox=require_no_bbox,
504
+ require_all_bbox=require_all_bbox,
505
+ require_no_cells=require_no_cells,
506
+ require_field=require_field,
507
+ )
508
+ return False
509
+ finally:
510
+ source.seek(pos)