tablecodec 0.0.18__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tablecodec-0.0.18/.gitignore +60 -0
- tablecodec-0.0.18/CHANGELOG.md +458 -0
- tablecodec-0.0.18/LICENSE +21 -0
- tablecodec-0.0.18/PKG-INFO +200 -0
- tablecodec-0.0.18/README.md +151 -0
- tablecodec-0.0.18/docs/spec.md +438 -0
- tablecodec-0.0.18/pyproject.toml +212 -0
- tablecodec-0.0.18/src/tablecodec/__init__.py +29 -0
- tablecodec-0.0.18/src/tablecodec/_invariants.py +311 -0
- tablecodec-0.0.18/src/tablecodec/cli.py +314 -0
- tablecodec-0.0.18/src/tablecodec/codecs/__init__.py +111 -0
- tablecodec-0.0.18/src/tablecodec/codecs/_base.py +79 -0
- tablecodec-0.0.18/src/tablecodec/codecs/_htmltable.py +510 -0
- tablecodec-0.0.18/src/tablecodec/codecs/_otslgrid.py +318 -0
- tablecodec-0.0.18/src/tablecodec/codecs/builtins.py +36 -0
- tablecodec-0.0.18/src/tablecodec/codecs/doctags.py +278 -0
- tablecodec-0.0.18/src/tablecodec/codecs/fintabnet.py +84 -0
- tablecodec-0.0.18/src/tablecodec/codecs/fintabnet_otsl.py +141 -0
- tablecodec-0.0.18/src/tablecodec/codecs/otsl.py +138 -0
- tablecodec-0.0.18/src/tablecodec/codecs/pubtables1m.py +161 -0
- tablecodec-0.0.18/src/tablecodec/codecs/pubtabnet.py +128 -0
- tablecodec-0.0.18/src/tablecodec/codecs/tablebank.py +76 -0
- tablecodec-0.0.18/src/tablecodec/codecs/tableformer.py +80 -0
- tablecodec-0.0.18/src/tablecodec/io.py +91 -0
- tablecodec-0.0.18/src/tablecodec/ir.py +101 -0
- tablecodec-0.0.18/src/tablecodec/loss.py +105 -0
- tablecodec-0.0.18/src/tablecodec/py.typed +0 -0
- tablecodec-0.0.18/src/tablecodec/teds.py +243 -0
- tablecodec-0.0.18/src/tablecodec/validate.py +185 -0
- tablecodec-0.0.18/tests/benchmarks/test_codec_benchmarks.py +70 -0
- tablecodec-0.0.18/tests/codecs/test_doctags.py +104 -0
- tablecodec-0.0.18/tests/codecs/test_fintabnet.py +117 -0
- tablecodec-0.0.18/tests/codecs/test_fintabnet_otsl.py +89 -0
- tablecodec-0.0.18/tests/codecs/test_otsl.py +256 -0
- tablecodec-0.0.18/tests/codecs/test_otsl_to_pubtabnet.py +99 -0
- tablecodec-0.0.18/tests/codecs/test_pubtables1m.py +106 -0
- tablecodec-0.0.18/tests/codecs/test_pubtabnet.py +235 -0
- tablecodec-0.0.18/tests/codecs/test_pubtabnet_v10.py +89 -0
- tablecodec-0.0.18/tests/codecs/test_registry.py +155 -0
- tablecodec-0.0.18/tests/codecs/test_tablebank.py +116 -0
- tablecodec-0.0.18/tests/codecs/test_tableformer.py +127 -0
- tablecodec-0.0.18/tests/conftest.py +4 -0
- tablecodec-0.0.18/tests/fixtures/doctags/simple_2x2.jsonl +1 -0
- tablecodec-0.0.18/tests/fixtures/doctags/with_span_and_empty.jsonl +1 -0
- tablecodec-0.0.18/tests/fixtures/fintabnet/simple_2x2.jsonl +1 -0
- tablecodec-0.0.18/tests/fixtures/fintabnet/with_colspan.jsonl +1 -0
- tablecodec-0.0.18/tests/fixtures/fintabnet_otsl/simple_2x2.jsonl +1 -0
- tablecodec-0.0.18/tests/fixtures/otsl/simple_2x2.jsonl +1 -0
- tablecodec-0.0.18/tests/fixtures/otsl/with_2x2_span.jsonl +1 -0
- tablecodec-0.0.18/tests/fixtures/otsl/with_colspan.jsonl +1 -0
- tablecodec-0.0.18/tests/fixtures/otsl/with_empty.jsonl +1 -0
- tablecodec-0.0.18/tests/fixtures/otsl/with_rowspan.jsonl +1 -0
- tablecodec-0.0.18/tests/fixtures/pubtables1m/simple_2x2.jsonl +1 -0
- tablecodec-0.0.18/tests/fixtures/pubtables1m/with_span.jsonl +1 -0
- tablecodec-0.0.18/tests/fixtures/pubtabnet/simple_2x2.jsonl +1 -0
- tablecodec-0.0.18/tests/fixtures/pubtabnet/v10_simple_2x2.jsonl +1 -0
- tablecodec-0.0.18/tests/fixtures/pubtabnet/with_empty.jsonl +1 -0
- tablecodec-0.0.18/tests/fixtures/pubtabnet/with_rowspan.jsonl +1 -0
- tablecodec-0.0.18/tests/fixtures/tablebank/simple_2x2.jsonl +1 -0
- tablecodec-0.0.18/tests/fixtures/tablebank/with_rowspan.jsonl +1 -0
- tablecodec-0.0.18/tests/fixtures/tableformer/empty_with_bbox.jsonl +1 -0
- tablecodec-0.0.18/tests/fixtures/tableformer/simple_2x2.jsonl +1 -0
- tablecodec-0.0.18/tests/strategies.py +130 -0
- tablecodec-0.0.18/tests/test_cli.py +243 -0
- tablecodec-0.0.18/tests/test_conformance.py +112 -0
- tablecodec-0.0.18/tests/test_invariants.py +616 -0
- tablecodec-0.0.18/tests/test_invariants_hypothesis.py +183 -0
- tablecodec-0.0.18/tests/test_io.py +78 -0
- tablecodec-0.0.18/tests/test_io_streaming.py +99 -0
- tablecodec-0.0.18/tests/test_ir.py +230 -0
- tablecodec-0.0.18/tests/test_loss.py +135 -0
- tablecodec-0.0.18/tests/test_smoke.py +45 -0
- tablecodec-0.0.18/tests/test_spec_surface.py +292 -0
- tablecodec-0.0.18/tests/test_teds.py +310 -0
- tablecodec-0.0.18/tests/test_validate.py +321 -0
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.Python
|
|
7
|
+
|
|
8
|
+
# Build artifacts
|
|
9
|
+
build/
|
|
10
|
+
dist/
|
|
11
|
+
*.egg-info/
|
|
12
|
+
*.egg
|
|
13
|
+
wheels/
|
|
14
|
+
|
|
15
|
+
# Test / coverage / lint caches
|
|
16
|
+
.pytest_cache/
|
|
17
|
+
.ruff_cache/
|
|
18
|
+
.mypy_cache/
|
|
19
|
+
.pyright/
|
|
20
|
+
.coverage
|
|
21
|
+
.coverage.*
|
|
22
|
+
htmlcov/
|
|
23
|
+
coverage.xml
|
|
24
|
+
.tox/
|
|
25
|
+
|
|
26
|
+
# Virtual environments
|
|
27
|
+
.venv/
|
|
28
|
+
venv/
|
|
29
|
+
env/
|
|
30
|
+
|
|
31
|
+
# uv
|
|
32
|
+
.uv-cache/
|
|
33
|
+
|
|
34
|
+
# Editor / OS
|
|
35
|
+
.idea/
|
|
36
|
+
.vscode/
|
|
37
|
+
!.vscode/extensions.json
|
|
38
|
+
*.swp
|
|
39
|
+
*.swo
|
|
40
|
+
.DS_Store
|
|
41
|
+
|
|
42
|
+
# hypothesis
|
|
43
|
+
.hypothesis/
|
|
44
|
+
|
|
45
|
+
# Local secrets / env
|
|
46
|
+
.env
|
|
47
|
+
.env.local
|
|
48
|
+
|
|
49
|
+
# Generated artifacts (e.g. e2e findings records for local audit)
|
|
50
|
+
output/
|
|
51
|
+
|
|
52
|
+
# Downloaded raw datasets for local-only native-format e2e (large; never
|
|
53
|
+
# committed). e.g. PubTables-1M PASCAL VOC, FinTabNet.c, TableBank.
|
|
54
|
+
input/
|
|
55
|
+
|
|
56
|
+
# Local-only, untracked working area (e.g. draft release workflow kept
|
|
57
|
+
# out of the public repo until release automation is enabled).
|
|
58
|
+
private/
|
|
59
|
+
|
|
60
|
+
.claude/scheduled_tasks.lock
|
|
@@ -0,0 +1,458 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
## [0.0.18] - 2026-06-07
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
|
|
14
|
+
- Conformance corpus (`conformance/`, SPEC §11) now covers **all nine codecs**
|
|
15
|
+
(was 2): added an independently-authored sample + expected-IR per codec for
|
|
16
|
+
`pubtabnet-1.0.0`, `fintabnet`, `fintabnet-otsl`, `tableformer`, `tablebank`,
|
|
17
|
+
`pubtables-1m`, `doctags-tables`. `tests/test_conformance.py` registers the
|
|
18
|
+
full builtin set and runs every case, so read-path regressions are caught
|
|
19
|
+
for every codec.
|
|
20
|
+
|
|
21
|
+
- `packages/tablecodec-docling/` — a bridge codec (`docling-tables`, own
|
|
22
|
+
version 0.0.2) mapping between `DoclingDocument.tables` and `TableSample`,
|
|
23
|
+
developed in-repo as a temporary monorepo member (ADR 0013, SPEC §15). It
|
|
24
|
+
depends on docling-core and lives in its own uv project, so the stdlib-only
|
|
25
|
+
core package and its environment are unaffected. Discover it via
|
|
26
|
+
`tablecodec.codecs.load_plugins()`. Run its checks with `just docling-ci`
|
|
27
|
+
(or `just ci-all` for the whole monorepo).
|
|
28
|
+
- **read** (0.0.1): JSONL of `DoclingDocument`s -> one `TableSample` per
|
|
29
|
+
table; populates `image_width`/`image_height` from page size so
|
|
30
|
+
docling-read samples can be validated under the STRICT profile.
|
|
31
|
+
- **write** (0.0.2): each `TableSample` -> one `DoclingDocument` (the inverse
|
|
32
|
+
of read), so `read(write([s]))` round-trips modulo
|
|
33
|
+
`lossy_write = {"tokens", "extras"}` (docling stores one string per cell;
|
|
34
|
+
no home for IR extras). `writable = True`, so docling-tables is now a real
|
|
35
|
+
`analyze_loss` conversion target.
|
|
36
|
+
|
|
37
|
+
### Security
|
|
38
|
+
|
|
39
|
+
- Hardened the release pipeline ahead of the first PyPI publish (ADR 0014):
|
|
40
|
+
- All GitHub Actions are pinned to full commit SHAs (was mutable tags;
|
|
41
|
+
`pypa/gh-action-pypi-publish` now at the v1.14.0 SHA), with Dependabot
|
|
42
|
+
tracking bumps behind a 7-day cooldown (`.github/dependabot.yml`).
|
|
43
|
+
- The release workflow records a **SLSA build provenance** attestation
|
|
44
|
+
(`actions/attest-build-provenance`) and notes that PyPI **PEP 740**
|
|
45
|
+
publish attestations are emitted automatically by Trusted Publishing.
|
|
46
|
+
`skip-existing` makes a partial-failure re-run idempotent.
|
|
47
|
+
- CI (and the release build) route installs through Takumi Guard, a
|
|
48
|
+
screening proxy that blocks known-malicious packages; `[tool.uv]
|
|
49
|
+
exclude-newer` is pinned to an absolute date and `uv sync --locked`
|
|
50
|
+
guards against lockfile drift.
|
|
51
|
+
- PEP 639 SPDX license metadata (`license = "MIT"` + `license-files`;
|
|
52
|
+
core-metadata 2.4 via hatchling >= 1.29).
|
|
53
|
+
|
|
54
|
+
## [0.0.17] - 2026-05-29
|
|
55
|
+
|
|
56
|
+
### Added
|
|
57
|
+
|
|
58
|
+
- `TableSample.image_width` / `image_height` (`int | None`, default `None`):
|
|
59
|
+
optional sample-level source-image dimensions, peers of `filename`/`imgid`.
|
|
60
|
+
They join `__hash__`/`__eq__`. No codec carries them yet, so `None`
|
|
61
|
+
round-trips losslessly and no loss declaration changes (loss_matrix
|
|
62
|
+
unaffected). See `docs/adr/0012-strict-profile-image-bounds.md`.
|
|
63
|
+
|
|
64
|
+
### Changed
|
|
65
|
+
|
|
66
|
+
- `profiles.STRICT` now implements SPEC §8's bbox-in-image cross-check instead
|
|
67
|
+
of aliasing `DEFAULT`. STRICT = DEFAULT plus: a bbox-free sample needs no
|
|
68
|
+
image metadata; once any cell carries a bbox the sample MUST declare
|
|
69
|
+
`image_width`/`image_height` (`STRICT-IMAGE-METADATA`) and every bbox must
|
|
70
|
+
lie within the image rectangle `0 <= x0 < x1 <= width`,
|
|
71
|
+
`0 <= y0 < y1 <= height`, upper bound inclusive (`STRICT-BBOX-OUT-OF-BOUNDS`)
|
|
72
|
+
(ADR 0012, option C). The check is a containment test independent of bbox
|
|
73
|
+
precision, so it ships with int image dims and does not depend on OQ-3.
|
|
74
|
+
Scope is IR field + check only: no codec populates dims yet, so a
|
|
75
|
+
bbox-bearing codec-read sample fails STRICT until a codec carries dims
|
|
76
|
+
(accepted for opt-in 0.x; codec population is a future patch).
|
|
77
|
+
|
|
78
|
+
## [0.0.16] - 2026-05-29
|
|
79
|
+
|
|
80
|
+
### Added
|
|
81
|
+
|
|
82
|
+
- TEDS (Tree-Edit-Distance based Similarity) metric, the `[teds]` optional
|
|
83
|
+
feature (`apted`, `lxml`). `from tablecodec.teds import teds, teds_html`:
|
|
84
|
+
`teds(pred, true, *, structure_only=False)` scores two `TableSample`s in
|
|
85
|
+
`[0, 1]` (`structure_only` gives TEDS-Struct); `teds_html` does the same for
|
|
86
|
+
HTML strings. The tree construction, rename-cost rule, and
|
|
87
|
+
`1 - dist / max_nodes` formula are adapted from IBM's PubTabNet reference
|
|
88
|
+
metric (Apache-2.0; see `THIRD_PARTY_NOTICES.md` and
|
|
89
|
+
`docs/adr/0011-teds-metric-port.md`), with a pure-Python normalized
|
|
90
|
+
Levenshtein and no batching. `teds.py` is core-external (not in the semgrep
|
|
91
|
+
core list, never imported by `tablecodec/__init__`), so `import tablecodec`
|
|
92
|
+
stays zero-dependency. `just test`/`type`/`cov` now run with `--extra teds`;
|
|
93
|
+
the tests `importorskip` when it is absent.
|
|
94
|
+
|
|
95
|
+
## [0.0.15] - 2026-05-29
|
|
96
|
+
|
|
97
|
+
### Changed
|
|
98
|
+
|
|
99
|
+
- I-05 (bbox well-formed) now decides "empty cell" by **content**, not by
|
|
100
|
+
token count: a cell whose tokens concatenate to only whitespace
|
|
101
|
+
(`"".join(tokens).strip() == ""`) localizes nothing, so its placeholder
|
|
102
|
+
bbox is out of scope for the geometry check. This widens the 0.0.12
|
|
103
|
+
scoping (which only skipped `tokens == ()`) to also skip a lone
|
|
104
|
+
empty-string token `("",)` and whitespace-only tokens `(" ",)` — the
|
|
105
|
+
dominant residual finding in an e2e verification sweep (70/85 SynthTabNet
|
|
106
|
+
cells were `("",)`). Markup-only cells (e.g. `("<sup>", " ", "</sup>")`)
|
|
107
|
+
stay content-bearing and ARE geometry-checked: the core IR does not model
|
|
108
|
+
HTML, so any non-whitespace token counts as content (the IR-neutral
|
|
109
|
+
line). Validation-layer only (`_invariants.py`); codecs unchanged, no
|
|
110
|
+
`lossy_*` / round-trip impact. See `docs/spec.md` §5.2 and
|
|
111
|
+
`docs/adr/0010-i05-empty-cell-is-whitespace-content.md` (refines ADR
|
|
112
|
+
0007).
|
|
113
|
+
|
|
114
|
+
## [0.0.14] - 2026-05-29
|
|
115
|
+
|
|
116
|
+
### Removed
|
|
117
|
+
|
|
118
|
+
- The `fast` (`orjson`) and `validate` (`pydantic`) optional extras
|
|
119
|
+
(`pyproject.toml` + SPEC §13 dependency table). Both were declared but
|
|
120
|
+
wired nowhere, and could not be: the work they would touch — JSONL
|
|
121
|
+
parsing, IR construction, validation — runs inside the zero-dependency
|
|
122
|
+
core, where `semgrep.yaml` forbids third-party imports. Installing them
|
|
123
|
+
pulled in a package nothing could import. `tablecodec[teds]` (a separate,
|
|
124
|
+
core-external feature) and `tablecodec[cli]`/`[hf]` are unaffected.
|
|
125
|
+
Stricter validation remains available via the layered validation
|
|
126
|
+
profiles (SPEC §8), which are stdlib-only. See
|
|
127
|
+
`docs/adr/0009-drop-fast-and-validate-extras.md`.
|
|
128
|
+
|
|
129
|
+
## [0.0.13] - 2026-05-29
|
|
130
|
+
|
|
131
|
+
### Added
|
|
132
|
+
|
|
133
|
+
- `codecs.load_plugins()` — discovers and registers third-party codecs from
|
|
134
|
+
the `tablecodec.codecs` entry-point group (SPEC §6.2). Each entry point
|
|
135
|
+
references a `Codec` class (instantiated) or instance; already-registered
|
|
136
|
+
names are skipped (idempotent). The CLI now calls it after registering the
|
|
137
|
+
built-ins, so `pip install tablecodec-<x>` codecs appear in `codecs list`
|
|
138
|
+
and are usable by every command. Stdlib-only (`importlib.metadata`).
|
|
139
|
+
|
|
140
|
+
### Fixed
|
|
141
|
+
|
|
142
|
+
- E2E harness (`scripts/e2e_hf_check.py`): the DocTags round-trip adapter
|
|
143
|
+
parsed `sink.getvalue().splitlines()[0]`, which breaks when a cell token
|
|
144
|
+
contains a Unicode line separator (U+2028/U+2029/U+0085) that
|
|
145
|
+
`json.dumps(ensure_ascii=False)` leaves raw — slicing the record
|
|
146
|
+
mid-string (1/16k rows). It now parses the whole single-record buffer.
|
|
147
|
+
The DocTags codec was already correct (it emits valid JSON);
|
|
148
|
+
`--self-test` gains a U+2028 regression guard. Harness-only; no library
|
|
149
|
+
or codec change.
|
|
150
|
+
|
|
151
|
+
## [0.0.12] - 2026-05-29
|
|
152
|
+
|
|
153
|
+
### Changed
|
|
154
|
+
|
|
155
|
+
- I-05 (bbox well-formed) is now scoped to **content-bearing cells**: a
|
|
156
|
+
bbox on an empty cell (`tokens == ()`) is a placeholder region and is no
|
|
157
|
+
longer geometry-checked. A live sweep showed the dominant validation
|
|
158
|
+
finding was empty cells carrying zero-area placeholder boxes (≈45% of
|
|
159
|
+
sampled SynthTabNet tables); these are degenerate in the SOURCE data
|
|
160
|
+
(not introduced by our float→int cast), and an empty cell localizes no
|
|
161
|
+
content. The fix lives entirely in the validation layer
|
|
162
|
+
(`_invariants.py`); codecs are unchanged and still read/keep the bbox
|
|
163
|
+
faithfully (no `lossy_*` / round-trip impact). Degenerate bboxes on
|
|
164
|
+
content-bearing cells are still flagged. Profiles that require bbox
|
|
165
|
+
*presence* (`tableformer`, `pubtabnet-2.0`) are unaffected. See
|
|
166
|
+
`docs/spec.md` §5.2 and `docs/adr/0007-i05-empty-cell-bbox-scope.md`.
|
|
167
|
+
|
|
168
|
+
## [0.0.11] - 2026-05-29
|
|
169
|
+
|
|
170
|
+
### Fixed
|
|
171
|
+
|
|
172
|
+
- OTSL reconstruction follow-up (`_otslgrid.py`): `check_right`/`check_down`
|
|
173
|
+
now stop at cells already claimed by a 2D-span `registry`. Without this a
|
|
174
|
+
long `lcel` run in one row swallowed `xcel` cells belonging to a 2D span
|
|
175
|
+
from above, overlapping it (real SynthTabNet rows, e.g. imgid 6075). The
|
|
176
|
+
remaining SynthTabNet I-04 are genuine OTSL span ambiguity (L-shaped
|
|
177
|
+
regions that cannot form an exact-cover grid), matching the HTML path.
|
|
178
|
+
|
|
179
|
+
### Added
|
|
180
|
+
|
|
181
|
+
- E2E native PubTables-1M coverage (`scripts/e2e_hf_check.py`): reads the
|
|
182
|
+
original PASCAL VOC structure annotation (`bsmock/pubtables-1m`,
|
|
183
|
+
download-only) from a local tar under `input/` and reconstructs the
|
|
184
|
+
logical grid (rows × columns intersection, spanning-cell merge,
|
|
185
|
+
column-header role) for the `pubtables-1m` codec. The harness gained a
|
|
186
|
+
local-tar source alongside HF streaming; XML is parsed with `defusedxml`
|
|
187
|
+
(added to the `[hf]` extra). A live run reads 200/200 real VOC tables
|
|
188
|
+
clean. FinTabNet / TableBank natives remain download-only and
|
|
189
|
+
Docling-covered. See `docs/adr/0006-native-datasets-via-local-download.md`.
|
|
190
|
+
|
|
191
|
+
## [0.0.10] - 2026-05-28
|
|
192
|
+
|
|
193
|
+
### Fixed
|
|
194
|
+
|
|
195
|
+
- OTSL grid reconstruction (`codecs/_otslgrid.py::build_anchors`): complex
|
|
196
|
+
2D span topologies were mis-decoded — a diagonal `xcel` resolution plus
|
|
197
|
+
independent `max` colspan/rowspan inflated vertical-only spans into
|
|
198
|
+
overlapping boxes, and a column-0 `xcel` was wrongly rejected. A live
|
|
199
|
+
e2e sweep exposed this: `SynthTabNet_OTSL` through `otsl-1.0.0` scored
|
|
200
|
+
48/300 while every other corpus scored 300/300, and an HTML-vs-OTSL
|
|
201
|
+
cross-check on the same rows proved the token streams were well-formed.
|
|
202
|
+
`build_anchors` now reconstructs the grid with the anchor-centric
|
|
203
|
+
algorithm adapted (with attribution) from docling-ibm-models'
|
|
204
|
+
`otsl_to_html` — `check_right`/`check_down` span runs over `lcel`/`xcel`
|
|
205
|
+
and `ucel`/`xcel`, a 2D-span registry preventing double-claims, and
|
|
206
|
+
continuation tokens skipped rather than erroring. Fixes `otsl-1.0.0`,
|
|
207
|
+
`fintabnet-otsl`, `doctags-tables`, `pubtables-1m` (all call
|
|
208
|
+
`build_anchors`). License is unchanged (MIT → MIT requires only
|
|
209
|
+
attribution; see `THIRD_PARTY_NOTICES.md` and
|
|
210
|
+
`docs/adr/0005-port-otsl-reconstruction.md`).
|
|
211
|
+
|
|
212
|
+
### Added
|
|
213
|
+
|
|
214
|
+
- E2E harness (`scripts/e2e_hf_check.py`, `[hf]` extra): streams the
|
|
215
|
+
Docling OTSL dataset family through the codecs and validates the
|
|
216
|
+
resulting IR. Exercises the real `codec.read()` path (square-table
|
|
217
|
+
assumption, anchor/cell alignment, HTML structure parsing) against
|
|
218
|
+
real tables. Rows are randomly sampled (streaming shuffle reshuffles
|
|
219
|
+
shard order; each run prints its `--seed` for reproducibility), so
|
|
220
|
+
repeated runs progressively cover the corpora. HF logging / progress
|
|
221
|
+
bars are silenced so output is just the summary. Occasional /
|
|
222
|
+
local-only (not CI-gated); a network-free `--self-test` /
|
|
223
|
+
`just e2e-selftest` verifies the adapters through the real codecs.
|
|
224
|
+
See `docs/adr/0003-e2e-against-docling-otsl-family.md`.
|
|
225
|
+
All nine shipped codecs now have at least one official-corpus check:
|
|
226
|
+
the FinTabNet_OTSL checks route through the actual `fintabnet` /
|
|
227
|
+
`fintabnet-otsl` codecs (adapter bridges Docling's `imgid` to
|
|
228
|
+
`table_id`); `pubtabnet-1.0.0` / `tableformer` read the Docling HTML;
|
|
229
|
+
`tablebank` reads the HTML structure with cells omitted; `pubtables-1m`
|
|
230
|
+
reads object-detection records whose grid coords are derived from OTSL
|
|
231
|
+
placement; and `doctags-tables` is a real-content round-trip. Every
|
|
232
|
+
failed row is recorded as a JSONL finding
|
|
233
|
+
under `output/e2e_findings/` (gitignored) — with full provenance
|
|
234
|
+
(dataset/split/codec/seed/row_index), the offending cell, and the
|
|
235
|
+
exact `input_payload` so a finding can be replayed and judged
|
|
236
|
+
(library bug vs. malformed upstream data vs. over-strict invariant);
|
|
237
|
+
`verdict` is always `needs-review`.
|
|
238
|
+
The PubTabNet codecs additionally read their first-published dataset in
|
|
239
|
+
its **native** shape via `apoidea/pubtabnet-html` (the original
|
|
240
|
+
PubTabNet 2.0 `html` annotation, fed unmodified — not the Docling OTSL
|
|
241
|
+
conversion). The other codecs' native originals (FinTabNet, TableBank,
|
|
242
|
+
PubTables-1M PASCAL VOC) ship as tar.gz / image files not exposed
|
|
243
|
+
through the HF Datasets viewer, so they stay Docling-covered.
|
|
244
|
+
See `docs/adr/0004-e2e-native-first-published-datasets.md`.
|
|
245
|
+
|
|
246
|
+
## [0.0.9] - 2026-05-28
|
|
247
|
+
|
|
248
|
+
### Added
|
|
249
|
+
|
|
250
|
+
- FinTabNet_OTSL codec (`fintabnet-otsl`, HF `ds4sd/FinTabNet_OTSL`):
|
|
251
|
+
OTSL structure with FinTabNet provenance — a `table_id` identifier
|
|
252
|
+
(mapped onto `imgid`) and an `extras` dict (e.g. `otsl_raw`). It is the
|
|
253
|
+
**first codec that round-trips IR `extras`**, so `extras` is
|
|
254
|
+
deliberately absent from `lossy_write` (`lossy_read = {"role"}`,
|
|
255
|
+
`lossy_write = {"role"}`). Structure handling is shared with OTSL via
|
|
256
|
+
`_otslgrid`. `sniff()` requires both `otsl` and `table_id` keys. This
|
|
257
|
+
brings the SPEC §7 initial codec set to nine.
|
|
258
|
+
- `_otslgrid` gains `otsl_to_cells` / `cells_to_otsl` so OTSL and
|
|
259
|
+
FinTabNet_OTSL share the OTSL payload↔GridCell mapping.
|
|
260
|
+
|
|
261
|
+
### Changed
|
|
262
|
+
|
|
263
|
+
- `otsl.py` delegates its payload↔sample mapping to the new `_otslgrid`
|
|
264
|
+
helpers (Tidy First, no behaviour change).
|
|
265
|
+
|
|
266
|
+
## [0.0.8] - 2026-05-28
|
|
267
|
+
|
|
268
|
+
### Fixed
|
|
269
|
+
|
|
270
|
+
- `tablecodec codecs list` now lists every built-in codec. The CLI's
|
|
271
|
+
built-in registration had drifted — it still seeded only the three
|
|
272
|
+
codecs that existed when the CLI was written (pubtabnet-1.0.0,
|
|
273
|
+
pubtabnet-2.0.0, otsl-1.0.0), omitting fintabnet, tableformer,
|
|
274
|
+
tablebank, pubtables-1m, and doctags-tables.
|
|
275
|
+
|
|
276
|
+
### Changed
|
|
277
|
+
|
|
278
|
+
- Introduced `tablecodec.codecs.builtins.BUILTIN_CODECS` as the single
|
|
279
|
+
source of truth for the shipped codecs. The CLI and both doc
|
|
280
|
+
generators now consume it instead of each maintaining their own list
|
|
281
|
+
(no doc-output change; removes the triplicated registration).
|
|
282
|
+
|
|
283
|
+
## [0.0.7] - 2026-05-28
|
|
284
|
+
|
|
285
|
+
### Added
|
|
286
|
+
|
|
287
|
+
- DocTags table subset codec (`doctags-tables`): reads the IBM
|
|
288
|
+
Granite-Docling table markup — OTSL cell tokens wrapped in
|
|
289
|
+
`<otsl>`...`</otsl>`, each anchor annotated with four `<loc_n>` tokens
|
|
290
|
+
(a 0–500 grid bbox) plus content tokens. Read is full (structure +
|
|
291
|
+
bbox + content); write emits the OTSL-equivalent subset, so `role` is
|
|
292
|
+
lost (`lossy_read = {"role"}`, `lossy_write = {"role", "extras"}`,
|
|
293
|
+
SPEC §7 △). `sniff()` matches the `doctags` key.
|
|
294
|
+
- `_otslgrid` shared module: the OTSL structure↔grid machinery
|
|
295
|
+
(`split_rows`, `ensure_square`, `build_anchors`, `build_token_grid`)
|
|
296
|
+
extracted from `otsl.py` so OTSL and DocTags share one implementation.
|
|
297
|
+
|
|
298
|
+
### Changed
|
|
299
|
+
|
|
300
|
+
- `otsl.py` now delegates its grid parsing/serialization to `_otslgrid`
|
|
301
|
+
(Tidy First, no behaviour change).
|
|
302
|
+
|
|
303
|
+
## [0.0.6] - 2026-05-28
|
|
304
|
+
|
|
305
|
+
### Added
|
|
306
|
+
|
|
307
|
+
- PubTables-1M codec (`pubtables-1m`): the first **read-only** codec.
|
|
308
|
+
Reads the object-detection format (cells carry explicit
|
|
309
|
+
row/col/rowspan/colspan/bbox in detection order) and normalises to
|
|
310
|
+
row-major IR; derives nrows/ncols when absent. `write` raises
|
|
311
|
+
`NotImplementedError`.
|
|
312
|
+
- `Codec.writable` flag (ADR 0002): boolean capability on the Codec
|
|
313
|
+
Protocol. All writable codecs default to `True`; read-only codecs set
|
|
314
|
+
`False`. `analyze_loss` short-circuits to a new
|
|
315
|
+
`round_trip_classification` value **`"unwritable"`** when the target
|
|
316
|
+
is read-only, and the loss matrix renders it as ⚫. `format_support.md`
|
|
317
|
+
gains a "Writable" column.
|
|
318
|
+
|
|
319
|
+
### Changed
|
|
320
|
+
|
|
321
|
+
- Every built-in codec now declares `writable` (mechanical, defaults to
|
|
322
|
+
`True`).
|
|
323
|
+
|
|
324
|
+
## [0.0.5] - 2026-05-28
|
|
325
|
+
|
|
326
|
+
### Added
|
|
327
|
+
|
|
328
|
+
- TableBank codec (`tablebank`): a structure-only format — the source
|
|
329
|
+
ships `html.structure` with no `html.cells`, so on read every cell is
|
|
330
|
+
empty (`tokens=()`, `bbox=None`) and the grid shape is reconstructed
|
|
331
|
+
from the structure tokens. Write emits structure only. `lossy_read =
|
|
332
|
+
{"tokens", "bbox"}`, `lossy_write = {"tokens", "bbox", "extras"}` —
|
|
333
|
+
so TableBank is the first codec to surface `lossy` (🔴) classifications
|
|
334
|
+
in the loss matrix (token loss is not structure-preserving). `sniff()`
|
|
335
|
+
requires `html.structure` present and `html.cells` absent.
|
|
336
|
+
- `_htmltable` gains `parse_html_structure_only` /
|
|
337
|
+
`serialize_html_structure_only` and a `require_no_cells` sniff knob.
|
|
338
|
+
|
|
339
|
+
## [0.0.4] - 2026-05-28
|
|
340
|
+
|
|
341
|
+
### Added
|
|
342
|
+
|
|
343
|
+
- TableFormer Format codec (`tableformer`): PubTabNet 2.0's HTML-token
|
|
344
|
+
structure plus the invariant that EVERY cell — including empty ones —
|
|
345
|
+
carries a bbox. The codec enforces this on read (raising a clear error
|
|
346
|
+
if any cell lacks a bbox), so its output always satisfies
|
|
347
|
+
`profiles.TABLEFORMER`. `sniff()` requires all cells to have a bbox,
|
|
348
|
+
which distinguishes it from PubTabNet (whose empty cells omit bbox).
|
|
349
|
+
`lossy_read = {}`, `lossy_write = {"extras"}`.
|
|
350
|
+
|
|
351
|
+
## [0.0.3] - 2026-05-28
|
|
352
|
+
|
|
353
|
+
### Added
|
|
354
|
+
|
|
355
|
+
- FinTabNet (original) codec (`fintabnet`): same HTML-token structure as
|
|
356
|
+
PubTabNet 2.0, with `table_id` as the record identifier instead of
|
|
357
|
+
`imgid`. Reads/writes via the shared `_htmltable` machinery with
|
|
358
|
+
`id_field="table_id"`; `sniff()` requires the `table_id` key so a
|
|
359
|
+
PubTabNet (imgid) record is not mis-detected as FinTabNet.
|
|
360
|
+
`lossy_read = {}`, `lossy_write = {"extras"}`.
|
|
361
|
+
|
|
362
|
+
### Changed
|
|
363
|
+
|
|
364
|
+
- Extracted the HTML-token parser / grid-placement / serializer out of
|
|
365
|
+
`codecs/pubtabnet.py` into `codecs/_htmltable.py` (Tidy First, no
|
|
366
|
+
behaviour change) so PubTabNet and FinTabNet share one implementation.
|
|
367
|
+
- `docs/format_support.md` now also lists `otsl-1.0.0` (previously the
|
|
368
|
+
generator only seeded the two PubTabNet codecs).
|
|
369
|
+
|
|
370
|
+
## [0.0.2] - 2026-05-28
|
|
371
|
+
|
|
372
|
+
Development preview (0.0.x makes no stability promises). Stdlib-only
|
|
373
|
+
core, three codecs, streaming I/O, static loss analysis, optional CLI,
|
|
374
|
+
and an in-repo conformance suite. Not published to PyPI yet — codecs
|
|
375
|
+
are being added incrementally within the 0.0.x series.
|
|
376
|
+
|
|
377
|
+
### Added
|
|
378
|
+
|
|
379
|
+
- Repository bootstrap (M0): `pyproject.toml` (hatchling, Python 3.11+),
|
|
380
|
+
`justfile`, `ruff.toml`, `pyrightconfig.json`, GitHub Actions CI matrix
|
|
381
|
+
(Python 3.11–3.13 × Ubuntu/macOS), `semgrep.yaml` enforcing
|
|
382
|
+
SPEC §13 zero-dependency policy, MIT license, smoke test scaffold.
|
|
383
|
+
- Internal Representation (M1): SPEC §5.1 `BBox`, `GridCell`,
|
|
384
|
+
`TableSample` as frozen, slotted, hashable dataclasses; SPEC §5.2
|
|
385
|
+
invariants I-01..I-07 each as an independent `check_iXX` function
|
|
386
|
+
returning `list[ValidationError]`. SPEC §8 validation profiles
|
|
387
|
+
(`LENIENT`, `DEFAULT`, `PUBTABNET_2_0`, `TABLEFORMER`, `STRICT`)
|
|
388
|
+
exposed via `tablecodec.profiles` and orchestrated by `validate()`.
|
|
389
|
+
Hypothesis-driven property tests (10,000 cases) verify that valid
|
|
390
|
+
samples pass every profile and that a single broken invariant is
|
|
391
|
+
reported by its own check function without spurious cross-talk.
|
|
392
|
+
Coverage 100% across all M1 modules; pyright strict clean.
|
|
393
|
+
- Codec layer (M2): SPEC §6 `Codec` Protocol (`@property` getters so
|
|
394
|
+
frozen-dataclass implementations satisfy the protocol) in
|
|
395
|
+
`tablecodec.codecs._base`; in-process registry (`register`, `get`,
|
|
396
|
+
`list_codecs`, `detect`) in `tablecodec.codecs`. First codec:
|
|
397
|
+
`PubTabNet20Codec` (`pubtabnet-2.0.0`) with streaming `read` /
|
|
398
|
+
`write`, span-aware HTML table-placement algorithm, honest
|
|
399
|
+
`lossy_read` (empty) and `lossy_write` (`{"extras"}`), and a
|
|
400
|
+
`sniff()` delegate for `codecs.detect()`. Round-trip tests verify
|
|
401
|
+
that `read → write → read` is the identity for non-extras payloads.
|
|
402
|
+
- Streaming I/O + PubTabNet 1.0 (M3): `tablecodec.io.open()` accepts a
|
|
403
|
+
path-like or text stream and returns a streaming iterator; auto-detect
|
|
404
|
+
via `tablecodec.io.detect()` peeks the source without consuming it.
|
|
405
|
+
Second codec: `PubTabNet10Codec` (`pubtabnet-1.0.0`) — same format
|
|
406
|
+
family minus bbox; `lossy_read = {"bbox"}`, `lossy_write =
|
|
407
|
+
{"bbox", "extras"}`. Sniff discriminates the two versions by bbox
|
|
408
|
+
presence in the first record. SPEC §10 streaming guarantee verified
|
|
409
|
+
by tracemalloc-instrumented test: 100,000 pubtabnet-2.0 records read
|
|
410
|
+
with peak < 50 MB. `docs/format_support.md` is auto-generated by
|
|
411
|
+
`scripts/gen_format_support.py` and CI fails if it goes stale
|
|
412
|
+
(`just docs-check`). `tests/benchmarks/` houses pytest-benchmark
|
|
413
|
+
micro-benchmarks (deselected from default run, executed by
|
|
414
|
+
`just bench` and the new `.github/workflows/benchmark.yaml`).
|
|
415
|
+
- OTSL 1.0 codec (M4): `OTSL10Codec` (`otsl-1.0.0`) implements the
|
|
416
|
+
five-token OTSL grammar from arXiv 2305.03393 (`fcel`, `ecel`,
|
|
417
|
+
`lcel`, `ucel`, `xcel`, plus `nl`). Square-table assumption is
|
|
418
|
+
enforced on read (jagged row widths rejected with a clear error).
|
|
419
|
+
Continuation tokens (lcel/ucel/xcel) extend the anchor cell they
|
|
420
|
+
reference; the IR is reconstructed in two passes (parse rows →
|
|
421
|
+
resolve anchors). The implementation is derived from the paper, not
|
|
422
|
+
copied from `docling-ibm-models/tableformer/otsl.py`. `lossy_read =
|
|
423
|
+
{"role"}` and `lossy_write = {"extras", "role"}` are honest about
|
|
424
|
+
the header/body distinction collapsing through OTSL — a property
|
|
425
|
+
verified by a cross-codec test that round-trips a PubTabNet sample
|
|
426
|
+
with header cells through OTSL and observes role=body on return.
|
|
427
|
+
- Loss analysis (M5): `tablecodec.analyze_loss(source, target)` returns
|
|
428
|
+
a `LossReport` derived statically from the codecs' `lossy_read` and
|
|
429
|
+
`lossy_write` declarations — no data is read. The round-trip
|
|
430
|
+
classification distinguishes `lossless` (nothing dropped),
|
|
431
|
+
`structure-preserving` (only auxiliary `bbox`/`role`/`extras` lost),
|
|
432
|
+
and `lossy` (any other field lost). `docs/loss_matrix.md` is
|
|
433
|
+
auto-generated by `scripts/gen_loss_matrix.py` and the same
|
|
434
|
+
`just docs-check` gate that protects `format_support.md` also
|
|
435
|
+
protects it.
|
|
436
|
+
- CLI (M6): `tablecodec` console script (`[project.scripts]`) backed by
|
|
437
|
+
`src/tablecodec/cli.py` and the `[cli]` extra (click 8.x). Six
|
|
438
|
+
subcommands: `validate`, `convert`, `stats`, `diff`, `analyze-loss`,
|
|
439
|
+
`codecs list`. Every command streams input; non-zero exit on
|
|
440
|
+
validation failures and diff mismatches. `convert --dry-run` prints
|
|
441
|
+
the static `analyze_loss` report without touching the input file.
|
|
442
|
+
CLI is wholly optional — the core continues to install and run
|
|
443
|
+
without click (verified by the existing pip-install-check job).
|
|
444
|
+
- Conformance suite skeleton (M7): the SPEC §11 corpus is bootstrapped
|
|
445
|
+
in-repo under `conformance/` (manifest `INDEX.json` + draft-2020-12
|
|
446
|
+
JSON Schema + samples + hand-authored expected-IR JSON), pending
|
|
447
|
+
extraction to a separate vendor-neutral repository before v1.0 (see
|
|
448
|
+
`docs/adr/0001-conformance-suite-in-repo-temporarily.md`).
|
|
449
|
+
`tests/test_conformance.py` validates `INDEX.json` against its schema
|
|
450
|
+
and runs every case (3 × pubtabnet-2.0.0, 3 × otsl-1.0.0) by reading
|
|
451
|
+
the sample and comparing the IR to the independent expectation.
|
|
452
|
+
`jsonschema` added to the `[dev]` extra (test-only).
|
|
453
|
+
|
|
454
|
+
<!-- v0.0.18 is the first cut release (tag + GitHub Release created by
|
|
455
|
+
.github/workflows/release.yaml). Earlier 0.0.x headings stay plain text
|
|
456
|
+
(no tags were pushed for them). -->
|
|
457
|
+
[Unreleased]: https://github.com/hironow/tablecodec/compare/v0.0.18...main
|
|
458
|
+
[0.0.18]: https://github.com/hironow/tablecodec/releases/tag/v0.0.18
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 hironow and tablecodec contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|