tablecodec 0.0.18__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. tablecodec-0.0.18/.gitignore +60 -0
  2. tablecodec-0.0.18/CHANGELOG.md +458 -0
  3. tablecodec-0.0.18/LICENSE +21 -0
  4. tablecodec-0.0.18/PKG-INFO +200 -0
  5. tablecodec-0.0.18/README.md +151 -0
  6. tablecodec-0.0.18/docs/spec.md +438 -0
  7. tablecodec-0.0.18/pyproject.toml +212 -0
  8. tablecodec-0.0.18/src/tablecodec/__init__.py +29 -0
  9. tablecodec-0.0.18/src/tablecodec/_invariants.py +311 -0
  10. tablecodec-0.0.18/src/tablecodec/cli.py +314 -0
  11. tablecodec-0.0.18/src/tablecodec/codecs/__init__.py +111 -0
  12. tablecodec-0.0.18/src/tablecodec/codecs/_base.py +79 -0
  13. tablecodec-0.0.18/src/tablecodec/codecs/_htmltable.py +510 -0
  14. tablecodec-0.0.18/src/tablecodec/codecs/_otslgrid.py +318 -0
  15. tablecodec-0.0.18/src/tablecodec/codecs/builtins.py +36 -0
  16. tablecodec-0.0.18/src/tablecodec/codecs/doctags.py +278 -0
  17. tablecodec-0.0.18/src/tablecodec/codecs/fintabnet.py +84 -0
  18. tablecodec-0.0.18/src/tablecodec/codecs/fintabnet_otsl.py +141 -0
  19. tablecodec-0.0.18/src/tablecodec/codecs/otsl.py +138 -0
  20. tablecodec-0.0.18/src/tablecodec/codecs/pubtables1m.py +161 -0
  21. tablecodec-0.0.18/src/tablecodec/codecs/pubtabnet.py +128 -0
  22. tablecodec-0.0.18/src/tablecodec/codecs/tablebank.py +76 -0
  23. tablecodec-0.0.18/src/tablecodec/codecs/tableformer.py +80 -0
  24. tablecodec-0.0.18/src/tablecodec/io.py +91 -0
  25. tablecodec-0.0.18/src/tablecodec/ir.py +101 -0
  26. tablecodec-0.0.18/src/tablecodec/loss.py +105 -0
  27. tablecodec-0.0.18/src/tablecodec/py.typed +0 -0
  28. tablecodec-0.0.18/src/tablecodec/teds.py +243 -0
  29. tablecodec-0.0.18/src/tablecodec/validate.py +185 -0
  30. tablecodec-0.0.18/tests/benchmarks/test_codec_benchmarks.py +70 -0
  31. tablecodec-0.0.18/tests/codecs/test_doctags.py +104 -0
  32. tablecodec-0.0.18/tests/codecs/test_fintabnet.py +117 -0
  33. tablecodec-0.0.18/tests/codecs/test_fintabnet_otsl.py +89 -0
  34. tablecodec-0.0.18/tests/codecs/test_otsl.py +256 -0
  35. tablecodec-0.0.18/tests/codecs/test_otsl_to_pubtabnet.py +99 -0
  36. tablecodec-0.0.18/tests/codecs/test_pubtables1m.py +106 -0
  37. tablecodec-0.0.18/tests/codecs/test_pubtabnet.py +235 -0
  38. tablecodec-0.0.18/tests/codecs/test_pubtabnet_v10.py +89 -0
  39. tablecodec-0.0.18/tests/codecs/test_registry.py +155 -0
  40. tablecodec-0.0.18/tests/codecs/test_tablebank.py +116 -0
  41. tablecodec-0.0.18/tests/codecs/test_tableformer.py +127 -0
  42. tablecodec-0.0.18/tests/conftest.py +4 -0
  43. tablecodec-0.0.18/tests/fixtures/doctags/simple_2x2.jsonl +1 -0
  44. tablecodec-0.0.18/tests/fixtures/doctags/with_span_and_empty.jsonl +1 -0
  45. tablecodec-0.0.18/tests/fixtures/fintabnet/simple_2x2.jsonl +1 -0
  46. tablecodec-0.0.18/tests/fixtures/fintabnet/with_colspan.jsonl +1 -0
  47. tablecodec-0.0.18/tests/fixtures/fintabnet_otsl/simple_2x2.jsonl +1 -0
  48. tablecodec-0.0.18/tests/fixtures/otsl/simple_2x2.jsonl +1 -0
  49. tablecodec-0.0.18/tests/fixtures/otsl/with_2x2_span.jsonl +1 -0
  50. tablecodec-0.0.18/tests/fixtures/otsl/with_colspan.jsonl +1 -0
  51. tablecodec-0.0.18/tests/fixtures/otsl/with_empty.jsonl +1 -0
  52. tablecodec-0.0.18/tests/fixtures/otsl/with_rowspan.jsonl +1 -0
  53. tablecodec-0.0.18/tests/fixtures/pubtables1m/simple_2x2.jsonl +1 -0
  54. tablecodec-0.0.18/tests/fixtures/pubtables1m/with_span.jsonl +1 -0
  55. tablecodec-0.0.18/tests/fixtures/pubtabnet/simple_2x2.jsonl +1 -0
  56. tablecodec-0.0.18/tests/fixtures/pubtabnet/v10_simple_2x2.jsonl +1 -0
  57. tablecodec-0.0.18/tests/fixtures/pubtabnet/with_empty.jsonl +1 -0
  58. tablecodec-0.0.18/tests/fixtures/pubtabnet/with_rowspan.jsonl +1 -0
  59. tablecodec-0.0.18/tests/fixtures/tablebank/simple_2x2.jsonl +1 -0
  60. tablecodec-0.0.18/tests/fixtures/tablebank/with_rowspan.jsonl +1 -0
  61. tablecodec-0.0.18/tests/fixtures/tableformer/empty_with_bbox.jsonl +1 -0
  62. tablecodec-0.0.18/tests/fixtures/tableformer/simple_2x2.jsonl +1 -0
  63. tablecodec-0.0.18/tests/strategies.py +130 -0
  64. tablecodec-0.0.18/tests/test_cli.py +243 -0
  65. tablecodec-0.0.18/tests/test_conformance.py +112 -0
  66. tablecodec-0.0.18/tests/test_invariants.py +616 -0
  67. tablecodec-0.0.18/tests/test_invariants_hypothesis.py +183 -0
  68. tablecodec-0.0.18/tests/test_io.py +78 -0
  69. tablecodec-0.0.18/tests/test_io_streaming.py +99 -0
  70. tablecodec-0.0.18/tests/test_ir.py +230 -0
  71. tablecodec-0.0.18/tests/test_loss.py +135 -0
  72. tablecodec-0.0.18/tests/test_smoke.py +45 -0
  73. tablecodec-0.0.18/tests/test_spec_surface.py +292 -0
  74. tablecodec-0.0.18/tests/test_teds.py +310 -0
  75. tablecodec-0.0.18/tests/test_validate.py +321 -0
@@ -0,0 +1,60 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+
8
+ # Build artifacts
9
+ build/
10
+ dist/
11
+ *.egg-info/
12
+ *.egg
13
+ wheels/
14
+
15
+ # Test / coverage / lint caches
16
+ .pytest_cache/
17
+ .ruff_cache/
18
+ .mypy_cache/
19
+ .pyright/
20
+ .coverage
21
+ .coverage.*
22
+ htmlcov/
23
+ coverage.xml
24
+ .tox/
25
+
26
+ # Virtual environments
27
+ .venv/
28
+ venv/
29
+ env/
30
+
31
+ # uv
32
+ .uv-cache/
33
+
34
+ # Editor / OS
35
+ .idea/
36
+ .vscode/
37
+ !.vscode/extensions.json
38
+ *.swp
39
+ *.swo
40
+ .DS_Store
41
+
42
+ # hypothesis
43
+ .hypothesis/
44
+
45
+ # Local secrets / env
46
+ .env
47
+ .env.local
48
+
49
+ # Generated artifacts (e.g. e2e findings records for local audit)
50
+ output/
51
+
52
+ # Downloaded raw datasets for local-only native-format e2e (large; never
53
+ # committed). e.g. PubTables-1M PASCAL VOC, FinTabNet.c, TableBank.
54
+ input/
55
+
56
+ # Local-only, untracked working area (e.g. draft release workflow kept
57
+ # out of the public repo until release automation is enabled).
58
+ private/
59
+
60
+ .claude/scheduled_tasks.lock
@@ -0,0 +1,458 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
9
+
10
+ ## [0.0.18] - 2026-06-07
11
+
12
+ ### Added
13
+
14
+ - Conformance corpus (`conformance/`, SPEC §11) now covers **all nine codecs**
15
+ (was 2): added an independently-authored sample + expected-IR per codec for
16
+ `pubtabnet-1.0.0`, `fintabnet`, `fintabnet-otsl`, `tableformer`, `tablebank`,
17
+ `pubtables-1m`, `doctags-tables`. `tests/test_conformance.py` registers the
18
+ full builtin set and runs every case, so read-path regressions are caught
19
+ for every codec.
20
+
21
+ - `packages/tablecodec-docling/` — a bridge codec (`docling-tables`, own
22
+ version 0.0.2) mapping between `DoclingDocument.tables` and `TableSample`,
23
+ developed in-repo as a temporary monorepo member (ADR 0013, SPEC §15). It
24
+ depends on docling-core and lives in its own uv project, so the stdlib-only
25
+ core package and its environment are unaffected. Discover it via
26
+ `tablecodec.codecs.load_plugins()`. Run its checks with `just docling-ci`
27
+ (or `just ci-all` for the whole monorepo).
28
+ - **read** (0.0.1): JSONL of `DoclingDocument`s -> one `TableSample` per
29
+ table; populates `image_width`/`image_height` from page size so
30
+ docling-read samples can be validated under the STRICT profile.
31
+ - **write** (0.0.2): each `TableSample` -> one `DoclingDocument` (the inverse
32
+ of read), so `read(write([s]))` round-trips modulo
33
+ `lossy_write = {"tokens", "extras"}` (docling stores one string per cell;
34
+ no home for IR extras). `writable = True`, so docling-tables is now a real
35
+ `analyze_loss` conversion target.
36
+
37
+ ### Security
38
+
39
+ - Hardened the release pipeline ahead of the first PyPI publish (ADR 0014):
40
+ - All GitHub Actions are pinned to full commit SHAs (was mutable tags;
41
+ `pypa/gh-action-pypi-publish` now at the v1.14.0 SHA), with Dependabot
42
+ tracking bumps behind a 7-day cooldown (`.github/dependabot.yml`).
43
+ - The release workflow records a **SLSA build provenance** attestation
44
+ (`actions/attest-build-provenance`) and notes that PyPI **PEP 740**
45
+ publish attestations are emitted automatically by Trusted Publishing.
46
+ `skip-existing` makes a partial-failure re-run idempotent.
47
+ - CI (and the release build) route installs through Takumi Guard, a
48
+ screening proxy that blocks known-malicious packages; `[tool.uv]
49
+ exclude-newer` is pinned to an absolute date and `uv sync --locked`
50
+ guards against lockfile drift.
51
+ - PEP 639 SPDX license metadata (`license = "MIT"` + `license-files`;
52
+ core-metadata 2.4 via hatchling >= 1.29).
53
+
54
+ ## [0.0.17] - 2026-05-29
55
+
56
+ ### Added
57
+
58
+ - `TableSample.image_width` / `image_height` (`int | None`, default `None`):
59
+ optional sample-level source-image dimensions, peers of `filename`/`imgid`.
60
+ They join `__hash__`/`__eq__`. No codec carries them yet, so `None`
61
+ round-trips losslessly and no loss declaration changes (loss_matrix
62
+ unaffected). See `docs/adr/0012-strict-profile-image-bounds.md`.
63
+
64
+ ### Changed
65
+
66
+ - `profiles.STRICT` now implements SPEC §8's bbox-in-image cross-check instead
67
+ of aliasing `DEFAULT`. STRICT = DEFAULT plus: a bbox-free sample needs no
68
+ image metadata; once any cell carries a bbox the sample MUST declare
69
+ `image_width`/`image_height` (`STRICT-IMAGE-METADATA`) and every bbox must
70
+ lie within the image rectangle `0 <= x0 < x1 <= width`,
71
+ `0 <= y0 < y1 <= height`, upper bound inclusive (`STRICT-BBOX-OUT-OF-BOUNDS`)
72
+ (ADR 0012, option C). The check is a containment test independent of bbox
73
+ precision, so it ships with int image dims and does not depend on OQ-3.
74
+ Scope is IR field + check only: no codec populates dims yet, so a
75
+ bbox-bearing codec-read sample fails STRICT until a codec carries dims
76
+ (accepted for opt-in 0.x; codec population is a future patch).
77
+
78
+ ## [0.0.16] - 2026-05-29
79
+
80
+ ### Added
81
+
82
+ - TEDS (Tree-Edit-Distance based Similarity) metric, the `[teds]` optional
83
+ feature (`apted`, `lxml`). `from tablecodec.teds import teds, teds_html`:
84
+ `teds(pred, true, *, structure_only=False)` scores two `TableSample`s in
85
+ `[0, 1]` (`structure_only` gives TEDS-Struct); `teds_html` does the same for
86
+ HTML strings. The tree construction, rename-cost rule, and
87
+ `1 - dist / max_nodes` formula are adapted from IBM's PubTabNet reference
88
+ metric (Apache-2.0; see `THIRD_PARTY_NOTICES.md` and
89
+ `docs/adr/0011-teds-metric-port.md`), with a pure-Python normalized
90
+ Levenshtein and no batching. `teds.py` is core-external (not in the semgrep
91
+ core list, never imported by `tablecodec/__init__`), so `import tablecodec`
92
+ stays zero-dependency. `just test`/`type`/`cov` now run with `--extra teds`;
93
+ the tests `importorskip` when it is absent.
94
+
95
+ ## [0.0.15] - 2026-05-29
96
+
97
+ ### Changed
98
+
99
+ - I-05 (bbox well-formed) now decides "empty cell" by **content**, not by
100
+ token count: a cell whose tokens concatenate to only whitespace
101
+ (`"".join(tokens).strip() == ""`) localizes nothing, so its placeholder
102
+ bbox is out of scope for the geometry check. This widens the 0.0.12
103
+ scoping (which only skipped `tokens == ()`) to also skip a lone
104
+ empty-string token `("",)` and whitespace-only tokens `(" ",)` — the
105
+ dominant residual finding in an e2e verification sweep (70/85 SynthTabNet
106
+ cells were `("",)`). Markup-only cells (e.g. `("<sup>", " ", "</sup>")`)
107
+ stay content-bearing and ARE geometry-checked: the core IR does not model
108
+ HTML, so any non-whitespace token counts as content (the IR-neutral
109
+ line). Validation-layer only (`_invariants.py`); codecs unchanged, no
110
+ `lossy_*` / round-trip impact. See `docs/spec.md` §5.2 and
111
+ `docs/adr/0010-i05-empty-cell-is-whitespace-content.md` (refines ADR
112
+ 0007).
113
+
114
+ ## [0.0.14] - 2026-05-29
115
+
116
+ ### Removed
117
+
118
+ - The `fast` (`orjson`) and `validate` (`pydantic`) optional extras
119
+ (`pyproject.toml` + SPEC §13 dependency table). Both were declared but
120
+ wired nowhere, and could not be: the work they would touch — JSONL
121
+ parsing, IR construction, validation — runs inside the zero-dependency
122
+ core, where `semgrep.yaml` forbids third-party imports. Installing them
123
+ pulled in a package nothing could import. `tablecodec[teds]` (a separate,
124
+ core-external feature) and `tablecodec[cli]`/`[hf]` are unaffected.
125
+ Stricter validation remains available via the layered validation
126
+ profiles (SPEC §8), which are stdlib-only. See
127
+ `docs/adr/0009-drop-fast-and-validate-extras.md`.
128
+
129
+ ## [0.0.13] - 2026-05-29
130
+
131
+ ### Added
132
+
133
+ - `codecs.load_plugins()` — discovers and registers third-party codecs from
134
+ the `tablecodec.codecs` entry-point group (SPEC §6.2). Each entry point
135
+ references a `Codec` class (instantiated) or instance; already-registered
136
+ names are skipped (idempotent). The CLI now calls it after registering the
137
+ built-ins, so `pip install tablecodec-<x>` codecs appear in `codecs list`
138
+ and are usable by every command. Stdlib-only (`importlib.metadata`).
139
+
140
+ ### Fixed
141
+
142
+ - E2E harness (`scripts/e2e_hf_check.py`): the DocTags round-trip adapter
143
+ parsed `sink.getvalue().splitlines()[0]`, which breaks when a cell token
144
+ contains a Unicode line separator (U+2028/U+2029/U+0085) that
145
+ `json.dumps(ensure_ascii=False)` leaves raw — slicing the record
146
+ mid-string (1/16k rows). It now parses the whole single-record buffer.
147
+ The DocTags codec was already correct (it emits valid JSON);
148
+ `--self-test` gains a U+2028 regression guard. Harness-only; no library
149
+ or codec change.
150
+
151
+ ## [0.0.12] - 2026-05-29
152
+
153
+ ### Changed
154
+
155
+ - I-05 (bbox well-formed) is now scoped to **content-bearing cells**: a
156
+ bbox on an empty cell (`tokens == ()`) is a placeholder region and is no
157
+ longer geometry-checked. A live sweep showed the dominant validation
158
+ finding was empty cells carrying zero-area placeholder boxes (≈45% of
159
+ sampled SynthTabNet tables); these are degenerate in the SOURCE data
160
+ (not introduced by our float→int cast), and an empty cell localizes no
161
+ content. The fix lives entirely in the validation layer
162
+ (`_invariants.py`); codecs are unchanged and still read/keep the bbox
163
+ faithfully (no `lossy_*` / round-trip impact). Degenerate bboxes on
164
+ content-bearing cells are still flagged. Profiles that require bbox
165
+ *presence* (`tableformer`, `pubtabnet-2.0`) are unaffected. See
166
+ `docs/spec.md` §5.2 and `docs/adr/0007-i05-empty-cell-bbox-scope.md`.
167
+
168
+ ## [0.0.11] - 2026-05-29
169
+
170
+ ### Fixed
171
+
172
+ - OTSL reconstruction follow-up (`_otslgrid.py`): `check_right`/`check_down`
173
+ now stop at cells already claimed by a 2D-span `registry`. Without this a
174
+ long `lcel` run in one row swallowed `xcel` cells belonging to a 2D span
175
+ from above, overlapping it (real SynthTabNet rows, e.g. imgid 6075). The
176
+ remaining SynthTabNet I-04 are genuine OTSL span ambiguity (L-shaped
177
+ regions that cannot form an exact-cover grid), matching the HTML path.
178
+
179
+ ### Added
180
+
181
+ - E2E native PubTables-1M coverage (`scripts/e2e_hf_check.py`): reads the
182
+ original PASCAL VOC structure annotation (`bsmock/pubtables-1m`,
183
+ download-only) from a local tar under `input/` and reconstructs the
184
+ logical grid (rows × columns intersection, spanning-cell merge,
185
+ column-header role) for the `pubtables-1m` codec. The harness gained a
186
+ local-tar source alongside HF streaming; XML is parsed with `defusedxml`
187
+ (added to the `[hf]` extra). A live run reads 200/200 real VOC tables
188
+ clean. FinTabNet / TableBank natives remain download-only and
189
+ Docling-covered. See `docs/adr/0006-native-datasets-via-local-download.md`.
190
+
191
+ ## [0.0.10] - 2026-05-28
192
+
193
+ ### Fixed
194
+
195
+ - OTSL grid reconstruction (`codecs/_otslgrid.py::build_anchors`): complex
196
+ 2D span topologies were mis-decoded — a diagonal `xcel` resolution plus
197
+ independent `max` colspan/rowspan inflated vertical-only spans into
198
+ overlapping boxes, and a column-0 `xcel` was wrongly rejected. A live
199
+ e2e sweep exposed this: `SynthTabNet_OTSL` through `otsl-1.0.0` scored
200
+ 48/300 while every other corpus scored 300/300, and an HTML-vs-OTSL
201
+ cross-check on the same rows proved the token streams were well-formed.
202
+ `build_anchors` now reconstructs the grid with the anchor-centric
203
+ algorithm adapted (with attribution) from docling-ibm-models'
204
+ `otsl_to_html` — `check_right`/`check_down` span runs over `lcel`/`xcel`
205
+ and `ucel`/`xcel`, a 2D-span registry preventing double-claims, and
206
+ continuation tokens skipped rather than erroring. Fixes `otsl-1.0.0`,
207
+ `fintabnet-otsl`, `doctags-tables`, `pubtables-1m` (all call
208
+ `build_anchors`). License is unchanged (MIT → MIT requires only
209
+ attribution; see `THIRD_PARTY_NOTICES.md` and
210
+ `docs/adr/0005-port-otsl-reconstruction.md`).
211
+
212
+ ### Added
213
+
214
+ - E2E harness (`scripts/e2e_hf_check.py`, `[hf]` extra): streams the
215
+ Docling OTSL dataset family through the codecs and validates the
216
+ resulting IR. Exercises the real `codec.read()` path (square-table
217
+ assumption, anchor/cell alignment, HTML structure parsing) against
218
+ real tables. Rows are randomly sampled (streaming shuffle reshuffles
219
+ shard order; each run prints its `--seed` for reproducibility), so
220
+ repeated runs progressively cover the corpora. HF logging / progress
221
+ bars are silenced so output is just the summary. Occasional /
222
+ local-only (not CI-gated); a network-free `--self-test` /
223
+ `just e2e-selftest` verifies the adapters through the real codecs.
224
+ See `docs/adr/0003-e2e-against-docling-otsl-family.md`.
225
+ All nine shipped codecs now have at least one official-corpus check:
226
+ the FinTabNet_OTSL checks route through the actual `fintabnet` /
227
+ `fintabnet-otsl` codecs (adapter bridges Docling's `imgid` to
228
+ `table_id`); `pubtabnet-1.0.0` / `tableformer` read the Docling HTML;
229
+ `tablebank` reads the HTML structure with cells omitted; `pubtables-1m`
230
+ reads object-detection records whose grid coords are derived from OTSL
231
+ placement; and `doctags-tables` is a real-content round-trip. Every
232
+ failed row is recorded as a JSONL finding
233
+ under `output/e2e_findings/` (gitignored) — with full provenance
234
+ (dataset/split/codec/seed/row_index), the offending cell, and the
235
+ exact `input_payload` so a finding can be replayed and judged
236
+ (library bug vs. malformed upstream data vs. over-strict invariant);
237
+ `verdict` is always `needs-review`.
238
+ The PubTabNet codecs additionally read their first-published dataset in
239
+ its **native** shape via `apoidea/pubtabnet-html` (the original
240
+ PubTabNet 2.0 `html` annotation, fed unmodified — not the Docling OTSL
241
+ conversion). The other codecs' native originals (FinTabNet, TableBank,
242
+ PubTables-1M PASCAL VOC) ship as tar.gz / image files not exposed
243
+ through the HF Datasets viewer, so they stay Docling-covered.
244
+ See `docs/adr/0004-e2e-native-first-published-datasets.md`.
245
+
246
+ ## [0.0.9] - 2026-05-28
247
+
248
+ ### Added
249
+
250
+ - FinTabNet_OTSL codec (`fintabnet-otsl`, HF `ds4sd/FinTabNet_OTSL`):
251
+ OTSL structure with FinTabNet provenance — a `table_id` identifier
252
+ (mapped onto `imgid`) and an `extras` dict (e.g. `otsl_raw`). It is the
253
+ **first codec that round-trips IR `extras`**, so `extras` is
254
+ deliberately absent from `lossy_write` (`lossy_read = {"role"}`,
255
+ `lossy_write = {"role"}`). Structure handling is shared with OTSL via
256
+ `_otslgrid`. `sniff()` requires both `otsl` and `table_id` keys. This
257
+ brings the SPEC §7 initial codec set to nine.
258
+ - `_otslgrid` gains `otsl_to_cells` / `cells_to_otsl` so OTSL and
259
+ FinTabNet_OTSL share the OTSL payload↔GridCell mapping.
260
+
261
+ ### Changed
262
+
263
+ - `otsl.py` delegates its payload↔sample mapping to the new `_otslgrid`
264
+ helpers (Tidy First, no behaviour change).
265
+
266
+ ## [0.0.8] - 2026-05-28
267
+
268
+ ### Fixed
269
+
270
+ - `tablecodec codecs list` now lists every built-in codec. The CLI's
271
+ built-in registration had drifted — it still seeded only the three
272
+ codecs that existed when the CLI was written (pubtabnet-1.0.0,
273
+ pubtabnet-2.0.0, otsl-1.0.0), omitting fintabnet, tableformer,
274
+ tablebank, pubtables-1m, and doctags-tables.
275
+
276
+ ### Changed
277
+
278
+ - Introduced `tablecodec.codecs.builtins.BUILTIN_CODECS` as the single
279
+ source of truth for the shipped codecs. The CLI and both doc
280
+ generators now consume it instead of each maintaining their own list
281
+ (no doc-output change; removes the triplicated registration).
282
+
283
+ ## [0.0.7] - 2026-05-28
284
+
285
+ ### Added
286
+
287
+ - DocTags table subset codec (`doctags-tables`): reads the IBM
288
+ Granite-Docling table markup — OTSL cell tokens wrapped in
289
+ `<otsl>`...`</otsl>`, each anchor annotated with four `<loc_n>` tokens
290
+ (a 0–500 grid bbox) plus content tokens. Read is full (structure +
291
+ bbox + content); write emits the OTSL-equivalent subset, so `role` is
292
+ lost (`lossy_read = {"role"}`, `lossy_write = {"role", "extras"}`,
293
+ SPEC §7 △). `sniff()` matches the `doctags` key.
294
+ - `_otslgrid` shared module: the OTSL structure↔grid machinery
295
+ (`split_rows`, `ensure_square`, `build_anchors`, `build_token_grid`)
296
+ extracted from `otsl.py` so OTSL and DocTags share one implementation.
297
+
298
+ ### Changed
299
+
300
+ - `otsl.py` now delegates its grid parsing/serialization to `_otslgrid`
301
+ (Tidy First, no behaviour change).
302
+
303
+ ## [0.0.6] - 2026-05-28
304
+
305
+ ### Added
306
+
307
+ - PubTables-1M codec (`pubtables-1m`): the first **read-only** codec.
308
+ Reads the object-detection format (cells carry explicit
309
+ row/col/rowspan/colspan/bbox in detection order) and normalises to
310
+ row-major IR; derives nrows/ncols when absent. `write` raises
311
+ `NotImplementedError`.
312
+ - `Codec.writable` flag (ADR 0002): boolean capability on the Codec
313
+ Protocol. All writable codecs default to `True`; read-only codecs set
314
+ `False`. `analyze_loss` short-circuits to a new
315
+ `round_trip_classification` value **`"unwritable"`** when the target
316
+ is read-only, and the loss matrix renders it as ⚫. `format_support.md`
317
+ gains a "Writable" column.
318
+
319
+ ### Changed
320
+
321
+ - Every built-in codec now declares `writable` (mechanical, defaults to
322
+ `True`).
323
+
324
+ ## [0.0.5] - 2026-05-28
325
+
326
+ ### Added
327
+
328
+ - TableBank codec (`tablebank`): a structure-only format — the source
329
+ ships `html.structure` with no `html.cells`, so on read every cell is
330
+ empty (`tokens=()`, `bbox=None`) and the grid shape is reconstructed
331
+ from the structure tokens. Write emits structure only. `lossy_read =
332
+ {"tokens", "bbox"}`, `lossy_write = {"tokens", "bbox", "extras"}` —
333
+ so TableBank is the first codec to surface `lossy` (🔴) classifications
334
+ in the loss matrix (token loss is not structure-preserving). `sniff()`
335
+ requires `html.structure` present and `html.cells` absent.
336
+ - `_htmltable` gains `parse_html_structure_only` /
337
+ `serialize_html_structure_only` and a `require_no_cells` sniff knob.
338
+
339
+ ## [0.0.4] - 2026-05-28
340
+
341
+ ### Added
342
+
343
+ - TableFormer Format codec (`tableformer`): PubTabNet 2.0's HTML-token
344
+ structure plus the invariant that EVERY cell — including empty ones —
345
+ carries a bbox. The codec enforces this on read (raising a clear error
346
+ if any cell lacks a bbox), so its output always satisfies
347
+ `profiles.TABLEFORMER`. `sniff()` requires all cells to have a bbox,
348
+ which distinguishes it from PubTabNet (whose empty cells omit bbox).
349
+ `lossy_read = {}`, `lossy_write = {"extras"}`.
350
+
351
+ ## [0.0.3] - 2026-05-28
352
+
353
+ ### Added
354
+
355
+ - FinTabNet (original) codec (`fintabnet`): same HTML-token structure as
356
+ PubTabNet 2.0, with `table_id` as the record identifier instead of
357
+ `imgid`. Reads/writes via the shared `_htmltable` machinery with
358
+ `id_field="table_id"`; `sniff()` requires the `table_id` key so a
359
+ PubTabNet (imgid) record is not mis-detected as FinTabNet.
360
+ `lossy_read = {}`, `lossy_write = {"extras"}`.
361
+
362
+ ### Changed
363
+
364
+ - Extracted the HTML-token parser / grid-placement / serializer out of
365
+ `codecs/pubtabnet.py` into `codecs/_htmltable.py` (Tidy First, no
366
+ behaviour change) so PubTabNet and FinTabNet share one implementation.
367
+ - `docs/format_support.md` now also lists `otsl-1.0.0` (previously the
368
+ generator only seeded the two PubTabNet codecs).
369
+
370
+ ## [0.0.2] - 2026-05-28
371
+
372
+ Development preview (0.0.x makes no stability promises). Stdlib-only
373
+ core, three codecs, streaming I/O, static loss analysis, optional CLI,
374
+ and an in-repo conformance suite. Not published to PyPI yet — codecs
375
+ are being added incrementally within the 0.0.x series.
376
+
377
+ ### Added
378
+
379
+ - Repository bootstrap (M0): `pyproject.toml` (hatchling, Python 3.11+),
380
+ `justfile`, `ruff.toml`, `pyrightconfig.json`, GitHub Actions CI matrix
381
+ (Python 3.11–3.13 × Ubuntu/macOS), `semgrep.yaml` enforcing
382
+ SPEC §13 zero-dependency policy, MIT license, smoke test scaffold.
383
+ - Internal Representation (M1): SPEC §5.1 `BBox`, `GridCell`,
384
+ `TableSample` as frozen, slotted, hashable dataclasses; SPEC §5.2
385
+ invariants I-01..I-07 each as an independent `check_iXX` function
386
+ returning `list[ValidationError]`. SPEC §8 validation profiles
387
+ (`LENIENT`, `DEFAULT`, `PUBTABNET_2_0`, `TABLEFORMER`, `STRICT`)
388
+ exposed via `tablecodec.profiles` and orchestrated by `validate()`.
389
+ Hypothesis-driven property tests (10,000 cases) verify that valid
390
+ samples pass every profile and that a single broken invariant is
391
+ reported by its own check function without spurious cross-talk.
392
+ Coverage 100% across all M1 modules; pyright strict clean.
393
+ - Codec layer (M2): SPEC §6 `Codec` Protocol (`@property` getters so
394
+ frozen-dataclass implementations satisfy the protocol) in
395
+ `tablecodec.codecs._base`; in-process registry (`register`, `get`,
396
+ `list_codecs`, `detect`) in `tablecodec.codecs`. First codec:
397
+ `PubTabNet20Codec` (`pubtabnet-2.0.0`) with streaming `read` /
398
+ `write`, span-aware HTML table-placement algorithm, honest
399
+ `lossy_read` (empty) and `lossy_write` (`{"extras"}`), and a
400
+ `sniff()` delegate for `codecs.detect()`. Round-trip tests verify
401
+ that `read → write → read` is the identity for non-extras payloads.
402
+ - Streaming I/O + PubTabNet 1.0 (M3): `tablecodec.io.open()` accepts a
403
+ path-like or text stream and returns a streaming iterator; auto-detect
404
+ via `tablecodec.io.detect()` peeks the source without consuming it.
405
+ Second codec: `PubTabNet10Codec` (`pubtabnet-1.0.0`) — same format
406
+ family minus bbox; `lossy_read = {"bbox"}`, `lossy_write =
407
+ {"bbox", "extras"}`. Sniff discriminates the two versions by bbox
408
+ presence in the first record. SPEC §10 streaming guarantee verified
409
+ by tracemalloc-instrumented test: 100,000 pubtabnet-2.0 records read
410
+ with peak < 50 MB. `docs/format_support.md` is auto-generated by
411
+ `scripts/gen_format_support.py` and CI fails if it goes stale
412
+ (`just docs-check`). `tests/benchmarks/` houses pytest-benchmark
413
+ micro-benchmarks (deselected from default run, executed by
414
+ `just bench` and the new `.github/workflows/benchmark.yaml`).
415
+ - OTSL 1.0 codec (M4): `OTSL10Codec` (`otsl-1.0.0`) implements the
416
+ five-token OTSL grammar from arXiv 2305.03393 (`fcel`, `ecel`,
417
+ `lcel`, `ucel`, `xcel`, plus `nl`). Square-table assumption is
418
+ enforced on read (jagged row widths rejected with a clear error).
419
+ Continuation tokens (lcel/ucel/xcel) extend the anchor cell they
420
+ reference; the IR is reconstructed in two passes (parse rows →
421
+ resolve anchors). The implementation is derived from the paper, not
422
+ copied from `docling-ibm-models/tableformer/otsl.py`. `lossy_read =
423
+ {"role"}` and `lossy_write = {"extras", "role"}` are honest about
424
+ the header/body distinction collapsing through OTSL — a property
425
+ verified by a cross-codec test that round-trips a PubTabNet sample
426
+ with header cells through OTSL and observes role=body on return.
427
+ - Loss analysis (M5): `tablecodec.analyze_loss(source, target)` returns
428
+ a `LossReport` derived statically from the codecs' `lossy_read` and
429
+ `lossy_write` declarations — no data is read. The round-trip
430
+ classification distinguishes `lossless` (nothing dropped),
431
+ `structure-preserving` (only auxiliary `bbox`/`role`/`extras` lost),
432
+ and `lossy` (any other field lost). `docs/loss_matrix.md` is
433
+ auto-generated by `scripts/gen_loss_matrix.py` and the same
434
+ `just docs-check` gate that protects `format_support.md` also
435
+ protects it.
436
+ - CLI (M6): `tablecodec` console script (`[project.scripts]`) backed by
437
+ `src/tablecodec/cli.py` and the `[cli]` extra (click 8.x). Six
438
+ subcommands: `validate`, `convert`, `stats`, `diff`, `analyze-loss`,
439
+ `codecs list`. Every command streams input; non-zero exit on
440
+ validation failures and diff mismatches. `convert --dry-run` prints
441
+ the static `analyze_loss` report without touching the input file.
442
+ CLI is wholly optional — the core continues to install and run
443
+ without click (verified by the existing pip-install-check job).
444
+ - Conformance suite skeleton (M7): the SPEC §11 corpus is bootstrapped
445
+ in-repo under `conformance/` (manifest `INDEX.json` + draft-2020-12
446
+ JSON Schema + samples + hand-authored expected-IR JSON), pending
447
+ extraction to a separate vendor-neutral repository before v1.0 (see
448
+ `docs/adr/0001-conformance-suite-in-repo-temporarily.md`).
449
+ `tests/test_conformance.py` validates `INDEX.json` against its schema
450
+ and runs every case (3 × pubtabnet-2.0.0, 3 × otsl-1.0.0) by reading
451
+ the sample and comparing the IR to the independent expectation.
452
+ `jsonschema` added to the `[dev]` extra (test-only).
453
+
454
+ <!-- v0.0.18 is the first cut release (tag + GitHub Release created by
455
+ .github/workflows/release.yaml). Earlier 0.0.x headings stay plain text
456
+ (no tags were pushed for them). -->
457
+ [Unreleased]: https://github.com/hironow/tablecodec/compare/v0.0.18...main
458
+ [0.0.18]: https://github.com/hironow/tablecodec/releases/tag/v0.0.18
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 hironow and tablecodec contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.