tablecodec 0.0.18__tar.gz → 0.0.19__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tablecodec-0.0.18 → tablecodec-0.0.19}/CHANGELOG.md +30 -1
- tablecodec-0.0.19/PKG-INFO +201 -0
- tablecodec-0.0.19/README.md +151 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/pyproject.toml +2 -1
- {tablecodec-0.0.18 → tablecodec-0.0.19}/src/tablecodec/__init__.py +6 -1
- tablecodec-0.0.18/PKG-INFO +0 -200
- tablecodec-0.0.18/README.md +0 -151
- {tablecodec-0.0.18 → tablecodec-0.0.19}/.gitignore +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/LICENSE +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/docs/spec.md +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/src/tablecodec/_invariants.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/src/tablecodec/cli.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/src/tablecodec/codecs/__init__.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/src/tablecodec/codecs/_base.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/src/tablecodec/codecs/_htmltable.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/src/tablecodec/codecs/_otslgrid.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/src/tablecodec/codecs/builtins.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/src/tablecodec/codecs/doctags.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/src/tablecodec/codecs/fintabnet.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/src/tablecodec/codecs/fintabnet_otsl.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/src/tablecodec/codecs/otsl.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/src/tablecodec/codecs/pubtables1m.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/src/tablecodec/codecs/pubtabnet.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/src/tablecodec/codecs/tablebank.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/src/tablecodec/codecs/tableformer.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/src/tablecodec/io.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/src/tablecodec/ir.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/src/tablecodec/loss.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/src/tablecodec/py.typed +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/src/tablecodec/teds.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/src/tablecodec/validate.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/benchmarks/test_codec_benchmarks.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/codecs/test_doctags.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/codecs/test_fintabnet.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/codecs/test_fintabnet_otsl.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/codecs/test_otsl.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/codecs/test_otsl_to_pubtabnet.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/codecs/test_pubtables1m.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/codecs/test_pubtabnet.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/codecs/test_pubtabnet_v10.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/codecs/test_registry.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/codecs/test_tablebank.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/codecs/test_tableformer.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/conftest.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/fixtures/doctags/simple_2x2.jsonl +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/fixtures/doctags/with_span_and_empty.jsonl +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/fixtures/fintabnet/simple_2x2.jsonl +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/fixtures/fintabnet/with_colspan.jsonl +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/fixtures/fintabnet_otsl/simple_2x2.jsonl +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/fixtures/otsl/simple_2x2.jsonl +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/fixtures/otsl/with_2x2_span.jsonl +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/fixtures/otsl/with_colspan.jsonl +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/fixtures/otsl/with_empty.jsonl +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/fixtures/otsl/with_rowspan.jsonl +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/fixtures/pubtables1m/simple_2x2.jsonl +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/fixtures/pubtables1m/with_span.jsonl +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/fixtures/pubtabnet/simple_2x2.jsonl +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/fixtures/pubtabnet/v10_simple_2x2.jsonl +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/fixtures/pubtabnet/with_empty.jsonl +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/fixtures/pubtabnet/with_rowspan.jsonl +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/fixtures/tablebank/simple_2x2.jsonl +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/fixtures/tablebank/with_rowspan.jsonl +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/fixtures/tableformer/empty_with_bbox.jsonl +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/fixtures/tableformer/simple_2x2.jsonl +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/strategies.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/test_cli.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/test_conformance.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/test_invariants.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/test_invariants_hypothesis.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/test_io.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/test_io_streaming.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/test_ir.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/test_loss.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/test_smoke.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/test_spec_surface.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/test_teds.py +0 -0
- {tablecodec-0.0.18 → tablecodec-0.0.19}/tests/test_validate.py +0 -0
|
@@ -7,6 +7,34 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.0.19] - 2026-06-07
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
|
|
14
|
+
- **Python 3.14 support.** 3.14 is now in the CI matrix (ubuntu + macOS) and the
|
|
15
|
+
trove classifiers; the full suite (core + `[cli]`/`[teds]`/dev, incl
|
|
16
|
+
lxml/apted) passes on 3.14. `requires-python` stays `>=3.11` with no upper
|
|
17
|
+
bound.
|
|
18
|
+
|
|
19
|
+
### Changed
|
|
20
|
+
|
|
21
|
+
- `__version__` (both the core package and the `tablecodec-docling` bridge) is
|
|
22
|
+
now derived from the installed package metadata (`importlib.metadata`,
|
|
23
|
+
stdlib), so `pyproject.toml` is the single human-edited version source — no
|
|
24
|
+
more hand-syncing a literal in `__init__.py`.
|
|
25
|
+
- README restructured (badges, a Supported table, a Development section) and the
|
|
26
|
+
first release marked shipped across `docs/intent.md` / `docs/handover.md`.
|
|
27
|
+
- The source distribution now ships only the package, tests, and `docs/spec.md`
|
|
28
|
+
(hatchling `only-include`), dropping the dev trees (`.semgrep/`,
|
|
29
|
+
`conformance/`, `packages/`) it previously bundled. The wheel is unchanged.
|
|
30
|
+
- `.github/dependabot.yml` renamed to `.github/dependabot.yaml` (project
|
|
31
|
+
convention; GitHub supports both).
|
|
32
|
+
|
|
33
|
+
### Removed
|
|
34
|
+
|
|
35
|
+
- `CONTRIBUTING.md` and `SECURITY.md` (GitHub private vulnerability reporting
|
|
36
|
+
stays enabled for security reports).
|
|
37
|
+
|
|
10
38
|
## [0.0.18] - 2026-06-07
|
|
11
39
|
|
|
12
40
|
### Added
|
|
@@ -454,5 +482,6 @@ are being added incrementally within the 0.0.x series.
|
|
|
454
482
|
<!-- v0.0.18 is the first cut release (tag + GitHub Release created by
|
|
455
483
|
.github/workflows/release.yaml). Earlier 0.0.x headings stay plain text
|
|
456
484
|
(no tags were pushed for them). -->
|
|
457
|
-
[Unreleased]: https://github.com/hironow/tablecodec/compare/v0.0.
|
|
485
|
+
[Unreleased]: https://github.com/hironow/tablecodec/compare/v0.0.19...main
|
|
486
|
+
[0.0.19]: https://github.com/hironow/tablecodec/compare/v0.0.18...v0.0.19
|
|
458
487
|
[0.0.18]: https://github.com/hironow/tablecodec/releases/tag/v0.0.18
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tablecodec
|
|
3
|
+
Version: 0.0.19
|
|
4
|
+
Summary: Neutral Internal Representation and Codec registry for image-based table-recognition datasets
|
|
5
|
+
Project-URL: Homepage, https://github.com/hironow/tablecodec
|
|
6
|
+
Project-URL: Repository, https://github.com/hironow/tablecodec
|
|
7
|
+
Project-URL: Issues, https://github.com/hironow/tablecodec/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/hironow/tablecodec/blob/main/CHANGELOG.md
|
|
9
|
+
Author-email: hironow <hironow365@gmail.com>
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: dataset,doctags,fintabnet,ocr,otsl,pubtabnet,table
|
|
13
|
+
Classifier: Development Status :: 2 - Pre-Alpha
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Image Recognition
|
|
23
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
24
|
+
Classifier: Typing :: Typed
|
|
25
|
+
Requires-Python: >=3.11
|
|
26
|
+
Provides-Extra: all
|
|
27
|
+
Requires-Dist: apted>=1.0.3; extra == 'all'
|
|
28
|
+
Requires-Dist: click>=8.1; extra == 'all'
|
|
29
|
+
Requires-Dist: datasets>=2.19; extra == 'all'
|
|
30
|
+
Requires-Dist: defusedxml>=0.7; extra == 'all'
|
|
31
|
+
Requires-Dist: lxml>=5.0; extra == 'all'
|
|
32
|
+
Provides-Extra: cli
|
|
33
|
+
Requires-Dist: click>=8.1; extra == 'cli'
|
|
34
|
+
Provides-Extra: dev
|
|
35
|
+
Requires-Dist: coverage>=7.5; extra == 'dev'
|
|
36
|
+
Requires-Dist: hypothesis>=6.100; extra == 'dev'
|
|
37
|
+
Requires-Dist: jsonschema>=4.20; extra == 'dev'
|
|
38
|
+
Requires-Dist: pyright>=1.1.380; extra == 'dev'
|
|
39
|
+
Requires-Dist: pytest-benchmark>=4.0; extra == 'dev'
|
|
40
|
+
Requires-Dist: pytest-cov>=5.0; extra == 'dev'
|
|
41
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
42
|
+
Requires-Dist: ruff>=0.6; extra == 'dev'
|
|
43
|
+
Provides-Extra: hf
|
|
44
|
+
Requires-Dist: datasets>=2.19; extra == 'hf'
|
|
45
|
+
Requires-Dist: defusedxml>=0.7; extra == 'hf'
|
|
46
|
+
Provides-Extra: teds
|
|
47
|
+
Requires-Dist: apted>=1.0.3; extra == 'teds'
|
|
48
|
+
Requires-Dist: lxml>=5.0; extra == 'teds'
|
|
49
|
+
Description-Content-Type: text/markdown
|
|
50
|
+
|
|
51
|
+
# tablecodec
|
|
52
|
+
|
|
53
|
+
[](https://pypi.org/project/tablecodec/)
|
|
54
|
+
[](https://github.com/hironow/tablecodec/actions/workflows/ci.yaml)
|
|
55
|
+
[](https://pypi.org/project/tablecodec/)
|
|
56
|
+
[](LICENSE)
|
|
57
|
+
|
|
58
|
+
One **lossless Internal Representation (IR)** for image-based table-recognition
|
|
59
|
+
datasets, plus a **registry of codecs** that translate between the IR and the
|
|
60
|
+
fragmented public formats — PubTabNet, FinTabNet, OTSL, TableFormer,
|
|
61
|
+
DocTags-tables, PubTables-1M, TableBank.
|
|
62
|
+
|
|
63
|
+
Read any of them into one neutral shape, validate it, convert between formats,
|
|
64
|
+
and get a **static, data-free loss report** for any conversion before you run it.
|
|
65
|
+
The core has **zero third-party runtime dependencies** — `import tablecodec`
|
|
66
|
+
works on a bare Python 3.11+; heavier features (TEDS, CLI, HF streaming) are
|
|
67
|
+
opt-in extras.
|
|
68
|
+
|
|
69
|
+
[`docs/spec.md`](docs/spec.md) is the source of truth. The `0.x` line makes no
|
|
70
|
+
API-stability promises; the public surface freezes at `1.0` (SPEC §14).
|
|
71
|
+
|
|
72
|
+
## Install
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
pip install tablecodec # stdlib-only core
|
|
76
|
+
pip install "tablecodec[cli]" # + command-line interface (click)
|
|
77
|
+
pip install "tablecodec[teds]" # + TEDS similarity metric (apted, lxml)
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Quick start
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
import tablecodec
|
|
84
|
+
from tablecodec import codecs, validate, profiles, analyze_loss
|
|
85
|
+
from tablecodec.codecs.pubtabnet import PubTabNet20Codec
|
|
86
|
+
|
|
87
|
+
# Register a codec (the CLI self-registers the built-ins; in library use you
|
|
88
|
+
# register the ones you need).
|
|
89
|
+
codecs.register(PubTabNet20Codec())
|
|
90
|
+
|
|
91
|
+
# Stream-read a dataset into the neutral IR (constant memory).
|
|
92
|
+
with open("pubtabnet_val.jsonl", encoding="utf-8") as f:
|
|
93
|
+
for sample in codecs.get("pubtabnet-2.0.0").read(f):
|
|
94
|
+
errors = validate(sample, profile=profiles.DEFAULT)
|
|
95
|
+
if errors:
|
|
96
|
+
print(sample.filename, errors)
|
|
97
|
+
|
|
98
|
+
# Static, data-free loss analysis between two formats.
|
|
99
|
+
report = analyze_loss(source="pubtabnet-2.0.0", target="otsl-1.0.0")
|
|
100
|
+
print(report.round_trip_classification) # "structure-preserving"
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Supported
|
|
104
|
+
|
|
105
|
+
Verified in CI (see [`.github/workflows/ci.yaml`](.github/workflows/ci.yaml)).
|
|
106
|
+
|
|
107
|
+
| Component | Supported | Notes |
|
|
108
|
+
|---|---|---|
|
|
109
|
+
| Python | 3.11 – 3.14 | core is stdlib-only (zero runtime deps, SPEC §13) |
|
|
110
|
+
| Codecs | 9 built-in | `pubtabnet-1.0.0/2.0.0`, `otsl-1.0.0`, `fintabnet`, `fintabnet-otsl`, `tableformer`, `tablebank`, `pubtables-1m`, `doctags-tables` |
|
|
111
|
+
| Extras | `[cli]` `[teds]` `[hf]` | click · apted+lxml · datasets (occasional/local e2e) |
|
|
112
|
+
| Bridge | `docling-tables` | a separate `tablecodec-docling` package (`packages/`, own version) |
|
|
113
|
+
|
|
114
|
+
Auto-generated capability tables: [format support](docs/format_support.md) ·
|
|
115
|
+
[loss matrix](docs/loss_matrix.md). Dependency bumps within these ranges are
|
|
116
|
+
tracked by Dependabot.
|
|
117
|
+
|
|
118
|
+
## TEDS similarity (`[teds]` extra)
|
|
119
|
+
|
|
120
|
+
A Tree-Edit-Distance-based Similarity score between two samples. It lives
|
|
121
|
+
outside the core (it imports `apted`/`lxml`), so import it from its submodule:
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
from tablecodec.teds import teds
|
|
125
|
+
|
|
126
|
+
score = teds(pred_sample, true_sample) # 0.0 .. 1.0
|
|
127
|
+
struct = teds(pred_sample, true_sample, structure_only=True) # ignore cell text
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## CLI (`[cli]` extra)
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
tablecodec codecs list
|
|
134
|
+
tablecodec analyze-loss --from pubtabnet-2.0.0 --to otsl-1.0.0
|
|
135
|
+
tablecodec validate path/to/dataset.jsonl --codec pubtabnet-2.0.0 --profile DEFAULT
|
|
136
|
+
tablecodec stats path/to/dataset.jsonl --codec pubtabnet-2.0.0 --json
|
|
137
|
+
tablecodec convert in.jsonl out.jsonl --from pubtabnet-2.0.0 --to otsl-1.0.0
|
|
138
|
+
tablecodec convert in.jsonl /dev/null --from pubtabnet-2.0.0 --to otsl-1.0.0 --dry-run
|
|
139
|
+
tablecodec diff a.jsonl b.jsonl --codec pubtabnet-2.0.0
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
All commands stream their input; exit codes are non-zero on validation failures
|
|
143
|
+
or diffs (suitable for CI / data pipelines).
|
|
144
|
+
|
|
145
|
+
## End-to-end check against real datasets
|
|
146
|
+
|
|
147
|
+
`scripts/e2e_hf_check.py` streams real datasets through the codecs and validates
|
|
148
|
+
the resulting IR. It is **occasional / local-only** (network + multi-GB
|
|
149
|
+
datasets), not part of CI. Every shipped codec gets at least one official-corpus
|
|
150
|
+
check, from three sources:
|
|
151
|
+
|
|
152
|
+
- the **Docling OTSL family**
|
|
153
|
+
(`docling-project/{PubTabNet,FinTabNet,PubTables-1M,SynthTabNet}_OTSL`) — a
|
|
154
|
+
uniform converted schema that feeds all nine codecs;
|
|
155
|
+
- the **native** first-published PubTabNet annotation (`apoidea/pubtabnet-html`)
|
|
156
|
+
fed unmodified to the `pubtabnet` codecs;
|
|
157
|
+
- the **native** PubTables-1M PASCAL VOC structure annotation
|
|
158
|
+
(`bsmock/pubtables-1m`, download-only) with the logical grid reconstructed for
|
|
159
|
+
the `pubtables-1m` codec.
|
|
160
|
+
|
|
161
|
+
```bash
|
|
162
|
+
just e2e-selftest # network-free adapter smoke test
|
|
163
|
+
just e2e 200 # 200 randomly-sampled rows per check (needs [hf] extra)
|
|
164
|
+
just e2e-fetch-pubtables1m # download native PubTables-1M VOC (~30MB) into input/
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
Rows are sampled randomly and each run prints its `--seed`, so repeated runs
|
|
168
|
+
progressively cover the corpora and any finding is reproducible. Failures are
|
|
169
|
+
appended to `output/e2e_findings/` (gitignored) with a replayable payload. See
|
|
170
|
+
[ADR 0003](docs/adr/0003-e2e-against-docling-otsl-family.md) and
|
|
171
|
+
[ADR 0004](docs/adr/0004-e2e-native-first-published-datasets.md) for the
|
|
172
|
+
data-source decisions and the canonical-vs-real-shape caveats.
|
|
173
|
+
|
|
174
|
+
## Documentation
|
|
175
|
+
|
|
176
|
+
- [`docs/spec.md`](docs/spec.md) — Specification (the single source of truth).
|
|
177
|
+
- [`docs/glossary.md`](docs/glossary.md) — Precise vocabulary: terms tablecodec
|
|
178
|
+
defines vs. borrows (e.g. "loss" vs a "degenerate" bbox).
|
|
179
|
+
- [`docs/intent.md`](docs/intent.md) — Implementation brief and roadmap
|
|
180
|
+
(milestones, quality bar, §8 future work).
|
|
181
|
+
- [`docs/adr/`](docs/adr/) — the decisions and their reasoning (the "Why").
|
|
182
|
+
- [`CHANGELOG.md`](CHANGELOG.md) — Keep a Changelog format.
|
|
183
|
+
|
|
184
|
+
## Development
|
|
185
|
+
|
|
186
|
+
```bash
|
|
187
|
+
just install # editable install with dev + cli + teds extras
|
|
188
|
+
just ci # lint + pyright (strict) + pytest + semgrep + docs-check
|
|
189
|
+
just docs # regenerate the codec/loss tables (docs-check enforces freshness)
|
|
190
|
+
just ci-all # core + the in-repo tablecodec-docling bridge
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
Releases are published from GitHub Actions via PyPI **OIDC Trusted Publishing**
|
|
194
|
+
(no long-lived token), carrying PEP 740 attestations and a SLSA build provenance
|
|
195
|
+
([ADR 0014](docs/adr/0014-release-via-oidc-trusted-publishing.md)).
|
|
196
|
+
|
|
197
|
+
## License
|
|
198
|
+
|
|
199
|
+
MIT. See [LICENSE](LICENSE). The OTSL grid-reconstruction logic and the TEDS
|
|
200
|
+
metric are adapted (with attribution) from upstream MIT / Apache-2.0 sources —
|
|
201
|
+
see [THIRD_PARTY_NOTICES.md](THIRD_PARTY_NOTICES.md).
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
# tablecodec
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/tablecodec/)
|
|
4
|
+
[](https://github.com/hironow/tablecodec/actions/workflows/ci.yaml)
|
|
5
|
+
[](https://pypi.org/project/tablecodec/)
|
|
6
|
+
[](LICENSE)
|
|
7
|
+
|
|
8
|
+
One **lossless Internal Representation (IR)** for image-based table-recognition
|
|
9
|
+
datasets, plus a **registry of codecs** that translate between the IR and the
|
|
10
|
+
fragmented public formats — PubTabNet, FinTabNet, OTSL, TableFormer,
|
|
11
|
+
DocTags-tables, PubTables-1M, TableBank.
|
|
12
|
+
|
|
13
|
+
Read any of them into one neutral shape, validate it, convert between formats,
|
|
14
|
+
and get a **static, data-free loss report** for any conversion before you run it.
|
|
15
|
+
The core has **zero third-party runtime dependencies** — `import tablecodec`
|
|
16
|
+
works on a bare Python 3.11+; heavier features (TEDS, CLI, HF streaming) are
|
|
17
|
+
opt-in extras.
|
|
18
|
+
|
|
19
|
+
[`docs/spec.md`](docs/spec.md) is the source of truth. The `0.x` line makes no
|
|
20
|
+
API-stability promises; the public surface freezes at `1.0` (SPEC §14).
|
|
21
|
+
|
|
22
|
+
## Install
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install tablecodec # stdlib-only core
|
|
26
|
+
pip install "tablecodec[cli]" # + command-line interface (click)
|
|
27
|
+
pip install "tablecodec[teds]" # + TEDS similarity metric (apted, lxml)
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Quick start
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
import tablecodec
|
|
34
|
+
from tablecodec import codecs, validate, profiles, analyze_loss
|
|
35
|
+
from tablecodec.codecs.pubtabnet import PubTabNet20Codec
|
|
36
|
+
|
|
37
|
+
# Register a codec (the CLI self-registers the built-ins; in library use you
|
|
38
|
+
# register the ones you need).
|
|
39
|
+
codecs.register(PubTabNet20Codec())
|
|
40
|
+
|
|
41
|
+
# Stream-read a dataset into the neutral IR (constant memory).
|
|
42
|
+
with open("pubtabnet_val.jsonl", encoding="utf-8") as f:
|
|
43
|
+
for sample in codecs.get("pubtabnet-2.0.0").read(f):
|
|
44
|
+
errors = validate(sample, profile=profiles.DEFAULT)
|
|
45
|
+
if errors:
|
|
46
|
+
print(sample.filename, errors)
|
|
47
|
+
|
|
48
|
+
# Static, data-free loss analysis between two formats.
|
|
49
|
+
report = analyze_loss(source="pubtabnet-2.0.0", target="otsl-1.0.0")
|
|
50
|
+
print(report.round_trip_classification) # "structure-preserving"
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Supported
|
|
54
|
+
|
|
55
|
+
Verified in CI (see [`.github/workflows/ci.yaml`](.github/workflows/ci.yaml)).
|
|
56
|
+
|
|
57
|
+
| Component | Supported | Notes |
|
|
58
|
+
|---|---|---|
|
|
59
|
+
| Python | 3.11 – 3.14 | core is stdlib-only (zero runtime deps, SPEC §13) |
|
|
60
|
+
| Codecs | 9 built-in | `pubtabnet-1.0.0/2.0.0`, `otsl-1.0.0`, `fintabnet`, `fintabnet-otsl`, `tableformer`, `tablebank`, `pubtables-1m`, `doctags-tables` |
|
|
61
|
+
| Extras | `[cli]` `[teds]` `[hf]` | click · apted+lxml · datasets (occasional/local e2e) |
|
|
62
|
+
| Bridge | `docling-tables` | a separate `tablecodec-docling` package (`packages/`, own version) |
|
|
63
|
+
|
|
64
|
+
Auto-generated capability tables: [format support](docs/format_support.md) ·
|
|
65
|
+
[loss matrix](docs/loss_matrix.md). Dependency bumps within these ranges are
|
|
66
|
+
tracked by Dependabot.
|
|
67
|
+
|
|
68
|
+
## TEDS similarity (`[teds]` extra)
|
|
69
|
+
|
|
70
|
+
A Tree-Edit-Distance-based Similarity score between two samples. It lives
|
|
71
|
+
outside the core (it imports `apted`/`lxml`), so import it from its submodule:
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
from tablecodec.teds import teds
|
|
75
|
+
|
|
76
|
+
score = teds(pred_sample, true_sample) # 0.0 .. 1.0
|
|
77
|
+
struct = teds(pred_sample, true_sample, structure_only=True) # ignore cell text
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## CLI (`[cli]` extra)
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
tablecodec codecs list
|
|
84
|
+
tablecodec analyze-loss --from pubtabnet-2.0.0 --to otsl-1.0.0
|
|
85
|
+
tablecodec validate path/to/dataset.jsonl --codec pubtabnet-2.0.0 --profile DEFAULT
|
|
86
|
+
tablecodec stats path/to/dataset.jsonl --codec pubtabnet-2.0.0 --json
|
|
87
|
+
tablecodec convert in.jsonl out.jsonl --from pubtabnet-2.0.0 --to otsl-1.0.0
|
|
88
|
+
tablecodec convert in.jsonl /dev/null --from pubtabnet-2.0.0 --to otsl-1.0.0 --dry-run
|
|
89
|
+
tablecodec diff a.jsonl b.jsonl --codec pubtabnet-2.0.0
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
All commands stream their input; exit codes are non-zero on validation failures
|
|
93
|
+
or diffs (suitable for CI / data pipelines).
|
|
94
|
+
|
|
95
|
+
## End-to-end check against real datasets
|
|
96
|
+
|
|
97
|
+
`scripts/e2e_hf_check.py` streams real datasets through the codecs and validates
|
|
98
|
+
the resulting IR. It is **occasional / local-only** (network + multi-GB
|
|
99
|
+
datasets), not part of CI. Every shipped codec gets at least one official-corpus
|
|
100
|
+
check, from three sources:
|
|
101
|
+
|
|
102
|
+
- the **Docling OTSL family**
|
|
103
|
+
(`docling-project/{PubTabNet,FinTabNet,PubTables-1M,SynthTabNet}_OTSL`) — a
|
|
104
|
+
uniform converted schema that feeds all nine codecs;
|
|
105
|
+
- the **native** first-published PubTabNet annotation (`apoidea/pubtabnet-html`)
|
|
106
|
+
fed unmodified to the `pubtabnet` codecs;
|
|
107
|
+
- the **native** PubTables-1M PASCAL VOC structure annotation
|
|
108
|
+
(`bsmock/pubtables-1m`, download-only) with the logical grid reconstructed for
|
|
109
|
+
the `pubtables-1m` codec.
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
just e2e-selftest # network-free adapter smoke test
|
|
113
|
+
just e2e 200 # 200 randomly-sampled rows per check (needs [hf] extra)
|
|
114
|
+
just e2e-fetch-pubtables1m # download native PubTables-1M VOC (~30MB) into input/
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
Rows are sampled randomly and each run prints its `--seed`, so repeated runs
|
|
118
|
+
progressively cover the corpora and any finding is reproducible. Failures are
|
|
119
|
+
appended to `output/e2e_findings/` (gitignored) with a replayable payload. See
|
|
120
|
+
[ADR 0003](docs/adr/0003-e2e-against-docling-otsl-family.md) and
|
|
121
|
+
[ADR 0004](docs/adr/0004-e2e-native-first-published-datasets.md) for the
|
|
122
|
+
data-source decisions and the canonical-vs-real-shape caveats.
|
|
123
|
+
|
|
124
|
+
## Documentation
|
|
125
|
+
|
|
126
|
+
- [`docs/spec.md`](docs/spec.md) — Specification (the single source of truth).
|
|
127
|
+
- [`docs/glossary.md`](docs/glossary.md) — Precise vocabulary: terms tablecodec
|
|
128
|
+
defines vs. borrows (e.g. "loss" vs a "degenerate" bbox).
|
|
129
|
+
- [`docs/intent.md`](docs/intent.md) — Implementation brief and roadmap
|
|
130
|
+
(milestones, quality bar, §8 future work).
|
|
131
|
+
- [`docs/adr/`](docs/adr/) — the decisions and their reasoning (the "Why").
|
|
132
|
+
- [`CHANGELOG.md`](CHANGELOG.md) — Keep a Changelog format.
|
|
133
|
+
|
|
134
|
+
## Development
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
just install # editable install with dev + cli + teds extras
|
|
138
|
+
just ci # lint + pyright (strict) + pytest + semgrep + docs-check
|
|
139
|
+
just docs # regenerate the codec/loss tables (docs-check enforces freshness)
|
|
140
|
+
just ci-all # core + the in-repo tablecodec-docling bridge
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
Releases are published from GitHub Actions via PyPI **OIDC Trusted Publishing**
|
|
144
|
+
(no long-lived token), carrying PEP 740 attestations and a SLSA build provenance
|
|
145
|
+
([ADR 0014](docs/adr/0014-release-via-oidc-trusted-publishing.md)).
|
|
146
|
+
|
|
147
|
+
## License
|
|
148
|
+
|
|
149
|
+
MIT. See [LICENSE](LICENSE). The OTSL grid-reconstruction logic and the TEDS
|
|
150
|
+
metric are adapted (with attribution) from upstream MIT / Apache-2.0 sources —
|
|
151
|
+
see [THIRD_PARTY_NOTICES.md](THIRD_PARTY_NOTICES.md).
|
|
@@ -5,7 +5,7 @@ build-backend = "hatchling.build"
|
|
|
5
5
|
|
|
6
6
|
[project]
|
|
7
7
|
name = "tablecodec"
|
|
8
|
-
version = "0.0.
|
|
8
|
+
version = "0.0.19"
|
|
9
9
|
description = "Neutral Internal Representation and Codec registry for image-based table-recognition datasets"
|
|
10
10
|
readme = "README.md"
|
|
11
11
|
requires-python = ">=3.11"
|
|
@@ -26,6 +26,7 @@ classifiers = [
|
|
|
26
26
|
"Programming Language :: Python :: 3.11",
|
|
27
27
|
"Programming Language :: Python :: 3.12",
|
|
28
28
|
"Programming Language :: Python :: 3.13",
|
|
29
|
+
"Programming Language :: Python :: 3.14",
|
|
29
30
|
"Topic :: Scientific/Engineering :: Image Recognition",
|
|
30
31
|
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
31
32
|
"Typing :: Typed",
|
|
@@ -9,6 +9,8 @@ Public API (M1):
|
|
|
9
9
|
|
|
10
10
|
from __future__ import annotations
|
|
11
11
|
|
|
12
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
13
|
+
|
|
12
14
|
from tablecodec.ir import BBox, GridCell, TableSample
|
|
13
15
|
from tablecodec.loss import LossReport, analyze_loss
|
|
14
16
|
from tablecodec.validate import Profile, ValidationError, profiles, validate
|
|
@@ -26,4 +28,7 @@ __all__ = [
|
|
|
26
28
|
"validate",
|
|
27
29
|
]
|
|
28
30
|
|
|
29
|
-
|
|
31
|
+
try:
|
|
32
|
+
__version__ = version("tablecodec")
|
|
33
|
+
except PackageNotFoundError: # source checkout without an installed build
|
|
34
|
+
__version__ = "0.0.0+unknown"
|
tablecodec-0.0.18/PKG-INFO
DELETED
|
@@ -1,200 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: tablecodec
|
|
3
|
-
Version: 0.0.18
|
|
4
|
-
Summary: Neutral Internal Representation and Codec registry for image-based table-recognition datasets
|
|
5
|
-
Project-URL: Homepage, https://github.com/hironow/tablecodec
|
|
6
|
-
Project-URL: Repository, https://github.com/hironow/tablecodec
|
|
7
|
-
Project-URL: Issues, https://github.com/hironow/tablecodec/issues
|
|
8
|
-
Project-URL: Changelog, https://github.com/hironow/tablecodec/blob/main/CHANGELOG.md
|
|
9
|
-
Author-email: hironow <hironow365@gmail.com>
|
|
10
|
-
License-Expression: MIT
|
|
11
|
-
License-File: LICENSE
|
|
12
|
-
Keywords: dataset,doctags,fintabnet,ocr,otsl,pubtabnet,table
|
|
13
|
-
Classifier: Development Status :: 2 - Pre-Alpha
|
|
14
|
-
Classifier: Intended Audience :: Developers
|
|
15
|
-
Classifier: Intended Audience :: Science/Research
|
|
16
|
-
Classifier: Operating System :: OS Independent
|
|
17
|
-
Classifier: Programming Language :: Python :: 3
|
|
18
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
21
|
-
Classifier: Topic :: Scientific/Engineering :: Image Recognition
|
|
22
|
-
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
23
|
-
Classifier: Typing :: Typed
|
|
24
|
-
Requires-Python: >=3.11
|
|
25
|
-
Provides-Extra: all
|
|
26
|
-
Requires-Dist: apted>=1.0.3; extra == 'all'
|
|
27
|
-
Requires-Dist: click>=8.1; extra == 'all'
|
|
28
|
-
Requires-Dist: datasets>=2.19; extra == 'all'
|
|
29
|
-
Requires-Dist: defusedxml>=0.7; extra == 'all'
|
|
30
|
-
Requires-Dist: lxml>=5.0; extra == 'all'
|
|
31
|
-
Provides-Extra: cli
|
|
32
|
-
Requires-Dist: click>=8.1; extra == 'cli'
|
|
33
|
-
Provides-Extra: dev
|
|
34
|
-
Requires-Dist: coverage>=7.5; extra == 'dev'
|
|
35
|
-
Requires-Dist: hypothesis>=6.100; extra == 'dev'
|
|
36
|
-
Requires-Dist: jsonschema>=4.20; extra == 'dev'
|
|
37
|
-
Requires-Dist: pyright>=1.1.380; extra == 'dev'
|
|
38
|
-
Requires-Dist: pytest-benchmark>=4.0; extra == 'dev'
|
|
39
|
-
Requires-Dist: pytest-cov>=5.0; extra == 'dev'
|
|
40
|
-
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
41
|
-
Requires-Dist: ruff>=0.6; extra == 'dev'
|
|
42
|
-
Provides-Extra: hf
|
|
43
|
-
Requires-Dist: datasets>=2.19; extra == 'hf'
|
|
44
|
-
Requires-Dist: defusedxml>=0.7; extra == 'hf'
|
|
45
|
-
Provides-Extra: teds
|
|
46
|
-
Requires-Dist: apted>=1.0.3; extra == 'teds'
|
|
47
|
-
Requires-Dist: lxml>=5.0; extra == 'teds'
|
|
48
|
-
Description-Content-Type: text/markdown
|
|
49
|
-
|
|
50
|
-
# tablecodec
|
|
51
|
-
|
|
52
|
-
> Neutral Internal Representation + Codec registry for image-based table-recognition datasets.
|
|
53
|
-
|
|
54
|
-
`tablecodec` is a Python library that provides a single, lossless Internal
|
|
55
|
-
Representation (IR) for tables and a registry-based codec layer that translates
|
|
56
|
-
between this IR and the fragmented landscape of public table-recognition
|
|
57
|
-
datasets — PubTabNet, FinTabNet, OTSL, TableFormer, DocTags-tables,
|
|
58
|
-
PubTables-1M, TableBank.
|
|
59
|
-
|
|
60
|
-
- Stdlib-only core. Heavier features (TEDS, CLI) are opt-in extras.
|
|
61
|
-
- Streams large JSONL datasets at constant memory.
|
|
62
|
-
- Self-declared loss analysis between any two codecs.
|
|
63
|
-
|
|
64
|
-
## Status
|
|
65
|
-
|
|
66
|
-
**0.0.18 (pre-alpha).** Not yet published to PyPI. The nine codecs, the TEDS
|
|
67
|
-
metric (`[teds]`), and the STRICT validation profile were all added
|
|
68
|
-
incrementally within the 0.0.x series; a separate `tablecodec-docling` bridge
|
|
69
|
-
codec lives in `packages/` (its own version). The 0.x line makes no
|
|
70
|
-
API-stability promises; the public surface freezes at 1.0 (see
|
|
71
|
-
[docs/spec.md](docs/spec.md) §14). The specification is the source of
|
|
72
|
-
truth. Auto-generated codec / loss tables live at
|
|
73
|
-
[docs/format_support.md](docs/format_support.md) and
|
|
74
|
-
[docs/loss_matrix.md](docs/loss_matrix.md).
|
|
75
|
-
|
|
76
|
-
## Installation
|
|
77
|
-
|
|
78
|
-
```bash
|
|
79
|
-
pip install tablecodec # stdlib-only core
|
|
80
|
-
pip install "tablecodec[cli]" # + command-line interface (click)
|
|
81
|
-
pip install "tablecodec[teds]" # + TEDS similarity metric (apted, lxml)
|
|
82
|
-
```
|
|
83
|
-
|
|
84
|
-
Requires Python 3.11+.
|
|
85
|
-
|
|
86
|
-
## Basic usage
|
|
87
|
-
|
|
88
|
-
```python
|
|
89
|
-
import tablecodec
|
|
90
|
-
from tablecodec import codecs, validate, profiles, analyze_loss
|
|
91
|
-
from tablecodec.codecs.pubtabnet import PubTabNet20Codec
|
|
92
|
-
|
|
93
|
-
# Register a codec (built-ins self-register through the CLI; in library
|
|
94
|
-
# use you register the ones you need).
|
|
95
|
-
codecs.register(PubTabNet20Codec())
|
|
96
|
-
|
|
97
|
-
# Stream-read a dataset into the neutral IR.
|
|
98
|
-
with open("pubtabnet_val.jsonl", encoding="utf-8") as f:
|
|
99
|
-
for sample in codecs.get("pubtabnet-2.0.0").read(f):
|
|
100
|
-
errors = validate(sample, profile=profiles.DEFAULT)
|
|
101
|
-
if errors:
|
|
102
|
-
print(sample.filename, errors)
|
|
103
|
-
|
|
104
|
-
# Static, data-free loss analysis between two formats.
|
|
105
|
-
report = analyze_loss(source="pubtabnet-2.0.0", target="otsl-1.0.0")
|
|
106
|
-
print(report.round_trip_classification) # "structure-preserving"
|
|
107
|
-
```
|
|
108
|
-
|
|
109
|
-
The core has **zero third-party runtime dependencies** (SPEC §13);
|
|
110
|
-
`import tablecodec` works on a bare Python 3.11+.
|
|
111
|
-
|
|
112
|
-
## TEDS similarity (optional)
|
|
113
|
-
|
|
114
|
-
The `[teds]` extra adds a Tree-Edit-Distance based Similarity score between
|
|
115
|
-
two samples. It lives outside the core (it imports `apted`/`lxml`), so import
|
|
116
|
-
it from its submodule:
|
|
117
|
-
|
|
118
|
-
```python
|
|
119
|
-
from tablecodec.teds import teds
|
|
120
|
-
|
|
121
|
-
score = teds(pred_sample, true_sample) # 0.0 .. 1.0
|
|
122
|
-
struct = teds(pred_sample, true_sample, structure_only=True) # ignore cell text
|
|
123
|
-
```
|
|
124
|
-
|
|
125
|
-
## CLI
|
|
126
|
-
|
|
127
|
-
Install with the optional ``[cli]`` extra:
|
|
128
|
-
|
|
129
|
-
```bash
|
|
130
|
-
pip install "tablecodec[cli]"
|
|
131
|
-
```
|
|
132
|
-
|
|
133
|
-
```bash
|
|
134
|
-
tablecodec codecs list
|
|
135
|
-
tablecodec analyze-loss --from pubtabnet-2.0.0 --to otsl-1.0.0
|
|
136
|
-
tablecodec validate path/to/dataset.jsonl --codec pubtabnet-2.0.0 --profile DEFAULT
|
|
137
|
-
tablecodec stats path/to/dataset.jsonl --codec pubtabnet-2.0.0 --json
|
|
138
|
-
tablecodec convert in.jsonl out.jsonl --from pubtabnet-2.0.0 --to otsl-1.0.0
|
|
139
|
-
tablecodec convert in.jsonl /dev/null --from pubtabnet-2.0.0 --to otsl-1.0.0 --dry-run
|
|
140
|
-
tablecodec diff a.jsonl b.jsonl --codec pubtabnet-2.0.0
|
|
141
|
-
```
|
|
142
|
-
|
|
143
|
-
All commands stream their input; exit codes are non-zero on validation
|
|
144
|
-
failures or diffs (suitable for CI / data pipelines).
|
|
145
|
-
|
|
146
|
-
## End-to-end check against real datasets
|
|
147
|
-
|
|
148
|
-
`scripts/e2e_hf_check.py` streams real datasets through the codecs and
|
|
149
|
-
validates the resulting IR. Every shipped codec gets at least one
|
|
150
|
-
official-corpus check. Two data sources are used:
|
|
151
|
-
|
|
152
|
-
- the Docling OTSL family
|
|
153
|
-
(`docling-project/{PubTabNet,FinTabNet,PubTables-1M,SynthTabNet}_OTSL`)
|
|
154
|
-
— a uniform converted schema that feeds all nine codecs;
|
|
155
|
-
- the **native** first-published PubTabNet annotation
|
|
156
|
-
(`apoidea/pubtabnet-html`) fed unmodified to the `pubtabnet` codecs;
|
|
157
|
-
- the **native** PubTables-1M PASCAL VOC structure annotation
|
|
158
|
-
(`bsmock/pubtables-1m`, download-only) read from a local tar under
|
|
159
|
-
`input/` with the logical grid reconstructed for the `pubtables-1m`
|
|
160
|
-
codec (FinTabNet / TableBank natives stay download-only + Docling-covered).
|
|
161
|
-
|
|
162
|
-
It is **occasional / local-only** (network + multi-GB datasets), not part
|
|
163
|
-
of CI.
|
|
164
|
-
|
|
165
|
-
```bash
|
|
166
|
-
just e2e-selftest # network-free adapter smoke test
|
|
167
|
-
just e2e 200 # 200 randomly-sampled rows per check (needs [hf] extra)
|
|
168
|
-
uv run --extra hf python scripts/e2e_hf_check.py --dataset apoidea --limit 50
|
|
169
|
-
just e2e-fetch-pubtables1m # download native PubTables-1M VOC (~30MB) into input/
|
|
170
|
-
uv run --extra hf python scripts/e2e_hf_check.py --dataset bsmock --limit 200
|
|
171
|
-
```
|
|
172
|
-
|
|
173
|
-
Rows are sampled randomly (streaming shuffle reshuffles shard order), so
|
|
174
|
-
repeated runs progressively cover the multi-hundred-thousand-row corpora.
|
|
175
|
-
Each run prints its `--seed` so a finding can be reproduced; pass
|
|
176
|
-
`--seed N` to fix it or `--no-shuffle` for a deterministic head read.
|
|
177
|
-
The harness reports parse errors and validation findings — e.g. it
|
|
178
|
-
surfaces real upstream rows with geometrically invalid bboxes (I-05) —
|
|
179
|
-
and appends each failed row to `output/e2e_findings/` (gitignored) with
|
|
180
|
-
its full provenance and replayable payload for later audit.
|
|
181
|
-
|
|
182
|
-
See [`docs/adr/0003-e2e-against-docling-otsl-family.md`](docs/adr/0003-e2e-against-docling-otsl-family.md)
|
|
183
|
-
and [`docs/adr/0004-e2e-native-first-published-datasets.md`](docs/adr/0004-e2e-native-first-published-datasets.md)
|
|
184
|
-
for the data-source decisions and the canonical-vs-real-shape caveats.
|
|
185
|
-
|
|
186
|
-
## Documents
|
|
187
|
-
|
|
188
|
-
- [`docs/spec.md`](docs/spec.md) — Specification (the single source of truth).
|
|
189
|
-
- [`docs/glossary.md`](docs/glossary.md) — Precise vocabulary: terms tablecodec
|
|
190
|
-
defines vs. borrows, and the words most likely to be misread (e.g. "loss"
|
|
191
|
-
vs a "degenerate" bbox).
|
|
192
|
-
- [`docs/intent.md`](docs/intent.md) — Implementation brief (milestones, order,
|
|
193
|
-
quality bar).
|
|
194
|
-
- [`CHANGELOG.md`](CHANGELOG.md) — Keep a Changelog format.
|
|
195
|
-
|
|
196
|
-
## License
|
|
197
|
-
|
|
198
|
-
MIT. See [LICENSE](LICENSE). The OTSL grid-reconstruction logic is
|
|
199
|
-
adapted (with attribution) from the MIT-licensed docling-ibm-models — see
|
|
200
|
-
[THIRD_PARTY_NOTICES.md](THIRD_PARTY_NOTICES.md).
|
tablecodec-0.0.18/README.md
DELETED
|
@@ -1,151 +0,0 @@
|
|
|
1
|
-
# tablecodec
|
|
2
|
-
|
|
3
|
-
> Neutral Internal Representation + Codec registry for image-based table-recognition datasets.
|
|
4
|
-
|
|
5
|
-
`tablecodec` is a Python library that provides a single, lossless Internal
|
|
6
|
-
Representation (IR) for tables and a registry-based codec layer that translates
|
|
7
|
-
between this IR and the fragmented landscape of public table-recognition
|
|
8
|
-
datasets — PubTabNet, FinTabNet, OTSL, TableFormer, DocTags-tables,
|
|
9
|
-
PubTables-1M, TableBank.
|
|
10
|
-
|
|
11
|
-
- Stdlib-only core. Heavier features (TEDS, CLI) are opt-in extras.
|
|
12
|
-
- Streams large JSONL datasets at constant memory.
|
|
13
|
-
- Self-declared loss analysis between any two codecs.
|
|
14
|
-
|
|
15
|
-
## Status
|
|
16
|
-
|
|
17
|
-
**0.0.18 (pre-alpha).** Not yet published to PyPI. The nine codecs, the TEDS
|
|
18
|
-
metric (`[teds]`), and the STRICT validation profile were all added
|
|
19
|
-
incrementally within the 0.0.x series; a separate `tablecodec-docling` bridge
|
|
20
|
-
codec lives in `packages/` (its own version). The 0.x line makes no
|
|
21
|
-
API-stability promises; the public surface freezes at 1.0 (see
|
|
22
|
-
[docs/spec.md](docs/spec.md) §14). The specification is the source of
|
|
23
|
-
truth. Auto-generated codec / loss tables live at
|
|
24
|
-
[docs/format_support.md](docs/format_support.md) and
|
|
25
|
-
[docs/loss_matrix.md](docs/loss_matrix.md).
|
|
26
|
-
|
|
27
|
-
## Installation
|
|
28
|
-
|
|
29
|
-
```bash
|
|
30
|
-
pip install tablecodec # stdlib-only core
|
|
31
|
-
pip install "tablecodec[cli]" # + command-line interface (click)
|
|
32
|
-
pip install "tablecodec[teds]" # + TEDS similarity metric (apted, lxml)
|
|
33
|
-
```
|
|
34
|
-
|
|
35
|
-
Requires Python 3.11+.
|
|
36
|
-
|
|
37
|
-
## Basic usage
|
|
38
|
-
|
|
39
|
-
```python
|
|
40
|
-
import tablecodec
|
|
41
|
-
from tablecodec import codecs, validate, profiles, analyze_loss
|
|
42
|
-
from tablecodec.codecs.pubtabnet import PubTabNet20Codec
|
|
43
|
-
|
|
44
|
-
# Register a codec (built-ins self-register through the CLI; in library
|
|
45
|
-
# use you register the ones you need).
|
|
46
|
-
codecs.register(PubTabNet20Codec())
|
|
47
|
-
|
|
48
|
-
# Stream-read a dataset into the neutral IR.
|
|
49
|
-
with open("pubtabnet_val.jsonl", encoding="utf-8") as f:
|
|
50
|
-
for sample in codecs.get("pubtabnet-2.0.0").read(f):
|
|
51
|
-
errors = validate(sample, profile=profiles.DEFAULT)
|
|
52
|
-
if errors:
|
|
53
|
-
print(sample.filename, errors)
|
|
54
|
-
|
|
55
|
-
# Static, data-free loss analysis between two formats.
|
|
56
|
-
report = analyze_loss(source="pubtabnet-2.0.0", target="otsl-1.0.0")
|
|
57
|
-
print(report.round_trip_classification) # "structure-preserving"
|
|
58
|
-
```
|
|
59
|
-
|
|
60
|
-
The core has **zero third-party runtime dependencies** (SPEC §13);
|
|
61
|
-
`import tablecodec` works on a bare Python 3.11+.
|
|
62
|
-
|
|
63
|
-
## TEDS similarity (optional)
|
|
64
|
-
|
|
65
|
-
The `[teds]` extra adds a Tree-Edit-Distance based Similarity score between
|
|
66
|
-
two samples. It lives outside the core (it imports `apted`/`lxml`), so import
|
|
67
|
-
it from its submodule:
|
|
68
|
-
|
|
69
|
-
```python
|
|
70
|
-
from tablecodec.teds import teds
|
|
71
|
-
|
|
72
|
-
score = teds(pred_sample, true_sample) # 0.0 .. 1.0
|
|
73
|
-
struct = teds(pred_sample, true_sample, structure_only=True) # ignore cell text
|
|
74
|
-
```
|
|
75
|
-
|
|
76
|
-
## CLI
|
|
77
|
-
|
|
78
|
-
Install with the optional ``[cli]`` extra:
|
|
79
|
-
|
|
80
|
-
```bash
|
|
81
|
-
pip install "tablecodec[cli]"
|
|
82
|
-
```
|
|
83
|
-
|
|
84
|
-
```bash
|
|
85
|
-
tablecodec codecs list
|
|
86
|
-
tablecodec analyze-loss --from pubtabnet-2.0.0 --to otsl-1.0.0
|
|
87
|
-
tablecodec validate path/to/dataset.jsonl --codec pubtabnet-2.0.0 --profile DEFAULT
|
|
88
|
-
tablecodec stats path/to/dataset.jsonl --codec pubtabnet-2.0.0 --json
|
|
89
|
-
tablecodec convert in.jsonl out.jsonl --from pubtabnet-2.0.0 --to otsl-1.0.0
|
|
90
|
-
tablecodec convert in.jsonl /dev/null --from pubtabnet-2.0.0 --to otsl-1.0.0 --dry-run
|
|
91
|
-
tablecodec diff a.jsonl b.jsonl --codec pubtabnet-2.0.0
|
|
92
|
-
```
|
|
93
|
-
|
|
94
|
-
All commands stream their input; exit codes are non-zero on validation
|
|
95
|
-
failures or diffs (suitable for CI / data pipelines).
|
|
96
|
-
|
|
97
|
-
## End-to-end check against real datasets
|
|
98
|
-
|
|
99
|
-
`scripts/e2e_hf_check.py` streams real datasets through the codecs and
|
|
100
|
-
validates the resulting IR. Every shipped codec gets at least one
|
|
101
|
-
official-corpus check. Two data sources are used:
|
|
102
|
-
|
|
103
|
-
- the Docling OTSL family
|
|
104
|
-
(`docling-project/{PubTabNet,FinTabNet,PubTables-1M,SynthTabNet}_OTSL`)
|
|
105
|
-
— a uniform converted schema that feeds all nine codecs;
|
|
106
|
-
- the **native** first-published PubTabNet annotation
|
|
107
|
-
(`apoidea/pubtabnet-html`) fed unmodified to the `pubtabnet` codecs;
|
|
108
|
-
- the **native** PubTables-1M PASCAL VOC structure annotation
|
|
109
|
-
(`bsmock/pubtables-1m`, download-only) read from a local tar under
|
|
110
|
-
`input/` with the logical grid reconstructed for the `pubtables-1m`
|
|
111
|
-
codec (FinTabNet / TableBank natives stay download-only + Docling-covered).
|
|
112
|
-
|
|
113
|
-
It is **occasional / local-only** (network + multi-GB datasets), not part
|
|
114
|
-
of CI.
|
|
115
|
-
|
|
116
|
-
```bash
|
|
117
|
-
just e2e-selftest # network-free adapter smoke test
|
|
118
|
-
just e2e 200 # 200 randomly-sampled rows per check (needs [hf] extra)
|
|
119
|
-
uv run --extra hf python scripts/e2e_hf_check.py --dataset apoidea --limit 50
|
|
120
|
-
just e2e-fetch-pubtables1m # download native PubTables-1M VOC (~30MB) into input/
|
|
121
|
-
uv run --extra hf python scripts/e2e_hf_check.py --dataset bsmock --limit 200
|
|
122
|
-
```
|
|
123
|
-
|
|
124
|
-
Rows are sampled randomly (streaming shuffle reshuffles shard order), so
|
|
125
|
-
repeated runs progressively cover the multi-hundred-thousand-row corpora.
|
|
126
|
-
Each run prints its `--seed` so a finding can be reproduced; pass
|
|
127
|
-
`--seed N` to fix it or `--no-shuffle` for a deterministic head read.
|
|
128
|
-
The harness reports parse errors and validation findings — e.g. it
|
|
129
|
-
surfaces real upstream rows with geometrically invalid bboxes (I-05) —
|
|
130
|
-
and appends each failed row to `output/e2e_findings/` (gitignored) with
|
|
131
|
-
its full provenance and replayable payload for later audit.
|
|
132
|
-
|
|
133
|
-
See [`docs/adr/0003-e2e-against-docling-otsl-family.md`](docs/adr/0003-e2e-against-docling-otsl-family.md)
|
|
134
|
-
and [`docs/adr/0004-e2e-native-first-published-datasets.md`](docs/adr/0004-e2e-native-first-published-datasets.md)
|
|
135
|
-
for the data-source decisions and the canonical-vs-real-shape caveats.
|
|
136
|
-
|
|
137
|
-
## Documents
|
|
138
|
-
|
|
139
|
-
- [`docs/spec.md`](docs/spec.md) — Specification (the single source of truth).
|
|
140
|
-
- [`docs/glossary.md`](docs/glossary.md) — Precise vocabulary: terms tablecodec
|
|
141
|
-
defines vs. borrows, and the words most likely to be misread (e.g. "loss"
|
|
142
|
-
vs a "degenerate" bbox).
|
|
143
|
-
- [`docs/intent.md`](docs/intent.md) — Implementation brief (milestones, order,
|
|
144
|
-
quality bar).
|
|
145
|
-
- [`CHANGELOG.md`](CHANGELOG.md) — Keep a Changelog format.
|
|
146
|
-
|
|
147
|
-
## License
|
|
148
|
-
|
|
149
|
-
MIT. See [LICENSE](LICENSE). The OTSL grid-reconstruction logic is
|
|
150
|
-
adapted (with attribution) from the MIT-licensed docling-ibm-models — see
|
|
151
|
-
[THIRD_PARTY_NOTICES.md](THIRD_PARTY_NOTICES.md).
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|