table2rules 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- table2rules-0.4.0/LICENSE +21 -0
- table2rules-0.4.0/PKG-INFO +332 -0
- table2rules-0.4.0/README.md +293 -0
- table2rules-0.4.0/pyproject.toml +93 -0
- table2rules-0.4.0/setup.cfg +4 -0
- table2rules-0.4.0/src/table2rules/__init__.py +51 -0
- table2rules-0.4.0/src/table2rules/__main__.py +85 -0
- table2rules-0.4.0/src/table2rules/_core.py +351 -0
- table2rules-0.4.0/src/table2rules/cleanup.py +61 -0
- table2rules-0.4.0/src/table2rules/errors.py +17 -0
- table2rules-0.4.0/src/table2rules/exporters/__init__.py +33 -0
- table2rules-0.4.0/src/table2rules/exporters/base.py +41 -0
- table2rules-0.4.0/src/table2rules/exporters/rules.py +66 -0
- table2rules-0.4.0/src/table2rules/grid_parser.py +487 -0
- table2rules-0.4.0/src/table2rules/maze_pathfinder.py +166 -0
- table2rules-0.4.0/src/table2rules/models.py +26 -0
- table2rules-0.4.0/src/table2rules/py.typed +0 -0
- table2rules-0.4.0/src/table2rules/quality_gate.py +186 -0
- table2rules-0.4.0/src/table2rules/report.py +155 -0
- table2rules-0.4.0/src/table2rules/simple_repair.py +645 -0
- table2rules-0.4.0/src/table2rules/spans.py +36 -0
- table2rules-0.4.0/src/table2rules.egg-info/PKG-INFO +332 -0
- table2rules-0.4.0/src/table2rules.egg-info/SOURCES.txt +30 -0
- table2rules-0.4.0/src/table2rules.egg-info/dependency_links.txt +1 -0
- table2rules-0.4.0/src/table2rules.egg-info/entry_points.txt +2 -0
- table2rules-0.4.0/src/table2rules.egg-info/requires.txt +10 -0
- table2rules-0.4.0/src/table2rules.egg-info/top_level.txt +1 -0
- table2rules-0.4.0/tests/test_correctness_oracle.py +254 -0
- table2rules-0.4.0/tests/test_determinism.py +49 -0
- table2rules-0.4.0/tests/test_public_api.py +454 -0
- table2rules-0.4.0/tests/test_regression_golds.py +65 -0
- table2rules-0.4.0/tests/test_robustness_mutations.py +394 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 PebbleRoad Pte Ltd
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: table2rules
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Summary: Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding.
|
|
5
|
+
Author: PebbleRoad Pte Ltd
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/PebbleRoad/table2rules
|
|
8
|
+
Project-URL: Repository, https://github.com/PebbleRoad/table2rules
|
|
9
|
+
Project-URL: Issues, https://github.com/PebbleRoad/table2rules/issues
|
|
10
|
+
Project-URL: Changelog, https://github.com/PebbleRoad/table2rules/blob/main/CHANGELOG.md
|
|
11
|
+
Project-URL: Documentation, https://github.com/PebbleRoad/table2rules/tree/main/docs
|
|
12
|
+
Keywords: html,tables,llm,parsing,rules,rag
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
24
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
25
|
+
Classifier: Typing :: Typed
|
|
26
|
+
Requires-Python: >=3.9
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
License-File: LICENSE
|
|
29
|
+
Requires-Dist: beautifulsoup4<5,>=4.13.0
|
|
30
|
+
Provides-Extra: dev
|
|
31
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
32
|
+
Requires-Dist: ruff>=0.6; extra == "dev"
|
|
33
|
+
Requires-Dist: mypy>=1.10; extra == "dev"
|
|
34
|
+
Requires-Dist: types-html5lib; extra == "dev"
|
|
35
|
+
Requires-Dist: lxml-stubs; extra == "dev"
|
|
36
|
+
Requires-Dist: build>=1.2; extra == "dev"
|
|
37
|
+
Requires-Dist: twine>=5.0; extra == "dev"
|
|
38
|
+
Dynamic: license-file
|
|
39
|
+
|
|
40
|
+
# table2rules
|
|
41
|
+
|
|
42
|
+
[](https://pypi.org/project/table2rules/)
|
|
43
|
+
[](https://pypi.org/project/table2rules/)
|
|
44
|
+
[](LICENSE)
|
|
45
|
+
[](https://github.com/PebbleRoad/table2rules/actions/workflows/test.yml)
|
|
46
|
+
|
|
47
|
+
**Convert HTML tables into flat, self-contained facts — one per line — so LLMs and RAG pipelines can chunk, embed, and retrieve them without losing header context.**
|
|
48
|
+
|
|
49
|
+
Concretely: HTML `<table>` in, lines of `row-path | col-path: value` out, with the full header ancestry repeated on every line so any chunker can split anywhere without orphaning a row from its headers.
|
|
50
|
+
|
|
51
|
+
- Pure Python, no network calls
|
|
52
|
+
- No ML models, fully deterministic
|
|
53
|
+
- MIT-licensed, safe for commercial / on-prem use
|
|
54
|
+
|
|
55
|
+
## The DNA of Table Parsing — a maze pathfinder approach
|
|
56
|
+
|
|
57
|
+
**Tables are mazes. Each cell finds its headers by pathfinding.**
|
|
58
|
+
|
|
59
|
+
This isn't pattern-matching or table-type detection. It's a universal algorithm based on how HTML tables actually work:
|
|
60
|
+
|
|
61
|
+
1. **Cells occupy grid positions** (with rowspan/colspan expanding them)
|
|
62
|
+
2. **Headers relate to data cells** via spatial relationships (left = row context, above = column context)
|
|
63
|
+
3. **Semantic markers** (`<th>`, `<thead>`, `scope`) signal intent
|
|
64
|
+
|
|
65
|
+
The algorithm **discovers structure** — it doesn't memorize patterns. When markup is ambiguous or hostile, it fails open and preserves raw HTML instead of inventing structure. This makes outputs more trustworthy for enterprise pipelines and LLM workflows where correctness and traceability matter more than aggressive guessing.
|
|
66
|
+
|
|
67
|
+
---
|
|
68
|
+
|
|
69
|
+
## Why RAG Pipelines Need This
|
|
70
|
+
|
|
71
|
+
The single largest failure mode for tables in RAG isn't extraction — it's **chunking**. A standard pipeline converts tables to markdown or HTML, then a size-based chunker splits by token count. For any table taller than a chunk, the header row ends up in one chunk and data rows land in others. Retrieval on *"what was Q2 2024 revenue?"* returns `Revenue | 155` without the system knowing `155` belongs to Q2, 2024, or even which metric it measures.
|
|
72
|
+
|
|
73
|
+
Consider a two-level-header financial table:
|
|
74
|
+
|
|
75
|
+
```html
|
|
76
|
+
<table>
|
|
77
|
+
<thead>
|
|
78
|
+
<tr><th></th><th colspan="2">2024</th><th colspan="2">2023</th></tr>
|
|
79
|
+
<tr><th></th><th>Q1</th><th>Q2</th><th>Q1</th><th>Q2</th></tr>
|
|
80
|
+
</thead>
|
|
81
|
+
<tbody>
|
|
82
|
+
<tr><th>Revenue</th><td>130</td><td>155</td><td>118</td><td>125</td></tr>
|
|
83
|
+
<tr><th>Operating Costs</th><td>55</td><td>60</td><td>48</td><td>52</td></tr>
|
|
84
|
+
</tbody>
|
|
85
|
+
</table>
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
**Typical markdown extraction** loses the year/quarter hierarchy (the two header rows collapse, and any table-unaware chunker can split the header off from the data):
|
|
89
|
+
|
|
90
|
+
```
|
|
91
|
+
| | Q1 | Q2 | Q1 | Q2 |
|
|
92
|
+
| Revenue | 130 | 155 | 118 | 125 |
|
|
93
|
+
| Operating Costs | 55 | 60 | 48 | 52 |
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
**table2rules output** is one self-contained fact per line, with the full header ancestry on every line:
|
|
97
|
+
|
|
98
|
+
```
|
|
99
|
+
Revenue | 2024 > Q1: 130
|
|
100
|
+
Revenue | 2024 > Q2: 155
|
|
101
|
+
Revenue | 2023 > Q1: 118
|
|
102
|
+
Revenue | 2023 > Q2: 125
|
|
103
|
+
Operating Costs | 2024 > Q1: 55
|
|
104
|
+
Operating Costs | 2024 > Q2: 60
|
|
105
|
+
Operating Costs | 2023 > Q1: 48
|
|
106
|
+
Operating Costs | 2023 > Q2: 52
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
Three properties this gives your RAG pipeline:
|
|
110
|
+
|
|
111
|
+
1. **Chunk-safety.** Any chunker (character count, token count, semantic, recursive) can split the output at any line boundary and every chunk stays independently meaningful. No row is ever orphaned from its headers.
|
|
112
|
+
2. **Retrieval semantics.** A vector embedding of `Revenue | 2024 > Q2: 155` is far closer to the query *"Q2 2024 revenue"* than an embedding of `Revenue | 155` ever will be. The dimension labels are inside the string that gets embedded.
|
|
113
|
+
3. **Traceability at answer time.** The LLM sees the full header path on every fact it reads, so when it answers *"why is this 155?"* it can cite the correct column group unambiguously.
|
|
114
|
+
|
|
115
|
+
This is why we produce rules, not just markdown: rules are the representation tables need to survive a RAG pipeline intact.
|
|
116
|
+
|
|
117
|
+
### Where this library fits vs. other tools
|
|
118
|
+
|
|
119
|
+
- **Unstructured.io, markitdown, docling**: extract tables as markdown/HTML. Excellent at extraction, but chunking those outputs without losing headers still needs help — that's where table2rules fits.
|
|
120
|
+
- **LlamaParse**: paid, similar intent at a higher level (whole-document parsing).
|
|
121
|
+
- **pandas / lxml**: give you structured data, not RAG-ingestible facts.
|
|
122
|
+
- **table2rules**: narrow scope — HTML table in, self-contained facts out, fail-open on hostile input. Pair it with any of the above in a pipeline: extract with your tool, pass the table HTML through table2rules before chunking.
|
|
123
|
+
|
|
124
|
+
### Where it fits in your RAG pipeline
|
|
125
|
+
|
|
126
|
+
`table2rules` is a single transformation between **table extraction** and **chunking**. It doesn't replace your extractor, vector store, embedder, or LLM — it makes table content survive the chunker:
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
from table2rules import process_tables_to_text
|
|
130
|
+
|
|
131
|
+
# 1. Extract HTML from your source (PDF, scrape, doc parser, etc.)
|
|
132
|
+
html = extract_html_with_unstructured(pdf_bytes) # or docling, markitdown, ...
|
|
133
|
+
|
|
134
|
+
# 2. Convert any tables in it to flat, header-inlined facts
|
|
135
|
+
facts = process_tables_to_text(html)
|
|
136
|
+
|
|
137
|
+
# 3. Hand the facts to *any* chunker — no table-aware splitting needed
|
|
138
|
+
chunks = your_chunker.split(facts) # recursive, token, semantic, etc.
|
|
139
|
+
|
|
140
|
+
# 4. Embed and store as usual
|
|
141
|
+
vectors = embedder.embed(chunks)
|
|
142
|
+
vector_store.add(vectors, chunks)
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
The output of step 2 is plain text where every line is self-contained, so step 3 can split at any line boundary without losing header context. That's the property that makes the rest of the pipeline boring — exactly what you want it to be.
|
|
146
|
+
|
|
147
|
+
Works the same way with **LangChain** (`RecursiveCharacterTextSplitter`, `TokenTextSplitter`), **LlamaIndex** (`SentenceSplitter`, `TokenTextSplitter`), **Haystack**, or your own custom splitter — there's no framework integration, just text in and text out.
|
|
148
|
+
|
|
149
|
+
### What this buys you on today's stack
|
|
150
|
+
|
|
151
|
+
Three pressures RAG teams are under right now, and what table2rules does about each:
|
|
152
|
+
|
|
153
|
+
**1) Token bloat on frontier models.** On 200 real PubTabNet tables, the rules output is a median **27% smaller** than the source HTML (p25–p75: 12%–39% savings, measured with OpenAI's `cl100k_base` tokenizer — see [scripts/measure_token_savings.py](scripts/measure_token_savings.py) to reproduce). It's not free, though: on **16% of tables** — dense ones with long header paths — the rules output actually *grows* by up to 59%, because each data cell carries its full row- and col-header path. That's the deliberate tradeoff: where the representation costs extra tokens, it's preserving the context the HTML would otherwise lose at a chunk boundary.
|
|
154
|
+
|
|
155
|
+
**2) SLMs getting confused by HTML baggage.** Teams increasingly deploy small models (Phi-3, Qwen 2.5 3B, Llama 3.2) where latency and cost matter more than capability headroom. Smaller models have less attention to spend filtering out structural noise — nested tag hierarchy, attribute clutter, whitespace — before they can reason about content. The rules format strips that to a flat sequence of `row-path | col-path: value` statements with no markup.
|
|
156
|
+
|
|
157
|
+
**3) No chunk configuration.** Teams typically spend meaningful time tuning how long tables are chunked: recursive-character splitter, token splitter, markdown-header-aware splitter, `"don't split in the middle of a table"` heuristics. With table2rules output, every line is a self-contained fact — **any chunker can split anywhere** without orphaning a row from its headers. The chunking question stops being about tables.
|
|
158
|
+
|
|
159
|
+
### Language coverage
|
|
160
|
+
|
|
161
|
+
**table2rules operates on table geometry, not cell text.** Header detection, span resolution, row-group propagation, and every other parsing decision is a deterministic property of the markup — cell type, span values, empty-vs-non-empty, row/column fill patterns. The one content-level question the pipeline asks is *"does this cell contain any letter?"* via Unicode `str.isalpha()`, used to distinguish descriptor columns from numeric ones. Every writing system answers identically: Latin, Cyrillic, CJK, Arabic, Devanagari, Thai, Hebrew. No language-specific lexicons, no keyword lists, no English bias — a financial table in 合計 / итого / المجموع parses by the same rules as one in English.
|
|
162
|
+
|
|
163
|
+
---
|
|
164
|
+
|
|
165
|
+
## Output Format
|
|
166
|
+
|
|
167
|
+
In `table2rules`, "rules" means flat, header-inlined facts — one per line — not Datalog/Prolog clauses, not a business-rule engine. The output is plain text that any chunker, embedder, or LLM can consume directly.
|
|
168
|
+
|
|
169
|
+
The default `rules` exporter emits **one self-contained rule per line** — every line carries the full row-header path and full column-header path:
|
|
170
|
+
|
|
171
|
+
```
|
|
172
|
+
<row-path> | <col-path>: <value>
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
- `>` joins nested header levels (e.g. `Q1 > Sales > Rev`)
|
|
176
|
+
- `|` separates the row-header path from the column-header path
|
|
177
|
+
- `:` precedes the value
|
|
178
|
+
|
|
179
|
+
**Examples:**
|
|
180
|
+
```
|
|
181
|
+
Name: John Smith
|
|
182
|
+
January | Revenue: $50,000
|
|
183
|
+
North | Q1 Sales > Revenue: $50,000
|
|
184
|
+
NA > East | Q1 > Sales > Rev: 100
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
See [docs/examples.md](docs/examples.md) for a gallery of inputs and
|
|
188
|
+
outputs, from key-value tables to four-level-header clinical trial data.
|
|
189
|
+
|
|
190
|
+
---
|
|
191
|
+
|
|
192
|
+
## Installation
|
|
193
|
+
|
|
194
|
+
```bash
|
|
195
|
+
pip install table2rules
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
Or from source:
|
|
199
|
+
|
|
200
|
+
```bash
|
|
201
|
+
pip install -e .
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
See [CHANGELOG.md](CHANGELOG.md) for release notes and migration guidance.
|
|
205
|
+
|
|
206
|
+
---
|
|
207
|
+
|
|
208
|
+
## Usage
|
|
209
|
+
|
|
210
|
+
### Python API — the minimal call
|
|
211
|
+
|
|
212
|
+
```python
|
|
213
|
+
from table2rules import process_tables_to_text
|
|
214
|
+
|
|
215
|
+
html = open("page.html").read()
|
|
216
|
+
rules = process_tables_to_text(html) # default: format="rules"
|
|
217
|
+
print(rules)
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
### Python API — with observability
|
|
221
|
+
|
|
222
|
+
When you need to know *which* tables rendered cleanly and which fell back,
|
|
223
|
+
use the stats form. It returns the same text plus a structured
|
|
224
|
+
`RenderReport` with one `TableReport` per top-level table:
|
|
225
|
+
|
|
226
|
+
```python
|
|
227
|
+
from table2rules import process_tables_with_stats
|
|
228
|
+
|
|
229
|
+
text, report = process_tables_with_stats(html)
|
|
230
|
+
|
|
231
|
+
for t in report.tables:
|
|
232
|
+
if t.render_mode != "rules":
|
|
233
|
+
print(f"table {t.table_index}: {t.render_mode} — {t.reasons}")
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
Each `TableReport` also carries the rendered output for *that* table only —
|
|
237
|
+
useful when you pass whole-document HTML in and want to keep per-table
|
|
238
|
+
provenance instead of splitting the flat string yourself:
|
|
239
|
+
|
|
240
|
+
```python
|
|
241
|
+
text, report = process_tables_with_stats(html)
|
|
242
|
+
|
|
243
|
+
for t in report.tables:
|
|
244
|
+
name = t.caption or f"table_{t.table_index}"
|
|
245
|
+
store(name, t.text) # t.text is just this table's lines
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
`t.caption` is the text of the table's `<caption>` element when present,
|
|
249
|
+
otherwise `None`. The HTML `id` attribute and surrounding headings are
|
|
250
|
+
intentionally ignored — `t.table_index` is the only stable positional
|
|
251
|
+
identifier.
|
|
252
|
+
|
|
253
|
+
`render_mode` is one of `"rules"`, `"flat"`, `"passthrough"`, or
|
|
254
|
+
`"skipped"`. The full playbook — what each mode means operationally, how to
|
|
255
|
+
group the 16 reason codes by severity, `gate_score` thresholds, batch
|
|
256
|
+
aggregation, `strict` mode, thread safety, and a conservative policy
|
|
257
|
+
template — is in **[docs/integrating.md](docs/integrating.md)**. Read that
|
|
258
|
+
before wiring this into anything production.
|
|
259
|
+
|
|
260
|
+
### CLI
|
|
261
|
+
|
|
262
|
+
```bash
|
|
263
|
+
# File in, stdout out
|
|
264
|
+
table2rules report.html
|
|
265
|
+
|
|
266
|
+
# File in, file out
|
|
267
|
+
table2rules report.html -o rules.txt
|
|
268
|
+
|
|
269
|
+
# Pipe
|
|
270
|
+
cat report.html | table2rules
|
|
271
|
+
|
|
272
|
+
# Pick an exporter
|
|
273
|
+
table2rules report.html --format rules
|
|
274
|
+
|
|
275
|
+
# Module form
|
|
276
|
+
python3 -m table2rules report.html
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
### Custom exporters
|
|
280
|
+
|
|
281
|
+
Output formatting is pluggable. Built-in: `rules` (default, one fact per
|
|
282
|
+
line). Third parties can add custom exporters by registering an object with
|
|
283
|
+
`export_rules` / `export_flat` methods — see
|
|
284
|
+
[docs/integrating.md](docs/integrating.md) for the full exporter protocol
|
|
285
|
+
and a JSONL example.
|
|
286
|
+
|
|
287
|
+
### Public API and stability
|
|
288
|
+
|
|
289
|
+
The public API is exactly the names listed in `table2rules.__all__` (and re-exported at the package root). Anything else — submodules like `table2rules.grid_parser`, internal helpers, undocumented attributes — is implementation detail and may change in any release without notice. SemVer compatibility guarantees apply only to the documented public surface.
|
|
290
|
+
|
|
291
|
+
---
|
|
292
|
+
|
|
293
|
+
## Safety contract
|
|
294
|
+
|
|
295
|
+
- Parse and transform well-formed tables deterministically.
|
|
296
|
+
- Apply bounded generic repair for common breakage (mismatched tags, missing `<thead>`, malformed spans).
|
|
297
|
+
- If invariants / confidence fail, degrade to header-free flat rows, then to passthrough of the original HTML — never fabricate content.
|
|
298
|
+
- Clamp per-cell `rowspan` / `colspan` to 1000 and refuse tables whose expanded grid would exceed 1,000,000 cells. Adversarial span values surface as a `TableReport` with `render_mode="skipped"` rather than an OOM.
|
|
299
|
+
- Surface the per-table verdict via `process_tables_with_stats` so callers can route flagged tables through their own policy instead of discovering lossy output downstream.
|
|
300
|
+
|
|
301
|
+
## Limitations
|
|
302
|
+
|
|
303
|
+
- Output format is deterministic but not guaranteed to match every downstream schema; separators and grouping are optimized for parseability.
|
|
304
|
+
- The repair stage is bounded and generic; it does not attempt arbitrary HTML surgery.
|
|
305
|
+
- Extremely malformed or ambiguous tables may be passed through as raw HTML by design (fail-open safety).
|
|
306
|
+
- Semantic interpretation is intentionally conservative: the system transforms structure, it does not infer business meaning beyond table topology and header scopes.
|
|
307
|
+
- Benchmark coverage improves confidence but cannot prove correctness for all possible HTML table encodings.
|
|
308
|
+
|
|
309
|
+
---
|
|
310
|
+
|
|
311
|
+
## Validation at a glance
|
|
312
|
+
|
|
313
|
+
Tested against 200 real PubTabNet tables with per-cell oracle matching,
|
|
314
|
+
plus ~2,000 mutation tests applying 10 HTML-noise patterns on top. The
|
|
315
|
+
parser either matches the oracle exactly or degrades to flat / passthrough
|
|
316
|
+
— it never fabricates content. Full test model, corpus details, and
|
|
317
|
+
reproduction instructions are in [docs/validation.md](docs/validation.md).
|
|
318
|
+
|
|
319
|
+
---
|
|
320
|
+
|
|
321
|
+
## Documentation map
|
|
322
|
+
|
|
323
|
+
- **[docs/integrating.md](docs/integrating.md)** — wiring `table2rules`
|
|
324
|
+
into a production pipeline: render modes, reason severity, gate scoring,
|
|
325
|
+
logging, strict mode, policy templates.
|
|
326
|
+
- **[docs/architecture.md](docs/architecture.md)** — internals of the
|
|
327
|
+
repair → grid → pathfinder → output pipeline.
|
|
328
|
+
- **[docs/examples.md](docs/examples.md)** — gallery of HTML inputs and
|
|
329
|
+
their rules-format outputs.
|
|
330
|
+
- **[docs/validation.md](docs/validation.md)** — test corpora, coverage
|
|
331
|
+
gaps, and how to run the suite locally.
|
|
332
|
+
- **[CHANGELOG.md](CHANGELOG.md)** — release notes and migration guidance.
|
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
# table2rules
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/table2rules/)
|
|
4
|
+
[](https://pypi.org/project/table2rules/)
|
|
5
|
+
[](LICENSE)
|
|
6
|
+
[](https://github.com/PebbleRoad/table2rules/actions/workflows/test.yml)
|
|
7
|
+
|
|
8
|
+
**Convert HTML tables into flat, self-contained facts — one per line — so LLMs and RAG pipelines can chunk, embed, and retrieve them without losing header context.**
|
|
9
|
+
|
|
10
|
+
Concretely: HTML `<table>` in, lines of `row-path | col-path: value` out, with the full header ancestry repeated on every line so any chunker can split anywhere without orphaning a row from its headers.
|
|
11
|
+
|
|
12
|
+
- Pure Python, no network calls
|
|
13
|
+
- No ML models, fully deterministic
|
|
14
|
+
- MIT-licensed, safe for commercial / on-prem use
|
|
15
|
+
|
|
16
|
+
## The DNA of Table Parsing — a maze pathfinder approach
|
|
17
|
+
|
|
18
|
+
**Tables are mazes. Each cell finds its headers by pathfinding.**
|
|
19
|
+
|
|
20
|
+
This isn't pattern-matching or table-type detection. It's a universal algorithm based on how HTML tables actually work:
|
|
21
|
+
|
|
22
|
+
1. **Cells occupy grid positions** (with rowspan/colspan expanding them)
|
|
23
|
+
2. **Headers relate to data cells** via spatial relationships (left = row context, above = column context)
|
|
24
|
+
3. **Semantic markers** (`<th>`, `<thead>`, `scope`) signal intent
|
|
25
|
+
|
|
26
|
+
The algorithm **discovers structure** — it doesn't memorize patterns. When markup is ambiguous or hostile, it fails open and preserves raw HTML instead of inventing structure. This makes outputs more trustworthy for enterprise pipelines and LLM workflows where correctness and traceability matter more than aggressive guessing.
|
|
27
|
+
|
|
28
|
+
---
|
|
29
|
+
|
|
30
|
+
## Why RAG Pipelines Need This
|
|
31
|
+
|
|
32
|
+
The single largest failure mode for tables in RAG isn't extraction — it's **chunking**. A standard pipeline converts tables to markdown or HTML, then a size-based chunker splits by token count. For any table taller than a chunk, the header row ends up in one chunk and data rows land in others. Retrieval on *"what was Q2 2024 revenue?"* returns `Revenue | 155` without the system knowing `155` belongs to Q2, 2024, or even which metric it measures.
|
|
33
|
+
|
|
34
|
+
Consider a two-level-header financial table:
|
|
35
|
+
|
|
36
|
+
```html
|
|
37
|
+
<table>
|
|
38
|
+
<thead>
|
|
39
|
+
<tr><th></th><th colspan="2">2024</th><th colspan="2">2023</th></tr>
|
|
40
|
+
<tr><th></th><th>Q1</th><th>Q2</th><th>Q1</th><th>Q2</th></tr>
|
|
41
|
+
</thead>
|
|
42
|
+
<tbody>
|
|
43
|
+
<tr><th>Revenue</th><td>130</td><td>155</td><td>118</td><td>125</td></tr>
|
|
44
|
+
<tr><th>Operating Costs</th><td>55</td><td>60</td><td>48</td><td>52</td></tr>
|
|
45
|
+
</tbody>
|
|
46
|
+
</table>
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
**Typical markdown extraction** loses the year/quarter hierarchy (the two header rows collapse, and any table-unaware chunker can split the header off from the data):
|
|
50
|
+
|
|
51
|
+
```
|
|
52
|
+
| | Q1 | Q2 | Q1 | Q2 |
|
|
53
|
+
| Revenue | 130 | 155 | 118 | 125 |
|
|
54
|
+
| Operating Costs | 55 | 60 | 48 | 52 |
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
**table2rules output** is one self-contained fact per line, with the full header ancestry on every line:
|
|
58
|
+
|
|
59
|
+
```
|
|
60
|
+
Revenue | 2024 > Q1: 130
|
|
61
|
+
Revenue | 2024 > Q2: 155
|
|
62
|
+
Revenue | 2023 > Q1: 118
|
|
63
|
+
Revenue | 2023 > Q2: 125
|
|
64
|
+
Operating Costs | 2024 > Q1: 55
|
|
65
|
+
Operating Costs | 2024 > Q2: 60
|
|
66
|
+
Operating Costs | 2023 > Q1: 48
|
|
67
|
+
Operating Costs | 2023 > Q2: 52
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Three properties this gives your RAG pipeline:
|
|
71
|
+
|
|
72
|
+
1. **Chunk-safety.** Any chunker (character count, token count, semantic, recursive) can split the output at any line boundary and every chunk stays independently meaningful. No row is ever orphaned from its headers.
|
|
73
|
+
2. **Retrieval semantics.** A vector embedding of `Revenue | 2024 > Q2: 155` is far closer to the query *"Q2 2024 revenue"* than an embedding of `Revenue | 155` ever will be. The dimension labels are inside the string that gets embedded.
|
|
74
|
+
3. **Traceability at answer time.** The LLM sees the full header path on every fact it reads, so when it answers *"why is this 155?"* it can cite the correct column group unambiguously.
|
|
75
|
+
|
|
76
|
+
This is why we produce rules, not just markdown: rules are the representation tables need to survive a RAG pipeline intact.
|
|
77
|
+
|
|
78
|
+
### Where this library fits vs. other tools
|
|
79
|
+
|
|
80
|
+
- **Unstructured.io, markitdown, docling**: extract tables as markdown/HTML. Excellent at extraction, but chunking those outputs without losing headers still needs help — that's where table2rules fits.
|
|
81
|
+
- **LlamaParse**: paid, similar intent at a higher level (whole-document parsing).
|
|
82
|
+
- **pandas / lxml**: give you structured data, not RAG-ingestible facts.
|
|
83
|
+
- **table2rules**: narrow scope — HTML table in, self-contained facts out, fail-open on hostile input. Pair it with any of the above in a pipeline: extract with your tool, pass the table HTML through table2rules before chunking.
|
|
84
|
+
|
|
85
|
+
### Where it fits in your RAG pipeline
|
|
86
|
+
|
|
87
|
+
`table2rules` is a single transformation between **table extraction** and **chunking**. It doesn't replace your extractor, vector store, embedder, or LLM — it makes table content survive the chunker:
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from table2rules import process_tables_to_text
|
|
91
|
+
|
|
92
|
+
# 1. Extract HTML from your source (PDF, scrape, doc parser, etc.)
|
|
93
|
+
html = extract_html_with_unstructured(pdf_bytes) # or docling, markitdown, ...
|
|
94
|
+
|
|
95
|
+
# 2. Convert any tables in it to flat, header-inlined facts
|
|
96
|
+
facts = process_tables_to_text(html)
|
|
97
|
+
|
|
98
|
+
# 3. Hand the facts to *any* chunker — no table-aware splitting needed
|
|
99
|
+
chunks = your_chunker.split(facts) # recursive, token, semantic, etc.
|
|
100
|
+
|
|
101
|
+
# 4. Embed and store as usual
|
|
102
|
+
vectors = embedder.embed(chunks)
|
|
103
|
+
vector_store.add(vectors, chunks)
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
The output of step 2 is plain text where every line is self-contained, so step 3 can split at any line boundary without losing header context. That's the property that makes the rest of the pipeline boring — exactly what you want it to be.
|
|
107
|
+
|
|
108
|
+
Works the same way with **LangChain** (`RecursiveCharacterTextSplitter`, `TokenTextSplitter`), **LlamaIndex** (`SentenceSplitter`, `TokenTextSplitter`), **Haystack**, or your own custom splitter — there's no framework integration, just text in and text out.
|
|
109
|
+
|
|
110
|
+
### What this buys you on today's stack
|
|
111
|
+
|
|
112
|
+
Three pressures RAG teams are under right now, and what table2rules does about each:
|
|
113
|
+
|
|
114
|
+
**1) Token bloat on frontier models.** On 200 real PubTabNet tables, the rules output is a median **27% smaller** than the source HTML (p25–p75: 12%–39% savings, measured with OpenAI's `cl100k_base` tokenizer — see [scripts/measure_token_savings.py](scripts/measure_token_savings.py) to reproduce). It's not free, though: on **16% of tables** — dense ones with long header paths — the rules output actually *grows* by up to 59%, because each data cell carries its full row- and col-header path. That's the deliberate tradeoff: where the representation costs extra tokens, it's preserving the context the HTML would otherwise lose at a chunk boundary.
|
|
115
|
+
|
|
116
|
+
**2) SLMs getting confused by HTML baggage.** Teams increasingly deploy small models (Phi-3, Qwen 2.5 3B, Llama 3.2) where latency and cost matter more than capability headroom. Smaller models have less attention to spend filtering out structural noise — nested tag hierarchy, attribute clutter, whitespace — before they can reason about content. The rules format strips that to a flat sequence of `row-path | col-path: value` statements with no markup.
|
|
117
|
+
|
|
118
|
+
**3) No chunk configuration.** Teams typically spend meaningful time tuning how long tables are chunked: recursive-character splitter, token splitter, markdown-header-aware splitter, `"don't split in the middle of a table"` heuristics. With table2rules output, every line is a self-contained fact — **any chunker can split anywhere** without orphaning a row from its headers. The chunking question stops being about tables.
|
|
119
|
+
|
|
120
|
+
### Language coverage
|
|
121
|
+
|
|
122
|
+
**table2rules operates on table geometry, not cell text.** Header detection, span resolution, row-group propagation, and every other parsing decision is a deterministic property of the markup — cell type, span values, empty-vs-non-empty, row/column fill patterns. The one content-level question the pipeline asks is *"does this cell contain any letter?"* via Unicode `str.isalpha()`, used to distinguish descriptor columns from numeric ones. Every writing system answers identically: Latin, Cyrillic, CJK, Arabic, Devanagari, Thai, Hebrew. No language-specific lexicons, no keyword lists, no English bias — a financial table in 合計 / итого / المجموع parses by the same rules as one in English.
|
|
123
|
+
|
|
124
|
+
---
|
|
125
|
+
|
|
126
|
+
## Output Format
|
|
127
|
+
|
|
128
|
+
In `table2rules`, "rules" means flat, header-inlined facts — one per line — not Datalog/Prolog clauses, not a business-rule engine. The output is plain text that any chunker, embedder, or LLM can consume directly.
|
|
129
|
+
|
|
130
|
+
The default `rules` exporter emits **one self-contained rule per line** — every line carries the full row-header path and full column-header path:
|
|
131
|
+
|
|
132
|
+
```
|
|
133
|
+
<row-path> | <col-path>: <value>
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
- `>` joins nested header levels (e.g. `Q1 > Sales > Rev`)
|
|
137
|
+
- `|` separates the row-header path from the column-header path
|
|
138
|
+
- `:` precedes the value
|
|
139
|
+
|
|
140
|
+
**Examples:**
|
|
141
|
+
```
|
|
142
|
+
Name: John Smith
|
|
143
|
+
January | Revenue: $50,000
|
|
144
|
+
North | Q1 Sales > Revenue: $50,000
|
|
145
|
+
NA > East | Q1 > Sales > Rev: 100
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
See [docs/examples.md](docs/examples.md) for a gallery of inputs and
|
|
149
|
+
outputs, from key-value tables to four-level-header clinical trial data.
|
|
150
|
+
|
|
151
|
+
---
|
|
152
|
+
|
|
153
|
+
## Installation
|
|
154
|
+
|
|
155
|
+
```bash
|
|
156
|
+
pip install table2rules
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
Or from source:
|
|
160
|
+
|
|
161
|
+
```bash
|
|
162
|
+
pip install -e .
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
See [CHANGELOG.md](CHANGELOG.md) for release notes and migration guidance.
|
|
166
|
+
|
|
167
|
+
---
|
|
168
|
+
|
|
169
|
+
## Usage
|
|
170
|
+
|
|
171
|
+
### Python API — the minimal call
|
|
172
|
+
|
|
173
|
+
```python
|
|
174
|
+
from table2rules import process_tables_to_text
|
|
175
|
+
|
|
176
|
+
html = open("page.html").read()
|
|
177
|
+
rules = process_tables_to_text(html) # default: format="rules"
|
|
178
|
+
print(rules)
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
### Python API — with observability
|
|
182
|
+
|
|
183
|
+
When you need to know *which* tables rendered cleanly and which fell back,
|
|
184
|
+
use the stats form. It returns the same text plus a structured
|
|
185
|
+
`RenderReport` with one `TableReport` per top-level table:
|
|
186
|
+
|
|
187
|
+
```python
|
|
188
|
+
from table2rules import process_tables_with_stats
|
|
189
|
+
|
|
190
|
+
text, report = process_tables_with_stats(html)
|
|
191
|
+
|
|
192
|
+
for t in report.tables:
|
|
193
|
+
if t.render_mode != "rules":
|
|
194
|
+
print(f"table {t.table_index}: {t.render_mode} — {t.reasons}")
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
Each `TableReport` also carries the rendered output for *that* table only —
|
|
198
|
+
useful when you pass whole-document HTML in and want to keep per-table
|
|
199
|
+
provenance instead of splitting the flat string yourself:
|
|
200
|
+
|
|
201
|
+
```python
|
|
202
|
+
text, report = process_tables_with_stats(html)
|
|
203
|
+
|
|
204
|
+
for t in report.tables:
|
|
205
|
+
name = t.caption or f"table_{t.table_index}"
|
|
206
|
+
store(name, t.text) # t.text is just this table's lines
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
`t.caption` is the text of the table's `<caption>` element when present,
|
|
210
|
+
otherwise `None`. The HTML `id` attribute and surrounding headings are
|
|
211
|
+
intentionally ignored — `t.table_index` is the only stable positional
|
|
212
|
+
identifier.
|
|
213
|
+
|
|
214
|
+
`render_mode` is one of `"rules"`, `"flat"`, `"passthrough"`, or
|
|
215
|
+
`"skipped"`. The full playbook — what each mode means operationally, how to
|
|
216
|
+
group the 16 reason codes by severity, `gate_score` thresholds, batch
|
|
217
|
+
aggregation, `strict` mode, thread safety, and a conservative policy
|
|
218
|
+
template — is in **[docs/integrating.md](docs/integrating.md)**. Read that
|
|
219
|
+
before wiring this into anything production.
|
|
220
|
+
|
|
221
|
+
### CLI
|
|
222
|
+
|
|
223
|
+
```bash
|
|
224
|
+
# File in, stdout out
|
|
225
|
+
table2rules report.html
|
|
226
|
+
|
|
227
|
+
# File in, file out
|
|
228
|
+
table2rules report.html -o rules.txt
|
|
229
|
+
|
|
230
|
+
# Pipe
|
|
231
|
+
cat report.html | table2rules
|
|
232
|
+
|
|
233
|
+
# Pick an exporter
|
|
234
|
+
table2rules report.html --format rules
|
|
235
|
+
|
|
236
|
+
# Module form
|
|
237
|
+
python3 -m table2rules report.html
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
### Custom exporters
|
|
241
|
+
|
|
242
|
+
Output formatting is pluggable. Built-in: `rules` (default, one fact per
|
|
243
|
+
line). Third parties can add custom exporters by registering an object with
|
|
244
|
+
`export_rules` / `export_flat` methods — see
|
|
245
|
+
[docs/integrating.md](docs/integrating.md) for the full exporter protocol
|
|
246
|
+
and a JSONL example.
|
|
247
|
+
|
|
248
|
+
### Public API and stability
|
|
249
|
+
|
|
250
|
+
The public API is exactly the names listed in `table2rules.__all__` (and re-exported at the package root). Anything else — submodules like `table2rules.grid_parser`, internal helpers, undocumented attributes — is implementation detail and may change in any release without notice. SemVer compatibility guarantees apply only to the documented public surface.
|
|
251
|
+
|
|
252
|
+
---
|
|
253
|
+
|
|
254
|
+
## Safety contract
|
|
255
|
+
|
|
256
|
+
- Parse and transform well-formed tables deterministically.
|
|
257
|
+
- Apply bounded generic repair for common breakage (mismatched tags, missing `<thead>`, malformed spans).
|
|
258
|
+
- If invariants / confidence fail, degrade to header-free flat rows, then to passthrough of the original HTML — never fabricate content.
|
|
259
|
+
- Clamp per-cell `rowspan` / `colspan` to 1000 and refuse tables whose expanded grid would exceed 1,000,000 cells. Adversarial span values surface as a `TableReport` with `render_mode="skipped"` rather than an OOM.
|
|
260
|
+
- Surface the per-table verdict via `process_tables_with_stats` so callers can route flagged tables through their own policy instead of discovering lossy output downstream.
|
|
261
|
+
|
|
262
|
+
## Limitations
|
|
263
|
+
|
|
264
|
+
- Output format is deterministic but not guaranteed to match every downstream schema; separators and grouping are optimized for parseability.
|
|
265
|
+
- The repair stage is bounded and generic; it does not attempt arbitrary HTML surgery.
|
|
266
|
+
- Extremely malformed or ambiguous tables may be passed through as raw HTML by design (fail-open safety).
|
|
267
|
+
- Semantic interpretation is intentionally conservative: the system transforms structure, it does not infer business meaning beyond table topology and header scopes.
|
|
268
|
+
- Benchmark coverage improves confidence but cannot prove correctness for all possible HTML table encodings.
|
|
269
|
+
|
|
270
|
+
---
|
|
271
|
+
|
|
272
|
+
## Validation at a glance
|
|
273
|
+
|
|
274
|
+
Tested against 200 real PubTabNet tables with per-cell oracle matching,
|
|
275
|
+
plus ~2,000 mutation tests applying 10 HTML-noise patterns on top. The
|
|
276
|
+
parser either matches the oracle exactly or degrades to flat / passthrough
|
|
277
|
+
— it never fabricates content. Full test model, corpus details, and
|
|
278
|
+
reproduction instructions are in [docs/validation.md](docs/validation.md).
|
|
279
|
+
|
|
280
|
+
---
|
|
281
|
+
|
|
282
|
+
## Documentation map
|
|
283
|
+
|
|
284
|
+
- **[docs/integrating.md](docs/integrating.md)** — wiring `table2rules`
|
|
285
|
+
into a production pipeline: render modes, reason severity, gate scoring,
|
|
286
|
+
logging, strict mode, policy templates.
|
|
287
|
+
- **[docs/architecture.md](docs/architecture.md)** — internals of the
|
|
288
|
+
repair → grid → pathfinder → output pipeline.
|
|
289
|
+
- **[docs/examples.md](docs/examples.md)** — gallery of HTML inputs and
|
|
290
|
+
their rules-format outputs.
|
|
291
|
+
- **[docs/validation.md](docs/validation.md)** — test corpora, coverage
|
|
292
|
+
gaps, and how to run the suite locally.
|
|
293
|
+
- **[CHANGELOG.md](CHANGELOG.md)** — release notes and migration guidance.
|