table-stitcher 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- table_stitcher/__init__.py +340 -0
- table_stitcher/adapters/README.md +173 -0
- table_stitcher/adapters/__init__.py +11 -0
- table_stitcher/adapters/base.py +42 -0
- table_stitcher/adapters/docling.py +797 -0
- table_stitcher/merger.py +979 -0
- table_stitcher/models.py +145 -0
- table_stitcher/py.typed +0 -0
- table_stitcher-0.3.0.dist-info/METADATA +392 -0
- table_stitcher-0.3.0.dist-info/RECORD +12 -0
- table_stitcher-0.3.0.dist-info/WHEEL +4 -0
- table_stitcher-0.3.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,340 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Table Stitcher — Reassemble tables split across page boundaries.
|
|
3
|
+
|
|
4
|
+
A parser-agnostic library that detects and merges table fragments produced
|
|
5
|
+
by PDF extraction tools. Ships with a Docling adapter out of the box.
|
|
6
|
+
|
|
7
|
+
Usage (Docling):
|
|
8
|
+
from table_stitcher import stitch_tables, MultiPageConfig
|
|
9
|
+
|
|
10
|
+
doc = stitch_tables(doc) # Use defaults
|
|
11
|
+
doc = stitch_tables(doc, config=MultiPageConfig(max_page_gap=2))
|
|
12
|
+
|
|
13
|
+
Usage (custom parser):
|
|
14
|
+
from table_stitcher import TableStitcher
|
|
15
|
+
from table_stitcher.adapters.base import TableStitcherAdapter
|
|
16
|
+
|
|
17
|
+
class MyAdapter:
|
|
18
|
+
def extract(self, doc, cfg): ...
|
|
19
|
+
def inject(self, doc, logical_tables): ...
|
|
20
|
+
|
|
21
|
+
stitcher = TableStitcher(adapter=MyAdapter())
|
|
22
|
+
doc = stitcher.stitch(doc)
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
import logging
|
|
26
|
+
import time
|
|
27
|
+
from typing import Any, Optional
|
|
28
|
+
|
|
29
|
+
from .adapters.base import TableStitcherAdapter
|
|
30
|
+
from .merger import merge_multipage_tables
|
|
31
|
+
from .models import LogicalTable, MergeTrace, MultiPageConfig, TableMeta
|
|
32
|
+
|
|
33
|
+
__version__ = "0.2.0"
|
|
34
|
+
__all__ = [
|
|
35
|
+
"stitch_tables",
|
|
36
|
+
"extract_table_meta",
|
|
37
|
+
"merge_multipage_tables",
|
|
38
|
+
"TableStitcher",
|
|
39
|
+
"MultiPageConfig",
|
|
40
|
+
"LogicalTable",
|
|
41
|
+
"TableMeta",
|
|
42
|
+
"MergeTrace",
|
|
43
|
+
"StitchingError",
|
|
44
|
+
"TableStitcherAdapter",
|
|
45
|
+
"__version__",
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class StitchingError(Exception):
|
|
50
|
+
"""Raised when table stitching fails."""
|
|
51
|
+
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class TableStitcher:
|
|
56
|
+
"""
|
|
57
|
+
Detects and merges tables split across multiple pages.
|
|
58
|
+
|
|
59
|
+
This class is parser-agnostic. Pass any adapter that implements the
|
|
60
|
+
``TableStitcherAdapter`` protocol (two methods: ``extract`` and ``inject``).
|
|
61
|
+
|
|
62
|
+
For simple Docling usage, use the ``stitch_tables()`` function instead.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
def __init__(
|
|
66
|
+
self,
|
|
67
|
+
adapter: TableStitcherAdapter,
|
|
68
|
+
config: Optional[MultiPageConfig] = None,
|
|
69
|
+
):
|
|
70
|
+
self.logger = logging.getLogger("table_stitcher")
|
|
71
|
+
self.adapter = adapter
|
|
72
|
+
self.config = config or MultiPageConfig()
|
|
73
|
+
self._validate_config()
|
|
74
|
+
|
|
75
|
+
# -------------------------------------------------------------------------
|
|
76
|
+
# Logging
|
|
77
|
+
# -------------------------------------------------------------------------
|
|
78
|
+
|
|
79
|
+
def _log_phase_start(self, phase_num: int, phase_name: str):
|
|
80
|
+
self.logger.info("=" * 70)
|
|
81
|
+
self.logger.info(f"Phase {phase_num}: {phase_name}")
|
|
82
|
+
self.logger.info("=" * 70)
|
|
83
|
+
|
|
84
|
+
def _log_phase_complete(
|
|
85
|
+
self, phase_num: int, count: int, duration: float, item_type: str = "items"
|
|
86
|
+
):
|
|
87
|
+
self.logger.info(f"Phase {phase_num} complete: {count} {item_type} [{duration:.1f}s]")
|
|
88
|
+
|
|
89
|
+
def _log_section(self, message: str, indent: int = 2):
|
|
90
|
+
prefix = " " * indent
|
|
91
|
+
self.logger.info(f"{prefix}{message}")
|
|
92
|
+
|
|
93
|
+
def _log_item_progress(self, item_name: str, status: str = "success"):
|
|
94
|
+
symbol = "+" if status == "success" else "x"
|
|
95
|
+
self.logger.info(f" [{symbol}] {item_name}")
|
|
96
|
+
|
|
97
|
+
def _log_error(self, context: str, error: Exception):
|
|
98
|
+
self.logger.error(f" [x] {context}: {error}")
|
|
99
|
+
|
|
100
|
+
# -------------------------------------------------------------------------
|
|
101
|
+
# Validation
|
|
102
|
+
# -------------------------------------------------------------------------
|
|
103
|
+
|
|
104
|
+
def _validate_config(self) -> None:
|
|
105
|
+
"""Validate configuration values."""
|
|
106
|
+
errors = []
|
|
107
|
+
cfg = self.config
|
|
108
|
+
|
|
109
|
+
if not (1 <= cfg.max_page_gap <= 10):
|
|
110
|
+
errors.append(f"max_page_gap must be 1-10, got {cfg.max_page_gap}")
|
|
111
|
+
|
|
112
|
+
if cfg.max_width_difference < 0:
|
|
113
|
+
errors.append(f"max_width_difference must be >= 0, got {cfg.max_width_difference}")
|
|
114
|
+
|
|
115
|
+
valid_width_policies = {"preserve_extra", "warn_drop", "fail", "merge_tail"}
|
|
116
|
+
if cfg.width_overflow_policy not in valid_width_policies:
|
|
117
|
+
errors.append(
|
|
118
|
+
"width_overflow_policy must be one of "
|
|
119
|
+
f"{sorted(valid_width_policies)}, got {cfg.width_overflow_policy!r}"
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
for name, value in [
|
|
123
|
+
("header_sim_strict", cfg.header_sim_strict),
|
|
124
|
+
("header_sim_loose", cfg.header_sim_loose),
|
|
125
|
+
("row_sim_threshold", cfg.row_sim_threshold),
|
|
126
|
+
("bottom_band_min", cfg.bottom_band_min),
|
|
127
|
+
("top_band_max", cfg.top_band_max),
|
|
128
|
+
]:
|
|
129
|
+
if not (0.0 <= value <= 1.0):
|
|
130
|
+
errors.append(f"{name} must be 0.0-1.0, got {value}")
|
|
131
|
+
|
|
132
|
+
if cfg.max_orphan_rows < 0:
|
|
133
|
+
errors.append(f"max_orphan_rows must be >= 0, got {cfg.max_orphan_rows}")
|
|
134
|
+
if cfg.max_data_orphan_rows < 0:
|
|
135
|
+
errors.append(f"max_data_orphan_rows must be >= 0, got {cfg.max_data_orphan_rows}")
|
|
136
|
+
|
|
137
|
+
if errors:
|
|
138
|
+
raise ValueError("Invalid MultiPageConfig:\n " + "\n ".join(errors))
|
|
139
|
+
|
|
140
|
+
# -------------------------------------------------------------------------
|
|
141
|
+
# Core Processing
|
|
142
|
+
# -------------------------------------------------------------------------
|
|
143
|
+
|
|
144
|
+
def stitch(
|
|
145
|
+
self,
|
|
146
|
+
doc: Any,
|
|
147
|
+
raise_on_error: bool = False,
|
|
148
|
+
) -> Any:
|
|
149
|
+
"""
|
|
150
|
+
Detect and merge tables split across multiple pages.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
doc: The parser-native document object.
|
|
154
|
+
raise_on_error: If True, raise exceptions on processing errors.
|
|
155
|
+
If False (default), log errors and return original doc.
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
The document with merged tables.
|
|
159
|
+
"""
|
|
160
|
+
if doc is None:
|
|
161
|
+
if raise_on_error:
|
|
162
|
+
raise StitchingError("Input document is None")
|
|
163
|
+
self.logger.error("Input document is None")
|
|
164
|
+
return doc
|
|
165
|
+
|
|
166
|
+
# --- Phase 1: Extract Metadata ---
|
|
167
|
+
self._log_phase_start(1, "Extract Table Metadata")
|
|
168
|
+
phase_start = time.time()
|
|
169
|
+
|
|
170
|
+
try:
|
|
171
|
+
tables_meta = self.adapter.extract(doc, self.config)
|
|
172
|
+
except Exception as e:
|
|
173
|
+
self._log_error("Metadata extraction failed", e)
|
|
174
|
+
if raise_on_error:
|
|
175
|
+
raise StitchingError(f"Failed to extract table metadata: {e}") from e
|
|
176
|
+
return doc
|
|
177
|
+
|
|
178
|
+
if not tables_meta:
|
|
179
|
+
self._log_section("No tables found in document")
|
|
180
|
+
self._log_phase_complete(1, 0, time.time() - phase_start, "tables")
|
|
181
|
+
return doc
|
|
182
|
+
|
|
183
|
+
# Report extraction coverage — tables that failed extraction
|
|
184
|
+
# are silently preserved in the original doc (pass-through).
|
|
185
|
+
total_tables = len(getattr(doc, "tables", []) or [])
|
|
186
|
+
if total_tables and len(tables_meta) < total_tables:
|
|
187
|
+
skipped = total_tables - len(tables_meta)
|
|
188
|
+
self._log_section(
|
|
189
|
+
f"Extracted {len(tables_meta)}/{total_tables} tables "
|
|
190
|
+
f"({skipped} skipped — originals preserved)"
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
self._log_phase_complete(
|
|
194
|
+
1, len(tables_meta), time.time() - phase_start, "table fragments extracted"
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
# --- Phase 2: Analyze & Merge ---
|
|
198
|
+
self._log_phase_start(2, "Analyze Multi-Page Merges")
|
|
199
|
+
phase_start = time.time()
|
|
200
|
+
|
|
201
|
+
try:
|
|
202
|
+
logical_tables = merge_multipage_tables(tables_meta, self.config)
|
|
203
|
+
except Exception as e:
|
|
204
|
+
self._log_error("Merge analysis failed", e)
|
|
205
|
+
if raise_on_error:
|
|
206
|
+
raise StitchingError(f"Failed to merge tables: {e}") from e
|
|
207
|
+
return doc
|
|
208
|
+
|
|
209
|
+
multi_page_tables = [lt for lt in logical_tables if len(lt.pages) > 1]
|
|
210
|
+
|
|
211
|
+
if not multi_page_tables:
|
|
212
|
+
self._log_section("No multi-page tables detected")
|
|
213
|
+
self._log_phase_complete(2, 0, time.time() - phase_start, "merges")
|
|
214
|
+
return doc
|
|
215
|
+
|
|
216
|
+
for lt in multi_page_tables:
|
|
217
|
+
reason = f", reason={lt.merge_reason}" if lt.merge_reason else ""
|
|
218
|
+
self._log_section(
|
|
219
|
+
f"Found: Table spanning pages {lt.pages} ({len(lt.members)} fragments{reason})"
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
self._log_phase_complete(
|
|
223
|
+
2, len(multi_page_tables), time.time() - phase_start, "multi-page tables identified"
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
# --- Phase 3: Inject Merged Tables ---
|
|
227
|
+
self._log_phase_start(3, "Inject Merged Tables")
|
|
228
|
+
phase_start = time.time()
|
|
229
|
+
|
|
230
|
+
try:
|
|
231
|
+
doc = self.adapter.inject(doc, logical_tables)
|
|
232
|
+
except Exception as e:
|
|
233
|
+
self._log_error("Injection failed", e)
|
|
234
|
+
if raise_on_error:
|
|
235
|
+
raise StitchingError(f"Failed to inject merged tables: {e}") from e
|
|
236
|
+
return doc
|
|
237
|
+
|
|
238
|
+
for lt in multi_page_tables:
|
|
239
|
+
self._log_item_progress(f"Merged pages {lt.pages} -> 1 table", "success")
|
|
240
|
+
|
|
241
|
+
self._log_phase_complete(
|
|
242
|
+
3, len(multi_page_tables), time.time() - phase_start, "tables injected"
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
return doc
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
# -----------------------------------------------------------------------------
|
|
249
|
+
# Convenience Function
|
|
250
|
+
# -----------------------------------------------------------------------------
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def stitch_tables(
|
|
254
|
+
doc: Any,
|
|
255
|
+
config: Optional[MultiPageConfig] = None,
|
|
256
|
+
raise_on_error: bool = False,
|
|
257
|
+
) -> Any:
|
|
258
|
+
"""
|
|
259
|
+
Detect and merge tables split across multiple pages.
|
|
260
|
+
|
|
261
|
+
Convenience function that uses the Docling adapter by default.
|
|
262
|
+
For other parsers, use ``TableStitcher`` with a custom adapter.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
doc: The input DoclingDocument (already converted from PDF).
|
|
266
|
+
config: Optional configuration overrides. Uses sensible defaults if None.
|
|
267
|
+
raise_on_error: If True, raise exceptions on processing errors.
|
|
268
|
+
If False (default), log errors and return original doc.
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
The document with merged tables.
|
|
272
|
+
|
|
273
|
+
Raises:
|
|
274
|
+
StitchingError: If raise_on_error=True and processing fails.
|
|
275
|
+
ValueError: If config values are invalid.
|
|
276
|
+
|
|
277
|
+
Example:
|
|
278
|
+
>>> from docling.document_converter import DocumentConverter
|
|
279
|
+
>>> from table_stitcher import stitch_tables
|
|
280
|
+
>>>
|
|
281
|
+
>>> converter = DocumentConverter()
|
|
282
|
+
>>> doc = converter.convert("report.pdf").document
|
|
283
|
+
>>> doc = stitch_tables(doc)
|
|
284
|
+
"""
|
|
285
|
+
try:
|
|
286
|
+
from .adapters.docling import DoclingAdapter
|
|
287
|
+
except ModuleNotFoundError as e:
|
|
288
|
+
if "docling_core" in str(e):
|
|
289
|
+
raise ImportError(
|
|
290
|
+
"The Docling adapter requires docling-core. "
|
|
291
|
+
"Install it with: pip install table-stitcher[docling]"
|
|
292
|
+
) from e
|
|
293
|
+
raise # genuine bug inside docling.py — don't mask it
|
|
294
|
+
|
|
295
|
+
try:
|
|
296
|
+
stitcher = TableStitcher(adapter=DoclingAdapter(), config=config)
|
|
297
|
+
return stitcher.stitch(doc, raise_on_error=raise_on_error)
|
|
298
|
+
except ValueError:
|
|
299
|
+
if raise_on_error:
|
|
300
|
+
raise
|
|
301
|
+
logging.getLogger("table_stitcher").error(
|
|
302
|
+
"Invalid configuration, returning original document"
|
|
303
|
+
)
|
|
304
|
+
return doc
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def extract_table_meta(
|
|
308
|
+
doc: Any,
|
|
309
|
+
config: Optional[MultiPageConfig] = None,
|
|
310
|
+
) -> list:
|
|
311
|
+
"""
|
|
312
|
+
Extract table metadata without merging — useful for analysis.
|
|
313
|
+
|
|
314
|
+
Convenience function that uses the Docling adapter by default.
|
|
315
|
+
Returns a list of ``TableMeta`` objects describing each table fragment.
|
|
316
|
+
|
|
317
|
+
Args:
|
|
318
|
+
doc: The input DoclingDocument (already converted from PDF).
|
|
319
|
+
config: Optional configuration overrides. Uses sensible defaults if None.
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
List of TableMeta objects for each table in the document.
|
|
323
|
+
|
|
324
|
+
Example:
|
|
325
|
+
>>> from table_stitcher import extract_table_meta
|
|
326
|
+
>>> metas = extract_table_meta(doc)
|
|
327
|
+
>>> for m in metas:
|
|
328
|
+
... print(f"Table {m.idx}: {m.width} cols, page {m.start_page}")
|
|
329
|
+
"""
|
|
330
|
+
try:
|
|
331
|
+
from .adapters.docling import DoclingAdapter
|
|
332
|
+
except ModuleNotFoundError as e:
|
|
333
|
+
if "docling_core" in str(e):
|
|
334
|
+
raise ImportError(
|
|
335
|
+
"The Docling adapter requires docling-core. "
|
|
336
|
+
"Install it with: pip install table-stitcher[docling]"
|
|
337
|
+
) from e
|
|
338
|
+
raise
|
|
339
|
+
|
|
340
|
+
return DoclingAdapter().extract(doc, config or MultiPageConfig())
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
# Adapters
|
|
2
|
+
|
|
3
|
+
`table-stitcher`'s core is parser-agnostic. Adapters bridge a specific
|
|
4
|
+
document parser (docling, pdfplumber, unstructured, …) to the
|
|
5
|
+
`TableMeta` / `LogicalTable` contract the merger operates on.
|
|
6
|
+
|
|
7
|
+
Adapter-specific notes — version compatibility, OCR backend behavior,
|
|
8
|
+
known upstream workarounds — live here rather than in the main README,
|
|
9
|
+
so the top-level docs stay agnostic and each adapter can document its
|
|
10
|
+
own quirks.
|
|
11
|
+
|
|
12
|
+
## Adapter protocol
|
|
13
|
+
|
|
14
|
+
Any adapter implements two methods:
|
|
15
|
+
|
|
16
|
+
```python
|
|
17
|
+
class MyAdapter:
|
|
18
|
+
def extract(self, doc, cfg: MultiPageConfig) -> list[TableMeta]:
|
|
19
|
+
"""Read tables from the parser's native document type and produce
|
|
20
|
+
TableMeta records for each fragment."""
|
|
21
|
+
|
|
22
|
+
def inject(self, doc, logical_tables: list[LogicalTable]):
|
|
23
|
+
"""Write merged tables back into the native document, pruning the
|
|
24
|
+
now-redundant satellite fragments from the body tree and clearing
|
|
25
|
+
their cell content. Return the modified document."""
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
A skeleton custom adapter appears in the top-level README. The merger
|
|
29
|
+
never imports parser-specific types — it only reads from the `TableMeta`
|
|
30
|
+
fields and writes to a pandas DataFrame, which the adapter then converts
|
|
31
|
+
back into the parser's native table representation during `inject`.
|
|
32
|
+
Each `LogicalTable` also carries `merge_reason`, `merge_traces`, and
|
|
33
|
+
`warnings`; adapters can ignore them, log them, or surface them in their
|
|
34
|
+
native document metadata.
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
|
|
38
|
+
## Docling adapter (`docling.py`)
|
|
39
|
+
|
|
40
|
+
### How the docling adapter prunes satellites
|
|
41
|
+
|
|
42
|
+
When `inject()` folds multiple fragments into one logical table, the
|
|
43
|
+
satellite fragments (members after the anchor) are handled in two places:
|
|
44
|
+
|
|
45
|
+
1. **Body tree** — references to satellite tables in `doc.body` (and any
|
|
46
|
+
groups) are removed, so rendered output contains only the merged
|
|
47
|
+
anchor.
|
|
48
|
+
2. **`doc.tables` list** — the `Table` objects at the satellite indices
|
|
49
|
+
are *cleared in place*: `data` becomes an empty `TableData`
|
|
50
|
+
(`num_rows=0, num_cols=0`), `prov` becomes `[]`. The `Table`
|
|
51
|
+
wrapper stays at its list position because docling uses
|
|
52
|
+
position-based `self_ref` strings (`#/tables/N`) — removing entries
|
|
53
|
+
would invalidate every reference that points to a later index.
|
|
54
|
+
|
|
55
|
+
**If your downstream code iterates `doc.tables` directly** (instead of
|
|
56
|
+
traversing the body tree), skip empty-shell tables explicitly:
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
for t in doc.tables:
|
|
60
|
+
if t.data and t.data.num_rows > 0:
|
|
61
|
+
... # real content
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
For most users the body tree is the right thing to iterate — it already
|
|
65
|
+
reflects the merged view.
|
|
66
|
+
|
|
67
|
+
### Version compatibility
|
|
68
|
+
|
|
69
|
+
Tested against **docling 2.64, docling-core 2.54**. The project pins a
|
|
70
|
+
compatible range in `pyproject.toml`:
|
|
71
|
+
|
|
72
|
+
```
|
|
73
|
+
docling>=2.60,<3
|
|
74
|
+
docling-core>=2.50,<3
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
If you need to test against a dev docling build:
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
pip install -e <path/to/docling-checkout>
|
|
81
|
+
pytest tests/integration/
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
Breaking changes in a 3.x release will need adapter updates — the adapter
|
|
85
|
+
touches `DoclingDocument`, `TableData`, `TableCell`, and table `prov`
|
|
86
|
+
entries (for page-number and bounding-box info).
|
|
87
|
+
|
|
88
|
+
### OCR backend
|
|
89
|
+
|
|
90
|
+
Docling auto-selects an OCR engine at runtime based on the host
|
|
91
|
+
(`ocrmac` on Apple Silicon, `easyocr` / `tesserocr` / `rapidocr`
|
|
92
|
+
otherwise). Cell text on image-backed PDFs (e.g. our bundled
|
|
93
|
+
PubTables-v2 fixtures) differs very slightly across backends.
|
|
94
|
+
|
|
95
|
+
The `first_row` / `last_row` assertions in integration fixtures have
|
|
96
|
+
been stable in practice but may flap if the CI host's OCR backend
|
|
97
|
+
differs from the one used to author the YAML. Re-running
|
|
98
|
+
`tests/integration/_tools/regenerate_expected.py` produces a clean
|
|
99
|
+
baseline for the new backend.
|
|
100
|
+
|
|
101
|
+
### Adapter detection thresholds
|
|
102
|
+
|
|
103
|
+
A few structural constants live at the top of `docling.py` rather than
|
|
104
|
+
in `MultiPageConfig`:
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
_MAX_HEADER_CELL_LEN = 30 # header cells typically short; data cells longer
|
|
108
|
+
_DATA_PATTERNS # regex list for "this cell is data, not header"
|
|
109
|
+
_AUTO_COLNAME_RE # "Column_N" / "Unnamed: N" parser placeholders
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
These are **adapter-intrinsic** — tuning them changes how the adapter
|
|
113
|
+
classifies first rows as header-or-data. User-tunable thresholds
|
|
114
|
+
(page gap, width tolerance, Jaccard cutoffs) live in `MultiPageConfig`
|
|
115
|
+
instead.
|
|
116
|
+
|
|
117
|
+
### Known upstream workarounds
|
|
118
|
+
|
|
119
|
+
The adapter compensates for a handful of docling extraction patterns
|
|
120
|
+
that produce fragments the merger would otherwise mis-group:
|
|
121
|
+
|
|
122
|
+
- **Data-as-headers** — when docling extracts a page where the real
|
|
123
|
+
header row got collapsed into the first data row, the fragment's
|
|
124
|
+
"column names" look like `['Column_0', 'Am Fds Trgt Dte Rtm 2055',
|
|
125
|
+
'13,085.03']`. The adapter's `_looks_like_data` regex list catches
|
|
126
|
+
comma-grouped decimals, stat ranges (`280 (176, 404)`), and scientific
|
|
127
|
+
notation (`7.0 x 10-7`) as data patterns, and flags the fragment
|
|
128
|
+
`is_headerless=True` so the merger's width-match path handles it.
|
|
129
|
+
An upstream fix that correctly identifies the collapsed header row
|
|
130
|
+
would make this heuristic unnecessary.
|
|
131
|
+
|
|
132
|
+
- **Long-cell first rows** — fragments where the majority of first-row
|
|
133
|
+
cells are >30 chars are flagged headerless too. Real headers are
|
|
134
|
+
typically short; a row of sentence-long strings is almost certainly
|
|
135
|
+
data.
|
|
136
|
+
|
|
137
|
+
- **Orphan-header fragments with truncated width** — when a new table's
|
|
138
|
+
header row has empty trailing cells that the parser drops, the
|
|
139
|
+
fragment's width is less than its data continuation's. The adapter
|
|
140
|
+
flags these as `is_header_orphan` structurally (small + header-shaped
|
|
141
|
+
cells, no data patterns), and the merger's "header-orphan → headerless
|
|
142
|
+
data" path trusts the data fragment's width on the join.
|
|
143
|
+
|
|
144
|
+
These workarounds are structural, not vocabulary-based — they reason
|
|
145
|
+
about cell shapes (length, regex patterns, auto-label form) rather than
|
|
146
|
+
specific words, so they generalize across domains and languages.
|
|
147
|
+
|
|
148
|
+
### Layout data availability
|
|
149
|
+
|
|
150
|
+
The merger's layout-confirmation rules (`bottom_band_min`, `top_band_max`,
|
|
151
|
+
width-drift tolerance) rely on `vert_top` / `vert_bottom` from the
|
|
152
|
+
adapter. Docling provides these via `prov[*].bbox` on `DoclingDocument`
|
|
153
|
+
tables. If a document lacks layout info (e.g. a text-only extraction),
|
|
154
|
+
the merger falls back to structural signals only and the layout-gated
|
|
155
|
+
rules don't fire.
|
|
156
|
+
|
|
157
|
+
---
|
|
158
|
+
|
|
159
|
+
## Adding a new adapter
|
|
160
|
+
|
|
161
|
+
1. Create `src/table_stitcher/adapters/<parser>.py`.
|
|
162
|
+
2. Implement the two methods above. Read a fragment's pandas DataFrame
|
|
163
|
+
plus its `prov` / layout metadata into `TableMeta`; write a merged
|
|
164
|
+
DataFrame back into the parser's native table type during `inject`.
|
|
165
|
+
3. Add a section to this README documenting version compat, any OCR or
|
|
166
|
+
extraction quirks, and workarounds.
|
|
167
|
+
4. Add a unit test file `tests/test_<parser>_adapter.py` exercising
|
|
168
|
+
`_grid_to_dataframe`-equivalent and `_dataframe_to_*_data`-equivalent
|
|
169
|
+
conversion on stub inputs.
|
|
170
|
+
5. (Optional but valuable) Re-run the integration fixtures through the
|
|
171
|
+
new adapter. Fixtures are parser-agnostic; anything your adapter can
|
|
172
|
+
convert to `TableMeta` with reasonable fidelity will exercise the
|
|
173
|
+
same merger rules as docling.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from .base import TableStitcherAdapter
|
|
2
|
+
|
|
3
|
+
try:
|
|
4
|
+
from .docling import DoclingAdapter
|
|
5
|
+
except ModuleNotFoundError as e:
|
|
6
|
+
if "docling_core" in str(e):
|
|
7
|
+
DoclingAdapter = None # docling-core not installed; use core-only mode
|
|
8
|
+
else:
|
|
9
|
+
raise # genuine missing dependency inside docling.py — don't swallow
|
|
10
|
+
|
|
11
|
+
__all__ = ["TableStitcherAdapter", "DoclingAdapter"]
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Adapter protocol for table-stitcher.
|
|
3
|
+
|
|
4
|
+
Any PDF parser can integrate with table-stitcher by implementing these
|
|
5
|
+
two methods. The merge engine only ever sees TableMeta objects — it never
|
|
6
|
+
touches parser-native document structures.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import Any, Protocol, runtime_checkable
|
|
10
|
+
|
|
11
|
+
from ..models import LogicalTable, MultiPageConfig, TableMeta
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@runtime_checkable
|
|
15
|
+
class TableStitcherAdapter(Protocol):
|
|
16
|
+
"""
|
|
17
|
+
Minimal interface a parser must implement to plug into table-stitcher.
|
|
18
|
+
|
|
19
|
+
Implement ``extract`` to read table fragments from your document format,
|
|
20
|
+
and ``inject`` to write merged results back.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def extract(self, doc: Any, cfg: MultiPageConfig) -> list[TableMeta]:
|
|
24
|
+
"""
|
|
25
|
+
Read all table fragments from the parser-native document object.
|
|
26
|
+
|
|
27
|
+
Returns a list of TableMeta, one per table fragment found in the doc.
|
|
28
|
+
The merger engine only ever sees these — it never touches ``doc``.
|
|
29
|
+
"""
|
|
30
|
+
...
|
|
31
|
+
|
|
32
|
+
def inject(self, doc: Any, logical_tables: list[LogicalTable]) -> Any:
|
|
33
|
+
"""
|
|
34
|
+
Write merged results back into the parser-native document object.
|
|
35
|
+
|
|
36
|
+
Receives the original doc and the full list of LogicalTable objects
|
|
37
|
+
(including single-fragment tables that were not merged — the adapter
|
|
38
|
+
decides whether to skip or handle them).
|
|
39
|
+
|
|
40
|
+
Returns the (potentially modified) doc.
|
|
41
|
+
"""
|
|
42
|
+
...
|