table-stitcher 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,340 @@
1
+ """
2
+ Table Stitcher — Reassemble tables split across page boundaries.
3
+
4
+ A parser-agnostic library that detects and merges table fragments produced
5
+ by PDF extraction tools. Ships with a Docling adapter out of the box.
6
+
7
+ Usage (Docling):
8
+ from table_stitcher import stitch_tables, MultiPageConfig
9
+
10
+ doc = stitch_tables(doc) # Use defaults
11
+ doc = stitch_tables(doc, config=MultiPageConfig(max_page_gap=2))
12
+
13
+ Usage (custom parser):
14
+ from table_stitcher import TableStitcher
15
+ from table_stitcher.adapters.base import TableStitcherAdapter
16
+
17
+ class MyAdapter:
18
+ def extract(self, doc, cfg): ...
19
+ def inject(self, doc, logical_tables): ...
20
+
21
+ stitcher = TableStitcher(adapter=MyAdapter())
22
+ doc = stitcher.stitch(doc)
23
+ """
24
+
25
+ import logging
26
+ import time
27
+ from typing import Any, Optional
28
+
29
+ from .adapters.base import TableStitcherAdapter
30
+ from .merger import merge_multipage_tables
31
+ from .models import LogicalTable, MergeTrace, MultiPageConfig, TableMeta
32
+
33
+ __version__ = "0.2.0"
34
+ __all__ = [
35
+ "stitch_tables",
36
+ "extract_table_meta",
37
+ "merge_multipage_tables",
38
+ "TableStitcher",
39
+ "MultiPageConfig",
40
+ "LogicalTable",
41
+ "TableMeta",
42
+ "MergeTrace",
43
+ "StitchingError",
44
+ "TableStitcherAdapter",
45
+ "__version__",
46
+ ]
47
+
48
+
49
+ class StitchingError(Exception):
50
+ """Raised when table stitching fails."""
51
+
52
+ pass
53
+
54
+
55
+ class TableStitcher:
56
+ """
57
+ Detects and merges tables split across multiple pages.
58
+
59
+ This class is parser-agnostic. Pass any adapter that implements the
60
+ ``TableStitcherAdapter`` protocol (two methods: ``extract`` and ``inject``).
61
+
62
+ For simple Docling usage, use the ``stitch_tables()`` function instead.
63
+ """
64
+
65
+ def __init__(
66
+ self,
67
+ adapter: TableStitcherAdapter,
68
+ config: Optional[MultiPageConfig] = None,
69
+ ):
70
+ self.logger = logging.getLogger("table_stitcher")
71
+ self.adapter = adapter
72
+ self.config = config or MultiPageConfig()
73
+ self._validate_config()
74
+
75
+ # -------------------------------------------------------------------------
76
+ # Logging
77
+ # -------------------------------------------------------------------------
78
+
79
+ def _log_phase_start(self, phase_num: int, phase_name: str):
80
+ self.logger.info("=" * 70)
81
+ self.logger.info(f"Phase {phase_num}: {phase_name}")
82
+ self.logger.info("=" * 70)
83
+
84
+ def _log_phase_complete(
85
+ self, phase_num: int, count: int, duration: float, item_type: str = "items"
86
+ ):
87
+ self.logger.info(f"Phase {phase_num} complete: {count} {item_type} [{duration:.1f}s]")
88
+
89
+ def _log_section(self, message: str, indent: int = 2):
90
+ prefix = " " * indent
91
+ self.logger.info(f"{prefix}{message}")
92
+
93
+ def _log_item_progress(self, item_name: str, status: str = "success"):
94
+ symbol = "+" if status == "success" else "x"
95
+ self.logger.info(f" [{symbol}] {item_name}")
96
+
97
+ def _log_error(self, context: str, error: Exception):
98
+ self.logger.error(f" [x] {context}: {error}")
99
+
100
+ # -------------------------------------------------------------------------
101
+ # Validation
102
+ # -------------------------------------------------------------------------
103
+
104
+ def _validate_config(self) -> None:
105
+ """Validate configuration values."""
106
+ errors = []
107
+ cfg = self.config
108
+
109
+ if not (1 <= cfg.max_page_gap <= 10):
110
+ errors.append(f"max_page_gap must be 1-10, got {cfg.max_page_gap}")
111
+
112
+ if cfg.max_width_difference < 0:
113
+ errors.append(f"max_width_difference must be >= 0, got {cfg.max_width_difference}")
114
+
115
+ valid_width_policies = {"preserve_extra", "warn_drop", "fail", "merge_tail"}
116
+ if cfg.width_overflow_policy not in valid_width_policies:
117
+ errors.append(
118
+ "width_overflow_policy must be one of "
119
+ f"{sorted(valid_width_policies)}, got {cfg.width_overflow_policy!r}"
120
+ )
121
+
122
+ for name, value in [
123
+ ("header_sim_strict", cfg.header_sim_strict),
124
+ ("header_sim_loose", cfg.header_sim_loose),
125
+ ("row_sim_threshold", cfg.row_sim_threshold),
126
+ ("bottom_band_min", cfg.bottom_band_min),
127
+ ("top_band_max", cfg.top_band_max),
128
+ ]:
129
+ if not (0.0 <= value <= 1.0):
130
+ errors.append(f"{name} must be 0.0-1.0, got {value}")
131
+
132
+ if cfg.max_orphan_rows < 0:
133
+ errors.append(f"max_orphan_rows must be >= 0, got {cfg.max_orphan_rows}")
134
+ if cfg.max_data_orphan_rows < 0:
135
+ errors.append(f"max_data_orphan_rows must be >= 0, got {cfg.max_data_orphan_rows}")
136
+
137
+ if errors:
138
+ raise ValueError("Invalid MultiPageConfig:\n " + "\n ".join(errors))
139
+
140
+ # -------------------------------------------------------------------------
141
+ # Core Processing
142
+ # -------------------------------------------------------------------------
143
+
144
+ def stitch(
145
+ self,
146
+ doc: Any,
147
+ raise_on_error: bool = False,
148
+ ) -> Any:
149
+ """
150
+ Detect and merge tables split across multiple pages.
151
+
152
+ Args:
153
+ doc: The parser-native document object.
154
+ raise_on_error: If True, raise exceptions on processing errors.
155
+ If False (default), log errors and return original doc.
156
+
157
+ Returns:
158
+ The document with merged tables.
159
+ """
160
+ if doc is None:
161
+ if raise_on_error:
162
+ raise StitchingError("Input document is None")
163
+ self.logger.error("Input document is None")
164
+ return doc
165
+
166
+ # --- Phase 1: Extract Metadata ---
167
+ self._log_phase_start(1, "Extract Table Metadata")
168
+ phase_start = time.time()
169
+
170
+ try:
171
+ tables_meta = self.adapter.extract(doc, self.config)
172
+ except Exception as e:
173
+ self._log_error("Metadata extraction failed", e)
174
+ if raise_on_error:
175
+ raise StitchingError(f"Failed to extract table metadata: {e}") from e
176
+ return doc
177
+
178
+ if not tables_meta:
179
+ self._log_section("No tables found in document")
180
+ self._log_phase_complete(1, 0, time.time() - phase_start, "tables")
181
+ return doc
182
+
183
+ # Report extraction coverage — tables that failed extraction
184
+ # are silently preserved in the original doc (pass-through).
185
+ total_tables = len(getattr(doc, "tables", []) or [])
186
+ if total_tables and len(tables_meta) < total_tables:
187
+ skipped = total_tables - len(tables_meta)
188
+ self._log_section(
189
+ f"Extracted {len(tables_meta)}/{total_tables} tables "
190
+ f"({skipped} skipped — originals preserved)"
191
+ )
192
+
193
+ self._log_phase_complete(
194
+ 1, len(tables_meta), time.time() - phase_start, "table fragments extracted"
195
+ )
196
+
197
+ # --- Phase 2: Analyze & Merge ---
198
+ self._log_phase_start(2, "Analyze Multi-Page Merges")
199
+ phase_start = time.time()
200
+
201
+ try:
202
+ logical_tables = merge_multipage_tables(tables_meta, self.config)
203
+ except Exception as e:
204
+ self._log_error("Merge analysis failed", e)
205
+ if raise_on_error:
206
+ raise StitchingError(f"Failed to merge tables: {e}") from e
207
+ return doc
208
+
209
+ multi_page_tables = [lt for lt in logical_tables if len(lt.pages) > 1]
210
+
211
+ if not multi_page_tables:
212
+ self._log_section("No multi-page tables detected")
213
+ self._log_phase_complete(2, 0, time.time() - phase_start, "merges")
214
+ return doc
215
+
216
+ for lt in multi_page_tables:
217
+ reason = f", reason={lt.merge_reason}" if lt.merge_reason else ""
218
+ self._log_section(
219
+ f"Found: Table spanning pages {lt.pages} ({len(lt.members)} fragments{reason})"
220
+ )
221
+
222
+ self._log_phase_complete(
223
+ 2, len(multi_page_tables), time.time() - phase_start, "multi-page tables identified"
224
+ )
225
+
226
+ # --- Phase 3: Inject Merged Tables ---
227
+ self._log_phase_start(3, "Inject Merged Tables")
228
+ phase_start = time.time()
229
+
230
+ try:
231
+ doc = self.adapter.inject(doc, logical_tables)
232
+ except Exception as e:
233
+ self._log_error("Injection failed", e)
234
+ if raise_on_error:
235
+ raise StitchingError(f"Failed to inject merged tables: {e}") from e
236
+ return doc
237
+
238
+ for lt in multi_page_tables:
239
+ self._log_item_progress(f"Merged pages {lt.pages} -> 1 table", "success")
240
+
241
+ self._log_phase_complete(
242
+ 3, len(multi_page_tables), time.time() - phase_start, "tables injected"
243
+ )
244
+
245
+ return doc
246
+
247
+
248
+ # -----------------------------------------------------------------------------
249
+ # Convenience Function
250
+ # -----------------------------------------------------------------------------
251
+
252
+
253
+ def stitch_tables(
254
+ doc: Any,
255
+ config: Optional[MultiPageConfig] = None,
256
+ raise_on_error: bool = False,
257
+ ) -> Any:
258
+ """
259
+ Detect and merge tables split across multiple pages.
260
+
261
+ Convenience function that uses the Docling adapter by default.
262
+ For other parsers, use ``TableStitcher`` with a custom adapter.
263
+
264
+ Args:
265
+ doc: The input DoclingDocument (already converted from PDF).
266
+ config: Optional configuration overrides. Uses sensible defaults if None.
267
+ raise_on_error: If True, raise exceptions on processing errors.
268
+ If False (default), log errors and return original doc.
269
+
270
+ Returns:
271
+ The document with merged tables.
272
+
273
+ Raises:
274
+ StitchingError: If raise_on_error=True and processing fails.
275
+ ValueError: If config values are invalid.
276
+
277
+ Example:
278
+ >>> from docling.document_converter import DocumentConverter
279
+ >>> from table_stitcher import stitch_tables
280
+ >>>
281
+ >>> converter = DocumentConverter()
282
+ >>> doc = converter.convert("report.pdf").document
283
+ >>> doc = stitch_tables(doc)
284
+ """
285
+ try:
286
+ from .adapters.docling import DoclingAdapter
287
+ except ModuleNotFoundError as e:
288
+ if "docling_core" in str(e):
289
+ raise ImportError(
290
+ "The Docling adapter requires docling-core. "
291
+ "Install it with: pip install table-stitcher[docling]"
292
+ ) from e
293
+ raise # genuine bug inside docling.py — don't mask it
294
+
295
+ try:
296
+ stitcher = TableStitcher(adapter=DoclingAdapter(), config=config)
297
+ return stitcher.stitch(doc, raise_on_error=raise_on_error)
298
+ except ValueError:
299
+ if raise_on_error:
300
+ raise
301
+ logging.getLogger("table_stitcher").error(
302
+ "Invalid configuration, returning original document"
303
+ )
304
+ return doc
305
+
306
+
307
+ def extract_table_meta(
308
+ doc: Any,
309
+ config: Optional[MultiPageConfig] = None,
310
+ ) -> list:
311
+ """
312
+ Extract table metadata without merging — useful for analysis.
313
+
314
+ Convenience function that uses the Docling adapter by default.
315
+ Returns a list of ``TableMeta`` objects describing each table fragment.
316
+
317
+ Args:
318
+ doc: The input DoclingDocument (already converted from PDF).
319
+ config: Optional configuration overrides. Uses sensible defaults if None.
320
+
321
+ Returns:
322
+ List of TableMeta objects for each table in the document.
323
+
324
+ Example:
325
+ >>> from table_stitcher import extract_table_meta
326
+ >>> metas = extract_table_meta(doc)
327
+ >>> for m in metas:
328
+ ... print(f"Table {m.idx}: {m.width} cols, page {m.start_page}")
329
+ """
330
+ try:
331
+ from .adapters.docling import DoclingAdapter
332
+ except ModuleNotFoundError as e:
333
+ if "docling_core" in str(e):
334
+ raise ImportError(
335
+ "The Docling adapter requires docling-core. "
336
+ "Install it with: pip install table-stitcher[docling]"
337
+ ) from e
338
+ raise
339
+
340
+ return DoclingAdapter().extract(doc, config or MultiPageConfig())
@@ -0,0 +1,173 @@
1
+ # Adapters
2
+
3
+ `table-stitcher`'s core is parser-agnostic. Adapters bridge a specific
4
+ document parser (docling, pdfplumber, unstructured, …) to the
5
+ `TableMeta` / `LogicalTable` contract the merger operates on.
6
+
7
+ Adapter-specific notes — version compatibility, OCR backend behavior,
8
+ known upstream workarounds — live here rather than in the main README,
9
+ so the top-level docs stay agnostic and each adapter can document its
10
+ own quirks.
11
+
12
+ ## Adapter protocol
13
+
14
+ Any adapter implements two methods:
15
+
16
+ ```python
17
+ class MyAdapter:
18
+ def extract(self, doc, cfg: MultiPageConfig) -> list[TableMeta]:
19
+ """Read tables from the parser's native document type and produce
20
+ TableMeta records for each fragment."""
21
+
22
+ def inject(self, doc, logical_tables: list[LogicalTable]):
23
+ """Write merged tables back into the native document, pruning the
24
+ now-redundant satellite fragments from the body tree and clearing
25
+ their cell content. Return the modified document."""
26
+ ```
27
+
28
+ A skeleton custom adapter appears in the top-level README. The merger
29
+ never imports parser-specific types — it only reads from the `TableMeta`
30
+ fields and writes to a pandas DataFrame, which the adapter then converts
31
+ back into the parser's native table representation during `inject`.
32
+ Each `LogicalTable` also carries `merge_reason`, `merge_traces`, and
33
+ `warnings`; adapters can ignore them, log them, or surface them in their
34
+ native document metadata.
35
+
36
+ ---
37
+
38
+ ## Docling adapter (`docling.py`)
39
+
40
+ ### How the docling adapter prunes satellites
41
+
42
+ When `inject()` folds multiple fragments into one logical table, the
43
+ satellite fragments (members after the anchor) are handled in two places:
44
+
45
+ 1. **Body tree** — references to satellite tables in `doc.body` (and any
46
+ groups) are removed, so rendered output contains only the merged
47
+ anchor.
48
+ 2. **`doc.tables` list** — the `Table` objects at the satellite indices
49
+ are *cleared in place*: `data` becomes an empty `TableData`
50
+ (`num_rows=0, num_cols=0`), `prov` becomes `[]`. The `Table`
51
+ wrapper stays at its list position because docling uses
52
+ position-based `self_ref` strings (`#/tables/N`) — removing entries
53
+ would invalidate every reference that points to a later index.
54
+
55
+ **If your downstream code iterates `doc.tables` directly** (instead of
56
+ traversing the body tree), skip empty-shell tables explicitly:
57
+
58
+ ```python
59
+ for t in doc.tables:
60
+ if t.data and t.data.num_rows > 0:
61
+ ... # real content
62
+ ```
63
+
64
+ For most users the body tree is the right thing to iterate — it already
65
+ reflects the merged view.
66
+
67
+ ### Version compatibility
68
+
69
+ Tested against **docling 2.64, docling-core 2.54**. The project pins a
70
+ compatible range in `pyproject.toml`:
71
+
72
+ ```
73
+ docling>=2.60,<3
74
+ docling-core>=2.50,<3
75
+ ```
76
+
77
+ If you need to test against a dev docling build:
78
+
79
+ ```bash
80
+ pip install -e <path/to/docling-checkout>
81
+ pytest tests/integration/
82
+ ```
83
+
84
+ Breaking changes in a 3.x release will need adapter updates — the adapter
85
+ touches `DoclingDocument`, `TableData`, `TableCell`, and table `prov`
86
+ entries (for page-number and bounding-box info).
87
+
88
+ ### OCR backend
89
+
90
+ Docling auto-selects an OCR engine at runtime based on the host
91
+ (`ocrmac` on Apple Silicon, `easyocr` / `tesserocr` / `rapidocr`
92
+ otherwise). Cell text on image-backed PDFs (e.g. our bundled
93
+ PubTables-v2 fixtures) differs very slightly across backends.
94
+
95
+ The `first_row` / `last_row` assertions in integration fixtures have
96
+ been stable in practice but may flap if the CI host's OCR backend
97
+ differs from the one used to author the YAML. Re-running
98
+ `tests/integration/_tools/regenerate_expected.py` produces a clean
99
+ baseline for the new backend.
100
+
101
+ ### Adapter detection thresholds
102
+
103
+ A few structural constants live at the top of `docling.py` rather than
104
+ in `MultiPageConfig`:
105
+
106
+ ```python
107
+ _MAX_HEADER_CELL_LEN = 30 # header cells typically short; data cells longer
108
+ _DATA_PATTERNS # regex list for "this cell is data, not header"
109
+ _AUTO_COLNAME_RE # "Column_N" / "Unnamed: N" parser placeholders
110
+ ```
111
+
112
+ These are **adapter-intrinsic** — tuning them changes how the adapter
113
+ classifies first rows as header-or-data. User-tunable thresholds
114
+ (page gap, width tolerance, Jaccard cutoffs) live in `MultiPageConfig`
115
+ instead.
116
+
117
+ ### Known upstream workarounds
118
+
119
+ The adapter compensates for a handful of docling extraction patterns
120
+ that produce fragments the merger would otherwise mis-group:
121
+
122
+ - **Data-as-headers** — when docling extracts a page where the real
123
+ header row got collapsed into the first data row, the fragment's
124
+ "column names" look like `['Column_0', 'Am Fds Trgt Dte Rtm 2055',
125
+ '13,085.03']`. The adapter's `_looks_like_data` regex list catches
126
+ comma-grouped decimals, stat ranges (`280 (176, 404)`), and scientific
127
+ notation (`7.0 x 10-7`) as data patterns, and flags the fragment
128
+ `is_headerless=True` so the merger's width-match path handles it.
129
+ An upstream fix that correctly identifies the collapsed header row
130
+ would make this heuristic unnecessary.
131
+
132
+ - **Long-cell first rows** — fragments where the majority of first-row
133
+ cells are >30 chars are flagged headerless too. Real headers are
134
+ typically short; a row of sentence-long strings is almost certainly
135
+ data.
136
+
137
+ - **Orphan-header fragments with truncated width** — when a new table's
138
+ header row has empty trailing cells that the parser drops, the
139
+ fragment's width is less than its data continuation's. The adapter
140
+ flags these as `is_header_orphan` structurally (small + header-shaped
141
+ cells, no data patterns), and the merger's "header-orphan → headerless
142
+ data" path trusts the data fragment's width on the join.
143
+
144
+ These workarounds are structural, not vocabulary-based — they reason
145
+ about cell shapes (length, regex patterns, auto-label form) rather than
146
+ specific words, so they generalize across domains and languages.
147
+
148
+ ### Layout data availability
149
+
150
+ The merger's layout-confirmation rules (`bottom_band_min`, `top_band_max`,
151
+ width-drift tolerance) rely on `vert_top` / `vert_bottom` from the
152
+ adapter. Docling provides these via `prov[*].bbox` on `DoclingDocument`
153
+ tables. If a document lacks layout info (e.g. a text-only extraction),
154
+ the merger falls back to structural signals only and the layout-gated
155
+ rules don't fire.
156
+
157
+ ---
158
+
159
+ ## Adding a new adapter
160
+
161
+ 1. Create `src/table_stitcher/adapters/<parser>.py`.
162
+ 2. Implement the two methods above. Read a fragment's pandas DataFrame
163
+ plus its `prov` / layout metadata into `TableMeta`; write a merged
164
+ DataFrame back into the parser's native table type during `inject`.
165
+ 3. Add a section to this README documenting version compat, any OCR or
166
+ extraction quirks, and workarounds.
167
+ 4. Add a unit test file `tests/test_<parser>_adapter.py` exercising
168
+ `_grid_to_dataframe`-equivalent and `_dataframe_to_*_data`-equivalent
169
+ conversion on stub inputs.
170
+ 5. (Optional but valuable) Re-run the integration fixtures through the
171
+ new adapter. Fixtures are parser-agnostic; anything your adapter can
172
+ convert to `TableMeta` with reasonable fidelity will exercise the
173
+ same merger rules as docling.
@@ -0,0 +1,11 @@
1
+ from .base import TableStitcherAdapter
2
+
3
+ try:
4
+ from .docling import DoclingAdapter
5
+ except ModuleNotFoundError as e:
6
+ if "docling_core" in str(e):
7
+ DoclingAdapter = None # docling-core not installed; use core-only mode
8
+ else:
9
+ raise # genuine missing dependency inside docling.py — don't swallow
10
+
11
+ __all__ = ["TableStitcherAdapter", "DoclingAdapter"]
@@ -0,0 +1,42 @@
1
+ """
2
+ Adapter protocol for table-stitcher.
3
+
4
+ Any PDF parser can integrate with table-stitcher by implementing these
5
+ two methods. The merge engine only ever sees TableMeta objects — it never
6
+ touches parser-native document structures.
7
+ """
8
+
9
+ from typing import Any, Protocol, runtime_checkable
10
+
11
+ from ..models import LogicalTable, MultiPageConfig, TableMeta
12
+
13
+
14
+ @runtime_checkable
15
+ class TableStitcherAdapter(Protocol):
16
+ """
17
+ Minimal interface a parser must implement to plug into table-stitcher.
18
+
19
+ Implement ``extract`` to read table fragments from your document format,
20
+ and ``inject`` to write merged results back.
21
+ """
22
+
23
+ def extract(self, doc: Any, cfg: MultiPageConfig) -> list[TableMeta]:
24
+ """
25
+ Read all table fragments from the parser-native document object.
26
+
27
+ Returns a list of TableMeta, one per table fragment found in the doc.
28
+ The merger engine only ever sees these — it never touches ``doc``.
29
+ """
30
+ ...
31
+
32
+ def inject(self, doc: Any, logical_tables: list[LogicalTable]) -> Any:
33
+ """
34
+ Write merged results back into the parser-native document object.
35
+
36
+ Receives the original doc and the full list of LogicalTable objects
37
+ (including single-fragment tables that were not merged — the adapter
38
+ decides whether to skip or handle them).
39
+
40
+ Returns the (potentially modified) doc.
41
+ """
42
+ ...