table-stitcher 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,145 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Any, Literal, Optional
3
+
4
+ import pandas as pd
5
+
6
+
7
+ @dataclass
8
+ class MultiPageConfig:
9
+ """
10
+ Configuration for multi-page table merging.
11
+
12
+ The merger uses three main signals to decide if tables should merge:
13
+ 1. Sequential adjacency: Tables must be consecutive in document order
14
+ 2. Width matching: Same column count suggests same table structure
15
+ 3. Header analysis: Headerless fragments continue the previous table
16
+
17
+ Geometry-based signals (vert_top, vert_bottom) are available when the
18
+ parser adapter provides bounding box data.
19
+ """
20
+
21
+ # --- Page Adjacency ---
22
+ max_page_gap: int = 1
23
+ """Maximum number of pages between fragments to consider merging."""
24
+
25
+ # --- Width Matching ---
26
+ require_same_width: bool = False
27
+ """If True, only merge tables with identical column counts."""
28
+
29
+ max_width_difference: int = 4
30
+ """Maximum allowed difference in column count for merging."""
31
+
32
+ width_overflow_policy: Literal["preserve_extra", "warn_drop", "fail", "merge_tail"] = (
33
+ "preserve_extra"
34
+ )
35
+ """
36
+ How to handle a continuation fragment with more columns than the anchor.
37
+
38
+ - ``preserve_extra``: keep trailing columns as ``_extra_N`` columns.
39
+ - ``warn_drop``: drop trailing columns after logging a warning.
40
+ - ``fail``: raise ``ValueError`` instead of losing data.
41
+ - ``merge_tail``: append trailing cell values into the final canonical cell.
42
+ """
43
+
44
+ headerless_width_tolerance: int = 2
45
+ """
46
+ Width-drift tolerance (±N columns) for the headerless-continuation path
47
+ when vertical layout confirms the pages are adjacent-and-stacked.
48
+ Tighter than `max_width_difference` because the layout guard alone
49
+ isn't enough to prevent false merges at large width deltas.
50
+ """
51
+
52
+ # --- Similarity Thresholds ---
53
+ header_sim_strict: float = 0.6
54
+ """Jaccard similarity threshold for 'repeated header' detection."""
55
+
56
+ header_sim_loose: float = 0.3
57
+ """Lower threshold used when layout hints confirm continuation."""
58
+
59
+ row_sim_threshold: float = 0.3
60
+ """Similarity threshold for first-row content matching (fallback)."""
61
+
62
+ # --- Geometry/Layout Hints ---
63
+ use_layout_hint: bool = True
64
+ """Whether to use vertical position for merge decisions."""
65
+
66
+ bottom_band_min: float = 0.60
67
+ """
68
+ Table A must end at or below this position to be a continuation candidate.
69
+ Uses normalized coordinates: 0 = top of page, 1 = bottom of page.
70
+ Default 0.6 means table must be in the bottom 40% of the page.
71
+ """
72
+
73
+ top_band_max: float = 0.40
74
+ """
75
+ Table B must start at or above this position to be a continuation candidate.
76
+ Uses normalized coordinates: 0 = top of page, 1 = bottom of page.
77
+ Default 0.4 means table must be in the top 40% of the page.
78
+ """
79
+
80
+ # --- Header/Orphan Detection ---
81
+ max_orphan_rows: int = 2
82
+ """Maximum rows for a table to be considered a 'header orphan'."""
83
+
84
+ max_data_orphan_rows: int = 5
85
+ """Maximum rows for a table to be considered a 'data orphan'."""
86
+
87
+ # --- Spillover Detection ---
88
+ spillover_require_content_check: bool = False
89
+ """
90
+ If True, 1-column fragments must contain URL/ticket patterns to be spillover.
91
+ If False (default), any 1-column headerless fragment is treated as spillover.
92
+ The structural signal (1 col following N cols) is strong enough for most cases.
93
+ """
94
+
95
+ # --- Cell Stitching ---
96
+ stitch_separator: str = "\n"
97
+ """Character(s) used to join split cell content."""
98
+
99
+
100
+ @dataclass
101
+ class TableMeta:
102
+ """Metadata for a single extracted table fragment."""
103
+
104
+ idx: int
105
+ df: pd.DataFrame
106
+ start_page: Optional[int]
107
+ pages: list[int]
108
+ width: int
109
+ header_tokens: set[str]
110
+ first_row_tokens: set[str]
111
+ raw_columns: list[str]
112
+ vert_center: Optional[float]
113
+ vert_top: Optional[float]
114
+ vert_bottom: Optional[float]
115
+ is_header_orphan: bool
116
+ is_data_orphan: bool
117
+ numeric_like_cols: bool
118
+ row_count: int
119
+ continuation_content: list[dict] = field(default_factory=list)
120
+ is_headerless: bool = False
121
+
122
+
123
+ @dataclass
124
+ class MergeTrace:
125
+ """Explain one adjacent-table merge decision."""
126
+
127
+ left_idx: int
128
+ right_idx: int
129
+ merged: bool
130
+ reason: str
131
+ signals: dict[str, Any] = field(default_factory=dict)
132
+ warnings: list[str] = field(default_factory=list)
133
+
134
+
135
+ @dataclass
136
+ class LogicalTable:
137
+ """A merged logical table spanning potentially multiple pages."""
138
+
139
+ logical_index: int
140
+ members: list[int]
141
+ pages: list[int]
142
+ df: pd.DataFrame
143
+ merge_reason: str = ""
144
+ merge_traces: list[MergeTrace] = field(default_factory=list)
145
+ warnings: list[str] = field(default_factory=list)
File without changes
@@ -0,0 +1,392 @@
1
+ Metadata-Version: 2.4
2
+ Name: table-stitcher
3
+ Version: 0.3.0
4
+ Summary: Reassemble tables split across page boundaries in PDF extraction
5
+ Project-URL: Homepage, https://github.com/pebbleroad/table-stitcher
6
+ Project-URL: Repository, https://github.com/pebbleroad/table-stitcher
7
+ Project-URL: Issues, https://github.com/pebbleroad/table-stitcher/issues
8
+ Author: PebbleRoad
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: docling,extraction,multipage,pdf,tables
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.9
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Scientific/Engineering
21
+ Classifier: Topic :: Text Processing
22
+ Requires-Python: >=3.9
23
+ Requires-Dist: pandas>=1.5
24
+ Provides-Extra: dev
25
+ Requires-Dist: build>=1.0; extra == 'dev'
26
+ Requires-Dist: docling-core<3,>=2.50; extra == 'dev'
27
+ Requires-Dist: docling<3,>=2.60; extra == 'dev'
28
+ Requires-Dist: pre-commit>=3.0; extra == 'dev'
29
+ Requires-Dist: pytest>=7.0; extra == 'dev'
30
+ Requires-Dist: pyyaml>=6.0; extra == 'dev'
31
+ Requires-Dist: reportlab>=4.0; extra == 'dev'
32
+ Requires-Dist: ruff>=0.6; extra == 'dev'
33
+ Requires-Dist: twine>=5.0; extra == 'dev'
34
+ Provides-Extra: docling
35
+ Requires-Dist: docling-core<3,>=2.50; extra == 'docling'
36
+ Requires-Dist: docling<3,>=2.60; extra == 'docling'
37
+ Description-Content-Type: text/markdown
38
+
39
+ # Table Stitcher
40
+
41
+ [![CI](https://github.com/pebbleroad/table-stitcher/actions/workflows/ci.yml/badge.svg)](https://github.com/pebbleroad/table-stitcher/actions/workflows/ci.yml)
42
+ [![PyPI](https://img.shields.io/pypi/v/table-stitcher.svg)](https://pypi.org/project/table-stitcher/)
43
+ [![Python](https://img.shields.io/pypi/pyversions/table-stitcher.svg)](https://pypi.org/project/table-stitcher/)
44
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
45
+ [![Code style: ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
46
+
47
+ Reassemble tables split across page boundaries in PDF extraction.
48
+
49
+ PDF extraction tools often fragment a single logical table into multiple pieces when it spans pages. **Table Stitcher** detects these fragments and merges them back into coherent tables.
50
+
51
+ **Parser-agnostic core** with a clean adapter interface. Ships with a [Docling](https://github.com/DS4SD/docling) adapter out of the box.
52
+
53
+ ---
54
+
55
+ ## What It Fixes
56
+
57
+ - **Data orphans** -- table body continues on the next page without headers
58
+ - **Header orphans** -- headers at the bottom of one page, data on the next
59
+ - **Spillover content** -- URLs or long text cut at page margins, appearing as separate 1-column "tables"
60
+ - **Split cells** -- cell content fragmented across page breaks
61
+ - **Width drift** -- same table extracted with slightly different column counts across pages
62
+ - **Multilingual headers** -- merge rules work on Latin, CJK, Thai, Arabic, Cyrillic, and more — no language model or dictionary required, purely structural signals
63
+
64
+ ## How It Fits in Your Pipeline
65
+
66
+ Table-stitcher is **parser-agnostic at the table-fragment level** — it doesn't parse PDFs, HTML, or anything else. It assumes your upstream pipeline already extracted tables and knows which page each came from.
67
+
68
+ ```
69
+ your parser adapter.extract() merger adapter.inject() your format
70
+ (Docling, VLM, ──> List[TableMeta] ──> List[Logical ──> write merged ──> (DoclingDocument,
71
+ Camelot, HTML…) Table] results back HTML, JSON…)
72
+ ```
73
+
74
+ The core engine only ever speaks `TableMeta` — a small dataclass carrying a DataFrame, page number, column count, header tokens, and optional bbox. It returns `LogicalTable` objects with merged data plus `MergeTrace` explanations for the decisions it made. Your job is a thin adapter with two methods:
75
+
76
+ - `extract(doc, cfg) -> List[TableMeta]` — translate your parser's native table objects into `TableMeta`
77
+ - `inject(doc, logical_tables) -> doc` — write merged results back into your native format
78
+
79
+ Ships with a `DoclingAdapter` out of the box. Writing an `HTMLAdapter`, `CamelotAdapter`, or one for your own pipeline is ~50 lines — see [Writing a Custom Adapter](#writing-a-custom-adapter).
80
+
81
+ ## Installation
82
+
83
+ **From PyPI** (once published):
84
+ ```bash
85
+ pip install table-stitcher[docling] # With Docling support
86
+ pip install table-stitcher # Core only (for custom adapters)
87
+ ```
88
+
89
+ **From source:**
90
+ ```bash
91
+ git clone https://github.com/pebbleroad/table-stitcher.git
92
+ cd table-stitcher
93
+ pip install -e ".[docling]" # Editable install with Docling
94
+ ```
95
+
96
+ ## Quick Start
97
+
98
+ ### Docling (one-liner)
99
+
100
+ ```python
101
+ from docling.document_converter import DocumentConverter
102
+ from table_stitcher import stitch_tables
103
+
104
+ converter = DocumentConverter()
105
+ doc = converter.convert("report.pdf").document
106
+ doc = stitch_tables(doc) # merged tables; ready for
107
+ # export_to_markdown() / HTML / LLM
108
+ ```
109
+
110
+ `stitch_tables()` mutates `doc` in place and returns the same object. If you
111
+ need the pre-stitch original (e.g. for diffing), snapshot first:
112
+
113
+ ```python
114
+ original = doc.model_copy(deep=True)
115
+ doc = stitch_tables(doc)
116
+ ```
117
+
118
+ Tables that aren't merged pass through byte-for-byte — multi-row headers,
119
+ rowspan/colspan, cell bboxes, and prov entries are preserved exactly as
120
+ Docling produced them. Only merged tables get their data rows rebuilt from
121
+ the merged DataFrame; anchor headers are reused verbatim. See
122
+ [Adapter Design Principle: Respect the Incoming Structure](#adapter-design-principle-respect-the-incoming-structure).
123
+
124
+ Runnable end-to-end scripts live in [`examples/`](examples/):
125
+
126
+ - [`basic_pipeline.py`](examples/basic_pipeline.py) — minimal Docling → stitch → markdown export
127
+ - [`system_controller.py`](examples/system_controller.py) — drop-in integration for a larger pipeline
128
+
129
+ ### With Configuration
130
+
131
+ ```python
132
+ from table_stitcher import stitch_tables, MultiPageConfig
133
+
134
+ config = MultiPageConfig(
135
+ max_page_gap=1, # Only merge tables on consecutive pages
136
+ max_width_difference=2, # Column count tolerance
137
+ header_sim_strict=0.6, # Threshold for repeated header detection
138
+ stitch_separator="\n", # Join character for split content
139
+ )
140
+
141
+ doc = stitch_tables(doc, config=config)
142
+ ```
143
+
144
+ ### Custom Parser (adapter pattern)
145
+
146
+ ```python
147
+ from typing import Any, List
148
+ from table_stitcher import TableStitcher, MultiPageConfig, TableMeta, LogicalTable
149
+ from table_stitcher.adapters.base import TableStitcherAdapter
150
+
151
+ class MyParserAdapter:
152
+ def extract(self, doc, cfg: MultiPageConfig) -> List[TableMeta]:
153
+ """Read tables from your document format into TableMeta objects."""
154
+ ...
155
+
156
+ def inject(self, doc, logical_tables: List[LogicalTable]):
157
+ """Write merged results back into your document format."""
158
+ ...
159
+
160
+ stitcher = TableStitcher(adapter=MyParserAdapter())
161
+ doc = stitcher.stitch(doc)
162
+ ```
163
+
164
+ ## How It Works
165
+
166
+ The merge engine uses three principles:
167
+
168
+ ### 1. Sequential Merging
169
+
170
+ A headerless fragment only merges with its immediate predecessor in document order. This prevents false merges between unrelated tables that happen to share column counts.
171
+
172
+ ### 2. Width Matching
173
+
174
+ Same column count = same table structure. This is the primary merge signal.
175
+
176
+ | Fragment A | Fragment B | Decision |
177
+ |---|---|---|
178
+ | 5 columns | 5 columns | Likely same table |
179
+ | 5 columns | 4 columns | Check other signals |
180
+ | 5 columns | 1 column | Spillover detection |
181
+
182
+ When a continuation fragment is wider than the anchor, the default policy is
183
+ data-preserving: extra trailing cells are kept in explicit `_extra_N` columns.
184
+ Use `width_overflow_policy="warn_drop"` for the older lossy behavior,
185
+ `"fail"` when you want strict no-overflow enforcement, or `"merge_tail"` when
186
+ overflow cells should be appended into the final canonical column.
187
+
188
+ ### 3. Spillover Detection
189
+
190
+ A 1-column headerless fragment following a multi-column table is almost certainly content that overflowed from the last cell. It gets stitched back automatically.
191
+
192
+ ## Architecture
193
+
194
+ ```
195
+ table_stitcher/
196
+ __init__.py # Public API: stitch_tables(), extract_table_meta(), TableStitcher
197
+ models.py # MultiPageConfig, TableMeta, LogicalTable
198
+ merger.py # Core engine (parser-agnostic)
199
+ adapters/
200
+ base.py # TableStitcherAdapter protocol
201
+ docling.py # Docling implementation
202
+ ```
203
+
204
+ The adapter protocol has exactly **two methods**:
205
+
206
+ | Method | Purpose |
207
+ |---|---|
208
+ | `extract(doc, cfg)` | Read table fragments from your document -> `List[TableMeta]` |
209
+ | `inject(doc, logical_tables)` | Write merged results back into your document |
210
+
211
+ The merge engine (`merger.py`) never sees parser-native objects. It works entirely with `TableMeta` (pandas DataFrames + page metadata), and each `LogicalTable` includes `merge_reason`, `merge_traces`, and `warnings` so downstream integrations can audit why fragments merged and whether any risky alignment happened.
212
+
213
+ ### Adapter Design Principle: Respect the Incoming Structure
214
+
215
+ > **Adapters must preserve the native structure of tables they don't modify, and preserve as much native structure as possible for tables they do modify.**
216
+
217
+ `TableMeta` is intentionally lossy — it reduces a rich table (with rowspan, colspan, multi-row headers, cell styles, bboxes) into a pandas DataFrame plus metadata, because the merger only needs that much to make merge decisions.
218
+
219
+ When `inject()` writes results back, the temptation is to rebuild the native structure from the DataFrame alone. **Don't.** That throws away everything `TableMeta` didn't capture.
220
+
221
+ Two rules for `inject()`:
222
+
223
+ 1. **Pass-through unchanged.** If a logical table has only one member (nothing merged), leave the original native table object untouched. Do not round-trip it through the DataFrame.
224
+ 2. **Partial reuse on merge.** For merged tables, reuse the anchor's native structure where possible (e.g. header rows with their spans) and only rebuild the parts the merger actually changed (the data rows, formed by concatenation).
225
+
226
+ The Docling adapter illustrates this: `_dataframe_to_docling_data()` reuses the anchor's original header rows verbatim (preserving rowspan/colspan) and only builds fresh 1x1 cells for the merged data rows. An earlier version rebuilt the entire grid from the DataFrame and destroyed multi-row headers — that was a bug, not a limitation of the architecture.
227
+
228
+ ## Configuration Reference
229
+
230
+ | Parameter | Type | Default | Description |
231
+ |---|---|---|---|
232
+ | `max_page_gap` | int | 1 | Maximum pages between fragments |
233
+ | `require_same_width` | bool | False | Require identical column counts |
234
+ | `max_width_difference` | int | 4 | Column count tolerance |
235
+ | `width_overflow_policy` | str | "preserve_extra" | How to handle continuation fragments wider than the anchor: "preserve_extra", "warn_drop", "fail", or "merge_tail" |
236
+ | `headerless_width_tolerance` | int | 2 | Width-drift tolerance for headerless pairs when layout confirms continuation |
237
+ | `header_sim_strict` | float | 0.6 | Header similarity threshold |
238
+ | `header_sim_loose` | float | 0.3 | Lower threshold (with layout confirmation) |
239
+ | `row_sim_threshold` | float | 0.3 | First-row similarity fallback |
240
+ | `use_layout_hint` | bool | True | Use vertical position signals |
241
+ | `bottom_band_min` | float | 0.6 | Table A must end below this (0=top, 1=bottom) |
242
+ | `top_band_max` | float | 0.4 | Table B must start above this |
243
+ | `spillover_require_content_check` | bool | False | Require URL/ticket patterns for spillover |
244
+ | `stitch_separator` | str | "\n" | Join character for split content |
245
+ | `max_orphan_rows` | int | 2 | Max rows for header orphan classification |
246
+ | `max_data_orphan_rows` | int | 5 | Max rows for data orphan classification |
247
+
248
+ ## Writing a Custom Adapter
249
+
250
+ For the adapter protocol in detail and notes on the Docling adapter's
251
+ version compatibility and known workarounds, see
252
+ [`src/table_stitcher/adapters/README.md`](src/table_stitcher/adapters/README.md).
253
+
254
+ To integrate a new parser, implement two methods. Here's a working skeleton:
255
+
256
+ ```python
257
+ from typing import Any, List
258
+ import pandas as pd
259
+ from table_stitcher import TableStitcher, MultiPageConfig, TableMeta, LogicalTable
260
+ from table_stitcher.adapters.base import TableStitcherAdapter
261
+ from table_stitcher.merger import tokenize, normalize_col_name, is_numeric_like_colnames, first_row_has_number
262
+
263
+ class MyParserAdapter:
264
+ def extract(self, doc: Any, cfg: MultiPageConfig) -> List[TableMeta]:
265
+ tables_meta = []
266
+ for idx, table in enumerate(doc.tables):
267
+ # 1. Convert your table to a DataFrame
268
+ # - First row as header if it looks like headers
269
+ # - Set df.attrs['is_headerless'] = True if no real headers
270
+ df = pd.DataFrame(table.rows, columns=table.headers)
271
+
272
+ # 2. Get page info
273
+ pages = [table.page_number]
274
+ start_page = pages[0]
275
+
276
+ # 3. Tokenize headers for similarity matching
277
+ header_tokens = set()
278
+ for col in df.columns:
279
+ header_tokens |= tokenize(normalize_col_name(col))
280
+
281
+ # 4. Tokenize first row (fallback similarity signal)
282
+ first_row_tokens = set()
283
+ if df.shape[0] > 0:
284
+ first_row_tokens = tokenize(
285
+ " ".join(str(x) for x in df.iloc[0].tolist())
286
+ )
287
+
288
+ # 5. Classify: is_headerless, is_header_orphan, is_data_orphan
289
+ raw_columns = [str(c) for c in df.columns]
290
+ is_headerless = df.attrs.get('is_headerless', False)
291
+
292
+ tables_meta.append(TableMeta(
293
+ idx=idx,
294
+ df=df,
295
+ start_page=start_page,
296
+ pages=pages,
297
+ width=df.shape[1],
298
+ header_tokens=header_tokens,
299
+ first_row_tokens=first_row_tokens,
300
+ raw_columns=raw_columns,
301
+ vert_center=None, # Set if bbox available
302
+ vert_top=None, # Normalized 0-1, 0=top of page
303
+ vert_bottom=None, # Normalized 0-1, 1=bottom of page
304
+ is_header_orphan=False, # True if headers-only, no/few data rows
305
+ is_data_orphan=False, # True if data-only, no real headers
306
+ numeric_like_cols=is_numeric_like_colnames(raw_columns),
307
+ row_count=df.shape[0],
308
+ is_headerless=is_headerless,
309
+ ))
310
+ return tables_meta
311
+
312
+ def inject(self, doc: Any, logical_tables: List[LogicalTable]) -> Any:
313
+ for lt in logical_tables:
314
+ if len(lt.members) <= 1:
315
+ continue # Nothing merged, skip
316
+
317
+ anchor_idx = lt.members[0]
318
+ # Replace the anchor table's data with lt.df
319
+ doc.tables[anchor_idx].data = lt.df
320
+ doc.tables[anchor_idx].pages = lt.pages
321
+
322
+ # Mark or remove satellite tables
323
+ for sat_idx in lt.members[1:]:
324
+ doc.tables[sat_idx].merged_into = anchor_idx
325
+
326
+ return doc
327
+
328
+ # Use it:
329
+ stitcher = TableStitcher(adapter=MyParserAdapter())
330
+ doc = stitcher.stitch(doc)
331
+ ```
332
+
333
+ ### Key `TableMeta` fields the merger relies on
334
+
335
+ | Field | What the merger uses it for |
336
+ |---|---|
337
+ | `idx` | Original table index in `doc.tables` — used for result mapping |
338
+ | `df` | The table content as a DataFrame — used for row stitching |
339
+ | `start_page`, `pages` | Page adjacency checks — must be populated |
340
+ | `width` | Column count matching — primary merge signal |
341
+ | `header_tokens` | Jaccard similarity for repeated-header detection |
342
+ | `is_headerless` | If `True`, table is a continuation candidate |
343
+ | `is_header_orphan` | If `True`, eligible for orphan+data merge |
344
+ | `is_data_orphan` | If `True`, eligible for header+orphan merge |
345
+ | `vert_top`, `vert_bottom` | Layout hints (0-1 normalized) — optional, set to `None` if unavailable |
346
+
347
+ ## Pass-Through Guarantee
348
+
349
+ Table-stitcher follows a **no-data-loss** principle:
350
+
351
+ - If extraction fails for a table, the **original table is preserved unchanged** in the document. It is not removed or modified.
352
+ - If the entire stitching pipeline fails, the **original document is returned as-is**.
353
+ - Tables that don't match any merge criteria pass through untouched.
354
+ - Skipped tables are logged with a count (e.g., `"Extracted 5/7 tables (2 skipped — originals preserved)"`).
355
+
356
+ This means you can safely call `stitch_tables()` on any document — the worst case is that nothing changes, never that data is lost.
357
+
358
+ ## Error Handling
359
+
360
+ ```python
361
+ from table_stitcher import stitch_tables, StitchingError
362
+
363
+ # Default: fails gracefully, returns original doc
364
+ doc = stitch_tables(doc)
365
+
366
+ # Strict: raises on failure
367
+ try:
368
+ doc = stitch_tables(doc, raise_on_error=True)
369
+ except StitchingError as e:
370
+ handle_error(e)
371
+ ```
372
+
373
+ ## Logging
374
+
375
+ ```python
376
+ import logging
377
+ logging.getLogger("table_stitcher").setLevel(logging.INFO)
378
+ ```
379
+
380
+ ## Testing and Contributing
381
+
382
+ - [`tests/README.md`](tests/README.md) — test layout, running instructions, timings, and what the integration harness actually asserts
383
+ - [`CONTRIBUTING.md`](CONTRIBUTING.md) — dev setup, fixture workflow, naming convention, how to regenerate an `expected.yaml` after a merger change
384
+ - [`src/table_stitcher/adapters/README.md`](src/table_stitcher/adapters/README.md) — adapter protocol, the Docling adapter's version compatibility and known workarounds, how to write a new adapter
385
+
386
+ The library ships with a taxonomy-based integration suite: every merge rule
387
+ has at least one fixture exercising it, and every category that surfaced
388
+ a real bug has a fixture pinning the fix.
389
+
390
+ ## License
391
+
392
+ MIT
@@ -0,0 +1,12 @@
1
+ table_stitcher/__init__.py,sha256=5B9oJB3dGKH8DPaySwT-nMc-TltFkbI5wzKAftnnGK4,11909
2
+ table_stitcher/merger.py,sha256=k8kca1Pj3A4u5Ujc213Tw_wr38Mu2K1nPEs0E4zhwN4,36611
3
+ table_stitcher/models.py,sha256=ZUG0iQFxdcsuqCpUFjpA-Gt4qSPxb3f22BX0dKIfw2U,4731
4
+ table_stitcher/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ table_stitcher/adapters/README.md,sha256=l7oQfAxhcJKMKlBGYJD-_hVJ5tndpa0S_ey9LX4U29k,7057
6
+ table_stitcher/adapters/__init__.py,sha256=LCZoEjhJrVOeF1vZHAopM35odv5vWJ-3nbovl2Kyucs,375
7
+ table_stitcher/adapters/base.py,sha256=djEL4FZW0wyT5Z4NUpxXvJtK5_FfZ4rpQKR4gNLfNq0,1409
8
+ table_stitcher/adapters/docling.py,sha256=HcpQy1Nj_FBefWOkieOsbosK93PpK84De1vG04J3EPg,28459
9
+ table_stitcher-0.3.0.dist-info/METADATA,sha256=ISjNDUCTJmeI-gFY5npu1hosYa86WQkzgh_zpih_h48,17900
10
+ table_stitcher-0.3.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
11
+ table_stitcher-0.3.0.dist-info/licenses/LICENSE,sha256=Ppd0sYarG61TjjB832IuEl7k9n92lW5nTvaR9X3ZoiI,1075
12
+ table_stitcher-0.3.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 PebbleRoad Pte Ltd
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.