table-stitcher 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- table_stitcher/__init__.py +340 -0
- table_stitcher/adapters/README.md +173 -0
- table_stitcher/adapters/__init__.py +11 -0
- table_stitcher/adapters/base.py +42 -0
- table_stitcher/adapters/docling.py +797 -0
- table_stitcher/merger.py +979 -0
- table_stitcher/models.py +145 -0
- table_stitcher/py.typed +0 -0
- table_stitcher-0.3.0.dist-info/METADATA +392 -0
- table_stitcher-0.3.0.dist-info/RECORD +12 -0
- table_stitcher-0.3.0.dist-info/WHEEL +4 -0
- table_stitcher-0.3.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,797 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Docling adapter for table-stitcher.
|
|
3
|
+
|
|
4
|
+
Reads tables from a DoclingDocument and writes merged results back.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import copy
|
|
8
|
+
import logging
|
|
9
|
+
import re
|
|
10
|
+
from typing import Any, Optional
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
from docling_core.types.doc import (
|
|
14
|
+
DoclingDocument,
|
|
15
|
+
TableCell,
|
|
16
|
+
TableData,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
from ..merger import (
|
|
20
|
+
first_row_has_number,
|
|
21
|
+
is_numeric_like_colnames,
|
|
22
|
+
normalize_col_name,
|
|
23
|
+
tokenize,
|
|
24
|
+
)
|
|
25
|
+
from ..models import (
|
|
26
|
+
LogicalTable,
|
|
27
|
+
MultiPageConfig,
|
|
28
|
+
TableMeta,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
log = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# -------------------------------------------------------------------
|
|
35
|
+
# Cell-shape heuristics (used for both headerless detection and
|
|
36
|
+
# structural header-orphan detection — shared so the two checks stay
|
|
37
|
+
# consistent).
|
|
38
|
+
# -------------------------------------------------------------------
|
|
39
|
+
|
|
40
|
+
# Patterns a cell matches when it looks like data rather than a header.
|
|
41
|
+
_DATA_PATTERNS = [
|
|
42
|
+
re.compile(p, re.IGNORECASE)
|
|
43
|
+
for p in [
|
|
44
|
+
r"^\d+$",
|
|
45
|
+
r"^\d+\.\d+$",
|
|
46
|
+
r"^\d{1,2}/\d{1,2}",
|
|
47
|
+
r"^\d{1,2}-\d{1,2}",
|
|
48
|
+
r"^https?://",
|
|
49
|
+
r"^[A-Z]+-\d+$",
|
|
50
|
+
r"^\$[\d,]+",
|
|
51
|
+
r"^[\d,]+\s*%$",
|
|
52
|
+
r"^Row\s*\d+",
|
|
53
|
+
r"^\d+\.\d+\.\d+",
|
|
54
|
+
r"^[\d,]+\.\d+$", # financial: "13,085.03"
|
|
55
|
+
r"^[\d,]+$", # grouped integer: "1,234,567"
|
|
56
|
+
r"^\d+\.?\d*\s*\([\d,\s.]+\)", # stat with range: "280 (176, 404)"
|
|
57
|
+
r"^\d+\.?\d*\s*[xX×]\s*10", # scientific: "7.0 x 10-7"
|
|
58
|
+
]
|
|
59
|
+
]
|
|
60
|
+
|
|
61
|
+
_AUTO_COLNAME_RE = re.compile(r"^(column|unnamed)[_:]?\s*\d+$", re.IGNORECASE)
|
|
62
|
+
|
|
63
|
+
# A cell is "header-shaped" when it's short, alphabetic-ish, and contains
|
|
64
|
+
# no data patterns. Used as the structural signal for orphan detection —
|
|
65
|
+
# no domain vocabulary involved.
|
|
66
|
+
_MAX_HEADER_CELL_LEN = 30
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _looks_like_data(cell: str) -> bool:
|
|
70
|
+
s = str(cell).strip()
|
|
71
|
+
if not s:
|
|
72
|
+
return False
|
|
73
|
+
return any(p.search(s) for p in _DATA_PATTERNS)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _data_subshape(cell: str) -> Optional[str]:
|
|
77
|
+
"""
|
|
78
|
+
Subcategory within "data-shaped". None if the cell isn't data-shaped.
|
|
79
|
+
Lets us tell whether row 1 and the body share the same flavour of data —
|
|
80
|
+
used to recover headers like ``[2020, 2021, 2022, 2023]`` whose cells
|
|
81
|
+
individually match a data pattern but whose row uniformly contrasts
|
|
82
|
+
with the body's shape.
|
|
83
|
+
"""
|
|
84
|
+
s = str(cell).strip()
|
|
85
|
+
if not s or not _looks_like_data(s):
|
|
86
|
+
return None
|
|
87
|
+
if re.match(r"^https?://", s, re.IGNORECASE):
|
|
88
|
+
return "url"
|
|
89
|
+
if s.startswith("$"):
|
|
90
|
+
return "currency"
|
|
91
|
+
if s.endswith("%"):
|
|
92
|
+
return "percent"
|
|
93
|
+
if re.match(r"^\d{1,2}[/-]\d{1,2}", s):
|
|
94
|
+
return "date_like"
|
|
95
|
+
if re.match(r"^\d+$", s):
|
|
96
|
+
return "bare_int"
|
|
97
|
+
if re.match(r"^[\d,]+$", s):
|
|
98
|
+
return "grouped_int"
|
|
99
|
+
if re.match(r"^\d+\.\d+$", s) or re.match(r"^[\d,]+\.\d+$", s):
|
|
100
|
+
return "float"
|
|
101
|
+
return "other_data"
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _first_row_is_header_by_contrast(rows: list[list[str]]) -> bool:
|
|
105
|
+
"""
|
|
106
|
+
True when row 1 is uniformly one data subshape but the body rows are
|
|
107
|
+
consistently a different subshape — meaning row 1 is a column-axis label
|
|
108
|
+
(years, ordinals) rather than data, even though its cells individually
|
|
109
|
+
match data patterns.
|
|
110
|
+
|
|
111
|
+
Universal structural rule: a header row is structurally distinct from
|
|
112
|
+
the body. We only assert it when the contrast is unambiguous — row 1
|
|
113
|
+
has one shape, body has a different one — to avoid promoting genuine
|
|
114
|
+
data rows (e.g. lottery numbers, IDs) where row 1 and body share shape.
|
|
115
|
+
"""
|
|
116
|
+
if len(rows) < 2:
|
|
117
|
+
return False
|
|
118
|
+
|
|
119
|
+
r1_cells = [str(c).strip() for c in rows[0] if str(c).strip()]
|
|
120
|
+
if len(r1_cells) < 2:
|
|
121
|
+
return False
|
|
122
|
+
|
|
123
|
+
r1_shapes = {_data_subshape(c) for c in r1_cells}
|
|
124
|
+
if None in r1_shapes or len(r1_shapes) != 1:
|
|
125
|
+
return False
|
|
126
|
+
r1_shape = next(iter(r1_shapes))
|
|
127
|
+
|
|
128
|
+
body_shapes: list[Optional[str]] = []
|
|
129
|
+
for row in rows[1:3]:
|
|
130
|
+
for c in row:
|
|
131
|
+
s = str(c).strip()
|
|
132
|
+
if s:
|
|
133
|
+
body_shapes.append(_data_subshape(s))
|
|
134
|
+
if not body_shapes:
|
|
135
|
+
return False
|
|
136
|
+
|
|
137
|
+
different = sum(1 for s in body_shapes if s != r1_shape)
|
|
138
|
+
return different / len(body_shapes) >= 0.6
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _is_header_shaped_cell(cell: str) -> bool:
|
|
142
|
+
"""True if cell is plausibly a header cell — short, not data, not auto-label."""
|
|
143
|
+
s = str(cell).strip()
|
|
144
|
+
if not s:
|
|
145
|
+
return True # empty cells coexist with header cells
|
|
146
|
+
if len(s) > _MAX_HEADER_CELL_LEN:
|
|
147
|
+
return False
|
|
148
|
+
if _AUTO_COLNAME_RE.match(s):
|
|
149
|
+
return False
|
|
150
|
+
if _looks_like_data(s):
|
|
151
|
+
return False
|
|
152
|
+
return True
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _detect_header_orphan(df: pd.DataFrame, is_headerless: bool, max_orphan_rows: int) -> bool:
|
|
156
|
+
"""
|
|
157
|
+
Structural rule: a fragment is a header orphan when it's small,
|
|
158
|
+
its first row was treated as a header (not promoted from data), and
|
|
159
|
+
any data rows present look header-shaped too (no data patterns, short).
|
|
160
|
+
|
|
161
|
+
Column names themselves are only screened for data patterns and
|
|
162
|
+
auto-label form — NOT for length, because legitimate headers can be
|
|
163
|
+
phrase-long (e.g. "Average annual revenue per customer"). Data rows,
|
|
164
|
+
however, must be short AND non-data to qualify as header-shaped.
|
|
165
|
+
|
|
166
|
+
No vocabulary is consulted — universal across domains and languages.
|
|
167
|
+
"""
|
|
168
|
+
if is_headerless:
|
|
169
|
+
return False
|
|
170
|
+
if df.shape[0] > max_orphan_rows:
|
|
171
|
+
return False
|
|
172
|
+
|
|
173
|
+
cols = [str(c) for c in df.columns]
|
|
174
|
+
# At least one meaningful column — not all empty / all auto-labels.
|
|
175
|
+
meaningful = [c for c in cols if c.strip() and not _AUTO_COLNAME_RE.match(c)]
|
|
176
|
+
if not meaningful:
|
|
177
|
+
return False
|
|
178
|
+
|
|
179
|
+
# Columns must not contain data patterns (numbers, currency, ranges)
|
|
180
|
+
# — UNLESS the columns form a uniform data subshape (e.g. all years,
|
|
181
|
+
# all ordinals), which indicates a column-axis header rather than data.
|
|
182
|
+
if any(_looks_like_data(c) for c in cols):
|
|
183
|
+
non_empty_cols = [c for c in cols if c.strip()]
|
|
184
|
+
col_shapes = {_data_subshape(c) for c in non_empty_cols}
|
|
185
|
+
if None in col_shapes or len(col_shapes) != 1 or len(non_empty_cols) < 2:
|
|
186
|
+
return False
|
|
187
|
+
|
|
188
|
+
# Data rows (if any) must be header-shaped: short, non-data, non-auto.
|
|
189
|
+
# A long or data-shaped value in a data row means this fragment carries
|
|
190
|
+
# real data, not just orphaned header content.
|
|
191
|
+
for _, row in df.iterrows():
|
|
192
|
+
if not all(_is_header_shaped_cell(v) for v in row.tolist()):
|
|
193
|
+
return False
|
|
194
|
+
|
|
195
|
+
return True
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
# -------------------------------------------------------------------
|
|
199
|
+
# Docling-specific helpers (bbox / provenance)
|
|
200
|
+
# -------------------------------------------------------------------
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _extract_y_bounds_from_prov(prov_list: list[Any]) -> Optional[tuple[float, float, str]]:
|
|
204
|
+
"""
|
|
205
|
+
Extract vertical bounds from Docling provenance data.
|
|
206
|
+
|
|
207
|
+
Returns: (y_min, y_max, coord_origin) or None if not available.
|
|
208
|
+
"""
|
|
209
|
+
for p in prov_list:
|
|
210
|
+
bbox = getattr(p, "bbox", None)
|
|
211
|
+
if bbox is None:
|
|
212
|
+
continue
|
|
213
|
+
|
|
214
|
+
t = getattr(bbox, "t", None)
|
|
215
|
+
b = getattr(bbox, "b", None)
|
|
216
|
+
|
|
217
|
+
if t is not None and b is not None:
|
|
218
|
+
coord_origin = getattr(bbox, "coord_origin", None)
|
|
219
|
+
origin_str = str(coord_origin) if coord_origin else "BOTTOMLEFT"
|
|
220
|
+
return (float(b), float(t), origin_str)
|
|
221
|
+
|
|
222
|
+
return None
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def _resolve_page_height(prov_list: list[Any], doc: Any, fallback: float = 842.0) -> float:
|
|
226
|
+
"""
|
|
227
|
+
Look up the actual page height for the first prov entry from the document.
|
|
228
|
+
Falls back to A4 (842pt) only when the document does not expose a size.
|
|
229
|
+
"""
|
|
230
|
+
pages = getattr(doc, "pages", None)
|
|
231
|
+
if not pages:
|
|
232
|
+
return fallback
|
|
233
|
+
for p in prov_list:
|
|
234
|
+
page_no = getattr(p, "page_no", None)
|
|
235
|
+
if page_no is None:
|
|
236
|
+
continue
|
|
237
|
+
page_item = pages.get(page_no) if hasattr(pages, "get") else None
|
|
238
|
+
size = getattr(page_item, "size", None) if page_item else None
|
|
239
|
+
height = getattr(size, "height", None) if size else None
|
|
240
|
+
if height:
|
|
241
|
+
return float(height)
|
|
242
|
+
return fallback
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _compute_vertical_positions(
|
|
246
|
+
prov_list: list[Any],
|
|
247
|
+
page_height: float = 842.0,
|
|
248
|
+
) -> tuple[Optional[float], Optional[float], Optional[float]]:
|
|
249
|
+
"""
|
|
250
|
+
Compute normalized vertical positions (0-1 scale, top=0, bottom=1).
|
|
251
|
+
|
|
252
|
+
Caller should pass the actual page height for the page in question;
|
|
253
|
+
the default of 842.0 (A4) is only a safety net for missing metadata.
|
|
254
|
+
"""
|
|
255
|
+
bounds = _extract_y_bounds_from_prov(prov_list)
|
|
256
|
+
if bounds is None:
|
|
257
|
+
return None, None, None
|
|
258
|
+
|
|
259
|
+
y_bottom, y_top, origin_str = bounds
|
|
260
|
+
|
|
261
|
+
if "BOTTOMLEFT" in origin_str.upper():
|
|
262
|
+
if y_top > page_height:
|
|
263
|
+
page_height = max(y_top * 1.1, page_height)
|
|
264
|
+
vert_top = 1.0 - (y_top / page_height)
|
|
265
|
+
vert_bottom = 1.0 - (y_bottom / page_height)
|
|
266
|
+
else:
|
|
267
|
+
vert_top = y_top / page_height
|
|
268
|
+
vert_bottom = y_bottom / page_height
|
|
269
|
+
|
|
270
|
+
vert_top = max(0.0, min(1.0, vert_top))
|
|
271
|
+
vert_bottom = max(0.0, min(1.0, vert_bottom))
|
|
272
|
+
vert_center = (vert_top + vert_bottom) / 2.0
|
|
273
|
+
|
|
274
|
+
return vert_center, vert_top, vert_bottom
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
# -------------------------------------------------------------------
|
|
278
|
+
# Grid-to-DataFrame conversion (Docling-specific)
|
|
279
|
+
# -------------------------------------------------------------------
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def _grid_to_dataframe(table: Any, doc: Any) -> pd.DataFrame:
|
|
283
|
+
"""
|
|
284
|
+
Convert Docling table grid to DataFrame with intelligent header detection.
|
|
285
|
+
"""
|
|
286
|
+
if not hasattr(table, "data") or not table.data or not hasattr(table.data, "grid"):
|
|
287
|
+
return table.export_to_dataframe(doc=doc)
|
|
288
|
+
|
|
289
|
+
grid = table.data.grid
|
|
290
|
+
if not grid:
|
|
291
|
+
return pd.DataFrame()
|
|
292
|
+
|
|
293
|
+
all_rows = []
|
|
294
|
+
for row in grid:
|
|
295
|
+
row_data = [getattr(cell, "text", str(cell)) if cell else "" for cell in row]
|
|
296
|
+
all_rows.append(row_data)
|
|
297
|
+
|
|
298
|
+
if not all_rows:
|
|
299
|
+
return pd.DataFrame()
|
|
300
|
+
|
|
301
|
+
real_content_rows = [r for r in all_rows if any(c.strip() for c in r)]
|
|
302
|
+
|
|
303
|
+
if not real_content_rows:
|
|
304
|
+
return pd.DataFrame(columns=[f"Column_{i}" for i in range(len(all_rows[0]))])
|
|
305
|
+
|
|
306
|
+
first_row = real_content_rows[0]
|
|
307
|
+
num_cols = len(first_row)
|
|
308
|
+
|
|
309
|
+
# Determine if first row is header or data — uses module-level
|
|
310
|
+
# _looks_like_data and _is_header_shaped_cell to stay consistent with
|
|
311
|
+
# structural orphan detection below.
|
|
312
|
+
has_data_values = any(_looks_like_data(c) for c in first_row)
|
|
313
|
+
# A uniformly data-shaped row 1 contradicted by a different-shaped body
|
|
314
|
+
# is a column-axis header (years, ordinals), not data.
|
|
315
|
+
if has_data_values and _first_row_is_header_by_contrast(real_content_rows):
|
|
316
|
+
has_data_values = False
|
|
317
|
+
has_url = any("http" in str(c).lower() for c in first_row)
|
|
318
|
+
|
|
319
|
+
non_empty_vals = [str(c).strip().upper() for c in first_row if str(c).strip()]
|
|
320
|
+
if len(non_empty_vals) >= 3:
|
|
321
|
+
unique_vals = set(non_empty_vals)
|
|
322
|
+
repetition_ratio = len(unique_vals) / len(non_empty_vals)
|
|
323
|
+
has_repeated_values = repetition_ratio < 0.5
|
|
324
|
+
placeholder_vals = {"DATA", "N/A", "NA", "NULL", "-", "0", "TBD", "NONE", "YES", "NO"}
|
|
325
|
+
has_placeholders = len(unique_vals & placeholder_vals) > 0
|
|
326
|
+
else:
|
|
327
|
+
has_repeated_values = False
|
|
328
|
+
has_placeholders = False
|
|
329
|
+
|
|
330
|
+
# Real headers are typically short (≤30 chars); a majority of long cells
|
|
331
|
+
# in the "header" row usually means we're looking at a data row whose
|
|
332
|
+
# true header was eaten by the parser on this page.
|
|
333
|
+
non_empty_cells = [str(c).strip() for c in first_row if str(c).strip()]
|
|
334
|
+
long_cells = sum(1 for c in non_empty_cells if len(c) > 30)
|
|
335
|
+
has_long_cells = bool(non_empty_cells) and long_cells / len(non_empty_cells) >= 0.5
|
|
336
|
+
|
|
337
|
+
non_empty_count = sum(1 for v in first_row if v and v.strip())
|
|
338
|
+
is_sparse = (non_empty_count < num_cols / 2) and (not first_row[0].strip())
|
|
339
|
+
|
|
340
|
+
is_headerless = False
|
|
341
|
+
|
|
342
|
+
if (
|
|
343
|
+
has_data_values
|
|
344
|
+
or has_url
|
|
345
|
+
or is_sparse
|
|
346
|
+
or has_repeated_values
|
|
347
|
+
or has_placeholders
|
|
348
|
+
or has_long_cells
|
|
349
|
+
):
|
|
350
|
+
is_headerless = True
|
|
351
|
+
header = [f"Column_{i}" for i in range(num_cols)]
|
|
352
|
+
|
|
353
|
+
if is_sparse and len(real_content_rows) > 1:
|
|
354
|
+
pre_header_rows = [first_row]
|
|
355
|
+
data_rows = real_content_rows[1:]
|
|
356
|
+
else:
|
|
357
|
+
pre_header_rows = []
|
|
358
|
+
data_rows = real_content_rows
|
|
359
|
+
else:
|
|
360
|
+
is_headerless = False
|
|
361
|
+
pre_header_rows = []
|
|
362
|
+
header = first_row
|
|
363
|
+
data_rows = real_content_rows[1:]
|
|
364
|
+
|
|
365
|
+
clean_header = []
|
|
366
|
+
for h in header:
|
|
367
|
+
h_str = str(h).strip()
|
|
368
|
+
if "." in h_str:
|
|
369
|
+
parts = h_str.split(".")
|
|
370
|
+
if len(parts) == 2 and parts[0] == parts[1]:
|
|
371
|
+
h_str = parts[0]
|
|
372
|
+
clean_header.append(h_str if h_str else f"Column_{len(clean_header)}")
|
|
373
|
+
|
|
374
|
+
if data_rows:
|
|
375
|
+
normalized_rows = []
|
|
376
|
+
for row in data_rows:
|
|
377
|
+
row_copy = list(row)
|
|
378
|
+
while len(row_copy) < len(clean_header):
|
|
379
|
+
row_copy.append("")
|
|
380
|
+
normalized_rows.append(row_copy[: len(clean_header)])
|
|
381
|
+
df = pd.DataFrame(normalized_rows, columns=clean_header)
|
|
382
|
+
else:
|
|
383
|
+
df = pd.DataFrame(columns=clean_header)
|
|
384
|
+
|
|
385
|
+
df.attrs["pre_header_rows"] = pre_header_rows
|
|
386
|
+
df.attrs["is_headerless"] = is_headerless
|
|
387
|
+
return df
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
# -------------------------------------------------------------------
|
|
391
|
+
# DataFrame → Docling TableData conversion
|
|
392
|
+
# -------------------------------------------------------------------
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def _extract_original_header_rows(
|
|
396
|
+
original_data: Optional[TableData],
|
|
397
|
+
) -> tuple[list[list[TableCell]], list[TableCell]]:
|
|
398
|
+
"""
|
|
399
|
+
Extract header rows from the anchor table's original grid.
|
|
400
|
+
|
|
401
|
+
Returns (header_grid_rows, flat_header_cells).
|
|
402
|
+
If the original data has multi-row headers with rowspan/colspan,
|
|
403
|
+
they are preserved exactly as-is.
|
|
404
|
+
"""
|
|
405
|
+
if not original_data or not original_data.grid:
|
|
406
|
+
return [], []
|
|
407
|
+
|
|
408
|
+
header_rows: list[list[TableCell]] = []
|
|
409
|
+
header_cells: list[TableCell] = []
|
|
410
|
+
|
|
411
|
+
for row in original_data.grid:
|
|
412
|
+
if row and any(getattr(c, "column_header", False) for c in row if c):
|
|
413
|
+
header_rows.append(row)
|
|
414
|
+
header_cells.extend(c for c in row if c)
|
|
415
|
+
else:
|
|
416
|
+
break # first non-header row = end of header
|
|
417
|
+
|
|
418
|
+
return header_rows, header_cells
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
def _dataframe_to_docling_data(
|
|
422
|
+
df: pd.DataFrame,
|
|
423
|
+
original_data: Optional[TableData] = None,
|
|
424
|
+
) -> TableData:
|
|
425
|
+
"""
|
|
426
|
+
Converts a pandas DataFrame back into Docling's TableData structure.
|
|
427
|
+
|
|
428
|
+
When ``original_data`` is provided and contains multi-row header rows
|
|
429
|
+
(cells with ``column_header=True``, rowspan, colspan), those header rows
|
|
430
|
+
are preserved exactly. Only the data rows are rebuilt from the DataFrame.
|
|
431
|
+
This prevents the lossy roundtrip that would flatten complex headers into
|
|
432
|
+
simple 1x1 cells.
|
|
433
|
+
"""
|
|
434
|
+
if df.empty:
|
|
435
|
+
cols = list(df.columns) if len(df.columns) > 0 else ["Column_0"]
|
|
436
|
+
header_cells = []
|
|
437
|
+
for j, col_name in enumerate(cols):
|
|
438
|
+
cell = TableCell(
|
|
439
|
+
text=str(col_name) if col_name is not None else "",
|
|
440
|
+
row_span=1,
|
|
441
|
+
col_span=1,
|
|
442
|
+
column_header=True,
|
|
443
|
+
row_header=False,
|
|
444
|
+
start_row_offset_idx=0,
|
|
445
|
+
end_row_offset_idx=1,
|
|
446
|
+
start_col_offset_idx=j,
|
|
447
|
+
end_col_offset_idx=j + 1,
|
|
448
|
+
)
|
|
449
|
+
header_cells.append(cell)
|
|
450
|
+
return TableData(
|
|
451
|
+
num_rows=1,
|
|
452
|
+
num_cols=len(cols),
|
|
453
|
+
table_cells=header_cells,
|
|
454
|
+
grid=[header_cells],
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
# --- Try to reuse original header rows (preserves rowspan/colspan) ---
|
|
458
|
+
orig_header_rows, orig_header_cells = _extract_original_header_rows(original_data)
|
|
459
|
+
|
|
460
|
+
num_cols = len(df.columns)
|
|
461
|
+
|
|
462
|
+
if orig_header_rows:
|
|
463
|
+
# Use original header rows as-is
|
|
464
|
+
num_header_rows = len(orig_header_rows)
|
|
465
|
+
grid: list[list[TableCell]] = list(orig_header_rows)
|
|
466
|
+
table_cells: list[TableCell] = list(orig_header_cells)
|
|
467
|
+
else:
|
|
468
|
+
# Fall back to building flat 1x1 header from DataFrame columns
|
|
469
|
+
num_header_rows = 1
|
|
470
|
+
grid = []
|
|
471
|
+
table_cells = []
|
|
472
|
+
|
|
473
|
+
header_row_cells = []
|
|
474
|
+
for j, col_name in enumerate(df.columns):
|
|
475
|
+
cell = TableCell(
|
|
476
|
+
text=str(col_name) if col_name is not None else "",
|
|
477
|
+
row_span=1,
|
|
478
|
+
col_span=1,
|
|
479
|
+
column_header=True,
|
|
480
|
+
row_header=False,
|
|
481
|
+
start_row_offset_idx=0,
|
|
482
|
+
end_row_offset_idx=1,
|
|
483
|
+
start_col_offset_idx=j,
|
|
484
|
+
end_col_offset_idx=j + 1,
|
|
485
|
+
)
|
|
486
|
+
header_row_cells.append(cell)
|
|
487
|
+
table_cells.append(cell)
|
|
488
|
+
|
|
489
|
+
grid.append(header_row_cells)
|
|
490
|
+
|
|
491
|
+
# --- Detect row_header styling from original data ---
|
|
492
|
+
has_row_headers = False
|
|
493
|
+
if original_data and original_data.grid:
|
|
494
|
+
for row in original_data.grid[num_header_rows:]:
|
|
495
|
+
if row and len(row) > 0 and row[0]:
|
|
496
|
+
if getattr(row[0], "row_header", False):
|
|
497
|
+
has_row_headers = True
|
|
498
|
+
break
|
|
499
|
+
|
|
500
|
+
# --- Build data rows from merged DataFrame ---
|
|
501
|
+
for i, (_, row) in enumerate(df.iterrows()):
|
|
502
|
+
grid_row: list[TableCell] = []
|
|
503
|
+
table_row_idx = num_header_rows + i
|
|
504
|
+
|
|
505
|
+
for j, val in enumerate(row):
|
|
506
|
+
if pd.isna(val) or val is None:
|
|
507
|
+
text_val = ""
|
|
508
|
+
else:
|
|
509
|
+
text_val = str(val)
|
|
510
|
+
|
|
511
|
+
row_header = j == 0 and has_row_headers
|
|
512
|
+
|
|
513
|
+
cell = TableCell(
|
|
514
|
+
text=text_val,
|
|
515
|
+
row_span=1,
|
|
516
|
+
col_span=1,
|
|
517
|
+
column_header=False,
|
|
518
|
+
row_header=row_header,
|
|
519
|
+
start_row_offset_idx=table_row_idx,
|
|
520
|
+
end_row_offset_idx=table_row_idx + 1,
|
|
521
|
+
start_col_offset_idx=j,
|
|
522
|
+
end_col_offset_idx=j + 1,
|
|
523
|
+
)
|
|
524
|
+
grid_row.append(cell)
|
|
525
|
+
table_cells.append(cell)
|
|
526
|
+
|
|
527
|
+
grid.append(grid_row)
|
|
528
|
+
|
|
529
|
+
num_total_rows = num_header_rows + len(df)
|
|
530
|
+
|
|
531
|
+
return TableData(num_rows=num_total_rows, num_cols=num_cols, table_cells=table_cells, grid=grid)
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
# -------------------------------------------------------------------
|
|
535
|
+
# Reference pointer helper
|
|
536
|
+
# -------------------------------------------------------------------
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
def _get_ref_pointer(ref_obj: Any) -> str:
|
|
540
|
+
"""Safely extract the string pointer (e.g., '#/tables/1') from a Ref object."""
|
|
541
|
+
if hasattr(ref_obj, "ref"):
|
|
542
|
+
return ref_obj.ref
|
|
543
|
+
|
|
544
|
+
if hasattr(ref_obj, "model_dump"):
|
|
545
|
+
data = ref_obj.model_dump(by_alias=True)
|
|
546
|
+
return data.get("$ref", "")
|
|
547
|
+
|
|
548
|
+
if isinstance(ref_obj, dict):
|
|
549
|
+
return ref_obj.get("$ref", "")
|
|
550
|
+
|
|
551
|
+
return ""
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
# -------------------------------------------------------------------
|
|
555
|
+
# DoclingAdapter
|
|
556
|
+
# -------------------------------------------------------------------
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
class DoclingAdapter:
|
|
560
|
+
"""
|
|
561
|
+
Table-stitcher adapter for Docling (docling-core).
|
|
562
|
+
|
|
563
|
+
Reads tables from a ``DoclingDocument`` and writes merged results back.
|
|
564
|
+
"""
|
|
565
|
+
|
|
566
|
+
def extract(self, doc: DoclingDocument, cfg: MultiPageConfig) -> list[TableMeta]:
|
|
567
|
+
"""Extract metadata from all tables in a DoclingDocument."""
|
|
568
|
+
tables_meta: list[TableMeta] = []
|
|
569
|
+
total = len(doc.tables)
|
|
570
|
+
skipped = 0
|
|
571
|
+
|
|
572
|
+
for idx, table in enumerate(doc.tables):
|
|
573
|
+
try:
|
|
574
|
+
df = _grid_to_dataframe(table, doc)
|
|
575
|
+
except Exception as e:
|
|
576
|
+
log.warning(
|
|
577
|
+
f"Skipping table {idx}/{total}: extraction failed ({e}). "
|
|
578
|
+
"Original table will be preserved unchanged."
|
|
579
|
+
)
|
|
580
|
+
skipped += 1
|
|
581
|
+
continue
|
|
582
|
+
|
|
583
|
+
continuation_content = []
|
|
584
|
+
pre_header_rows = df.attrs.get("pre_header_rows", [])
|
|
585
|
+
is_headerless = df.attrs.get("is_headerless", False)
|
|
586
|
+
|
|
587
|
+
if pre_header_rows:
|
|
588
|
+
for row in pre_header_rows:
|
|
589
|
+
non_empty = [(i, v) for i, v in enumerate(row) if v and v.strip()]
|
|
590
|
+
for col_idx, val in non_empty:
|
|
591
|
+
continuation_content.append({"col_idx": col_idx, "value": val})
|
|
592
|
+
|
|
593
|
+
prov = getattr(table, "prov", None) or []
|
|
594
|
+
pages = sorted({p.page_no for p in prov}) if prov else []
|
|
595
|
+
start_page = pages[0] if pages else None
|
|
596
|
+
|
|
597
|
+
header_tokens: set[str] = set()
|
|
598
|
+
for col in df.columns:
|
|
599
|
+
header_tokens |= tokenize(normalize_col_name(col))
|
|
600
|
+
|
|
601
|
+
first_row_tokens: set[str] = set()
|
|
602
|
+
if df.shape[0] > 0:
|
|
603
|
+
row_text = " ".join(str(x) for x in df.iloc[0].tolist())
|
|
604
|
+
first_row_tokens = tokenize(row_text)
|
|
605
|
+
|
|
606
|
+
vert_center, vert_top, vert_bottom = None, None, None
|
|
607
|
+
if cfg.use_layout_hint and prov:
|
|
608
|
+
page_height = _resolve_page_height(prov, doc)
|
|
609
|
+
vert_center, vert_top, vert_bottom = _compute_vertical_positions(
|
|
610
|
+
prov, page_height=page_height
|
|
611
|
+
)
|
|
612
|
+
|
|
613
|
+
raw_columns = [str(c) for c in df.columns]
|
|
614
|
+
numeric_like_cols = is_numeric_like_colnames(raw_columns)
|
|
615
|
+
|
|
616
|
+
is_header_orphan = _detect_header_orphan(df, is_headerless, cfg.max_orphan_rows)
|
|
617
|
+
|
|
618
|
+
is_data_orphan = (
|
|
619
|
+
df.shape[0] > 0
|
|
620
|
+
and df.shape[0] <= cfg.max_data_orphan_rows
|
|
621
|
+
and first_row_has_number(df)
|
|
622
|
+
)
|
|
623
|
+
|
|
624
|
+
tables_meta.append(
|
|
625
|
+
TableMeta(
|
|
626
|
+
idx=idx,
|
|
627
|
+
df=df,
|
|
628
|
+
start_page=start_page,
|
|
629
|
+
pages=pages,
|
|
630
|
+
width=df.shape[1],
|
|
631
|
+
header_tokens=header_tokens,
|
|
632
|
+
first_row_tokens=first_row_tokens,
|
|
633
|
+
raw_columns=raw_columns,
|
|
634
|
+
vert_center=vert_center,
|
|
635
|
+
vert_top=vert_top,
|
|
636
|
+
vert_bottom=vert_bottom,
|
|
637
|
+
is_header_orphan=is_header_orphan,
|
|
638
|
+
is_data_orphan=is_data_orphan,
|
|
639
|
+
numeric_like_cols=numeric_like_cols,
|
|
640
|
+
row_count=df.shape[0],
|
|
641
|
+
continuation_content=continuation_content,
|
|
642
|
+
is_headerless=is_headerless,
|
|
643
|
+
)
|
|
644
|
+
)
|
|
645
|
+
|
|
646
|
+
if skipped:
|
|
647
|
+
log.warning(
|
|
648
|
+
f"Extracted {len(tables_meta)}/{total} tables "
|
|
649
|
+
f"({skipped} skipped — originals preserved)"
|
|
650
|
+
)
|
|
651
|
+
|
|
652
|
+
return tables_meta
|
|
653
|
+
|
|
654
|
+
def inject(self, doc: DoclingDocument, logical_tables: list[LogicalTable]) -> DoclingDocument:
|
|
655
|
+
"""
|
|
656
|
+
Modify the DoclingDocument in-place with merged table data.
|
|
657
|
+
|
|
658
|
+
Only modifies tables that were actually merged (multiple fragments).
|
|
659
|
+
Single-page tables retain their original Docling structure.
|
|
660
|
+
If injection fails, fields modified by this adapter are restored before
|
|
661
|
+
the exception is re-raised so callers using ``raise_on_error=False`` do
|
|
662
|
+
not receive a half-stitched document.
|
|
663
|
+
"""
|
|
664
|
+
log.info("Starting DoclingDocument injection...")
|
|
665
|
+
|
|
666
|
+
refs_to_remove: set[str] = set()
|
|
667
|
+
table_snapshots = {
|
|
668
|
+
idx: {
|
|
669
|
+
"data": getattr(table, "data", None),
|
|
670
|
+
"prov": copy.copy(getattr(table, "prov", None)),
|
|
671
|
+
}
|
|
672
|
+
for idx, table in enumerate(doc.tables)
|
|
673
|
+
}
|
|
674
|
+
body_children_snapshot = (
|
|
675
|
+
list(doc.body.children)
|
|
676
|
+
if getattr(doc, "body", None) is not None and hasattr(doc.body, "children")
|
|
677
|
+
else None
|
|
678
|
+
)
|
|
679
|
+
group_children_snapshots = {
|
|
680
|
+
idx: list(group.children)
|
|
681
|
+
for idx, group in enumerate(getattr(doc, "groups", []) or [])
|
|
682
|
+
if hasattr(group, "children")
|
|
683
|
+
}
|
|
684
|
+
|
|
685
|
+
def restore_snapshots():
|
|
686
|
+
for idx, snap in table_snapshots.items():
|
|
687
|
+
if idx >= len(doc.tables):
|
|
688
|
+
continue
|
|
689
|
+
doc.tables[idx].data = snap["data"]
|
|
690
|
+
doc.tables[idx].prov = copy.copy(snap["prov"])
|
|
691
|
+
|
|
692
|
+
if body_children_snapshot is not None:
|
|
693
|
+
doc.body.children = list(body_children_snapshot)
|
|
694
|
+
|
|
695
|
+
for idx, children in group_children_snapshots.items():
|
|
696
|
+
groups = getattr(doc, "groups", []) or []
|
|
697
|
+
if idx < len(groups) and hasattr(groups[idx], "children"):
|
|
698
|
+
groups[idx].children = list(children)
|
|
699
|
+
|
|
700
|
+
try:
|
|
701
|
+
for lt in logical_tables:
|
|
702
|
+
if not lt.members:
|
|
703
|
+
continue
|
|
704
|
+
|
|
705
|
+
if len(lt.members) == 1:
|
|
706
|
+
log.debug(
|
|
707
|
+
f"Skipping single-table {lt.members[0]} - preserving original structure"
|
|
708
|
+
)
|
|
709
|
+
continue
|
|
710
|
+
|
|
711
|
+
anchor_idx = lt.members[0]
|
|
712
|
+
anchor_table = doc.tables[anchor_idx]
|
|
713
|
+
|
|
714
|
+
log.info(
|
|
715
|
+
f"Injecting Logical Table {lt.logical_index} into Anchor Table {anchor_idx} "
|
|
716
|
+
f"(merged from {len(lt.members)} fragments)"
|
|
717
|
+
)
|
|
718
|
+
|
|
719
|
+
original_data = getattr(anchor_table, "data", None)
|
|
720
|
+
|
|
721
|
+
anchor_table.data = _dataframe_to_docling_data(
|
|
722
|
+
lt.df,
|
|
723
|
+
original_data=original_data,
|
|
724
|
+
)
|
|
725
|
+
|
|
726
|
+
for satellite_idx in lt.members[1:]:
|
|
727
|
+
satellite_table = doc.tables[satellite_idx]
|
|
728
|
+
|
|
729
|
+
if satellite_table.prov:
|
|
730
|
+
if anchor_table.prov is None:
|
|
731
|
+
anchor_table.prov = []
|
|
732
|
+
|
|
733
|
+
if isinstance(satellite_table.prov, list):
|
|
734
|
+
if isinstance(anchor_table.prov, list):
|
|
735
|
+
anchor_table.prov.extend(satellite_table.prov)
|
|
736
|
+
else:
|
|
737
|
+
anchor_table.prov = [anchor_table.prov] + satellite_table.prov
|
|
738
|
+
else:
|
|
739
|
+
if isinstance(anchor_table.prov, list):
|
|
740
|
+
anchor_table.prov.append(satellite_table.prov)
|
|
741
|
+
else:
|
|
742
|
+
anchor_table.prov = [anchor_table.prov, satellite_table.prov]
|
|
743
|
+
|
|
744
|
+
refs_to_remove.add(satellite_table.self_ref)
|
|
745
|
+
|
|
746
|
+
# Clear the satellite in place so downstream code iterating
|
|
747
|
+
# doc.tables directly doesn't see stale fragment content.
|
|
748
|
+
# We don't pop the entry because self_refs are position-based
|
|
749
|
+
# (`#/tables/N` = list index N) — removing an element would
|
|
750
|
+
# shift every subsequent self_ref and body reference. The
|
|
751
|
+
# satellite becomes an empty shell, still present but
|
|
752
|
+
# without data or prov.
|
|
753
|
+
satellite_table.data = TableData(
|
|
754
|
+
num_rows=0, num_cols=0, table_cells=[], grid=[]
|
|
755
|
+
)
|
|
756
|
+
satellite_table.prov = []
|
|
757
|
+
|
|
758
|
+
# Prune satellite references from body hierarchy
|
|
759
|
+
removed_count = 0
|
|
760
|
+
|
|
761
|
+
def traverse_and_prune(group_node: Any):
|
|
762
|
+
nonlocal removed_count
|
|
763
|
+
if not hasattr(group_node, "children"):
|
|
764
|
+
return
|
|
765
|
+
|
|
766
|
+
new_children = []
|
|
767
|
+
for child_ref in group_node.children:
|
|
768
|
+
ptr = _get_ref_pointer(child_ref)
|
|
769
|
+
|
|
770
|
+
if not ptr:
|
|
771
|
+
new_children.append(child_ref)
|
|
772
|
+
continue
|
|
773
|
+
|
|
774
|
+
if ptr in refs_to_remove:
|
|
775
|
+
removed_count += 1
|
|
776
|
+
continue
|
|
777
|
+
|
|
778
|
+
new_children.append(child_ref)
|
|
779
|
+
|
|
780
|
+
if ptr.startswith("#/groups/"):
|
|
781
|
+
try:
|
|
782
|
+
group_idx = int(ptr.split("/")[-1])
|
|
783
|
+
if group_idx < len(doc.groups):
|
|
784
|
+
traverse_and_prune(doc.groups[group_idx])
|
|
785
|
+
except (ValueError, IndexError):
|
|
786
|
+
pass
|
|
787
|
+
|
|
788
|
+
group_node.children = new_children
|
|
789
|
+
|
|
790
|
+
if doc.body:
|
|
791
|
+
traverse_and_prune(doc.body)
|
|
792
|
+
except Exception:
|
|
793
|
+
restore_snapshots()
|
|
794
|
+
raise
|
|
795
|
+
|
|
796
|
+
log.info(f"Injection complete. Pruned {removed_count} satellite table references.")
|
|
797
|
+
return doc
|