table-stitcher 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,797 @@
1
+ """
2
+ Docling adapter for table-stitcher.
3
+
4
+ Reads tables from a DoclingDocument and writes merged results back.
5
+ """
6
+
7
+ import copy
8
+ import logging
9
+ import re
10
+ from typing import Any, Optional
11
+
12
+ import pandas as pd
13
+ from docling_core.types.doc import (
14
+ DoclingDocument,
15
+ TableCell,
16
+ TableData,
17
+ )
18
+
19
+ from ..merger import (
20
+ first_row_has_number,
21
+ is_numeric_like_colnames,
22
+ normalize_col_name,
23
+ tokenize,
24
+ )
25
+ from ..models import (
26
+ LogicalTable,
27
+ MultiPageConfig,
28
+ TableMeta,
29
+ )
30
+
31
+ log = logging.getLogger(__name__)
32
+
33
+
34
+ # -------------------------------------------------------------------
35
+ # Cell-shape heuristics (used for both headerless detection and
36
+ # structural header-orphan detection — shared so the two checks stay
37
+ # consistent).
38
+ # -------------------------------------------------------------------
39
+
40
+ # Patterns a cell matches when it looks like data rather than a header.
41
+ _DATA_PATTERNS = [
42
+ re.compile(p, re.IGNORECASE)
43
+ for p in [
44
+ r"^\d+$",
45
+ r"^\d+\.\d+$",
46
+ r"^\d{1,2}/\d{1,2}",
47
+ r"^\d{1,2}-\d{1,2}",
48
+ r"^https?://",
49
+ r"^[A-Z]+-\d+$",
50
+ r"^\$[\d,]+",
51
+ r"^[\d,]+\s*%$",
52
+ r"^Row\s*\d+",
53
+ r"^\d+\.\d+\.\d+",
54
+ r"^[\d,]+\.\d+$", # financial: "13,085.03"
55
+ r"^[\d,]+$", # grouped integer: "1,234,567"
56
+ r"^\d+\.?\d*\s*\([\d,\s.]+\)", # stat with range: "280 (176, 404)"
57
+ r"^\d+\.?\d*\s*[xX×]\s*10", # scientific: "7.0 x 10-7"
58
+ ]
59
+ ]
60
+
61
+ _AUTO_COLNAME_RE = re.compile(r"^(column|unnamed)[_:]?\s*\d+$", re.IGNORECASE)
62
+
63
+ # A cell is "header-shaped" when it's short, alphabetic-ish, and contains
64
+ # no data patterns. Used as the structural signal for orphan detection —
65
+ # no domain vocabulary involved.
66
+ _MAX_HEADER_CELL_LEN = 30
67
+
68
+
69
+ def _looks_like_data(cell: str) -> bool:
70
+ s = str(cell).strip()
71
+ if not s:
72
+ return False
73
+ return any(p.search(s) for p in _DATA_PATTERNS)
74
+
75
+
76
+ def _data_subshape(cell: str) -> Optional[str]:
77
+ """
78
+ Subcategory within "data-shaped". None if the cell isn't data-shaped.
79
+ Lets us tell whether row 1 and the body share the same flavour of data —
80
+ used to recover headers like ``[2020, 2021, 2022, 2023]`` whose cells
81
+ individually match a data pattern but whose row uniformly contrasts
82
+ with the body's shape.
83
+ """
84
+ s = str(cell).strip()
85
+ if not s or not _looks_like_data(s):
86
+ return None
87
+ if re.match(r"^https?://", s, re.IGNORECASE):
88
+ return "url"
89
+ if s.startswith("$"):
90
+ return "currency"
91
+ if s.endswith("%"):
92
+ return "percent"
93
+ if re.match(r"^\d{1,2}[/-]\d{1,2}", s):
94
+ return "date_like"
95
+ if re.match(r"^\d+$", s):
96
+ return "bare_int"
97
+ if re.match(r"^[\d,]+$", s):
98
+ return "grouped_int"
99
+ if re.match(r"^\d+\.\d+$", s) or re.match(r"^[\d,]+\.\d+$", s):
100
+ return "float"
101
+ return "other_data"
102
+
103
+
104
+ def _first_row_is_header_by_contrast(rows: list[list[str]]) -> bool:
105
+ """
106
+ True when row 1 is uniformly one data subshape but the body rows are
107
+ consistently a different subshape — meaning row 1 is a column-axis label
108
+ (years, ordinals) rather than data, even though its cells individually
109
+ match data patterns.
110
+
111
+ Universal structural rule: a header row is structurally distinct from
112
+ the body. We only assert it when the contrast is unambiguous — row 1
113
+ has one shape, body has a different one — to avoid promoting genuine
114
+ data rows (e.g. lottery numbers, IDs) where row 1 and body share shape.
115
+ """
116
+ if len(rows) < 2:
117
+ return False
118
+
119
+ r1_cells = [str(c).strip() for c in rows[0] if str(c).strip()]
120
+ if len(r1_cells) < 2:
121
+ return False
122
+
123
+ r1_shapes = {_data_subshape(c) for c in r1_cells}
124
+ if None in r1_shapes or len(r1_shapes) != 1:
125
+ return False
126
+ r1_shape = next(iter(r1_shapes))
127
+
128
+ body_shapes: list[Optional[str]] = []
129
+ for row in rows[1:3]:
130
+ for c in row:
131
+ s = str(c).strip()
132
+ if s:
133
+ body_shapes.append(_data_subshape(s))
134
+ if not body_shapes:
135
+ return False
136
+
137
+ different = sum(1 for s in body_shapes if s != r1_shape)
138
+ return different / len(body_shapes) >= 0.6
139
+
140
+
141
+ def _is_header_shaped_cell(cell: str) -> bool:
142
+ """True if cell is plausibly a header cell — short, not data, not auto-label."""
143
+ s = str(cell).strip()
144
+ if not s:
145
+ return True # empty cells coexist with header cells
146
+ if len(s) > _MAX_HEADER_CELL_LEN:
147
+ return False
148
+ if _AUTO_COLNAME_RE.match(s):
149
+ return False
150
+ if _looks_like_data(s):
151
+ return False
152
+ return True
153
+
154
+
155
+ def _detect_header_orphan(df: pd.DataFrame, is_headerless: bool, max_orphan_rows: int) -> bool:
156
+ """
157
+ Structural rule: a fragment is a header orphan when it's small,
158
+ its first row was treated as a header (not promoted from data), and
159
+ any data rows present look header-shaped too (no data patterns, short).
160
+
161
+ Column names themselves are only screened for data patterns and
162
+ auto-label form — NOT for length, because legitimate headers can be
163
+ phrase-long (e.g. "Average annual revenue per customer"). Data rows,
164
+ however, must be short AND non-data to qualify as header-shaped.
165
+
166
+ No vocabulary is consulted — universal across domains and languages.
167
+ """
168
+ if is_headerless:
169
+ return False
170
+ if df.shape[0] > max_orphan_rows:
171
+ return False
172
+
173
+ cols = [str(c) for c in df.columns]
174
+ # At least one meaningful column — not all empty / all auto-labels.
175
+ meaningful = [c for c in cols if c.strip() and not _AUTO_COLNAME_RE.match(c)]
176
+ if not meaningful:
177
+ return False
178
+
179
+ # Columns must not contain data patterns (numbers, currency, ranges)
180
+ # — UNLESS the columns form a uniform data subshape (e.g. all years,
181
+ # all ordinals), which indicates a column-axis header rather than data.
182
+ if any(_looks_like_data(c) for c in cols):
183
+ non_empty_cols = [c for c in cols if c.strip()]
184
+ col_shapes = {_data_subshape(c) for c in non_empty_cols}
185
+ if None in col_shapes or len(col_shapes) != 1 or len(non_empty_cols) < 2:
186
+ return False
187
+
188
+ # Data rows (if any) must be header-shaped: short, non-data, non-auto.
189
+ # A long or data-shaped value in a data row means this fragment carries
190
+ # real data, not just orphaned header content.
191
+ for _, row in df.iterrows():
192
+ if not all(_is_header_shaped_cell(v) for v in row.tolist()):
193
+ return False
194
+
195
+ return True
196
+
197
+
198
+ # -------------------------------------------------------------------
199
+ # Docling-specific helpers (bbox / provenance)
200
+ # -------------------------------------------------------------------
201
+
202
+
203
+ def _extract_y_bounds_from_prov(prov_list: list[Any]) -> Optional[tuple[float, float, str]]:
204
+ """
205
+ Extract vertical bounds from Docling provenance data.
206
+
207
+ Returns: (y_min, y_max, coord_origin) or None if not available.
208
+ """
209
+ for p in prov_list:
210
+ bbox = getattr(p, "bbox", None)
211
+ if bbox is None:
212
+ continue
213
+
214
+ t = getattr(bbox, "t", None)
215
+ b = getattr(bbox, "b", None)
216
+
217
+ if t is not None and b is not None:
218
+ coord_origin = getattr(bbox, "coord_origin", None)
219
+ origin_str = str(coord_origin) if coord_origin else "BOTTOMLEFT"
220
+ return (float(b), float(t), origin_str)
221
+
222
+ return None
223
+
224
+
225
+ def _resolve_page_height(prov_list: list[Any], doc: Any, fallback: float = 842.0) -> float:
226
+ """
227
+ Look up the actual page height for the first prov entry from the document.
228
+ Falls back to A4 (842pt) only when the document does not expose a size.
229
+ """
230
+ pages = getattr(doc, "pages", None)
231
+ if not pages:
232
+ return fallback
233
+ for p in prov_list:
234
+ page_no = getattr(p, "page_no", None)
235
+ if page_no is None:
236
+ continue
237
+ page_item = pages.get(page_no) if hasattr(pages, "get") else None
238
+ size = getattr(page_item, "size", None) if page_item else None
239
+ height = getattr(size, "height", None) if size else None
240
+ if height:
241
+ return float(height)
242
+ return fallback
243
+
244
+
245
+ def _compute_vertical_positions(
246
+ prov_list: list[Any],
247
+ page_height: float = 842.0,
248
+ ) -> tuple[Optional[float], Optional[float], Optional[float]]:
249
+ """
250
+ Compute normalized vertical positions (0-1 scale, top=0, bottom=1).
251
+
252
+ Caller should pass the actual page height for the page in question;
253
+ the default of 842.0 (A4) is only a safety net for missing metadata.
254
+ """
255
+ bounds = _extract_y_bounds_from_prov(prov_list)
256
+ if bounds is None:
257
+ return None, None, None
258
+
259
+ y_bottom, y_top, origin_str = bounds
260
+
261
+ if "BOTTOMLEFT" in origin_str.upper():
262
+ if y_top > page_height:
263
+ page_height = max(y_top * 1.1, page_height)
264
+ vert_top = 1.0 - (y_top / page_height)
265
+ vert_bottom = 1.0 - (y_bottom / page_height)
266
+ else:
267
+ vert_top = y_top / page_height
268
+ vert_bottom = y_bottom / page_height
269
+
270
+ vert_top = max(0.0, min(1.0, vert_top))
271
+ vert_bottom = max(0.0, min(1.0, vert_bottom))
272
+ vert_center = (vert_top + vert_bottom) / 2.0
273
+
274
+ return vert_center, vert_top, vert_bottom
275
+
276
+
277
+ # -------------------------------------------------------------------
278
+ # Grid-to-DataFrame conversion (Docling-specific)
279
+ # -------------------------------------------------------------------
280
+
281
+
282
+ def _grid_to_dataframe(table: Any, doc: Any) -> pd.DataFrame:
283
+ """
284
+ Convert Docling table grid to DataFrame with intelligent header detection.
285
+ """
286
+ if not hasattr(table, "data") or not table.data or not hasattr(table.data, "grid"):
287
+ return table.export_to_dataframe(doc=doc)
288
+
289
+ grid = table.data.grid
290
+ if not grid:
291
+ return pd.DataFrame()
292
+
293
+ all_rows = []
294
+ for row in grid:
295
+ row_data = [getattr(cell, "text", str(cell)) if cell else "" for cell in row]
296
+ all_rows.append(row_data)
297
+
298
+ if not all_rows:
299
+ return pd.DataFrame()
300
+
301
+ real_content_rows = [r for r in all_rows if any(c.strip() for c in r)]
302
+
303
+ if not real_content_rows:
304
+ return pd.DataFrame(columns=[f"Column_{i}" for i in range(len(all_rows[0]))])
305
+
306
+ first_row = real_content_rows[0]
307
+ num_cols = len(first_row)
308
+
309
+ # Determine if first row is header or data — uses module-level
310
+ # _looks_like_data and _is_header_shaped_cell to stay consistent with
311
+ # structural orphan detection below.
312
+ has_data_values = any(_looks_like_data(c) for c in first_row)
313
+ # A uniformly data-shaped row 1 contradicted by a different-shaped body
314
+ # is a column-axis header (years, ordinals), not data.
315
+ if has_data_values and _first_row_is_header_by_contrast(real_content_rows):
316
+ has_data_values = False
317
+ has_url = any("http" in str(c).lower() for c in first_row)
318
+
319
+ non_empty_vals = [str(c).strip().upper() for c in first_row if str(c).strip()]
320
+ if len(non_empty_vals) >= 3:
321
+ unique_vals = set(non_empty_vals)
322
+ repetition_ratio = len(unique_vals) / len(non_empty_vals)
323
+ has_repeated_values = repetition_ratio < 0.5
324
+ placeholder_vals = {"DATA", "N/A", "NA", "NULL", "-", "0", "TBD", "NONE", "YES", "NO"}
325
+ has_placeholders = len(unique_vals & placeholder_vals) > 0
326
+ else:
327
+ has_repeated_values = False
328
+ has_placeholders = False
329
+
330
+ # Real headers are typically short (≤30 chars); a majority of long cells
331
+ # in the "header" row usually means we're looking at a data row whose
332
+ # true header was eaten by the parser on this page.
333
+ non_empty_cells = [str(c).strip() for c in first_row if str(c).strip()]
334
+ long_cells = sum(1 for c in non_empty_cells if len(c) > 30)
335
+ has_long_cells = bool(non_empty_cells) and long_cells / len(non_empty_cells) >= 0.5
336
+
337
+ non_empty_count = sum(1 for v in first_row if v and v.strip())
338
+ is_sparse = (non_empty_count < num_cols / 2) and (not first_row[0].strip())
339
+
340
+ is_headerless = False
341
+
342
+ if (
343
+ has_data_values
344
+ or has_url
345
+ or is_sparse
346
+ or has_repeated_values
347
+ or has_placeholders
348
+ or has_long_cells
349
+ ):
350
+ is_headerless = True
351
+ header = [f"Column_{i}" for i in range(num_cols)]
352
+
353
+ if is_sparse and len(real_content_rows) > 1:
354
+ pre_header_rows = [first_row]
355
+ data_rows = real_content_rows[1:]
356
+ else:
357
+ pre_header_rows = []
358
+ data_rows = real_content_rows
359
+ else:
360
+ is_headerless = False
361
+ pre_header_rows = []
362
+ header = first_row
363
+ data_rows = real_content_rows[1:]
364
+
365
+ clean_header = []
366
+ for h in header:
367
+ h_str = str(h).strip()
368
+ if "." in h_str:
369
+ parts = h_str.split(".")
370
+ if len(parts) == 2 and parts[0] == parts[1]:
371
+ h_str = parts[0]
372
+ clean_header.append(h_str if h_str else f"Column_{len(clean_header)}")
373
+
374
+ if data_rows:
375
+ normalized_rows = []
376
+ for row in data_rows:
377
+ row_copy = list(row)
378
+ while len(row_copy) < len(clean_header):
379
+ row_copy.append("")
380
+ normalized_rows.append(row_copy[: len(clean_header)])
381
+ df = pd.DataFrame(normalized_rows, columns=clean_header)
382
+ else:
383
+ df = pd.DataFrame(columns=clean_header)
384
+
385
+ df.attrs["pre_header_rows"] = pre_header_rows
386
+ df.attrs["is_headerless"] = is_headerless
387
+ return df
388
+
389
+
390
+ # -------------------------------------------------------------------
391
+ # DataFrame → Docling TableData conversion
392
+ # -------------------------------------------------------------------
393
+
394
+
395
+ def _extract_original_header_rows(
396
+ original_data: Optional[TableData],
397
+ ) -> tuple[list[list[TableCell]], list[TableCell]]:
398
+ """
399
+ Extract header rows from the anchor table's original grid.
400
+
401
+ Returns (header_grid_rows, flat_header_cells).
402
+ If the original data has multi-row headers with rowspan/colspan,
403
+ they are preserved exactly as-is.
404
+ """
405
+ if not original_data or not original_data.grid:
406
+ return [], []
407
+
408
+ header_rows: list[list[TableCell]] = []
409
+ header_cells: list[TableCell] = []
410
+
411
+ for row in original_data.grid:
412
+ if row and any(getattr(c, "column_header", False) for c in row if c):
413
+ header_rows.append(row)
414
+ header_cells.extend(c for c in row if c)
415
+ else:
416
+ break # first non-header row = end of header
417
+
418
+ return header_rows, header_cells
419
+
420
+
421
+ def _dataframe_to_docling_data(
422
+ df: pd.DataFrame,
423
+ original_data: Optional[TableData] = None,
424
+ ) -> TableData:
425
+ """
426
+ Converts a pandas DataFrame back into Docling's TableData structure.
427
+
428
+ When ``original_data`` is provided and contains multi-row header rows
429
+ (cells with ``column_header=True``, rowspan, colspan), those header rows
430
+ are preserved exactly. Only the data rows are rebuilt from the DataFrame.
431
+ This prevents the lossy roundtrip that would flatten complex headers into
432
+ simple 1x1 cells.
433
+ """
434
+ if df.empty:
435
+ cols = list(df.columns) if len(df.columns) > 0 else ["Column_0"]
436
+ header_cells = []
437
+ for j, col_name in enumerate(cols):
438
+ cell = TableCell(
439
+ text=str(col_name) if col_name is not None else "",
440
+ row_span=1,
441
+ col_span=1,
442
+ column_header=True,
443
+ row_header=False,
444
+ start_row_offset_idx=0,
445
+ end_row_offset_idx=1,
446
+ start_col_offset_idx=j,
447
+ end_col_offset_idx=j + 1,
448
+ )
449
+ header_cells.append(cell)
450
+ return TableData(
451
+ num_rows=1,
452
+ num_cols=len(cols),
453
+ table_cells=header_cells,
454
+ grid=[header_cells],
455
+ )
456
+
457
+ # --- Try to reuse original header rows (preserves rowspan/colspan) ---
458
+ orig_header_rows, orig_header_cells = _extract_original_header_rows(original_data)
459
+
460
+ num_cols = len(df.columns)
461
+
462
+ if orig_header_rows:
463
+ # Use original header rows as-is
464
+ num_header_rows = len(orig_header_rows)
465
+ grid: list[list[TableCell]] = list(orig_header_rows)
466
+ table_cells: list[TableCell] = list(orig_header_cells)
467
+ else:
468
+ # Fall back to building flat 1x1 header from DataFrame columns
469
+ num_header_rows = 1
470
+ grid = []
471
+ table_cells = []
472
+
473
+ header_row_cells = []
474
+ for j, col_name in enumerate(df.columns):
475
+ cell = TableCell(
476
+ text=str(col_name) if col_name is not None else "",
477
+ row_span=1,
478
+ col_span=1,
479
+ column_header=True,
480
+ row_header=False,
481
+ start_row_offset_idx=0,
482
+ end_row_offset_idx=1,
483
+ start_col_offset_idx=j,
484
+ end_col_offset_idx=j + 1,
485
+ )
486
+ header_row_cells.append(cell)
487
+ table_cells.append(cell)
488
+
489
+ grid.append(header_row_cells)
490
+
491
+ # --- Detect row_header styling from original data ---
492
+ has_row_headers = False
493
+ if original_data and original_data.grid:
494
+ for row in original_data.grid[num_header_rows:]:
495
+ if row and len(row) > 0 and row[0]:
496
+ if getattr(row[0], "row_header", False):
497
+ has_row_headers = True
498
+ break
499
+
500
+ # --- Build data rows from merged DataFrame ---
501
+ for i, (_, row) in enumerate(df.iterrows()):
502
+ grid_row: list[TableCell] = []
503
+ table_row_idx = num_header_rows + i
504
+
505
+ for j, val in enumerate(row):
506
+ if pd.isna(val) or val is None:
507
+ text_val = ""
508
+ else:
509
+ text_val = str(val)
510
+
511
+ row_header = j == 0 and has_row_headers
512
+
513
+ cell = TableCell(
514
+ text=text_val,
515
+ row_span=1,
516
+ col_span=1,
517
+ column_header=False,
518
+ row_header=row_header,
519
+ start_row_offset_idx=table_row_idx,
520
+ end_row_offset_idx=table_row_idx + 1,
521
+ start_col_offset_idx=j,
522
+ end_col_offset_idx=j + 1,
523
+ )
524
+ grid_row.append(cell)
525
+ table_cells.append(cell)
526
+
527
+ grid.append(grid_row)
528
+
529
+ num_total_rows = num_header_rows + len(df)
530
+
531
+ return TableData(num_rows=num_total_rows, num_cols=num_cols, table_cells=table_cells, grid=grid)
532
+
533
+
534
+ # -------------------------------------------------------------------
535
+ # Reference pointer helper
536
+ # -------------------------------------------------------------------
537
+
538
+
539
+ def _get_ref_pointer(ref_obj: Any) -> str:
540
+ """Safely extract the string pointer (e.g., '#/tables/1') from a Ref object."""
541
+ if hasattr(ref_obj, "ref"):
542
+ return ref_obj.ref
543
+
544
+ if hasattr(ref_obj, "model_dump"):
545
+ data = ref_obj.model_dump(by_alias=True)
546
+ return data.get("$ref", "")
547
+
548
+ if isinstance(ref_obj, dict):
549
+ return ref_obj.get("$ref", "")
550
+
551
+ return ""
552
+
553
+
554
+ # -------------------------------------------------------------------
555
+ # DoclingAdapter
556
+ # -------------------------------------------------------------------
557
+
558
+
559
+ class DoclingAdapter:
560
+ """
561
+ Table-stitcher adapter for Docling (docling-core).
562
+
563
+ Reads tables from a ``DoclingDocument`` and writes merged results back.
564
+ """
565
+
566
+ def extract(self, doc: DoclingDocument, cfg: MultiPageConfig) -> list[TableMeta]:
567
+ """Extract metadata from all tables in a DoclingDocument."""
568
+ tables_meta: list[TableMeta] = []
569
+ total = len(doc.tables)
570
+ skipped = 0
571
+
572
+ for idx, table in enumerate(doc.tables):
573
+ try:
574
+ df = _grid_to_dataframe(table, doc)
575
+ except Exception as e:
576
+ log.warning(
577
+ f"Skipping table {idx}/{total}: extraction failed ({e}). "
578
+ "Original table will be preserved unchanged."
579
+ )
580
+ skipped += 1
581
+ continue
582
+
583
+ continuation_content = []
584
+ pre_header_rows = df.attrs.get("pre_header_rows", [])
585
+ is_headerless = df.attrs.get("is_headerless", False)
586
+
587
+ if pre_header_rows:
588
+ for row in pre_header_rows:
589
+ non_empty = [(i, v) for i, v in enumerate(row) if v and v.strip()]
590
+ for col_idx, val in non_empty:
591
+ continuation_content.append({"col_idx": col_idx, "value": val})
592
+
593
+ prov = getattr(table, "prov", None) or []
594
+ pages = sorted({p.page_no for p in prov}) if prov else []
595
+ start_page = pages[0] if pages else None
596
+
597
+ header_tokens: set[str] = set()
598
+ for col in df.columns:
599
+ header_tokens |= tokenize(normalize_col_name(col))
600
+
601
+ first_row_tokens: set[str] = set()
602
+ if df.shape[0] > 0:
603
+ row_text = " ".join(str(x) for x in df.iloc[0].tolist())
604
+ first_row_tokens = tokenize(row_text)
605
+
606
+ vert_center, vert_top, vert_bottom = None, None, None
607
+ if cfg.use_layout_hint and prov:
608
+ page_height = _resolve_page_height(prov, doc)
609
+ vert_center, vert_top, vert_bottom = _compute_vertical_positions(
610
+ prov, page_height=page_height
611
+ )
612
+
613
+ raw_columns = [str(c) for c in df.columns]
614
+ numeric_like_cols = is_numeric_like_colnames(raw_columns)
615
+
616
+ is_header_orphan = _detect_header_orphan(df, is_headerless, cfg.max_orphan_rows)
617
+
618
+ is_data_orphan = (
619
+ df.shape[0] > 0
620
+ and df.shape[0] <= cfg.max_data_orphan_rows
621
+ and first_row_has_number(df)
622
+ )
623
+
624
+ tables_meta.append(
625
+ TableMeta(
626
+ idx=idx,
627
+ df=df,
628
+ start_page=start_page,
629
+ pages=pages,
630
+ width=df.shape[1],
631
+ header_tokens=header_tokens,
632
+ first_row_tokens=first_row_tokens,
633
+ raw_columns=raw_columns,
634
+ vert_center=vert_center,
635
+ vert_top=vert_top,
636
+ vert_bottom=vert_bottom,
637
+ is_header_orphan=is_header_orphan,
638
+ is_data_orphan=is_data_orphan,
639
+ numeric_like_cols=numeric_like_cols,
640
+ row_count=df.shape[0],
641
+ continuation_content=continuation_content,
642
+ is_headerless=is_headerless,
643
+ )
644
+ )
645
+
646
+ if skipped:
647
+ log.warning(
648
+ f"Extracted {len(tables_meta)}/{total} tables "
649
+ f"({skipped} skipped — originals preserved)"
650
+ )
651
+
652
+ return tables_meta
653
+
654
+ def inject(self, doc: DoclingDocument, logical_tables: list[LogicalTable]) -> DoclingDocument:
655
+ """
656
+ Modify the DoclingDocument in-place with merged table data.
657
+
658
+ Only modifies tables that were actually merged (multiple fragments).
659
+ Single-page tables retain their original Docling structure.
660
+ If injection fails, fields modified by this adapter are restored before
661
+ the exception is re-raised so callers using ``raise_on_error=False`` do
662
+ not receive a half-stitched document.
663
+ """
664
+ log.info("Starting DoclingDocument injection...")
665
+
666
+ refs_to_remove: set[str] = set()
667
+ table_snapshots = {
668
+ idx: {
669
+ "data": getattr(table, "data", None),
670
+ "prov": copy.copy(getattr(table, "prov", None)),
671
+ }
672
+ for idx, table in enumerate(doc.tables)
673
+ }
674
+ body_children_snapshot = (
675
+ list(doc.body.children)
676
+ if getattr(doc, "body", None) is not None and hasattr(doc.body, "children")
677
+ else None
678
+ )
679
+ group_children_snapshots = {
680
+ idx: list(group.children)
681
+ for idx, group in enumerate(getattr(doc, "groups", []) or [])
682
+ if hasattr(group, "children")
683
+ }
684
+
685
+ def restore_snapshots():
686
+ for idx, snap in table_snapshots.items():
687
+ if idx >= len(doc.tables):
688
+ continue
689
+ doc.tables[idx].data = snap["data"]
690
+ doc.tables[idx].prov = copy.copy(snap["prov"])
691
+
692
+ if body_children_snapshot is not None:
693
+ doc.body.children = list(body_children_snapshot)
694
+
695
+ for idx, children in group_children_snapshots.items():
696
+ groups = getattr(doc, "groups", []) or []
697
+ if idx < len(groups) and hasattr(groups[idx], "children"):
698
+ groups[idx].children = list(children)
699
+
700
+ try:
701
+ for lt in logical_tables:
702
+ if not lt.members:
703
+ continue
704
+
705
+ if len(lt.members) == 1:
706
+ log.debug(
707
+ f"Skipping single-table {lt.members[0]} - preserving original structure"
708
+ )
709
+ continue
710
+
711
+ anchor_idx = lt.members[0]
712
+ anchor_table = doc.tables[anchor_idx]
713
+
714
+ log.info(
715
+ f"Injecting Logical Table {lt.logical_index} into Anchor Table {anchor_idx} "
716
+ f"(merged from {len(lt.members)} fragments)"
717
+ )
718
+
719
+ original_data = getattr(anchor_table, "data", None)
720
+
721
+ anchor_table.data = _dataframe_to_docling_data(
722
+ lt.df,
723
+ original_data=original_data,
724
+ )
725
+
726
+ for satellite_idx in lt.members[1:]:
727
+ satellite_table = doc.tables[satellite_idx]
728
+
729
+ if satellite_table.prov:
730
+ if anchor_table.prov is None:
731
+ anchor_table.prov = []
732
+
733
+ if isinstance(satellite_table.prov, list):
734
+ if isinstance(anchor_table.prov, list):
735
+ anchor_table.prov.extend(satellite_table.prov)
736
+ else:
737
+ anchor_table.prov = [anchor_table.prov] + satellite_table.prov
738
+ else:
739
+ if isinstance(anchor_table.prov, list):
740
+ anchor_table.prov.append(satellite_table.prov)
741
+ else:
742
+ anchor_table.prov = [anchor_table.prov, satellite_table.prov]
743
+
744
+ refs_to_remove.add(satellite_table.self_ref)
745
+
746
+ # Clear the satellite in place so downstream code iterating
747
+ # doc.tables directly doesn't see stale fragment content.
748
+ # We don't pop the entry because self_refs are position-based
749
+ # (`#/tables/N` = list index N) — removing an element would
750
+ # shift every subsequent self_ref and body reference. The
751
+ # satellite becomes an empty shell, still present but
752
+ # without data or prov.
753
+ satellite_table.data = TableData(
754
+ num_rows=0, num_cols=0, table_cells=[], grid=[]
755
+ )
756
+ satellite_table.prov = []
757
+
758
+ # Prune satellite references from body hierarchy
759
+ removed_count = 0
760
+
761
+ def traverse_and_prune(group_node: Any):
762
+ nonlocal removed_count
763
+ if not hasattr(group_node, "children"):
764
+ return
765
+
766
+ new_children = []
767
+ for child_ref in group_node.children:
768
+ ptr = _get_ref_pointer(child_ref)
769
+
770
+ if not ptr:
771
+ new_children.append(child_ref)
772
+ continue
773
+
774
+ if ptr in refs_to_remove:
775
+ removed_count += 1
776
+ continue
777
+
778
+ new_children.append(child_ref)
779
+
780
+ if ptr.startswith("#/groups/"):
781
+ try:
782
+ group_idx = int(ptr.split("/")[-1])
783
+ if group_idx < len(doc.groups):
784
+ traverse_and_prune(doc.groups[group_idx])
785
+ except (ValueError, IndexError):
786
+ pass
787
+
788
+ group_node.children = new_children
789
+
790
+ if doc.body:
791
+ traverse_and_prune(doc.body)
792
+ except Exception:
793
+ restore_snapshots()
794
+ raise
795
+
796
+ log.info(f"Injection complete. Pruned {removed_count} satellite table references.")
797
+ return doc