table-stitcher 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- table_stitcher/__init__.py +340 -0
- table_stitcher/adapters/README.md +173 -0
- table_stitcher/adapters/__init__.py +11 -0
- table_stitcher/adapters/base.py +42 -0
- table_stitcher/adapters/docling.py +797 -0
- table_stitcher/merger.py +979 -0
- table_stitcher/models.py +145 -0
- table_stitcher/py.typed +0 -0
- table_stitcher-0.3.0.dist-info/METADATA +392 -0
- table_stitcher-0.3.0.dist-info/RECORD +12 -0
- table_stitcher-0.3.0.dist-info/WHEEL +4 -0
- table_stitcher-0.3.0.dist-info/licenses/LICENSE +21 -0
table_stitcher/merger.py
ADDED
|
@@ -0,0 +1,979 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core merge engine for table-stitcher.
|
|
3
|
+
|
|
4
|
+
This module is parser-agnostic. It operates exclusively on TableMeta objects
|
|
5
|
+
and pandas DataFrames — it never touches parser-native document objects.
|
|
6
|
+
|
|
7
|
+
Key Principles:
|
|
8
|
+
1. Sequential merging: Headerless fragments only merge with immediate predecessor
|
|
9
|
+
2. Width matching: Same column count = same table structure (primary signal)
|
|
10
|
+
3. Spillover detection: 1-column fragments are cell overflow, not new tables
|
|
11
|
+
4. New table detection: Fragments with non-matching headers are separate tables
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import logging
|
|
15
|
+
import re
|
|
16
|
+
import unicodedata
|
|
17
|
+
from collections import defaultdict
|
|
18
|
+
from dataclasses import dataclass, field
|
|
19
|
+
from typing import Any, Optional
|
|
20
|
+
|
|
21
|
+
import pandas as pd
|
|
22
|
+
|
|
23
|
+
from .models import LogicalTable, MergeTrace, MultiPageConfig, TableMeta
|
|
24
|
+
|
|
25
|
+
log = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# -------------------------------------------------------------------
|
|
29
|
+
# 1. UTILITY FUNCTIONS
|
|
30
|
+
# -------------------------------------------------------------------
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def normalize_col_name(col: Any) -> str:
|
|
34
|
+
"""Normalize column name for comparison."""
|
|
35
|
+
return str(col).strip().lower()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# Scripts where each character is semantically its own token, because the
|
|
39
|
+
# script doesn't use whitespace between words (CJK family, Thai, Lao, Khmer,
|
|
40
|
+
# Myanmar, Tibetan). Per-character Jaccard works for similarity comparison:
|
|
41
|
+
# identical headers produce identical character sets; unrelated headers
|
|
42
|
+
# rarely cross the ~60% overlap required to hit the strict threshold.
|
|
43
|
+
#
|
|
44
|
+
# This list is bounded — Unicode regularly adds new scripts, but almost all
|
|
45
|
+
# new scripts are whitespace-using (and therefore handled as words without
|
|
46
|
+
# a code change). Only the separator-less family needs enumeration.
|
|
47
|
+
_SEPARATORLESS_SCRIPTS: set[str] = {
|
|
48
|
+
"Han", # Chinese / Japanese kanji / Korean hanja
|
|
49
|
+
"Hiragana",
|
|
50
|
+
"Katakana",
|
|
51
|
+
"Hangul",
|
|
52
|
+
"Thai",
|
|
53
|
+
"Lao",
|
|
54
|
+
"Khmer",
|
|
55
|
+
"Myanmar",
|
|
56
|
+
"Tibetan",
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
# Map a substring of the Unicode character name to a script tag. Unicode
|
|
60
|
+
# character names are standardized and frozen, so this mapping is stable
|
|
61
|
+
# across Python and Unicode releases.
|
|
62
|
+
_NAME_TO_SCRIPT: list[tuple[str, str]] = [
|
|
63
|
+
("CJK", "Han"),
|
|
64
|
+
("KANGXI", "Han"), # e.g. U+2F49 "KANGXI RADICAL MOON"
|
|
65
|
+
("HIRAGANA", "Hiragana"),
|
|
66
|
+
("KATAKANA", "Katakana"),
|
|
67
|
+
("HANGUL", "Hangul"),
|
|
68
|
+
("THAI", "Thai"),
|
|
69
|
+
("LAO", "Lao"),
|
|
70
|
+
("KHMER", "Khmer"),
|
|
71
|
+
("MYANMAR", "Myanmar"),
|
|
72
|
+
("TIBETAN", "Tibetan"),
|
|
73
|
+
]
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _script_of(ch: str) -> Optional[str]:
|
|
77
|
+
"""Return a script tag for `ch`, or None for scripts that use whitespace."""
|
|
78
|
+
if ord(ch) < 128: # ASCII fast path — by far the common case in Latin text
|
|
79
|
+
return None
|
|
80
|
+
name = unicodedata.name(ch, "")
|
|
81
|
+
if not name:
|
|
82
|
+
return None
|
|
83
|
+
for prefix, script in _NAME_TO_SCRIPT:
|
|
84
|
+
if prefix in name:
|
|
85
|
+
return script
|
|
86
|
+
return None
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def tokenize(text: str) -> set[str]:
|
|
90
|
+
"""
|
|
91
|
+
Extract tokens for Jaccard similarity comparison — script-aware.
|
|
92
|
+
|
|
93
|
+
Rules, all structural (no language models, no external dependencies):
|
|
94
|
+
|
|
95
|
+
- Characters in separator-less scripts (CJK, Thai, Lao, Khmer, Myanmar,
|
|
96
|
+
Tibetan): each character is its own token. Unigram Jaccard — identical
|
|
97
|
+
headers produce identical token sets.
|
|
98
|
+
- Other alphabetic characters (Latin, Cyrillic, Greek, Arabic, Hebrew,
|
|
99
|
+
Devanagari, Tamil, ...): grouped into whitespace-separated words,
|
|
100
|
+
lowercased. These scripts have word boundaries at whitespace, so the
|
|
101
|
+
same rule that works for English works for them.
|
|
102
|
+
- Digits, punctuation, and whitespace: ignored — boundaries only.
|
|
103
|
+
|
|
104
|
+
Mixed-script text (e.g., "Sales + non-Latin run") produces the union of
|
|
105
|
+
both token sets.
|
|
106
|
+
"""
|
|
107
|
+
tokens: set[str] = set()
|
|
108
|
+
buf: list[str] = []
|
|
109
|
+
for ch in str(text):
|
|
110
|
+
# Check script BEFORE isalpha: Kangxi radicals (U+2F00–U+2FDF) and
|
|
111
|
+
# some CJK compatibility characters are classed as symbols, not
|
|
112
|
+
# letters, but still belong to Han script for tokenization purposes.
|
|
113
|
+
if _script_of(ch) in _SEPARATORLESS_SCRIPTS:
|
|
114
|
+
if buf:
|
|
115
|
+
tokens.add("".join(buf).lower())
|
|
116
|
+
buf.clear()
|
|
117
|
+
tokens.add(ch)
|
|
118
|
+
elif ch.isalpha():
|
|
119
|
+
buf.append(ch)
|
|
120
|
+
else:
|
|
121
|
+
# Non-letter boundary (digit, punctuation, whitespace) — flush.
|
|
122
|
+
if buf:
|
|
123
|
+
tokens.add("".join(buf).lower())
|
|
124
|
+
buf.clear()
|
|
125
|
+
if buf:
|
|
126
|
+
tokens.add("".join(buf).lower())
|
|
127
|
+
return tokens
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def jaccard(a: set[str], b: set[str]) -> float:
|
|
131
|
+
"""Calculate Jaccard similarity between two sets."""
|
|
132
|
+
if not a and not b:
|
|
133
|
+
return 0.0
|
|
134
|
+
inter = len(a & b)
|
|
135
|
+
union = len(a | b)
|
|
136
|
+
return inter / union if union > 0 else 0.0
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def is_numeric_like_colnames(cols: list[Any]) -> bool:
|
|
140
|
+
"""Check if column names look auto-generated (numeric or 'Unnamed')."""
|
|
141
|
+
if not cols:
|
|
142
|
+
return False
|
|
143
|
+
numeric_like = 0
|
|
144
|
+
for c in cols:
|
|
145
|
+
s = str(c).strip().lower()
|
|
146
|
+
if re.fullmatch(r"\d+", s):
|
|
147
|
+
numeric_like += 1
|
|
148
|
+
elif s.startswith("unnamed:"):
|
|
149
|
+
numeric_like += 1
|
|
150
|
+
return numeric_like / len(cols) >= 0.7
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def first_row_has_number(df: pd.DataFrame) -> bool:
|
|
154
|
+
"""Check if the first row contains any numeric characters."""
|
|
155
|
+
if df.shape[0] == 0:
|
|
156
|
+
return False
|
|
157
|
+
row_text = " ".join(str(x) for x in df.iloc[0].tolist())
|
|
158
|
+
return bool(re.search(r"\d", row_text))
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def is_empty_value(val: Any) -> bool:
|
|
162
|
+
"""Check if a value is empty/null."""
|
|
163
|
+
if val is None:
|
|
164
|
+
return True
|
|
165
|
+
if isinstance(val, float) and pd.isna(val):
|
|
166
|
+
return True
|
|
167
|
+
if isinstance(val, str) and val.strip() == "":
|
|
168
|
+
return True
|
|
169
|
+
return False
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def clean_malformed_header(col: str) -> str:
|
|
173
|
+
"""Fix headers like 'Name.Name' -> 'Name'."""
|
|
174
|
+
col = str(col).strip()
|
|
175
|
+
if "." in col:
|
|
176
|
+
parts = col.split(".")
|
|
177
|
+
if len(parts) == 2 and parts[0].strip().lower() == parts[1].strip().lower():
|
|
178
|
+
return parts[0].strip()
|
|
179
|
+
return col
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def clean_all_headers(df: pd.DataFrame) -> pd.DataFrame:
|
|
183
|
+
"""Apply header cleaning to all columns."""
|
|
184
|
+
new_cols = [clean_malformed_header(c) for c in df.columns]
|
|
185
|
+
df_copy = df.copy()
|
|
186
|
+
df_copy.columns = new_cols
|
|
187
|
+
return df_copy
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def _pair_signals(tA: TableMeta, tB: TableMeta, cfg: MultiPageConfig) -> dict[str, Any]:
|
|
191
|
+
"""Collect stable, parser-neutral signals for merge explanations."""
|
|
192
|
+
page_gap = None
|
|
193
|
+
if tA.start_page is not None and tB.start_page is not None:
|
|
194
|
+
page_gap = tB.start_page - tA.start_page
|
|
195
|
+
|
|
196
|
+
return {
|
|
197
|
+
"left_page": tA.start_page,
|
|
198
|
+
"right_page": tB.start_page,
|
|
199
|
+
"page_gap": page_gap,
|
|
200
|
+
"left_width": tA.width,
|
|
201
|
+
"right_width": tB.width,
|
|
202
|
+
"width_diff": abs(tA.width - tB.width),
|
|
203
|
+
"left_headerless": tA.is_headerless,
|
|
204
|
+
"right_headerless": tB.is_headerless,
|
|
205
|
+
"left_header_orphan": tA.is_header_orphan,
|
|
206
|
+
"right_header_orphan": tB.is_header_orphan,
|
|
207
|
+
"left_data_orphan": tA.is_data_orphan,
|
|
208
|
+
"right_data_orphan": tB.is_data_orphan,
|
|
209
|
+
"header_similarity": jaccard(tA.header_tokens, tB.header_tokens),
|
|
210
|
+
"row_similarity": jaccard(tA.first_row_tokens, tB.first_row_tokens),
|
|
211
|
+
"layout_continuation": layout_suggests_continuation(tA, tB, cfg),
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def _trace_pair(
|
|
216
|
+
tA: TableMeta,
|
|
217
|
+
tB: TableMeta,
|
|
218
|
+
cfg: MultiPageConfig,
|
|
219
|
+
merged: bool,
|
|
220
|
+
reason: str,
|
|
221
|
+
warnings: Optional[list[str]] = None,
|
|
222
|
+
) -> MergeTrace:
|
|
223
|
+
"""Build a MergeTrace for one adjacent pair."""
|
|
224
|
+
return MergeTrace(
|
|
225
|
+
left_idx=tA.idx,
|
|
226
|
+
right_idx=tB.idx,
|
|
227
|
+
merged=merged,
|
|
228
|
+
reason=reason,
|
|
229
|
+
signals=_pair_signals(tA, tB, cfg),
|
|
230
|
+
warnings=warnings or [],
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
# -------------------------------------------------------------------
|
|
235
|
+
# 2. UNION-FIND DATA STRUCTURE
|
|
236
|
+
# -------------------------------------------------------------------
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
class UnionFind:
|
|
240
|
+
"""Union-Find (Disjoint Set) for grouping table fragments."""
|
|
241
|
+
|
|
242
|
+
def __init__(self, n: int):
|
|
243
|
+
self.parent = list(range(n))
|
|
244
|
+
self.rank = [0] * n
|
|
245
|
+
|
|
246
|
+
def find(self, x: int) -> int:
|
|
247
|
+
if self.parent[x] != x:
|
|
248
|
+
self.parent[x] = self.find(self.parent[x])
|
|
249
|
+
return self.parent[x]
|
|
250
|
+
|
|
251
|
+
def union(self, a: int, b: int) -> bool:
|
|
252
|
+
ra, rb = self.find(a), self.find(b)
|
|
253
|
+
if ra == rb:
|
|
254
|
+
return False
|
|
255
|
+
if self.rank[ra] < self.rank[rb]:
|
|
256
|
+
ra, rb = rb, ra
|
|
257
|
+
self.parent[rb] = ra
|
|
258
|
+
if self.rank[ra] == self.rank[rb]:
|
|
259
|
+
self.rank[ra] += 1
|
|
260
|
+
return True
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
# -------------------------------------------------------------------
|
|
264
|
+
# 3. MERGE DECISION LOGIC
|
|
265
|
+
# -------------------------------------------------------------------
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def layout_suggests_continuation(tA: TableMeta, tB: TableMeta, cfg: MultiPageConfig) -> bool:
|
|
269
|
+
"""
|
|
270
|
+
Check if vertical positions suggest tB continues tA.
|
|
271
|
+
|
|
272
|
+
Uses normalized coordinates where 0 = top of page, 1 = bottom of page.
|
|
273
|
+
|
|
274
|
+
For continuation:
|
|
275
|
+
- Table A should be near the BOTTOM of its page (vert_bottom >= bottom_band_min)
|
|
276
|
+
- Table B should be near the TOP of its page (vert_top <= top_band_max)
|
|
277
|
+
"""
|
|
278
|
+
if not cfg.use_layout_hint:
|
|
279
|
+
return False
|
|
280
|
+
if tA.vert_bottom is None or tB.vert_top is None:
|
|
281
|
+
return False
|
|
282
|
+
|
|
283
|
+
a_near_bottom = tA.vert_bottom >= cfg.bottom_band_min
|
|
284
|
+
b_near_top = tB.vert_top <= cfg.top_band_max
|
|
285
|
+
|
|
286
|
+
return a_near_bottom and b_near_top
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def _both_have_unique_header_tokens(tA: TableMeta, tB: TableMeta) -> bool:
|
|
290
|
+
"""
|
|
291
|
+
True when each side's header set has at least one token the other lacks.
|
|
292
|
+
|
|
293
|
+
This is the structural signature of *parallel* tables sharing domain
|
|
294
|
+
vocabulary (e.g. clinical studies that share patient/age/sex but differ
|
|
295
|
+
on outcome column), not of a single table split across pages. A real
|
|
296
|
+
continuation has either identical headers or tB ⊆ tA — parsers may
|
|
297
|
+
drop columns on page 2 but cannot invent header tokens that weren't on
|
|
298
|
+
page 1. So when both sides bring their own tokens, header similarity
|
|
299
|
+
alone is unsafe; we require layout corroboration before merging.
|
|
300
|
+
"""
|
|
301
|
+
a, b = tA.header_tokens, tB.header_tokens
|
|
302
|
+
if not a or not b:
|
|
303
|
+
return False
|
|
304
|
+
return bool(a - b) and bool(b - a)
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def should_force_orphan_merge(h: TableMeta, d: TableMeta, cfg: MultiPageConfig) -> tuple[bool, str]:
|
|
308
|
+
"""Check if header orphan + data orphan should merge."""
|
|
309
|
+
if h.start_page is None or d.start_page is None:
|
|
310
|
+
return False, ""
|
|
311
|
+
if (d.start_page - h.start_page) > cfg.max_page_gap:
|
|
312
|
+
return False, ""
|
|
313
|
+
if abs(h.width - d.width) > cfg.max_width_difference:
|
|
314
|
+
return False, ""
|
|
315
|
+
|
|
316
|
+
layout = layout_suggests_continuation(h, d, cfg)
|
|
317
|
+
if h.is_header_orphan and d.is_data_orphan:
|
|
318
|
+
return True, "orphans" + ("+layout" if layout else "")
|
|
319
|
+
return False, ""
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def is_spillover_fragment(tA: TableMeta, tB: TableMeta, cfg: MultiPageConfig) -> bool:
|
|
323
|
+
"""
|
|
324
|
+
Detect if tB is a spillover fragment (cell overflow from tA).
|
|
325
|
+
|
|
326
|
+
A spillover fragment is characterized by:
|
|
327
|
+
- 1 column (content got dumped into a single cell)
|
|
328
|
+
- Headerless (no structure, just content)
|
|
329
|
+
- Follows a multi-column table
|
|
330
|
+
- On the **immediately** following page (page_gap == 1)
|
|
331
|
+
|
|
332
|
+
The immediate-next-page constraint is load-bearing: a 1-col fragment
|
|
333
|
+
several pages later is almost certainly an unrelated small table, not a
|
|
334
|
+
continuation. Independent of `cfg.max_page_gap` — which governs the
|
|
335
|
+
general merge search but shouldn't apply to spillover, since the semantic
|
|
336
|
+
is "cell overflow" and overflow physically lands on the very next page.
|
|
337
|
+
|
|
338
|
+
By default, the structural signal is strong enough. Content checking
|
|
339
|
+
(URLs, ticket patterns) is optional via spillover_require_content_check.
|
|
340
|
+
"""
|
|
341
|
+
if not (tB.is_headerless and tB.width == 1 and tA.width > 1):
|
|
342
|
+
return False
|
|
343
|
+
if tA.start_page is None or tB.start_page is None:
|
|
344
|
+
return False
|
|
345
|
+
if tB.start_page - tA.start_page != 1:
|
|
346
|
+
return False
|
|
347
|
+
|
|
348
|
+
if not cfg.spillover_require_content_check:
|
|
349
|
+
return True
|
|
350
|
+
|
|
351
|
+
if tB.df.shape[0] == 0:
|
|
352
|
+
return False
|
|
353
|
+
|
|
354
|
+
first_cell = str(tB.df.iloc[0, 0]).lower()
|
|
355
|
+
looks_like_continuation = (
|
|
356
|
+
"http" in first_cell
|
|
357
|
+
or "://" in first_cell
|
|
358
|
+
or bool(re.search(r"[A-Z]+-\d+", str(tB.df.iloc[0, 0])))
|
|
359
|
+
or tB.row_count <= 2
|
|
360
|
+
)
|
|
361
|
+
return looks_like_continuation
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
# -------------------------------------------------------------------
|
|
365
|
+
# 4. TABLE BUILDING (Post-Merge)
|
|
366
|
+
# -------------------------------------------------------------------
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def stitch_split_cells(df: pd.DataFrame, separator: str = "\n") -> pd.DataFrame:
|
|
370
|
+
"""
|
|
371
|
+
Merge rows that are actually split cells.
|
|
372
|
+
|
|
373
|
+
Detects patterns where a row has only one non-empty cell, which is
|
|
374
|
+
likely continuation content from the previous row.
|
|
375
|
+
|
|
376
|
+
Uses positional (integer) indexing throughout — pandas' label-based
|
|
377
|
+
indexing breaks when a merged DataFrame has duplicate column names
|
|
378
|
+
(common with rowspan/colspan parsers), because df[col] returns a
|
|
379
|
+
sub-DataFrame rather than a scalar.
|
|
380
|
+
"""
|
|
381
|
+
if df.shape[0] <= 1:
|
|
382
|
+
return df
|
|
383
|
+
|
|
384
|
+
cols = list(df.columns)
|
|
385
|
+
ncols = len(cols)
|
|
386
|
+
stitched_rows = []
|
|
387
|
+
i = 0
|
|
388
|
+
n = df.shape[0]
|
|
389
|
+
|
|
390
|
+
while i < n:
|
|
391
|
+
row = df.iloc[i].tolist()
|
|
392
|
+
j = i + 1
|
|
393
|
+
|
|
394
|
+
while j < n:
|
|
395
|
+
next_row_vals = df.iloc[j].tolist()
|
|
396
|
+
nonempty_idxs = [k for k, v in enumerate(next_row_vals) if not is_empty_value(v)]
|
|
397
|
+
|
|
398
|
+
if len(nonempty_idxs) != 1:
|
|
399
|
+
break
|
|
400
|
+
|
|
401
|
+
# A genuine continuation always has col 0 empty — that column
|
|
402
|
+
# is the record identifier (participant ID, row label, etc.).
|
|
403
|
+
# A non-empty col 0 means a new record or a category/section
|
|
404
|
+
# row, not an overflow of the previous cell.
|
|
405
|
+
if not is_empty_value(next_row_vals[0]):
|
|
406
|
+
break
|
|
407
|
+
|
|
408
|
+
cont_idx = nonempty_idxs[0]
|
|
409
|
+
cont_val = str(next_row_vals[cont_idx]).strip()
|
|
410
|
+
target_idx = cont_idx
|
|
411
|
+
|
|
412
|
+
is_url = "://" in cont_val or cont_val.lower().startswith("http")
|
|
413
|
+
if is_url:
|
|
414
|
+
candidates = [
|
|
415
|
+
k
|
|
416
|
+
for k, c in enumerate(cols)
|
|
417
|
+
if any(x in str(c).lower() for x in ["content", "ref", "desc", "link", "url"])
|
|
418
|
+
]
|
|
419
|
+
if candidates:
|
|
420
|
+
target_idx = candidates[-1]
|
|
421
|
+
else:
|
|
422
|
+
target_idx = ncols - 1
|
|
423
|
+
|
|
424
|
+
prev_val = row[target_idx]
|
|
425
|
+
if not is_empty_value(prev_val):
|
|
426
|
+
row[target_idx] = str(prev_val).rstrip() + separator + cont_val.lstrip()
|
|
427
|
+
else:
|
|
428
|
+
row[target_idx] = cont_val
|
|
429
|
+
j += 1
|
|
430
|
+
|
|
431
|
+
stitched_rows.append(row)
|
|
432
|
+
i = j
|
|
433
|
+
|
|
434
|
+
return pd.DataFrame(stitched_rows, columns=cols)
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
_VALID_WIDTH_OVERFLOW_POLICIES = {"preserve_extra", "warn_drop", "fail", "merge_tail"}
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
def _pad_narrow(df: pd.DataFrame, canonical_cols: list[str]) -> pd.DataFrame:
|
|
441
|
+
"""Right-pad a narrower fragment with empty ``_pad_N`` columns."""
|
|
442
|
+
df_copy = df.copy()
|
|
443
|
+
for k in range(df.shape[1], len(canonical_cols)):
|
|
444
|
+
df_copy[f"_pad_{k}"] = ""
|
|
445
|
+
df_copy.columns = canonical_cols
|
|
446
|
+
df_copy.attrs["table_stitcher_warnings"] = []
|
|
447
|
+
return df_copy
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
def _overflow_fail(
|
|
451
|
+
df: pd.DataFrame, canonical_cols: list[str], source_meta: TableMeta, cfg: MultiPageConfig
|
|
452
|
+
) -> pd.DataFrame:
|
|
453
|
+
dropped_cols = [str(c) for c in df.columns[len(canonical_cols) :]]
|
|
454
|
+
raise ValueError(
|
|
455
|
+
f"Fragment idx={getattr(source_meta, 'idx', None)} "
|
|
456
|
+
f"page={getattr(source_meta, 'start_page', None)} has {df.shape[1]} columns, "
|
|
457
|
+
f"wider than canonical width {len(canonical_cols)}; extra columns: {dropped_cols}"
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
def _overflow_warn_drop(
|
|
462
|
+
df: pd.DataFrame, canonical_cols: list[str], source_meta: TableMeta, cfg: MultiPageConfig
|
|
463
|
+
) -> pd.DataFrame:
|
|
464
|
+
dropped = df.shape[1] - len(canonical_cols)
|
|
465
|
+
dropped_cols = [str(c) for c in df.columns[len(canonical_cols) :]]
|
|
466
|
+
warning = (
|
|
467
|
+
f"Dropped {dropped} trailing column(s) from fragment "
|
|
468
|
+
f"idx={getattr(source_meta, 'idx', None)} "
|
|
469
|
+
f"page={getattr(source_meta, 'start_page', None)} "
|
|
470
|
+
f"to fit canonical width {len(canonical_cols)}; dropped columns: {dropped_cols}"
|
|
471
|
+
)
|
|
472
|
+
log.warning("align_dataframe_to_header: %s", warning)
|
|
473
|
+
|
|
474
|
+
df_copy = df.iloc[:, : len(canonical_cols)].copy()
|
|
475
|
+
df_copy.columns = canonical_cols
|
|
476
|
+
df_copy.attrs["table_stitcher_warnings"] = [warning]
|
|
477
|
+
return df_copy
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
def _overflow_merge_tail(
|
|
481
|
+
df: pd.DataFrame, canonical_cols: list[str], source_meta: TableMeta, cfg: MultiPageConfig
|
|
482
|
+
) -> pd.DataFrame:
|
|
483
|
+
"""Fold trailing overflow cells into the last canonical column."""
|
|
484
|
+
rows = []
|
|
485
|
+
for _, row in df.iterrows():
|
|
486
|
+
vals = list(row.tolist())
|
|
487
|
+
head_vals = vals[: len(canonical_cols)]
|
|
488
|
+
tail_vals = [str(v).strip() for v in vals[len(canonical_cols) :] if not is_empty_value(v)]
|
|
489
|
+
while len(head_vals) < len(canonical_cols):
|
|
490
|
+
head_vals.append("")
|
|
491
|
+
if tail_vals and canonical_cols:
|
|
492
|
+
last_idx = len(canonical_cols) - 1
|
|
493
|
+
tail_text = cfg.stitch_separator.join(tail_vals)
|
|
494
|
+
if is_empty_value(head_vals[last_idx]):
|
|
495
|
+
head_vals[last_idx] = tail_text
|
|
496
|
+
else:
|
|
497
|
+
head_vals[last_idx] = (
|
|
498
|
+
str(head_vals[last_idx]).rstrip() + cfg.stitch_separator + tail_text
|
|
499
|
+
)
|
|
500
|
+
rows.append(head_vals)
|
|
501
|
+
df_copy = pd.DataFrame(rows, columns=canonical_cols)
|
|
502
|
+
df_copy.attrs["table_stitcher_warnings"] = []
|
|
503
|
+
return df_copy
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
def _overflow_preserve_extra(
|
|
507
|
+
df: pd.DataFrame, canonical_cols: list[str], source_meta: TableMeta, cfg: MultiPageConfig
|
|
508
|
+
) -> pd.DataFrame:
|
|
509
|
+
"""Keep overflow cells in explicit ``_extra_N_<origname>`` columns (default, lossless)."""
|
|
510
|
+
df_copy = df.copy()
|
|
511
|
+
extra_cols: list[str] = []
|
|
512
|
+
used = {str(c) for c in canonical_cols}
|
|
513
|
+
for offset, col in enumerate(df.columns[len(canonical_cols) :]):
|
|
514
|
+
base = f"_extra_{offset}_{str(col).strip() or 'column'}"
|
|
515
|
+
candidate = base
|
|
516
|
+
suffix = 1
|
|
517
|
+
while candidate in used:
|
|
518
|
+
candidate = f"{base}_{suffix}"
|
|
519
|
+
suffix += 1
|
|
520
|
+
used.add(candidate)
|
|
521
|
+
extra_cols.append(candidate)
|
|
522
|
+
df_copy.columns = canonical_cols + extra_cols
|
|
523
|
+
df_copy.attrs["table_stitcher_warnings"] = []
|
|
524
|
+
return df_copy
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
_WIDTH_OVERFLOW_HANDLERS = {
|
|
528
|
+
"preserve_extra": _overflow_preserve_extra,
|
|
529
|
+
"warn_drop": _overflow_warn_drop,
|
|
530
|
+
"fail": _overflow_fail,
|
|
531
|
+
"merge_tail": _overflow_merge_tail,
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
|
|
535
|
+
def align_dataframe_to_header(
|
|
536
|
+
df: pd.DataFrame,
|
|
537
|
+
canonical_cols: list[str],
|
|
538
|
+
source_meta: TableMeta,
|
|
539
|
+
cfg: MultiPageConfig,
|
|
540
|
+
) -> pd.DataFrame:
|
|
541
|
+
"""
|
|
542
|
+
Align a DataFrame to a canonical column structure.
|
|
543
|
+
|
|
544
|
+
Narrower fragments are right-padded with empty columns.
|
|
545
|
+
Wider fragments dispatch to a handler keyed by ``cfg.width_overflow_policy``:
|
|
546
|
+
|
|
547
|
+
- ``preserve_extra`` (default): add trailing ``_extra_N_<origname>`` columns.
|
|
548
|
+
- ``warn_drop``: drop trailing columns and log a warning.
|
|
549
|
+
- ``fail``: raise ``ValueError``.
|
|
550
|
+
- ``merge_tail``: append trailing values into the final canonical cell.
|
|
551
|
+
"""
|
|
552
|
+
if cfg.width_overflow_policy not in _VALID_WIDTH_OVERFLOW_POLICIES:
|
|
553
|
+
raise ValueError(
|
|
554
|
+
"width_overflow_policy must be one of "
|
|
555
|
+
f"{sorted(_VALID_WIDTH_OVERFLOW_POLICIES)}, got {cfg.width_overflow_policy!r}"
|
|
556
|
+
)
|
|
557
|
+
|
|
558
|
+
if df.shape[1] < len(canonical_cols):
|
|
559
|
+
return _pad_narrow(df, canonical_cols)
|
|
560
|
+
|
|
561
|
+
if df.shape[1] > len(canonical_cols):
|
|
562
|
+
return _WIDTH_OVERFLOW_HANDLERS[cfg.width_overflow_policy](
|
|
563
|
+
df, canonical_cols, source_meta, cfg
|
|
564
|
+
)
|
|
565
|
+
|
|
566
|
+
# Exact width match — just relabel and carry an empty warnings list.
|
|
567
|
+
df_copy = df.copy()
|
|
568
|
+
df_copy.columns = canonical_cols
|
|
569
|
+
df_copy.attrs["table_stitcher_warnings"] = []
|
|
570
|
+
return df_copy
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
def _build_orphan_merged_table(
|
|
574
|
+
header_idx: int, all_members: list[int], meta_by_idx: dict[int, TableMeta]
|
|
575
|
+
) -> tuple[pd.DataFrame, set[int], list[str]]:
|
|
576
|
+
"""Build merged table when the anchor is a header orphan."""
|
|
577
|
+
h_meta = meta_by_idx[header_idx]
|
|
578
|
+
|
|
579
|
+
if h_meta.df.shape[0] == 0:
|
|
580
|
+
header_cells = [str(c) for c in h_meta.df.columns]
|
|
581
|
+
else:
|
|
582
|
+
header_cells = [str(x) for x in h_meta.df.iloc[0].tolist()]
|
|
583
|
+
|
|
584
|
+
data_members = [m for m in all_members if m != header_idx]
|
|
585
|
+
max_w = max([len(header_cells)] + [meta_by_idx[m].width for m in data_members])
|
|
586
|
+
canonical_cols = header_cells + [f"col_{k}" for k in range(len(header_cells), max_w)]
|
|
587
|
+
|
|
588
|
+
rows = []
|
|
589
|
+
prev = h_meta
|
|
590
|
+
|
|
591
|
+
for m_idx in data_members:
|
|
592
|
+
m = meta_by_idx[m_idx]
|
|
593
|
+
|
|
594
|
+
if m.continuation_content and not rows and prev.is_header_orphan:
|
|
595
|
+
for cc in m.continuation_content:
|
|
596
|
+
if cc["col_idx"] < len(canonical_cols):
|
|
597
|
+
canonical_cols[cc["col_idx"]] += " " + cc["value"]
|
|
598
|
+
elif m.continuation_content and rows:
|
|
599
|
+
for cc in m.continuation_content:
|
|
600
|
+
if cc["col_idx"] < max_w:
|
|
601
|
+
rows[-1][cc["col_idx"]] += "\n" + cc["value"]
|
|
602
|
+
|
|
603
|
+
for _, r in m.df.iterrows():
|
|
604
|
+
vals = [str(v) for v in r.tolist()]
|
|
605
|
+
vals += [""] * (max_w - len(vals))
|
|
606
|
+
rows.append(vals[:max_w])
|
|
607
|
+
|
|
608
|
+
prev = m
|
|
609
|
+
|
|
610
|
+
return (
|
|
611
|
+
pd.DataFrame(rows, columns=canonical_cols),
|
|
612
|
+
set().union(*(meta_by_idx[i].pages for i in all_members)),
|
|
613
|
+
[],
|
|
614
|
+
)
|
|
615
|
+
|
|
616
|
+
|
|
617
|
+
def _build_generic_merged_table(
|
|
618
|
+
members: list[int], meta_by_idx: dict[int, TableMeta], cfg: MultiPageConfig
|
|
619
|
+
) -> tuple[pd.DataFrame, set[int], list[str]]:
|
|
620
|
+
"""Build merged table for the general case."""
|
|
621
|
+
base = meta_by_idx[members[0]]
|
|
622
|
+
merged_df = base.df.copy()
|
|
623
|
+
canonical_cols = [str(c) for c in base.df.columns]
|
|
624
|
+
merged_pages = set(base.pages)
|
|
625
|
+
warnings: list[str] = []
|
|
626
|
+
prev = base
|
|
627
|
+
|
|
628
|
+
for idx in members[1:]:
|
|
629
|
+
m = meta_by_idx[idx]
|
|
630
|
+
|
|
631
|
+
if m.continuation_content and merged_df.shape[0] > 0:
|
|
632
|
+
if (min(m.pages or [0]) - max(prev.pages or [0])) <= cfg.max_page_gap:
|
|
633
|
+
for cc in m.continuation_content:
|
|
634
|
+
if cc["col_idx"] < merged_df.shape[1]:
|
|
635
|
+
curr = str(merged_df.iloc[-1, cc["col_idx"]])
|
|
636
|
+
if curr and not is_empty_value(curr):
|
|
637
|
+
merged_df.iloc[-1, cc["col_idx"]] += cfg.stitch_separator + cc["value"]
|
|
638
|
+
|
|
639
|
+
aligned = align_dataframe_to_header(m.df, canonical_cols, m, cfg)
|
|
640
|
+
warnings.extend(aligned.attrs.get("table_stitcher_warnings", []))
|
|
641
|
+
merged_df = pd.concat([merged_df, aligned], ignore_index=True).fillna("")
|
|
642
|
+
canonical_cols = [str(c) for c in merged_df.columns]
|
|
643
|
+
merged_pages.update(m.pages)
|
|
644
|
+
prev = m
|
|
645
|
+
|
|
646
|
+
return merged_df, merged_pages, warnings
|
|
647
|
+
|
|
648
|
+
|
|
649
|
+
# -------------------------------------------------------------------
|
|
650
|
+
# 5. MAIN MERGE FUNCTION
|
|
651
|
+
#
|
|
652
|
+
# The main `merge_multipage_tables` function reads as four named phases:
|
|
653
|
+
# setup → Pass 1 (sequential) → Pass 2 (orphan repair) → build results.
|
|
654
|
+
# Each phase is a helper that takes `_MergeState` plus cfg; state holds
|
|
655
|
+
# the cross-phase data (union-find, index maps, traces).
|
|
656
|
+
# -------------------------------------------------------------------
|
|
657
|
+
|
|
658
|
+
|
|
659
|
+
@dataclass
|
|
660
|
+
class _MergeState:
|
|
661
|
+
"""Mutable state passed between the phases of merge_multipage_tables."""
|
|
662
|
+
|
|
663
|
+
uf: UnionFind
|
|
664
|
+
tables_meta: list[TableMeta]
|
|
665
|
+
meta_by_idx: dict[int, TableMeta]
|
|
666
|
+
orig_to_pos: dict[int, int]
|
|
667
|
+
sorted_tables: list[TableMeta]
|
|
668
|
+
extracted_indices: set[int]
|
|
669
|
+
spillover_targets: dict[int, int] = field(default_factory=dict)
|
|
670
|
+
decision_traces: list[MergeTrace] = field(default_factory=list)
|
|
671
|
+
|
|
672
|
+
|
|
673
|
+
def _init_merge_state(tables_meta: list[TableMeta]) -> _MergeState:
|
|
674
|
+
"""Build the shared state for one merge invocation."""
|
|
675
|
+
# Original t.idx values may be non-contiguous when table extraction
|
|
676
|
+
# fails for some tables. Positional index maps bridge that gap.
|
|
677
|
+
orig_to_pos = {t.idx: pos for pos, t in enumerate(tables_meta)}
|
|
678
|
+
return _MergeState(
|
|
679
|
+
uf=UnionFind(len(tables_meta)),
|
|
680
|
+
tables_meta=tables_meta,
|
|
681
|
+
meta_by_idx={t.idx: t for t in tables_meta},
|
|
682
|
+
orig_to_pos=orig_to_pos,
|
|
683
|
+
sorted_tables=sorted(tables_meta, key=lambda t: (t.start_page or 0, t.idx)),
|
|
684
|
+
extracted_indices={t.idx for t in tables_meta},
|
|
685
|
+
)
|
|
686
|
+
|
|
687
|
+
|
|
688
|
+
def _classify_sequential_pair(
|
|
689
|
+
tA: TableMeta,
|
|
690
|
+
tB: TableMeta,
|
|
691
|
+
cfg: MultiPageConfig,
|
|
692
|
+
) -> tuple[bool, str, bool, list[str]]:
|
|
693
|
+
"""
|
|
694
|
+
Decide whether two adjacent-in-document-order fragments should merge.
|
|
695
|
+
|
|
696
|
+
Returns ``(should_merge, reason, is_spillover, warnings)``. The caller
|
|
697
|
+
handles the actual union and trace bookkeeping; this function is pure
|
|
698
|
+
logic over the pair's signals. Keeping it pure makes every merge
|
|
699
|
+
decision independently reviewable.
|
|
700
|
+
"""
|
|
701
|
+
# --- Page-adjacency guard ---
|
|
702
|
+
if tA.start_page is None or tB.start_page is None:
|
|
703
|
+
return False, "missing_page", False, []
|
|
704
|
+
page_gap = tB.start_page - tA.start_page
|
|
705
|
+
if page_gap < 1 or page_gap > cfg.max_page_gap:
|
|
706
|
+
return False, "page_gap_out_of_range", False, []
|
|
707
|
+
|
|
708
|
+
# --- Spillover (checked before width guards since spillover can cross
|
|
709
|
+
# width boundaries legitimately: 1-col fragment follows N-col table) ---
|
|
710
|
+
if is_spillover_fragment(tA, tB, cfg):
|
|
711
|
+
return True, "spillover", True, []
|
|
712
|
+
|
|
713
|
+
# --- Right-side header orphan starts a new table, not a continuation ---
|
|
714
|
+
if tB.is_header_orphan:
|
|
715
|
+
return False, "right_header_orphan_starts_next_table", False, []
|
|
716
|
+
|
|
717
|
+
# --- Width guards ---
|
|
718
|
+
width_diff = abs(tA.width - tB.width)
|
|
719
|
+
if cfg.require_same_width and width_diff > 0:
|
|
720
|
+
return False, "require_same_width", False, []
|
|
721
|
+
if width_diff > cfg.max_width_difference:
|
|
722
|
+
return False, "width_difference_too_large", False, []
|
|
723
|
+
|
|
724
|
+
# --- Header orphan on the left + headerless data on the right:
|
|
725
|
+
# trust the data fragment's width (header orphans often have
|
|
726
|
+
# truncated widths from empty cells dropped by the parser). ---
|
|
727
|
+
if tA.is_header_orphan and tB.is_headerless:
|
|
728
|
+
return True, "header_orphan_to_headerless", False, []
|
|
729
|
+
|
|
730
|
+
# --- Headerless continuation ---
|
|
731
|
+
if tB.is_headerless:
|
|
732
|
+
if tA.width == tB.width:
|
|
733
|
+
# When tA also has no real header, width alone is not enough —
|
|
734
|
+
# two independent same-width tables would always match. Require
|
|
735
|
+
# layout (tA near page bottom → tB near page top) to confirm the
|
|
736
|
+
# table actually overflowed onto the next page.
|
|
737
|
+
if not tA.is_headerless or layout_suggests_continuation(tA, tB, cfg):
|
|
738
|
+
return True, "headerless_width_match", False, []
|
|
739
|
+
if width_diff <= cfg.headerless_width_tolerance and layout_suggests_continuation(
|
|
740
|
+
tA, tB, cfg
|
|
741
|
+
):
|
|
742
|
+
return True, "headerless_width_drift_layout", False, []
|
|
743
|
+
if jaccard(tA.first_row_tokens, tB.first_row_tokens) >= cfg.row_sim_threshold:
|
|
744
|
+
return True, "row_similarity", False, []
|
|
745
|
+
return False, "headerless_no_signal", False, []
|
|
746
|
+
|
|
747
|
+
# --- Repeated-header continuation ---
|
|
748
|
+
header_sim = jaccard(tA.header_tokens, tB.header_tokens)
|
|
749
|
+
layout = layout_suggests_continuation(tA, tB, cfg)
|
|
750
|
+
|
|
751
|
+
if header_sim >= cfg.header_sim_strict:
|
|
752
|
+
# Strict path normally trusts similarity alone. But when both sides
|
|
753
|
+
# carry unique tokens, we're seeing parallel tables sharing domain
|
|
754
|
+
# vocabulary (clinical studies, quarterly reports) — a continuation
|
|
755
|
+
# would have identical headers or tB ⊆ tA. Demand layout in that case.
|
|
756
|
+
if _both_have_unique_header_tokens(tA, tB) and not layout:
|
|
757
|
+
return False, "header_similarity_strict_disjoint_tokens", False, []
|
|
758
|
+
return True, "header_similarity_strict", False, []
|
|
759
|
+
if header_sim >= cfg.header_sim_loose and layout:
|
|
760
|
+
return True, "header_similarity_loose_layout", False, []
|
|
761
|
+
return False, "header_similarity_too_low", False, []
|
|
762
|
+
|
|
763
|
+
|
|
764
|
+
def _pass1_sequential_merge(state: _MergeState, cfg: MultiPageConfig) -> None:
|
|
765
|
+
"""
|
|
766
|
+
Walk document-order-adjacent pairs and union them by the rules in
|
|
767
|
+
``_classify_sequential_pair``. Records a MergeTrace for every pair
|
|
768
|
+
(merged or not) so downstream consumers can audit the decision stream.
|
|
769
|
+
"""
|
|
770
|
+
sorted_tables = state.sorted_tables
|
|
771
|
+
for i in range(1, len(sorted_tables)):
|
|
772
|
+
tA, tB = sorted_tables[i - 1], sorted_tables[i]
|
|
773
|
+
|
|
774
|
+
# Continuity guard: if any table index between tA and tB failed to
|
|
775
|
+
# extract, an unknown fragment sits between them and merging risks
|
|
776
|
+
# false positives.
|
|
777
|
+
if tB.idx - tA.idx > 1:
|
|
778
|
+
gap_indices = set(range(tA.idx + 1, tB.idx))
|
|
779
|
+
if not gap_indices.issubset(state.extracted_indices):
|
|
780
|
+
missing = sorted(gap_indices - state.extracted_indices)
|
|
781
|
+
log.debug(
|
|
782
|
+
f"Skipping pair {tA.idx}->{tB.idx}: "
|
|
783
|
+
f"unextracted table(s) {set(missing)} between them"
|
|
784
|
+
)
|
|
785
|
+
state.decision_traces.append(
|
|
786
|
+
_trace_pair(
|
|
787
|
+
tA,
|
|
788
|
+
tB,
|
|
789
|
+
cfg,
|
|
790
|
+
False,
|
|
791
|
+
"unextracted_table_between",
|
|
792
|
+
[f"unextracted table indices between pair: {missing}"],
|
|
793
|
+
)
|
|
794
|
+
)
|
|
795
|
+
continue
|
|
796
|
+
|
|
797
|
+
should_merge, reason, is_spillover, warnings = _classify_sequential_pair(tA, tB, cfg)
|
|
798
|
+
state.decision_traces.append(_trace_pair(tA, tB, cfg, should_merge, reason, warnings))
|
|
799
|
+
|
|
800
|
+
if not should_merge:
|
|
801
|
+
continue
|
|
802
|
+
|
|
803
|
+
if is_spillover:
|
|
804
|
+
state.spillover_targets[tB.idx] = tA.idx
|
|
805
|
+
|
|
806
|
+
state.uf.union(state.orig_to_pos[tA.idx], state.orig_to_pos[tB.idx])
|
|
807
|
+
log.debug(f"Merge ({reason}): Table {tB.idx} -> Table {tA.idx}")
|
|
808
|
+
|
|
809
|
+
|
|
810
|
+
def _pass2_orphan_repair(state: _MergeState, cfg: MultiPageConfig) -> None:
|
|
811
|
+
"""
|
|
812
|
+
Second pass: pair any not-yet-unioned fragments across pages when
|
|
813
|
+
one is a header orphan and the other is a data orphan. This catches
|
|
814
|
+
cases Pass 1 misses because the two aren't document-order-adjacent.
|
|
815
|
+
"""
|
|
816
|
+
page_map: dict[int, list[int]] = defaultdict(list)
|
|
817
|
+
for t in state.tables_meta:
|
|
818
|
+
if t.start_page is not None:
|
|
819
|
+
page_map[t.start_page].append(t.idx)
|
|
820
|
+
|
|
821
|
+
for p in page_map:
|
|
822
|
+
for off in range(1, cfg.max_page_gap + 1):
|
|
823
|
+
if (p + off) not in page_map:
|
|
824
|
+
continue
|
|
825
|
+
for i in page_map[p]:
|
|
826
|
+
for j in page_map[p + off]:
|
|
827
|
+
posI, posJ = state.orig_to_pos[i], state.orig_to_pos[j]
|
|
828
|
+
if state.uf.find(posI) == state.uf.find(posJ):
|
|
829
|
+
continue
|
|
830
|
+
|
|
831
|
+
# Same continuity guard as Pass 1.
|
|
832
|
+
lo, hi = (i, j) if i < j else (j, i)
|
|
833
|
+
if hi - lo > 1:
|
|
834
|
+
gap_indices = set(range(lo + 1, hi))
|
|
835
|
+
if not gap_indices.issubset(state.extracted_indices):
|
|
836
|
+
missing = sorted(gap_indices - state.extracted_indices)
|
|
837
|
+
log.debug(
|
|
838
|
+
f"Skipping orphan pair {i}->{j}: "
|
|
839
|
+
f"unextracted table(s) {set(missing)} between them"
|
|
840
|
+
)
|
|
841
|
+
continue
|
|
842
|
+
|
|
843
|
+
tA, tB = state.meta_by_idx[i], state.meta_by_idx[j]
|
|
844
|
+
should, reason = should_force_orphan_merge(tA, tB, cfg)
|
|
845
|
+
if should:
|
|
846
|
+
state.uf.union(posI, posJ)
|
|
847
|
+
state.decision_traces.append(
|
|
848
|
+
_trace_pair(tA, tB, cfg, True, reason or "orphans")
|
|
849
|
+
)
|
|
850
|
+
log.debug(f"Orphan merge ({reason}): Table {j} -> Table {i}")
|
|
851
|
+
|
|
852
|
+
|
|
853
|
+
def _apply_spillover(
|
|
854
|
+
df: pd.DataFrame,
|
|
855
|
+
pgs: set[int],
|
|
856
|
+
spillover_members: list[int],
|
|
857
|
+
meta_by_idx: dict[int, TableMeta],
|
|
858
|
+
cfg: MultiPageConfig,
|
|
859
|
+
) -> None:
|
|
860
|
+
"""
|
|
861
|
+
Stitch each spillover fragment's content into the last cell of df
|
|
862
|
+
(in-place). Extracted for readability — the build phase would
|
|
863
|
+
otherwise nest this loop four levels deep.
|
|
864
|
+
"""
|
|
865
|
+
for spill_idx in spillover_members:
|
|
866
|
+
spill_meta = meta_by_idx[spill_idx]
|
|
867
|
+
if spill_meta.df.shape[0] == 0 or df.shape[0] == 0:
|
|
868
|
+
continue
|
|
869
|
+
spill_content = cfg.stitch_separator.join(
|
|
870
|
+
str(spill_meta.df.iloc[r, 0])
|
|
871
|
+
for r in range(spill_meta.df.shape[0])
|
|
872
|
+
if str(spill_meta.df.iloc[r, 0]).strip()
|
|
873
|
+
)
|
|
874
|
+
if not spill_content:
|
|
875
|
+
continue
|
|
876
|
+
|
|
877
|
+
last_row_idx = df.shape[0] - 1
|
|
878
|
+
last_col_idx = df.shape[1] - 1
|
|
879
|
+
raw_val = df.iloc[last_row_idx, last_col_idx]
|
|
880
|
+
current_val = "" if pd.isna(raw_val) else str(raw_val).strip()
|
|
881
|
+
if current_val:
|
|
882
|
+
df.iloc[last_row_idx, last_col_idx] = current_val + cfg.stitch_separator + spill_content
|
|
883
|
+
else:
|
|
884
|
+
df.iloc[last_row_idx, last_col_idx] = spill_content
|
|
885
|
+
pgs.update(spill_meta.pages)
|
|
886
|
+
|
|
887
|
+
|
|
888
|
+
def _build_logical_tables(state: _MergeState, cfg: MultiPageConfig) -> list[LogicalTable]:
|
|
889
|
+
"""
|
|
890
|
+
Collapse the union-find groups into a list of LogicalTable objects.
|
|
891
|
+
Handles spillover application, orphan-anchor vs generic build paths,
|
|
892
|
+
post-merge cell stitching, and attaches per-group merge traces.
|
|
893
|
+
"""
|
|
894
|
+
groups: dict[int, list[int]] = defaultdict(list)
|
|
895
|
+
for t in state.tables_meta:
|
|
896
|
+
groups[state.uf.find(state.orig_to_pos[t.idx])].append(t.idx)
|
|
897
|
+
|
|
898
|
+
results: list[LogicalTable] = []
|
|
899
|
+
for idx, members in enumerate(groups.values()):
|
|
900
|
+
members = sorted(members, key=lambda x: (state.meta_by_idx[x].start_page or 0, x))
|
|
901
|
+
|
|
902
|
+
normal_members = [m for m in members if m not in state.spillover_targets]
|
|
903
|
+
spillover_members = [m for m in members if m in state.spillover_targets]
|
|
904
|
+
if not normal_members:
|
|
905
|
+
continue
|
|
906
|
+
|
|
907
|
+
header_orphan_idx = next(
|
|
908
|
+
(m for m in normal_members if state.meta_by_idx[m].is_header_orphan),
|
|
909
|
+
None,
|
|
910
|
+
)
|
|
911
|
+
if header_orphan_idx is not None:
|
|
912
|
+
df, pgs, build_warnings = _build_orphan_merged_table(
|
|
913
|
+
header_orphan_idx, normal_members, state.meta_by_idx
|
|
914
|
+
)
|
|
915
|
+
else:
|
|
916
|
+
df, pgs, build_warnings = _build_generic_merged_table(
|
|
917
|
+
normal_members, state.meta_by_idx, cfg
|
|
918
|
+
)
|
|
919
|
+
|
|
920
|
+
_apply_spillover(df, pgs, spillover_members, state.meta_by_idx, cfg)
|
|
921
|
+
|
|
922
|
+
if len(pgs) > 1:
|
|
923
|
+
df = stitch_split_cells(df, cfg.stitch_separator)
|
|
924
|
+
df = clean_all_headers(df)
|
|
925
|
+
|
|
926
|
+
member_set = set(members)
|
|
927
|
+
group_traces = [
|
|
928
|
+
tr
|
|
929
|
+
for tr in state.decision_traces
|
|
930
|
+
if tr.left_idx in member_set and tr.right_idx in member_set
|
|
931
|
+
]
|
|
932
|
+
merge_reasons = [tr.reason for tr in group_traces if tr.merged]
|
|
933
|
+
group_warnings = list(build_warnings)
|
|
934
|
+
for tr in group_traces:
|
|
935
|
+
group_warnings.extend(tr.warnings)
|
|
936
|
+
|
|
937
|
+
results.append(
|
|
938
|
+
LogicalTable(
|
|
939
|
+
idx,
|
|
940
|
+
members,
|
|
941
|
+
sorted(pgs),
|
|
942
|
+
df,
|
|
943
|
+
merge_reason="+".join(merge_reasons),
|
|
944
|
+
merge_traces=group_traces,
|
|
945
|
+
warnings=group_warnings,
|
|
946
|
+
)
|
|
947
|
+
)
|
|
948
|
+
|
|
949
|
+
return results
|
|
950
|
+
|
|
951
|
+
|
|
952
|
+
def merge_multipage_tables(
|
|
953
|
+
tables_meta: list[TableMeta],
|
|
954
|
+
cfg: MultiPageConfig,
|
|
955
|
+
) -> list[LogicalTable]:
|
|
956
|
+
"""
|
|
957
|
+
Merge table fragments into logical tables.
|
|
958
|
+
|
|
959
|
+
The merge engine runs in four named phases:
|
|
960
|
+
|
|
961
|
+
1. **Setup** (``_init_merge_state``) — build index maps, union-find,
|
|
962
|
+
and sort fragments into document order.
|
|
963
|
+
2. **Sequential merge** (``_pass1_sequential_merge``) — walk adjacent
|
|
964
|
+
pairs; union them by structural rules in ``_classify_sequential_pair``.
|
|
965
|
+
3. **Orphan repair** (``_pass2_orphan_repair``) — catch any header/data
|
|
966
|
+
orphan pairs Pass 1 missed.
|
|
967
|
+
4. **Build results** (``_build_logical_tables``) — group by union-find
|
|
968
|
+
root, apply spillover content, stitch split cells, attach traces.
|
|
969
|
+
|
|
970
|
+
Returns a list of ``LogicalTable`` objects, each with ``merge_reason``,
|
|
971
|
+
``merge_traces``, and ``warnings`` populated for downstream auditing.
|
|
972
|
+
"""
|
|
973
|
+
if not tables_meta:
|
|
974
|
+
return []
|
|
975
|
+
|
|
976
|
+
state = _init_merge_state(tables_meta)
|
|
977
|
+
_pass1_sequential_merge(state, cfg)
|
|
978
|
+
_pass2_orphan_repair(state, cfg)
|
|
979
|
+
return _build_logical_tables(state, cfg)
|