table-stitcher 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,979 @@
1
+ """
2
+ Core merge engine for table-stitcher.
3
+
4
+ This module is parser-agnostic. It operates exclusively on TableMeta objects
5
+ and pandas DataFrames — it never touches parser-native document objects.
6
+
7
+ Key Principles:
8
+ 1. Sequential merging: Headerless fragments only merge with immediate predecessor
9
+ 2. Width matching: Same column count = same table structure (primary signal)
10
+ 3. Spillover detection: 1-column fragments are cell overflow, not new tables
11
+ 4. New table detection: Fragments with non-matching headers are separate tables
12
+ """
13
+
14
+ import logging
15
+ import re
16
+ import unicodedata
17
+ from collections import defaultdict
18
+ from dataclasses import dataclass, field
19
+ from typing import Any, Optional
20
+
21
+ import pandas as pd
22
+
23
+ from .models import LogicalTable, MergeTrace, MultiPageConfig, TableMeta
24
+
25
+ log = logging.getLogger(__name__)
26
+
27
+
28
+ # -------------------------------------------------------------------
29
+ # 1. UTILITY FUNCTIONS
30
+ # -------------------------------------------------------------------
31
+
32
+
33
+ def normalize_col_name(col: Any) -> str:
34
+ """Normalize column name for comparison."""
35
+ return str(col).strip().lower()
36
+
37
+
38
+ # Scripts where each character is semantically its own token, because the
39
+ # script doesn't use whitespace between words (CJK family, Thai, Lao, Khmer,
40
+ # Myanmar, Tibetan). Per-character Jaccard works for similarity comparison:
41
+ # identical headers produce identical character sets; unrelated headers
42
+ # rarely cross the ~60% overlap required to hit the strict threshold.
43
+ #
44
+ # This list is bounded — Unicode regularly adds new scripts, but almost all
45
+ # new scripts are whitespace-using (and therefore handled as words without
46
+ # a code change). Only the separator-less family needs enumeration.
47
+ _SEPARATORLESS_SCRIPTS: set[str] = {
48
+ "Han", # Chinese / Japanese kanji / Korean hanja
49
+ "Hiragana",
50
+ "Katakana",
51
+ "Hangul",
52
+ "Thai",
53
+ "Lao",
54
+ "Khmer",
55
+ "Myanmar",
56
+ "Tibetan",
57
+ }
58
+
59
+ # Map a substring of the Unicode character name to a script tag. Unicode
60
+ # character names are standardized and frozen, so this mapping is stable
61
+ # across Python and Unicode releases.
62
+ _NAME_TO_SCRIPT: list[tuple[str, str]] = [
63
+ ("CJK", "Han"),
64
+ ("KANGXI", "Han"), # e.g. U+2F49 "KANGXI RADICAL MOON"
65
+ ("HIRAGANA", "Hiragana"),
66
+ ("KATAKANA", "Katakana"),
67
+ ("HANGUL", "Hangul"),
68
+ ("THAI", "Thai"),
69
+ ("LAO", "Lao"),
70
+ ("KHMER", "Khmer"),
71
+ ("MYANMAR", "Myanmar"),
72
+ ("TIBETAN", "Tibetan"),
73
+ ]
74
+
75
+
76
+ def _script_of(ch: str) -> Optional[str]:
77
+ """Return a script tag for `ch`, or None for scripts that use whitespace."""
78
+ if ord(ch) < 128: # ASCII fast path — by far the common case in Latin text
79
+ return None
80
+ name = unicodedata.name(ch, "")
81
+ if not name:
82
+ return None
83
+ for prefix, script in _NAME_TO_SCRIPT:
84
+ if prefix in name:
85
+ return script
86
+ return None
87
+
88
+
89
+ def tokenize(text: str) -> set[str]:
90
+ """
91
+ Extract tokens for Jaccard similarity comparison — script-aware.
92
+
93
+ Rules, all structural (no language models, no external dependencies):
94
+
95
+ - Characters in separator-less scripts (CJK, Thai, Lao, Khmer, Myanmar,
96
+ Tibetan): each character is its own token. Unigram Jaccard — identical
97
+ headers produce identical token sets.
98
+ - Other alphabetic characters (Latin, Cyrillic, Greek, Arabic, Hebrew,
99
+ Devanagari, Tamil, ...): grouped into whitespace-separated words,
100
+ lowercased. These scripts have word boundaries at whitespace, so the
101
+ same rule that works for English works for them.
102
+ - Digits, punctuation, and whitespace: ignored — boundaries only.
103
+
104
+ Mixed-script text (e.g., "Sales + non-Latin run") produces the union of
105
+ both token sets.
106
+ """
107
+ tokens: set[str] = set()
108
+ buf: list[str] = []
109
+ for ch in str(text):
110
+ # Check script BEFORE isalpha: Kangxi radicals (U+2F00–U+2FDF) and
111
+ # some CJK compatibility characters are classed as symbols, not
112
+ # letters, but still belong to Han script for tokenization purposes.
113
+ if _script_of(ch) in _SEPARATORLESS_SCRIPTS:
114
+ if buf:
115
+ tokens.add("".join(buf).lower())
116
+ buf.clear()
117
+ tokens.add(ch)
118
+ elif ch.isalpha():
119
+ buf.append(ch)
120
+ else:
121
+ # Non-letter boundary (digit, punctuation, whitespace) — flush.
122
+ if buf:
123
+ tokens.add("".join(buf).lower())
124
+ buf.clear()
125
+ if buf:
126
+ tokens.add("".join(buf).lower())
127
+ return tokens
128
+
129
+
130
+ def jaccard(a: set[str], b: set[str]) -> float:
131
+ """Calculate Jaccard similarity between two sets."""
132
+ if not a and not b:
133
+ return 0.0
134
+ inter = len(a & b)
135
+ union = len(a | b)
136
+ return inter / union if union > 0 else 0.0
137
+
138
+
139
+ def is_numeric_like_colnames(cols: list[Any]) -> bool:
140
+ """Check if column names look auto-generated (numeric or 'Unnamed')."""
141
+ if not cols:
142
+ return False
143
+ numeric_like = 0
144
+ for c in cols:
145
+ s = str(c).strip().lower()
146
+ if re.fullmatch(r"\d+", s):
147
+ numeric_like += 1
148
+ elif s.startswith("unnamed:"):
149
+ numeric_like += 1
150
+ return numeric_like / len(cols) >= 0.7
151
+
152
+
153
+ def first_row_has_number(df: pd.DataFrame) -> bool:
154
+ """Check if the first row contains any numeric characters."""
155
+ if df.shape[0] == 0:
156
+ return False
157
+ row_text = " ".join(str(x) for x in df.iloc[0].tolist())
158
+ return bool(re.search(r"\d", row_text))
159
+
160
+
161
+ def is_empty_value(val: Any) -> bool:
162
+ """Check if a value is empty/null."""
163
+ if val is None:
164
+ return True
165
+ if isinstance(val, float) and pd.isna(val):
166
+ return True
167
+ if isinstance(val, str) and val.strip() == "":
168
+ return True
169
+ return False
170
+
171
+
172
+ def clean_malformed_header(col: str) -> str:
173
+ """Fix headers like 'Name.Name' -> 'Name'."""
174
+ col = str(col).strip()
175
+ if "." in col:
176
+ parts = col.split(".")
177
+ if len(parts) == 2 and parts[0].strip().lower() == parts[1].strip().lower():
178
+ return parts[0].strip()
179
+ return col
180
+
181
+
182
+ def clean_all_headers(df: pd.DataFrame) -> pd.DataFrame:
183
+ """Apply header cleaning to all columns."""
184
+ new_cols = [clean_malformed_header(c) for c in df.columns]
185
+ df_copy = df.copy()
186
+ df_copy.columns = new_cols
187
+ return df_copy
188
+
189
+
190
+ def _pair_signals(tA: TableMeta, tB: TableMeta, cfg: MultiPageConfig) -> dict[str, Any]:
191
+ """Collect stable, parser-neutral signals for merge explanations."""
192
+ page_gap = None
193
+ if tA.start_page is not None and tB.start_page is not None:
194
+ page_gap = tB.start_page - tA.start_page
195
+
196
+ return {
197
+ "left_page": tA.start_page,
198
+ "right_page": tB.start_page,
199
+ "page_gap": page_gap,
200
+ "left_width": tA.width,
201
+ "right_width": tB.width,
202
+ "width_diff": abs(tA.width - tB.width),
203
+ "left_headerless": tA.is_headerless,
204
+ "right_headerless": tB.is_headerless,
205
+ "left_header_orphan": tA.is_header_orphan,
206
+ "right_header_orphan": tB.is_header_orphan,
207
+ "left_data_orphan": tA.is_data_orphan,
208
+ "right_data_orphan": tB.is_data_orphan,
209
+ "header_similarity": jaccard(tA.header_tokens, tB.header_tokens),
210
+ "row_similarity": jaccard(tA.first_row_tokens, tB.first_row_tokens),
211
+ "layout_continuation": layout_suggests_continuation(tA, tB, cfg),
212
+ }
213
+
214
+
215
+ def _trace_pair(
216
+ tA: TableMeta,
217
+ tB: TableMeta,
218
+ cfg: MultiPageConfig,
219
+ merged: bool,
220
+ reason: str,
221
+ warnings: Optional[list[str]] = None,
222
+ ) -> MergeTrace:
223
+ """Build a MergeTrace for one adjacent pair."""
224
+ return MergeTrace(
225
+ left_idx=tA.idx,
226
+ right_idx=tB.idx,
227
+ merged=merged,
228
+ reason=reason,
229
+ signals=_pair_signals(tA, tB, cfg),
230
+ warnings=warnings or [],
231
+ )
232
+
233
+
234
+ # -------------------------------------------------------------------
235
+ # 2. UNION-FIND DATA STRUCTURE
236
+ # -------------------------------------------------------------------
237
+
238
+
239
+ class UnionFind:
240
+ """Union-Find (Disjoint Set) for grouping table fragments."""
241
+
242
+ def __init__(self, n: int):
243
+ self.parent = list(range(n))
244
+ self.rank = [0] * n
245
+
246
+ def find(self, x: int) -> int:
247
+ if self.parent[x] != x:
248
+ self.parent[x] = self.find(self.parent[x])
249
+ return self.parent[x]
250
+
251
+ def union(self, a: int, b: int) -> bool:
252
+ ra, rb = self.find(a), self.find(b)
253
+ if ra == rb:
254
+ return False
255
+ if self.rank[ra] < self.rank[rb]:
256
+ ra, rb = rb, ra
257
+ self.parent[rb] = ra
258
+ if self.rank[ra] == self.rank[rb]:
259
+ self.rank[ra] += 1
260
+ return True
261
+
262
+
263
+ # -------------------------------------------------------------------
264
+ # 3. MERGE DECISION LOGIC
265
+ # -------------------------------------------------------------------
266
+
267
+
268
+ def layout_suggests_continuation(tA: TableMeta, tB: TableMeta, cfg: MultiPageConfig) -> bool:
269
+ """
270
+ Check if vertical positions suggest tB continues tA.
271
+
272
+ Uses normalized coordinates where 0 = top of page, 1 = bottom of page.
273
+
274
+ For continuation:
275
+ - Table A should be near the BOTTOM of its page (vert_bottom >= bottom_band_min)
276
+ - Table B should be near the TOP of its page (vert_top <= top_band_max)
277
+ """
278
+ if not cfg.use_layout_hint:
279
+ return False
280
+ if tA.vert_bottom is None or tB.vert_top is None:
281
+ return False
282
+
283
+ a_near_bottom = tA.vert_bottom >= cfg.bottom_band_min
284
+ b_near_top = tB.vert_top <= cfg.top_band_max
285
+
286
+ return a_near_bottom and b_near_top
287
+
288
+
289
+ def _both_have_unique_header_tokens(tA: TableMeta, tB: TableMeta) -> bool:
290
+ """
291
+ True when each side's header set has at least one token the other lacks.
292
+
293
+ This is the structural signature of *parallel* tables sharing domain
294
+ vocabulary (e.g. clinical studies that share patient/age/sex but differ
295
+ on outcome column), not of a single table split across pages. A real
296
+ continuation has either identical headers or tB ⊆ tA — parsers may
297
+ drop columns on page 2 but cannot invent header tokens that weren't on
298
+ page 1. So when both sides bring their own tokens, header similarity
299
+ alone is unsafe; we require layout corroboration before merging.
300
+ """
301
+ a, b = tA.header_tokens, tB.header_tokens
302
+ if not a or not b:
303
+ return False
304
+ return bool(a - b) and bool(b - a)
305
+
306
+
307
+ def should_force_orphan_merge(h: TableMeta, d: TableMeta, cfg: MultiPageConfig) -> tuple[bool, str]:
308
+ """Check if header orphan + data orphan should merge."""
309
+ if h.start_page is None or d.start_page is None:
310
+ return False, ""
311
+ if (d.start_page - h.start_page) > cfg.max_page_gap:
312
+ return False, ""
313
+ if abs(h.width - d.width) > cfg.max_width_difference:
314
+ return False, ""
315
+
316
+ layout = layout_suggests_continuation(h, d, cfg)
317
+ if h.is_header_orphan and d.is_data_orphan:
318
+ return True, "orphans" + ("+layout" if layout else "")
319
+ return False, ""
320
+
321
+
322
+ def is_spillover_fragment(tA: TableMeta, tB: TableMeta, cfg: MultiPageConfig) -> bool:
323
+ """
324
+ Detect if tB is a spillover fragment (cell overflow from tA).
325
+
326
+ A spillover fragment is characterized by:
327
+ - 1 column (content got dumped into a single cell)
328
+ - Headerless (no structure, just content)
329
+ - Follows a multi-column table
330
+ - On the **immediately** following page (page_gap == 1)
331
+
332
+ The immediate-next-page constraint is load-bearing: a 1-col fragment
333
+ several pages later is almost certainly an unrelated small table, not a
334
+ continuation. Independent of `cfg.max_page_gap` — which governs the
335
+ general merge search but shouldn't apply to spillover, since the semantic
336
+ is "cell overflow" and overflow physically lands on the very next page.
337
+
338
+ By default, the structural signal is strong enough. Content checking
339
+ (URLs, ticket patterns) is optional via spillover_require_content_check.
340
+ """
341
+ if not (tB.is_headerless and tB.width == 1 and tA.width > 1):
342
+ return False
343
+ if tA.start_page is None or tB.start_page is None:
344
+ return False
345
+ if tB.start_page - tA.start_page != 1:
346
+ return False
347
+
348
+ if not cfg.spillover_require_content_check:
349
+ return True
350
+
351
+ if tB.df.shape[0] == 0:
352
+ return False
353
+
354
+ first_cell = str(tB.df.iloc[0, 0]).lower()
355
+ looks_like_continuation = (
356
+ "http" in first_cell
357
+ or "://" in first_cell
358
+ or bool(re.search(r"[A-Z]+-\d+", str(tB.df.iloc[0, 0])))
359
+ or tB.row_count <= 2
360
+ )
361
+ return looks_like_continuation
362
+
363
+
364
+ # -------------------------------------------------------------------
365
+ # 4. TABLE BUILDING (Post-Merge)
366
+ # -------------------------------------------------------------------
367
+
368
+
369
+ def stitch_split_cells(df: pd.DataFrame, separator: str = "\n") -> pd.DataFrame:
370
+ """
371
+ Merge rows that are actually split cells.
372
+
373
+ Detects patterns where a row has only one non-empty cell, which is
374
+ likely continuation content from the previous row.
375
+
376
+ Uses positional (integer) indexing throughout — pandas' label-based
377
+ indexing breaks when a merged DataFrame has duplicate column names
378
+ (common with rowspan/colspan parsers), because df[col] returns a
379
+ sub-DataFrame rather than a scalar.
380
+ """
381
+ if df.shape[0] <= 1:
382
+ return df
383
+
384
+ cols = list(df.columns)
385
+ ncols = len(cols)
386
+ stitched_rows = []
387
+ i = 0
388
+ n = df.shape[0]
389
+
390
+ while i < n:
391
+ row = df.iloc[i].tolist()
392
+ j = i + 1
393
+
394
+ while j < n:
395
+ next_row_vals = df.iloc[j].tolist()
396
+ nonempty_idxs = [k for k, v in enumerate(next_row_vals) if not is_empty_value(v)]
397
+
398
+ if len(nonempty_idxs) != 1:
399
+ break
400
+
401
+ # A genuine continuation always has col 0 empty — that column
402
+ # is the record identifier (participant ID, row label, etc.).
403
+ # A non-empty col 0 means a new record or a category/section
404
+ # row, not an overflow of the previous cell.
405
+ if not is_empty_value(next_row_vals[0]):
406
+ break
407
+
408
+ cont_idx = nonempty_idxs[0]
409
+ cont_val = str(next_row_vals[cont_idx]).strip()
410
+ target_idx = cont_idx
411
+
412
+ is_url = "://" in cont_val or cont_val.lower().startswith("http")
413
+ if is_url:
414
+ candidates = [
415
+ k
416
+ for k, c in enumerate(cols)
417
+ if any(x in str(c).lower() for x in ["content", "ref", "desc", "link", "url"])
418
+ ]
419
+ if candidates:
420
+ target_idx = candidates[-1]
421
+ else:
422
+ target_idx = ncols - 1
423
+
424
+ prev_val = row[target_idx]
425
+ if not is_empty_value(prev_val):
426
+ row[target_idx] = str(prev_val).rstrip() + separator + cont_val.lstrip()
427
+ else:
428
+ row[target_idx] = cont_val
429
+ j += 1
430
+
431
+ stitched_rows.append(row)
432
+ i = j
433
+
434
+ return pd.DataFrame(stitched_rows, columns=cols)
435
+
436
+
437
+ _VALID_WIDTH_OVERFLOW_POLICIES = {"preserve_extra", "warn_drop", "fail", "merge_tail"}
438
+
439
+
440
+ def _pad_narrow(df: pd.DataFrame, canonical_cols: list[str]) -> pd.DataFrame:
441
+ """Right-pad a narrower fragment with empty ``_pad_N`` columns."""
442
+ df_copy = df.copy()
443
+ for k in range(df.shape[1], len(canonical_cols)):
444
+ df_copy[f"_pad_{k}"] = ""
445
+ df_copy.columns = canonical_cols
446
+ df_copy.attrs["table_stitcher_warnings"] = []
447
+ return df_copy
448
+
449
+
450
+ def _overflow_fail(
451
+ df: pd.DataFrame, canonical_cols: list[str], source_meta: TableMeta, cfg: MultiPageConfig
452
+ ) -> pd.DataFrame:
453
+ dropped_cols = [str(c) for c in df.columns[len(canonical_cols) :]]
454
+ raise ValueError(
455
+ f"Fragment idx={getattr(source_meta, 'idx', None)} "
456
+ f"page={getattr(source_meta, 'start_page', None)} has {df.shape[1]} columns, "
457
+ f"wider than canonical width {len(canonical_cols)}; extra columns: {dropped_cols}"
458
+ )
459
+
460
+
461
+ def _overflow_warn_drop(
462
+ df: pd.DataFrame, canonical_cols: list[str], source_meta: TableMeta, cfg: MultiPageConfig
463
+ ) -> pd.DataFrame:
464
+ dropped = df.shape[1] - len(canonical_cols)
465
+ dropped_cols = [str(c) for c in df.columns[len(canonical_cols) :]]
466
+ warning = (
467
+ f"Dropped {dropped} trailing column(s) from fragment "
468
+ f"idx={getattr(source_meta, 'idx', None)} "
469
+ f"page={getattr(source_meta, 'start_page', None)} "
470
+ f"to fit canonical width {len(canonical_cols)}; dropped columns: {dropped_cols}"
471
+ )
472
+ log.warning("align_dataframe_to_header: %s", warning)
473
+
474
+ df_copy = df.iloc[:, : len(canonical_cols)].copy()
475
+ df_copy.columns = canonical_cols
476
+ df_copy.attrs["table_stitcher_warnings"] = [warning]
477
+ return df_copy
478
+
479
+
480
+ def _overflow_merge_tail(
481
+ df: pd.DataFrame, canonical_cols: list[str], source_meta: TableMeta, cfg: MultiPageConfig
482
+ ) -> pd.DataFrame:
483
+ """Fold trailing overflow cells into the last canonical column."""
484
+ rows = []
485
+ for _, row in df.iterrows():
486
+ vals = list(row.tolist())
487
+ head_vals = vals[: len(canonical_cols)]
488
+ tail_vals = [str(v).strip() for v in vals[len(canonical_cols) :] if not is_empty_value(v)]
489
+ while len(head_vals) < len(canonical_cols):
490
+ head_vals.append("")
491
+ if tail_vals and canonical_cols:
492
+ last_idx = len(canonical_cols) - 1
493
+ tail_text = cfg.stitch_separator.join(tail_vals)
494
+ if is_empty_value(head_vals[last_idx]):
495
+ head_vals[last_idx] = tail_text
496
+ else:
497
+ head_vals[last_idx] = (
498
+ str(head_vals[last_idx]).rstrip() + cfg.stitch_separator + tail_text
499
+ )
500
+ rows.append(head_vals)
501
+ df_copy = pd.DataFrame(rows, columns=canonical_cols)
502
+ df_copy.attrs["table_stitcher_warnings"] = []
503
+ return df_copy
504
+
505
+
506
+ def _overflow_preserve_extra(
507
+ df: pd.DataFrame, canonical_cols: list[str], source_meta: TableMeta, cfg: MultiPageConfig
508
+ ) -> pd.DataFrame:
509
+ """Keep overflow cells in explicit ``_extra_N_<origname>`` columns (default, lossless)."""
510
+ df_copy = df.copy()
511
+ extra_cols: list[str] = []
512
+ used = {str(c) for c in canonical_cols}
513
+ for offset, col in enumerate(df.columns[len(canonical_cols) :]):
514
+ base = f"_extra_{offset}_{str(col).strip() or 'column'}"
515
+ candidate = base
516
+ suffix = 1
517
+ while candidate in used:
518
+ candidate = f"{base}_{suffix}"
519
+ suffix += 1
520
+ used.add(candidate)
521
+ extra_cols.append(candidate)
522
+ df_copy.columns = canonical_cols + extra_cols
523
+ df_copy.attrs["table_stitcher_warnings"] = []
524
+ return df_copy
525
+
526
+
527
+ _WIDTH_OVERFLOW_HANDLERS = {
528
+ "preserve_extra": _overflow_preserve_extra,
529
+ "warn_drop": _overflow_warn_drop,
530
+ "fail": _overflow_fail,
531
+ "merge_tail": _overflow_merge_tail,
532
+ }
533
+
534
+
535
+ def align_dataframe_to_header(
536
+ df: pd.DataFrame,
537
+ canonical_cols: list[str],
538
+ source_meta: TableMeta,
539
+ cfg: MultiPageConfig,
540
+ ) -> pd.DataFrame:
541
+ """
542
+ Align a DataFrame to a canonical column structure.
543
+
544
+ Narrower fragments are right-padded with empty columns.
545
+ Wider fragments dispatch to a handler keyed by ``cfg.width_overflow_policy``:
546
+
547
+ - ``preserve_extra`` (default): add trailing ``_extra_N_<origname>`` columns.
548
+ - ``warn_drop``: drop trailing columns and log a warning.
549
+ - ``fail``: raise ``ValueError``.
550
+ - ``merge_tail``: append trailing values into the final canonical cell.
551
+ """
552
+ if cfg.width_overflow_policy not in _VALID_WIDTH_OVERFLOW_POLICIES:
553
+ raise ValueError(
554
+ "width_overflow_policy must be one of "
555
+ f"{sorted(_VALID_WIDTH_OVERFLOW_POLICIES)}, got {cfg.width_overflow_policy!r}"
556
+ )
557
+
558
+ if df.shape[1] < len(canonical_cols):
559
+ return _pad_narrow(df, canonical_cols)
560
+
561
+ if df.shape[1] > len(canonical_cols):
562
+ return _WIDTH_OVERFLOW_HANDLERS[cfg.width_overflow_policy](
563
+ df, canonical_cols, source_meta, cfg
564
+ )
565
+
566
+ # Exact width match — just relabel and carry an empty warnings list.
567
+ df_copy = df.copy()
568
+ df_copy.columns = canonical_cols
569
+ df_copy.attrs["table_stitcher_warnings"] = []
570
+ return df_copy
571
+
572
+
573
+ def _build_orphan_merged_table(
574
+ header_idx: int, all_members: list[int], meta_by_idx: dict[int, TableMeta]
575
+ ) -> tuple[pd.DataFrame, set[int], list[str]]:
576
+ """Build merged table when the anchor is a header orphan."""
577
+ h_meta = meta_by_idx[header_idx]
578
+
579
+ if h_meta.df.shape[0] == 0:
580
+ header_cells = [str(c) for c in h_meta.df.columns]
581
+ else:
582
+ header_cells = [str(x) for x in h_meta.df.iloc[0].tolist()]
583
+
584
+ data_members = [m for m in all_members if m != header_idx]
585
+ max_w = max([len(header_cells)] + [meta_by_idx[m].width for m in data_members])
586
+ canonical_cols = header_cells + [f"col_{k}" for k in range(len(header_cells), max_w)]
587
+
588
+ rows = []
589
+ prev = h_meta
590
+
591
+ for m_idx in data_members:
592
+ m = meta_by_idx[m_idx]
593
+
594
+ if m.continuation_content and not rows and prev.is_header_orphan:
595
+ for cc in m.continuation_content:
596
+ if cc["col_idx"] < len(canonical_cols):
597
+ canonical_cols[cc["col_idx"]] += " " + cc["value"]
598
+ elif m.continuation_content and rows:
599
+ for cc in m.continuation_content:
600
+ if cc["col_idx"] < max_w:
601
+ rows[-1][cc["col_idx"]] += "\n" + cc["value"]
602
+
603
+ for _, r in m.df.iterrows():
604
+ vals = [str(v) for v in r.tolist()]
605
+ vals += [""] * (max_w - len(vals))
606
+ rows.append(vals[:max_w])
607
+
608
+ prev = m
609
+
610
+ return (
611
+ pd.DataFrame(rows, columns=canonical_cols),
612
+ set().union(*(meta_by_idx[i].pages for i in all_members)),
613
+ [],
614
+ )
615
+
616
+
617
+ def _build_generic_merged_table(
618
+ members: list[int], meta_by_idx: dict[int, TableMeta], cfg: MultiPageConfig
619
+ ) -> tuple[pd.DataFrame, set[int], list[str]]:
620
+ """Build merged table for the general case."""
621
+ base = meta_by_idx[members[0]]
622
+ merged_df = base.df.copy()
623
+ canonical_cols = [str(c) for c in base.df.columns]
624
+ merged_pages = set(base.pages)
625
+ warnings: list[str] = []
626
+ prev = base
627
+
628
+ for idx in members[1:]:
629
+ m = meta_by_idx[idx]
630
+
631
+ if m.continuation_content and merged_df.shape[0] > 0:
632
+ if (min(m.pages or [0]) - max(prev.pages or [0])) <= cfg.max_page_gap:
633
+ for cc in m.continuation_content:
634
+ if cc["col_idx"] < merged_df.shape[1]:
635
+ curr = str(merged_df.iloc[-1, cc["col_idx"]])
636
+ if curr and not is_empty_value(curr):
637
+ merged_df.iloc[-1, cc["col_idx"]] += cfg.stitch_separator + cc["value"]
638
+
639
+ aligned = align_dataframe_to_header(m.df, canonical_cols, m, cfg)
640
+ warnings.extend(aligned.attrs.get("table_stitcher_warnings", []))
641
+ merged_df = pd.concat([merged_df, aligned], ignore_index=True).fillna("")
642
+ canonical_cols = [str(c) for c in merged_df.columns]
643
+ merged_pages.update(m.pages)
644
+ prev = m
645
+
646
+ return merged_df, merged_pages, warnings
647
+
648
+
649
+ # -------------------------------------------------------------------
650
+ # 5. MAIN MERGE FUNCTION
651
+ #
652
+ # The main `merge_multipage_tables` function reads as four named phases:
653
+ # setup → Pass 1 (sequential) → Pass 2 (orphan repair) → build results.
654
+ # Each phase is a helper that takes `_MergeState` plus cfg; state holds
655
+ # the cross-phase data (union-find, index maps, traces).
656
+ # -------------------------------------------------------------------
657
+
658
+
659
+ @dataclass
660
+ class _MergeState:
661
+ """Mutable state passed between the phases of merge_multipage_tables."""
662
+
663
+ uf: UnionFind
664
+ tables_meta: list[TableMeta]
665
+ meta_by_idx: dict[int, TableMeta]
666
+ orig_to_pos: dict[int, int]
667
+ sorted_tables: list[TableMeta]
668
+ extracted_indices: set[int]
669
+ spillover_targets: dict[int, int] = field(default_factory=dict)
670
+ decision_traces: list[MergeTrace] = field(default_factory=list)
671
+
672
+
673
+ def _init_merge_state(tables_meta: list[TableMeta]) -> _MergeState:
674
+ """Build the shared state for one merge invocation."""
675
+ # Original t.idx values may be non-contiguous when table extraction
676
+ # fails for some tables. Positional index maps bridge that gap.
677
+ orig_to_pos = {t.idx: pos for pos, t in enumerate(tables_meta)}
678
+ return _MergeState(
679
+ uf=UnionFind(len(tables_meta)),
680
+ tables_meta=tables_meta,
681
+ meta_by_idx={t.idx: t for t in tables_meta},
682
+ orig_to_pos=orig_to_pos,
683
+ sorted_tables=sorted(tables_meta, key=lambda t: (t.start_page or 0, t.idx)),
684
+ extracted_indices={t.idx for t in tables_meta},
685
+ )
686
+
687
+
688
+ def _classify_sequential_pair(
689
+ tA: TableMeta,
690
+ tB: TableMeta,
691
+ cfg: MultiPageConfig,
692
+ ) -> tuple[bool, str, bool, list[str]]:
693
+ """
694
+ Decide whether two adjacent-in-document-order fragments should merge.
695
+
696
+ Returns ``(should_merge, reason, is_spillover, warnings)``. The caller
697
+ handles the actual union and trace bookkeeping; this function is pure
698
+ logic over the pair's signals. Keeping it pure makes every merge
699
+ decision independently reviewable.
700
+ """
701
+ # --- Page-adjacency guard ---
702
+ if tA.start_page is None or tB.start_page is None:
703
+ return False, "missing_page", False, []
704
+ page_gap = tB.start_page - tA.start_page
705
+ if page_gap < 1 or page_gap > cfg.max_page_gap:
706
+ return False, "page_gap_out_of_range", False, []
707
+
708
+ # --- Spillover (checked before width guards since spillover can cross
709
+ # width boundaries legitimately: 1-col fragment follows N-col table) ---
710
+ if is_spillover_fragment(tA, tB, cfg):
711
+ return True, "spillover", True, []
712
+
713
+ # --- Right-side header orphan starts a new table, not a continuation ---
714
+ if tB.is_header_orphan:
715
+ return False, "right_header_orphan_starts_next_table", False, []
716
+
717
+ # --- Width guards ---
718
+ width_diff = abs(tA.width - tB.width)
719
+ if cfg.require_same_width and width_diff > 0:
720
+ return False, "require_same_width", False, []
721
+ if width_diff > cfg.max_width_difference:
722
+ return False, "width_difference_too_large", False, []
723
+
724
+ # --- Header orphan on the left + headerless data on the right:
725
+ # trust the data fragment's width (header orphans often have
726
+ # truncated widths from empty cells dropped by the parser). ---
727
+ if tA.is_header_orphan and tB.is_headerless:
728
+ return True, "header_orphan_to_headerless", False, []
729
+
730
+ # --- Headerless continuation ---
731
+ if tB.is_headerless:
732
+ if tA.width == tB.width:
733
+ # When tA also has no real header, width alone is not enough —
734
+ # two independent same-width tables would always match. Require
735
+ # layout (tA near page bottom → tB near page top) to confirm the
736
+ # table actually overflowed onto the next page.
737
+ if not tA.is_headerless or layout_suggests_continuation(tA, tB, cfg):
738
+ return True, "headerless_width_match", False, []
739
+ if width_diff <= cfg.headerless_width_tolerance and layout_suggests_continuation(
740
+ tA, tB, cfg
741
+ ):
742
+ return True, "headerless_width_drift_layout", False, []
743
+ if jaccard(tA.first_row_tokens, tB.first_row_tokens) >= cfg.row_sim_threshold:
744
+ return True, "row_similarity", False, []
745
+ return False, "headerless_no_signal", False, []
746
+
747
+ # --- Repeated-header continuation ---
748
+ header_sim = jaccard(tA.header_tokens, tB.header_tokens)
749
+ layout = layout_suggests_continuation(tA, tB, cfg)
750
+
751
+ if header_sim >= cfg.header_sim_strict:
752
+ # Strict path normally trusts similarity alone. But when both sides
753
+ # carry unique tokens, we're seeing parallel tables sharing domain
754
+ # vocabulary (clinical studies, quarterly reports) — a continuation
755
+ # would have identical headers or tB ⊆ tA. Demand layout in that case.
756
+ if _both_have_unique_header_tokens(tA, tB) and not layout:
757
+ return False, "header_similarity_strict_disjoint_tokens", False, []
758
+ return True, "header_similarity_strict", False, []
759
+ if header_sim >= cfg.header_sim_loose and layout:
760
+ return True, "header_similarity_loose_layout", False, []
761
+ return False, "header_similarity_too_low", False, []
762
+
763
+
764
+ def _pass1_sequential_merge(state: _MergeState, cfg: MultiPageConfig) -> None:
765
+ """
766
+ Walk document-order-adjacent pairs and union them by the rules in
767
+ ``_classify_sequential_pair``. Records a MergeTrace for every pair
768
+ (merged or not) so downstream consumers can audit the decision stream.
769
+ """
770
+ sorted_tables = state.sorted_tables
771
+ for i in range(1, len(sorted_tables)):
772
+ tA, tB = sorted_tables[i - 1], sorted_tables[i]
773
+
774
+ # Continuity guard: if any table index between tA and tB failed to
775
+ # extract, an unknown fragment sits between them and merging risks
776
+ # false positives.
777
+ if tB.idx - tA.idx > 1:
778
+ gap_indices = set(range(tA.idx + 1, tB.idx))
779
+ if not gap_indices.issubset(state.extracted_indices):
780
+ missing = sorted(gap_indices - state.extracted_indices)
781
+ log.debug(
782
+ f"Skipping pair {tA.idx}->{tB.idx}: "
783
+ f"unextracted table(s) {set(missing)} between them"
784
+ )
785
+ state.decision_traces.append(
786
+ _trace_pair(
787
+ tA,
788
+ tB,
789
+ cfg,
790
+ False,
791
+ "unextracted_table_between",
792
+ [f"unextracted table indices between pair: {missing}"],
793
+ )
794
+ )
795
+ continue
796
+
797
+ should_merge, reason, is_spillover, warnings = _classify_sequential_pair(tA, tB, cfg)
798
+ state.decision_traces.append(_trace_pair(tA, tB, cfg, should_merge, reason, warnings))
799
+
800
+ if not should_merge:
801
+ continue
802
+
803
+ if is_spillover:
804
+ state.spillover_targets[tB.idx] = tA.idx
805
+
806
+ state.uf.union(state.orig_to_pos[tA.idx], state.orig_to_pos[tB.idx])
807
+ log.debug(f"Merge ({reason}): Table {tB.idx} -> Table {tA.idx}")
808
+
809
+
810
+ def _pass2_orphan_repair(state: _MergeState, cfg: MultiPageConfig) -> None:
811
+ """
812
+ Second pass: pair any not-yet-unioned fragments across pages when
813
+ one is a header orphan and the other is a data orphan. This catches
814
+ cases Pass 1 misses because the two aren't document-order-adjacent.
815
+ """
816
+ page_map: dict[int, list[int]] = defaultdict(list)
817
+ for t in state.tables_meta:
818
+ if t.start_page is not None:
819
+ page_map[t.start_page].append(t.idx)
820
+
821
+ for p in page_map:
822
+ for off in range(1, cfg.max_page_gap + 1):
823
+ if (p + off) not in page_map:
824
+ continue
825
+ for i in page_map[p]:
826
+ for j in page_map[p + off]:
827
+ posI, posJ = state.orig_to_pos[i], state.orig_to_pos[j]
828
+ if state.uf.find(posI) == state.uf.find(posJ):
829
+ continue
830
+
831
+ # Same continuity guard as Pass 1.
832
+ lo, hi = (i, j) if i < j else (j, i)
833
+ if hi - lo > 1:
834
+ gap_indices = set(range(lo + 1, hi))
835
+ if not gap_indices.issubset(state.extracted_indices):
836
+ missing = sorted(gap_indices - state.extracted_indices)
837
+ log.debug(
838
+ f"Skipping orphan pair {i}->{j}: "
839
+ f"unextracted table(s) {set(missing)} between them"
840
+ )
841
+ continue
842
+
843
+ tA, tB = state.meta_by_idx[i], state.meta_by_idx[j]
844
+ should, reason = should_force_orphan_merge(tA, tB, cfg)
845
+ if should:
846
+ state.uf.union(posI, posJ)
847
+ state.decision_traces.append(
848
+ _trace_pair(tA, tB, cfg, True, reason or "orphans")
849
+ )
850
+ log.debug(f"Orphan merge ({reason}): Table {j} -> Table {i}")
851
+
852
+
853
+ def _apply_spillover(
854
+ df: pd.DataFrame,
855
+ pgs: set[int],
856
+ spillover_members: list[int],
857
+ meta_by_idx: dict[int, TableMeta],
858
+ cfg: MultiPageConfig,
859
+ ) -> None:
860
+ """
861
+ Stitch each spillover fragment's content into the last cell of df
862
+ (in-place). Extracted for readability — the build phase would
863
+ otherwise nest this loop four levels deep.
864
+ """
865
+ for spill_idx in spillover_members:
866
+ spill_meta = meta_by_idx[spill_idx]
867
+ if spill_meta.df.shape[0] == 0 or df.shape[0] == 0:
868
+ continue
869
+ spill_content = cfg.stitch_separator.join(
870
+ str(spill_meta.df.iloc[r, 0])
871
+ for r in range(spill_meta.df.shape[0])
872
+ if str(spill_meta.df.iloc[r, 0]).strip()
873
+ )
874
+ if not spill_content:
875
+ continue
876
+
877
+ last_row_idx = df.shape[0] - 1
878
+ last_col_idx = df.shape[1] - 1
879
+ raw_val = df.iloc[last_row_idx, last_col_idx]
880
+ current_val = "" if pd.isna(raw_val) else str(raw_val).strip()
881
+ if current_val:
882
+ df.iloc[last_row_idx, last_col_idx] = current_val + cfg.stitch_separator + spill_content
883
+ else:
884
+ df.iloc[last_row_idx, last_col_idx] = spill_content
885
+ pgs.update(spill_meta.pages)
886
+
887
+
888
+ def _build_logical_tables(state: _MergeState, cfg: MultiPageConfig) -> list[LogicalTable]:
889
+ """
890
+ Collapse the union-find groups into a list of LogicalTable objects.
891
+ Handles spillover application, orphan-anchor vs generic build paths,
892
+ post-merge cell stitching, and attaches per-group merge traces.
893
+ """
894
+ groups: dict[int, list[int]] = defaultdict(list)
895
+ for t in state.tables_meta:
896
+ groups[state.uf.find(state.orig_to_pos[t.idx])].append(t.idx)
897
+
898
+ results: list[LogicalTable] = []
899
+ for idx, members in enumerate(groups.values()):
900
+ members = sorted(members, key=lambda x: (state.meta_by_idx[x].start_page or 0, x))
901
+
902
+ normal_members = [m for m in members if m not in state.spillover_targets]
903
+ spillover_members = [m for m in members if m in state.spillover_targets]
904
+ if not normal_members:
905
+ continue
906
+
907
+ header_orphan_idx = next(
908
+ (m for m in normal_members if state.meta_by_idx[m].is_header_orphan),
909
+ None,
910
+ )
911
+ if header_orphan_idx is not None:
912
+ df, pgs, build_warnings = _build_orphan_merged_table(
913
+ header_orphan_idx, normal_members, state.meta_by_idx
914
+ )
915
+ else:
916
+ df, pgs, build_warnings = _build_generic_merged_table(
917
+ normal_members, state.meta_by_idx, cfg
918
+ )
919
+
920
+ _apply_spillover(df, pgs, spillover_members, state.meta_by_idx, cfg)
921
+
922
+ if len(pgs) > 1:
923
+ df = stitch_split_cells(df, cfg.stitch_separator)
924
+ df = clean_all_headers(df)
925
+
926
+ member_set = set(members)
927
+ group_traces = [
928
+ tr
929
+ for tr in state.decision_traces
930
+ if tr.left_idx in member_set and tr.right_idx in member_set
931
+ ]
932
+ merge_reasons = [tr.reason for tr in group_traces if tr.merged]
933
+ group_warnings = list(build_warnings)
934
+ for tr in group_traces:
935
+ group_warnings.extend(tr.warnings)
936
+
937
+ results.append(
938
+ LogicalTable(
939
+ idx,
940
+ members,
941
+ sorted(pgs),
942
+ df,
943
+ merge_reason="+".join(merge_reasons),
944
+ merge_traces=group_traces,
945
+ warnings=group_warnings,
946
+ )
947
+ )
948
+
949
+ return results
950
+
951
+
952
+ def merge_multipage_tables(
953
+ tables_meta: list[TableMeta],
954
+ cfg: MultiPageConfig,
955
+ ) -> list[LogicalTable]:
956
+ """
957
+ Merge table fragments into logical tables.
958
+
959
+ The merge engine runs in four named phases:
960
+
961
+ 1. **Setup** (``_init_merge_state``) — build index maps, union-find,
962
+ and sort fragments into document order.
963
+ 2. **Sequential merge** (``_pass1_sequential_merge``) — walk adjacent
964
+ pairs; union them by structural rules in ``_classify_sequential_pair``.
965
+ 3. **Orphan repair** (``_pass2_orphan_repair``) — catch any header/data
966
+ orphan pairs Pass 1 missed.
967
+ 4. **Build results** (``_build_logical_tables``) — group by union-find
968
+ root, apply spillover content, stitch split cells, attach traces.
969
+
970
+ Returns a list of ``LogicalTable`` objects, each with ``merge_reason``,
971
+ ``merge_traces``, and ``warnings`` populated for downstream auditing.
972
+ """
973
+ if not tables_meta:
974
+ return []
975
+
976
+ state = _init_merge_state(tables_meta)
977
+ _pass1_sequential_merge(state, cfg)
978
+ _pass2_orphan_repair(state, cfg)
979
+ return _build_logical_tables(state, cfg)