table2rules 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,645 @@
1
+ import re
2
+
3
+ from bs4 import BeautifulSoup, NavigableString, Tag
4
+
5
+ from .spans import assert_grid_size, clamped_span
6
+
7
+
8
+ def get_top_level_rows(table):
9
+ """
10
+ Helper function to robustly get ONLY the rows
11
+ belonging to the main table, skipping nested tables.
12
+ """
13
+ all_rows_in_dom = table.find_all("tr")
14
+ top_level_rows = []
15
+
16
+ for row in all_rows_in_dom:
17
+ if row.find_parent("table") is table:
18
+ top_level_rows.append(row)
19
+
20
+ return top_level_rows
21
+
22
+
23
+ def _build_logical_grid(rows):
24
+ """Expand DOM rows into a colspan/rowspan-aware occupancy grid.
25
+
26
+ Returns (grid, origin_cells, max_cols) where
27
+ grid[r][c] = {"nonempty": bool, "origin": (r0, c0), "rs": int, "cs": int}
28
+ origin_cells[(r0, c0)] = DOM Tag of the originating cell
29
+ max_cols = logical width of the table
30
+ """
31
+ occupied = {}
32
+ max_cols = 0
33
+ for r_idx, row in enumerate(rows):
34
+ cells = row.find_all(["td", "th"], recursive=False)
35
+ col = 0
36
+ for cell in cells:
37
+ while (r_idx, col) in occupied:
38
+ col += 1
39
+ rs = clamped_span(cell.get("rowspan", 1))
40
+ cs = clamped_span(cell.get("colspan", 1))
41
+ assert_grid_size(len(rows), col + cs)
42
+ for dr in range(min(rs, len(rows) - r_idx)):
43
+ for dc in range(cs):
44
+ occupied[(r_idx + dr, col + dc)] = True
45
+ col += cs
46
+ max_cols = max(max_cols, col)
47
+
48
+ n = len(rows)
49
+ grid = [
50
+ [{"nonempty": False, "origin": (r, c), "rs": 1, "cs": 1} for c in range(max_cols)]
51
+ for r in range(n)
52
+ ]
53
+ origin_cells = {}
54
+ filled_at = {}
55
+ for r_idx, row in enumerate(rows):
56
+ cells = row.find_all(["td", "th"], recursive=False)
57
+ col = 0
58
+ for cell in cells:
59
+ while col < max_cols and (r_idx, col) in filled_at:
60
+ col += 1
61
+ if col >= max_cols:
62
+ break
63
+ rs = clamped_span(cell.get("rowspan", 1))
64
+ cs = clamped_span(cell.get("colspan", 1))
65
+ txt = cell.get_text(strip=True)
66
+ nonempty = bool(txt)
67
+ origin_cells[(r_idx, col)] = cell
68
+ for dr in range(min(rs, n - r_idx)):
69
+ for dc in range(cs):
70
+ tr, tc = r_idx + dr, col + dc
71
+ if tr < n and tc < max_cols:
72
+ grid[tr][tc] = {
73
+ "nonempty": nonempty,
74
+ "origin": (r_idx, col),
75
+ "rs": rs,
76
+ "cs": cs,
77
+ }
78
+ filled_at[(tr, tc)] = True
79
+ col += cs
80
+ return grid, origin_cells, max_cols
81
+
82
+
83
+ def detect_header_block(rows):
84
+ """Infer the header block of a headless table via a universal structural
85
+ rule — no content analysis, no percentage thresholds, no dataset-specific
86
+ heuristics.
87
+
88
+ Two structural definitions:
89
+
90
+ 1. A *clean data row* is a row r where every logical position (r, c) is
91
+ an origin cell (not a rowspan copy from above) with rowspan == 1,
92
+ colspan == 1, and non-empty text. A clean data row has no structural
93
+ feature that could distinguish it from the rows below it — it is
94
+ unambiguously part of the body.
95
+
96
+ 2. A *stub column* is a column non-empty in every non-divider body row.
97
+ Section dividers (rows with ≤ 1 non-empty logical cell) are excluded
98
+ from the body statistic because they are a separate structural class.
99
+
100
+ Rule: the header block is the maximal leading prefix [0..k-1] where k is
101
+ the index of the first clean data row whose col 0 is non-empty, provided
102
+ every non-divider row in that prefix has its empty cells contained in
103
+ stub columns.
104
+
105
+ Why this works universally:
106
+ * Dense first-row-header tables (receipts, simple relational): row 0
107
+ is a clean data row, so k = 0 and no promotion occurs. This is
108
+ structurally honest — without spans, empties, or other shape
109
+ differences, row 0 is indistinguishable from row 1+, and the
110
+ pipeline defers to the confidence gate.
111
+ * Multi-row headers with colspan group labels (benefits, FinTabNet
112
+ hierarchical): header rows carry colspan > 1 cells, so they are not
113
+ clean data rows. The first clean data row follows the header block.
114
+ * Financial 10-K tables with an empty row-stub label (FinTabNet
115
+ single-row header): header rows have col 0 empty. Col 0 is a stub
116
+ column (non-empty in every body row), so the empty in row 0 is
117
+ permitted. The first clean data row is the first body row.
118
+
119
+ Section-divider rows in the header region (single-cell rows like the
120
+ "2014" year marker in FinTabNet) do NOT themselves qualify as headers,
121
+ but they do not prevent the detection either — they sit inside the
122
+ [0..k-1] range and are left as-is so the downstream thead-wrap naturally
123
+ excludes them via Fix 7's contiguous-<th> chain.
124
+
125
+ Returns (k, stub_cols, origin_cells, grid) on success, or None.
126
+ """
127
+ n = len(rows)
128
+ if n < 3:
129
+ return None
130
+
131
+ grid, origin_cells, max_cols = _build_logical_grid(rows)
132
+ if max_cols == 0:
133
+ return None
134
+
135
+ # Find the first data row. A data row is one whose shape has no
136
+ # structural feature that would mark it as a header:
137
+ # - col 0 is non-empty (typical row-label)
138
+ # - at least two non-empty logical cells (not a section divider)
139
+ # - every logical position at row r is an origin cell at row r
140
+ # (no rowspan copy from above pulling header content into body)
141
+ # - every origin cell at row r has rowspan == colspan == 1
142
+ # (no colspan group or rowspan marker signaling header role)
143
+ #
144
+ # The "every cell non-empty" requirement was dropped intentionally:
145
+ # real body rows in financial tables are often gappy (a cell has a
146
+ # value only for some rows, e.g., an aggregate fair-value column
147
+ # that only fills on vest events). The remaining conditions still
148
+ # separate body rows from header rows — header rows typically carry
149
+ # either an empty col 0 (row-stub-column signature) or a span of
150
+ # some kind.
151
+ first_data_idx = None
152
+ for r in range(n):
153
+ row = grid[r]
154
+ if not row[0]["nonempty"]:
155
+ continue
156
+ nonempty_count = sum(1 for c in row if c["nonempty"])
157
+ if nonempty_count < 2:
158
+ continue
159
+ is_clean = True
160
+ for c in range(max_cols):
161
+ cell = row[c]
162
+ if cell["origin"] != (r, c):
163
+ is_clean = False
164
+ break
165
+ if cell["rs"] != 1 or cell["cs"] != 1:
166
+ is_clean = False
167
+ break
168
+ if is_clean:
169
+ first_data_idx = r
170
+ break
171
+
172
+ if first_data_idx is None or first_data_idx == 0:
173
+ return None
174
+
175
+ # Header rows = non-divider, non-empty rows in the header region.
176
+ header_row_indices = [
177
+ r for r in range(first_data_idx) if sum(1 for c in grid[r] if c["nonempty"]) >= 2
178
+ ]
179
+ if not header_row_indices:
180
+ return None
181
+
182
+ # Body rows = non-divider rows from first_data_idx down.
183
+ body_rows = [
184
+ grid[r] for r in range(first_data_idx, n) if sum(1 for c in grid[r] if c["nonempty"]) >= 2
185
+ ]
186
+ if not body_rows:
187
+ return None
188
+
189
+ # A row-stub column is a col that is empty in *every* header row AND
190
+ # non-empty in a strict majority of non-divider body rows. The
191
+ # conjunction is load-bearing — "empty in every header" alone would
192
+ # mis-label data columns whose top-level group header happens not to
193
+ # cover them; "non-empty in every body row" (the stricter earlier
194
+ # form) would reject tables whose trailing summary row leaves the
195
+ # stub column blank (e.g. a FinTabNet totals row rendered as
196
+ # `— | $1,573,043 | ...`). Strict majority — more non-empty body
197
+ # rows than empty — is deterministic (a count comparison, not a
198
+ # ratio), admits the unlabeled-summary-row pattern, and still
199
+ # refuses to promote a sparsely-filled data column.
200
+ stub_cols = set()
201
+ for c in range(max_cols):
202
+ all_empty_in_header = all(not grid[r][c]["nonempty"] for r in header_row_indices)
203
+ if not all_empty_in_header:
204
+ continue
205
+ filled = sum(1 for br in body_rows if br[c]["nonempty"])
206
+ empty = len(body_rows) - filled
207
+ if filled > empty:
208
+ stub_cols.add(c)
209
+
210
+ # Validity: every column empty in every header row must either be a
211
+ # stub (non-empty in a strict majority of body rows) or a column
212
+ # that is used at all. If such a column fails the stub test, the
213
+ # header region and body region disagree geometrically — there is
214
+ # no consistent story for what that column is, so reject.
215
+ for c in range(max_cols):
216
+ all_empty_in_header = all(not grid[r][c]["nonempty"] for r in header_row_indices)
217
+ if all_empty_in_header and c not in stub_cols:
218
+ return None
219
+
220
+ return first_data_idx, stub_cols, origin_cells, grid
221
+
222
+
223
+ def simple_repair(html: str) -> str:
224
+ """
225
+ Simple targeted repairs for common issues:
226
+ 0. Fix mismatched opening/closing tags (<td>...</th> and vice versa)
227
+ 1. Move title rows (full-width th) to caption
228
+ 2. Fix <td> headers in <tfoot> (for totals)
229
+ 3. Move footer legends to tfoot
230
+ 4. Convert first data row to proper header row (<th> tags)
231
+ 6. Merge "hanging" description rows (e.g. Dates below items)
232
+ """
233
+ # --- Fix 0: Repair mismatched opening/closing tags ---
234
+ # <td ...>text</th> and <th ...>text</td> cause html.parser to nest
235
+ # subsequent sibling cells inside the unclosed element.
236
+ # Fix by normalising closing tags to match their opener.
237
+ # [^<]* restricts to plain-text content so we never span across tags.
238
+ html = re.sub(r"(<td\b[^>]*>)([^<]*)</th>", r"\1\2</td>", html)
239
+ html = re.sub(r"(<th\b[^>]*>)([^<]*)</td>", r"\1\2</th>", html)
240
+
241
+ soup = BeautifulSoup(html, "html.parser")
242
+ table = soup.find("table")
243
+ if not isinstance(table, Tag):
244
+ return html
245
+
246
+ # --- Fix 9: Inline nested tables ---
247
+ # Replace <table> elements inside cells with their flattened text
248
+ # so the outer grid parser sees clean content instead of nested markup.
249
+ for nested in table.find_all("table"):
250
+ if not isinstance(nested, Tag):
251
+ continue
252
+ rows = nested.find_all("tr")
253
+ lines = []
254
+ for row in rows:
255
+ if not isinstance(row, Tag):
256
+ continue
257
+ cells = row.find_all(["td", "th"], recursive=False)
258
+ texts = [c.get_text(strip=True) for c in cells]
259
+ if any(texts):
260
+ lines.append(", ".join(t for t in texts if t))
261
+ nested.replace_with(NavigableString("; ".join(lines)))
262
+
263
+ actual_rows = get_top_level_rows(table)
264
+ if not actual_rows:
265
+ return html
266
+
267
+ # --- Fix 1: Move title row to caption ---
268
+ first_meaningful_row = None
269
+ first_meaningful_row_index = 0
270
+ for idx, row in enumerate(actual_rows):
271
+ cells = row.find_all(["td", "th"], recursive=False)
272
+ if cells:
273
+ first_meaningful_row = row
274
+ first_meaningful_row_index = idx
275
+ break
276
+
277
+ if first_meaningful_row:
278
+ cells = first_meaningful_row.find_all(["td", "th"], recursive=False)
279
+ # Treat the first row as a title iff it is a single cell whose colspan
280
+ # covers the full width of the remaining rows (width >= 2). This
281
+ # captures 2-col and 3-col tables correctly without over-promoting.
282
+ later_widths = [
283
+ len(r.find_all(["td", "th"], recursive=False))
284
+ for r in actual_rows[first_meaningful_row_index + 1 :]
285
+ ]
286
+ max_later_width = max(later_widths, default=0)
287
+ first_cell_span = clamped_span(cells[0].get("colspan", 1))
288
+ # If two or more rows in the table share this "single full-width
289
+ # <th>" shape, they are a section-divider series (e.g. <tbody>
290
+ # groups labelled "Personal" / "Work"), not a title. Treating the
291
+ # first as a caption while the others survive into Fix 1b would
292
+ # delete the matched siblings and leave the document with one
293
+ # arbitrarily-promoted label and the rest erased.
294
+ sibling_full_width_count = 0
295
+ for r in actual_rows:
296
+ r_cells = r.find_all(["td", "th"], recursive=False)
297
+ if len(r_cells) != 1:
298
+ continue
299
+ only = r_cells[0]
300
+ if only.name != "th":
301
+ continue
302
+ if clamped_span(only.get("colspan", 1)) >= max_later_width and max_later_width >= 2:
303
+ sibling_full_width_count += 1
304
+ is_section_divider_series = sibling_full_width_count >= 2
305
+ if (
306
+ len(cells) == 1
307
+ and max_later_width >= 2
308
+ and first_cell_span >= max_later_width
309
+ and not is_section_divider_series
310
+ ):
311
+ title_text = cells[0].get_text(strip=True)
312
+ caption = table.find("caption")
313
+ if isinstance(caption, Tag):
314
+ caption.string = title_text
315
+ else:
316
+ new_caption = soup.new_tag("caption")
317
+ new_caption.string = title_text
318
+ table.insert(0, new_caption)
319
+ for i in range(first_meaningful_row_index + 1):
320
+ actual_rows[i].decompose()
321
+ actual_rows = get_top_level_rows(table)
322
+
323
+ # --- Fix 1b: Mark section-divider rows as scope="rowgroup" ---
324
+ # A <tr> whose sole cell is a <th> with colspan covering the grid
325
+ # width is structurally a row-group label (e.g. "Personal" / "Work"
326
+ # delimiting attribute-value blocks across two <tbody>s, or
327
+ # "Operating Expenses" / "Non-Operating Items" partitioning a P&L
328
+ # statement). The maze pathfinder already understands
329
+ # <th scope="rowgroup"> as an ancestor label whose extent runs from
330
+ # its origin row to the next rowgroup divider in the same column —
331
+ # we just need to mark the cell honestly instead of deleting it.
332
+ #
333
+ # Two structural side-effects of the divider survive into downstream:
334
+ # 1. The col-header walk (maze_pathfinder.find_headers_for_cell)
335
+ # already skips cells (and span copies) whose origin scope is
336
+ # "rowgroup", so the colspan-expanded divider can never be
337
+ # mistaken for a fabricated column header for the rows below.
338
+ # 2. The row-header walk picks the divider up as a row-group
339
+ # ancestor — but only if col 0 of the body rows is itself a
340
+ # row-header column. A divider series is a structural witness
341
+ # that col 0 names individual rows within each group; promote
342
+ # <td> col-0 cells in non-divider, non-thead rows to
343
+ # <th scope="row"> so the maze can walk up from there.
344
+ #
345
+ # The earlier behaviour (row.decompose()) silently destroyed the
346
+ # group label. Outputs scored gate_ok with no signal that context
347
+ # had been lost — see issue #1.
348
+ rowgroup_divider_rows: list = []
349
+ if actual_rows:
350
+ row_widths = [len(r.find_all(["td", "th"], recursive=False)) for r in actual_rows]
351
+ max_width = max(row_widths, default=0)
352
+ if max_width >= 2:
353
+ for row in list(actual_rows):
354
+ cells = row.find_all(["td", "th"], recursive=False)
355
+ if len(cells) != 1:
356
+ continue
357
+ cell = cells[0]
358
+ if cell.name != "th":
359
+ continue
360
+ colspan = clamped_span(cell.get("colspan", 1))
361
+ if colspan >= max_width:
362
+ cell["scope"] = "rowgroup"
363
+ rowgroup_divider_rows.append(row)
364
+ actual_rows = get_top_level_rows(table)
365
+
366
+ # If any rowgroup dividers were marked, col 0 of the surrounding
367
+ # rows is the row-label column by structural implication (the
368
+ # divider partitions row identities, which must live in some
369
+ # column; the canonical placement is col 0). Promote <td> col-0
370
+ # cells outside <thead> and outside divider rows to
371
+ # <th scope="row"> so the row-header walk picks up both the row
372
+ # label and its rowgroup ancestor.
373
+ if rowgroup_divider_rows:
374
+ divider_set = {id(r) for r in rowgroup_divider_rows}
375
+ for row in actual_rows:
376
+ if id(row) in divider_set:
377
+ continue
378
+ if row.find_parent("thead") is not None:
379
+ continue
380
+ cells = row.find_all(["td", "th"], recursive=False)
381
+ if not cells:
382
+ continue
383
+ first = cells[0]
384
+ if first.name == "td" and first.get_text(strip=True):
385
+ first.name = "th"
386
+ if not first.get("scope"):
387
+ first["scope"] = "row"
388
+
389
+ # --- Fix 4: Structural header-block promotion ---
390
+ # Universal structural rule (replaces the old row-0-only "all cells
391
+ # non-empty" heuristic). See detect_header_block for the full spec —
392
+ # the rule is deterministic, content-free, and subsumes three cases
393
+ # previously handled by disjoint heuristics:
394
+ #
395
+ # Dense first-row headers (receipts, simple relational): row 0 is
396
+ # a clean data row, detection returns None, no promotion.
397
+ #
398
+ # Multi-row headers with colspan group labels (benefits-style,
399
+ # FinTabNet hierarchical): span-bearing rows above the first clean
400
+ # data row form the header block.
401
+ #
402
+ # Financial 10-K tables with an empty row-stub label (FinTabNet
403
+ # single-row headers): col 0 is structurally identified as a stub
404
+ # column (empty in every header, filled in every body), and the
405
+ # empty col-0 cell in the header row is permitted.
406
+ #
407
+ # Promotes header-region rows (non-divider) to <th> so Fix 7 can wrap
408
+ # them in <thead>. Promotes stub-column body cells to <th scope="row">
409
+ # so dimensional columns are recognized even in single-row-header
410
+ # tables (Fix 8 only covers multi-row <thead>).
411
+ if not table.find("thead") and actual_rows:
412
+ detection = detect_header_block(actual_rows)
413
+ if detection is not None:
414
+ first_data_idx, stub_cols, origin_cells, grid = detection
415
+ # Promote header-region rows to <th> (skip section dividers
416
+ # and empty rows — dividers are structurally distinct and
417
+ # would break the thead contiguous-<th> chain intentionally).
418
+ # Use the *logical* non-empty count (colspan-expanded) so that
419
+ # a single-DOM-cell row with a wide colspan — e.g., a "(Dollars
420
+ # in thousands)" sub-header — is recognized as a multi-cell
421
+ # row rather than mis-classified as a divider.
422
+ for r_idx in range(first_data_idx):
423
+ logical_nonempty = sum(1 for cell in grid[r_idx] if cell["nonempty"])
424
+ if logical_nonempty <= 1:
425
+ continue
426
+ row = actual_rows[r_idx]
427
+ for cell in row.find_all(["td", "th"], recursive=False):
428
+ if cell.name == "td":
429
+ cell.name = "th"
430
+ # Promote stub-column body cells to <th scope="row"> at origin.
431
+ for c in stub_cols:
432
+ for r_idx in range(first_data_idx, len(actual_rows)):
433
+ if r_idx >= len(grid):
434
+ break
435
+ origin = grid[r_idx][c]["origin"]
436
+ if origin[0] < first_data_idx:
437
+ continue
438
+ cell = origin_cells.get(origin)
439
+ if cell is None:
440
+ continue
441
+ if cell.name == "td":
442
+ cell.name = "th"
443
+ if not cell.get("scope"):
444
+ cell["scope"] = "row"
445
+ # Row-group divider promotion. A row with exactly one
446
+ # non-empty logical cell whose column is in stub_cols is a
447
+ # row-group header for the body rows that follow until the
448
+ # next such divider — the FinTabNet year-label pattern
449
+ # ("2014" row between Q1–Q4 blocks). Promoting the cell to
450
+ # <th scope="rowgroup"> lets maze_pathfinder walk up the
451
+ # stub column and include the group label in row_path for
452
+ # subsequent body cells.
453
+ #
454
+ # Iterate from the end of the contiguous promoted-header
455
+ # prefix (what Fix 7 will wrap into <thead>) onward — so
456
+ # dividers inside the header region (e.g., a divider row
457
+ # that breaks the <th>-or-empty chain) are correctly
458
+ # classified as body rowgroup markers, not thead cells.
459
+ #
460
+ # Structurally distinct from the <th scope="row"> promotion
461
+ # above: row-headers are *peer* labels (one per row),
462
+ # rowgroup-headers are *ancestor* labels (span multiple rows).
463
+ thead_end = 0
464
+ for r in range(first_data_idx):
465
+ if sum(1 for c in grid[r] if c["nonempty"]) >= 2:
466
+ thead_end = r + 1
467
+ else:
468
+ break
469
+ for r_idx in range(thead_end, len(actual_rows)):
470
+ if r_idx >= len(grid):
471
+ break
472
+ row = grid[r_idx]
473
+ non_empty_cols = [c for c in range(len(row)) if row[c]["nonempty"]]
474
+ if len(non_empty_cols) != 1:
475
+ continue
476
+ only_col = non_empty_cols[0]
477
+ if only_col not in stub_cols:
478
+ continue
479
+ origin = row[only_col]["origin"]
480
+ if origin[0] < thead_end:
481
+ continue
482
+ cell = origin_cells.get(origin)
483
+ if cell is None:
484
+ continue
485
+ if cell.name == "td":
486
+ cell.name = "th"
487
+ cell["scope"] = "rowgroup"
488
+
489
+ # --- Fix 7: Wrap header rows in <thead> ---
490
+ # If table lacks <thead>, detect contiguous leading rows that are "header-like"
491
+ # (all <th> cells, or all <th>/empty cells) and wrap them in <thead>.
492
+ # This ensures downstream logic can rely on is_thead to identify column headers.
493
+ if not table.find("thead") and actual_rows:
494
+ header_rows = []
495
+ seen_non_empty = False
496
+
497
+ for row in actual_rows:
498
+ cells = row.find_all(["td", "th"], recursive=False)
499
+
500
+ # Ignore leading empty rows; do not treat them as headers
501
+ if not cells and not seen_non_empty:
502
+ continue
503
+
504
+ if not cells and seen_non_empty:
505
+ # Empty row after header block means header detection ends
506
+ break
507
+
508
+ seen_non_empty = True
509
+
510
+ # A row carrying a <th scope="rowgroup"> is a body section
511
+ # divider (Fix 1b marker), not a thead row. Stop the leading
512
+ # header chain here so the divider stays in <tbody>; pulling
513
+ # it into <thead> would mis-classify it as a column header.
514
+ is_rowgroup_divider = any(
515
+ cell.name == "th" and cell.get("scope") == "rowgroup" for cell in cells
516
+ )
517
+ if is_rowgroup_divider:
518
+ break
519
+
520
+ # A row is "header-like" if all cells are <th> or empty
521
+ is_header_like = all(
522
+ cell.name == "th" or not cell.get_text(strip=True) for cell in cells
523
+ )
524
+
525
+ if is_header_like:
526
+ header_rows.append(row)
527
+ else:
528
+ # Stop at first non-header row
529
+ break
530
+
531
+ # Only wrap if we found header rows (and they're not ALL the rows)
532
+ if header_rows and len(header_rows) < len(actual_rows):
533
+ new_thead = soup.new_tag("thead")
534
+
535
+ # Insert thead at the beginning of the table
536
+ # (after caption if present)
537
+ caption = table.find("caption")
538
+ if isinstance(caption, Tag):
539
+ caption.insert_after(new_thead)
540
+ else:
541
+ table.insert(0, new_thead)
542
+
543
+ # Move header rows into thead
544
+ for row in header_rows:
545
+ row.extract()
546
+ new_thead.append(row)
547
+
548
+ actual_rows = get_top_level_rows(table)
549
+
550
+ # --- Fix 8: Promote row headers based on <thead> structure ---
551
+ # If <thead> has multi-row structure (hierarchical column headers), the first
552
+ # column typically contains row identifiers. Promote first-column <td> cells
553
+ # in <tbody> to <th scope="row">.
554
+ #
555
+ # We only promote cells that:
556
+ # 1. Are the first cell in their DOM row, AND
557
+ # 2. Either have rowspan > 1 (explicit row group identifier), OR
558
+ # 3. Are not "covered" by a rowspan from a previous row
559
+ thead = table.find("thead")
560
+ if isinstance(thead, Tag):
561
+ thead_rows = thead.find_all("tr", recursive=False)
562
+ header_depth = len(thead_rows)
563
+
564
+ if header_depth > 1:
565
+ # Multi-row header structure suggests dimensional table
566
+ tbody = table.find("tbody")
567
+ if isinstance(tbody, Tag):
568
+ active_rowspan = 0 # Track if a rowspan from above covers first column
569
+
570
+ for row in tbody.find_all("tr", recursive=False):
571
+ if not isinstance(row, Tag):
572
+ continue
573
+ cells = row.find_all(["td", "th"], recursive=False)
574
+
575
+ if active_rowspan > 0:
576
+ # First column is covered by rowspan from above.
577
+ active_rowspan -= 1
578
+ continue
579
+ if not cells:
580
+ continue
581
+
582
+ first = cells[0]
583
+ if not isinstance(first, Tag):
584
+ continue
585
+ # Promote to row-header if not already. Fix 4 may have
586
+ # pre-promoted this cell via its stub-column path — in
587
+ # that case we still need to track the rowspan so the
588
+ # counter stays in sync with the grid, otherwise a cell
589
+ # at logical col > 0 in a subsequent row would be
590
+ # mistaken for the first-column cell.
591
+ if first.name == "td":
592
+ first.name = "th"
593
+ first["scope"] = "row"
594
+ rowspan = clamped_span(first.get("rowspan"))
595
+ if rowspan > 1:
596
+ active_rowspan = rowspan - 1
597
+
598
+ # --- Fix 6: Merge "Hanging" Description Rows ---
599
+ # Detects "wrap continuation" rows: a row with text in only the first cell
600
+ # (rest empty) that follows a fully-populated data row. Historically meant
601
+ # to rejoin labels that wrapped to a new line in some extractor outputs.
602
+ #
603
+ # In practice, this pattern is vastly more often a SECTION DIVIDER row
604
+ # (scientific / financial tables introducing a sub-group) than a genuine
605
+ # wrap continuation. Merging section dividers corrupts adjacent data
606
+ # rows (observed on PubTabNet, HiTab). So we only fire when the row is
607
+ # very likely to be a continuation:
608
+ # - current sparse row has a wide trailing colspan → section marker
609
+ # - next row has a continuation-like shape → probably wrap
610
+ # For now, the safer default is to NOT merge. The edge case where this
611
+ # merge was useful (single-column wrapped descriptions on narrow tables)
612
+ # hasn't resurfaced across the corpus and red-team fixtures.
613
+
614
+ # (merge loop intentionally disabled — see commit log for context)
615
+
616
+ # --- Fix 2, 3: Iterate remaining rows ---
617
+ actual_rows = get_top_level_rows(table)
618
+
619
+ for row in actual_rows:
620
+ cells = row.find_all(["td", "th"], recursive=False)
621
+ if not cells:
622
+ continue
623
+
624
+ # --- Fix 2: Fix <tfoot> row headers ---
625
+ if row.find_parent("tfoot") and cells[0].name == "td":
626
+ if clamped_span(cells[0].get("colspan", 1)) > 1:
627
+ cells[0].name = "th"
628
+ cells[0]["scope"] = "colgroup"
629
+
630
+ # --- Fix 3: Move footer legends to tfoot ---
631
+ if not row.find_parent("tfoot"):
632
+ if len(cells) == 1:
633
+ text = cells[0].get_text(strip=True).lower()
634
+ if "legend" in text or "footnote" in text:
635
+ colspan = clamped_span(cells[0].get("colspan", 1))
636
+ if colspan >= 3:
637
+ tfoot = table.find("tfoot")
638
+ if not isinstance(tfoot, Tag):
639
+ tfoot = soup.new_tag("tfoot")
640
+ table.append(tfoot)
641
+ row.extract()
642
+ tfoot.append(row)
643
+ continue
644
+
645
+ return str(soup)
table2rules/spans.py ADDED
@@ -0,0 +1,36 @@
1
+ """Shared span limits for table expansion.
2
+
3
+ Both repair and parsing expand ``rowspan`` / ``colspan`` into logical grid
4
+ positions. Keep all coercion and size checks here so adversarial markup is
5
+ bounded before either phase allocates span-derived structures.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from .errors import TableTooLargeError
11
+
12
+ # Guards against adversarial HTML. Normal tables never approach these.
13
+ MAX_SPAN = 1000
14
+ MAX_GRID_CELLS = 1_000_000
15
+
16
+
17
+ def clamped_span(raw) -> int:
18
+ """Coerce a raw rowspan/colspan attribute to a safe int in [1, MAX_SPAN]."""
19
+ try:
20
+ value = int(raw)
21
+ except (TypeError, ValueError):
22
+ return 1
23
+ if value < 1:
24
+ return 1
25
+ if value > MAX_SPAN:
26
+ return MAX_SPAN
27
+ return value
28
+
29
+
30
+ def assert_grid_size(rows: int, cols: int) -> None:
31
+ """Raise if a logical grid shape would exceed the configured cell cap."""
32
+ total_cells = rows * cols
33
+ if total_cells > MAX_GRID_CELLS:
34
+ raise TableTooLargeError(
35
+ f"expanded grid would be {rows} x {cols} = {total_cells} cells (cap {MAX_GRID_CELLS})"
36
+ )