table2rules 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- table2rules/__init__.py +51 -0
- table2rules/__main__.py +85 -0
- table2rules/_core.py +351 -0
- table2rules/cleanup.py +61 -0
- table2rules/errors.py +17 -0
- table2rules/exporters/__init__.py +33 -0
- table2rules/exporters/base.py +41 -0
- table2rules/exporters/rules.py +66 -0
- table2rules/grid_parser.py +487 -0
- table2rules/maze_pathfinder.py +166 -0
- table2rules/models.py +26 -0
- table2rules/py.typed +0 -0
- table2rules/quality_gate.py +186 -0
- table2rules/report.py +155 -0
- table2rules/simple_repair.py +645 -0
- table2rules/spans.py +36 -0
- table2rules-0.4.0.dist-info/METADATA +332 -0
- table2rules-0.4.0.dist-info/RECORD +22 -0
- table2rules-0.4.0.dist-info/WHEEL +5 -0
- table2rules-0.4.0.dist-info/entry_points.txt +2 -0
- table2rules-0.4.0.dist-info/licenses/LICENSE +21 -0
- table2rules-0.4.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,645 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
from bs4 import BeautifulSoup, NavigableString, Tag
|
|
4
|
+
|
|
5
|
+
from .spans import assert_grid_size, clamped_span
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_top_level_rows(table):
|
|
9
|
+
"""
|
|
10
|
+
Helper function to robustly get ONLY the rows
|
|
11
|
+
belonging to the main table, skipping nested tables.
|
|
12
|
+
"""
|
|
13
|
+
all_rows_in_dom = table.find_all("tr")
|
|
14
|
+
top_level_rows = []
|
|
15
|
+
|
|
16
|
+
for row in all_rows_in_dom:
|
|
17
|
+
if row.find_parent("table") is table:
|
|
18
|
+
top_level_rows.append(row)
|
|
19
|
+
|
|
20
|
+
return top_level_rows
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _build_logical_grid(rows):
|
|
24
|
+
"""Expand DOM rows into a colspan/rowspan-aware occupancy grid.
|
|
25
|
+
|
|
26
|
+
Returns (grid, origin_cells, max_cols) where
|
|
27
|
+
grid[r][c] = {"nonempty": bool, "origin": (r0, c0), "rs": int, "cs": int}
|
|
28
|
+
origin_cells[(r0, c0)] = DOM Tag of the originating cell
|
|
29
|
+
max_cols = logical width of the table
|
|
30
|
+
"""
|
|
31
|
+
occupied = {}
|
|
32
|
+
max_cols = 0
|
|
33
|
+
for r_idx, row in enumerate(rows):
|
|
34
|
+
cells = row.find_all(["td", "th"], recursive=False)
|
|
35
|
+
col = 0
|
|
36
|
+
for cell in cells:
|
|
37
|
+
while (r_idx, col) in occupied:
|
|
38
|
+
col += 1
|
|
39
|
+
rs = clamped_span(cell.get("rowspan", 1))
|
|
40
|
+
cs = clamped_span(cell.get("colspan", 1))
|
|
41
|
+
assert_grid_size(len(rows), col + cs)
|
|
42
|
+
for dr in range(min(rs, len(rows) - r_idx)):
|
|
43
|
+
for dc in range(cs):
|
|
44
|
+
occupied[(r_idx + dr, col + dc)] = True
|
|
45
|
+
col += cs
|
|
46
|
+
max_cols = max(max_cols, col)
|
|
47
|
+
|
|
48
|
+
n = len(rows)
|
|
49
|
+
grid = [
|
|
50
|
+
[{"nonempty": False, "origin": (r, c), "rs": 1, "cs": 1} for c in range(max_cols)]
|
|
51
|
+
for r in range(n)
|
|
52
|
+
]
|
|
53
|
+
origin_cells = {}
|
|
54
|
+
filled_at = {}
|
|
55
|
+
for r_idx, row in enumerate(rows):
|
|
56
|
+
cells = row.find_all(["td", "th"], recursive=False)
|
|
57
|
+
col = 0
|
|
58
|
+
for cell in cells:
|
|
59
|
+
while col < max_cols and (r_idx, col) in filled_at:
|
|
60
|
+
col += 1
|
|
61
|
+
if col >= max_cols:
|
|
62
|
+
break
|
|
63
|
+
rs = clamped_span(cell.get("rowspan", 1))
|
|
64
|
+
cs = clamped_span(cell.get("colspan", 1))
|
|
65
|
+
txt = cell.get_text(strip=True)
|
|
66
|
+
nonempty = bool(txt)
|
|
67
|
+
origin_cells[(r_idx, col)] = cell
|
|
68
|
+
for dr in range(min(rs, n - r_idx)):
|
|
69
|
+
for dc in range(cs):
|
|
70
|
+
tr, tc = r_idx + dr, col + dc
|
|
71
|
+
if tr < n and tc < max_cols:
|
|
72
|
+
grid[tr][tc] = {
|
|
73
|
+
"nonempty": nonempty,
|
|
74
|
+
"origin": (r_idx, col),
|
|
75
|
+
"rs": rs,
|
|
76
|
+
"cs": cs,
|
|
77
|
+
}
|
|
78
|
+
filled_at[(tr, tc)] = True
|
|
79
|
+
col += cs
|
|
80
|
+
return grid, origin_cells, max_cols
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def detect_header_block(rows):
|
|
84
|
+
"""Infer the header block of a headless table via a universal structural
|
|
85
|
+
rule — no content analysis, no percentage thresholds, no dataset-specific
|
|
86
|
+
heuristics.
|
|
87
|
+
|
|
88
|
+
Two structural definitions:
|
|
89
|
+
|
|
90
|
+
1. A *clean data row* is a row r where every logical position (r, c) is
|
|
91
|
+
an origin cell (not a rowspan copy from above) with rowspan == 1,
|
|
92
|
+
colspan == 1, and non-empty text. A clean data row has no structural
|
|
93
|
+
feature that could distinguish it from the rows below it — it is
|
|
94
|
+
unambiguously part of the body.
|
|
95
|
+
|
|
96
|
+
2. A *stub column* is a column non-empty in every non-divider body row.
|
|
97
|
+
Section dividers (rows with ≤ 1 non-empty logical cell) are excluded
|
|
98
|
+
from the body statistic because they are a separate structural class.
|
|
99
|
+
|
|
100
|
+
Rule: the header block is the maximal leading prefix [0..k-1] where k is
|
|
101
|
+
the index of the first clean data row whose col 0 is non-empty, provided
|
|
102
|
+
every non-divider row in that prefix has its empty cells contained in
|
|
103
|
+
stub columns.
|
|
104
|
+
|
|
105
|
+
Why this works universally:
|
|
106
|
+
* Dense first-row-header tables (receipts, simple relational): row 0
|
|
107
|
+
is a clean data row, so k = 0 and no promotion occurs. This is
|
|
108
|
+
structurally honest — without spans, empties, or other shape
|
|
109
|
+
differences, row 0 is indistinguishable from row 1+, and the
|
|
110
|
+
pipeline defers to the confidence gate.
|
|
111
|
+
* Multi-row headers with colspan group labels (benefits, FinTabNet
|
|
112
|
+
hierarchical): header rows carry colspan > 1 cells, so they are not
|
|
113
|
+
clean data rows. The first clean data row follows the header block.
|
|
114
|
+
* Financial 10-K tables with an empty row-stub label (FinTabNet
|
|
115
|
+
single-row header): header rows have col 0 empty. Col 0 is a stub
|
|
116
|
+
column (non-empty in every body row), so the empty in row 0 is
|
|
117
|
+
permitted. The first clean data row is the first body row.
|
|
118
|
+
|
|
119
|
+
Section-divider rows in the header region (single-cell rows like the
|
|
120
|
+
"2014" year marker in FinTabNet) do NOT themselves qualify as headers,
|
|
121
|
+
but they do not prevent the detection either — they sit inside the
|
|
122
|
+
[0..k-1] range and are left as-is so the downstream thead-wrap naturally
|
|
123
|
+
excludes them via Fix 7's contiguous-<th> chain.
|
|
124
|
+
|
|
125
|
+
Returns (k, stub_cols, origin_cells, grid) on success, or None.
|
|
126
|
+
"""
|
|
127
|
+
n = len(rows)
|
|
128
|
+
if n < 3:
|
|
129
|
+
return None
|
|
130
|
+
|
|
131
|
+
grid, origin_cells, max_cols = _build_logical_grid(rows)
|
|
132
|
+
if max_cols == 0:
|
|
133
|
+
return None
|
|
134
|
+
|
|
135
|
+
# Find the first data row. A data row is one whose shape has no
|
|
136
|
+
# structural feature that would mark it as a header:
|
|
137
|
+
# - col 0 is non-empty (typical row-label)
|
|
138
|
+
# - at least two non-empty logical cells (not a section divider)
|
|
139
|
+
# - every logical position at row r is an origin cell at row r
|
|
140
|
+
# (no rowspan copy from above pulling header content into body)
|
|
141
|
+
# - every origin cell at row r has rowspan == colspan == 1
|
|
142
|
+
# (no colspan group or rowspan marker signaling header role)
|
|
143
|
+
#
|
|
144
|
+
# The "every cell non-empty" requirement was dropped intentionally:
|
|
145
|
+
# real body rows in financial tables are often gappy (a cell has a
|
|
146
|
+
# value only for some rows, e.g., an aggregate fair-value column
|
|
147
|
+
# that only fills on vest events). The remaining conditions still
|
|
148
|
+
# separate body rows from header rows — header rows typically carry
|
|
149
|
+
# either an empty col 0 (row-stub-column signature) or a span of
|
|
150
|
+
# some kind.
|
|
151
|
+
first_data_idx = None
|
|
152
|
+
for r in range(n):
|
|
153
|
+
row = grid[r]
|
|
154
|
+
if not row[0]["nonempty"]:
|
|
155
|
+
continue
|
|
156
|
+
nonempty_count = sum(1 for c in row if c["nonempty"])
|
|
157
|
+
if nonempty_count < 2:
|
|
158
|
+
continue
|
|
159
|
+
is_clean = True
|
|
160
|
+
for c in range(max_cols):
|
|
161
|
+
cell = row[c]
|
|
162
|
+
if cell["origin"] != (r, c):
|
|
163
|
+
is_clean = False
|
|
164
|
+
break
|
|
165
|
+
if cell["rs"] != 1 or cell["cs"] != 1:
|
|
166
|
+
is_clean = False
|
|
167
|
+
break
|
|
168
|
+
if is_clean:
|
|
169
|
+
first_data_idx = r
|
|
170
|
+
break
|
|
171
|
+
|
|
172
|
+
if first_data_idx is None or first_data_idx == 0:
|
|
173
|
+
return None
|
|
174
|
+
|
|
175
|
+
# Header rows = non-divider, non-empty rows in the header region.
|
|
176
|
+
header_row_indices = [
|
|
177
|
+
r for r in range(first_data_idx) if sum(1 for c in grid[r] if c["nonempty"]) >= 2
|
|
178
|
+
]
|
|
179
|
+
if not header_row_indices:
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
# Body rows = non-divider rows from first_data_idx down.
|
|
183
|
+
body_rows = [
|
|
184
|
+
grid[r] for r in range(first_data_idx, n) if sum(1 for c in grid[r] if c["nonempty"]) >= 2
|
|
185
|
+
]
|
|
186
|
+
if not body_rows:
|
|
187
|
+
return None
|
|
188
|
+
|
|
189
|
+
# A row-stub column is a col that is empty in *every* header row AND
|
|
190
|
+
# non-empty in a strict majority of non-divider body rows. The
|
|
191
|
+
# conjunction is load-bearing — "empty in every header" alone would
|
|
192
|
+
# mis-label data columns whose top-level group header happens not to
|
|
193
|
+
# cover them; "non-empty in every body row" (the stricter earlier
|
|
194
|
+
# form) would reject tables whose trailing summary row leaves the
|
|
195
|
+
# stub column blank (e.g. a FinTabNet totals row rendered as
|
|
196
|
+
# `— | $1,573,043 | ...`). Strict majority — more non-empty body
|
|
197
|
+
# rows than empty — is deterministic (a count comparison, not a
|
|
198
|
+
# ratio), admits the unlabeled-summary-row pattern, and still
|
|
199
|
+
# refuses to promote a sparsely-filled data column.
|
|
200
|
+
stub_cols = set()
|
|
201
|
+
for c in range(max_cols):
|
|
202
|
+
all_empty_in_header = all(not grid[r][c]["nonempty"] for r in header_row_indices)
|
|
203
|
+
if not all_empty_in_header:
|
|
204
|
+
continue
|
|
205
|
+
filled = sum(1 for br in body_rows if br[c]["nonempty"])
|
|
206
|
+
empty = len(body_rows) - filled
|
|
207
|
+
if filled > empty:
|
|
208
|
+
stub_cols.add(c)
|
|
209
|
+
|
|
210
|
+
# Validity: every column empty in every header row must either be a
|
|
211
|
+
# stub (non-empty in a strict majority of body rows) or a column
|
|
212
|
+
# that is used at all. If such a column fails the stub test, the
|
|
213
|
+
# header region and body region disagree geometrically — there is
|
|
214
|
+
# no consistent story for what that column is, so reject.
|
|
215
|
+
for c in range(max_cols):
|
|
216
|
+
all_empty_in_header = all(not grid[r][c]["nonempty"] for r in header_row_indices)
|
|
217
|
+
if all_empty_in_header and c not in stub_cols:
|
|
218
|
+
return None
|
|
219
|
+
|
|
220
|
+
return first_data_idx, stub_cols, origin_cells, grid
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def simple_repair(html: str) -> str:
|
|
224
|
+
"""
|
|
225
|
+
Simple targeted repairs for common issues:
|
|
226
|
+
0. Fix mismatched opening/closing tags (<td>...</th> and vice versa)
|
|
227
|
+
1. Move title rows (full-width th) to caption
|
|
228
|
+
2. Fix <td> headers in <tfoot> (for totals)
|
|
229
|
+
3. Move footer legends to tfoot
|
|
230
|
+
4. Convert first data row to proper header row (<th> tags)
|
|
231
|
+
6. Merge "hanging" description rows (e.g. Dates below items)
|
|
232
|
+
"""
|
|
233
|
+
# --- Fix 0: Repair mismatched opening/closing tags ---
|
|
234
|
+
# <td ...>text</th> and <th ...>text</td> cause html.parser to nest
|
|
235
|
+
# subsequent sibling cells inside the unclosed element.
|
|
236
|
+
# Fix by normalising closing tags to match their opener.
|
|
237
|
+
# [^<]* restricts to plain-text content so we never span across tags.
|
|
238
|
+
html = re.sub(r"(<td\b[^>]*>)([^<]*)</th>", r"\1\2</td>", html)
|
|
239
|
+
html = re.sub(r"(<th\b[^>]*>)([^<]*)</td>", r"\1\2</th>", html)
|
|
240
|
+
|
|
241
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
242
|
+
table = soup.find("table")
|
|
243
|
+
if not isinstance(table, Tag):
|
|
244
|
+
return html
|
|
245
|
+
|
|
246
|
+
# --- Fix 9: Inline nested tables ---
|
|
247
|
+
# Replace <table> elements inside cells with their flattened text
|
|
248
|
+
# so the outer grid parser sees clean content instead of nested markup.
|
|
249
|
+
for nested in table.find_all("table"):
|
|
250
|
+
if not isinstance(nested, Tag):
|
|
251
|
+
continue
|
|
252
|
+
rows = nested.find_all("tr")
|
|
253
|
+
lines = []
|
|
254
|
+
for row in rows:
|
|
255
|
+
if not isinstance(row, Tag):
|
|
256
|
+
continue
|
|
257
|
+
cells = row.find_all(["td", "th"], recursive=False)
|
|
258
|
+
texts = [c.get_text(strip=True) for c in cells]
|
|
259
|
+
if any(texts):
|
|
260
|
+
lines.append(", ".join(t for t in texts if t))
|
|
261
|
+
nested.replace_with(NavigableString("; ".join(lines)))
|
|
262
|
+
|
|
263
|
+
actual_rows = get_top_level_rows(table)
|
|
264
|
+
if not actual_rows:
|
|
265
|
+
return html
|
|
266
|
+
|
|
267
|
+
# --- Fix 1: Move title row to caption ---
|
|
268
|
+
first_meaningful_row = None
|
|
269
|
+
first_meaningful_row_index = 0
|
|
270
|
+
for idx, row in enumerate(actual_rows):
|
|
271
|
+
cells = row.find_all(["td", "th"], recursive=False)
|
|
272
|
+
if cells:
|
|
273
|
+
first_meaningful_row = row
|
|
274
|
+
first_meaningful_row_index = idx
|
|
275
|
+
break
|
|
276
|
+
|
|
277
|
+
if first_meaningful_row:
|
|
278
|
+
cells = first_meaningful_row.find_all(["td", "th"], recursive=False)
|
|
279
|
+
# Treat the first row as a title iff it is a single cell whose colspan
|
|
280
|
+
# covers the full width of the remaining rows (width >= 2). This
|
|
281
|
+
# captures 2-col and 3-col tables correctly without over-promoting.
|
|
282
|
+
later_widths = [
|
|
283
|
+
len(r.find_all(["td", "th"], recursive=False))
|
|
284
|
+
for r in actual_rows[first_meaningful_row_index + 1 :]
|
|
285
|
+
]
|
|
286
|
+
max_later_width = max(later_widths, default=0)
|
|
287
|
+
first_cell_span = clamped_span(cells[0].get("colspan", 1))
|
|
288
|
+
# If two or more rows in the table share this "single full-width
|
|
289
|
+
# <th>" shape, they are a section-divider series (e.g. <tbody>
|
|
290
|
+
# groups labelled "Personal" / "Work"), not a title. Treating the
|
|
291
|
+
# first as a caption while the others survive into Fix 1b would
|
|
292
|
+
# delete the matched siblings and leave the document with one
|
|
293
|
+
# arbitrarily-promoted label and the rest erased.
|
|
294
|
+
sibling_full_width_count = 0
|
|
295
|
+
for r in actual_rows:
|
|
296
|
+
r_cells = r.find_all(["td", "th"], recursive=False)
|
|
297
|
+
if len(r_cells) != 1:
|
|
298
|
+
continue
|
|
299
|
+
only = r_cells[0]
|
|
300
|
+
if only.name != "th":
|
|
301
|
+
continue
|
|
302
|
+
if clamped_span(only.get("colspan", 1)) >= max_later_width and max_later_width >= 2:
|
|
303
|
+
sibling_full_width_count += 1
|
|
304
|
+
is_section_divider_series = sibling_full_width_count >= 2
|
|
305
|
+
if (
|
|
306
|
+
len(cells) == 1
|
|
307
|
+
and max_later_width >= 2
|
|
308
|
+
and first_cell_span >= max_later_width
|
|
309
|
+
and not is_section_divider_series
|
|
310
|
+
):
|
|
311
|
+
title_text = cells[0].get_text(strip=True)
|
|
312
|
+
caption = table.find("caption")
|
|
313
|
+
if isinstance(caption, Tag):
|
|
314
|
+
caption.string = title_text
|
|
315
|
+
else:
|
|
316
|
+
new_caption = soup.new_tag("caption")
|
|
317
|
+
new_caption.string = title_text
|
|
318
|
+
table.insert(0, new_caption)
|
|
319
|
+
for i in range(first_meaningful_row_index + 1):
|
|
320
|
+
actual_rows[i].decompose()
|
|
321
|
+
actual_rows = get_top_level_rows(table)
|
|
322
|
+
|
|
323
|
+
# --- Fix 1b: Mark section-divider rows as scope="rowgroup" ---
|
|
324
|
+
# A <tr> whose sole cell is a <th> with colspan covering the grid
|
|
325
|
+
# width is structurally a row-group label (e.g. "Personal" / "Work"
|
|
326
|
+
# delimiting attribute-value blocks across two <tbody>s, or
|
|
327
|
+
# "Operating Expenses" / "Non-Operating Items" partitioning a P&L
|
|
328
|
+
# statement). The maze pathfinder already understands
|
|
329
|
+
# <th scope="rowgroup"> as an ancestor label whose extent runs from
|
|
330
|
+
# its origin row to the next rowgroup divider in the same column —
|
|
331
|
+
# we just need to mark the cell honestly instead of deleting it.
|
|
332
|
+
#
|
|
333
|
+
# Two structural side-effects of the divider survive into downstream:
|
|
334
|
+
# 1. The col-header walk (maze_pathfinder.find_headers_for_cell)
|
|
335
|
+
# already skips cells (and span copies) whose origin scope is
|
|
336
|
+
# "rowgroup", so the colspan-expanded divider can never be
|
|
337
|
+
# mistaken for a fabricated column header for the rows below.
|
|
338
|
+
# 2. The row-header walk picks the divider up as a row-group
|
|
339
|
+
# ancestor — but only if col 0 of the body rows is itself a
|
|
340
|
+
# row-header column. A divider series is a structural witness
|
|
341
|
+
# that col 0 names individual rows within each group; promote
|
|
342
|
+
# <td> col-0 cells in non-divider, non-thead rows to
|
|
343
|
+
# <th scope="row"> so the maze can walk up from there.
|
|
344
|
+
#
|
|
345
|
+
# The earlier behaviour (row.decompose()) silently destroyed the
|
|
346
|
+
# group label. Outputs scored gate_ok with no signal that context
|
|
347
|
+
# had been lost — see issue #1.
|
|
348
|
+
rowgroup_divider_rows: list = []
|
|
349
|
+
if actual_rows:
|
|
350
|
+
row_widths = [len(r.find_all(["td", "th"], recursive=False)) for r in actual_rows]
|
|
351
|
+
max_width = max(row_widths, default=0)
|
|
352
|
+
if max_width >= 2:
|
|
353
|
+
for row in list(actual_rows):
|
|
354
|
+
cells = row.find_all(["td", "th"], recursive=False)
|
|
355
|
+
if len(cells) != 1:
|
|
356
|
+
continue
|
|
357
|
+
cell = cells[0]
|
|
358
|
+
if cell.name != "th":
|
|
359
|
+
continue
|
|
360
|
+
colspan = clamped_span(cell.get("colspan", 1))
|
|
361
|
+
if colspan >= max_width:
|
|
362
|
+
cell["scope"] = "rowgroup"
|
|
363
|
+
rowgroup_divider_rows.append(row)
|
|
364
|
+
actual_rows = get_top_level_rows(table)
|
|
365
|
+
|
|
366
|
+
# If any rowgroup dividers were marked, col 0 of the surrounding
|
|
367
|
+
# rows is the row-label column by structural implication (the
|
|
368
|
+
# divider partitions row identities, which must live in some
|
|
369
|
+
# column; the canonical placement is col 0). Promote <td> col-0
|
|
370
|
+
# cells outside <thead> and outside divider rows to
|
|
371
|
+
# <th scope="row"> so the row-header walk picks up both the row
|
|
372
|
+
# label and its rowgroup ancestor.
|
|
373
|
+
if rowgroup_divider_rows:
|
|
374
|
+
divider_set = {id(r) for r in rowgroup_divider_rows}
|
|
375
|
+
for row in actual_rows:
|
|
376
|
+
if id(row) in divider_set:
|
|
377
|
+
continue
|
|
378
|
+
if row.find_parent("thead") is not None:
|
|
379
|
+
continue
|
|
380
|
+
cells = row.find_all(["td", "th"], recursive=False)
|
|
381
|
+
if not cells:
|
|
382
|
+
continue
|
|
383
|
+
first = cells[0]
|
|
384
|
+
if first.name == "td" and first.get_text(strip=True):
|
|
385
|
+
first.name = "th"
|
|
386
|
+
if not first.get("scope"):
|
|
387
|
+
first["scope"] = "row"
|
|
388
|
+
|
|
389
|
+
# --- Fix 4: Structural header-block promotion ---
|
|
390
|
+
# Universal structural rule (replaces the old row-0-only "all cells
|
|
391
|
+
# non-empty" heuristic). See detect_header_block for the full spec —
|
|
392
|
+
# the rule is deterministic, content-free, and subsumes three cases
|
|
393
|
+
# previously handled by disjoint heuristics:
|
|
394
|
+
#
|
|
395
|
+
# Dense first-row headers (receipts, simple relational): row 0 is
|
|
396
|
+
# a clean data row, detection returns None, no promotion.
|
|
397
|
+
#
|
|
398
|
+
# Multi-row headers with colspan group labels (benefits-style,
|
|
399
|
+
# FinTabNet hierarchical): span-bearing rows above the first clean
|
|
400
|
+
# data row form the header block.
|
|
401
|
+
#
|
|
402
|
+
# Financial 10-K tables with an empty row-stub label (FinTabNet
|
|
403
|
+
# single-row headers): col 0 is structurally identified as a stub
|
|
404
|
+
# column (empty in every header, filled in every body), and the
|
|
405
|
+
# empty col-0 cell in the header row is permitted.
|
|
406
|
+
#
|
|
407
|
+
# Promotes header-region rows (non-divider) to <th> so Fix 7 can wrap
|
|
408
|
+
# them in <thead>. Promotes stub-column body cells to <th scope="row">
|
|
409
|
+
# so dimensional columns are recognized even in single-row-header
|
|
410
|
+
# tables (Fix 8 only covers multi-row <thead>).
|
|
411
|
+
if not table.find("thead") and actual_rows:
|
|
412
|
+
detection = detect_header_block(actual_rows)
|
|
413
|
+
if detection is not None:
|
|
414
|
+
first_data_idx, stub_cols, origin_cells, grid = detection
|
|
415
|
+
# Promote header-region rows to <th> (skip section dividers
|
|
416
|
+
# and empty rows — dividers are structurally distinct and
|
|
417
|
+
# would break the thead contiguous-<th> chain intentionally).
|
|
418
|
+
# Use the *logical* non-empty count (colspan-expanded) so that
|
|
419
|
+
# a single-DOM-cell row with a wide colspan — e.g., a "(Dollars
|
|
420
|
+
# in thousands)" sub-header — is recognized as a multi-cell
|
|
421
|
+
# row rather than mis-classified as a divider.
|
|
422
|
+
for r_idx in range(first_data_idx):
|
|
423
|
+
logical_nonempty = sum(1 for cell in grid[r_idx] if cell["nonempty"])
|
|
424
|
+
if logical_nonempty <= 1:
|
|
425
|
+
continue
|
|
426
|
+
row = actual_rows[r_idx]
|
|
427
|
+
for cell in row.find_all(["td", "th"], recursive=False):
|
|
428
|
+
if cell.name == "td":
|
|
429
|
+
cell.name = "th"
|
|
430
|
+
# Promote stub-column body cells to <th scope="row"> at origin.
|
|
431
|
+
for c in stub_cols:
|
|
432
|
+
for r_idx in range(first_data_idx, len(actual_rows)):
|
|
433
|
+
if r_idx >= len(grid):
|
|
434
|
+
break
|
|
435
|
+
origin = grid[r_idx][c]["origin"]
|
|
436
|
+
if origin[0] < first_data_idx:
|
|
437
|
+
continue
|
|
438
|
+
cell = origin_cells.get(origin)
|
|
439
|
+
if cell is None:
|
|
440
|
+
continue
|
|
441
|
+
if cell.name == "td":
|
|
442
|
+
cell.name = "th"
|
|
443
|
+
if not cell.get("scope"):
|
|
444
|
+
cell["scope"] = "row"
|
|
445
|
+
# Row-group divider promotion. A row with exactly one
|
|
446
|
+
# non-empty logical cell whose column is in stub_cols is a
|
|
447
|
+
# row-group header for the body rows that follow until the
|
|
448
|
+
# next such divider — the FinTabNet year-label pattern
|
|
449
|
+
# ("2014" row between Q1–Q4 blocks). Promoting the cell to
|
|
450
|
+
# <th scope="rowgroup"> lets maze_pathfinder walk up the
|
|
451
|
+
# stub column and include the group label in row_path for
|
|
452
|
+
# subsequent body cells.
|
|
453
|
+
#
|
|
454
|
+
# Iterate from the end of the contiguous promoted-header
|
|
455
|
+
# prefix (what Fix 7 will wrap into <thead>) onward — so
|
|
456
|
+
# dividers inside the header region (e.g., a divider row
|
|
457
|
+
# that breaks the <th>-or-empty chain) are correctly
|
|
458
|
+
# classified as body rowgroup markers, not thead cells.
|
|
459
|
+
#
|
|
460
|
+
# Structurally distinct from the <th scope="row"> promotion
|
|
461
|
+
# above: row-headers are *peer* labels (one per row),
|
|
462
|
+
# rowgroup-headers are *ancestor* labels (span multiple rows).
|
|
463
|
+
thead_end = 0
|
|
464
|
+
for r in range(first_data_idx):
|
|
465
|
+
if sum(1 for c in grid[r] if c["nonempty"]) >= 2:
|
|
466
|
+
thead_end = r + 1
|
|
467
|
+
else:
|
|
468
|
+
break
|
|
469
|
+
for r_idx in range(thead_end, len(actual_rows)):
|
|
470
|
+
if r_idx >= len(grid):
|
|
471
|
+
break
|
|
472
|
+
row = grid[r_idx]
|
|
473
|
+
non_empty_cols = [c for c in range(len(row)) if row[c]["nonempty"]]
|
|
474
|
+
if len(non_empty_cols) != 1:
|
|
475
|
+
continue
|
|
476
|
+
only_col = non_empty_cols[0]
|
|
477
|
+
if only_col not in stub_cols:
|
|
478
|
+
continue
|
|
479
|
+
origin = row[only_col]["origin"]
|
|
480
|
+
if origin[0] < thead_end:
|
|
481
|
+
continue
|
|
482
|
+
cell = origin_cells.get(origin)
|
|
483
|
+
if cell is None:
|
|
484
|
+
continue
|
|
485
|
+
if cell.name == "td":
|
|
486
|
+
cell.name = "th"
|
|
487
|
+
cell["scope"] = "rowgroup"
|
|
488
|
+
|
|
489
|
+
# --- Fix 7: Wrap header rows in <thead> ---
|
|
490
|
+
# If table lacks <thead>, detect contiguous leading rows that are "header-like"
|
|
491
|
+
# (all <th> cells, or all <th>/empty cells) and wrap them in <thead>.
|
|
492
|
+
# This ensures downstream logic can rely on is_thead to identify column headers.
|
|
493
|
+
if not table.find("thead") and actual_rows:
|
|
494
|
+
header_rows = []
|
|
495
|
+
seen_non_empty = False
|
|
496
|
+
|
|
497
|
+
for row in actual_rows:
|
|
498
|
+
cells = row.find_all(["td", "th"], recursive=False)
|
|
499
|
+
|
|
500
|
+
# Ignore leading empty rows; do not treat them as headers
|
|
501
|
+
if not cells and not seen_non_empty:
|
|
502
|
+
continue
|
|
503
|
+
|
|
504
|
+
if not cells and seen_non_empty:
|
|
505
|
+
# Empty row after header block means header detection ends
|
|
506
|
+
break
|
|
507
|
+
|
|
508
|
+
seen_non_empty = True
|
|
509
|
+
|
|
510
|
+
# A row carrying a <th scope="rowgroup"> is a body section
|
|
511
|
+
# divider (Fix 1b marker), not a thead row. Stop the leading
|
|
512
|
+
# header chain here so the divider stays in <tbody>; pulling
|
|
513
|
+
# it into <thead> would mis-classify it as a column header.
|
|
514
|
+
is_rowgroup_divider = any(
|
|
515
|
+
cell.name == "th" and cell.get("scope") == "rowgroup" for cell in cells
|
|
516
|
+
)
|
|
517
|
+
if is_rowgroup_divider:
|
|
518
|
+
break
|
|
519
|
+
|
|
520
|
+
# A row is "header-like" if all cells are <th> or empty
|
|
521
|
+
is_header_like = all(
|
|
522
|
+
cell.name == "th" or not cell.get_text(strip=True) for cell in cells
|
|
523
|
+
)
|
|
524
|
+
|
|
525
|
+
if is_header_like:
|
|
526
|
+
header_rows.append(row)
|
|
527
|
+
else:
|
|
528
|
+
# Stop at first non-header row
|
|
529
|
+
break
|
|
530
|
+
|
|
531
|
+
# Only wrap if we found header rows (and they're not ALL the rows)
|
|
532
|
+
if header_rows and len(header_rows) < len(actual_rows):
|
|
533
|
+
new_thead = soup.new_tag("thead")
|
|
534
|
+
|
|
535
|
+
# Insert thead at the beginning of the table
|
|
536
|
+
# (after caption if present)
|
|
537
|
+
caption = table.find("caption")
|
|
538
|
+
if isinstance(caption, Tag):
|
|
539
|
+
caption.insert_after(new_thead)
|
|
540
|
+
else:
|
|
541
|
+
table.insert(0, new_thead)
|
|
542
|
+
|
|
543
|
+
# Move header rows into thead
|
|
544
|
+
for row in header_rows:
|
|
545
|
+
row.extract()
|
|
546
|
+
new_thead.append(row)
|
|
547
|
+
|
|
548
|
+
actual_rows = get_top_level_rows(table)
|
|
549
|
+
|
|
550
|
+
# --- Fix 8: Promote row headers based on <thead> structure ---
|
|
551
|
+
# If <thead> has multi-row structure (hierarchical column headers), the first
|
|
552
|
+
# column typically contains row identifiers. Promote first-column <td> cells
|
|
553
|
+
# in <tbody> to <th scope="row">.
|
|
554
|
+
#
|
|
555
|
+
# We only promote cells that:
|
|
556
|
+
# 1. Are the first cell in their DOM row, AND
|
|
557
|
+
# 2. Either have rowspan > 1 (explicit row group identifier), OR
|
|
558
|
+
# 3. Are not "covered" by a rowspan from a previous row
|
|
559
|
+
thead = table.find("thead")
|
|
560
|
+
if isinstance(thead, Tag):
|
|
561
|
+
thead_rows = thead.find_all("tr", recursive=False)
|
|
562
|
+
header_depth = len(thead_rows)
|
|
563
|
+
|
|
564
|
+
if header_depth > 1:
|
|
565
|
+
# Multi-row header structure suggests dimensional table
|
|
566
|
+
tbody = table.find("tbody")
|
|
567
|
+
if isinstance(tbody, Tag):
|
|
568
|
+
active_rowspan = 0 # Track if a rowspan from above covers first column
|
|
569
|
+
|
|
570
|
+
for row in tbody.find_all("tr", recursive=False):
|
|
571
|
+
if not isinstance(row, Tag):
|
|
572
|
+
continue
|
|
573
|
+
cells = row.find_all(["td", "th"], recursive=False)
|
|
574
|
+
|
|
575
|
+
if active_rowspan > 0:
|
|
576
|
+
# First column is covered by rowspan from above.
|
|
577
|
+
active_rowspan -= 1
|
|
578
|
+
continue
|
|
579
|
+
if not cells:
|
|
580
|
+
continue
|
|
581
|
+
|
|
582
|
+
first = cells[0]
|
|
583
|
+
if not isinstance(first, Tag):
|
|
584
|
+
continue
|
|
585
|
+
# Promote to row-header if not already. Fix 4 may have
|
|
586
|
+
# pre-promoted this cell via its stub-column path — in
|
|
587
|
+
# that case we still need to track the rowspan so the
|
|
588
|
+
# counter stays in sync with the grid, otherwise a cell
|
|
589
|
+
# at logical col > 0 in a subsequent row would be
|
|
590
|
+
# mistaken for the first-column cell.
|
|
591
|
+
if first.name == "td":
|
|
592
|
+
first.name = "th"
|
|
593
|
+
first["scope"] = "row"
|
|
594
|
+
rowspan = clamped_span(first.get("rowspan"))
|
|
595
|
+
if rowspan > 1:
|
|
596
|
+
active_rowspan = rowspan - 1
|
|
597
|
+
|
|
598
|
+
# --- Fix 6: Merge "Hanging" Description Rows ---
|
|
599
|
+
# Detects "wrap continuation" rows: a row with text in only the first cell
|
|
600
|
+
# (rest empty) that follows a fully-populated data row. Historically meant
|
|
601
|
+
# to rejoin labels that wrapped to a new line in some extractor outputs.
|
|
602
|
+
#
|
|
603
|
+
# In practice, this pattern is vastly more often a SECTION DIVIDER row
|
|
604
|
+
# (scientific / financial tables introducing a sub-group) than a genuine
|
|
605
|
+
# wrap continuation. Merging section dividers corrupts adjacent data
|
|
606
|
+
# rows (observed on PubTabNet, HiTab). So we only fire when the row is
|
|
607
|
+
# very likely to be a continuation:
|
|
608
|
+
# - current sparse row has a wide trailing colspan → section marker
|
|
609
|
+
# - next row has a continuation-like shape → probably wrap
|
|
610
|
+
# For now, the safer default is to NOT merge. The edge case where this
|
|
611
|
+
# merge was useful (single-column wrapped descriptions on narrow tables)
|
|
612
|
+
# hasn't resurfaced across the corpus and red-team fixtures.
|
|
613
|
+
|
|
614
|
+
# (merge loop intentionally disabled — see commit log for context)
|
|
615
|
+
|
|
616
|
+
# --- Fix 2, 3: Iterate remaining rows ---
|
|
617
|
+
actual_rows = get_top_level_rows(table)
|
|
618
|
+
|
|
619
|
+
for row in actual_rows:
|
|
620
|
+
cells = row.find_all(["td", "th"], recursive=False)
|
|
621
|
+
if not cells:
|
|
622
|
+
continue
|
|
623
|
+
|
|
624
|
+
# --- Fix 2: Fix <tfoot> row headers ---
|
|
625
|
+
if row.find_parent("tfoot") and cells[0].name == "td":
|
|
626
|
+
if clamped_span(cells[0].get("colspan", 1)) > 1:
|
|
627
|
+
cells[0].name = "th"
|
|
628
|
+
cells[0]["scope"] = "colgroup"
|
|
629
|
+
|
|
630
|
+
# --- Fix 3: Move footer legends to tfoot ---
|
|
631
|
+
if not row.find_parent("tfoot"):
|
|
632
|
+
if len(cells) == 1:
|
|
633
|
+
text = cells[0].get_text(strip=True).lower()
|
|
634
|
+
if "legend" in text or "footnote" in text:
|
|
635
|
+
colspan = clamped_span(cells[0].get("colspan", 1))
|
|
636
|
+
if colspan >= 3:
|
|
637
|
+
tfoot = table.find("tfoot")
|
|
638
|
+
if not isinstance(tfoot, Tag):
|
|
639
|
+
tfoot = soup.new_tag("tfoot")
|
|
640
|
+
table.append(tfoot)
|
|
641
|
+
row.extract()
|
|
642
|
+
tfoot.append(row)
|
|
643
|
+
continue
|
|
644
|
+
|
|
645
|
+
return str(soup)
|
table2rules/spans.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Shared span limits for table expansion.
|
|
2
|
+
|
|
3
|
+
Both repair and parsing expand ``rowspan`` / ``colspan`` into logical grid
|
|
4
|
+
positions. Keep all coercion and size checks here so adversarial markup is
|
|
5
|
+
bounded before either phase allocates span-derived structures.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from .errors import TableTooLargeError
|
|
11
|
+
|
|
12
|
+
# Guards against adversarial HTML. Normal tables never approach these.
|
|
13
|
+
MAX_SPAN = 1000
|
|
14
|
+
MAX_GRID_CELLS = 1_000_000
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def clamped_span(raw) -> int:
|
|
18
|
+
"""Coerce a raw rowspan/colspan attribute to a safe int in [1, MAX_SPAN]."""
|
|
19
|
+
try:
|
|
20
|
+
value = int(raw)
|
|
21
|
+
except (TypeError, ValueError):
|
|
22
|
+
return 1
|
|
23
|
+
if value < 1:
|
|
24
|
+
return 1
|
|
25
|
+
if value > MAX_SPAN:
|
|
26
|
+
return MAX_SPAN
|
|
27
|
+
return value
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def assert_grid_size(rows: int, cols: int) -> None:
|
|
31
|
+
"""Raise if a logical grid shape would exceed the configured cell cap."""
|
|
32
|
+
total_cells = rows * cols
|
|
33
|
+
if total_cells > MAX_GRID_CELLS:
|
|
34
|
+
raise TableTooLargeError(
|
|
35
|
+
f"expanded grid would be {rows} x {cols} = {total_cells} cells (cap {MAX_GRID_CELLS})"
|
|
36
|
+
)
|