table2rules 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- table2rules/__init__.py +51 -0
- table2rules/__main__.py +85 -0
- table2rules/_core.py +351 -0
- table2rules/cleanup.py +61 -0
- table2rules/errors.py +17 -0
- table2rules/exporters/__init__.py +33 -0
- table2rules/exporters/base.py +41 -0
- table2rules/exporters/rules.py +66 -0
- table2rules/grid_parser.py +487 -0
- table2rules/maze_pathfinder.py +166 -0
- table2rules/models.py +26 -0
- table2rules/py.typed +0 -0
- table2rules/quality_gate.py +186 -0
- table2rules/report.py +155 -0
- table2rules/simple_repair.py +645 -0
- table2rules/spans.py +36 -0
- table2rules-0.4.0.dist-info/METADATA +332 -0
- table2rules-0.4.0.dist-info/RECORD +22 -0
- table2rules-0.4.0.dist-info/WHEEL +5 -0
- table2rules-0.4.0.dist-info/entry_points.txt +2 -0
- table2rules-0.4.0.dist-info/licenses/LICENSE +21 -0
- table2rules-0.4.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Rules exporter — the native table2rules format.
|
|
2
|
+
|
|
3
|
+
One self-contained rule per line:
|
|
4
|
+
|
|
5
|
+
<row-path> | <col-path>: <value>
|
|
6
|
+
|
|
7
|
+
Where row-path and col-path join nested header levels with ' > '.
|
|
8
|
+
Full header ancestry on every line so an LLM never loses column
|
|
9
|
+
context across rows. Informed by TIDE (ICLR 2025) and ASTRA (2025).
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from typing import List
|
|
15
|
+
|
|
16
|
+
from ..models import LogicRule
|
|
17
|
+
|
|
18
|
+
PATH_SEP = " > "
|
|
19
|
+
ROW_COL_SEP = " | "
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class RulesExporter:
|
|
23
|
+
name = "rules"
|
|
24
|
+
|
|
25
|
+
def export_rules(self, rules: List[LogicRule]) -> List[str]:
|
|
26
|
+
# Sort by (row, col) so output follows reading order.
|
|
27
|
+
#
|
|
28
|
+
# Dedup is *origin-aware*: a single source cell expanded across
|
|
29
|
+
# multiple positions via rowspan/colspan can produce identical
|
|
30
|
+
# lines, which we collapse. But two different source cells that
|
|
31
|
+
# happen to render identically (e.g. two rows each with Qty: 1)
|
|
32
|
+
# are kept — dropping either would silently lose data.
|
|
33
|
+
ordered = sorted(rules, key=lambda r: r.position)
|
|
34
|
+
lines: List[str] = []
|
|
35
|
+
seen_by_origin: dict = {}
|
|
36
|
+
for rule in ordered:
|
|
37
|
+
line = self._format_rule(rule)
|
|
38
|
+
if not line:
|
|
39
|
+
continue
|
|
40
|
+
origin = rule.origin
|
|
41
|
+
if origin is not None and seen_by_origin.get(origin) == line:
|
|
42
|
+
continue
|
|
43
|
+
if origin is not None:
|
|
44
|
+
seen_by_origin[origin] = line
|
|
45
|
+
lines.append(line)
|
|
46
|
+
return lines
|
|
47
|
+
|
|
48
|
+
def export_flat(self, cell_rows: List[List[str]]) -> List[str]:
|
|
49
|
+
# No header info available on gate failure — fall back to pipe join.
|
|
50
|
+
return [" | ".join(row) for row in cell_rows if any(row)]
|
|
51
|
+
|
|
52
|
+
@staticmethod
|
|
53
|
+
def _format_rule(rule: LogicRule) -> str:
|
|
54
|
+
value = rule.outcome.strip()
|
|
55
|
+
if not value:
|
|
56
|
+
return ""
|
|
57
|
+
row_path = PATH_SEP.join(h.strip() for h in rule.row_headers if h.strip())
|
|
58
|
+
col_path = PATH_SEP.join(h.strip() for h in rule.col_headers if h.strip())
|
|
59
|
+
|
|
60
|
+
if row_path and col_path:
|
|
61
|
+
return f"{row_path}{ROW_COL_SEP}{col_path}: {value}"
|
|
62
|
+
if col_path:
|
|
63
|
+
return f"{col_path}: {value}"
|
|
64
|
+
if row_path:
|
|
65
|
+
return f"{row_path}: {value}"
|
|
66
|
+
return value
|
|
@@ -0,0 +1,487 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from typing import Any, Dict, List, Optional, cast
|
|
3
|
+
|
|
4
|
+
from bs4 import NavigableString, Tag
|
|
5
|
+
|
|
6
|
+
from .simple_repair import detect_header_block
|
|
7
|
+
from .spans import assert_grid_size, clamped_span
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def clean_text(text: str) -> str:
|
|
11
|
+
if not text:
|
|
12
|
+
return ""
|
|
13
|
+
|
|
14
|
+
# Basic HTML entity cleanup
|
|
15
|
+
text = text.replace(" ", " ")
|
|
16
|
+
text = text.replace("&", "&")
|
|
17
|
+
|
|
18
|
+
# Strip residual HTML tags if any slipped through
|
|
19
|
+
text = re.sub(r"<[^>]+>", " ", text)
|
|
20
|
+
|
|
21
|
+
# 1) Fix double dollar patterns like "$$200,000" or "S$$3,000"
|
|
22
|
+
# Turn them into "$200,000" or "S$3,000"
|
|
23
|
+
text = re.sub(r"\bS\$\$(\d)", r"S$\1", text)
|
|
24
|
+
text = re.sub(r"\$\$(\d)", r"$\1", text)
|
|
25
|
+
|
|
26
|
+
# 2) Remove a trailing standalone "$" after a word/number
|
|
27
|
+
# e.g. "per Sickness$" -> "per Sickness"
|
|
28
|
+
text = re.sub(r"(\w)\$(\s|$)", r"\1\2", text)
|
|
29
|
+
|
|
30
|
+
# Collapse whitespace
|
|
31
|
+
text = re.sub(r"\s+", " ", text).strip()
|
|
32
|
+
|
|
33
|
+
return text
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _is_textualish(text: str) -> bool:
|
|
37
|
+
"""Return True when text carries alphabetic descriptor content.
|
|
38
|
+
|
|
39
|
+
Uses ``str.isalpha`` on individual characters so the check is Unicode-aware
|
|
40
|
+
— a cell containing any letter in any writing system (Latin, Cyrillic, CJK,
|
|
41
|
+
Arabic, Devanagari, etc.) counts as textual. This is the single content
|
|
42
|
+
signal the parser relies on; the alphabetic-vs-numeric distinction is
|
|
43
|
+
universal across writing systems ("letters label, digits measure").
|
|
44
|
+
"""
|
|
45
|
+
if not text:
|
|
46
|
+
return False
|
|
47
|
+
return any(ch.isalpha() for ch in text)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def get_row_cells(row, table) -> List:
|
|
51
|
+
"""
|
|
52
|
+
Return logical cells for a row, including malformed sibling cells that may
|
|
53
|
+
be nested due to broken closing tags, while excluding nested-table cells.
|
|
54
|
+
"""
|
|
55
|
+
cells = row.find_all(["td", "th"])
|
|
56
|
+
return [
|
|
57
|
+
cell
|
|
58
|
+
for cell in cells
|
|
59
|
+
if cell.find_parent("tr") is row and cell.find_parent("table") is table
|
|
60
|
+
]
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def extract_cell_text(cell) -> str:
|
|
64
|
+
"""
|
|
65
|
+
Extract text from a logical cell while excluding text from malformed nested
|
|
66
|
+
sibling cells that can appear after HTML recovery.
|
|
67
|
+
"""
|
|
68
|
+
parts: List[str] = []
|
|
69
|
+
for node in cell.descendants:
|
|
70
|
+
if not isinstance(node, NavigableString):
|
|
71
|
+
continue
|
|
72
|
+
|
|
73
|
+
text = str(node).strip()
|
|
74
|
+
if not text:
|
|
75
|
+
continue
|
|
76
|
+
|
|
77
|
+
parent = node.parent
|
|
78
|
+
if parent is None:
|
|
79
|
+
continue
|
|
80
|
+
|
|
81
|
+
nearest_cell: Optional[Tag]
|
|
82
|
+
if parent.name in ("td", "th"):
|
|
83
|
+
nearest_cell = parent
|
|
84
|
+
else:
|
|
85
|
+
ancestor = parent.find_parent(["td", "th"])
|
|
86
|
+
nearest_cell = ancestor if isinstance(ancestor, Tag) else None
|
|
87
|
+
|
|
88
|
+
if nearest_cell is cell:
|
|
89
|
+
parts.append(text)
|
|
90
|
+
|
|
91
|
+
return clean_text(" ".join(parts))
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def parse_table_to_grid(table: Tag) -> List[List[Dict[str, Any]]]:
|
|
95
|
+
# 1. Get all top-level rows
|
|
96
|
+
all_rows_in_dom = table.find_all("tr")
|
|
97
|
+
actual_rows = []
|
|
98
|
+
for row in all_rows_in_dom:
|
|
99
|
+
if row.find_parent("table") is table:
|
|
100
|
+
actual_rows.append(row)
|
|
101
|
+
if not actual_rows:
|
|
102
|
+
return []
|
|
103
|
+
|
|
104
|
+
# --- UNIVERSAL HEADER LOGIC ---
|
|
105
|
+
|
|
106
|
+
data_start_row_idx = 0
|
|
107
|
+
thead = table.find("thead")
|
|
108
|
+
has_thead = isinstance(thead, Tag)
|
|
109
|
+
|
|
110
|
+
if isinstance(thead, Tag):
|
|
111
|
+
# --- Logic for tables WITH <thead> ---
|
|
112
|
+
data_start_row_idx = len(thead.find_all("tr", recursive=False))
|
|
113
|
+
|
|
114
|
+
else:
|
|
115
|
+
# --- Logic for "Headless" tables (NO <thead>) ---
|
|
116
|
+
|
|
117
|
+
# Step 1: Prefer an explicit header row that uses <th>.
|
|
118
|
+
# A column-header row has <th> cells whose scope is NOT 'row':
|
|
119
|
+
# scope='row' marks a row-stub header (row label), not a
|
|
120
|
+
# column label, so those cells cannot make a row qualify as the
|
|
121
|
+
# primary column-header row. Otherwise a single mid-body
|
|
122
|
+
# <th scope='row'> summary row (e.g. an explicit-markup totals
|
|
123
|
+
# line like <tr><th scope="row">Total</th>...</tr>) would be
|
|
124
|
+
# mistaken for the table header.
|
|
125
|
+
def _is_col_header_cell(cell):
|
|
126
|
+
return cell.name == "th" and cell.get("scope") != "row"
|
|
127
|
+
|
|
128
|
+
header_row_idx = None
|
|
129
|
+
for idx, row in enumerate(actual_rows):
|
|
130
|
+
cells = get_row_cells(row, table)
|
|
131
|
+
if not cells or len(cells) == 1:
|
|
132
|
+
# Skip empty or title rows
|
|
133
|
+
continue
|
|
134
|
+
|
|
135
|
+
has_col_th = any(_is_col_header_cell(cell) for cell in cells)
|
|
136
|
+
all_col_th_or_empty = all(
|
|
137
|
+
_is_col_header_cell(cell) or not cell.get_text(strip=True) for cell in cells
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
if has_col_th and all_col_th_or_empty:
|
|
141
|
+
header_row_idx = idx
|
|
142
|
+
break
|
|
143
|
+
|
|
144
|
+
if header_row_idx is not None:
|
|
145
|
+
# We have a clear header row made of <th>
|
|
146
|
+
main_header_row_idx = header_row_idx
|
|
147
|
+
|
|
148
|
+
# If some cells in this header row span multiple body rows,
|
|
149
|
+
# use the maximum rowspan to determine how many header rows exist.
|
|
150
|
+
cells = get_row_cells(actual_rows[main_header_row_idx], table)
|
|
151
|
+
header_row_span = max(clamped_span(cell.get("rowspan", 1)) for cell in cells) or 1
|
|
152
|
+
|
|
153
|
+
# Data starts after the header row span
|
|
154
|
+
data_start_row_idx = main_header_row_idx + header_row_span
|
|
155
|
+
|
|
156
|
+
else:
|
|
157
|
+
# Step 2: Fallback – no explicit <th> header row.
|
|
158
|
+
# Re-use the original rowspan-based heuristic on the first cell.
|
|
159
|
+
main_header_row_idx = 0
|
|
160
|
+
header_row_span = 1
|
|
161
|
+
found_header_row = False
|
|
162
|
+
|
|
163
|
+
for idx, row in enumerate(actual_rows):
|
|
164
|
+
cells = get_row_cells(row, table)
|
|
165
|
+
if not cells or len(cells) == 1:
|
|
166
|
+
continue
|
|
167
|
+
|
|
168
|
+
first_cell_rowspan = clamped_span(cells[0].get("rowspan", 1))
|
|
169
|
+
if first_cell_rowspan > 1:
|
|
170
|
+
main_header_row_idx = idx
|
|
171
|
+
header_row_span = first_cell_rowspan
|
|
172
|
+
found_header_row = True
|
|
173
|
+
break
|
|
174
|
+
|
|
175
|
+
if found_header_row:
|
|
176
|
+
data_start_row_idx = main_header_row_idx + header_row_span
|
|
177
|
+
else:
|
|
178
|
+
# Step 3: structural fallback via the same universal rule
|
|
179
|
+
# that simple_repair.Fix 4 uses. This path runs only when
|
|
180
|
+
# simple_repair didn't promote (e.g. upstream repairs left
|
|
181
|
+
# the table in a shape Fix 4 couldn't normalize) — it
|
|
182
|
+
# keeps the parser aligned with the repair's structural
|
|
183
|
+
# definition rather than reintroducing a separate
|
|
184
|
+
# "row 0 all non-empty" heuristic. If no header block is
|
|
185
|
+
# confidently identified, leave data_start_row_idx = 0 so
|
|
186
|
+
# every row is treated as data and the confidence gate
|
|
187
|
+
# decides whether to emit rules or flat.
|
|
188
|
+
detection = detect_header_block(actual_rows)
|
|
189
|
+
if detection is not None:
|
|
190
|
+
data_start_row_idx = detection[0]
|
|
191
|
+
else:
|
|
192
|
+
data_start_row_idx = 0
|
|
193
|
+
|
|
194
|
+
# --- END HEADER HEURISTIC ---
|
|
195
|
+
|
|
196
|
+
# --- KEY-VALUE TABLE DETECTION ---
|
|
197
|
+
# Detect simple key-value tables (no thead, 2 columns, th+td pattern)
|
|
198
|
+
# This prevents row headers from being treated as column headers
|
|
199
|
+
is_key_value_table = False
|
|
200
|
+
if not has_thead:
|
|
201
|
+
# Check if ALL rows follow the key-value pattern
|
|
202
|
+
is_key_value_table = True
|
|
203
|
+
for row in actual_rows:
|
|
204
|
+
cells = get_row_cells(row, table)
|
|
205
|
+
# Skip empty rows
|
|
206
|
+
if not cells:
|
|
207
|
+
continue
|
|
208
|
+
# Must have exactly 2 cells
|
|
209
|
+
if len(cells) != 2:
|
|
210
|
+
is_key_value_table = False
|
|
211
|
+
break
|
|
212
|
+
# First must be th, second must be td
|
|
213
|
+
if cells[0].name != "th" or cells[1].name != "td":
|
|
214
|
+
is_key_value_table = False
|
|
215
|
+
break
|
|
216
|
+
# No colspan/rowspan (keep it simple)
|
|
217
|
+
if (
|
|
218
|
+
clamped_span(cells[0].get("colspan", 1)) > 1
|
|
219
|
+
or clamped_span(cells[0].get("rowspan", 1)) > 1
|
|
220
|
+
):
|
|
221
|
+
is_key_value_table = False
|
|
222
|
+
break
|
|
223
|
+
if (
|
|
224
|
+
clamped_span(cells[1].get("colspan", 1)) > 1
|
|
225
|
+
or clamped_span(cells[1].get("rowspan", 1)) > 1
|
|
226
|
+
):
|
|
227
|
+
is_key_value_table = False
|
|
228
|
+
break
|
|
229
|
+
# Key-value tables have no header rows — every row is data.
|
|
230
|
+
if is_key_value_table:
|
|
231
|
+
data_start_row_idx = 0
|
|
232
|
+
# --- END KEY-VALUE DETECTION ---
|
|
233
|
+
|
|
234
|
+
# Phase 1: Calculate dimensions
|
|
235
|
+
max_cols = 0
|
|
236
|
+
occupied = {}
|
|
237
|
+
|
|
238
|
+
for row_idx, row in enumerate(actual_rows):
|
|
239
|
+
cells = get_row_cells(row, table)
|
|
240
|
+
logical_col = 0
|
|
241
|
+
|
|
242
|
+
for cell in cells:
|
|
243
|
+
while (row_idx, logical_col) in occupied:
|
|
244
|
+
logical_col += 1
|
|
245
|
+
rowspan = clamped_span(cell.get("rowspan", 1))
|
|
246
|
+
colspan = clamped_span(cell.get("colspan", 1))
|
|
247
|
+
assert_grid_size(len(actual_rows), logical_col + colspan)
|
|
248
|
+
for r in range(min(rowspan, len(actual_rows) - row_idx)):
|
|
249
|
+
for c in range(colspan):
|
|
250
|
+
occupied[(row_idx + r, logical_col + c)] = True
|
|
251
|
+
logical_col += colspan
|
|
252
|
+
max_cols = max(max_cols, logical_col)
|
|
253
|
+
|
|
254
|
+
if max_cols == 0:
|
|
255
|
+
return []
|
|
256
|
+
|
|
257
|
+
assert_grid_size(len(actual_rows), max_cols)
|
|
258
|
+
|
|
259
|
+
# Clamp inferred structure to valid ranges
|
|
260
|
+
data_start_row_idx = max(0, min(data_start_row_idx, len(actual_rows)))
|
|
261
|
+
|
|
262
|
+
# Phase 2: Create empty grid
|
|
263
|
+
grid: List[List[Optional[Dict[str, Any]]]] = [
|
|
264
|
+
[None for _ in range(max_cols)] for _ in range(len(actual_rows))
|
|
265
|
+
]
|
|
266
|
+
|
|
267
|
+
# Phase 3: Fill grid
|
|
268
|
+
for row_idx, row in enumerate(actual_rows):
|
|
269
|
+
cells = get_row_cells(row, table)
|
|
270
|
+
logical_col = 0
|
|
271
|
+
|
|
272
|
+
# A row is a "header row" if it's before the data start (headless only)
|
|
273
|
+
is_header_row = (row_idx < data_start_row_idx) and not has_thead
|
|
274
|
+
|
|
275
|
+
for cell in cells:
|
|
276
|
+
while logical_col < max_cols and grid[row_idx][logical_col] is not None:
|
|
277
|
+
logical_col += 1
|
|
278
|
+
if logical_col >= max_cols:
|
|
279
|
+
break
|
|
280
|
+
|
|
281
|
+
rowspan = clamped_span(cell.get("rowspan", 1))
|
|
282
|
+
colspan = clamped_span(cell.get("colspan", 1))
|
|
283
|
+
|
|
284
|
+
# Universal cell_type logic
|
|
285
|
+
cell_type = cell.name
|
|
286
|
+
|
|
287
|
+
# Heuristic 1: header row (for headless)
|
|
288
|
+
# Skip this for key-value tables - they don't have header rows
|
|
289
|
+
if is_header_row and cell.name == "td" and not is_key_value_table:
|
|
290
|
+
cell_type = "th"
|
|
291
|
+
|
|
292
|
+
# Heuristic 1b: <td> cells inside <thead> are structural headers
|
|
293
|
+
# regardless of tag. Word / Markdown-to-HTML converters, and many
|
|
294
|
+
# CMS outputs, emit <thead><tr><td><b>Header</b></td></tr></thead>
|
|
295
|
+
# — the <thead> wrapper is the authoritative signal. Promote to
|
|
296
|
+
# <th> so downstream header-walking treats these as column
|
|
297
|
+
# headers.
|
|
298
|
+
if cell.name == "td" and cell.find_parent("thead") is not None:
|
|
299
|
+
cell_type = "th"
|
|
300
|
+
|
|
301
|
+
is_footer = cell.find_parent("tfoot") is not None
|
|
302
|
+
is_thead = cell.find_parent("thead") is not None
|
|
303
|
+
|
|
304
|
+
# Override scope for key-value tables
|
|
305
|
+
cell_scope = cell.get("scope")
|
|
306
|
+
if is_key_value_table and logical_col == 0 and cell_type == "th":
|
|
307
|
+
cell_scope = "row"
|
|
308
|
+
|
|
309
|
+
cell_data = {
|
|
310
|
+
"text": extract_cell_text(cell),
|
|
311
|
+
"type": cell_type,
|
|
312
|
+
"rowspan": rowspan,
|
|
313
|
+
"colspan": colspan,
|
|
314
|
+
"scope": cell_scope,
|
|
315
|
+
"is_footer": is_footer,
|
|
316
|
+
"is_thead": is_thead,
|
|
317
|
+
"has_thead": has_thead,
|
|
318
|
+
"is_header_row": is_header_row,
|
|
319
|
+
"header_depth": data_start_row_idx if has_thead else 0,
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
for r_offset in range(min(rowspan, len(grid) - row_idx)):
|
|
323
|
+
for c_offset in range(colspan):
|
|
324
|
+
target_row = row_idx + r_offset
|
|
325
|
+
target_col = logical_col + c_offset
|
|
326
|
+
if target_row < len(grid) and target_col < max_cols:
|
|
327
|
+
if r_offset == 0 and c_offset == 0:
|
|
328
|
+
grid[target_row][target_col] = cell_data
|
|
329
|
+
else:
|
|
330
|
+
span_ref = {
|
|
331
|
+
"text": cell_data["text"],
|
|
332
|
+
"type": cell_data["type"],
|
|
333
|
+
"rowspan": 1,
|
|
334
|
+
"colspan": 1,
|
|
335
|
+
"scope": cell_data.get("scope"),
|
|
336
|
+
"is_footer": cell_data["is_footer"],
|
|
337
|
+
"is_thead": cell_data["is_thead"],
|
|
338
|
+
"has_thead": cell_data["has_thead"],
|
|
339
|
+
"is_header_row": cell_data["is_header_row"],
|
|
340
|
+
"is_span_copy": True,
|
|
341
|
+
"origin": (row_idx, logical_col),
|
|
342
|
+
"header_depth": cell_data.get("header_depth", 0),
|
|
343
|
+
}
|
|
344
|
+
grid[target_row][target_col] = span_ref
|
|
345
|
+
logical_col += colspan
|
|
346
|
+
|
|
347
|
+
# Phase 3.5: Promote dimensional body columns to row headers.
|
|
348
|
+
#
|
|
349
|
+
# Universal structural signals:
|
|
350
|
+
# A) Rowspan signal: leading body columns with rowspan>1 origins are
|
|
351
|
+
# dimensional/grouping columns.
|
|
352
|
+
# B) Unlabeled descriptor signal: a body column has non-empty cells but
|
|
353
|
+
# no non-empty <thead> header text at that column and is text-like.
|
|
354
|
+
#
|
|
355
|
+
# Signal B is guarded to avoid over-promotion: only contiguous descriptor
|
|
356
|
+
# columns from the left edge (or directly after a promoted descriptor
|
|
357
|
+
# column) are promoted.
|
|
358
|
+
if has_thead and data_start_row_idx < len(grid):
|
|
359
|
+
# --- Per-column stats over body origins ---
|
|
360
|
+
body_nonempty = [0] * max_cols
|
|
361
|
+
body_textual = [0] * max_cols
|
|
362
|
+
body_th = [0] * max_cols
|
|
363
|
+
has_thead_text = [False] * max_cols
|
|
364
|
+
|
|
365
|
+
for c in range(max_cols):
|
|
366
|
+
for r in range(len(grid)):
|
|
367
|
+
cell = grid[r][c]
|
|
368
|
+
if not cell or cell.get("is_span_copy"):
|
|
369
|
+
continue
|
|
370
|
+
txt = (cell.get("text") or "").strip()
|
|
371
|
+
if cell.get("is_thead", False):
|
|
372
|
+
if txt:
|
|
373
|
+
has_thead_text[c] = True
|
|
374
|
+
continue
|
|
375
|
+
if r < data_start_row_idx:
|
|
376
|
+
continue
|
|
377
|
+
if cell.get("is_footer", False):
|
|
378
|
+
continue
|
|
379
|
+
if not txt:
|
|
380
|
+
continue
|
|
381
|
+
body_nonempty[c] += 1
|
|
382
|
+
if cell.get("type") == "th":
|
|
383
|
+
body_th[c] += 1
|
|
384
|
+
if _is_textualish(txt):
|
|
385
|
+
body_textual[c] += 1
|
|
386
|
+
|
|
387
|
+
def _descriptor_like(col: int) -> bool:
|
|
388
|
+
# A column is "descriptor-like" iff strictly more of its
|
|
389
|
+
# non-empty body cells are textual than non-textual. Using a
|
|
390
|
+
# strict-majority count (integer comparison) instead of a
|
|
391
|
+
# fixed ratio keeps the rule deterministic and aligned with
|
|
392
|
+
# the stub-column rule in simple_repair.detect_header_block.
|
|
393
|
+
n = body_nonempty[col]
|
|
394
|
+
if n < 2:
|
|
395
|
+
return False
|
|
396
|
+
return body_textual[col] * 2 > n
|
|
397
|
+
|
|
398
|
+
# --- Signal A: rowspan-driven leading dimensional columns ---
|
|
399
|
+
body_dimensional = []
|
|
400
|
+
for c in range(max_cols):
|
|
401
|
+
col_has_rowspan = False
|
|
402
|
+
for r in range(data_start_row_idx, len(grid)):
|
|
403
|
+
cell = grid[r][c]
|
|
404
|
+
if cell and not cell.get("is_span_copy") and cell.get("rowspan", 1) > 1:
|
|
405
|
+
col_has_rowspan = True
|
|
406
|
+
break
|
|
407
|
+
if col_has_rowspan:
|
|
408
|
+
body_dimensional.append(c)
|
|
409
|
+
else:
|
|
410
|
+
break # Stop at first non-dimensional column
|
|
411
|
+
|
|
412
|
+
# --- Signal B: full-depth header columns (multi-row headers) ---
|
|
413
|
+
if data_start_row_idx >= 2:
|
|
414
|
+
full_depth_count = 0
|
|
415
|
+
for c in range(max_cols):
|
|
416
|
+
cell = grid[0][c]
|
|
417
|
+
if (
|
|
418
|
+
cell
|
|
419
|
+
and cell.get("is_thead")
|
|
420
|
+
and not cell.get("is_span_copy")
|
|
421
|
+
and cell.get("rowspan", 1) == data_start_row_idx
|
|
422
|
+
):
|
|
423
|
+
full_depth_count += 1
|
|
424
|
+
else:
|
|
425
|
+
break # Stop at first non-full-depth header
|
|
426
|
+
# Cap body signal at the header-derived count
|
|
427
|
+
body_dimensional = body_dimensional[:full_depth_count]
|
|
428
|
+
|
|
429
|
+
promote_cols = set()
|
|
430
|
+
if len(body_dimensional) >= 2:
|
|
431
|
+
promote_cols.update(body_dimensional)
|
|
432
|
+
elif len(body_dimensional) == 1:
|
|
433
|
+
c0 = body_dimensional[0]
|
|
434
|
+
if not has_thead_text[c0]:
|
|
435
|
+
promote_cols.add(c0)
|
|
436
|
+
|
|
437
|
+
# Seed only strong pre-promotions from upstream repair on unlabeled
|
|
438
|
+
# columns. A single summary-row <th> should not turn a labeled data
|
|
439
|
+
# column into a row-header column.
|
|
440
|
+
for c in range(max_cols):
|
|
441
|
+
if (
|
|
442
|
+
body_nonempty[c] >= 2
|
|
443
|
+
and not has_thead_text[c]
|
|
444
|
+
and (body_th[c] / max(1, body_nonempty[c])) >= 0.60
|
|
445
|
+
):
|
|
446
|
+
promote_cols.add(c)
|
|
447
|
+
|
|
448
|
+
# --- Signal B: unlabeled descriptor columns ---
|
|
449
|
+
for c in range(max_cols):
|
|
450
|
+
if c in promote_cols:
|
|
451
|
+
continue
|
|
452
|
+
if has_thead_text[c]:
|
|
453
|
+
continue
|
|
454
|
+
if not _descriptor_like(c):
|
|
455
|
+
continue
|
|
456
|
+
|
|
457
|
+
left_is_dimensional = (c == 0) or ((c - 1) in promote_cols)
|
|
458
|
+
left_is_descriptor = (c > 0) and _descriptor_like(c - 1)
|
|
459
|
+
if not (left_is_dimensional or left_is_descriptor):
|
|
460
|
+
continue
|
|
461
|
+
promote_cols.add(c)
|
|
462
|
+
|
|
463
|
+
if promote_cols:
|
|
464
|
+
for c in sorted(promote_cols):
|
|
465
|
+
for r in range(data_start_row_idx, len(grid)):
|
|
466
|
+
cell = grid[r][c]
|
|
467
|
+
if not cell or cell.get("is_footer", False):
|
|
468
|
+
continue
|
|
469
|
+
if cell["type"] == "td" and (cell.get("text") or "").strip():
|
|
470
|
+
cell["type"] = "th"
|
|
471
|
+
if not cell.get("scope"):
|
|
472
|
+
cell["scope"] = "row"
|
|
473
|
+
|
|
474
|
+
# Phase 4: Fill gaps
|
|
475
|
+
for r in range(len(grid)):
|
|
476
|
+
for c in range(max_cols):
|
|
477
|
+
if grid[r][c] is None:
|
|
478
|
+
grid[r][c] = {
|
|
479
|
+
"text": "",
|
|
480
|
+
"type": "td",
|
|
481
|
+
"rowspan": 1,
|
|
482
|
+
"colspan": 1,
|
|
483
|
+
"has_thead": has_thead,
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
# Phase 4 guarantees every cell is non-None; cast for the public type.
|
|
487
|
+
return cast(List[List[Dict[str, Any]]], grid)
|