table2rules 0.4.0__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {table2rules-0.4.0/src/table2rules.egg-info → table2rules-0.5.0}/PKG-INFO +1 -1
- {table2rules-0.4.0 → table2rules-0.5.0}/pyproject.toml +1 -1
- {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules/grid_parser.py +5 -0
- {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules/simple_repair.py +133 -1
- {table2rules-0.4.0 → table2rules-0.5.0/src/table2rules.egg-info}/PKG-INFO +1 -1
- {table2rules-0.4.0 → table2rules-0.5.0}/tests/test_public_api.py +49 -0
- {table2rules-0.4.0 → table2rules-0.5.0}/LICENSE +0 -0
- {table2rules-0.4.0 → table2rules-0.5.0}/README.md +0 -0
- {table2rules-0.4.0 → table2rules-0.5.0}/setup.cfg +0 -0
- {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules/__init__.py +0 -0
- {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules/__main__.py +0 -0
- {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules/_core.py +0 -0
- {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules/cleanup.py +0 -0
- {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules/errors.py +0 -0
- {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules/exporters/__init__.py +0 -0
- {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules/exporters/base.py +0 -0
- {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules/exporters/rules.py +0 -0
- {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules/maze_pathfinder.py +0 -0
- {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules/models.py +0 -0
- {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules/py.typed +0 -0
- {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules/quality_gate.py +0 -0
- {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules/report.py +0 -0
- {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules/spans.py +0 -0
- {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules.egg-info/SOURCES.txt +0 -0
- {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules.egg-info/dependency_links.txt +0 -0
- {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules.egg-info/entry_points.txt +0 -0
- {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules.egg-info/requires.txt +0 -0
- {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules.egg-info/top_level.txt +0 -0
- {table2rules-0.4.0 → table2rules-0.5.0}/tests/test_correctness_oracle.py +0 -0
- {table2rules-0.4.0 → table2rules-0.5.0}/tests/test_determinism.py +0 -0
- {table2rules-0.4.0 → table2rules-0.5.0}/tests/test_regression_golds.py +0 -0
- {table2rules-0.4.0 → table2rules-0.5.0}/tests/test_robustness_mutations.py +0 -0
|
@@ -78,6 +78,11 @@ def extract_cell_text(cell) -> str:
|
|
|
78
78
|
if parent is None:
|
|
79
79
|
continue
|
|
80
80
|
|
|
81
|
+
# Skip text content of <style> and <script> tags embedded in cells
|
|
82
|
+
# (Wikipedia injects inline <style> blocks for multi-column templates).
|
|
83
|
+
if isinstance(parent, Tag) and parent.name in ("style", "script"):
|
|
84
|
+
continue
|
|
85
|
+
|
|
81
86
|
nearest_cell: Optional[Tag]
|
|
82
87
|
if parent.name in ("td", "th"):
|
|
83
88
|
nearest_cell = parent
|
|
@@ -122,6 +122,46 @@ def detect_header_block(rows):
|
|
|
122
122
|
[0..k-1] range and are left as-is so the downstream thead-wrap naturally
|
|
123
123
|
excludes them via Fix 7's contiguous-<th> chain.
|
|
124
124
|
|
|
125
|
+
Two further structural witnesses extend "clean data row" to disqualify
|
|
126
|
+
rows that look header-shaped relative to the body:
|
|
127
|
+
|
|
128
|
+
* **Fuller-than-body**: row r's non-empty cell count is strictly greater
|
|
129
|
+
than the minimum non-empty count of the non-divider rows below it,
|
|
130
|
+
AND every column where row r is non-empty has at least one body
|
|
131
|
+
row that fills the same column. A row that fills more columns
|
|
132
|
+
than at least one body row is naming columns the body sometimes
|
|
133
|
+
leaves empty — the structural signature of a column-header row
|
|
134
|
+
above an implicit-rowspan group-label column, a multi-stub
|
|
135
|
+
indentation pyramid, or an alternating coefficient/std-error
|
|
136
|
+
layout. Comparing to the minimum (rather than median or mean)
|
|
137
|
+
is intentional: the structural distinction is "exists a body
|
|
138
|
+
row with fewer non-empty cells than row r," and strictly more
|
|
139
|
+
central statistics misfire on tables where a slim majority of body
|
|
140
|
+
rows match row r's fullness. The body-coverage clause excludes
|
|
141
|
+
receipts whose row 0 is a 4-cell line item followed by 2-cell
|
|
142
|
+
totals — there cols 2 and 3 are filled only in row 0, the body
|
|
143
|
+
never uses them, and promoting row 0 to header would erase the
|
|
144
|
+
line-item data. Uniform-dense tables stay out because their min
|
|
145
|
+
equals row r's count.
|
|
146
|
+
|
|
147
|
+
* **Cell-type inversion**: row r contains at least one corner-stub
|
|
148
|
+
``<td>`` — a column where row r is ``<td>`` while the body majority
|
|
149
|
+
is ``<th>`` — and a strict majority of compared columns invert.
|
|
150
|
+
The corner-stub clause is load-bearing: an all-``<th>`` row above
|
|
151
|
+
an all-``<td>`` body inverts every column too, but that's the
|
|
152
|
+
*normal* header pattern (and Fix 7 already wraps it in ``<thead>``
|
|
153
|
+
via the contiguous-``<th>`` chain). Inversion is only the right
|
|
154
|
+
witness when row r has a stray ``<td>`` corner cell that the body
|
|
155
|
+
makes a ``<th scope="row">`` (e.g., header row ``[td, th, th]``
|
|
156
|
+
above body rows ``[th, td, td]``). Universal: cell tags are
|
|
157
|
+
markup, not content.
|
|
158
|
+
|
|
159
|
+
Both extended witnesses apply only at r == 0 — the literal first
|
|
160
|
+
row. Beyond row 0, "fuller than the body below" describes regular
|
|
161
|
+
data rows (regression group labels, alternating coefficient/std-err
|
|
162
|
+
pairs) and "first row with col 0 non-empty" is already the existing
|
|
163
|
+
stub-column header pattern handled by the rest of the function.
|
|
164
|
+
|
|
125
165
|
Returns (k, stub_cols, origin_cells, grid) on success, or None.
|
|
126
166
|
"""
|
|
127
167
|
n = len(rows)
|
|
@@ -132,6 +172,85 @@ def detect_header_block(rows):
|
|
|
132
172
|
if max_cols == 0:
|
|
133
173
|
return None
|
|
134
174
|
|
|
175
|
+
# Pre-compute per-row non-empty counts (logical, colspan-expanded).
|
|
176
|
+
nonempty_counts = [sum(1 for c in row if c["nonempty"]) for row in grid]
|
|
177
|
+
|
|
178
|
+
def _body_min_nonempty(after_r: int) -> int:
|
|
179
|
+
"""Minimum non-empty count among non-divider rows strictly after
|
|
180
|
+
``after_r``. Returns -1 when no qualifying body row exists
|
|
181
|
+
(caller treats that as "no body to compare against" and skips
|
|
182
|
+
the witness).
|
|
183
|
+
"""
|
|
184
|
+
counts = [nonempty_counts[r2] for r2 in range(after_r + 1, n) if nonempty_counts[r2] >= 2]
|
|
185
|
+
if not counts:
|
|
186
|
+
return -1
|
|
187
|
+
return min(counts)
|
|
188
|
+
|
|
189
|
+
def _row_cols_covered_by_body(r: int) -> bool:
|
|
190
|
+
"""True iff every column where row r is non-empty has at least one
|
|
191
|
+
non-divider body row below that fills the same column. Excludes
|
|
192
|
+
receipts whose row 0 fills columns the body never uses (line
|
|
193
|
+
item over totals)."""
|
|
194
|
+
for c in range(max_cols):
|
|
195
|
+
if not grid[r][c]["nonempty"]:
|
|
196
|
+
continue
|
|
197
|
+
covered = False
|
|
198
|
+
for r2 in range(r + 1, n):
|
|
199
|
+
if nonempty_counts[r2] < 2:
|
|
200
|
+
continue
|
|
201
|
+
if grid[r2][c]["nonempty"]:
|
|
202
|
+
covered = True
|
|
203
|
+
break
|
|
204
|
+
if not covered:
|
|
205
|
+
return False
|
|
206
|
+
return True
|
|
207
|
+
|
|
208
|
+
def _is_inverted_relative_to_body(r: int) -> bool:
|
|
209
|
+
"""True iff row r contains a corner-stub ``<td>`` and inverts the
|
|
210
|
+
body majority at a strict majority of compared columns. Only
|
|
211
|
+
origin cells with non-empty text on both sides participate.
|
|
212
|
+
|
|
213
|
+
The corner-stub clause requires ≥1 column where row r is ``<td>``
|
|
214
|
+
but body majority is ``<th>``. Without it, an all-``<th>`` row
|
|
215
|
+
above an all-``<td>`` body would also "invert" every column — but
|
|
216
|
+
that's the normal header pattern, already handled by Fix 7's
|
|
217
|
+
contiguous-``<th>`` thead wrap.
|
|
218
|
+
"""
|
|
219
|
+
has_corner_stub = False
|
|
220
|
+
inverted = 0
|
|
221
|
+
checked = 0
|
|
222
|
+
for c in range(max_cols):
|
|
223
|
+
row_cell = grid[r][c]
|
|
224
|
+
if row_cell["origin"] != (r, c) or not row_cell["nonempty"]:
|
|
225
|
+
continue
|
|
226
|
+
row_origin = origin_cells.get((r, c))
|
|
227
|
+
if row_origin is None or row_origin.name not in ("td", "th"):
|
|
228
|
+
continue
|
|
229
|
+
row_tag = row_origin.name
|
|
230
|
+
|
|
231
|
+
td_count = 0
|
|
232
|
+
th_count = 0
|
|
233
|
+
for r2 in range(r + 1, n):
|
|
234
|
+
body_cell = grid[r2][c]
|
|
235
|
+
if body_cell["origin"] != (r2, c) or not body_cell["nonempty"]:
|
|
236
|
+
continue
|
|
237
|
+
body_origin = origin_cells.get((r2, c))
|
|
238
|
+
if body_origin is None:
|
|
239
|
+
continue
|
|
240
|
+
if body_origin.name == "td":
|
|
241
|
+
td_count += 1
|
|
242
|
+
elif body_origin.name == "th":
|
|
243
|
+
th_count += 1
|
|
244
|
+
if td_count + th_count == 0:
|
|
245
|
+
continue
|
|
246
|
+
body_majority = "td" if td_count >= th_count else "th"
|
|
247
|
+
checked += 1
|
|
248
|
+
if body_majority != row_tag:
|
|
249
|
+
inverted += 1
|
|
250
|
+
if row_tag == "td" and body_majority == "th":
|
|
251
|
+
has_corner_stub = True
|
|
252
|
+
return has_corner_stub and checked > 0 and inverted * 2 > checked
|
|
253
|
+
|
|
135
254
|
# Find the first data row. A data row is one whose shape has no
|
|
136
255
|
# structural feature that would mark it as a header:
|
|
137
256
|
# - col 0 is non-empty (typical row-label)
|
|
@@ -140,6 +259,13 @@ def detect_header_block(rows):
|
|
|
140
259
|
# (no rowspan copy from above pulling header content into body)
|
|
141
260
|
# - every origin cell at row r has rowspan == colspan == 1
|
|
142
261
|
# (no colspan group or rowspan marker signaling header role)
|
|
262
|
+
# Plus, only at r == 0 (the literal first row):
|
|
263
|
+
# - row 0 is not strictly fuller than at least one body row below
|
|
264
|
+
# while every column it fills is covered by some body row
|
|
265
|
+
# (the fuller-than-body witness — see docstring)
|
|
266
|
+
# - row 0's cell tags are not inverted relative to body majority,
|
|
267
|
+
# with at least one corner-stub ``<td>`` versus body ``<th>``
|
|
268
|
+
# (the cell-type-inversion witness — see docstring)
|
|
143
269
|
#
|
|
144
270
|
# The "every cell non-empty" requirement was dropped intentionally:
|
|
145
271
|
# real body rows in financial tables are often gappy (a cell has a
|
|
@@ -153,7 +279,7 @@ def detect_header_block(rows):
|
|
|
153
279
|
row = grid[r]
|
|
154
280
|
if not row[0]["nonempty"]:
|
|
155
281
|
continue
|
|
156
|
-
nonempty_count =
|
|
282
|
+
nonempty_count = nonempty_counts[r]
|
|
157
283
|
if nonempty_count < 2:
|
|
158
284
|
continue
|
|
159
285
|
is_clean = True
|
|
@@ -165,6 +291,12 @@ def detect_header_block(rows):
|
|
|
165
291
|
if cell["rs"] != 1 or cell["cs"] != 1:
|
|
166
292
|
is_clean = False
|
|
167
293
|
break
|
|
294
|
+
if is_clean and r == 0:
|
|
295
|
+
body_min = _body_min_nonempty(r)
|
|
296
|
+
if body_min >= 0 and nonempty_count > body_min and _row_cols_covered_by_body(r):
|
|
297
|
+
is_clean = False
|
|
298
|
+
if is_clean and _is_inverted_relative_to_body(r):
|
|
299
|
+
is_clean = False
|
|
168
300
|
if is_clean:
|
|
169
301
|
first_data_idx = r
|
|
170
302
|
break
|
|
@@ -452,3 +452,52 @@ def test_reasons_by_severity_partitions_catalogue() -> None:
|
|
|
452
452
|
def test_reasons_by_severity_has_expected_buckets() -> None:
|
|
453
453
|
# Renaming a bucket is a breaking change — guard it.
|
|
454
454
|
assert set(REASONS_BY_SEVERITY) == {"defensive", "confidence", "input"}
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
# --- Cell text extraction: inline <style> / <script> noise -----------------
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
def test_inline_style_tag_excluded_from_cell_text() -> None:
|
|
461
|
+
"""Inline <style> blocks injected by Wikipedia templates must not appear
|
|
462
|
+
in emitted rule values. Regression for the CSS-noise silent failure."""
|
|
463
|
+
html = """
|
|
464
|
+
<table>
|
|
465
|
+
<thead>
|
|
466
|
+
<tr><th>District</th><th>Talukas</th></tr>
|
|
467
|
+
</thead>
|
|
468
|
+
<tbody>
|
|
469
|
+
<tr>
|
|
470
|
+
<th scope="row">Bagalkot</th>
|
|
471
|
+
<td><style>.div-col{column-width:30em}</style>Badami Bagalkot Bilagi</td>
|
|
472
|
+
</tr>
|
|
473
|
+
</tbody>
|
|
474
|
+
</table>
|
|
475
|
+
"""
|
|
476
|
+
text, report = process_tables_with_stats(html, strict=False)
|
|
477
|
+
|
|
478
|
+
assert report.tables[0].render_mode == "rules"
|
|
479
|
+
assert ".div-col" not in text
|
|
480
|
+
assert "column-width" not in text
|
|
481
|
+
assert "Badami" in text
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
def test_inline_script_tag_excluded_from_cell_text() -> None:
|
|
485
|
+
"""Inline <script> blocks must not bleed into emitted rule values."""
|
|
486
|
+
html = """
|
|
487
|
+
<table>
|
|
488
|
+
<thead>
|
|
489
|
+
<tr><th>Region</th><th>Population</th></tr>
|
|
490
|
+
</thead>
|
|
491
|
+
<tbody>
|
|
492
|
+
<tr>
|
|
493
|
+
<th scope="row">South</th>
|
|
494
|
+
<td><script>var x=1;</script>4,200,000</td>
|
|
495
|
+
</tr>
|
|
496
|
+
</tbody>
|
|
497
|
+
</table>
|
|
498
|
+
"""
|
|
499
|
+
text, report = process_tables_with_stats(html, strict=False)
|
|
500
|
+
|
|
501
|
+
assert report.tables[0].render_mode == "rules"
|
|
502
|
+
assert "var x" not in text
|
|
503
|
+
assert "4,200,000" in text
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|