table2rules 0.4.0__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {table2rules-0.4.0/src/table2rules.egg-info → table2rules-0.5.0}/PKG-INFO +1 -1
  2. {table2rules-0.4.0 → table2rules-0.5.0}/pyproject.toml +1 -1
  3. {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules/grid_parser.py +5 -0
  4. {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules/simple_repair.py +133 -1
  5. {table2rules-0.4.0 → table2rules-0.5.0/src/table2rules.egg-info}/PKG-INFO +1 -1
  6. {table2rules-0.4.0 → table2rules-0.5.0}/tests/test_public_api.py +49 -0
  7. {table2rules-0.4.0 → table2rules-0.5.0}/LICENSE +0 -0
  8. {table2rules-0.4.0 → table2rules-0.5.0}/README.md +0 -0
  9. {table2rules-0.4.0 → table2rules-0.5.0}/setup.cfg +0 -0
  10. {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules/__init__.py +0 -0
  11. {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules/__main__.py +0 -0
  12. {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules/_core.py +0 -0
  13. {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules/cleanup.py +0 -0
  14. {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules/errors.py +0 -0
  15. {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules/exporters/__init__.py +0 -0
  16. {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules/exporters/base.py +0 -0
  17. {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules/exporters/rules.py +0 -0
  18. {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules/maze_pathfinder.py +0 -0
  19. {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules/models.py +0 -0
  20. {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules/py.typed +0 -0
  21. {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules/quality_gate.py +0 -0
  22. {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules/report.py +0 -0
  23. {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules/spans.py +0 -0
  24. {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules.egg-info/SOURCES.txt +0 -0
  25. {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules.egg-info/dependency_links.txt +0 -0
  26. {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules.egg-info/entry_points.txt +0 -0
  27. {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules.egg-info/requires.txt +0 -0
  28. {table2rules-0.4.0 → table2rules-0.5.0}/src/table2rules.egg-info/top_level.txt +0 -0
  29. {table2rules-0.4.0 → table2rules-0.5.0}/tests/test_correctness_oracle.py +0 -0
  30. {table2rules-0.4.0 → table2rules-0.5.0}/tests/test_determinism.py +0 -0
  31. {table2rules-0.4.0 → table2rules-0.5.0}/tests/test_regression_golds.py +0 -0
  32. {table2rules-0.4.0 → table2rules-0.5.0}/tests/test_robustness_mutations.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: table2rules
3
- Version: 0.4.0
3
+ Version: 0.5.0
4
4
  Summary: Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding.
5
5
  Author: PebbleRoad Pte Ltd
6
6
  License-Expression: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "table2rules"
7
- version = "0.4.0"
7
+ version = "0.5.0"
8
8
  description = "Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding."
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -78,6 +78,11 @@ def extract_cell_text(cell) -> str:
78
78
  if parent is None:
79
79
  continue
80
80
 
81
+ # Skip text content of <style> and <script> tags embedded in cells
82
+ # (Wikipedia injects inline <style> blocks for multi-column templates).
83
+ if isinstance(parent, Tag) and parent.name in ("style", "script"):
84
+ continue
85
+
81
86
  nearest_cell: Optional[Tag]
82
87
  if parent.name in ("td", "th"):
83
88
  nearest_cell = parent
@@ -122,6 +122,46 @@ def detect_header_block(rows):
122
122
  [0..k-1] range and are left as-is so the downstream thead-wrap naturally
123
123
  excludes them via Fix 7's contiguous-<th> chain.
124
124
 
125
+ Two further structural witnesses extend "clean data row" to disqualify
126
+ rows that look header-shaped relative to the body:
127
+
128
+ * **Fuller-than-body**: row r's non-empty cell count is strictly greater
129
+ than the minimum non-empty count of the non-divider rows below it,
130
+ AND every column where row r is non-empty has at least one body
131
+ row that fills the same column. A row that fills more columns
132
+ than at least one body row is naming columns the body sometimes
133
+ leaves empty — the structural signature of a column-header row
134
+ above an implicit-rowspan group-label column, a multi-stub
135
+ indentation pyramid, or an alternating coefficient/std-error
136
+ layout. Comparing to the minimum (rather than median or mean)
137
+ is intentional: the structural distinction is "exists a body
138
+ row with fewer non-empty cells than row r," and strictly more
139
+ central statistics misfire on tables where a slim majority of body
140
+ rows match row r's fullness. The body-coverage clause excludes
141
+ receipts whose row 0 is a 4-cell line item followed by 2-cell
142
+ totals — there cols 2 and 3 are filled only in row 0, the body
143
+ never uses them, and promoting row 0 to header would erase the
144
+ line-item data. Uniform-dense tables stay out because their min
145
+ equals row r's count.
146
+
147
+ * **Cell-type inversion**: row r contains at least one corner-stub
148
+ ``<td>`` — a column where row r is ``<td>`` while the body majority
149
+ is ``<th>`` — and a strict majority of compared columns invert.
150
+ The corner-stub clause is load-bearing: an all-``<th>`` row above
151
+ an all-``<td>`` body inverts every column too, but that's the
152
+ *normal* header pattern (and Fix 7 already wraps it in ``<thead>``
153
+ via the contiguous-``<th>`` chain). Inversion is only the right
154
+ witness when row r has a stray ``<td>`` corner cell that the body
155
+ makes a ``<th scope="row">`` (e.g., header row ``[td, th, th]``
156
+ above body rows ``[th, td, td]``). Universal: cell tags are
157
+ markup, not content.
158
+
159
+ Both extended witnesses apply only at r == 0 — the literal first
160
+ row. Beyond row 0, "fuller than the body below" describes regular
161
+ data rows (regression group labels, alternating coefficient/std-err
162
+ pairs) and "first row with col 0 non-empty" is already the existing
163
+ stub-column header pattern handled by the rest of the function.
164
+
125
165
  Returns (k, stub_cols, origin_cells, grid) on success, or None.
126
166
  """
127
167
  n = len(rows)
@@ -132,6 +172,85 @@ def detect_header_block(rows):
132
172
  if max_cols == 0:
133
173
  return None
134
174
 
175
+ # Pre-compute per-row non-empty counts (logical, colspan-expanded).
176
+ nonempty_counts = [sum(1 for c in row if c["nonempty"]) for row in grid]
177
+
178
+ def _body_min_nonempty(after_r: int) -> int:
179
+ """Minimum non-empty count among non-divider rows strictly after
180
+ ``after_r``. Returns -1 when no qualifying body row exists
181
+ (caller treats that as "no body to compare against" and skips
182
+ the witness).
183
+ """
184
+ counts = [nonempty_counts[r2] for r2 in range(after_r + 1, n) if nonempty_counts[r2] >= 2]
185
+ if not counts:
186
+ return -1
187
+ return min(counts)
188
+
189
+ def _row_cols_covered_by_body(r: int) -> bool:
190
+ """True iff every column where row r is non-empty has at least one
191
+ non-divider body row below that fills the same column. Excludes
192
+ receipts whose row 0 fills columns the body never uses (line
193
+ item over totals)."""
194
+ for c in range(max_cols):
195
+ if not grid[r][c]["nonempty"]:
196
+ continue
197
+ covered = False
198
+ for r2 in range(r + 1, n):
199
+ if nonempty_counts[r2] < 2:
200
+ continue
201
+ if grid[r2][c]["nonempty"]:
202
+ covered = True
203
+ break
204
+ if not covered:
205
+ return False
206
+ return True
207
+
208
+ def _is_inverted_relative_to_body(r: int) -> bool:
209
+ """True iff row r contains a corner-stub ``<td>`` and inverts the
210
+ body majority at a strict majority of compared columns. Only
211
+ origin cells with non-empty text on both sides participate.
212
+
213
+ The corner-stub clause requires ≥1 column where row r is ``<td>``
214
+ but body majority is ``<th>``. Without it, an all-``<th>`` row
215
+ above an all-``<td>`` body would also "invert" every column — but
216
+ that's the normal header pattern, already handled by Fix 7's
217
+ contiguous-``<th>`` thead wrap.
218
+ """
219
+ has_corner_stub = False
220
+ inverted = 0
221
+ checked = 0
222
+ for c in range(max_cols):
223
+ row_cell = grid[r][c]
224
+ if row_cell["origin"] != (r, c) or not row_cell["nonempty"]:
225
+ continue
226
+ row_origin = origin_cells.get((r, c))
227
+ if row_origin is None or row_origin.name not in ("td", "th"):
228
+ continue
229
+ row_tag = row_origin.name
230
+
231
+ td_count = 0
232
+ th_count = 0
233
+ for r2 in range(r + 1, n):
234
+ body_cell = grid[r2][c]
235
+ if body_cell["origin"] != (r2, c) or not body_cell["nonempty"]:
236
+ continue
237
+ body_origin = origin_cells.get((r2, c))
238
+ if body_origin is None:
239
+ continue
240
+ if body_origin.name == "td":
241
+ td_count += 1
242
+ elif body_origin.name == "th":
243
+ th_count += 1
244
+ if td_count + th_count == 0:
245
+ continue
246
+ body_majority = "td" if td_count >= th_count else "th"
247
+ checked += 1
248
+ if body_majority != row_tag:
249
+ inverted += 1
250
+ if row_tag == "td" and body_majority == "th":
251
+ has_corner_stub = True
252
+ return has_corner_stub and checked > 0 and inverted * 2 > checked
253
+
135
254
  # Find the first data row. A data row is one whose shape has no
136
255
  # structural feature that would mark it as a header:
137
256
  # - col 0 is non-empty (typical row-label)
@@ -140,6 +259,13 @@ def detect_header_block(rows):
140
259
  # (no rowspan copy from above pulling header content into body)
141
260
  # - every origin cell at row r has rowspan == colspan == 1
142
261
  # (no colspan group or rowspan marker signaling header role)
262
+ # Plus, only at r == 0 (the literal first row):
263
+ # - row 0 is not strictly fuller than at least one body row below
264
+ # while every column it fills is covered by some body row
265
+ # (the fuller-than-body witness — see docstring)
266
+ # - row 0's cell tags are not inverted relative to body majority,
267
+ # with at least one corner-stub ``<td>`` versus body ``<th>``
268
+ # (the cell-type-inversion witness — see docstring)
143
269
  #
144
270
  # The "every cell non-empty" requirement was dropped intentionally:
145
271
  # real body rows in financial tables are often gappy (a cell has a
@@ -153,7 +279,7 @@ def detect_header_block(rows):
153
279
  row = grid[r]
154
280
  if not row[0]["nonempty"]:
155
281
  continue
156
- nonempty_count = sum(1 for c in row if c["nonempty"])
282
+ nonempty_count = nonempty_counts[r]
157
283
  if nonempty_count < 2:
158
284
  continue
159
285
  is_clean = True
@@ -165,6 +291,12 @@ def detect_header_block(rows):
165
291
  if cell["rs"] != 1 or cell["cs"] != 1:
166
292
  is_clean = False
167
293
  break
294
+ if is_clean and r == 0:
295
+ body_min = _body_min_nonempty(r)
296
+ if body_min >= 0 and nonempty_count > body_min and _row_cols_covered_by_body(r):
297
+ is_clean = False
298
+ if is_clean and _is_inverted_relative_to_body(r):
299
+ is_clean = False
168
300
  if is_clean:
169
301
  first_data_idx = r
170
302
  break
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: table2rules
3
- Version: 0.4.0
3
+ Version: 0.5.0
4
4
  Summary: Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding.
5
5
  Author: PebbleRoad Pte Ltd
6
6
  License-Expression: MIT
@@ -452,3 +452,52 @@ def test_reasons_by_severity_partitions_catalogue() -> None:
452
452
  def test_reasons_by_severity_has_expected_buckets() -> None:
453
453
  # Renaming a bucket is a breaking change — guard it.
454
454
  assert set(REASONS_BY_SEVERITY) == {"defensive", "confidence", "input"}
455
+
456
+
457
+ # --- Cell text extraction: inline <style> / <script> noise -----------------
458
+
459
+
460
+ def test_inline_style_tag_excluded_from_cell_text() -> None:
461
+ """Inline <style> blocks injected by Wikipedia templates must not appear
462
+ in emitted rule values. Regression for the CSS-noise silent failure."""
463
+ html = """
464
+ <table>
465
+ <thead>
466
+ <tr><th>District</th><th>Talukas</th></tr>
467
+ </thead>
468
+ <tbody>
469
+ <tr>
470
+ <th scope="row">Bagalkot</th>
471
+ <td><style>.div-col{column-width:30em}</style>Badami Bagalkot Bilagi</td>
472
+ </tr>
473
+ </tbody>
474
+ </table>
475
+ """
476
+ text, report = process_tables_with_stats(html, strict=False)
477
+
478
+ assert report.tables[0].render_mode == "rules"
479
+ assert ".div-col" not in text
480
+ assert "column-width" not in text
481
+ assert "Badami" in text
482
+
483
+
484
+ def test_inline_script_tag_excluded_from_cell_text() -> None:
485
+ """Inline <script> blocks must not bleed into emitted rule values."""
486
+ html = """
487
+ <table>
488
+ <thead>
489
+ <tr><th>Region</th><th>Population</th></tr>
490
+ </thead>
491
+ <tbody>
492
+ <tr>
493
+ <th scope="row">South</th>
494
+ <td><script>var x=1;</script>4,200,000</td>
495
+ </tr>
496
+ </tbody>
497
+ </table>
498
+ """
499
+ text, report = process_tables_with_stats(html, strict=False)
500
+
501
+ assert report.tables[0].render_mode == "rules"
502
+ assert "var x" not in text
503
+ assert "4,200,000" in text
File without changes
File without changes
File without changes