table2rules 0.5.1__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {table2rules-0.5.1/src/table2rules.egg-info → table2rules-0.6.0}/PKG-INFO +1 -1
  2. {table2rules-0.5.1 → table2rules-0.6.0}/pyproject.toml +1 -1
  3. {table2rules-0.5.1 → table2rules-0.6.0}/src/table2rules/_core.py +94 -1
  4. {table2rules-0.5.1 → table2rules-0.6.0}/src/table2rules/grid_parser.py +22 -0
  5. {table2rules-0.5.1 → table2rules-0.6.0}/src/table2rules/maze_pathfinder.py +67 -37
  6. {table2rules-0.5.1 → table2rules-0.6.0}/src/table2rules/simple_repair.py +33 -1
  7. {table2rules-0.5.1 → table2rules-0.6.0}/src/table2rules/spans.py +15 -0
  8. {table2rules-0.5.1 → table2rules-0.6.0/src/table2rules.egg-info}/PKG-INFO +1 -1
  9. {table2rules-0.5.1 → table2rules-0.6.0}/LICENSE +0 -0
  10. {table2rules-0.5.1 → table2rules-0.6.0}/README.md +0 -0
  11. {table2rules-0.5.1 → table2rules-0.6.0}/setup.cfg +0 -0
  12. {table2rules-0.5.1 → table2rules-0.6.0}/src/table2rules/__init__.py +0 -0
  13. {table2rules-0.5.1 → table2rules-0.6.0}/src/table2rules/__main__.py +0 -0
  14. {table2rules-0.5.1 → table2rules-0.6.0}/src/table2rules/cleanup.py +0 -0
  15. {table2rules-0.5.1 → table2rules-0.6.0}/src/table2rules/errors.py +0 -0
  16. {table2rules-0.5.1 → table2rules-0.6.0}/src/table2rules/exporters/__init__.py +0 -0
  17. {table2rules-0.5.1 → table2rules-0.6.0}/src/table2rules/exporters/base.py +0 -0
  18. {table2rules-0.5.1 → table2rules-0.6.0}/src/table2rules/exporters/rules.py +0 -0
  19. {table2rules-0.5.1 → table2rules-0.6.0}/src/table2rules/models.py +0 -0
  20. {table2rules-0.5.1 → table2rules-0.6.0}/src/table2rules/py.typed +0 -0
  21. {table2rules-0.5.1 → table2rules-0.6.0}/src/table2rules/quality_gate.py +0 -0
  22. {table2rules-0.5.1 → table2rules-0.6.0}/src/table2rules/report.py +0 -0
  23. {table2rules-0.5.1 → table2rules-0.6.0}/src/table2rules.egg-info/SOURCES.txt +0 -0
  24. {table2rules-0.5.1 → table2rules-0.6.0}/src/table2rules.egg-info/dependency_links.txt +0 -0
  25. {table2rules-0.5.1 → table2rules-0.6.0}/src/table2rules.egg-info/entry_points.txt +0 -0
  26. {table2rules-0.5.1 → table2rules-0.6.0}/src/table2rules.egg-info/requires.txt +0 -0
  27. {table2rules-0.5.1 → table2rules-0.6.0}/src/table2rules.egg-info/top_level.txt +0 -0
  28. {table2rules-0.5.1 → table2rules-0.6.0}/tests/test_correctness_oracle.py +0 -0
  29. {table2rules-0.5.1 → table2rules-0.6.0}/tests/test_determinism.py +0 -0
  30. {table2rules-0.5.1 → table2rules-0.6.0}/tests/test_public_api.py +0 -0
  31. {table2rules-0.5.1 → table2rules-0.6.0}/tests/test_regression_golds.py +0 -0
  32. {table2rules-0.5.1 → table2rules-0.6.0}/tests/test_robustness_mutations.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: table2rules
3
- Version: 0.5.1
3
+ Version: 0.6.0
4
4
  Summary: Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding.
5
5
  Author: PebbleRoad Pte Ltd
6
6
  License-Expression: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "table2rules"
7
- version = "0.5.1"
7
+ version = "0.6.0"
8
8
  description = "Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding."
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -12,6 +12,7 @@ from .models import LogicRule
12
12
  from .quality_gate import GateResult, assess_confidence
13
13
  from .report import RenderMode, RenderReport, TableReport
14
14
  from .simple_repair import simple_repair
15
+ from .spans import is_full_width_note
15
16
 
16
17
 
17
18
  def _split_compound_tables(soup) -> None:
@@ -155,6 +156,22 @@ def _build_rules(grid) -> List[LogicRule]:
155
156
  colspan = cell.get("colspan", 1)
156
157
  outcome_norm = cell["text"].strip().lower()
157
158
 
159
+ # A wide <td> that reaches the last column AND covers a majority of
160
+ # the grid's columns is structurally a full-width note/description
161
+ # (e.g. a benefit name "Accidental death and permanent disability"
162
+ # or "If the departure of your public transport is delayed…"
163
+ # spanning the whole value region), not a per-column value. We still
164
+ # emit at every spanned position — so the gate detects an
165
+ # overlapping-span corruption (a rowspan intruding into the note's
166
+ # row) as a conflict and fails open to flat — but attribute every
167
+ # position to the *origin* column's header path. The exporter's
168
+ # origin-aware dedup then collapses the identical lines to one,
169
+ # instead of stamping the sentence under each plan×cover header.
170
+ # Legitimate narrow spans (a right-edge colspan=2 amount covering
171
+ # INDIVIDUAL+FAMILY of one plan) fail the majority test and keep
172
+ # their genuine per-column attribution.
173
+ note = is_full_width_note(col_idx, colspan, n_cols)
174
+
158
175
  for r_offset in range(rowspan):
159
176
  for c_offset in range(colspan):
160
177
  target_row = row_idx + r_offset
@@ -163,7 +180,8 @@ def _build_rules(grid) -> List[LogicRule]:
163
180
  if target_row >= len(grid) or target_col >= len(grid[0]):
164
181
  continue
165
182
 
166
- row_headers, col_headers = find_headers_for_cell(grid, target_row, target_col)
183
+ header_col = col_idx if note else target_col
184
+ row_headers, col_headers = find_headers_for_cell(grid, target_row, header_col)
167
185
 
168
186
  rules.append(
169
187
  LogicRule(
@@ -229,6 +247,78 @@ def _build_rules(grid) -> List[LogicRule]:
229
247
  return rules
230
248
 
231
249
 
250
+ def _mark_rowgroup_bands(grid) -> None:
251
+ """Promote value-region-wide body dividers to ``<th scope="rowgroup">`` so
252
+ the maze threads them into each value line's row path as bounded, nested
253
+ row-group ancestors — the row-side counterpart of the multi-level column
254
+ header path.
255
+
256
+ A candidate is a body cell whose span reaches the last column and covers a
257
+ majority of the grid (``is_full_width_note`` geometry): a section band
258
+ (full width) or a group header / description spanning the value region. A
259
+ candidate is promoted only when its extent contains at least one real data
260
+ row — so a standalone trailing note (which groups nothing) is left as a
261
+ note and still emitted, never stranded as an empty-extent rowgroup. Nested
262
+ candidates are bounded by colspan: a band's extent ends at the next
263
+ candidate whose span is equal or wider. Cells already marked
264
+ ``scope="rowgroup"`` by the source are honored as-is.
265
+ """
266
+ if not grid or not grid[0]:
267
+ return
268
+ n_cols = len(grid[0])
269
+ # Column-header texts. A full-width body cell that merely repeats a column
270
+ # header (a units caption like "(In thousands, except per share data)"
271
+ # reprinted between sections) is an annotation, not a row-group divider —
272
+ # promoting it would stamp it onto every row path (where it is already noise
273
+ # in the column path). Exclude such echoes.
274
+ header_texts = {
275
+ (cell.get("text") or "").strip().lower()
276
+ for row in grid
277
+ for cell in row
278
+ if cell and cell.get("is_thead") and (cell.get("text") or "").strip()
279
+ }
280
+ candidates = [] # (row, col, colspan)
281
+ for r in range(len(grid)):
282
+ for c in range(n_cols):
283
+ cell = grid[r][c]
284
+ if not cell or cell.get("is_span_copy"):
285
+ continue
286
+ if cell.get("is_thead") or cell.get("is_header_row"):
287
+ continue
288
+ text = (cell.get("text") or "").strip()
289
+ if not text:
290
+ continue
291
+ if text.lower() in header_texts:
292
+ continue
293
+ if is_full_width_note(c, cell.get("colspan", 1), n_cols):
294
+ candidates.append((r, c, cell.get("colspan", 1)))
295
+ candidate_rows = {r for (r, _c, _cs) in candidates}
296
+
297
+ def _next_band_below(after_row: int, min_colspan: int) -> int:
298
+ for rr in range(after_row + 1, len(grid)):
299
+ if any(r == rr and cs >= min_colspan for (r, _c, cs) in candidates):
300
+ return rr
301
+ return len(grid)
302
+
303
+ for r, c, cs in candidates:
304
+ extent_end = _next_band_below(r, cs) - 1
305
+ has_data_row = False
306
+ for rr in range(r + 1, extent_end + 1):
307
+ if rr in candidate_rows:
308
+ continue
309
+ if any(
310
+ grid[rr][cc]["type"] == "td" and (grid[rr][cc].get("text") or "").strip()
311
+ for cc in range(n_cols)
312
+ ):
313
+ has_data_row = True
314
+ break
315
+ if not has_data_row:
316
+ continue
317
+ for cc in range(c, min(c + cs, n_cols)):
318
+ grid[r][cc]["type"] = "th"
319
+ grid[r][cc]["scope"] = "rowgroup"
320
+
321
+
232
322
  def _process_table_with_gate(table_html: str) -> Tuple[List[LogicRule], GateResult]:
233
323
  """Runs the full pipeline and returns rules plus the gate verdict.
234
324
 
@@ -246,6 +336,7 @@ def _process_table_with_gate(table_html: str) -> Tuple[List[LogicRule], GateResu
246
336
  if not grid:
247
337
  return [], GateResult(ok=False, score=0.0, reasons=["empty_grid"])
248
338
 
339
+ _mark_rowgroup_bands(grid)
249
340
  rules = clean_rules(_build_rules(grid))
250
341
  gate = assess_confidence(grid, rules)
251
342
  if not gate.ok:
@@ -302,6 +393,8 @@ def _run(
302
393
  table_index = 0
303
394
 
304
395
  for table in all_tables:
396
+ if not isinstance(table, Tag):
397
+ continue
305
398
  # Skip nested tables — they're folded into their parent's cell text.
306
399
  if table.find_parent("table"):
307
400
  continue
@@ -465,6 +465,28 @@ def parse_table_to_grid(table: Tag) -> List[List[Dict[str, Any]]]:
465
465
  continue
466
466
  promote_cols.add(c)
467
467
 
468
+ # --- Signal C: stub-dimension columns under the leftmost top-level
469
+ # header group ---
470
+ # When the leftmost top-level (row 0) header spans more than one column
471
+ # AND a distinct value-header group exists to its right, that leftmost
472
+ # group is the row-label dimension (e.g. a "SECTION" header spanning the
473
+ # rownum and person-class columns, beside a "MAXIMUM LIMIT" value group).
474
+ # Its descriptor columns are row labels even though they carry thead text
475
+ # — promoting them threads the row identity (the person-class) into each
476
+ # value line, not just the leading rownum, mirroring the column path.
477
+ top0 = grid[0][0] if grid and grid[0] else None
478
+ if top0 and top0.get("is_thead"):
479
+ stub_origin = top0.get("origin", (0, 0)) if top0.get("is_span_copy") else (0, 0)
480
+ stub_cell = grid[stub_origin[0]][stub_origin[1]]
481
+ stub_width = stub_cell.get("colspan", 1) if stub_cell else 1
482
+ has_value_group_right = stub_width < max_cols and any(
483
+ has_thead_text[c] for c in range(stub_width, max_cols)
484
+ )
485
+ if stub_width >= 2 and has_value_group_right:
486
+ for c in range(stub_width):
487
+ if _descriptor_like(c):
488
+ promote_cols.add(c)
489
+
468
490
  if promote_cols:
469
491
  for c in sorted(promote_cols):
470
492
  for r in range(data_start_row_idx, len(grid)):
@@ -116,51 +116,81 @@ def find_headers_for_cell(
116
116
  if scope == "row":
117
117
  continue
118
118
 
119
- # Locate the origin for scope and rowspan lookup.
119
+ # scope='rowgroup' bands are handled uniformly in Step 4 (which also
120
+ # reaches bands spanning the data column when the row-label is
121
+ # empty), so they can be ordered across columns by nesting level.
122
+ if scope == "rowgroup":
123
+ continue
124
+
125
+ # Non-scope-rowgroup <th> cells outside thead are only accepted from
126
+ # the explicit header block (headless tables where header detection
127
+ # promoted a row).
128
+ if not cell.get("is_header_row", False):
129
+ continue
130
+
131
+ # Locate the origin for dedup.
120
132
  if cell.get("is_span_copy", False):
121
133
  origin = cell.get("origin", (r, header_col))
122
- origin_cell = grid[origin[0]][origin[1]]
123
134
  else:
124
135
  origin = (r, header_col)
125
- origin_cell = cell
126
-
127
- if scope == "rowgroup":
128
- # A rowgroup header ancestors rows within its extent:
129
- # rowspan > 1 → extent = [origin_row, origin_row + rowspan - 1]
130
- # (the rowspan itself bounds the group, as in
131
- # a <th scope="rowgroup" rowspan="2"> pattern)
132
- # rowspan == 1 → extent = [origin_row, next_rowgroup - 1]
133
- # (a single-cell divider row like a FinTabNet
134
- # year label runs until the next such divider
135
- # in the same column)
136
- origin_row, origin_col = origin
137
- origin_rowspan = origin_cell.get("rowspan", 1)
138
- if origin_rowspan > 1:
139
- extent_end = origin_row + origin_rowspan - 1
140
- else:
141
- extent_end = len(grid) - 1
142
- for rr in range(origin_row + 1, len(grid)):
143
- other = grid[rr][origin_col]
144
- if (
145
- other
146
- and not other.get("is_span_copy", False)
147
- and other.get("scope") == "rowgroup"
148
- ):
149
- extent_end = rr - 1
150
- break
151
- if row > extent_end:
152
- continue
153
- else:
154
- # Non-scope-rowgroup <th> cells outside thead are only
155
- # accepted from the explicit header block (headless
156
- # tables where the header detection promoted a row).
157
- is_header_row = cell.get("is_header_row", False)
158
- if not is_header_row:
159
- continue
160
136
 
161
137
  if origin not in seen_origins:
162
138
  seen_origins.add(origin)
163
139
  # Insert at the beginning to maintain hierarchy
164
140
  row_headers.insert(row_header_columns.index(header_col), cell["text"])
165
141
 
142
+ # --- 4. Row-group bands ---
143
+ # A band / group header ancestors the data rows within its extent. Bands are
144
+ # collected from the data cell's own column AND every row-label column: the
145
+ # own column reaches bands that span the value region even when this row's
146
+ # label cell is empty (which would otherwise drop the band, e.g. an
147
+ # unlabeled continuation row under a group divider); the row-label columns
148
+ # reach narrow stub-column dividers (a FinTabNet year label). Extent is
149
+ # bounded by COLSPAN — a band ends at the next band whose span is equal or
150
+ # wider — so a narrower inner group header does not close an outer one.
151
+ # Bands are ordered topmost-first (origin row ascending) and prepended, so
152
+ # the row path reads outer-band > inner-group > row-labels, mirroring the
153
+ # multi-level column path.
154
+ bands: List[Tuple[int, str]] = [] # (origin_row, text)
155
+ for scan_col in [col, *row_header_columns]:
156
+ for r in range(row - 1, -1, -1):
157
+ cell = grid[r][scan_col]
158
+ if not cell or not cell.get("text", "").strip():
159
+ continue
160
+ if cell["type"] != "th" or cell.get("is_thead", False):
161
+ continue
162
+ if cell.get("scope") != "rowgroup":
163
+ continue
164
+ if cell.get("is_span_copy", False):
165
+ origin = cell.get("origin", (r, scan_col))
166
+ origin_cell = grid[origin[0]][origin[1]]
167
+ else:
168
+ origin = (r, scan_col)
169
+ origin_cell = cell
170
+ if origin in seen_origins:
171
+ continue
172
+ origin_row, origin_col = origin
173
+ my_colspan = origin_cell.get("colspan", 1)
174
+ origin_rowspan = origin_cell.get("rowspan", 1)
175
+ if origin_rowspan > 1:
176
+ extent_end = origin_row + origin_rowspan - 1
177
+ else:
178
+ extent_end = len(grid) - 1
179
+ for rr in range(origin_row + 1, len(grid)):
180
+ other = grid[rr][origin_col]
181
+ if (
182
+ other
183
+ and not other.get("is_span_copy", False)
184
+ and other.get("scope") == "rowgroup"
185
+ and other.get("colspan", 1) >= my_colspan
186
+ ):
187
+ extent_end = rr - 1
188
+ break
189
+ if row > extent_end:
190
+ continue
191
+ seen_origins.add(origin)
192
+ bands.append((origin_row, cell["text"]))
193
+ bands.sort(key=lambda b: b[0])
194
+ row_headers[:0] = [text for _row, text in bands]
195
+
166
196
  return row_headers, col_headers
@@ -301,6 +301,29 @@ def detect_header_block(rows):
301
301
  first_data_idx = r
302
302
  break
303
303
 
304
+ # Full-width section dividers cap the header. A row whose only non-empty
305
+ # content is a single DOM cell spanning the whole width (e.g. a benefits
306
+ # schedule "1. PERSONAL ACCIDENT" <td colspan="8"> row) reads, under the
307
+ # colspan-expanded non-empty count used above, as a full multi-cell header
308
+ # row — so without this the header sweep swallows the divider *and* the
309
+ # body rows between it and the first clean data row, bleeding them onto
310
+ # every line as fabricated column headers. When such dividers form a series
311
+ # (>= 2) they are body section dividers, not a one-off header subtitle like
312
+ # "(Dollars in thousands)"; the header ends at the first one. They stay in
313
+ # the body as plain cells (rendered as full-width notes downstream).
314
+ full_width_divider_idxs = []
315
+ for r in range(n):
316
+ origins = {grid[r][c]["origin"] for c in range(max_cols) if grid[r][c]["nonempty"]}
317
+ if len(origins) != 1:
318
+ continue
319
+ (orow, ocol) = next(iter(origins))
320
+ if grid[orow][ocol]["cs"] >= max_cols:
321
+ full_width_divider_idxs.append(r)
322
+ if len(full_width_divider_idxs) >= 2:
323
+ first_divider = full_width_divider_idxs[0]
324
+ if first_divider > 0 and (first_data_idx is None or first_divider < first_data_idx):
325
+ first_data_idx = first_divider
326
+
304
327
  if first_data_idx is None or first_data_idx == 0:
305
328
  return None
306
329
 
@@ -720,7 +743,16 @@ def simple_repair(html: str) -> str:
720
743
  # counter stays in sync with the grid, otherwise a cell
721
744
  # at logical col > 0 in a subsequent row would be
722
745
  # mistaken for the first-column cell.
723
- if first.name == "td":
746
+ #
747
+ # A row whose single cell spans multiple columns is a
748
+ # section divider / full-width note, not a row label —
749
+ # promoting it to <th scope="row"> strands it (it has no
750
+ # value column to anchor a rule, so it vanishes). Leave it
751
+ # a <td> so it is emitted once as a full-width note.
752
+ is_full_width_single = (
753
+ len(cells) == 1 and clamped_span(first.get("colspan")) > 1
754
+ )
755
+ if first.name == "td" and not is_full_width_single:
724
756
  first.name = "th"
725
757
  first["scope"] = "row"
726
758
  rowspan = clamped_span(first.get("rowspan"))
@@ -27,6 +27,21 @@ def clamped_span(raw) -> int:
27
27
  return value
28
28
 
29
29
 
30
+ def is_full_width_note(col_idx: int, colspan: int, n_cols: int) -> bool:
31
+ """True when a wide data cell is structurally a full-width note/description.
32
+
33
+ A ``<td>`` that reaches the last column AND spans a majority of the grid's
34
+ columns (e.g. a benefit name or a "If the departure…" sentence spanning the
35
+ whole value region of a plan×cover matrix) is a description, not a
36
+ per-column value. Such a cell must collapse to a single rule rather than fan
37
+ out across every spanned column — and the confidence gate must count it as a
38
+ single candidate position to match. Legitimate narrow spans (a right-edge
39
+ ``colspan=2`` amount covering two sub-columns of one group) fail the majority
40
+ test and keep their per-column fan-out.
41
+ """
42
+ return colspan > 1 and (col_idx + colspan == n_cols) and (colspan * 2 > n_cols)
43
+
44
+
30
45
  def assert_grid_size(rows: int, cols: int) -> None:
31
46
  """Raise if a logical grid shape would exceed the configured cell cap."""
32
47
  total_cells = rows * cols
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: table2rules
3
- Version: 0.5.1
3
+ Version: 0.6.0
4
4
  Summary: Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding.
5
5
  Author: PebbleRoad Pte Ltd
6
6
  License-Expression: MIT
File without changes
File without changes
File without changes