PyPI - table2rules - Versions diffs - 0.5.2__tar.gz → 0.6.0__tar.gz - Mend

table2rules 0.5.2tar.gz → 0.6.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{table2rules-0.5.2/src/table2rules.egg-info → table2rules-0.6.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: table2rules
-Version: 0.5.2
+Version: 0.6.0
 Summary: Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding.
 Author: PebbleRoad Pte Ltd
 License-Expression: MIT

{table2rules-0.5.2 → table2rules-0.6.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "table2rules"
-version = "0.5.2"
+version = "0.6.0"
 description = "Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding."
 readme = "README.md"
 license = "MIT"

{table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules/_core.py RENAMED Viewed

@@ -247,6 +247,78 @@ def _build_rules(grid) -> List[LogicRule]:
     return rules
+def _mark_rowgroup_bands(grid) -> None:
+    """Promote value-region-wide body dividers to ``<th scope="rowgroup">`` so
+    the maze threads them into each value line's row path as bounded, nested
+    row-group ancestors — the row-side counterpart of the multi-level column
+    header path.
+    A candidate is a body cell whose span reaches the last column and covers a
+    majority of the grid (``is_full_width_note`` geometry): a section band
+    (full width) or a group header / description spanning the value region. A
+    candidate is promoted only when its extent contains at least one real data
+    row — so a standalone trailing note (which groups nothing) is left as a
+    note and still emitted, never stranded as an empty-extent rowgroup. Nested
+    candidates are bounded by colspan: a band's extent ends at the next
+    candidate whose span is equal or wider. Cells already marked
+    ``scope="rowgroup"`` by the source are honored as-is.
+    """
+    if not grid or not grid[0]:
+        return
+    n_cols = len(grid[0])
+    # Column-header texts. A full-width body cell that merely repeats a column
+    # header (a units caption like "(In thousands, except per share data)"
+    # reprinted between sections) is an annotation, not a row-group divider —
+    # promoting it would stamp it onto every row path (where it is already noise
+    # in the column path). Exclude such echoes.
+    header_texts = {
+        (cell.get("text") or "").strip().lower()
+        for row in grid
+        for cell in row
+        if cell and cell.get("is_thead") and (cell.get("text") or "").strip()
+    }
+    candidates = []  # (row, col, colspan)
+    for r in range(len(grid)):
+        for c in range(n_cols):
+            cell = grid[r][c]
+            if not cell or cell.get("is_span_copy"):
+                continue
+            if cell.get("is_thead") or cell.get("is_header_row"):
+                continue
+            text = (cell.get("text") or "").strip()
+            if not text:
+                continue
+            if text.lower() in header_texts:
+                continue
+            if is_full_width_note(c, cell.get("colspan", 1), n_cols):
+                candidates.append((r, c, cell.get("colspan", 1)))
+    candidate_rows = {r for (r, _c, _cs) in candidates}
+    def _next_band_below(after_row: int, min_colspan: int) -> int:
+        for rr in range(after_row + 1, len(grid)):
+            if any(r == rr and cs >= min_colspan for (r, _c, cs) in candidates):
+                return rr
+        return len(grid)
+    for r, c, cs in candidates:
+        extent_end = _next_band_below(r, cs) - 1
+        has_data_row = False
+        for rr in range(r + 1, extent_end + 1):
+            if rr in candidate_rows:
+                continue
+            if any(
+                grid[rr][cc]["type"] == "td" and (grid[rr][cc].get("text") or "").strip()
+                for cc in range(n_cols)
+            ):
+                has_data_row = True
+                break
+        if not has_data_row:
+            continue
+        for cc in range(c, min(c + cs, n_cols)):
+            grid[r][cc]["type"] = "th"
+            grid[r][cc]["scope"] = "rowgroup"
 def _process_table_with_gate(table_html: str) -> Tuple[List[LogicRule], GateResult]:
     """Runs the full pipeline and returns rules plus the gate verdict.
@@ -264,6 +336,7 @@ def _process_table_with_gate(table_html: str) -> Tuple[List[LogicRule], GateResu
     if not grid:
         return [], GateResult(ok=False, score=0.0, reasons=["empty_grid"])
+    _mark_rowgroup_bands(grid)
     rules = clean_rules(_build_rules(grid))
     gate = assess_confidence(grid, rules)
     if not gate.ok:

{table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules/grid_parser.py RENAMED Viewed

@@ -465,6 +465,28 @@ def parse_table_to_grid(table: Tag) -> List[List[Dict[str, Any]]]:
                 continue
             promote_cols.add(c)
+        # --- Signal C: stub-dimension columns under the leftmost top-level
+        # header group ---
+        # When the leftmost top-level (row 0) header spans more than one column
+        # AND a distinct value-header group exists to its right, that leftmost
+        # group is the row-label dimension (e.g. a "SECTION" header spanning the
+        # rownum and person-class columns, beside a "MAXIMUM LIMIT" value group).
+        # Its descriptor columns are row labels even though they carry thead text
+        # — promoting them threads the row identity (the person-class) into each
+        # value line, not just the leading rownum, mirroring the column path.
+        top0 = grid[0][0] if grid and grid[0] else None
+        if top0 and top0.get("is_thead"):
+            stub_origin = top0.get("origin", (0, 0)) if top0.get("is_span_copy") else (0, 0)
+            stub_cell = grid[stub_origin[0]][stub_origin[1]]
+            stub_width = stub_cell.get("colspan", 1) if stub_cell else 1
+            has_value_group_right = stub_width < max_cols and any(
+                has_thead_text[c] for c in range(stub_width, max_cols)
+            )
+            if stub_width >= 2 and has_value_group_right:
+                for c in range(stub_width):
+                    if _descriptor_like(c):
+                        promote_cols.add(c)
         if promote_cols:
             for c in sorted(promote_cols):
                 for r in range(data_start_row_idx, len(grid)):

{table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules/maze_pathfinder.py RENAMED Viewed

@@ -116,51 +116,81 @@ def find_headers_for_cell(
             if scope == "row":
                 continue
-            # Locate the origin for scope and rowspan lookup.
+            # scope='rowgroup' bands are handled uniformly in Step 4 (which also
+            # reaches bands spanning the data column when the row-label is
+            # empty), so they can be ordered across columns by nesting level.
+            if scope == "rowgroup":
+                continue
+            # Non-scope-rowgroup <th> cells outside thead are only accepted from
+            # the explicit header block (headless tables where header detection
+            # promoted a row).
+            if not cell.get("is_header_row", False):
+                continue
+            # Locate the origin for dedup.
             if cell.get("is_span_copy", False):
                 origin = cell.get("origin", (r, header_col))
-                origin_cell = grid[origin[0]][origin[1]]
             else:
                 origin = (r, header_col)
-                origin_cell = cell
-            if scope == "rowgroup":
-                # A rowgroup header ancestors rows within its extent:
-                #   rowspan > 1  → extent = [origin_row, origin_row + rowspan - 1]
-                #                  (the rowspan itself bounds the group, as in
-                #                  a <th scope="rowgroup" rowspan="2"> pattern)
-                #   rowspan == 1 → extent = [origin_row, next_rowgroup - 1]
-                #                  (a single-cell divider row like a FinTabNet
-                #                  year label runs until the next such divider
-                #                  in the same column)
-                origin_row, origin_col = origin
-                origin_rowspan = origin_cell.get("rowspan", 1)
-                if origin_rowspan > 1:
-                    extent_end = origin_row + origin_rowspan - 1
-                else:
-                    extent_end = len(grid) - 1
-                    for rr in range(origin_row + 1, len(grid)):
-                        other = grid[rr][origin_col]
-                        if (
-                            other
-                            and not other.get("is_span_copy", False)
-                            and other.get("scope") == "rowgroup"
-                        ):
-                            extent_end = rr - 1
-                            break
-                if row > extent_end:
-                    continue
-            else:
-                # Non-scope-rowgroup <th> cells outside thead are only
-                # accepted from the explicit header block (headless
-                # tables where the header detection promoted a row).
-                is_header_row = cell.get("is_header_row", False)
-                if not is_header_row:
-                    continue
             if origin not in seen_origins:
                 seen_origins.add(origin)
                 # Insert at the beginning to maintain hierarchy
                 row_headers.insert(row_header_columns.index(header_col), cell["text"])
+    # --- 4. Row-group bands ---
+    # A band / group header ancestors the data rows within its extent. Bands are
+    # collected from the data cell's own column AND every row-label column: the
+    # own column reaches bands that span the value region even when this row's
+    # label cell is empty (which would otherwise drop the band, e.g. an
+    # unlabeled continuation row under a group divider); the row-label columns
+    # reach narrow stub-column dividers (a FinTabNet year label). Extent is
+    # bounded by COLSPAN — a band ends at the next band whose span is equal or
+    # wider — so a narrower inner group header does not close an outer one.
+    # Bands are ordered topmost-first (origin row ascending) and prepended, so
+    # the row path reads outer-band > inner-group > row-labels, mirroring the
+    # multi-level column path.
+    bands: List[Tuple[int, str]] = []  # (origin_row, text)
+    for scan_col in [col, *row_header_columns]:
+        for r in range(row - 1, -1, -1):
+            cell = grid[r][scan_col]
+            if not cell or not cell.get("text", "").strip():
+                continue
+            if cell["type"] != "th" or cell.get("is_thead", False):
+                continue
+            if cell.get("scope") != "rowgroup":
+                continue
+            if cell.get("is_span_copy", False):
+                origin = cell.get("origin", (r, scan_col))
+                origin_cell = grid[origin[0]][origin[1]]
+            else:
+                origin = (r, scan_col)
+                origin_cell = cell
+            if origin in seen_origins:
+                continue
+            origin_row, origin_col = origin
+            my_colspan = origin_cell.get("colspan", 1)
+            origin_rowspan = origin_cell.get("rowspan", 1)
+            if origin_rowspan > 1:
+                extent_end = origin_row + origin_rowspan - 1
+            else:
+                extent_end = len(grid) - 1
+                for rr in range(origin_row + 1, len(grid)):
+                    other = grid[rr][origin_col]
+                    if (
+                        other
+                        and not other.get("is_span_copy", False)
+                        and other.get("scope") == "rowgroup"
+                        and other.get("colspan", 1) >= my_colspan
+                    ):
+                        extent_end = rr - 1
+                        break
+            if row > extent_end:
+                continue
+            seen_origins.add(origin)
+            bands.append((origin_row, cell["text"]))
+    bands.sort(key=lambda b: b[0])
+    row_headers[:0] = [text for _row, text in bands]
     return row_headers, col_headers

{table2rules-0.5.2 → table2rules-0.6.0/src/table2rules.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: table2rules
-Version: 0.5.2
+Version: 0.6.0
 Summary: Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding.
 Author: PebbleRoad Pte Ltd
 License-Expression: MIT

{table2rules-0.5.2 → table2rules-0.6.0}/LICENSE RENAMED Viewed

File without changes

{table2rules-0.5.2 → table2rules-0.6.0}/README.md RENAMED Viewed

File without changes

{table2rules-0.5.2 → table2rules-0.6.0}/setup.cfg RENAMED Viewed

File without changes

{table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules/__init__.py RENAMED Viewed

File without changes

{table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules/__main__.py RENAMED Viewed

File without changes

{table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules/cleanup.py RENAMED Viewed

File without changes

{table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules/errors.py RENAMED Viewed

File without changes

{table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules/exporters/__init__.py RENAMED Viewed

File without changes

{table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules/exporters/base.py RENAMED Viewed

File without changes

{table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules/exporters/rules.py RENAMED Viewed

File without changes

{table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules/models.py RENAMED Viewed

File without changes

{table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules/py.typed RENAMED Viewed

File without changes

{table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules/quality_gate.py RENAMED Viewed

File without changes

{table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules/report.py RENAMED Viewed

File without changes

{table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules/simple_repair.py RENAMED Viewed

File without changes

{table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules/spans.py RENAMED Viewed

File without changes

{table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules.egg-info/SOURCES.txt RENAMED Viewed

File without changes

{table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules.egg-info/entry_points.txt RENAMED Viewed

File without changes

{table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules.egg-info/requires.txt RENAMED Viewed

File without changes

{table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules.egg-info/top_level.txt RENAMED Viewed

File without changes

{table2rules-0.5.2 → table2rules-0.6.0}/tests/test_correctness_oracle.py RENAMED Viewed

File without changes

{table2rules-0.5.2 → table2rules-0.6.0}/tests/test_determinism.py RENAMED Viewed

File without changes

{table2rules-0.5.2 → table2rules-0.6.0}/tests/test_public_api.py RENAMED Viewed

File without changes

{table2rules-0.5.2 → table2rules-0.6.0}/tests/test_regression_golds.py RENAMED Viewed

File without changes

{table2rules-0.5.2 → table2rules-0.6.0}/tests/test_robustness_mutations.py RENAMED Viewed

File without changes

table2rules 0.5.2__tar.gz → 0.6.0__tar.gz

table2rules 0.5.2tar.gz → 0.6.0tar.gz