PyPI - table2rules - Versions diffs - 0.5.0__tar.gz → 0.5.2__tar.gz - Mend

table2rules 0.5.0tar.gz → 0.5.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

{table2rules-0.5.0/src/table2rules.egg-info → table2rules-0.5.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: table2rules
-Version: 0.5.0
+Version: 0.5.2
 Summary: Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding.
 Author: PebbleRoad Pte Ltd
 License-Expression: MIT

{table2rules-0.5.0 → table2rules-0.5.2}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "table2rules"
-version = "0.5.0"
+version = "0.5.2"
 description = "Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding."
 readme = "README.md"
 license = "MIT"

{table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules/_core.py RENAMED Viewed

@@ -12,6 +12,7 @@ from .models import LogicRule
 from .quality_gate import GateResult, assess_confidence
 from .report import RenderMode, RenderReport, TableReport
 from .simple_repair import simple_repair
+from .spans import is_full_width_note
 def _split_compound_tables(soup) -> None:
@@ -122,9 +123,18 @@ def _extract_cell_rows(table_html: str) -> List[List[str]]:
 def _build_rules(grid) -> List[LogicRule]:
     """Walk the parsed grid and emit one LogicRule per data cell position."""
     rules: List[LogicRule] = []
+    n_cols = len(grid[0])
+    # Rows that carry a *real* value — used below to decide which rows need
+    # label-only preservation. A value that merely echoes its own column header
+    # (a de-spanned or page-break-repeated header cell) carries no independent
+    # data and is dropped downstream by clean_rules; it must not mask an
+    # otherwise label-only row. Tracked at the value's *target* positions so a
+    # rowspan-filled value correctly marks every row it covers.
+    rows_with_value: set = set()
     for row_idx in range(len(grid)):
-        for col_idx in range(len(grid[0])):
+        for col_idx in range(n_cols):
             cell = grid[row_idx][col_idx]
             # Only <td> cells are data cells
@@ -144,6 +154,23 @@ def _build_rules(grid) -> List[LogicRule]:
             rowspan = cell.get("rowspan", 1)
             colspan = cell.get("colspan", 1)
+            outcome_norm = cell["text"].strip().lower()
+            # A wide <td> that reaches the last column AND covers a majority of
+            # the grid's columns is structurally a full-width note/description
+            # (e.g. a benefit name "Accidental death and permanent disability"
+            # or "If the departure of your public transport is delayed…"
+            # spanning the whole value region), not a per-column value. We still
+            # emit at every spanned position — so the gate detects an
+            # overlapping-span corruption (a rowspan intruding into the note's
+            # row) as a conflict and fails open to flat — but attribute every
+            # position to the *origin* column's header path. The exporter's
+            # origin-aware dedup then collapses the identical lines to one,
+            # instead of stamping the sentence under each plan×cover header.
+            # Legitimate narrow spans (a right-edge colspan=2 amount covering
+            # INDIVIDUAL+FAMILY of one plan) fail the majority test and keep
+            # their genuine per-column attribution.
+            note = is_full_width_note(col_idx, colspan, n_cols)
             for r_offset in range(rowspan):
                 for c_offset in range(colspan):
@@ -153,7 +180,8 @@ def _build_rules(grid) -> List[LogicRule]:
                     if target_row >= len(grid) or target_col >= len(grid[0]):
                         continue
-                    row_headers, col_headers = find_headers_for_cell(grid, target_row, target_col)
+                    header_col = col_idx if note else target_col
+                    row_headers, col_headers = find_headers_for_cell(grid, target_row, header_col)
                     rules.append(
                         LogicRule(
@@ -166,6 +194,56 @@ def _build_rules(grid) -> List[LogicRule]:
                         )
                     )
+                    is_header_echo = outcome_norm in {h.strip().lower() for h in col_headers}
+                    if not is_header_echo:
+                        rows_with_value.add(target_row)
+    # Label-only preservation: a body row whose row-header label is present but
+    # which carries no independent value would otherwise vanish entirely — the
+    # data loop above emits nothing usable for it. This is how de-spanned
+    # section headers arrive when an OCR/HTML pipeline drops the original
+    # ``colspan``: the value column is either empty (a benefits-schedule title
+    # row "2. Public transport double indemnity") or repeats the column header
+    # (a "24. COVID-19 Coverage Extension | Sum Insured" row, whose echoed value
+    # clean_rules strips, taking the label with it). It is structurally
+    # indistinguishable from a leaf row with a genuinely missing value, so we
+    # preserve the label verbatim rather than fabricate a section breadcrumb.
+    for row_idx in range(len(grid)):
+        if row_idx in rows_with_value:
+            continue
+        # Anchor the rule at the row's data column so it satisfies the quality
+        # gate's "rules originate from <td>" invariant. A row with no <td> at
+        # all is a true full-width <th colspan> divider — already handled as a
+        # row-group ancestor upstream — so we leave it alone.
+        anchor_col = next((c for c in range(n_cols) if grid[row_idx][c]["type"] == "td"), None)
+        if anchor_col is None:
+            continue
+        label_parts: List[str] = []
+        for col_idx in range(n_cols):
+            cell = grid[row_idx][col_idx]
+            if cell["type"] != "th":
+                continue
+            if cell.get("is_thead", False) or cell.get("is_header_row", False):
+                continue
+            if cell.get("is_span_copy", False):
+                continue
+            text = (cell.get("text") or "").strip()
+            if not text:
+                continue
+            label_parts.append(text)
+        if not label_parts:
+            continue
+        rules.append(
+            LogicRule(
+                outcome=" > ".join(label_parts),
+                position=(row_idx, anchor_col),
+                row_headers=(),
+                col_headers=(),
+                origin=(row_idx, anchor_col),
+                is_label=True,
+            )
+        )
     return rules
@@ -242,6 +320,8 @@ def _run(
     table_index = 0
     for table in all_tables:
+        if not isinstance(table, Tag):
+            continue
         # Skip nested tables — they're folded into their parent's cell text.
         if table.find_parent("table"):
             continue

{table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules/models.py RENAMED Viewed

@@ -12,6 +12,12 @@ class LogicRule:
     col_headers: Tuple[str, ...] = ()
     origin: Optional[Tuple[int, int]] = None
     is_footer: bool = False
+    # A label-preservation rule: the row carried a label but no independent
+    # value (empty value column, or a value that merely echoes the column
+    # header). The label is preserved verbatim as the outcome with no header
+    # relationship. The confidence gate treats these as pass-through, not a
+    # parser-confidence signal.
+    is_label: bool = False
     def to_string(self) -> str:
         """Descriptive format for Graph-RAG: '<rows> → <cols>: <value>'."""

{table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules/quality_gate.py RENAMED Viewed

@@ -74,6 +74,13 @@ def assess_confidence(grid: List[List[Dict]], rules: List[LogicRule]) -> GateRes
     if not candidates:
         return GateResult(ok=False, score=0.0, reasons=["no_candidate_data_cells"])
+    # Score only value rules. Label-preservation rules (a row's label kept
+    # visible when it carries no independent value) have no header relationship
+    # by design — they are pass-through, not a parser-confidence signal, so they
+    # neither help nor hurt the gate. check_invariants above still validates
+    # them (a valid <td> anchor, non-empty outcome).
+    rules = [r for r in rules if not r.is_label]
     rule_positions = {rule.position for rule in rules}
     coverage = len(rule_positions) / max(1, len(candidates))

{table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules/simple_repair.py RENAMED Viewed

@@ -301,6 +301,29 @@ def detect_header_block(rows):
             first_data_idx = r
             break
+    # Full-width section dividers cap the header. A row whose only non-empty
+    # content is a single DOM cell spanning the whole width (e.g. a benefits
+    # schedule "1. PERSONAL ACCIDENT" <td colspan="8"> row) reads, under the
+    # colspan-expanded non-empty count used above, as a full multi-cell header
+    # row — so without this the header sweep swallows the divider *and* the
+    # body rows between it and the first clean data row, bleeding them onto
+    # every line as fabricated column headers. When such dividers form a series
+    # (>= 2) they are body section dividers, not a one-off header subtitle like
+    # "(Dollars in thousands)"; the header ends at the first one. They stay in
+    # the body as plain cells (rendered as full-width notes downstream).
+    full_width_divider_idxs = []
+    for r in range(n):
+        origins = {grid[r][c]["origin"] for c in range(max_cols) if grid[r][c]["nonempty"]}
+        if len(origins) != 1:
+            continue
+        (orow, ocol) = next(iter(origins))
+        if grid[orow][ocol]["cs"] >= max_cols:
+            full_width_divider_idxs.append(r)
+    if len(full_width_divider_idxs) >= 2:
+        first_divider = full_width_divider_idxs[0]
+        if first_divider > 0 and (first_data_idx is None or first_divider < first_data_idx):
+            first_data_idx = first_divider
     if first_data_idx is None or first_data_idx == 0:
         return None
@@ -720,7 +743,16 @@ def simple_repair(html: str) -> str:
                     # counter stays in sync with the grid, otherwise a cell
                     # at logical col > 0 in a subsequent row would be
                     # mistaken for the first-column cell.
-                    if first.name == "td":
+                    #
+                    # A row whose single cell spans multiple columns is a
+                    # section divider / full-width note, not a row label —
+                    # promoting it to <th scope="row"> strands it (it has no
+                    # value column to anchor a rule, so it vanishes). Leave it
+                    # a <td> so it is emitted once as a full-width note.
+                    is_full_width_single = (
+                        len(cells) == 1 and clamped_span(first.get("colspan")) > 1
+                    )
+                    if first.name == "td" and not is_full_width_single:
                         first.name = "th"
                         first["scope"] = "row"
                     rowspan = clamped_span(first.get("rowspan"))

{table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules/spans.py RENAMED Viewed

@@ -27,6 +27,21 @@ def clamped_span(raw) -> int:
     return value
+def is_full_width_note(col_idx: int, colspan: int, n_cols: int) -> bool:
+    """True when a wide data cell is structurally a full-width note/description.
+    A ``<td>`` that reaches the last column AND spans a majority of the grid's
+    columns (e.g. a benefit name or a "If the departure…" sentence spanning the
+    whole value region of a plan×cover matrix) is a description, not a
+    per-column value. Such a cell must collapse to a single rule rather than fan
+    out across every spanned column — and the confidence gate must count it as a
+    single candidate position to match. Legitimate narrow spans (a right-edge
+    ``colspan=2`` amount covering two sub-columns of one group) fail the majority
+    test and keep their per-column fan-out.
+    """
+    return colspan > 1 and (col_idx + colspan == n_cols) and (colspan * 2 > n_cols)
 def assert_grid_size(rows: int, cols: int) -> None:
     """Raise if a logical grid shape would exceed the configured cell cap."""
     total_cells = rows * cols

{table2rules-0.5.0 → table2rules-0.5.2/src/table2rules.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: table2rules
-Version: 0.5.0
+Version: 0.5.2
 Summary: Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding.
 Author: PebbleRoad Pte Ltd
 License-Expression: MIT

{table2rules-0.5.0 → table2rules-0.5.2}/tests/test_correctness_oracle.py RENAMED Viewed

@@ -188,6 +188,14 @@ def test_correctness_oracle(case: tuple[Path, Path]) -> None:
     matched = 0
     emitted_lines = [l for l in output.splitlines() if l.strip()]
     for line in emitted_lines:
+        # Label-preservation lines reproduce a whole source cell verbatim (a
+        # de-spanned section header whose value column is empty, e.g.
+        # "Segments: (1)"). Such a cell may itself contain ": ", which the
+        # rule-line parser would misread as a key/value split. A line equal to
+        # a full source cell is faithful preservation, not misattribution.
+        if source_tokens and _norm(line) in source_tokens:
+            matched += 1
+            continue
         parsed = _parse_rule_line(line, source_tokens)
         if parsed is None:
             continue

{table2rules-0.5.0 → table2rules-0.5.2}/tests/test_regression_golds.py RENAMED Viewed

@@ -1,13 +1,20 @@
-"""Regression layer — byte-for-byte gold matching on hand-authored fixtures.
-Each .md file beneath tests/{adversarial,structured,headerless,smoke,
-regression}/ is a fixture containing HTML table markup. For every fixture
-we run process_tables_to_text and assert the output matches the committed
-gold file under benchmarks/gold/<format>/.
+"""Regression layer — byte-for-byte gold matching on every fixture.
+Each .md file beneath tests/ (except top-level docs) is a fixture containing
+HTML table markup. For every fixture we run process_tables_to_text and assert
+the output matches the committed gold file under benchmarks/gold/<format>/.
+This covers both the hand-authored fixtures AND the real-world corpus
+(tests/realworld/). The two suites play complementary roles: the correctness
+and robustness layers (test_correctness_oracle / test_robustness_mutations)
+assert the output is *right* (no fabricated content, correct attribution,
+stable under mutation); this layer asserts the output does not *change* unless
+a human regenerates the golds. Together they catch a silent-drop regression —
+where the parser quietly stops emitting real content — which neither the
+oracle (it only guards against fabrication) nor an un-asserted benchmark gold
+could catch on its own. See tests/README.md.
 This is the strictest of the three test layers — catches any output drift.
-See tests/README.md for the relationship to the correctness and robustness
-suites.
 Refresh gold outputs by running:  python scripts/benchmark.py --update-gold
 """
@@ -27,15 +34,14 @@ GOLD_DIR = ROOT / "benchmarks" / "gold" / DEFAULT_FORMAT
 def _discover_cases() -> list[Path]:
-    # Real-world fixtures (tests/realworld/) are checked against per-fixture
-    # oracle triples, not frozen gold text — see test_correctness_oracle.py.
-    # Top-level docs like README.md are not fixtures.
-    skip_prefixes = {"realworld"}
+    # Every fixture beneath tests/ is byte-checked, including the real-world
+    # corpus (tests/realworld/) — frozen gold text is the tripwire that makes
+    # any output change visible. Top-level docs like tests/README.md are not
+    # fixtures and are excluded.
     return [
         p
         for p in sorted(TESTS_DIR.rglob("*.md"))
-        if not (skip_prefixes & set(p.relative_to(TESTS_DIR).parts))
-        and p.parent != TESTS_DIR  # exclude tests/README.md etc.
+        if p.parent != TESTS_DIR  # exclude tests/README.md, tests/failing_table.md
     ]

{table2rules-0.5.0 → table2rules-0.5.2}/tests/test_robustness_mutations.py RENAMED Viewed

@@ -95,13 +95,22 @@ def _parse_rule_line(line: str, source_tokens: frozenset[str] = frozenset()):
     return row_path, col_path, value
-def _classify(output: str) -> str:
+def _classify(output: str, source_tokens: frozenset[str] = frozenset()) -> str:
     lines = [l for l in output.splitlines() if l.strip()]
     if not lines:
         return "EMPTY"
     if any("<table" in l for l in lines):
         return "PASSTHROUGH"
-    rule_shaped = sum(1 for l in lines if _parse_rule_line(l) is not None)
+    # A label-preservation line reproduces a whole source cell verbatim (a
+    # de-spanned/echoed section header kept visible). It is not key/value
+    # shaped, but it is legitimate rules-mode output — count it as such so a
+    # table of rules plus section labels stays RULES rather than degrading to
+    # MIXED (which would skip the precision check below).
+    rule_shaped = sum(
+        1
+        for l in lines
+        if _parse_rule_line(l) is not None or (source_tokens and _norm(l) in source_tokens)
+    )
     if rule_shaped == len(lines):
         return "RULES"
     if rule_shaped == 0:
@@ -345,7 +354,7 @@ def test_robustness_under_mutation(case: tuple[Path, Path], mutation_name: str)
     mutated_html = mutator(html, rng)
     output = process_tables_to_text(mutated_html)
-    tier = _classify(output)
+    tier = _classify(output, source_tokens)
     if tier in {"PASSTHROUGH", "FLAT", "EMPTY", "MIXED"}:
         # Safe fallback; not a precision failure.
         pytest.skip(f"tier={tier} after mutation={mutation_name!r}")
@@ -363,6 +372,14 @@ def test_robustness_under_mutation(case: tuple[Path, Path], mutation_name: str)
     for line in output.splitlines():
         if not line.strip():
             continue
+        # Label-preservation lines reproduce a whole source cell verbatim
+        # (a de-spanned section header whose value column is empty, e.g.
+        # "Segments: (1)"). The cell text itself may contain ": ", which the
+        # rule-line parser would misread as a key/value split. A line equal to
+        # a full source cell is faithful preservation, not fabrication — the
+        # contract is "no invented content", and there is none here.
+        if source_tokens and _norm(line) in source_tokens:
+            continue
         parsed = _parse_rule_line(line, source_tokens)
         if parsed is None:
             continue

{table2rules-0.5.0 → table2rules-0.5.2}/LICENSE RENAMED Viewed

File without changes

{table2rules-0.5.0 → table2rules-0.5.2}/README.md RENAMED Viewed

File without changes

{table2rules-0.5.0 → table2rules-0.5.2}/setup.cfg RENAMED Viewed

File without changes

{table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules/__init__.py RENAMED Viewed

File without changes

{table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules/__main__.py RENAMED Viewed

File without changes

{table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules/cleanup.py RENAMED Viewed

File without changes

{table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules/errors.py RENAMED Viewed

File without changes

{table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules/exporters/__init__.py RENAMED Viewed

File without changes

{table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules/exporters/base.py RENAMED Viewed

File without changes

{table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules/exporters/rules.py RENAMED Viewed

File without changes

{table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules/grid_parser.py RENAMED Viewed

File without changes

{table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules/maze_pathfinder.py RENAMED Viewed

File without changes

{table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules/py.typed RENAMED Viewed

File without changes

{table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules/report.py RENAMED Viewed

File without changes

{table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules.egg-info/SOURCES.txt RENAMED Viewed

File without changes

{table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules.egg-info/entry_points.txt RENAMED Viewed

File without changes

{table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules.egg-info/requires.txt RENAMED Viewed

File without changes

{table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules.egg-info/top_level.txt RENAMED Viewed

File without changes

{table2rules-0.5.0 → table2rules-0.5.2}/tests/test_determinism.py RENAMED Viewed

File without changes

{table2rules-0.5.0 → table2rules-0.5.2}/tests/test_public_api.py RENAMED Viewed

File without changes

table2rules 0.5.0__tar.gz → 0.5.2__tar.gz

table2rules 0.5.0tar.gz → 0.5.2tar.gz