PyPI - table2rules - Versions diffs - 0.4.0__py3-none-any.whl - Mend

table2rules 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

table2rules/__init__.py +51 -0
table2rules/__main__.py +85 -0
table2rules/_core.py +351 -0
table2rules/cleanup.py +61 -0
table2rules/errors.py +17 -0
table2rules/exporters/__init__.py +33 -0
table2rules/exporters/base.py +41 -0
table2rules/exporters/rules.py +66 -0
table2rules/grid_parser.py +487 -0
table2rules/maze_pathfinder.py +166 -0
table2rules/models.py +26 -0
table2rules/py.typed +0 -0
table2rules/quality_gate.py +186 -0
table2rules/report.py +155 -0
table2rules/simple_repair.py +645 -0
table2rules/spans.py +36 -0
table2rules-0.4.0.dist-info/METADATA +332 -0
table2rules-0.4.0.dist-info/RECORD +22 -0
table2rules-0.4.0.dist-info/WHEEL +5 -0
table2rules-0.4.0.dist-info/entry_points.txt +2 -0
table2rules-0.4.0.dist-info/licenses/LICENSE +21 -0
table2rules-0.4.0.dist-info/top_level.txt +1 -0

table2rules/maze_pathfinder.py ADDED Viewed

@@ -0,0 +1,166 @@
+from typing import Dict, List, Tuple
+def find_headers_for_cell(
+    grid: List[List[Dict]], row: int, col: int
+) -> Tuple[List[str], List[str]]:
+    """
+    Navigate the maze from this cell to find all headers.
+    Returns row_headers and col_headers separately.
+    Rules:
+    1. Walk LEFT on same row - collect all <th> cells
+    2. Walk UP from data cell's column - collect all <th> cells
+    3. Walk UP from each row header's column - collect their column headers
+    """
+    if not grid or not grid[0]:
+        return [], []
+    row_headers = []
+    col_headers = []
+    seen_origins = set()
+    row_header_columns = []  # Track which columns have row headers
+    # Get table properties from the first cell
+    has_thead = grid[0][0].get("has_thead", False)
+    # --- 1. Walk LEFT on same row ---
+    for c in range(col - 1, -1, -1):
+        cell = grid[row][c]
+        if not cell or not cell.get("text", "").strip():
+            continue
+        if cell["type"] == "th":
+            if cell.get("is_span_copy", False):
+                origin = cell.get("origin", (row, c))
+            else:
+                origin = (row, c)
+            if origin not in seen_origins:
+                seen_origins.add(origin)
+                row_headers.append(cell["text"])
+                row_header_columns.append(c)  # Remember this column
+    row_headers.reverse()
+    row_header_columns.reverse()
+    # --- 2. Walk UP - collect headers for the data cell itself ---
+    for r in range(row - 1, -1, -1):
+        cell = grid[r][col]
+        if not cell or not cell.get("text", "").strip():
+            continue
+        if cell["type"] == "th":
+            # Universal "Walk UP" Logic:
+            # If a <thead> exists, only accept headers from it.
+            if has_thead and not cell.get("is_thead", False):
+                continue
+            # Skip row-scoped headers
+            scope = cell.get("scope", "")
+            if scope in ("row", "rowgroup"):
+                continue
+            if cell.get("is_span_copy", False):
+                origin = cell.get("origin", (r, col))
+                origin_row, origin_col = origin
+                origin_cell = grid[origin_row][origin_col]
+                origin_scope = origin_cell.get("scope", "")
+                if origin_scope in ("row", "rowgroup"):
+                    continue
+                colspan = origin_cell.get("colspan", 1)
+                if origin_col <= col < origin_col + colspan:
+                    if origin not in seen_origins:
+                        seen_origins.add(origin)
+                        col_headers.append(cell["text"])
+            else:
+                origin = (r, col)
+                if origin not in seen_origins:
+                    seen_origins.add(origin)
+                    col_headers.append(cell["text"])
+    col_headers.reverse()
+    # --- 3. Walk UP from each row header column ---
+    # Find ancestor headers for the row headers (e.g. "Region" for "North").
+    # Peer row labels are skipped via the scope='row' check below; group
+    # ancestors are bounded by their rowspan/divider extent. No text-level
+    # check is needed.
+    for header_col in row_header_columns:
+        for r in range(row - 1, -1, -1):
+            cell = grid[r][header_col]
+            if not cell or not cell.get("text", "").strip():
+                continue
+            if cell["type"] != "th":
+                continue
+            # Never include <thead> cells in row header context —
+            # thead cells are column headers, not row-header hierarchy.
+            if cell.get("is_thead", False):
+                continue
+            scope = cell.get("scope", "")
+            # Skip column-scoped headers — they name the column.
+            if scope in ("col", "colgroup"):
+                continue
+            # scope='row' = peer row label (not an ancestor). Skip.
+            if scope == "row":
+                continue
+            # Locate the origin for scope and rowspan lookup.
+            if cell.get("is_span_copy", False):
+                origin = cell.get("origin", (r, header_col))
+                origin_cell = grid[origin[0]][origin[1]]
+            else:
+                origin = (r, header_col)
+                origin_cell = cell
+            if scope == "rowgroup":
+                # A rowgroup header ancestors rows within its extent:
+                #   rowspan > 1  → extent = [origin_row, origin_row + rowspan - 1]
+                #                  (the rowspan itself bounds the group, as in
+                #                  a <th scope="rowgroup" rowspan="2"> pattern)
+                #   rowspan == 1 → extent = [origin_row, next_rowgroup - 1]
+                #                  (a single-cell divider row like a FinTabNet
+                #                  year label runs until the next such divider
+                #                  in the same column)
+                origin_row, origin_col = origin
+                origin_rowspan = origin_cell.get("rowspan", 1)
+                if origin_rowspan > 1:
+                    extent_end = origin_row + origin_rowspan - 1
+                else:
+                    extent_end = len(grid) - 1
+                    for rr in range(origin_row + 1, len(grid)):
+                        other = grid[rr][origin_col]
+                        if (
+                            other
+                            and not other.get("is_span_copy", False)
+                            and other.get("scope") == "rowgroup"
+                        ):
+                            extent_end = rr - 1
+                            break
+                if row > extent_end:
+                    continue
+            else:
+                # Non-scope-rowgroup <th> cells outside thead are only
+                # accepted from the explicit header block (headless
+                # tables where the header detection promoted a row).
+                is_header_row = cell.get("is_header_row", False)
+                if not is_header_row:
+                    continue
+            if origin not in seen_origins:
+                seen_origins.add(origin)
+                # Insert at the beginning to maintain hierarchy
+                row_headers.insert(row_header_columns.index(header_col), cell["text"])
+    return row_headers, col_headers

table2rules/models.py ADDED Viewed

@@ -0,0 +1,26 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Optional, Tuple
+@dataclass(frozen=True)
+class LogicRule:
+    outcome: str
+    position: Tuple[int, int]
+    row_headers: Tuple[str, ...] = ()
+    col_headers: Tuple[str, ...] = ()
+    origin: Optional[Tuple[int, int]] = None
+    is_footer: bool = False
+    def to_string(self) -> str:
+        """Descriptive format for Graph-RAG: '<rows> → <cols>: <value>'."""
+        parts = []
+        if self.row_headers:
+            parts.append(" | ".join(self.row_headers))
+        if self.col_headers:
+            parts.append(" | ".join(self.col_headers))
+        if not parts:
+            return f"value: {self.outcome}"
+        context = parts[0] if len(parts) == 1 else f"{parts[0]} → {parts[1]}"
+        return f"{context}: {self.outcome}"

table2rules/py.typed ADDED Viewed

File without changes

table2rules/quality_gate.py ADDED Viewed

@@ -0,0 +1,186 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Dict, List, Tuple
+from .models import LogicRule
+@dataclass
+class GateResult:
+    ok: bool
+    score: float
+    reasons: List[str]
+def _candidate_data_positions(grid: List[List[Dict]]) -> List[Tuple[int, int]]:
+    positions: List[Tuple[int, int]] = []
+    for r, row in enumerate(grid):
+        for c, cell in enumerate(row):
+            if not cell:
+                continue
+            if cell.get("type") != "td":
+                continue
+            if cell.get("is_thead", False) or cell.get("is_header_row", False):
+                continue
+            if not str(cell.get("text", "")).strip():
+                continue
+            positions.append((r, c))
+    return positions
+def check_invariants(grid: List[List[Dict]], rules: List[LogicRule]) -> Tuple[bool, List[str]]:
+    reasons: List[str] = []
+    if not grid or not grid[0]:
+        reasons.append("empty_grid")
+        return False, reasons
+    rows = len(grid)
+    cols = len(grid[0])
+    for rule in rules:
+        r, c = rule.position
+        if not (0 <= r < rows and 0 <= c < cols):
+            reasons.append("position_out_of_bounds")
+            continue
+        cell = grid[r][c]
+        if cell.get("type") != "td":
+            reasons.append("non_td_rule_cell")
+        if cell.get("is_thead", False) or cell.get("is_header_row", False):
+            reasons.append("header_cell_emitted")
+        rule_outcome = (rule.outcome or "").strip()
+        if not rule_outcome:
+            reasons.append("empty_rule_outcome")
+        if any(not h.strip() for h in (rule.row_headers + rule.col_headers)):
+            reasons.append("empty_header_text")
+    return len(reasons) == 0, sorted(set(reasons))
+def assess_confidence(grid: List[List[Dict]], rules: List[LogicRule]) -> GateResult:
+    """
+    Conservative fail-open gate:
+    - Hard-fail on invariant violations.
+    - Soft score combines data coverage and header attachment.
+    """
+    ok, inv_reasons = check_invariants(grid, rules)
+    if not ok:
+        return GateResult(ok=False, score=0.0, reasons=inv_reasons)
+    candidates = _candidate_data_positions(grid)
+    if not candidates:
+        return GateResult(ok=False, score=0.0, reasons=["no_candidate_data_cells"])
+    rule_positions = {rule.position for rule in rules}
+    coverage = len(rule_positions) / max(1, len(candidates))
+    with_headers = sum(1 for rule in rules if rule.row_headers or rule.col_headers)
+    header_ratio = with_headers / max(1, len(rules))
+    # Penalize duplicate positions and conflicting outcomes at the same position.
+    pos_outcomes: Dict[Tuple[int, int], set] = {}
+    for rule in rules:
+        pos_outcomes.setdefault(rule.position, set()).add(rule.outcome.strip())
+    unique_positions = len(pos_outcomes)
+    duplicate_ratio = (len(rules) - unique_positions) / max(1, len(rules))
+    conflicting_positions = sum(1 for values in pos_outcomes.values() if len(values) > 1)
+    conflict_ratio = conflicting_positions / max(1, unique_positions)
+    # Penalize noisy self-echo headers (header identical to value)
+    self_echo = 0
+    for rule in rules:
+        outcome = rule.outcome.strip().lower()
+        headers = [h.strip().lower() for h in (rule.row_headers + rule.col_headers)]
+        if outcome and outcome in headers:
+            self_echo += 1
+    echo_ratio = self_echo / max(1, len(rules))
+    # Shape-heuristic header checks (numeric / placeholder) only apply when
+    # the headers could have been MISIDENTIFIED by the parser. Cells that
+    # the source explicitly placed inside <thead> are authoritative —
+    # financial reports legitimately label columns with years like "2024",
+    # and the gate must not second-guess source-authored <th>. Skip the
+    # shape heuristics for tables that have any <thead> cell in the grid.
+    has_source_thead = any(cell.get("is_thead", False) for row in grid for cell in row if cell)
+    # Penalize numeric column headers — real headers are text labels, not values.
+    # A column header like "25.000" or "· 12,000" signals the first row was
+    # data, not a header.  Strip common currency/bullet noise before checking.
+    #
+    # Flag a RULE only when the ENTIRE column-header stack is numeric. Multi-
+    # level headers where the bottom level is numeric (e.g. a year label
+    # '2018' under a text group 'Year Ended December 31,') are legitimate
+    # financial / statistical / sports tables and must not trigger the guard.
+    import re
+    def _is_numeric_token(h: str) -> bool:
+        stripped = re.sub(r"[\s\$€£¥·•\-\+,.]", "", h.strip())
+        return bool(stripped) and stripped.isdigit()
+    def _is_placeholder_token(h: str) -> bool:
+        return bool(re.match(r"^[_\-.\s]+$", h.strip()))
+    rules_all_numeric_col = 0
+    rules_all_placeholder_col = 0
+    rules_with_col_headers = 0
+    for rule in rules:
+        if not rule.col_headers:
+            continue
+        rules_with_col_headers += 1
+        if all(_is_numeric_token(h) for h in rule.col_headers):
+            rules_all_numeric_col += 1
+        if all(_is_placeholder_token(h) for h in rule.col_headers):
+            rules_all_placeholder_col += 1
+    numeric_header_ratio = rules_all_numeric_col / max(1, rules_with_col_headers)
+    placeholder_header_ratio = rules_all_placeholder_col / max(1, rules_with_col_headers)
+    score = (
+        (0.45 * coverage)
+        + (0.30 * header_ratio)
+        + (0.10 * (1.0 - echo_ratio))
+        + (0.10 * (1.0 - duplicate_ratio))
+        + (0.05 * (1.0 - conflict_ratio))
+    )
+    reasons: List[str] = []
+    if coverage < 0.60:
+        reasons.append("low_coverage")
+    # Structural invariant for rules mode: every rule must carry at least
+    # one header. A rule with zero headers is indistinguishable from flat
+    # cell text — rules format implies a header relationship that doesn't
+    # exist if no header was found. Fires universally, not on a threshold.
+    if len(rules) > 0 and header_ratio < 1.0:
+        reasons.append("low_header_attachment")
+    if echo_ratio > 0.50:
+        reasons.append("high_self_echo")
+    # One logical grid position must not carry multiple source cells. A valid
+    # rowspan/colspan expands one origin across many positions, but two origins
+    # at the same position means the source geometry overlaps. Fail open instead
+    # of emitting a rule for an ambiguous slot.
+    if duplicate_ratio > 0:
+        reasons.append("high_duplicate_positions")
+    if conflict_ratio > 0:
+        reasons.append("high_position_conflict")
+    if not has_source_thead and numeric_header_ratio > 0.30:
+        reasons.append("numeric_column_headers")
+    if not has_source_thead and placeholder_header_ratio > 0.30:
+        reasons.append("placeholder_column_headers")
+    # Detect column-header coverage gaps: the table has column headers for some
+    # rules but not for others. This happens when the header rows cover fewer
+    # columns than the data rows (e.g. a multi-level header that forgets to
+    # allocate a slot for the row-label column, shifting all column labels one
+    # position and leaving the rightmost data column without any label).
+    # Only fires when the table has at least some column headers — a table with
+    # only row headers is fine (zero column headers everywhere is intentional).
+    rules_with_col = sum(1 for r in rules if r.col_headers)
+    rules_without_col = sum(1 for r in rules if not r.col_headers)
+    if rules_with_col > 0 and rules_without_col > 0:
+        reasons.append("partial_column_coverage")
+    # Keep threshold modest so we only fail on clearly weak parses.
+    gate_ok = score >= 0.45 and not reasons
+    return GateResult(ok=gate_ok, score=score, reasons=reasons)

table2rules/report.py ADDED Viewed

@@ -0,0 +1,155 @@
+"""Per-table observability types returned by ``process_tables_with_stats``.
+Downstream integrators use these to answer "did this table convert cleanly,
+and if not, why?" without having to re-parse the output or sample by hand.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from itertools import chain
+from typing import Dict, FrozenSet, Iterable, Literal, Optional, Tuple
+# The four values ``render_mode`` can take. Order = descending output quality.
+#
+#   rules       — gate passed; exporter-native output (one rule per line).
+#   flat        — gate failed; header-free pipe-joined cell rows.
+#   passthrough — neither rules nor flat produced anything; raw HTML emitted.
+#   skipped     — input was refused before fallback output (currently too large).
+#
+# Adding new values is a minor-version bump; renaming/removing is breaking.
+RenderMode = Literal["rules", "flat", "passthrough", "skipped"]
+# Symbolic constants for integrators who'd rather not sprinkle magic strings
+# through policy code. ``t.render_mode == RENDER_MODE_RULES`` is equivalent to
+# ``t.render_mode == "rules"``; use whichever reads better at the call site.
+RENDER_MODE_RULES: RenderMode = "rules"
+RENDER_MODE_FLAT: RenderMode = "flat"
+RENDER_MODE_PASSTHROUGH: RenderMode = "passthrough"
+RENDER_MODE_SKIPPED: RenderMode = "skipped"
+# Stable catalogue of every ``reasons`` string a ``TableReport`` may contain.
+# This is part of the semver contract: additions are minor-version bumps;
+# renames or removals are breaking. Integrators who switch on ``reasons`` can
+# lint against ``REASONS.keys()`` to catch typos.
+REASONS: Dict[str, str] = {
+    # --- Gate invariants (structural) ---
+    "empty_grid": "Parsed grid was empty or zero-dimensional.",
+    "position_out_of_bounds": "A generated rule's position fell outside the grid bounds.",
+    "non_td_rule_cell": "A rule was generated from a non-<td> cell.",
+    "header_cell_emitted": "A rule was generated from a cell flagged as a header.",
+    "empty_rule_outcome": "A rule's outcome text was empty.",
+    "empty_header_text": "A rule had at least one header with empty text.",
+    # --- Gate confidence (statistical) ---
+    "no_candidate_data_cells": "The table had no non-empty data cells.",
+    "low_coverage": "Fewer than 60% of data cells produced a rule.",
+    "low_header_attachment": "At least one rule lacked any header context; rules mode requires every rule to carry at least one header.",
+    "high_self_echo": "More than 50% of rules repeat a column header as their value.",
+    "high_duplicate_positions": "At least one logical grid position produced multiple rules.",
+    "high_position_conflict": "At least one logical grid position carried conflicting outcomes.",
+    "numeric_column_headers": "More than 30% of rules have all-numeric column headers — likely a data row misread as a header.",
+    "placeholder_column_headers": "More than 30% of rules have placeholder-only column headers (underscores, dashes).",
+    "partial_column_coverage": "The table has column headers for some rules but not for others — the header rows do not fully cover all data columns. Common cause: a multi-level header that does not reserve a column for the row-label, shifting all column labels one position to the right.",
+    # --- Report-level signals ---
+    "input_too_large": "Expanded grid exceeded the safety cap; the table was skipped.",
+    "processing_error": "The parser raised an exception and ``strict=False`` swallowed it; see ``TableReport.error``.",
+}
+# Operational severity grouping for the codes in ``REASONS``.
+#
+#   defensive  — structural invariants on the library's own output. Should
+#                never fire in production; if you see one, file an issue.
+#   confidence — soft gate signals for low-quality parses. Expected on
+#                real-world input; tune alerting against these.
+#   input      — signals that the caller handed table2rules bad data. The
+#                fix is upstream, not in this library.
+#
+# Exposing this grouping lets integrators auto-populate metrics dashboards
+# and switch statements without hardcoding the buckets from the docs. Every
+# key in ``REASONS`` appears in exactly one bucket — enforced by tests.
+REASONS_BY_SEVERITY: Dict[str, FrozenSet[str]] = {
+    "defensive": frozenset(
+        {
+            "empty_grid",
+            "position_out_of_bounds",
+            "non_td_rule_cell",
+            "header_cell_emitted",
+            "empty_rule_outcome",
+            "empty_header_text",
+        }
+    ),
+    "confidence": frozenset(
+        {
+            "no_candidate_data_cells",
+            "low_coverage",
+            "low_header_attachment",
+            "high_self_echo",
+            "high_duplicate_positions",
+            "high_position_conflict",
+            "numeric_column_headers",
+            "placeholder_column_headers",
+            "partial_column_coverage",
+        }
+    ),
+    "input": frozenset(
+        {
+            "input_too_large",
+            "processing_error",
+        }
+    ),
+}
+@dataclass(frozen=True)
+class TableReport:
+    """Observability record for a single top-level table in the input HTML.
+    ``text`` carries the rendered output for *this* table only — the same lines
+    that contributed to the concatenated string returned alongside the report.
+    Callers passing whole-document HTML in can read ``report.tables[i].text``
+    to keep per-table provenance instead of having to split the flat blob.
+    ``caption`` is the text of the table's ``<caption>`` element when present,
+    otherwise ``None``. Only direct ``<caption>`` children are read; the HTML
+    ``id`` attribute, surrounding headings, and other content-derived names
+    are intentionally ignored — ``table_index`` remains the only stable
+    positional identifier.
+    """
+    table_index: int
+    render_mode: RenderMode
+    gate_ok: bool
+    gate_score: float
+    reasons: Tuple[str, ...]
+    error: Optional[str] = None
+    caption: Optional[str] = None
+    text: str = ""
+@dataclass(frozen=True)
+class RenderReport:
+    """Aggregate of per-table reports for a single ``process_tables_*`` call."""
+    tables: Tuple[TableReport, ...] = ()
+    @property
+    def tables_rendered(self) -> int:
+        """Count of tables whose output reached the final string in any mode."""
+        return sum(1 for t in self.tables if t.render_mode != "skipped")
+    @property
+    def tables_flagged(self) -> int:
+        """Count of tables that did NOT produce clean rules output."""
+        return sum(1 for t in self.tables if t.render_mode != "rules")
+    @classmethod
+    def merge(cls, reports: Iterable["RenderReport"]) -> "RenderReport":
+        """Concatenate multiple reports (e.g. from a batch of documents).
+        Per-report ``table_index`` values are preserved as-is — they refer to
+        positions within each original call. If you need cross-call identity,
+        track it alongside the reports yourself.
+        """
+        return cls(tables=tuple(chain.from_iterable(r.tables for r in reports)))