table2rules 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,166 @@
1
+ from typing import Dict, List, Tuple
2
+
3
+
4
+ def find_headers_for_cell(
5
+ grid: List[List[Dict]], row: int, col: int
6
+ ) -> Tuple[List[str], List[str]]:
7
+ """
8
+ Navigate the maze from this cell to find all headers.
9
+
10
+ Returns row_headers and col_headers separately.
11
+
12
+ Rules:
13
+ 1. Walk LEFT on same row - collect all <th> cells
14
+ 2. Walk UP from data cell's column - collect all <th> cells
15
+ 3. Walk UP from each row header's column - collect their column headers
16
+ """
17
+ if not grid or not grid[0]:
18
+ return [], []
19
+
20
+ row_headers = []
21
+ col_headers = []
22
+ seen_origins = set()
23
+ row_header_columns = [] # Track which columns have row headers
24
+
25
+ # Get table properties from the first cell
26
+ has_thead = grid[0][0].get("has_thead", False)
27
+
28
+ # --- 1. Walk LEFT on same row ---
29
+ for c in range(col - 1, -1, -1):
30
+ cell = grid[row][c]
31
+
32
+ if not cell or not cell.get("text", "").strip():
33
+ continue
34
+
35
+ if cell["type"] == "th":
36
+ if cell.get("is_span_copy", False):
37
+ origin = cell.get("origin", (row, c))
38
+ else:
39
+ origin = (row, c)
40
+
41
+ if origin not in seen_origins:
42
+ seen_origins.add(origin)
43
+ row_headers.append(cell["text"])
44
+ row_header_columns.append(c) # Remember this column
45
+
46
+ row_headers.reverse()
47
+ row_header_columns.reverse()
48
+
49
+ # --- 2. Walk UP - collect headers for the data cell itself ---
50
+ for r in range(row - 1, -1, -1):
51
+ cell = grid[r][col]
52
+
53
+ if not cell or not cell.get("text", "").strip():
54
+ continue
55
+
56
+ if cell["type"] == "th":
57
+ # Universal "Walk UP" Logic:
58
+ # If a <thead> exists, only accept headers from it.
59
+ if has_thead and not cell.get("is_thead", False):
60
+ continue
61
+
62
+ # Skip row-scoped headers
63
+ scope = cell.get("scope", "")
64
+ if scope in ("row", "rowgroup"):
65
+ continue
66
+
67
+ if cell.get("is_span_copy", False):
68
+ origin = cell.get("origin", (r, col))
69
+ origin_row, origin_col = origin
70
+ origin_cell = grid[origin_row][origin_col]
71
+ origin_scope = origin_cell.get("scope", "")
72
+
73
+ if origin_scope in ("row", "rowgroup"):
74
+ continue
75
+
76
+ colspan = origin_cell.get("colspan", 1)
77
+ if origin_col <= col < origin_col + colspan:
78
+ if origin not in seen_origins:
79
+ seen_origins.add(origin)
80
+ col_headers.append(cell["text"])
81
+ else:
82
+ origin = (r, col)
83
+ if origin not in seen_origins:
84
+ seen_origins.add(origin)
85
+ col_headers.append(cell["text"])
86
+
87
+ col_headers.reverse()
88
+
89
+ # --- 3. Walk UP from each row header column ---
90
+ # Find ancestor headers for the row headers (e.g. "Region" for "North").
91
+ # Peer row labels are skipped via the scope='row' check below; group
92
+ # ancestors are bounded by their rowspan/divider extent. No text-level
93
+ # check is needed.
94
+ for header_col in row_header_columns:
95
+ for r in range(row - 1, -1, -1):
96
+ cell = grid[r][header_col]
97
+
98
+ if not cell or not cell.get("text", "").strip():
99
+ continue
100
+
101
+ if cell["type"] != "th":
102
+ continue
103
+
104
+ # Never include <thead> cells in row header context —
105
+ # thead cells are column headers, not row-header hierarchy.
106
+ if cell.get("is_thead", False):
107
+ continue
108
+
109
+ scope = cell.get("scope", "")
110
+
111
+ # Skip column-scoped headers — they name the column.
112
+ if scope in ("col", "colgroup"):
113
+ continue
114
+
115
+ # scope='row' = peer row label (not an ancestor). Skip.
116
+ if scope == "row":
117
+ continue
118
+
119
+ # Locate the origin for scope and rowspan lookup.
120
+ if cell.get("is_span_copy", False):
121
+ origin = cell.get("origin", (r, header_col))
122
+ origin_cell = grid[origin[0]][origin[1]]
123
+ else:
124
+ origin = (r, header_col)
125
+ origin_cell = cell
126
+
127
+ if scope == "rowgroup":
128
+ # A rowgroup header ancestors rows within its extent:
129
+ # rowspan > 1 → extent = [origin_row, origin_row + rowspan - 1]
130
+ # (the rowspan itself bounds the group, as in
131
+ # a <th scope="rowgroup" rowspan="2"> pattern)
132
+ # rowspan == 1 → extent = [origin_row, next_rowgroup - 1]
133
+ # (a single-cell divider row like a FinTabNet
134
+ # year label runs until the next such divider
135
+ # in the same column)
136
+ origin_row, origin_col = origin
137
+ origin_rowspan = origin_cell.get("rowspan", 1)
138
+ if origin_rowspan > 1:
139
+ extent_end = origin_row + origin_rowspan - 1
140
+ else:
141
+ extent_end = len(grid) - 1
142
+ for rr in range(origin_row + 1, len(grid)):
143
+ other = grid[rr][origin_col]
144
+ if (
145
+ other
146
+ and not other.get("is_span_copy", False)
147
+ and other.get("scope") == "rowgroup"
148
+ ):
149
+ extent_end = rr - 1
150
+ break
151
+ if row > extent_end:
152
+ continue
153
+ else:
154
+ # Non-scope-rowgroup <th> cells outside thead are only
155
+ # accepted from the explicit header block (headless
156
+ # tables where the header detection promoted a row).
157
+ is_header_row = cell.get("is_header_row", False)
158
+ if not is_header_row:
159
+ continue
160
+
161
+ if origin not in seen_origins:
162
+ seen_origins.add(origin)
163
+ # Insert at the beginning to maintain hierarchy
164
+ row_headers.insert(row_header_columns.index(header_col), cell["text"])
165
+
166
+ return row_headers, col_headers
table2rules/models.py ADDED
@@ -0,0 +1,26 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Optional, Tuple
5
+
6
+
7
+ @dataclass(frozen=True)
8
+ class LogicRule:
9
+ outcome: str
10
+ position: Tuple[int, int]
11
+ row_headers: Tuple[str, ...] = ()
12
+ col_headers: Tuple[str, ...] = ()
13
+ origin: Optional[Tuple[int, int]] = None
14
+ is_footer: bool = False
15
+
16
+ def to_string(self) -> str:
17
+ """Descriptive format for Graph-RAG: '<rows> → <cols>: <value>'."""
18
+ parts = []
19
+ if self.row_headers:
20
+ parts.append(" | ".join(self.row_headers))
21
+ if self.col_headers:
22
+ parts.append(" | ".join(self.col_headers))
23
+ if not parts:
24
+ return f"value: {self.outcome}"
25
+ context = parts[0] if len(parts) == 1 else f"{parts[0]} → {parts[1]}"
26
+ return f"{context}: {self.outcome}"
table2rules/py.typed ADDED
File without changes
@@ -0,0 +1,186 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Dict, List, Tuple
5
+
6
+ from .models import LogicRule
7
+
8
+
9
+ @dataclass
10
+ class GateResult:
11
+ ok: bool
12
+ score: float
13
+ reasons: List[str]
14
+
15
+
16
+ def _candidate_data_positions(grid: List[List[Dict]]) -> List[Tuple[int, int]]:
17
+ positions: List[Tuple[int, int]] = []
18
+ for r, row in enumerate(grid):
19
+ for c, cell in enumerate(row):
20
+ if not cell:
21
+ continue
22
+ if cell.get("type") != "td":
23
+ continue
24
+ if cell.get("is_thead", False) or cell.get("is_header_row", False):
25
+ continue
26
+ if not str(cell.get("text", "")).strip():
27
+ continue
28
+ positions.append((r, c))
29
+ return positions
30
+
31
+
32
+ def check_invariants(grid: List[List[Dict]], rules: List[LogicRule]) -> Tuple[bool, List[str]]:
33
+ reasons: List[str] = []
34
+ if not grid or not grid[0]:
35
+ reasons.append("empty_grid")
36
+ return False, reasons
37
+
38
+ rows = len(grid)
39
+ cols = len(grid[0])
40
+
41
+ for rule in rules:
42
+ r, c = rule.position
43
+ if not (0 <= r < rows and 0 <= c < cols):
44
+ reasons.append("position_out_of_bounds")
45
+ continue
46
+
47
+ cell = grid[r][c]
48
+ if cell.get("type") != "td":
49
+ reasons.append("non_td_rule_cell")
50
+ if cell.get("is_thead", False) or cell.get("is_header_row", False):
51
+ reasons.append("header_cell_emitted")
52
+
53
+ rule_outcome = (rule.outcome or "").strip()
54
+ if not rule_outcome:
55
+ reasons.append("empty_rule_outcome")
56
+
57
+ if any(not h.strip() for h in (rule.row_headers + rule.col_headers)):
58
+ reasons.append("empty_header_text")
59
+
60
+ return len(reasons) == 0, sorted(set(reasons))
61
+
62
+
63
+ def assess_confidence(grid: List[List[Dict]], rules: List[LogicRule]) -> GateResult:
64
+ """
65
+ Conservative fail-open gate:
66
+ - Hard-fail on invariant violations.
67
+ - Soft score combines data coverage and header attachment.
68
+ """
69
+ ok, inv_reasons = check_invariants(grid, rules)
70
+ if not ok:
71
+ return GateResult(ok=False, score=0.0, reasons=inv_reasons)
72
+
73
+ candidates = _candidate_data_positions(grid)
74
+ if not candidates:
75
+ return GateResult(ok=False, score=0.0, reasons=["no_candidate_data_cells"])
76
+
77
+ rule_positions = {rule.position for rule in rules}
78
+ coverage = len(rule_positions) / max(1, len(candidates))
79
+
80
+ with_headers = sum(1 for rule in rules if rule.row_headers or rule.col_headers)
81
+ header_ratio = with_headers / max(1, len(rules))
82
+
83
+ # Penalize duplicate positions and conflicting outcomes at the same position.
84
+ pos_outcomes: Dict[Tuple[int, int], set] = {}
85
+ for rule in rules:
86
+ pos_outcomes.setdefault(rule.position, set()).add(rule.outcome.strip())
87
+ unique_positions = len(pos_outcomes)
88
+ duplicate_ratio = (len(rules) - unique_positions) / max(1, len(rules))
89
+ conflicting_positions = sum(1 for values in pos_outcomes.values() if len(values) > 1)
90
+ conflict_ratio = conflicting_positions / max(1, unique_positions)
91
+
92
+ # Penalize noisy self-echo headers (header identical to value)
93
+ self_echo = 0
94
+ for rule in rules:
95
+ outcome = rule.outcome.strip().lower()
96
+ headers = [h.strip().lower() for h in (rule.row_headers + rule.col_headers)]
97
+ if outcome and outcome in headers:
98
+ self_echo += 1
99
+ echo_ratio = self_echo / max(1, len(rules))
100
+
101
+ # Shape-heuristic header checks (numeric / placeholder) only apply when
102
+ # the headers could have been MISIDENTIFIED by the parser. Cells that
103
+ # the source explicitly placed inside <thead> are authoritative —
104
+ # financial reports legitimately label columns with years like "2024",
105
+ # and the gate must not second-guess source-authored <th>. Skip the
106
+ # shape heuristics for tables that have any <thead> cell in the grid.
107
+ has_source_thead = any(cell.get("is_thead", False) for row in grid for cell in row if cell)
108
+
109
+ # Penalize numeric column headers — real headers are text labels, not values.
110
+ # A column header like "25.000" or "· 12,000" signals the first row was
111
+ # data, not a header. Strip common currency/bullet noise before checking.
112
+ #
113
+ # Flag a RULE only when the ENTIRE column-header stack is numeric. Multi-
114
+ # level headers where the bottom level is numeric (e.g. a year label
115
+ # '2018' under a text group 'Year Ended December 31,') are legitimate
116
+ # financial / statistical / sports tables and must not trigger the guard.
117
+ import re
118
+
119
+ def _is_numeric_token(h: str) -> bool:
120
+ stripped = re.sub(r"[\s\$€£¥·•\-\+,.]", "", h.strip())
121
+ return bool(stripped) and stripped.isdigit()
122
+
123
+ def _is_placeholder_token(h: str) -> bool:
124
+ return bool(re.match(r"^[_\-.\s]+$", h.strip()))
125
+
126
+ rules_all_numeric_col = 0
127
+ rules_all_placeholder_col = 0
128
+ rules_with_col_headers = 0
129
+ for rule in rules:
130
+ if not rule.col_headers:
131
+ continue
132
+ rules_with_col_headers += 1
133
+ if all(_is_numeric_token(h) for h in rule.col_headers):
134
+ rules_all_numeric_col += 1
135
+ if all(_is_placeholder_token(h) for h in rule.col_headers):
136
+ rules_all_placeholder_col += 1
137
+ numeric_header_ratio = rules_all_numeric_col / max(1, rules_with_col_headers)
138
+ placeholder_header_ratio = rules_all_placeholder_col / max(1, rules_with_col_headers)
139
+
140
+ score = (
141
+ (0.45 * coverage)
142
+ + (0.30 * header_ratio)
143
+ + (0.10 * (1.0 - echo_ratio))
144
+ + (0.10 * (1.0 - duplicate_ratio))
145
+ + (0.05 * (1.0 - conflict_ratio))
146
+ )
147
+
148
+ reasons: List[str] = []
149
+ if coverage < 0.60:
150
+ reasons.append("low_coverage")
151
+ # Structural invariant for rules mode: every rule must carry at least
152
+ # one header. A rule with zero headers is indistinguishable from flat
153
+ # cell text — rules format implies a header relationship that doesn't
154
+ # exist if no header was found. Fires universally, not on a threshold.
155
+ if len(rules) > 0 and header_ratio < 1.0:
156
+ reasons.append("low_header_attachment")
157
+ if echo_ratio > 0.50:
158
+ reasons.append("high_self_echo")
159
+ # One logical grid position must not carry multiple source cells. A valid
160
+ # rowspan/colspan expands one origin across many positions, but two origins
161
+ # at the same position means the source geometry overlaps. Fail open instead
162
+ # of emitting a rule for an ambiguous slot.
163
+ if duplicate_ratio > 0:
164
+ reasons.append("high_duplicate_positions")
165
+ if conflict_ratio > 0:
166
+ reasons.append("high_position_conflict")
167
+ if not has_source_thead and numeric_header_ratio > 0.30:
168
+ reasons.append("numeric_column_headers")
169
+ if not has_source_thead and placeholder_header_ratio > 0.30:
170
+ reasons.append("placeholder_column_headers")
171
+
172
+ # Detect column-header coverage gaps: the table has column headers for some
173
+ # rules but not for others. This happens when the header rows cover fewer
174
+ # columns than the data rows (e.g. a multi-level header that forgets to
175
+ # allocate a slot for the row-label column, shifting all column labels one
176
+ # position and leaving the rightmost data column without any label).
177
+ # Only fires when the table has at least some column headers — a table with
178
+ # only row headers is fine (zero column headers everywhere is intentional).
179
+ rules_with_col = sum(1 for r in rules if r.col_headers)
180
+ rules_without_col = sum(1 for r in rules if not r.col_headers)
181
+ if rules_with_col > 0 and rules_without_col > 0:
182
+ reasons.append("partial_column_coverage")
183
+
184
+ # Keep threshold modest so we only fail on clearly weak parses.
185
+ gate_ok = score >= 0.45 and not reasons
186
+ return GateResult(ok=gate_ok, score=score, reasons=reasons)
table2rules/report.py ADDED
@@ -0,0 +1,155 @@
1
+ """Per-table observability types returned by ``process_tables_with_stats``.
2
+
3
+ Downstream integrators use these to answer "did this table convert cleanly,
4
+ and if not, why?" without having to re-parse the output or sample by hand.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass
10
+ from itertools import chain
11
+ from typing import Dict, FrozenSet, Iterable, Literal, Optional, Tuple
12
+
13
+ # The four values ``render_mode`` can take. Order = descending output quality.
14
+ #
15
+ # rules — gate passed; exporter-native output (one rule per line).
16
+ # flat — gate failed; header-free pipe-joined cell rows.
17
+ # passthrough — neither rules nor flat produced anything; raw HTML emitted.
18
+ # skipped — input was refused before fallback output (currently too large).
19
+ #
20
+ # Adding new values is a minor-version bump; renaming/removing is breaking.
21
+ RenderMode = Literal["rules", "flat", "passthrough", "skipped"]
22
+
23
+ # Symbolic constants for integrators who'd rather not sprinkle magic strings
24
+ # through policy code. ``t.render_mode == RENDER_MODE_RULES`` is equivalent to
25
+ # ``t.render_mode == "rules"``; use whichever reads better at the call site.
26
+ RENDER_MODE_RULES: RenderMode = "rules"
27
+ RENDER_MODE_FLAT: RenderMode = "flat"
28
+ RENDER_MODE_PASSTHROUGH: RenderMode = "passthrough"
29
+ RENDER_MODE_SKIPPED: RenderMode = "skipped"
30
+
31
+
32
+ # Stable catalogue of every ``reasons`` string a ``TableReport`` may contain.
33
+ # This is part of the semver contract: additions are minor-version bumps;
34
+ # renames or removals are breaking. Integrators who switch on ``reasons`` can
35
+ # lint against ``REASONS.keys()`` to catch typos.
36
+ REASONS: Dict[str, str] = {
37
+ # --- Gate invariants (structural) ---
38
+ "empty_grid": "Parsed grid was empty or zero-dimensional.",
39
+ "position_out_of_bounds": "A generated rule's position fell outside the grid bounds.",
40
+ "non_td_rule_cell": "A rule was generated from a non-<td> cell.",
41
+ "header_cell_emitted": "A rule was generated from a cell flagged as a header.",
42
+ "empty_rule_outcome": "A rule's outcome text was empty.",
43
+ "empty_header_text": "A rule had at least one header with empty text.",
44
+ # --- Gate confidence (statistical) ---
45
+ "no_candidate_data_cells": "The table had no non-empty data cells.",
46
+ "low_coverage": "Fewer than 60% of data cells produced a rule.",
47
+ "low_header_attachment": "At least one rule lacked any header context; rules mode requires every rule to carry at least one header.",
48
+ "high_self_echo": "More than 50% of rules repeat a column header as their value.",
49
+ "high_duplicate_positions": "At least one logical grid position produced multiple rules.",
50
+ "high_position_conflict": "At least one logical grid position carried conflicting outcomes.",
51
+ "numeric_column_headers": "More than 30% of rules have all-numeric column headers — likely a data row misread as a header.",
52
+ "placeholder_column_headers": "More than 30% of rules have placeholder-only column headers (underscores, dashes).",
53
+ "partial_column_coverage": "The table has column headers for some rules but not for others — the header rows do not fully cover all data columns. Common cause: a multi-level header that does not reserve a column for the row-label, shifting all column labels one position to the right.",
54
+ # --- Report-level signals ---
55
+ "input_too_large": "Expanded grid exceeded the safety cap; the table was skipped.",
56
+ "processing_error": "The parser raised an exception and ``strict=False`` swallowed it; see ``TableReport.error``.",
57
+ }
58
+
59
+
60
+ # Operational severity grouping for the codes in ``REASONS``.
61
+ #
62
+ # defensive — structural invariants on the library's own output. Should
63
+ # never fire in production; if you see one, file an issue.
64
+ # confidence — soft gate signals for low-quality parses. Expected on
65
+ # real-world input; tune alerting against these.
66
+ # input — signals that the caller handed table2rules bad data. The
67
+ # fix is upstream, not in this library.
68
+ #
69
+ # Exposing this grouping lets integrators auto-populate metrics dashboards
70
+ # and switch statements without hardcoding the buckets from the docs. Every
71
+ # key in ``REASONS`` appears in exactly one bucket — enforced by tests.
72
+ REASONS_BY_SEVERITY: Dict[str, FrozenSet[str]] = {
73
+ "defensive": frozenset(
74
+ {
75
+ "empty_grid",
76
+ "position_out_of_bounds",
77
+ "non_td_rule_cell",
78
+ "header_cell_emitted",
79
+ "empty_rule_outcome",
80
+ "empty_header_text",
81
+ }
82
+ ),
83
+ "confidence": frozenset(
84
+ {
85
+ "no_candidate_data_cells",
86
+ "low_coverage",
87
+ "low_header_attachment",
88
+ "high_self_echo",
89
+ "high_duplicate_positions",
90
+ "high_position_conflict",
91
+ "numeric_column_headers",
92
+ "placeholder_column_headers",
93
+ "partial_column_coverage",
94
+ }
95
+ ),
96
+ "input": frozenset(
97
+ {
98
+ "input_too_large",
99
+ "processing_error",
100
+ }
101
+ ),
102
+ }
103
+
104
+
105
+ @dataclass(frozen=True)
106
+ class TableReport:
107
+ """Observability record for a single top-level table in the input HTML.
108
+
109
+ ``text`` carries the rendered output for *this* table only — the same lines
110
+ that contributed to the concatenated string returned alongside the report.
111
+ Callers passing whole-document HTML in can read ``report.tables[i].text``
112
+ to keep per-table provenance instead of having to split the flat blob.
113
+
114
+ ``caption`` is the text of the table's ``<caption>`` element when present,
115
+ otherwise ``None``. Only direct ``<caption>`` children are read; the HTML
116
+ ``id`` attribute, surrounding headings, and other content-derived names
117
+ are intentionally ignored — ``table_index`` remains the only stable
118
+ positional identifier.
119
+ """
120
+
121
+ table_index: int
122
+ render_mode: RenderMode
123
+ gate_ok: bool
124
+ gate_score: float
125
+ reasons: Tuple[str, ...]
126
+ error: Optional[str] = None
127
+ caption: Optional[str] = None
128
+ text: str = ""
129
+
130
+
131
+ @dataclass(frozen=True)
132
+ class RenderReport:
133
+ """Aggregate of per-table reports for a single ``process_tables_*`` call."""
134
+
135
+ tables: Tuple[TableReport, ...] = ()
136
+
137
+ @property
138
+ def tables_rendered(self) -> int:
139
+ """Count of tables whose output reached the final string in any mode."""
140
+ return sum(1 for t in self.tables if t.render_mode != "skipped")
141
+
142
+ @property
143
+ def tables_flagged(self) -> int:
144
+ """Count of tables that did NOT produce clean rules output."""
145
+ return sum(1 for t in self.tables if t.render_mode != "rules")
146
+
147
+ @classmethod
148
+ def merge(cls, reports: Iterable["RenderReport"]) -> "RenderReport":
149
+ """Concatenate multiple reports (e.g. from a batch of documents).
150
+
151
+ Per-report ``table_index`` values are preserved as-is — they refer to
152
+ positions within each original call. If you need cross-call identity,
153
+ track it alongside the reports yourself.
154
+ """
155
+ return cls(tables=tuple(chain.from_iterable(r.tables for r in reports)))