table2rules 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,51 @@
1
+ """table2rules — convert HTML tables to flat, LLM-friendly rules."""
2
+
3
+ from importlib.metadata import PackageNotFoundError
4
+ from importlib.metadata import version as _pkg_version
5
+
6
+ from ._core import process_table, process_tables_to_text, process_tables_with_stats
7
+ from .errors import Table2RulesError, TableTooLargeError
8
+ from .exporters import (
9
+ Exporter,
10
+ RulesExporter,
11
+ available_exporters,
12
+ register_exporter,
13
+ )
14
+ from .models import LogicRule
15
+ from .report import (
16
+ REASONS,
17
+ REASONS_BY_SEVERITY,
18
+ RENDER_MODE_FLAT,
19
+ RENDER_MODE_PASSTHROUGH,
20
+ RENDER_MODE_RULES,
21
+ RENDER_MODE_SKIPPED,
22
+ RenderReport,
23
+ TableReport,
24
+ )
25
+
26
+ try:
27
+ __version__ = _pkg_version("table2rules")
28
+ except PackageNotFoundError:
29
+ __version__ = "0.0.0+unknown"
30
+
31
+ __all__ = [
32
+ "__version__",
33
+ "LogicRule",
34
+ "process_table",
35
+ "process_tables_to_text",
36
+ "process_tables_with_stats",
37
+ "RenderReport",
38
+ "TableReport",
39
+ "REASONS",
40
+ "REASONS_BY_SEVERITY",
41
+ "RENDER_MODE_RULES",
42
+ "RENDER_MODE_FLAT",
43
+ "RENDER_MODE_PASSTHROUGH",
44
+ "RENDER_MODE_SKIPPED",
45
+ "Table2RulesError",
46
+ "TableTooLargeError",
47
+ "Exporter",
48
+ "RulesExporter",
49
+ "available_exporters",
50
+ "register_exporter",
51
+ ]
@@ -0,0 +1,85 @@
1
+ """CLI entry point: python -m table2rules [input] [output]"""
2
+
3
+ import argparse
4
+ import sys
5
+
6
+ from . import __version__
7
+ from ._core import process_tables_to_text, process_tables_with_stats
8
+ from .errors import Table2RulesError
9
+ from .exporters import DEFAULT_FORMAT, available_exporters
10
+
11
+
12
+ def main() -> None:
13
+ parser = argparse.ArgumentParser(
14
+ prog="table2rules",
15
+ description="Convert HTML tables to flat, LLM-friendly rules.",
16
+ )
17
+ parser.add_argument(
18
+ "input",
19
+ nargs="?",
20
+ default="-",
21
+ help="Input file (default: stdin)",
22
+ )
23
+ parser.add_argument(
24
+ "-o",
25
+ "--output",
26
+ default="-",
27
+ help="Output file (default: stdout)",
28
+ )
29
+ parser.add_argument(
30
+ "-f",
31
+ "--format",
32
+ default=DEFAULT_FORMAT,
33
+ choices=available_exporters(),
34
+ help=f"Output exporter (default: {DEFAULT_FORMAT})",
35
+ )
36
+ parser.add_argument(
37
+ "--strict",
38
+ action="store_true",
39
+ help="Fail on parse errors or oversized tables instead of degrading.",
40
+ )
41
+ parser.add_argument(
42
+ "-V",
43
+ "--version",
44
+ action="version",
45
+ version=f"%(prog)s {__version__}",
46
+ )
47
+ args = parser.parse_args()
48
+
49
+ # Read input
50
+ if args.input == "-":
51
+ html = sys.stdin.read()
52
+ else:
53
+ try:
54
+ with open(args.input, "r", encoding="utf-8") as f:
55
+ html = f.read()
56
+ except FileNotFoundError:
57
+ print(f"error: file not found: {args.input}", file=sys.stderr)
58
+ sys.exit(1)
59
+ except IsADirectoryError:
60
+ print(f"error: is a directory: {args.input}", file=sys.stderr)
61
+ sys.exit(1)
62
+
63
+ try:
64
+ if args.strict:
65
+ result, _ = process_tables_with_stats(html, format=args.format, strict=True)
66
+ else:
67
+ result = process_tables_to_text(html, format=args.format)
68
+ except Table2RulesError as e:
69
+ print(f"error: {e}", file=sys.stderr)
70
+ sys.exit(1)
71
+
72
+ # Write output
73
+ if args.output == "-":
74
+ sys.stdout.write(result)
75
+ else:
76
+ try:
77
+ with open(args.output, "w", encoding="utf-8") as f:
78
+ f.write(result)
79
+ except (PermissionError, IsADirectoryError) as e:
80
+ print(f"error: cannot write to {args.output}: {e}", file=sys.stderr)
81
+ sys.exit(1)
82
+
83
+
84
+ if __name__ == "__main__":
85
+ main()
table2rules/_core.py ADDED
@@ -0,0 +1,351 @@
1
+ import logging
2
+ from typing import List, Tuple, Union
3
+
4
+ from bs4 import BeautifulSoup, Tag
5
+
6
+ from .cleanup import clean_rules
7
+ from .errors import TableTooLargeError
8
+ from .exporters import DEFAULT_FORMAT, Exporter, get_exporter
9
+ from .grid_parser import clean_text, parse_table_to_grid
10
+ from .maze_pathfinder import find_headers_for_cell
11
+ from .models import LogicRule
12
+ from .quality_gate import GateResult, assess_confidence
13
+ from .report import RenderMode, RenderReport, TableReport
14
+ from .simple_repair import simple_repair
15
+
16
+
17
+ def _split_compound_tables(soup) -> None:
18
+ """Split tables with mid-body header resets into separate tables.
19
+
20
+ Detects all-<th> rows in the body that redefine column names (e.g. OCR
21
+ page-break repeats where "Sales" becomes "Returns"). Each section gets
22
+ its own <table> so it is parsed with the correct headers.
23
+
24
+ Operates on the raw soup BEFORE simple_repair to avoid false positives
25
+ from summary rows promoted to <th>.
26
+ """
27
+ for table in list(soup.find_all("table")):
28
+ rows = [r for r in table.find_all("tr") if r.find_parent("table") is table]
29
+ if len(rows) < 3:
30
+ continue
31
+
32
+ # Find all-th rows in source markup, but only treat them as
33
+ # split points when data rows appear in between. Consecutive
34
+ # all-th rows (top or post-data) form a single multi-row header,
35
+ # not separate boundaries. Summary rows that simple_repair later
36
+ # promotes to <th> (e.g. "Total", "Subtotal") must not be treated
37
+ # as boundaries either.
38
+ SUMMARY_LABELS = {"total", "subtotal", "sub total", "grand total"}
39
+ header_indices: List[int] = []
40
+ seen_data_row = False
41
+ prev_was_header_row = False
42
+ for idx, row in enumerate(rows):
43
+ cells = row.find_all(["td", "th"], recursive=False)
44
+ if not cells:
45
+ continue
46
+ all_th = len(cells) >= 2 and all(c.name == "th" for c in cells)
47
+ looks_like_summary = any(
48
+ c.get_text(strip=True).lower() in SUMMARY_LABELS for c in cells
49
+ )
50
+ if all_th and not looks_like_summary:
51
+ non_empty = sum(1 for c in cells if c.get_text(strip=True))
52
+ if non_empty >= len(cells) // 2:
53
+ if not seen_data_row:
54
+ # Part of the initial header block
55
+ if not header_indices:
56
+ header_indices.append(idx)
57
+ elif not prev_was_header_row:
58
+ # Genuine reset (previous content row was data, not
59
+ # another header row continuing a multi-row header)
60
+ header_indices.append(idx)
61
+ prev_was_header_row = True
62
+ elif len(cells) >= 2:
63
+ # Only multi-cell non-th rows count as data.
64
+ # Single-cell rows (titles, captions) don't flip the flag.
65
+ seen_data_row = True
66
+ prev_was_header_row = False
67
+ else:
68
+ prev_was_header_row = False
69
+
70
+ if len(header_indices) < 2:
71
+ continue
72
+
73
+ boundaries = header_indices + [len(rows)]
74
+ for i in range(len(header_indices)):
75
+ section_rows = rows[boundaries[i] : boundaries[i + 1]]
76
+ has_data = any(
77
+ len(r.find_all(["td", "th"], recursive=False)) >= 2
78
+ and any(c.name == "td" for c in r.find_all(["td", "th"], recursive=False))
79
+ for r in section_rows
80
+ )
81
+ if not has_data:
82
+ break
83
+ else:
84
+ sections_html = []
85
+ for i in range(len(header_indices)):
86
+ start = boundaries[i]
87
+ end = boundaries[i + 1]
88
+ section_rows = rows[start:end]
89
+ new_table = soup.new_tag("table")
90
+ for row in section_rows:
91
+ row.extract()
92
+ new_table.append(row)
93
+ sections_html.append(new_table)
94
+ for section in reversed(sections_html):
95
+ table.insert_after(section)
96
+ table.decompose()
97
+
98
+
99
+ def _extract_cell_rows(table_html: str) -> List[List[str]]:
100
+ """Return raw cell text for each row (header-free). Used as gate-fail fallback."""
101
+ try:
102
+ soup = BeautifulSoup(table_html, "html.parser")
103
+ table = soup.find("table")
104
+ if not isinstance(table, Tag):
105
+ return []
106
+ rows = [
107
+ r
108
+ for r in table.find_all("tr")
109
+ if isinstance(r, Tag) and r.find_parent("table") is table
110
+ ]
111
+ out: List[List[str]] = []
112
+ for row in rows:
113
+ cells = row.find_all(["td", "th"], recursive=False)
114
+ texts = [clean_text(c.get_text(" ", strip=True)) for c in cells]
115
+ if any(texts):
116
+ out.append(texts)
117
+ return out
118
+ except Exception:
119
+ return []
120
+
121
+
122
+ def _build_rules(grid) -> List[LogicRule]:
123
+ """Walk the parsed grid and emit one LogicRule per data cell position."""
124
+ rules: List[LogicRule] = []
125
+
126
+ for row_idx in range(len(grid)):
127
+ for col_idx in range(len(grid[0])):
128
+ cell = grid[row_idx][col_idx]
129
+
130
+ # Only <td> cells are data cells
131
+ if cell["type"] != "td":
132
+ continue
133
+
134
+ # Defensive guard: never emit rules from explicit/implicit header rows
135
+ if cell.get("is_thead", False) or cell.get("is_header_row", False):
136
+ continue
137
+
138
+ if not cell.get("text", "").strip():
139
+ continue
140
+
141
+ # If this is a span copy, skip it (we'll process it from origin)
142
+ if cell.get("is_span_copy", False):
143
+ continue
144
+
145
+ rowspan = cell.get("rowspan", 1)
146
+ colspan = cell.get("colspan", 1)
147
+
148
+ for r_offset in range(rowspan):
149
+ for c_offset in range(colspan):
150
+ target_row = row_idx + r_offset
151
+ target_col = col_idx + c_offset
152
+
153
+ if target_row >= len(grid) or target_col >= len(grid[0]):
154
+ continue
155
+
156
+ row_headers, col_headers = find_headers_for_cell(grid, target_row, target_col)
157
+
158
+ rules.append(
159
+ LogicRule(
160
+ outcome=cell["text"],
161
+ position=(target_row, target_col),
162
+ is_footer=cell.get("is_footer", False),
163
+ row_headers=tuple(row_headers),
164
+ col_headers=tuple(col_headers),
165
+ origin=(row_idx, col_idx),
166
+ )
167
+ )
168
+
169
+ return rules
170
+
171
+
172
+ def _process_table_with_gate(table_html: str) -> Tuple[List[LogicRule], GateResult]:
173
+ """Runs the full pipeline and returns rules plus the gate verdict.
174
+
175
+ Rules are ``[]`` when the gate fails. Raises ``TableTooLargeError`` on
176
+ adversarial span values and propagates other parse errors; the caller
177
+ decides whether to swallow them.
178
+ """
179
+ repaired = simple_repair(table_html)
180
+ soup = BeautifulSoup(repaired, "html.parser")
181
+ table = soup.find("table")
182
+ if not isinstance(table, Tag):
183
+ return [], GateResult(ok=False, score=0.0, reasons=["empty_grid"])
184
+
185
+ grid = parse_table_to_grid(table)
186
+ if not grid:
187
+ return [], GateResult(ok=False, score=0.0, reasons=["empty_grid"])
188
+
189
+ rules = clean_rules(_build_rules(grid))
190
+ gate = assess_confidence(grid, rules)
191
+ if not gate.ok:
192
+ return [], gate
193
+ return rules, gate
194
+
195
+
196
+ def process_table(table_html: str, *, strict: bool = False) -> List[LogicRule]:
197
+ """Process a single table and return rules (one per cell position).
198
+
199
+ Args:
200
+ table_html: HTML string containing a single ``<table>``.
201
+ strict: when ``True``, re-raise parse errors and ``TableTooLargeError``.
202
+ Default ``False`` is fail-open: returns ``[]`` on any parse
203
+ error, adversarial input, or gate failure. Use
204
+ :func:`process_tables_with_stats` if you need to tell those
205
+ apart.
206
+ """
207
+ try:
208
+ rules, _ = _process_table_with_gate(table_html)
209
+ return rules
210
+ except Exception:
211
+ if strict:
212
+ raise
213
+ logging.debug("process_table failed on input, returning empty", exc_info=True)
214
+ return []
215
+
216
+
217
+ def _run(
218
+ html_content: str,
219
+ format: Union[str, Exporter],
220
+ collect_report: bool,
221
+ strict: bool,
222
+ ) -> Tuple[str, RenderReport]:
223
+ """Shared engine for both public entry points."""
224
+ exporter = get_exporter(format)
225
+
226
+ if not html_content:
227
+ return "", RenderReport()
228
+
229
+ soup = BeautifulSoup(html_content, "html.parser")
230
+ if not soup.find_all("table"):
231
+ return "", RenderReport()
232
+
233
+ # Pre-process: split compound tables that have mid-body header resets
234
+ # (all-<th> rows appearing after the first header row with different
235
+ # column names). Must happen BEFORE repair to avoid false positives
236
+ # from summary rows that Fix 5 promotes to <th>.
237
+ _split_compound_tables(soup)
238
+ all_tables = soup.find_all("table")
239
+
240
+ output_chunks: List[str] = []
241
+ reports: List[TableReport] = []
242
+ table_index = 0
243
+
244
+ for table in all_tables:
245
+ # Skip nested tables — they're folded into their parent's cell text.
246
+ if table.find_parent("table"):
247
+ continue
248
+
249
+ table_html = str(table)
250
+ rules: List[LogicRule] = []
251
+ gate: GateResult = GateResult(ok=False, score=0.0, reasons=[])
252
+ too_large = False
253
+ error_msg = None
254
+
255
+ try:
256
+ rules, gate = _process_table_with_gate(table_html)
257
+ except TableTooLargeError as exc:
258
+ if strict:
259
+ raise
260
+ too_large = True
261
+ error_msg = str(exc)
262
+ except Exception as exc:
263
+ if strict:
264
+ raise
265
+ logging.debug("table processing failed; falling back", exc_info=True)
266
+ error_msg = f"{type(exc).__name__}: {exc}"
267
+
268
+ render_mode: RenderMode
269
+ table_chunks: List[str] = []
270
+ if rules:
271
+ table_chunks = list(exporter.export_rules(rules))
272
+ render_mode = "rules"
273
+ elif too_large:
274
+ # Refuse to emit anything for span-bomb input — the fallback paths
275
+ # would still iterate the HTML, which is fine, but the signal to
276
+ # downstream consumers is clearer if we skip entirely.
277
+ render_mode = "skipped"
278
+ else:
279
+ cell_rows = _extract_cell_rows(table_html)
280
+ flat = exporter.export_flat(cell_rows) if cell_rows else []
281
+ if flat:
282
+ table_chunks = list(flat)
283
+ render_mode = "flat"
284
+ else:
285
+ table_chunks = [table_html]
286
+ render_mode = "passthrough"
287
+ output_chunks.extend(table_chunks)
288
+
289
+ if collect_report:
290
+ reasons = tuple(gate.reasons)
291
+ if too_large:
292
+ reasons = ("input_too_large",) + reasons
293
+ elif error_msg is not None:
294
+ reasons = ("processing_error",) + reasons
295
+ caption_tag = table.find("caption", recursive=False)
296
+ caption_text = (clean_text(caption_tag.get_text()) if caption_tag else "") or None
297
+ reports.append(
298
+ TableReport(
299
+ table_index=table_index,
300
+ render_mode=render_mode,
301
+ gate_ok=gate.ok,
302
+ gate_score=gate.score,
303
+ reasons=reasons,
304
+ error=error_msg,
305
+ caption=caption_text,
306
+ text="\n".join(table_chunks),
307
+ )
308
+ )
309
+ table_index += 1
310
+
311
+ text = "\n".join(output_chunks) if output_chunks else ""
312
+ report = RenderReport(tables=tuple(reports)) if collect_report else RenderReport()
313
+ return text, report
314
+
315
+
316
+ def process_tables_to_text(
317
+ html_content: str,
318
+ format: Union[str, Exporter] = DEFAULT_FORMAT,
319
+ ) -> str:
320
+ """HTML -> formatted text (fail-open, no observability).
321
+
322
+ Args:
323
+ html_content: raw HTML containing one or more <table> elements.
324
+ format: exporter name (e.g. ``"rules"``) or an ``Exporter`` instance.
325
+ Defaults to ``"rules"`` (one rule per line, full header paths).
326
+ """
327
+ text, _ = _run(html_content, format=format, collect_report=False, strict=False)
328
+ return text
329
+
330
+
331
+ def process_tables_with_stats(
332
+ html_content: str,
333
+ *,
334
+ format: Union[str, Exporter] = DEFAULT_FORMAT,
335
+ strict: bool = False,
336
+ ) -> Tuple[str, RenderReport]:
337
+ """HTML -> ``(formatted text, RenderReport)``.
338
+
339
+ The report has one ``TableReport`` per top-level table in input order,
340
+ carrying the gate verdict, the render mode actually used, and any error
341
+ message captured while processing that table.
342
+
343
+ Args:
344
+ html_content: raw HTML containing one or more <table> elements.
345
+ format: exporter name or an ``Exporter`` instance.
346
+ strict: when ``True``, re-raise parse errors and ``TableTooLargeError``
347
+ instead of falling back silently. Useful during development and
348
+ tests; keep the default ``False`` in production pipelines that
349
+ process untrusted input.
350
+ """
351
+ return _run(html_content, format=format, collect_report=True, strict=strict)
table2rules/cleanup.py ADDED
@@ -0,0 +1,61 @@
1
+ import re
2
+ from dataclasses import replace
3
+ from typing import List, Tuple
4
+
5
+ from .models import LogicRule
6
+
7
+
8
+ def clean_rules(rules: List[LogicRule]) -> List[LogicRule]:
9
+ """
10
+ Post-processing cleanup.
11
+
12
+ Fixes:
13
+ 1. Remove duplicate headers
14
+ 2. Filter footer/legend content
15
+ 3. Drop self-echo rules (value == column header)
16
+ """
17
+ cleaned: List[LogicRule] = []
18
+
19
+ for rule in rules:
20
+ if rule.is_footer:
21
+ text = rule.outcome.lower()
22
+
23
+ if (
24
+ text.startswith("note:")
25
+ or text.startswith("footnote")
26
+ or "legend:" in text
27
+ or "months indicate" in text
28
+ ):
29
+ continue
30
+
31
+ if re.search(r"^\d+\s+\w+.*?\d+\s+\w+", text):
32
+ continue
33
+
34
+ # Drop self-echo rules: value identical to its column header.
35
+ # These come from body rows that repeat the header text (OCR artifacts,
36
+ # page-break header repeats). They carry zero information.
37
+ if rule.col_headers and rule.outcome.strip().lower() in (
38
+ h.strip().lower() for h in rule.col_headers
39
+ ):
40
+ continue
41
+
42
+ cleaned.append(
43
+ replace(
44
+ rule,
45
+ row_headers=deduplicate_headers(rule.row_headers),
46
+ col_headers=deduplicate_headers(rule.col_headers),
47
+ )
48
+ )
49
+
50
+ return cleaned
51
+
52
+
53
+ def deduplicate_headers(headers: Tuple[str, ...]) -> Tuple[str, ...]:
54
+ """Remove exact duplicates while preserving order."""
55
+ seen = set()
56
+ unique: List[str] = []
57
+ for h in headers:
58
+ if h not in seen:
59
+ seen.add(h)
60
+ unique.append(h)
61
+ return tuple(unique)
table2rules/errors.py ADDED
@@ -0,0 +1,17 @@
1
+ """Public exception types for table2rules."""
2
+
3
+ from __future__ import annotations
4
+
5
+
6
+ class Table2RulesError(Exception):
7
+ """Base class for exceptions raised by table2rules."""
8
+
9
+
10
+ class TableTooLargeError(Table2RulesError):
11
+ """Raised when a table's span-expanded grid would exceed configured caps.
12
+
13
+ Typically produced by malformed or adversarial HTML (e.g. a cell with
14
+ ``rowspan=99999`` / ``colspan=99999``). Callers running on untrusted input
15
+ should treat this as a signal to skip or degrade rather than hang on
16
+ allocation.
17
+ """
@@ -0,0 +1,33 @@
1
+ """Pluggable output exporters for table2rules.
2
+
3
+ Built-in:
4
+ - "rules" (default): one rule per line, full header paths — the native
5
+ table2rules format.
6
+
7
+ Third parties can register custom exporters:
8
+
9
+ from table2rules.exporters import Exporter, register_exporter
10
+
11
+ class MyExporter:
12
+ name = "mine"
13
+ def export_rules(self, rules): ...
14
+ def export_flat(self, cell_rows): ...
15
+
16
+ register_exporter(MyExporter())
17
+ """
18
+
19
+ from .base import Exporter, available_exporters, get_exporter, register_exporter
20
+ from .rules import RulesExporter
21
+
22
+ register_exporter(RulesExporter())
23
+
24
+ DEFAULT_FORMAT = "rules"
25
+
26
+ __all__ = [
27
+ "Exporter",
28
+ "RulesExporter",
29
+ "DEFAULT_FORMAT",
30
+ "available_exporters",
31
+ "get_exporter",
32
+ "register_exporter",
33
+ ]
@@ -0,0 +1,41 @@
1
+ """Exporter protocol and registry for table2rules.
2
+
3
+ Exporters turn a list of LogicRule objects (one per cell) into a list of
4
+ output lines. Third parties can add new formats by subclassing Exporter
5
+ and calling register_exporter().
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Dict, List, Protocol
11
+
12
+ from ..models import LogicRule
13
+
14
+
15
+ class Exporter(Protocol):
16
+ name: str
17
+
18
+ def export_rules(self, rules: List[LogicRule]) -> List[str]: ...
19
+
20
+ def export_flat(self, cell_rows: List[List[str]]) -> List[str]: ...
21
+
22
+
23
+ _REGISTRY: Dict[str, Exporter] = {}
24
+
25
+
26
+ def register_exporter(exporter: Exporter) -> None:
27
+ _REGISTRY[exporter.name] = exporter
28
+
29
+
30
+ def get_exporter(name_or_instance) -> Exporter:
31
+ if isinstance(name_or_instance, str):
32
+ if name_or_instance not in _REGISTRY:
33
+ raise ValueError(
34
+ f"unknown exporter {name_or_instance!r}; registered: {sorted(_REGISTRY)}"
35
+ )
36
+ return _REGISTRY[name_or_instance]
37
+ return name_or_instance
38
+
39
+
40
+ def available_exporters() -> List[str]:
41
+ return sorted(_REGISTRY)