table2rules 0.5.0__tar.gz → 0.5.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {table2rules-0.5.0/src/table2rules.egg-info → table2rules-0.5.2}/PKG-INFO +1 -1
  2. {table2rules-0.5.0 → table2rules-0.5.2}/pyproject.toml +1 -1
  3. {table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules/_core.py +82 -2
  4. {table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules/models.py +6 -0
  5. {table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules/quality_gate.py +7 -0
  6. {table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules/simple_repair.py +33 -1
  7. {table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules/spans.py +15 -0
  8. {table2rules-0.5.0 → table2rules-0.5.2/src/table2rules.egg-info}/PKG-INFO +1 -1
  9. {table2rules-0.5.0 → table2rules-0.5.2}/tests/test_correctness_oracle.py +8 -0
  10. {table2rules-0.5.0 → table2rules-0.5.2}/tests/test_regression_golds.py +20 -14
  11. {table2rules-0.5.0 → table2rules-0.5.2}/tests/test_robustness_mutations.py +20 -3
  12. {table2rules-0.5.0 → table2rules-0.5.2}/LICENSE +0 -0
  13. {table2rules-0.5.0 → table2rules-0.5.2}/README.md +0 -0
  14. {table2rules-0.5.0 → table2rules-0.5.2}/setup.cfg +0 -0
  15. {table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules/__init__.py +0 -0
  16. {table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules/__main__.py +0 -0
  17. {table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules/cleanup.py +0 -0
  18. {table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules/errors.py +0 -0
  19. {table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules/exporters/__init__.py +0 -0
  20. {table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules/exporters/base.py +0 -0
  21. {table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules/exporters/rules.py +0 -0
  22. {table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules/grid_parser.py +0 -0
  23. {table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules/maze_pathfinder.py +0 -0
  24. {table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules/py.typed +0 -0
  25. {table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules/report.py +0 -0
  26. {table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules.egg-info/SOURCES.txt +0 -0
  27. {table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules.egg-info/dependency_links.txt +0 -0
  28. {table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules.egg-info/entry_points.txt +0 -0
  29. {table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules.egg-info/requires.txt +0 -0
  30. {table2rules-0.5.0 → table2rules-0.5.2}/src/table2rules.egg-info/top_level.txt +0 -0
  31. {table2rules-0.5.0 → table2rules-0.5.2}/tests/test_determinism.py +0 -0
  32. {table2rules-0.5.0 → table2rules-0.5.2}/tests/test_public_api.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: table2rules
3
- Version: 0.5.0
3
+ Version: 0.5.2
4
4
  Summary: Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding.
5
5
  Author: PebbleRoad Pte Ltd
6
6
  License-Expression: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "table2rules"
7
- version = "0.5.0"
7
+ version = "0.5.2"
8
8
  description = "Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding."
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -12,6 +12,7 @@ from .models import LogicRule
12
12
  from .quality_gate import GateResult, assess_confidence
13
13
  from .report import RenderMode, RenderReport, TableReport
14
14
  from .simple_repair import simple_repair
15
+ from .spans import is_full_width_note
15
16
 
16
17
 
17
18
  def _split_compound_tables(soup) -> None:
@@ -122,9 +123,18 @@ def _extract_cell_rows(table_html: str) -> List[List[str]]:
122
123
  def _build_rules(grid) -> List[LogicRule]:
123
124
  """Walk the parsed grid and emit one LogicRule per data cell position."""
124
125
  rules: List[LogicRule] = []
126
+ n_cols = len(grid[0])
127
+
128
+ # Rows that carry a *real* value — used below to decide which rows need
129
+ # label-only preservation. A value that merely echoes its own column header
130
+ # (a de-spanned or page-break-repeated header cell) carries no independent
131
+ # data and is dropped downstream by clean_rules; it must not mask an
132
+ # otherwise label-only row. Tracked at the value's *target* positions so a
133
+ # rowspan-filled value correctly marks every row it covers.
134
+ rows_with_value: set = set()
125
135
 
126
136
  for row_idx in range(len(grid)):
127
- for col_idx in range(len(grid[0])):
137
+ for col_idx in range(n_cols):
128
138
  cell = grid[row_idx][col_idx]
129
139
 
130
140
  # Only <td> cells are data cells
@@ -144,6 +154,23 @@ def _build_rules(grid) -> List[LogicRule]:
144
154
 
145
155
  rowspan = cell.get("rowspan", 1)
146
156
  colspan = cell.get("colspan", 1)
157
+ outcome_norm = cell["text"].strip().lower()
158
+
159
+ # A wide <td> that reaches the last column AND covers a majority of
160
+ # the grid's columns is structurally a full-width note/description
161
+ # (e.g. a benefit name "Accidental death and permanent disability"
162
+ # or "If the departure of your public transport is delayed…"
163
+ # spanning the whole value region), not a per-column value. We still
164
+ # emit at every spanned position — so the gate detects an
165
+ # overlapping-span corruption (a rowspan intruding into the note's
166
+ # row) as a conflict and fails open to flat — but attribute every
167
+ # position to the *origin* column's header path. The exporter's
168
+ # origin-aware dedup then collapses the identical lines to one,
169
+ # instead of stamping the sentence under each plan×cover header.
170
+ # Legitimate narrow spans (a right-edge colspan=2 amount covering
171
+ # INDIVIDUAL+FAMILY of one plan) fail the majority test and keep
172
+ # their genuine per-column attribution.
173
+ note = is_full_width_note(col_idx, colspan, n_cols)
147
174
 
148
175
  for r_offset in range(rowspan):
149
176
  for c_offset in range(colspan):
@@ -153,7 +180,8 @@ def _build_rules(grid) -> List[LogicRule]:
153
180
  if target_row >= len(grid) or target_col >= len(grid[0]):
154
181
  continue
155
182
 
156
- row_headers, col_headers = find_headers_for_cell(grid, target_row, target_col)
183
+ header_col = col_idx if note else target_col
184
+ row_headers, col_headers = find_headers_for_cell(grid, target_row, header_col)
157
185
 
158
186
  rules.append(
159
187
  LogicRule(
@@ -166,6 +194,56 @@ def _build_rules(grid) -> List[LogicRule]:
166
194
  )
167
195
  )
168
196
 
197
+ is_header_echo = outcome_norm in {h.strip().lower() for h in col_headers}
198
+ if not is_header_echo:
199
+ rows_with_value.add(target_row)
200
+
201
+ # Label-only preservation: a body row whose row-header label is present but
202
+ # which carries no independent value would otherwise vanish entirely — the
203
+ # data loop above emits nothing usable for it. This is how de-spanned
204
+ # section headers arrive when an OCR/HTML pipeline drops the original
205
+ # ``colspan``: the value column is either empty (a benefits-schedule title
206
+ # row "2. Public transport double indemnity") or repeats the column header
207
+ # (a "24. COVID-19 Coverage Extension | Sum Insured" row, whose echoed value
208
+ # clean_rules strips, taking the label with it). It is structurally
209
+ # indistinguishable from a leaf row with a genuinely missing value, so we
210
+ # preserve the label verbatim rather than fabricate a section breadcrumb.
211
+ for row_idx in range(len(grid)):
212
+ if row_idx in rows_with_value:
213
+ continue
214
+ # Anchor the rule at the row's data column so it satisfies the quality
215
+ # gate's "rules originate from <td>" invariant. A row with no <td> at
216
+ # all is a true full-width <th colspan> divider — already handled as a
217
+ # row-group ancestor upstream — so we leave it alone.
218
+ anchor_col = next((c for c in range(n_cols) if grid[row_idx][c]["type"] == "td"), None)
219
+ if anchor_col is None:
220
+ continue
221
+ label_parts: List[str] = []
222
+ for col_idx in range(n_cols):
223
+ cell = grid[row_idx][col_idx]
224
+ if cell["type"] != "th":
225
+ continue
226
+ if cell.get("is_thead", False) or cell.get("is_header_row", False):
227
+ continue
228
+ if cell.get("is_span_copy", False):
229
+ continue
230
+ text = (cell.get("text") or "").strip()
231
+ if not text:
232
+ continue
233
+ label_parts.append(text)
234
+ if not label_parts:
235
+ continue
236
+ rules.append(
237
+ LogicRule(
238
+ outcome=" > ".join(label_parts),
239
+ position=(row_idx, anchor_col),
240
+ row_headers=(),
241
+ col_headers=(),
242
+ origin=(row_idx, anchor_col),
243
+ is_label=True,
244
+ )
245
+ )
246
+
169
247
  return rules
170
248
 
171
249
 
@@ -242,6 +320,8 @@ def _run(
242
320
  table_index = 0
243
321
 
244
322
  for table in all_tables:
323
+ if not isinstance(table, Tag):
324
+ continue
245
325
  # Skip nested tables — they're folded into their parent's cell text.
246
326
  if table.find_parent("table"):
247
327
  continue
@@ -12,6 +12,12 @@ class LogicRule:
12
12
  col_headers: Tuple[str, ...] = ()
13
13
  origin: Optional[Tuple[int, int]] = None
14
14
  is_footer: bool = False
15
+ # A label-preservation rule: the row carried a label but no independent
16
+ # value (empty value column, or a value that merely echoes the column
17
+ # header). The label is preserved verbatim as the outcome with no header
18
+ # relationship. The confidence gate treats these as pass-through, not a
19
+ # parser-confidence signal.
20
+ is_label: bool = False
15
21
 
16
22
  def to_string(self) -> str:
17
23
  """Descriptive format for Graph-RAG: '<rows> → <cols>: <value>'."""
@@ -74,6 +74,13 @@ def assess_confidence(grid: List[List[Dict]], rules: List[LogicRule]) -> GateRes
74
74
  if not candidates:
75
75
  return GateResult(ok=False, score=0.0, reasons=["no_candidate_data_cells"])
76
76
 
77
+ # Score only value rules. Label-preservation rules (a row's label kept
78
+ # visible when it carries no independent value) have no header relationship
79
+ # by design — they are pass-through, not a parser-confidence signal, so they
80
+ # neither help nor hurt the gate. check_invariants above still validates
81
+ # them (a valid <td> anchor, non-empty outcome).
82
+ rules = [r for r in rules if not r.is_label]
83
+
77
84
  rule_positions = {rule.position for rule in rules}
78
85
  coverage = len(rule_positions) / max(1, len(candidates))
79
86
 
@@ -301,6 +301,29 @@ def detect_header_block(rows):
301
301
  first_data_idx = r
302
302
  break
303
303
 
304
+ # Full-width section dividers cap the header. A row whose only non-empty
305
+ # content is a single DOM cell spanning the whole width (e.g. a benefits
306
+ # schedule "1. PERSONAL ACCIDENT" <td colspan="8"> row) reads, under the
307
+ # colspan-expanded non-empty count used above, as a full multi-cell header
308
+ # row — so without this the header sweep swallows the divider *and* the
309
+ # body rows between it and the first clean data row, bleeding them onto
310
+ # every line as fabricated column headers. When such dividers form a series
311
+ # (>= 2) they are body section dividers, not a one-off header subtitle like
312
+ # "(Dollars in thousands)"; the header ends at the first one. They stay in
313
+ # the body as plain cells (rendered as full-width notes downstream).
314
+ full_width_divider_idxs = []
315
+ for r in range(n):
316
+ origins = {grid[r][c]["origin"] for c in range(max_cols) if grid[r][c]["nonempty"]}
317
+ if len(origins) != 1:
318
+ continue
319
+ (orow, ocol) = next(iter(origins))
320
+ if grid[orow][ocol]["cs"] >= max_cols:
321
+ full_width_divider_idxs.append(r)
322
+ if len(full_width_divider_idxs) >= 2:
323
+ first_divider = full_width_divider_idxs[0]
324
+ if first_divider > 0 and (first_data_idx is None or first_divider < first_data_idx):
325
+ first_data_idx = first_divider
326
+
304
327
  if first_data_idx is None or first_data_idx == 0:
305
328
  return None
306
329
 
@@ -720,7 +743,16 @@ def simple_repair(html: str) -> str:
720
743
  # counter stays in sync with the grid, otherwise a cell
721
744
  # at logical col > 0 in a subsequent row would be
722
745
  # mistaken for the first-column cell.
723
- if first.name == "td":
746
+ #
747
+ # A row whose single cell spans multiple columns is a
748
+ # section divider / full-width note, not a row label —
749
+ # promoting it to <th scope="row"> strands it (it has no
750
+ # value column to anchor a rule, so it vanishes). Leave it
751
+ # a <td> so it is emitted once as a full-width note.
752
+ is_full_width_single = (
753
+ len(cells) == 1 and clamped_span(first.get("colspan")) > 1
754
+ )
755
+ if first.name == "td" and not is_full_width_single:
724
756
  first.name = "th"
725
757
  first["scope"] = "row"
726
758
  rowspan = clamped_span(first.get("rowspan"))
@@ -27,6 +27,21 @@ def clamped_span(raw) -> int:
27
27
  return value
28
28
 
29
29
 
30
+ def is_full_width_note(col_idx: int, colspan: int, n_cols: int) -> bool:
31
+ """True when a wide data cell is structurally a full-width note/description.
32
+
33
+ A ``<td>`` that reaches the last column AND spans a majority of the grid's
34
+ columns (e.g. a benefit name or a "If the departure…" sentence spanning the
35
+ whole value region of a plan×cover matrix) is a description, not a
36
+ per-column value. Such a cell must collapse to a single rule rather than fan
37
+ out across every spanned column — and the confidence gate must count it as a
38
+ single candidate position to match. Legitimate narrow spans (a right-edge
39
+ ``colspan=2`` amount covering two sub-columns of one group) fail the majority
40
+ test and keep their per-column fan-out.
41
+ """
42
+ return colspan > 1 and (col_idx + colspan == n_cols) and (colspan * 2 > n_cols)
43
+
44
+
30
45
  def assert_grid_size(rows: int, cols: int) -> None:
31
46
  """Raise if a logical grid shape would exceed the configured cell cap."""
32
47
  total_cells = rows * cols
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: table2rules
3
- Version: 0.5.0
3
+ Version: 0.5.2
4
4
  Summary: Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding.
5
5
  Author: PebbleRoad Pte Ltd
6
6
  License-Expression: MIT
@@ -188,6 +188,14 @@ def test_correctness_oracle(case: tuple[Path, Path]) -> None:
188
188
  matched = 0
189
189
  emitted_lines = [l for l in output.splitlines() if l.strip()]
190
190
  for line in emitted_lines:
191
+ # Label-preservation lines reproduce a whole source cell verbatim (a
192
+ # de-spanned section header whose value column is empty, e.g.
193
+ # "Segments: (1)"). Such a cell may itself contain ": ", which the
194
+ # rule-line parser would misread as a key/value split. A line equal to
195
+ # a full source cell is faithful preservation, not misattribution.
196
+ if source_tokens and _norm(line) in source_tokens:
197
+ matched += 1
198
+ continue
191
199
  parsed = _parse_rule_line(line, source_tokens)
192
200
  if parsed is None:
193
201
  continue
@@ -1,13 +1,20 @@
1
- """Regression layer — byte-for-byte gold matching on hand-authored fixtures.
2
-
3
- Each .md file beneath tests/{adversarial,structured,headerless,smoke,
4
- regression}/ is a fixture containing HTML table markup. For every fixture
5
- we run process_tables_to_text and assert the output matches the committed
6
- gold file under benchmarks/gold/<format>/.
1
+ """Regression layer — byte-for-byte gold matching on every fixture.
2
+
3
+ Each .md file beneath tests/ (except top-level docs) is a fixture containing
4
+ HTML table markup. For every fixture we run process_tables_to_text and assert
5
+ the output matches the committed gold file under benchmarks/gold/<format>/.
6
+
7
+ This covers both the hand-authored fixtures AND the real-world corpus
8
+ (tests/realworld/). The two suites play complementary roles: the correctness
9
+ and robustness layers (test_correctness_oracle / test_robustness_mutations)
10
+ assert the output is *right* (no fabricated content, correct attribution,
11
+ stable under mutation); this layer asserts the output does not *change* unless
12
+ a human regenerates the golds. Together they catch a silent-drop regression —
13
+ where the parser quietly stops emitting real content — which neither the
14
+ oracle (it only guards against fabrication) nor an un-asserted benchmark gold
15
+ could catch on its own. See tests/README.md.
7
16
 
8
17
  This is the strictest of the three test layers — catches any output drift.
9
- See tests/README.md for the relationship to the correctness and robustness
10
- suites.
11
18
 
12
19
  Refresh gold outputs by running: python scripts/benchmark.py --update-gold
13
20
  """
@@ -27,15 +34,14 @@ GOLD_DIR = ROOT / "benchmarks" / "gold" / DEFAULT_FORMAT
27
34
 
28
35
 
29
36
  def _discover_cases() -> list[Path]:
30
- # Real-world fixtures (tests/realworld/) are checked against per-fixture
31
- # oracle triples, not frozen gold text see test_correctness_oracle.py.
32
- # Top-level docs like README.md are not fixtures.
33
- skip_prefixes = {"realworld"}
37
+ # Every fixture beneath tests/ is byte-checked, including the real-world
38
+ # corpus (tests/realworld/) frozen gold text is the tripwire that makes
39
+ # any output change visible. Top-level docs like tests/README.md are not
40
+ # fixtures and are excluded.
34
41
  return [
35
42
  p
36
43
  for p in sorted(TESTS_DIR.rglob("*.md"))
37
- if not (skip_prefixes & set(p.relative_to(TESTS_DIR).parts))
38
- and p.parent != TESTS_DIR # exclude tests/README.md etc.
44
+ if p.parent != TESTS_DIR # exclude tests/README.md, tests/failing_table.md
39
45
  ]
40
46
 
41
47
 
@@ -95,13 +95,22 @@ def _parse_rule_line(line: str, source_tokens: frozenset[str] = frozenset()):
95
95
  return row_path, col_path, value
96
96
 
97
97
 
98
- def _classify(output: str) -> str:
98
+ def _classify(output: str, source_tokens: frozenset[str] = frozenset()) -> str:
99
99
  lines = [l for l in output.splitlines() if l.strip()]
100
100
  if not lines:
101
101
  return "EMPTY"
102
102
  if any("<table" in l for l in lines):
103
103
  return "PASSTHROUGH"
104
- rule_shaped = sum(1 for l in lines if _parse_rule_line(l) is not None)
104
+ # A label-preservation line reproduces a whole source cell verbatim (a
105
+ # de-spanned/echoed section header kept visible). It is not key/value
106
+ # shaped, but it is legitimate rules-mode output — count it as such so a
107
+ # table of rules plus section labels stays RULES rather than degrading to
108
+ # MIXED (which would skip the precision check below).
109
+ rule_shaped = sum(
110
+ 1
111
+ for l in lines
112
+ if _parse_rule_line(l) is not None or (source_tokens and _norm(l) in source_tokens)
113
+ )
105
114
  if rule_shaped == len(lines):
106
115
  return "RULES"
107
116
  if rule_shaped == 0:
@@ -345,7 +354,7 @@ def test_robustness_under_mutation(case: tuple[Path, Path], mutation_name: str)
345
354
  mutated_html = mutator(html, rng)
346
355
 
347
356
  output = process_tables_to_text(mutated_html)
348
- tier = _classify(output)
357
+ tier = _classify(output, source_tokens)
349
358
  if tier in {"PASSTHROUGH", "FLAT", "EMPTY", "MIXED"}:
350
359
  # Safe fallback; not a precision failure.
351
360
  pytest.skip(f"tier={tier} after mutation={mutation_name!r}")
@@ -363,6 +372,14 @@ def test_robustness_under_mutation(case: tuple[Path, Path], mutation_name: str)
363
372
  for line in output.splitlines():
364
373
  if not line.strip():
365
374
  continue
375
+ # Label-preservation lines reproduce a whole source cell verbatim
376
+ # (a de-spanned section header whose value column is empty, e.g.
377
+ # "Segments: (1)"). The cell text itself may contain ": ", which the
378
+ # rule-line parser would misread as a key/value split. A line equal to
379
+ # a full source cell is faithful preservation, not fabrication — the
380
+ # contract is "no invented content", and there is none here.
381
+ if source_tokens and _norm(line) in source_tokens:
382
+ continue
366
383
  parsed = _parse_rule_line(line, source_tokens)
367
384
  if parsed is None:
368
385
  continue
File without changes
File without changes
File without changes