table2rules 0.5.0__tar.gz → 0.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {table2rules-0.5.0/src/table2rules.egg-info → table2rules-0.5.1}/PKG-INFO +1 -1
  2. {table2rules-0.5.0 → table2rules-0.5.1}/pyproject.toml +1 -1
  3. {table2rules-0.5.0 → table2rules-0.5.1}/src/table2rules/_core.py +61 -1
  4. {table2rules-0.5.0 → table2rules-0.5.1}/src/table2rules/models.py +6 -0
  5. {table2rules-0.5.0 → table2rules-0.5.1}/src/table2rules/quality_gate.py +7 -0
  6. {table2rules-0.5.0 → table2rules-0.5.1/src/table2rules.egg-info}/PKG-INFO +1 -1
  7. {table2rules-0.5.0 → table2rules-0.5.1}/tests/test_correctness_oracle.py +8 -0
  8. {table2rules-0.5.0 → table2rules-0.5.1}/tests/test_regression_golds.py +20 -14
  9. {table2rules-0.5.0 → table2rules-0.5.1}/tests/test_robustness_mutations.py +20 -3
  10. {table2rules-0.5.0 → table2rules-0.5.1}/LICENSE +0 -0
  11. {table2rules-0.5.0 → table2rules-0.5.1}/README.md +0 -0
  12. {table2rules-0.5.0 → table2rules-0.5.1}/setup.cfg +0 -0
  13. {table2rules-0.5.0 → table2rules-0.5.1}/src/table2rules/__init__.py +0 -0
  14. {table2rules-0.5.0 → table2rules-0.5.1}/src/table2rules/__main__.py +0 -0
  15. {table2rules-0.5.0 → table2rules-0.5.1}/src/table2rules/cleanup.py +0 -0
  16. {table2rules-0.5.0 → table2rules-0.5.1}/src/table2rules/errors.py +0 -0
  17. {table2rules-0.5.0 → table2rules-0.5.1}/src/table2rules/exporters/__init__.py +0 -0
  18. {table2rules-0.5.0 → table2rules-0.5.1}/src/table2rules/exporters/base.py +0 -0
  19. {table2rules-0.5.0 → table2rules-0.5.1}/src/table2rules/exporters/rules.py +0 -0
  20. {table2rules-0.5.0 → table2rules-0.5.1}/src/table2rules/grid_parser.py +0 -0
  21. {table2rules-0.5.0 → table2rules-0.5.1}/src/table2rules/maze_pathfinder.py +0 -0
  22. {table2rules-0.5.0 → table2rules-0.5.1}/src/table2rules/py.typed +0 -0
  23. {table2rules-0.5.0 → table2rules-0.5.1}/src/table2rules/report.py +0 -0
  24. {table2rules-0.5.0 → table2rules-0.5.1}/src/table2rules/simple_repair.py +0 -0
  25. {table2rules-0.5.0 → table2rules-0.5.1}/src/table2rules/spans.py +0 -0
  26. {table2rules-0.5.0 → table2rules-0.5.1}/src/table2rules.egg-info/SOURCES.txt +0 -0
  27. {table2rules-0.5.0 → table2rules-0.5.1}/src/table2rules.egg-info/dependency_links.txt +0 -0
  28. {table2rules-0.5.0 → table2rules-0.5.1}/src/table2rules.egg-info/entry_points.txt +0 -0
  29. {table2rules-0.5.0 → table2rules-0.5.1}/src/table2rules.egg-info/requires.txt +0 -0
  30. {table2rules-0.5.0 → table2rules-0.5.1}/src/table2rules.egg-info/top_level.txt +0 -0
  31. {table2rules-0.5.0 → table2rules-0.5.1}/tests/test_determinism.py +0 -0
  32. {table2rules-0.5.0 → table2rules-0.5.1}/tests/test_public_api.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: table2rules
3
- Version: 0.5.0
3
+ Version: 0.5.1
4
4
  Summary: Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding.
5
5
  Author: PebbleRoad Pte Ltd
6
6
  License-Expression: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "table2rules"
7
- version = "0.5.0"
7
+ version = "0.5.1"
8
8
  description = "Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding."
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -122,9 +122,18 @@ def _extract_cell_rows(table_html: str) -> List[List[str]]:
122
122
  def _build_rules(grid) -> List[LogicRule]:
123
123
  """Walk the parsed grid and emit one LogicRule per data cell position."""
124
124
  rules: List[LogicRule] = []
125
+ n_cols = len(grid[0])
126
+
127
+ # Rows that carry a *real* value — used below to decide which rows need
128
+ # label-only preservation. A value that merely echoes its own column header
129
+ # (a de-spanned or page-break-repeated header cell) carries no independent
130
+ # data and is dropped downstream by clean_rules; it must not mask an
131
+ # otherwise label-only row. Tracked at the value's *target* positions so a
132
+ # rowspan-filled value correctly marks every row it covers.
133
+ rows_with_value: set = set()
125
134
 
126
135
  for row_idx in range(len(grid)):
127
- for col_idx in range(len(grid[0])):
136
+ for col_idx in range(n_cols):
128
137
  cell = grid[row_idx][col_idx]
129
138
 
130
139
  # Only <td> cells are data cells
@@ -144,6 +153,7 @@ def _build_rules(grid) -> List[LogicRule]:
144
153
 
145
154
  rowspan = cell.get("rowspan", 1)
146
155
  colspan = cell.get("colspan", 1)
156
+ outcome_norm = cell["text"].strip().lower()
147
157
 
148
158
  for r_offset in range(rowspan):
149
159
  for c_offset in range(colspan):
@@ -166,6 +176,56 @@ def _build_rules(grid) -> List[LogicRule]:
166
176
  )
167
177
  )
168
178
 
179
+ is_header_echo = outcome_norm in {h.strip().lower() for h in col_headers}
180
+ if not is_header_echo:
181
+ rows_with_value.add(target_row)
182
+
183
+ # Label-only preservation: a body row whose row-header label is present but
184
+ # which carries no independent value would otherwise vanish entirely — the
185
+ # data loop above emits nothing usable for it. This is how de-spanned
186
+ # section headers arrive when an OCR/HTML pipeline drops the original
187
+ # ``colspan``: the value column is either empty (a benefits-schedule title
188
+ # row "2. Public transport double indemnity") or repeats the column header
189
+ # (a "24. COVID-19 Coverage Extension | Sum Insured" row, whose echoed value
190
+ # clean_rules strips, taking the label with it). It is structurally
191
+ # indistinguishable from a leaf row with a genuinely missing value, so we
192
+ # preserve the label verbatim rather than fabricate a section breadcrumb.
193
+ for row_idx in range(len(grid)):
194
+ if row_idx in rows_with_value:
195
+ continue
196
+ # Anchor the rule at the row's data column so it satisfies the quality
197
+ # gate's "rules originate from <td>" invariant. A row with no <td> at
198
+ # all is a true full-width <th colspan> divider — already handled as a
199
+ # row-group ancestor upstream — so we leave it alone.
200
+ anchor_col = next((c for c in range(n_cols) if grid[row_idx][c]["type"] == "td"), None)
201
+ if anchor_col is None:
202
+ continue
203
+ label_parts: List[str] = []
204
+ for col_idx in range(n_cols):
205
+ cell = grid[row_idx][col_idx]
206
+ if cell["type"] != "th":
207
+ continue
208
+ if cell.get("is_thead", False) or cell.get("is_header_row", False):
209
+ continue
210
+ if cell.get("is_span_copy", False):
211
+ continue
212
+ text = (cell.get("text") or "").strip()
213
+ if not text:
214
+ continue
215
+ label_parts.append(text)
216
+ if not label_parts:
217
+ continue
218
+ rules.append(
219
+ LogicRule(
220
+ outcome=" > ".join(label_parts),
221
+ position=(row_idx, anchor_col),
222
+ row_headers=(),
223
+ col_headers=(),
224
+ origin=(row_idx, anchor_col),
225
+ is_label=True,
226
+ )
227
+ )
228
+
169
229
  return rules
170
230
 
171
231
 
@@ -12,6 +12,12 @@ class LogicRule:
12
12
  col_headers: Tuple[str, ...] = ()
13
13
  origin: Optional[Tuple[int, int]] = None
14
14
  is_footer: bool = False
15
+ # A label-preservation rule: the row carried a label but no independent
16
+ # value (empty value column, or a value that merely echoes the column
17
+ # header). The label is preserved verbatim as the outcome with no header
18
+ # relationship. The confidence gate treats these as pass-through, not a
19
+ # parser-confidence signal.
20
+ is_label: bool = False
15
21
 
16
22
  def to_string(self) -> str:
17
23
  """Descriptive format for Graph-RAG: '<rows> → <cols>: <value>'."""
@@ -74,6 +74,13 @@ def assess_confidence(grid: List[List[Dict]], rules: List[LogicRule]) -> GateRes
74
74
  if not candidates:
75
75
  return GateResult(ok=False, score=0.0, reasons=["no_candidate_data_cells"])
76
76
 
77
+ # Score only value rules. Label-preservation rules (a row's label kept
78
+ # visible when it carries no independent value) have no header relationship
79
+ # by design — they are pass-through, not a parser-confidence signal, so they
80
+ # neither help nor hurt the gate. check_invariants above still validates
81
+ # them (a valid <td> anchor, non-empty outcome).
82
+ rules = [r for r in rules if not r.is_label]
83
+
77
84
  rule_positions = {rule.position for rule in rules}
78
85
  coverage = len(rule_positions) / max(1, len(candidates))
79
86
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: table2rules
3
- Version: 0.5.0
3
+ Version: 0.5.1
4
4
  Summary: Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding.
5
5
  Author: PebbleRoad Pte Ltd
6
6
  License-Expression: MIT
@@ -188,6 +188,14 @@ def test_correctness_oracle(case: tuple[Path, Path]) -> None:
188
188
  matched = 0
189
189
  emitted_lines = [l for l in output.splitlines() if l.strip()]
190
190
  for line in emitted_lines:
191
+ # Label-preservation lines reproduce a whole source cell verbatim (a
192
+ # de-spanned section header whose value column is empty, e.g.
193
+ # "Segments: (1)"). Such a cell may itself contain ": ", which the
194
+ # rule-line parser would misread as a key/value split. A line equal to
195
+ # a full source cell is faithful preservation, not misattribution.
196
+ if source_tokens and _norm(line) in source_tokens:
197
+ matched += 1
198
+ continue
191
199
  parsed = _parse_rule_line(line, source_tokens)
192
200
  if parsed is None:
193
201
  continue
@@ -1,13 +1,20 @@
1
- """Regression layer — byte-for-byte gold matching on hand-authored fixtures.
2
-
3
- Each .md file beneath tests/{adversarial,structured,headerless,smoke,
4
- regression}/ is a fixture containing HTML table markup. For every fixture
5
- we run process_tables_to_text and assert the output matches the committed
6
- gold file under benchmarks/gold/<format>/.
1
+ """Regression layer — byte-for-byte gold matching on every fixture.
2
+
3
+ Each .md file beneath tests/ (except top-level docs) is a fixture containing
4
+ HTML table markup. For every fixture we run process_tables_to_text and assert
5
+ the output matches the committed gold file under benchmarks/gold/<format>/.
6
+
7
+ This covers both the hand-authored fixtures AND the real-world corpus
8
+ (tests/realworld/). The two suites play complementary roles: the correctness
9
+ and robustness layers (test_correctness_oracle / test_robustness_mutations)
10
+ assert the output is *right* (no fabricated content, correct attribution,
11
+ stable under mutation); this layer asserts the output does not *change* unless
12
+ a human regenerates the golds. Together they catch a silent-drop regression —
13
+ where the parser quietly stops emitting real content — which neither the
14
+ oracle (it only guards against fabrication) nor an un-asserted benchmark gold
15
+ could catch on its own. See tests/README.md.
7
16
 
8
17
  This is the strictest of the three test layers — catches any output drift.
9
- See tests/README.md for the relationship to the correctness and robustness
10
- suites.
11
18
 
12
19
  Refresh gold outputs by running: python scripts/benchmark.py --update-gold
13
20
  """
@@ -27,15 +34,14 @@ GOLD_DIR = ROOT / "benchmarks" / "gold" / DEFAULT_FORMAT
27
34
 
28
35
 
29
36
  def _discover_cases() -> list[Path]:
30
- # Real-world fixtures (tests/realworld/) are checked against per-fixture
31
- # oracle triples, not frozen gold text see test_correctness_oracle.py.
32
- # Top-level docs like README.md are not fixtures.
33
- skip_prefixes = {"realworld"}
37
+ # Every fixture beneath tests/ is byte-checked, including the real-world
38
+ # corpus (tests/realworld/) frozen gold text is the tripwire that makes
39
+ # any output change visible. Top-level docs like tests/README.md are not
40
+ # fixtures and are excluded.
34
41
  return [
35
42
  p
36
43
  for p in sorted(TESTS_DIR.rglob("*.md"))
37
- if not (skip_prefixes & set(p.relative_to(TESTS_DIR).parts))
38
- and p.parent != TESTS_DIR # exclude tests/README.md etc.
44
+ if p.parent != TESTS_DIR # exclude tests/README.md, tests/failing_table.md
39
45
  ]
40
46
 
41
47
 
@@ -95,13 +95,22 @@ def _parse_rule_line(line: str, source_tokens: frozenset[str] = frozenset()):
95
95
  return row_path, col_path, value
96
96
 
97
97
 
98
- def _classify(output: str) -> str:
98
+ def _classify(output: str, source_tokens: frozenset[str] = frozenset()) -> str:
99
99
  lines = [l for l in output.splitlines() if l.strip()]
100
100
  if not lines:
101
101
  return "EMPTY"
102
102
  if any("<table" in l for l in lines):
103
103
  return "PASSTHROUGH"
104
- rule_shaped = sum(1 for l in lines if _parse_rule_line(l) is not None)
104
+ # A label-preservation line reproduces a whole source cell verbatim (a
105
+ # de-spanned/echoed section header kept visible). It is not key/value
106
+ # shaped, but it is legitimate rules-mode output — count it as such so a
107
+ # table of rules plus section labels stays RULES rather than degrading to
108
+ # MIXED (which would skip the precision check below).
109
+ rule_shaped = sum(
110
+ 1
111
+ for l in lines
112
+ if _parse_rule_line(l) is not None or (source_tokens and _norm(l) in source_tokens)
113
+ )
105
114
  if rule_shaped == len(lines):
106
115
  return "RULES"
107
116
  if rule_shaped == 0:
@@ -345,7 +354,7 @@ def test_robustness_under_mutation(case: tuple[Path, Path], mutation_name: str)
345
354
  mutated_html = mutator(html, rng)
346
355
 
347
356
  output = process_tables_to_text(mutated_html)
348
- tier = _classify(output)
357
+ tier = _classify(output, source_tokens)
349
358
  if tier in {"PASSTHROUGH", "FLAT", "EMPTY", "MIXED"}:
350
359
  # Safe fallback; not a precision failure.
351
360
  pytest.skip(f"tier={tier} after mutation={mutation_name!r}")
@@ -363,6 +372,14 @@ def test_robustness_under_mutation(case: tuple[Path, Path], mutation_name: str)
363
372
  for line in output.splitlines():
364
373
  if not line.strip():
365
374
  continue
375
+ # Label-preservation lines reproduce a whole source cell verbatim
376
+ # (a de-spanned section header whose value column is empty, e.g.
377
+ # "Segments: (1)"). The cell text itself may contain ": ", which the
378
+ # rule-line parser would misread as a key/value split. A line equal to
379
+ # a full source cell is faithful preservation, not fabrication — the
380
+ # contract is "no invented content", and there is none here.
381
+ if source_tokens and _norm(line) in source_tokens:
382
+ continue
366
383
  parsed = _parse_rule_line(line, source_tokens)
367
384
  if parsed is None:
368
385
  continue
File without changes
File without changes
File without changes