table2rules 0.4.1__tar.gz → 0.5.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {table2rules-0.4.1/src/table2rules.egg-info → table2rules-0.5.1}/PKG-INFO +1 -1
- {table2rules-0.4.1 → table2rules-0.5.1}/pyproject.toml +1 -1
- {table2rules-0.4.1 → table2rules-0.5.1}/src/table2rules/_core.py +61 -1
- {table2rules-0.4.1 → table2rules-0.5.1}/src/table2rules/models.py +6 -0
- {table2rules-0.4.1 → table2rules-0.5.1}/src/table2rules/quality_gate.py +7 -0
- {table2rules-0.4.1 → table2rules-0.5.1}/src/table2rules/simple_repair.py +133 -1
- {table2rules-0.4.1 → table2rules-0.5.1/src/table2rules.egg-info}/PKG-INFO +1 -1
- {table2rules-0.4.1 → table2rules-0.5.1}/tests/test_correctness_oracle.py +8 -0
- {table2rules-0.4.1 → table2rules-0.5.1}/tests/test_regression_golds.py +20 -14
- {table2rules-0.4.1 → table2rules-0.5.1}/tests/test_robustness_mutations.py +20 -3
- {table2rules-0.4.1 → table2rules-0.5.1}/LICENSE +0 -0
- {table2rules-0.4.1 → table2rules-0.5.1}/README.md +0 -0
- {table2rules-0.4.1 → table2rules-0.5.1}/setup.cfg +0 -0
- {table2rules-0.4.1 → table2rules-0.5.1}/src/table2rules/__init__.py +0 -0
- {table2rules-0.4.1 → table2rules-0.5.1}/src/table2rules/__main__.py +0 -0
- {table2rules-0.4.1 → table2rules-0.5.1}/src/table2rules/cleanup.py +0 -0
- {table2rules-0.4.1 → table2rules-0.5.1}/src/table2rules/errors.py +0 -0
- {table2rules-0.4.1 → table2rules-0.5.1}/src/table2rules/exporters/__init__.py +0 -0
- {table2rules-0.4.1 → table2rules-0.5.1}/src/table2rules/exporters/base.py +0 -0
- {table2rules-0.4.1 → table2rules-0.5.1}/src/table2rules/exporters/rules.py +0 -0
- {table2rules-0.4.1 → table2rules-0.5.1}/src/table2rules/grid_parser.py +0 -0
- {table2rules-0.4.1 → table2rules-0.5.1}/src/table2rules/maze_pathfinder.py +0 -0
- {table2rules-0.4.1 → table2rules-0.5.1}/src/table2rules/py.typed +0 -0
- {table2rules-0.4.1 → table2rules-0.5.1}/src/table2rules/report.py +0 -0
- {table2rules-0.4.1 → table2rules-0.5.1}/src/table2rules/spans.py +0 -0
- {table2rules-0.4.1 → table2rules-0.5.1}/src/table2rules.egg-info/SOURCES.txt +0 -0
- {table2rules-0.4.1 → table2rules-0.5.1}/src/table2rules.egg-info/dependency_links.txt +0 -0
- {table2rules-0.4.1 → table2rules-0.5.1}/src/table2rules.egg-info/entry_points.txt +0 -0
- {table2rules-0.4.1 → table2rules-0.5.1}/src/table2rules.egg-info/requires.txt +0 -0
- {table2rules-0.4.1 → table2rules-0.5.1}/src/table2rules.egg-info/top_level.txt +0 -0
- {table2rules-0.4.1 → table2rules-0.5.1}/tests/test_determinism.py +0 -0
- {table2rules-0.4.1 → table2rules-0.5.1}/tests/test_public_api.py +0 -0
|
@@ -122,9 +122,18 @@ def _extract_cell_rows(table_html: str) -> List[List[str]]:
|
|
|
122
122
|
def _build_rules(grid) -> List[LogicRule]:
|
|
123
123
|
"""Walk the parsed grid and emit one LogicRule per data cell position."""
|
|
124
124
|
rules: List[LogicRule] = []
|
|
125
|
+
n_cols = len(grid[0])
|
|
126
|
+
|
|
127
|
+
# Rows that carry a *real* value — used below to decide which rows need
|
|
128
|
+
# label-only preservation. A value that merely echoes its own column header
|
|
129
|
+
# (a de-spanned or page-break-repeated header cell) carries no independent
|
|
130
|
+
# data and is dropped downstream by clean_rules; it must not mask an
|
|
131
|
+
# otherwise label-only row. Tracked at the value's *target* positions so a
|
|
132
|
+
# rowspan-filled value correctly marks every row it covers.
|
|
133
|
+
rows_with_value: set = set()
|
|
125
134
|
|
|
126
135
|
for row_idx in range(len(grid)):
|
|
127
|
-
for col_idx in range(
|
|
136
|
+
for col_idx in range(n_cols):
|
|
128
137
|
cell = grid[row_idx][col_idx]
|
|
129
138
|
|
|
130
139
|
# Only <td> cells are data cells
|
|
@@ -144,6 +153,7 @@ def _build_rules(grid) -> List[LogicRule]:
|
|
|
144
153
|
|
|
145
154
|
rowspan = cell.get("rowspan", 1)
|
|
146
155
|
colspan = cell.get("colspan", 1)
|
|
156
|
+
outcome_norm = cell["text"].strip().lower()
|
|
147
157
|
|
|
148
158
|
for r_offset in range(rowspan):
|
|
149
159
|
for c_offset in range(colspan):
|
|
@@ -166,6 +176,56 @@ def _build_rules(grid) -> List[LogicRule]:
|
|
|
166
176
|
)
|
|
167
177
|
)
|
|
168
178
|
|
|
179
|
+
is_header_echo = outcome_norm in {h.strip().lower() for h in col_headers}
|
|
180
|
+
if not is_header_echo:
|
|
181
|
+
rows_with_value.add(target_row)
|
|
182
|
+
|
|
183
|
+
# Label-only preservation: a body row whose row-header label is present but
|
|
184
|
+
# which carries no independent value would otherwise vanish entirely — the
|
|
185
|
+
# data loop above emits nothing usable for it. This is how de-spanned
|
|
186
|
+
# section headers arrive when an OCR/HTML pipeline drops the original
|
|
187
|
+
# ``colspan``: the value column is either empty (a benefits-schedule title
|
|
188
|
+
# row "2. Public transport double indemnity") or repeats the column header
|
|
189
|
+
# (a "24. COVID-19 Coverage Extension | Sum Insured" row, whose echoed value
|
|
190
|
+
# clean_rules strips, taking the label with it). It is structurally
|
|
191
|
+
# indistinguishable from a leaf row with a genuinely missing value, so we
|
|
192
|
+
# preserve the label verbatim rather than fabricate a section breadcrumb.
|
|
193
|
+
for row_idx in range(len(grid)):
|
|
194
|
+
if row_idx in rows_with_value:
|
|
195
|
+
continue
|
|
196
|
+
# Anchor the rule at the row's data column so it satisfies the quality
|
|
197
|
+
# gate's "rules originate from <td>" invariant. A row with no <td> at
|
|
198
|
+
# all is a true full-width <th colspan> divider — already handled as a
|
|
199
|
+
# row-group ancestor upstream — so we leave it alone.
|
|
200
|
+
anchor_col = next((c for c in range(n_cols) if grid[row_idx][c]["type"] == "td"), None)
|
|
201
|
+
if anchor_col is None:
|
|
202
|
+
continue
|
|
203
|
+
label_parts: List[str] = []
|
|
204
|
+
for col_idx in range(n_cols):
|
|
205
|
+
cell = grid[row_idx][col_idx]
|
|
206
|
+
if cell["type"] != "th":
|
|
207
|
+
continue
|
|
208
|
+
if cell.get("is_thead", False) or cell.get("is_header_row", False):
|
|
209
|
+
continue
|
|
210
|
+
if cell.get("is_span_copy", False):
|
|
211
|
+
continue
|
|
212
|
+
text = (cell.get("text") or "").strip()
|
|
213
|
+
if not text:
|
|
214
|
+
continue
|
|
215
|
+
label_parts.append(text)
|
|
216
|
+
if not label_parts:
|
|
217
|
+
continue
|
|
218
|
+
rules.append(
|
|
219
|
+
LogicRule(
|
|
220
|
+
outcome=" > ".join(label_parts),
|
|
221
|
+
position=(row_idx, anchor_col),
|
|
222
|
+
row_headers=(),
|
|
223
|
+
col_headers=(),
|
|
224
|
+
origin=(row_idx, anchor_col),
|
|
225
|
+
is_label=True,
|
|
226
|
+
)
|
|
227
|
+
)
|
|
228
|
+
|
|
169
229
|
return rules
|
|
170
230
|
|
|
171
231
|
|
|
@@ -12,6 +12,12 @@ class LogicRule:
|
|
|
12
12
|
col_headers: Tuple[str, ...] = ()
|
|
13
13
|
origin: Optional[Tuple[int, int]] = None
|
|
14
14
|
is_footer: bool = False
|
|
15
|
+
# A label-preservation rule: the row carried a label but no independent
|
|
16
|
+
# value (empty value column, or a value that merely echoes the column
|
|
17
|
+
# header). The label is preserved verbatim as the outcome with no header
|
|
18
|
+
# relationship. The confidence gate treats these as pass-through, not a
|
|
19
|
+
# parser-confidence signal.
|
|
20
|
+
is_label: bool = False
|
|
15
21
|
|
|
16
22
|
def to_string(self) -> str:
|
|
17
23
|
"""Descriptive format for Graph-RAG: '<rows> → <cols>: <value>'."""
|
|
@@ -74,6 +74,13 @@ def assess_confidence(grid: List[List[Dict]], rules: List[LogicRule]) -> GateRes
|
|
|
74
74
|
if not candidates:
|
|
75
75
|
return GateResult(ok=False, score=0.0, reasons=["no_candidate_data_cells"])
|
|
76
76
|
|
|
77
|
+
# Score only value rules. Label-preservation rules (a row's label kept
|
|
78
|
+
# visible when it carries no independent value) have no header relationship
|
|
79
|
+
# by design — they are pass-through, not a parser-confidence signal, so they
|
|
80
|
+
# neither help nor hurt the gate. check_invariants above still validates
|
|
81
|
+
# them (a valid <td> anchor, non-empty outcome).
|
|
82
|
+
rules = [r for r in rules if not r.is_label]
|
|
83
|
+
|
|
77
84
|
rule_positions = {rule.position for rule in rules}
|
|
78
85
|
coverage = len(rule_positions) / max(1, len(candidates))
|
|
79
86
|
|
|
@@ -122,6 +122,46 @@ def detect_header_block(rows):
|
|
|
122
122
|
[0..k-1] range and are left as-is so the downstream thead-wrap naturally
|
|
123
123
|
excludes them via Fix 7's contiguous-<th> chain.
|
|
124
124
|
|
|
125
|
+
Two further structural witnesses extend "clean data row" to disqualify
|
|
126
|
+
rows that look header-shaped relative to the body:
|
|
127
|
+
|
|
128
|
+
* **Fuller-than-body**: row r's non-empty cell count is strictly greater
|
|
129
|
+
than the minimum non-empty count of the non-divider rows below it,
|
|
130
|
+
AND every column where row r is non-empty has at least one body
|
|
131
|
+
row that fills the same column. A row that fills more columns
|
|
132
|
+
than at least one body row is naming columns the body sometimes
|
|
133
|
+
leaves empty — the structural signature of a column-header row
|
|
134
|
+
above an implicit-rowspan group-label column, a multi-stub
|
|
135
|
+
indentation pyramid, or an alternating coefficient/std-error
|
|
136
|
+
layout. Comparing to the minimum (rather than median or mean)
|
|
137
|
+
is intentional: the structural distinction is "exists a body
|
|
138
|
+
row with fewer non-empty cells than row r," and strictly more
|
|
139
|
+
central statistics misfire on tables where a slim majority of body
|
|
140
|
+
rows match row r's fullness. The body-coverage clause excludes
|
|
141
|
+
receipts whose row 0 is a 4-cell line item followed by 2-cell
|
|
142
|
+
totals — there cols 2 and 3 are filled only in row 0, the body
|
|
143
|
+
never uses them, and promoting row 0 to header would erase the
|
|
144
|
+
line-item data. Uniform-dense tables stay out because their min
|
|
145
|
+
equals row r's count.
|
|
146
|
+
|
|
147
|
+
* **Cell-type inversion**: row r contains at least one corner-stub
|
|
148
|
+
``<td>`` — a column where row r is ``<td>`` while the body majority
|
|
149
|
+
is ``<th>`` — and a strict majority of compared columns invert.
|
|
150
|
+
The corner-stub clause is load-bearing: an all-``<th>`` row above
|
|
151
|
+
an all-``<td>`` body inverts every column too, but that's the
|
|
152
|
+
*normal* header pattern (and Fix 7 already wraps it in ``<thead>``
|
|
153
|
+
via the contiguous-``<th>`` chain). Inversion is only the right
|
|
154
|
+
witness when row r has a stray ``<td>`` corner cell that the body
|
|
155
|
+
makes a ``<th scope="row">`` (e.g., header row ``[td, th, th]``
|
|
156
|
+
above body rows ``[th, td, td]``). Universal: cell tags are
|
|
157
|
+
markup, not content.
|
|
158
|
+
|
|
159
|
+
Both extended witnesses apply only at r == 0 — the literal first
|
|
160
|
+
row. Beyond row 0, "fuller than the body below" describes regular
|
|
161
|
+
data rows (regression group labels, alternating coefficient/std-err
|
|
162
|
+
pairs) and "first row with col 0 non-empty" is already the existing
|
|
163
|
+
stub-column header pattern handled by the rest of the function.
|
|
164
|
+
|
|
125
165
|
Returns (k, stub_cols, origin_cells, grid) on success, or None.
|
|
126
166
|
"""
|
|
127
167
|
n = len(rows)
|
|
@@ -132,6 +172,85 @@ def detect_header_block(rows):
|
|
|
132
172
|
if max_cols == 0:
|
|
133
173
|
return None
|
|
134
174
|
|
|
175
|
+
# Pre-compute per-row non-empty counts (logical, colspan-expanded).
|
|
176
|
+
nonempty_counts = [sum(1 for c in row if c["nonempty"]) for row in grid]
|
|
177
|
+
|
|
178
|
+
def _body_min_nonempty(after_r: int) -> int:
|
|
179
|
+
"""Minimum non-empty count among non-divider rows strictly after
|
|
180
|
+
``after_r``. Returns -1 when no qualifying body row exists
|
|
181
|
+
(caller treats that as "no body to compare against" and skips
|
|
182
|
+
the witness).
|
|
183
|
+
"""
|
|
184
|
+
counts = [nonempty_counts[r2] for r2 in range(after_r + 1, n) if nonempty_counts[r2] >= 2]
|
|
185
|
+
if not counts:
|
|
186
|
+
return -1
|
|
187
|
+
return min(counts)
|
|
188
|
+
|
|
189
|
+
def _row_cols_covered_by_body(r: int) -> bool:
|
|
190
|
+
"""True iff every column where row r is non-empty has at least one
|
|
191
|
+
non-divider body row below that fills the same column. Excludes
|
|
192
|
+
receipts whose row 0 fills columns the body never uses (line
|
|
193
|
+
item over totals)."""
|
|
194
|
+
for c in range(max_cols):
|
|
195
|
+
if not grid[r][c]["nonempty"]:
|
|
196
|
+
continue
|
|
197
|
+
covered = False
|
|
198
|
+
for r2 in range(r + 1, n):
|
|
199
|
+
if nonempty_counts[r2] < 2:
|
|
200
|
+
continue
|
|
201
|
+
if grid[r2][c]["nonempty"]:
|
|
202
|
+
covered = True
|
|
203
|
+
break
|
|
204
|
+
if not covered:
|
|
205
|
+
return False
|
|
206
|
+
return True
|
|
207
|
+
|
|
208
|
+
def _is_inverted_relative_to_body(r: int) -> bool:
|
|
209
|
+
"""True iff row r contains a corner-stub ``<td>`` and inverts the
|
|
210
|
+
body majority at a strict majority of compared columns. Only
|
|
211
|
+
origin cells with non-empty text on both sides participate.
|
|
212
|
+
|
|
213
|
+
The corner-stub clause requires ≥1 column where row r is ``<td>``
|
|
214
|
+
but body majority is ``<th>``. Without it, an all-``<th>`` row
|
|
215
|
+
above an all-``<td>`` body would also "invert" every column — but
|
|
216
|
+
that's the normal header pattern, already handled by Fix 7's
|
|
217
|
+
contiguous-``<th>`` thead wrap.
|
|
218
|
+
"""
|
|
219
|
+
has_corner_stub = False
|
|
220
|
+
inverted = 0
|
|
221
|
+
checked = 0
|
|
222
|
+
for c in range(max_cols):
|
|
223
|
+
row_cell = grid[r][c]
|
|
224
|
+
if row_cell["origin"] != (r, c) or not row_cell["nonempty"]:
|
|
225
|
+
continue
|
|
226
|
+
row_origin = origin_cells.get((r, c))
|
|
227
|
+
if row_origin is None or row_origin.name not in ("td", "th"):
|
|
228
|
+
continue
|
|
229
|
+
row_tag = row_origin.name
|
|
230
|
+
|
|
231
|
+
td_count = 0
|
|
232
|
+
th_count = 0
|
|
233
|
+
for r2 in range(r + 1, n):
|
|
234
|
+
body_cell = grid[r2][c]
|
|
235
|
+
if body_cell["origin"] != (r2, c) or not body_cell["nonempty"]:
|
|
236
|
+
continue
|
|
237
|
+
body_origin = origin_cells.get((r2, c))
|
|
238
|
+
if body_origin is None:
|
|
239
|
+
continue
|
|
240
|
+
if body_origin.name == "td":
|
|
241
|
+
td_count += 1
|
|
242
|
+
elif body_origin.name == "th":
|
|
243
|
+
th_count += 1
|
|
244
|
+
if td_count + th_count == 0:
|
|
245
|
+
continue
|
|
246
|
+
body_majority = "td" if td_count >= th_count else "th"
|
|
247
|
+
checked += 1
|
|
248
|
+
if body_majority != row_tag:
|
|
249
|
+
inverted += 1
|
|
250
|
+
if row_tag == "td" and body_majority == "th":
|
|
251
|
+
has_corner_stub = True
|
|
252
|
+
return has_corner_stub and checked > 0 and inverted * 2 > checked
|
|
253
|
+
|
|
135
254
|
# Find the first data row. A data row is one whose shape has no
|
|
136
255
|
# structural feature that would mark it as a header:
|
|
137
256
|
# - col 0 is non-empty (typical row-label)
|
|
@@ -140,6 +259,13 @@ def detect_header_block(rows):
|
|
|
140
259
|
# (no rowspan copy from above pulling header content into body)
|
|
141
260
|
# - every origin cell at row r has rowspan == colspan == 1
|
|
142
261
|
# (no colspan group or rowspan marker signaling header role)
|
|
262
|
+
# Plus, only at r == 0 (the literal first row):
|
|
263
|
+
# - row 0 is not strictly fuller than at least one body row below
|
|
264
|
+
# while every column it fills is covered by some body row
|
|
265
|
+
# (the fuller-than-body witness — see docstring)
|
|
266
|
+
# - row 0's cell tags are not inverted relative to body majority,
|
|
267
|
+
# with at least one corner-stub ``<td>`` versus body ``<th>``
|
|
268
|
+
# (the cell-type-inversion witness — see docstring)
|
|
143
269
|
#
|
|
144
270
|
# The "every cell non-empty" requirement was dropped intentionally:
|
|
145
271
|
# real body rows in financial tables are often gappy (a cell has a
|
|
@@ -153,7 +279,7 @@ def detect_header_block(rows):
|
|
|
153
279
|
row = grid[r]
|
|
154
280
|
if not row[0]["nonempty"]:
|
|
155
281
|
continue
|
|
156
|
-
nonempty_count =
|
|
282
|
+
nonempty_count = nonempty_counts[r]
|
|
157
283
|
if nonempty_count < 2:
|
|
158
284
|
continue
|
|
159
285
|
is_clean = True
|
|
@@ -165,6 +291,12 @@ def detect_header_block(rows):
|
|
|
165
291
|
if cell["rs"] != 1 or cell["cs"] != 1:
|
|
166
292
|
is_clean = False
|
|
167
293
|
break
|
|
294
|
+
if is_clean and r == 0:
|
|
295
|
+
body_min = _body_min_nonempty(r)
|
|
296
|
+
if body_min >= 0 and nonempty_count > body_min and _row_cols_covered_by_body(r):
|
|
297
|
+
is_clean = False
|
|
298
|
+
if is_clean and _is_inverted_relative_to_body(r):
|
|
299
|
+
is_clean = False
|
|
168
300
|
if is_clean:
|
|
169
301
|
first_data_idx = r
|
|
170
302
|
break
|
|
@@ -188,6 +188,14 @@ def test_correctness_oracle(case: tuple[Path, Path]) -> None:
|
|
|
188
188
|
matched = 0
|
|
189
189
|
emitted_lines = [l for l in output.splitlines() if l.strip()]
|
|
190
190
|
for line in emitted_lines:
|
|
191
|
+
# Label-preservation lines reproduce a whole source cell verbatim (a
|
|
192
|
+
# de-spanned section header whose value column is empty, e.g.
|
|
193
|
+
# "Segments: (1)"). Such a cell may itself contain ": ", which the
|
|
194
|
+
# rule-line parser would misread as a key/value split. A line equal to
|
|
195
|
+
# a full source cell is faithful preservation, not misattribution.
|
|
196
|
+
if source_tokens and _norm(line) in source_tokens:
|
|
197
|
+
matched += 1
|
|
198
|
+
continue
|
|
191
199
|
parsed = _parse_rule_line(line, source_tokens)
|
|
192
200
|
if parsed is None:
|
|
193
201
|
continue
|
|
@@ -1,13 +1,20 @@
|
|
|
1
|
-
"""Regression layer — byte-for-byte gold matching on
|
|
2
|
-
|
|
3
|
-
Each .md file beneath tests/
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
1
|
+
"""Regression layer — byte-for-byte gold matching on every fixture.
|
|
2
|
+
|
|
3
|
+
Each .md file beneath tests/ (except top-level docs) is a fixture containing
|
|
4
|
+
HTML table markup. For every fixture we run process_tables_to_text and assert
|
|
5
|
+
the output matches the committed gold file under benchmarks/gold/<format>/.
|
|
6
|
+
|
|
7
|
+
This covers both the hand-authored fixtures AND the real-world corpus
|
|
8
|
+
(tests/realworld/). The two suites play complementary roles: the correctness
|
|
9
|
+
and robustness layers (test_correctness_oracle / test_robustness_mutations)
|
|
10
|
+
assert the output is *right* (no fabricated content, correct attribution,
|
|
11
|
+
stable under mutation); this layer asserts the output does not *change* unless
|
|
12
|
+
a human regenerates the golds. Together they catch a silent-drop regression —
|
|
13
|
+
where the parser quietly stops emitting real content — which neither the
|
|
14
|
+
oracle (it only guards against fabrication) nor an un-asserted benchmark gold
|
|
15
|
+
could catch on its own. See tests/README.md.
|
|
7
16
|
|
|
8
17
|
This is the strictest of the three test layers — catches any output drift.
|
|
9
|
-
See tests/README.md for the relationship to the correctness and robustness
|
|
10
|
-
suites.
|
|
11
18
|
|
|
12
19
|
Refresh gold outputs by running: python scripts/benchmark.py --update-gold
|
|
13
20
|
"""
|
|
@@ -27,15 +34,14 @@ GOLD_DIR = ROOT / "benchmarks" / "gold" / DEFAULT_FORMAT
|
|
|
27
34
|
|
|
28
35
|
|
|
29
36
|
def _discover_cases() -> list[Path]:
|
|
30
|
-
#
|
|
31
|
-
#
|
|
32
|
-
# Top-level docs like README.md are not
|
|
33
|
-
|
|
37
|
+
# Every fixture beneath tests/ is byte-checked, including the real-world
|
|
38
|
+
# corpus (tests/realworld/) — frozen gold text is the tripwire that makes
|
|
39
|
+
# any output change visible. Top-level docs like tests/README.md are not
|
|
40
|
+
# fixtures and are excluded.
|
|
34
41
|
return [
|
|
35
42
|
p
|
|
36
43
|
for p in sorted(TESTS_DIR.rglob("*.md"))
|
|
37
|
-
if
|
|
38
|
-
and p.parent != TESTS_DIR # exclude tests/README.md etc.
|
|
44
|
+
if p.parent != TESTS_DIR # exclude tests/README.md, tests/failing_table.md
|
|
39
45
|
]
|
|
40
46
|
|
|
41
47
|
|
|
@@ -95,13 +95,22 @@ def _parse_rule_line(line: str, source_tokens: frozenset[str] = frozenset()):
|
|
|
95
95
|
return row_path, col_path, value
|
|
96
96
|
|
|
97
97
|
|
|
98
|
-
def _classify(output: str) -> str:
|
|
98
|
+
def _classify(output: str, source_tokens: frozenset[str] = frozenset()) -> str:
|
|
99
99
|
lines = [l for l in output.splitlines() if l.strip()]
|
|
100
100
|
if not lines:
|
|
101
101
|
return "EMPTY"
|
|
102
102
|
if any("<table" in l for l in lines):
|
|
103
103
|
return "PASSTHROUGH"
|
|
104
|
-
|
|
104
|
+
# A label-preservation line reproduces a whole source cell verbatim (a
|
|
105
|
+
# de-spanned/echoed section header kept visible). It is not key/value
|
|
106
|
+
# shaped, but it is legitimate rules-mode output — count it as such so a
|
|
107
|
+
# table of rules plus section labels stays RULES rather than degrading to
|
|
108
|
+
# MIXED (which would skip the precision check below).
|
|
109
|
+
rule_shaped = sum(
|
|
110
|
+
1
|
|
111
|
+
for l in lines
|
|
112
|
+
if _parse_rule_line(l) is not None or (source_tokens and _norm(l) in source_tokens)
|
|
113
|
+
)
|
|
105
114
|
if rule_shaped == len(lines):
|
|
106
115
|
return "RULES"
|
|
107
116
|
if rule_shaped == 0:
|
|
@@ -345,7 +354,7 @@ def test_robustness_under_mutation(case: tuple[Path, Path], mutation_name: str)
|
|
|
345
354
|
mutated_html = mutator(html, rng)
|
|
346
355
|
|
|
347
356
|
output = process_tables_to_text(mutated_html)
|
|
348
|
-
tier = _classify(output)
|
|
357
|
+
tier = _classify(output, source_tokens)
|
|
349
358
|
if tier in {"PASSTHROUGH", "FLAT", "EMPTY", "MIXED"}:
|
|
350
359
|
# Safe fallback; not a precision failure.
|
|
351
360
|
pytest.skip(f"tier={tier} after mutation={mutation_name!r}")
|
|
@@ -363,6 +372,14 @@ def test_robustness_under_mutation(case: tuple[Path, Path], mutation_name: str)
|
|
|
363
372
|
for line in output.splitlines():
|
|
364
373
|
if not line.strip():
|
|
365
374
|
continue
|
|
375
|
+
# Label-preservation lines reproduce a whole source cell verbatim
|
|
376
|
+
# (a de-spanned section header whose value column is empty, e.g.
|
|
377
|
+
# "Segments: (1)"). The cell text itself may contain ": ", which the
|
|
378
|
+
# rule-line parser would misread as a key/value split. A line equal to
|
|
379
|
+
# a full source cell is faithful preservation, not fabrication — the
|
|
380
|
+
# contract is "no invented content", and there is none here.
|
|
381
|
+
if source_tokens and _norm(line) in source_tokens:
|
|
382
|
+
continue
|
|
366
383
|
parsed = _parse_rule_line(line, source_tokens)
|
|
367
384
|
if parsed is None:
|
|
368
385
|
continue
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|