table2rules 0.6.0__tar.gz → 0.6.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {table2rules-0.6.0/src/table2rules.egg-info → table2rules-0.6.2}/PKG-INFO +1 -1
- {table2rules-0.6.0 → table2rules-0.6.2}/pyproject.toml +1 -1
- {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules/_core.py +148 -0
- {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules/maze_pathfinder.py +24 -2
- {table2rules-0.6.0 → table2rules-0.6.2/src/table2rules.egg-info}/PKG-INFO +1 -1
- {table2rules-0.6.0 → table2rules-0.6.2}/LICENSE +0 -0
- {table2rules-0.6.0 → table2rules-0.6.2}/README.md +0 -0
- {table2rules-0.6.0 → table2rules-0.6.2}/setup.cfg +0 -0
- {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules/__init__.py +0 -0
- {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules/__main__.py +0 -0
- {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules/cleanup.py +0 -0
- {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules/errors.py +0 -0
- {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules/exporters/__init__.py +0 -0
- {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules/exporters/base.py +0 -0
- {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules/exporters/rules.py +0 -0
- {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules/grid_parser.py +0 -0
- {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules/models.py +0 -0
- {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules/py.typed +0 -0
- {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules/quality_gate.py +0 -0
- {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules/report.py +0 -0
- {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules/simple_repair.py +0 -0
- {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules/spans.py +0 -0
- {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules.egg-info/SOURCES.txt +0 -0
- {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules.egg-info/dependency_links.txt +0 -0
- {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules.egg-info/entry_points.txt +0 -0
- {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules.egg-info/requires.txt +0 -0
- {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules.egg-info/top_level.txt +0 -0
- {table2rules-0.6.0 → table2rules-0.6.2}/tests/test_correctness_oracle.py +0 -0
- {table2rules-0.6.0 → table2rules-0.6.2}/tests/test_determinism.py +0 -0
- {table2rules-0.6.0 → table2rules-0.6.2}/tests/test_public_api.py +0 -0
- {table2rules-0.6.0 → table2rules-0.6.2}/tests/test_regression_golds.py +0 -0
- {table2rules-0.6.0 → table2rules-0.6.2}/tests/test_robustness_mutations.py +0 -0
|
@@ -211,6 +211,11 @@ def _build_rules(grid) -> List[LogicRule]:
|
|
|
211
211
|
for row_idx in range(len(grid)):
|
|
212
212
|
if row_idx in rows_with_value:
|
|
213
213
|
continue
|
|
214
|
+
# A label-only row promoted to a row-group ancestor (scope="rowgroup")
|
|
215
|
+
# is threaded into the value lines beneath it — emitting it here too
|
|
216
|
+
# would duplicate it as an orphan label.
|
|
217
|
+
if any(grid[row_idx][c].get("scope") == "rowgroup" for c in range(n_cols)):
|
|
218
|
+
continue
|
|
214
219
|
# Anchor the rule at the row's data column so it satisfies the quality
|
|
215
220
|
# gate's "rules originate from <td>" invariant. A row with no <td> at
|
|
216
221
|
# all is a true full-width <th colspan> divider — already handled as a
|
|
@@ -319,6 +324,148 @@ def _mark_rowgroup_bands(grid) -> None:
|
|
|
319
324
|
grid[r][cc]["scope"] = "rowgroup"
|
|
320
325
|
|
|
321
326
|
|
|
327
|
+
def _mark_label_only_rowgroups(grid) -> None:
|
|
328
|
+
"""Promote *label-only rows* to ``<th scope="rowgroup">`` so the maze threads
|
|
329
|
+
them into each value line's row path, the row-side counterpart of the
|
|
330
|
+
full-width band handled by :func:`_mark_rowgroup_bands`.
|
|
331
|
+
|
|
332
|
+
A label-only row is a body row whose value (``<td>``) columns are all empty
|
|
333
|
+
while a leading label column carries text — the ``Label | Value`` form
|
|
334
|
+
pervasive in financial/insurance schedules (``9. Trip Cancellation | (empty)``
|
|
335
|
+
above its value rows). Unlike a full-width band the label cell does *not*
|
|
336
|
+
span the value region; the other columns are simply empty, so
|
|
337
|
+
``is_full_width_note`` geometry never sees it. Without this pass the row is
|
|
338
|
+
emitted as an orphaned ``is_label`` rule and the values beneath it lose their
|
|
339
|
+
group identity.
|
|
340
|
+
|
|
341
|
+
Detection is geometric, not flag-based: a row with no value-bearing ``<td>``
|
|
342
|
+
but exactly one non-empty body ``<th>`` label source cell. (Row-label
|
|
343
|
+
columns are already promoted to ``<th scope="row">`` upstream — Signal A/B/C
|
|
344
|
+
in grid_parser and simple_repair — so "no non-empty ``<td>``" means "no
|
|
345
|
+
value".)
|
|
346
|
+
|
|
347
|
+
The single-label-cell requirement is what separates a group header from a
|
|
348
|
+
data row whose *designated* value columns merely happen to be empty. A genuine
|
|
349
|
+
group header carries one title ("9. Trip Cancellation", possibly spanning the
|
|
350
|
+
first N>1 columns via one ``colspan`` cell). A data row whose value columns
|
|
351
|
+
are blank ("Average: | 80.2 | 10.7 | 3.3", or a summary row under a header
|
|
352
|
+
that over-promoted numeric columns to row labels) spreads several distinct
|
|
353
|
+
values across its label cells — threading those as a group path would invent
|
|
354
|
+
a breadcrumb and misattribute it to the rows below. Such rows stay on the
|
|
355
|
+
``is_label`` preservation path, unchanged.
|
|
356
|
+
|
|
357
|
+
Stacking and extent (no content-aware level inference):
|
|
358
|
+
|
|
359
|
+
* A maximal run of *consecutive* label-only rows forms one header stack. Its
|
|
360
|
+
members are threaded as nested ancestors in row order — a title followed by
|
|
361
|
+
a description (``10. Travel Delay`` then ``If the departure…``) both land in
|
|
362
|
+
the path, title first.
|
|
363
|
+
* A stack's extent runs from just below the stack down to the row before the
|
|
364
|
+
next stack OR the next full-width band, whichever comes first — so a group
|
|
365
|
+
never leaks into the next line-item or across a section divider.
|
|
366
|
+
* A stack is promoted only when its extent holds a real value row (parity
|
|
367
|
+
with the full-width-note guard): a trailing label that groups nothing is
|
|
368
|
+
left for the ``is_label`` preservation path, never stranded as an
|
|
369
|
+
empty-extent rowgroup.
|
|
370
|
+
|
|
371
|
+
The stored ``rowgroup_extent_end`` is what the maze honors for these bands;
|
|
372
|
+
full-width bands keep their colspan-bounded extent. The two compose: a
|
|
373
|
+
section band (wider) and a label-only group (narrower) nest consistently.
|
|
374
|
+
"""
|
|
375
|
+
if not grid or not grid[0]:
|
|
376
|
+
return
|
|
377
|
+
n_rows = len(grid)
|
|
378
|
+
n_cols = len(grid[0])
|
|
379
|
+
|
|
380
|
+
def _is_body_row(r: int) -> bool:
|
|
381
|
+
return not any(
|
|
382
|
+
grid[r][c].get("is_thead") or grid[r][c].get("is_header_row") for c in range(n_cols)
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
def _has_value(r: int) -> bool:
|
|
386
|
+
return any(
|
|
387
|
+
grid[r][c]["type"] == "td" and (grid[r][c].get("text") or "").strip()
|
|
388
|
+
for c in range(n_cols)
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
def _label_cols(r: int) -> List[int]:
|
|
392
|
+
cols: List[int] = []
|
|
393
|
+
for c in range(n_cols):
|
|
394
|
+
cell = grid[r][c]
|
|
395
|
+
if cell.get("is_thead") or cell.get("is_header_row"):
|
|
396
|
+
continue
|
|
397
|
+
if cell["type"] != "th":
|
|
398
|
+
continue
|
|
399
|
+
if cell.get("is_span_copy"):
|
|
400
|
+
# A span copy of a label cell originating in this same row is
|
|
401
|
+
# part of a multi-column label (label spans the first N>1
|
|
402
|
+
# columns); promote it too. A span copy reaching down from a
|
|
403
|
+
# row above is not a label of this row.
|
|
404
|
+
origin = cell.get("origin", (r, c))
|
|
405
|
+
if origin[0] != r:
|
|
406
|
+
continue
|
|
407
|
+
if not (grid[origin[0]][origin[1]].get("text") or "").strip():
|
|
408
|
+
continue
|
|
409
|
+
elif not (cell.get("text") or "").strip():
|
|
410
|
+
continue
|
|
411
|
+
cols.append(c)
|
|
412
|
+
return cols
|
|
413
|
+
|
|
414
|
+
def _single_label_origin(r: int) -> bool:
|
|
415
|
+
# A group header is exactly one label source cell (a title, possibly
|
|
416
|
+
# colspan'd). More than one distinct non-empty label cell means a data
|
|
417
|
+
# row, not a divider — do not thread it.
|
|
418
|
+
origins = set()
|
|
419
|
+
for c in _label_cols(r):
|
|
420
|
+
cell = grid[r][c]
|
|
421
|
+
origins.add(cell.get("origin", (r, c)) if cell.get("is_span_copy") else (r, c))
|
|
422
|
+
return len(origins) == 1
|
|
423
|
+
|
|
424
|
+
# A row already carrying a rowgroup cell (a full-width band promoted above,
|
|
425
|
+
# or a source scope="rowgroup") is a boundary, not a label-only candidate.
|
|
426
|
+
band_rows = {
|
|
427
|
+
r for r in range(n_rows) for c in range(n_cols) if grid[r][c].get("scope") == "rowgroup"
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
is_label_row = [
|
|
431
|
+
_is_body_row(r)
|
|
432
|
+
and r not in band_rows
|
|
433
|
+
and not _has_value(r)
|
|
434
|
+
and bool(_label_cols(r))
|
|
435
|
+
and _single_label_origin(r)
|
|
436
|
+
for r in range(n_rows)
|
|
437
|
+
]
|
|
438
|
+
|
|
439
|
+
r = 0
|
|
440
|
+
while r < n_rows:
|
|
441
|
+
if not is_label_row[r]:
|
|
442
|
+
r += 1
|
|
443
|
+
continue
|
|
444
|
+
# Gather the maximal consecutive run of label-only rows.
|
|
445
|
+
s_start = r
|
|
446
|
+
while r + 1 < n_rows and is_label_row[r + 1]:
|
|
447
|
+
r += 1
|
|
448
|
+
s_end = r
|
|
449
|
+
r += 1 # advance past the stack for the outer loop
|
|
450
|
+
|
|
451
|
+
# Extent: down to the row before the next boundary (next label stack or
|
|
452
|
+
# full-width band). Bounded by a value row's presence.
|
|
453
|
+
extent_end = n_rows - 1
|
|
454
|
+
for rr in range(s_end + 1, n_rows):
|
|
455
|
+
if is_label_row[rr] or rr in band_rows:
|
|
456
|
+
extent_end = rr - 1
|
|
457
|
+
break
|
|
458
|
+
has_data_row = any(_has_value(rr) for rr in range(s_end + 1, extent_end + 1))
|
|
459
|
+
if not has_data_row:
|
|
460
|
+
continue
|
|
461
|
+
|
|
462
|
+
for rr in range(s_start, s_end + 1):
|
|
463
|
+
for c in _label_cols(rr):
|
|
464
|
+
grid[rr][c]["type"] = "th"
|
|
465
|
+
grid[rr][c]["scope"] = "rowgroup"
|
|
466
|
+
grid[rr][c]["rowgroup_extent_end"] = extent_end
|
|
467
|
+
|
|
468
|
+
|
|
322
469
|
def _process_table_with_gate(table_html: str) -> Tuple[List[LogicRule], GateResult]:
|
|
323
470
|
"""Runs the full pipeline and returns rules plus the gate verdict.
|
|
324
471
|
|
|
@@ -337,6 +484,7 @@ def _process_table_with_gate(table_html: str) -> Tuple[List[LogicRule], GateResu
|
|
|
337
484
|
return [], GateResult(ok=False, score=0.0, reasons=["empty_grid"])
|
|
338
485
|
|
|
339
486
|
_mark_rowgroup_bands(grid)
|
|
487
|
+
_mark_label_only_rowgroups(grid)
|
|
340
488
|
rules = clean_rules(_build_rules(grid))
|
|
341
489
|
gate = assess_confidence(grid, rules)
|
|
342
490
|
if not gate.ok:
|
|
@@ -151,8 +151,18 @@ def find_headers_for_cell(
|
|
|
151
151
|
# Bands are ordered topmost-first (origin row ascending) and prepended, so
|
|
152
152
|
# the row path reads outer-band > inner-group > row-labels, mirroring the
|
|
153
153
|
# multi-level column path.
|
|
154
|
+
#
|
|
155
|
+
# A *label-only* band (one carrying an explicit ``rowgroup_extent_end``)
|
|
156
|
+
# groups a ROW RANGE, so it must reach every value row in its extent
|
|
157
|
+
# regardless of which column its single label cell sits in — e.g. a numbered
|
|
158
|
+
# schedule whose group header is in the line-number/stub column while the
|
|
159
|
+
# sub-rows leave that column empty and carry their identity in a different
|
|
160
|
+
# column. Such bands are therefore scanned across ALL columns. Full-width and
|
|
161
|
+
# source ``scope="rowgroup"`` bands keep the column-restricted scan (own
|
|
162
|
+
# column + row-label columns) so unrelated stub dividers don't cross-attach.
|
|
163
|
+
own_cols = {col, *row_header_columns}
|
|
154
164
|
bands: List[Tuple[int, str]] = [] # (origin_row, text)
|
|
155
|
-
for scan_col in [
|
|
165
|
+
for scan_col in range(len(grid[0])):
|
|
156
166
|
for r in range(row - 1, -1, -1):
|
|
157
167
|
cell = grid[r][scan_col]
|
|
158
168
|
if not cell or not cell.get("text", "").strip():
|
|
@@ -167,12 +177,24 @@ def find_headers_for_cell(
|
|
|
167
177
|
else:
|
|
168
178
|
origin = (r, scan_col)
|
|
169
179
|
origin_cell = cell
|
|
180
|
+
# A column-restricted band (no stored extent) is only honored from
|
|
181
|
+
# the value's own column or a row-label column; a label-only band
|
|
182
|
+
# (stored extent) is honored from any column.
|
|
183
|
+
if origin_cell.get("rowgroup_extent_end") is None and scan_col not in own_cols:
|
|
184
|
+
continue
|
|
170
185
|
if origin in seen_origins:
|
|
171
186
|
continue
|
|
172
187
|
origin_row, origin_col = origin
|
|
173
188
|
my_colspan = origin_cell.get("colspan", 1)
|
|
174
189
|
origin_rowspan = origin_cell.get("rowspan", 1)
|
|
175
|
-
|
|
190
|
+
stored_extent = origin_cell.get("rowgroup_extent_end")
|
|
191
|
+
if stored_extent is not None:
|
|
192
|
+
# Label-only bands carry an explicit extent (the run of value
|
|
193
|
+
# rows they group, bounded by the next stack or section band),
|
|
194
|
+
# because their colspan=1 label cannot encode nesting depth the
|
|
195
|
+
# way a full-width band's width does.
|
|
196
|
+
extent_end = stored_extent
|
|
197
|
+
elif origin_rowspan > 1:
|
|
176
198
|
extent_end = origin_row + origin_rowspan - 1
|
|
177
199
|
else:
|
|
178
200
|
extent_end = len(grid) - 1
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|