table2rules 0.6.2__tar.gz → 0.6.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {table2rules-0.6.2/src/table2rules.egg-info → table2rules-0.6.3}/PKG-INFO +1 -1
- {table2rules-0.6.2 → table2rules-0.6.3}/pyproject.toml +1 -1
- {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules/_core.py +67 -16
- {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules/grid_parser.py +18 -0
- {table2rules-0.6.2 → table2rules-0.6.3/src/table2rules.egg-info}/PKG-INFO +1 -1
- {table2rules-0.6.2 → table2rules-0.6.3}/LICENSE +0 -0
- {table2rules-0.6.2 → table2rules-0.6.3}/README.md +0 -0
- {table2rules-0.6.2 → table2rules-0.6.3}/setup.cfg +0 -0
- {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules/__init__.py +0 -0
- {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules/__main__.py +0 -0
- {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules/cleanup.py +0 -0
- {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules/errors.py +0 -0
- {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules/exporters/__init__.py +0 -0
- {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules/exporters/base.py +0 -0
- {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules/exporters/rules.py +0 -0
- {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules/maze_pathfinder.py +0 -0
- {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules/models.py +0 -0
- {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules/py.typed +0 -0
- {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules/quality_gate.py +0 -0
- {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules/report.py +0 -0
- {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules/simple_repair.py +0 -0
- {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules/spans.py +0 -0
- {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules.egg-info/SOURCES.txt +0 -0
- {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules.egg-info/dependency_links.txt +0 -0
- {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules.egg-info/entry_points.txt +0 -0
- {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules.egg-info/requires.txt +0 -0
- {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules.egg-info/top_level.txt +0 -0
- {table2rules-0.6.2 → table2rules-0.6.3}/tests/test_correctness_oracle.py +0 -0
- {table2rules-0.6.2 → table2rules-0.6.3}/tests/test_determinism.py +0 -0
- {table2rules-0.6.2 → table2rules-0.6.3}/tests/test_public_api.py +0 -0
- {table2rules-0.6.2 → table2rules-0.6.3}/tests/test_regression_golds.py +0 -0
- {table2rules-0.6.2 → table2rules-0.6.3}/tests/test_robustness_mutations.py +0 -0
|
@@ -411,15 +411,33 @@ def _mark_label_only_rowgroups(grid) -> None:
|
|
|
411
411
|
cols.append(c)
|
|
412
412
|
return cols
|
|
413
413
|
|
|
414
|
-
def
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
414
|
+
def _cell_text(r: int, c: int) -> str:
|
|
415
|
+
cell = grid[r][c]
|
|
416
|
+
if cell.get("is_span_copy"):
|
|
417
|
+
o = cell.get("origin", (r, c))
|
|
418
|
+
return (grid[o[0]][o[1]].get("text") or "").strip()
|
|
419
|
+
return (cell.get("text") or "").strip()
|
|
420
|
+
|
|
421
|
+
def _is_numeric_only(text: str) -> bool:
|
|
422
|
+
# No alphabetic character but at least one digit — a bare item number
|
|
423
|
+
# ("3.", "10"), reusing the parser's universal "letters label, digits
|
|
424
|
+
# measure" signal. A group title carries text; a mis-promoted value cell
|
|
425
|
+
# is a number.
|
|
426
|
+
return (
|
|
427
|
+
bool(text) and not any(ch.isalpha() for ch in text) and any(ch.isdigit() for ch in text)
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
def _title_like(r: int) -> bool:
|
|
431
|
+
# A group-header title carries at most ONE numeric-only label cell (a
|
|
432
|
+
# leading item number, e.g. "10 | Travel delay" or "3. | Permanent loss
|
|
433
|
+
# of:"). Two or more numeric label cells means a data row whose value
|
|
434
|
+
# columns merely happen to be empty (e.g. a header that over-promoted
|
|
435
|
+
# numeric columns to row labels, "Average: | 80.2 | 10.7 | 3.3") —
|
|
436
|
+
# threading it would invent a breadcrumb, so it stays an is_label.
|
|
437
|
+
cols = _label_cols(r)
|
|
438
|
+
if not cols:
|
|
439
|
+
return False
|
|
440
|
+
return sum(1 for c in cols if _is_numeric_only(_cell_text(r, c))) <= 1
|
|
423
441
|
|
|
424
442
|
# A row already carrying a rowgroup cell (a full-width band promoted above,
|
|
425
443
|
# or a source scope="rowgroup") is a boundary, not a label-only candidate.
|
|
@@ -432,7 +450,7 @@ def _mark_label_only_rowgroups(grid) -> None:
|
|
|
432
450
|
and r not in band_rows
|
|
433
451
|
and not _has_value(r)
|
|
434
452
|
and bool(_label_cols(r))
|
|
435
|
-
and
|
|
453
|
+
and _title_like(r)
|
|
436
454
|
for r in range(n_rows)
|
|
437
455
|
]
|
|
438
456
|
|
|
@@ -448,23 +466,56 @@ def _mark_label_only_rowgroups(grid) -> None:
|
|
|
448
466
|
s_end = r
|
|
449
467
|
r += 1 # advance past the stack for the outer loop
|
|
450
468
|
|
|
451
|
-
#
|
|
452
|
-
#
|
|
469
|
+
# Absorb a run of full-width band rows immediately following the title
|
|
470
|
+
# stack (a description band under the title) into this header block —
|
|
471
|
+
# they are nested members, not a boundary. Without this the title's
|
|
472
|
+
# extent would terminate at the band and the title would be dropped (the
|
|
473
|
+
# narrow-title-then-full-width-description shape).
|
|
474
|
+
header_end = s_end
|
|
475
|
+
while header_end + 1 < n_rows and (header_end + 1) in band_rows:
|
|
476
|
+
header_end += 1
|
|
477
|
+
|
|
478
|
+
# Extent: from the first row after the header block to the row before the
|
|
479
|
+
# next group start — the next title, or a full-width band that begins a
|
|
480
|
+
# new section (one appearing AFTER a value row). A band absorbed above is
|
|
481
|
+
# part of this header, not a boundary.
|
|
453
482
|
extent_end = n_rows - 1
|
|
454
|
-
|
|
455
|
-
|
|
483
|
+
saw_value = False
|
|
484
|
+
for rr in range(header_end + 1, n_rows):
|
|
485
|
+
if is_label_row[rr]:
|
|
456
486
|
extent_end = rr - 1
|
|
457
487
|
break
|
|
458
|
-
|
|
459
|
-
|
|
488
|
+
if rr in band_rows and saw_value:
|
|
489
|
+
extent_end = rr - 1
|
|
490
|
+
break
|
|
491
|
+
if _has_value(rr):
|
|
492
|
+
saw_value = True
|
|
493
|
+
value_rows = [rr for rr in range(header_end + 1, extent_end + 1) if _has_value(rr)]
|
|
494
|
+
if not value_rows:
|
|
460
495
|
continue
|
|
461
496
|
|
|
497
|
+
# Promote each title cell, EXCLUDING a key column whose (column, text)
|
|
498
|
+
# repeats on a value row of the group — a repeating item-number/key
|
|
499
|
+
# already threads via the value rows' own labels; promoting it again
|
|
500
|
+
# would duplicate it in the path. The remaining cells are the title.
|
|
462
501
|
for rr in range(s_start, s_end + 1):
|
|
463
502
|
for c in _label_cols(rr):
|
|
503
|
+
text = _cell_text(rr, c)
|
|
504
|
+
if any(_cell_text(vr, c) == text for vr in value_rows):
|
|
505
|
+
continue
|
|
464
506
|
grid[rr][c]["type"] = "th"
|
|
465
507
|
grid[rr][c]["scope"] = "rowgroup"
|
|
466
508
|
grid[rr][c]["rowgroup_extent_end"] = extent_end
|
|
467
509
|
|
|
510
|
+
# Bound the absorbed description band(s) by the same extent so a
|
|
511
|
+
# full-width description does not leak past the next narrow title (its
|
|
512
|
+
# colspan is wider, so the maze's colspan rule would not close it).
|
|
513
|
+
for rr in range(s_end + 1, header_end + 1):
|
|
514
|
+
for c in range(n_cols):
|
|
515
|
+
cell = grid[rr][c]
|
|
516
|
+
if cell.get("scope") == "rowgroup":
|
|
517
|
+
cell["rowgroup_extent_end"] = extent_end
|
|
518
|
+
|
|
468
519
|
|
|
469
520
|
def _process_table_with_gate(table_html: str) -> Tuple[List[LogicRule], GateResult]:
|
|
470
521
|
"""Runs the full pipeline and returns rules plus the gate verdict.
|
|
@@ -487,6 +487,24 @@ def parse_table_to_grid(table: Tag) -> List[List[Dict[str, Any]]]:
|
|
|
487
487
|
if _descriptor_like(c):
|
|
488
488
|
promote_cols.add(c)
|
|
489
489
|
|
|
490
|
+
# --- Signal D: stub column in a 2-column Label|Value schedule ---
|
|
491
|
+
# In a two-column table the left column is the row-label/stub and the
|
|
492
|
+
# right column is its value — even when col 0 carries a thead header,
|
|
493
|
+
# which Signals A/C/B all skip (they need a multi-row/rowspan header or
|
|
494
|
+
# an unlabeled column). This is the single-row-thead
|
|
495
|
+
# "Benefit | Maximum limit (S$)" schedule shape. Scoped to exactly two
|
|
496
|
+
# columns so multi-column property tables (where col 0 is one data field
|
|
497
|
+
# among several) are untouched; col 0 must be descriptor-like and col 1
|
|
498
|
+
# must carry values, so a two-column all-text table is left alone.
|
|
499
|
+
if (
|
|
500
|
+
max_cols == 2
|
|
501
|
+
and 0 not in promote_cols
|
|
502
|
+
and _descriptor_like(0)
|
|
503
|
+
and body_nonempty[1] >= 1
|
|
504
|
+
and not _descriptor_like(1)
|
|
505
|
+
):
|
|
506
|
+
promote_cols.add(0)
|
|
507
|
+
|
|
490
508
|
if promote_cols:
|
|
491
509
|
for c in sorted(promote_cols):
|
|
492
510
|
for r in range(data_start_row_idx, len(grid)):
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|