table2rules 0.6.2__tar.gz → 0.6.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {table2rules-0.6.2/src/table2rules.egg-info → table2rules-0.6.3}/PKG-INFO +1 -1
  2. {table2rules-0.6.2 → table2rules-0.6.3}/pyproject.toml +1 -1
  3. {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules/_core.py +67 -16
  4. {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules/grid_parser.py +18 -0
  5. {table2rules-0.6.2 → table2rules-0.6.3/src/table2rules.egg-info}/PKG-INFO +1 -1
  6. {table2rules-0.6.2 → table2rules-0.6.3}/LICENSE +0 -0
  7. {table2rules-0.6.2 → table2rules-0.6.3}/README.md +0 -0
  8. {table2rules-0.6.2 → table2rules-0.6.3}/setup.cfg +0 -0
  9. {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules/__init__.py +0 -0
  10. {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules/__main__.py +0 -0
  11. {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules/cleanup.py +0 -0
  12. {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules/errors.py +0 -0
  13. {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules/exporters/__init__.py +0 -0
  14. {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules/exporters/base.py +0 -0
  15. {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules/exporters/rules.py +0 -0
  16. {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules/maze_pathfinder.py +0 -0
  17. {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules/models.py +0 -0
  18. {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules/py.typed +0 -0
  19. {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules/quality_gate.py +0 -0
  20. {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules/report.py +0 -0
  21. {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules/simple_repair.py +0 -0
  22. {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules/spans.py +0 -0
  23. {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules.egg-info/SOURCES.txt +0 -0
  24. {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules.egg-info/dependency_links.txt +0 -0
  25. {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules.egg-info/entry_points.txt +0 -0
  26. {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules.egg-info/requires.txt +0 -0
  27. {table2rules-0.6.2 → table2rules-0.6.3}/src/table2rules.egg-info/top_level.txt +0 -0
  28. {table2rules-0.6.2 → table2rules-0.6.3}/tests/test_correctness_oracle.py +0 -0
  29. {table2rules-0.6.2 → table2rules-0.6.3}/tests/test_determinism.py +0 -0
  30. {table2rules-0.6.2 → table2rules-0.6.3}/tests/test_public_api.py +0 -0
  31. {table2rules-0.6.2 → table2rules-0.6.3}/tests/test_regression_golds.py +0 -0
  32. {table2rules-0.6.2 → table2rules-0.6.3}/tests/test_robustness_mutations.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: table2rules
3
- Version: 0.6.2
3
+ Version: 0.6.3
4
4
  Summary: Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding.
5
5
  Author: PebbleRoad Pte Ltd
6
6
  License-Expression: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "table2rules"
7
- version = "0.6.2"
7
+ version = "0.6.3"
8
8
  description = "Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding."
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -411,15 +411,33 @@ def _mark_label_only_rowgroups(grid) -> None:
411
411
  cols.append(c)
412
412
  return cols
413
413
 
414
- def _single_label_origin(r: int) -> bool:
415
- # A group header is exactly one label source cell (a title, possibly
416
- # colspan'd). More than one distinct non-empty label cell means a data
417
- # row, not a divider — do not thread it.
418
- origins = set()
419
- for c in _label_cols(r):
420
- cell = grid[r][c]
421
- origins.add(cell.get("origin", (r, c)) if cell.get("is_span_copy") else (r, c))
422
- return len(origins) == 1
414
+ def _cell_text(r: int, c: int) -> str:
415
+ cell = grid[r][c]
416
+ if cell.get("is_span_copy"):
417
+ o = cell.get("origin", (r, c))
418
+ return (grid[o[0]][o[1]].get("text") or "").strip()
419
+ return (cell.get("text") or "").strip()
420
+
421
+ def _is_numeric_only(text: str) -> bool:
422
+ # No alphabetic character but at least one digit — a bare item number
423
+ # ("3.", "10"), reusing the parser's universal "letters label, digits
424
+ # measure" signal. A group title carries text; a mis-promoted value cell
425
+ # is a number.
426
+ return (
427
+ bool(text) and not any(ch.isalpha() for ch in text) and any(ch.isdigit() for ch in text)
428
+ )
429
+
430
+ def _title_like(r: int) -> bool:
431
+ # A group-header title carries at most ONE numeric-only label cell (a
432
+ # leading item number, e.g. "10 | Travel delay" or "3. | Permanent loss
433
+ # of:"). Two or more numeric label cells means a data row whose value
434
+ # columns merely happen to be empty (e.g. a header that over-promoted
435
+ # numeric columns to row labels, "Average: | 80.2 | 10.7 | 3.3") —
436
+ # threading it would invent a breadcrumb, so it stays an is_label.
437
+ cols = _label_cols(r)
438
+ if not cols:
439
+ return False
440
+ return sum(1 for c in cols if _is_numeric_only(_cell_text(r, c))) <= 1
423
441
 
424
442
  # A row already carrying a rowgroup cell (a full-width band promoted above,
425
443
  # or a source scope="rowgroup") is a boundary, not a label-only candidate.
@@ -432,7 +450,7 @@ def _mark_label_only_rowgroups(grid) -> None:
432
450
  and r not in band_rows
433
451
  and not _has_value(r)
434
452
  and bool(_label_cols(r))
435
- and _single_label_origin(r)
453
+ and _title_like(r)
436
454
  for r in range(n_rows)
437
455
  ]
438
456
 
@@ -448,23 +466,56 @@ def _mark_label_only_rowgroups(grid) -> None:
448
466
  s_end = r
449
467
  r += 1 # advance past the stack for the outer loop
450
468
 
451
- # Extent: down to the row before the next boundary (next label stack or
452
- # full-width band). Bounded by a value row's presence.
469
+ # Absorb a run of full-width band rows immediately following the title
470
+ # stack (a description band under the title) into this header block
471
+ # they are nested members, not a boundary. Without this the title's
472
+ # extent would terminate at the band and the title would be dropped (the
473
+ # narrow-title-then-full-width-description shape).
474
+ header_end = s_end
475
+ while header_end + 1 < n_rows and (header_end + 1) in band_rows:
476
+ header_end += 1
477
+
478
+ # Extent: from the first row after the header block to the row before the
479
+ # next group start — the next title, or a full-width band that begins a
480
+ # new section (one appearing AFTER a value row). A band absorbed above is
481
+ # part of this header, not a boundary.
453
482
  extent_end = n_rows - 1
454
- for rr in range(s_end + 1, n_rows):
455
- if is_label_row[rr] or rr in band_rows:
483
+ saw_value = False
484
+ for rr in range(header_end + 1, n_rows):
485
+ if is_label_row[rr]:
456
486
  extent_end = rr - 1
457
487
  break
458
- has_data_row = any(_has_value(rr) for rr in range(s_end + 1, extent_end + 1))
459
- if not has_data_row:
488
+ if rr in band_rows and saw_value:
489
+ extent_end = rr - 1
490
+ break
491
+ if _has_value(rr):
492
+ saw_value = True
493
+ value_rows = [rr for rr in range(header_end + 1, extent_end + 1) if _has_value(rr)]
494
+ if not value_rows:
460
495
  continue
461
496
 
497
+ # Promote each title cell, EXCLUDING a key column whose (column, text)
498
+ # repeats on a value row of the group — a repeating item-number/key
499
+ # already threads via the value rows' own labels; promoting it again
500
+ # would duplicate it in the path. The remaining cells are the title.
462
501
  for rr in range(s_start, s_end + 1):
463
502
  for c in _label_cols(rr):
503
+ text = _cell_text(rr, c)
504
+ if any(_cell_text(vr, c) == text for vr in value_rows):
505
+ continue
464
506
  grid[rr][c]["type"] = "th"
465
507
  grid[rr][c]["scope"] = "rowgroup"
466
508
  grid[rr][c]["rowgroup_extent_end"] = extent_end
467
509
 
510
+ # Bound the absorbed description band(s) by the same extent so a
511
+ # full-width description does not leak past the next narrow title (its
512
+ # colspan is wider, so the maze's colspan rule would not close it).
513
+ for rr in range(s_end + 1, header_end + 1):
514
+ for c in range(n_cols):
515
+ cell = grid[rr][c]
516
+ if cell.get("scope") == "rowgroup":
517
+ cell["rowgroup_extent_end"] = extent_end
518
+
468
519
 
469
520
  def _process_table_with_gate(table_html: str) -> Tuple[List[LogicRule], GateResult]:
470
521
  """Runs the full pipeline and returns rules plus the gate verdict.
@@ -487,6 +487,24 @@ def parse_table_to_grid(table: Tag) -> List[List[Dict[str, Any]]]:
487
487
  if _descriptor_like(c):
488
488
  promote_cols.add(c)
489
489
 
490
+ # --- Signal D: stub column in a 2-column Label|Value schedule ---
491
+ # In a two-column table the left column is the row-label/stub and the
492
+ # right column is its value — even when col 0 carries a thead header,
493
+ # which Signals A/C/B all skip (they need a multi-row/rowspan header or
494
+ # an unlabeled column). This is the single-row-thead
495
+ # "Benefit | Maximum limit (S$)" schedule shape. Scoped to exactly two
496
+ # columns so multi-column property tables (where col 0 is one data field
497
+ # among several) are untouched; col 0 must be descriptor-like and col 1
498
+ # must carry values, so a two-column all-text table is left alone.
499
+ if (
500
+ max_cols == 2
501
+ and 0 not in promote_cols
502
+ and _descriptor_like(0)
503
+ and body_nonempty[1] >= 1
504
+ and not _descriptor_like(1)
505
+ ):
506
+ promote_cols.add(0)
507
+
490
508
  if promote_cols:
491
509
  for c in sorted(promote_cols):
492
510
  for r in range(data_start_row_idx, len(grid)):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: table2rules
3
- Version: 0.6.2
3
+ Version: 0.6.3
4
4
  Summary: Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding.
5
5
  Author: PebbleRoad Pte Ltd
6
6
  License-Expression: MIT
File without changes
File without changes
File without changes