table2rules 0.6.0__tar.gz → 0.6.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {table2rules-0.6.0/src/table2rules.egg-info → table2rules-0.6.1}/PKG-INFO +1 -1
  2. {table2rules-0.6.0 → table2rules-0.6.1}/pyproject.toml +1 -1
  3. {table2rules-0.6.0 → table2rules-0.6.1}/src/table2rules/_core.py +148 -0
  4. {table2rules-0.6.0 → table2rules-0.6.1}/src/table2rules/maze_pathfinder.py +8 -1
  5. {table2rules-0.6.0 → table2rules-0.6.1/src/table2rules.egg-info}/PKG-INFO +1 -1
  6. {table2rules-0.6.0 → table2rules-0.6.1}/LICENSE +0 -0
  7. {table2rules-0.6.0 → table2rules-0.6.1}/README.md +0 -0
  8. {table2rules-0.6.0 → table2rules-0.6.1}/setup.cfg +0 -0
  9. {table2rules-0.6.0 → table2rules-0.6.1}/src/table2rules/__init__.py +0 -0
  10. {table2rules-0.6.0 → table2rules-0.6.1}/src/table2rules/__main__.py +0 -0
  11. {table2rules-0.6.0 → table2rules-0.6.1}/src/table2rules/cleanup.py +0 -0
  12. {table2rules-0.6.0 → table2rules-0.6.1}/src/table2rules/errors.py +0 -0
  13. {table2rules-0.6.0 → table2rules-0.6.1}/src/table2rules/exporters/__init__.py +0 -0
  14. {table2rules-0.6.0 → table2rules-0.6.1}/src/table2rules/exporters/base.py +0 -0
  15. {table2rules-0.6.0 → table2rules-0.6.1}/src/table2rules/exporters/rules.py +0 -0
  16. {table2rules-0.6.0 → table2rules-0.6.1}/src/table2rules/grid_parser.py +0 -0
  17. {table2rules-0.6.0 → table2rules-0.6.1}/src/table2rules/models.py +0 -0
  18. {table2rules-0.6.0 → table2rules-0.6.1}/src/table2rules/py.typed +0 -0
  19. {table2rules-0.6.0 → table2rules-0.6.1}/src/table2rules/quality_gate.py +0 -0
  20. {table2rules-0.6.0 → table2rules-0.6.1}/src/table2rules/report.py +0 -0
  21. {table2rules-0.6.0 → table2rules-0.6.1}/src/table2rules/simple_repair.py +0 -0
  22. {table2rules-0.6.0 → table2rules-0.6.1}/src/table2rules/spans.py +0 -0
  23. {table2rules-0.6.0 → table2rules-0.6.1}/src/table2rules.egg-info/SOURCES.txt +0 -0
  24. {table2rules-0.6.0 → table2rules-0.6.1}/src/table2rules.egg-info/dependency_links.txt +0 -0
  25. {table2rules-0.6.0 → table2rules-0.6.1}/src/table2rules.egg-info/entry_points.txt +0 -0
  26. {table2rules-0.6.0 → table2rules-0.6.1}/src/table2rules.egg-info/requires.txt +0 -0
  27. {table2rules-0.6.0 → table2rules-0.6.1}/src/table2rules.egg-info/top_level.txt +0 -0
  28. {table2rules-0.6.0 → table2rules-0.6.1}/tests/test_correctness_oracle.py +0 -0
  29. {table2rules-0.6.0 → table2rules-0.6.1}/tests/test_determinism.py +0 -0
  30. {table2rules-0.6.0 → table2rules-0.6.1}/tests/test_public_api.py +0 -0
  31. {table2rules-0.6.0 → table2rules-0.6.1}/tests/test_regression_golds.py +0 -0
  32. {table2rules-0.6.0 → table2rules-0.6.1}/tests/test_robustness_mutations.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: table2rules
3
- Version: 0.6.0
3
+ Version: 0.6.1
4
4
  Summary: Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding.
5
5
  Author: PebbleRoad Pte Ltd
6
6
  License-Expression: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "table2rules"
7
- version = "0.6.0"
7
+ version = "0.6.1"
8
8
  description = "Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding."
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -211,6 +211,11 @@ def _build_rules(grid) -> List[LogicRule]:
211
211
  for row_idx in range(len(grid)):
212
212
  if row_idx in rows_with_value:
213
213
  continue
214
+ # A label-only row promoted to a row-group ancestor (scope="rowgroup")
215
+ # is threaded into the value lines beneath it — emitting it here too
216
+ # would duplicate it as an orphan label.
217
+ if any(grid[row_idx][c].get("scope") == "rowgroup" for c in range(n_cols)):
218
+ continue
214
219
  # Anchor the rule at the row's data column so it satisfies the quality
215
220
  # gate's "rules originate from <td>" invariant. A row with no <td> at
216
221
  # all is a true full-width <th colspan> divider — already handled as a
@@ -319,6 +324,148 @@ def _mark_rowgroup_bands(grid) -> None:
319
324
  grid[r][cc]["scope"] = "rowgroup"
320
325
 
321
326
 
327
+ def _mark_label_only_rowgroups(grid) -> None:
328
+ """Promote *label-only rows* to ``<th scope="rowgroup">`` so the maze threads
329
+ them into each value line's row path, the row-side counterpart of the
330
+ full-width band handled by :func:`_mark_rowgroup_bands`.
331
+
332
+ A label-only row is a body row whose value (``<td>``) columns are all empty
333
+ while a leading label column carries text — the ``Label | Value`` form
334
+ pervasive in financial/insurance schedules (``9. Trip Cancellation | (empty)``
335
+ above its value rows). Unlike a full-width band the label cell does *not*
336
+ span the value region; the other columns are simply empty, so
337
+ ``is_full_width_note`` geometry never sees it. Without this pass the row is
338
+ emitted as an orphaned ``is_label`` rule and the values beneath it lose their
339
+ group identity.
340
+
341
+ Detection is geometric, not flag-based: a row with no value-bearing ``<td>``
342
+ but exactly one non-empty body ``<th>`` label source cell. (Row-label
343
+ columns are already promoted to ``<th scope="row">`` upstream — Signal A/B/C
344
+ in grid_parser and simple_repair — so "no non-empty ``<td>``" means "no
345
+ value".)
346
+
347
+ The single-label-cell requirement is what separates a group header from a
348
+ data row whose *designated* value columns merely happen to be empty. A genuine
349
+ group header carries one title ("9. Trip Cancellation", possibly spanning the
350
+ first N>1 columns via one ``colspan`` cell). A data row whose value columns
351
+ are blank ("Average: | 80.2 | 10.7 | 3.3", or a summary row under a header
352
+ that over-promoted numeric columns to row labels) spreads several distinct
353
+ values across its label cells — threading those as a group path would invent
354
+ a breadcrumb and misattribute it to the rows below. Such rows stay on the
355
+ ``is_label`` preservation path, unchanged.
356
+
357
+ Stacking and extent (no content-aware level inference):
358
+
359
+ * A maximal run of *consecutive* label-only rows forms one header stack. Its
360
+ members are threaded as nested ancestors in row order — a title followed by
361
+ a description (``10. Travel Delay`` then ``If the departure…``) both land in
362
+ the path, title first.
363
+ * A stack's extent runs from just below the stack down to the row before the
364
+ next stack OR the next full-width band, whichever comes first — so a group
365
+ never leaks into the next line-item or across a section divider.
366
+ * A stack is promoted only when its extent holds a real value row (parity
367
+ with the full-width-note guard): a trailing label that groups nothing is
368
+ left for the ``is_label`` preservation path, never stranded as an
369
+ empty-extent rowgroup.
370
+
371
+ The stored ``rowgroup_extent_end`` is what the maze honors for these bands;
372
+ full-width bands keep their colspan-bounded extent. The two compose: a
373
+ section band (wider) and a label-only group (narrower) nest consistently.
374
+ """
375
+ if not grid or not grid[0]:
376
+ return
377
+ n_rows = len(grid)
378
+ n_cols = len(grid[0])
379
+
380
+ def _is_body_row(r: int) -> bool:
381
+ return not any(
382
+ grid[r][c].get("is_thead") or grid[r][c].get("is_header_row") for c in range(n_cols)
383
+ )
384
+
385
+ def _has_value(r: int) -> bool:
386
+ return any(
387
+ grid[r][c]["type"] == "td" and (grid[r][c].get("text") or "").strip()
388
+ for c in range(n_cols)
389
+ )
390
+
391
+ def _label_cols(r: int) -> List[int]:
392
+ cols: List[int] = []
393
+ for c in range(n_cols):
394
+ cell = grid[r][c]
395
+ if cell.get("is_thead") or cell.get("is_header_row"):
396
+ continue
397
+ if cell["type"] != "th":
398
+ continue
399
+ if cell.get("is_span_copy"):
400
+ # A span copy of a label cell originating in this same row is
401
+ # part of a multi-column label (label spans the first N>1
402
+ # columns); promote it too. A span copy reaching down from a
403
+ # row above is not a label of this row.
404
+ origin = cell.get("origin", (r, c))
405
+ if origin[0] != r:
406
+ continue
407
+ if not (grid[origin[0]][origin[1]].get("text") or "").strip():
408
+ continue
409
+ elif not (cell.get("text") or "").strip():
410
+ continue
411
+ cols.append(c)
412
+ return cols
413
+
414
+ def _single_label_origin(r: int) -> bool:
415
+ # A group header is exactly one label source cell (a title, possibly
416
+ # colspan'd). More than one distinct non-empty label cell means a data
417
+ # row, not a divider — do not thread it.
418
+ origins = set()
419
+ for c in _label_cols(r):
420
+ cell = grid[r][c]
421
+ origins.add(cell.get("origin", (r, c)) if cell.get("is_span_copy") else (r, c))
422
+ return len(origins) == 1
423
+
424
+ # A row already carrying a rowgroup cell (a full-width band promoted above,
425
+ # or a source scope="rowgroup") is a boundary, not a label-only candidate.
426
+ band_rows = {
427
+ r for r in range(n_rows) for c in range(n_cols) if grid[r][c].get("scope") == "rowgroup"
428
+ }
429
+
430
+ is_label_row = [
431
+ _is_body_row(r)
432
+ and r not in band_rows
433
+ and not _has_value(r)
434
+ and bool(_label_cols(r))
435
+ and _single_label_origin(r)
436
+ for r in range(n_rows)
437
+ ]
438
+
439
+ r = 0
440
+ while r < n_rows:
441
+ if not is_label_row[r]:
442
+ r += 1
443
+ continue
444
+ # Gather the maximal consecutive run of label-only rows.
445
+ s_start = r
446
+ while r + 1 < n_rows and is_label_row[r + 1]:
447
+ r += 1
448
+ s_end = r
449
+ r += 1 # advance past the stack for the outer loop
450
+
451
+ # Extent: down to the row before the next boundary (next label stack or
452
+ # full-width band). Bounded by a value row's presence.
453
+ extent_end = n_rows - 1
454
+ for rr in range(s_end + 1, n_rows):
455
+ if is_label_row[rr] or rr in band_rows:
456
+ extent_end = rr - 1
457
+ break
458
+ has_data_row = any(_has_value(rr) for rr in range(s_end + 1, extent_end + 1))
459
+ if not has_data_row:
460
+ continue
461
+
462
+ for rr in range(s_start, s_end + 1):
463
+ for c in _label_cols(rr):
464
+ grid[rr][c]["type"] = "th"
465
+ grid[rr][c]["scope"] = "rowgroup"
466
+ grid[rr][c]["rowgroup_extent_end"] = extent_end
467
+
468
+
322
469
  def _process_table_with_gate(table_html: str) -> Tuple[List[LogicRule], GateResult]:
323
470
  """Runs the full pipeline and returns rules plus the gate verdict.
324
471
 
@@ -337,6 +484,7 @@ def _process_table_with_gate(table_html: str) -> Tuple[List[LogicRule], GateResu
337
484
  return [], GateResult(ok=False, score=0.0, reasons=["empty_grid"])
338
485
 
339
486
  _mark_rowgroup_bands(grid)
487
+ _mark_label_only_rowgroups(grid)
340
488
  rules = clean_rules(_build_rules(grid))
341
489
  gate = assess_confidence(grid, rules)
342
490
  if not gate.ok:
@@ -172,7 +172,14 @@ def find_headers_for_cell(
172
172
  origin_row, origin_col = origin
173
173
  my_colspan = origin_cell.get("colspan", 1)
174
174
  origin_rowspan = origin_cell.get("rowspan", 1)
175
- if origin_rowspan > 1:
175
+ stored_extent = origin_cell.get("rowgroup_extent_end")
176
+ if stored_extent is not None:
177
+ # Label-only bands carry an explicit extent (the run of value
178
+ # rows they group, bounded by the next stack or section band),
179
+ # because their colspan=1 label cannot encode nesting depth the
180
+ # way a full-width band's width does.
181
+ extent_end = stored_extent
182
+ elif origin_rowspan > 1:
176
183
  extent_end = origin_row + origin_rowspan - 1
177
184
  else:
178
185
  extent_end = len(grid) - 1
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: table2rules
3
- Version: 0.6.0
3
+ Version: 0.6.1
4
4
  Summary: Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding.
5
5
  Author: PebbleRoad Pte Ltd
6
6
  License-Expression: MIT
File without changes
File without changes
File without changes