table2rules 0.6.0__tar.gz → 0.6.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {table2rules-0.6.0/src/table2rules.egg-info → table2rules-0.6.2}/PKG-INFO +1 -1
  2. {table2rules-0.6.0 → table2rules-0.6.2}/pyproject.toml +1 -1
  3. {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules/_core.py +148 -0
  4. {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules/maze_pathfinder.py +24 -2
  5. {table2rules-0.6.0 → table2rules-0.6.2/src/table2rules.egg-info}/PKG-INFO +1 -1
  6. {table2rules-0.6.0 → table2rules-0.6.2}/LICENSE +0 -0
  7. {table2rules-0.6.0 → table2rules-0.6.2}/README.md +0 -0
  8. {table2rules-0.6.0 → table2rules-0.6.2}/setup.cfg +0 -0
  9. {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules/__init__.py +0 -0
  10. {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules/__main__.py +0 -0
  11. {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules/cleanup.py +0 -0
  12. {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules/errors.py +0 -0
  13. {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules/exporters/__init__.py +0 -0
  14. {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules/exporters/base.py +0 -0
  15. {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules/exporters/rules.py +0 -0
  16. {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules/grid_parser.py +0 -0
  17. {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules/models.py +0 -0
  18. {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules/py.typed +0 -0
  19. {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules/quality_gate.py +0 -0
  20. {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules/report.py +0 -0
  21. {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules/simple_repair.py +0 -0
  22. {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules/spans.py +0 -0
  23. {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules.egg-info/SOURCES.txt +0 -0
  24. {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules.egg-info/dependency_links.txt +0 -0
  25. {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules.egg-info/entry_points.txt +0 -0
  26. {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules.egg-info/requires.txt +0 -0
  27. {table2rules-0.6.0 → table2rules-0.6.2}/src/table2rules.egg-info/top_level.txt +0 -0
  28. {table2rules-0.6.0 → table2rules-0.6.2}/tests/test_correctness_oracle.py +0 -0
  29. {table2rules-0.6.0 → table2rules-0.6.2}/tests/test_determinism.py +0 -0
  30. {table2rules-0.6.0 → table2rules-0.6.2}/tests/test_public_api.py +0 -0
  31. {table2rules-0.6.0 → table2rules-0.6.2}/tests/test_regression_golds.py +0 -0
  32. {table2rules-0.6.0 → table2rules-0.6.2}/tests/test_robustness_mutations.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: table2rules
3
- Version: 0.6.0
3
+ Version: 0.6.2
4
4
  Summary: Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding.
5
5
  Author: PebbleRoad Pte Ltd
6
6
  License-Expression: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "table2rules"
7
- version = "0.6.0"
7
+ version = "0.6.2"
8
8
  description = "Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding."
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -211,6 +211,11 @@ def _build_rules(grid) -> List[LogicRule]:
211
211
  for row_idx in range(len(grid)):
212
212
  if row_idx in rows_with_value:
213
213
  continue
214
+ # A label-only row promoted to a row-group ancestor (scope="rowgroup")
215
+ # is threaded into the value lines beneath it — emitting it here too
216
+ # would duplicate it as an orphan label.
217
+ if any(grid[row_idx][c].get("scope") == "rowgroup" for c in range(n_cols)):
218
+ continue
214
219
  # Anchor the rule at the row's data column so it satisfies the quality
215
220
  # gate's "rules originate from <td>" invariant. A row with no <td> at
216
221
  # all is a true full-width <th colspan> divider — already handled as a
@@ -319,6 +324,148 @@ def _mark_rowgroup_bands(grid) -> None:
319
324
  grid[r][cc]["scope"] = "rowgroup"
320
325
 
321
326
 
327
+ def _mark_label_only_rowgroups(grid) -> None:
328
+ """Promote *label-only rows* to ``<th scope="rowgroup">`` so the maze threads
329
+ them into each value line's row path, the row-side counterpart of the
330
+ full-width band handled by :func:`_mark_rowgroup_bands`.
331
+
332
+ A label-only row is a body row whose value (``<td>``) columns are all empty
333
+ while a leading label column carries text — the ``Label | Value`` form
334
+ pervasive in financial/insurance schedules (``9. Trip Cancellation | (empty)``
335
+ above its value rows). Unlike a full-width band the label cell does *not*
336
+ span the value region; the other columns are simply empty, so
337
+ ``is_full_width_note`` geometry never sees it. Without this pass the row is
338
+ emitted as an orphaned ``is_label`` rule and the values beneath it lose their
339
+ group identity.
340
+
341
+ Detection is geometric, not flag-based: a row with no value-bearing ``<td>``
342
+ but exactly one non-empty body ``<th>`` label source cell. (Row-label
343
+ columns are already promoted to ``<th scope="row">`` upstream — Signal A/B/C
344
+ in grid_parser and simple_repair — so "no non-empty ``<td>``" means "no
345
+ value".)
346
+
347
+ The single-label-cell requirement is what separates a group header from a
348
+ data row whose *designated* value columns merely happen to be empty. A genuine
349
+ group header carries one title ("9. Trip Cancellation", possibly spanning the
350
+ first N>1 columns via one ``colspan`` cell). A data row whose value columns
351
+ are blank ("Average: | 80.2 | 10.7 | 3.3", or a summary row under a header
352
+ that over-promoted numeric columns to row labels) spreads several distinct
353
+ values across its label cells — threading those as a group path would invent
354
+ a breadcrumb and misattribute it to the rows below. Such rows stay on the
355
+ ``is_label`` preservation path, unchanged.
356
+
357
+ Stacking and extent (no content-aware level inference):
358
+
359
+ * A maximal run of *consecutive* label-only rows forms one header stack. Its
360
+ members are threaded as nested ancestors in row order — a title followed by
361
+ a description (``10. Travel Delay`` then ``If the departure…``) both land in
362
+ the path, title first.
363
+ * A stack's extent runs from just below the stack down to the row before the
364
+ next stack OR the next full-width band, whichever comes first — so a group
365
+ never leaks into the next line-item or across a section divider.
366
+ * A stack is promoted only when its extent holds a real value row (parity
367
+ with the full-width-note guard): a trailing label that groups nothing is
368
+ left for the ``is_label`` preservation path, never stranded as an
369
+ empty-extent rowgroup.
370
+
371
+ The stored ``rowgroup_extent_end`` is what the maze honors for these bands;
372
+ full-width bands keep their colspan-bounded extent. The two compose: a
373
+ section band (wider) and a label-only group (narrower) nest consistently.
374
+ """
375
+ if not grid or not grid[0]:
376
+ return
377
+ n_rows = len(grid)
378
+ n_cols = len(grid[0])
379
+
380
+ def _is_body_row(r: int) -> bool:
381
+ return not any(
382
+ grid[r][c].get("is_thead") or grid[r][c].get("is_header_row") for c in range(n_cols)
383
+ )
384
+
385
+ def _has_value(r: int) -> bool:
386
+ return any(
387
+ grid[r][c]["type"] == "td" and (grid[r][c].get("text") or "").strip()
388
+ for c in range(n_cols)
389
+ )
390
+
391
+ def _label_cols(r: int) -> List[int]:
392
+ cols: List[int] = []
393
+ for c in range(n_cols):
394
+ cell = grid[r][c]
395
+ if cell.get("is_thead") or cell.get("is_header_row"):
396
+ continue
397
+ if cell["type"] != "th":
398
+ continue
399
+ if cell.get("is_span_copy"):
400
+ # A span copy of a label cell originating in this same row is
401
+ # part of a multi-column label (label spans the first N>1
402
+ # columns); promote it too. A span copy reaching down from a
403
+ # row above is not a label of this row.
404
+ origin = cell.get("origin", (r, c))
405
+ if origin[0] != r:
406
+ continue
407
+ if not (grid[origin[0]][origin[1]].get("text") or "").strip():
408
+ continue
409
+ elif not (cell.get("text") or "").strip():
410
+ continue
411
+ cols.append(c)
412
+ return cols
413
+
414
+ def _single_label_origin(r: int) -> bool:
415
+ # A group header is exactly one label source cell (a title, possibly
416
+ # colspan'd). More than one distinct non-empty label cell means a data
417
+ # row, not a divider — do not thread it.
418
+ origins = set()
419
+ for c in _label_cols(r):
420
+ cell = grid[r][c]
421
+ origins.add(cell.get("origin", (r, c)) if cell.get("is_span_copy") else (r, c))
422
+ return len(origins) == 1
423
+
424
+ # A row already carrying a rowgroup cell (a full-width band promoted above,
425
+ # or a source scope="rowgroup") is a boundary, not a label-only candidate.
426
+ band_rows = {
427
+ r for r in range(n_rows) for c in range(n_cols) if grid[r][c].get("scope") == "rowgroup"
428
+ }
429
+
430
+ is_label_row = [
431
+ _is_body_row(r)
432
+ and r not in band_rows
433
+ and not _has_value(r)
434
+ and bool(_label_cols(r))
435
+ and _single_label_origin(r)
436
+ for r in range(n_rows)
437
+ ]
438
+
439
+ r = 0
440
+ while r < n_rows:
441
+ if not is_label_row[r]:
442
+ r += 1
443
+ continue
444
+ # Gather the maximal consecutive run of label-only rows.
445
+ s_start = r
446
+ while r + 1 < n_rows and is_label_row[r + 1]:
447
+ r += 1
448
+ s_end = r
449
+ r += 1 # advance past the stack for the outer loop
450
+
451
+ # Extent: down to the row before the next boundary (next label stack or
452
+ # full-width band). Bounded by a value row's presence.
453
+ extent_end = n_rows - 1
454
+ for rr in range(s_end + 1, n_rows):
455
+ if is_label_row[rr] or rr in band_rows:
456
+ extent_end = rr - 1
457
+ break
458
+ has_data_row = any(_has_value(rr) for rr in range(s_end + 1, extent_end + 1))
459
+ if not has_data_row:
460
+ continue
461
+
462
+ for rr in range(s_start, s_end + 1):
463
+ for c in _label_cols(rr):
464
+ grid[rr][c]["type"] = "th"
465
+ grid[rr][c]["scope"] = "rowgroup"
466
+ grid[rr][c]["rowgroup_extent_end"] = extent_end
467
+
468
+
322
469
  def _process_table_with_gate(table_html: str) -> Tuple[List[LogicRule], GateResult]:
323
470
  """Runs the full pipeline and returns rules plus the gate verdict.
324
471
 
@@ -337,6 +484,7 @@ def _process_table_with_gate(table_html: str) -> Tuple[List[LogicRule], GateResu
337
484
  return [], GateResult(ok=False, score=0.0, reasons=["empty_grid"])
338
485
 
339
486
  _mark_rowgroup_bands(grid)
487
+ _mark_label_only_rowgroups(grid)
340
488
  rules = clean_rules(_build_rules(grid))
341
489
  gate = assess_confidence(grid, rules)
342
490
  if not gate.ok:
@@ -151,8 +151,18 @@ def find_headers_for_cell(
151
151
  # Bands are ordered topmost-first (origin row ascending) and prepended, so
152
152
  # the row path reads outer-band > inner-group > row-labels, mirroring the
153
153
  # multi-level column path.
154
+ #
155
+ # A *label-only* band (one carrying an explicit ``rowgroup_extent_end``)
156
+ # groups a ROW RANGE, so it must reach every value row in its extent
157
+ # regardless of which column its single label cell sits in — e.g. a numbered
158
+ # schedule whose group header is in the line-number/stub column while the
159
+ # sub-rows leave that column empty and carry their identity in a different
160
+ # column. Such bands are therefore scanned across ALL columns. Full-width and
161
+ # source ``scope="rowgroup"`` bands keep the column-restricted scan (own
162
+ # column + row-label columns) so unrelated stub dividers don't cross-attach.
163
+ own_cols = {col, *row_header_columns}
154
164
  bands: List[Tuple[int, str]] = [] # (origin_row, text)
155
- for scan_col in [col, *row_header_columns]:
165
+ for scan_col in range(len(grid[0])):
156
166
  for r in range(row - 1, -1, -1):
157
167
  cell = grid[r][scan_col]
158
168
  if not cell or not cell.get("text", "").strip():
@@ -167,12 +177,24 @@ def find_headers_for_cell(
167
177
  else:
168
178
  origin = (r, scan_col)
169
179
  origin_cell = cell
180
+ # A column-restricted band (no stored extent) is only honored from
181
+ # the value's own column or a row-label column; a label-only band
182
+ # (stored extent) is honored from any column.
183
+ if origin_cell.get("rowgroup_extent_end") is None and scan_col not in own_cols:
184
+ continue
170
185
  if origin in seen_origins:
171
186
  continue
172
187
  origin_row, origin_col = origin
173
188
  my_colspan = origin_cell.get("colspan", 1)
174
189
  origin_rowspan = origin_cell.get("rowspan", 1)
175
- if origin_rowspan > 1:
190
+ stored_extent = origin_cell.get("rowgroup_extent_end")
191
+ if stored_extent is not None:
192
+ # Label-only bands carry an explicit extent (the run of value
193
+ # rows they group, bounded by the next stack or section band),
194
+ # because their colspan=1 label cannot encode nesting depth the
195
+ # way a full-width band's width does.
196
+ extent_end = stored_extent
197
+ elif origin_rowspan > 1:
176
198
  extent_end = origin_row + origin_rowspan - 1
177
199
  else:
178
200
  extent_end = len(grid) - 1
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: table2rules
3
- Version: 0.6.0
3
+ Version: 0.6.2
4
4
  Summary: Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding.
5
5
  Author: PebbleRoad Pte Ltd
6
6
  License-Expression: MIT
File without changes
File without changes
File without changes