table2rules 0.5.2__tar.gz → 0.6.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {table2rules-0.5.2/src/table2rules.egg-info → table2rules-0.6.1}/PKG-INFO +1 -1
  2. {table2rules-0.5.2 → table2rules-0.6.1}/pyproject.toml +1 -1
  3. {table2rules-0.5.2 → table2rules-0.6.1}/src/table2rules/_core.py +221 -0
  4. {table2rules-0.5.2 → table2rules-0.6.1}/src/table2rules/grid_parser.py +22 -0
  5. {table2rules-0.5.2 → table2rules-0.6.1}/src/table2rules/maze_pathfinder.py +74 -37
  6. {table2rules-0.5.2 → table2rules-0.6.1/src/table2rules.egg-info}/PKG-INFO +1 -1
  7. {table2rules-0.5.2 → table2rules-0.6.1}/LICENSE +0 -0
  8. {table2rules-0.5.2 → table2rules-0.6.1}/README.md +0 -0
  9. {table2rules-0.5.2 → table2rules-0.6.1}/setup.cfg +0 -0
  10. {table2rules-0.5.2 → table2rules-0.6.1}/src/table2rules/__init__.py +0 -0
  11. {table2rules-0.5.2 → table2rules-0.6.1}/src/table2rules/__main__.py +0 -0
  12. {table2rules-0.5.2 → table2rules-0.6.1}/src/table2rules/cleanup.py +0 -0
  13. {table2rules-0.5.2 → table2rules-0.6.1}/src/table2rules/errors.py +0 -0
  14. {table2rules-0.5.2 → table2rules-0.6.1}/src/table2rules/exporters/__init__.py +0 -0
  15. {table2rules-0.5.2 → table2rules-0.6.1}/src/table2rules/exporters/base.py +0 -0
  16. {table2rules-0.5.2 → table2rules-0.6.1}/src/table2rules/exporters/rules.py +0 -0
  17. {table2rules-0.5.2 → table2rules-0.6.1}/src/table2rules/models.py +0 -0
  18. {table2rules-0.5.2 → table2rules-0.6.1}/src/table2rules/py.typed +0 -0
  19. {table2rules-0.5.2 → table2rules-0.6.1}/src/table2rules/quality_gate.py +0 -0
  20. {table2rules-0.5.2 → table2rules-0.6.1}/src/table2rules/report.py +0 -0
  21. {table2rules-0.5.2 → table2rules-0.6.1}/src/table2rules/simple_repair.py +0 -0
  22. {table2rules-0.5.2 → table2rules-0.6.1}/src/table2rules/spans.py +0 -0
  23. {table2rules-0.5.2 → table2rules-0.6.1}/src/table2rules.egg-info/SOURCES.txt +0 -0
  24. {table2rules-0.5.2 → table2rules-0.6.1}/src/table2rules.egg-info/dependency_links.txt +0 -0
  25. {table2rules-0.5.2 → table2rules-0.6.1}/src/table2rules.egg-info/entry_points.txt +0 -0
  26. {table2rules-0.5.2 → table2rules-0.6.1}/src/table2rules.egg-info/requires.txt +0 -0
  27. {table2rules-0.5.2 → table2rules-0.6.1}/src/table2rules.egg-info/top_level.txt +0 -0
  28. {table2rules-0.5.2 → table2rules-0.6.1}/tests/test_correctness_oracle.py +0 -0
  29. {table2rules-0.5.2 → table2rules-0.6.1}/tests/test_determinism.py +0 -0
  30. {table2rules-0.5.2 → table2rules-0.6.1}/tests/test_public_api.py +0 -0
  31. {table2rules-0.5.2 → table2rules-0.6.1}/tests/test_regression_golds.py +0 -0
  32. {table2rules-0.5.2 → table2rules-0.6.1}/tests/test_robustness_mutations.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: table2rules
3
- Version: 0.5.2
3
+ Version: 0.6.1
4
4
  Summary: Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding.
5
5
  Author: PebbleRoad Pte Ltd
6
6
  License-Expression: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "table2rules"
7
- version = "0.5.2"
7
+ version = "0.6.1"
8
8
  description = "Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding."
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -211,6 +211,11 @@ def _build_rules(grid) -> List[LogicRule]:
211
211
  for row_idx in range(len(grid)):
212
212
  if row_idx in rows_with_value:
213
213
  continue
214
+ # A label-only row promoted to a row-group ancestor (scope="rowgroup")
215
+ # is threaded into the value lines beneath it — emitting it here too
216
+ # would duplicate it as an orphan label.
217
+ if any(grid[row_idx][c].get("scope") == "rowgroup" for c in range(n_cols)):
218
+ continue
214
219
  # Anchor the rule at the row's data column so it satisfies the quality
215
220
  # gate's "rules originate from <td>" invariant. A row with no <td> at
216
221
  # all is a true full-width <th colspan> divider — already handled as a
@@ -247,6 +252,220 @@ def _build_rules(grid) -> List[LogicRule]:
247
252
  return rules
248
253
 
249
254
 
255
+ def _mark_rowgroup_bands(grid) -> None:
256
+ """Promote value-region-wide body dividers to ``<th scope="rowgroup">`` so
257
+ the maze threads them into each value line's row path as bounded, nested
258
+ row-group ancestors — the row-side counterpart of the multi-level column
259
+ header path.
260
+
261
+ A candidate is a body cell whose span reaches the last column and covers a
262
+ majority of the grid (``is_full_width_note`` geometry): a section band
263
+ (full width) or a group header / description spanning the value region. A
264
+ candidate is promoted only when its extent contains at least one real data
265
+ row — so a standalone trailing note (which groups nothing) is left as a
266
+ note and still emitted, never stranded as an empty-extent rowgroup. Nested
267
+ candidates are bounded by colspan: a band's extent ends at the next
268
+ candidate whose span is equal or wider. Cells already marked
269
+ ``scope="rowgroup"`` by the source are honored as-is.
270
+ """
271
+ if not grid or not grid[0]:
272
+ return
273
+ n_cols = len(grid[0])
274
+ # Column-header texts. A full-width body cell that merely repeats a column
275
+ # header (a units caption like "(In thousands, except per share data)"
276
+ # reprinted between sections) is an annotation, not a row-group divider —
277
+ # promoting it would stamp it onto every row path (where it is already noise
278
+ # in the column path). Exclude such echoes.
279
+ header_texts = {
280
+ (cell.get("text") or "").strip().lower()
281
+ for row in grid
282
+ for cell in row
283
+ if cell and cell.get("is_thead") and (cell.get("text") or "").strip()
284
+ }
285
+ candidates = [] # (row, col, colspan)
286
+ for r in range(len(grid)):
287
+ for c in range(n_cols):
288
+ cell = grid[r][c]
289
+ if not cell or cell.get("is_span_copy"):
290
+ continue
291
+ if cell.get("is_thead") or cell.get("is_header_row"):
292
+ continue
293
+ text = (cell.get("text") or "").strip()
294
+ if not text:
295
+ continue
296
+ if text.lower() in header_texts:
297
+ continue
298
+ if is_full_width_note(c, cell.get("colspan", 1), n_cols):
299
+ candidates.append((r, c, cell.get("colspan", 1)))
300
+ candidate_rows = {r for (r, _c, _cs) in candidates}
301
+
302
+ def _next_band_below(after_row: int, min_colspan: int) -> int:
303
+ for rr in range(after_row + 1, len(grid)):
304
+ if any(r == rr and cs >= min_colspan for (r, _c, cs) in candidates):
305
+ return rr
306
+ return len(grid)
307
+
308
+ for r, c, cs in candidates:
309
+ extent_end = _next_band_below(r, cs) - 1
310
+ has_data_row = False
311
+ for rr in range(r + 1, extent_end + 1):
312
+ if rr in candidate_rows:
313
+ continue
314
+ if any(
315
+ grid[rr][cc]["type"] == "td" and (grid[rr][cc].get("text") or "").strip()
316
+ for cc in range(n_cols)
317
+ ):
318
+ has_data_row = True
319
+ break
320
+ if not has_data_row:
321
+ continue
322
+ for cc in range(c, min(c + cs, n_cols)):
323
+ grid[r][cc]["type"] = "th"
324
+ grid[r][cc]["scope"] = "rowgroup"
325
+
326
+
327
+ def _mark_label_only_rowgroups(grid) -> None:
328
+ """Promote *label-only rows* to ``<th scope="rowgroup">`` so the maze threads
329
+ them into each value line's row path, the row-side counterpart of the
330
+ full-width band handled by :func:`_mark_rowgroup_bands`.
331
+
332
+ A label-only row is a body row whose value (``<td>``) columns are all empty
333
+ while a leading label column carries text — the ``Label | Value`` form
334
+ pervasive in financial/insurance schedules (``9. Trip Cancellation | (empty)``
335
+ above its value rows). Unlike a full-width band the label cell does *not*
336
+ span the value region; the other columns are simply empty, so
337
+ ``is_full_width_note`` geometry never sees it. Without this pass the row is
338
+ emitted as an orphaned ``is_label`` rule and the values beneath it lose their
339
+ group identity.
340
+
341
+ Detection is geometric, not flag-based: a row with no value-bearing ``<td>``
342
+ but exactly one non-empty body ``<th>`` label source cell. (Row-label
343
+ columns are already promoted to ``<th scope="row">`` upstream — Signal A/B/C
344
+ in grid_parser and simple_repair — so "no non-empty ``<td>``" means "no
345
+ value".)
346
+
347
+ The single-label-cell requirement is what separates a group header from a
348
+ data row whose *designated* value columns merely happen to be empty. A genuine
349
+ group header carries one title ("9. Trip Cancellation", possibly spanning the
350
+ first N>1 columns via one ``colspan`` cell). A data row whose value columns
351
+ are blank ("Average: | 80.2 | 10.7 | 3.3", or a summary row under a header
352
+ that over-promoted numeric columns to row labels) spreads several distinct
353
+ values across its label cells — threading those as a group path would invent
354
+ a breadcrumb and misattribute it to the rows below. Such rows stay on the
355
+ ``is_label`` preservation path, unchanged.
356
+
357
+ Stacking and extent (no content-aware level inference):
358
+
359
+ * A maximal run of *consecutive* label-only rows forms one header stack. Its
360
+ members are threaded as nested ancestors in row order — a title followed by
361
+ a description (``10. Travel Delay`` then ``If the departure…``) both land in
362
+ the path, title first.
363
+ * A stack's extent runs from just below the stack down to the row before the
364
+ next stack OR the next full-width band, whichever comes first — so a group
365
+ never leaks into the next line-item or across a section divider.
366
+ * A stack is promoted only when its extent holds a real value row (parity
367
+ with the full-width-note guard): a trailing label that groups nothing is
368
+ left for the ``is_label`` preservation path, never stranded as an
369
+ empty-extent rowgroup.
370
+
371
+ The stored ``rowgroup_extent_end`` is what the maze honors for these bands;
372
+ full-width bands keep their colspan-bounded extent. The two compose: a
373
+ section band (wider) and a label-only group (narrower) nest consistently.
374
+ """
375
+ if not grid or not grid[0]:
376
+ return
377
+ n_rows = len(grid)
378
+ n_cols = len(grid[0])
379
+
380
+ def _is_body_row(r: int) -> bool:
381
+ return not any(
382
+ grid[r][c].get("is_thead") or grid[r][c].get("is_header_row") for c in range(n_cols)
383
+ )
384
+
385
+ def _has_value(r: int) -> bool:
386
+ return any(
387
+ grid[r][c]["type"] == "td" and (grid[r][c].get("text") or "").strip()
388
+ for c in range(n_cols)
389
+ )
390
+
391
+ def _label_cols(r: int) -> List[int]:
392
+ cols: List[int] = []
393
+ for c in range(n_cols):
394
+ cell = grid[r][c]
395
+ if cell.get("is_thead") or cell.get("is_header_row"):
396
+ continue
397
+ if cell["type"] != "th":
398
+ continue
399
+ if cell.get("is_span_copy"):
400
+ # A span copy of a label cell originating in this same row is
401
+ # part of a multi-column label (label spans the first N>1
402
+ # columns); promote it too. A span copy reaching down from a
403
+ # row above is not a label of this row.
404
+ origin = cell.get("origin", (r, c))
405
+ if origin[0] != r:
406
+ continue
407
+ if not (grid[origin[0]][origin[1]].get("text") or "").strip():
408
+ continue
409
+ elif not (cell.get("text") or "").strip():
410
+ continue
411
+ cols.append(c)
412
+ return cols
413
+
414
+ def _single_label_origin(r: int) -> bool:
415
+ # A group header is exactly one label source cell (a title, possibly
416
+ # colspan'd). More than one distinct non-empty label cell means a data
417
+ # row, not a divider — do not thread it.
418
+ origins = set()
419
+ for c in _label_cols(r):
420
+ cell = grid[r][c]
421
+ origins.add(cell.get("origin", (r, c)) if cell.get("is_span_copy") else (r, c))
422
+ return len(origins) == 1
423
+
424
+ # A row already carrying a rowgroup cell (a full-width band promoted above,
425
+ # or a source scope="rowgroup") is a boundary, not a label-only candidate.
426
+ band_rows = {
427
+ r for r in range(n_rows) for c in range(n_cols) if grid[r][c].get("scope") == "rowgroup"
428
+ }
429
+
430
+ is_label_row = [
431
+ _is_body_row(r)
432
+ and r not in band_rows
433
+ and not _has_value(r)
434
+ and bool(_label_cols(r))
435
+ and _single_label_origin(r)
436
+ for r in range(n_rows)
437
+ ]
438
+
439
+ r = 0
440
+ while r < n_rows:
441
+ if not is_label_row[r]:
442
+ r += 1
443
+ continue
444
+ # Gather the maximal consecutive run of label-only rows.
445
+ s_start = r
446
+ while r + 1 < n_rows and is_label_row[r + 1]:
447
+ r += 1
448
+ s_end = r
449
+ r += 1 # advance past the stack for the outer loop
450
+
451
+ # Extent: down to the row before the next boundary (next label stack or
452
+ # full-width band). Bounded by a value row's presence.
453
+ extent_end = n_rows - 1
454
+ for rr in range(s_end + 1, n_rows):
455
+ if is_label_row[rr] or rr in band_rows:
456
+ extent_end = rr - 1
457
+ break
458
+ has_data_row = any(_has_value(rr) for rr in range(s_end + 1, extent_end + 1))
459
+ if not has_data_row:
460
+ continue
461
+
462
+ for rr in range(s_start, s_end + 1):
463
+ for c in _label_cols(rr):
464
+ grid[rr][c]["type"] = "th"
465
+ grid[rr][c]["scope"] = "rowgroup"
466
+ grid[rr][c]["rowgroup_extent_end"] = extent_end
467
+
468
+
250
469
  def _process_table_with_gate(table_html: str) -> Tuple[List[LogicRule], GateResult]:
251
470
  """Runs the full pipeline and returns rules plus the gate verdict.
252
471
 
@@ -264,6 +483,8 @@ def _process_table_with_gate(table_html: str) -> Tuple[List[LogicRule], GateResu
264
483
  if not grid:
265
484
  return [], GateResult(ok=False, score=0.0, reasons=["empty_grid"])
266
485
 
486
+ _mark_rowgroup_bands(grid)
487
+ _mark_label_only_rowgroups(grid)
267
488
  rules = clean_rules(_build_rules(grid))
268
489
  gate = assess_confidence(grid, rules)
269
490
  if not gate.ok:
@@ -465,6 +465,28 @@ def parse_table_to_grid(table: Tag) -> List[List[Dict[str, Any]]]:
465
465
  continue
466
466
  promote_cols.add(c)
467
467
 
468
+ # --- Signal C: stub-dimension columns under the leftmost top-level
469
+ # header group ---
470
+ # When the leftmost top-level (row 0) header spans more than one column
471
+ # AND a distinct value-header group exists to its right, that leftmost
472
+ # group is the row-label dimension (e.g. a "SECTION" header spanning the
473
+ # rownum and person-class columns, beside a "MAXIMUM LIMIT" value group).
474
+ # Its descriptor columns are row labels even though they carry thead text
475
+ # — promoting them threads the row identity (the person-class) into each
476
+ # value line, not just the leading rownum, mirroring the column path.
477
+ top0 = grid[0][0] if grid and grid[0] else None
478
+ if top0 and top0.get("is_thead"):
479
+ stub_origin = top0.get("origin", (0, 0)) if top0.get("is_span_copy") else (0, 0)
480
+ stub_cell = grid[stub_origin[0]][stub_origin[1]]
481
+ stub_width = stub_cell.get("colspan", 1) if stub_cell else 1
482
+ has_value_group_right = stub_width < max_cols and any(
483
+ has_thead_text[c] for c in range(stub_width, max_cols)
484
+ )
485
+ if stub_width >= 2 and has_value_group_right:
486
+ for c in range(stub_width):
487
+ if _descriptor_like(c):
488
+ promote_cols.add(c)
489
+
468
490
  if promote_cols:
469
491
  for c in sorted(promote_cols):
470
492
  for r in range(data_start_row_idx, len(grid)):
@@ -116,51 +116,88 @@ def find_headers_for_cell(
116
116
  if scope == "row":
117
117
  continue
118
118
 
119
- # Locate the origin for scope and rowspan lookup.
119
+ # scope='rowgroup' bands are handled uniformly in Step 4 (which also
120
+ # reaches bands spanning the data column when the row-label is
121
+ # empty), so they can be ordered across columns by nesting level.
122
+ if scope == "rowgroup":
123
+ continue
124
+
125
+ # Non-scope-rowgroup <th> cells outside thead are only accepted from
126
+ # the explicit header block (headless tables where header detection
127
+ # promoted a row).
128
+ if not cell.get("is_header_row", False):
129
+ continue
130
+
131
+ # Locate the origin for dedup.
120
132
  if cell.get("is_span_copy", False):
121
133
  origin = cell.get("origin", (r, header_col))
122
- origin_cell = grid[origin[0]][origin[1]]
123
134
  else:
124
135
  origin = (r, header_col)
125
- origin_cell = cell
126
-
127
- if scope == "rowgroup":
128
- # A rowgroup header ancestors rows within its extent:
129
- # rowspan > 1 → extent = [origin_row, origin_row + rowspan - 1]
130
- # (the rowspan itself bounds the group, as in
131
- # a <th scope="rowgroup" rowspan="2"> pattern)
132
- # rowspan == 1 → extent = [origin_row, next_rowgroup - 1]
133
- # (a single-cell divider row like a FinTabNet
134
- # year label runs until the next such divider
135
- # in the same column)
136
- origin_row, origin_col = origin
137
- origin_rowspan = origin_cell.get("rowspan", 1)
138
- if origin_rowspan > 1:
139
- extent_end = origin_row + origin_rowspan - 1
140
- else:
141
- extent_end = len(grid) - 1
142
- for rr in range(origin_row + 1, len(grid)):
143
- other = grid[rr][origin_col]
144
- if (
145
- other
146
- and not other.get("is_span_copy", False)
147
- and other.get("scope") == "rowgroup"
148
- ):
149
- extent_end = rr - 1
150
- break
151
- if row > extent_end:
152
- continue
153
- else:
154
- # Non-scope-rowgroup <th> cells outside thead are only
155
- # accepted from the explicit header block (headless
156
- # tables where the header detection promoted a row).
157
- is_header_row = cell.get("is_header_row", False)
158
- if not is_header_row:
159
- continue
160
136
 
161
137
  if origin not in seen_origins:
162
138
  seen_origins.add(origin)
163
139
  # Insert at the beginning to maintain hierarchy
164
140
  row_headers.insert(row_header_columns.index(header_col), cell["text"])
165
141
 
142
+ # --- 4. Row-group bands ---
143
+ # A band / group header ancestors the data rows within its extent. Bands are
144
+ # collected from the data cell's own column AND every row-label column: the
145
+ # own column reaches bands that span the value region even when this row's
146
+ # label cell is empty (which would otherwise drop the band, e.g. an
147
+ # unlabeled continuation row under a group divider); the row-label columns
148
+ # reach narrow stub-column dividers (a FinTabNet year label). Extent is
149
+ # bounded by COLSPAN — a band ends at the next band whose span is equal or
150
+ # wider — so a narrower inner group header does not close an outer one.
151
+ # Bands are ordered topmost-first (origin row ascending) and prepended, so
152
+ # the row path reads outer-band > inner-group > row-labels, mirroring the
153
+ # multi-level column path.
154
+ bands: List[Tuple[int, str]] = [] # (origin_row, text)
155
+ for scan_col in [col, *row_header_columns]:
156
+ for r in range(row - 1, -1, -1):
157
+ cell = grid[r][scan_col]
158
+ if not cell or not cell.get("text", "").strip():
159
+ continue
160
+ if cell["type"] != "th" or cell.get("is_thead", False):
161
+ continue
162
+ if cell.get("scope") != "rowgroup":
163
+ continue
164
+ if cell.get("is_span_copy", False):
165
+ origin = cell.get("origin", (r, scan_col))
166
+ origin_cell = grid[origin[0]][origin[1]]
167
+ else:
168
+ origin = (r, scan_col)
169
+ origin_cell = cell
170
+ if origin in seen_origins:
171
+ continue
172
+ origin_row, origin_col = origin
173
+ my_colspan = origin_cell.get("colspan", 1)
174
+ origin_rowspan = origin_cell.get("rowspan", 1)
175
+ stored_extent = origin_cell.get("rowgroup_extent_end")
176
+ if stored_extent is not None:
177
+ # Label-only bands carry an explicit extent (the run of value
178
+ # rows they group, bounded by the next stack or section band),
179
+ # because their colspan=1 label cannot encode nesting depth the
180
+ # way a full-width band's width does.
181
+ extent_end = stored_extent
182
+ elif origin_rowspan > 1:
183
+ extent_end = origin_row + origin_rowspan - 1
184
+ else:
185
+ extent_end = len(grid) - 1
186
+ for rr in range(origin_row + 1, len(grid)):
187
+ other = grid[rr][origin_col]
188
+ if (
189
+ other
190
+ and not other.get("is_span_copy", False)
191
+ and other.get("scope") == "rowgroup"
192
+ and other.get("colspan", 1) >= my_colspan
193
+ ):
194
+ extent_end = rr - 1
195
+ break
196
+ if row > extent_end:
197
+ continue
198
+ seen_origins.add(origin)
199
+ bands.append((origin_row, cell["text"]))
200
+ bands.sort(key=lambda b: b[0])
201
+ row_headers[:0] = [text for _row, text in bands]
202
+
166
203
  return row_headers, col_headers
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: table2rules
3
- Version: 0.5.2
3
+ Version: 0.6.1
4
4
  Summary: Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding.
5
5
  Author: PebbleRoad Pte Ltd
6
6
  License-Expression: MIT
File without changes
File without changes
File without changes