table2rules 0.5.2__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {table2rules-0.5.2/src/table2rules.egg-info → table2rules-0.6.0}/PKG-INFO +1 -1
  2. {table2rules-0.5.2 → table2rules-0.6.0}/pyproject.toml +1 -1
  3. {table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules/_core.py +73 -0
  4. {table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules/grid_parser.py +22 -0
  5. {table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules/maze_pathfinder.py +67 -37
  6. {table2rules-0.5.2 → table2rules-0.6.0/src/table2rules.egg-info}/PKG-INFO +1 -1
  7. {table2rules-0.5.2 → table2rules-0.6.0}/LICENSE +0 -0
  8. {table2rules-0.5.2 → table2rules-0.6.0}/README.md +0 -0
  9. {table2rules-0.5.2 → table2rules-0.6.0}/setup.cfg +0 -0
  10. {table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules/__init__.py +0 -0
  11. {table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules/__main__.py +0 -0
  12. {table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules/cleanup.py +0 -0
  13. {table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules/errors.py +0 -0
  14. {table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules/exporters/__init__.py +0 -0
  15. {table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules/exporters/base.py +0 -0
  16. {table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules/exporters/rules.py +0 -0
  17. {table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules/models.py +0 -0
  18. {table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules/py.typed +0 -0
  19. {table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules/quality_gate.py +0 -0
  20. {table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules/report.py +0 -0
  21. {table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules/simple_repair.py +0 -0
  22. {table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules/spans.py +0 -0
  23. {table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules.egg-info/SOURCES.txt +0 -0
  24. {table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules.egg-info/dependency_links.txt +0 -0
  25. {table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules.egg-info/entry_points.txt +0 -0
  26. {table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules.egg-info/requires.txt +0 -0
  27. {table2rules-0.5.2 → table2rules-0.6.0}/src/table2rules.egg-info/top_level.txt +0 -0
  28. {table2rules-0.5.2 → table2rules-0.6.0}/tests/test_correctness_oracle.py +0 -0
  29. {table2rules-0.5.2 → table2rules-0.6.0}/tests/test_determinism.py +0 -0
  30. {table2rules-0.5.2 → table2rules-0.6.0}/tests/test_public_api.py +0 -0
  31. {table2rules-0.5.2 → table2rules-0.6.0}/tests/test_regression_golds.py +0 -0
  32. {table2rules-0.5.2 → table2rules-0.6.0}/tests/test_robustness_mutations.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: table2rules
3
- Version: 0.5.2
3
+ Version: 0.6.0
4
4
  Summary: Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding.
5
5
  Author: PebbleRoad Pte Ltd
6
6
  License-Expression: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "table2rules"
7
- version = "0.5.2"
7
+ version = "0.6.0"
8
8
  description = "Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding."
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -247,6 +247,78 @@ def _build_rules(grid) -> List[LogicRule]:
247
247
  return rules
248
248
 
249
249
 
250
+ def _mark_rowgroup_bands(grid) -> None:
251
+ """Promote value-region-wide body dividers to ``<th scope="rowgroup">`` so
252
+ the maze threads them into each value line's row path as bounded, nested
253
+ row-group ancestors — the row-side counterpart of the multi-level column
254
+ header path.
255
+
256
+ A candidate is a body cell whose span reaches the last column and covers a
257
+ majority of the grid (``is_full_width_note`` geometry): a section band
258
+ (full width) or a group header / description spanning the value region. A
259
+ candidate is promoted only when its extent contains at least one real data
260
+ row — so a standalone trailing note (which groups nothing) is left as a
261
+ note and still emitted, never stranded as an empty-extent rowgroup. Nested
262
+ candidates are bounded by colspan: a band's extent ends at the next
263
+ candidate whose span is equal or wider. Cells already marked
264
+ ``scope="rowgroup"`` by the source are honored as-is.
265
+ """
266
+ if not grid or not grid[0]:
267
+ return
268
+ n_cols = len(grid[0])
269
+ # Column-header texts. A full-width body cell that merely repeats a column
270
+ # header (a units caption like "(In thousands, except per share data)"
271
+ # reprinted between sections) is an annotation, not a row-group divider —
272
+ # promoting it would stamp it onto every row path (where it is already noise
273
+ # in the column path). Exclude such echoes.
274
+ header_texts = {
275
+ (cell.get("text") or "").strip().lower()
276
+ for row in grid
277
+ for cell in row
278
+ if cell and cell.get("is_thead") and (cell.get("text") or "").strip()
279
+ }
280
+ candidates = [] # (row, col, colspan)
281
+ for r in range(len(grid)):
282
+ for c in range(n_cols):
283
+ cell = grid[r][c]
284
+ if not cell or cell.get("is_span_copy"):
285
+ continue
286
+ if cell.get("is_thead") or cell.get("is_header_row"):
287
+ continue
288
+ text = (cell.get("text") or "").strip()
289
+ if not text:
290
+ continue
291
+ if text.lower() in header_texts:
292
+ continue
293
+ if is_full_width_note(c, cell.get("colspan", 1), n_cols):
294
+ candidates.append((r, c, cell.get("colspan", 1)))
295
+ candidate_rows = {r for (r, _c, _cs) in candidates}
296
+
297
+ def _next_band_below(after_row: int, min_colspan: int) -> int:
298
+ for rr in range(after_row + 1, len(grid)):
299
+ if any(r == rr and cs >= min_colspan for (r, _c, cs) in candidates):
300
+ return rr
301
+ return len(grid)
302
+
303
+ for r, c, cs in candidates:
304
+ extent_end = _next_band_below(r, cs) - 1
305
+ has_data_row = False
306
+ for rr in range(r + 1, extent_end + 1):
307
+ if rr in candidate_rows:
308
+ continue
309
+ if any(
310
+ grid[rr][cc]["type"] == "td" and (grid[rr][cc].get("text") or "").strip()
311
+ for cc in range(n_cols)
312
+ ):
313
+ has_data_row = True
314
+ break
315
+ if not has_data_row:
316
+ continue
317
+ for cc in range(c, min(c + cs, n_cols)):
318
+ grid[r][cc]["type"] = "th"
319
+ grid[r][cc]["scope"] = "rowgroup"
320
+
321
+
250
322
  def _process_table_with_gate(table_html: str) -> Tuple[List[LogicRule], GateResult]:
251
323
  """Runs the full pipeline and returns rules plus the gate verdict.
252
324
 
@@ -264,6 +336,7 @@ def _process_table_with_gate(table_html: str) -> Tuple[List[LogicRule], GateResu
264
336
  if not grid:
265
337
  return [], GateResult(ok=False, score=0.0, reasons=["empty_grid"])
266
338
 
339
+ _mark_rowgroup_bands(grid)
267
340
  rules = clean_rules(_build_rules(grid))
268
341
  gate = assess_confidence(grid, rules)
269
342
  if not gate.ok:
@@ -465,6 +465,28 @@ def parse_table_to_grid(table: Tag) -> List[List[Dict[str, Any]]]:
465
465
  continue
466
466
  promote_cols.add(c)
467
467
 
468
+ # --- Signal C: stub-dimension columns under the leftmost top-level
469
+ # header group ---
470
+ # When the leftmost top-level (row 0) header spans more than one column
471
+ # AND a distinct value-header group exists to its right, that leftmost
472
+ # group is the row-label dimension (e.g. a "SECTION" header spanning the
473
+ # rownum and person-class columns, beside a "MAXIMUM LIMIT" value group).
474
+ # Its descriptor columns are row labels even though they carry thead text
475
+ # — promoting them threads the row identity (the person-class) into each
476
+ # value line, not just the leading rownum, mirroring the column path.
477
+ top0 = grid[0][0] if grid and grid[0] else None
478
+ if top0 and top0.get("is_thead"):
479
+ stub_origin = top0.get("origin", (0, 0)) if top0.get("is_span_copy") else (0, 0)
480
+ stub_cell = grid[stub_origin[0]][stub_origin[1]]
481
+ stub_width = stub_cell.get("colspan", 1) if stub_cell else 1
482
+ has_value_group_right = stub_width < max_cols and any(
483
+ has_thead_text[c] for c in range(stub_width, max_cols)
484
+ )
485
+ if stub_width >= 2 and has_value_group_right:
486
+ for c in range(stub_width):
487
+ if _descriptor_like(c):
488
+ promote_cols.add(c)
489
+
468
490
  if promote_cols:
469
491
  for c in sorted(promote_cols):
470
492
  for r in range(data_start_row_idx, len(grid)):
@@ -116,51 +116,81 @@ def find_headers_for_cell(
116
116
  if scope == "row":
117
117
  continue
118
118
 
119
- # Locate the origin for scope and rowspan lookup.
119
+ # scope='rowgroup' bands are handled uniformly in Step 4 (which also
120
+ # reaches bands spanning the data column when the row-label is
121
+ # empty), so they can be ordered across columns by nesting level.
122
+ if scope == "rowgroup":
123
+ continue
124
+
125
+ # Non-scope-rowgroup <th> cells outside thead are only accepted from
126
+ # the explicit header block (headless tables where header detection
127
+ # promoted a row).
128
+ if not cell.get("is_header_row", False):
129
+ continue
130
+
131
+ # Locate the origin for dedup.
120
132
  if cell.get("is_span_copy", False):
121
133
  origin = cell.get("origin", (r, header_col))
122
- origin_cell = grid[origin[0]][origin[1]]
123
134
  else:
124
135
  origin = (r, header_col)
125
- origin_cell = cell
126
-
127
- if scope == "rowgroup":
128
- # A rowgroup header ancestors rows within its extent:
129
- # rowspan > 1 → extent = [origin_row, origin_row + rowspan - 1]
130
- # (the rowspan itself bounds the group, as in
131
- # a <th scope="rowgroup" rowspan="2"> pattern)
132
- # rowspan == 1 → extent = [origin_row, next_rowgroup - 1]
133
- # (a single-cell divider row like a FinTabNet
134
- # year label runs until the next such divider
135
- # in the same column)
136
- origin_row, origin_col = origin
137
- origin_rowspan = origin_cell.get("rowspan", 1)
138
- if origin_rowspan > 1:
139
- extent_end = origin_row + origin_rowspan - 1
140
- else:
141
- extent_end = len(grid) - 1
142
- for rr in range(origin_row + 1, len(grid)):
143
- other = grid[rr][origin_col]
144
- if (
145
- other
146
- and not other.get("is_span_copy", False)
147
- and other.get("scope") == "rowgroup"
148
- ):
149
- extent_end = rr - 1
150
- break
151
- if row > extent_end:
152
- continue
153
- else:
154
- # Non-scope-rowgroup <th> cells outside thead are only
155
- # accepted from the explicit header block (headless
156
- # tables where the header detection promoted a row).
157
- is_header_row = cell.get("is_header_row", False)
158
- if not is_header_row:
159
- continue
160
136
 
161
137
  if origin not in seen_origins:
162
138
  seen_origins.add(origin)
163
139
  # Insert at the beginning to maintain hierarchy
164
140
  row_headers.insert(row_header_columns.index(header_col), cell["text"])
165
141
 
142
+ # --- 4. Row-group bands ---
143
+ # A band / group header ancestors the data rows within its extent. Bands are
144
+ # collected from the data cell's own column AND every row-label column: the
145
+ # own column reaches bands that span the value region even when this row's
146
+ # label cell is empty (which would otherwise drop the band, e.g. an
147
+ # unlabeled continuation row under a group divider); the row-label columns
148
+ # reach narrow stub-column dividers (a FinTabNet year label). Extent is
149
+ # bounded by COLSPAN — a band ends at the next band whose span is equal or
150
+ # wider — so a narrower inner group header does not close an outer one.
151
+ # Bands are ordered topmost-first (origin row ascending) and prepended, so
152
+ # the row path reads outer-band > inner-group > row-labels, mirroring the
153
+ # multi-level column path.
154
+ bands: List[Tuple[int, str]] = [] # (origin_row, text)
155
+ for scan_col in [col, *row_header_columns]:
156
+ for r in range(row - 1, -1, -1):
157
+ cell = grid[r][scan_col]
158
+ if not cell or not cell.get("text", "").strip():
159
+ continue
160
+ if cell["type"] != "th" or cell.get("is_thead", False):
161
+ continue
162
+ if cell.get("scope") != "rowgroup":
163
+ continue
164
+ if cell.get("is_span_copy", False):
165
+ origin = cell.get("origin", (r, scan_col))
166
+ origin_cell = grid[origin[0]][origin[1]]
167
+ else:
168
+ origin = (r, scan_col)
169
+ origin_cell = cell
170
+ if origin in seen_origins:
171
+ continue
172
+ origin_row, origin_col = origin
173
+ my_colspan = origin_cell.get("colspan", 1)
174
+ origin_rowspan = origin_cell.get("rowspan", 1)
175
+ if origin_rowspan > 1:
176
+ extent_end = origin_row + origin_rowspan - 1
177
+ else:
178
+ extent_end = len(grid) - 1
179
+ for rr in range(origin_row + 1, len(grid)):
180
+ other = grid[rr][origin_col]
181
+ if (
182
+ other
183
+ and not other.get("is_span_copy", False)
184
+ and other.get("scope") == "rowgroup"
185
+ and other.get("colspan", 1) >= my_colspan
186
+ ):
187
+ extent_end = rr - 1
188
+ break
189
+ if row > extent_end:
190
+ continue
191
+ seen_origins.add(origin)
192
+ bands.append((origin_row, cell["text"]))
193
+ bands.sort(key=lambda b: b[0])
194
+ row_headers[:0] = [text for _row, text in bands]
195
+
166
196
  return row_headers, col_headers
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: table2rules
3
- Version: 0.5.2
3
+ Version: 0.6.0
4
4
  Summary: Convert HTML tables to flat, LLM-friendly rules using spatial pathfinding.
5
5
  Author: PebbleRoad Pte Ltd
6
6
  License-Expression: MIT
File without changes
File without changes
File without changes