table2rules 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,66 @@
1
+ """Rules exporter — the native table2rules format.
2
+
3
+ One self-contained rule per line:
4
+
5
+ <row-path> | <col-path>: <value>
6
+
7
+ Where row-path and col-path join nested header levels with ' > '.
8
+ Full header ancestry on every line so an LLM never loses column
9
+ context across rows. Informed by TIDE (ICLR 2025) and ASTRA (2025).
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from typing import List
15
+
16
+ from ..models import LogicRule
17
+
18
+ PATH_SEP = " > "
19
+ ROW_COL_SEP = " | "
20
+
21
+
22
+ class RulesExporter:
23
+ name = "rules"
24
+
25
+ def export_rules(self, rules: List[LogicRule]) -> List[str]:
26
+ # Sort by (row, col) so output follows reading order.
27
+ #
28
+ # Dedup is *origin-aware*: a single source cell expanded across
29
+ # multiple positions via rowspan/colspan can produce identical
30
+ # lines, which we collapse. But two different source cells that
31
+ # happen to render identically (e.g. two rows each with Qty: 1)
32
+ # are kept — dropping either would silently lose data.
33
+ ordered = sorted(rules, key=lambda r: r.position)
34
+ lines: List[str] = []
35
+ seen_by_origin: dict = {}
36
+ for rule in ordered:
37
+ line = self._format_rule(rule)
38
+ if not line:
39
+ continue
40
+ origin = rule.origin
41
+ if origin is not None and seen_by_origin.get(origin) == line:
42
+ continue
43
+ if origin is not None:
44
+ seen_by_origin[origin] = line
45
+ lines.append(line)
46
+ return lines
47
+
48
+ def export_flat(self, cell_rows: List[List[str]]) -> List[str]:
49
+ # No header info available on gate failure — fall back to pipe join.
50
+ return [" | ".join(row) for row in cell_rows if any(row)]
51
+
52
+ @staticmethod
53
+ def _format_rule(rule: LogicRule) -> str:
54
+ value = rule.outcome.strip()
55
+ if not value:
56
+ return ""
57
+ row_path = PATH_SEP.join(h.strip() for h in rule.row_headers if h.strip())
58
+ col_path = PATH_SEP.join(h.strip() for h in rule.col_headers if h.strip())
59
+
60
+ if row_path and col_path:
61
+ return f"{row_path}{ROW_COL_SEP}{col_path}: {value}"
62
+ if col_path:
63
+ return f"{col_path}: {value}"
64
+ if row_path:
65
+ return f"{row_path}: {value}"
66
+ return value
@@ -0,0 +1,487 @@
1
+ import re
2
+ from typing import Any, Dict, List, Optional, cast
3
+
4
+ from bs4 import NavigableString, Tag
5
+
6
+ from .simple_repair import detect_header_block
7
+ from .spans import assert_grid_size, clamped_span
8
+
9
+
10
+ def clean_text(text: str) -> str:
11
+ if not text:
12
+ return ""
13
+
14
+ # Basic HTML entity cleanup
15
+ text = text.replace("&nbsp;", " ")
16
+ text = text.replace("&amp;", "&")
17
+
18
+ # Strip residual HTML tags if any slipped through
19
+ text = re.sub(r"<[^>]+>", " ", text)
20
+
21
+ # 1) Fix double dollar patterns like "$$200,000" or "S$$3,000"
22
+ # Turn them into "$200,000" or "S$3,000"
23
+ text = re.sub(r"\bS\$\$(\d)", r"S$\1", text)
24
+ text = re.sub(r"\$\$(\d)", r"$\1", text)
25
+
26
+ # 2) Remove a trailing standalone "$" after a word/number
27
+ # e.g. "per Sickness$" -> "per Sickness"
28
+ text = re.sub(r"(\w)\$(\s|$)", r"\1\2", text)
29
+
30
+ # Collapse whitespace
31
+ text = re.sub(r"\s+", " ", text).strip()
32
+
33
+ return text
34
+
35
+
36
+ def _is_textualish(text: str) -> bool:
37
+ """Return True when text carries alphabetic descriptor content.
38
+
39
+ Uses ``str.isalpha`` on individual characters so the check is Unicode-aware
40
+ — a cell containing any letter in any writing system (Latin, Cyrillic, CJK,
41
+ Arabic, Devanagari, etc.) counts as textual. This is the single content
42
+ signal the parser relies on; the alphabetic-vs-numeric distinction is
43
+ universal across writing systems ("letters label, digits measure").
44
+ """
45
+ if not text:
46
+ return False
47
+ return any(ch.isalpha() for ch in text)
48
+
49
+
50
+ def get_row_cells(row, table) -> List:
51
+ """
52
+ Return logical cells for a row, including malformed sibling cells that may
53
+ be nested due to broken closing tags, while excluding nested-table cells.
54
+ """
55
+ cells = row.find_all(["td", "th"])
56
+ return [
57
+ cell
58
+ for cell in cells
59
+ if cell.find_parent("tr") is row and cell.find_parent("table") is table
60
+ ]
61
+
62
+
63
+ def extract_cell_text(cell) -> str:
64
+ """
65
+ Extract text from a logical cell while excluding text from malformed nested
66
+ sibling cells that can appear after HTML recovery.
67
+ """
68
+ parts: List[str] = []
69
+ for node in cell.descendants:
70
+ if not isinstance(node, NavigableString):
71
+ continue
72
+
73
+ text = str(node).strip()
74
+ if not text:
75
+ continue
76
+
77
+ parent = node.parent
78
+ if parent is None:
79
+ continue
80
+
81
+ nearest_cell: Optional[Tag]
82
+ if parent.name in ("td", "th"):
83
+ nearest_cell = parent
84
+ else:
85
+ ancestor = parent.find_parent(["td", "th"])
86
+ nearest_cell = ancestor if isinstance(ancestor, Tag) else None
87
+
88
+ if nearest_cell is cell:
89
+ parts.append(text)
90
+
91
+ return clean_text(" ".join(parts))
92
+
93
+
94
+ def parse_table_to_grid(table: Tag) -> List[List[Dict[str, Any]]]:
95
+ # 1. Get all top-level rows
96
+ all_rows_in_dom = table.find_all("tr")
97
+ actual_rows = []
98
+ for row in all_rows_in_dom:
99
+ if row.find_parent("table") is table:
100
+ actual_rows.append(row)
101
+ if not actual_rows:
102
+ return []
103
+
104
+ # --- UNIVERSAL HEADER LOGIC ---
105
+
106
+ data_start_row_idx = 0
107
+ thead = table.find("thead")
108
+ has_thead = isinstance(thead, Tag)
109
+
110
+ if isinstance(thead, Tag):
111
+ # --- Logic for tables WITH <thead> ---
112
+ data_start_row_idx = len(thead.find_all("tr", recursive=False))
113
+
114
+ else:
115
+ # --- Logic for "Headless" tables (NO <thead>) ---
116
+
117
+ # Step 1: Prefer an explicit header row that uses <th>.
118
+ # A column-header row has <th> cells whose scope is NOT 'row':
119
+ # scope='row' marks a row-stub header (row label), not a
120
+ # column label, so those cells cannot make a row qualify as the
121
+ # primary column-header row. Otherwise a single mid-body
122
+ # <th scope='row'> summary row (e.g. an explicit-markup totals
123
+ # line like <tr><th scope="row">Total</th>...</tr>) would be
124
+ # mistaken for the table header.
125
+ def _is_col_header_cell(cell):
126
+ return cell.name == "th" and cell.get("scope") != "row"
127
+
128
+ header_row_idx = None
129
+ for idx, row in enumerate(actual_rows):
130
+ cells = get_row_cells(row, table)
131
+ if not cells or len(cells) == 1:
132
+ # Skip empty or title rows
133
+ continue
134
+
135
+ has_col_th = any(_is_col_header_cell(cell) for cell in cells)
136
+ all_col_th_or_empty = all(
137
+ _is_col_header_cell(cell) or not cell.get_text(strip=True) for cell in cells
138
+ )
139
+
140
+ if has_col_th and all_col_th_or_empty:
141
+ header_row_idx = idx
142
+ break
143
+
144
+ if header_row_idx is not None:
145
+ # We have a clear header row made of <th>
146
+ main_header_row_idx = header_row_idx
147
+
148
+ # If some cells in this header row span multiple body rows,
149
+ # use the maximum rowspan to determine how many header rows exist.
150
+ cells = get_row_cells(actual_rows[main_header_row_idx], table)
151
+ header_row_span = max(clamped_span(cell.get("rowspan", 1)) for cell in cells) or 1
152
+
153
+ # Data starts after the header row span
154
+ data_start_row_idx = main_header_row_idx + header_row_span
155
+
156
+ else:
157
+ # Step 2: Fallback – no explicit <th> header row.
158
+ # Re-use the original rowspan-based heuristic on the first cell.
159
+ main_header_row_idx = 0
160
+ header_row_span = 1
161
+ found_header_row = False
162
+
163
+ for idx, row in enumerate(actual_rows):
164
+ cells = get_row_cells(row, table)
165
+ if not cells or len(cells) == 1:
166
+ continue
167
+
168
+ first_cell_rowspan = clamped_span(cells[0].get("rowspan", 1))
169
+ if first_cell_rowspan > 1:
170
+ main_header_row_idx = idx
171
+ header_row_span = first_cell_rowspan
172
+ found_header_row = True
173
+ break
174
+
175
+ if found_header_row:
176
+ data_start_row_idx = main_header_row_idx + header_row_span
177
+ else:
178
+ # Step 3: structural fallback via the same universal rule
179
+ # that simple_repair.Fix 4 uses. This path runs only when
180
+ # simple_repair didn't promote (e.g. upstream repairs left
181
+ # the table in a shape Fix 4 couldn't normalize) — it
182
+ # keeps the parser aligned with the repair's structural
183
+ # definition rather than reintroducing a separate
184
+ # "row 0 all non-empty" heuristic. If no header block is
185
+ # confidently identified, leave data_start_row_idx = 0 so
186
+ # every row is treated as data and the confidence gate
187
+ # decides whether to emit rules or flat.
188
+ detection = detect_header_block(actual_rows)
189
+ if detection is not None:
190
+ data_start_row_idx = detection[0]
191
+ else:
192
+ data_start_row_idx = 0
193
+
194
+ # --- END HEADER HEURISTIC ---
195
+
196
+ # --- KEY-VALUE TABLE DETECTION ---
197
+ # Detect simple key-value tables (no thead, 2 columns, th+td pattern)
198
+ # This prevents row headers from being treated as column headers
199
+ is_key_value_table = False
200
+ if not has_thead:
201
+ # Check if ALL rows follow the key-value pattern
202
+ is_key_value_table = True
203
+ for row in actual_rows:
204
+ cells = get_row_cells(row, table)
205
+ # Skip empty rows
206
+ if not cells:
207
+ continue
208
+ # Must have exactly 2 cells
209
+ if len(cells) != 2:
210
+ is_key_value_table = False
211
+ break
212
+ # First must be th, second must be td
213
+ if cells[0].name != "th" or cells[1].name != "td":
214
+ is_key_value_table = False
215
+ break
216
+ # No colspan/rowspan (keep it simple)
217
+ if (
218
+ clamped_span(cells[0].get("colspan", 1)) > 1
219
+ or clamped_span(cells[0].get("rowspan", 1)) > 1
220
+ ):
221
+ is_key_value_table = False
222
+ break
223
+ if (
224
+ clamped_span(cells[1].get("colspan", 1)) > 1
225
+ or clamped_span(cells[1].get("rowspan", 1)) > 1
226
+ ):
227
+ is_key_value_table = False
228
+ break
229
+ # Key-value tables have no header rows — every row is data.
230
+ if is_key_value_table:
231
+ data_start_row_idx = 0
232
+ # --- END KEY-VALUE DETECTION ---
233
+
234
+ # Phase 1: Calculate dimensions
235
+ max_cols = 0
236
+ occupied = {}
237
+
238
+ for row_idx, row in enumerate(actual_rows):
239
+ cells = get_row_cells(row, table)
240
+ logical_col = 0
241
+
242
+ for cell in cells:
243
+ while (row_idx, logical_col) in occupied:
244
+ logical_col += 1
245
+ rowspan = clamped_span(cell.get("rowspan", 1))
246
+ colspan = clamped_span(cell.get("colspan", 1))
247
+ assert_grid_size(len(actual_rows), logical_col + colspan)
248
+ for r in range(min(rowspan, len(actual_rows) - row_idx)):
249
+ for c in range(colspan):
250
+ occupied[(row_idx + r, logical_col + c)] = True
251
+ logical_col += colspan
252
+ max_cols = max(max_cols, logical_col)
253
+
254
+ if max_cols == 0:
255
+ return []
256
+
257
+ assert_grid_size(len(actual_rows), max_cols)
258
+
259
+ # Clamp inferred structure to valid ranges
260
+ data_start_row_idx = max(0, min(data_start_row_idx, len(actual_rows)))
261
+
262
+ # Phase 2: Create empty grid
263
+ grid: List[List[Optional[Dict[str, Any]]]] = [
264
+ [None for _ in range(max_cols)] for _ in range(len(actual_rows))
265
+ ]
266
+
267
+ # Phase 3: Fill grid
268
+ for row_idx, row in enumerate(actual_rows):
269
+ cells = get_row_cells(row, table)
270
+ logical_col = 0
271
+
272
+ # A row is a "header row" if it's before the data start (headless only)
273
+ is_header_row = (row_idx < data_start_row_idx) and not has_thead
274
+
275
+ for cell in cells:
276
+ while logical_col < max_cols and grid[row_idx][logical_col] is not None:
277
+ logical_col += 1
278
+ if logical_col >= max_cols:
279
+ break
280
+
281
+ rowspan = clamped_span(cell.get("rowspan", 1))
282
+ colspan = clamped_span(cell.get("colspan", 1))
283
+
284
+ # Universal cell_type logic
285
+ cell_type = cell.name
286
+
287
+ # Heuristic 1: header row (for headless)
288
+ # Skip this for key-value tables - they don't have header rows
289
+ if is_header_row and cell.name == "td" and not is_key_value_table:
290
+ cell_type = "th"
291
+
292
+ # Heuristic 1b: <td> cells inside <thead> are structural headers
293
+ # regardless of tag. Word / Markdown-to-HTML converters, and many
294
+ # CMS outputs, emit <thead><tr><td><b>Header</b></td></tr></thead>
295
+ # — the <thead> wrapper is the authoritative signal. Promote to
296
+ # <th> so downstream header-walking treats these as column
297
+ # headers.
298
+ if cell.name == "td" and cell.find_parent("thead") is not None:
299
+ cell_type = "th"
300
+
301
+ is_footer = cell.find_parent("tfoot") is not None
302
+ is_thead = cell.find_parent("thead") is not None
303
+
304
+ # Override scope for key-value tables
305
+ cell_scope = cell.get("scope")
306
+ if is_key_value_table and logical_col == 0 and cell_type == "th":
307
+ cell_scope = "row"
308
+
309
+ cell_data = {
310
+ "text": extract_cell_text(cell),
311
+ "type": cell_type,
312
+ "rowspan": rowspan,
313
+ "colspan": colspan,
314
+ "scope": cell_scope,
315
+ "is_footer": is_footer,
316
+ "is_thead": is_thead,
317
+ "has_thead": has_thead,
318
+ "is_header_row": is_header_row,
319
+ "header_depth": data_start_row_idx if has_thead else 0,
320
+ }
321
+
322
+ for r_offset in range(min(rowspan, len(grid) - row_idx)):
323
+ for c_offset in range(colspan):
324
+ target_row = row_idx + r_offset
325
+ target_col = logical_col + c_offset
326
+ if target_row < len(grid) and target_col < max_cols:
327
+ if r_offset == 0 and c_offset == 0:
328
+ grid[target_row][target_col] = cell_data
329
+ else:
330
+ span_ref = {
331
+ "text": cell_data["text"],
332
+ "type": cell_data["type"],
333
+ "rowspan": 1,
334
+ "colspan": 1,
335
+ "scope": cell_data.get("scope"),
336
+ "is_footer": cell_data["is_footer"],
337
+ "is_thead": cell_data["is_thead"],
338
+ "has_thead": cell_data["has_thead"],
339
+ "is_header_row": cell_data["is_header_row"],
340
+ "is_span_copy": True,
341
+ "origin": (row_idx, logical_col),
342
+ "header_depth": cell_data.get("header_depth", 0),
343
+ }
344
+ grid[target_row][target_col] = span_ref
345
+ logical_col += colspan
346
+
347
+ # Phase 3.5: Promote dimensional body columns to row headers.
348
+ #
349
+ # Universal structural signals:
350
+ # A) Rowspan signal: leading body columns with rowspan>1 origins are
351
+ # dimensional/grouping columns.
352
+ # B) Unlabeled descriptor signal: a body column has non-empty cells but
353
+ # no non-empty <thead> header text at that column and is text-like.
354
+ #
355
+ # Signal B is guarded to avoid over-promotion: only contiguous descriptor
356
+ # columns from the left edge (or directly after a promoted descriptor
357
+ # column) are promoted.
358
+ if has_thead and data_start_row_idx < len(grid):
359
+ # --- Per-column stats over body origins ---
360
+ body_nonempty = [0] * max_cols
361
+ body_textual = [0] * max_cols
362
+ body_th = [0] * max_cols
363
+ has_thead_text = [False] * max_cols
364
+
365
+ for c in range(max_cols):
366
+ for r in range(len(grid)):
367
+ cell = grid[r][c]
368
+ if not cell or cell.get("is_span_copy"):
369
+ continue
370
+ txt = (cell.get("text") or "").strip()
371
+ if cell.get("is_thead", False):
372
+ if txt:
373
+ has_thead_text[c] = True
374
+ continue
375
+ if r < data_start_row_idx:
376
+ continue
377
+ if cell.get("is_footer", False):
378
+ continue
379
+ if not txt:
380
+ continue
381
+ body_nonempty[c] += 1
382
+ if cell.get("type") == "th":
383
+ body_th[c] += 1
384
+ if _is_textualish(txt):
385
+ body_textual[c] += 1
386
+
387
+ def _descriptor_like(col: int) -> bool:
388
+ # A column is "descriptor-like" iff strictly more of its
389
+ # non-empty body cells are textual than non-textual. Using a
390
+ # strict-majority count (integer comparison) instead of a
391
+ # fixed ratio keeps the rule deterministic and aligned with
392
+ # the stub-column rule in simple_repair.detect_header_block.
393
+ n = body_nonempty[col]
394
+ if n < 2:
395
+ return False
396
+ return body_textual[col] * 2 > n
397
+
398
+ # --- Signal A: rowspan-driven leading dimensional columns ---
399
+ body_dimensional = []
400
+ for c in range(max_cols):
401
+ col_has_rowspan = False
402
+ for r in range(data_start_row_idx, len(grid)):
403
+ cell = grid[r][c]
404
+ if cell and not cell.get("is_span_copy") and cell.get("rowspan", 1) > 1:
405
+ col_has_rowspan = True
406
+ break
407
+ if col_has_rowspan:
408
+ body_dimensional.append(c)
409
+ else:
410
+ break # Stop at first non-dimensional column
411
+
412
+ # --- Signal B: full-depth header columns (multi-row headers) ---
413
+ if data_start_row_idx >= 2:
414
+ full_depth_count = 0
415
+ for c in range(max_cols):
416
+ cell = grid[0][c]
417
+ if (
418
+ cell
419
+ and cell.get("is_thead")
420
+ and not cell.get("is_span_copy")
421
+ and cell.get("rowspan", 1) == data_start_row_idx
422
+ ):
423
+ full_depth_count += 1
424
+ else:
425
+ break # Stop at first non-full-depth header
426
+ # Cap body signal at the header-derived count
427
+ body_dimensional = body_dimensional[:full_depth_count]
428
+
429
+ promote_cols = set()
430
+ if len(body_dimensional) >= 2:
431
+ promote_cols.update(body_dimensional)
432
+ elif len(body_dimensional) == 1:
433
+ c0 = body_dimensional[0]
434
+ if not has_thead_text[c0]:
435
+ promote_cols.add(c0)
436
+
437
+ # Seed only strong pre-promotions from upstream repair on unlabeled
438
+ # columns. A single summary-row <th> should not turn a labeled data
439
+ # column into a row-header column.
440
+ for c in range(max_cols):
441
+ if (
442
+ body_nonempty[c] >= 2
443
+ and not has_thead_text[c]
444
+ and (body_th[c] / max(1, body_nonempty[c])) >= 0.60
445
+ ):
446
+ promote_cols.add(c)
447
+
448
+ # --- Signal B: unlabeled descriptor columns ---
449
+ for c in range(max_cols):
450
+ if c in promote_cols:
451
+ continue
452
+ if has_thead_text[c]:
453
+ continue
454
+ if not _descriptor_like(c):
455
+ continue
456
+
457
+ left_is_dimensional = (c == 0) or ((c - 1) in promote_cols)
458
+ left_is_descriptor = (c > 0) and _descriptor_like(c - 1)
459
+ if not (left_is_dimensional or left_is_descriptor):
460
+ continue
461
+ promote_cols.add(c)
462
+
463
+ if promote_cols:
464
+ for c in sorted(promote_cols):
465
+ for r in range(data_start_row_idx, len(grid)):
466
+ cell = grid[r][c]
467
+ if not cell or cell.get("is_footer", False):
468
+ continue
469
+ if cell["type"] == "td" and (cell.get("text") or "").strip():
470
+ cell["type"] = "th"
471
+ if not cell.get("scope"):
472
+ cell["scope"] = "row"
473
+
474
+ # Phase 4: Fill gaps
475
+ for r in range(len(grid)):
476
+ for c in range(max_cols):
477
+ if grid[r][c] is None:
478
+ grid[r][c] = {
479
+ "text": "",
480
+ "type": "td",
481
+ "rowspan": 1,
482
+ "colspan": 1,
483
+ "has_thead": has_thead,
484
+ }
485
+
486
+ # Phase 4 guarantees every cell is non-None; cast for the public type.
487
+ return cast(List[List[Dict[str, Any]]], grid)