exstruct 0.2.80__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
exstruct/core/cells.py ADDED
@@ -0,0 +1,1039 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import deque
4
+ from collections.abc import Sequence
5
+ from decimal import Decimal, InvalidOperation
6
+ import logging
7
+ from pathlib import Path
8
+ import re
9
+
10
+ import numpy as np
11
+ from openpyxl import load_workbook
12
+ from openpyxl.utils import get_column_letter, range_boundaries
13
+ from openpyxl.worksheet.worksheet import Worksheet
14
+ import pandas as pd
15
+ import xlwings as xw
16
+
17
+ from ..models import CellRow
18
+
19
+ logger = logging.getLogger(__name__)
20
+ _warned_keys: set[str] = set()
21
+ XL_LINESTYLE_NONE = -4142
22
+ XL_INSIDE_VERTICAL = 11
23
+ XL_INSIDE_HORIZONTAL = 12
24
+ XL_EDGE_LEFT = 7
25
+ XL_EDGE_TOP = 8
26
+ XL_EDGE_BOTTOM = 9
27
+ XL_EDGE_RIGHT = 10
28
+ MatrixInput = Sequence[Sequence[object]] | Sequence[object]
29
+
30
+ # Detection tuning parameters (can be overridden via set_table_detection_params)
31
+ _DETECTION_CONFIG = {
32
+ "table_score_threshold": 0.35,
33
+ "density_min": 0.05,
34
+ "coverage_min": 0.2,
35
+ "min_nonempty_cells": 3,
36
+ }
37
+
38
+
39
+ def warn_once(key: str, message: str) -> None:
40
+ if key not in _warned_keys:
41
+ logger.warning(message)
42
+ _warned_keys.add(key)
43
+
44
+
45
+ def extract_sheet_cells(file_path: Path) -> dict[str, list[CellRow]]:
46
+ """Read all sheets via pandas and convert to CellRow list while skipping empty cells."""
47
+ dfs = pd.read_excel(file_path, header=None, sheet_name=None, dtype=str)
48
+ result: dict[str, list[CellRow]] = {}
49
+ for sheet_name, df in dfs.items():
50
+ df = df.fillna("")
51
+ rows: list[CellRow] = []
52
+ for excel_row, row in enumerate(df.itertuples(index=False, name=None), start=1):
53
+ filtered: dict[str, int | float | str] = {}
54
+ for j, v in enumerate(row):
55
+ s = "" if v is None else str(v)
56
+ if s.strip() == "":
57
+ continue
58
+ filtered[str(j)] = _coerce_numeric_preserve_format(s)
59
+ if not filtered:
60
+ continue
61
+ rows.append(CellRow(r=excel_row, c=filtered))
62
+ result[sheet_name] = rows
63
+ return result
64
+
65
+
66
+ def extract_sheet_cells_with_links(file_path: Path) -> dict[str, list[CellRow]]:
67
+ """
68
+ Extract cells and hyperlinks per sheet.
69
+
70
+ Returns:
71
+ {sheet_name: [CellRow(r=..., c=..., links={"col_index": url, ...}), ...]}
72
+
73
+ Notes:
74
+ - Uses pandas extraction for values (same filtering as extract_sheet_cells).
75
+ - Collects hyperlinks via openpyxl (requires read_only=False because border maps/hyperlinks need full objects).
76
+ - Links are mapped by column index string (e.g., "0") to hyperlink.target.
77
+ """
78
+ cell_rows = extract_sheet_cells(file_path)
79
+ wb = load_workbook(file_path, data_only=True, read_only=False)
80
+ links_by_sheet: dict[str, dict[int, dict[str, str]]] = {}
81
+ for ws in wb.worksheets:
82
+ sheet_links: dict[int, dict[str, str]] = {}
83
+ for row in ws.iter_rows():
84
+ for cell in row:
85
+ link = getattr(cell, "hyperlink", None)
86
+ target = getattr(link, "target", None) if link else None
87
+ if not target:
88
+ continue
89
+ col_str = str(
90
+ cell.col_idx - 1
91
+ ) # zero-based to align with extract_sheet_cells
92
+ sheet_links.setdefault(cell.row, {})[col_str] = target
93
+ links_by_sheet[ws.title] = sheet_links
94
+
95
+ merged: dict[str, list[CellRow]] = {}
96
+ for sheet_name, rows in cell_rows.items():
97
+ sheet_links = links_by_sheet.get(sheet_name, {})
98
+ merged_rows: list[CellRow] = []
99
+ for row in rows:
100
+ links = sheet_links.get(row.r, {})
101
+ merged_rows.append(CellRow(r=row.r, c=row.c, links=links or None))
102
+ merged[sheet_name] = merged_rows
103
+ wb.close()
104
+ return merged
105
+
106
+
107
+ def shrink_to_content( # noqa: C901
108
+ sheet: xw.Sheet,
109
+ top: int,
110
+ left: int,
111
+ bottom: int,
112
+ right: int,
113
+ require_inside_border: bool = False,
114
+ min_nonempty_ratio: float = 0.0,
115
+ ) -> tuple[int, int, int, int]:
116
+ """Trim a rectangle based on cell contents and optional border heuristics."""
117
+ rng = sheet.range((top, left), (bottom, right))
118
+ vals = rng.value
119
+ if vals is None:
120
+ vals = []
121
+ if not isinstance(vals, list):
122
+ vals = [[vals]]
123
+ elif vals and not isinstance(vals[0], list):
124
+ vals = [vals]
125
+ rows_n = len(vals)
126
+ cols_n = len(vals[0]) if rows_n else 0
127
+
128
+ def to_str(x: object) -> str:
129
+ return "" if x is None else str(x)
130
+
131
+ def is_empty_value(x: object) -> bool:
132
+ return to_str(x).strip() == ""
133
+
134
+ def row_empty(i: int) -> bool:
135
+ return cols_n == 0 or all(is_empty_value(vals[i][j]) for j in range(cols_n))
136
+
137
+ def col_empty(j: int) -> bool:
138
+ return rows_n == 0 or all(is_empty_value(vals[i][j]) for i in range(rows_n))
139
+
140
+ def row_nonempty_ratio(i: int) -> float:
141
+ if cols_n == 0:
142
+ return 0.0
143
+ cnt = sum(1 for j in range(cols_n) if not is_empty_value(vals[i][j]))
144
+ return cnt / cols_n
145
+
146
+ def col_nonempty_ratio(j: int) -> float:
147
+ if rows_n == 0:
148
+ return 0.0
149
+ cnt = sum(1 for i in range(rows_n) if not is_empty_value(vals[i][j]))
150
+ return cnt / rows_n
151
+
152
+ def column_has_inside_border(col_idx: int) -> bool:
153
+ if not require_inside_border:
154
+ return False
155
+ try:
156
+ for r in range(top, bottom + 1):
157
+ ls = (
158
+ sheet.api.Cells(r, left + col_idx)
159
+ .Borders(XL_INSIDE_VERTICAL)
160
+ .LineStyle
161
+ )
162
+ if ls is not None and ls != XL_LINESTYLE_NONE:
163
+ return True
164
+ except Exception:
165
+ pass
166
+ return False
167
+
168
+ def row_has_inside_border(row_idx: int) -> bool:
169
+ if not require_inside_border:
170
+ return False
171
+ try:
172
+ for c in range(left, right + 1):
173
+ ls = (
174
+ sheet.api.Cells(top + row_idx, c)
175
+ .Borders(XL_INSIDE_HORIZONTAL)
176
+ .LineStyle
177
+ )
178
+ if ls is not None and ls != XL_LINESTYLE_NONE:
179
+ return True
180
+ except Exception:
181
+ pass
182
+ return False
183
+
184
+ def should_trim_col(j: int) -> bool:
185
+ if col_empty(j):
186
+ return True
187
+ if require_inside_border and not column_has_inside_border(j):
188
+ return True
189
+ if min_nonempty_ratio > 0.0 and col_nonempty_ratio(j) < min_nonempty_ratio:
190
+ return True
191
+ return False
192
+
193
+ def should_trim_row(i: int) -> bool:
194
+ if row_empty(i):
195
+ return True
196
+ if require_inside_border and not row_has_inside_border(i):
197
+ return True
198
+ if min_nonempty_ratio > 0.0 and row_nonempty_ratio(i) < min_nonempty_ratio:
199
+ return True
200
+ return False
201
+
202
+ while left <= right and cols_n > 0:
203
+ if should_trim_col(0):
204
+ for i in range(rows_n):
205
+ if cols_n > 0:
206
+ vals[i].pop(0)
207
+ cols_n = len(vals[0]) if rows_n else 0
208
+ left += 1
209
+ else:
210
+ break
211
+ while top <= bottom and rows_n > 0:
212
+ if should_trim_row(0):
213
+ vals.pop(0)
214
+ rows_n = len(vals)
215
+ top += 1
216
+ else:
217
+ break
218
+ while left <= right and cols_n > 0:
219
+ if should_trim_col(cols_n - 1):
220
+ for i in range(rows_n):
221
+ if cols_n > 0:
222
+ vals[i].pop(cols_n - 1)
223
+ cols_n = len(vals[0]) if rows_n else 0
224
+ right -= 1
225
+ else:
226
+ break
227
+ while top <= bottom and rows_n > 0:
228
+ if should_trim_row(rows_n - 1):
229
+ vals.pop(rows_n - 1)
230
+ rows_n = len(vals)
231
+ bottom -= 1
232
+ else:
233
+ break
234
+ return top, left, bottom, right
235
+
236
+
237
+ def load_border_maps_xlsx( # noqa: C901
238
+ xlsx_path: Path, sheet_name: str
239
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int]:
240
+ wb = load_workbook(xlsx_path, data_only=True, read_only=False)
241
+ if sheet_name not in wb.sheetnames:
242
+ wb.close()
243
+ raise KeyError(f"Sheet '{sheet_name}' not found in {xlsx_path}")
244
+
245
+ ws = wb[sheet_name]
246
+ try:
247
+ min_col, min_row, max_col, max_row = range_boundaries(ws.calculate_dimension())
248
+ except Exception:
249
+ min_col, min_row, max_col, max_row = 1, 1, ws.max_column or 1, ws.max_row or 1
250
+
251
+ shape = (max_row + 1, max_col + 1)
252
+ has_border = np.zeros(shape, dtype=bool)
253
+ top_edge = np.zeros(shape, dtype=bool)
254
+ bottom_edge = np.zeros(shape, dtype=bool)
255
+ left_edge = np.zeros(shape, dtype=bool)
256
+ right_edge = np.zeros(shape, dtype=bool)
257
+
258
+ def edge_has_style(edge: object) -> bool:
259
+ if edge is None:
260
+ return False
261
+ style = getattr(edge, "style", None)
262
+ return style is not None and style != "none"
263
+
264
+ for r in range(min_row, max_row + 1):
265
+ for c in range(min_col, max_col + 1):
266
+ cell = ws.cell(row=r, column=c)
267
+ b = getattr(cell, "border", None)
268
+ if b is None:
269
+ continue
270
+
271
+ t = edge_has_style(b.top)
272
+ btm = edge_has_style(b.bottom)
273
+ left_border = edge_has_style(b.left)
274
+ rgt = edge_has_style(b.right)
275
+
276
+ if t or btm or left_border or rgt:
277
+ has_border[r, c] = True
278
+ if t:
279
+ top_edge[r, c] = True
280
+ if btm:
281
+ bottom_edge[r, c] = True
282
+ if left_border:
283
+ left_edge[r, c] = True
284
+ if rgt:
285
+ right_edge[r, c] = True
286
+
287
+ wb.close()
288
+ return has_border, top_edge, bottom_edge, left_edge, right_edge, max_row, max_col
289
+
290
+
291
+ def _detect_border_clusters_numpy(
292
+ has_border: np.ndarray, min_size: int
293
+ ) -> list[tuple[int, int, int, int]]:
294
+ from scipy.ndimage import label
295
+
296
+ structure = np.array([[0, 1, 0], [1, 1, 1], [0, 1, 0]], dtype=np.uint8)
297
+ lbl, num = label(has_border.astype(np.uint8), structure=structure)
298
+ rects: list[tuple[int, int, int, int]] = []
299
+ for k in range(1, int(num) + 1):
300
+ ys, xs = np.where(lbl == k)
301
+ if int(len(ys)) < min_size:
302
+ continue
303
+ rects.append((int(ys.min()), int(xs.min()), int(ys.max()), int(xs.max())))
304
+ return rects
305
+
306
+
307
+ def _detect_border_clusters_python(
308
+ has_border: np.ndarray, min_size: int
309
+ ) -> list[tuple[int, int, int, int]]:
310
+ h, w = has_border.shape
311
+ visited = np.zeros_like(has_border, dtype=bool)
312
+ rects: list[tuple[int, int, int, int]] = []
313
+ for r in range(h):
314
+ for c in range(w):
315
+ if not has_border[r, c] or visited[r, c]:
316
+ continue
317
+ q = deque([(r, c)])
318
+ visited[r, c] = True
319
+ ys = [r]
320
+ xs = [c]
321
+ while q:
322
+ yy, xx = q.popleft()
323
+ for dy, dx in ((1, 0), (-1, 0), (0, 1), (0, -1)):
324
+ ny, nx = yy + dy, xx + dx
325
+ if (
326
+ 0 <= ny < h
327
+ and 0 <= nx < w
328
+ and has_border[ny, nx]
329
+ and not visited[ny, nx]
330
+ ):
331
+ visited[ny, nx] = True
332
+ q.append((ny, nx))
333
+ ys.append(ny)
334
+ xs.append(nx)
335
+ if len(ys) >= min_size:
336
+ rects.append((min(ys), min(xs), max(ys), max(xs)))
337
+ return rects
338
+
339
+
340
+ def detect_border_clusters(
341
+ has_border: np.ndarray, min_size: int = 4
342
+ ) -> list[tuple[int, int, int, int]]:
343
+ try:
344
+ return _detect_border_clusters_numpy(has_border, min_size)
345
+ except Exception:
346
+ warn_once(
347
+ "scipy-missing",
348
+ "scipy is not available. Falling back to pure-Python BFS for connected components, which may be significantly slower.",
349
+ )
350
+ return _detect_border_clusters_python(has_border, min_size)
351
+
352
+
353
+ def _get_values_block(
354
+ ws: Worksheet, top: int, left: int, bottom: int, right: int
355
+ ) -> list[list[object]]:
356
+ vals: list[list[object]] = []
357
+ for row in ws.iter_rows(
358
+ min_row=top, max_row=bottom, min_col=left, max_col=right, values_only=True
359
+ ):
360
+ vals.append(list(row))
361
+ return vals
362
+
363
+
364
+ def _ensure_matrix(matrix: MatrixInput) -> list[list[object]]:
365
+ rows_seq = list(matrix)
366
+ if not rows_seq:
367
+ return []
368
+ first = rows_seq[0]
369
+ if isinstance(first, Sequence) and not isinstance(first, (str, bytes, bytearray)):
370
+ normalized: list[list[object]] = []
371
+ for row in rows_seq:
372
+ if isinstance(row, Sequence) and not isinstance(
373
+ row, (str, bytes, bytearray)
374
+ ):
375
+ normalized.append(list(row))
376
+ else:
377
+ normalized.append([row])
378
+ return normalized
379
+ return [list(rows_seq)]
380
+
381
+
382
+ def _table_density_metrics(matrix: MatrixInput) -> tuple[float, float]:
383
+ """
384
+ Given a 2D matrix (list of rows), return (density, coverage).
385
+ density: nonempty / total cells.
386
+ coverage: area of tight bounding box of nonempty cells divided by total area.
387
+ """
388
+ normalized = _ensure_matrix(matrix)
389
+ if not normalized:
390
+ return 0.0, 0.0
391
+ rows = len(normalized)
392
+ cols = len(normalized[0]) if rows else 0
393
+ if rows == 0 or cols == 0:
394
+ return 0.0, 0.0
395
+
396
+ nonempty_coords = []
397
+ for i, row in enumerate(normalized):
398
+ for j, v in enumerate(row):
399
+ if not (v is None or str(v).strip() == ""):
400
+ nonempty_coords.append((i, j))
401
+
402
+ total = rows * cols
403
+ if not nonempty_coords:
404
+ return 0.0, 0.0
405
+
406
+ nonempty = len(nonempty_coords)
407
+ density = nonempty / total
408
+
409
+ ys = [p[0] for p in nonempty_coords]
410
+ xs = [p[1] for p in nonempty_coords]
411
+ bbox_h = max(ys) - min(ys) + 1
412
+ bbox_w = max(xs) - min(xs) + 1
413
+ coverage = (bbox_h * bbox_w) / total if total > 0 else 0.0
414
+ return density, coverage
415
+
416
+
417
+ def _is_plausible_table(matrix: MatrixInput) -> bool:
418
+ """
419
+ Heuristic: require at least 2 rows and 2 cols with meaningful data.
420
+ - At least 2 rows have 2 以上の非空セル
421
+ - At least 2 columns have 2 以上の非空セル
422
+ """
423
+ normalized = _ensure_matrix(matrix)
424
+ if not normalized:
425
+ return False
426
+
427
+ rows = len(normalized)
428
+ cols = max((len(r) if isinstance(r, list) else 1) for r in normalized) if rows else 0
429
+ if rows < 2 or cols < 2:
430
+ return False
431
+
432
+ row_counts: list[int] = []
433
+ col_counts = [0] * cols
434
+ for r in normalized:
435
+ cnt = 0
436
+ for j in range(cols):
437
+ v = r[j] if j < len(r) else None
438
+ if not (v is None or str(v).strip() == ""):
439
+ cnt += 1
440
+ col_counts[j] += 1
441
+ row_counts.append(cnt)
442
+
443
+ rows_with_two = sum(1 for c in row_counts if c >= 2)
444
+ cols_with_two = sum(1 for c in col_counts if c >= 2)
445
+ return rows_with_two >= 2 and cols_with_two >= 2
446
+
447
+
448
+ def _nonempty_clusters(matrix: Sequence[Sequence[object]]) -> list[tuple[int, int, int, int]]:
449
+ """Return bounding boxes of connected components of nonempty cells (4-neighbor)."""
450
+ if not matrix:
451
+ return []
452
+ rows = len(matrix)
453
+ cols = max(len(r) for r in matrix) if rows else 0
454
+ grid = [[False] * cols for _ in range(rows)]
455
+ for i, row in enumerate(matrix):
456
+ for j in range(cols):
457
+ v = row[j] if j < len(row) else None
458
+ if not (v is None or str(v).strip() == ""):
459
+ grid[i][j] = True
460
+ visited = [[False] * cols for _ in range(rows)]
461
+ boxes: list[tuple[int, int, int, int]] = []
462
+
463
+ def bfs(sr: int, sc: int) -> tuple[int, int, int, int]:
464
+ q = deque([(sr, sc)])
465
+ visited[sr][sc] = True
466
+ ys = [sr]
467
+ xs = [sc]
468
+ while q:
469
+ r, c = q.popleft()
470
+ for dr, dc in ((1, 0), (-1, 0), (0, 1), (0, -1)):
471
+ nr, nc = r + dr, c + dc
472
+ if (
473
+ 0 <= nr < rows
474
+ and 0 <= nc < cols
475
+ and grid[nr][nc]
476
+ and not visited[nr][nc]
477
+ ):
478
+ visited[nr][nc] = True
479
+ q.append((nr, nc))
480
+ ys.append(nr)
481
+ xs.append(nc)
482
+ return min(ys), min(xs), max(ys), max(xs)
483
+
484
+ for i in range(rows):
485
+ for j in range(cols):
486
+ if grid[i][j] and not visited[i][j]:
487
+ boxes.append(bfs(i, j))
488
+ return boxes
489
+
490
+
491
+ def _normalize_matrix(matrix: object) -> list[list[object]]:
492
+ if matrix is None:
493
+ return []
494
+ if isinstance(matrix, list):
495
+ return _ensure_matrix(matrix)
496
+ if isinstance(matrix, Sequence) and not isinstance(matrix, (str, bytes, bytearray)):
497
+ return _ensure_matrix(matrix)
498
+ return [[matrix]]
499
+
500
+
501
+ def _header_like_row(row: list[object]) -> bool:
502
+ nonempty = [v for v in row if not (v is None or str(v).strip() == "")]
503
+ if len(nonempty) < 2:
504
+ return False
505
+ str_like = 0
506
+ num_like = 0
507
+ for v in nonempty:
508
+ s = str(v)
509
+ if _INT_RE.match(s) or _FLOAT_RE.match(s):
510
+ num_like += 1
511
+ else:
512
+ str_like += 1
513
+ return str_like >= num_like and str_like >= 1
514
+
515
+
516
+ def _table_signal_score(matrix: Sequence[Sequence[object]]) -> float:
517
+ normalized = _ensure_matrix(matrix)
518
+ density, coverage = _table_density_metrics(normalized)
519
+ header = any(_header_like_row(r) for r in normalized[:2]) # check first 2 rows
520
+
521
+ rows = len(normalized)
522
+ cols = max((len(r) if isinstance(r, list) else 1) for r in normalized) if rows else 0
523
+ row_counts: list[int] = []
524
+ col_counts = [0] * cols if cols else []
525
+ for r in normalized:
526
+ cnt = 0
527
+ for j in range(cols):
528
+ v = r[j] if j < len(r) else None
529
+ if not (v is None or str(v).strip() == ""):
530
+ cnt += 1
531
+ if j < len(col_counts):
532
+ col_counts[j] += 1
533
+ row_counts.append(cnt)
534
+ rows_with_two = sum(1 for c in row_counts if c >= 2)
535
+ cols_with_two = sum(1 for c in col_counts if c >= 2)
536
+ structure_score = 0.1 if (rows_with_two >= 2 and cols_with_two >= 2) else 0.0
537
+
538
+ score = density
539
+ if header:
540
+ score += 0.2
541
+ if coverage > 0.5:
542
+ score += 0.1
543
+ score += structure_score
544
+ return score
545
+
546
+
547
+ def set_table_detection_params(
548
+ *,
549
+ table_score_threshold: float | None = None,
550
+ density_min: float | None = None,
551
+ coverage_min: float | None = None,
552
+ min_nonempty_cells: int | None = None,
553
+ ) -> None:
554
+ """
555
+ Configure table detection heuristics at runtime.
556
+ Any parameter left as None keeps its current value.
557
+ """
558
+ if table_score_threshold is not None:
559
+ _DETECTION_CONFIG["table_score_threshold"] = table_score_threshold
560
+ if density_min is not None:
561
+ _DETECTION_CONFIG["density_min"] = density_min
562
+ if coverage_min is not None:
563
+ _DETECTION_CONFIG["coverage_min"] = coverage_min
564
+ if min_nonempty_cells is not None:
565
+ _DETECTION_CONFIG["min_nonempty_cells"] = min_nonempty_cells
566
+
567
+
568
+ def shrink_to_content_openpyxl( # noqa: C901
569
+ ws: Worksheet,
570
+ top: int,
571
+ left: int,
572
+ bottom: int,
573
+ right: int,
574
+ require_inside_border: bool,
575
+ top_edge: np.ndarray,
576
+ bottom_edge: np.ndarray,
577
+ left_edge: np.ndarray,
578
+ right_edge: np.ndarray,
579
+ min_nonempty_ratio: float = 0.0,
580
+ ) -> tuple[int, int, int, int]:
581
+ vals = _get_values_block(ws, top, left, bottom, right)
582
+ rows_n = bottom - top + 1
583
+ cols_n = right - left + 1
584
+
585
+ def to_str(x: object) -> str:
586
+ return "" if x is None else str(x)
587
+
588
+ def is_empty_value(x: object) -> bool:
589
+ return to_str(x).strip() == ""
590
+
591
+ def row_nonempty_ratio_local(i: int) -> float:
592
+ if cols_n <= 0:
593
+ return 0.0
594
+ row = vals[i]
595
+ cnt = sum(1 for v in row if not is_empty_value(v))
596
+ return cnt / cols_n
597
+
598
+ def col_nonempty_ratio_local(j: int) -> float:
599
+ if rows_n <= 0:
600
+ return 0.0
601
+ cnt = 0
602
+ for i in range(rows_n):
603
+ if not is_empty_value(vals[i][j]):
604
+ cnt += 1
605
+ return cnt / rows_n
606
+
607
+ def col_has_inside_border(j_abs: int) -> bool:
608
+ if not require_inside_border:
609
+ return False
610
+ count_pairs = 0
611
+ for r_abs in range(top, bottom + 1):
612
+ if (
613
+ j_abs > left
614
+ and right_edge[r_abs, j_abs - 1]
615
+ and left_edge[r_abs, j_abs]
616
+ ):
617
+ count_pairs += 1
618
+ return count_pairs > 0
619
+
620
+ def row_has_inside_border(i_abs: int) -> bool:
621
+ if not require_inside_border:
622
+ return False
623
+ count_pairs = 0
624
+ for c_abs in range(left, right + 1):
625
+ if i_abs > top and bottom_edge[i_abs - 1, c_abs] and top_edge[i_abs, c_abs]:
626
+ count_pairs += 1
627
+ return count_pairs > 0
628
+
629
+ while left <= right and cols_n > 0:
630
+ empty_col = all(
631
+ not (
632
+ top_edge[i, left]
633
+ or bottom_edge[i, left]
634
+ or left_edge[i, left]
635
+ or right_edge[i, left]
636
+ )
637
+ for i in range(top, bottom + 1)
638
+ )
639
+ if (
640
+ empty_col
641
+ or (require_inside_border and not col_has_inside_border(left))
642
+ or (
643
+ min_nonempty_ratio > 0.0
644
+ and col_nonempty_ratio_local(0) < min_nonempty_ratio
645
+ )
646
+ ):
647
+ for i in range(rows_n):
648
+ if cols_n > 0:
649
+ vals[i].pop(0)
650
+ cols_n -= 1
651
+ left += 1
652
+ else:
653
+ break
654
+ while top <= bottom and rows_n > 0:
655
+ empty_row = all(
656
+ not (
657
+ top_edge[top, j]
658
+ or bottom_edge[top, j]
659
+ or left_edge[top, j]
660
+ or right_edge[top, j]
661
+ )
662
+ for j in range(left, right + 1)
663
+ )
664
+ if (
665
+ empty_row
666
+ or (require_inside_border and not row_has_inside_border(top))
667
+ or (
668
+ min_nonempty_ratio > 0.0
669
+ and row_nonempty_ratio_local(0) < min_nonempty_ratio
670
+ )
671
+ ):
672
+ vals.pop(0)
673
+ rows_n -= 1
674
+ top += 1
675
+ else:
676
+ break
677
+ while left <= right and cols_n > 0:
678
+ empty_col = all(
679
+ not (
680
+ top_edge[i, right]
681
+ or bottom_edge[i, right]
682
+ or left_edge[i, right]
683
+ or right_edge[i, right]
684
+ )
685
+ for i in range(top, bottom + 1)
686
+ )
687
+ if (
688
+ empty_col
689
+ or (require_inside_border and not col_has_inside_border(right))
690
+ or (
691
+ min_nonempty_ratio > 0.0
692
+ and col_nonempty_ratio_local(cols_n - 1) < min_nonempty_ratio
693
+ )
694
+ ):
695
+ for i in range(rows_n):
696
+ if cols_n > 0:
697
+ vals[i].pop(cols_n - 1)
698
+ cols_n -= 1
699
+ right -= 1
700
+ else:
701
+ break
702
+ while top <= bottom and rows_n > 0:
703
+ empty_row = all(
704
+ not (
705
+ top_edge[bottom, j]
706
+ or bottom_edge[bottom, j]
707
+ or left_edge[bottom, j]
708
+ or right_edge[bottom, j]
709
+ )
710
+ for j in range(left, right + 1)
711
+ )
712
+ if (
713
+ empty_row
714
+ or (require_inside_border and not row_has_inside_border(bottom))
715
+ or (
716
+ min_nonempty_ratio > 0.0
717
+ and row_nonempty_ratio_local(rows_n - 1) < min_nonempty_ratio
718
+ )
719
+ ):
720
+ vals.pop(rows_n - 1)
721
+ rows_n -= 1
722
+ bottom -= 1
723
+ else:
724
+ break
725
+ return top, left, bottom, right
726
+
727
+
728
+ def detect_tables_xlwings(sheet: xw.Sheet) -> list[str]: # noqa: C901
729
+ """Detect table-like ranges via COM: ListObjects first, then border clusters."""
730
+ tables: list[str] = []
731
+ try:
732
+ for lo in sheet.api.ListObjects:
733
+ rng = lo.Range
734
+ top_row = int(rng.Row)
735
+ left_col = int(rng.Column)
736
+ bottom_row = top_row + int(rng.Rows.Count) - 1
737
+ right_col = left_col + int(rng.Columns.Count) - 1
738
+ addr = rng.Address(RowAbsolute=False, ColumnAbsolute=False)
739
+ tables.append(addr)
740
+ except Exception:
741
+ pass
742
+
743
+ used = sheet.used_range
744
+ max_row = used.last_cell.row
745
+ max_col = used.last_cell.column
746
+
747
+ def cell_has_any_border(r: int, c: int) -> bool:
748
+ try:
749
+ b = sheet.api.Cells(r, c).Borders
750
+ for idx in (
751
+ XL_EDGE_LEFT,
752
+ XL_EDGE_TOP,
753
+ XL_EDGE_RIGHT,
754
+ XL_EDGE_BOTTOM,
755
+ XL_INSIDE_VERTICAL,
756
+ XL_INSIDE_HORIZONTAL,
757
+ ):
758
+ ls = b(idx).LineStyle
759
+ if ls is not None and ls != XL_LINESTYLE_NONE:
760
+ try:
761
+ if getattr(b(idx), "Weight", 0) == 0:
762
+ continue
763
+ except Exception:
764
+ pass
765
+ return True
766
+ return False
767
+ except Exception:
768
+ return False
769
+
770
+ grid = [[False] * (max_col + 1) for _ in range(max_row + 1)]
771
+ for r in range(1, max_row + 1):
772
+ for c in range(1, max_col + 1):
773
+ if cell_has_any_border(r, c):
774
+ grid[r][c] = True
775
+ visited = [[False] * (max_col + 1) for _ in range(max_row + 1)]
776
+
777
+ def dfs(sr: int, sc: int, acc: list[tuple[int, int]]) -> None:
778
+ stack = [(sr, sc)]
779
+ while stack:
780
+ rr, cc = stack.pop()
781
+ if not (1 <= rr <= max_row and 1 <= cc <= max_col):
782
+ continue
783
+ if visited[rr][cc] or not grid[rr][cc]:
784
+ continue
785
+ visited[rr][cc] = True
786
+ acc.append((rr, cc))
787
+ for dr, dc in ((1, 0), (-1, 0), (0, 1), (0, -1)):
788
+ stack.append((rr + dr, cc + dc))
789
+
790
+ clusters: list[tuple[int, int, int, int]] = []
791
+ for r in range(1, max_row + 1):
792
+ for c in range(1, max_col + 1):
793
+ if grid[r][c] and not visited[r][c]:
794
+ cluster: list[tuple[int, int]] = []
795
+ dfs(r, c, cluster)
796
+ if len(cluster) < 4:
797
+ continue
798
+ rows = [rc[0] for rc in cluster]
799
+ cols = [rc[1] for rc in cluster]
800
+ top_row = min(rows)
801
+ bottom_row = max(rows)
802
+ left_col = min(cols)
803
+ right_col = max(cols)
804
+ clusters.append((top_row, left_col, bottom_row, right_col))
805
+
806
+ def overlaps_for_merge(
807
+ a: tuple[int, int, int, int], b: tuple[int, int, int, int]
808
+ ) -> bool:
809
+ # Do not merge if one rect fully contains the other (separate clusters like big frame vs small table)
810
+ contains = (
811
+ a[0] <= b[0] and a[1] <= b[1] and a[2] >= b[2] and a[3] >= b[3]
812
+ ) or (b[0] <= a[0] and b[1] <= a[1] and b[2] >= a[2] and b[3] >= a[3])
813
+ if contains:
814
+ return False
815
+ return not (a[1] > b[3] or a[3] < b[1] or a[0] > b[2] or a[2] < b[0])
816
+
817
+ merged_rects: list[tuple[int, int, int, int]] = []
818
+ for rect in sorted(clusters):
819
+ merged = False
820
+ for i, ex in enumerate(merged_rects):
821
+ if overlaps_for_merge(rect, ex):
822
+ merged_rects[i] = (
823
+ min(rect[0], ex[0]),
824
+ min(rect[1], ex[1]),
825
+ max(rect[2], ex[2]),
826
+ max(rect[3], ex[3]),
827
+ )
828
+ merged = True
829
+ break
830
+ if not merged:
831
+ merged_rects.append(rect)
832
+
833
+ dedup: set[str] = set()
834
+ for top_row, left_col, bottom_row, right_col in merged_rects:
835
+ top_row, left_col, bottom_row, right_col = shrink_to_content(
836
+ sheet, top_row, left_col, bottom_row, right_col, require_inside_border=False
837
+ )
838
+ try:
839
+ rng_vals = sheet.range((top_row, left_col), (bottom_row, right_col)).value
840
+ rng_vals = _normalize_matrix(rng_vals)
841
+ nonempty = sum(
842
+ 1
843
+ for row in rng_vals
844
+ for v in (row if isinstance(row, list) else [row])
845
+ if not (v is None or str(v).strip() == "")
846
+ )
847
+ except Exception:
848
+ nonempty = 0
849
+ if nonempty < _DETECTION_CONFIG["min_nonempty_cells"]:
850
+ continue
851
+ clusters = _nonempty_clusters(rng_vals)
852
+ for r0, c0, r1, c1 in clusters:
853
+ sub = [row[c0 : c1 + 1] for row in rng_vals[r0 : r1 + 1]]
854
+ density, coverage = _table_density_metrics(sub)
855
+ if (
856
+ density < _DETECTION_CONFIG["density_min"]
857
+ and coverage < _DETECTION_CONFIG["coverage_min"]
858
+ ):
859
+ continue
860
+ if not _is_plausible_table(sub):
861
+ continue
862
+ score = _table_signal_score(sub)
863
+ if score < _DETECTION_CONFIG["table_score_threshold"]:
864
+ continue
865
+ addr = f"{xw.utils.col_name(left_col + c0)}{top_row + r0}:{xw.utils.col_name(left_col + c1)}{top_row + r1}"
866
+ if addr not in dedup:
867
+ dedup.add(addr)
868
+ tables.append(addr)
869
+ return tables
870
+
871
+
872
+ def detect_tables_openpyxl( # noqa: C901
873
+ xlsx_path: Path, sheet_name: str
874
+ ) -> list[str]:
875
+ wb = load_workbook(
876
+ xlsx_path,
877
+ data_only=True,
878
+ read_only=False,
879
+ )
880
+ ws = wb[sheet_name]
881
+ tables: list[str] = []
882
+ try:
883
+ openpyxl_tables: list[object] = []
884
+ if hasattr(ws, "tables") and ws.tables:
885
+ if isinstance(ws.tables, dict):
886
+ openpyxl_tables = list(ws.tables.values())
887
+ else:
888
+ openpyxl_tables = list(ws.tables)
889
+ elif hasattr(ws, "_tables") and ws._tables:
890
+ openpyxl_tables = list(ws._tables)
891
+ for t in openpyxl_tables:
892
+ addr = getattr(t, "ref", None)
893
+ if addr:
894
+ tables.append(str(addr))
895
+ except Exception:
896
+ pass
897
+
898
+ has_border, top_edge, bottom_edge, left_edge, right_edge, max_row, max_col = (
899
+ load_border_maps_xlsx(xlsx_path, sheet_name)
900
+ )
901
+ rects = detect_border_clusters(has_border, min_size=4)
902
+
903
+ def overlaps_for_merge(
904
+ a: tuple[int, int, int, int], b: tuple[int, int, int, int]
905
+ ) -> bool:
906
+ contains = (
907
+ a[0] <= b[0] and a[1] <= b[1] and a[2] >= b[2] and a[3] >= b[3]
908
+ ) or (b[0] <= a[0] and b[1] <= a[1] and b[2] >= a[2] and b[3] >= a[3])
909
+ if contains:
910
+ return False
911
+ return not (a[1] > b[3] or a[3] < b[1] or a[0] > b[2] or a[2] < b[0])
912
+
913
+ merged_rects: list[tuple[int, int, int, int]] = []
914
+ for rect in sorted(rects):
915
+ merged = False
916
+ for i, ex in enumerate(merged_rects):
917
+ if overlaps_for_merge(rect, ex):
918
+ merged_rects[i] = (
919
+ min(rect[0], ex[0]),
920
+ min(rect[1], ex[1]),
921
+ max(rect[2], ex[2]),
922
+ max(rect[3], ex[3]),
923
+ )
924
+ merged = True
925
+ break
926
+ if not merged:
927
+ merged_rects.append(rect)
928
+
929
+ dedup: set[str] = set()
930
+ for top_row, left_col, bottom_row, right_col in merged_rects:
931
+ top_row, left_col, bottom_row, right_col = shrink_to_content_openpyxl(
932
+ ws,
933
+ top_row,
934
+ left_col,
935
+ bottom_row,
936
+ right_col,
937
+ require_inside_border=False,
938
+ top_edge=top_edge,
939
+ bottom_edge=bottom_edge,
940
+ left_edge=left_edge,
941
+ right_edge=right_edge,
942
+ min_nonempty_ratio=0.0,
943
+ )
944
+ vals_block = _get_values_block(ws, top_row, left_col, bottom_row, right_col)
945
+ vals_block = _normalize_matrix(vals_block)
946
+ nonempty = sum(
947
+ 1
948
+ for row in vals_block
949
+ for v in row
950
+ if not (v is None or str(v).strip() == "")
951
+ )
952
+ if nonempty < _DETECTION_CONFIG["min_nonempty_cells"]:
953
+ continue
954
+ clusters = _nonempty_clusters(vals_block)
955
+ for r0, c0, r1, c1 in clusters:
956
+ sub = [row[c0 : c1 + 1] for row in vals_block[r0 : r1 + 1]]
957
+ density, coverage = _table_density_metrics(sub)
958
+ if (
959
+ density < _DETECTION_CONFIG["density_min"]
960
+ and coverage < _DETECTION_CONFIG["coverage_min"]
961
+ ):
962
+ continue
963
+ if not _is_plausible_table(sub):
964
+ continue
965
+ score = _table_signal_score(sub)
966
+ if score < _DETECTION_CONFIG["table_score_threshold"]:
967
+ continue
968
+ addr = f"{get_column_letter(left_col + c0)}{top_row + r0}:{get_column_letter(left_col + c1)}{top_row + r1}"
969
+ if addr not in dedup:
970
+ dedup.add(addr)
971
+ tables.append(addr)
972
+ wb.close()
973
+ return tables
974
+
975
+
976
+ def detect_tables(sheet: xw.Sheet) -> list[str]:
977
+ excel_path: Path | None = None
978
+ try:
979
+ excel_path = Path(sheet.book.fullname)
980
+ except Exception:
981
+ excel_path = None
982
+
983
+ if excel_path and excel_path.suffix.lower() == ".xls":
984
+ warn_once(
985
+ f"xls-fallback::{excel_path}",
986
+ f"File '{excel_path.name}' is .xls (BIFF); openpyxl cannot read it. Falling back to COM-based detection (slower). Consider converting to .xlsx.",
987
+ )
988
+ return detect_tables_xlwings(sheet)
989
+
990
+ if excel_path and excel_path.suffix.lower() in (".xlsx", ".xlsm"):
991
+ try:
992
+ import openpyxl # noqa: F401
993
+ except Exception:
994
+ warn_once(
995
+ "openpyxl-missing",
996
+ "openpyxl is not installed. Falling back to COM-based detection (slower).",
997
+ )
998
+ return detect_tables_xlwings(sheet)
999
+
1000
+ try:
1001
+ return detect_tables_openpyxl(excel_path, sheet.name)
1002
+ except Exception as e:
1003
+ warn_once(
1004
+ f"openpyxl-parse-fallback::{excel_path}::{sheet.name}",
1005
+ f"openpyxl failed to parse '{excel_path.name}' (sheet '{sheet.name}'): {e!r}. Falling back to COM-based detection (slower).",
1006
+ )
1007
+ return detect_tables_xlwings(sheet)
1008
+
1009
+ warn_once(
1010
+ "unknown-ext-fallback",
1011
+ "Workbook path or extension is unavailable; falling back to COM-based detection (slower).",
1012
+ )
1013
+ return detect_tables_xlwings(sheet)
1014
+
1015
+
1016
+ _INT_RE = re.compile(r"^[+-]?\d+$")
1017
+ _FLOAT_RE = re.compile(r"^[+-]?\d*\.\d+$")
1018
+
1019
+
1020
+ def _coerce_numeric_preserve_format(val: str) -> int | float | str:
1021
+ """
1022
+ Convert numeric-looking strings to int/float while keeping precision.
1023
+ Integers stay int; decimals keep scale via Decimal before casting to float.
1024
+ """
1025
+ if _INT_RE.match(val):
1026
+ try:
1027
+ return int(val)
1028
+ except Exception:
1029
+ return val
1030
+ if _FLOAT_RE.match(val):
1031
+ try:
1032
+ dec = Decimal(val)
1033
+ exponent = int(dec.as_tuple().exponent)
1034
+ scale = max(1, -exponent)
1035
+ quantized = dec.quantize(Decimal("1." + "0" * scale))
1036
+ return float(quantized)
1037
+ except (InvalidOperation, Exception):
1038
+ return val
1039
+ return val