exstruct 0.2.80__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
exstruct/core/cells.py CHANGED
@@ -1,31 +1,33 @@
1
1
  from __future__ import annotations
2
2
 
3
- from collections import deque
4
- from collections.abc import Sequence
5
- from decimal import Decimal, InvalidOperation
6
- import logging
7
- from pathlib import Path
8
- import re
9
-
10
- import numpy as np
11
- from openpyxl import load_workbook
12
- from openpyxl.utils import get_column_letter, range_boundaries
13
- from openpyxl.worksheet.worksheet import Worksheet
14
- import pandas as pd
15
- import xlwings as xw
16
-
17
- from ..models import CellRow
18
-
19
- logger = logging.getLogger(__name__)
20
- _warned_keys: set[str] = set()
21
- XL_LINESTYLE_NONE = -4142
22
- XL_INSIDE_VERTICAL = 11
23
- XL_INSIDE_HORIZONTAL = 12
24
- XL_EDGE_LEFT = 7
25
- XL_EDGE_TOP = 8
26
- XL_EDGE_BOTTOM = 9
27
- XL_EDGE_RIGHT = 10
28
- MatrixInput = Sequence[Sequence[object]] | Sequence[object]
3
+ from collections import deque
4
+ from collections.abc import Callable, Sequence
5
+ from dataclasses import dataclass
6
+ from decimal import Decimal, InvalidOperation
7
+ import logging
8
+ from pathlib import Path
9
+ import re
10
+
11
+ import numpy as np
12
+ from openpyxl.styles.colors import Color
13
+ from openpyxl.utils import get_column_letter, range_boundaries
14
+ from openpyxl.worksheet.worksheet import Worksheet
15
+ import pandas as pd
16
+ import xlwings as xw
17
+
18
+ from ..models import CellRow, MergedCell
19
+ from .workbook import openpyxl_workbook
20
+
21
+ logger = logging.getLogger(__name__)
22
+ _warned_keys: set[str] = set()
23
+ XL_LINESTYLE_NONE = -4142
24
+ XL_INSIDE_VERTICAL = 11
25
+ XL_INSIDE_HORIZONTAL = 12
26
+ XL_EDGE_LEFT = 7
27
+ XL_EDGE_TOP = 8
28
+ XL_EDGE_BOTTOM = 9
29
+ XL_EDGE_RIGHT = 10
30
+ MatrixInput = Sequence[Sequence[object]] | Sequence[object]
29
31
 
30
32
  # Detection tuning parameters (can be overridden via set_table_detection_params)
31
33
  _DETECTION_CONFIG = {
@@ -34,6 +36,426 @@ _DETECTION_CONFIG = {
34
36
  "coverage_min": 0.2,
35
37
  "min_nonempty_cells": 3,
36
38
  }
39
+ _DEFAULT_BACKGROUND_HEX = "FFFFFF"
40
+ _XL_COLOR_NONE = -4142
41
+
42
+
43
+ # Use dataclasses for lightweight models
44
+ @dataclass(frozen=True)
45
+ class SheetColorsMap:
46
+ """Background color map for a single worksheet."""
47
+
48
+ sheet_name: str
49
+ colors_map: dict[str, list[tuple[int, int]]]
50
+
51
+
52
+ @dataclass(frozen=True)
53
+ class WorkbookColorsMap:
54
+ """Background color maps for all worksheets in a workbook."""
55
+
56
+ sheets: dict[str, SheetColorsMap]
57
+
58
+ def get_sheet(self, sheet_name: str) -> SheetColorsMap | None:
59
+ """Return the colors map for a sheet if available.
60
+
61
+ Args:
62
+ sheet_name: Target worksheet name.
63
+
64
+ Returns:
65
+ SheetColorsMap for the sheet, or None if missing.
66
+ """
67
+ return self.sheets.get(sheet_name)
68
+
69
+
70
+ def extract_sheet_colors_map(
71
+ file_path: Path, *, include_default_background: bool, ignore_colors: set[str] | None
72
+ ) -> WorkbookColorsMap:
73
+ """Extract background colors for each worksheet.
74
+
75
+ Args:
76
+ file_path: Excel workbook path.
77
+ include_default_background: Whether to include default (white) backgrounds
78
+ within the used range.
79
+ ignore_colors: Optional set of color keys to ignore.
80
+
81
+ Returns:
82
+ WorkbookColorsMap containing per-sheet color maps.
83
+ """
84
+ sheets: dict[str, SheetColorsMap] = {}
85
+ with openpyxl_workbook(file_path, data_only=True, read_only=False) as wb:
86
+ for ws in wb.worksheets:
87
+ sheet_map = _extract_sheet_colors(
88
+ ws, include_default_background, ignore_colors
89
+ )
90
+ sheets[ws.title] = sheet_map
91
+ return WorkbookColorsMap(sheets=sheets)
92
+
93
+
94
+ def extract_sheet_colors_map_com(
95
+ workbook: xw.Book,
96
+ *,
97
+ include_default_background: bool,
98
+ ignore_colors: set[str] | None,
99
+ ) -> WorkbookColorsMap:
100
+ """Extract background colors for each worksheet via COM display formats.
101
+
102
+ Args:
103
+ workbook: xlwings workbook instance.
104
+ include_default_background: Whether to include default (white) backgrounds
105
+ within the used range.
106
+ ignore_colors: Optional set of color keys to ignore.
107
+
108
+ Returns:
109
+ WorkbookColorsMap containing per-sheet color maps.
110
+ """
111
+ _prepare_workbook_for_display_format(workbook)
112
+ sheets: dict[str, SheetColorsMap] = {}
113
+ for sheet in workbook.sheets:
114
+ _prepare_sheet_for_display_format(sheet)
115
+ sheet_map = _extract_sheet_colors_com(
116
+ sheet, include_default_background, ignore_colors
117
+ )
118
+ sheets[sheet.name] = sheet_map
119
+ return WorkbookColorsMap(sheets=sheets)
120
+
121
+
122
+ def _extract_sheet_colors(
123
+ ws: Worksheet, include_default_background: bool, ignore_colors: set[str] | None
124
+ ) -> SheetColorsMap:
125
+ """Extract background colors for a single worksheet.
126
+
127
+ Args:
128
+ ws: Target worksheet.
129
+ include_default_background: Whether to include default (white) backgrounds.
130
+ ignore_colors: Optional set of color keys to ignore.
131
+
132
+ Returns:
133
+ SheetColorsMap for the worksheet.
134
+ """
135
+ min_row, min_col, max_row, max_col = _get_used_range_bounds(ws)
136
+ colors_map: dict[str, list[tuple[int, int]]] = {}
137
+ if min_row > max_row or min_col > max_col:
138
+ return SheetColorsMap(sheet_name=ws.title, colors_map=colors_map)
139
+
140
+ ignore_set = _normalize_ignore_colors(ignore_colors)
141
+ for row in ws.iter_rows(
142
+ min_row=min_row, max_row=max_row, min_col=min_col, max_col=max_col
143
+ ):
144
+ for cell in row:
145
+ color_key = _resolve_cell_background(cell, include_default_background)
146
+ if color_key is None:
147
+ continue
148
+ normalized_key = _normalize_color_key(color_key)
149
+ if _should_ignore_color(normalized_key, ignore_set):
150
+ continue
151
+ colors_map.setdefault(normalized_key, []).append(
152
+ (cell.row, cell.col_idx - 1)
153
+ )
154
+ return SheetColorsMap(sheet_name=ws.title, colors_map=colors_map)
155
+
156
+
157
+ def _extract_sheet_colors_com(
158
+ sheet: xw.Sheet, include_default_background: bool, ignore_colors: set[str] | None
159
+ ) -> SheetColorsMap:
160
+ """Extract background colors for a single worksheet via COM.
161
+
162
+ Args:
163
+ sheet: Target worksheet.
164
+ include_default_background: Whether to include default (white) backgrounds.
165
+ ignore_colors: Optional set of color keys to ignore.
166
+
167
+ Returns:
168
+ SheetColorsMap for the worksheet.
169
+ """
170
+ colors_map: dict[str, list[tuple[int, int]]] = {}
171
+ used = sheet.used_range
172
+ start_row = int(getattr(used, "row", 1))
173
+ start_col = int(getattr(used, "column", 1))
174
+ max_row = used.last_cell.row
175
+ max_col = used.last_cell.column
176
+ if max_row <= 0 or max_col <= 0:
177
+ return SheetColorsMap(sheet_name=sheet.name, colors_map=colors_map)
178
+
179
+ ignore_set = _normalize_ignore_colors(ignore_colors)
180
+ for row in range(start_row, max_row + 1):
181
+ for col in range(start_col, max_col + 1):
182
+ color_key = _resolve_cell_background_com(
183
+ sheet, row, col, include_default_background
184
+ )
185
+ if color_key is None:
186
+ continue
187
+ normalized_key = _normalize_color_key(color_key)
188
+ if _should_ignore_color(normalized_key, ignore_set):
189
+ continue
190
+ colors_map.setdefault(normalized_key, []).append((row, col - 1))
191
+ return SheetColorsMap(sheet_name=sheet.name, colors_map=colors_map)
192
+
193
+
194
+ def _get_used_range_bounds(ws: Worksheet) -> tuple[int, int, int, int]:
195
+ """Return used range bounds for a worksheet.
196
+
197
+ Args:
198
+ ws: Target worksheet.
199
+
200
+ Returns:
201
+ Tuple of (min_row, min_col, max_row, max_col).
202
+ """
203
+ try:
204
+ if _is_effectively_empty_sheet(ws):
205
+ return 1, 1, 0, 0
206
+ dim = ws.calculate_dimension()
207
+ min_col, min_row, max_col, max_row = range_boundaries(dim)
208
+ return min_row, min_col, max_row, max_col
209
+ except Exception:
210
+ max_row = ws.max_row or 0
211
+ max_col = ws.max_column or 0
212
+ if max_row == 0 or max_col == 0:
213
+ return 1, 1, 0, 0
214
+ return 1, 1, max_row, max_col
215
+
216
+
217
+ def _is_effectively_empty_sheet(ws: Worksheet) -> bool:
218
+ """Check whether a worksheet has no content or styling.
219
+
220
+ Args:
221
+ ws: Target worksheet.
222
+
223
+ Returns:
224
+ True if the sheet has no meaningful content or style, otherwise False.
225
+ """
226
+ if ws.max_row != 1 or ws.max_column != 1:
227
+ return False
228
+ cell = ws.cell(row=1, column=1)
229
+ return cell.value is None and not cell.has_style
230
+
231
+
232
+ def _resolve_cell_background(
233
+ cell: object, include_default_background: bool
234
+ ) -> str | None:
235
+ """Resolve a cell's background color key.
236
+
237
+ Args:
238
+ cell: Worksheet cell object.
239
+ include_default_background: Whether to treat default fills as white.
240
+
241
+ Returns:
242
+ Normalized color key or None when excluded.
243
+ """
244
+ fill = getattr(cell, "fill", None)
245
+ if fill is None:
246
+ return _DEFAULT_BACKGROUND_HEX if include_default_background else None
247
+ pattern_type = getattr(fill, "patternType", None)
248
+ if pattern_type in (None, "none"):
249
+ return _DEFAULT_BACKGROUND_HEX if include_default_background else None
250
+ color_key = _resolve_fill_color_key(fill)
251
+ if color_key == _DEFAULT_BACKGROUND_HEX and not include_default_background:
252
+ return None
253
+ return color_key
254
+
255
+
256
+ def _resolve_fill_color_key(fill: object) -> str | None:
257
+ """Normalize the foreground/background color of a fill.
258
+
259
+ Args:
260
+ fill: openpyxl fill object.
261
+
262
+ Returns:
263
+ Normalized color key or None when unavailable.
264
+ """
265
+ fg_color = getattr(fill, "fgColor", None)
266
+ if fg_color is not None:
267
+ fg_key = _color_to_key(fg_color)
268
+ if fg_key is not None:
269
+ return fg_key
270
+ bg_color = getattr(fill, "bgColor", None)
271
+ return _color_to_key(bg_color) if bg_color is not None else None
272
+
273
+
274
+ def _resolve_cell_background_com(
275
+ sheet: xw.Sheet, row: int, col: int, include_default_background: bool
276
+ ) -> str | None:
277
+ """Resolve a cell's background color key via COM display format.
278
+
279
+ Args:
280
+ sheet: Target worksheet.
281
+ row: 1-based row index.
282
+ col: 1-based column index.
283
+ include_default_background: Whether to include default (white) backgrounds.
284
+
285
+ Returns:
286
+ Normalized color key or None when excluded.
287
+ """
288
+ color_value = _get_display_format_color(sheet, row, col)
289
+ if color_value is None:
290
+ return _DEFAULT_BACKGROUND_HEX if include_default_background else None
291
+ if color_value == _XL_COLOR_NONE:
292
+ return _DEFAULT_BACKGROUND_HEX if include_default_background else None
293
+ color_key = _excel_color_int_to_rgb_hex(color_value)
294
+ if color_key == _DEFAULT_BACKGROUND_HEX and not include_default_background:
295
+ return None
296
+ return color_key
297
+
298
+
299
+ def _prepare_workbook_for_display_format(workbook: xw.Book) -> None:
300
+ """Prepare a workbook so DisplayFormat reflects conditional formatting.
301
+
302
+ Args:
303
+ workbook: xlwings workbook instance.
304
+ """
305
+ try:
306
+ # Force calculation to ensure DisplayFormat.Interior reflects conditional formatting rules
307
+ workbook.app.calculate()
308
+ except Exception:
309
+ return
310
+
311
+
312
+ def _prepare_sheet_for_display_format(sheet: xw.Sheet) -> None:
313
+ """Prepare a sheet so DisplayFormat reflects conditional formatting.
314
+
315
+ Args:
316
+ sheet: Target worksheet.
317
+ """
318
+ try:
319
+ # Activate sheet so DisplayFormat is available
320
+ sheet.api.Activate()
321
+ except Exception:
322
+ return
323
+ try:
324
+ # Calculate to apply conditional formatting to DisplayFormat
325
+ sheet.api.Calculate()
326
+ except Exception:
327
+ return
328
+
329
+
330
+ def _get_display_format_color(sheet: xw.Sheet, row: int, col: int) -> int | None:
331
+ """Read DisplayFormat.Interior.Color from COM.
332
+
333
+ Args:
334
+ sheet: Target worksheet.
335
+ row: 1-based row index.
336
+ col: 1-based column index.
337
+
338
+ Returns:
339
+ BGR integer color or None if unavailable.
340
+ """
341
+ try:
342
+ cell = sheet.api.Cells(row, col)
343
+ display_format = cell.DisplayFormat
344
+ interior = display_format.Interior
345
+ return int(interior.Color)
346
+ except Exception:
347
+ return None
348
+
349
+
350
+ def _excel_color_int_to_rgb_hex(color_value: int) -> str:
351
+ """Convert an Excel color integer into an RGB hex string.
352
+
353
+ Args:
354
+ color_value: Excel color integer from COM.
355
+
356
+ Returns:
357
+ RGB hex string (uppercase).
358
+ """
359
+ red = color_value & 0xFF
360
+ green = (color_value >> 8) & 0xFF
361
+ blue = (color_value >> 16) & 0xFF
362
+ return f"{red:02X}{green:02X}{blue:02X}"
363
+
364
+
365
+ def _normalize_color_key(color_key: str) -> str:
366
+ """Normalize a color key into a canonical representation.
367
+
368
+ Args:
369
+ color_key: Raw color key (hex or themed/indexed).
370
+
371
+ Returns:
372
+ Normalized color key.
373
+ """
374
+ trimmed = color_key.strip()
375
+ if not trimmed:
376
+ return ""
377
+ lowered = trimmed.lower()
378
+ if lowered.startswith(("theme:", "indexed:", "auto:")) or lowered == "auto":
379
+ return lowered
380
+ hex_key = trimmed.lstrip("#").upper()
381
+ if len(hex_key) == 8:
382
+ hex_key = hex_key[2:]
383
+ return hex_key
384
+
385
+
386
+ def _normalize_ignore_colors(ignore_colors: set[str] | None) -> set[str]:
387
+ """Normalize ignore color keys.
388
+
389
+ Args:
390
+ ignore_colors: Optional set of color keys to ignore.
391
+
392
+ Returns:
393
+ Normalized set of color keys.
394
+ """
395
+ if not ignore_colors:
396
+ return set()
397
+ normalized = {_normalize_color_key(color) for color in ignore_colors}
398
+ return {color for color in normalized if color}
399
+
400
+
401
+ def _should_ignore_color(color_key: str, ignore_colors: set[str]) -> bool:
402
+ """Check whether a color key should be ignored.
403
+
404
+ Args:
405
+ color_key: Normalized color key.
406
+ ignore_colors: Normalized ignore color set.
407
+
408
+ Returns:
409
+ True when the color key is ignored.
410
+ """
411
+ return color_key in ignore_colors
412
+
413
+
414
+ def _color_to_key(color: Color | object) -> str | None:
415
+ """Convert an openpyxl color object into a normalized key.
416
+
417
+ Args:
418
+ color: openpyxl color object.
419
+
420
+ Returns:
421
+ Normalized color key string or None when unavailable.
422
+ """
423
+ rgb = getattr(color, "rgb", None)
424
+ if rgb:
425
+ return _normalize_rgb(str(rgb))
426
+ color_type = getattr(color, "type", None)
427
+ if color_type == "theme":
428
+ theme = getattr(color, "theme", None)
429
+ tint = getattr(color, "tint", None)
430
+ theme_id = "unknown" if theme is None else str(theme)
431
+ if tint is None:
432
+ return f"theme:{theme_id}"
433
+ return f"theme:{theme_id}:{tint}"
434
+ if color_type == "indexed":
435
+ indexed = getattr(color, "indexed", None)
436
+ if indexed is not None:
437
+ return f"indexed:{indexed}"
438
+ if color_type == "auto":
439
+ auto = getattr(color, "auto", None)
440
+ return "auto" if auto is None else f"auto:{auto}"
441
+ return None
442
+
443
+
444
+ def _normalize_rgb(rgb: str) -> str:
445
+ """Normalize an RGB/ARGB string into 6-hex format.
446
+
447
+ Args:
448
+ rgb: Raw RGB/ARGB string from openpyxl.
449
+
450
+ Returns:
451
+ Normalized RGB hex string (uppercase, 6 chars when possible).
452
+ """
453
+ cleaned = rgb.strip().upper()
454
+ if cleaned.startswith("0X"):
455
+ cleaned = cleaned[2:]
456
+ if len(cleaned) == 8:
457
+ cleaned = cleaned[2:]
458
+ return cleaned
37
459
 
38
460
 
39
461
  def warn_once(key: str, message: str) -> None:
@@ -76,21 +498,21 @@ def extract_sheet_cells_with_links(file_path: Path) -> dict[str, list[CellRow]]:
76
498
  - Links are mapped by column index string (e.g., "0") to hyperlink.target.
77
499
  """
78
500
  cell_rows = extract_sheet_cells(file_path)
79
- wb = load_workbook(file_path, data_only=True, read_only=False)
80
501
  links_by_sheet: dict[str, dict[int, dict[str, str]]] = {}
81
- for ws in wb.worksheets:
82
- sheet_links: dict[int, dict[str, str]] = {}
83
- for row in ws.iter_rows():
84
- for cell in row:
85
- link = getattr(cell, "hyperlink", None)
86
- target = getattr(link, "target", None) if link else None
87
- if not target:
88
- continue
89
- col_str = str(
90
- cell.col_idx - 1
91
- ) # zero-based to align with extract_sheet_cells
92
- sheet_links.setdefault(cell.row, {})[col_str] = target
93
- links_by_sheet[ws.title] = sheet_links
502
+ with openpyxl_workbook(file_path, data_only=True, read_only=False) as wb:
503
+ for ws in wb.worksheets:
504
+ sheet_links: dict[int, dict[str, str]] = {}
505
+ for row in ws.iter_rows():
506
+ for cell in row:
507
+ link = getattr(cell, "hyperlink", None)
508
+ target = getattr(link, "target", None) if link else None
509
+ if not target:
510
+ continue
511
+ col_str = str(
512
+ cell.col_idx - 1
513
+ ) # zero-based to align with extract_sheet_cells
514
+ sheet_links.setdefault(cell.row, {})[col_str] = target
515
+ links_by_sheet[ws.title] = sheet_links
94
516
 
95
517
  merged: dict[str, list[CellRow]] = {}
96
518
  for sheet_name, rows in cell_rows.items():
@@ -104,32 +526,67 @@ def extract_sheet_cells_with_links(file_path: Path) -> dict[str, list[CellRow]]:
104
526
  return merged
105
527
 
106
528
 
107
- def shrink_to_content( # noqa: C901
108
- sheet: xw.Sheet,
109
- top: int,
110
- left: int,
111
- bottom: int,
112
- right: int,
113
- require_inside_border: bool = False,
114
- min_nonempty_ratio: float = 0.0,
115
- ) -> tuple[int, int, int, int]:
116
- """Trim a rectangle based on cell contents and optional border heuristics."""
529
+ def extract_sheet_merged_cells(file_path: Path) -> dict[str, list[MergedCell]]:
530
+ """Extract merged cell ranges per sheet via openpyxl.
531
+
532
+ Args:
533
+ file_path: Excel workbook path.
534
+
535
+ Returns:
536
+ Mapping of sheet name to merged cell ranges.
537
+ """
538
+ merged_by_sheet: dict[str, list[MergedCell]] = {}
539
+ with openpyxl_workbook(file_path, data_only=True, read_only=False) as wb:
540
+ for ws in wb.worksheets:
541
+ merged_ranges = getattr(ws, "merged_cells", None)
542
+ if merged_ranges is None:
543
+ merged_by_sheet[ws.title] = []
544
+ continue
545
+ results: list[MergedCell] = []
546
+ for merged_range in getattr(merged_ranges, "ranges", []):
547
+ bounds = range_boundaries(str(merged_range))
548
+ min_col, min_row, max_col, max_row = bounds
549
+ cell_value = ws.cell(row=min_row, column=min_col).value
550
+ value_str = "" if cell_value is None else str(cell_value)
551
+ results.append(
552
+ MergedCell(
553
+ r1=min_row,
554
+ c1=min_col - 1,
555
+ r2=max_row,
556
+ c2=max_col - 1,
557
+ v=value_str,
558
+ )
559
+ )
560
+ merged_by_sheet[ws.title] = results
561
+ return merged_by_sheet
562
+
563
+
564
+ def shrink_to_content( # noqa: C901
565
+ sheet: xw.Sheet,
566
+ top: int,
567
+ left: int,
568
+ bottom: int,
569
+ right: int,
570
+ require_inside_border: bool = False,
571
+ min_nonempty_ratio: float = 0.0,
572
+ ) -> tuple[int, int, int, int]:
573
+ """Trim a rectangle based on cell contents and optional border heuristics."""
117
574
  rng = sheet.range((top, left), (bottom, right))
118
575
  vals = rng.value
119
576
  if vals is None:
120
577
  vals = []
121
578
  if not isinstance(vals, list):
122
579
  vals = [[vals]]
123
- elif vals and not isinstance(vals[0], list):
124
- vals = [vals]
125
- rows_n = len(vals)
126
- cols_n = len(vals[0]) if rows_n else 0
127
-
128
- def to_str(x: object) -> str:
129
- return "" if x is None else str(x)
130
-
131
- def is_empty_value(x: object) -> bool:
132
- return to_str(x).strip() == ""
580
+ elif vals and not isinstance(vals[0], list):
581
+ vals = [vals]
582
+ rows_n = len(vals)
583
+ cols_n = len(vals[0]) if rows_n else 0
584
+
585
+ def to_str(x: object) -> str:
586
+ return "" if x is None else str(x)
587
+
588
+ def is_empty_value(x: object) -> bool:
589
+ return to_str(x).strip() == ""
133
590
 
134
591
  def row_empty(i: int) -> bool:
135
592
  return cols_n == 0 or all(is_empty_value(vals[i][j]) for j in range(cols_n))
@@ -149,11 +606,11 @@ def shrink_to_content( # noqa: C901
149
606
  cnt = sum(1 for i in range(rows_n) if not is_empty_value(vals[i][j]))
150
607
  return cnt / rows_n
151
608
 
152
- def column_has_inside_border(col_idx: int) -> bool:
153
- if not require_inside_border:
154
- return False
155
- try:
156
- for r in range(top, bottom + 1):
609
+ def column_has_inside_border(col_idx: int) -> bool:
610
+ if not require_inside_border:
611
+ return False
612
+ try:
613
+ for r in range(top, bottom + 1):
157
614
  ls = (
158
615
  sheet.api.Cells(r, left + col_idx)
159
616
  .Borders(XL_INSIDE_VERTICAL)
@@ -231,22 +688,28 @@ def shrink_to_content( # noqa: C901
231
688
  bottom -= 1
232
689
  else:
233
690
  break
234
- return top, left, bottom, right
235
-
236
-
237
- def load_border_maps_xlsx( # noqa: C901
238
- xlsx_path: Path, sheet_name: str
239
- ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int]:
240
- wb = load_workbook(xlsx_path, data_only=True, read_only=False)
241
- if sheet_name not in wb.sheetnames:
242
- wb.close()
243
- raise KeyError(f"Sheet '{sheet_name}' not found in {xlsx_path}")
244
-
245
- ws = wb[sheet_name]
246
- try:
247
- min_col, min_row, max_col, max_row = range_boundaries(ws.calculate_dimension())
248
- except Exception:
249
- min_col, min_row, max_col, max_row = 1, 1, ws.max_column or 1, ws.max_row or 1
691
+ return top, left, bottom, right
692
+
693
+
694
+ def load_border_maps_xlsx( # noqa: C901
695
+ xlsx_path: Path, sheet_name: str
696
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int]:
697
+ with openpyxl_workbook(xlsx_path, data_only=True, read_only=False) as wb:
698
+ if sheet_name not in wb.sheetnames:
699
+ raise KeyError(f"Sheet '{sheet_name}' not found in {xlsx_path}")
700
+
701
+ ws = wb[sheet_name]
702
+ try:
703
+ min_col, min_row, max_col, max_row = range_boundaries(
704
+ ws.calculate_dimension()
705
+ )
706
+ except Exception:
707
+ min_col, min_row, max_col, max_row = (
708
+ 1,
709
+ 1,
710
+ ws.max_column or 1,
711
+ ws.max_row or 1,
712
+ )
250
713
 
251
714
  shape = (max_row + 1, max_col + 1)
252
715
  has_border = np.zeros(shape, dtype=bool)
@@ -255,11 +718,11 @@ def load_border_maps_xlsx( # noqa: C901
255
718
  left_edge = np.zeros(shape, dtype=bool)
256
719
  right_edge = np.zeros(shape, dtype=bool)
257
720
 
258
- def edge_has_style(edge: object) -> bool:
259
- if edge is None:
260
- return False
261
- style = getattr(edge, "style", None)
262
- return style is not None and style != "none"
721
+ def edge_has_style(edge: object) -> bool:
722
+ if edge is None:
723
+ return False
724
+ style = getattr(edge, "style", None)
725
+ return style is not None and style != "none"
263
726
 
264
727
  for r in range(min_row, max_row + 1):
265
728
  for c in range(min_col, max_col + 1):
@@ -270,134 +733,133 @@ def load_border_maps_xlsx( # noqa: C901
270
733
 
271
734
  t = edge_has_style(b.top)
272
735
  btm = edge_has_style(b.bottom)
273
- left_border = edge_has_style(b.left)
274
- rgt = edge_has_style(b.right)
275
-
276
- if t or btm or left_border or rgt:
277
- has_border[r, c] = True
278
- if t:
279
- top_edge[r, c] = True
280
- if btm:
281
- bottom_edge[r, c] = True
282
- if left_border:
283
- left_edge[r, c] = True
284
- if rgt:
285
- right_edge[r, c] = True
736
+ left_border = edge_has_style(b.left)
737
+ rgt = edge_has_style(b.right)
738
+
739
+ if t or btm or left_border or rgt:
740
+ has_border[r, c] = True
741
+ if t:
742
+ top_edge[r, c] = True
743
+ if btm:
744
+ bottom_edge[r, c] = True
745
+ if left_border:
746
+ left_edge[r, c] = True
747
+ if rgt:
748
+ right_edge[r, c] = True
286
749
 
287
- wb.close()
288
750
  return has_border, top_edge, bottom_edge, left_edge, right_edge, max_row, max_col
289
751
 
290
752
 
291
- def _detect_border_clusters_numpy(
292
- has_border: np.ndarray, min_size: int
293
- ) -> list[tuple[int, int, int, int]]:
294
- from scipy.ndimage import label
295
-
296
- structure = np.array([[0, 1, 0], [1, 1, 1], [0, 1, 0]], dtype=np.uint8)
297
- lbl, num = label(has_border.astype(np.uint8), structure=structure)
298
- rects: list[tuple[int, int, int, int]] = []
299
- for k in range(1, int(num) + 1):
300
- ys, xs = np.where(lbl == k)
301
- if int(len(ys)) < min_size:
302
- continue
303
- rects.append((int(ys.min()), int(xs.min()), int(ys.max()), int(xs.max())))
304
- return rects
305
-
306
-
307
- def _detect_border_clusters_python(
308
- has_border: np.ndarray, min_size: int
309
- ) -> list[tuple[int, int, int, int]]:
310
- h, w = has_border.shape
311
- visited = np.zeros_like(has_border, dtype=bool)
312
- rects: list[tuple[int, int, int, int]] = []
313
- for r in range(h):
314
- for c in range(w):
315
- if not has_border[r, c] or visited[r, c]:
316
- continue
317
- q = deque([(r, c)])
318
- visited[r, c] = True
319
- ys = [r]
320
- xs = [c]
321
- while q:
322
- yy, xx = q.popleft()
323
- for dy, dx in ((1, 0), (-1, 0), (0, 1), (0, -1)):
324
- ny, nx = yy + dy, xx + dx
325
- if (
326
- 0 <= ny < h
327
- and 0 <= nx < w
328
- and has_border[ny, nx]
329
- and not visited[ny, nx]
330
- ):
331
- visited[ny, nx] = True
332
- q.append((ny, nx))
333
- ys.append(ny)
334
- xs.append(nx)
335
- if len(ys) >= min_size:
336
- rects.append((min(ys), min(xs), max(ys), max(xs)))
337
- return rects
338
-
339
-
340
- def detect_border_clusters(
341
- has_border: np.ndarray, min_size: int = 4
342
- ) -> list[tuple[int, int, int, int]]:
343
- try:
344
- return _detect_border_clusters_numpy(has_border, min_size)
345
- except Exception:
346
- warn_once(
347
- "scipy-missing",
348
- "scipy is not available. Falling back to pure-Python BFS for connected components, which may be significantly slower.",
349
- )
350
- return _detect_border_clusters_python(has_border, min_size)
351
-
352
-
353
- def _get_values_block(
354
- ws: Worksheet, top: int, left: int, bottom: int, right: int
355
- ) -> list[list[object]]:
356
- vals: list[list[object]] = []
357
- for row in ws.iter_rows(
358
- min_row=top, max_row=bottom, min_col=left, max_col=right, values_only=True
359
- ):
360
- vals.append(list(row))
361
- return vals
362
-
363
-
364
- def _ensure_matrix(matrix: MatrixInput) -> list[list[object]]:
365
- rows_seq = list(matrix)
366
- if not rows_seq:
367
- return []
368
- first = rows_seq[0]
369
- if isinstance(first, Sequence) and not isinstance(first, (str, bytes, bytearray)):
370
- normalized: list[list[object]] = []
371
- for row in rows_seq:
372
- if isinstance(row, Sequence) and not isinstance(
373
- row, (str, bytes, bytearray)
374
- ):
375
- normalized.append(list(row))
376
- else:
377
- normalized.append([row])
378
- return normalized
379
- return [list(rows_seq)]
380
-
381
-
382
- def _table_density_metrics(matrix: MatrixInput) -> tuple[float, float]:
383
- """
384
- Given a 2D matrix (list of rows), return (density, coverage).
385
- density: nonempty / total cells.
386
- coverage: area of tight bounding box of nonempty cells divided by total area.
387
- """
388
- normalized = _ensure_matrix(matrix)
389
- if not normalized:
390
- return 0.0, 0.0
391
- rows = len(normalized)
392
- cols = len(normalized[0]) if rows else 0
393
- if rows == 0 or cols == 0:
394
- return 0.0, 0.0
395
-
396
- nonempty_coords = []
397
- for i, row in enumerate(normalized):
398
- for j, v in enumerate(row):
399
- if not (v is None or str(v).strip() == ""):
400
- nonempty_coords.append((i, j))
753
+ def _detect_border_clusters_numpy(
754
+ has_border: np.ndarray, min_size: int
755
+ ) -> list[tuple[int, int, int, int]]:
756
+ from scipy.ndimage import label
757
+
758
+ structure = np.array([[0, 1, 0], [1, 1, 1], [0, 1, 0]], dtype=np.uint8)
759
+ lbl, num = label(has_border.astype(np.uint8), structure=structure)
760
+ rects: list[tuple[int, int, int, int]] = []
761
+ for k in range(1, int(num) + 1):
762
+ ys, xs = np.where(lbl == k)
763
+ if int(len(ys)) < min_size:
764
+ continue
765
+ rects.append((int(ys.min()), int(xs.min()), int(ys.max()), int(xs.max())))
766
+ return rects
767
+
768
+
769
+ def _detect_border_clusters_python(
770
+ has_border: np.ndarray, min_size: int
771
+ ) -> list[tuple[int, int, int, int]]:
772
+ h, w = has_border.shape
773
+ visited = np.zeros_like(has_border, dtype=bool)
774
+ rects: list[tuple[int, int, int, int]] = []
775
+ for r in range(h):
776
+ for c in range(w):
777
+ if not has_border[r, c] or visited[r, c]:
778
+ continue
779
+ q = deque([(r, c)])
780
+ visited[r, c] = True
781
+ ys = [r]
782
+ xs = [c]
783
+ while q:
784
+ yy, xx = q.popleft()
785
+ for dy, dx in ((1, 0), (-1, 0), (0, 1), (0, -1)):
786
+ ny, nx = yy + dy, xx + dx
787
+ if (
788
+ 0 <= ny < h
789
+ and 0 <= nx < w
790
+ and has_border[ny, nx]
791
+ and not visited[ny, nx]
792
+ ):
793
+ visited[ny, nx] = True
794
+ q.append((ny, nx))
795
+ ys.append(ny)
796
+ xs.append(nx)
797
+ if len(ys) >= min_size:
798
+ rects.append((min(ys), min(xs), max(ys), max(xs)))
799
+ return rects
800
+
801
+
802
+ def detect_border_clusters(
803
+ has_border: np.ndarray, min_size: int = 4
804
+ ) -> list[tuple[int, int, int, int]]:
805
+ try:
806
+ return _detect_border_clusters_numpy(has_border, min_size)
807
+ except Exception:
808
+ warn_once(
809
+ "scipy-missing",
810
+ "scipy is not available. Falling back to pure-Python BFS for connected components, which may be significantly slower.",
811
+ )
812
+ return _detect_border_clusters_python(has_border, min_size)
813
+
814
+
815
+ def _get_values_block(
816
+ ws: Worksheet, top: int, left: int, bottom: int, right: int
817
+ ) -> list[list[object]]:
818
+ vals: list[list[object]] = []
819
+ for row in ws.iter_rows(
820
+ min_row=top, max_row=bottom, min_col=left, max_col=right, values_only=True
821
+ ):
822
+ vals.append(list(row))
823
+ return vals
824
+
825
+
826
+ def _ensure_matrix(matrix: MatrixInput) -> list[list[object]]:
827
+ rows_seq = list(matrix)
828
+ if not rows_seq:
829
+ return []
830
+ first = rows_seq[0]
831
+ if isinstance(first, Sequence) and not isinstance(first, str | bytes | bytearray):
832
+ normalized: list[list[object]] = []
833
+ for row in rows_seq:
834
+ if isinstance(row, Sequence) and not isinstance(
835
+ row, str | bytes | bytearray
836
+ ):
837
+ normalized.append(list(row))
838
+ else:
839
+ normalized.append([row])
840
+ return normalized
841
+ return [list(rows_seq)]
842
+
843
+
844
+ def _table_density_metrics(matrix: MatrixInput) -> tuple[float, float]:
845
+ """
846
+ Given a 2D matrix (list of rows), return (density, coverage).
847
+ density: nonempty / total cells.
848
+ coverage: area of tight bounding box of nonempty cells divided by total area.
849
+ """
850
+ normalized = _ensure_matrix(matrix)
851
+ if not normalized:
852
+ return 0.0, 0.0
853
+ rows = len(normalized)
854
+ cols = len(normalized[0]) if rows else 0
855
+ if rows == 0 or cols == 0:
856
+ return 0.0, 0.0
857
+
858
+ nonempty_coords = []
859
+ for i, row in enumerate(normalized):
860
+ for j, v in enumerate(row):
861
+ if not (v is None or str(v).strip() == ""):
862
+ nonempty_coords.append((i, j))
401
863
 
402
864
  total = rows * cols
403
865
  if not nonempty_coords:
@@ -414,27 +876,29 @@ def _table_density_metrics(matrix: MatrixInput) -> tuple[float, float]:
414
876
  return density, coverage
415
877
 
416
878
 
417
- def _is_plausible_table(matrix: MatrixInput) -> bool:
418
- """
419
- Heuristic: require at least 2 rows and 2 cols with meaningful data.
420
- - At least 2 rows have 2 以上の非空セル
421
- - At least 2 columns have 2 以上の非空セル
422
- """
423
- normalized = _ensure_matrix(matrix)
424
- if not normalized:
425
- return False
426
-
427
- rows = len(normalized)
428
- cols = max((len(r) if isinstance(r, list) else 1) for r in normalized) if rows else 0
429
- if rows < 2 or cols < 2:
430
- return False
431
-
432
- row_counts: list[int] = []
433
- col_counts = [0] * cols
434
- for r in normalized:
435
- cnt = 0
436
- for j in range(cols):
437
- v = r[j] if j < len(r) else None
879
+ def _is_plausible_table(matrix: MatrixInput) -> bool:
880
+ """
881
+ Heuristic: require at least 2 rows and 2 cols with meaningful data.
882
+ - At least 2 rows have 2 以上の非空セル
883
+ - At least 2 columns have 2 以上の非空セル
884
+ """
885
+ normalized = _ensure_matrix(matrix)
886
+ if not normalized:
887
+ return False
888
+
889
+ rows = len(normalized)
890
+ cols = (
891
+ max((len(r) if isinstance(r, list) else 1) for r in normalized) if rows else 0
892
+ )
893
+ if rows < 2 or cols < 2:
894
+ return False
895
+
896
+ row_counts: list[int] = []
897
+ col_counts = [0] * cols
898
+ for r in normalized:
899
+ cnt = 0
900
+ for j in range(cols):
901
+ v = r[j] if j < len(r) else None
438
902
  if not (v is None or str(v).strip() == ""):
439
903
  cnt += 1
440
904
  col_counts[j] += 1
@@ -445,7 +909,9 @@ def _is_plausible_table(matrix: MatrixInput) -> bool:
445
909
  return rows_with_two >= 2 and cols_with_two >= 2
446
910
 
447
911
 
448
- def _nonempty_clusters(matrix: Sequence[Sequence[object]]) -> list[tuple[int, int, int, int]]:
912
+ def _nonempty_clusters(
913
+ matrix: Sequence[Sequence[object]],
914
+ ) -> list[tuple[int, int, int, int]]:
449
915
  """Return bounding boxes of connected components of nonempty cells (4-neighbor)."""
450
916
  if not matrix:
451
917
  return []
@@ -460,12 +926,12 @@ def _nonempty_clusters(matrix: Sequence[Sequence[object]]) -> list[tuple[int, in
460
926
  visited = [[False] * cols for _ in range(rows)]
461
927
  boxes: list[tuple[int, int, int, int]] = []
462
928
 
463
- def bfs(sr: int, sc: int) -> tuple[int, int, int, int]:
464
- q = deque([(sr, sc)])
465
- visited[sr][sc] = True
466
- ys = [sr]
467
- xs = [sc]
468
- while q:
929
+ def bfs(sr: int, sc: int) -> tuple[int, int, int, int]:
930
+ q = deque([(sr, sc)])
931
+ visited[sr][sc] = True
932
+ ys = [sr]
933
+ xs = [sc]
934
+ while q:
469
935
  r, c = q.popleft()
470
936
  for dr, dc in ((1, 0), (-1, 0), (0, 1), (0, -1)):
471
937
  nr, nc = r + dr, c + dc
@@ -488,17 +954,17 @@ def _nonempty_clusters(matrix: Sequence[Sequence[object]]) -> list[tuple[int, in
488
954
  return boxes
489
955
 
490
956
 
491
- def _normalize_matrix(matrix: object) -> list[list[object]]:
492
- if matrix is None:
493
- return []
494
- if isinstance(matrix, list):
495
- return _ensure_matrix(matrix)
496
- if isinstance(matrix, Sequence) and not isinstance(matrix, (str, bytes, bytearray)):
497
- return _ensure_matrix(matrix)
498
- return [[matrix]]
957
+ def _normalize_matrix(matrix: object) -> list[list[object]]:
958
+ if matrix is None:
959
+ return []
960
+ if isinstance(matrix, list):
961
+ return _ensure_matrix(matrix)
962
+ if isinstance(matrix, Sequence) and not isinstance(matrix, str | bytes | bytearray):
963
+ return _ensure_matrix(matrix)
964
+ return [[matrix]]
499
965
 
500
966
 
501
- def _header_like_row(row: list[object]) -> bool:
967
+ def _header_like_row(row: list[object]) -> bool:
502
968
  nonempty = [v for v in row if not (v is None or str(v).strip() == "")]
503
969
  if len(nonempty) < 2:
504
970
  return False
@@ -513,19 +979,21 @@ def _header_like_row(row: list[object]) -> bool:
513
979
  return str_like >= num_like and str_like >= 1
514
980
 
515
981
 
516
- def _table_signal_score(matrix: Sequence[Sequence[object]]) -> float:
517
- normalized = _ensure_matrix(matrix)
518
- density, coverage = _table_density_metrics(normalized)
519
- header = any(_header_like_row(r) for r in normalized[:2]) # check first 2 rows
982
+ def _table_signal_score(matrix: Sequence[Sequence[object]]) -> float:
983
+ normalized = _ensure_matrix(matrix)
984
+ density, coverage = _table_density_metrics(normalized)
985
+ header = any(_header_like_row(r) for r in normalized[:2]) # check first 2 rows
520
986
 
521
- rows = len(normalized)
522
- cols = max((len(r) if isinstance(r, list) else 1) for r in normalized) if rows else 0
523
- row_counts: list[int] = []
524
- col_counts = [0] * cols if cols else []
525
- for r in normalized:
526
- cnt = 0
527
- for j in range(cols):
528
- v = r[j] if j < len(r) else None
987
+ rows = len(normalized)
988
+ cols = (
989
+ max((len(r) if isinstance(r, list) else 1) for r in normalized) if rows else 0
990
+ )
991
+ row_counts: list[int] = []
992
+ col_counts = [0] * cols if cols else []
993
+ for r in normalized:
994
+ cnt = 0
995
+ for j in range(cols):
996
+ v = r[j] if j < len(r) else None
529
997
  if not (v is None or str(v).strip() == ""):
530
998
  cnt += 1
531
999
  if j < len(col_counts):
@@ -565,28 +1033,28 @@ def set_table_detection_params(
565
1033
  _DETECTION_CONFIG["min_nonempty_cells"] = min_nonempty_cells
566
1034
 
567
1035
 
568
- def shrink_to_content_openpyxl( # noqa: C901
569
- ws: Worksheet,
570
- top: int,
571
- left: int,
572
- bottom: int,
573
- right: int,
574
- require_inside_border: bool,
575
- top_edge: np.ndarray,
576
- bottom_edge: np.ndarray,
577
- left_edge: np.ndarray,
578
- right_edge: np.ndarray,
579
- min_nonempty_ratio: float = 0.0,
580
- ) -> tuple[int, int, int, int]:
581
- vals = _get_values_block(ws, top, left, bottom, right)
582
- rows_n = bottom - top + 1
583
- cols_n = right - left + 1
584
-
585
- def to_str(x: object) -> str:
586
- return "" if x is None else str(x)
587
-
588
- def is_empty_value(x: object) -> bool:
589
- return to_str(x).strip() == ""
1036
+ def shrink_to_content_openpyxl( # noqa: C901
1037
+ ws: Worksheet,
1038
+ top: int,
1039
+ left: int,
1040
+ bottom: int,
1041
+ right: int,
1042
+ require_inside_border: bool,
1043
+ top_edge: np.ndarray,
1044
+ bottom_edge: np.ndarray,
1045
+ left_edge: np.ndarray,
1046
+ right_edge: np.ndarray,
1047
+ min_nonempty_ratio: float = 0.0,
1048
+ ) -> tuple[int, int, int, int]:
1049
+ vals = _get_values_block(ws, top, left, bottom, right)
1050
+ rows_n = bottom - top + 1
1051
+ cols_n = right - left + 1
1052
+
1053
+ def to_str(x: object) -> str:
1054
+ return "" if x is None else str(x)
1055
+
1056
+ def is_empty_value(x: object) -> bool:
1057
+ return to_str(x).strip() == ""
590
1058
 
591
1059
  def row_nonempty_ratio_local(i: int) -> float:
592
1060
  if cols_n <= 0:
@@ -725,29 +1193,45 @@ def shrink_to_content_openpyxl( # noqa: C901
725
1193
  return top, left, bottom, right
726
1194
 
727
1195
 
728
- def detect_tables_xlwings(sheet: xw.Sheet) -> list[str]: # noqa: C901
729
- """Detect table-like ranges via COM: ListObjects first, then border clusters."""
730
- tables: list[str] = []
1196
+ def _extract_listobject_tables(sheet: xw.Sheet) -> list[str]:
1197
+ """Extract table ranges from Excel ListObjects via COM.
1198
+
1199
+ Args:
1200
+ sheet: xlwings worksheet.
1201
+
1202
+ Returns:
1203
+ List of table ranges as Excel A1 strings.
1204
+ """
1205
+ tables: list[str] = []
731
1206
  try:
732
1207
  for lo in sheet.api.ListObjects:
733
1208
  rng = lo.Range
734
- top_row = int(rng.Row)
735
- left_col = int(rng.Column)
736
- bottom_row = top_row + int(rng.Rows.Count) - 1
737
- right_col = left_col + int(rng.Columns.Count) - 1
738
1209
  addr = rng.Address(RowAbsolute=False, ColumnAbsolute=False)
739
1210
  tables.append(addr)
740
- except Exception:
741
- pass
742
-
743
- used = sheet.used_range
744
- max_row = used.last_cell.row
745
- max_col = used.last_cell.column
746
-
747
- def cell_has_any_border(r: int, c: int) -> bool:
748
- try:
749
- b = sheet.api.Cells(r, c).Borders
750
- for idx in (
1211
+ except Exception:
1212
+ pass
1213
+ return tables
1214
+
1215
+
1216
+ def _detect_border_rectangles_xlwings(
1217
+ sheet: xw.Sheet,
1218
+ ) -> list[tuple[int, int, int, int]]:
1219
+ """Detect bordered rectangles in a sheet using COM border inspection.
1220
+
1221
+ Args:
1222
+ sheet: xlwings worksheet.
1223
+
1224
+ Returns:
1225
+ List of rectangles as (top_row, left_col, bottom_row, right_col).
1226
+ """
1227
+ used = sheet.used_range
1228
+ max_row = used.last_cell.row
1229
+ max_col = used.last_cell.column
1230
+
1231
+ def cell_has_any_border(r: int, c: int) -> bool:
1232
+ try:
1233
+ b = sheet.api.Cells(r, c).Borders
1234
+ for idx in (
751
1235
  XL_EDGE_LEFT,
752
1236
  XL_EDGE_TOP,
753
1237
  XL_EDGE_RIGHT,
@@ -768,211 +1252,243 @@ def detect_tables_xlwings(sheet: xw.Sheet) -> list[str]: # noqa: C901
768
1252
  return False
769
1253
 
770
1254
  grid = [[False] * (max_col + 1) for _ in range(max_row + 1)]
771
- for r in range(1, max_row + 1):
772
- for c in range(1, max_col + 1):
773
- if cell_has_any_border(r, c):
774
- grid[r][c] = True
775
- visited = [[False] * (max_col + 1) for _ in range(max_row + 1)]
776
-
777
- def dfs(sr: int, sc: int, acc: list[tuple[int, int]]) -> None:
778
- stack = [(sr, sc)]
779
- while stack:
780
- rr, cc = stack.pop()
781
- if not (1 <= rr <= max_row and 1 <= cc <= max_col):
782
- continue
783
- if visited[rr][cc] or not grid[rr][cc]:
784
- continue
785
- visited[rr][cc] = True
786
- acc.append((rr, cc))
787
- for dr, dc in ((1, 0), (-1, 0), (0, 1), (0, -1)):
788
- stack.append((rr + dr, cc + dc))
789
-
790
- clusters: list[tuple[int, int, int, int]] = []
791
1255
  for r in range(1, max_row + 1):
792
1256
  for c in range(1, max_col + 1):
793
- if grid[r][c] and not visited[r][c]:
794
- cluster: list[tuple[int, int]] = []
795
- dfs(r, c, cluster)
796
- if len(cluster) < 4:
797
- continue
798
- rows = [rc[0] for rc in cluster]
799
- cols = [rc[1] for rc in cluster]
800
- top_row = min(rows)
801
- bottom_row = max(rows)
802
- left_col = min(cols)
803
- right_col = max(cols)
804
- clusters.append((top_row, left_col, bottom_row, right_col))
805
-
806
- def overlaps_for_merge(
807
- a: tuple[int, int, int, int], b: tuple[int, int, int, int]
808
- ) -> bool:
809
- # Do not merge if one rect fully contains the other (separate clusters like big frame vs small table)
810
- contains = (
811
- a[0] <= b[0] and a[1] <= b[1] and a[2] >= b[2] and a[3] >= b[3]
812
- ) or (b[0] <= a[0] and b[1] <= a[1] and b[2] >= a[2] and b[3] >= a[3])
813
- if contains:
814
- return False
815
- return not (a[1] > b[3] or a[3] < b[1] or a[0] > b[2] or a[2] < b[0])
1257
+ if cell_has_any_border(r, c):
1258
+ grid[r][c] = True
1259
+ return _detect_border_rectangles(grid, min_size=4)
1260
+
1261
+
1262
+ def _detect_border_rectangles(
1263
+ has_border: np.ndarray | Sequence[Sequence[bool]], *, min_size: int
1264
+ ) -> list[tuple[int, int, int, int]]:
1265
+ """Detect border rectangles from a boolean grid.
816
1266
 
1267
+ Args:
1268
+ has_border: Boolean grid of border presence.
1269
+ min_size: Minimum cluster size to keep.
1270
+
1271
+ Returns:
1272
+ List of rectangles as (top_row, left_col, bottom_row, right_col).
1273
+ """
1274
+ return detect_border_clusters(np.asarray(has_border, dtype=bool), min_size=min_size)
1275
+
1276
+
1277
+ def _merge_rectangles(
1278
+ rects: Sequence[tuple[int, int, int, int]],
1279
+ ) -> list[tuple[int, int, int, int]]:
1280
+ """Merge overlapping rectangles while preserving contained regions.
1281
+
1282
+ Args:
1283
+ rects: Sequence of rectangles (top, left, bottom, right).
1284
+
1285
+ Returns:
1286
+ Merged rectangles sorted by coordinates.
1287
+ """
817
1288
  merged_rects: list[tuple[int, int, int, int]] = []
818
- for rect in sorted(clusters):
1289
+ for rect in sorted(rects):
819
1290
  merged = False
820
- for i, ex in enumerate(merged_rects):
821
- if overlaps_for_merge(rect, ex):
1291
+ for i, existing in enumerate(merged_rects):
1292
+ if _rectangles_overlap_for_merge(rect, existing):
822
1293
  merged_rects[i] = (
823
- min(rect[0], ex[0]),
824
- min(rect[1], ex[1]),
825
- max(rect[2], ex[2]),
826
- max(rect[3], ex[3]),
1294
+ min(rect[0], existing[0]),
1295
+ min(rect[1], existing[1]),
1296
+ max(rect[2], existing[2]),
1297
+ max(rect[3], existing[3]),
827
1298
  )
828
1299
  merged = True
829
1300
  break
830
1301
  if not merged:
831
1302
  merged_rects.append(rect)
1303
+ return merged_rects
832
1304
 
833
- dedup: set[str] = set()
834
- for top_row, left_col, bottom_row, right_col in merged_rects:
835
- top_row, left_col, bottom_row, right_col = shrink_to_content(
836
- sheet, top_row, left_col, bottom_row, right_col, require_inside_border=False
837
- )
838
- try:
839
- rng_vals = sheet.range((top_row, left_col), (bottom_row, right_col)).value
840
- rng_vals = _normalize_matrix(rng_vals)
841
- nonempty = sum(
842
- 1
843
- for row in rng_vals
844
- for v in (row if isinstance(row, list) else [row])
845
- if not (v is None or str(v).strip() == "")
846
- )
847
- except Exception:
848
- nonempty = 0
849
- if nonempty < _DETECTION_CONFIG["min_nonempty_cells"]:
1305
+
1306
+ def _rectangles_overlap_for_merge(
1307
+ a: tuple[int, int, int, int], b: tuple[int, int, int, int]
1308
+ ) -> bool:
1309
+ """Return True when rectangles should be merged.
1310
+
1311
+ Args:
1312
+ a: First rectangle (top, left, bottom, right).
1313
+ b: Second rectangle (top, left, bottom, right).
1314
+
1315
+ Returns:
1316
+ True if rectangles overlap and neither fully contains the other.
1317
+ """
1318
+ contains = (a[0] <= b[0] and a[1] <= b[1] and a[2] >= b[2] and a[3] >= b[3]) or (
1319
+ b[0] <= a[0] and b[1] <= a[1] and b[2] >= a[2] and b[3] >= a[3]
1320
+ )
1321
+ if contains:
1322
+ return False
1323
+ return not (a[1] > b[3] or a[3] < b[1] or a[0] > b[2] or a[2] < b[0])
1324
+
1325
+
1326
+ def _collect_table_candidates_from_values(
1327
+ values: Sequence[Sequence[object]],
1328
+ *,
1329
+ base_top: int,
1330
+ base_left: int,
1331
+ col_name: Callable[[int], str],
1332
+ ) -> list[str]:
1333
+ """Collect table candidates from a normalized value matrix.
1334
+
1335
+ Args:
1336
+ values: Normalized matrix of cell values.
1337
+ base_top: Top row index of the matrix in worksheet coordinates (1-based).
1338
+ base_left: Left column index of the matrix in worksheet coordinates (1-based).
1339
+ col_name: Function to convert column index to Excel letters.
1340
+
1341
+ Returns:
1342
+ List of detected table candidate range strings.
1343
+ """
1344
+ normalized = [list(row) for row in values]
1345
+ nonempty = _count_nonempty_cells(normalized)
1346
+ if nonempty < _DETECTION_CONFIG["min_nonempty_cells"]:
1347
+ return []
1348
+
1349
+ results: list[str] = []
1350
+ clusters = _nonempty_clusters(normalized)
1351
+ for r0, c0, r1, c1 in clusters:
1352
+ sub = [row[c0 : c1 + 1] for row in normalized[r0 : r1 + 1]]
1353
+ density, coverage = _table_density_metrics(sub)
1354
+ if (
1355
+ density < _DETECTION_CONFIG["density_min"]
1356
+ and coverage < _DETECTION_CONFIG["coverage_min"]
1357
+ ):
850
1358
  continue
851
- clusters = _nonempty_clusters(rng_vals)
852
- for r0, c0, r1, c1 in clusters:
853
- sub = [row[c0 : c1 + 1] for row in rng_vals[r0 : r1 + 1]]
854
- density, coverage = _table_density_metrics(sub)
855
- if (
856
- density < _DETECTION_CONFIG["density_min"]
857
- and coverage < _DETECTION_CONFIG["coverage_min"]
858
- ):
859
- continue
860
- if not _is_plausible_table(sub):
861
- continue
862
- score = _table_signal_score(sub)
863
- if score < _DETECTION_CONFIG["table_score_threshold"]:
864
- continue
865
- addr = f"{xw.utils.col_name(left_col + c0)}{top_row + r0}:{xw.utils.col_name(left_col + c1)}{top_row + r1}"
866
- if addr not in dedup:
867
- dedup.add(addr)
868
- tables.append(addr)
869
- return tables
1359
+ if not _is_plausible_table(sub):
1360
+ continue
1361
+ score = _table_signal_score(sub)
1362
+ if score < _DETECTION_CONFIG["table_score_threshold"]:
1363
+ continue
1364
+ addr = (
1365
+ f"{col_name(base_left + c0)}{base_top + r0}:"
1366
+ f"{col_name(base_left + c1)}{base_top + r1}"
1367
+ )
1368
+ results.append(addr)
1369
+ return results
1370
+
1371
+
1372
+ def _count_nonempty_cells(values: Sequence[Sequence[object]]) -> int:
1373
+ """Count non-empty cells in a normalized matrix.
870
1374
 
1375
+ Args:
1376
+ values: Normalized matrix of values.
871
1377
 
872
- def detect_tables_openpyxl( # noqa: C901
873
- xlsx_path: Path, sheet_name: str
874
- ) -> list[str]:
875
- wb = load_workbook(
876
- xlsx_path,
877
- data_only=True,
878
- read_only=False,
1378
+ Returns:
1379
+ Number of non-empty cells.
1380
+ """
1381
+ return sum(
1382
+ 1 for row in values for v in row if not (v is None or str(v).strip() == "")
879
1383
  )
880
- ws = wb[sheet_name]
881
- tables: list[str] = []
882
- try:
883
- openpyxl_tables: list[object] = []
884
- if hasattr(ws, "tables") and ws.tables:
885
- if isinstance(ws.tables, dict):
886
- openpyxl_tables = list(ws.tables.values())
887
- else:
888
- openpyxl_tables = list(ws.tables)
889
- elif hasattr(ws, "_tables") and ws._tables:
890
- openpyxl_tables = list(ws._tables)
891
- for t in openpyxl_tables:
892
- addr = getattr(t, "ref", None)
893
- if addr:
894
- tables.append(str(addr))
1384
+
1385
+
1386
+ def _extract_openpyxl_table_refs(ws: Worksheet) -> list[str]:
1387
+ """Extract table reference strings from an openpyxl worksheet.
1388
+
1389
+ Args:
1390
+ ws: Target worksheet.
1391
+
1392
+ Returns:
1393
+ List of table reference strings.
1394
+ """
1395
+ tables: list[str] = []
1396
+ try:
1397
+ openpyxl_tables: list[object] = []
1398
+ if hasattr(ws, "tables") and ws.tables:
1399
+ if isinstance(ws.tables, dict):
1400
+ openpyxl_tables = list(ws.tables.values())
1401
+ else:
1402
+ openpyxl_tables = list(ws.tables)
1403
+ elif hasattr(ws, "_tables") and ws._tables:
1404
+ openpyxl_tables = list(ws._tables)
1405
+ for t in openpyxl_tables:
1406
+ addr = getattr(t, "ref", None)
1407
+ if addr:
1408
+ tables.append(str(addr))
895
1409
  except Exception:
896
1410
  pass
1411
+ return tables
897
1412
 
898
- has_border, top_edge, bottom_edge, left_edge, right_edge, max_row, max_col = (
899
- load_border_maps_xlsx(xlsx_path, sheet_name)
900
- )
901
- rects = detect_border_clusters(has_border, min_size=4)
902
-
903
- def overlaps_for_merge(
904
- a: tuple[int, int, int, int], b: tuple[int, int, int, int]
905
- ) -> bool:
906
- contains = (
907
- a[0] <= b[0] and a[1] <= b[1] and a[2] >= b[2] and a[3] >= b[3]
908
- ) or (b[0] <= a[0] and b[1] <= a[1] and b[2] >= a[2] and b[3] >= a[3])
909
- if contains:
910
- return False
911
- return not (a[1] > b[3] or a[3] < b[1] or a[0] > b[2] or a[2] < b[0])
912
1413
 
913
- merged_rects: list[tuple[int, int, int, int]] = []
914
- for rect in sorted(rects):
915
- merged = False
916
- for i, ex in enumerate(merged_rects):
917
- if overlaps_for_merge(rect, ex):
918
- merged_rects[i] = (
919
- min(rect[0], ex[0]),
920
- min(rect[1], ex[1]),
921
- max(rect[2], ex[2]),
922
- max(rect[3], ex[3]),
923
- )
924
- merged = True
925
- break
926
- if not merged:
927
- merged_rects.append(rect)
1414
+ def detect_tables_xlwings(sheet: xw.Sheet) -> list[str]:
1415
+ """Detect table-like ranges via COM: ListObjects first, then border clusters."""
1416
+ tables: list[str] = []
1417
+ tables.extend(_extract_listobject_tables(sheet))
1418
+
1419
+ rects = _detect_border_rectangles_xlwings(sheet)
1420
+ merged_rects = _merge_rectangles(rects)
1421
+ dedup: set[str] = set(tables)
928
1422
 
929
- dedup: set[str] = set()
930
1423
  for top_row, left_col, bottom_row, right_col in merged_rects:
931
- top_row, left_col, bottom_row, right_col = shrink_to_content_openpyxl(
932
- ws,
933
- top_row,
934
- left_col,
935
- bottom_row,
936
- right_col,
937
- require_inside_border=False,
938
- top_edge=top_edge,
939
- bottom_edge=bottom_edge,
940
- left_edge=left_edge,
941
- right_edge=right_edge,
942
- min_nonempty_ratio=0.0,
943
- )
944
- vals_block = _get_values_block(ws, top_row, left_col, bottom_row, right_col)
945
- vals_block = _normalize_matrix(vals_block)
946
- nonempty = sum(
947
- 1
948
- for row in vals_block
949
- for v in row
950
- if not (v is None or str(v).strip() == "")
1424
+ top_row, left_col, bottom_row, right_col = shrink_to_content(
1425
+ sheet, top_row, left_col, bottom_row, right_col, require_inside_border=False
951
1426
  )
952
- if nonempty < _DETECTION_CONFIG["min_nonempty_cells"]:
1427
+ rng_vals: object | None = None
1428
+ try:
1429
+ rng_vals = sheet.range((top_row, left_col), (bottom_row, right_col)).value
1430
+ except Exception as exc:
1431
+ logger.warning(
1432
+ "Failed to read range for table detection (%s). (%r)",
1433
+ sheet.name,
1434
+ exc,
1435
+ )
1436
+ if rng_vals is None:
953
1437
  continue
954
- clusters = _nonempty_clusters(vals_block)
955
- for r0, c0, r1, c1 in clusters:
956
- sub = [row[c0 : c1 + 1] for row in vals_block[r0 : r1 + 1]]
957
- density, coverage = _table_density_metrics(sub)
958
- if (
959
- density < _DETECTION_CONFIG["density_min"]
960
- and coverage < _DETECTION_CONFIG["coverage_min"]
961
- ):
962
- continue
963
- if not _is_plausible_table(sub):
964
- continue
965
- score = _table_signal_score(sub)
966
- if score < _DETECTION_CONFIG["table_score_threshold"]:
967
- continue
968
- addr = f"{get_column_letter(left_col + c0)}{top_row + r0}:{get_column_letter(left_col + c1)}{top_row + r1}"
1438
+ candidates = _collect_table_candidates_from_values(
1439
+ _normalize_matrix(rng_vals),
1440
+ base_top=top_row,
1441
+ base_left=left_col,
1442
+ col_name=xw.utils.col_name,
1443
+ )
1444
+ for addr in candidates:
969
1445
  if addr not in dedup:
970
1446
  dedup.add(addr)
971
1447
  tables.append(addr)
972
- wb.close()
973
1448
  return tables
974
1449
 
975
1450
 
1451
+ def detect_tables_openpyxl(xlsx_path: Path, sheet_name: str) -> list[str]:
1452
+ """Detect table-like ranges via openpyxl tables and border clusters."""
1453
+ with openpyxl_workbook(xlsx_path, data_only=True, read_only=False) as wb:
1454
+ ws = wb[sheet_name]
1455
+ tables = _extract_openpyxl_table_refs(ws)
1456
+
1457
+ has_border, top_edge, bottom_edge, left_edge, right_edge, max_row, max_col = (
1458
+ load_border_maps_xlsx(xlsx_path, sheet_name)
1459
+ )
1460
+ rects = _detect_border_rectangles(has_border, min_size=4)
1461
+ merged_rects = _merge_rectangles(rects)
1462
+ dedup: set[str] = set(tables)
1463
+
1464
+ for top_row, left_col, bottom_row, right_col in merged_rects:
1465
+ top_row, left_col, bottom_row, right_col = shrink_to_content_openpyxl(
1466
+ ws,
1467
+ top_row,
1468
+ left_col,
1469
+ bottom_row,
1470
+ right_col,
1471
+ require_inside_border=False,
1472
+ top_edge=top_edge,
1473
+ bottom_edge=bottom_edge,
1474
+ left_edge=left_edge,
1475
+ right_edge=right_edge,
1476
+ min_nonempty_ratio=0.0,
1477
+ )
1478
+ vals_block = _get_values_block(ws, top_row, left_col, bottom_row, right_col)
1479
+ candidates = _collect_table_candidates_from_values(
1480
+ _normalize_matrix(vals_block),
1481
+ base_top=top_row,
1482
+ base_left=left_col,
1483
+ col_name=get_column_letter,
1484
+ )
1485
+ for addr in candidates:
1486
+ if addr not in dedup:
1487
+ dedup.add(addr)
1488
+ tables.append(addr)
1489
+ return tables
1490
+
1491
+
976
1492
  def detect_tables(sheet: xw.Sheet) -> list[str]:
977
1493
  excel_path: Path | None = None
978
1494
  try:
@@ -1027,13 +1543,13 @@ def _coerce_numeric_preserve_format(val: str) -> int | float | str:
1027
1543
  return int(val)
1028
1544
  except Exception:
1029
1545
  return val
1030
- if _FLOAT_RE.match(val):
1031
- try:
1032
- dec = Decimal(val)
1033
- exponent = int(dec.as_tuple().exponent)
1034
- scale = max(1, -exponent)
1035
- quantized = dec.quantize(Decimal("1." + "0" * scale))
1036
- return float(quantized)
1037
- except (InvalidOperation, Exception):
1038
- return val
1546
+ if _FLOAT_RE.match(val):
1547
+ try:
1548
+ dec = Decimal(val)
1549
+ exponent = int(dec.as_tuple().exponent)
1550
+ scale = max(1, -exponent)
1551
+ quantized = dec.quantize(Decimal("1." + "0" * scale))
1552
+ return float(quantized)
1553
+ except (InvalidOperation, Exception):
1554
+ return val
1039
1555
  return val