exstruct 0.2.80__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- exstruct/__init__.py +23 -12
- exstruct/cli/main.py +20 -0
- exstruct/core/backends/__init__.py +7 -0
- exstruct/core/backends/base.py +42 -0
- exstruct/core/backends/com_backend.py +230 -0
- exstruct/core/backends/openpyxl_backend.py +191 -0
- exstruct/core/cells.py +999 -483
- exstruct/core/charts.py +243 -241
- exstruct/core/integrate.py +42 -375
- exstruct/core/logging_utils.py +16 -0
- exstruct/core/modeling.py +87 -0
- exstruct/core/pipeline.py +749 -0
- exstruct/core/ranges.py +48 -0
- exstruct/core/shapes.py +282 -36
- exstruct/core/workbook.py +114 -0
- exstruct/engine.py +51 -123
- exstruct/errors.py +12 -1
- exstruct/io/__init__.py +130 -138
- exstruct/io/serialize.py +112 -0
- exstruct/models/__init__.py +58 -8
- exstruct/render/__init__.py +3 -7
- {exstruct-0.2.80.dist-info → exstruct-0.3.2.dist-info}/METADATA +133 -18
- exstruct-0.3.2.dist-info/RECORD +30 -0
- exstruct-0.2.80.dist-info/RECORD +0 -20
- {exstruct-0.2.80.dist-info → exstruct-0.3.2.dist-info}/WHEEL +0 -0
- {exstruct-0.2.80.dist-info → exstruct-0.3.2.dist-info}/entry_points.txt +0 -0
exstruct/core/cells.py
CHANGED
|
@@ -1,31 +1,33 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from collections import deque
|
|
4
|
-
from collections.abc import Sequence
|
|
5
|
-
from
|
|
6
|
-
import
|
|
7
|
-
|
|
8
|
-
import
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
from openpyxl.
|
|
13
|
-
from openpyxl.
|
|
14
|
-
|
|
15
|
-
import
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
3
|
+
from collections import deque
|
|
4
|
+
from collections.abc import Callable, Sequence
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from decimal import Decimal, InvalidOperation
|
|
7
|
+
import logging
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
import re
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
from openpyxl.styles.colors import Color
|
|
13
|
+
from openpyxl.utils import get_column_letter, range_boundaries
|
|
14
|
+
from openpyxl.worksheet.worksheet import Worksheet
|
|
15
|
+
import pandas as pd
|
|
16
|
+
import xlwings as xw
|
|
17
|
+
|
|
18
|
+
from ..models import CellRow, MergedCell
|
|
19
|
+
from .workbook import openpyxl_workbook
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
_warned_keys: set[str] = set()
|
|
23
|
+
XL_LINESTYLE_NONE = -4142
|
|
24
|
+
XL_INSIDE_VERTICAL = 11
|
|
25
|
+
XL_INSIDE_HORIZONTAL = 12
|
|
26
|
+
XL_EDGE_LEFT = 7
|
|
27
|
+
XL_EDGE_TOP = 8
|
|
28
|
+
XL_EDGE_BOTTOM = 9
|
|
29
|
+
XL_EDGE_RIGHT = 10
|
|
30
|
+
MatrixInput = Sequence[Sequence[object]] | Sequence[object]
|
|
29
31
|
|
|
30
32
|
# Detection tuning parameters (can be overridden via set_table_detection_params)
|
|
31
33
|
_DETECTION_CONFIG = {
|
|
@@ -34,6 +36,426 @@ _DETECTION_CONFIG = {
|
|
|
34
36
|
"coverage_min": 0.2,
|
|
35
37
|
"min_nonempty_cells": 3,
|
|
36
38
|
}
|
|
39
|
+
_DEFAULT_BACKGROUND_HEX = "FFFFFF"
|
|
40
|
+
_XL_COLOR_NONE = -4142
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# Use dataclasses for lightweight models
|
|
44
|
+
@dataclass(frozen=True)
|
|
45
|
+
class SheetColorsMap:
|
|
46
|
+
"""Background color map for a single worksheet."""
|
|
47
|
+
|
|
48
|
+
sheet_name: str
|
|
49
|
+
colors_map: dict[str, list[tuple[int, int]]]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass(frozen=True)
|
|
53
|
+
class WorkbookColorsMap:
|
|
54
|
+
"""Background color maps for all worksheets in a workbook."""
|
|
55
|
+
|
|
56
|
+
sheets: dict[str, SheetColorsMap]
|
|
57
|
+
|
|
58
|
+
def get_sheet(self, sheet_name: str) -> SheetColorsMap | None:
|
|
59
|
+
"""Return the colors map for a sheet if available.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
sheet_name: Target worksheet name.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
SheetColorsMap for the sheet, or None if missing.
|
|
66
|
+
"""
|
|
67
|
+
return self.sheets.get(sheet_name)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def extract_sheet_colors_map(
|
|
71
|
+
file_path: Path, *, include_default_background: bool, ignore_colors: set[str] | None
|
|
72
|
+
) -> WorkbookColorsMap:
|
|
73
|
+
"""Extract background colors for each worksheet.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
file_path: Excel workbook path.
|
|
77
|
+
include_default_background: Whether to include default (white) backgrounds
|
|
78
|
+
within the used range.
|
|
79
|
+
ignore_colors: Optional set of color keys to ignore.
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
WorkbookColorsMap containing per-sheet color maps.
|
|
83
|
+
"""
|
|
84
|
+
sheets: dict[str, SheetColorsMap] = {}
|
|
85
|
+
with openpyxl_workbook(file_path, data_only=True, read_only=False) as wb:
|
|
86
|
+
for ws in wb.worksheets:
|
|
87
|
+
sheet_map = _extract_sheet_colors(
|
|
88
|
+
ws, include_default_background, ignore_colors
|
|
89
|
+
)
|
|
90
|
+
sheets[ws.title] = sheet_map
|
|
91
|
+
return WorkbookColorsMap(sheets=sheets)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def extract_sheet_colors_map_com(
|
|
95
|
+
workbook: xw.Book,
|
|
96
|
+
*,
|
|
97
|
+
include_default_background: bool,
|
|
98
|
+
ignore_colors: set[str] | None,
|
|
99
|
+
) -> WorkbookColorsMap:
|
|
100
|
+
"""Extract background colors for each worksheet via COM display formats.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
workbook: xlwings workbook instance.
|
|
104
|
+
include_default_background: Whether to include default (white) backgrounds
|
|
105
|
+
within the used range.
|
|
106
|
+
ignore_colors: Optional set of color keys to ignore.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
WorkbookColorsMap containing per-sheet color maps.
|
|
110
|
+
"""
|
|
111
|
+
_prepare_workbook_for_display_format(workbook)
|
|
112
|
+
sheets: dict[str, SheetColorsMap] = {}
|
|
113
|
+
for sheet in workbook.sheets:
|
|
114
|
+
_prepare_sheet_for_display_format(sheet)
|
|
115
|
+
sheet_map = _extract_sheet_colors_com(
|
|
116
|
+
sheet, include_default_background, ignore_colors
|
|
117
|
+
)
|
|
118
|
+
sheets[sheet.name] = sheet_map
|
|
119
|
+
return WorkbookColorsMap(sheets=sheets)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _extract_sheet_colors(
|
|
123
|
+
ws: Worksheet, include_default_background: bool, ignore_colors: set[str] | None
|
|
124
|
+
) -> SheetColorsMap:
|
|
125
|
+
"""Extract background colors for a single worksheet.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
ws: Target worksheet.
|
|
129
|
+
include_default_background: Whether to include default (white) backgrounds.
|
|
130
|
+
ignore_colors: Optional set of color keys to ignore.
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
SheetColorsMap for the worksheet.
|
|
134
|
+
"""
|
|
135
|
+
min_row, min_col, max_row, max_col = _get_used_range_bounds(ws)
|
|
136
|
+
colors_map: dict[str, list[tuple[int, int]]] = {}
|
|
137
|
+
if min_row > max_row or min_col > max_col:
|
|
138
|
+
return SheetColorsMap(sheet_name=ws.title, colors_map=colors_map)
|
|
139
|
+
|
|
140
|
+
ignore_set = _normalize_ignore_colors(ignore_colors)
|
|
141
|
+
for row in ws.iter_rows(
|
|
142
|
+
min_row=min_row, max_row=max_row, min_col=min_col, max_col=max_col
|
|
143
|
+
):
|
|
144
|
+
for cell in row:
|
|
145
|
+
color_key = _resolve_cell_background(cell, include_default_background)
|
|
146
|
+
if color_key is None:
|
|
147
|
+
continue
|
|
148
|
+
normalized_key = _normalize_color_key(color_key)
|
|
149
|
+
if _should_ignore_color(normalized_key, ignore_set):
|
|
150
|
+
continue
|
|
151
|
+
colors_map.setdefault(normalized_key, []).append(
|
|
152
|
+
(cell.row, cell.col_idx - 1)
|
|
153
|
+
)
|
|
154
|
+
return SheetColorsMap(sheet_name=ws.title, colors_map=colors_map)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _extract_sheet_colors_com(
|
|
158
|
+
sheet: xw.Sheet, include_default_background: bool, ignore_colors: set[str] | None
|
|
159
|
+
) -> SheetColorsMap:
|
|
160
|
+
"""Extract background colors for a single worksheet via COM.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
sheet: Target worksheet.
|
|
164
|
+
include_default_background: Whether to include default (white) backgrounds.
|
|
165
|
+
ignore_colors: Optional set of color keys to ignore.
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
SheetColorsMap for the worksheet.
|
|
169
|
+
"""
|
|
170
|
+
colors_map: dict[str, list[tuple[int, int]]] = {}
|
|
171
|
+
used = sheet.used_range
|
|
172
|
+
start_row = int(getattr(used, "row", 1))
|
|
173
|
+
start_col = int(getattr(used, "column", 1))
|
|
174
|
+
max_row = used.last_cell.row
|
|
175
|
+
max_col = used.last_cell.column
|
|
176
|
+
if max_row <= 0 or max_col <= 0:
|
|
177
|
+
return SheetColorsMap(sheet_name=sheet.name, colors_map=colors_map)
|
|
178
|
+
|
|
179
|
+
ignore_set = _normalize_ignore_colors(ignore_colors)
|
|
180
|
+
for row in range(start_row, max_row + 1):
|
|
181
|
+
for col in range(start_col, max_col + 1):
|
|
182
|
+
color_key = _resolve_cell_background_com(
|
|
183
|
+
sheet, row, col, include_default_background
|
|
184
|
+
)
|
|
185
|
+
if color_key is None:
|
|
186
|
+
continue
|
|
187
|
+
normalized_key = _normalize_color_key(color_key)
|
|
188
|
+
if _should_ignore_color(normalized_key, ignore_set):
|
|
189
|
+
continue
|
|
190
|
+
colors_map.setdefault(normalized_key, []).append((row, col - 1))
|
|
191
|
+
return SheetColorsMap(sheet_name=sheet.name, colors_map=colors_map)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _get_used_range_bounds(ws: Worksheet) -> tuple[int, int, int, int]:
|
|
195
|
+
"""Return used range bounds for a worksheet.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
ws: Target worksheet.
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
Tuple of (min_row, min_col, max_row, max_col).
|
|
202
|
+
"""
|
|
203
|
+
try:
|
|
204
|
+
if _is_effectively_empty_sheet(ws):
|
|
205
|
+
return 1, 1, 0, 0
|
|
206
|
+
dim = ws.calculate_dimension()
|
|
207
|
+
min_col, min_row, max_col, max_row = range_boundaries(dim)
|
|
208
|
+
return min_row, min_col, max_row, max_col
|
|
209
|
+
except Exception:
|
|
210
|
+
max_row = ws.max_row or 0
|
|
211
|
+
max_col = ws.max_column or 0
|
|
212
|
+
if max_row == 0 or max_col == 0:
|
|
213
|
+
return 1, 1, 0, 0
|
|
214
|
+
return 1, 1, max_row, max_col
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _is_effectively_empty_sheet(ws: Worksheet) -> bool:
|
|
218
|
+
"""Check whether a worksheet has no content or styling.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
ws: Target worksheet.
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
True if the sheet has no meaningful content or style, otherwise False.
|
|
225
|
+
"""
|
|
226
|
+
if ws.max_row != 1 or ws.max_column != 1:
|
|
227
|
+
return False
|
|
228
|
+
cell = ws.cell(row=1, column=1)
|
|
229
|
+
return cell.value is None and not cell.has_style
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def _resolve_cell_background(
|
|
233
|
+
cell: object, include_default_background: bool
|
|
234
|
+
) -> str | None:
|
|
235
|
+
"""Resolve a cell's background color key.
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
cell: Worksheet cell object.
|
|
239
|
+
include_default_background: Whether to treat default fills as white.
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
Normalized color key or None when excluded.
|
|
243
|
+
"""
|
|
244
|
+
fill = getattr(cell, "fill", None)
|
|
245
|
+
if fill is None:
|
|
246
|
+
return _DEFAULT_BACKGROUND_HEX if include_default_background else None
|
|
247
|
+
pattern_type = getattr(fill, "patternType", None)
|
|
248
|
+
if pattern_type in (None, "none"):
|
|
249
|
+
return _DEFAULT_BACKGROUND_HEX if include_default_background else None
|
|
250
|
+
color_key = _resolve_fill_color_key(fill)
|
|
251
|
+
if color_key == _DEFAULT_BACKGROUND_HEX and not include_default_background:
|
|
252
|
+
return None
|
|
253
|
+
return color_key
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def _resolve_fill_color_key(fill: object) -> str | None:
|
|
257
|
+
"""Normalize the foreground/background color of a fill.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
fill: openpyxl fill object.
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
Normalized color key or None when unavailable.
|
|
264
|
+
"""
|
|
265
|
+
fg_color = getattr(fill, "fgColor", None)
|
|
266
|
+
if fg_color is not None:
|
|
267
|
+
fg_key = _color_to_key(fg_color)
|
|
268
|
+
if fg_key is not None:
|
|
269
|
+
return fg_key
|
|
270
|
+
bg_color = getattr(fill, "bgColor", None)
|
|
271
|
+
return _color_to_key(bg_color) if bg_color is not None else None
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def _resolve_cell_background_com(
|
|
275
|
+
sheet: xw.Sheet, row: int, col: int, include_default_background: bool
|
|
276
|
+
) -> str | None:
|
|
277
|
+
"""Resolve a cell's background color key via COM display format.
|
|
278
|
+
|
|
279
|
+
Args:
|
|
280
|
+
sheet: Target worksheet.
|
|
281
|
+
row: 1-based row index.
|
|
282
|
+
col: 1-based column index.
|
|
283
|
+
include_default_background: Whether to include default (white) backgrounds.
|
|
284
|
+
|
|
285
|
+
Returns:
|
|
286
|
+
Normalized color key or None when excluded.
|
|
287
|
+
"""
|
|
288
|
+
color_value = _get_display_format_color(sheet, row, col)
|
|
289
|
+
if color_value is None:
|
|
290
|
+
return _DEFAULT_BACKGROUND_HEX if include_default_background else None
|
|
291
|
+
if color_value == _XL_COLOR_NONE:
|
|
292
|
+
return _DEFAULT_BACKGROUND_HEX if include_default_background else None
|
|
293
|
+
color_key = _excel_color_int_to_rgb_hex(color_value)
|
|
294
|
+
if color_key == _DEFAULT_BACKGROUND_HEX and not include_default_background:
|
|
295
|
+
return None
|
|
296
|
+
return color_key
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def _prepare_workbook_for_display_format(workbook: xw.Book) -> None:
|
|
300
|
+
"""Prepare a workbook so DisplayFormat reflects conditional formatting.
|
|
301
|
+
|
|
302
|
+
Args:
|
|
303
|
+
workbook: xlwings workbook instance.
|
|
304
|
+
"""
|
|
305
|
+
try:
|
|
306
|
+
# Force calculation to ensure DisplayFormat.Interior reflects conditional formatting rules
|
|
307
|
+
workbook.app.calculate()
|
|
308
|
+
except Exception:
|
|
309
|
+
return
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def _prepare_sheet_for_display_format(sheet: xw.Sheet) -> None:
|
|
313
|
+
"""Prepare a sheet so DisplayFormat reflects conditional formatting.
|
|
314
|
+
|
|
315
|
+
Args:
|
|
316
|
+
sheet: Target worksheet.
|
|
317
|
+
"""
|
|
318
|
+
try:
|
|
319
|
+
# Activate sheet so DisplayFormat is available
|
|
320
|
+
sheet.api.Activate()
|
|
321
|
+
except Exception:
|
|
322
|
+
return
|
|
323
|
+
try:
|
|
324
|
+
# Calculate to apply conditional formatting to DisplayFormat
|
|
325
|
+
sheet.api.Calculate()
|
|
326
|
+
except Exception:
|
|
327
|
+
return
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def _get_display_format_color(sheet: xw.Sheet, row: int, col: int) -> int | None:
|
|
331
|
+
"""Read DisplayFormat.Interior.Color from COM.
|
|
332
|
+
|
|
333
|
+
Args:
|
|
334
|
+
sheet: Target worksheet.
|
|
335
|
+
row: 1-based row index.
|
|
336
|
+
col: 1-based column index.
|
|
337
|
+
|
|
338
|
+
Returns:
|
|
339
|
+
BGR integer color or None if unavailable.
|
|
340
|
+
"""
|
|
341
|
+
try:
|
|
342
|
+
cell = sheet.api.Cells(row, col)
|
|
343
|
+
display_format = cell.DisplayFormat
|
|
344
|
+
interior = display_format.Interior
|
|
345
|
+
return int(interior.Color)
|
|
346
|
+
except Exception:
|
|
347
|
+
return None
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def _excel_color_int_to_rgb_hex(color_value: int) -> str:
|
|
351
|
+
"""Convert an Excel color integer into an RGB hex string.
|
|
352
|
+
|
|
353
|
+
Args:
|
|
354
|
+
color_value: Excel color integer from COM.
|
|
355
|
+
|
|
356
|
+
Returns:
|
|
357
|
+
RGB hex string (uppercase).
|
|
358
|
+
"""
|
|
359
|
+
red = color_value & 0xFF
|
|
360
|
+
green = (color_value >> 8) & 0xFF
|
|
361
|
+
blue = (color_value >> 16) & 0xFF
|
|
362
|
+
return f"{red:02X}{green:02X}{blue:02X}"
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
def _normalize_color_key(color_key: str) -> str:
|
|
366
|
+
"""Normalize a color key into a canonical representation.
|
|
367
|
+
|
|
368
|
+
Args:
|
|
369
|
+
color_key: Raw color key (hex or themed/indexed).
|
|
370
|
+
|
|
371
|
+
Returns:
|
|
372
|
+
Normalized color key.
|
|
373
|
+
"""
|
|
374
|
+
trimmed = color_key.strip()
|
|
375
|
+
if not trimmed:
|
|
376
|
+
return ""
|
|
377
|
+
lowered = trimmed.lower()
|
|
378
|
+
if lowered.startswith(("theme:", "indexed:", "auto:")) or lowered == "auto":
|
|
379
|
+
return lowered
|
|
380
|
+
hex_key = trimmed.lstrip("#").upper()
|
|
381
|
+
if len(hex_key) == 8:
|
|
382
|
+
hex_key = hex_key[2:]
|
|
383
|
+
return hex_key
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def _normalize_ignore_colors(ignore_colors: set[str] | None) -> set[str]:
|
|
387
|
+
"""Normalize ignore color keys.
|
|
388
|
+
|
|
389
|
+
Args:
|
|
390
|
+
ignore_colors: Optional set of color keys to ignore.
|
|
391
|
+
|
|
392
|
+
Returns:
|
|
393
|
+
Normalized set of color keys.
|
|
394
|
+
"""
|
|
395
|
+
if not ignore_colors:
|
|
396
|
+
return set()
|
|
397
|
+
normalized = {_normalize_color_key(color) for color in ignore_colors}
|
|
398
|
+
return {color for color in normalized if color}
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def _should_ignore_color(color_key: str, ignore_colors: set[str]) -> bool:
|
|
402
|
+
"""Check whether a color key should be ignored.
|
|
403
|
+
|
|
404
|
+
Args:
|
|
405
|
+
color_key: Normalized color key.
|
|
406
|
+
ignore_colors: Normalized ignore color set.
|
|
407
|
+
|
|
408
|
+
Returns:
|
|
409
|
+
True when the color key is ignored.
|
|
410
|
+
"""
|
|
411
|
+
return color_key in ignore_colors
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def _color_to_key(color: Color | object) -> str | None:
|
|
415
|
+
"""Convert an openpyxl color object into a normalized key.
|
|
416
|
+
|
|
417
|
+
Args:
|
|
418
|
+
color: openpyxl color object.
|
|
419
|
+
|
|
420
|
+
Returns:
|
|
421
|
+
Normalized color key string or None when unavailable.
|
|
422
|
+
"""
|
|
423
|
+
rgb = getattr(color, "rgb", None)
|
|
424
|
+
if rgb:
|
|
425
|
+
return _normalize_rgb(str(rgb))
|
|
426
|
+
color_type = getattr(color, "type", None)
|
|
427
|
+
if color_type == "theme":
|
|
428
|
+
theme = getattr(color, "theme", None)
|
|
429
|
+
tint = getattr(color, "tint", None)
|
|
430
|
+
theme_id = "unknown" if theme is None else str(theme)
|
|
431
|
+
if tint is None:
|
|
432
|
+
return f"theme:{theme_id}"
|
|
433
|
+
return f"theme:{theme_id}:{tint}"
|
|
434
|
+
if color_type == "indexed":
|
|
435
|
+
indexed = getattr(color, "indexed", None)
|
|
436
|
+
if indexed is not None:
|
|
437
|
+
return f"indexed:{indexed}"
|
|
438
|
+
if color_type == "auto":
|
|
439
|
+
auto = getattr(color, "auto", None)
|
|
440
|
+
return "auto" if auto is None else f"auto:{auto}"
|
|
441
|
+
return None
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
def _normalize_rgb(rgb: str) -> str:
|
|
445
|
+
"""Normalize an RGB/ARGB string into 6-hex format.
|
|
446
|
+
|
|
447
|
+
Args:
|
|
448
|
+
rgb: Raw RGB/ARGB string from openpyxl.
|
|
449
|
+
|
|
450
|
+
Returns:
|
|
451
|
+
Normalized RGB hex string (uppercase, 6 chars when possible).
|
|
452
|
+
"""
|
|
453
|
+
cleaned = rgb.strip().upper()
|
|
454
|
+
if cleaned.startswith("0X"):
|
|
455
|
+
cleaned = cleaned[2:]
|
|
456
|
+
if len(cleaned) == 8:
|
|
457
|
+
cleaned = cleaned[2:]
|
|
458
|
+
return cleaned
|
|
37
459
|
|
|
38
460
|
|
|
39
461
|
def warn_once(key: str, message: str) -> None:
|
|
@@ -76,21 +498,21 @@ def extract_sheet_cells_with_links(file_path: Path) -> dict[str, list[CellRow]]:
|
|
|
76
498
|
- Links are mapped by column index string (e.g., "0") to hyperlink.target.
|
|
77
499
|
"""
|
|
78
500
|
cell_rows = extract_sheet_cells(file_path)
|
|
79
|
-
wb = load_workbook(file_path, data_only=True, read_only=False)
|
|
80
501
|
links_by_sheet: dict[str, dict[int, dict[str, str]]] = {}
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
for
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
502
|
+
with openpyxl_workbook(file_path, data_only=True, read_only=False) as wb:
|
|
503
|
+
for ws in wb.worksheets:
|
|
504
|
+
sheet_links: dict[int, dict[str, str]] = {}
|
|
505
|
+
for row in ws.iter_rows():
|
|
506
|
+
for cell in row:
|
|
507
|
+
link = getattr(cell, "hyperlink", None)
|
|
508
|
+
target = getattr(link, "target", None) if link else None
|
|
509
|
+
if not target:
|
|
510
|
+
continue
|
|
511
|
+
col_str = str(
|
|
512
|
+
cell.col_idx - 1
|
|
513
|
+
) # zero-based to align with extract_sheet_cells
|
|
514
|
+
sheet_links.setdefault(cell.row, {})[col_str] = target
|
|
515
|
+
links_by_sheet[ws.title] = sheet_links
|
|
94
516
|
|
|
95
517
|
merged: dict[str, list[CellRow]] = {}
|
|
96
518
|
for sheet_name, rows in cell_rows.items():
|
|
@@ -104,32 +526,67 @@ def extract_sheet_cells_with_links(file_path: Path) -> dict[str, list[CellRow]]:
|
|
|
104
526
|
return merged
|
|
105
527
|
|
|
106
528
|
|
|
107
|
-
def
|
|
108
|
-
sheet
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
529
|
+
def extract_sheet_merged_cells(file_path: Path) -> dict[str, list[MergedCell]]:
|
|
530
|
+
"""Extract merged cell ranges per sheet via openpyxl.
|
|
531
|
+
|
|
532
|
+
Args:
|
|
533
|
+
file_path: Excel workbook path.
|
|
534
|
+
|
|
535
|
+
Returns:
|
|
536
|
+
Mapping of sheet name to merged cell ranges.
|
|
537
|
+
"""
|
|
538
|
+
merged_by_sheet: dict[str, list[MergedCell]] = {}
|
|
539
|
+
with openpyxl_workbook(file_path, data_only=True, read_only=False) as wb:
|
|
540
|
+
for ws in wb.worksheets:
|
|
541
|
+
merged_ranges = getattr(ws, "merged_cells", None)
|
|
542
|
+
if merged_ranges is None:
|
|
543
|
+
merged_by_sheet[ws.title] = []
|
|
544
|
+
continue
|
|
545
|
+
results: list[MergedCell] = []
|
|
546
|
+
for merged_range in getattr(merged_ranges, "ranges", []):
|
|
547
|
+
bounds = range_boundaries(str(merged_range))
|
|
548
|
+
min_col, min_row, max_col, max_row = bounds
|
|
549
|
+
cell_value = ws.cell(row=min_row, column=min_col).value
|
|
550
|
+
value_str = "" if cell_value is None else str(cell_value)
|
|
551
|
+
results.append(
|
|
552
|
+
MergedCell(
|
|
553
|
+
r1=min_row,
|
|
554
|
+
c1=min_col - 1,
|
|
555
|
+
r2=max_row,
|
|
556
|
+
c2=max_col - 1,
|
|
557
|
+
v=value_str,
|
|
558
|
+
)
|
|
559
|
+
)
|
|
560
|
+
merged_by_sheet[ws.title] = results
|
|
561
|
+
return merged_by_sheet
|
|
562
|
+
|
|
563
|
+
|
|
564
|
+
def shrink_to_content( # noqa: C901
|
|
565
|
+
sheet: xw.Sheet,
|
|
566
|
+
top: int,
|
|
567
|
+
left: int,
|
|
568
|
+
bottom: int,
|
|
569
|
+
right: int,
|
|
570
|
+
require_inside_border: bool = False,
|
|
571
|
+
min_nonempty_ratio: float = 0.0,
|
|
572
|
+
) -> tuple[int, int, int, int]:
|
|
573
|
+
"""Trim a rectangle based on cell contents and optional border heuristics."""
|
|
117
574
|
rng = sheet.range((top, left), (bottom, right))
|
|
118
575
|
vals = rng.value
|
|
119
576
|
if vals is None:
|
|
120
577
|
vals = []
|
|
121
578
|
if not isinstance(vals, list):
|
|
122
579
|
vals = [[vals]]
|
|
123
|
-
elif vals and not isinstance(vals[0], list):
|
|
124
|
-
vals = [vals]
|
|
125
|
-
rows_n = len(vals)
|
|
126
|
-
cols_n = len(vals[0]) if rows_n else 0
|
|
127
|
-
|
|
128
|
-
def to_str(x: object) -> str:
|
|
129
|
-
return "" if x is None else str(x)
|
|
130
|
-
|
|
131
|
-
def is_empty_value(x: object) -> bool:
|
|
132
|
-
return to_str(x).strip() == ""
|
|
580
|
+
elif vals and not isinstance(vals[0], list):
|
|
581
|
+
vals = [vals]
|
|
582
|
+
rows_n = len(vals)
|
|
583
|
+
cols_n = len(vals[0]) if rows_n else 0
|
|
584
|
+
|
|
585
|
+
def to_str(x: object) -> str:
|
|
586
|
+
return "" if x is None else str(x)
|
|
587
|
+
|
|
588
|
+
def is_empty_value(x: object) -> bool:
|
|
589
|
+
return to_str(x).strip() == ""
|
|
133
590
|
|
|
134
591
|
def row_empty(i: int) -> bool:
|
|
135
592
|
return cols_n == 0 or all(is_empty_value(vals[i][j]) for j in range(cols_n))
|
|
@@ -149,11 +606,11 @@ def shrink_to_content( # noqa: C901
|
|
|
149
606
|
cnt = sum(1 for i in range(rows_n) if not is_empty_value(vals[i][j]))
|
|
150
607
|
return cnt / rows_n
|
|
151
608
|
|
|
152
|
-
def column_has_inside_border(col_idx: int) -> bool:
|
|
153
|
-
if not require_inside_border:
|
|
154
|
-
return False
|
|
155
|
-
try:
|
|
156
|
-
for r in range(top, bottom + 1):
|
|
609
|
+
def column_has_inside_border(col_idx: int) -> bool:
|
|
610
|
+
if not require_inside_border:
|
|
611
|
+
return False
|
|
612
|
+
try:
|
|
613
|
+
for r in range(top, bottom + 1):
|
|
157
614
|
ls = (
|
|
158
615
|
sheet.api.Cells(r, left + col_idx)
|
|
159
616
|
.Borders(XL_INSIDE_VERTICAL)
|
|
@@ -231,22 +688,28 @@ def shrink_to_content( # noqa: C901
|
|
|
231
688
|
bottom -= 1
|
|
232
689
|
else:
|
|
233
690
|
break
|
|
234
|
-
return top, left, bottom, right
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
def load_border_maps_xlsx( # noqa: C901
|
|
238
|
-
xlsx_path: Path, sheet_name: str
|
|
239
|
-
) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int]:
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
691
|
+
return top, left, bottom, right
|
|
692
|
+
|
|
693
|
+
|
|
694
|
+
def load_border_maps_xlsx( # noqa: C901
|
|
695
|
+
xlsx_path: Path, sheet_name: str
|
|
696
|
+
) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int]:
|
|
697
|
+
with openpyxl_workbook(xlsx_path, data_only=True, read_only=False) as wb:
|
|
698
|
+
if sheet_name not in wb.sheetnames:
|
|
699
|
+
raise KeyError(f"Sheet '{sheet_name}' not found in {xlsx_path}")
|
|
700
|
+
|
|
701
|
+
ws = wb[sheet_name]
|
|
702
|
+
try:
|
|
703
|
+
min_col, min_row, max_col, max_row = range_boundaries(
|
|
704
|
+
ws.calculate_dimension()
|
|
705
|
+
)
|
|
706
|
+
except Exception:
|
|
707
|
+
min_col, min_row, max_col, max_row = (
|
|
708
|
+
1,
|
|
709
|
+
1,
|
|
710
|
+
ws.max_column or 1,
|
|
711
|
+
ws.max_row or 1,
|
|
712
|
+
)
|
|
250
713
|
|
|
251
714
|
shape = (max_row + 1, max_col + 1)
|
|
252
715
|
has_border = np.zeros(shape, dtype=bool)
|
|
@@ -255,11 +718,11 @@ def load_border_maps_xlsx( # noqa: C901
|
|
|
255
718
|
left_edge = np.zeros(shape, dtype=bool)
|
|
256
719
|
right_edge = np.zeros(shape, dtype=bool)
|
|
257
720
|
|
|
258
|
-
def edge_has_style(edge: object) -> bool:
|
|
259
|
-
if edge is None:
|
|
260
|
-
return False
|
|
261
|
-
style = getattr(edge, "style", None)
|
|
262
|
-
return style is not None and style != "none"
|
|
721
|
+
def edge_has_style(edge: object) -> bool:
|
|
722
|
+
if edge is None:
|
|
723
|
+
return False
|
|
724
|
+
style = getattr(edge, "style", None)
|
|
725
|
+
return style is not None and style != "none"
|
|
263
726
|
|
|
264
727
|
for r in range(min_row, max_row + 1):
|
|
265
728
|
for c in range(min_col, max_col + 1):
|
|
@@ -270,134 +733,133 @@ def load_border_maps_xlsx( # noqa: C901
|
|
|
270
733
|
|
|
271
734
|
t = edge_has_style(b.top)
|
|
272
735
|
btm = edge_has_style(b.bottom)
|
|
273
|
-
left_border = edge_has_style(b.left)
|
|
274
|
-
rgt = edge_has_style(b.right)
|
|
275
|
-
|
|
276
|
-
if t or btm or left_border or rgt:
|
|
277
|
-
has_border[r, c] = True
|
|
278
|
-
if t:
|
|
279
|
-
top_edge[r, c] = True
|
|
280
|
-
if btm:
|
|
281
|
-
bottom_edge[r, c] = True
|
|
282
|
-
if left_border:
|
|
283
|
-
left_edge[r, c] = True
|
|
284
|
-
if rgt:
|
|
285
|
-
right_edge[r, c] = True
|
|
736
|
+
left_border = edge_has_style(b.left)
|
|
737
|
+
rgt = edge_has_style(b.right)
|
|
738
|
+
|
|
739
|
+
if t or btm or left_border or rgt:
|
|
740
|
+
has_border[r, c] = True
|
|
741
|
+
if t:
|
|
742
|
+
top_edge[r, c] = True
|
|
743
|
+
if btm:
|
|
744
|
+
bottom_edge[r, c] = True
|
|
745
|
+
if left_border:
|
|
746
|
+
left_edge[r, c] = True
|
|
747
|
+
if rgt:
|
|
748
|
+
right_edge[r, c] = True
|
|
286
749
|
|
|
287
|
-
wb.close()
|
|
288
750
|
return has_border, top_edge, bottom_edge, left_edge, right_edge, max_row, max_col
|
|
289
751
|
|
|
290
752
|
|
|
291
|
-
def _detect_border_clusters_numpy(
|
|
292
|
-
has_border: np.ndarray, min_size: int
|
|
293
|
-
) -> list[tuple[int, int, int, int]]:
|
|
294
|
-
from scipy.ndimage import label
|
|
295
|
-
|
|
296
|
-
structure = np.array([[0, 1, 0], [1, 1, 1], [0, 1, 0]], dtype=np.uint8)
|
|
297
|
-
lbl, num = label(has_border.astype(np.uint8), structure=structure)
|
|
298
|
-
rects: list[tuple[int, int, int, int]] = []
|
|
299
|
-
for k in range(1, int(num) + 1):
|
|
300
|
-
ys, xs = np.where(lbl == k)
|
|
301
|
-
if int(len(ys)) < min_size:
|
|
302
|
-
continue
|
|
303
|
-
rects.append((int(ys.min()), int(xs.min()), int(ys.max()), int(xs.max())))
|
|
304
|
-
return rects
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
def _detect_border_clusters_python(
|
|
308
|
-
has_border: np.ndarray, min_size: int
|
|
309
|
-
) -> list[tuple[int, int, int, int]]:
|
|
310
|
-
h, w = has_border.shape
|
|
311
|
-
visited = np.zeros_like(has_border, dtype=bool)
|
|
312
|
-
rects: list[tuple[int, int, int, int]] = []
|
|
313
|
-
for r in range(h):
|
|
314
|
-
for c in range(w):
|
|
315
|
-
if not has_border[r, c] or visited[r, c]:
|
|
316
|
-
continue
|
|
317
|
-
q = deque([(r, c)])
|
|
318
|
-
visited[r, c] = True
|
|
319
|
-
ys = [r]
|
|
320
|
-
xs = [c]
|
|
321
|
-
while q:
|
|
322
|
-
yy, xx = q.popleft()
|
|
323
|
-
for dy, dx in ((1, 0), (-1, 0), (0, 1), (0, -1)):
|
|
324
|
-
ny, nx = yy + dy, xx + dx
|
|
325
|
-
if (
|
|
326
|
-
0 <= ny < h
|
|
327
|
-
and 0 <= nx < w
|
|
328
|
-
and has_border[ny, nx]
|
|
329
|
-
and not visited[ny, nx]
|
|
330
|
-
):
|
|
331
|
-
visited[ny, nx] = True
|
|
332
|
-
q.append((ny, nx))
|
|
333
|
-
ys.append(ny)
|
|
334
|
-
xs.append(nx)
|
|
335
|
-
if len(ys) >= min_size:
|
|
336
|
-
rects.append((min(ys), min(xs), max(ys), max(xs)))
|
|
337
|
-
return rects
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
def detect_border_clusters(
|
|
341
|
-
has_border: np.ndarray, min_size: int = 4
|
|
342
|
-
) -> list[tuple[int, int, int, int]]:
|
|
343
|
-
try:
|
|
344
|
-
return _detect_border_clusters_numpy(has_border, min_size)
|
|
345
|
-
except Exception:
|
|
346
|
-
warn_once(
|
|
347
|
-
"scipy-missing",
|
|
348
|
-
"scipy is not available. Falling back to pure-Python BFS for connected components, which may be significantly slower.",
|
|
349
|
-
)
|
|
350
|
-
return _detect_border_clusters_python(has_border, min_size)
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
def _get_values_block(
|
|
354
|
-
ws: Worksheet, top: int, left: int, bottom: int, right: int
|
|
355
|
-
) -> list[list[object]]:
|
|
356
|
-
vals: list[list[object]] = []
|
|
357
|
-
for row in ws.iter_rows(
|
|
358
|
-
min_row=top, max_row=bottom, min_col=left, max_col=right, values_only=True
|
|
359
|
-
):
|
|
360
|
-
vals.append(list(row))
|
|
361
|
-
return vals
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
def _ensure_matrix(matrix: MatrixInput) -> list[list[object]]:
|
|
365
|
-
rows_seq = list(matrix)
|
|
366
|
-
if not rows_seq:
|
|
367
|
-
return []
|
|
368
|
-
first = rows_seq[0]
|
|
369
|
-
if isinstance(first, Sequence) and not isinstance(first,
|
|
370
|
-
normalized: list[list[object]] = []
|
|
371
|
-
for row in rows_seq:
|
|
372
|
-
if isinstance(row, Sequence) and not isinstance(
|
|
373
|
-
row,
|
|
374
|
-
):
|
|
375
|
-
normalized.append(list(row))
|
|
376
|
-
else:
|
|
377
|
-
normalized.append([row])
|
|
378
|
-
return normalized
|
|
379
|
-
return [list(rows_seq)]
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
def _table_density_metrics(matrix: MatrixInput) -> tuple[float, float]:
|
|
383
|
-
"""
|
|
384
|
-
Given a 2D matrix (list of rows), return (density, coverage).
|
|
385
|
-
density: nonempty / total cells.
|
|
386
|
-
coverage: area of tight bounding box of nonempty cells divided by total area.
|
|
387
|
-
"""
|
|
388
|
-
normalized = _ensure_matrix(matrix)
|
|
389
|
-
if not normalized:
|
|
390
|
-
return 0.0, 0.0
|
|
391
|
-
rows = len(normalized)
|
|
392
|
-
cols = len(normalized[0]) if rows else 0
|
|
393
|
-
if rows == 0 or cols == 0:
|
|
394
|
-
return 0.0, 0.0
|
|
395
|
-
|
|
396
|
-
nonempty_coords = []
|
|
397
|
-
for i, row in enumerate(normalized):
|
|
398
|
-
for j, v in enumerate(row):
|
|
399
|
-
if not (v is None or str(v).strip() == ""):
|
|
400
|
-
nonempty_coords.append((i, j))
|
|
753
|
+
def _detect_border_clusters_numpy(
|
|
754
|
+
has_border: np.ndarray, min_size: int
|
|
755
|
+
) -> list[tuple[int, int, int, int]]:
|
|
756
|
+
from scipy.ndimage import label
|
|
757
|
+
|
|
758
|
+
structure = np.array([[0, 1, 0], [1, 1, 1], [0, 1, 0]], dtype=np.uint8)
|
|
759
|
+
lbl, num = label(has_border.astype(np.uint8), structure=structure)
|
|
760
|
+
rects: list[tuple[int, int, int, int]] = []
|
|
761
|
+
for k in range(1, int(num) + 1):
|
|
762
|
+
ys, xs = np.where(lbl == k)
|
|
763
|
+
if int(len(ys)) < min_size:
|
|
764
|
+
continue
|
|
765
|
+
rects.append((int(ys.min()), int(xs.min()), int(ys.max()), int(xs.max())))
|
|
766
|
+
return rects
|
|
767
|
+
|
|
768
|
+
|
|
769
|
+
def _detect_border_clusters_python(
|
|
770
|
+
has_border: np.ndarray, min_size: int
|
|
771
|
+
) -> list[tuple[int, int, int, int]]:
|
|
772
|
+
h, w = has_border.shape
|
|
773
|
+
visited = np.zeros_like(has_border, dtype=bool)
|
|
774
|
+
rects: list[tuple[int, int, int, int]] = []
|
|
775
|
+
for r in range(h):
|
|
776
|
+
for c in range(w):
|
|
777
|
+
if not has_border[r, c] or visited[r, c]:
|
|
778
|
+
continue
|
|
779
|
+
q = deque([(r, c)])
|
|
780
|
+
visited[r, c] = True
|
|
781
|
+
ys = [r]
|
|
782
|
+
xs = [c]
|
|
783
|
+
while q:
|
|
784
|
+
yy, xx = q.popleft()
|
|
785
|
+
for dy, dx in ((1, 0), (-1, 0), (0, 1), (0, -1)):
|
|
786
|
+
ny, nx = yy + dy, xx + dx
|
|
787
|
+
if (
|
|
788
|
+
0 <= ny < h
|
|
789
|
+
and 0 <= nx < w
|
|
790
|
+
and has_border[ny, nx]
|
|
791
|
+
and not visited[ny, nx]
|
|
792
|
+
):
|
|
793
|
+
visited[ny, nx] = True
|
|
794
|
+
q.append((ny, nx))
|
|
795
|
+
ys.append(ny)
|
|
796
|
+
xs.append(nx)
|
|
797
|
+
if len(ys) >= min_size:
|
|
798
|
+
rects.append((min(ys), min(xs), max(ys), max(xs)))
|
|
799
|
+
return rects
|
|
800
|
+
|
|
801
|
+
|
|
802
|
+
def detect_border_clusters(
|
|
803
|
+
has_border: np.ndarray, min_size: int = 4
|
|
804
|
+
) -> list[tuple[int, int, int, int]]:
|
|
805
|
+
try:
|
|
806
|
+
return _detect_border_clusters_numpy(has_border, min_size)
|
|
807
|
+
except Exception:
|
|
808
|
+
warn_once(
|
|
809
|
+
"scipy-missing",
|
|
810
|
+
"scipy is not available. Falling back to pure-Python BFS for connected components, which may be significantly slower.",
|
|
811
|
+
)
|
|
812
|
+
return _detect_border_clusters_python(has_border, min_size)
|
|
813
|
+
|
|
814
|
+
|
|
815
|
+
def _get_values_block(
|
|
816
|
+
ws: Worksheet, top: int, left: int, bottom: int, right: int
|
|
817
|
+
) -> list[list[object]]:
|
|
818
|
+
vals: list[list[object]] = []
|
|
819
|
+
for row in ws.iter_rows(
|
|
820
|
+
min_row=top, max_row=bottom, min_col=left, max_col=right, values_only=True
|
|
821
|
+
):
|
|
822
|
+
vals.append(list(row))
|
|
823
|
+
return vals
|
|
824
|
+
|
|
825
|
+
|
|
826
|
+
def _ensure_matrix(matrix: MatrixInput) -> list[list[object]]:
|
|
827
|
+
rows_seq = list(matrix)
|
|
828
|
+
if not rows_seq:
|
|
829
|
+
return []
|
|
830
|
+
first = rows_seq[0]
|
|
831
|
+
if isinstance(first, Sequence) and not isinstance(first, str | bytes | bytearray):
|
|
832
|
+
normalized: list[list[object]] = []
|
|
833
|
+
for row in rows_seq:
|
|
834
|
+
if isinstance(row, Sequence) and not isinstance(
|
|
835
|
+
row, str | bytes | bytearray
|
|
836
|
+
):
|
|
837
|
+
normalized.append(list(row))
|
|
838
|
+
else:
|
|
839
|
+
normalized.append([row])
|
|
840
|
+
return normalized
|
|
841
|
+
return [list(rows_seq)]
|
|
842
|
+
|
|
843
|
+
|
|
844
|
+
def _table_density_metrics(matrix: MatrixInput) -> tuple[float, float]:
|
|
845
|
+
"""
|
|
846
|
+
Given a 2D matrix (list of rows), return (density, coverage).
|
|
847
|
+
density: nonempty / total cells.
|
|
848
|
+
coverage: area of tight bounding box of nonempty cells divided by total area.
|
|
849
|
+
"""
|
|
850
|
+
normalized = _ensure_matrix(matrix)
|
|
851
|
+
if not normalized:
|
|
852
|
+
return 0.0, 0.0
|
|
853
|
+
rows = len(normalized)
|
|
854
|
+
cols = len(normalized[0]) if rows else 0
|
|
855
|
+
if rows == 0 or cols == 0:
|
|
856
|
+
return 0.0, 0.0
|
|
857
|
+
|
|
858
|
+
nonempty_coords = []
|
|
859
|
+
for i, row in enumerate(normalized):
|
|
860
|
+
for j, v in enumerate(row):
|
|
861
|
+
if not (v is None or str(v).strip() == ""):
|
|
862
|
+
nonempty_coords.append((i, j))
|
|
401
863
|
|
|
402
864
|
total = rows * cols
|
|
403
865
|
if not nonempty_coords:
|
|
@@ -414,27 +876,29 @@ def _table_density_metrics(matrix: MatrixInput) -> tuple[float, float]:
|
|
|
414
876
|
return density, coverage
|
|
415
877
|
|
|
416
878
|
|
|
417
|
-
def _is_plausible_table(matrix: MatrixInput) -> bool:
|
|
418
|
-
"""
|
|
419
|
-
Heuristic: require at least 2 rows and 2 cols with meaningful data.
|
|
420
|
-
- At least 2 rows have 2 以上の非空セル
|
|
421
|
-
- At least 2 columns have 2 以上の非空セル
|
|
422
|
-
"""
|
|
423
|
-
normalized = _ensure_matrix(matrix)
|
|
424
|
-
if not normalized:
|
|
425
|
-
return False
|
|
426
|
-
|
|
427
|
-
rows = len(normalized)
|
|
428
|
-
cols =
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
879
|
+
def _is_plausible_table(matrix: MatrixInput) -> bool:
|
|
880
|
+
"""
|
|
881
|
+
Heuristic: require at least 2 rows and 2 cols with meaningful data.
|
|
882
|
+
- At least 2 rows have 2 以上の非空セル
|
|
883
|
+
- At least 2 columns have 2 以上の非空セル
|
|
884
|
+
"""
|
|
885
|
+
normalized = _ensure_matrix(matrix)
|
|
886
|
+
if not normalized:
|
|
887
|
+
return False
|
|
888
|
+
|
|
889
|
+
rows = len(normalized)
|
|
890
|
+
cols = (
|
|
891
|
+
max((len(r) if isinstance(r, list) else 1) for r in normalized) if rows else 0
|
|
892
|
+
)
|
|
893
|
+
if rows < 2 or cols < 2:
|
|
894
|
+
return False
|
|
895
|
+
|
|
896
|
+
row_counts: list[int] = []
|
|
897
|
+
col_counts = [0] * cols
|
|
898
|
+
for r in normalized:
|
|
899
|
+
cnt = 0
|
|
900
|
+
for j in range(cols):
|
|
901
|
+
v = r[j] if j < len(r) else None
|
|
438
902
|
if not (v is None or str(v).strip() == ""):
|
|
439
903
|
cnt += 1
|
|
440
904
|
col_counts[j] += 1
|
|
@@ -445,7 +909,9 @@ def _is_plausible_table(matrix: MatrixInput) -> bool:
|
|
|
445
909
|
return rows_with_two >= 2 and cols_with_two >= 2
|
|
446
910
|
|
|
447
911
|
|
|
448
|
-
def _nonempty_clusters(
|
|
912
|
+
def _nonempty_clusters(
|
|
913
|
+
matrix: Sequence[Sequence[object]],
|
|
914
|
+
) -> list[tuple[int, int, int, int]]:
|
|
449
915
|
"""Return bounding boxes of connected components of nonempty cells (4-neighbor)."""
|
|
450
916
|
if not matrix:
|
|
451
917
|
return []
|
|
@@ -460,12 +926,12 @@ def _nonempty_clusters(matrix: Sequence[Sequence[object]]) -> list[tuple[int, in
|
|
|
460
926
|
visited = [[False] * cols for _ in range(rows)]
|
|
461
927
|
boxes: list[tuple[int, int, int, int]] = []
|
|
462
928
|
|
|
463
|
-
def bfs(sr: int, sc: int) -> tuple[int, int, int, int]:
|
|
464
|
-
q = deque([(sr, sc)])
|
|
465
|
-
visited[sr][sc] = True
|
|
466
|
-
ys = [sr]
|
|
467
|
-
xs = [sc]
|
|
468
|
-
while q:
|
|
929
|
+
def bfs(sr: int, sc: int) -> tuple[int, int, int, int]:
|
|
930
|
+
q = deque([(sr, sc)])
|
|
931
|
+
visited[sr][sc] = True
|
|
932
|
+
ys = [sr]
|
|
933
|
+
xs = [sc]
|
|
934
|
+
while q:
|
|
469
935
|
r, c = q.popleft()
|
|
470
936
|
for dr, dc in ((1, 0), (-1, 0), (0, 1), (0, -1)):
|
|
471
937
|
nr, nc = r + dr, c + dc
|
|
@@ -488,17 +954,17 @@ def _nonempty_clusters(matrix: Sequence[Sequence[object]]) -> list[tuple[int, in
|
|
|
488
954
|
return boxes
|
|
489
955
|
|
|
490
956
|
|
|
491
|
-
def _normalize_matrix(matrix: object) -> list[list[object]]:
|
|
492
|
-
if matrix is None:
|
|
493
|
-
return []
|
|
494
|
-
if isinstance(matrix, list):
|
|
495
|
-
return _ensure_matrix(matrix)
|
|
496
|
-
if isinstance(matrix, Sequence) and not isinstance(matrix,
|
|
497
|
-
return _ensure_matrix(matrix)
|
|
498
|
-
return [[matrix]]
|
|
957
|
+
def _normalize_matrix(matrix: object) -> list[list[object]]:
|
|
958
|
+
if matrix is None:
|
|
959
|
+
return []
|
|
960
|
+
if isinstance(matrix, list):
|
|
961
|
+
return _ensure_matrix(matrix)
|
|
962
|
+
if isinstance(matrix, Sequence) and not isinstance(matrix, str | bytes | bytearray):
|
|
963
|
+
return _ensure_matrix(matrix)
|
|
964
|
+
return [[matrix]]
|
|
499
965
|
|
|
500
966
|
|
|
501
|
-
def _header_like_row(row: list[object]) -> bool:
|
|
967
|
+
def _header_like_row(row: list[object]) -> bool:
|
|
502
968
|
nonempty = [v for v in row if not (v is None or str(v).strip() == "")]
|
|
503
969
|
if len(nonempty) < 2:
|
|
504
970
|
return False
|
|
@@ -513,19 +979,21 @@ def _header_like_row(row: list[object]) -> bool:
|
|
|
513
979
|
return str_like >= num_like and str_like >= 1
|
|
514
980
|
|
|
515
981
|
|
|
516
|
-
def _table_signal_score(matrix: Sequence[Sequence[object]]) -> float:
|
|
517
|
-
normalized = _ensure_matrix(matrix)
|
|
518
|
-
density, coverage = _table_density_metrics(normalized)
|
|
519
|
-
header = any(_header_like_row(r) for r in normalized[:2]) # check first 2 rows
|
|
982
|
+
def _table_signal_score(matrix: Sequence[Sequence[object]]) -> float:
|
|
983
|
+
normalized = _ensure_matrix(matrix)
|
|
984
|
+
density, coverage = _table_density_metrics(normalized)
|
|
985
|
+
header = any(_header_like_row(r) for r in normalized[:2]) # check first 2 rows
|
|
520
986
|
|
|
521
|
-
rows = len(normalized)
|
|
522
|
-
cols =
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
987
|
+
rows = len(normalized)
|
|
988
|
+
cols = (
|
|
989
|
+
max((len(r) if isinstance(r, list) else 1) for r in normalized) if rows else 0
|
|
990
|
+
)
|
|
991
|
+
row_counts: list[int] = []
|
|
992
|
+
col_counts = [0] * cols if cols else []
|
|
993
|
+
for r in normalized:
|
|
994
|
+
cnt = 0
|
|
995
|
+
for j in range(cols):
|
|
996
|
+
v = r[j] if j < len(r) else None
|
|
529
997
|
if not (v is None or str(v).strip() == ""):
|
|
530
998
|
cnt += 1
|
|
531
999
|
if j < len(col_counts):
|
|
@@ -565,28 +1033,28 @@ def set_table_detection_params(
|
|
|
565
1033
|
_DETECTION_CONFIG["min_nonempty_cells"] = min_nonempty_cells
|
|
566
1034
|
|
|
567
1035
|
|
|
568
|
-
def shrink_to_content_openpyxl( # noqa: C901
|
|
569
|
-
ws: Worksheet,
|
|
570
|
-
top: int,
|
|
571
|
-
left: int,
|
|
572
|
-
bottom: int,
|
|
573
|
-
right: int,
|
|
574
|
-
require_inside_border: bool,
|
|
575
|
-
top_edge: np.ndarray,
|
|
576
|
-
bottom_edge: np.ndarray,
|
|
577
|
-
left_edge: np.ndarray,
|
|
578
|
-
right_edge: np.ndarray,
|
|
579
|
-
min_nonempty_ratio: float = 0.0,
|
|
580
|
-
) -> tuple[int, int, int, int]:
|
|
581
|
-
vals = _get_values_block(ws, top, left, bottom, right)
|
|
582
|
-
rows_n = bottom - top + 1
|
|
583
|
-
cols_n = right - left + 1
|
|
584
|
-
|
|
585
|
-
def to_str(x: object) -> str:
|
|
586
|
-
return "" if x is None else str(x)
|
|
587
|
-
|
|
588
|
-
def is_empty_value(x: object) -> bool:
|
|
589
|
-
return to_str(x).strip() == ""
|
|
1036
|
+
def shrink_to_content_openpyxl( # noqa: C901
|
|
1037
|
+
ws: Worksheet,
|
|
1038
|
+
top: int,
|
|
1039
|
+
left: int,
|
|
1040
|
+
bottom: int,
|
|
1041
|
+
right: int,
|
|
1042
|
+
require_inside_border: bool,
|
|
1043
|
+
top_edge: np.ndarray,
|
|
1044
|
+
bottom_edge: np.ndarray,
|
|
1045
|
+
left_edge: np.ndarray,
|
|
1046
|
+
right_edge: np.ndarray,
|
|
1047
|
+
min_nonempty_ratio: float = 0.0,
|
|
1048
|
+
) -> tuple[int, int, int, int]:
|
|
1049
|
+
vals = _get_values_block(ws, top, left, bottom, right)
|
|
1050
|
+
rows_n = bottom - top + 1
|
|
1051
|
+
cols_n = right - left + 1
|
|
1052
|
+
|
|
1053
|
+
def to_str(x: object) -> str:
|
|
1054
|
+
return "" if x is None else str(x)
|
|
1055
|
+
|
|
1056
|
+
def is_empty_value(x: object) -> bool:
|
|
1057
|
+
return to_str(x).strip() == ""
|
|
590
1058
|
|
|
591
1059
|
def row_nonempty_ratio_local(i: int) -> float:
|
|
592
1060
|
if cols_n <= 0:
|
|
@@ -725,29 +1193,45 @@ def shrink_to_content_openpyxl( # noqa: C901
|
|
|
725
1193
|
return top, left, bottom, right
|
|
726
1194
|
|
|
727
1195
|
|
|
728
|
-
def
|
|
729
|
-
"""
|
|
730
|
-
|
|
1196
|
+
def _extract_listobject_tables(sheet: xw.Sheet) -> list[str]:
|
|
1197
|
+
"""Extract table ranges from Excel ListObjects via COM.
|
|
1198
|
+
|
|
1199
|
+
Args:
|
|
1200
|
+
sheet: xlwings worksheet.
|
|
1201
|
+
|
|
1202
|
+
Returns:
|
|
1203
|
+
List of table ranges as Excel A1 strings.
|
|
1204
|
+
"""
|
|
1205
|
+
tables: list[str] = []
|
|
731
1206
|
try:
|
|
732
1207
|
for lo in sheet.api.ListObjects:
|
|
733
1208
|
rng = lo.Range
|
|
734
|
-
top_row = int(rng.Row)
|
|
735
|
-
left_col = int(rng.Column)
|
|
736
|
-
bottom_row = top_row + int(rng.Rows.Count) - 1
|
|
737
|
-
right_col = left_col + int(rng.Columns.Count) - 1
|
|
738
1209
|
addr = rng.Address(RowAbsolute=False, ColumnAbsolute=False)
|
|
739
1210
|
tables.append(addr)
|
|
740
|
-
except Exception:
|
|
741
|
-
pass
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
1211
|
+
except Exception:
|
|
1212
|
+
pass
|
|
1213
|
+
return tables
|
|
1214
|
+
|
|
1215
|
+
|
|
1216
|
+
def _detect_border_rectangles_xlwings(
|
|
1217
|
+
sheet: xw.Sheet,
|
|
1218
|
+
) -> list[tuple[int, int, int, int]]:
|
|
1219
|
+
"""Detect bordered rectangles in a sheet using COM border inspection.
|
|
1220
|
+
|
|
1221
|
+
Args:
|
|
1222
|
+
sheet: xlwings worksheet.
|
|
1223
|
+
|
|
1224
|
+
Returns:
|
|
1225
|
+
List of rectangles as (top_row, left_col, bottom_row, right_col).
|
|
1226
|
+
"""
|
|
1227
|
+
used = sheet.used_range
|
|
1228
|
+
max_row = used.last_cell.row
|
|
1229
|
+
max_col = used.last_cell.column
|
|
1230
|
+
|
|
1231
|
+
def cell_has_any_border(r: int, c: int) -> bool:
|
|
1232
|
+
try:
|
|
1233
|
+
b = sheet.api.Cells(r, c).Borders
|
|
1234
|
+
for idx in (
|
|
751
1235
|
XL_EDGE_LEFT,
|
|
752
1236
|
XL_EDGE_TOP,
|
|
753
1237
|
XL_EDGE_RIGHT,
|
|
@@ -768,211 +1252,243 @@ def detect_tables_xlwings(sheet: xw.Sheet) -> list[str]: # noqa: C901
|
|
|
768
1252
|
return False
|
|
769
1253
|
|
|
770
1254
|
grid = [[False] * (max_col + 1) for _ in range(max_row + 1)]
|
|
771
|
-
for r in range(1, max_row + 1):
|
|
772
|
-
for c in range(1, max_col + 1):
|
|
773
|
-
if cell_has_any_border(r, c):
|
|
774
|
-
grid[r][c] = True
|
|
775
|
-
visited = [[False] * (max_col + 1) for _ in range(max_row + 1)]
|
|
776
|
-
|
|
777
|
-
def dfs(sr: int, sc: int, acc: list[tuple[int, int]]) -> None:
|
|
778
|
-
stack = [(sr, sc)]
|
|
779
|
-
while stack:
|
|
780
|
-
rr, cc = stack.pop()
|
|
781
|
-
if not (1 <= rr <= max_row and 1 <= cc <= max_col):
|
|
782
|
-
continue
|
|
783
|
-
if visited[rr][cc] or not grid[rr][cc]:
|
|
784
|
-
continue
|
|
785
|
-
visited[rr][cc] = True
|
|
786
|
-
acc.append((rr, cc))
|
|
787
|
-
for dr, dc in ((1, 0), (-1, 0), (0, 1), (0, -1)):
|
|
788
|
-
stack.append((rr + dr, cc + dc))
|
|
789
|
-
|
|
790
|
-
clusters: list[tuple[int, int, int, int]] = []
|
|
791
1255
|
for r in range(1, max_row + 1):
|
|
792
1256
|
for c in range(1, max_col + 1):
|
|
793
|
-
if
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
left_col = min(cols)
|
|
803
|
-
right_col = max(cols)
|
|
804
|
-
clusters.append((top_row, left_col, bottom_row, right_col))
|
|
805
|
-
|
|
806
|
-
def overlaps_for_merge(
|
|
807
|
-
a: tuple[int, int, int, int], b: tuple[int, int, int, int]
|
|
808
|
-
) -> bool:
|
|
809
|
-
# Do not merge if one rect fully contains the other (separate clusters like big frame vs small table)
|
|
810
|
-
contains = (
|
|
811
|
-
a[0] <= b[0] and a[1] <= b[1] and a[2] >= b[2] and a[3] >= b[3]
|
|
812
|
-
) or (b[0] <= a[0] and b[1] <= a[1] and b[2] >= a[2] and b[3] >= a[3])
|
|
813
|
-
if contains:
|
|
814
|
-
return False
|
|
815
|
-
return not (a[1] > b[3] or a[3] < b[1] or a[0] > b[2] or a[2] < b[0])
|
|
1257
|
+
if cell_has_any_border(r, c):
|
|
1258
|
+
grid[r][c] = True
|
|
1259
|
+
return _detect_border_rectangles(grid, min_size=4)
|
|
1260
|
+
|
|
1261
|
+
|
|
1262
|
+
def _detect_border_rectangles(
|
|
1263
|
+
has_border: np.ndarray | Sequence[Sequence[bool]], *, min_size: int
|
|
1264
|
+
) -> list[tuple[int, int, int, int]]:
|
|
1265
|
+
"""Detect border rectangles from a boolean grid.
|
|
816
1266
|
|
|
1267
|
+
Args:
|
|
1268
|
+
has_border: Boolean grid of border presence.
|
|
1269
|
+
min_size: Minimum cluster size to keep.
|
|
1270
|
+
|
|
1271
|
+
Returns:
|
|
1272
|
+
List of rectangles as (top_row, left_col, bottom_row, right_col).
|
|
1273
|
+
"""
|
|
1274
|
+
return detect_border_clusters(np.asarray(has_border, dtype=bool), min_size=min_size)
|
|
1275
|
+
|
|
1276
|
+
|
|
1277
|
+
def _merge_rectangles(
|
|
1278
|
+
rects: Sequence[tuple[int, int, int, int]],
|
|
1279
|
+
) -> list[tuple[int, int, int, int]]:
|
|
1280
|
+
"""Merge overlapping rectangles while preserving contained regions.
|
|
1281
|
+
|
|
1282
|
+
Args:
|
|
1283
|
+
rects: Sequence of rectangles (top, left, bottom, right).
|
|
1284
|
+
|
|
1285
|
+
Returns:
|
|
1286
|
+
Merged rectangles sorted by coordinates.
|
|
1287
|
+
"""
|
|
817
1288
|
merged_rects: list[tuple[int, int, int, int]] = []
|
|
818
|
-
for rect in sorted(
|
|
1289
|
+
for rect in sorted(rects):
|
|
819
1290
|
merged = False
|
|
820
|
-
for i,
|
|
821
|
-
if
|
|
1291
|
+
for i, existing in enumerate(merged_rects):
|
|
1292
|
+
if _rectangles_overlap_for_merge(rect, existing):
|
|
822
1293
|
merged_rects[i] = (
|
|
823
|
-
min(rect[0],
|
|
824
|
-
min(rect[1],
|
|
825
|
-
max(rect[2],
|
|
826
|
-
max(rect[3],
|
|
1294
|
+
min(rect[0], existing[0]),
|
|
1295
|
+
min(rect[1], existing[1]),
|
|
1296
|
+
max(rect[2], existing[2]),
|
|
1297
|
+
max(rect[3], existing[3]),
|
|
827
1298
|
)
|
|
828
1299
|
merged = True
|
|
829
1300
|
break
|
|
830
1301
|
if not merged:
|
|
831
1302
|
merged_rects.append(rect)
|
|
1303
|
+
return merged_rects
|
|
832
1304
|
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
1305
|
+
|
|
1306
|
+
def _rectangles_overlap_for_merge(
|
|
1307
|
+
a: tuple[int, int, int, int], b: tuple[int, int, int, int]
|
|
1308
|
+
) -> bool:
|
|
1309
|
+
"""Return True when rectangles should be merged.
|
|
1310
|
+
|
|
1311
|
+
Args:
|
|
1312
|
+
a: First rectangle (top, left, bottom, right).
|
|
1313
|
+
b: Second rectangle (top, left, bottom, right).
|
|
1314
|
+
|
|
1315
|
+
Returns:
|
|
1316
|
+
True if rectangles overlap and neither fully contains the other.
|
|
1317
|
+
"""
|
|
1318
|
+
contains = (a[0] <= b[0] and a[1] <= b[1] and a[2] >= b[2] and a[3] >= b[3]) or (
|
|
1319
|
+
b[0] <= a[0] and b[1] <= a[1] and b[2] >= a[2] and b[3] >= a[3]
|
|
1320
|
+
)
|
|
1321
|
+
if contains:
|
|
1322
|
+
return False
|
|
1323
|
+
return not (a[1] > b[3] or a[3] < b[1] or a[0] > b[2] or a[2] < b[0])
|
|
1324
|
+
|
|
1325
|
+
|
|
1326
|
+
def _collect_table_candidates_from_values(
|
|
1327
|
+
values: Sequence[Sequence[object]],
|
|
1328
|
+
*,
|
|
1329
|
+
base_top: int,
|
|
1330
|
+
base_left: int,
|
|
1331
|
+
col_name: Callable[[int], str],
|
|
1332
|
+
) -> list[str]:
|
|
1333
|
+
"""Collect table candidates from a normalized value matrix.
|
|
1334
|
+
|
|
1335
|
+
Args:
|
|
1336
|
+
values: Normalized matrix of cell values.
|
|
1337
|
+
base_top: Top row index of the matrix in worksheet coordinates (1-based).
|
|
1338
|
+
base_left: Left column index of the matrix in worksheet coordinates (1-based).
|
|
1339
|
+
col_name: Function to convert column index to Excel letters.
|
|
1340
|
+
|
|
1341
|
+
Returns:
|
|
1342
|
+
List of detected table candidate range strings.
|
|
1343
|
+
"""
|
|
1344
|
+
normalized = [list(row) for row in values]
|
|
1345
|
+
nonempty = _count_nonempty_cells(normalized)
|
|
1346
|
+
if nonempty < _DETECTION_CONFIG["min_nonempty_cells"]:
|
|
1347
|
+
return []
|
|
1348
|
+
|
|
1349
|
+
results: list[str] = []
|
|
1350
|
+
clusters = _nonempty_clusters(normalized)
|
|
1351
|
+
for r0, c0, r1, c1 in clusters:
|
|
1352
|
+
sub = [row[c0 : c1 + 1] for row in normalized[r0 : r1 + 1]]
|
|
1353
|
+
density, coverage = _table_density_metrics(sub)
|
|
1354
|
+
if (
|
|
1355
|
+
density < _DETECTION_CONFIG["density_min"]
|
|
1356
|
+
and coverage < _DETECTION_CONFIG["coverage_min"]
|
|
1357
|
+
):
|
|
850
1358
|
continue
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
)
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
if addr not in dedup:
|
|
867
|
-
dedup.add(addr)
|
|
868
|
-
tables.append(addr)
|
|
869
|
-
return tables
|
|
1359
|
+
if not _is_plausible_table(sub):
|
|
1360
|
+
continue
|
|
1361
|
+
score = _table_signal_score(sub)
|
|
1362
|
+
if score < _DETECTION_CONFIG["table_score_threshold"]:
|
|
1363
|
+
continue
|
|
1364
|
+
addr = (
|
|
1365
|
+
f"{col_name(base_left + c0)}{base_top + r0}:"
|
|
1366
|
+
f"{col_name(base_left + c1)}{base_top + r1}"
|
|
1367
|
+
)
|
|
1368
|
+
results.append(addr)
|
|
1369
|
+
return results
|
|
1370
|
+
|
|
1371
|
+
|
|
1372
|
+
def _count_nonempty_cells(values: Sequence[Sequence[object]]) -> int:
|
|
1373
|
+
"""Count non-empty cells in a normalized matrix.
|
|
870
1374
|
|
|
1375
|
+
Args:
|
|
1376
|
+
values: Normalized matrix of values.
|
|
871
1377
|
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
data_only=True,
|
|
878
|
-
read_only=False,
|
|
1378
|
+
Returns:
|
|
1379
|
+
Number of non-empty cells.
|
|
1380
|
+
"""
|
|
1381
|
+
return sum(
|
|
1382
|
+
1 for row in values for v in row if not (v is None or str(v).strip() == "")
|
|
879
1383
|
)
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
1384
|
+
|
|
1385
|
+
|
|
1386
|
+
def _extract_openpyxl_table_refs(ws: Worksheet) -> list[str]:
|
|
1387
|
+
"""Extract table reference strings from an openpyxl worksheet.
|
|
1388
|
+
|
|
1389
|
+
Args:
|
|
1390
|
+
ws: Target worksheet.
|
|
1391
|
+
|
|
1392
|
+
Returns:
|
|
1393
|
+
List of table reference strings.
|
|
1394
|
+
"""
|
|
1395
|
+
tables: list[str] = []
|
|
1396
|
+
try:
|
|
1397
|
+
openpyxl_tables: list[object] = []
|
|
1398
|
+
if hasattr(ws, "tables") and ws.tables:
|
|
1399
|
+
if isinstance(ws.tables, dict):
|
|
1400
|
+
openpyxl_tables = list(ws.tables.values())
|
|
1401
|
+
else:
|
|
1402
|
+
openpyxl_tables = list(ws.tables)
|
|
1403
|
+
elif hasattr(ws, "_tables") and ws._tables:
|
|
1404
|
+
openpyxl_tables = list(ws._tables)
|
|
1405
|
+
for t in openpyxl_tables:
|
|
1406
|
+
addr = getattr(t, "ref", None)
|
|
1407
|
+
if addr:
|
|
1408
|
+
tables.append(str(addr))
|
|
895
1409
|
except Exception:
|
|
896
1410
|
pass
|
|
1411
|
+
return tables
|
|
897
1412
|
|
|
898
|
-
has_border, top_edge, bottom_edge, left_edge, right_edge, max_row, max_col = (
|
|
899
|
-
load_border_maps_xlsx(xlsx_path, sheet_name)
|
|
900
|
-
)
|
|
901
|
-
rects = detect_border_clusters(has_border, min_size=4)
|
|
902
|
-
|
|
903
|
-
def overlaps_for_merge(
|
|
904
|
-
a: tuple[int, int, int, int], b: tuple[int, int, int, int]
|
|
905
|
-
) -> bool:
|
|
906
|
-
contains = (
|
|
907
|
-
a[0] <= b[0] and a[1] <= b[1] and a[2] >= b[2] and a[3] >= b[3]
|
|
908
|
-
) or (b[0] <= a[0] and b[1] <= a[1] and b[2] >= a[2] and b[3] >= a[3])
|
|
909
|
-
if contains:
|
|
910
|
-
return False
|
|
911
|
-
return not (a[1] > b[3] or a[3] < b[1] or a[0] > b[2] or a[2] < b[0])
|
|
912
1413
|
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
max(rect[2], ex[2]),
|
|
922
|
-
max(rect[3], ex[3]),
|
|
923
|
-
)
|
|
924
|
-
merged = True
|
|
925
|
-
break
|
|
926
|
-
if not merged:
|
|
927
|
-
merged_rects.append(rect)
|
|
1414
|
+
def detect_tables_xlwings(sheet: xw.Sheet) -> list[str]:
|
|
1415
|
+
"""Detect table-like ranges via COM: ListObjects first, then border clusters."""
|
|
1416
|
+
tables: list[str] = []
|
|
1417
|
+
tables.extend(_extract_listobject_tables(sheet))
|
|
1418
|
+
|
|
1419
|
+
rects = _detect_border_rectangles_xlwings(sheet)
|
|
1420
|
+
merged_rects = _merge_rectangles(rects)
|
|
1421
|
+
dedup: set[str] = set(tables)
|
|
928
1422
|
|
|
929
|
-
dedup: set[str] = set()
|
|
930
1423
|
for top_row, left_col, bottom_row, right_col in merged_rects:
|
|
931
|
-
top_row, left_col, bottom_row, right_col =
|
|
932
|
-
|
|
933
|
-
top_row,
|
|
934
|
-
left_col,
|
|
935
|
-
bottom_row,
|
|
936
|
-
right_col,
|
|
937
|
-
require_inside_border=False,
|
|
938
|
-
top_edge=top_edge,
|
|
939
|
-
bottom_edge=bottom_edge,
|
|
940
|
-
left_edge=left_edge,
|
|
941
|
-
right_edge=right_edge,
|
|
942
|
-
min_nonempty_ratio=0.0,
|
|
943
|
-
)
|
|
944
|
-
vals_block = _get_values_block(ws, top_row, left_col, bottom_row, right_col)
|
|
945
|
-
vals_block = _normalize_matrix(vals_block)
|
|
946
|
-
nonempty = sum(
|
|
947
|
-
1
|
|
948
|
-
for row in vals_block
|
|
949
|
-
for v in row
|
|
950
|
-
if not (v is None or str(v).strip() == "")
|
|
1424
|
+
top_row, left_col, bottom_row, right_col = shrink_to_content(
|
|
1425
|
+
sheet, top_row, left_col, bottom_row, right_col, require_inside_border=False
|
|
951
1426
|
)
|
|
952
|
-
|
|
1427
|
+
rng_vals: object | None = None
|
|
1428
|
+
try:
|
|
1429
|
+
rng_vals = sheet.range((top_row, left_col), (bottom_row, right_col)).value
|
|
1430
|
+
except Exception as exc:
|
|
1431
|
+
logger.warning(
|
|
1432
|
+
"Failed to read range for table detection (%s). (%r)",
|
|
1433
|
+
sheet.name,
|
|
1434
|
+
exc,
|
|
1435
|
+
)
|
|
1436
|
+
if rng_vals is None:
|
|
953
1437
|
continue
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
):
|
|
962
|
-
continue
|
|
963
|
-
if not _is_plausible_table(sub):
|
|
964
|
-
continue
|
|
965
|
-
score = _table_signal_score(sub)
|
|
966
|
-
if score < _DETECTION_CONFIG["table_score_threshold"]:
|
|
967
|
-
continue
|
|
968
|
-
addr = f"{get_column_letter(left_col + c0)}{top_row + r0}:{get_column_letter(left_col + c1)}{top_row + r1}"
|
|
1438
|
+
candidates = _collect_table_candidates_from_values(
|
|
1439
|
+
_normalize_matrix(rng_vals),
|
|
1440
|
+
base_top=top_row,
|
|
1441
|
+
base_left=left_col,
|
|
1442
|
+
col_name=xw.utils.col_name,
|
|
1443
|
+
)
|
|
1444
|
+
for addr in candidates:
|
|
969
1445
|
if addr not in dedup:
|
|
970
1446
|
dedup.add(addr)
|
|
971
1447
|
tables.append(addr)
|
|
972
|
-
wb.close()
|
|
973
1448
|
return tables
|
|
974
1449
|
|
|
975
1450
|
|
|
1451
|
+
def detect_tables_openpyxl(xlsx_path: Path, sheet_name: str) -> list[str]:
|
|
1452
|
+
"""Detect table-like ranges via openpyxl tables and border clusters."""
|
|
1453
|
+
with openpyxl_workbook(xlsx_path, data_only=True, read_only=False) as wb:
|
|
1454
|
+
ws = wb[sheet_name]
|
|
1455
|
+
tables = _extract_openpyxl_table_refs(ws)
|
|
1456
|
+
|
|
1457
|
+
has_border, top_edge, bottom_edge, left_edge, right_edge, max_row, max_col = (
|
|
1458
|
+
load_border_maps_xlsx(xlsx_path, sheet_name)
|
|
1459
|
+
)
|
|
1460
|
+
rects = _detect_border_rectangles(has_border, min_size=4)
|
|
1461
|
+
merged_rects = _merge_rectangles(rects)
|
|
1462
|
+
dedup: set[str] = set(tables)
|
|
1463
|
+
|
|
1464
|
+
for top_row, left_col, bottom_row, right_col in merged_rects:
|
|
1465
|
+
top_row, left_col, bottom_row, right_col = shrink_to_content_openpyxl(
|
|
1466
|
+
ws,
|
|
1467
|
+
top_row,
|
|
1468
|
+
left_col,
|
|
1469
|
+
bottom_row,
|
|
1470
|
+
right_col,
|
|
1471
|
+
require_inside_border=False,
|
|
1472
|
+
top_edge=top_edge,
|
|
1473
|
+
bottom_edge=bottom_edge,
|
|
1474
|
+
left_edge=left_edge,
|
|
1475
|
+
right_edge=right_edge,
|
|
1476
|
+
min_nonempty_ratio=0.0,
|
|
1477
|
+
)
|
|
1478
|
+
vals_block = _get_values_block(ws, top_row, left_col, bottom_row, right_col)
|
|
1479
|
+
candidates = _collect_table_candidates_from_values(
|
|
1480
|
+
_normalize_matrix(vals_block),
|
|
1481
|
+
base_top=top_row,
|
|
1482
|
+
base_left=left_col,
|
|
1483
|
+
col_name=get_column_letter,
|
|
1484
|
+
)
|
|
1485
|
+
for addr in candidates:
|
|
1486
|
+
if addr not in dedup:
|
|
1487
|
+
dedup.add(addr)
|
|
1488
|
+
tables.append(addr)
|
|
1489
|
+
return tables
|
|
1490
|
+
|
|
1491
|
+
|
|
976
1492
|
def detect_tables(sheet: xw.Sheet) -> list[str]:
|
|
977
1493
|
excel_path: Path | None = None
|
|
978
1494
|
try:
|
|
@@ -1027,13 +1543,13 @@ def _coerce_numeric_preserve_format(val: str) -> int | float | str:
|
|
|
1027
1543
|
return int(val)
|
|
1028
1544
|
except Exception:
|
|
1029
1545
|
return val
|
|
1030
|
-
if _FLOAT_RE.match(val):
|
|
1031
|
-
try:
|
|
1032
|
-
dec = Decimal(val)
|
|
1033
|
-
exponent = int(dec.as_tuple().exponent)
|
|
1034
|
-
scale = max(1, -exponent)
|
|
1035
|
-
quantized = dec.quantize(Decimal("1." + "0" * scale))
|
|
1036
|
-
return float(quantized)
|
|
1037
|
-
except (InvalidOperation, Exception):
|
|
1038
|
-
return val
|
|
1546
|
+
if _FLOAT_RE.match(val):
|
|
1547
|
+
try:
|
|
1548
|
+
dec = Decimal(val)
|
|
1549
|
+
exponent = int(dec.as_tuple().exponent)
|
|
1550
|
+
scale = max(1, -exponent)
|
|
1551
|
+
quantized = dec.quantize(Decimal("1." + "0" * scale))
|
|
1552
|
+
return float(quantized)
|
|
1553
|
+
except (InvalidOperation, Exception):
|
|
1554
|
+
return val
|
|
1039
1555
|
return val
|