exstruct 0.2.80__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- exstruct/__init__.py +387 -0
- exstruct/cli/availability.py +49 -0
- exstruct/cli/main.py +134 -0
- exstruct/core/__init__.py +0 -0
- exstruct/core/cells.py +1039 -0
- exstruct/core/charts.py +241 -0
- exstruct/core/integrate.py +388 -0
- exstruct/core/shapes.py +275 -0
- exstruct/engine.py +643 -0
- exstruct/errors.py +35 -0
- exstruct/io/__init__.py +555 -0
- exstruct/models/__init__.py +335 -0
- exstruct/models/maps.py +335 -0
- exstruct/models/types.py +8 -0
- exstruct/py.typed +0 -0
- exstruct/render/__init__.py +118 -0
- exstruct-0.2.80.dist-info/METADATA +435 -0
- exstruct-0.2.80.dist-info/RECORD +20 -0
- exstruct-0.2.80.dist-info/WHEEL +4 -0
- exstruct-0.2.80.dist-info/entry_points.txt +3 -0
exstruct/core/cells.py
ADDED
|
@@ -0,0 +1,1039 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections import deque
|
|
4
|
+
from collections.abc import Sequence
|
|
5
|
+
from decimal import Decimal, InvalidOperation
|
|
6
|
+
import logging
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
import re
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
from openpyxl import load_workbook
|
|
12
|
+
from openpyxl.utils import get_column_letter, range_boundaries
|
|
13
|
+
from openpyxl.worksheet.worksheet import Worksheet
|
|
14
|
+
import pandas as pd
|
|
15
|
+
import xlwings as xw
|
|
16
|
+
|
|
17
|
+
from ..models import CellRow
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
_warned_keys: set[str] = set()
|
|
21
|
+
XL_LINESTYLE_NONE = -4142
|
|
22
|
+
XL_INSIDE_VERTICAL = 11
|
|
23
|
+
XL_INSIDE_HORIZONTAL = 12
|
|
24
|
+
XL_EDGE_LEFT = 7
|
|
25
|
+
XL_EDGE_TOP = 8
|
|
26
|
+
XL_EDGE_BOTTOM = 9
|
|
27
|
+
XL_EDGE_RIGHT = 10
|
|
28
|
+
MatrixInput = Sequence[Sequence[object]] | Sequence[object]
|
|
29
|
+
|
|
30
|
+
# Detection tuning parameters (can be overridden via set_table_detection_params)
|
|
31
|
+
_DETECTION_CONFIG = {
|
|
32
|
+
"table_score_threshold": 0.35,
|
|
33
|
+
"density_min": 0.05,
|
|
34
|
+
"coverage_min": 0.2,
|
|
35
|
+
"min_nonempty_cells": 3,
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def warn_once(key: str, message: str) -> None:
|
|
40
|
+
if key not in _warned_keys:
|
|
41
|
+
logger.warning(message)
|
|
42
|
+
_warned_keys.add(key)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def extract_sheet_cells(file_path: Path) -> dict[str, list[CellRow]]:
|
|
46
|
+
"""Read all sheets via pandas and convert to CellRow list while skipping empty cells."""
|
|
47
|
+
dfs = pd.read_excel(file_path, header=None, sheet_name=None, dtype=str)
|
|
48
|
+
result: dict[str, list[CellRow]] = {}
|
|
49
|
+
for sheet_name, df in dfs.items():
|
|
50
|
+
df = df.fillna("")
|
|
51
|
+
rows: list[CellRow] = []
|
|
52
|
+
for excel_row, row in enumerate(df.itertuples(index=False, name=None), start=1):
|
|
53
|
+
filtered: dict[str, int | float | str] = {}
|
|
54
|
+
for j, v in enumerate(row):
|
|
55
|
+
s = "" if v is None else str(v)
|
|
56
|
+
if s.strip() == "":
|
|
57
|
+
continue
|
|
58
|
+
filtered[str(j)] = _coerce_numeric_preserve_format(s)
|
|
59
|
+
if not filtered:
|
|
60
|
+
continue
|
|
61
|
+
rows.append(CellRow(r=excel_row, c=filtered))
|
|
62
|
+
result[sheet_name] = rows
|
|
63
|
+
return result
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def extract_sheet_cells_with_links(file_path: Path) -> dict[str, list[CellRow]]:
|
|
67
|
+
"""
|
|
68
|
+
Extract cells and hyperlinks per sheet.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
{sheet_name: [CellRow(r=..., c=..., links={"col_index": url, ...}), ...]}
|
|
72
|
+
|
|
73
|
+
Notes:
|
|
74
|
+
- Uses pandas extraction for values (same filtering as extract_sheet_cells).
|
|
75
|
+
- Collects hyperlinks via openpyxl (requires read_only=False because border maps/hyperlinks need full objects).
|
|
76
|
+
- Links are mapped by column index string (e.g., "0") to hyperlink.target.
|
|
77
|
+
"""
|
|
78
|
+
cell_rows = extract_sheet_cells(file_path)
|
|
79
|
+
wb = load_workbook(file_path, data_only=True, read_only=False)
|
|
80
|
+
links_by_sheet: dict[str, dict[int, dict[str, str]]] = {}
|
|
81
|
+
for ws in wb.worksheets:
|
|
82
|
+
sheet_links: dict[int, dict[str, str]] = {}
|
|
83
|
+
for row in ws.iter_rows():
|
|
84
|
+
for cell in row:
|
|
85
|
+
link = getattr(cell, "hyperlink", None)
|
|
86
|
+
target = getattr(link, "target", None) if link else None
|
|
87
|
+
if not target:
|
|
88
|
+
continue
|
|
89
|
+
col_str = str(
|
|
90
|
+
cell.col_idx - 1
|
|
91
|
+
) # zero-based to align with extract_sheet_cells
|
|
92
|
+
sheet_links.setdefault(cell.row, {})[col_str] = target
|
|
93
|
+
links_by_sheet[ws.title] = sheet_links
|
|
94
|
+
|
|
95
|
+
merged: dict[str, list[CellRow]] = {}
|
|
96
|
+
for sheet_name, rows in cell_rows.items():
|
|
97
|
+
sheet_links = links_by_sheet.get(sheet_name, {})
|
|
98
|
+
merged_rows: list[CellRow] = []
|
|
99
|
+
for row in rows:
|
|
100
|
+
links = sheet_links.get(row.r, {})
|
|
101
|
+
merged_rows.append(CellRow(r=row.r, c=row.c, links=links or None))
|
|
102
|
+
merged[sheet_name] = merged_rows
|
|
103
|
+
wb.close()
|
|
104
|
+
return merged
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def shrink_to_content( # noqa: C901
|
|
108
|
+
sheet: xw.Sheet,
|
|
109
|
+
top: int,
|
|
110
|
+
left: int,
|
|
111
|
+
bottom: int,
|
|
112
|
+
right: int,
|
|
113
|
+
require_inside_border: bool = False,
|
|
114
|
+
min_nonempty_ratio: float = 0.0,
|
|
115
|
+
) -> tuple[int, int, int, int]:
|
|
116
|
+
"""Trim a rectangle based on cell contents and optional border heuristics."""
|
|
117
|
+
rng = sheet.range((top, left), (bottom, right))
|
|
118
|
+
vals = rng.value
|
|
119
|
+
if vals is None:
|
|
120
|
+
vals = []
|
|
121
|
+
if not isinstance(vals, list):
|
|
122
|
+
vals = [[vals]]
|
|
123
|
+
elif vals and not isinstance(vals[0], list):
|
|
124
|
+
vals = [vals]
|
|
125
|
+
rows_n = len(vals)
|
|
126
|
+
cols_n = len(vals[0]) if rows_n else 0
|
|
127
|
+
|
|
128
|
+
def to_str(x: object) -> str:
|
|
129
|
+
return "" if x is None else str(x)
|
|
130
|
+
|
|
131
|
+
def is_empty_value(x: object) -> bool:
|
|
132
|
+
return to_str(x).strip() == ""
|
|
133
|
+
|
|
134
|
+
def row_empty(i: int) -> bool:
|
|
135
|
+
return cols_n == 0 or all(is_empty_value(vals[i][j]) for j in range(cols_n))
|
|
136
|
+
|
|
137
|
+
def col_empty(j: int) -> bool:
|
|
138
|
+
return rows_n == 0 or all(is_empty_value(vals[i][j]) for i in range(rows_n))
|
|
139
|
+
|
|
140
|
+
def row_nonempty_ratio(i: int) -> float:
|
|
141
|
+
if cols_n == 0:
|
|
142
|
+
return 0.0
|
|
143
|
+
cnt = sum(1 for j in range(cols_n) if not is_empty_value(vals[i][j]))
|
|
144
|
+
return cnt / cols_n
|
|
145
|
+
|
|
146
|
+
def col_nonempty_ratio(j: int) -> float:
|
|
147
|
+
if rows_n == 0:
|
|
148
|
+
return 0.0
|
|
149
|
+
cnt = sum(1 for i in range(rows_n) if not is_empty_value(vals[i][j]))
|
|
150
|
+
return cnt / rows_n
|
|
151
|
+
|
|
152
|
+
def column_has_inside_border(col_idx: int) -> bool:
|
|
153
|
+
if not require_inside_border:
|
|
154
|
+
return False
|
|
155
|
+
try:
|
|
156
|
+
for r in range(top, bottom + 1):
|
|
157
|
+
ls = (
|
|
158
|
+
sheet.api.Cells(r, left + col_idx)
|
|
159
|
+
.Borders(XL_INSIDE_VERTICAL)
|
|
160
|
+
.LineStyle
|
|
161
|
+
)
|
|
162
|
+
if ls is not None and ls != XL_LINESTYLE_NONE:
|
|
163
|
+
return True
|
|
164
|
+
except Exception:
|
|
165
|
+
pass
|
|
166
|
+
return False
|
|
167
|
+
|
|
168
|
+
def row_has_inside_border(row_idx: int) -> bool:
|
|
169
|
+
if not require_inside_border:
|
|
170
|
+
return False
|
|
171
|
+
try:
|
|
172
|
+
for c in range(left, right + 1):
|
|
173
|
+
ls = (
|
|
174
|
+
sheet.api.Cells(top + row_idx, c)
|
|
175
|
+
.Borders(XL_INSIDE_HORIZONTAL)
|
|
176
|
+
.LineStyle
|
|
177
|
+
)
|
|
178
|
+
if ls is not None and ls != XL_LINESTYLE_NONE:
|
|
179
|
+
return True
|
|
180
|
+
except Exception:
|
|
181
|
+
pass
|
|
182
|
+
return False
|
|
183
|
+
|
|
184
|
+
def should_trim_col(j: int) -> bool:
|
|
185
|
+
if col_empty(j):
|
|
186
|
+
return True
|
|
187
|
+
if require_inside_border and not column_has_inside_border(j):
|
|
188
|
+
return True
|
|
189
|
+
if min_nonempty_ratio > 0.0 and col_nonempty_ratio(j) < min_nonempty_ratio:
|
|
190
|
+
return True
|
|
191
|
+
return False
|
|
192
|
+
|
|
193
|
+
def should_trim_row(i: int) -> bool:
|
|
194
|
+
if row_empty(i):
|
|
195
|
+
return True
|
|
196
|
+
if require_inside_border and not row_has_inside_border(i):
|
|
197
|
+
return True
|
|
198
|
+
if min_nonempty_ratio > 0.0 and row_nonempty_ratio(i) < min_nonempty_ratio:
|
|
199
|
+
return True
|
|
200
|
+
return False
|
|
201
|
+
|
|
202
|
+
while left <= right and cols_n > 0:
|
|
203
|
+
if should_trim_col(0):
|
|
204
|
+
for i in range(rows_n):
|
|
205
|
+
if cols_n > 0:
|
|
206
|
+
vals[i].pop(0)
|
|
207
|
+
cols_n = len(vals[0]) if rows_n else 0
|
|
208
|
+
left += 1
|
|
209
|
+
else:
|
|
210
|
+
break
|
|
211
|
+
while top <= bottom and rows_n > 0:
|
|
212
|
+
if should_trim_row(0):
|
|
213
|
+
vals.pop(0)
|
|
214
|
+
rows_n = len(vals)
|
|
215
|
+
top += 1
|
|
216
|
+
else:
|
|
217
|
+
break
|
|
218
|
+
while left <= right and cols_n > 0:
|
|
219
|
+
if should_trim_col(cols_n - 1):
|
|
220
|
+
for i in range(rows_n):
|
|
221
|
+
if cols_n > 0:
|
|
222
|
+
vals[i].pop(cols_n - 1)
|
|
223
|
+
cols_n = len(vals[0]) if rows_n else 0
|
|
224
|
+
right -= 1
|
|
225
|
+
else:
|
|
226
|
+
break
|
|
227
|
+
while top <= bottom and rows_n > 0:
|
|
228
|
+
if should_trim_row(rows_n - 1):
|
|
229
|
+
vals.pop(rows_n - 1)
|
|
230
|
+
rows_n = len(vals)
|
|
231
|
+
bottom -= 1
|
|
232
|
+
else:
|
|
233
|
+
break
|
|
234
|
+
return top, left, bottom, right
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def load_border_maps_xlsx( # noqa: C901
|
|
238
|
+
xlsx_path: Path, sheet_name: str
|
|
239
|
+
) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int]:
|
|
240
|
+
wb = load_workbook(xlsx_path, data_only=True, read_only=False)
|
|
241
|
+
if sheet_name not in wb.sheetnames:
|
|
242
|
+
wb.close()
|
|
243
|
+
raise KeyError(f"Sheet '{sheet_name}' not found in {xlsx_path}")
|
|
244
|
+
|
|
245
|
+
ws = wb[sheet_name]
|
|
246
|
+
try:
|
|
247
|
+
min_col, min_row, max_col, max_row = range_boundaries(ws.calculate_dimension())
|
|
248
|
+
except Exception:
|
|
249
|
+
min_col, min_row, max_col, max_row = 1, 1, ws.max_column or 1, ws.max_row or 1
|
|
250
|
+
|
|
251
|
+
shape = (max_row + 1, max_col + 1)
|
|
252
|
+
has_border = np.zeros(shape, dtype=bool)
|
|
253
|
+
top_edge = np.zeros(shape, dtype=bool)
|
|
254
|
+
bottom_edge = np.zeros(shape, dtype=bool)
|
|
255
|
+
left_edge = np.zeros(shape, dtype=bool)
|
|
256
|
+
right_edge = np.zeros(shape, dtype=bool)
|
|
257
|
+
|
|
258
|
+
def edge_has_style(edge: object) -> bool:
|
|
259
|
+
if edge is None:
|
|
260
|
+
return False
|
|
261
|
+
style = getattr(edge, "style", None)
|
|
262
|
+
return style is not None and style != "none"
|
|
263
|
+
|
|
264
|
+
for r in range(min_row, max_row + 1):
|
|
265
|
+
for c in range(min_col, max_col + 1):
|
|
266
|
+
cell = ws.cell(row=r, column=c)
|
|
267
|
+
b = getattr(cell, "border", None)
|
|
268
|
+
if b is None:
|
|
269
|
+
continue
|
|
270
|
+
|
|
271
|
+
t = edge_has_style(b.top)
|
|
272
|
+
btm = edge_has_style(b.bottom)
|
|
273
|
+
left_border = edge_has_style(b.left)
|
|
274
|
+
rgt = edge_has_style(b.right)
|
|
275
|
+
|
|
276
|
+
if t or btm or left_border or rgt:
|
|
277
|
+
has_border[r, c] = True
|
|
278
|
+
if t:
|
|
279
|
+
top_edge[r, c] = True
|
|
280
|
+
if btm:
|
|
281
|
+
bottom_edge[r, c] = True
|
|
282
|
+
if left_border:
|
|
283
|
+
left_edge[r, c] = True
|
|
284
|
+
if rgt:
|
|
285
|
+
right_edge[r, c] = True
|
|
286
|
+
|
|
287
|
+
wb.close()
|
|
288
|
+
return has_border, top_edge, bottom_edge, left_edge, right_edge, max_row, max_col
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def _detect_border_clusters_numpy(
|
|
292
|
+
has_border: np.ndarray, min_size: int
|
|
293
|
+
) -> list[tuple[int, int, int, int]]:
|
|
294
|
+
from scipy.ndimage import label
|
|
295
|
+
|
|
296
|
+
structure = np.array([[0, 1, 0], [1, 1, 1], [0, 1, 0]], dtype=np.uint8)
|
|
297
|
+
lbl, num = label(has_border.astype(np.uint8), structure=structure)
|
|
298
|
+
rects: list[tuple[int, int, int, int]] = []
|
|
299
|
+
for k in range(1, int(num) + 1):
|
|
300
|
+
ys, xs = np.where(lbl == k)
|
|
301
|
+
if int(len(ys)) < min_size:
|
|
302
|
+
continue
|
|
303
|
+
rects.append((int(ys.min()), int(xs.min()), int(ys.max()), int(xs.max())))
|
|
304
|
+
return rects
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def _detect_border_clusters_python(
|
|
308
|
+
has_border: np.ndarray, min_size: int
|
|
309
|
+
) -> list[tuple[int, int, int, int]]:
|
|
310
|
+
h, w = has_border.shape
|
|
311
|
+
visited = np.zeros_like(has_border, dtype=bool)
|
|
312
|
+
rects: list[tuple[int, int, int, int]] = []
|
|
313
|
+
for r in range(h):
|
|
314
|
+
for c in range(w):
|
|
315
|
+
if not has_border[r, c] or visited[r, c]:
|
|
316
|
+
continue
|
|
317
|
+
q = deque([(r, c)])
|
|
318
|
+
visited[r, c] = True
|
|
319
|
+
ys = [r]
|
|
320
|
+
xs = [c]
|
|
321
|
+
while q:
|
|
322
|
+
yy, xx = q.popleft()
|
|
323
|
+
for dy, dx in ((1, 0), (-1, 0), (0, 1), (0, -1)):
|
|
324
|
+
ny, nx = yy + dy, xx + dx
|
|
325
|
+
if (
|
|
326
|
+
0 <= ny < h
|
|
327
|
+
and 0 <= nx < w
|
|
328
|
+
and has_border[ny, nx]
|
|
329
|
+
and not visited[ny, nx]
|
|
330
|
+
):
|
|
331
|
+
visited[ny, nx] = True
|
|
332
|
+
q.append((ny, nx))
|
|
333
|
+
ys.append(ny)
|
|
334
|
+
xs.append(nx)
|
|
335
|
+
if len(ys) >= min_size:
|
|
336
|
+
rects.append((min(ys), min(xs), max(ys), max(xs)))
|
|
337
|
+
return rects
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def detect_border_clusters(
|
|
341
|
+
has_border: np.ndarray, min_size: int = 4
|
|
342
|
+
) -> list[tuple[int, int, int, int]]:
|
|
343
|
+
try:
|
|
344
|
+
return _detect_border_clusters_numpy(has_border, min_size)
|
|
345
|
+
except Exception:
|
|
346
|
+
warn_once(
|
|
347
|
+
"scipy-missing",
|
|
348
|
+
"scipy is not available. Falling back to pure-Python BFS for connected components, which may be significantly slower.",
|
|
349
|
+
)
|
|
350
|
+
return _detect_border_clusters_python(has_border, min_size)
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def _get_values_block(
|
|
354
|
+
ws: Worksheet, top: int, left: int, bottom: int, right: int
|
|
355
|
+
) -> list[list[object]]:
|
|
356
|
+
vals: list[list[object]] = []
|
|
357
|
+
for row in ws.iter_rows(
|
|
358
|
+
min_row=top, max_row=bottom, min_col=left, max_col=right, values_only=True
|
|
359
|
+
):
|
|
360
|
+
vals.append(list(row))
|
|
361
|
+
return vals
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def _ensure_matrix(matrix: MatrixInput) -> list[list[object]]:
|
|
365
|
+
rows_seq = list(matrix)
|
|
366
|
+
if not rows_seq:
|
|
367
|
+
return []
|
|
368
|
+
first = rows_seq[0]
|
|
369
|
+
if isinstance(first, Sequence) and not isinstance(first, (str, bytes, bytearray)):
|
|
370
|
+
normalized: list[list[object]] = []
|
|
371
|
+
for row in rows_seq:
|
|
372
|
+
if isinstance(row, Sequence) and not isinstance(
|
|
373
|
+
row, (str, bytes, bytearray)
|
|
374
|
+
):
|
|
375
|
+
normalized.append(list(row))
|
|
376
|
+
else:
|
|
377
|
+
normalized.append([row])
|
|
378
|
+
return normalized
|
|
379
|
+
return [list(rows_seq)]
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def _table_density_metrics(matrix: MatrixInput) -> tuple[float, float]:
|
|
383
|
+
"""
|
|
384
|
+
Given a 2D matrix (list of rows), return (density, coverage).
|
|
385
|
+
density: nonempty / total cells.
|
|
386
|
+
coverage: area of tight bounding box of nonempty cells divided by total area.
|
|
387
|
+
"""
|
|
388
|
+
normalized = _ensure_matrix(matrix)
|
|
389
|
+
if not normalized:
|
|
390
|
+
return 0.0, 0.0
|
|
391
|
+
rows = len(normalized)
|
|
392
|
+
cols = len(normalized[0]) if rows else 0
|
|
393
|
+
if rows == 0 or cols == 0:
|
|
394
|
+
return 0.0, 0.0
|
|
395
|
+
|
|
396
|
+
nonempty_coords = []
|
|
397
|
+
for i, row in enumerate(normalized):
|
|
398
|
+
for j, v in enumerate(row):
|
|
399
|
+
if not (v is None or str(v).strip() == ""):
|
|
400
|
+
nonempty_coords.append((i, j))
|
|
401
|
+
|
|
402
|
+
total = rows * cols
|
|
403
|
+
if not nonempty_coords:
|
|
404
|
+
return 0.0, 0.0
|
|
405
|
+
|
|
406
|
+
nonempty = len(nonempty_coords)
|
|
407
|
+
density = nonempty / total
|
|
408
|
+
|
|
409
|
+
ys = [p[0] for p in nonempty_coords]
|
|
410
|
+
xs = [p[1] for p in nonempty_coords]
|
|
411
|
+
bbox_h = max(ys) - min(ys) + 1
|
|
412
|
+
bbox_w = max(xs) - min(xs) + 1
|
|
413
|
+
coverage = (bbox_h * bbox_w) / total if total > 0 else 0.0
|
|
414
|
+
return density, coverage
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
def _is_plausible_table(matrix: MatrixInput) -> bool:
|
|
418
|
+
"""
|
|
419
|
+
Heuristic: require at least 2 rows and 2 cols with meaningful data.
|
|
420
|
+
- At least 2 rows have 2 以上の非空セル
|
|
421
|
+
- At least 2 columns have 2 以上の非空セル
|
|
422
|
+
"""
|
|
423
|
+
normalized = _ensure_matrix(matrix)
|
|
424
|
+
if not normalized:
|
|
425
|
+
return False
|
|
426
|
+
|
|
427
|
+
rows = len(normalized)
|
|
428
|
+
cols = max((len(r) if isinstance(r, list) else 1) for r in normalized) if rows else 0
|
|
429
|
+
if rows < 2 or cols < 2:
|
|
430
|
+
return False
|
|
431
|
+
|
|
432
|
+
row_counts: list[int] = []
|
|
433
|
+
col_counts = [0] * cols
|
|
434
|
+
for r in normalized:
|
|
435
|
+
cnt = 0
|
|
436
|
+
for j in range(cols):
|
|
437
|
+
v = r[j] if j < len(r) else None
|
|
438
|
+
if not (v is None or str(v).strip() == ""):
|
|
439
|
+
cnt += 1
|
|
440
|
+
col_counts[j] += 1
|
|
441
|
+
row_counts.append(cnt)
|
|
442
|
+
|
|
443
|
+
rows_with_two = sum(1 for c in row_counts if c >= 2)
|
|
444
|
+
cols_with_two = sum(1 for c in col_counts if c >= 2)
|
|
445
|
+
return rows_with_two >= 2 and cols_with_two >= 2
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
def _nonempty_clusters(matrix: Sequence[Sequence[object]]) -> list[tuple[int, int, int, int]]:
|
|
449
|
+
"""Return bounding boxes of connected components of nonempty cells (4-neighbor)."""
|
|
450
|
+
if not matrix:
|
|
451
|
+
return []
|
|
452
|
+
rows = len(matrix)
|
|
453
|
+
cols = max(len(r) for r in matrix) if rows else 0
|
|
454
|
+
grid = [[False] * cols for _ in range(rows)]
|
|
455
|
+
for i, row in enumerate(matrix):
|
|
456
|
+
for j in range(cols):
|
|
457
|
+
v = row[j] if j < len(row) else None
|
|
458
|
+
if not (v is None or str(v).strip() == ""):
|
|
459
|
+
grid[i][j] = True
|
|
460
|
+
visited = [[False] * cols for _ in range(rows)]
|
|
461
|
+
boxes: list[tuple[int, int, int, int]] = []
|
|
462
|
+
|
|
463
|
+
def bfs(sr: int, sc: int) -> tuple[int, int, int, int]:
|
|
464
|
+
q = deque([(sr, sc)])
|
|
465
|
+
visited[sr][sc] = True
|
|
466
|
+
ys = [sr]
|
|
467
|
+
xs = [sc]
|
|
468
|
+
while q:
|
|
469
|
+
r, c = q.popleft()
|
|
470
|
+
for dr, dc in ((1, 0), (-1, 0), (0, 1), (0, -1)):
|
|
471
|
+
nr, nc = r + dr, c + dc
|
|
472
|
+
if (
|
|
473
|
+
0 <= nr < rows
|
|
474
|
+
and 0 <= nc < cols
|
|
475
|
+
and grid[nr][nc]
|
|
476
|
+
and not visited[nr][nc]
|
|
477
|
+
):
|
|
478
|
+
visited[nr][nc] = True
|
|
479
|
+
q.append((nr, nc))
|
|
480
|
+
ys.append(nr)
|
|
481
|
+
xs.append(nc)
|
|
482
|
+
return min(ys), min(xs), max(ys), max(xs)
|
|
483
|
+
|
|
484
|
+
for i in range(rows):
|
|
485
|
+
for j in range(cols):
|
|
486
|
+
if grid[i][j] and not visited[i][j]:
|
|
487
|
+
boxes.append(bfs(i, j))
|
|
488
|
+
return boxes
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
def _normalize_matrix(matrix: object) -> list[list[object]]:
|
|
492
|
+
if matrix is None:
|
|
493
|
+
return []
|
|
494
|
+
if isinstance(matrix, list):
|
|
495
|
+
return _ensure_matrix(matrix)
|
|
496
|
+
if isinstance(matrix, Sequence) and not isinstance(matrix, (str, bytes, bytearray)):
|
|
497
|
+
return _ensure_matrix(matrix)
|
|
498
|
+
return [[matrix]]
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
def _header_like_row(row: list[object]) -> bool:
|
|
502
|
+
nonempty = [v for v in row if not (v is None or str(v).strip() == "")]
|
|
503
|
+
if len(nonempty) < 2:
|
|
504
|
+
return False
|
|
505
|
+
str_like = 0
|
|
506
|
+
num_like = 0
|
|
507
|
+
for v in nonempty:
|
|
508
|
+
s = str(v)
|
|
509
|
+
if _INT_RE.match(s) or _FLOAT_RE.match(s):
|
|
510
|
+
num_like += 1
|
|
511
|
+
else:
|
|
512
|
+
str_like += 1
|
|
513
|
+
return str_like >= num_like and str_like >= 1
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
def _table_signal_score(matrix: Sequence[Sequence[object]]) -> float:
|
|
517
|
+
normalized = _ensure_matrix(matrix)
|
|
518
|
+
density, coverage = _table_density_metrics(normalized)
|
|
519
|
+
header = any(_header_like_row(r) for r in normalized[:2]) # check first 2 rows
|
|
520
|
+
|
|
521
|
+
rows = len(normalized)
|
|
522
|
+
cols = max((len(r) if isinstance(r, list) else 1) for r in normalized) if rows else 0
|
|
523
|
+
row_counts: list[int] = []
|
|
524
|
+
col_counts = [0] * cols if cols else []
|
|
525
|
+
for r in normalized:
|
|
526
|
+
cnt = 0
|
|
527
|
+
for j in range(cols):
|
|
528
|
+
v = r[j] if j < len(r) else None
|
|
529
|
+
if not (v is None or str(v).strip() == ""):
|
|
530
|
+
cnt += 1
|
|
531
|
+
if j < len(col_counts):
|
|
532
|
+
col_counts[j] += 1
|
|
533
|
+
row_counts.append(cnt)
|
|
534
|
+
rows_with_two = sum(1 for c in row_counts if c >= 2)
|
|
535
|
+
cols_with_two = sum(1 for c in col_counts if c >= 2)
|
|
536
|
+
structure_score = 0.1 if (rows_with_two >= 2 and cols_with_two >= 2) else 0.0
|
|
537
|
+
|
|
538
|
+
score = density
|
|
539
|
+
if header:
|
|
540
|
+
score += 0.2
|
|
541
|
+
if coverage > 0.5:
|
|
542
|
+
score += 0.1
|
|
543
|
+
score += structure_score
|
|
544
|
+
return score
|
|
545
|
+
|
|
546
|
+
|
|
547
|
+
def set_table_detection_params(
|
|
548
|
+
*,
|
|
549
|
+
table_score_threshold: float | None = None,
|
|
550
|
+
density_min: float | None = None,
|
|
551
|
+
coverage_min: float | None = None,
|
|
552
|
+
min_nonempty_cells: int | None = None,
|
|
553
|
+
) -> None:
|
|
554
|
+
"""
|
|
555
|
+
Configure table detection heuristics at runtime.
|
|
556
|
+
Any parameter left as None keeps its current value.
|
|
557
|
+
"""
|
|
558
|
+
if table_score_threshold is not None:
|
|
559
|
+
_DETECTION_CONFIG["table_score_threshold"] = table_score_threshold
|
|
560
|
+
if density_min is not None:
|
|
561
|
+
_DETECTION_CONFIG["density_min"] = density_min
|
|
562
|
+
if coverage_min is not None:
|
|
563
|
+
_DETECTION_CONFIG["coverage_min"] = coverage_min
|
|
564
|
+
if min_nonempty_cells is not None:
|
|
565
|
+
_DETECTION_CONFIG["min_nonempty_cells"] = min_nonempty_cells
|
|
566
|
+
|
|
567
|
+
|
|
568
|
+
def shrink_to_content_openpyxl( # noqa: C901
|
|
569
|
+
ws: Worksheet,
|
|
570
|
+
top: int,
|
|
571
|
+
left: int,
|
|
572
|
+
bottom: int,
|
|
573
|
+
right: int,
|
|
574
|
+
require_inside_border: bool,
|
|
575
|
+
top_edge: np.ndarray,
|
|
576
|
+
bottom_edge: np.ndarray,
|
|
577
|
+
left_edge: np.ndarray,
|
|
578
|
+
right_edge: np.ndarray,
|
|
579
|
+
min_nonempty_ratio: float = 0.0,
|
|
580
|
+
) -> tuple[int, int, int, int]:
|
|
581
|
+
vals = _get_values_block(ws, top, left, bottom, right)
|
|
582
|
+
rows_n = bottom - top + 1
|
|
583
|
+
cols_n = right - left + 1
|
|
584
|
+
|
|
585
|
+
def to_str(x: object) -> str:
|
|
586
|
+
return "" if x is None else str(x)
|
|
587
|
+
|
|
588
|
+
def is_empty_value(x: object) -> bool:
|
|
589
|
+
return to_str(x).strip() == ""
|
|
590
|
+
|
|
591
|
+
def row_nonempty_ratio_local(i: int) -> float:
|
|
592
|
+
if cols_n <= 0:
|
|
593
|
+
return 0.0
|
|
594
|
+
row = vals[i]
|
|
595
|
+
cnt = sum(1 for v in row if not is_empty_value(v))
|
|
596
|
+
return cnt / cols_n
|
|
597
|
+
|
|
598
|
+
def col_nonempty_ratio_local(j: int) -> float:
|
|
599
|
+
if rows_n <= 0:
|
|
600
|
+
return 0.0
|
|
601
|
+
cnt = 0
|
|
602
|
+
for i in range(rows_n):
|
|
603
|
+
if not is_empty_value(vals[i][j]):
|
|
604
|
+
cnt += 1
|
|
605
|
+
return cnt / rows_n
|
|
606
|
+
|
|
607
|
+
def col_has_inside_border(j_abs: int) -> bool:
|
|
608
|
+
if not require_inside_border:
|
|
609
|
+
return False
|
|
610
|
+
count_pairs = 0
|
|
611
|
+
for r_abs in range(top, bottom + 1):
|
|
612
|
+
if (
|
|
613
|
+
j_abs > left
|
|
614
|
+
and right_edge[r_abs, j_abs - 1]
|
|
615
|
+
and left_edge[r_abs, j_abs]
|
|
616
|
+
):
|
|
617
|
+
count_pairs += 1
|
|
618
|
+
return count_pairs > 0
|
|
619
|
+
|
|
620
|
+
def row_has_inside_border(i_abs: int) -> bool:
|
|
621
|
+
if not require_inside_border:
|
|
622
|
+
return False
|
|
623
|
+
count_pairs = 0
|
|
624
|
+
for c_abs in range(left, right + 1):
|
|
625
|
+
if i_abs > top and bottom_edge[i_abs - 1, c_abs] and top_edge[i_abs, c_abs]:
|
|
626
|
+
count_pairs += 1
|
|
627
|
+
return count_pairs > 0
|
|
628
|
+
|
|
629
|
+
while left <= right and cols_n > 0:
|
|
630
|
+
empty_col = all(
|
|
631
|
+
not (
|
|
632
|
+
top_edge[i, left]
|
|
633
|
+
or bottom_edge[i, left]
|
|
634
|
+
or left_edge[i, left]
|
|
635
|
+
or right_edge[i, left]
|
|
636
|
+
)
|
|
637
|
+
for i in range(top, bottom + 1)
|
|
638
|
+
)
|
|
639
|
+
if (
|
|
640
|
+
empty_col
|
|
641
|
+
or (require_inside_border and not col_has_inside_border(left))
|
|
642
|
+
or (
|
|
643
|
+
min_nonempty_ratio > 0.0
|
|
644
|
+
and col_nonempty_ratio_local(0) < min_nonempty_ratio
|
|
645
|
+
)
|
|
646
|
+
):
|
|
647
|
+
for i in range(rows_n):
|
|
648
|
+
if cols_n > 0:
|
|
649
|
+
vals[i].pop(0)
|
|
650
|
+
cols_n -= 1
|
|
651
|
+
left += 1
|
|
652
|
+
else:
|
|
653
|
+
break
|
|
654
|
+
while top <= bottom and rows_n > 0:
|
|
655
|
+
empty_row = all(
|
|
656
|
+
not (
|
|
657
|
+
top_edge[top, j]
|
|
658
|
+
or bottom_edge[top, j]
|
|
659
|
+
or left_edge[top, j]
|
|
660
|
+
or right_edge[top, j]
|
|
661
|
+
)
|
|
662
|
+
for j in range(left, right + 1)
|
|
663
|
+
)
|
|
664
|
+
if (
|
|
665
|
+
empty_row
|
|
666
|
+
or (require_inside_border and not row_has_inside_border(top))
|
|
667
|
+
or (
|
|
668
|
+
min_nonempty_ratio > 0.0
|
|
669
|
+
and row_nonempty_ratio_local(0) < min_nonempty_ratio
|
|
670
|
+
)
|
|
671
|
+
):
|
|
672
|
+
vals.pop(0)
|
|
673
|
+
rows_n -= 1
|
|
674
|
+
top += 1
|
|
675
|
+
else:
|
|
676
|
+
break
|
|
677
|
+
while left <= right and cols_n > 0:
|
|
678
|
+
empty_col = all(
|
|
679
|
+
not (
|
|
680
|
+
top_edge[i, right]
|
|
681
|
+
or bottom_edge[i, right]
|
|
682
|
+
or left_edge[i, right]
|
|
683
|
+
or right_edge[i, right]
|
|
684
|
+
)
|
|
685
|
+
for i in range(top, bottom + 1)
|
|
686
|
+
)
|
|
687
|
+
if (
|
|
688
|
+
empty_col
|
|
689
|
+
or (require_inside_border and not col_has_inside_border(right))
|
|
690
|
+
or (
|
|
691
|
+
min_nonempty_ratio > 0.0
|
|
692
|
+
and col_nonempty_ratio_local(cols_n - 1) < min_nonempty_ratio
|
|
693
|
+
)
|
|
694
|
+
):
|
|
695
|
+
for i in range(rows_n):
|
|
696
|
+
if cols_n > 0:
|
|
697
|
+
vals[i].pop(cols_n - 1)
|
|
698
|
+
cols_n -= 1
|
|
699
|
+
right -= 1
|
|
700
|
+
else:
|
|
701
|
+
break
|
|
702
|
+
while top <= bottom and rows_n > 0:
|
|
703
|
+
empty_row = all(
|
|
704
|
+
not (
|
|
705
|
+
top_edge[bottom, j]
|
|
706
|
+
or bottom_edge[bottom, j]
|
|
707
|
+
or left_edge[bottom, j]
|
|
708
|
+
or right_edge[bottom, j]
|
|
709
|
+
)
|
|
710
|
+
for j in range(left, right + 1)
|
|
711
|
+
)
|
|
712
|
+
if (
|
|
713
|
+
empty_row
|
|
714
|
+
or (require_inside_border and not row_has_inside_border(bottom))
|
|
715
|
+
or (
|
|
716
|
+
min_nonempty_ratio > 0.0
|
|
717
|
+
and row_nonempty_ratio_local(rows_n - 1) < min_nonempty_ratio
|
|
718
|
+
)
|
|
719
|
+
):
|
|
720
|
+
vals.pop(rows_n - 1)
|
|
721
|
+
rows_n -= 1
|
|
722
|
+
bottom -= 1
|
|
723
|
+
else:
|
|
724
|
+
break
|
|
725
|
+
return top, left, bottom, right
|
|
726
|
+
|
|
727
|
+
|
|
728
|
+
def detect_tables_xlwings(sheet: xw.Sheet) -> list[str]: # noqa: C901
|
|
729
|
+
"""Detect table-like ranges via COM: ListObjects first, then border clusters."""
|
|
730
|
+
tables: list[str] = []
|
|
731
|
+
try:
|
|
732
|
+
for lo in sheet.api.ListObjects:
|
|
733
|
+
rng = lo.Range
|
|
734
|
+
top_row = int(rng.Row)
|
|
735
|
+
left_col = int(rng.Column)
|
|
736
|
+
bottom_row = top_row + int(rng.Rows.Count) - 1
|
|
737
|
+
right_col = left_col + int(rng.Columns.Count) - 1
|
|
738
|
+
addr = rng.Address(RowAbsolute=False, ColumnAbsolute=False)
|
|
739
|
+
tables.append(addr)
|
|
740
|
+
except Exception:
|
|
741
|
+
pass
|
|
742
|
+
|
|
743
|
+
used = sheet.used_range
|
|
744
|
+
max_row = used.last_cell.row
|
|
745
|
+
max_col = used.last_cell.column
|
|
746
|
+
|
|
747
|
+
def cell_has_any_border(r: int, c: int) -> bool:
|
|
748
|
+
try:
|
|
749
|
+
b = sheet.api.Cells(r, c).Borders
|
|
750
|
+
for idx in (
|
|
751
|
+
XL_EDGE_LEFT,
|
|
752
|
+
XL_EDGE_TOP,
|
|
753
|
+
XL_EDGE_RIGHT,
|
|
754
|
+
XL_EDGE_BOTTOM,
|
|
755
|
+
XL_INSIDE_VERTICAL,
|
|
756
|
+
XL_INSIDE_HORIZONTAL,
|
|
757
|
+
):
|
|
758
|
+
ls = b(idx).LineStyle
|
|
759
|
+
if ls is not None and ls != XL_LINESTYLE_NONE:
|
|
760
|
+
try:
|
|
761
|
+
if getattr(b(idx), "Weight", 0) == 0:
|
|
762
|
+
continue
|
|
763
|
+
except Exception:
|
|
764
|
+
pass
|
|
765
|
+
return True
|
|
766
|
+
return False
|
|
767
|
+
except Exception:
|
|
768
|
+
return False
|
|
769
|
+
|
|
770
|
+
grid = [[False] * (max_col + 1) for _ in range(max_row + 1)]
|
|
771
|
+
for r in range(1, max_row + 1):
|
|
772
|
+
for c in range(1, max_col + 1):
|
|
773
|
+
if cell_has_any_border(r, c):
|
|
774
|
+
grid[r][c] = True
|
|
775
|
+
visited = [[False] * (max_col + 1) for _ in range(max_row + 1)]
|
|
776
|
+
|
|
777
|
+
def dfs(sr: int, sc: int, acc: list[tuple[int, int]]) -> None:
|
|
778
|
+
stack = [(sr, sc)]
|
|
779
|
+
while stack:
|
|
780
|
+
rr, cc = stack.pop()
|
|
781
|
+
if not (1 <= rr <= max_row and 1 <= cc <= max_col):
|
|
782
|
+
continue
|
|
783
|
+
if visited[rr][cc] or not grid[rr][cc]:
|
|
784
|
+
continue
|
|
785
|
+
visited[rr][cc] = True
|
|
786
|
+
acc.append((rr, cc))
|
|
787
|
+
for dr, dc in ((1, 0), (-1, 0), (0, 1), (0, -1)):
|
|
788
|
+
stack.append((rr + dr, cc + dc))
|
|
789
|
+
|
|
790
|
+
clusters: list[tuple[int, int, int, int]] = []
|
|
791
|
+
for r in range(1, max_row + 1):
|
|
792
|
+
for c in range(1, max_col + 1):
|
|
793
|
+
if grid[r][c] and not visited[r][c]:
|
|
794
|
+
cluster: list[tuple[int, int]] = []
|
|
795
|
+
dfs(r, c, cluster)
|
|
796
|
+
if len(cluster) < 4:
|
|
797
|
+
continue
|
|
798
|
+
rows = [rc[0] for rc in cluster]
|
|
799
|
+
cols = [rc[1] for rc in cluster]
|
|
800
|
+
top_row = min(rows)
|
|
801
|
+
bottom_row = max(rows)
|
|
802
|
+
left_col = min(cols)
|
|
803
|
+
right_col = max(cols)
|
|
804
|
+
clusters.append((top_row, left_col, bottom_row, right_col))
|
|
805
|
+
|
|
806
|
+
def overlaps_for_merge(
|
|
807
|
+
a: tuple[int, int, int, int], b: tuple[int, int, int, int]
|
|
808
|
+
) -> bool:
|
|
809
|
+
# Do not merge if one rect fully contains the other (separate clusters like big frame vs small table)
|
|
810
|
+
contains = (
|
|
811
|
+
a[0] <= b[0] and a[1] <= b[1] and a[2] >= b[2] and a[3] >= b[3]
|
|
812
|
+
) or (b[0] <= a[0] and b[1] <= a[1] and b[2] >= a[2] and b[3] >= a[3])
|
|
813
|
+
if contains:
|
|
814
|
+
return False
|
|
815
|
+
return not (a[1] > b[3] or a[3] < b[1] or a[0] > b[2] or a[2] < b[0])
|
|
816
|
+
|
|
817
|
+
merged_rects: list[tuple[int, int, int, int]] = []
|
|
818
|
+
for rect in sorted(clusters):
|
|
819
|
+
merged = False
|
|
820
|
+
for i, ex in enumerate(merged_rects):
|
|
821
|
+
if overlaps_for_merge(rect, ex):
|
|
822
|
+
merged_rects[i] = (
|
|
823
|
+
min(rect[0], ex[0]),
|
|
824
|
+
min(rect[1], ex[1]),
|
|
825
|
+
max(rect[2], ex[2]),
|
|
826
|
+
max(rect[3], ex[3]),
|
|
827
|
+
)
|
|
828
|
+
merged = True
|
|
829
|
+
break
|
|
830
|
+
if not merged:
|
|
831
|
+
merged_rects.append(rect)
|
|
832
|
+
|
|
833
|
+
dedup: set[str] = set()
|
|
834
|
+
for top_row, left_col, bottom_row, right_col in merged_rects:
|
|
835
|
+
top_row, left_col, bottom_row, right_col = shrink_to_content(
|
|
836
|
+
sheet, top_row, left_col, bottom_row, right_col, require_inside_border=False
|
|
837
|
+
)
|
|
838
|
+
try:
|
|
839
|
+
rng_vals = sheet.range((top_row, left_col), (bottom_row, right_col)).value
|
|
840
|
+
rng_vals = _normalize_matrix(rng_vals)
|
|
841
|
+
nonempty = sum(
|
|
842
|
+
1
|
|
843
|
+
for row in rng_vals
|
|
844
|
+
for v in (row if isinstance(row, list) else [row])
|
|
845
|
+
if not (v is None or str(v).strip() == "")
|
|
846
|
+
)
|
|
847
|
+
except Exception:
|
|
848
|
+
nonempty = 0
|
|
849
|
+
if nonempty < _DETECTION_CONFIG["min_nonempty_cells"]:
|
|
850
|
+
continue
|
|
851
|
+
clusters = _nonempty_clusters(rng_vals)
|
|
852
|
+
for r0, c0, r1, c1 in clusters:
|
|
853
|
+
sub = [row[c0 : c1 + 1] for row in rng_vals[r0 : r1 + 1]]
|
|
854
|
+
density, coverage = _table_density_metrics(sub)
|
|
855
|
+
if (
|
|
856
|
+
density < _DETECTION_CONFIG["density_min"]
|
|
857
|
+
and coverage < _DETECTION_CONFIG["coverage_min"]
|
|
858
|
+
):
|
|
859
|
+
continue
|
|
860
|
+
if not _is_plausible_table(sub):
|
|
861
|
+
continue
|
|
862
|
+
score = _table_signal_score(sub)
|
|
863
|
+
if score < _DETECTION_CONFIG["table_score_threshold"]:
|
|
864
|
+
continue
|
|
865
|
+
addr = f"{xw.utils.col_name(left_col + c0)}{top_row + r0}:{xw.utils.col_name(left_col + c1)}{top_row + r1}"
|
|
866
|
+
if addr not in dedup:
|
|
867
|
+
dedup.add(addr)
|
|
868
|
+
tables.append(addr)
|
|
869
|
+
return tables
|
|
870
|
+
|
|
871
|
+
|
|
872
|
+
def detect_tables_openpyxl( # noqa: C901
|
|
873
|
+
xlsx_path: Path, sheet_name: str
|
|
874
|
+
) -> list[str]:
|
|
875
|
+
wb = load_workbook(
|
|
876
|
+
xlsx_path,
|
|
877
|
+
data_only=True,
|
|
878
|
+
read_only=False,
|
|
879
|
+
)
|
|
880
|
+
ws = wb[sheet_name]
|
|
881
|
+
tables: list[str] = []
|
|
882
|
+
try:
|
|
883
|
+
openpyxl_tables: list[object] = []
|
|
884
|
+
if hasattr(ws, "tables") and ws.tables:
|
|
885
|
+
if isinstance(ws.tables, dict):
|
|
886
|
+
openpyxl_tables = list(ws.tables.values())
|
|
887
|
+
else:
|
|
888
|
+
openpyxl_tables = list(ws.tables)
|
|
889
|
+
elif hasattr(ws, "_tables") and ws._tables:
|
|
890
|
+
openpyxl_tables = list(ws._tables)
|
|
891
|
+
for t in openpyxl_tables:
|
|
892
|
+
addr = getattr(t, "ref", None)
|
|
893
|
+
if addr:
|
|
894
|
+
tables.append(str(addr))
|
|
895
|
+
except Exception:
|
|
896
|
+
pass
|
|
897
|
+
|
|
898
|
+
has_border, top_edge, bottom_edge, left_edge, right_edge, max_row, max_col = (
|
|
899
|
+
load_border_maps_xlsx(xlsx_path, sheet_name)
|
|
900
|
+
)
|
|
901
|
+
rects = detect_border_clusters(has_border, min_size=4)
|
|
902
|
+
|
|
903
|
+
def overlaps_for_merge(
|
|
904
|
+
a: tuple[int, int, int, int], b: tuple[int, int, int, int]
|
|
905
|
+
) -> bool:
|
|
906
|
+
contains = (
|
|
907
|
+
a[0] <= b[0] and a[1] <= b[1] and a[2] >= b[2] and a[3] >= b[3]
|
|
908
|
+
) or (b[0] <= a[0] and b[1] <= a[1] and b[2] >= a[2] and b[3] >= a[3])
|
|
909
|
+
if contains:
|
|
910
|
+
return False
|
|
911
|
+
return not (a[1] > b[3] or a[3] < b[1] or a[0] > b[2] or a[2] < b[0])
|
|
912
|
+
|
|
913
|
+
merged_rects: list[tuple[int, int, int, int]] = []
|
|
914
|
+
for rect in sorted(rects):
|
|
915
|
+
merged = False
|
|
916
|
+
for i, ex in enumerate(merged_rects):
|
|
917
|
+
if overlaps_for_merge(rect, ex):
|
|
918
|
+
merged_rects[i] = (
|
|
919
|
+
min(rect[0], ex[0]),
|
|
920
|
+
min(rect[1], ex[1]),
|
|
921
|
+
max(rect[2], ex[2]),
|
|
922
|
+
max(rect[3], ex[3]),
|
|
923
|
+
)
|
|
924
|
+
merged = True
|
|
925
|
+
break
|
|
926
|
+
if not merged:
|
|
927
|
+
merged_rects.append(rect)
|
|
928
|
+
|
|
929
|
+
dedup: set[str] = set()
|
|
930
|
+
for top_row, left_col, bottom_row, right_col in merged_rects:
|
|
931
|
+
top_row, left_col, bottom_row, right_col = shrink_to_content_openpyxl(
|
|
932
|
+
ws,
|
|
933
|
+
top_row,
|
|
934
|
+
left_col,
|
|
935
|
+
bottom_row,
|
|
936
|
+
right_col,
|
|
937
|
+
require_inside_border=False,
|
|
938
|
+
top_edge=top_edge,
|
|
939
|
+
bottom_edge=bottom_edge,
|
|
940
|
+
left_edge=left_edge,
|
|
941
|
+
right_edge=right_edge,
|
|
942
|
+
min_nonempty_ratio=0.0,
|
|
943
|
+
)
|
|
944
|
+
vals_block = _get_values_block(ws, top_row, left_col, bottom_row, right_col)
|
|
945
|
+
vals_block = _normalize_matrix(vals_block)
|
|
946
|
+
nonempty = sum(
|
|
947
|
+
1
|
|
948
|
+
for row in vals_block
|
|
949
|
+
for v in row
|
|
950
|
+
if not (v is None or str(v).strip() == "")
|
|
951
|
+
)
|
|
952
|
+
if nonempty < _DETECTION_CONFIG["min_nonempty_cells"]:
|
|
953
|
+
continue
|
|
954
|
+
clusters = _nonempty_clusters(vals_block)
|
|
955
|
+
for r0, c0, r1, c1 in clusters:
|
|
956
|
+
sub = [row[c0 : c1 + 1] for row in vals_block[r0 : r1 + 1]]
|
|
957
|
+
density, coverage = _table_density_metrics(sub)
|
|
958
|
+
if (
|
|
959
|
+
density < _DETECTION_CONFIG["density_min"]
|
|
960
|
+
and coverage < _DETECTION_CONFIG["coverage_min"]
|
|
961
|
+
):
|
|
962
|
+
continue
|
|
963
|
+
if not _is_plausible_table(sub):
|
|
964
|
+
continue
|
|
965
|
+
score = _table_signal_score(sub)
|
|
966
|
+
if score < _DETECTION_CONFIG["table_score_threshold"]:
|
|
967
|
+
continue
|
|
968
|
+
addr = f"{get_column_letter(left_col + c0)}{top_row + r0}:{get_column_letter(left_col + c1)}{top_row + r1}"
|
|
969
|
+
if addr not in dedup:
|
|
970
|
+
dedup.add(addr)
|
|
971
|
+
tables.append(addr)
|
|
972
|
+
wb.close()
|
|
973
|
+
return tables
|
|
974
|
+
|
|
975
|
+
|
|
976
|
+
def detect_tables(sheet: xw.Sheet) -> list[str]:
|
|
977
|
+
excel_path: Path | None = None
|
|
978
|
+
try:
|
|
979
|
+
excel_path = Path(sheet.book.fullname)
|
|
980
|
+
except Exception:
|
|
981
|
+
excel_path = None
|
|
982
|
+
|
|
983
|
+
if excel_path and excel_path.suffix.lower() == ".xls":
|
|
984
|
+
warn_once(
|
|
985
|
+
f"xls-fallback::{excel_path}",
|
|
986
|
+
f"File '{excel_path.name}' is .xls (BIFF); openpyxl cannot read it. Falling back to COM-based detection (slower). Consider converting to .xlsx.",
|
|
987
|
+
)
|
|
988
|
+
return detect_tables_xlwings(sheet)
|
|
989
|
+
|
|
990
|
+
if excel_path and excel_path.suffix.lower() in (".xlsx", ".xlsm"):
|
|
991
|
+
try:
|
|
992
|
+
import openpyxl # noqa: F401
|
|
993
|
+
except Exception:
|
|
994
|
+
warn_once(
|
|
995
|
+
"openpyxl-missing",
|
|
996
|
+
"openpyxl is not installed. Falling back to COM-based detection (slower).",
|
|
997
|
+
)
|
|
998
|
+
return detect_tables_xlwings(sheet)
|
|
999
|
+
|
|
1000
|
+
try:
|
|
1001
|
+
return detect_tables_openpyxl(excel_path, sheet.name)
|
|
1002
|
+
except Exception as e:
|
|
1003
|
+
warn_once(
|
|
1004
|
+
f"openpyxl-parse-fallback::{excel_path}::{sheet.name}",
|
|
1005
|
+
f"openpyxl failed to parse '{excel_path.name}' (sheet '{sheet.name}'): {e!r}. Falling back to COM-based detection (slower).",
|
|
1006
|
+
)
|
|
1007
|
+
return detect_tables_xlwings(sheet)
|
|
1008
|
+
|
|
1009
|
+
warn_once(
|
|
1010
|
+
"unknown-ext-fallback",
|
|
1011
|
+
"Workbook path or extension is unavailable; falling back to COM-based detection (slower).",
|
|
1012
|
+
)
|
|
1013
|
+
return detect_tables_xlwings(sheet)
|
|
1014
|
+
|
|
1015
|
+
|
|
1016
|
+
_INT_RE = re.compile(r"^[+-]?\d+$")
|
|
1017
|
+
_FLOAT_RE = re.compile(r"^[+-]?\d*\.\d+$")
|
|
1018
|
+
|
|
1019
|
+
|
|
1020
|
+
def _coerce_numeric_preserve_format(val: str) -> int | float | str:
|
|
1021
|
+
"""
|
|
1022
|
+
Convert numeric-looking strings to int/float while keeping precision.
|
|
1023
|
+
Integers stay int; decimals keep scale via Decimal before casting to float.
|
|
1024
|
+
"""
|
|
1025
|
+
if _INT_RE.match(val):
|
|
1026
|
+
try:
|
|
1027
|
+
return int(val)
|
|
1028
|
+
except Exception:
|
|
1029
|
+
return val
|
|
1030
|
+
if _FLOAT_RE.match(val):
|
|
1031
|
+
try:
|
|
1032
|
+
dec = Decimal(val)
|
|
1033
|
+
exponent = int(dec.as_tuple().exponent)
|
|
1034
|
+
scale = max(1, -exponent)
|
|
1035
|
+
quantized = dec.quantize(Decimal("1." + "0" * scale))
|
|
1036
|
+
return float(quantized)
|
|
1037
|
+
except (InvalidOperation, Exception):
|
|
1038
|
+
return val
|
|
1039
|
+
return val
|