excel-region-extractor 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. excel_region_extractor-0.1.0/PKG-INFO +7 -0
  2. excel_region_extractor-0.1.0/README.md +170 -0
  3. excel_region_extractor-0.1.0/pyproject.toml +20 -0
  4. excel_region_extractor-0.1.0/setup.cfg +4 -0
  5. excel_region_extractor-0.1.0/src/excel_info_region/__init__.py +12 -0
  6. excel_region_extractor-0.1.0/src/excel_info_region/borders.py +362 -0
  7. excel_region_extractor-0.1.0/src/excel_info_region/cells.py +39 -0
  8. excel_region_extractor-0.1.0/src/excel_info_region/cli.py +32 -0
  9. excel_region_extractor-0.1.0/src/excel_info_region/components.py +132 -0
  10. excel_region_extractor-0.1.0/src/excel_info_region/config.py +12 -0
  11. excel_region_extractor-0.1.0/src/excel_info_region/extractor.py +144 -0
  12. excel_region_extractor-0.1.0/src/excel_info_region/image_export.py +94 -0
  13. excel_region_extractor-0.1.0/src/excel_info_region/image_regions.py +53 -0
  14. excel_region_extractor-0.1.0/src/excel_info_region/io.py +23 -0
  15. excel_region_extractor-0.1.0/src/excel_info_region/raw_drawing.py +385 -0
  16. excel_region_extractor-0.1.0/src/excel_info_region/runner.py +112 -0
  17. excel_region_extractor-0.1.0/src/excel_info_region/schema.py +76 -0
  18. excel_region_extractor-0.1.0/src/excel_info_region/visualize.py +620 -0
  19. excel_region_extractor-0.1.0/src/excel_region_extractor.egg-info/PKG-INFO +7 -0
  20. excel_region_extractor-0.1.0/src/excel_region_extractor.egg-info/SOURCES.txt +24 -0
  21. excel_region_extractor-0.1.0/src/excel_region_extractor.egg-info/dependency_links.txt +1 -0
  22. excel_region_extractor-0.1.0/src/excel_region_extractor.egg-info/entry_points.txt +3 -0
  23. excel_region_extractor-0.1.0/src/excel_region_extractor.egg-info/requires.txt +2 -0
  24. excel_region_extractor-0.1.0/src/excel_region_extractor.egg-info/top_level.txt +1 -0
  25. excel_region_extractor-0.1.0/tests/test_extractor_helpers.py +43 -0
  26. excel_region_extractor-0.1.0/tests/test_smoke.py +20 -0
@@ -0,0 +1,7 @@
1
+ Metadata-Version: 2.4
2
+ Name: excel-region-extractor
3
+ Version: 0.1.0
4
+ Summary: Extract information-region bounding boxes from Excel sheets without semantic labels
5
+ Requires-Python: >=3.10
6
+ Requires-Dist: openpyxl>=3.1.0
7
+ Requires-Dist: Pillow>=10.0.0
@@ -0,0 +1,170 @@
1
+ # Excel Region Extractor
2
+
3
+ Extract Excel information-region ranges from workbook sheets.
4
+
5
+ The tool uses cell values, merged cells, borders, and embedded image anchors to write JSON outputs, optional overlay PNGs, and extracted embedded image files.
6
+
7
+ ## Install
8
+
9
+ From GitHub:
10
+
11
+ ```powershell
12
+ pip install git+https://github.com/LampSeeker/ExcelRegionExtractor.git
13
+ ```
14
+
15
+ For local development:
16
+
17
+ ```powershell
18
+ pip install -e .
19
+ ```
20
+
21
+ ## Usage
22
+
23
+ Run all sheets:
24
+
25
+ ```powershell
26
+ excel-regions --workbook examples/synthetic_demo.xlsx --out outputs/all_sheets
27
+ ```
28
+
29
+ Run one sheet:
30
+
31
+ ```powershell
32
+ excel-regions --workbook examples/synthetic_demo.xlsx --sheet "Synthetic Demo" --out outputs/demo
33
+ ```
34
+
35
+ Write JSON and extracted embedded images without overlay PNG:
36
+
37
+ ```powershell
38
+ excel-regions --workbook examples/synthetic_demo.xlsx --out outputs/all_sheets --no-images
39
+ ```
40
+
41
+ Python API:
42
+
43
+ ```python
44
+ from excel_info_region import extract_workbook_info_regions
45
+ from excel_info_region.config import load_config
46
+
47
+ config = load_config("config/default.json")
48
+ result = extract_workbook_info_regions("examples/synthetic_demo.xlsx", config=config)
49
+ ```
50
+
51
+ ## Output
52
+
53
+ ```text
54
+ outputs/all_sheets/
55
+ info_regions_full.json
56
+ info_regions_summary.json
57
+
58
+ Synthetic Demo/
59
+ info_regions.json
60
+ info_regions.png
61
+ images/
62
+ IMG001_G4_I9_Image_1.png
63
+ ```
64
+
65
+ Sheet JSON:
66
+
67
+ ```json
68
+ {
69
+ "sheet_name": "Synthetic Demo",
70
+ "regions": [
71
+ "A1:H1",
72
+ "A3:D6",
73
+ "G4:I9",
74
+ "A9:E12"
75
+ ],
76
+ "images": [
77
+ {
78
+ "name": "Image 1",
79
+ "range_ref": "G4:I9",
80
+ "path": "images/IMG001_G4_I9_Image_1.png"
81
+ }
82
+ ]
83
+ }
84
+ ```
85
+
86
+ `regions` is the list of detected Excel ranges. `images` records extracted embedded image metadata and relative file paths.
87
+
88
+ Example overlay:
89
+
90
+ ![Synthetic Excel region overlay](docs/images/synthetic_demo_regions.png)
91
+
92
+ ## Processing Flow
93
+
94
+ ```text
95
+ Excel workbook
96
+ -> collect non-empty cells
97
+ -> expand non-empty merged cells to their full merged ranges
98
+ -> find occupied-cell connected components
99
+ -> expand ranges with border/table shells
100
+ -> merge adjacent regions by border contact
101
+ -> keep embedded image regions separate
102
+ -> write sheet JSON, workbook summary JSON, and optional overlay PNG
103
+ ```
104
+
105
+ Images are intentionally kept separate from cell connected components. This avoids over-merging drawings with nearby tables.
106
+
107
+ ## Configuration
108
+
109
+ Default config:
110
+
111
+ ```text
112
+ config/default.json
113
+ ```
114
+
115
+ Common options:
116
+
117
+ ```json
118
+ {
119
+ "include_values": true,
120
+ "include_merged_cells": true,
121
+ "include_images": true,
122
+ "include_grouped_drawing_images": true,
123
+ "use_borders": true,
124
+ "strong_borders_only": true,
125
+ "use_border_contact_merge": true,
126
+ "extract_embedded_images": true,
127
+ "embedded_image_dir": "images"
128
+ }
129
+ ```
130
+
131
+ Set a font path if Korean text is broken in overlay PNGs:
132
+
133
+ ```json
134
+ {
135
+ "visualization": {
136
+ "font_path": "C:/Windows/Fonts/malgun.ttf"
137
+ }
138
+ }
139
+ ```
140
+
141
+ `--no-images` skips overlay PNG generation. Embedded image extraction still runs when `extract_embedded_images` is `true`.
142
+
143
+ ## Project Structure
144
+
145
+ ```text
146
+ src/excel_info_region/
147
+ cli.py console entrypoint
148
+ runner.py writes JSON, overlay PNG, extracted images
149
+ extractor.py workbook/sheet orchestration
150
+ cells.py cell and merged-cell occupied logic
151
+ borders.py border expansion and border-contact merge
152
+ components.py connected components and bbox helpers
153
+ image_regions.py image anchors to region boxes
154
+ image_export.py embedded image extraction
155
+ raw_drawing.py raw xlsx DrawingML parsing
156
+ visualize.py overlay PNG renderer
157
+ ```
158
+
159
+ ## Development
160
+
161
+ ```powershell
162
+ pytest
163
+ excel-regions --workbook examples/synthetic_demo.xlsx --out outputs/all_sheets --no-images
164
+ ```
165
+
166
+ Run without `--no-images` when changing visualization or image extraction.
167
+
168
+ ## Notes
169
+
170
+ `openpyxl` does not calculate formulas. Overlay rendering uses `data_only=True`, so formula cells need cached values saved by Excel to show calculated results.
@@ -0,0 +1,20 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "excel-region-extractor"
7
+ version = "0.1.0"
8
+ description = "Extract information-region bounding boxes from Excel sheets without semantic labels"
9
+ requires-python = ">=3.10"
10
+ dependencies = [
11
+ "openpyxl>=3.1.0",
12
+ "Pillow>=10.0.0"
13
+ ]
14
+
15
+ [project.scripts]
16
+ excel-regions = "excel_info_region.cli:main"
17
+ excel-info-regions = "excel_info_region.cli:main"
18
+
19
+ [tool.setuptools.packages.find]
20
+ where = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,12 @@
1
+ from .extractor import extract_info_regions_from_sheet, extract_workbook_info_regions
2
+ from .runner import run_and_write
3
+ from .schema import Box
4
+
5
+ __version__ = "0.1.0"
6
+
7
+ __all__ = [
8
+ "Box",
9
+ "extract_info_regions_from_sheet",
10
+ "extract_workbook_info_regions",
11
+ "run_and_write",
12
+ ]
@@ -0,0 +1,362 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import deque
4
+ from typing import Any
5
+
6
+ from openpyxl.worksheet.worksheet import Worksheet
7
+
8
+ from .components import (
9
+ dedupe_boxes,
10
+ intersection_area,
11
+ overlap_ratio_on_axis,
12
+ union_find_groups,
13
+ )
14
+ from .schema import Box
15
+
16
+
17
+ def _side_has_style(side) -> bool:
18
+ return side is not None and getattr(side, "style", None) is not None
19
+
20
+
21
+ def cell_has_border(cell, *, strong_only: bool = False) -> bool:
22
+ border = cell.border
23
+ sides = [border.left, border.right, border.top, border.bottom]
24
+
25
+ if not strong_only:
26
+ return any(_side_has_style(side) for side in sides)
27
+
28
+ strong_styles = {
29
+ "medium",
30
+ "thick",
31
+ "double",
32
+ "mediumDashed",
33
+ "mediumDashDot",
34
+ "mediumDashDotDot",
35
+ "slantDashDot",
36
+ }
37
+ return any(getattr(side, "style", None) in strong_styles for side in sides)
38
+
39
+
40
+ def collect_border_occupied(ws: Worksheet, bounds: Box | None, config: dict[str, Any]) -> set[tuple[int, int]]:
41
+ if bounds is None or not config.get("use_borders", True):
42
+ return set()
43
+
44
+ strong_only = bool(config.get("strong_borders_only", False))
45
+ occupied: set[tuple[int, int]] = set()
46
+
47
+ for (row, col), cell in ws._cells.items():
48
+ if bounds.contains(row, col) and cell_has_border(cell, strong_only=strong_only):
49
+ occupied.add((row, col))
50
+
51
+ # Merged cells often store border information only on edge cells.
52
+ if config.get("include_merged_cells", True):
53
+ for rng in ws.merged_cells.ranges:
54
+ box = Box(rng.min_row, rng.min_col, rng.max_row, rng.max_col)
55
+ if not box.intersects(bounds):
56
+ continue
57
+
58
+ edge_cells = []
59
+ for col in range(rng.min_col, rng.max_col + 1):
60
+ edge_cells.append(ws.cell(rng.min_row, col))
61
+ edge_cells.append(ws.cell(rng.max_row, col))
62
+ for row in range(rng.min_row, rng.max_row + 1):
63
+ edge_cells.append(ws.cell(row, rng.min_col))
64
+ edge_cells.append(ws.cell(row, rng.max_col))
65
+
66
+ if any(cell_has_border(cell, strong_only=strong_only) for cell in edge_cells):
67
+ for row in range(max(bounds.min_row, rng.min_row), min(bounds.max_row, rng.max_row) + 1):
68
+ for col in range(max(bounds.min_col, rng.min_col), min(bounds.max_col, rng.max_col) + 1):
69
+ occupied.add((row, col))
70
+
71
+ return occupied
72
+
73
+
74
+ def should_expand_to_border_shell(value_box: Box, border_box: Box, config: dict[str, Any]) -> bool:
75
+ inter = intersection_area(value_box, border_box)
76
+ if inter <= 0:
77
+ return False
78
+
79
+ value_overlap = inter / max(1, value_box.area)
80
+ border_overlap = inter / max(1, border_box.area)
81
+
82
+ if value_overlap < float(config.get("border_expand_min_value_overlap", 0.80)):
83
+ return False
84
+
85
+ # Border expansion is bbox correction, not section grouping.
86
+ max_area_ratio = float(config.get("border_expand_max_area_ratio", 3.0))
87
+ if border_box.area > value_box.area * max_area_ratio:
88
+ return False
89
+
90
+ max_extra_rows = int(config.get("border_expand_max_extra_rows", 3))
91
+ max_extra_cols = int(config.get("border_expand_max_extra_cols", 3))
92
+ extra_top = max(0, value_box.min_row - border_box.min_row)
93
+ extra_bottom = max(0, border_box.max_row - value_box.max_row)
94
+ extra_left = max(0, value_box.min_col - border_box.min_col)
95
+ extra_right = max(0, border_box.max_col - value_box.max_col)
96
+
97
+ if max(extra_top, extra_bottom) > max_extra_rows:
98
+ return False
99
+ if max(extra_left, extra_right) > max_extra_cols:
100
+ return False
101
+
102
+ # Avoid over-expanding a tiny value region into a large outlined region.
103
+ return border_overlap >= float(config.get("border_expand_min_border_overlap", 0.10))
104
+
105
+
106
+ def expand_cell_boxes_with_borders(
107
+ cell_boxes: list[Box],
108
+ border_boxes: list[Box],
109
+ config: dict[str, Any],
110
+ ) -> list[Box]:
111
+ if not config.get("use_borders", True) or not border_boxes:
112
+ return cell_boxes
113
+
114
+ expanded: list[Box] = []
115
+ for value_box in cell_boxes:
116
+ current = value_box
117
+ for border_box in border_boxes:
118
+ if should_expand_to_border_shell(current, border_box, config):
119
+ current = current.union(border_box)
120
+ expanded.append(current)
121
+
122
+ if config.get("add_border_only_regions", False):
123
+ for border_box in border_boxes:
124
+ has_value = any(
125
+ border_box.contains_box(value_box) or intersection_area(border_box, value_box) > 0
126
+ for value_box in cell_boxes
127
+ )
128
+ if not has_value:
129
+ expanded.append(border_box)
130
+
131
+ return dedupe_boxes(expanded)
132
+
133
+
134
+ BorderEdge = tuple[str, int, int] # ("h", row_line, col) or ("v", row, col_line)
135
+
136
+
137
+ def cell_has_border_side(side, *, strong_only: bool = False) -> bool:
138
+ if not _side_has_style(side):
139
+ return False
140
+ if not strong_only:
141
+ return True
142
+ return getattr(side, "style", None) in {
143
+ "medium",
144
+ "thick",
145
+ "double",
146
+ "mediumDashed",
147
+ "mediumDashDot",
148
+ "mediumDashDotDot",
149
+ "slantDashDot",
150
+ }
151
+
152
+
153
+ def collect_border_edges(ws: Worksheet, bounds: Box | None, config: dict[str, Any]) -> set[BorderEdge]:
154
+ if bounds is None or not config.get("use_border_contact_merge", False):
155
+ return set()
156
+
157
+ strong_only = bool(config.get("border_contact_strong_only", False))
158
+ edges: set[BorderEdge] = set()
159
+
160
+ def add_cell_edges(row: int, col: int, cell) -> None:
161
+ b = cell.border
162
+ if _side_has_style(b.top) and (not strong_only or cell_has_border_side(b.top, strong_only=True)):
163
+ edges.add(("h", row, col))
164
+ if _side_has_style(b.bottom) and (not strong_only or cell_has_border_side(b.bottom, strong_only=True)):
165
+ edges.add(("h", row + 1, col))
166
+ if _side_has_style(b.left) and (not strong_only or cell_has_border_side(b.left, strong_only=True)):
167
+ edges.add(("v", row, col))
168
+ if _side_has_style(b.right) and (not strong_only or cell_has_border_side(b.right, strong_only=True)):
169
+ edges.add(("v", row, col + 1))
170
+
171
+ for (row, col), cell in ws._cells.items():
172
+ if bounds.contains(row, col):
173
+ add_cell_edges(row, col, cell)
174
+
175
+ if config.get("include_merged_cells", True):
176
+ for rng in ws.merged_cells.ranges:
177
+ box = Box(rng.min_row, rng.min_col, rng.max_row, rng.max_col)
178
+ if not box.intersects(bounds):
179
+ continue
180
+
181
+ for col in range(rng.min_col, rng.max_col + 1):
182
+ top_cell = ws.cell(rng.min_row, col)
183
+ bottom_cell = ws.cell(rng.max_row, col)
184
+ if _side_has_style(top_cell.border.top) and (not strong_only or cell_has_border_side(top_cell.border.top, strong_only=True)):
185
+ edges.add(("h", rng.min_row, col))
186
+ if _side_has_style(bottom_cell.border.bottom) and (not strong_only or cell_has_border_side(bottom_cell.border.bottom, strong_only=True)):
187
+ edges.add(("h", rng.max_row + 1, col))
188
+
189
+ for row in range(rng.min_row, rng.max_row + 1):
190
+ left_cell = ws.cell(row, rng.min_col)
191
+ right_cell = ws.cell(row, rng.max_col)
192
+ if _side_has_style(left_cell.border.left) and (not strong_only or cell_has_border_side(left_cell.border.left, strong_only=True)):
193
+ edges.add(("v", row, rng.min_col))
194
+ if _side_has_style(right_cell.border.right) and (not strong_only or cell_has_border_side(right_cell.border.right, strong_only=True)):
195
+ edges.add(("v", row, rng.max_col + 1))
196
+
197
+ return edges
198
+
199
+
200
+ def edge_endpoints(edge: BorderEdge) -> tuple[tuple[int, int], tuple[int, int]]:
201
+ kind, a, b = edge
202
+ if kind == "h":
203
+ return (a, b), (a, b + 1)
204
+ return (a, b), (a + 1, b)
205
+
206
+
207
+ def border_edge_components(edges: set[BorderEdge]) -> dict[BorderEdge, int]:
208
+ endpoint_to_edges: dict[tuple[int, int], list[BorderEdge]] = {}
209
+ for edge in edges:
210
+ p1, p2 = edge_endpoints(edge)
211
+ endpoint_to_edges.setdefault(p1, []).append(edge)
212
+ endpoint_to_edges.setdefault(p2, []).append(edge)
213
+
214
+ edge_to_component: dict[BorderEdge, int] = {}
215
+ visited: set[BorderEdge] = set()
216
+ component_id = 0
217
+
218
+ for start in sorted(edges):
219
+ if start in visited:
220
+ continue
221
+ component_id += 1
222
+ q = deque([start])
223
+ visited.add(start)
224
+ edge_to_component[start] = component_id
225
+
226
+ while q:
227
+ edge = q.popleft()
228
+ for point in edge_endpoints(edge):
229
+ for nxt in endpoint_to_edges.get(point, []):
230
+ if nxt not in visited:
231
+ visited.add(nxt)
232
+ edge_to_component[nxt] = component_id
233
+ q.append(nxt)
234
+
235
+ return edge_to_component
236
+
237
+
238
+ def perimeter_edges_by_side(box: Box, *, tolerance: int = 0) -> dict[str, set[BorderEdge]]:
239
+ min_row = max(1, box.min_row - tolerance)
240
+ min_col = max(1, box.min_col - tolerance)
241
+ max_row = box.max_row + tolerance
242
+ max_col = box.max_col + tolerance
243
+
244
+ return {
245
+ "top": {("h", min_row, col) for col in range(min_col, max_col + 1)},
246
+ "bottom": {("h", max_row + 1, col) for col in range(min_col, max_col + 1)},
247
+ "left": {("v", row, min_col) for row in range(min_row, max_row + 1)},
248
+ "right": {("v", row, max_col + 1) for row in range(min_row, max_row + 1)},
249
+ }
250
+
251
+
252
+ def touched_border_component_sides(
253
+ box: Box,
254
+ edge_to_component: dict[BorderEdge, int],
255
+ config: dict[str, Any],
256
+ ) -> dict[int, set[str]]:
257
+ tolerance = int(config.get("border_contact_tolerance_cells", 0))
258
+ min_edges_per_side = int(
259
+ config.get(
260
+ "border_contact_min_edges_per_side",
261
+ config.get("border_contact_min_edges", 1),
262
+ )
263
+ )
264
+
265
+ component_side_counts: dict[int, dict[str, int]] = {}
266
+ for side_name, edges in perimeter_edges_by_side(box, tolerance=tolerance).items():
267
+ for edge in edges:
268
+ component_id = edge_to_component.get(edge)
269
+ if component_id is None:
270
+ continue
271
+ component_side_counts.setdefault(component_id, {})
272
+ component_side_counts[component_id][side_name] = (
273
+ component_side_counts[component_id].get(side_name, 0) + 1
274
+ )
275
+
276
+ result: dict[int, set[str]] = {}
277
+ for component_id, side_counts in component_side_counts.items():
278
+ sides = {
279
+ side_name
280
+ for side_name, count in side_counts.items()
281
+ if count >= min_edges_per_side
282
+ }
283
+ if sides:
284
+ result[component_id] = sides
285
+ return result
286
+
287
+
288
+ def boxes_are_contact_merge_neighbors(a: Box, b: Box, config: dict[str, Any]) -> bool:
289
+ max_gap = int(config.get("border_contact_merge_max_gap", 1))
290
+ min_axis_overlap = float(config.get("border_contact_merge_min_axis_overlap", 0.80))
291
+
292
+ if a.max_row < b.min_row:
293
+ row_gap = b.min_row - a.max_row - 1
294
+ return row_gap <= max_gap and overlap_ratio_on_axis(a, b, axis="col") >= min_axis_overlap
295
+ if b.max_row < a.min_row:
296
+ row_gap = a.min_row - b.max_row - 1
297
+ return row_gap <= max_gap and overlap_ratio_on_axis(a, b, axis="col") >= min_axis_overlap
298
+
299
+ if a.max_col < b.min_col:
300
+ col_gap = b.min_col - a.max_col - 1
301
+ return col_gap <= max_gap and overlap_ratio_on_axis(a, b, axis="row") >= min_axis_overlap
302
+ if b.max_col < a.min_col:
303
+ col_gap = a.min_col - b.max_col - 1
304
+ return col_gap <= max_gap and overlap_ratio_on_axis(a, b, axis="row") >= min_axis_overlap
305
+
306
+ return True
307
+
308
+
309
+ def merge_boxes_by_border_contact(
310
+ cell_boxes: list[Box],
311
+ ws: Worksheet,
312
+ bounds: Box | None,
313
+ config: dict[str, Any],
314
+ ) -> list[Box]:
315
+ if not config.get("use_border_contact_merge", False) or len(cell_boxes) < 2:
316
+ return cell_boxes
317
+
318
+ edges = collect_border_edges(ws, bounds, config)
319
+ if not edges:
320
+ return cell_boxes
321
+
322
+ edge_to_component = border_edge_components(edges)
323
+ min_touched_sides = int(config.get("border_contact_min_touched_sides", 2))
324
+ component_to_indices: dict[int, list[int]] = {}
325
+
326
+ for idx, box in enumerate(cell_boxes):
327
+ touched_sides_by_component = touched_border_component_sides(box, edge_to_component, config)
328
+ for component_id, sides in touched_sides_by_component.items():
329
+ if len(sides) >= min_touched_sides:
330
+ component_to_indices.setdefault(component_id, []).append(idx)
331
+
332
+ candidate_pairs: set[tuple[int, int]] = set()
333
+ for indices in component_to_indices.values():
334
+ unique_indices = sorted(set(indices))
335
+ if len(unique_indices) < 2:
336
+ continue
337
+ for pos, a_idx in enumerate(unique_indices):
338
+ for b_idx in unique_indices[pos + 1:]:
339
+ if boxes_are_contact_merge_neighbors(cell_boxes[a_idx], cell_boxes[b_idx], config):
340
+ candidate_pairs.add((a_idx, b_idx))
341
+
342
+ if not candidate_pairs:
343
+ return cell_boxes
344
+
345
+ output: list[Box] = []
346
+ for group in union_find_groups(list(range(len(cell_boxes))), sorted(candidate_pairs)):
347
+ if len(group) == 1:
348
+ output.append(cell_boxes[group[0]])
349
+ continue
350
+
351
+ merged = cell_boxes[group[0]]
352
+ for idx in group[1:]:
353
+ merged = merged.union(cell_boxes[idx])
354
+
355
+ total_area = sum(cell_boxes[idx].area for idx in group)
356
+ max_area_ratio = float(config.get("border_contact_merge_max_area_ratio", 2.5))
357
+ if merged.area > total_area * max_area_ratio:
358
+ output.extend(cell_boxes[idx] for idx in group)
359
+ else:
360
+ output.append(merged)
361
+
362
+ return dedupe_boxes(output)
@@ -0,0 +1,39 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from openpyxl.worksheet.worksheet import Worksheet
6
+
7
+ from .schema import Box
8
+
9
+
10
+ def is_non_empty(value: Any) -> bool:
11
+ return value is not None and value != ""
12
+
13
+
14
+ def merged_boxes_with_values(ws: Worksheet) -> list[Box]:
15
+ boxes: list[Box] = []
16
+ for rng in ws.merged_cells.ranges:
17
+ top_left = ws.cell(rng.min_row, rng.min_col)
18
+ if is_non_empty(top_left.value):
19
+ boxes.append(Box(rng.min_row, rng.min_col, rng.max_row, rng.max_col))
20
+ return boxes
21
+
22
+
23
+ def collect_cell_occupied(ws: Worksheet, bounds: Box | None, config: dict[str, Any]) -> set[tuple[int, int]]:
24
+ if bounds is None:
25
+ return set()
26
+ occupied: set[tuple[int, int]] = set()
27
+
28
+ if config.get("include_values", True):
29
+ for (row, col), cell in ws._cells.items():
30
+ if bounds.contains(row, col) and is_non_empty(cell.value):
31
+ occupied.add((row, col))
32
+
33
+ if config.get("include_merged_cells", True):
34
+ for box in merged_boxes_with_values(ws):
35
+ for row in range(max(bounds.min_row, box.min_row), min(bounds.max_row, box.max_row) + 1):
36
+ for col in range(max(bounds.min_col, box.min_col), min(bounds.max_col, box.max_col) + 1):
37
+ occupied.add((row, col))
38
+
39
+ return occupied
@@ -0,0 +1,32 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ from pathlib import Path
5
+
6
+ from .runner import run_and_write
7
+
8
+
9
+ PROJECT_ROOT = Path(__file__).resolve().parents[2]
10
+
11
+
12
+ def main() -> None:
13
+ p = argparse.ArgumentParser(description="Extract Excel information regions only")
14
+ p.add_argument("--workbook", default=str(PROJECT_ROOT / "examples" / "sample.xlsx"))
15
+ p.add_argument("--sheet", default=None)
16
+ p.add_argument("--config", default=str(PROJECT_ROOT / "config" / "default.json"))
17
+ p.add_argument("--out", default=str(PROJECT_ROOT / "outputs" / "info_regions"))
18
+ p.add_argument("--no-images", action="store_true", help="Skip PNG overlay generation")
19
+ args = p.parse_args()
20
+
21
+ result = run_and_write(
22
+ args.workbook,
23
+ out_dir=args.out,
24
+ sheet_name=args.sheet,
25
+ config_path=args.config,
26
+ write_images=not args.no_images,
27
+ )
28
+
29
+ print(f"[extract_info_regions] sheets={len(result['sheets'])} -> {args.out}")
30
+ for sheet, data in result["sheets"].items():
31
+ regions = data.get("regions", data.get("info_regions", []))
32
+ print(f" - {sheet}: regions={len(regions)}, ranges={regions}, images={len(data.get('images', []))}")