excel-region-extractor 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- excel_region_extractor-0.1.0/PKG-INFO +7 -0
- excel_region_extractor-0.1.0/README.md +170 -0
- excel_region_extractor-0.1.0/pyproject.toml +20 -0
- excel_region_extractor-0.1.0/setup.cfg +4 -0
- excel_region_extractor-0.1.0/src/excel_info_region/__init__.py +12 -0
- excel_region_extractor-0.1.0/src/excel_info_region/borders.py +362 -0
- excel_region_extractor-0.1.0/src/excel_info_region/cells.py +39 -0
- excel_region_extractor-0.1.0/src/excel_info_region/cli.py +32 -0
- excel_region_extractor-0.1.0/src/excel_info_region/components.py +132 -0
- excel_region_extractor-0.1.0/src/excel_info_region/config.py +12 -0
- excel_region_extractor-0.1.0/src/excel_info_region/extractor.py +144 -0
- excel_region_extractor-0.1.0/src/excel_info_region/image_export.py +94 -0
- excel_region_extractor-0.1.0/src/excel_info_region/image_regions.py +53 -0
- excel_region_extractor-0.1.0/src/excel_info_region/io.py +23 -0
- excel_region_extractor-0.1.0/src/excel_info_region/raw_drawing.py +385 -0
- excel_region_extractor-0.1.0/src/excel_info_region/runner.py +112 -0
- excel_region_extractor-0.1.0/src/excel_info_region/schema.py +76 -0
- excel_region_extractor-0.1.0/src/excel_info_region/visualize.py +620 -0
- excel_region_extractor-0.1.0/src/excel_region_extractor.egg-info/PKG-INFO +7 -0
- excel_region_extractor-0.1.0/src/excel_region_extractor.egg-info/SOURCES.txt +24 -0
- excel_region_extractor-0.1.0/src/excel_region_extractor.egg-info/dependency_links.txt +1 -0
- excel_region_extractor-0.1.0/src/excel_region_extractor.egg-info/entry_points.txt +3 -0
- excel_region_extractor-0.1.0/src/excel_region_extractor.egg-info/requires.txt +2 -0
- excel_region_extractor-0.1.0/src/excel_region_extractor.egg-info/top_level.txt +1 -0
- excel_region_extractor-0.1.0/tests/test_extractor_helpers.py +43 -0
- excel_region_extractor-0.1.0/tests/test_smoke.py +20 -0
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
# Excel Region Extractor
|
|
2
|
+
|
|
3
|
+
Extract Excel information-region ranges from workbook sheets.
|
|
4
|
+
|
|
5
|
+
The tool uses cell values, merged cells, borders, and embedded image anchors to write JSON outputs, optional overlay PNGs, and extracted embedded image files.
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
From GitHub:
|
|
10
|
+
|
|
11
|
+
```powershell
|
|
12
|
+
pip install git+https://github.com/LampSeeker/ExcelRegionExtractor.git
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
For local development:
|
|
16
|
+
|
|
17
|
+
```powershell
|
|
18
|
+
pip install -e .
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Usage
|
|
22
|
+
|
|
23
|
+
Run all sheets:
|
|
24
|
+
|
|
25
|
+
```powershell
|
|
26
|
+
excel-regions --workbook examples/synthetic_demo.xlsx --out outputs/all_sheets
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
Run one sheet:
|
|
30
|
+
|
|
31
|
+
```powershell
|
|
32
|
+
excel-regions --workbook examples/synthetic_demo.xlsx --sheet "Synthetic Demo" --out outputs/demo
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
Write JSON and extracted embedded images without overlay PNG:
|
|
36
|
+
|
|
37
|
+
```powershell
|
|
38
|
+
excel-regions --workbook examples/synthetic_demo.xlsx --out outputs/all_sheets --no-images
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Python API:
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
from excel_info_region import extract_workbook_info_regions
|
|
45
|
+
from excel_info_region.config import load_config
|
|
46
|
+
|
|
47
|
+
config = load_config("config/default.json")
|
|
48
|
+
result = extract_workbook_info_regions("examples/synthetic_demo.xlsx", config=config)
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Output
|
|
52
|
+
|
|
53
|
+
```text
|
|
54
|
+
outputs/all_sheets/
|
|
55
|
+
info_regions_full.json
|
|
56
|
+
info_regions_summary.json
|
|
57
|
+
|
|
58
|
+
Synthetic Demo/
|
|
59
|
+
info_regions.json
|
|
60
|
+
info_regions.png
|
|
61
|
+
images/
|
|
62
|
+
IMG001_G4_I9_Image_1.png
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
Sheet JSON:
|
|
66
|
+
|
|
67
|
+
```json
|
|
68
|
+
{
|
|
69
|
+
"sheet_name": "Synthetic Demo",
|
|
70
|
+
"regions": [
|
|
71
|
+
"A1:H1",
|
|
72
|
+
"A3:D6",
|
|
73
|
+
"G4:I9",
|
|
74
|
+
"A9:E12"
|
|
75
|
+
],
|
|
76
|
+
"images": [
|
|
77
|
+
{
|
|
78
|
+
"name": "Image 1",
|
|
79
|
+
"range_ref": "G4:I9",
|
|
80
|
+
"path": "images/IMG001_G4_I9_Image_1.png"
|
|
81
|
+
}
|
|
82
|
+
]
|
|
83
|
+
}
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
`regions` is the list of detected Excel ranges. `images` records extracted embedded image metadata and relative file paths.
|
|
87
|
+
|
|
88
|
+
Example overlay:
|
|
89
|
+
|
|
90
|
+

|
|
91
|
+
|
|
92
|
+
## Processing Flow
|
|
93
|
+
|
|
94
|
+
```text
|
|
95
|
+
Excel workbook
|
|
96
|
+
-> collect non-empty cells
|
|
97
|
+
-> expand non-empty merged cells to their full merged ranges
|
|
98
|
+
-> find occupied-cell connected components
|
|
99
|
+
-> expand ranges with border/table shells
|
|
100
|
+
-> merge adjacent regions by border contact
|
|
101
|
+
-> keep embedded image regions separate
|
|
102
|
+
-> write sheet JSON, workbook summary JSON, and optional overlay PNG
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
Images are intentionally kept separate from cell connected components. This avoids over-merging drawings with nearby tables.
|
|
106
|
+
|
|
107
|
+
## Configuration
|
|
108
|
+
|
|
109
|
+
Default config:
|
|
110
|
+
|
|
111
|
+
```text
|
|
112
|
+
config/default.json
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
Common options:
|
|
116
|
+
|
|
117
|
+
```json
|
|
118
|
+
{
|
|
119
|
+
"include_values": true,
|
|
120
|
+
"include_merged_cells": true,
|
|
121
|
+
"include_images": true,
|
|
122
|
+
"include_grouped_drawing_images": true,
|
|
123
|
+
"use_borders": true,
|
|
124
|
+
"strong_borders_only": true,
|
|
125
|
+
"use_border_contact_merge": true,
|
|
126
|
+
"extract_embedded_images": true,
|
|
127
|
+
"embedded_image_dir": "images"
|
|
128
|
+
}
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
Set a font path if Korean text is broken in overlay PNGs:
|
|
132
|
+
|
|
133
|
+
```json
|
|
134
|
+
{
|
|
135
|
+
"visualization": {
|
|
136
|
+
"font_path": "C:/Windows/Fonts/malgun.ttf"
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
`--no-images` skips overlay PNG generation. Embedded image extraction still runs when `extract_embedded_images` is `true`.
|
|
142
|
+
|
|
143
|
+
## Project Structure
|
|
144
|
+
|
|
145
|
+
```text
|
|
146
|
+
src/excel_info_region/
|
|
147
|
+
cli.py console entrypoint
|
|
148
|
+
runner.py writes JSON, overlay PNG, extracted images
|
|
149
|
+
extractor.py workbook/sheet orchestration
|
|
150
|
+
cells.py cell and merged-cell occupied logic
|
|
151
|
+
borders.py border expansion and border-contact merge
|
|
152
|
+
components.py connected components and bbox helpers
|
|
153
|
+
image_regions.py image anchors to region boxes
|
|
154
|
+
image_export.py embedded image extraction
|
|
155
|
+
raw_drawing.py raw xlsx DrawingML parsing
|
|
156
|
+
visualize.py overlay PNG renderer
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
## Development
|
|
160
|
+
|
|
161
|
+
```powershell
|
|
162
|
+
pytest
|
|
163
|
+
excel-regions --workbook examples/synthetic_demo.xlsx --out outputs/all_sheets --no-images
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
Run without `--no-images` when changing visualization or image extraction.
|
|
167
|
+
|
|
168
|
+
## Notes
|
|
169
|
+
|
|
170
|
+
`openpyxl` does not calculate formulas. Overlay rendering uses `data_only=True`, so formula cells need cached values saved by Excel to show calculated results.
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "excel-region-extractor"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Extract information-region bounding boxes from Excel sheets without semantic labels"
|
|
9
|
+
requires-python = ">=3.10"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"openpyxl>=3.1.0",
|
|
12
|
+
"Pillow>=10.0.0"
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
[project.scripts]
|
|
16
|
+
excel-regions = "excel_info_region.cli:main"
|
|
17
|
+
excel-info-regions = "excel_info_region.cli:main"
|
|
18
|
+
|
|
19
|
+
[tool.setuptools.packages.find]
|
|
20
|
+
where = ["src"]
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from .extractor import extract_info_regions_from_sheet, extract_workbook_info_regions
|
|
2
|
+
from .runner import run_and_write
|
|
3
|
+
from .schema import Box
|
|
4
|
+
|
|
5
|
+
__version__ = "0.1.0"
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"Box",
|
|
9
|
+
"extract_info_regions_from_sheet",
|
|
10
|
+
"extract_workbook_info_regions",
|
|
11
|
+
"run_and_write",
|
|
12
|
+
]
|
|
@@ -0,0 +1,362 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections import deque
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from openpyxl.worksheet.worksheet import Worksheet
|
|
7
|
+
|
|
8
|
+
from .components import (
|
|
9
|
+
dedupe_boxes,
|
|
10
|
+
intersection_area,
|
|
11
|
+
overlap_ratio_on_axis,
|
|
12
|
+
union_find_groups,
|
|
13
|
+
)
|
|
14
|
+
from .schema import Box
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _side_has_style(side) -> bool:
|
|
18
|
+
return side is not None and getattr(side, "style", None) is not None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def cell_has_border(cell, *, strong_only: bool = False) -> bool:
|
|
22
|
+
border = cell.border
|
|
23
|
+
sides = [border.left, border.right, border.top, border.bottom]
|
|
24
|
+
|
|
25
|
+
if not strong_only:
|
|
26
|
+
return any(_side_has_style(side) for side in sides)
|
|
27
|
+
|
|
28
|
+
strong_styles = {
|
|
29
|
+
"medium",
|
|
30
|
+
"thick",
|
|
31
|
+
"double",
|
|
32
|
+
"mediumDashed",
|
|
33
|
+
"mediumDashDot",
|
|
34
|
+
"mediumDashDotDot",
|
|
35
|
+
"slantDashDot",
|
|
36
|
+
}
|
|
37
|
+
return any(getattr(side, "style", None) in strong_styles for side in sides)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def collect_border_occupied(ws: Worksheet, bounds: Box | None, config: dict[str, Any]) -> set[tuple[int, int]]:
|
|
41
|
+
if bounds is None or not config.get("use_borders", True):
|
|
42
|
+
return set()
|
|
43
|
+
|
|
44
|
+
strong_only = bool(config.get("strong_borders_only", False))
|
|
45
|
+
occupied: set[tuple[int, int]] = set()
|
|
46
|
+
|
|
47
|
+
for (row, col), cell in ws._cells.items():
|
|
48
|
+
if bounds.contains(row, col) and cell_has_border(cell, strong_only=strong_only):
|
|
49
|
+
occupied.add((row, col))
|
|
50
|
+
|
|
51
|
+
# Merged cells often store border information only on edge cells.
|
|
52
|
+
if config.get("include_merged_cells", True):
|
|
53
|
+
for rng in ws.merged_cells.ranges:
|
|
54
|
+
box = Box(rng.min_row, rng.min_col, rng.max_row, rng.max_col)
|
|
55
|
+
if not box.intersects(bounds):
|
|
56
|
+
continue
|
|
57
|
+
|
|
58
|
+
edge_cells = []
|
|
59
|
+
for col in range(rng.min_col, rng.max_col + 1):
|
|
60
|
+
edge_cells.append(ws.cell(rng.min_row, col))
|
|
61
|
+
edge_cells.append(ws.cell(rng.max_row, col))
|
|
62
|
+
for row in range(rng.min_row, rng.max_row + 1):
|
|
63
|
+
edge_cells.append(ws.cell(row, rng.min_col))
|
|
64
|
+
edge_cells.append(ws.cell(row, rng.max_col))
|
|
65
|
+
|
|
66
|
+
if any(cell_has_border(cell, strong_only=strong_only) for cell in edge_cells):
|
|
67
|
+
for row in range(max(bounds.min_row, rng.min_row), min(bounds.max_row, rng.max_row) + 1):
|
|
68
|
+
for col in range(max(bounds.min_col, rng.min_col), min(bounds.max_col, rng.max_col) + 1):
|
|
69
|
+
occupied.add((row, col))
|
|
70
|
+
|
|
71
|
+
return occupied
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def should_expand_to_border_shell(value_box: Box, border_box: Box, config: dict[str, Any]) -> bool:
|
|
75
|
+
inter = intersection_area(value_box, border_box)
|
|
76
|
+
if inter <= 0:
|
|
77
|
+
return False
|
|
78
|
+
|
|
79
|
+
value_overlap = inter / max(1, value_box.area)
|
|
80
|
+
border_overlap = inter / max(1, border_box.area)
|
|
81
|
+
|
|
82
|
+
if value_overlap < float(config.get("border_expand_min_value_overlap", 0.80)):
|
|
83
|
+
return False
|
|
84
|
+
|
|
85
|
+
# Border expansion is bbox correction, not section grouping.
|
|
86
|
+
max_area_ratio = float(config.get("border_expand_max_area_ratio", 3.0))
|
|
87
|
+
if border_box.area > value_box.area * max_area_ratio:
|
|
88
|
+
return False
|
|
89
|
+
|
|
90
|
+
max_extra_rows = int(config.get("border_expand_max_extra_rows", 3))
|
|
91
|
+
max_extra_cols = int(config.get("border_expand_max_extra_cols", 3))
|
|
92
|
+
extra_top = max(0, value_box.min_row - border_box.min_row)
|
|
93
|
+
extra_bottom = max(0, border_box.max_row - value_box.max_row)
|
|
94
|
+
extra_left = max(0, value_box.min_col - border_box.min_col)
|
|
95
|
+
extra_right = max(0, border_box.max_col - value_box.max_col)
|
|
96
|
+
|
|
97
|
+
if max(extra_top, extra_bottom) > max_extra_rows:
|
|
98
|
+
return False
|
|
99
|
+
if max(extra_left, extra_right) > max_extra_cols:
|
|
100
|
+
return False
|
|
101
|
+
|
|
102
|
+
# Avoid over-expanding a tiny value region into a large outlined region.
|
|
103
|
+
return border_overlap >= float(config.get("border_expand_min_border_overlap", 0.10))
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def expand_cell_boxes_with_borders(
|
|
107
|
+
cell_boxes: list[Box],
|
|
108
|
+
border_boxes: list[Box],
|
|
109
|
+
config: dict[str, Any],
|
|
110
|
+
) -> list[Box]:
|
|
111
|
+
if not config.get("use_borders", True) or not border_boxes:
|
|
112
|
+
return cell_boxes
|
|
113
|
+
|
|
114
|
+
expanded: list[Box] = []
|
|
115
|
+
for value_box in cell_boxes:
|
|
116
|
+
current = value_box
|
|
117
|
+
for border_box in border_boxes:
|
|
118
|
+
if should_expand_to_border_shell(current, border_box, config):
|
|
119
|
+
current = current.union(border_box)
|
|
120
|
+
expanded.append(current)
|
|
121
|
+
|
|
122
|
+
if config.get("add_border_only_regions", False):
|
|
123
|
+
for border_box in border_boxes:
|
|
124
|
+
has_value = any(
|
|
125
|
+
border_box.contains_box(value_box) or intersection_area(border_box, value_box) > 0
|
|
126
|
+
for value_box in cell_boxes
|
|
127
|
+
)
|
|
128
|
+
if not has_value:
|
|
129
|
+
expanded.append(border_box)
|
|
130
|
+
|
|
131
|
+
return dedupe_boxes(expanded)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
BorderEdge = tuple[str, int, int] # ("h", row_line, col) or ("v", row, col_line)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def cell_has_border_side(side, *, strong_only: bool = False) -> bool:
|
|
138
|
+
if not _side_has_style(side):
|
|
139
|
+
return False
|
|
140
|
+
if not strong_only:
|
|
141
|
+
return True
|
|
142
|
+
return getattr(side, "style", None) in {
|
|
143
|
+
"medium",
|
|
144
|
+
"thick",
|
|
145
|
+
"double",
|
|
146
|
+
"mediumDashed",
|
|
147
|
+
"mediumDashDot",
|
|
148
|
+
"mediumDashDotDot",
|
|
149
|
+
"slantDashDot",
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def collect_border_edges(ws: Worksheet, bounds: Box | None, config: dict[str, Any]) -> set[BorderEdge]:
|
|
154
|
+
if bounds is None or not config.get("use_border_contact_merge", False):
|
|
155
|
+
return set()
|
|
156
|
+
|
|
157
|
+
strong_only = bool(config.get("border_contact_strong_only", False))
|
|
158
|
+
edges: set[BorderEdge] = set()
|
|
159
|
+
|
|
160
|
+
def add_cell_edges(row: int, col: int, cell) -> None:
|
|
161
|
+
b = cell.border
|
|
162
|
+
if _side_has_style(b.top) and (not strong_only or cell_has_border_side(b.top, strong_only=True)):
|
|
163
|
+
edges.add(("h", row, col))
|
|
164
|
+
if _side_has_style(b.bottom) and (not strong_only or cell_has_border_side(b.bottom, strong_only=True)):
|
|
165
|
+
edges.add(("h", row + 1, col))
|
|
166
|
+
if _side_has_style(b.left) and (not strong_only or cell_has_border_side(b.left, strong_only=True)):
|
|
167
|
+
edges.add(("v", row, col))
|
|
168
|
+
if _side_has_style(b.right) and (not strong_only or cell_has_border_side(b.right, strong_only=True)):
|
|
169
|
+
edges.add(("v", row, col + 1))
|
|
170
|
+
|
|
171
|
+
for (row, col), cell in ws._cells.items():
|
|
172
|
+
if bounds.contains(row, col):
|
|
173
|
+
add_cell_edges(row, col, cell)
|
|
174
|
+
|
|
175
|
+
if config.get("include_merged_cells", True):
|
|
176
|
+
for rng in ws.merged_cells.ranges:
|
|
177
|
+
box = Box(rng.min_row, rng.min_col, rng.max_row, rng.max_col)
|
|
178
|
+
if not box.intersects(bounds):
|
|
179
|
+
continue
|
|
180
|
+
|
|
181
|
+
for col in range(rng.min_col, rng.max_col + 1):
|
|
182
|
+
top_cell = ws.cell(rng.min_row, col)
|
|
183
|
+
bottom_cell = ws.cell(rng.max_row, col)
|
|
184
|
+
if _side_has_style(top_cell.border.top) and (not strong_only or cell_has_border_side(top_cell.border.top, strong_only=True)):
|
|
185
|
+
edges.add(("h", rng.min_row, col))
|
|
186
|
+
if _side_has_style(bottom_cell.border.bottom) and (not strong_only or cell_has_border_side(bottom_cell.border.bottom, strong_only=True)):
|
|
187
|
+
edges.add(("h", rng.max_row + 1, col))
|
|
188
|
+
|
|
189
|
+
for row in range(rng.min_row, rng.max_row + 1):
|
|
190
|
+
left_cell = ws.cell(row, rng.min_col)
|
|
191
|
+
right_cell = ws.cell(row, rng.max_col)
|
|
192
|
+
if _side_has_style(left_cell.border.left) and (not strong_only or cell_has_border_side(left_cell.border.left, strong_only=True)):
|
|
193
|
+
edges.add(("v", row, rng.min_col))
|
|
194
|
+
if _side_has_style(right_cell.border.right) and (not strong_only or cell_has_border_side(right_cell.border.right, strong_only=True)):
|
|
195
|
+
edges.add(("v", row, rng.max_col + 1))
|
|
196
|
+
|
|
197
|
+
return edges
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def edge_endpoints(edge: BorderEdge) -> tuple[tuple[int, int], tuple[int, int]]:
|
|
201
|
+
kind, a, b = edge
|
|
202
|
+
if kind == "h":
|
|
203
|
+
return (a, b), (a, b + 1)
|
|
204
|
+
return (a, b), (a + 1, b)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def border_edge_components(edges: set[BorderEdge]) -> dict[BorderEdge, int]:
|
|
208
|
+
endpoint_to_edges: dict[tuple[int, int], list[BorderEdge]] = {}
|
|
209
|
+
for edge in edges:
|
|
210
|
+
p1, p2 = edge_endpoints(edge)
|
|
211
|
+
endpoint_to_edges.setdefault(p1, []).append(edge)
|
|
212
|
+
endpoint_to_edges.setdefault(p2, []).append(edge)
|
|
213
|
+
|
|
214
|
+
edge_to_component: dict[BorderEdge, int] = {}
|
|
215
|
+
visited: set[BorderEdge] = set()
|
|
216
|
+
component_id = 0
|
|
217
|
+
|
|
218
|
+
for start in sorted(edges):
|
|
219
|
+
if start in visited:
|
|
220
|
+
continue
|
|
221
|
+
component_id += 1
|
|
222
|
+
q = deque([start])
|
|
223
|
+
visited.add(start)
|
|
224
|
+
edge_to_component[start] = component_id
|
|
225
|
+
|
|
226
|
+
while q:
|
|
227
|
+
edge = q.popleft()
|
|
228
|
+
for point in edge_endpoints(edge):
|
|
229
|
+
for nxt in endpoint_to_edges.get(point, []):
|
|
230
|
+
if nxt not in visited:
|
|
231
|
+
visited.add(nxt)
|
|
232
|
+
edge_to_component[nxt] = component_id
|
|
233
|
+
q.append(nxt)
|
|
234
|
+
|
|
235
|
+
return edge_to_component
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def perimeter_edges_by_side(box: Box, *, tolerance: int = 0) -> dict[str, set[BorderEdge]]:
|
|
239
|
+
min_row = max(1, box.min_row - tolerance)
|
|
240
|
+
min_col = max(1, box.min_col - tolerance)
|
|
241
|
+
max_row = box.max_row + tolerance
|
|
242
|
+
max_col = box.max_col + tolerance
|
|
243
|
+
|
|
244
|
+
return {
|
|
245
|
+
"top": {("h", min_row, col) for col in range(min_col, max_col + 1)},
|
|
246
|
+
"bottom": {("h", max_row + 1, col) for col in range(min_col, max_col + 1)},
|
|
247
|
+
"left": {("v", row, min_col) for row in range(min_row, max_row + 1)},
|
|
248
|
+
"right": {("v", row, max_col + 1) for row in range(min_row, max_row + 1)},
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def touched_border_component_sides(
|
|
253
|
+
box: Box,
|
|
254
|
+
edge_to_component: dict[BorderEdge, int],
|
|
255
|
+
config: dict[str, Any],
|
|
256
|
+
) -> dict[int, set[str]]:
|
|
257
|
+
tolerance = int(config.get("border_contact_tolerance_cells", 0))
|
|
258
|
+
min_edges_per_side = int(
|
|
259
|
+
config.get(
|
|
260
|
+
"border_contact_min_edges_per_side",
|
|
261
|
+
config.get("border_contact_min_edges", 1),
|
|
262
|
+
)
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
component_side_counts: dict[int, dict[str, int]] = {}
|
|
266
|
+
for side_name, edges in perimeter_edges_by_side(box, tolerance=tolerance).items():
|
|
267
|
+
for edge in edges:
|
|
268
|
+
component_id = edge_to_component.get(edge)
|
|
269
|
+
if component_id is None:
|
|
270
|
+
continue
|
|
271
|
+
component_side_counts.setdefault(component_id, {})
|
|
272
|
+
component_side_counts[component_id][side_name] = (
|
|
273
|
+
component_side_counts[component_id].get(side_name, 0) + 1
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
result: dict[int, set[str]] = {}
|
|
277
|
+
for component_id, side_counts in component_side_counts.items():
|
|
278
|
+
sides = {
|
|
279
|
+
side_name
|
|
280
|
+
for side_name, count in side_counts.items()
|
|
281
|
+
if count >= min_edges_per_side
|
|
282
|
+
}
|
|
283
|
+
if sides:
|
|
284
|
+
result[component_id] = sides
|
|
285
|
+
return result
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def boxes_are_contact_merge_neighbors(a: Box, b: Box, config: dict[str, Any]) -> bool:
|
|
289
|
+
max_gap = int(config.get("border_contact_merge_max_gap", 1))
|
|
290
|
+
min_axis_overlap = float(config.get("border_contact_merge_min_axis_overlap", 0.80))
|
|
291
|
+
|
|
292
|
+
if a.max_row < b.min_row:
|
|
293
|
+
row_gap = b.min_row - a.max_row - 1
|
|
294
|
+
return row_gap <= max_gap and overlap_ratio_on_axis(a, b, axis="col") >= min_axis_overlap
|
|
295
|
+
if b.max_row < a.min_row:
|
|
296
|
+
row_gap = a.min_row - b.max_row - 1
|
|
297
|
+
return row_gap <= max_gap and overlap_ratio_on_axis(a, b, axis="col") >= min_axis_overlap
|
|
298
|
+
|
|
299
|
+
if a.max_col < b.min_col:
|
|
300
|
+
col_gap = b.min_col - a.max_col - 1
|
|
301
|
+
return col_gap <= max_gap and overlap_ratio_on_axis(a, b, axis="row") >= min_axis_overlap
|
|
302
|
+
if b.max_col < a.min_col:
|
|
303
|
+
col_gap = a.min_col - b.max_col - 1
|
|
304
|
+
return col_gap <= max_gap and overlap_ratio_on_axis(a, b, axis="row") >= min_axis_overlap
|
|
305
|
+
|
|
306
|
+
return True
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def merge_boxes_by_border_contact(
|
|
310
|
+
cell_boxes: list[Box],
|
|
311
|
+
ws: Worksheet,
|
|
312
|
+
bounds: Box | None,
|
|
313
|
+
config: dict[str, Any],
|
|
314
|
+
) -> list[Box]:
|
|
315
|
+
if not config.get("use_border_contact_merge", False) or len(cell_boxes) < 2:
|
|
316
|
+
return cell_boxes
|
|
317
|
+
|
|
318
|
+
edges = collect_border_edges(ws, bounds, config)
|
|
319
|
+
if not edges:
|
|
320
|
+
return cell_boxes
|
|
321
|
+
|
|
322
|
+
edge_to_component = border_edge_components(edges)
|
|
323
|
+
min_touched_sides = int(config.get("border_contact_min_touched_sides", 2))
|
|
324
|
+
component_to_indices: dict[int, list[int]] = {}
|
|
325
|
+
|
|
326
|
+
for idx, box in enumerate(cell_boxes):
|
|
327
|
+
touched_sides_by_component = touched_border_component_sides(box, edge_to_component, config)
|
|
328
|
+
for component_id, sides in touched_sides_by_component.items():
|
|
329
|
+
if len(sides) >= min_touched_sides:
|
|
330
|
+
component_to_indices.setdefault(component_id, []).append(idx)
|
|
331
|
+
|
|
332
|
+
candidate_pairs: set[tuple[int, int]] = set()
|
|
333
|
+
for indices in component_to_indices.values():
|
|
334
|
+
unique_indices = sorted(set(indices))
|
|
335
|
+
if len(unique_indices) < 2:
|
|
336
|
+
continue
|
|
337
|
+
for pos, a_idx in enumerate(unique_indices):
|
|
338
|
+
for b_idx in unique_indices[pos + 1:]:
|
|
339
|
+
if boxes_are_contact_merge_neighbors(cell_boxes[a_idx], cell_boxes[b_idx], config):
|
|
340
|
+
candidate_pairs.add((a_idx, b_idx))
|
|
341
|
+
|
|
342
|
+
if not candidate_pairs:
|
|
343
|
+
return cell_boxes
|
|
344
|
+
|
|
345
|
+
output: list[Box] = []
|
|
346
|
+
for group in union_find_groups(list(range(len(cell_boxes))), sorted(candidate_pairs)):
|
|
347
|
+
if len(group) == 1:
|
|
348
|
+
output.append(cell_boxes[group[0]])
|
|
349
|
+
continue
|
|
350
|
+
|
|
351
|
+
merged = cell_boxes[group[0]]
|
|
352
|
+
for idx in group[1:]:
|
|
353
|
+
merged = merged.union(cell_boxes[idx])
|
|
354
|
+
|
|
355
|
+
total_area = sum(cell_boxes[idx].area for idx in group)
|
|
356
|
+
max_area_ratio = float(config.get("border_contact_merge_max_area_ratio", 2.5))
|
|
357
|
+
if merged.area > total_area * max_area_ratio:
|
|
358
|
+
output.extend(cell_boxes[idx] for idx in group)
|
|
359
|
+
else:
|
|
360
|
+
output.append(merged)
|
|
361
|
+
|
|
362
|
+
return dedupe_boxes(output)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from openpyxl.worksheet.worksheet import Worksheet
|
|
6
|
+
|
|
7
|
+
from .schema import Box
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def is_non_empty(value: Any) -> bool:
|
|
11
|
+
return value is not None and value != ""
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def merged_boxes_with_values(ws: Worksheet) -> list[Box]:
|
|
15
|
+
boxes: list[Box] = []
|
|
16
|
+
for rng in ws.merged_cells.ranges:
|
|
17
|
+
top_left = ws.cell(rng.min_row, rng.min_col)
|
|
18
|
+
if is_non_empty(top_left.value):
|
|
19
|
+
boxes.append(Box(rng.min_row, rng.min_col, rng.max_row, rng.max_col))
|
|
20
|
+
return boxes
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def collect_cell_occupied(ws: Worksheet, bounds: Box | None, config: dict[str, Any]) -> set[tuple[int, int]]:
|
|
24
|
+
if bounds is None:
|
|
25
|
+
return set()
|
|
26
|
+
occupied: set[tuple[int, int]] = set()
|
|
27
|
+
|
|
28
|
+
if config.get("include_values", True):
|
|
29
|
+
for (row, col), cell in ws._cells.items():
|
|
30
|
+
if bounds.contains(row, col) and is_non_empty(cell.value):
|
|
31
|
+
occupied.add((row, col))
|
|
32
|
+
|
|
33
|
+
if config.get("include_merged_cells", True):
|
|
34
|
+
for box in merged_boxes_with_values(ws):
|
|
35
|
+
for row in range(max(bounds.min_row, box.min_row), min(bounds.max_row, box.max_row) + 1):
|
|
36
|
+
for col in range(max(bounds.min_col, box.min_col), min(bounds.max_col, box.max_col) + 1):
|
|
37
|
+
occupied.add((row, col))
|
|
38
|
+
|
|
39
|
+
return occupied
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from .runner import run_and_write
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
PROJECT_ROOT = Path(__file__).resolve().parents[2]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def main() -> None:
|
|
13
|
+
p = argparse.ArgumentParser(description="Extract Excel information regions only")
|
|
14
|
+
p.add_argument("--workbook", default=str(PROJECT_ROOT / "examples" / "sample.xlsx"))
|
|
15
|
+
p.add_argument("--sheet", default=None)
|
|
16
|
+
p.add_argument("--config", default=str(PROJECT_ROOT / "config" / "default.json"))
|
|
17
|
+
p.add_argument("--out", default=str(PROJECT_ROOT / "outputs" / "info_regions"))
|
|
18
|
+
p.add_argument("--no-images", action="store_true", help="Skip PNG overlay generation")
|
|
19
|
+
args = p.parse_args()
|
|
20
|
+
|
|
21
|
+
result = run_and_write(
|
|
22
|
+
args.workbook,
|
|
23
|
+
out_dir=args.out,
|
|
24
|
+
sheet_name=args.sheet,
|
|
25
|
+
config_path=args.config,
|
|
26
|
+
write_images=not args.no_images,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
print(f"[extract_info_regions] sheets={len(result['sheets'])} -> {args.out}")
|
|
30
|
+
for sheet, data in result["sheets"].items():
|
|
31
|
+
regions = data.get("regions", data.get("info_regions", []))
|
|
32
|
+
print(f" - {sheet}: regions={len(regions)}, ranges={regions}, images={len(data.get('images', []))}")
|