xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,739 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Excel 레이아웃 및 객체 감지 모듈
|
|
3
|
+
|
|
4
|
+
엑셀 시트에서 실제 데이터가 있는 영역(layout)을 감지합니다.
|
|
5
|
+
개별 객체(테이블) 감지:
|
|
6
|
+
1. 테두리가 있는 영역을 먼저 개별 개체로 인식
|
|
7
|
+
2. 완전히 붙어있는 인접 개체들을 병합
|
|
8
|
+
3. 각 개체를 사각형 영역으로 반환
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
from typing import Tuple, Optional, List, Set, Dict
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
from collections import deque
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger("document-processor")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class LayoutRange:
|
|
21
|
+
"""레이아웃 범위 정보"""
|
|
22
|
+
min_row: int # 시작 행 (1-based)
|
|
23
|
+
max_row: int # 끝 행 (1-based)
|
|
24
|
+
min_col: int # 시작 열 (1-based)
|
|
25
|
+
max_col: int # 끝 열 (1-based)
|
|
26
|
+
|
|
27
|
+
def is_valid(self) -> bool:
|
|
28
|
+
"""유효한 범위인지 확인"""
|
|
29
|
+
return (self.min_row > 0 and self.max_row > 0 and
|
|
30
|
+
self.min_col > 0 and self.max_col > 0 and
|
|
31
|
+
self.min_row <= self.max_row and
|
|
32
|
+
self.min_col <= self.max_col)
|
|
33
|
+
|
|
34
|
+
def row_count(self) -> int:
|
|
35
|
+
"""행 개수"""
|
|
36
|
+
return self.max_row - self.min_row + 1
|
|
37
|
+
|
|
38
|
+
def col_count(self) -> int:
|
|
39
|
+
"""열 개수"""
|
|
40
|
+
return self.max_col - self.min_col + 1
|
|
41
|
+
|
|
42
|
+
def cell_count(self) -> int:
|
|
43
|
+
"""셀 개수"""
|
|
44
|
+
return self.row_count() * self.col_count()
|
|
45
|
+
|
|
46
|
+
def is_adjacent(self, other: 'LayoutRange') -> bool:
|
|
47
|
+
"""다른 LayoutRange와 완전히 인접해 있는지 확인 (변이 맞닿아 있음)"""
|
|
48
|
+
# 수평으로 인접 (같은 행 범위에서 열이 맞닿음)
|
|
49
|
+
if self.min_row <= other.max_row and self.max_row >= other.min_row:
|
|
50
|
+
if self.max_col + 1 == other.min_col or other.max_col + 1 == self.min_col:
|
|
51
|
+
return True
|
|
52
|
+
# 수직으로 인접 (같은 열 범위에서 행이 맞닿음)
|
|
53
|
+
if self.min_col <= other.max_col and self.max_col >= other.min_col:
|
|
54
|
+
if self.max_row + 1 == other.min_row or other.max_row + 1 == self.min_row:
|
|
55
|
+
return True
|
|
56
|
+
return False
|
|
57
|
+
|
|
58
|
+
def merge_with(self, other: 'LayoutRange') -> 'LayoutRange':
|
|
59
|
+
"""다른 LayoutRange와 병합하여 새로운 범위 반환"""
|
|
60
|
+
return LayoutRange(
|
|
61
|
+
min_row=min(self.min_row, other.min_row),
|
|
62
|
+
max_row=max(self.max_row, other.max_row),
|
|
63
|
+
min_col=min(self.min_col, other.min_col),
|
|
64
|
+
max_col=max(self.max_col, other.max_col)
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
def overlaps(self, other: 'LayoutRange') -> bool:
|
|
68
|
+
"""다른 LayoutRange와 겹치는지 확인"""
|
|
69
|
+
return not (self.max_row < other.min_row or
|
|
70
|
+
self.min_row > other.max_row or
|
|
71
|
+
self.max_col < other.min_col or
|
|
72
|
+
self.min_col > other.max_col)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def layout_detect_range_xlsx(ws) -> Optional[LayoutRange]:
|
|
76
|
+
"""
|
|
77
|
+
XLSX 워크시트에서 실제 데이터가 있는 영역을 감지합니다.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
ws: openpyxl Worksheet 객체
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
LayoutRange 객체 또는 데이터가 없으면 None
|
|
84
|
+
"""
|
|
85
|
+
try:
|
|
86
|
+
if ws.max_row is None or ws.max_row == 0:
|
|
87
|
+
return None
|
|
88
|
+
|
|
89
|
+
sheet_max_row = min(ws.max_row, 1000)
|
|
90
|
+
sheet_max_col = min(ws.max_column, 100) if ws.max_column else 100
|
|
91
|
+
|
|
92
|
+
min_row = None
|
|
93
|
+
max_row = None
|
|
94
|
+
min_col = None
|
|
95
|
+
max_col = None
|
|
96
|
+
|
|
97
|
+
# 왼쪽→오른쪽으로 첫 번째 데이터 열 찾기
|
|
98
|
+
for col_idx in range(1, sheet_max_col + 1):
|
|
99
|
+
for row_idx in range(1, sheet_max_row + 1):
|
|
100
|
+
cell = ws.cell(row=row_idx, column=col_idx)
|
|
101
|
+
if cell.value is not None and str(cell.value).strip():
|
|
102
|
+
min_col = col_idx
|
|
103
|
+
break
|
|
104
|
+
if min_col is not None:
|
|
105
|
+
break
|
|
106
|
+
|
|
107
|
+
if min_col is None:
|
|
108
|
+
return None
|
|
109
|
+
|
|
110
|
+
# 위→아래로 첫 번째 데이터 행 찾기
|
|
111
|
+
for row_idx in range(1, sheet_max_row + 1):
|
|
112
|
+
for col_idx in range(min_col, sheet_max_col + 1):
|
|
113
|
+
cell = ws.cell(row=row_idx, column=col_idx)
|
|
114
|
+
if cell.value is not None and str(cell.value).strip():
|
|
115
|
+
min_row = row_idx
|
|
116
|
+
break
|
|
117
|
+
if min_row is not None:
|
|
118
|
+
break
|
|
119
|
+
|
|
120
|
+
if min_row is None:
|
|
121
|
+
return None
|
|
122
|
+
|
|
123
|
+
# 오른쪽→왼쪽으로 마지막 데이터 열 찾기
|
|
124
|
+
for col_idx in range(sheet_max_col, min_col - 1, -1):
|
|
125
|
+
for row_idx in range(min_row, sheet_max_row + 1):
|
|
126
|
+
cell = ws.cell(row=row_idx, column=col_idx)
|
|
127
|
+
if cell.value is not None and str(cell.value).strip():
|
|
128
|
+
max_col = col_idx
|
|
129
|
+
break
|
|
130
|
+
if max_col is not None:
|
|
131
|
+
break
|
|
132
|
+
|
|
133
|
+
if max_col is None:
|
|
134
|
+
max_col = min_col
|
|
135
|
+
|
|
136
|
+
# 아래→위로 마지막 데이터 행 찾기
|
|
137
|
+
for row_idx in range(sheet_max_row, min_row - 1, -1):
|
|
138
|
+
for col_idx in range(min_col, max_col + 1):
|
|
139
|
+
cell = ws.cell(row=row_idx, column=col_idx)
|
|
140
|
+
if cell.value is not None and str(cell.value).strip():
|
|
141
|
+
max_row = row_idx
|
|
142
|
+
break
|
|
143
|
+
if max_row is not None:
|
|
144
|
+
break
|
|
145
|
+
|
|
146
|
+
if max_row is None:
|
|
147
|
+
max_row = min_row
|
|
148
|
+
|
|
149
|
+
layout = LayoutRange(min_row=min_row, max_row=max_row, min_col=min_col, max_col=max_col)
|
|
150
|
+
logger.debug(f"Layout detected: rows {min_row}-{max_row}, cols {min_col}-{max_col}")
|
|
151
|
+
return layout
|
|
152
|
+
|
|
153
|
+
except Exception as e:
|
|
154
|
+
logger.warning(f"Error detecting layout range: {e}")
|
|
155
|
+
return None
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def layout_detect_range_xls(sheet) -> Optional[LayoutRange]:
|
|
159
|
+
"""
|
|
160
|
+
XLS 시트에서 실제 데이터가 있는 영역을 감지합니다.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
sheet: xlrd Sheet 객체
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
LayoutRange 객체 또는 데이터가 없으면 None
|
|
167
|
+
"""
|
|
168
|
+
try:
|
|
169
|
+
if sheet.nrows == 0 or sheet.ncols == 0:
|
|
170
|
+
return None
|
|
171
|
+
|
|
172
|
+
sheet_max_row = min(sheet.nrows, 1000)
|
|
173
|
+
sheet_max_col = min(sheet.ncols, 100)
|
|
174
|
+
|
|
175
|
+
min_row = None
|
|
176
|
+
max_row = None
|
|
177
|
+
min_col = None
|
|
178
|
+
max_col = None
|
|
179
|
+
|
|
180
|
+
# 왼쪽→오른쪽으로 첫 번째 데이터 열 찾기 (0-based)
|
|
181
|
+
for col_idx in range(sheet_max_col):
|
|
182
|
+
for row_idx in range(sheet_max_row):
|
|
183
|
+
try:
|
|
184
|
+
value = sheet.cell_value(row_idx, col_idx)
|
|
185
|
+
if value is not None and str(value).strip():
|
|
186
|
+
min_col = col_idx + 1 # 1-based
|
|
187
|
+
break
|
|
188
|
+
except Exception:
|
|
189
|
+
pass
|
|
190
|
+
if min_col is not None:
|
|
191
|
+
break
|
|
192
|
+
|
|
193
|
+
if min_col is None:
|
|
194
|
+
return None
|
|
195
|
+
|
|
196
|
+
# 위→아래로 첫 번째 데이터 행 찾기
|
|
197
|
+
for row_idx in range(sheet_max_row):
|
|
198
|
+
for col_idx in range(min_col - 1, sheet_max_col):
|
|
199
|
+
try:
|
|
200
|
+
value = sheet.cell_value(row_idx, col_idx)
|
|
201
|
+
if value is not None and str(value).strip():
|
|
202
|
+
min_row = row_idx + 1 # 1-based
|
|
203
|
+
break
|
|
204
|
+
except Exception:
|
|
205
|
+
pass
|
|
206
|
+
if min_row is not None:
|
|
207
|
+
break
|
|
208
|
+
|
|
209
|
+
if min_row is None:
|
|
210
|
+
return None
|
|
211
|
+
|
|
212
|
+
# 오른쪽→왼쪽으로 마지막 데이터 열 찾기
|
|
213
|
+
for col_idx in range(sheet_max_col - 1, min_col - 2, -1):
|
|
214
|
+
for row_idx in range(min_row - 1, sheet_max_row):
|
|
215
|
+
try:
|
|
216
|
+
value = sheet.cell_value(row_idx, col_idx)
|
|
217
|
+
if value is not None and str(value).strip():
|
|
218
|
+
max_col = col_idx + 1 # 1-based
|
|
219
|
+
break
|
|
220
|
+
except Exception:
|
|
221
|
+
pass
|
|
222
|
+
if max_col is not None:
|
|
223
|
+
break
|
|
224
|
+
|
|
225
|
+
if max_col is None:
|
|
226
|
+
max_col = min_col
|
|
227
|
+
|
|
228
|
+
# 아래→위로 마지막 데이터 행 찾기
|
|
229
|
+
for row_idx in range(sheet_max_row - 1, min_row - 2, -1):
|
|
230
|
+
for col_idx in range(min_col - 1, max_col):
|
|
231
|
+
try:
|
|
232
|
+
value = sheet.cell_value(row_idx, col_idx)
|
|
233
|
+
if value is not None and str(value).strip():
|
|
234
|
+
max_row = row_idx + 1 # 1-based
|
|
235
|
+
break
|
|
236
|
+
except Exception:
|
|
237
|
+
pass
|
|
238
|
+
if max_row is not None:
|
|
239
|
+
break
|
|
240
|
+
|
|
241
|
+
if max_row is None:
|
|
242
|
+
max_row = min_row
|
|
243
|
+
|
|
244
|
+
layout = LayoutRange(min_row=min_row, max_row=max_row, min_col=min_col, max_col=max_col)
|
|
245
|
+
logger.debug(f"XLS Layout detected: rows {min_row}-{max_row}, cols {min_col}-{max_col}")
|
|
246
|
+
return layout
|
|
247
|
+
|
|
248
|
+
except Exception as e:
|
|
249
|
+
logger.warning(f"Error detecting XLS layout range: {e}")
|
|
250
|
+
return None
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def _has_border_xlsx(cell) -> bool:
|
|
254
|
+
"""XLSX 셀에 테두리가 있는지 확인 (상하좌우 중 하나라도)"""
|
|
255
|
+
try:
|
|
256
|
+
border = cell.border
|
|
257
|
+
if border is None:
|
|
258
|
+
return False
|
|
259
|
+
|
|
260
|
+
sides = [border.top, border.bottom, border.left, border.right]
|
|
261
|
+
for side in sides:
|
|
262
|
+
if side is not None and side.style is not None and side.style != 'none':
|
|
263
|
+
return True
|
|
264
|
+
return False
|
|
265
|
+
except Exception:
|
|
266
|
+
return False
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def _detect_bordered_regions_xlsx(ws, layout: LayoutRange) -> List[LayoutRange]:
|
|
270
|
+
"""
|
|
271
|
+
XLSX 워크시트에서 테두리가 있는 영역들을 감지합니다.
|
|
272
|
+
테두리가 있는 셀들을 BFS로 그룹화하여 사각형 영역으로 반환합니다.
|
|
273
|
+
|
|
274
|
+
Args:
|
|
275
|
+
ws: openpyxl Worksheet 객체
|
|
276
|
+
layout: 탐색할 레이아웃 범위
|
|
277
|
+
|
|
278
|
+
Returns:
|
|
279
|
+
테두리 영역 목록
|
|
280
|
+
"""
|
|
281
|
+
# 테두리가 있는 셀 좌표 수집
|
|
282
|
+
bordered_cells: Set[Tuple[int, int]] = set()
|
|
283
|
+
|
|
284
|
+
for row_idx in range(layout.min_row, layout.max_row + 1):
|
|
285
|
+
for col_idx in range(layout.min_col, layout.max_col + 1):
|
|
286
|
+
cell = ws.cell(row=row_idx, column=col_idx)
|
|
287
|
+
if _has_border_xlsx(cell):
|
|
288
|
+
bordered_cells.add((row_idx, col_idx))
|
|
289
|
+
|
|
290
|
+
if not bordered_cells:
|
|
291
|
+
return []
|
|
292
|
+
|
|
293
|
+
# BFS로 인접한 테두리 셀들을 그룹화
|
|
294
|
+
visited: Set[Tuple[int, int]] = set()
|
|
295
|
+
regions: List[LayoutRange] = []
|
|
296
|
+
|
|
297
|
+
# 위→아래, 왼쪽→오른쪽 순서로 정렬
|
|
298
|
+
sorted_cells = sorted(bordered_cells, key=lambda x: (x[0], x[1]))
|
|
299
|
+
|
|
300
|
+
for start_cell in sorted_cells:
|
|
301
|
+
if start_cell in visited:
|
|
302
|
+
continue
|
|
303
|
+
|
|
304
|
+
# BFS
|
|
305
|
+
group: Set[Tuple[int, int]] = set()
|
|
306
|
+
queue = deque([start_cell])
|
|
307
|
+
|
|
308
|
+
while queue:
|
|
309
|
+
current = queue.popleft()
|
|
310
|
+
if current in visited:
|
|
311
|
+
continue
|
|
312
|
+
|
|
313
|
+
visited.add(current)
|
|
314
|
+
group.add(current)
|
|
315
|
+
|
|
316
|
+
row, col = current
|
|
317
|
+
# 상하좌우 인접 셀
|
|
318
|
+
neighbors = [(row-1, col), (row+1, col), (row, col-1), (row, col+1)]
|
|
319
|
+
|
|
320
|
+
for neighbor in neighbors:
|
|
321
|
+
if neighbor in bordered_cells and neighbor not in visited:
|
|
322
|
+
queue.append(neighbor)
|
|
323
|
+
|
|
324
|
+
# 그룹에서 사각형 영역 계산
|
|
325
|
+
if group:
|
|
326
|
+
min_r = min(r for r, c in group)
|
|
327
|
+
max_r = max(r for r, c in group)
|
|
328
|
+
min_c = min(c for r, c in group)
|
|
329
|
+
max_c = max(c for r, c in group)
|
|
330
|
+
regions.append(LayoutRange(min_row=min_r, max_row=max_r, min_col=min_c, max_col=max_c))
|
|
331
|
+
|
|
332
|
+
return regions
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def _detect_value_regions_xlsx(ws, layout: LayoutRange, exclude_regions: List[LayoutRange]) -> List[LayoutRange]:
|
|
336
|
+
"""
|
|
337
|
+
XLSX 워크시트에서 값이 있는 영역들을 감지합니다 (테두리 영역 제외).
|
|
338
|
+
병합 셀의 경우, 병합 영역의 일부가 layout에 포함되면 전체 영역을 감지합니다.
|
|
339
|
+
|
|
340
|
+
Args:
|
|
341
|
+
ws: openpyxl Worksheet 객체
|
|
342
|
+
layout: 탐색할 레이아웃 범위
|
|
343
|
+
exclude_regions: 제외할 영역 목록 (이미 감지된 테두리 영역)
|
|
344
|
+
|
|
345
|
+
Returns:
|
|
346
|
+
값이 있는 영역 목록
|
|
347
|
+
"""
|
|
348
|
+
# 이미 감지된 영역에 포함된 셀인지 확인하는 함수
|
|
349
|
+
def is_in_excluded(row: int, col: int) -> bool:
|
|
350
|
+
for region in exclude_regions:
|
|
351
|
+
if (region.min_row <= row <= region.max_row and
|
|
352
|
+
region.min_col <= col <= region.max_col):
|
|
353
|
+
return True
|
|
354
|
+
return False
|
|
355
|
+
|
|
356
|
+
# 병합 셀 정보 수집: 각 셀이 어떤 병합 영역에 속하는지
|
|
357
|
+
merged_cell_map: Dict[Tuple[int, int], Tuple[int, int, int, int]] = {} # (row, col) -> (min_row, max_row, min_col, max_col)
|
|
358
|
+
for merged_range in ws.merged_cells.ranges:
|
|
359
|
+
mr_min_row, mr_min_col = merged_range.min_row, merged_range.min_col
|
|
360
|
+
mr_max_row, mr_max_col = merged_range.max_row, merged_range.max_col
|
|
361
|
+
for r in range(mr_min_row, mr_max_row + 1):
|
|
362
|
+
for c in range(mr_min_col, mr_max_col + 1):
|
|
363
|
+
merged_cell_map[(r, c)] = (mr_min_row, mr_max_row, mr_min_col, mr_max_col)
|
|
364
|
+
|
|
365
|
+
# 값이 있는 셀 좌표 수집 (제외 영역 외)
|
|
366
|
+
value_cells: Set[Tuple[int, int]] = set()
|
|
367
|
+
|
|
368
|
+
for row_idx in range(layout.min_row, layout.max_row + 1):
|
|
369
|
+
for col_idx in range(layout.min_col, layout.max_col + 1):
|
|
370
|
+
if is_in_excluded(row_idx, col_idx):
|
|
371
|
+
continue
|
|
372
|
+
|
|
373
|
+
cell = ws.cell(row=row_idx, column=col_idx)
|
|
374
|
+
|
|
375
|
+
# 일반 셀: 값이 있으면 추가
|
|
376
|
+
if cell.value is not None and str(cell.value).strip():
|
|
377
|
+
value_cells.add((row_idx, col_idx))
|
|
378
|
+
# 병합 셀의 일부인 경우: 병합 셀의 첫 번째 셀에 값이 있으면 이 셀도 추가
|
|
379
|
+
elif (row_idx, col_idx) in merged_cell_map:
|
|
380
|
+
mr_min_row, mr_max_row, mr_min_col, mr_max_col = merged_cell_map[(row_idx, col_idx)]
|
|
381
|
+
# 병합 셀의 첫 번째 셀 값 확인
|
|
382
|
+
first_cell = ws.cell(row=mr_min_row, column=mr_min_col)
|
|
383
|
+
if first_cell.value is not None and str(first_cell.value).strip():
|
|
384
|
+
value_cells.add((row_idx, col_idx))
|
|
385
|
+
|
|
386
|
+
if not value_cells:
|
|
387
|
+
return []
|
|
388
|
+
|
|
389
|
+
# BFS로 인접한 값 셀들을 그룹화
|
|
390
|
+
visited: Set[Tuple[int, int]] = set()
|
|
391
|
+
regions: List[LayoutRange] = []
|
|
392
|
+
|
|
393
|
+
sorted_cells = sorted(value_cells, key=lambda x: (x[0], x[1]))
|
|
394
|
+
|
|
395
|
+
for start_cell in sorted_cells:
|
|
396
|
+
if start_cell in visited:
|
|
397
|
+
continue
|
|
398
|
+
|
|
399
|
+
group: Set[Tuple[int, int]] = set()
|
|
400
|
+
queue = deque([start_cell])
|
|
401
|
+
|
|
402
|
+
while queue:
|
|
403
|
+
current = queue.popleft()
|
|
404
|
+
if current in visited:
|
|
405
|
+
continue
|
|
406
|
+
|
|
407
|
+
visited.add(current)
|
|
408
|
+
group.add(current)
|
|
409
|
+
|
|
410
|
+
row, col = current
|
|
411
|
+
neighbors = [(row-1, col), (row+1, col), (row, col-1), (row, col+1)]
|
|
412
|
+
|
|
413
|
+
for neighbor in neighbors:
|
|
414
|
+
if neighbor in value_cells and neighbor not in visited:
|
|
415
|
+
queue.append(neighbor)
|
|
416
|
+
|
|
417
|
+
if group:
|
|
418
|
+
min_r = min(r for r, c in group)
|
|
419
|
+
max_r = max(r for r, c in group)
|
|
420
|
+
min_c = min(c for r, c in group)
|
|
421
|
+
max_c = max(c for r, c in group)
|
|
422
|
+
regions.append(LayoutRange(min_row=min_r, max_row=max_r, min_col=min_c, max_col=max_c))
|
|
423
|
+
|
|
424
|
+
return regions
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
def _merge_adjacent_regions(regions: List[LayoutRange]) -> List[LayoutRange]:
|
|
428
|
+
"""
|
|
429
|
+
완전히 인접한 영역들을 병합합니다.
|
|
430
|
+
반복적으로 인접한 영역을 찾아 병합합니다.
|
|
431
|
+
|
|
432
|
+
Args:
|
|
433
|
+
regions: 영역 목록
|
|
434
|
+
|
|
435
|
+
Returns:
|
|
436
|
+
병합된 영역 목록
|
|
437
|
+
"""
|
|
438
|
+
if len(regions) <= 1:
|
|
439
|
+
return regions
|
|
440
|
+
|
|
441
|
+
merged = True
|
|
442
|
+
current_regions = list(regions)
|
|
443
|
+
|
|
444
|
+
while merged:
|
|
445
|
+
merged = False
|
|
446
|
+
new_regions = []
|
|
447
|
+
used = set()
|
|
448
|
+
|
|
449
|
+
for i, region_a in enumerate(current_regions):
|
|
450
|
+
if i in used:
|
|
451
|
+
continue
|
|
452
|
+
|
|
453
|
+
merged_region = region_a
|
|
454
|
+
|
|
455
|
+
for j, region_b in enumerate(current_regions):
|
|
456
|
+
if j <= i or j in used:
|
|
457
|
+
continue
|
|
458
|
+
|
|
459
|
+
if merged_region.is_adjacent(region_b):
|
|
460
|
+
merged_region = merged_region.merge_with(region_b)
|
|
461
|
+
used.add(j)
|
|
462
|
+
merged = True
|
|
463
|
+
|
|
464
|
+
new_regions.append(merged_region)
|
|
465
|
+
used.add(i)
|
|
466
|
+
|
|
467
|
+
current_regions = new_regions
|
|
468
|
+
|
|
469
|
+
return current_regions
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
def object_detect_xlsx(ws, layout: Optional[LayoutRange] = None) -> List[LayoutRange]:
|
|
473
|
+
"""
|
|
474
|
+
XLSX 워크시트에서 개별 객체(테이블/데이터 블록)를 감지합니다.
|
|
475
|
+
|
|
476
|
+
알고리즘:
|
|
477
|
+
1. 테두리가 있는 영역을 먼저 개별 개체로 인식
|
|
478
|
+
2. 테두리가 없는 값 영역을 감지
|
|
479
|
+
3. 완전히 인접한 개체들을 병합
|
|
480
|
+
4. 위→아래, 왼쪽→오른쪽 순서로 정렬하여 반환
|
|
481
|
+
|
|
482
|
+
Args:
|
|
483
|
+
ws: openpyxl Worksheet 객체
|
|
484
|
+
layout: 탐색할 레이아웃 범위 (None이면 자동 감지)
|
|
485
|
+
|
|
486
|
+
Returns:
|
|
487
|
+
개별 객체 영역 목록
|
|
488
|
+
"""
|
|
489
|
+
try:
|
|
490
|
+
if layout is None:
|
|
491
|
+
layout = layout_detect_range_xlsx(ws)
|
|
492
|
+
if layout is None:
|
|
493
|
+
return []
|
|
494
|
+
|
|
495
|
+
# 1. 테두리 영역 감지
|
|
496
|
+
bordered_regions = _detect_bordered_regions_xlsx(ws, layout)
|
|
497
|
+
logger.debug(f"Detected {len(bordered_regions)} bordered regions")
|
|
498
|
+
|
|
499
|
+
# 2. 값 영역 감지 (테두리 영역 제외)
|
|
500
|
+
value_regions = _detect_value_regions_xlsx(ws, layout, bordered_regions)
|
|
501
|
+
logger.debug(f"Detected {len(value_regions)} value regions (excluding bordered)")
|
|
502
|
+
|
|
503
|
+
# 3. 모든 영역 합치기
|
|
504
|
+
all_regions = bordered_regions + value_regions
|
|
505
|
+
|
|
506
|
+
if not all_regions:
|
|
507
|
+
return []
|
|
508
|
+
|
|
509
|
+
# 4. 인접 영역 병합
|
|
510
|
+
merged_regions = _merge_adjacent_regions(all_regions)
|
|
511
|
+
logger.debug(f"After merging: {len(merged_regions)} regions")
|
|
512
|
+
|
|
513
|
+
# 5. 위→아래, 왼쪽→오른쪽 순서로 정렬
|
|
514
|
+
sorted_regions = sorted(merged_regions, key=lambda r: (r.min_row, r.min_col))
|
|
515
|
+
|
|
516
|
+
for i, obj in enumerate(sorted_regions):
|
|
517
|
+
logger.debug(
|
|
518
|
+
f" Object {i+1}: rows {obj.min_row}-{obj.max_row}, "
|
|
519
|
+
f"cols {obj.min_col}-{obj.max_col} ({obj.cell_count()} cells)"
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
return sorted_regions
|
|
523
|
+
|
|
524
|
+
except Exception as e:
|
|
525
|
+
logger.warning(f"Error detecting objects in XLSX: {e}")
|
|
526
|
+
return []
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
def _has_border_xls(sheet, wb, row_idx: int, col_idx: int) -> bool:
|
|
530
|
+
"""XLS 셀에 테두리가 있는지 확인 (0-based 인덱스)"""
|
|
531
|
+
try:
|
|
532
|
+
xf_index = sheet.cell_xf_index(row_idx, col_idx)
|
|
533
|
+
xf = wb.xf_list[xf_index]
|
|
534
|
+
|
|
535
|
+
# 테두리 인덱스 확인
|
|
536
|
+
borders = [
|
|
537
|
+
xf.border.top_line_style,
|
|
538
|
+
xf.border.bottom_line_style,
|
|
539
|
+
xf.border.left_line_style,
|
|
540
|
+
xf.border.right_line_style
|
|
541
|
+
]
|
|
542
|
+
|
|
543
|
+
for border_style in borders:
|
|
544
|
+
if border_style and border_style > 0:
|
|
545
|
+
return True
|
|
546
|
+
return False
|
|
547
|
+
except Exception:
|
|
548
|
+
return False
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
def _detect_bordered_regions_xls(sheet, wb, layout: LayoutRange) -> List[LayoutRange]:
|
|
552
|
+
"""
|
|
553
|
+
XLS 시트에서 테두리가 있는 영역들을 감지합니다.
|
|
554
|
+
|
|
555
|
+
Args:
|
|
556
|
+
sheet: xlrd Sheet 객체
|
|
557
|
+
wb: xlrd Workbook 객체
|
|
558
|
+
layout: 탐색할 레이아웃 범위 (1-based)
|
|
559
|
+
|
|
560
|
+
Returns:
|
|
561
|
+
테두리 영역 목록 (1-based)
|
|
562
|
+
"""
|
|
563
|
+
bordered_cells: Set[Tuple[int, int]] = set()
|
|
564
|
+
|
|
565
|
+
for row_idx in range(layout.min_row, layout.max_row + 1):
|
|
566
|
+
for col_idx in range(layout.min_col, layout.max_col + 1):
|
|
567
|
+
# XLS는 0-based
|
|
568
|
+
if _has_border_xls(sheet, wb, row_idx - 1, col_idx - 1):
|
|
569
|
+
bordered_cells.add((row_idx, col_idx))
|
|
570
|
+
|
|
571
|
+
if not bordered_cells:
|
|
572
|
+
return []
|
|
573
|
+
|
|
574
|
+
visited: Set[Tuple[int, int]] = set()
|
|
575
|
+
regions: List[LayoutRange] = []
|
|
576
|
+
|
|
577
|
+
sorted_cells = sorted(bordered_cells, key=lambda x: (x[0], x[1]))
|
|
578
|
+
|
|
579
|
+
for start_cell in sorted_cells:
|
|
580
|
+
if start_cell in visited:
|
|
581
|
+
continue
|
|
582
|
+
|
|
583
|
+
group: Set[Tuple[int, int]] = set()
|
|
584
|
+
queue = deque([start_cell])
|
|
585
|
+
|
|
586
|
+
while queue:
|
|
587
|
+
current = queue.popleft()
|
|
588
|
+
if current in visited:
|
|
589
|
+
continue
|
|
590
|
+
|
|
591
|
+
visited.add(current)
|
|
592
|
+
group.add(current)
|
|
593
|
+
|
|
594
|
+
row, col = current
|
|
595
|
+
neighbors = [(row-1, col), (row+1, col), (row, col-1), (row, col+1)]
|
|
596
|
+
|
|
597
|
+
for neighbor in neighbors:
|
|
598
|
+
if neighbor in bordered_cells and neighbor not in visited:
|
|
599
|
+
queue.append(neighbor)
|
|
600
|
+
|
|
601
|
+
if group:
|
|
602
|
+
min_r = min(r for r, c in group)
|
|
603
|
+
max_r = max(r for r, c in group)
|
|
604
|
+
min_c = min(c for r, c in group)
|
|
605
|
+
max_c = max(c for r, c in group)
|
|
606
|
+
regions.append(LayoutRange(min_row=min_r, max_row=max_r, min_col=min_c, max_col=max_c))
|
|
607
|
+
|
|
608
|
+
return regions
|
|
609
|
+
|
|
610
|
+
|
|
611
|
+
def _detect_value_regions_xls(sheet, layout: LayoutRange, exclude_regions: List[LayoutRange]) -> List[LayoutRange]:
|
|
612
|
+
"""
|
|
613
|
+
XLS 시트에서 값이 있는 영역들을 감지합니다 (테두리 영역 제외).
|
|
614
|
+
|
|
615
|
+
Args:
|
|
616
|
+
sheet: xlrd Sheet 객체
|
|
617
|
+
layout: 탐색할 레이아웃 범위 (1-based)
|
|
618
|
+
exclude_regions: 제외할 영역 목록
|
|
619
|
+
|
|
620
|
+
Returns:
|
|
621
|
+
값이 있는 영역 목록 (1-based)
|
|
622
|
+
"""
|
|
623
|
+
def is_in_excluded(row: int, col: int) -> bool:
|
|
624
|
+
for region in exclude_regions:
|
|
625
|
+
if (region.min_row <= row <= region.max_row and
|
|
626
|
+
region.min_col <= col <= region.max_col):
|
|
627
|
+
return True
|
|
628
|
+
return False
|
|
629
|
+
|
|
630
|
+
value_cells: Set[Tuple[int, int]] = set()
|
|
631
|
+
|
|
632
|
+
for row_idx in range(layout.min_row, layout.max_row + 1):
|
|
633
|
+
for col_idx in range(layout.min_col, layout.max_col + 1):
|
|
634
|
+
if is_in_excluded(row_idx, col_idx):
|
|
635
|
+
continue
|
|
636
|
+
try:
|
|
637
|
+
# XLS는 0-based
|
|
638
|
+
value = sheet.cell_value(row_idx - 1, col_idx - 1)
|
|
639
|
+
if value is not None and str(value).strip():
|
|
640
|
+
value_cells.add((row_idx, col_idx))
|
|
641
|
+
except Exception:
|
|
642
|
+
pass
|
|
643
|
+
|
|
644
|
+
if not value_cells:
|
|
645
|
+
return []
|
|
646
|
+
|
|
647
|
+
visited: Set[Tuple[int, int]] = set()
|
|
648
|
+
regions: List[LayoutRange] = []
|
|
649
|
+
|
|
650
|
+
sorted_cells = sorted(value_cells, key=lambda x: (x[0], x[1]))
|
|
651
|
+
|
|
652
|
+
for start_cell in sorted_cells:
|
|
653
|
+
if start_cell in visited:
|
|
654
|
+
continue
|
|
655
|
+
|
|
656
|
+
group: Set[Tuple[int, int]] = set()
|
|
657
|
+
queue = deque([start_cell])
|
|
658
|
+
|
|
659
|
+
while queue:
|
|
660
|
+
current = queue.popleft()
|
|
661
|
+
if current in visited:
|
|
662
|
+
continue
|
|
663
|
+
|
|
664
|
+
visited.add(current)
|
|
665
|
+
group.add(current)
|
|
666
|
+
|
|
667
|
+
row, col = current
|
|
668
|
+
neighbors = [(row-1, col), (row+1, col), (row, col-1), (row, col+1)]
|
|
669
|
+
|
|
670
|
+
for neighbor in neighbors:
|
|
671
|
+
if neighbor in value_cells and neighbor not in visited:
|
|
672
|
+
queue.append(neighbor)
|
|
673
|
+
|
|
674
|
+
if group:
|
|
675
|
+
min_r = min(r for r, c in group)
|
|
676
|
+
max_r = max(r for r, c in group)
|
|
677
|
+
min_c = min(c for r, c in group)
|
|
678
|
+
max_c = max(c for r, c in group)
|
|
679
|
+
regions.append(LayoutRange(min_row=min_r, max_row=max_r, min_col=min_c, max_col=max_c))
|
|
680
|
+
|
|
681
|
+
return regions
|
|
682
|
+
|
|
683
|
+
|
|
684
|
+
def object_detect_xls(sheet, wb, layout: Optional[LayoutRange] = None) -> List[LayoutRange]:
|
|
685
|
+
"""
|
|
686
|
+
XLS 시트에서 개별 객체(테이블/데이터 블록)를 감지합니다.
|
|
687
|
+
|
|
688
|
+
알고리즘:
|
|
689
|
+
1. 테두리가 있는 영역을 먼저 개별 개체로 인식
|
|
690
|
+
2. 테두리가 없는 값 영역을 감지
|
|
691
|
+
3. 완전히 인접한 개체들을 병합
|
|
692
|
+
4. 위→아래, 왼쪽→오른쪽 순서로 정렬하여 반환
|
|
693
|
+
|
|
694
|
+
Args:
|
|
695
|
+
sheet: xlrd Sheet 객체
|
|
696
|
+
wb: xlrd Workbook 객체
|
|
697
|
+
layout: 탐색할 레이아웃 범위 (None이면 자동 감지)
|
|
698
|
+
|
|
699
|
+
Returns:
|
|
700
|
+
개별 객체 영역 목록 (1-based 좌표)
|
|
701
|
+
"""
|
|
702
|
+
try:
|
|
703
|
+
if layout is None:
|
|
704
|
+
layout = layout_detect_range_xls(sheet)
|
|
705
|
+
if layout is None:
|
|
706
|
+
return []
|
|
707
|
+
|
|
708
|
+
# 1. 테두리 영역 감지
|
|
709
|
+
bordered_regions = _detect_bordered_regions_xls(sheet, wb, layout)
|
|
710
|
+
logger.debug(f"XLS: Detected {len(bordered_regions)} bordered regions")
|
|
711
|
+
|
|
712
|
+
# 2. 값 영역 감지 (테두리 영역 제외)
|
|
713
|
+
value_regions = _detect_value_regions_xls(sheet, layout, bordered_regions)
|
|
714
|
+
logger.debug(f"XLS: Detected {len(value_regions)} value regions (excluding bordered)")
|
|
715
|
+
|
|
716
|
+
# 3. 모든 영역 합치기
|
|
717
|
+
all_regions = bordered_regions + value_regions
|
|
718
|
+
|
|
719
|
+
if not all_regions:
|
|
720
|
+
return []
|
|
721
|
+
|
|
722
|
+
# 4. 인접 영역 병합
|
|
723
|
+
merged_regions = _merge_adjacent_regions(all_regions)
|
|
724
|
+
logger.debug(f"XLS: After merging: {len(merged_regions)} regions")
|
|
725
|
+
|
|
726
|
+
# 5. 위→아래, 왼쪽→오른쪽 순서로 정렬
|
|
727
|
+
sorted_regions = sorted(merged_regions, key=lambda r: (r.min_row, r.min_col))
|
|
728
|
+
|
|
729
|
+
for i, obj in enumerate(sorted_regions):
|
|
730
|
+
logger.debug(
|
|
731
|
+
f" XLS Object {i+1}: rows {obj.min_row}-{obj.max_row}, "
|
|
732
|
+
f"cols {obj.min_col}-{obj.max_col} ({obj.cell_count()} cells)"
|
|
733
|
+
)
|
|
734
|
+
|
|
735
|
+
return sorted_regions
|
|
736
|
+
|
|
737
|
+
except Exception as e:
|
|
738
|
+
logger.warning(f"Error detecting objects in XLS: {e}")
|
|
739
|
+
return []
|