xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,739 @@
1
+ """
2
+ Excel 레이아웃 및 객체 감지 모듈
3
+
4
+ 엑셀 시트에서 실제 데이터가 있는 영역(layout)을 감지합니다.
5
+ 개별 객체(테이블) 감지:
6
+ 1. 테두리가 있는 영역을 먼저 개별 개체로 인식
7
+ 2. 완전히 붙어있는 인접 개체들을 병합
8
+ 3. 각 개체를 사각형 영역으로 반환
9
+ """
10
+
11
+ import logging
12
+ from typing import Tuple, Optional, List, Set, Dict
13
+ from dataclasses import dataclass
14
+ from collections import deque
15
+
16
+ logger = logging.getLogger("document-processor")
17
+
18
+
19
+ @dataclass
20
+ class LayoutRange:
21
+ """레이아웃 범위 정보"""
22
+ min_row: int # 시작 행 (1-based)
23
+ max_row: int # 끝 행 (1-based)
24
+ min_col: int # 시작 열 (1-based)
25
+ max_col: int # 끝 열 (1-based)
26
+
27
+ def is_valid(self) -> bool:
28
+ """유효한 범위인지 확인"""
29
+ return (self.min_row > 0 and self.max_row > 0 and
30
+ self.min_col > 0 and self.max_col > 0 and
31
+ self.min_row <= self.max_row and
32
+ self.min_col <= self.max_col)
33
+
34
+ def row_count(self) -> int:
35
+ """행 개수"""
36
+ return self.max_row - self.min_row + 1
37
+
38
+ def col_count(self) -> int:
39
+ """열 개수"""
40
+ return self.max_col - self.min_col + 1
41
+
42
+ def cell_count(self) -> int:
43
+ """셀 개수"""
44
+ return self.row_count() * self.col_count()
45
+
46
+ def is_adjacent(self, other: 'LayoutRange') -> bool:
47
+ """다른 LayoutRange와 완전히 인접해 있는지 확인 (변이 맞닿아 있음)"""
48
+ # 수평으로 인접 (같은 행 범위에서 열이 맞닿음)
49
+ if self.min_row <= other.max_row and self.max_row >= other.min_row:
50
+ if self.max_col + 1 == other.min_col or other.max_col + 1 == self.min_col:
51
+ return True
52
+ # 수직으로 인접 (같은 열 범위에서 행이 맞닿음)
53
+ if self.min_col <= other.max_col and self.max_col >= other.min_col:
54
+ if self.max_row + 1 == other.min_row or other.max_row + 1 == self.min_row:
55
+ return True
56
+ return False
57
+
58
+ def merge_with(self, other: 'LayoutRange') -> 'LayoutRange':
59
+ """다른 LayoutRange와 병합하여 새로운 범위 반환"""
60
+ return LayoutRange(
61
+ min_row=min(self.min_row, other.min_row),
62
+ max_row=max(self.max_row, other.max_row),
63
+ min_col=min(self.min_col, other.min_col),
64
+ max_col=max(self.max_col, other.max_col)
65
+ )
66
+
67
+ def overlaps(self, other: 'LayoutRange') -> bool:
68
+ """다른 LayoutRange와 겹치는지 확인"""
69
+ return not (self.max_row < other.min_row or
70
+ self.min_row > other.max_row or
71
+ self.max_col < other.min_col or
72
+ self.min_col > other.max_col)
73
+
74
+
75
+ def layout_detect_range_xlsx(ws) -> Optional[LayoutRange]:
76
+ """
77
+ XLSX 워크시트에서 실제 데이터가 있는 영역을 감지합니다.
78
+
79
+ Args:
80
+ ws: openpyxl Worksheet 객체
81
+
82
+ Returns:
83
+ LayoutRange 객체 또는 데이터가 없으면 None
84
+ """
85
+ try:
86
+ if ws.max_row is None or ws.max_row == 0:
87
+ return None
88
+
89
+ sheet_max_row = min(ws.max_row, 1000)
90
+ sheet_max_col = min(ws.max_column, 100) if ws.max_column else 100
91
+
92
+ min_row = None
93
+ max_row = None
94
+ min_col = None
95
+ max_col = None
96
+
97
+ # 왼쪽→오른쪽으로 첫 번째 데이터 열 찾기
98
+ for col_idx in range(1, sheet_max_col + 1):
99
+ for row_idx in range(1, sheet_max_row + 1):
100
+ cell = ws.cell(row=row_idx, column=col_idx)
101
+ if cell.value is not None and str(cell.value).strip():
102
+ min_col = col_idx
103
+ break
104
+ if min_col is not None:
105
+ break
106
+
107
+ if min_col is None:
108
+ return None
109
+
110
+ # 위→아래로 첫 번째 데이터 행 찾기
111
+ for row_idx in range(1, sheet_max_row + 1):
112
+ for col_idx in range(min_col, sheet_max_col + 1):
113
+ cell = ws.cell(row=row_idx, column=col_idx)
114
+ if cell.value is not None and str(cell.value).strip():
115
+ min_row = row_idx
116
+ break
117
+ if min_row is not None:
118
+ break
119
+
120
+ if min_row is None:
121
+ return None
122
+
123
+ # 오른쪽→왼쪽으로 마지막 데이터 열 찾기
124
+ for col_idx in range(sheet_max_col, min_col - 1, -1):
125
+ for row_idx in range(min_row, sheet_max_row + 1):
126
+ cell = ws.cell(row=row_idx, column=col_idx)
127
+ if cell.value is not None and str(cell.value).strip():
128
+ max_col = col_idx
129
+ break
130
+ if max_col is not None:
131
+ break
132
+
133
+ if max_col is None:
134
+ max_col = min_col
135
+
136
+ # 아래→위로 마지막 데이터 행 찾기
137
+ for row_idx in range(sheet_max_row, min_row - 1, -1):
138
+ for col_idx in range(min_col, max_col + 1):
139
+ cell = ws.cell(row=row_idx, column=col_idx)
140
+ if cell.value is not None and str(cell.value).strip():
141
+ max_row = row_idx
142
+ break
143
+ if max_row is not None:
144
+ break
145
+
146
+ if max_row is None:
147
+ max_row = min_row
148
+
149
+ layout = LayoutRange(min_row=min_row, max_row=max_row, min_col=min_col, max_col=max_col)
150
+ logger.debug(f"Layout detected: rows {min_row}-{max_row}, cols {min_col}-{max_col}")
151
+ return layout
152
+
153
+ except Exception as e:
154
+ logger.warning(f"Error detecting layout range: {e}")
155
+ return None
156
+
157
+
158
+ def layout_detect_range_xls(sheet) -> Optional[LayoutRange]:
159
+ """
160
+ XLS 시트에서 실제 데이터가 있는 영역을 감지합니다.
161
+
162
+ Args:
163
+ sheet: xlrd Sheet 객체
164
+
165
+ Returns:
166
+ LayoutRange 객체 또는 데이터가 없으면 None
167
+ """
168
+ try:
169
+ if sheet.nrows == 0 or sheet.ncols == 0:
170
+ return None
171
+
172
+ sheet_max_row = min(sheet.nrows, 1000)
173
+ sheet_max_col = min(sheet.ncols, 100)
174
+
175
+ min_row = None
176
+ max_row = None
177
+ min_col = None
178
+ max_col = None
179
+
180
+ # 왼쪽→오른쪽으로 첫 번째 데이터 열 찾기 (0-based)
181
+ for col_idx in range(sheet_max_col):
182
+ for row_idx in range(sheet_max_row):
183
+ try:
184
+ value = sheet.cell_value(row_idx, col_idx)
185
+ if value is not None and str(value).strip():
186
+ min_col = col_idx + 1 # 1-based
187
+ break
188
+ except Exception:
189
+ pass
190
+ if min_col is not None:
191
+ break
192
+
193
+ if min_col is None:
194
+ return None
195
+
196
+ # 위→아래로 첫 번째 데이터 행 찾기
197
+ for row_idx in range(sheet_max_row):
198
+ for col_idx in range(min_col - 1, sheet_max_col):
199
+ try:
200
+ value = sheet.cell_value(row_idx, col_idx)
201
+ if value is not None and str(value).strip():
202
+ min_row = row_idx + 1 # 1-based
203
+ break
204
+ except Exception:
205
+ pass
206
+ if min_row is not None:
207
+ break
208
+
209
+ if min_row is None:
210
+ return None
211
+
212
+ # 오른쪽→왼쪽으로 마지막 데이터 열 찾기
213
+ for col_idx in range(sheet_max_col - 1, min_col - 2, -1):
214
+ for row_idx in range(min_row - 1, sheet_max_row):
215
+ try:
216
+ value = sheet.cell_value(row_idx, col_idx)
217
+ if value is not None and str(value).strip():
218
+ max_col = col_idx + 1 # 1-based
219
+ break
220
+ except Exception:
221
+ pass
222
+ if max_col is not None:
223
+ break
224
+
225
+ if max_col is None:
226
+ max_col = min_col
227
+
228
+ # 아래→위로 마지막 데이터 행 찾기
229
+ for row_idx in range(sheet_max_row - 1, min_row - 2, -1):
230
+ for col_idx in range(min_col - 1, max_col):
231
+ try:
232
+ value = sheet.cell_value(row_idx, col_idx)
233
+ if value is not None and str(value).strip():
234
+ max_row = row_idx + 1 # 1-based
235
+ break
236
+ except Exception:
237
+ pass
238
+ if max_row is not None:
239
+ break
240
+
241
+ if max_row is None:
242
+ max_row = min_row
243
+
244
+ layout = LayoutRange(min_row=min_row, max_row=max_row, min_col=min_col, max_col=max_col)
245
+ logger.debug(f"XLS Layout detected: rows {min_row}-{max_row}, cols {min_col}-{max_col}")
246
+ return layout
247
+
248
+ except Exception as e:
249
+ logger.warning(f"Error detecting XLS layout range: {e}")
250
+ return None
251
+
252
+
253
+ def _has_border_xlsx(cell) -> bool:
254
+ """XLSX 셀에 테두리가 있는지 확인 (상하좌우 중 하나라도)"""
255
+ try:
256
+ border = cell.border
257
+ if border is None:
258
+ return False
259
+
260
+ sides = [border.top, border.bottom, border.left, border.right]
261
+ for side in sides:
262
+ if side is not None and side.style is not None and side.style != 'none':
263
+ return True
264
+ return False
265
+ except Exception:
266
+ return False
267
+
268
+
269
+ def _detect_bordered_regions_xlsx(ws, layout: LayoutRange) -> List[LayoutRange]:
270
+ """
271
+ XLSX 워크시트에서 테두리가 있는 영역들을 감지합니다.
272
+ 테두리가 있는 셀들을 BFS로 그룹화하여 사각형 영역으로 반환합니다.
273
+
274
+ Args:
275
+ ws: openpyxl Worksheet 객체
276
+ layout: 탐색할 레이아웃 범위
277
+
278
+ Returns:
279
+ 테두리 영역 목록
280
+ """
281
+ # 테두리가 있는 셀 좌표 수집
282
+ bordered_cells: Set[Tuple[int, int]] = set()
283
+
284
+ for row_idx in range(layout.min_row, layout.max_row + 1):
285
+ for col_idx in range(layout.min_col, layout.max_col + 1):
286
+ cell = ws.cell(row=row_idx, column=col_idx)
287
+ if _has_border_xlsx(cell):
288
+ bordered_cells.add((row_idx, col_idx))
289
+
290
+ if not bordered_cells:
291
+ return []
292
+
293
+ # BFS로 인접한 테두리 셀들을 그룹화
294
+ visited: Set[Tuple[int, int]] = set()
295
+ regions: List[LayoutRange] = []
296
+
297
+ # 위→아래, 왼쪽→오른쪽 순서로 정렬
298
+ sorted_cells = sorted(bordered_cells, key=lambda x: (x[0], x[1]))
299
+
300
+ for start_cell in sorted_cells:
301
+ if start_cell in visited:
302
+ continue
303
+
304
+ # BFS
305
+ group: Set[Tuple[int, int]] = set()
306
+ queue = deque([start_cell])
307
+
308
+ while queue:
309
+ current = queue.popleft()
310
+ if current in visited:
311
+ continue
312
+
313
+ visited.add(current)
314
+ group.add(current)
315
+
316
+ row, col = current
317
+ # 상하좌우 인접 셀
318
+ neighbors = [(row-1, col), (row+1, col), (row, col-1), (row, col+1)]
319
+
320
+ for neighbor in neighbors:
321
+ if neighbor in bordered_cells and neighbor not in visited:
322
+ queue.append(neighbor)
323
+
324
+ # 그룹에서 사각형 영역 계산
325
+ if group:
326
+ min_r = min(r for r, c in group)
327
+ max_r = max(r for r, c in group)
328
+ min_c = min(c for r, c in group)
329
+ max_c = max(c for r, c in group)
330
+ regions.append(LayoutRange(min_row=min_r, max_row=max_r, min_col=min_c, max_col=max_c))
331
+
332
+ return regions
333
+
334
+
335
+ def _detect_value_regions_xlsx(ws, layout: LayoutRange, exclude_regions: List[LayoutRange]) -> List[LayoutRange]:
336
+ """
337
+ XLSX 워크시트에서 값이 있는 영역들을 감지합니다 (테두리 영역 제외).
338
+ 병합 셀의 경우, 병합 영역의 일부가 layout에 포함되면 전체 영역을 감지합니다.
339
+
340
+ Args:
341
+ ws: openpyxl Worksheet 객체
342
+ layout: 탐색할 레이아웃 범위
343
+ exclude_regions: 제외할 영역 목록 (이미 감지된 테두리 영역)
344
+
345
+ Returns:
346
+ 값이 있는 영역 목록
347
+ """
348
+ # 이미 감지된 영역에 포함된 셀인지 확인하는 함수
349
+ def is_in_excluded(row: int, col: int) -> bool:
350
+ for region in exclude_regions:
351
+ if (region.min_row <= row <= region.max_row and
352
+ region.min_col <= col <= region.max_col):
353
+ return True
354
+ return False
355
+
356
+ # 병합 셀 정보 수집: 각 셀이 어떤 병합 영역에 속하는지
357
+ merged_cell_map: Dict[Tuple[int, int], Tuple[int, int, int, int]] = {} # (row, col) -> (min_row, max_row, min_col, max_col)
358
+ for merged_range in ws.merged_cells.ranges:
359
+ mr_min_row, mr_min_col = merged_range.min_row, merged_range.min_col
360
+ mr_max_row, mr_max_col = merged_range.max_row, merged_range.max_col
361
+ for r in range(mr_min_row, mr_max_row + 1):
362
+ for c in range(mr_min_col, mr_max_col + 1):
363
+ merged_cell_map[(r, c)] = (mr_min_row, mr_max_row, mr_min_col, mr_max_col)
364
+
365
+ # 값이 있는 셀 좌표 수집 (제외 영역 외)
366
+ value_cells: Set[Tuple[int, int]] = set()
367
+
368
+ for row_idx in range(layout.min_row, layout.max_row + 1):
369
+ for col_idx in range(layout.min_col, layout.max_col + 1):
370
+ if is_in_excluded(row_idx, col_idx):
371
+ continue
372
+
373
+ cell = ws.cell(row=row_idx, column=col_idx)
374
+
375
+ # 일반 셀: 값이 있으면 추가
376
+ if cell.value is not None and str(cell.value).strip():
377
+ value_cells.add((row_idx, col_idx))
378
+ # 병합 셀의 일부인 경우: 병합 셀의 첫 번째 셀에 값이 있으면 이 셀도 추가
379
+ elif (row_idx, col_idx) in merged_cell_map:
380
+ mr_min_row, mr_max_row, mr_min_col, mr_max_col = merged_cell_map[(row_idx, col_idx)]
381
+ # 병합 셀의 첫 번째 셀 값 확인
382
+ first_cell = ws.cell(row=mr_min_row, column=mr_min_col)
383
+ if first_cell.value is not None and str(first_cell.value).strip():
384
+ value_cells.add((row_idx, col_idx))
385
+
386
+ if not value_cells:
387
+ return []
388
+
389
+ # BFS로 인접한 값 셀들을 그룹화
390
+ visited: Set[Tuple[int, int]] = set()
391
+ regions: List[LayoutRange] = []
392
+
393
+ sorted_cells = sorted(value_cells, key=lambda x: (x[0], x[1]))
394
+
395
+ for start_cell in sorted_cells:
396
+ if start_cell in visited:
397
+ continue
398
+
399
+ group: Set[Tuple[int, int]] = set()
400
+ queue = deque([start_cell])
401
+
402
+ while queue:
403
+ current = queue.popleft()
404
+ if current in visited:
405
+ continue
406
+
407
+ visited.add(current)
408
+ group.add(current)
409
+
410
+ row, col = current
411
+ neighbors = [(row-1, col), (row+1, col), (row, col-1), (row, col+1)]
412
+
413
+ for neighbor in neighbors:
414
+ if neighbor in value_cells and neighbor not in visited:
415
+ queue.append(neighbor)
416
+
417
+ if group:
418
+ min_r = min(r for r, c in group)
419
+ max_r = max(r for r, c in group)
420
+ min_c = min(c for r, c in group)
421
+ max_c = max(c for r, c in group)
422
+ regions.append(LayoutRange(min_row=min_r, max_row=max_r, min_col=min_c, max_col=max_c))
423
+
424
+ return regions
425
+
426
+
427
+ def _merge_adjacent_regions(regions: List[LayoutRange]) -> List[LayoutRange]:
428
+ """
429
+ 완전히 인접한 영역들을 병합합니다.
430
+ 반복적으로 인접한 영역을 찾아 병합합니다.
431
+
432
+ Args:
433
+ regions: 영역 목록
434
+
435
+ Returns:
436
+ 병합된 영역 목록
437
+ """
438
+ if len(regions) <= 1:
439
+ return regions
440
+
441
+ merged = True
442
+ current_regions = list(regions)
443
+
444
+ while merged:
445
+ merged = False
446
+ new_regions = []
447
+ used = set()
448
+
449
+ for i, region_a in enumerate(current_regions):
450
+ if i in used:
451
+ continue
452
+
453
+ merged_region = region_a
454
+
455
+ for j, region_b in enumerate(current_regions):
456
+ if j <= i or j in used:
457
+ continue
458
+
459
+ if merged_region.is_adjacent(region_b):
460
+ merged_region = merged_region.merge_with(region_b)
461
+ used.add(j)
462
+ merged = True
463
+
464
+ new_regions.append(merged_region)
465
+ used.add(i)
466
+
467
+ current_regions = new_regions
468
+
469
+ return current_regions
470
+
471
+
472
+ def object_detect_xlsx(ws, layout: Optional[LayoutRange] = None) -> List[LayoutRange]:
473
+ """
474
+ XLSX 워크시트에서 개별 객체(테이블/데이터 블록)를 감지합니다.
475
+
476
+ 알고리즘:
477
+ 1. 테두리가 있는 영역을 먼저 개별 개체로 인식
478
+ 2. 테두리가 없는 값 영역을 감지
479
+ 3. 완전히 인접한 개체들을 병합
480
+ 4. 위→아래, 왼쪽→오른쪽 순서로 정렬하여 반환
481
+
482
+ Args:
483
+ ws: openpyxl Worksheet 객체
484
+ layout: 탐색할 레이아웃 범위 (None이면 자동 감지)
485
+
486
+ Returns:
487
+ 개별 객체 영역 목록
488
+ """
489
+ try:
490
+ if layout is None:
491
+ layout = layout_detect_range_xlsx(ws)
492
+ if layout is None:
493
+ return []
494
+
495
+ # 1. 테두리 영역 감지
496
+ bordered_regions = _detect_bordered_regions_xlsx(ws, layout)
497
+ logger.debug(f"Detected {len(bordered_regions)} bordered regions")
498
+
499
+ # 2. 값 영역 감지 (테두리 영역 제외)
500
+ value_regions = _detect_value_regions_xlsx(ws, layout, bordered_regions)
501
+ logger.debug(f"Detected {len(value_regions)} value regions (excluding bordered)")
502
+
503
+ # 3. 모든 영역 합치기
504
+ all_regions = bordered_regions + value_regions
505
+
506
+ if not all_regions:
507
+ return []
508
+
509
+ # 4. 인접 영역 병합
510
+ merged_regions = _merge_adjacent_regions(all_regions)
511
+ logger.debug(f"After merging: {len(merged_regions)} regions")
512
+
513
+ # 5. 위→아래, 왼쪽→오른쪽 순서로 정렬
514
+ sorted_regions = sorted(merged_regions, key=lambda r: (r.min_row, r.min_col))
515
+
516
+ for i, obj in enumerate(sorted_regions):
517
+ logger.debug(
518
+ f" Object {i+1}: rows {obj.min_row}-{obj.max_row}, "
519
+ f"cols {obj.min_col}-{obj.max_col} ({obj.cell_count()} cells)"
520
+ )
521
+
522
+ return sorted_regions
523
+
524
+ except Exception as e:
525
+ logger.warning(f"Error detecting objects in XLSX: {e}")
526
+ return []
527
+
528
+
529
+ def _has_border_xls(sheet, wb, row_idx: int, col_idx: int) -> bool:
530
+ """XLS 셀에 테두리가 있는지 확인 (0-based 인덱스)"""
531
+ try:
532
+ xf_index = sheet.cell_xf_index(row_idx, col_idx)
533
+ xf = wb.xf_list[xf_index]
534
+
535
+ # 테두리 인덱스 확인
536
+ borders = [
537
+ xf.border.top_line_style,
538
+ xf.border.bottom_line_style,
539
+ xf.border.left_line_style,
540
+ xf.border.right_line_style
541
+ ]
542
+
543
+ for border_style in borders:
544
+ if border_style and border_style > 0:
545
+ return True
546
+ return False
547
+ except Exception:
548
+ return False
549
+
550
+
551
+ def _detect_bordered_regions_xls(sheet, wb, layout: LayoutRange) -> List[LayoutRange]:
552
+ """
553
+ XLS 시트에서 테두리가 있는 영역들을 감지합니다.
554
+
555
+ Args:
556
+ sheet: xlrd Sheet 객체
557
+ wb: xlrd Workbook 객체
558
+ layout: 탐색할 레이아웃 범위 (1-based)
559
+
560
+ Returns:
561
+ 테두리 영역 목록 (1-based)
562
+ """
563
+ bordered_cells: Set[Tuple[int, int]] = set()
564
+
565
+ for row_idx in range(layout.min_row, layout.max_row + 1):
566
+ for col_idx in range(layout.min_col, layout.max_col + 1):
567
+ # XLS는 0-based
568
+ if _has_border_xls(sheet, wb, row_idx - 1, col_idx - 1):
569
+ bordered_cells.add((row_idx, col_idx))
570
+
571
+ if not bordered_cells:
572
+ return []
573
+
574
+ visited: Set[Tuple[int, int]] = set()
575
+ regions: List[LayoutRange] = []
576
+
577
+ sorted_cells = sorted(bordered_cells, key=lambda x: (x[0], x[1]))
578
+
579
+ for start_cell in sorted_cells:
580
+ if start_cell in visited:
581
+ continue
582
+
583
+ group: Set[Tuple[int, int]] = set()
584
+ queue = deque([start_cell])
585
+
586
+ while queue:
587
+ current = queue.popleft()
588
+ if current in visited:
589
+ continue
590
+
591
+ visited.add(current)
592
+ group.add(current)
593
+
594
+ row, col = current
595
+ neighbors = [(row-1, col), (row+1, col), (row, col-1), (row, col+1)]
596
+
597
+ for neighbor in neighbors:
598
+ if neighbor in bordered_cells and neighbor not in visited:
599
+ queue.append(neighbor)
600
+
601
+ if group:
602
+ min_r = min(r for r, c in group)
603
+ max_r = max(r for r, c in group)
604
+ min_c = min(c for r, c in group)
605
+ max_c = max(c for r, c in group)
606
+ regions.append(LayoutRange(min_row=min_r, max_row=max_r, min_col=min_c, max_col=max_c))
607
+
608
+ return regions
609
+
610
+
611
+ def _detect_value_regions_xls(sheet, layout: LayoutRange, exclude_regions: List[LayoutRange]) -> List[LayoutRange]:
612
+ """
613
+ XLS 시트에서 값이 있는 영역들을 감지합니다 (테두리 영역 제외).
614
+
615
+ Args:
616
+ sheet: xlrd Sheet 객체
617
+ layout: 탐색할 레이아웃 범위 (1-based)
618
+ exclude_regions: 제외할 영역 목록
619
+
620
+ Returns:
621
+ 값이 있는 영역 목록 (1-based)
622
+ """
623
+ def is_in_excluded(row: int, col: int) -> bool:
624
+ for region in exclude_regions:
625
+ if (region.min_row <= row <= region.max_row and
626
+ region.min_col <= col <= region.max_col):
627
+ return True
628
+ return False
629
+
630
+ value_cells: Set[Tuple[int, int]] = set()
631
+
632
+ for row_idx in range(layout.min_row, layout.max_row + 1):
633
+ for col_idx in range(layout.min_col, layout.max_col + 1):
634
+ if is_in_excluded(row_idx, col_idx):
635
+ continue
636
+ try:
637
+ # XLS는 0-based
638
+ value = sheet.cell_value(row_idx - 1, col_idx - 1)
639
+ if value is not None and str(value).strip():
640
+ value_cells.add((row_idx, col_idx))
641
+ except Exception:
642
+ pass
643
+
644
+ if not value_cells:
645
+ return []
646
+
647
+ visited: Set[Tuple[int, int]] = set()
648
+ regions: List[LayoutRange] = []
649
+
650
+ sorted_cells = sorted(value_cells, key=lambda x: (x[0], x[1]))
651
+
652
+ for start_cell in sorted_cells:
653
+ if start_cell in visited:
654
+ continue
655
+
656
+ group: Set[Tuple[int, int]] = set()
657
+ queue = deque([start_cell])
658
+
659
+ while queue:
660
+ current = queue.popleft()
661
+ if current in visited:
662
+ continue
663
+
664
+ visited.add(current)
665
+ group.add(current)
666
+
667
+ row, col = current
668
+ neighbors = [(row-1, col), (row+1, col), (row, col-1), (row, col+1)]
669
+
670
+ for neighbor in neighbors:
671
+ if neighbor in value_cells and neighbor not in visited:
672
+ queue.append(neighbor)
673
+
674
+ if group:
675
+ min_r = min(r for r, c in group)
676
+ max_r = max(r for r, c in group)
677
+ min_c = min(c for r, c in group)
678
+ max_c = max(c for r, c in group)
679
+ regions.append(LayoutRange(min_row=min_r, max_row=max_r, min_col=min_c, max_col=max_c))
680
+
681
+ return regions
682
+
683
+
684
+ def object_detect_xls(sheet, wb, layout: Optional[LayoutRange] = None) -> List[LayoutRange]:
685
+ """
686
+ XLS 시트에서 개별 객체(테이블/데이터 블록)를 감지합니다.
687
+
688
+ 알고리즘:
689
+ 1. 테두리가 있는 영역을 먼저 개별 개체로 인식
690
+ 2. 테두리가 없는 값 영역을 감지
691
+ 3. 완전히 인접한 개체들을 병합
692
+ 4. 위→아래, 왼쪽→오른쪽 순서로 정렬하여 반환
693
+
694
+ Args:
695
+ sheet: xlrd Sheet 객체
696
+ wb: xlrd Workbook 객체
697
+ layout: 탐색할 레이아웃 범위 (None이면 자동 감지)
698
+
699
+ Returns:
700
+ 개별 객체 영역 목록 (1-based 좌표)
701
+ """
702
+ try:
703
+ if layout is None:
704
+ layout = layout_detect_range_xls(sheet)
705
+ if layout is None:
706
+ return []
707
+
708
+ # 1. 테두리 영역 감지
709
+ bordered_regions = _detect_bordered_regions_xls(sheet, wb, layout)
710
+ logger.debug(f"XLS: Detected {len(bordered_regions)} bordered regions")
711
+
712
+ # 2. 값 영역 감지 (테두리 영역 제외)
713
+ value_regions = _detect_value_regions_xls(sheet, layout, bordered_regions)
714
+ logger.debug(f"XLS: Detected {len(value_regions)} value regions (excluding bordered)")
715
+
716
+ # 3. 모든 영역 합치기
717
+ all_regions = bordered_regions + value_regions
718
+
719
+ if not all_regions:
720
+ return []
721
+
722
+ # 4. 인접 영역 병합
723
+ merged_regions = _merge_adjacent_regions(all_regions)
724
+ logger.debug(f"XLS: After merging: {len(merged_regions)} regions")
725
+
726
+ # 5. 위→아래, 왼쪽→오른쪽 순서로 정렬
727
+ sorted_regions = sorted(merged_regions, key=lambda r: (r.min_row, r.min_col))
728
+
729
+ for i, obj in enumerate(sorted_regions):
730
+ logger.debug(
731
+ f" XLS Object {i+1}: rows {obj.min_row}-{obj.max_row}, "
732
+ f"cols {obj.min_col}-{obj.max_col} ({obj.cell_count()} cells)"
733
+ )
734
+
735
+ return sorted_regions
736
+
737
+ except Exception as e:
738
+ logger.warning(f"Error detecting objects in XLS: {e}")
739
+ return []