xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,361 @@
1
+ """
2
+ Excel XLSX 테이블 변환 모듈
3
+
4
+ XLSX 시트를 Markdown 또는 HTML 테이블로 변환합니다.
5
+ 병합셀이 있으면 HTML, 없으면 Markdown을 사용합니다.
6
+ layout_detect_range를 통해 실제 데이터 영역만 추출합니다.
7
+ object_detect를 통해 개별 객체(테이블)별로 청킹할 수 있습니다.
8
+ """
9
+
10
+ import logging
11
+ from typing import Optional, List
12
+ from xgen_doc2chunk.core.processor.excel_helper.excel_layout_detector import layout_detect_range_xlsx, object_detect_xlsx, LayoutRange
13
+
14
+ logger = logging.getLogger("document-processor")
15
+
16
+
17
+ def has_merged_cells_xlsx(ws, layout: Optional[LayoutRange] = None) -> bool:
18
+ """
19
+ XLSX 워크시트에 병합셀이 존재하는지 확인합니다.
20
+ layout이 주어지면 해당 영역 내의 병합셀만 확인합니다.
21
+
22
+ Args:
23
+ ws: openpyxl Worksheet 객체
24
+ layout: 검사할 레이아웃 범위 (None이면 전체 시트)
25
+
26
+ Returns:
27
+ 병합셀이 존재하면 True
28
+ """
29
+ try:
30
+ if len(ws.merged_cells.ranges) == 0:
31
+ return False
32
+
33
+ # layout이 없으면 전체 시트에 병합셀 존재 여부만 확인
34
+ if layout is None:
35
+ return True
36
+
37
+ # layout 영역 내에 병합셀이 있는지 확인
38
+ for merged_range in ws.merged_cells.ranges:
39
+ # 병합 영역이 layout 영역과 겹치는지 확인
40
+ if (merged_range.min_row <= layout.max_row and
41
+ merged_range.max_row >= layout.min_row and
42
+ merged_range.min_col <= layout.max_col and
43
+ merged_range.max_col >= layout.min_col):
44
+ return True
45
+
46
+ return False
47
+ except Exception:
48
+ return False
49
+
50
+
51
+ def convert_xlsx_sheet_to_table(ws, layout: Optional[LayoutRange] = None) -> str:
52
+ """
53
+ XLSX 워크시트를 테이블로 변환합니다.
54
+ 병합셀이 없으면 Markdown, 있으면 HTML로 변환합니다.
55
+ layout이 None이면 자동으로 감지합니다.
56
+
57
+ Args:
58
+ ws: openpyxl Worksheet 객체
59
+ layout: 변환할 레이아웃 범위 (None이면 자동 감지)
60
+
61
+ Returns:
62
+ 변환된 테이블 문자열
63
+ """
64
+ # layout이 없으면 자동 감지
65
+ if layout is None:
66
+ layout = layout_detect_range_xlsx(ws)
67
+ if layout is None:
68
+ logger.debug("No data found in worksheet")
69
+ return ""
70
+
71
+ if has_merged_cells_xlsx(ws, layout):
72
+ logger.debug("Merged cells detected in XLSX, using HTML format")
73
+ return convert_xlsx_sheet_to_html(ws, layout)
74
+ else:
75
+ logger.debug("No merged cells in XLSX, using Markdown format")
76
+ return convert_xlsx_sheet_to_markdown(ws, layout)
77
+
78
+
79
+ def convert_xlsx_sheet_to_markdown(ws, layout: Optional[LayoutRange] = None) -> str:
80
+ """
81
+ XLSX 워크시트를 Markdown 테이블로 변환합니다.
82
+ layout_detect_range를 통해 실제 데이터 영역만 추출합니다.
83
+
84
+ Args:
85
+ ws: openpyxl Worksheet 객체
86
+ layout: 변환할 레이아웃 범위 (None이면 자동 감지)
87
+
88
+ Returns:
89
+ Markdown 테이블 문자열
90
+ """
91
+ try:
92
+ # layout이 없으면 자동 감지
93
+ if layout is None:
94
+ layout = layout_detect_range_xlsx(ws)
95
+ if layout is None:
96
+ return ""
97
+
98
+ # 병합 셀의 시작점이 layout 밖에 있는 경우, layout 내 첫 번째 셀에 값을 표시
99
+ merged_value_override = {} # (row, col) -> value
100
+ for merged_range in ws.merged_cells.ranges:
101
+ mr_min_row, mr_min_col = merged_range.min_row, merged_range.min_col
102
+ mr_max_row, mr_max_col = merged_range.max_row, merged_range.max_col
103
+
104
+ # layout 영역과 겹치는지 확인
105
+ if (mr_min_row <= layout.max_row and
106
+ mr_max_row >= layout.min_row and
107
+ mr_min_col <= layout.max_col and
108
+ mr_max_col >= layout.min_col):
109
+
110
+ # 병합 셀의 시작점이 layout 밖에 있는 경우
111
+ start_in_layout = (layout.min_row <= mr_min_row <= layout.max_row and
112
+ layout.min_col <= mr_min_col <= layout.max_col)
113
+
114
+ if not start_in_layout:
115
+ merged_value = ws.cell(row=mr_min_row, column=mr_min_col).value
116
+ if merged_value is not None:
117
+ first_row_in_layout = max(mr_min_row, layout.min_row)
118
+ first_col_in_layout = max(mr_min_col, layout.min_col)
119
+ merged_value_override[(first_row_in_layout, first_col_in_layout)] = merged_value
120
+
121
+ md_parts = []
122
+ row_count = 0
123
+
124
+ for row_idx in range(layout.min_row, layout.max_row + 1):
125
+ cells = []
126
+ row_has_content = False
127
+
128
+ for col_idx in range(layout.min_col, layout.max_col + 1):
129
+ cell = ws.cell(row=row_idx, column=col_idx)
130
+ cell_value = ""
131
+
132
+ # 병합 셀 override 확인
133
+ if (row_idx, col_idx) in merged_value_override:
134
+ cell_value = str(merged_value_override[(row_idx, col_idx)]).strip()
135
+ if cell_value:
136
+ row_has_content = True
137
+ elif cell.value is not None:
138
+ cell_value = str(cell.value).strip()
139
+ if cell_value:
140
+ row_has_content = True
141
+
142
+ # Markdown 테이블에서 파이프는 이스케이프 필요
143
+ cell_value = cell_value.replace("|", "\\|")
144
+ cell_value = cell_value.replace("\n", " ")
145
+ cells.append(cell_value)
146
+
147
+ if not row_has_content:
148
+ continue
149
+
150
+ row_str = "| " + " | ".join(cells) + " |"
151
+ md_parts.append(row_str)
152
+ row_count += 1
153
+
154
+ # 첫 번째 데이터 행 다음에 구분선 추가
155
+ if row_count == 1:
156
+ separator = "| " + " | ".join(["---"] * len(cells)) + " |"
157
+ md_parts.append(separator)
158
+
159
+ return "\n".join(md_parts) if md_parts else ""
160
+
161
+ except Exception as e:
162
+ logger.warning(f"Error converting sheet to Markdown: {e}")
163
+ return ""
164
+
165
+
166
+ def convert_xlsx_sheet_to_html(ws, layout: Optional[LayoutRange] = None) -> str:
167
+ """
168
+ XLSX 워크시트를 HTML 테이블로 변환합니다.
169
+ 셀 병합(rowspan/colspan)을 지원합니다.
170
+ layout_detect_range를 통해 실제 데이터 영역만 추출합니다.
171
+
172
+ 병합셀이 있는 경우 빈 행도 테이블 구조의 일부이므로 포함합니다.
173
+
174
+ Args:
175
+ ws: openpyxl Worksheet 객체
176
+ layout: 변환할 레이아웃 범위 (None이면 자동 감지)
177
+
178
+ Returns:
179
+ HTML 테이블 문자열
180
+ """
181
+ try:
182
+ # layout이 없으면 자동 감지
183
+ if layout is None:
184
+ layout = layout_detect_range_xlsx(ws)
185
+ if layout is None:
186
+ return ""
187
+
188
+ # 병합된 셀 정보 수집 (layout 영역 내만)
189
+ merged_cells_info = {} # (row, col) -> (rowspan, colspan)
190
+ skip_cells = set() # 건너뛸 셀 (병합된 영역의 일부)
191
+ # 병합 셀의 시작점이 layout 밖에 있는 경우, layout 내 첫 번째 셀에 값을 표시
192
+ merged_value_override = {} # (row, col) -> value
193
+
194
+ for merged_range in ws.merged_cells.ranges:
195
+ mr_min_row, mr_min_col = merged_range.min_row, merged_range.min_col
196
+ mr_max_row, mr_max_col = merged_range.max_row, merged_range.max_col
197
+
198
+ # layout 영역과 겹치는 병합 셀만 처리
199
+ if (mr_min_row <= layout.max_row and
200
+ mr_max_row >= layout.min_row and
201
+ mr_min_col <= layout.max_col and
202
+ mr_max_col >= layout.min_col):
203
+
204
+ # 병합 셀의 시작점이 layout 안에 있는지 확인
205
+ start_in_layout = (layout.min_row <= mr_min_row <= layout.max_row and
206
+ layout.min_col <= mr_min_col <= layout.max_col)
207
+
208
+ if start_in_layout:
209
+ # 일반적인 경우: 병합 정보 저장
210
+ rowspan = mr_max_row - mr_min_row + 1
211
+ colspan = mr_max_col - mr_min_col + 1
212
+ merged_cells_info[(mr_min_row, mr_min_col)] = (rowspan, colspan)
213
+
214
+ # 병합된 영역의 나머지 셀들은 건너뛰기
215
+ for r in range(mr_min_row, mr_max_row + 1):
216
+ for c in range(mr_min_col, mr_max_col + 1):
217
+ if r != mr_min_row or c != mr_min_col:
218
+ skip_cells.add((r, c))
219
+ else:
220
+ # 병합 셀의 시작점이 layout 밖에 있는 경우
221
+ # layout 내 첫 번째 셀에 병합 셀의 값을 표시
222
+ merged_value = ws.cell(row=mr_min_row, column=mr_min_col).value
223
+ if merged_value is not None:
224
+ # layout 내에서 병합 영역의 첫 번째 셀 찾기
225
+ first_row_in_layout = max(mr_min_row, layout.min_row)
226
+ first_col_in_layout = max(mr_min_col, layout.min_col)
227
+ merged_value_override[(first_row_in_layout, first_col_in_layout)] = merged_value
228
+
229
+ # layout 내의 병합 영역 나머지 셀들은 건너뛰기
230
+ for r in range(max(mr_min_row, layout.min_row), min(mr_max_row, layout.max_row) + 1):
231
+ for c in range(max(mr_min_col, layout.min_col), min(mr_max_col, layout.max_col) + 1):
232
+ # 값을 표시할 첫 번째 셀은 skip하지 않음
233
+ if (r, c) in merged_value_override:
234
+ continue
235
+ skip_cells.add((r, c))
236
+
237
+ # HTML 생성
238
+ html_parts = ["<table border='1'>"]
239
+ has_data = False
240
+
241
+ for row_idx in range(layout.min_row, layout.max_row + 1):
242
+ row_parts = ["<tr>"]
243
+
244
+ for col_idx in range(layout.min_col, layout.max_col + 1):
245
+ # 건너뛸 셀 확인 (병합된 영역의 일부)
246
+ if (row_idx, col_idx) in skip_cells:
247
+ continue
248
+
249
+ cell = ws.cell(row=row_idx, column=col_idx)
250
+
251
+ # 셀 값 추출 (병합 셀 override 확인)
252
+ cell_value = ""
253
+ if (row_idx, col_idx) in merged_value_override:
254
+ cell_value = str(merged_value_override[(row_idx, col_idx)]).strip()
255
+ if cell_value:
256
+ has_data = True
257
+ elif cell.value is not None:
258
+ cell_value = str(cell.value).strip()
259
+ if cell_value:
260
+ has_data = True
261
+
262
+ # HTML 이스케이프
263
+ cell_value = _escape_html(cell_value)
264
+
265
+ # 첫 번째 행은 헤더로 처리
266
+ tag = "th" if row_idx == layout.min_row else "td"
267
+
268
+ # 병합 속성
269
+ attrs = []
270
+ if (row_idx, col_idx) in merged_cells_info:
271
+ rowspan, colspan = merged_cells_info[(row_idx, col_idx)]
272
+ if rowspan > 1:
273
+ attrs.append(f"rowspan='{rowspan}'")
274
+ if colspan > 1:
275
+ attrs.append(f"colspan='{colspan}'")
276
+
277
+ attr_str = " " + " ".join(attrs) if attrs else ""
278
+ row_parts.append(f"<{tag}{attr_str}>{cell_value}</{tag}>")
279
+
280
+ row_parts.append("</tr>")
281
+
282
+ # 모든 행을 추가 (빈 행도 테이블 구조의 일부)
283
+ html_parts.append("".join(row_parts))
284
+
285
+ html_parts.append("</table>")
286
+
287
+ if has_data:
288
+ return "\n".join(html_parts)
289
+ return ""
290
+
291
+ except Exception as e:
292
+ logger.warning(f"Error converting sheet to HTML: {e}")
293
+ return ""
294
+
295
+
296
+ def _escape_html(text: str) -> str:
297
+ """
298
+ HTML 특수 문자를 이스케이프합니다.
299
+
300
+ Args:
301
+ text: 원본 텍스트
302
+
303
+ Returns:
304
+ 이스케이프된 텍스트
305
+ """
306
+ if not text:
307
+ return ""
308
+
309
+ text = text.replace("&", "&amp;")
310
+ text = text.replace("<", "&lt;")
311
+ text = text.replace(">", "&gt;")
312
+ text = text.replace("\n", "<br>")
313
+
314
+ return text
315
+
316
+
317
+ def convert_xlsx_objects_to_tables(ws, layout: Optional[LayoutRange] = None) -> List[str]:
318
+ """
319
+ XLSX 워크시트에서 개별 객체(테이블)를 감지하고 각각을 테이블 문자열로 변환합니다.
320
+
321
+ 알고리즘:
322
+ 1. 테두리가 있는 영역을 먼저 개별 개체로 인식
323
+ 2. 테두리가 없는 값 영역을 감지
324
+ 3. 완전히 인접한 개체들을 병합
325
+ 4. 각 객체를 테이블로 변환
326
+
327
+ Args:
328
+ ws: openpyxl Worksheet 객체
329
+ layout: 탐색할 레이아웃 범위 (None이면 자동 감지)
330
+
331
+ Returns:
332
+ 개별 객체 테이블 문자열 목록 (위→아래, 왼쪽→오른쪽 순서)
333
+ """
334
+ objects = object_detect_xlsx(ws, layout)
335
+
336
+ if not objects:
337
+ return []
338
+
339
+ tables = []
340
+ for obj_layout in objects:
341
+ table_str = convert_xlsx_sheet_to_table(ws, obj_layout)
342
+ # 빈 테이블 필터링 (공백, 줄바꿈, 테이블 기호만 있는 경우 제외)
343
+ if table_str and table_str.strip():
344
+ # Markdown 테이블에서 실제 데이터가 있는지 확인
345
+ # 헤더 구분선(---)만 있고 데이터가 없는 경우 제외
346
+ lines = [line.strip() for line in table_str.strip().split('\n') if line.strip()]
347
+ has_data = False
348
+ for line in lines:
349
+ # 구분선이 아닌 행에서 | 사이에 실제 값이 있는지 확인
350
+ if '---' not in line:
351
+ # | col1 | col2 | 형태에서 값 추출
352
+ parts = [p.strip() for p in line.split('|') if p.strip()]
353
+ if parts:
354
+ has_data = True
355
+ break
356
+
357
+ if has_data:
358
+ tables.append(table_str)
359
+
360
+ logger.debug(f"Converted {len(tables)} objects to tables (XLSX)")
361
+ return tables
@@ -0,0 +1,266 @@
1
+ """
2
+ XLSX 텍스트박스 추출 모듈
3
+
4
+ XLSX 파일의 DrawingML에서 텍스트박스 내용을 추출합니다.
5
+ 텍스트박스는 xl/drawings/drawing*.xml에 <xdr:sp> 요소로 저장됩니다.
6
+ """
7
+
8
+ import os
9
+ import zipfile
10
+ import logging
11
+ import xml.etree.ElementTree as ET
12
+ from typing import Dict, List, Optional
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # DrawingML 네임스페이스
17
+ NAMESPACES = {
18
+ 'xdr': 'http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing',
19
+ 'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
20
+ 'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
21
+ 'pkg': 'http://schemas.openxmlformats.org/package/2006/relationships',
22
+ 'ss': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main',
23
+ }
24
+
25
+ # 네임스페이스 URI 상수
26
+ NS_XDR = '{http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing}'
27
+ NS_A = '{http://schemas.openxmlformats.org/drawingml/2006/main}'
28
+ NS_R = '{http://schemas.openxmlformats.org/officeDocument/2006/relationships}'
29
+ NS_PKG = '{http://schemas.openxmlformats.org/package/2006/relationships}'
30
+ NS_SS = '{http://schemas.openxmlformats.org/spreadsheetml/2006/main}'
31
+
32
+
33
+ def extract_textboxes_from_xlsx(file_path: str) -> Dict[str, List[str]]:
34
+ """
35
+ XLSX 파일에서 텍스트박스를 추출합니다.
36
+
37
+ XLSX의 텍스트박스는 xl/drawings/drawing*.xml 파일에 저장됩니다.
38
+ DrawingML 형식으로 <xdr:sp> (shape) 요소 내 <xdr:txBody>에 텍스트가 포함됩니다.
39
+
40
+ Args:
41
+ file_path: XLSX 파일 경로
42
+
43
+ Returns:
44
+ {시트명: [텍스트박스 내용 리스트]} 형태의 딕셔너리
45
+ """
46
+ textboxes_by_sheet: Dict[str, List[str]] = {}
47
+
48
+ try:
49
+ with zipfile.ZipFile(file_path, 'r') as zf:
50
+ # 시트와 drawing 관계 매핑 구축
51
+ sheet_drawing_map = _get_sheet_drawing_mapping(zf)
52
+ logger.debug(f"Sheet-Drawing mapping: {sheet_drawing_map}")
53
+
54
+ # 모든 drawing 파일 처리
55
+ for name in zf.namelist():
56
+ if name.startswith('xl/drawings/drawing') and name.endswith('.xml'):
57
+ try:
58
+ drawing_xml = zf.read(name)
59
+ textboxes = _parse_drawing_textboxes(drawing_xml)
60
+
61
+ if textboxes:
62
+ # drawing 파일에 해당하는 시트 찾기
63
+ drawing_name = os.path.basename(name)
64
+ sheet_name = sheet_drawing_map.get(drawing_name, f"Sheet ({drawing_name})")
65
+
66
+ if sheet_name not in textboxes_by_sheet:
67
+ textboxes_by_sheet[sheet_name] = []
68
+ textboxes_by_sheet[sheet_name].extend(textboxes)
69
+
70
+ logger.info(f"Extracted {len(textboxes)} textboxes from {name} -> {sheet_name}")
71
+
72
+ except Exception as e:
73
+ logger.warning(f"Error parsing textboxes from {name}: {e}")
74
+
75
+ total_textboxes = sum(len(tb) for tb in textboxes_by_sheet.values())
76
+ if total_textboxes > 0:
77
+ logger.info(f"Total extracted {total_textboxes} textboxes from XLSX")
78
+
79
+ except Exception as e:
80
+ logger.warning(f"Error extracting textboxes from XLSX: {e}")
81
+
82
+ return textboxes_by_sheet
83
+
84
+
85
+ def _get_sheet_drawing_mapping(zf: zipfile.ZipFile) -> Dict[str, str]:
86
+ """
87
+ XLSX 내부 관계를 파싱하여 drawing 파일과 시트 이름의 매핑을 구축합니다.
88
+
89
+ Args:
90
+ zf: ZipFile 객체
91
+
92
+ Returns:
93
+ {drawing 파일명: 시트명} 매핑
94
+ """
95
+ drawing_to_sheet: Dict[str, str] = {}
96
+ sheet_rid_map: Dict[str, str] = {} # rId -> sheet_name
97
+ rid_to_sheet_file: Dict[str, str] = {} # rId -> sheet파일경로
98
+
99
+ try:
100
+ # 1. workbook.xml에서 시트 정보 추출 (rId -> sheet_name)
101
+ if 'xl/workbook.xml' in zf.namelist():
102
+ workbook_xml = zf.read('xl/workbook.xml')
103
+ wb_root = ET.fromstring(workbook_xml)
104
+
105
+ for sheet_elem in wb_root.findall(f'.//{NS_SS}sheet'):
106
+ sheet_name = sheet_elem.get('name', '')
107
+ r_id = sheet_elem.get(f'{NS_R}id', '')
108
+ if sheet_name and r_id:
109
+ sheet_rid_map[r_id] = sheet_name
110
+
111
+ # 2. workbook.xml.rels에서 rId -> sheet*.xml 매핑
112
+ if 'xl/_rels/workbook.xml.rels' in zf.namelist():
113
+ rels_xml = zf.read('xl/_rels/workbook.xml.rels')
114
+ rels_root = ET.fromstring(rels_xml)
115
+
116
+ for rel_elem in rels_root.findall(f'.//{NS_PKG}Relationship'):
117
+ r_id = rel_elem.get('Id', '')
118
+ target = rel_elem.get('Target', '')
119
+ if 'worksheets/sheet' in target:
120
+ rid_to_sheet_file[r_id] = target
121
+
122
+ # 3. sheet파일 -> sheet_name 매핑
123
+ sheet_file_to_name: Dict[str, str] = {}
124
+ for r_id, sheet_name in sheet_rid_map.items():
125
+ if r_id in rid_to_sheet_file:
126
+ sheet_file = rid_to_sheet_file[r_id]
127
+ # worksheets/sheet1.xml -> sheet1.xml
128
+ sheet_file_base = os.path.basename(sheet_file)
129
+ sheet_file_to_name[sheet_file_base] = sheet_name
130
+
131
+ # 4. 각 sheet*.xml.rels에서 drawing 관계 찾기
132
+ for name in zf.namelist():
133
+ if name.startswith('xl/worksheets/_rels/sheet') and name.endswith('.xml.rels'):
134
+ try:
135
+ rels_xml = zf.read(name)
136
+ rels_root = ET.fromstring(rels_xml)
137
+
138
+ # sheet*.xml.rels -> sheet*.xml
139
+ sheet_file = os.path.basename(name).replace('.rels', '')
140
+ sheet_name = sheet_file_to_name.get(sheet_file, sheet_file)
141
+
142
+ for rel_elem in rels_root.findall(f'.//{NS_PKG}Relationship'):
143
+ target = rel_elem.get('Target', '')
144
+ if 'drawings/drawing' in target:
145
+ # ../drawings/drawing1.xml -> drawing1.xml
146
+ drawing_file = os.path.basename(target)
147
+ drawing_to_sheet[drawing_file] = sheet_name
148
+ logger.debug(f"Mapped {drawing_file} -> {sheet_name}")
149
+
150
+ except Exception as e:
151
+ logger.debug(f"Error parsing sheet rels {name}: {e}")
152
+
153
+ except Exception as e:
154
+ logger.debug(f"Error building sheet-drawing mapping: {e}")
155
+
156
+ return drawing_to_sheet
157
+
158
+
159
+ def _parse_drawing_textboxes(drawing_xml: bytes) -> List[str]:
160
+ """
161
+ DrawingML XML에서 텍스트박스 내용을 추출합니다.
162
+
163
+ Args:
164
+ drawing_xml: drawing XML 바이트
165
+
166
+ Returns:
167
+ 텍스트박스 내용 리스트
168
+ """
169
+ textboxes: List[str] = []
170
+
171
+ try:
172
+ # XML 파싱
173
+ try:
174
+ root = ET.fromstring(drawing_xml)
175
+ except ET.ParseError:
176
+ # BOM 제거 후 재시도
177
+ drawing_str = drawing_xml.decode('utf-8-sig', errors='ignore')
178
+ root = ET.fromstring(drawing_str)
179
+
180
+ # 모든 shape 요소 직접 찾기 (<xdr:sp>)
181
+ # 전체 문서에서 모든 sp 요소 탐색
182
+ sp_elems = root.findall(f'.//{NS_XDR}sp')
183
+ logger.debug(f"Found {len(sp_elems)} shape elements in drawing")
184
+
185
+ for sp in sp_elems:
186
+ textbox_content = _extract_textbox_content(sp)
187
+ if textbox_content:
188
+ textboxes.append(textbox_content)
189
+ logger.debug(f"Extracted textbox: {textbox_content[:50]}...")
190
+
191
+ except Exception as e:
192
+ logger.warning(f"Error parsing drawing textboxes: {e}")
193
+
194
+ return textboxes
195
+
196
+
197
+ def _extract_textbox_content(sp_elem) -> Optional[str]:
198
+ """
199
+ Shape 요소에서 텍스트박스 내용을 추출합니다.
200
+
201
+ XLSX의 텍스트박스 구조:
202
+ <xdr:sp>
203
+ <xdr:nvSpPr>...</xdr:nvSpPr>
204
+ <xdr:spPr>...</xdr:spPr>
205
+ <xdr:txBody> <-- 직접 자식! (.//가 아님)
206
+ <a:p>
207
+ <a:r>
208
+ <a:t>텍스트</a:t>
209
+ </a:r>
210
+ </a:p>
211
+ </xdr:txBody>
212
+ </xdr:sp>
213
+
214
+ Args:
215
+ sp_elem: shape XML 요소
216
+
217
+ Returns:
218
+ 텍스트박스 내용 (없으면 None)
219
+ """
220
+ try:
221
+ # txBody 요소 찾기 - xdr 네임스페이스의 직접 자식으로 찾기
222
+ txBody = sp_elem.find(f'{NS_XDR}txBody')
223
+
224
+ if txBody is None:
225
+ return None
226
+
227
+ # 모든 텍스트 추출
228
+ text_parts: List[str] = []
229
+
230
+ # 각 paragraph (a:p) 처리
231
+ paragraphs = txBody.findall(f'.//{NS_A}p')
232
+
233
+ for p_elem in paragraphs:
234
+ para_texts: List[str] = []
235
+
236
+ # 각 run (a:r) 내의 텍스트 (a:t) 찾기
237
+ runs = p_elem.findall(f'.//{NS_A}r')
238
+
239
+ for r_elem in runs:
240
+ # a:t는 a:r의 직접 자식
241
+ t_elem = r_elem.find(f'{NS_A}t')
242
+
243
+ if t_elem is not None and t_elem.text:
244
+ para_texts.append(t_elem.text)
245
+
246
+ # run 없이 직접 a:t가 있는 경우도 처리
247
+ if not para_texts:
248
+ t_elems = p_elem.findall(f'.//{NS_A}t')
249
+ for t_elem in t_elems:
250
+ if t_elem is not None and t_elem.text:
251
+ para_texts.append(t_elem.text)
252
+
253
+ if para_texts:
254
+ text_parts.append(''.join(para_texts))
255
+
256
+ if text_parts:
257
+ # 줄바꿈으로 문단 구분
258
+ full_text = '\n'.join(text_parts).strip()
259
+ if full_text:
260
+ return full_text
261
+
262
+ return None
263
+
264
+ except Exception as e:
265
+ logger.debug(f"Error extracting textbox content: {e}")
266
+ return None
@@ -0,0 +1,7 @@
1
+ # xgen_doc2chunk/core/processor/html_helper/__init__.py
2
+ """HTML helper module for HTML file processing."""
3
+
4
+ from xgen_doc2chunk.core.processor.html_helper.html_file_converter import HTMLFileConverter
5
+
6
+ __all__ = ['HTMLFileConverter']
7
+