xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Excel XLSX 테이블 변환 모듈
|
|
3
|
+
|
|
4
|
+
XLSX 시트를 Markdown 또는 HTML 테이블로 변환합니다.
|
|
5
|
+
병합셀이 있으면 HTML, 없으면 Markdown을 사용합니다.
|
|
6
|
+
layout_detect_range를 통해 실제 데이터 영역만 추출합니다.
|
|
7
|
+
object_detect를 통해 개별 객체(테이블)별로 청킹할 수 있습니다.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
from typing import Optional, List
|
|
12
|
+
from xgen_doc2chunk.core.processor.excel_helper.excel_layout_detector import layout_detect_range_xlsx, object_detect_xlsx, LayoutRange
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger("document-processor")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def has_merged_cells_xlsx(ws, layout: Optional[LayoutRange] = None) -> bool:
|
|
18
|
+
"""
|
|
19
|
+
XLSX 워크시트에 병합셀이 존재하는지 확인합니다.
|
|
20
|
+
layout이 주어지면 해당 영역 내의 병합셀만 확인합니다.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
ws: openpyxl Worksheet 객체
|
|
24
|
+
layout: 검사할 레이아웃 범위 (None이면 전체 시트)
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
병합셀이 존재하면 True
|
|
28
|
+
"""
|
|
29
|
+
try:
|
|
30
|
+
if len(ws.merged_cells.ranges) == 0:
|
|
31
|
+
return False
|
|
32
|
+
|
|
33
|
+
# layout이 없으면 전체 시트에 병합셀 존재 여부만 확인
|
|
34
|
+
if layout is None:
|
|
35
|
+
return True
|
|
36
|
+
|
|
37
|
+
# layout 영역 내에 병합셀이 있는지 확인
|
|
38
|
+
for merged_range in ws.merged_cells.ranges:
|
|
39
|
+
# 병합 영역이 layout 영역과 겹치는지 확인
|
|
40
|
+
if (merged_range.min_row <= layout.max_row and
|
|
41
|
+
merged_range.max_row >= layout.min_row and
|
|
42
|
+
merged_range.min_col <= layout.max_col and
|
|
43
|
+
merged_range.max_col >= layout.min_col):
|
|
44
|
+
return True
|
|
45
|
+
|
|
46
|
+
return False
|
|
47
|
+
except Exception:
|
|
48
|
+
return False
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def convert_xlsx_sheet_to_table(ws, layout: Optional[LayoutRange] = None) -> str:
|
|
52
|
+
"""
|
|
53
|
+
XLSX 워크시트를 테이블로 변환합니다.
|
|
54
|
+
병합셀이 없으면 Markdown, 있으면 HTML로 변환합니다.
|
|
55
|
+
layout이 None이면 자동으로 감지합니다.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
ws: openpyxl Worksheet 객체
|
|
59
|
+
layout: 변환할 레이아웃 범위 (None이면 자동 감지)
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
변환된 테이블 문자열
|
|
63
|
+
"""
|
|
64
|
+
# layout이 없으면 자동 감지
|
|
65
|
+
if layout is None:
|
|
66
|
+
layout = layout_detect_range_xlsx(ws)
|
|
67
|
+
if layout is None:
|
|
68
|
+
logger.debug("No data found in worksheet")
|
|
69
|
+
return ""
|
|
70
|
+
|
|
71
|
+
if has_merged_cells_xlsx(ws, layout):
|
|
72
|
+
logger.debug("Merged cells detected in XLSX, using HTML format")
|
|
73
|
+
return convert_xlsx_sheet_to_html(ws, layout)
|
|
74
|
+
else:
|
|
75
|
+
logger.debug("No merged cells in XLSX, using Markdown format")
|
|
76
|
+
return convert_xlsx_sheet_to_markdown(ws, layout)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def convert_xlsx_sheet_to_markdown(ws, layout: Optional[LayoutRange] = None) -> str:
|
|
80
|
+
"""
|
|
81
|
+
XLSX 워크시트를 Markdown 테이블로 변환합니다.
|
|
82
|
+
layout_detect_range를 통해 실제 데이터 영역만 추출합니다.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
ws: openpyxl Worksheet 객체
|
|
86
|
+
layout: 변환할 레이아웃 범위 (None이면 자동 감지)
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
Markdown 테이블 문자열
|
|
90
|
+
"""
|
|
91
|
+
try:
|
|
92
|
+
# layout이 없으면 자동 감지
|
|
93
|
+
if layout is None:
|
|
94
|
+
layout = layout_detect_range_xlsx(ws)
|
|
95
|
+
if layout is None:
|
|
96
|
+
return ""
|
|
97
|
+
|
|
98
|
+
# 병합 셀의 시작점이 layout 밖에 있는 경우, layout 내 첫 번째 셀에 값을 표시
|
|
99
|
+
merged_value_override = {} # (row, col) -> value
|
|
100
|
+
for merged_range in ws.merged_cells.ranges:
|
|
101
|
+
mr_min_row, mr_min_col = merged_range.min_row, merged_range.min_col
|
|
102
|
+
mr_max_row, mr_max_col = merged_range.max_row, merged_range.max_col
|
|
103
|
+
|
|
104
|
+
# layout 영역과 겹치는지 확인
|
|
105
|
+
if (mr_min_row <= layout.max_row and
|
|
106
|
+
mr_max_row >= layout.min_row and
|
|
107
|
+
mr_min_col <= layout.max_col and
|
|
108
|
+
mr_max_col >= layout.min_col):
|
|
109
|
+
|
|
110
|
+
# 병합 셀의 시작점이 layout 밖에 있는 경우
|
|
111
|
+
start_in_layout = (layout.min_row <= mr_min_row <= layout.max_row and
|
|
112
|
+
layout.min_col <= mr_min_col <= layout.max_col)
|
|
113
|
+
|
|
114
|
+
if not start_in_layout:
|
|
115
|
+
merged_value = ws.cell(row=mr_min_row, column=mr_min_col).value
|
|
116
|
+
if merged_value is not None:
|
|
117
|
+
first_row_in_layout = max(mr_min_row, layout.min_row)
|
|
118
|
+
first_col_in_layout = max(mr_min_col, layout.min_col)
|
|
119
|
+
merged_value_override[(first_row_in_layout, first_col_in_layout)] = merged_value
|
|
120
|
+
|
|
121
|
+
md_parts = []
|
|
122
|
+
row_count = 0
|
|
123
|
+
|
|
124
|
+
for row_idx in range(layout.min_row, layout.max_row + 1):
|
|
125
|
+
cells = []
|
|
126
|
+
row_has_content = False
|
|
127
|
+
|
|
128
|
+
for col_idx in range(layout.min_col, layout.max_col + 1):
|
|
129
|
+
cell = ws.cell(row=row_idx, column=col_idx)
|
|
130
|
+
cell_value = ""
|
|
131
|
+
|
|
132
|
+
# 병합 셀 override 확인
|
|
133
|
+
if (row_idx, col_idx) in merged_value_override:
|
|
134
|
+
cell_value = str(merged_value_override[(row_idx, col_idx)]).strip()
|
|
135
|
+
if cell_value:
|
|
136
|
+
row_has_content = True
|
|
137
|
+
elif cell.value is not None:
|
|
138
|
+
cell_value = str(cell.value).strip()
|
|
139
|
+
if cell_value:
|
|
140
|
+
row_has_content = True
|
|
141
|
+
|
|
142
|
+
# Markdown 테이블에서 파이프는 이스케이프 필요
|
|
143
|
+
cell_value = cell_value.replace("|", "\\|")
|
|
144
|
+
cell_value = cell_value.replace("\n", " ")
|
|
145
|
+
cells.append(cell_value)
|
|
146
|
+
|
|
147
|
+
if not row_has_content:
|
|
148
|
+
continue
|
|
149
|
+
|
|
150
|
+
row_str = "| " + " | ".join(cells) + " |"
|
|
151
|
+
md_parts.append(row_str)
|
|
152
|
+
row_count += 1
|
|
153
|
+
|
|
154
|
+
# 첫 번째 데이터 행 다음에 구분선 추가
|
|
155
|
+
if row_count == 1:
|
|
156
|
+
separator = "| " + " | ".join(["---"] * len(cells)) + " |"
|
|
157
|
+
md_parts.append(separator)
|
|
158
|
+
|
|
159
|
+
return "\n".join(md_parts) if md_parts else ""
|
|
160
|
+
|
|
161
|
+
except Exception as e:
|
|
162
|
+
logger.warning(f"Error converting sheet to Markdown: {e}")
|
|
163
|
+
return ""
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def convert_xlsx_sheet_to_html(ws, layout: Optional[LayoutRange] = None) -> str:
|
|
167
|
+
"""
|
|
168
|
+
XLSX 워크시트를 HTML 테이블로 변환합니다.
|
|
169
|
+
셀 병합(rowspan/colspan)을 지원합니다.
|
|
170
|
+
layout_detect_range를 통해 실제 데이터 영역만 추출합니다.
|
|
171
|
+
|
|
172
|
+
병합셀이 있는 경우 빈 행도 테이블 구조의 일부이므로 포함합니다.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
ws: openpyxl Worksheet 객체
|
|
176
|
+
layout: 변환할 레이아웃 범위 (None이면 자동 감지)
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
HTML 테이블 문자열
|
|
180
|
+
"""
|
|
181
|
+
try:
|
|
182
|
+
# layout이 없으면 자동 감지
|
|
183
|
+
if layout is None:
|
|
184
|
+
layout = layout_detect_range_xlsx(ws)
|
|
185
|
+
if layout is None:
|
|
186
|
+
return ""
|
|
187
|
+
|
|
188
|
+
# 병합된 셀 정보 수집 (layout 영역 내만)
|
|
189
|
+
merged_cells_info = {} # (row, col) -> (rowspan, colspan)
|
|
190
|
+
skip_cells = set() # 건너뛸 셀 (병합된 영역의 일부)
|
|
191
|
+
# 병합 셀의 시작점이 layout 밖에 있는 경우, layout 내 첫 번째 셀에 값을 표시
|
|
192
|
+
merged_value_override = {} # (row, col) -> value
|
|
193
|
+
|
|
194
|
+
for merged_range in ws.merged_cells.ranges:
|
|
195
|
+
mr_min_row, mr_min_col = merged_range.min_row, merged_range.min_col
|
|
196
|
+
mr_max_row, mr_max_col = merged_range.max_row, merged_range.max_col
|
|
197
|
+
|
|
198
|
+
# layout 영역과 겹치는 병합 셀만 처리
|
|
199
|
+
if (mr_min_row <= layout.max_row and
|
|
200
|
+
mr_max_row >= layout.min_row and
|
|
201
|
+
mr_min_col <= layout.max_col and
|
|
202
|
+
mr_max_col >= layout.min_col):
|
|
203
|
+
|
|
204
|
+
# 병합 셀의 시작점이 layout 안에 있는지 확인
|
|
205
|
+
start_in_layout = (layout.min_row <= mr_min_row <= layout.max_row and
|
|
206
|
+
layout.min_col <= mr_min_col <= layout.max_col)
|
|
207
|
+
|
|
208
|
+
if start_in_layout:
|
|
209
|
+
# 일반적인 경우: 병합 정보 저장
|
|
210
|
+
rowspan = mr_max_row - mr_min_row + 1
|
|
211
|
+
colspan = mr_max_col - mr_min_col + 1
|
|
212
|
+
merged_cells_info[(mr_min_row, mr_min_col)] = (rowspan, colspan)
|
|
213
|
+
|
|
214
|
+
# 병합된 영역의 나머지 셀들은 건너뛰기
|
|
215
|
+
for r in range(mr_min_row, mr_max_row + 1):
|
|
216
|
+
for c in range(mr_min_col, mr_max_col + 1):
|
|
217
|
+
if r != mr_min_row or c != mr_min_col:
|
|
218
|
+
skip_cells.add((r, c))
|
|
219
|
+
else:
|
|
220
|
+
# 병합 셀의 시작점이 layout 밖에 있는 경우
|
|
221
|
+
# layout 내 첫 번째 셀에 병합 셀의 값을 표시
|
|
222
|
+
merged_value = ws.cell(row=mr_min_row, column=mr_min_col).value
|
|
223
|
+
if merged_value is not None:
|
|
224
|
+
# layout 내에서 병합 영역의 첫 번째 셀 찾기
|
|
225
|
+
first_row_in_layout = max(mr_min_row, layout.min_row)
|
|
226
|
+
first_col_in_layout = max(mr_min_col, layout.min_col)
|
|
227
|
+
merged_value_override[(first_row_in_layout, first_col_in_layout)] = merged_value
|
|
228
|
+
|
|
229
|
+
# layout 내의 병합 영역 나머지 셀들은 건너뛰기
|
|
230
|
+
for r in range(max(mr_min_row, layout.min_row), min(mr_max_row, layout.max_row) + 1):
|
|
231
|
+
for c in range(max(mr_min_col, layout.min_col), min(mr_max_col, layout.max_col) + 1):
|
|
232
|
+
# 값을 표시할 첫 번째 셀은 skip하지 않음
|
|
233
|
+
if (r, c) in merged_value_override:
|
|
234
|
+
continue
|
|
235
|
+
skip_cells.add((r, c))
|
|
236
|
+
|
|
237
|
+
# HTML 생성
|
|
238
|
+
html_parts = ["<table border='1'>"]
|
|
239
|
+
has_data = False
|
|
240
|
+
|
|
241
|
+
for row_idx in range(layout.min_row, layout.max_row + 1):
|
|
242
|
+
row_parts = ["<tr>"]
|
|
243
|
+
|
|
244
|
+
for col_idx in range(layout.min_col, layout.max_col + 1):
|
|
245
|
+
# 건너뛸 셀 확인 (병합된 영역의 일부)
|
|
246
|
+
if (row_idx, col_idx) in skip_cells:
|
|
247
|
+
continue
|
|
248
|
+
|
|
249
|
+
cell = ws.cell(row=row_idx, column=col_idx)
|
|
250
|
+
|
|
251
|
+
# 셀 값 추출 (병합 셀 override 확인)
|
|
252
|
+
cell_value = ""
|
|
253
|
+
if (row_idx, col_idx) in merged_value_override:
|
|
254
|
+
cell_value = str(merged_value_override[(row_idx, col_idx)]).strip()
|
|
255
|
+
if cell_value:
|
|
256
|
+
has_data = True
|
|
257
|
+
elif cell.value is not None:
|
|
258
|
+
cell_value = str(cell.value).strip()
|
|
259
|
+
if cell_value:
|
|
260
|
+
has_data = True
|
|
261
|
+
|
|
262
|
+
# HTML 이스케이프
|
|
263
|
+
cell_value = _escape_html(cell_value)
|
|
264
|
+
|
|
265
|
+
# 첫 번째 행은 헤더로 처리
|
|
266
|
+
tag = "th" if row_idx == layout.min_row else "td"
|
|
267
|
+
|
|
268
|
+
# 병합 속성
|
|
269
|
+
attrs = []
|
|
270
|
+
if (row_idx, col_idx) in merged_cells_info:
|
|
271
|
+
rowspan, colspan = merged_cells_info[(row_idx, col_idx)]
|
|
272
|
+
if rowspan > 1:
|
|
273
|
+
attrs.append(f"rowspan='{rowspan}'")
|
|
274
|
+
if colspan > 1:
|
|
275
|
+
attrs.append(f"colspan='{colspan}'")
|
|
276
|
+
|
|
277
|
+
attr_str = " " + " ".join(attrs) if attrs else ""
|
|
278
|
+
row_parts.append(f"<{tag}{attr_str}>{cell_value}</{tag}>")
|
|
279
|
+
|
|
280
|
+
row_parts.append("</tr>")
|
|
281
|
+
|
|
282
|
+
# 모든 행을 추가 (빈 행도 테이블 구조의 일부)
|
|
283
|
+
html_parts.append("".join(row_parts))
|
|
284
|
+
|
|
285
|
+
html_parts.append("</table>")
|
|
286
|
+
|
|
287
|
+
if has_data:
|
|
288
|
+
return "\n".join(html_parts)
|
|
289
|
+
return ""
|
|
290
|
+
|
|
291
|
+
except Exception as e:
|
|
292
|
+
logger.warning(f"Error converting sheet to HTML: {e}")
|
|
293
|
+
return ""
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def _escape_html(text: str) -> str:
|
|
297
|
+
"""
|
|
298
|
+
HTML 특수 문자를 이스케이프합니다.
|
|
299
|
+
|
|
300
|
+
Args:
|
|
301
|
+
text: 원본 텍스트
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
이스케이프된 텍스트
|
|
305
|
+
"""
|
|
306
|
+
if not text:
|
|
307
|
+
return ""
|
|
308
|
+
|
|
309
|
+
text = text.replace("&", "&")
|
|
310
|
+
text = text.replace("<", "<")
|
|
311
|
+
text = text.replace(">", ">")
|
|
312
|
+
text = text.replace("\n", "<br>")
|
|
313
|
+
|
|
314
|
+
return text
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def convert_xlsx_objects_to_tables(ws, layout: Optional[LayoutRange] = None) -> List[str]:
|
|
318
|
+
"""
|
|
319
|
+
XLSX 워크시트에서 개별 객체(테이블)를 감지하고 각각을 테이블 문자열로 변환합니다.
|
|
320
|
+
|
|
321
|
+
알고리즘:
|
|
322
|
+
1. 테두리가 있는 영역을 먼저 개별 개체로 인식
|
|
323
|
+
2. 테두리가 없는 값 영역을 감지
|
|
324
|
+
3. 완전히 인접한 개체들을 병합
|
|
325
|
+
4. 각 객체를 테이블로 변환
|
|
326
|
+
|
|
327
|
+
Args:
|
|
328
|
+
ws: openpyxl Worksheet 객체
|
|
329
|
+
layout: 탐색할 레이아웃 범위 (None이면 자동 감지)
|
|
330
|
+
|
|
331
|
+
Returns:
|
|
332
|
+
개별 객체 테이블 문자열 목록 (위→아래, 왼쪽→오른쪽 순서)
|
|
333
|
+
"""
|
|
334
|
+
objects = object_detect_xlsx(ws, layout)
|
|
335
|
+
|
|
336
|
+
if not objects:
|
|
337
|
+
return []
|
|
338
|
+
|
|
339
|
+
tables = []
|
|
340
|
+
for obj_layout in objects:
|
|
341
|
+
table_str = convert_xlsx_sheet_to_table(ws, obj_layout)
|
|
342
|
+
# 빈 테이블 필터링 (공백, 줄바꿈, 테이블 기호만 있는 경우 제외)
|
|
343
|
+
if table_str and table_str.strip():
|
|
344
|
+
# Markdown 테이블에서 실제 데이터가 있는지 확인
|
|
345
|
+
# 헤더 구분선(---)만 있고 데이터가 없는 경우 제외
|
|
346
|
+
lines = [line.strip() for line in table_str.strip().split('\n') if line.strip()]
|
|
347
|
+
has_data = False
|
|
348
|
+
for line in lines:
|
|
349
|
+
# 구분선이 아닌 행에서 | 사이에 실제 값이 있는지 확인
|
|
350
|
+
if '---' not in line:
|
|
351
|
+
# | col1 | col2 | 형태에서 값 추출
|
|
352
|
+
parts = [p.strip() for p in line.split('|') if p.strip()]
|
|
353
|
+
if parts:
|
|
354
|
+
has_data = True
|
|
355
|
+
break
|
|
356
|
+
|
|
357
|
+
if has_data:
|
|
358
|
+
tables.append(table_str)
|
|
359
|
+
|
|
360
|
+
logger.debug(f"Converted {len(tables)} objects to tables (XLSX)")
|
|
361
|
+
return tables
|
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
"""
|
|
2
|
+
XLSX 텍스트박스 추출 모듈
|
|
3
|
+
|
|
4
|
+
XLSX 파일의 DrawingML에서 텍스트박스 내용을 추출합니다.
|
|
5
|
+
텍스트박스는 xl/drawings/drawing*.xml에 <xdr:sp> 요소로 저장됩니다.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import zipfile
|
|
10
|
+
import logging
|
|
11
|
+
import xml.etree.ElementTree as ET
|
|
12
|
+
from typing import Dict, List, Optional
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
# DrawingML 네임스페이스
|
|
17
|
+
NAMESPACES = {
|
|
18
|
+
'xdr': 'http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing',
|
|
19
|
+
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
|
|
20
|
+
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
|
|
21
|
+
'pkg': 'http://schemas.openxmlformats.org/package/2006/relationships',
|
|
22
|
+
'ss': 'http://schemas.openxmlformats.org/spreadsheetml/2006/main',
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
# 네임스페이스 URI 상수
|
|
26
|
+
NS_XDR = '{http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing}'
|
|
27
|
+
NS_A = '{http://schemas.openxmlformats.org/drawingml/2006/main}'
|
|
28
|
+
NS_R = '{http://schemas.openxmlformats.org/officeDocument/2006/relationships}'
|
|
29
|
+
NS_PKG = '{http://schemas.openxmlformats.org/package/2006/relationships}'
|
|
30
|
+
NS_SS = '{http://schemas.openxmlformats.org/spreadsheetml/2006/main}'
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def extract_textboxes_from_xlsx(file_path: str) -> Dict[str, List[str]]:
|
|
34
|
+
"""
|
|
35
|
+
XLSX 파일에서 텍스트박스를 추출합니다.
|
|
36
|
+
|
|
37
|
+
XLSX의 텍스트박스는 xl/drawings/drawing*.xml 파일에 저장됩니다.
|
|
38
|
+
DrawingML 형식으로 <xdr:sp> (shape) 요소 내 <xdr:txBody>에 텍스트가 포함됩니다.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
file_path: XLSX 파일 경로
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
{시트명: [텍스트박스 내용 리스트]} 형태의 딕셔너리
|
|
45
|
+
"""
|
|
46
|
+
textboxes_by_sheet: Dict[str, List[str]] = {}
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
with zipfile.ZipFile(file_path, 'r') as zf:
|
|
50
|
+
# 시트와 drawing 관계 매핑 구축
|
|
51
|
+
sheet_drawing_map = _get_sheet_drawing_mapping(zf)
|
|
52
|
+
logger.debug(f"Sheet-Drawing mapping: {sheet_drawing_map}")
|
|
53
|
+
|
|
54
|
+
# 모든 drawing 파일 처리
|
|
55
|
+
for name in zf.namelist():
|
|
56
|
+
if name.startswith('xl/drawings/drawing') and name.endswith('.xml'):
|
|
57
|
+
try:
|
|
58
|
+
drawing_xml = zf.read(name)
|
|
59
|
+
textboxes = _parse_drawing_textboxes(drawing_xml)
|
|
60
|
+
|
|
61
|
+
if textboxes:
|
|
62
|
+
# drawing 파일에 해당하는 시트 찾기
|
|
63
|
+
drawing_name = os.path.basename(name)
|
|
64
|
+
sheet_name = sheet_drawing_map.get(drawing_name, f"Sheet ({drawing_name})")
|
|
65
|
+
|
|
66
|
+
if sheet_name not in textboxes_by_sheet:
|
|
67
|
+
textboxes_by_sheet[sheet_name] = []
|
|
68
|
+
textboxes_by_sheet[sheet_name].extend(textboxes)
|
|
69
|
+
|
|
70
|
+
logger.info(f"Extracted {len(textboxes)} textboxes from {name} -> {sheet_name}")
|
|
71
|
+
|
|
72
|
+
except Exception as e:
|
|
73
|
+
logger.warning(f"Error parsing textboxes from {name}: {e}")
|
|
74
|
+
|
|
75
|
+
total_textboxes = sum(len(tb) for tb in textboxes_by_sheet.values())
|
|
76
|
+
if total_textboxes > 0:
|
|
77
|
+
logger.info(f"Total extracted {total_textboxes} textboxes from XLSX")
|
|
78
|
+
|
|
79
|
+
except Exception as e:
|
|
80
|
+
logger.warning(f"Error extracting textboxes from XLSX: {e}")
|
|
81
|
+
|
|
82
|
+
return textboxes_by_sheet
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _get_sheet_drawing_mapping(zf: zipfile.ZipFile) -> Dict[str, str]:
|
|
86
|
+
"""
|
|
87
|
+
XLSX 내부 관계를 파싱하여 drawing 파일과 시트 이름의 매핑을 구축합니다.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
zf: ZipFile 객체
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
{drawing 파일명: 시트명} 매핑
|
|
94
|
+
"""
|
|
95
|
+
drawing_to_sheet: Dict[str, str] = {}
|
|
96
|
+
sheet_rid_map: Dict[str, str] = {} # rId -> sheet_name
|
|
97
|
+
rid_to_sheet_file: Dict[str, str] = {} # rId -> sheet파일경로
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
# 1. workbook.xml에서 시트 정보 추출 (rId -> sheet_name)
|
|
101
|
+
if 'xl/workbook.xml' in zf.namelist():
|
|
102
|
+
workbook_xml = zf.read('xl/workbook.xml')
|
|
103
|
+
wb_root = ET.fromstring(workbook_xml)
|
|
104
|
+
|
|
105
|
+
for sheet_elem in wb_root.findall(f'.//{NS_SS}sheet'):
|
|
106
|
+
sheet_name = sheet_elem.get('name', '')
|
|
107
|
+
r_id = sheet_elem.get(f'{NS_R}id', '')
|
|
108
|
+
if sheet_name and r_id:
|
|
109
|
+
sheet_rid_map[r_id] = sheet_name
|
|
110
|
+
|
|
111
|
+
# 2. workbook.xml.rels에서 rId -> sheet*.xml 매핑
|
|
112
|
+
if 'xl/_rels/workbook.xml.rels' in zf.namelist():
|
|
113
|
+
rels_xml = zf.read('xl/_rels/workbook.xml.rels')
|
|
114
|
+
rels_root = ET.fromstring(rels_xml)
|
|
115
|
+
|
|
116
|
+
for rel_elem in rels_root.findall(f'.//{NS_PKG}Relationship'):
|
|
117
|
+
r_id = rel_elem.get('Id', '')
|
|
118
|
+
target = rel_elem.get('Target', '')
|
|
119
|
+
if 'worksheets/sheet' in target:
|
|
120
|
+
rid_to_sheet_file[r_id] = target
|
|
121
|
+
|
|
122
|
+
# 3. sheet파일 -> sheet_name 매핑
|
|
123
|
+
sheet_file_to_name: Dict[str, str] = {}
|
|
124
|
+
for r_id, sheet_name in sheet_rid_map.items():
|
|
125
|
+
if r_id in rid_to_sheet_file:
|
|
126
|
+
sheet_file = rid_to_sheet_file[r_id]
|
|
127
|
+
# worksheets/sheet1.xml -> sheet1.xml
|
|
128
|
+
sheet_file_base = os.path.basename(sheet_file)
|
|
129
|
+
sheet_file_to_name[sheet_file_base] = sheet_name
|
|
130
|
+
|
|
131
|
+
# 4. 각 sheet*.xml.rels에서 drawing 관계 찾기
|
|
132
|
+
for name in zf.namelist():
|
|
133
|
+
if name.startswith('xl/worksheets/_rels/sheet') and name.endswith('.xml.rels'):
|
|
134
|
+
try:
|
|
135
|
+
rels_xml = zf.read(name)
|
|
136
|
+
rels_root = ET.fromstring(rels_xml)
|
|
137
|
+
|
|
138
|
+
# sheet*.xml.rels -> sheet*.xml
|
|
139
|
+
sheet_file = os.path.basename(name).replace('.rels', '')
|
|
140
|
+
sheet_name = sheet_file_to_name.get(sheet_file, sheet_file)
|
|
141
|
+
|
|
142
|
+
for rel_elem in rels_root.findall(f'.//{NS_PKG}Relationship'):
|
|
143
|
+
target = rel_elem.get('Target', '')
|
|
144
|
+
if 'drawings/drawing' in target:
|
|
145
|
+
# ../drawings/drawing1.xml -> drawing1.xml
|
|
146
|
+
drawing_file = os.path.basename(target)
|
|
147
|
+
drawing_to_sheet[drawing_file] = sheet_name
|
|
148
|
+
logger.debug(f"Mapped {drawing_file} -> {sheet_name}")
|
|
149
|
+
|
|
150
|
+
except Exception as e:
|
|
151
|
+
logger.debug(f"Error parsing sheet rels {name}: {e}")
|
|
152
|
+
|
|
153
|
+
except Exception as e:
|
|
154
|
+
logger.debug(f"Error building sheet-drawing mapping: {e}")
|
|
155
|
+
|
|
156
|
+
return drawing_to_sheet
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _parse_drawing_textboxes(drawing_xml: bytes) -> List[str]:
|
|
160
|
+
"""
|
|
161
|
+
DrawingML XML에서 텍스트박스 내용을 추출합니다.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
drawing_xml: drawing XML 바이트
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
텍스트박스 내용 리스트
|
|
168
|
+
"""
|
|
169
|
+
textboxes: List[str] = []
|
|
170
|
+
|
|
171
|
+
try:
|
|
172
|
+
# XML 파싱
|
|
173
|
+
try:
|
|
174
|
+
root = ET.fromstring(drawing_xml)
|
|
175
|
+
except ET.ParseError:
|
|
176
|
+
# BOM 제거 후 재시도
|
|
177
|
+
drawing_str = drawing_xml.decode('utf-8-sig', errors='ignore')
|
|
178
|
+
root = ET.fromstring(drawing_str)
|
|
179
|
+
|
|
180
|
+
# 모든 shape 요소 직접 찾기 (<xdr:sp>)
|
|
181
|
+
# 전체 문서에서 모든 sp 요소 탐색
|
|
182
|
+
sp_elems = root.findall(f'.//{NS_XDR}sp')
|
|
183
|
+
logger.debug(f"Found {len(sp_elems)} shape elements in drawing")
|
|
184
|
+
|
|
185
|
+
for sp in sp_elems:
|
|
186
|
+
textbox_content = _extract_textbox_content(sp)
|
|
187
|
+
if textbox_content:
|
|
188
|
+
textboxes.append(textbox_content)
|
|
189
|
+
logger.debug(f"Extracted textbox: {textbox_content[:50]}...")
|
|
190
|
+
|
|
191
|
+
except Exception as e:
|
|
192
|
+
logger.warning(f"Error parsing drawing textboxes: {e}")
|
|
193
|
+
|
|
194
|
+
return textboxes
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _extract_textbox_content(sp_elem) -> Optional[str]:
|
|
198
|
+
"""
|
|
199
|
+
Shape 요소에서 텍스트박스 내용을 추출합니다.
|
|
200
|
+
|
|
201
|
+
XLSX의 텍스트박스 구조:
|
|
202
|
+
<xdr:sp>
|
|
203
|
+
<xdr:nvSpPr>...</xdr:nvSpPr>
|
|
204
|
+
<xdr:spPr>...</xdr:spPr>
|
|
205
|
+
<xdr:txBody> <-- 직접 자식! (.//가 아님)
|
|
206
|
+
<a:p>
|
|
207
|
+
<a:r>
|
|
208
|
+
<a:t>텍스트</a:t>
|
|
209
|
+
</a:r>
|
|
210
|
+
</a:p>
|
|
211
|
+
</xdr:txBody>
|
|
212
|
+
</xdr:sp>
|
|
213
|
+
|
|
214
|
+
Args:
|
|
215
|
+
sp_elem: shape XML 요소
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
텍스트박스 내용 (없으면 None)
|
|
219
|
+
"""
|
|
220
|
+
try:
|
|
221
|
+
# txBody 요소 찾기 - xdr 네임스페이스의 직접 자식으로 찾기
|
|
222
|
+
txBody = sp_elem.find(f'{NS_XDR}txBody')
|
|
223
|
+
|
|
224
|
+
if txBody is None:
|
|
225
|
+
return None
|
|
226
|
+
|
|
227
|
+
# 모든 텍스트 추출
|
|
228
|
+
text_parts: List[str] = []
|
|
229
|
+
|
|
230
|
+
# 각 paragraph (a:p) 처리
|
|
231
|
+
paragraphs = txBody.findall(f'.//{NS_A}p')
|
|
232
|
+
|
|
233
|
+
for p_elem in paragraphs:
|
|
234
|
+
para_texts: List[str] = []
|
|
235
|
+
|
|
236
|
+
# 각 run (a:r) 내의 텍스트 (a:t) 찾기
|
|
237
|
+
runs = p_elem.findall(f'.//{NS_A}r')
|
|
238
|
+
|
|
239
|
+
for r_elem in runs:
|
|
240
|
+
# a:t는 a:r의 직접 자식
|
|
241
|
+
t_elem = r_elem.find(f'{NS_A}t')
|
|
242
|
+
|
|
243
|
+
if t_elem is not None and t_elem.text:
|
|
244
|
+
para_texts.append(t_elem.text)
|
|
245
|
+
|
|
246
|
+
# run 없이 직접 a:t가 있는 경우도 처리
|
|
247
|
+
if not para_texts:
|
|
248
|
+
t_elems = p_elem.findall(f'.//{NS_A}t')
|
|
249
|
+
for t_elem in t_elems:
|
|
250
|
+
if t_elem is not None and t_elem.text:
|
|
251
|
+
para_texts.append(t_elem.text)
|
|
252
|
+
|
|
253
|
+
if para_texts:
|
|
254
|
+
text_parts.append(''.join(para_texts))
|
|
255
|
+
|
|
256
|
+
if text_parts:
|
|
257
|
+
# 줄바꿈으로 문단 구분
|
|
258
|
+
full_text = '\n'.join(text_parts).strip()
|
|
259
|
+
if full_text:
|
|
260
|
+
return full_text
|
|
261
|
+
|
|
262
|
+
return None
|
|
263
|
+
|
|
264
|
+
except Exception as e:
|
|
265
|
+
logger.debug(f"Error extracting textbox content: {e}")
|
|
266
|
+
return None
|