xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,332 @@
1
+ """
2
+ PPT 목록(Bullet/Numbering) 처리 모듈
3
+
4
+ 포함 함수:
5
+ - extract_text_with_bullets(): TextFrame에서 목록 기호 포함 텍스트 추출
6
+ - extract_bullet_info(): Paragraph에서 목록 정보 추출
7
+ - convert_special_font_char(): 특수 폰트 문자 변환
8
+
9
+ 지원하는 목록 스타일:
10
+ - Bullet: •, ○, ■, □, ✓, ➢ 등 모든 Unicode 문자
11
+ - Numbering: 1., I., i., A., a., (1), 1) 등
12
+ """
13
+ import logging
14
+ from typing import Any, Dict
15
+
16
+ from xgen_doc2chunk.core.processor.ppt_helper.ppt_constants import WINGDINGS_MAPPING, WINGDINGS_CHAR_MAPPING, SYMBOL_MAPPING
17
+
18
+ logger = logging.getLogger("document-processor")
19
+
20
+
21
+ def extract_text_with_bullets(text_frame) -> str:
22
+ """
23
+ TextFrame에서 목록 기호/번호를 포함한 텍스트를 추출합니다.
24
+
25
+ 지원하는 목록 스타일:
26
+ - Bullet: •, ○, ■, □, ✓, ➢ 등 모든 Unicode 문자
27
+ - Numbering: 1., I., i., A., a., (1), 1) 등
28
+
29
+ Args:
30
+ text_frame: Shape의 text_frame 객체
31
+
32
+ Returns:
33
+ 목록 기호가 포함된 텍스트
34
+ """
35
+ if not text_frame:
36
+ return ""
37
+
38
+ result_lines = []
39
+ numbering_state = {} # 레벨별 번호 상태 추적
40
+
41
+ try:
42
+ for paragraph in text_frame.paragraphs:
43
+ para_text = paragraph.text.strip()
44
+
45
+ if not para_text:
46
+ result_lines.append("")
47
+ continue
48
+
49
+ # 들여쓰기 레벨 (0-8)
50
+ level = paragraph.level if hasattr(paragraph, 'level') else 0
51
+ indent = " " * level # 2칸씩 들여쓰기
52
+
53
+ # 목록 정보 추출
54
+ bullet_info = extract_bullet_info(paragraph)
55
+
56
+ if bullet_info['type'] == 'numbered':
57
+ # 번호 목록 처리
58
+ num_format = bullet_info['format']
59
+ current_num = _get_or_increment_number(numbering_state, level, bullet_info)
60
+
61
+ # 번호 포맷팅
62
+ formatted_num = _format_number(current_num, num_format)
63
+ result_lines.append(f"{indent}{formatted_num} {para_text}")
64
+
65
+ elif bullet_info['type'] == 'bulleted':
66
+ # Bullet 목록 처리
67
+ bullet_char = bullet_info['char']
68
+ result_lines.append(f"{indent}{bullet_char} {para_text}")
69
+
70
+ else:
71
+ # 목록이 아닌 일반 텍스트
72
+ # 목록이 끝나면 번호 상태 초기화
73
+ if numbering_state:
74
+ numbering_state.clear()
75
+
76
+ if level > 0:
77
+ result_lines.append(f"{indent}{para_text}")
78
+ else:
79
+ result_lines.append(para_text)
80
+
81
+ except Exception as e:
82
+ logger.warning(f"Error extracting text with bullets: {e}")
83
+ # 폴백: 기본 텍스트만 추출
84
+ return text_frame.text.strip() if text_frame.text else ""
85
+
86
+ return "\n".join(result_lines)
87
+
88
+
89
+ def extract_bullet_info(paragraph) -> Dict[str, Any]:
90
+ """
91
+ Paragraph에서 목록(bullet/numbering) 정보를 추출합니다.
92
+ 특수 폰트(Wingdings, Symbol 등)의 문자를 올바른 Unicode로 변환합니다.
93
+
94
+ Args:
95
+ paragraph: python-pptx Paragraph 객체
96
+
97
+ Returns:
98
+ {
99
+ 'type': 'none' | 'bulleted' | 'numbered',
100
+ 'char': str, # bullet 문자 (type='bulleted'인 경우)
101
+ 'format': str, # 번호 포맷 (type='numbered'인 경우)
102
+ 'start_at': int # 시작 번호
103
+ }
104
+ """
105
+ result = {
106
+ 'type': 'none',
107
+ 'char': None,
108
+ 'format': None,
109
+ 'start_at': 1
110
+ }
111
+
112
+ try:
113
+ # XML 요소 접근
114
+ pPr = paragraph._element.pPr
115
+
116
+ if pPr is None:
117
+ return result
118
+
119
+ # namespace
120
+ ns = {'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'}
121
+
122
+ # buNone 확인 (목록 비활성화)
123
+ buNone = pPr.find('.//a:buNone', namespaces=ns)
124
+ if buNone is not None:
125
+ return result
126
+
127
+ # Bullet 폰트 확인 (특수 폰트 여부)
128
+ buFont = pPr.find('.//a:buFont', namespaces=ns)
129
+ font_typeface = None
130
+ if buFont is not None:
131
+ font_typeface = buFont.get('typeface', '').lower()
132
+
133
+ # Bullet 문자 확인
134
+ buChar = pPr.find('.//a:buChar', namespaces=ns)
135
+ if buChar is not None:
136
+ result['type'] = 'bulleted'
137
+ raw_char = buChar.get('char', '•')
138
+
139
+ # 특수 폰트인 경우 문자 변환
140
+ if font_typeface:
141
+ converted_char = convert_special_font_char(raw_char, font_typeface)
142
+ result['char'] = converted_char
143
+ else:
144
+ result['char'] = raw_char
145
+
146
+ return result
147
+
148
+ # 자동 번호 확인
149
+ buAutoNum = pPr.find('.//a:buAutoNum', namespaces=ns)
150
+ if buAutoNum is not None:
151
+ result['type'] = 'numbered'
152
+ result['format'] = buAutoNum.get('type', 'arabicPeriod')
153
+ result['start_at'] = int(buAutoNum.get('startAt', '1'))
154
+ return result
155
+
156
+ # Font bullet만 있고 buChar가 없는 경우 (기본 bullet)
157
+ if buFont is not None:
158
+ result['type'] = 'bulleted'
159
+ result['char'] = '•'
160
+ return result
161
+
162
+ except Exception as e:
163
+ logger.debug(f"Error extracting bullet info: {e}")
164
+
165
+ return result
166
+
167
+
168
+ def convert_special_font_char(char: str, font_typeface: str) -> str:
169
+ """
170
+ 특수 폰트(Wingdings, Symbol 등)의 문자를 일반 Unicode로 변환합니다.
171
+
172
+ Args:
173
+ char: 원본 문자
174
+ font_typeface: 폰트 이름 (소문자)
175
+
176
+ Returns:
177
+ 변환된 Unicode 문자
178
+ """
179
+ if not char:
180
+ return '•'
181
+
182
+ try:
183
+ # 먼저 문자 기반 매핑 시도 (가장 정확)
184
+ if 'wingdings' in font_typeface:
185
+ # 문자 자체로 매핑 시도
186
+ if char in WINGDINGS_CHAR_MAPPING:
187
+ return WINGDINGS_CHAR_MAPPING[char]
188
+
189
+ # 문자 코드로 매핑 시도
190
+ char_code = ord(char[0]) if len(char) > 0 else 0
191
+ if char_code in WINGDINGS_MAPPING:
192
+ return WINGDINGS_MAPPING[char_code]
193
+
194
+ # 매핑되지 않은 경우 로그 출력 (디버깅용)
195
+ logger.debug(f"Unmapped Wingdings char: '{char}' (code: {char_code}, hex: 0x{char_code:02X})")
196
+ return '•' # 기본값
197
+
198
+ # Symbol 폰트
199
+ elif 'symbol' in font_typeface:
200
+ char_code = ord(char[0]) if len(char) > 0 else 0
201
+ if char_code in SYMBOL_MAPPING:
202
+ return SYMBOL_MAPPING[char_code]
203
+ return char
204
+
205
+ # Webdings 폰트 (필요시 매핑 추가)
206
+ elif 'webdings' in font_typeface:
207
+ return '•' # 기본값
208
+
209
+ # 일반 폰트는 그대로 반환
210
+ else:
211
+ return char
212
+
213
+ except Exception as e:
214
+ logger.debug(f"Error converting special font char: {e}")
215
+ return '•'
216
+
217
+
218
+ def _get_or_increment_number(numbering_state: Dict, level: int, bullet_info: Dict) -> int:
219
+ """
220
+ 레벨별 번호를 추적하고 증가시킵니다.
221
+
222
+ Args:
223
+ numbering_state: 레벨별 번호 상태 딕셔너리
224
+ level: 현재 들여쓰기 레벨
225
+ bullet_info: 목록 정보
226
+
227
+ Returns:
228
+ 현재 번호
229
+ """
230
+ # 새로운 번호 시퀀스 시작
231
+ if level not in numbering_state:
232
+ numbering_state[level] = bullet_info['start_at']
233
+ else:
234
+ numbering_state[level] += 1
235
+
236
+ # 하위 레벨 초기화
237
+ for l in list(numbering_state.keys()):
238
+ if l > level:
239
+ del numbering_state[l]
240
+
241
+ return numbering_state[level]
242
+
243
+
244
+ def _format_number(num: int, format_type: str) -> str:
245
+ """
246
+ 번호를 지정된 포맷으로 변환합니다.
247
+
248
+ 지원 포맷:
249
+ - arabicPeriod: 1.
250
+ - arabicParenR: 1)
251
+ - arabicParenBoth: (1)
252
+ - romanUcPeriod: I.
253
+ - romanLcPeriod: i.
254
+ - alphaUcPeriod: A.
255
+ - alphaLcPeriod: a.
256
+ - alphaUcParenR: A)
257
+ - alphaLcParenR: a)
258
+ 등등...
259
+
260
+ Args:
261
+ num: 번호
262
+ format_type: 포맷 타입 문자열
263
+
264
+ Returns:
265
+ 포맷팅된 번호 문자열
266
+ """
267
+ # 번호 변환
268
+ if 'roman' in format_type.lower():
269
+ num_str = _to_roman(num)
270
+ if 'Lc' in format_type: # 소문자
271
+ num_str = num_str.lower()
272
+ elif 'alpha' in format_type.lower():
273
+ num_str = _to_alpha(num)
274
+ if 'Lc' in format_type: # 소문자
275
+ num_str = num_str.lower()
276
+ else:
277
+ num_str = str(num)
278
+
279
+ # 구분자 추가
280
+ if 'Period' in format_type:
281
+ return f"{num_str}."
282
+ elif 'ParenBoth' in format_type:
283
+ return f"({num_str})"
284
+ elif 'ParenR' in format_type:
285
+ return f"{num_str})"
286
+ elif 'ParenL' in format_type:
287
+ return f"({num_str}"
288
+ elif 'Plain' in format_type:
289
+ return num_str
290
+ else:
291
+ # 기본값
292
+ return f"{num_str}."
293
+
294
+
295
+ def _to_roman(num: int) -> str:
296
+ """
297
+ 숫자를 로마 숫자로 변환합니다.
298
+
299
+ Args:
300
+ num: 1-3999 범위의 정수
301
+
302
+ Returns:
303
+ 로마 숫자 문자열 (예: 1→I, 4→IV, 9→IX)
304
+ """
305
+ val_map = [
306
+ (1000, 'M'), (900, 'CM'), (500, 'D'), (400, 'CD'),
307
+ (100, 'C'), (90, 'XC'), (50, 'L'), (40, 'XL'),
308
+ (10, 'X'), (9, 'IX'), (5, 'V'), (4, 'IV'), (1, 'I')
309
+ ]
310
+ result = []
311
+ for value, letter in val_map:
312
+ count, num = divmod(num, value)
313
+ result.append(letter * count)
314
+ return ''.join(result)
315
+
316
+
317
+ def _to_alpha(num: int) -> str:
318
+ """
319
+ 숫자를 알파벳으로 변환합니다.
320
+
321
+ Args:
322
+ num: 양의 정수
323
+
324
+ Returns:
325
+ 알파벳 문자열 (예: 1→A, 2→B, 26→Z, 27→AA)
326
+ """
327
+ result = []
328
+ while num > 0:
329
+ num -= 1
330
+ result.append(chr(65 + (num % 26)))
331
+ num //= 26
332
+ return ''.join(reversed(result))
@@ -0,0 +1,182 @@
1
+ """
2
+ PPT Chart Extractor
3
+
4
+ Extracts chart data from PowerPoint files (PPTX).
5
+ Uses python-pptx Presentation and Chart objects.
6
+
7
+ Provides:
8
+ - extract(): Single chart extraction from python-pptx Chart object
9
+ - extract_all_from_file(): Extract all charts from PPTX file in slide order
10
+ """
11
+ import io
12
+ import logging
13
+ from typing import Any, BinaryIO, Dict, List, Optional, Union
14
+
15
+ from pptx import Presentation
16
+
17
+ from xgen_doc2chunk.core.functions.chart_extractor import BaseChartExtractor, ChartData
18
+
19
+ logger = logging.getLogger("document-processor")
20
+
21
+
22
+ class PPTChartExtractor(BaseChartExtractor):
23
+ """
24
+ Chart extractor for PowerPoint files.
25
+
26
+ Supports:
27
+ - Direct python-pptx Chart object extraction
28
+ - Full file extraction via extract_all_from_file()
29
+ """
30
+
31
+ # ========================================================================
32
+ # Main Interface
33
+ # ========================================================================
34
+
35
+ def extract(self, chart_element: Any) -> ChartData:
36
+ """
37
+ Extract chart data from python-pptx Chart object.
38
+
39
+ Args:
40
+ chart_element: python-pptx Chart object
41
+
42
+ Returns:
43
+ ChartData with extracted information
44
+ """
45
+ if not chart_element:
46
+ return ChartData()
47
+
48
+ title = self._extract_title(chart_element)
49
+ chart_type = self._extract_chart_type(chart_element)
50
+ categories = self._extract_categories(chart_element)
51
+ series = self._extract_series(chart_element)
52
+
53
+ return ChartData(
54
+ chart_type=chart_type,
55
+ title=title,
56
+ categories=categories,
57
+ series=series
58
+ )
59
+
60
+ def extract_all_from_file(
61
+ self,
62
+ file_source: Union[str, bytes, BinaryIO]
63
+ ) -> List[ChartData]:
64
+ """
65
+ Extract all charts from a PowerPoint file in slide order.
66
+
67
+ Args:
68
+ file_source: File path, bytes, or file-like object
69
+
70
+ Returns:
71
+ List of ChartData for all charts in the file
72
+ """
73
+ charts = []
74
+
75
+ try:
76
+ # Prepare file-like object
77
+ if isinstance(file_source, str):
78
+ with open(file_source, 'rb') as f:
79
+ file_obj = io.BytesIO(f.read())
80
+ elif isinstance(file_source, bytes):
81
+ file_obj = io.BytesIO(file_source)
82
+ else:
83
+ file_source.seek(0)
84
+ file_obj = file_source
85
+
86
+ # Open presentation
87
+ prs = Presentation(file_obj)
88
+
89
+ # Iterate slides in order
90
+ for slide in prs.slides:
91
+ for shape in slide.shapes:
92
+ if shape.has_chart:
93
+ chart_data = self.extract(shape.chart)
94
+ charts.append(chart_data)
95
+
96
+ # Check group shapes
97
+ if hasattr(shape, 'shapes'):
98
+ for sub_shape in shape.shapes:
99
+ if hasattr(sub_shape, 'has_chart') and sub_shape.has_chart:
100
+ chart_data = self.extract(sub_shape.chart)
101
+ charts.append(chart_data)
102
+
103
+ logger.info(f"Extracted {len(charts)} charts from PowerPoint file")
104
+
105
+ except Exception as e:
106
+ logger.error(f"Error extracting charts from PowerPoint: {e}")
107
+
108
+ return charts
109
+
110
+ # ========================================================================
111
+ # Private Methods
112
+ # ========================================================================
113
+
114
+ def _extract_title(self, chart) -> Optional[str]:
115
+ """Extract chart title."""
116
+ try:
117
+ if chart.has_title and chart.chart_title:
118
+ if chart.chart_title.has_text_frame:
119
+ title_text = chart.chart_title.text_frame.text
120
+ if title_text:
121
+ return title_text.strip()
122
+ except Exception:
123
+ pass
124
+ return None
125
+
126
+ def _extract_chart_type(self, chart) -> str:
127
+ """Extract chart type."""
128
+ try:
129
+ if hasattr(chart, 'chart_type'):
130
+ type_str = str(chart.chart_type)
131
+ type_name = type_str.split('.')[-1].split(' ')[0]
132
+ return type_name.replace('_', ' ').title()
133
+ except Exception:
134
+ pass
135
+ return "Chart"
136
+
137
+ def _extract_categories(self, chart) -> List[str]:
138
+ """Extract category labels."""
139
+ categories = []
140
+ try:
141
+ if hasattr(chart, 'plots') and chart.plots:
142
+ for plot in chart.plots:
143
+ if hasattr(plot, 'categories') and plot.categories:
144
+ categories = [str(c) for c in plot.categories]
145
+ break
146
+ except Exception:
147
+ pass
148
+ return categories
149
+
150
+ def _extract_series(self, chart) -> List[Dict[str, Any]]:
151
+ """Extract series data."""
152
+ series_data = []
153
+ try:
154
+ for idx, series in enumerate(chart.series):
155
+ series_info = {
156
+ 'name': self._get_series_name(series, idx),
157
+ 'values': []
158
+ }
159
+
160
+ try:
161
+ if hasattr(series, 'values') and series.values:
162
+ series_info['values'] = list(series.values)
163
+ except Exception:
164
+ pass
165
+
166
+ series_data.append(series_info)
167
+ except Exception:
168
+ pass
169
+
170
+ return series_data
171
+
172
+ def _get_series_name(self, series, idx: int) -> str:
173
+ """Get series name."""
174
+ try:
175
+ if hasattr(series, 'name') and series.name:
176
+ return str(series.name)
177
+ except Exception:
178
+ pass
179
+ return f"Series {idx + 1}"
180
+
181
+
182
+ __all__ = ['PPTChartExtractor']
@@ -0,0 +1,119 @@
1
+ """
2
+ PPT 상수 및 타입 정의 모듈
3
+
4
+ 포함 내용:
5
+ - Wingdings/Symbol 폰트 매핑 테이블
6
+ - ElementType Enum
7
+ - SlideElement dataclass
8
+ """
9
+ from dataclasses import dataclass
10
+ from enum import Enum
11
+ from typing import Tuple
12
+
13
+
14
+ # === Wingdings/Symbol 폰트 매핑 테이블 ===
15
+ # PPT에서 특수 폰트(Wingdings, Symbol 등)로 표시되는 목록 기호를
16
+ # 올바른 Unicode 문자로 변환하기 위한 매핑 테이블
17
+
18
+ WINGDINGS_MAPPING = {
19
+ # 기본 도형
20
+ 0x6C: '●', # 'l' -> 검정 원 (filled circle)
21
+ 0x6D: '○', # 'm' -> 빈 원 (empty circle)
22
+ 0x6E: '■', # 'n' -> 검정 사각형 (filled square)
23
+ 0x6F: '□', # 'o' -> 빈 사각형 (empty square)
24
+ 0x70: '◆', # 'p' -> 검정 마름모 (filled diamond)
25
+ 0x71: '◇', # 'q' -> 빈 마름모 (empty diamond)
26
+ 0x75: '◆', # 'u' -> 마름모
27
+ 0x76: '❖', # 'v' -> 마름모 변형
28
+
29
+ # 체크마크/X 마크
30
+ 0xFC: '✓', # 체크마크
31
+ 0xFB: '✓', # 체크마크 변형
32
+ 0xFD: '✗', # X 마크
33
+ 0xFE: '✘', # Heavy X
34
+
35
+ # 화살표
36
+ 0xD8: '➢', # Ø -> 3D 입체 화살표 (가장 많이 사용)
37
+ 0xE0: '➢', # 오른쪽 화살표
38
+ 0xE1: '⬅', # 왼쪽 화살표
39
+ 0xE2: '⬆', # 위쪽 화살표
40
+ 0xE3: '⬇', # 아래쪽 화살표
41
+ 0xE4: '⬌', # 양방향 화살표
42
+ 0xE8: '➢', # 화살표 (è)
43
+ 0xE9: '➣', # 화살표 변형
44
+ 0xEA: '➤', # 삼각 화살표
45
+ 0xF0: '➢', # 화살표
46
+ 0xD0: '➢', # 화살표
47
+
48
+ # 손가락 포인터
49
+ 0x46: '☞', # 'F' -> 오른쪽 손가락
50
+ 0x47: '☜', # 'G' -> 왼쪽 손가락
51
+
52
+ # 별/특수 기호
53
+ 0xAB: '★', # 검정 별
54
+ 0xAC: '☆', # 빈 별
55
+ 0xA7: '§', # Section -> 네모로 변환
56
+
57
+ # 숫자 원
58
+ 0x31: '①', # '1'
59
+ 0x32: '②', # '2'
60
+ 0x33: '③', # '3'
61
+ 0x34: '④', # '4'
62
+ 0x35: '⑤', # '5'
63
+ 0x36: '⑥', # '6'
64
+ 0x37: '⑦', # '7'
65
+ 0x38: '⑧', # '8'
66
+ 0x39: '⑨', # '9'
67
+ 0x30: '⓪', # '0'
68
+ }
69
+
70
+ # 특정 문자에서 Unicode로 직접 매핑 (문자 기반)
71
+ WINGDINGS_CHAR_MAPPING = {
72
+ '§': '■', # Section sign -> 검정 사각형
73
+ 'Ø': '➢', # 3D 입체 화살표 (가장 많이 사용)
74
+ 'ü': '✓', # 체크마크
75
+ 'u': '◆', # 마름모
76
+ 'n': '■', # 검정 사각형
77
+ 'l': '●', # 검정 원
78
+ 'o': '□', # 빈 사각형
79
+ 'q': '◇', # 빈 마름모
80
+ 'v': '❖', # 마름모 변형
81
+ 'F': '☞', # 오른쪽 손가락
82
+ 'ð': '➢', # 화살표
83
+ 'Ð': '➢', # 화살표
84
+ 'à': '➢', # 화살표
85
+ 'è': '➢', # 화살표 (0xE8)
86
+ 'ê': '➤', # 삼각 화살표
87
+ }
88
+
89
+ SYMBOL_MAPPING = {
90
+ 0xB7: '•', # Bullet
91
+ 0xD7: '×', # Multiplication
92
+ 0xF7: '÷', # Division
93
+ 0xA5: '∞', # Infinity
94
+ 0xB1: '±', # Plus-minus
95
+ }
96
+
97
+
98
+ # === 슬라이드 요소 타입 정의 ===
99
+
100
+ class ElementType(Enum):
101
+ """슬라이드 요소 타입"""
102
+ TEXT = "text"
103
+ IMAGE = "image"
104
+ TABLE = "table"
105
+ CHART = "chart"
106
+
107
+
108
+ @dataclass
109
+ class SlideElement:
110
+ """슬라이드 내 요소를 나타내는 데이터 클래스"""
111
+ element_type: ElementType
112
+ content: str
113
+ position: Tuple[int, int, int, int] # (left, top, width, height) in EMU
114
+ shape_id: int
115
+
116
+ @property
117
+ def sort_key(self) -> Tuple[int, int]:
118
+ """정렬 키: (top, left) - 위에서 아래, 왼쪽에서 오른쪽"""
119
+ return (self.position[1], self.position[0])
@@ -0,0 +1,55 @@
1
+ # xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py
2
+ """
3
+ PPTFileConverter - PPT/PPTX file format converter
4
+
5
+ Converts binary PPT/PPTX data to python-pptx Presentation object.
6
+ """
7
+ from io import BytesIO
8
+ from typing import Any, Optional, BinaryIO
9
+
10
+ from xgen_doc2chunk.core.functions.file_converter import BaseFileConverter
11
+
12
+
13
+ class PPTFileConverter(BaseFileConverter):
14
+ """
15
+ PPT/PPTX file converter using python-pptx.
16
+
17
+ Converts binary PPT/PPTX data to Presentation object.
18
+ """
19
+
20
+ # ZIP magic number (PPTX is a ZIP file)
21
+ ZIP_MAGIC = b'PK\x03\x04'
22
+
23
+ def convert(
24
+ self,
25
+ file_data: bytes,
26
+ file_stream: Optional[BinaryIO] = None,
27
+ **kwargs
28
+ ) -> Any:
29
+ """
30
+ Convert binary PPT/PPTX data to Presentation object.
31
+
32
+ Args:
33
+ file_data: Raw binary PPT/PPTX data
34
+ file_stream: Optional file stream
35
+ **kwargs: Additional options
36
+
37
+ Returns:
38
+ pptx.Presentation object
39
+ """
40
+ from pptx import Presentation
41
+
42
+ stream = file_stream if file_stream is not None else BytesIO(file_data)
43
+ stream.seek(0)
44
+ return Presentation(stream)
45
+
46
+ def get_format_name(self) -> str:
47
+ """Return format name."""
48
+ return "PPT/PPTX Presentation"
49
+
50
+ def validate(self, file_data: bytes) -> bool:
51
+ """Validate if data is a valid PPTX."""
52
+ if not file_data or len(file_data) < 4:
53
+ return False
54
+ return file_data[:4] == self.ZIP_MAGIC
55
+