xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,436 @@
1
+ """
2
+ DOCX Chart Extractor
3
+
4
+ Extracts all chart data from DOCX files.
5
+ Parses OOXML DrawingML Chart format (ISO/IEC 29500).
6
+
7
+ Structure:
8
+ - Charts are stored in word/charts/chart*.xml
9
+ - Referenced via relationships in document.xml
10
+ """
11
+ import io
12
+ import logging
13
+ import xml.etree.ElementTree as ET
14
+ import zipfile
15
+ from typing import Any, Dict, List, Optional, Union, BinaryIO
16
+
17
+ from xgen_doc2chunk.core.functions.chart_extractor import BaseChartExtractor, ChartData
18
+
19
+ logger = logging.getLogger("document-processor")
20
+
21
+ # OOXML namespaces
22
+ OOXML_NS = {
23
+ 'c': 'http://schemas.openxmlformats.org/drawingml/2006/chart',
24
+ 'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
25
+ }
26
+
27
+ # Chart type mapping
28
+ CHART_TYPE_MAP = {
29
+ 'barChart': 'Bar Chart',
30
+ 'bar3DChart': '3D Bar Chart',
31
+ 'lineChart': 'Line Chart',
32
+ 'line3DChart': '3D Line Chart',
33
+ 'pieChart': 'Pie Chart',
34
+ 'pie3DChart': '3D Pie Chart',
35
+ 'doughnutChart': 'Doughnut Chart',
36
+ 'areaChart': 'Area Chart',
37
+ 'area3DChart': '3D Area Chart',
38
+ 'scatterChart': 'Scatter Chart',
39
+ 'bubbleChart': 'Bubble Chart',
40
+ 'radarChart': 'Radar Chart',
41
+ 'surfaceChart': 'Surface Chart',
42
+ 'surface3DChart': '3D Surface Chart',
43
+ 'stockChart': 'Stock Chart',
44
+ }
45
+
46
+
47
+ class DOCXChartExtractor(BaseChartExtractor):
48
+ """
49
+ Chart extractor for DOCX files.
50
+
51
+ Extracts all charts from DOCX by parsing word/charts/*.xml files.
52
+ """
53
+
54
+ # ========================================================================
55
+ # Main Interface
56
+ # ========================================================================
57
+
58
+ def extract(self, chart_element: Any) -> ChartData:
59
+ """
60
+ Extract chart data from various input types.
61
+
62
+ Args:
63
+ chart_element: One of:
64
+ - bytes: Raw chart XML
65
+ - object with 'blob' attribute: Chart part object
66
+
67
+ Returns:
68
+ ChartData with extracted information
69
+ """
70
+ if not chart_element:
71
+ return ChartData()
72
+
73
+ # Handle chart XML bytes
74
+ if isinstance(chart_element, bytes):
75
+ return self._parse_ooxml_chart(chart_element)
76
+
77
+ # Handle object with blob attribute (chart part)
78
+ if hasattr(chart_element, 'blob'):
79
+ return self._parse_ooxml_chart(chart_element.blob)
80
+
81
+ return ChartData()
82
+
83
+ def extract_all_from_file(
84
+ self,
85
+ file_source: Union[str, bytes, BinaryIO]
86
+ ) -> List[ChartData]:
87
+ """
88
+ Extract all charts from a DOCX file.
89
+
90
+ Args:
91
+ file_source: File path, bytes, or file-like object
92
+
93
+ Returns:
94
+ List of ChartData for all charts in the file (in document order)
95
+ """
96
+ charts = []
97
+
98
+ try:
99
+ # Prepare file-like object
100
+ if isinstance(file_source, str):
101
+ zf = zipfile.ZipFile(file_source, 'r')
102
+ elif isinstance(file_source, bytes):
103
+ zf = zipfile.ZipFile(io.BytesIO(file_source), 'r')
104
+ else:
105
+ file_source.seek(0)
106
+ zf = zipfile.ZipFile(file_source, 'r')
107
+
108
+ try:
109
+ # Find all chart XML files in word/charts/
110
+ chart_files = sorted([
111
+ name for name in zf.namelist()
112
+ if name.startswith('word/charts/chart') and name.endswith('.xml')
113
+ ])
114
+
115
+ for chart_file in chart_files:
116
+ try:
117
+ chart_xml = zf.read(chart_file)
118
+ chart_data = self._parse_ooxml_chart(chart_xml)
119
+ if chart_data.has_data():
120
+ charts.append(chart_data)
121
+ else:
122
+ # Even empty charts should be tracked for position matching
123
+ charts.append(chart_data)
124
+ except Exception as e:
125
+ logger.debug(f"Error parsing chart {chart_file}: {e}")
126
+ charts.append(ChartData()) # Placeholder for failed chart
127
+
128
+ finally:
129
+ zf.close()
130
+
131
+ logger.debug(f"Extracted {len(charts)} charts from DOCX file")
132
+
133
+ except Exception as e:
134
+ logger.warning(f"Error extracting charts from DOCX: {e}")
135
+
136
+ return charts
137
+
138
+ def process_all_from_file(
139
+ self,
140
+ file_source: Union[str, bytes, BinaryIO]
141
+ ) -> List[str]:
142
+ """
143
+ Extract and format all charts from a DOCX file.
144
+
145
+ Args:
146
+ file_source: File path, bytes, or file-like object
147
+
148
+ Returns:
149
+ List of formatted chart strings
150
+ """
151
+ results = []
152
+
153
+ for chart_data in self.extract_all_from_file(file_source):
154
+ formatted = self._format_chart_data(chart_data)
155
+ if formatted:
156
+ results.append(formatted)
157
+
158
+ return results
159
+
160
+ # ========================================================================
161
+ # OOXML Chart Parsing
162
+ # ========================================================================
163
+
164
+ def _parse_ooxml_chart(self, chart_xml: bytes) -> ChartData:
165
+ """Parse OOXML chart XML."""
166
+ try:
167
+ # Parse XML
168
+ root = self._parse_xml(chart_xml)
169
+ if root is None:
170
+ return ChartData()
171
+
172
+ # Find chart element
173
+ chart_elem = self._find_chart_element(root)
174
+ if chart_elem is None:
175
+ return ChartData()
176
+
177
+ # Extract title
178
+ title = self._extract_title(chart_elem)
179
+
180
+ # Extract chart type and series data
181
+ chart_type, categories, series = self._extract_plot_data(chart_elem)
182
+
183
+ return ChartData(
184
+ chart_type=chart_type,
185
+ title=title,
186
+ categories=categories if categories else None,
187
+ series=series if series else None
188
+ )
189
+
190
+ except Exception as e:
191
+ logger.debug(f"Error parsing OOXML chart: {e}")
192
+ return ChartData()
193
+
194
+ def _parse_xml(self, chart_xml: bytes) -> Optional[ET.Element]:
195
+ """Parse XML with BOM and encoding handling."""
196
+ try:
197
+ return ET.fromstring(chart_xml)
198
+ except ET.ParseError:
199
+ try:
200
+ chart_str = chart_xml.decode('utf-8-sig', errors='ignore')
201
+ return ET.fromstring(chart_str)
202
+ except:
203
+ return None
204
+
205
+ def _find_chart_element(self, root: ET.Element) -> Optional[ET.Element]:
206
+ """Find the chart element in the XML tree."""
207
+ chart_elem = root.find('.//c:chart', OOXML_NS)
208
+ if chart_elem is not None:
209
+ return chart_elem
210
+
211
+ chart_elem = root.find('.//{http://schemas.openxmlformats.org/drawingml/2006/chart}chart')
212
+ if chart_elem is not None:
213
+ return chart_elem
214
+
215
+ if root.tag.endswith('}chart') or root.tag == 'chart':
216
+ return root
217
+
218
+ return None
219
+
220
+ def _extract_title(self, chart_elem: ET.Element) -> Optional[str]:
221
+ """Extract chart title.
222
+
223
+ Chart titles in DOCX may be split across multiple <a:t> text elements
224
+ (text runs), so we need to find all of them and concatenate.
225
+ """
226
+ # Primary path: find all text elements in title
227
+ title_container = chart_elem.find('.//c:title//c:tx//c:rich', OOXML_NS)
228
+ if title_container is not None:
229
+ # Find all a:t elements and concatenate their text
230
+ text_elements = title_container.findall('.//a:t', OOXML_NS)
231
+ if text_elements:
232
+ title_parts = [elem.text for elem in text_elements if elem.text]
233
+ if title_parts:
234
+ return ''.join(title_parts).strip()
235
+
236
+ # Alternative path with full namespace
237
+ title_container = chart_elem.find('.//{http://schemas.openxmlformats.org/drawingml/2006/chart}title//{http://schemas.openxmlformats.org/drawingml/2006/chart}tx//{http://schemas.openxmlformats.org/drawingml/2006/chart}rich')
238
+ if title_container is not None:
239
+ text_elements = title_container.findall('.//{http://schemas.openxmlformats.org/drawingml/2006/main}t')
240
+ if text_elements:
241
+ title_parts = [elem.text for elem in text_elements if elem.text]
242
+ if title_parts:
243
+ return ''.join(title_parts).strip()
244
+
245
+ # Fallback: try to find any a:t elements under title
246
+ text_elements = chart_elem.findall('.//c:title//a:t', OOXML_NS)
247
+ if text_elements:
248
+ title_parts = [elem.text for elem in text_elements if elem.text]
249
+ if title_parts:
250
+ return ''.join(title_parts).strip()
251
+
252
+ # Final fallback with full namespace
253
+ text_elements = chart_elem.findall('.//{http://schemas.openxmlformats.org/drawingml/2006/chart}title//{http://schemas.openxmlformats.org/drawingml/2006/main}t')
254
+ if text_elements:
255
+ title_parts = [elem.text for elem in text_elements if elem.text]
256
+ if title_parts:
257
+ return ''.join(title_parts).strip()
258
+
259
+ return None
260
+
261
+ def _extract_plot_data(self, chart_elem: ET.Element) -> tuple:
262
+ """Extract chart type, categories, and series."""
263
+ plot_area = chart_elem.find('.//c:plotArea', OOXML_NS)
264
+ if plot_area is None:
265
+ plot_area = chart_elem.find('.//{http://schemas.openxmlformats.org/drawingml/2006/chart}plotArea')
266
+
267
+ if plot_area is None:
268
+ return "Chart", [], []
269
+
270
+ for chart_tag, type_name in CHART_TYPE_MAP.items():
271
+ elem = plot_area.find(f'.//c:{chart_tag}', OOXML_NS)
272
+ if elem is None:
273
+ elem = plot_area.find(f'.//{{{OOXML_NS["c"]}}}{chart_tag}')
274
+ if elem is not None:
275
+ categories, series = self._extract_series_data(elem)
276
+ return type_name, categories, series
277
+
278
+ return "Chart", [], []
279
+
280
+ def _extract_series_data(self, chart_type_elem: ET.Element) -> tuple:
281
+ """Extract categories and series data."""
282
+ ns_c = OOXML_NS['c']
283
+ categories = []
284
+ series = []
285
+ categories_extracted = False
286
+
287
+ series_elements = chart_type_elem.findall('.//c:ser', OOXML_NS)
288
+ if not series_elements:
289
+ series_elements = chart_type_elem.findall(f'.//{{{ns_c}}}ser')
290
+
291
+ for idx, ser_elem in enumerate(series_elements):
292
+ series_data = {
293
+ 'name': self._extract_series_name(ser_elem, idx),
294
+ 'values': []
295
+ }
296
+
297
+ if not categories_extracted:
298
+ categories = self._extract_categories(ser_elem)
299
+ categories_extracted = True
300
+
301
+ series_data['values'] = self._extract_values(ser_elem)
302
+
303
+ if series_data['values']:
304
+ series.append(series_data)
305
+
306
+ return categories, series
307
+
308
+ def _extract_series_name(self, ser_elem: ET.Element, idx: int) -> str:
309
+ """Extract series name."""
310
+ ns_c = OOXML_NS['c']
311
+
312
+ tx_elem = ser_elem.find('.//c:tx//c:v', OOXML_NS)
313
+ if tx_elem is None:
314
+ tx_elem = ser_elem.find(f'.//{{{ns_c}}}tx//{{{ns_c}}}v')
315
+ if tx_elem is not None and tx_elem.text:
316
+ return tx_elem.text.strip()
317
+
318
+ str_ref = ser_elem.find('.//c:tx//c:strRef//c:strCache//c:pt//c:v', OOXML_NS)
319
+ if str_ref is None:
320
+ str_ref = ser_elem.find(f'.//{{{ns_c}}}tx//{{{ns_c}}}strRef//{{{ns_c}}}strCache//{{{ns_c}}}pt//{{{ns_c}}}v')
321
+ if str_ref is not None and str_ref.text:
322
+ return str_ref.text.strip()
323
+
324
+ return f"Series {idx + 1}"
325
+
326
+ def _extract_categories(self, ser_elem: ET.Element) -> List[str]:
327
+ """Extract category labels."""
328
+ ns_c = OOXML_NS['c']
329
+ categories = []
330
+
331
+ cat_elem = ser_elem.find('.//c:cat', OOXML_NS)
332
+ if cat_elem is None:
333
+ cat_elem = ser_elem.find(f'.//{{{ns_c}}}cat')
334
+
335
+ if cat_elem is None:
336
+ return categories
337
+
338
+ # Try string cache
339
+ str_cache = cat_elem.find('.//c:strCache', OOXML_NS)
340
+ if str_cache is None:
341
+ str_cache = cat_elem.find(f'.//{{{ns_c}}}strCache')
342
+
343
+ if str_cache is not None:
344
+ categories = self._extract_point_values(str_cache, as_string=True)
345
+
346
+ # Fallback to numeric cache
347
+ if not categories:
348
+ num_cache = cat_elem.find('.//c:numCache', OOXML_NS)
349
+ if num_cache is None:
350
+ num_cache = cat_elem.find(f'.//{{{ns_c}}}numCache')
351
+
352
+ if num_cache is not None:
353
+ categories = self._extract_point_values(num_cache, as_string=True)
354
+
355
+ return categories
356
+
357
+ def _extract_values(self, ser_elem: ET.Element) -> List[Any]:
358
+ """Extract series values."""
359
+ ns_c = OOXML_NS['c']
360
+ values = []
361
+
362
+ val_elem = ser_elem.find('.//c:val', OOXML_NS)
363
+ if val_elem is None:
364
+ val_elem = ser_elem.find(f'.//{{{ns_c}}}val')
365
+
366
+ if val_elem is not None:
367
+ num_cache = val_elem.find('.//c:numCache', OOXML_NS)
368
+ if num_cache is None:
369
+ num_cache = val_elem.find(f'.//{{{ns_c}}}numCache')
370
+
371
+ if num_cache is not None:
372
+ values = self._extract_point_values(num_cache, as_string=False)
373
+
374
+ # Try yVal for scatter/bubble charts
375
+ if not values:
376
+ yval_elem = ser_elem.find('.//c:yVal', OOXML_NS)
377
+ if yval_elem is None:
378
+ yval_elem = ser_elem.find(f'.//{{{ns_c}}}yVal')
379
+
380
+ if yval_elem is not None:
381
+ num_cache = yval_elem.find('.//c:numCache', OOXML_NS)
382
+ if num_cache is None:
383
+ num_cache = yval_elem.find(f'.//{{{ns_c}}}numCache')
384
+
385
+ if num_cache is not None:
386
+ values = self._extract_point_values(num_cache, as_string=False)
387
+
388
+ return values
389
+
390
+ def _extract_point_values(self, cache_elem: ET.Element, as_string: bool = False) -> List[Any]:
391
+ """Extract values from cache element."""
392
+ ns_c = OOXML_NS['c']
393
+ values = []
394
+
395
+ pts = cache_elem.findall('.//c:pt', OOXML_NS)
396
+ if not pts:
397
+ pts = cache_elem.findall(f'.//{{{ns_c}}}pt')
398
+
399
+ for pt in sorted(pts, key=lambda x: int(x.get('idx', 0))):
400
+ v_elem = pt.find('c:v', OOXML_NS)
401
+ if v_elem is None:
402
+ v_elem = pt.find(f'{{{ns_c}}}v')
403
+
404
+ if v_elem is not None and v_elem.text:
405
+ text = v_elem.text.strip()
406
+ if as_string:
407
+ values.append(text)
408
+ else:
409
+ try:
410
+ values.append(float(text))
411
+ except ValueError:
412
+ values.append(text)
413
+
414
+ return values
415
+
416
+ # ========================================================================
417
+ # Formatting
418
+ # ========================================================================
419
+
420
+ def _format_chart_data(self, chart_data: ChartData) -> str:
421
+ """Format ChartData using ChartProcessor."""
422
+ if chart_data.has_data():
423
+ return self._chart_processor.format_chart_data(
424
+ chart_type=chart_data.chart_type,
425
+ title=chart_data.title,
426
+ categories=chart_data.categories,
427
+ series=chart_data.series
428
+ )
429
+ else:
430
+ return self._chart_processor.format_chart_fallback(
431
+ chart_type=chart_data.chart_type,
432
+ title=chart_data.title
433
+ )
434
+
435
+
436
+ __all__ = ['DOCXChartExtractor']
@@ -0,0 +1,75 @@
1
+ # service/document_processor/processor/docx_helper/docx_constants.py
2
+ """
3
+ DOCX 상수 및 타입 정의
4
+
5
+ DOCX 문서 처리에 필요한 상수, Enum, 데이터클래스를 정의합니다.
6
+ - ElementType: 문서 요소 타입 (텍스트, 이미지, 테이블 등)
7
+ - DocxElement: 문서 요소 데이터 클래스
8
+ - NAMESPACES: OOXML 네임스페이스
9
+ - CHART_TYPE_MAP: 차트 타입 매핑
10
+ """
11
+ from dataclasses import dataclass
12
+ from enum import Enum
13
+
14
+
15
+ # === 문서 요소 타입 정의 ===
16
+
17
+ class ElementType(Enum):
18
+ """문서 요소 타입"""
19
+ TEXT = "text"
20
+ IMAGE = "image"
21
+ TABLE = "table"
22
+ CHART = "chart"
23
+ DIAGRAM = "diagram"
24
+ PAGE_BREAK = "page_break"
25
+
26
+
27
+ @dataclass
28
+ class DocxElement:
29
+ """문서 내 요소를 나타내는 데이터 클래스"""
30
+ element_type: ElementType
31
+ content: str
32
+ element_index: int # 문서 내 순서
33
+
34
+
35
+ # === OOXML 네임스페이스 ===
36
+
37
+ NAMESPACES = {
38
+ 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
39
+ 'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
40
+ 'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
41
+ 'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
42
+ 'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
43
+ 'c': 'http://schemas.openxmlformats.org/drawingml/2006/chart',
44
+ 'dgm': 'http://schemas.openxmlformats.org/drawingml/2006/diagram',
45
+ 'mc': 'http://schemas.openxmlformats.org/markup-compatibility/2006',
46
+ 'wps': 'http://schemas.microsoft.com/office/word/2010/wordprocessingShape',
47
+ }
48
+
49
+ # OOXML 차트 타입 맵핑
50
+ CHART_TYPE_MAP = {
51
+ 'barChart': '막대 차트',
52
+ 'bar3DChart': '3D 막대 차트',
53
+ 'lineChart': '선 차트',
54
+ 'line3DChart': '3D 선 차트',
55
+ 'pieChart': '파이 차트',
56
+ 'pie3DChart': '3D 파이 차트',
57
+ 'doughnutChart': '도넛 차트',
58
+ 'areaChart': '영역 차트',
59
+ 'area3DChart': '3D 영역 차트',
60
+ 'scatterChart': '분산형 차트',
61
+ 'radarChart': '방사형 차트',
62
+ 'bubbleChart': '거품형 차트',
63
+ 'stockChart': '주식형 차트',
64
+ 'surfaceChart': '표면 차트',
65
+ 'surface3DChart': '3D 표면 차트',
66
+ 'ofPieChart': '분리형 파이 차트',
67
+ }
68
+
69
+
70
+ __all__ = [
71
+ 'ElementType',
72
+ 'DocxElement',
73
+ 'NAMESPACES',
74
+ 'CHART_TYPE_MAP',
75
+ ]
@@ -0,0 +1,76 @@
1
+ # xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py
2
+ """
3
+ DOCXFileConverter - DOCX file format converter
4
+
5
+ Converts binary DOCX data to python-docx Document object.
6
+ """
7
+ from io import BytesIO
8
+ from typing import Any, Optional, BinaryIO
9
+ import zipfile
10
+
11
+ from xgen_doc2chunk.core.functions.file_converter import BaseFileConverter
12
+
13
+
14
+ class DOCXFileConverter(BaseFileConverter):
15
+ """
16
+ DOCX file converter using python-docx.
17
+
18
+ Converts binary DOCX data to Document object.
19
+ """
20
+
21
+ # ZIP magic number (DOCX is a ZIP file)
22
+ ZIP_MAGIC = b'PK\x03\x04'
23
+
24
+ def convert(
25
+ self,
26
+ file_data: bytes,
27
+ file_stream: Optional[BinaryIO] = None,
28
+ **kwargs
29
+ ) -> Any:
30
+ """
31
+ Convert binary DOCX data to Document object.
32
+
33
+ Args:
34
+ file_data: Raw binary DOCX data
35
+ file_stream: Optional file stream
36
+ **kwargs: Additional options
37
+
38
+ Returns:
39
+ docx.Document object
40
+
41
+ Raises:
42
+ Exception: If DOCX cannot be opened
43
+ """
44
+ from docx import Document
45
+
46
+ stream = file_stream if file_stream is not None else BytesIO(file_data)
47
+ stream.seek(0)
48
+ return Document(stream)
49
+
50
+ def get_format_name(self) -> str:
51
+ """Return format name."""
52
+ return "DOCX Document"
53
+
54
+ def validate(self, file_data: bytes) -> bool:
55
+ """
56
+ Validate if data is a valid DOCX (ZIP with specific structure).
57
+
58
+ Args:
59
+ file_data: Raw binary file data
60
+
61
+ Returns:
62
+ True if file appears to be a DOCX
63
+ """
64
+ if not file_data or len(file_data) < 4:
65
+ return False
66
+
67
+ if not file_data[:4] == self.ZIP_MAGIC:
68
+ return False
69
+
70
+ # Check for DOCX-specific content
71
+ try:
72
+ with zipfile.ZipFile(BytesIO(file_data), 'r') as zf:
73
+ return '[Content_Types].xml' in zf.namelist()
74
+ except zipfile.BadZipFile:
75
+ return False
76
+