xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,498 @@
1
+ """
2
+ Excel Chart Extractor
3
+
4
+ Extracts chart data from Excel files (XLSX/XLS).
5
+ Parses OOXML DrawingML Chart format (ISO/IEC 29500).
6
+
7
+ Handles:
8
+ - XLSX: Chart XML in xl/charts/*.xml
9
+ - Chart info dictionaries (pre-parsed)
10
+ """
11
+ import io
12
+ import logging
13
+ import xml.etree.ElementTree as ET
14
+ import zipfile
15
+ from typing import Any, Dict, List, Optional, Union, BinaryIO
16
+
17
+ from xgen_doc2chunk.core.functions.chart_extractor import BaseChartExtractor, ChartData
18
+
19
+ logger = logging.getLogger("document-processor")
20
+
21
+ # OOXML namespaces
22
+ OOXML_NS = {
23
+ 'c': 'http://schemas.openxmlformats.org/drawingml/2006/chart',
24
+ 'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
25
+ }
26
+
27
+ # Chart type mapping (OOXML tag -> display name)
28
+ CHART_TYPE_MAP = {
29
+ 'barChart': 'Bar Chart',
30
+ 'bar3DChart': '3D Bar Chart',
31
+ 'lineChart': 'Line Chart',
32
+ 'line3DChart': '3D Line Chart',
33
+ 'pieChart': 'Pie Chart',
34
+ 'pie3DChart': '3D Pie Chart',
35
+ 'doughnutChart': 'Doughnut Chart',
36
+ 'areaChart': 'Area Chart',
37
+ 'area3DChart': '3D Area Chart',
38
+ 'scatterChart': 'Scatter Chart',
39
+ 'bubbleChart': 'Bubble Chart',
40
+ 'radarChart': 'Radar Chart',
41
+ 'surfaceChart': 'Surface Chart',
42
+ 'surface3DChart': '3D Surface Chart',
43
+ 'stockChart': 'Stock Chart',
44
+ 'ofPieChart': 'Pie of Pie Chart',
45
+ }
46
+
47
+
48
+ class ExcelChartExtractor(BaseChartExtractor):
49
+ """
50
+ Chart extractor for Excel files (XLSX/XLS).
51
+
52
+ Supports:
53
+ - Direct chart XML bytes parsing
54
+ - Pre-parsed chart info dictionaries
55
+ - Full file extraction (via extract_all_from_file)
56
+ """
57
+
58
+ # ========================================================================
59
+ # Main Interface
60
+ # ========================================================================
61
+
62
+ def extract(self, chart_element: Any) -> ChartData:
63
+ """
64
+ Extract chart data from various input types.
65
+
66
+ Args:
67
+ chart_element: One of:
68
+ - bytes: Raw chart XML
69
+ - dict: Pre-parsed chart info dictionary
70
+ - object with 'blob' attribute: Chart part object
71
+
72
+ Returns:
73
+ ChartData with extracted information
74
+ """
75
+ if not chart_element:
76
+ return ChartData()
77
+
78
+ # Handle chart XML bytes
79
+ if isinstance(chart_element, bytes):
80
+ return self._parse_ooxml_chart(chart_element)
81
+
82
+ # Handle pre-parsed dictionary
83
+ if isinstance(chart_element, dict):
84
+ return self._from_dict(chart_element)
85
+
86
+ # Handle object with blob attribute
87
+ if hasattr(chart_element, 'blob'):
88
+ return self._parse_ooxml_chart(chart_element.blob)
89
+
90
+ return ChartData()
91
+
92
+ def extract_all_from_file(
93
+ self,
94
+ file_source: Union[str, bytes, BinaryIO]
95
+ ) -> List[ChartData]:
96
+ """
97
+ Extract all charts from an Excel file.
98
+
99
+ Args:
100
+ file_source: File path, bytes, or file-like object
101
+
102
+ Returns:
103
+ List of ChartData for all charts in the file
104
+ """
105
+ charts = []
106
+
107
+ try:
108
+ # Prepare file-like object
109
+ if isinstance(file_source, str):
110
+ zf = zipfile.ZipFile(file_source, 'r')
111
+ elif isinstance(file_source, bytes):
112
+ zf = zipfile.ZipFile(io.BytesIO(file_source), 'r')
113
+ else:
114
+ file_source.seek(0)
115
+ zf = zipfile.ZipFile(file_source, 'r')
116
+
117
+ try:
118
+ # Find all chart XML files
119
+ for name in sorted(zf.namelist()):
120
+ if name.startswith('xl/charts/chart') and name.endswith('.xml'):
121
+ try:
122
+ chart_xml = zf.read(name)
123
+ chart_data = self._parse_ooxml_chart(chart_xml)
124
+ if chart_data.has_data():
125
+ charts.append(chart_data)
126
+ except Exception as e:
127
+ logger.debug(f"Error parsing chart {name}: {e}")
128
+ finally:
129
+ zf.close()
130
+
131
+ logger.debug(f"Extracted {len(charts)} charts from Excel file")
132
+
133
+ except Exception as e:
134
+ logger.warning(f"Error extracting charts from Excel: {e}")
135
+
136
+ return charts
137
+
138
+ def process_all_from_file(
139
+ self,
140
+ file_source: Union[str, bytes, BinaryIO]
141
+ ) -> List[str]:
142
+ """
143
+ Extract and format all charts from an Excel file.
144
+
145
+ Args:
146
+ file_source: File path, bytes, or file-like object
147
+
148
+ Returns:
149
+ List of formatted chart strings
150
+ """
151
+ results = []
152
+
153
+ for chart_data in self.extract_all_from_file(file_source):
154
+ formatted = self._format_chart_data(chart_data)
155
+ if formatted:
156
+ results.append(formatted)
157
+
158
+ return results
159
+
160
+ # ========================================================================
161
+ # OOXML Chart Parsing
162
+ # ========================================================================
163
+
164
+ def _parse_ooxml_chart(self, chart_xml: bytes) -> ChartData:
165
+ """
166
+ Parse OOXML chart XML (DrawingML Chart format).
167
+
168
+ Args:
169
+ chart_xml: Raw chart XML bytes
170
+
171
+ Returns:
172
+ ChartData with extracted information
173
+ """
174
+ try:
175
+ # Parse XML with error handling
176
+ root = self._parse_xml(chart_xml)
177
+ if root is None:
178
+ return ChartData()
179
+
180
+ # Find chart element
181
+ chart_elem = self._find_chart_element(root)
182
+ if chart_elem is None:
183
+ return ChartData()
184
+
185
+ # Extract title
186
+ title = self._extract_title(chart_elem)
187
+
188
+ # Extract chart type and series data
189
+ chart_type, categories, series = self._extract_plot_data(chart_elem)
190
+
191
+ return ChartData(
192
+ chart_type=chart_type,
193
+ title=title,
194
+ categories=categories if categories else None,
195
+ series=series if series else None
196
+ )
197
+
198
+ except Exception as e:
199
+ logger.debug(f"Error parsing OOXML chart: {e}")
200
+ return ChartData()
201
+
202
+ def _parse_xml(self, chart_xml: bytes) -> Optional[ET.Element]:
203
+ """Parse XML with BOM and encoding handling."""
204
+ try:
205
+ return ET.fromstring(chart_xml)
206
+ except ET.ParseError:
207
+ try:
208
+ # Try removing BOM or invalid characters
209
+ chart_str = chart_xml.decode('utf-8-sig', errors='ignore')
210
+ return ET.fromstring(chart_str)
211
+ except:
212
+ return None
213
+
214
+ def _find_chart_element(self, root: ET.Element) -> Optional[ET.Element]:
215
+ """Find the chart element in the XML tree."""
216
+ # Try with namespace prefix
217
+ chart_elem = root.find('.//c:chart', OOXML_NS)
218
+ if chart_elem is not None:
219
+ return chart_elem
220
+
221
+ # Try with full namespace
222
+ chart_elem = root.find('.//{http://schemas.openxmlformats.org/drawingml/2006/chart}chart')
223
+ if chart_elem is not None:
224
+ return chart_elem
225
+
226
+ # Check if root is chart itself
227
+ if root.tag.endswith('}chart') or root.tag == 'chart':
228
+ return root
229
+
230
+ return None
231
+
232
+ def _extract_title(self, chart_elem: ET.Element) -> Optional[str]:
233
+ """Extract chart title from various possible locations."""
234
+ # Primary path: c:title/c:tx/c:rich/a:p/a:r/a:t
235
+ title_elem = chart_elem.find('.//c:title//c:tx//c:rich//a:t', OOXML_NS)
236
+ if title_elem is not None and title_elem.text:
237
+ return title_elem.text.strip()
238
+
239
+ # Alternative: direct text
240
+ title_elem = chart_elem.find('.//{http://schemas.openxmlformats.org/drawingml/2006/chart}tx//{http://schemas.openxmlformats.org/drawingml/2006/main}t')
241
+ if title_elem is not None and title_elem.text:
242
+ return title_elem.text.strip()
243
+
244
+ # Try strRef path
245
+ title_elem = chart_elem.find('.//c:title//c:tx//c:strRef//c:strCache//c:pt//c:v', OOXML_NS)
246
+ if title_elem is not None and title_elem.text:
247
+ return title_elem.text.strip()
248
+
249
+ return None
250
+
251
+ def _extract_plot_data(self, chart_elem: ET.Element) -> tuple:
252
+ """
253
+ Extract chart type, categories, and series from plot area.
254
+
255
+ Returns:
256
+ Tuple of (chart_type, categories, series)
257
+ """
258
+ # Find plot area
259
+ plot_area = chart_elem.find('.//c:plotArea', OOXML_NS)
260
+ if plot_area is None:
261
+ plot_area = chart_elem.find('.//{http://schemas.openxmlformats.org/drawingml/2006/chart}plotArea')
262
+
263
+ if plot_area is None:
264
+ return "Chart", [], []
265
+
266
+ # Find chart type element
267
+ for chart_tag, type_name in CHART_TYPE_MAP.items():
268
+ elem = plot_area.find(f'.//c:{chart_tag}', OOXML_NS)
269
+ if elem is None:
270
+ elem = plot_area.find(f'.//{{{OOXML_NS["c"]}}}{chart_tag}')
271
+ if elem is not None:
272
+ categories, series = self._extract_series_data(elem)
273
+ return type_name, categories, series
274
+
275
+ return "Chart", [], []
276
+
277
+ def _extract_series_data(self, chart_type_elem: ET.Element) -> tuple:
278
+ """
279
+ Extract categories and series data from chart type element.
280
+
281
+ Args:
282
+ chart_type_elem: Chart type XML element (barChart, lineChart, etc.)
283
+
284
+ Returns:
285
+ Tuple of (categories, series)
286
+ """
287
+ ns_c = OOXML_NS['c']
288
+ categories = []
289
+ series = []
290
+ categories_extracted = False
291
+
292
+ # Find all series elements
293
+ series_elements = chart_type_elem.findall('.//c:ser', OOXML_NS)
294
+ if not series_elements:
295
+ series_elements = chart_type_elem.findall(f'.//{{{ns_c}}}ser')
296
+
297
+ for idx, ser_elem in enumerate(series_elements):
298
+ series_data = {
299
+ 'name': self._extract_series_name(ser_elem, idx),
300
+ 'values': []
301
+ }
302
+
303
+ # Extract categories from first series only
304
+ if not categories_extracted:
305
+ categories = self._extract_categories(ser_elem)
306
+ categories_extracted = True
307
+
308
+ # Extract values
309
+ series_data['values'] = self._extract_values(ser_elem)
310
+
311
+ if series_data['values']:
312
+ series.append(series_data)
313
+
314
+ return categories, series
315
+
316
+ def _extract_series_name(self, ser_elem: ET.Element, idx: int) -> str:
317
+ """Extract series name from series element."""
318
+ ns_c = OOXML_NS['c']
319
+
320
+ # Try direct value
321
+ tx_elem = ser_elem.find('.//c:tx//c:v', OOXML_NS)
322
+ if tx_elem is None:
323
+ tx_elem = ser_elem.find(f'.//{{{ns_c}}}tx//{{{ns_c}}}v')
324
+ if tx_elem is not None and tx_elem.text:
325
+ return tx_elem.text.strip()
326
+
327
+ # Try strRef path
328
+ str_ref = ser_elem.find('.//c:tx//c:strRef//c:strCache//c:pt//c:v', OOXML_NS)
329
+ if str_ref is None:
330
+ str_ref = ser_elem.find(f'.//{{{ns_c}}}tx//{{{ns_c}}}strRef//{{{ns_c}}}strCache//{{{ns_c}}}pt//{{{ns_c}}}v')
331
+ if str_ref is not None and str_ref.text:
332
+ return str_ref.text.strip()
333
+
334
+ return f"Series {idx + 1}"
335
+
336
+ def _extract_categories(self, ser_elem: ET.Element) -> List[str]:
337
+ """Extract category labels from series element."""
338
+ ns_c = OOXML_NS['c']
339
+ categories = []
340
+
341
+ # Find category element
342
+ cat_elem = ser_elem.find('.//c:cat', OOXML_NS)
343
+ if cat_elem is None:
344
+ cat_elem = ser_elem.find(f'.//{{{ns_c}}}cat')
345
+
346
+ if cat_elem is None:
347
+ return categories
348
+
349
+ # Try string cache first (text labels)
350
+ str_cache = cat_elem.find('.//c:strCache', OOXML_NS)
351
+ if str_cache is None:
352
+ str_cache = cat_elem.find(f'.//{{{ns_c}}}strCache')
353
+
354
+ if str_cache is not None:
355
+ categories = self._extract_point_values(str_cache, as_string=True)
356
+
357
+ # Fallback to numeric cache
358
+ if not categories:
359
+ num_cache = cat_elem.find('.//c:numCache', OOXML_NS)
360
+ if num_cache is None:
361
+ num_cache = cat_elem.find(f'.//{{{ns_c}}}numCache')
362
+
363
+ if num_cache is not None:
364
+ categories = self._extract_point_values(num_cache, as_string=True)
365
+
366
+ return categories
367
+
368
+ def _extract_values(self, ser_elem: ET.Element) -> List[Any]:
369
+ """Extract series values from series element."""
370
+ ns_c = OOXML_NS['c']
371
+ values = []
372
+
373
+ # Try val element
374
+ val_elem = ser_elem.find('.//c:val', OOXML_NS)
375
+ if val_elem is None:
376
+ val_elem = ser_elem.find(f'.//{{{ns_c}}}val')
377
+
378
+ if val_elem is not None:
379
+ num_cache = val_elem.find('.//c:numCache', OOXML_NS)
380
+ if num_cache is None:
381
+ num_cache = val_elem.find(f'.//{{{ns_c}}}numCache')
382
+
383
+ if num_cache is not None:
384
+ values = self._extract_point_values(num_cache, as_string=False)
385
+
386
+ # Try yVal for scatter/bubble charts
387
+ if not values:
388
+ yval_elem = ser_elem.find('.//c:yVal', OOXML_NS)
389
+ if yval_elem is None:
390
+ yval_elem = ser_elem.find(f'.//{{{ns_c}}}yVal')
391
+
392
+ if yval_elem is not None:
393
+ num_cache = yval_elem.find('.//c:numCache', OOXML_NS)
394
+ if num_cache is None:
395
+ num_cache = yval_elem.find(f'.//{{{ns_c}}}numCache')
396
+
397
+ if num_cache is not None:
398
+ values = self._extract_point_values(num_cache, as_string=False)
399
+
400
+ return values
401
+
402
+ def _extract_point_values(
403
+ self,
404
+ cache_elem: ET.Element,
405
+ as_string: bool = False
406
+ ) -> List[Any]:
407
+ """
408
+ Extract values from cache element (strCache or numCache).
409
+
410
+ Args:
411
+ cache_elem: Cache XML element
412
+ as_string: If True, return all values as strings
413
+
414
+ Returns:
415
+ List of extracted values
416
+ """
417
+ ns_c = OOXML_NS['c']
418
+ values = []
419
+
420
+ # Find all point elements
421
+ pts = cache_elem.findall('.//c:pt', OOXML_NS)
422
+ if not pts:
423
+ pts = cache_elem.findall(f'.//{{{ns_c}}}pt')
424
+
425
+ # Sort by index and extract values
426
+ for pt in sorted(pts, key=lambda x: int(x.get('idx', 0))):
427
+ v_elem = pt.find('c:v', OOXML_NS)
428
+ if v_elem is None:
429
+ v_elem = pt.find(f'{{{ns_c}}}v')
430
+
431
+ if v_elem is not None and v_elem.text:
432
+ text = v_elem.text.strip()
433
+ if as_string:
434
+ values.append(text)
435
+ else:
436
+ try:
437
+ values.append(float(text))
438
+ except ValueError:
439
+ values.append(text)
440
+
441
+ return values
442
+
443
+ # ========================================================================
444
+ # Dictionary Input Handling
445
+ # ========================================================================
446
+
447
+ def _from_dict(self, chart_info: Dict[str, Any]) -> ChartData:
448
+ """
449
+ Convert pre-parsed chart dictionary to ChartData.
450
+
451
+ Args:
452
+ chart_info: Dictionary with chart_type, title, categories, series
453
+
454
+ Returns:
455
+ ChartData instance
456
+ """
457
+ chart_type = chart_info.get('chart_type', 'Chart') or 'Chart'
458
+ title = chart_info.get('title')
459
+ categories = chart_info.get('categories', [])
460
+ series_list = chart_info.get('series', [])
461
+
462
+ # Normalize series data
463
+ series = []
464
+ for idx, s in enumerate(series_list):
465
+ if isinstance(s, dict):
466
+ series.append({
467
+ 'name': s.get('name') or f"Series {idx + 1}",
468
+ 'values': s.get('values', [])
469
+ })
470
+
471
+ return ChartData(
472
+ chart_type=chart_type,
473
+ title=title,
474
+ categories=[str(c) for c in categories] if categories else None,
475
+ series=series if series else None
476
+ )
477
+
478
+ # ========================================================================
479
+ # Formatting
480
+ # ========================================================================
481
+
482
+ def _format_chart_data(self, chart_data: ChartData) -> str:
483
+ """Format ChartData using ChartProcessor."""
484
+ if chart_data.has_data():
485
+ return self._chart_processor.format_chart_data(
486
+ chart_type=chart_data.chart_type,
487
+ title=chart_data.title,
488
+ categories=chart_data.categories,
489
+ series=chart_data.series
490
+ )
491
+ else:
492
+ return self._chart_processor.format_chart_fallback(
493
+ chart_type=chart_data.chart_type,
494
+ title=chart_data.title
495
+ )
496
+
497
+
498
+ __all__ = ['ExcelChartExtractor']
@@ -0,0 +1,157 @@
1
+ # xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py
2
+ """
3
+ ExcelFileConverter - Excel file format converter
4
+
5
+ Converts binary Excel data to Workbook object.
6
+ Supports both XLSX and XLS formats.
7
+ """
8
+ from io import BytesIO
9
+ from typing import Any, Optional, BinaryIO, Union
10
+
11
+ from xgen_doc2chunk.core.functions.file_converter import BaseFileConverter
12
+
13
+
14
+ class XLSXFileConverter(BaseFileConverter):
15
+ """
16
+ XLSX file converter using openpyxl.
17
+
18
+ Converts binary XLSX data to openpyxl Workbook object.
19
+ """
20
+
21
+ # ZIP magic number (XLSX is a ZIP file)
22
+ ZIP_MAGIC = b'PK\x03\x04'
23
+
24
+ def convert(
25
+ self,
26
+ file_data: bytes,
27
+ file_stream: Optional[BinaryIO] = None,
28
+ data_only: bool = True,
29
+ **kwargs
30
+ ) -> Any:
31
+ """
32
+ Convert binary XLSX data to Workbook object.
33
+
34
+ Args:
35
+ file_data: Raw binary XLSX data
36
+ file_stream: Optional file stream
37
+ data_only: If True, return calculated values instead of formulas
38
+ **kwargs: Additional options
39
+
40
+ Returns:
41
+ openpyxl.Workbook object
42
+ """
43
+ from openpyxl import load_workbook
44
+
45
+ stream = file_stream if file_stream is not None else BytesIO(file_data)
46
+ stream.seek(0)
47
+ return load_workbook(stream, data_only=data_only)
48
+
49
+ def get_format_name(self) -> str:
50
+ """Return format name."""
51
+ return "XLSX Workbook"
52
+
53
+ def validate(self, file_data: bytes) -> bool:
54
+ """Validate if data is a valid XLSX."""
55
+ if not file_data or len(file_data) < 4:
56
+ return False
57
+ return file_data[:4] == self.ZIP_MAGIC
58
+
59
+
60
+ class XLSFileConverter(BaseFileConverter):
61
+ """
62
+ XLS file converter using xlrd.
63
+
64
+ Converts binary XLS data to xlrd Workbook object.
65
+ """
66
+
67
+ # OLE magic number (XLS is an OLE file)
68
+ OLE_MAGIC = b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1'
69
+
70
+ def convert(
71
+ self,
72
+ file_data: bytes,
73
+ file_stream: Optional[BinaryIO] = None,
74
+ **kwargs
75
+ ) -> Any:
76
+ """
77
+ Convert binary XLS data to xlrd Workbook object.
78
+
79
+ Args:
80
+ file_data: Raw binary XLS data
81
+ file_stream: Optional file stream (not used)
82
+ **kwargs: Additional options
83
+
84
+ Returns:
85
+ xlrd.Book object
86
+ """
87
+ import xlrd
88
+ return xlrd.open_workbook(file_contents=file_data)
89
+
90
+ def get_format_name(self) -> str:
91
+ """Return format name."""
92
+ return "XLS Workbook"
93
+
94
+ def validate(self, file_data: bytes) -> bool:
95
+ """Validate if data is a valid XLS."""
96
+ if not file_data or len(file_data) < 8:
97
+ return False
98
+ return file_data[:8] == self.OLE_MAGIC
99
+
100
+
101
+ class ExcelFileConverter(BaseFileConverter):
102
+ """
103
+ Unified Excel file converter.
104
+
105
+ Auto-detects format (XLSX/XLS) and uses appropriate converter.
106
+ """
107
+
108
+ def __init__(self):
109
+ """Initialize with both converters."""
110
+ self._xlsx_converter = XLSXFileConverter()
111
+ self._xls_converter = XLSFileConverter()
112
+ self._used_converter: Optional[BaseFileConverter] = None
113
+
114
+ def convert(
115
+ self,
116
+ file_data: bytes,
117
+ file_stream: Optional[BinaryIO] = None,
118
+ extension: Optional[str] = None,
119
+ **kwargs
120
+ ) -> Any:
121
+ """
122
+ Convert binary Excel data to Workbook object.
123
+
124
+ Args:
125
+ file_data: Raw binary Excel data
126
+ file_stream: Optional file stream
127
+ extension: File extension hint ('xlsx' or 'xls')
128
+ **kwargs: Additional options
129
+
130
+ Returns:
131
+ Workbook object (openpyxl or xlrd)
132
+ """
133
+ # Determine format from extension or magic number
134
+ if extension:
135
+ ext = extension.lower().lstrip('.')
136
+ if ext == 'xlsx':
137
+ self._used_converter = self._xlsx_converter
138
+ elif ext == 'xls':
139
+ self._used_converter = self._xls_converter
140
+ else:
141
+ # Auto-detect
142
+ if self._xlsx_converter.validate(file_data):
143
+ self._used_converter = self._xlsx_converter
144
+ elif self._xls_converter.validate(file_data):
145
+ self._used_converter = self._xls_converter
146
+ else:
147
+ # Default to XLSX
148
+ self._used_converter = self._xlsx_converter
149
+
150
+ return self._used_converter.convert(file_data, file_stream, **kwargs)
151
+
152
+ def get_format_name(self) -> str:
153
+ """Return format name based on detected type."""
154
+ if self._used_converter:
155
+ return self._used_converter.get_format_name()
156
+ return "Excel Workbook"
157
+