xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,498 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Excel Chart Extractor
|
|
3
|
+
|
|
4
|
+
Extracts chart data from Excel files (XLSX/XLS).
|
|
5
|
+
Parses OOXML DrawingML Chart format (ISO/IEC 29500).
|
|
6
|
+
|
|
7
|
+
Handles:
|
|
8
|
+
- XLSX: Chart XML in xl/charts/*.xml
|
|
9
|
+
- Chart info dictionaries (pre-parsed)
|
|
10
|
+
"""
|
|
11
|
+
import io
|
|
12
|
+
import logging
|
|
13
|
+
import xml.etree.ElementTree as ET
|
|
14
|
+
import zipfile
|
|
15
|
+
from typing import Any, Dict, List, Optional, Union, BinaryIO
|
|
16
|
+
|
|
17
|
+
from xgen_doc2chunk.core.functions.chart_extractor import BaseChartExtractor, ChartData
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger("document-processor")
|
|
20
|
+
|
|
21
|
+
# OOXML namespaces
|
|
22
|
+
OOXML_NS = {
|
|
23
|
+
'c': 'http://schemas.openxmlformats.org/drawingml/2006/chart',
|
|
24
|
+
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
# Chart type mapping (OOXML tag -> display name)
|
|
28
|
+
CHART_TYPE_MAP = {
|
|
29
|
+
'barChart': 'Bar Chart',
|
|
30
|
+
'bar3DChart': '3D Bar Chart',
|
|
31
|
+
'lineChart': 'Line Chart',
|
|
32
|
+
'line3DChart': '3D Line Chart',
|
|
33
|
+
'pieChart': 'Pie Chart',
|
|
34
|
+
'pie3DChart': '3D Pie Chart',
|
|
35
|
+
'doughnutChart': 'Doughnut Chart',
|
|
36
|
+
'areaChart': 'Area Chart',
|
|
37
|
+
'area3DChart': '3D Area Chart',
|
|
38
|
+
'scatterChart': 'Scatter Chart',
|
|
39
|
+
'bubbleChart': 'Bubble Chart',
|
|
40
|
+
'radarChart': 'Radar Chart',
|
|
41
|
+
'surfaceChart': 'Surface Chart',
|
|
42
|
+
'surface3DChart': '3D Surface Chart',
|
|
43
|
+
'stockChart': 'Stock Chart',
|
|
44
|
+
'ofPieChart': 'Pie of Pie Chart',
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class ExcelChartExtractor(BaseChartExtractor):
|
|
49
|
+
"""
|
|
50
|
+
Chart extractor for Excel files (XLSX/XLS).
|
|
51
|
+
|
|
52
|
+
Supports:
|
|
53
|
+
- Direct chart XML bytes parsing
|
|
54
|
+
- Pre-parsed chart info dictionaries
|
|
55
|
+
- Full file extraction (via extract_all_from_file)
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
# ========================================================================
|
|
59
|
+
# Main Interface
|
|
60
|
+
# ========================================================================
|
|
61
|
+
|
|
62
|
+
def extract(self, chart_element: Any) -> ChartData:
|
|
63
|
+
"""
|
|
64
|
+
Extract chart data from various input types.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
chart_element: One of:
|
|
68
|
+
- bytes: Raw chart XML
|
|
69
|
+
- dict: Pre-parsed chart info dictionary
|
|
70
|
+
- object with 'blob' attribute: Chart part object
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
ChartData with extracted information
|
|
74
|
+
"""
|
|
75
|
+
if not chart_element:
|
|
76
|
+
return ChartData()
|
|
77
|
+
|
|
78
|
+
# Handle chart XML bytes
|
|
79
|
+
if isinstance(chart_element, bytes):
|
|
80
|
+
return self._parse_ooxml_chart(chart_element)
|
|
81
|
+
|
|
82
|
+
# Handle pre-parsed dictionary
|
|
83
|
+
if isinstance(chart_element, dict):
|
|
84
|
+
return self._from_dict(chart_element)
|
|
85
|
+
|
|
86
|
+
# Handle object with blob attribute
|
|
87
|
+
if hasattr(chart_element, 'blob'):
|
|
88
|
+
return self._parse_ooxml_chart(chart_element.blob)
|
|
89
|
+
|
|
90
|
+
return ChartData()
|
|
91
|
+
|
|
92
|
+
def extract_all_from_file(
|
|
93
|
+
self,
|
|
94
|
+
file_source: Union[str, bytes, BinaryIO]
|
|
95
|
+
) -> List[ChartData]:
|
|
96
|
+
"""
|
|
97
|
+
Extract all charts from an Excel file.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
file_source: File path, bytes, or file-like object
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
List of ChartData for all charts in the file
|
|
104
|
+
"""
|
|
105
|
+
charts = []
|
|
106
|
+
|
|
107
|
+
try:
|
|
108
|
+
# Prepare file-like object
|
|
109
|
+
if isinstance(file_source, str):
|
|
110
|
+
zf = zipfile.ZipFile(file_source, 'r')
|
|
111
|
+
elif isinstance(file_source, bytes):
|
|
112
|
+
zf = zipfile.ZipFile(io.BytesIO(file_source), 'r')
|
|
113
|
+
else:
|
|
114
|
+
file_source.seek(0)
|
|
115
|
+
zf = zipfile.ZipFile(file_source, 'r')
|
|
116
|
+
|
|
117
|
+
try:
|
|
118
|
+
# Find all chart XML files
|
|
119
|
+
for name in sorted(zf.namelist()):
|
|
120
|
+
if name.startswith('xl/charts/chart') and name.endswith('.xml'):
|
|
121
|
+
try:
|
|
122
|
+
chart_xml = zf.read(name)
|
|
123
|
+
chart_data = self._parse_ooxml_chart(chart_xml)
|
|
124
|
+
if chart_data.has_data():
|
|
125
|
+
charts.append(chart_data)
|
|
126
|
+
except Exception as e:
|
|
127
|
+
logger.debug(f"Error parsing chart {name}: {e}")
|
|
128
|
+
finally:
|
|
129
|
+
zf.close()
|
|
130
|
+
|
|
131
|
+
logger.debug(f"Extracted {len(charts)} charts from Excel file")
|
|
132
|
+
|
|
133
|
+
except Exception as e:
|
|
134
|
+
logger.warning(f"Error extracting charts from Excel: {e}")
|
|
135
|
+
|
|
136
|
+
return charts
|
|
137
|
+
|
|
138
|
+
def process_all_from_file(
|
|
139
|
+
self,
|
|
140
|
+
file_source: Union[str, bytes, BinaryIO]
|
|
141
|
+
) -> List[str]:
|
|
142
|
+
"""
|
|
143
|
+
Extract and format all charts from an Excel file.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
file_source: File path, bytes, or file-like object
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
List of formatted chart strings
|
|
150
|
+
"""
|
|
151
|
+
results = []
|
|
152
|
+
|
|
153
|
+
for chart_data in self.extract_all_from_file(file_source):
|
|
154
|
+
formatted = self._format_chart_data(chart_data)
|
|
155
|
+
if formatted:
|
|
156
|
+
results.append(formatted)
|
|
157
|
+
|
|
158
|
+
return results
|
|
159
|
+
|
|
160
|
+
# ========================================================================
|
|
161
|
+
# OOXML Chart Parsing
|
|
162
|
+
# ========================================================================
|
|
163
|
+
|
|
164
|
+
def _parse_ooxml_chart(self, chart_xml: bytes) -> ChartData:
|
|
165
|
+
"""
|
|
166
|
+
Parse OOXML chart XML (DrawingML Chart format).
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
chart_xml: Raw chart XML bytes
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
ChartData with extracted information
|
|
173
|
+
"""
|
|
174
|
+
try:
|
|
175
|
+
# Parse XML with error handling
|
|
176
|
+
root = self._parse_xml(chart_xml)
|
|
177
|
+
if root is None:
|
|
178
|
+
return ChartData()
|
|
179
|
+
|
|
180
|
+
# Find chart element
|
|
181
|
+
chart_elem = self._find_chart_element(root)
|
|
182
|
+
if chart_elem is None:
|
|
183
|
+
return ChartData()
|
|
184
|
+
|
|
185
|
+
# Extract title
|
|
186
|
+
title = self._extract_title(chart_elem)
|
|
187
|
+
|
|
188
|
+
# Extract chart type and series data
|
|
189
|
+
chart_type, categories, series = self._extract_plot_data(chart_elem)
|
|
190
|
+
|
|
191
|
+
return ChartData(
|
|
192
|
+
chart_type=chart_type,
|
|
193
|
+
title=title,
|
|
194
|
+
categories=categories if categories else None,
|
|
195
|
+
series=series if series else None
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
except Exception as e:
|
|
199
|
+
logger.debug(f"Error parsing OOXML chart: {e}")
|
|
200
|
+
return ChartData()
|
|
201
|
+
|
|
202
|
+
def _parse_xml(self, chart_xml: bytes) -> Optional[ET.Element]:
|
|
203
|
+
"""Parse XML with BOM and encoding handling."""
|
|
204
|
+
try:
|
|
205
|
+
return ET.fromstring(chart_xml)
|
|
206
|
+
except ET.ParseError:
|
|
207
|
+
try:
|
|
208
|
+
# Try removing BOM or invalid characters
|
|
209
|
+
chart_str = chart_xml.decode('utf-8-sig', errors='ignore')
|
|
210
|
+
return ET.fromstring(chart_str)
|
|
211
|
+
except:
|
|
212
|
+
return None
|
|
213
|
+
|
|
214
|
+
def _find_chart_element(self, root: ET.Element) -> Optional[ET.Element]:
|
|
215
|
+
"""Find the chart element in the XML tree."""
|
|
216
|
+
# Try with namespace prefix
|
|
217
|
+
chart_elem = root.find('.//c:chart', OOXML_NS)
|
|
218
|
+
if chart_elem is not None:
|
|
219
|
+
return chart_elem
|
|
220
|
+
|
|
221
|
+
# Try with full namespace
|
|
222
|
+
chart_elem = root.find('.//{http://schemas.openxmlformats.org/drawingml/2006/chart}chart')
|
|
223
|
+
if chart_elem is not None:
|
|
224
|
+
return chart_elem
|
|
225
|
+
|
|
226
|
+
# Check if root is chart itself
|
|
227
|
+
if root.tag.endswith('}chart') or root.tag == 'chart':
|
|
228
|
+
return root
|
|
229
|
+
|
|
230
|
+
return None
|
|
231
|
+
|
|
232
|
+
def _extract_title(self, chart_elem: ET.Element) -> Optional[str]:
|
|
233
|
+
"""Extract chart title from various possible locations."""
|
|
234
|
+
# Primary path: c:title/c:tx/c:rich/a:p/a:r/a:t
|
|
235
|
+
title_elem = chart_elem.find('.//c:title//c:tx//c:rich//a:t', OOXML_NS)
|
|
236
|
+
if title_elem is not None and title_elem.text:
|
|
237
|
+
return title_elem.text.strip()
|
|
238
|
+
|
|
239
|
+
# Alternative: direct text
|
|
240
|
+
title_elem = chart_elem.find('.//{http://schemas.openxmlformats.org/drawingml/2006/chart}tx//{http://schemas.openxmlformats.org/drawingml/2006/main}t')
|
|
241
|
+
if title_elem is not None and title_elem.text:
|
|
242
|
+
return title_elem.text.strip()
|
|
243
|
+
|
|
244
|
+
# Try strRef path
|
|
245
|
+
title_elem = chart_elem.find('.//c:title//c:tx//c:strRef//c:strCache//c:pt//c:v', OOXML_NS)
|
|
246
|
+
if title_elem is not None and title_elem.text:
|
|
247
|
+
return title_elem.text.strip()
|
|
248
|
+
|
|
249
|
+
return None
|
|
250
|
+
|
|
251
|
+
def _extract_plot_data(self, chart_elem: ET.Element) -> tuple:
|
|
252
|
+
"""
|
|
253
|
+
Extract chart type, categories, and series from plot area.
|
|
254
|
+
|
|
255
|
+
Returns:
|
|
256
|
+
Tuple of (chart_type, categories, series)
|
|
257
|
+
"""
|
|
258
|
+
# Find plot area
|
|
259
|
+
plot_area = chart_elem.find('.//c:plotArea', OOXML_NS)
|
|
260
|
+
if plot_area is None:
|
|
261
|
+
plot_area = chart_elem.find('.//{http://schemas.openxmlformats.org/drawingml/2006/chart}plotArea')
|
|
262
|
+
|
|
263
|
+
if plot_area is None:
|
|
264
|
+
return "Chart", [], []
|
|
265
|
+
|
|
266
|
+
# Find chart type element
|
|
267
|
+
for chart_tag, type_name in CHART_TYPE_MAP.items():
|
|
268
|
+
elem = plot_area.find(f'.//c:{chart_tag}', OOXML_NS)
|
|
269
|
+
if elem is None:
|
|
270
|
+
elem = plot_area.find(f'.//{{{OOXML_NS["c"]}}}{chart_tag}')
|
|
271
|
+
if elem is not None:
|
|
272
|
+
categories, series = self._extract_series_data(elem)
|
|
273
|
+
return type_name, categories, series
|
|
274
|
+
|
|
275
|
+
return "Chart", [], []
|
|
276
|
+
|
|
277
|
+
def _extract_series_data(self, chart_type_elem: ET.Element) -> tuple:
|
|
278
|
+
"""
|
|
279
|
+
Extract categories and series data from chart type element.
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
chart_type_elem: Chart type XML element (barChart, lineChart, etc.)
|
|
283
|
+
|
|
284
|
+
Returns:
|
|
285
|
+
Tuple of (categories, series)
|
|
286
|
+
"""
|
|
287
|
+
ns_c = OOXML_NS['c']
|
|
288
|
+
categories = []
|
|
289
|
+
series = []
|
|
290
|
+
categories_extracted = False
|
|
291
|
+
|
|
292
|
+
# Find all series elements
|
|
293
|
+
series_elements = chart_type_elem.findall('.//c:ser', OOXML_NS)
|
|
294
|
+
if not series_elements:
|
|
295
|
+
series_elements = chart_type_elem.findall(f'.//{{{ns_c}}}ser')
|
|
296
|
+
|
|
297
|
+
for idx, ser_elem in enumerate(series_elements):
|
|
298
|
+
series_data = {
|
|
299
|
+
'name': self._extract_series_name(ser_elem, idx),
|
|
300
|
+
'values': []
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
# Extract categories from first series only
|
|
304
|
+
if not categories_extracted:
|
|
305
|
+
categories = self._extract_categories(ser_elem)
|
|
306
|
+
categories_extracted = True
|
|
307
|
+
|
|
308
|
+
# Extract values
|
|
309
|
+
series_data['values'] = self._extract_values(ser_elem)
|
|
310
|
+
|
|
311
|
+
if series_data['values']:
|
|
312
|
+
series.append(series_data)
|
|
313
|
+
|
|
314
|
+
return categories, series
|
|
315
|
+
|
|
316
|
+
def _extract_series_name(self, ser_elem: ET.Element, idx: int) -> str:
|
|
317
|
+
"""Extract series name from series element."""
|
|
318
|
+
ns_c = OOXML_NS['c']
|
|
319
|
+
|
|
320
|
+
# Try direct value
|
|
321
|
+
tx_elem = ser_elem.find('.//c:tx//c:v', OOXML_NS)
|
|
322
|
+
if tx_elem is None:
|
|
323
|
+
tx_elem = ser_elem.find(f'.//{{{ns_c}}}tx//{{{ns_c}}}v')
|
|
324
|
+
if tx_elem is not None and tx_elem.text:
|
|
325
|
+
return tx_elem.text.strip()
|
|
326
|
+
|
|
327
|
+
# Try strRef path
|
|
328
|
+
str_ref = ser_elem.find('.//c:tx//c:strRef//c:strCache//c:pt//c:v', OOXML_NS)
|
|
329
|
+
if str_ref is None:
|
|
330
|
+
str_ref = ser_elem.find(f'.//{{{ns_c}}}tx//{{{ns_c}}}strRef//{{{ns_c}}}strCache//{{{ns_c}}}pt//{{{ns_c}}}v')
|
|
331
|
+
if str_ref is not None and str_ref.text:
|
|
332
|
+
return str_ref.text.strip()
|
|
333
|
+
|
|
334
|
+
return f"Series {idx + 1}"
|
|
335
|
+
|
|
336
|
+
def _extract_categories(self, ser_elem: ET.Element) -> List[str]:
|
|
337
|
+
"""Extract category labels from series element."""
|
|
338
|
+
ns_c = OOXML_NS['c']
|
|
339
|
+
categories = []
|
|
340
|
+
|
|
341
|
+
# Find category element
|
|
342
|
+
cat_elem = ser_elem.find('.//c:cat', OOXML_NS)
|
|
343
|
+
if cat_elem is None:
|
|
344
|
+
cat_elem = ser_elem.find(f'.//{{{ns_c}}}cat')
|
|
345
|
+
|
|
346
|
+
if cat_elem is None:
|
|
347
|
+
return categories
|
|
348
|
+
|
|
349
|
+
# Try string cache first (text labels)
|
|
350
|
+
str_cache = cat_elem.find('.//c:strCache', OOXML_NS)
|
|
351
|
+
if str_cache is None:
|
|
352
|
+
str_cache = cat_elem.find(f'.//{{{ns_c}}}strCache')
|
|
353
|
+
|
|
354
|
+
if str_cache is not None:
|
|
355
|
+
categories = self._extract_point_values(str_cache, as_string=True)
|
|
356
|
+
|
|
357
|
+
# Fallback to numeric cache
|
|
358
|
+
if not categories:
|
|
359
|
+
num_cache = cat_elem.find('.//c:numCache', OOXML_NS)
|
|
360
|
+
if num_cache is None:
|
|
361
|
+
num_cache = cat_elem.find(f'.//{{{ns_c}}}numCache')
|
|
362
|
+
|
|
363
|
+
if num_cache is not None:
|
|
364
|
+
categories = self._extract_point_values(num_cache, as_string=True)
|
|
365
|
+
|
|
366
|
+
return categories
|
|
367
|
+
|
|
368
|
+
def _extract_values(self, ser_elem: ET.Element) -> List[Any]:
|
|
369
|
+
"""Extract series values from series element."""
|
|
370
|
+
ns_c = OOXML_NS['c']
|
|
371
|
+
values = []
|
|
372
|
+
|
|
373
|
+
# Try val element
|
|
374
|
+
val_elem = ser_elem.find('.//c:val', OOXML_NS)
|
|
375
|
+
if val_elem is None:
|
|
376
|
+
val_elem = ser_elem.find(f'.//{{{ns_c}}}val')
|
|
377
|
+
|
|
378
|
+
if val_elem is not None:
|
|
379
|
+
num_cache = val_elem.find('.//c:numCache', OOXML_NS)
|
|
380
|
+
if num_cache is None:
|
|
381
|
+
num_cache = val_elem.find(f'.//{{{ns_c}}}numCache')
|
|
382
|
+
|
|
383
|
+
if num_cache is not None:
|
|
384
|
+
values = self._extract_point_values(num_cache, as_string=False)
|
|
385
|
+
|
|
386
|
+
# Try yVal for scatter/bubble charts
|
|
387
|
+
if not values:
|
|
388
|
+
yval_elem = ser_elem.find('.//c:yVal', OOXML_NS)
|
|
389
|
+
if yval_elem is None:
|
|
390
|
+
yval_elem = ser_elem.find(f'.//{{{ns_c}}}yVal')
|
|
391
|
+
|
|
392
|
+
if yval_elem is not None:
|
|
393
|
+
num_cache = yval_elem.find('.//c:numCache', OOXML_NS)
|
|
394
|
+
if num_cache is None:
|
|
395
|
+
num_cache = yval_elem.find(f'.//{{{ns_c}}}numCache')
|
|
396
|
+
|
|
397
|
+
if num_cache is not None:
|
|
398
|
+
values = self._extract_point_values(num_cache, as_string=False)
|
|
399
|
+
|
|
400
|
+
return values
|
|
401
|
+
|
|
402
|
+
def _extract_point_values(
|
|
403
|
+
self,
|
|
404
|
+
cache_elem: ET.Element,
|
|
405
|
+
as_string: bool = False
|
|
406
|
+
) -> List[Any]:
|
|
407
|
+
"""
|
|
408
|
+
Extract values from cache element (strCache or numCache).
|
|
409
|
+
|
|
410
|
+
Args:
|
|
411
|
+
cache_elem: Cache XML element
|
|
412
|
+
as_string: If True, return all values as strings
|
|
413
|
+
|
|
414
|
+
Returns:
|
|
415
|
+
List of extracted values
|
|
416
|
+
"""
|
|
417
|
+
ns_c = OOXML_NS['c']
|
|
418
|
+
values = []
|
|
419
|
+
|
|
420
|
+
# Find all point elements
|
|
421
|
+
pts = cache_elem.findall('.//c:pt', OOXML_NS)
|
|
422
|
+
if not pts:
|
|
423
|
+
pts = cache_elem.findall(f'.//{{{ns_c}}}pt')
|
|
424
|
+
|
|
425
|
+
# Sort by index and extract values
|
|
426
|
+
for pt in sorted(pts, key=lambda x: int(x.get('idx', 0))):
|
|
427
|
+
v_elem = pt.find('c:v', OOXML_NS)
|
|
428
|
+
if v_elem is None:
|
|
429
|
+
v_elem = pt.find(f'{{{ns_c}}}v')
|
|
430
|
+
|
|
431
|
+
if v_elem is not None and v_elem.text:
|
|
432
|
+
text = v_elem.text.strip()
|
|
433
|
+
if as_string:
|
|
434
|
+
values.append(text)
|
|
435
|
+
else:
|
|
436
|
+
try:
|
|
437
|
+
values.append(float(text))
|
|
438
|
+
except ValueError:
|
|
439
|
+
values.append(text)
|
|
440
|
+
|
|
441
|
+
return values
|
|
442
|
+
|
|
443
|
+
# ========================================================================
|
|
444
|
+
# Dictionary Input Handling
|
|
445
|
+
# ========================================================================
|
|
446
|
+
|
|
447
|
+
def _from_dict(self, chart_info: Dict[str, Any]) -> ChartData:
|
|
448
|
+
"""
|
|
449
|
+
Convert pre-parsed chart dictionary to ChartData.
|
|
450
|
+
|
|
451
|
+
Args:
|
|
452
|
+
chart_info: Dictionary with chart_type, title, categories, series
|
|
453
|
+
|
|
454
|
+
Returns:
|
|
455
|
+
ChartData instance
|
|
456
|
+
"""
|
|
457
|
+
chart_type = chart_info.get('chart_type', 'Chart') or 'Chart'
|
|
458
|
+
title = chart_info.get('title')
|
|
459
|
+
categories = chart_info.get('categories', [])
|
|
460
|
+
series_list = chart_info.get('series', [])
|
|
461
|
+
|
|
462
|
+
# Normalize series data
|
|
463
|
+
series = []
|
|
464
|
+
for idx, s in enumerate(series_list):
|
|
465
|
+
if isinstance(s, dict):
|
|
466
|
+
series.append({
|
|
467
|
+
'name': s.get('name') or f"Series {idx + 1}",
|
|
468
|
+
'values': s.get('values', [])
|
|
469
|
+
})
|
|
470
|
+
|
|
471
|
+
return ChartData(
|
|
472
|
+
chart_type=chart_type,
|
|
473
|
+
title=title,
|
|
474
|
+
categories=[str(c) for c in categories] if categories else None,
|
|
475
|
+
series=series if series else None
|
|
476
|
+
)
|
|
477
|
+
|
|
478
|
+
# ========================================================================
|
|
479
|
+
# Formatting
|
|
480
|
+
# ========================================================================
|
|
481
|
+
|
|
482
|
+
def _format_chart_data(self, chart_data: ChartData) -> str:
|
|
483
|
+
"""Format ChartData using ChartProcessor."""
|
|
484
|
+
if chart_data.has_data():
|
|
485
|
+
return self._chart_processor.format_chart_data(
|
|
486
|
+
chart_type=chart_data.chart_type,
|
|
487
|
+
title=chart_data.title,
|
|
488
|
+
categories=chart_data.categories,
|
|
489
|
+
series=chart_data.series
|
|
490
|
+
)
|
|
491
|
+
else:
|
|
492
|
+
return self._chart_processor.format_chart_fallback(
|
|
493
|
+
chart_type=chart_data.chart_type,
|
|
494
|
+
title=chart_data.title
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
__all__ = ['ExcelChartExtractor']
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py
|
|
2
|
+
"""
|
|
3
|
+
ExcelFileConverter - Excel file format converter
|
|
4
|
+
|
|
5
|
+
Converts binary Excel data to Workbook object.
|
|
6
|
+
Supports both XLSX and XLS formats.
|
|
7
|
+
"""
|
|
8
|
+
from io import BytesIO
|
|
9
|
+
from typing import Any, Optional, BinaryIO, Union
|
|
10
|
+
|
|
11
|
+
from xgen_doc2chunk.core.functions.file_converter import BaseFileConverter
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class XLSXFileConverter(BaseFileConverter):
|
|
15
|
+
"""
|
|
16
|
+
XLSX file converter using openpyxl.
|
|
17
|
+
|
|
18
|
+
Converts binary XLSX data to openpyxl Workbook object.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
# ZIP magic number (XLSX is a ZIP file)
|
|
22
|
+
ZIP_MAGIC = b'PK\x03\x04'
|
|
23
|
+
|
|
24
|
+
def convert(
|
|
25
|
+
self,
|
|
26
|
+
file_data: bytes,
|
|
27
|
+
file_stream: Optional[BinaryIO] = None,
|
|
28
|
+
data_only: bool = True,
|
|
29
|
+
**kwargs
|
|
30
|
+
) -> Any:
|
|
31
|
+
"""
|
|
32
|
+
Convert binary XLSX data to Workbook object.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
file_data: Raw binary XLSX data
|
|
36
|
+
file_stream: Optional file stream
|
|
37
|
+
data_only: If True, return calculated values instead of formulas
|
|
38
|
+
**kwargs: Additional options
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
openpyxl.Workbook object
|
|
42
|
+
"""
|
|
43
|
+
from openpyxl import load_workbook
|
|
44
|
+
|
|
45
|
+
stream = file_stream if file_stream is not None else BytesIO(file_data)
|
|
46
|
+
stream.seek(0)
|
|
47
|
+
return load_workbook(stream, data_only=data_only)
|
|
48
|
+
|
|
49
|
+
def get_format_name(self) -> str:
|
|
50
|
+
"""Return format name."""
|
|
51
|
+
return "XLSX Workbook"
|
|
52
|
+
|
|
53
|
+
def validate(self, file_data: bytes) -> bool:
|
|
54
|
+
"""Validate if data is a valid XLSX."""
|
|
55
|
+
if not file_data or len(file_data) < 4:
|
|
56
|
+
return False
|
|
57
|
+
return file_data[:4] == self.ZIP_MAGIC
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class XLSFileConverter(BaseFileConverter):
|
|
61
|
+
"""
|
|
62
|
+
XLS file converter using xlrd.
|
|
63
|
+
|
|
64
|
+
Converts binary XLS data to xlrd Workbook object.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
# OLE magic number (XLS is an OLE file)
|
|
68
|
+
OLE_MAGIC = b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1'
|
|
69
|
+
|
|
70
|
+
def convert(
|
|
71
|
+
self,
|
|
72
|
+
file_data: bytes,
|
|
73
|
+
file_stream: Optional[BinaryIO] = None,
|
|
74
|
+
**kwargs
|
|
75
|
+
) -> Any:
|
|
76
|
+
"""
|
|
77
|
+
Convert binary XLS data to xlrd Workbook object.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
file_data: Raw binary XLS data
|
|
81
|
+
file_stream: Optional file stream (not used)
|
|
82
|
+
**kwargs: Additional options
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
xlrd.Book object
|
|
86
|
+
"""
|
|
87
|
+
import xlrd
|
|
88
|
+
return xlrd.open_workbook(file_contents=file_data)
|
|
89
|
+
|
|
90
|
+
def get_format_name(self) -> str:
|
|
91
|
+
"""Return format name."""
|
|
92
|
+
return "XLS Workbook"
|
|
93
|
+
|
|
94
|
+
def validate(self, file_data: bytes) -> bool:
|
|
95
|
+
"""Validate if data is a valid XLS."""
|
|
96
|
+
if not file_data or len(file_data) < 8:
|
|
97
|
+
return False
|
|
98
|
+
return file_data[:8] == self.OLE_MAGIC
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class ExcelFileConverter(BaseFileConverter):
|
|
102
|
+
"""
|
|
103
|
+
Unified Excel file converter.
|
|
104
|
+
|
|
105
|
+
Auto-detects format (XLSX/XLS) and uses appropriate converter.
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
def __init__(self):
|
|
109
|
+
"""Initialize with both converters."""
|
|
110
|
+
self._xlsx_converter = XLSXFileConverter()
|
|
111
|
+
self._xls_converter = XLSFileConverter()
|
|
112
|
+
self._used_converter: Optional[BaseFileConverter] = None
|
|
113
|
+
|
|
114
|
+
def convert(
|
|
115
|
+
self,
|
|
116
|
+
file_data: bytes,
|
|
117
|
+
file_stream: Optional[BinaryIO] = None,
|
|
118
|
+
extension: Optional[str] = None,
|
|
119
|
+
**kwargs
|
|
120
|
+
) -> Any:
|
|
121
|
+
"""
|
|
122
|
+
Convert binary Excel data to Workbook object.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
file_data: Raw binary Excel data
|
|
126
|
+
file_stream: Optional file stream
|
|
127
|
+
extension: File extension hint ('xlsx' or 'xls')
|
|
128
|
+
**kwargs: Additional options
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
Workbook object (openpyxl or xlrd)
|
|
132
|
+
"""
|
|
133
|
+
# Determine format from extension or magic number
|
|
134
|
+
if extension:
|
|
135
|
+
ext = extension.lower().lstrip('.')
|
|
136
|
+
if ext == 'xlsx':
|
|
137
|
+
self._used_converter = self._xlsx_converter
|
|
138
|
+
elif ext == 'xls':
|
|
139
|
+
self._used_converter = self._xls_converter
|
|
140
|
+
else:
|
|
141
|
+
# Auto-detect
|
|
142
|
+
if self._xlsx_converter.validate(file_data):
|
|
143
|
+
self._used_converter = self._xlsx_converter
|
|
144
|
+
elif self._xls_converter.validate(file_data):
|
|
145
|
+
self._used_converter = self._xls_converter
|
|
146
|
+
else:
|
|
147
|
+
# Default to XLSX
|
|
148
|
+
self._used_converter = self._xlsx_converter
|
|
149
|
+
|
|
150
|
+
return self._used_converter.convert(file_data, file_stream, **kwargs)
|
|
151
|
+
|
|
152
|
+
def get_format_name(self) -> str:
|
|
153
|
+
"""Return format name based on detected type."""
|
|
154
|
+
if self._used_converter:
|
|
155
|
+
return self._used_converter.get_format_name()
|
|
156
|
+
return "Excel Workbook"
|
|
157
|
+
|