xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,436 @@
|
|
|
1
|
+
"""
|
|
2
|
+
DOCX Chart Extractor
|
|
3
|
+
|
|
4
|
+
Extracts all chart data from DOCX files.
|
|
5
|
+
Parses OOXML DrawingML Chart format (ISO/IEC 29500).
|
|
6
|
+
|
|
7
|
+
Structure:
|
|
8
|
+
- Charts are stored in word/charts/chart*.xml
|
|
9
|
+
- Referenced via relationships in document.xml
|
|
10
|
+
"""
|
|
11
|
+
import io
|
|
12
|
+
import logging
|
|
13
|
+
import xml.etree.ElementTree as ET
|
|
14
|
+
import zipfile
|
|
15
|
+
from typing import Any, Dict, List, Optional, Union, BinaryIO
|
|
16
|
+
|
|
17
|
+
from xgen_doc2chunk.core.functions.chart_extractor import BaseChartExtractor, ChartData
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger("document-processor")
|
|
20
|
+
|
|
21
|
+
# OOXML namespaces
|
|
22
|
+
OOXML_NS = {
|
|
23
|
+
'c': 'http://schemas.openxmlformats.org/drawingml/2006/chart',
|
|
24
|
+
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
# Chart type mapping
|
|
28
|
+
CHART_TYPE_MAP = {
|
|
29
|
+
'barChart': 'Bar Chart',
|
|
30
|
+
'bar3DChart': '3D Bar Chart',
|
|
31
|
+
'lineChart': 'Line Chart',
|
|
32
|
+
'line3DChart': '3D Line Chart',
|
|
33
|
+
'pieChart': 'Pie Chart',
|
|
34
|
+
'pie3DChart': '3D Pie Chart',
|
|
35
|
+
'doughnutChart': 'Doughnut Chart',
|
|
36
|
+
'areaChart': 'Area Chart',
|
|
37
|
+
'area3DChart': '3D Area Chart',
|
|
38
|
+
'scatterChart': 'Scatter Chart',
|
|
39
|
+
'bubbleChart': 'Bubble Chart',
|
|
40
|
+
'radarChart': 'Radar Chart',
|
|
41
|
+
'surfaceChart': 'Surface Chart',
|
|
42
|
+
'surface3DChart': '3D Surface Chart',
|
|
43
|
+
'stockChart': 'Stock Chart',
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class DOCXChartExtractor(BaseChartExtractor):
|
|
48
|
+
"""
|
|
49
|
+
Chart extractor for DOCX files.
|
|
50
|
+
|
|
51
|
+
Extracts all charts from DOCX by parsing word/charts/*.xml files.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
# ========================================================================
|
|
55
|
+
# Main Interface
|
|
56
|
+
# ========================================================================
|
|
57
|
+
|
|
58
|
+
def extract(self, chart_element: Any) -> ChartData:
|
|
59
|
+
"""
|
|
60
|
+
Extract chart data from various input types.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
chart_element: One of:
|
|
64
|
+
- bytes: Raw chart XML
|
|
65
|
+
- object with 'blob' attribute: Chart part object
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
ChartData with extracted information
|
|
69
|
+
"""
|
|
70
|
+
if not chart_element:
|
|
71
|
+
return ChartData()
|
|
72
|
+
|
|
73
|
+
# Handle chart XML bytes
|
|
74
|
+
if isinstance(chart_element, bytes):
|
|
75
|
+
return self._parse_ooxml_chart(chart_element)
|
|
76
|
+
|
|
77
|
+
# Handle object with blob attribute (chart part)
|
|
78
|
+
if hasattr(chart_element, 'blob'):
|
|
79
|
+
return self._parse_ooxml_chart(chart_element.blob)
|
|
80
|
+
|
|
81
|
+
return ChartData()
|
|
82
|
+
|
|
83
|
+
def extract_all_from_file(
|
|
84
|
+
self,
|
|
85
|
+
file_source: Union[str, bytes, BinaryIO]
|
|
86
|
+
) -> List[ChartData]:
|
|
87
|
+
"""
|
|
88
|
+
Extract all charts from a DOCX file.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
file_source: File path, bytes, or file-like object
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
List of ChartData for all charts in the file (in document order)
|
|
95
|
+
"""
|
|
96
|
+
charts = []
|
|
97
|
+
|
|
98
|
+
try:
|
|
99
|
+
# Prepare file-like object
|
|
100
|
+
if isinstance(file_source, str):
|
|
101
|
+
zf = zipfile.ZipFile(file_source, 'r')
|
|
102
|
+
elif isinstance(file_source, bytes):
|
|
103
|
+
zf = zipfile.ZipFile(io.BytesIO(file_source), 'r')
|
|
104
|
+
else:
|
|
105
|
+
file_source.seek(0)
|
|
106
|
+
zf = zipfile.ZipFile(file_source, 'r')
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
# Find all chart XML files in word/charts/
|
|
110
|
+
chart_files = sorted([
|
|
111
|
+
name for name in zf.namelist()
|
|
112
|
+
if name.startswith('word/charts/chart') and name.endswith('.xml')
|
|
113
|
+
])
|
|
114
|
+
|
|
115
|
+
for chart_file in chart_files:
|
|
116
|
+
try:
|
|
117
|
+
chart_xml = zf.read(chart_file)
|
|
118
|
+
chart_data = self._parse_ooxml_chart(chart_xml)
|
|
119
|
+
if chart_data.has_data():
|
|
120
|
+
charts.append(chart_data)
|
|
121
|
+
else:
|
|
122
|
+
# Even empty charts should be tracked for position matching
|
|
123
|
+
charts.append(chart_data)
|
|
124
|
+
except Exception as e:
|
|
125
|
+
logger.debug(f"Error parsing chart {chart_file}: {e}")
|
|
126
|
+
charts.append(ChartData()) # Placeholder for failed chart
|
|
127
|
+
|
|
128
|
+
finally:
|
|
129
|
+
zf.close()
|
|
130
|
+
|
|
131
|
+
logger.debug(f"Extracted {len(charts)} charts from DOCX file")
|
|
132
|
+
|
|
133
|
+
except Exception as e:
|
|
134
|
+
logger.warning(f"Error extracting charts from DOCX: {e}")
|
|
135
|
+
|
|
136
|
+
return charts
|
|
137
|
+
|
|
138
|
+
def process_all_from_file(
|
|
139
|
+
self,
|
|
140
|
+
file_source: Union[str, bytes, BinaryIO]
|
|
141
|
+
) -> List[str]:
|
|
142
|
+
"""
|
|
143
|
+
Extract and format all charts from a DOCX file.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
file_source: File path, bytes, or file-like object
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
List of formatted chart strings
|
|
150
|
+
"""
|
|
151
|
+
results = []
|
|
152
|
+
|
|
153
|
+
for chart_data in self.extract_all_from_file(file_source):
|
|
154
|
+
formatted = self._format_chart_data(chart_data)
|
|
155
|
+
if formatted:
|
|
156
|
+
results.append(formatted)
|
|
157
|
+
|
|
158
|
+
return results
|
|
159
|
+
|
|
160
|
+
# ========================================================================
|
|
161
|
+
# OOXML Chart Parsing
|
|
162
|
+
# ========================================================================
|
|
163
|
+
|
|
164
|
+
def _parse_ooxml_chart(self, chart_xml: bytes) -> ChartData:
|
|
165
|
+
"""Parse OOXML chart XML."""
|
|
166
|
+
try:
|
|
167
|
+
# Parse XML
|
|
168
|
+
root = self._parse_xml(chart_xml)
|
|
169
|
+
if root is None:
|
|
170
|
+
return ChartData()
|
|
171
|
+
|
|
172
|
+
# Find chart element
|
|
173
|
+
chart_elem = self._find_chart_element(root)
|
|
174
|
+
if chart_elem is None:
|
|
175
|
+
return ChartData()
|
|
176
|
+
|
|
177
|
+
# Extract title
|
|
178
|
+
title = self._extract_title(chart_elem)
|
|
179
|
+
|
|
180
|
+
# Extract chart type and series data
|
|
181
|
+
chart_type, categories, series = self._extract_plot_data(chart_elem)
|
|
182
|
+
|
|
183
|
+
return ChartData(
|
|
184
|
+
chart_type=chart_type,
|
|
185
|
+
title=title,
|
|
186
|
+
categories=categories if categories else None,
|
|
187
|
+
series=series if series else None
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
except Exception as e:
|
|
191
|
+
logger.debug(f"Error parsing OOXML chart: {e}")
|
|
192
|
+
return ChartData()
|
|
193
|
+
|
|
194
|
+
def _parse_xml(self, chart_xml: bytes) -> Optional[ET.Element]:
|
|
195
|
+
"""Parse XML with BOM and encoding handling."""
|
|
196
|
+
try:
|
|
197
|
+
return ET.fromstring(chart_xml)
|
|
198
|
+
except ET.ParseError:
|
|
199
|
+
try:
|
|
200
|
+
chart_str = chart_xml.decode('utf-8-sig', errors='ignore')
|
|
201
|
+
return ET.fromstring(chart_str)
|
|
202
|
+
except:
|
|
203
|
+
return None
|
|
204
|
+
|
|
205
|
+
def _find_chart_element(self, root: ET.Element) -> Optional[ET.Element]:
|
|
206
|
+
"""Find the chart element in the XML tree."""
|
|
207
|
+
chart_elem = root.find('.//c:chart', OOXML_NS)
|
|
208
|
+
if chart_elem is not None:
|
|
209
|
+
return chart_elem
|
|
210
|
+
|
|
211
|
+
chart_elem = root.find('.//{http://schemas.openxmlformats.org/drawingml/2006/chart}chart')
|
|
212
|
+
if chart_elem is not None:
|
|
213
|
+
return chart_elem
|
|
214
|
+
|
|
215
|
+
if root.tag.endswith('}chart') or root.tag == 'chart':
|
|
216
|
+
return root
|
|
217
|
+
|
|
218
|
+
return None
|
|
219
|
+
|
|
220
|
+
def _extract_title(self, chart_elem: ET.Element) -> Optional[str]:
|
|
221
|
+
"""Extract chart title.
|
|
222
|
+
|
|
223
|
+
Chart titles in DOCX may be split across multiple <a:t> text elements
|
|
224
|
+
(text runs), so we need to find all of them and concatenate.
|
|
225
|
+
"""
|
|
226
|
+
# Primary path: find all text elements in title
|
|
227
|
+
title_container = chart_elem.find('.//c:title//c:tx//c:rich', OOXML_NS)
|
|
228
|
+
if title_container is not None:
|
|
229
|
+
# Find all a:t elements and concatenate their text
|
|
230
|
+
text_elements = title_container.findall('.//a:t', OOXML_NS)
|
|
231
|
+
if text_elements:
|
|
232
|
+
title_parts = [elem.text for elem in text_elements if elem.text]
|
|
233
|
+
if title_parts:
|
|
234
|
+
return ''.join(title_parts).strip()
|
|
235
|
+
|
|
236
|
+
# Alternative path with full namespace
|
|
237
|
+
title_container = chart_elem.find('.//{http://schemas.openxmlformats.org/drawingml/2006/chart}title//{http://schemas.openxmlformats.org/drawingml/2006/chart}tx//{http://schemas.openxmlformats.org/drawingml/2006/chart}rich')
|
|
238
|
+
if title_container is not None:
|
|
239
|
+
text_elements = title_container.findall('.//{http://schemas.openxmlformats.org/drawingml/2006/main}t')
|
|
240
|
+
if text_elements:
|
|
241
|
+
title_parts = [elem.text for elem in text_elements if elem.text]
|
|
242
|
+
if title_parts:
|
|
243
|
+
return ''.join(title_parts).strip()
|
|
244
|
+
|
|
245
|
+
# Fallback: try to find any a:t elements under title
|
|
246
|
+
text_elements = chart_elem.findall('.//c:title//a:t', OOXML_NS)
|
|
247
|
+
if text_elements:
|
|
248
|
+
title_parts = [elem.text for elem in text_elements if elem.text]
|
|
249
|
+
if title_parts:
|
|
250
|
+
return ''.join(title_parts).strip()
|
|
251
|
+
|
|
252
|
+
# Final fallback with full namespace
|
|
253
|
+
text_elements = chart_elem.findall('.//{http://schemas.openxmlformats.org/drawingml/2006/chart}title//{http://schemas.openxmlformats.org/drawingml/2006/main}t')
|
|
254
|
+
if text_elements:
|
|
255
|
+
title_parts = [elem.text for elem in text_elements if elem.text]
|
|
256
|
+
if title_parts:
|
|
257
|
+
return ''.join(title_parts).strip()
|
|
258
|
+
|
|
259
|
+
return None
|
|
260
|
+
|
|
261
|
+
def _extract_plot_data(self, chart_elem: ET.Element) -> tuple:
|
|
262
|
+
"""Extract chart type, categories, and series."""
|
|
263
|
+
plot_area = chart_elem.find('.//c:plotArea', OOXML_NS)
|
|
264
|
+
if plot_area is None:
|
|
265
|
+
plot_area = chart_elem.find('.//{http://schemas.openxmlformats.org/drawingml/2006/chart}plotArea')
|
|
266
|
+
|
|
267
|
+
if plot_area is None:
|
|
268
|
+
return "Chart", [], []
|
|
269
|
+
|
|
270
|
+
for chart_tag, type_name in CHART_TYPE_MAP.items():
|
|
271
|
+
elem = plot_area.find(f'.//c:{chart_tag}', OOXML_NS)
|
|
272
|
+
if elem is None:
|
|
273
|
+
elem = plot_area.find(f'.//{{{OOXML_NS["c"]}}}{chart_tag}')
|
|
274
|
+
if elem is not None:
|
|
275
|
+
categories, series = self._extract_series_data(elem)
|
|
276
|
+
return type_name, categories, series
|
|
277
|
+
|
|
278
|
+
return "Chart", [], []
|
|
279
|
+
|
|
280
|
+
def _extract_series_data(self, chart_type_elem: ET.Element) -> tuple:
|
|
281
|
+
"""Extract categories and series data."""
|
|
282
|
+
ns_c = OOXML_NS['c']
|
|
283
|
+
categories = []
|
|
284
|
+
series = []
|
|
285
|
+
categories_extracted = False
|
|
286
|
+
|
|
287
|
+
series_elements = chart_type_elem.findall('.//c:ser', OOXML_NS)
|
|
288
|
+
if not series_elements:
|
|
289
|
+
series_elements = chart_type_elem.findall(f'.//{{{ns_c}}}ser')
|
|
290
|
+
|
|
291
|
+
for idx, ser_elem in enumerate(series_elements):
|
|
292
|
+
series_data = {
|
|
293
|
+
'name': self._extract_series_name(ser_elem, idx),
|
|
294
|
+
'values': []
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
if not categories_extracted:
|
|
298
|
+
categories = self._extract_categories(ser_elem)
|
|
299
|
+
categories_extracted = True
|
|
300
|
+
|
|
301
|
+
series_data['values'] = self._extract_values(ser_elem)
|
|
302
|
+
|
|
303
|
+
if series_data['values']:
|
|
304
|
+
series.append(series_data)
|
|
305
|
+
|
|
306
|
+
return categories, series
|
|
307
|
+
|
|
308
|
+
def _extract_series_name(self, ser_elem: ET.Element, idx: int) -> str:
|
|
309
|
+
"""Extract series name."""
|
|
310
|
+
ns_c = OOXML_NS['c']
|
|
311
|
+
|
|
312
|
+
tx_elem = ser_elem.find('.//c:tx//c:v', OOXML_NS)
|
|
313
|
+
if tx_elem is None:
|
|
314
|
+
tx_elem = ser_elem.find(f'.//{{{ns_c}}}tx//{{{ns_c}}}v')
|
|
315
|
+
if tx_elem is not None and tx_elem.text:
|
|
316
|
+
return tx_elem.text.strip()
|
|
317
|
+
|
|
318
|
+
str_ref = ser_elem.find('.//c:tx//c:strRef//c:strCache//c:pt//c:v', OOXML_NS)
|
|
319
|
+
if str_ref is None:
|
|
320
|
+
str_ref = ser_elem.find(f'.//{{{ns_c}}}tx//{{{ns_c}}}strRef//{{{ns_c}}}strCache//{{{ns_c}}}pt//{{{ns_c}}}v')
|
|
321
|
+
if str_ref is not None and str_ref.text:
|
|
322
|
+
return str_ref.text.strip()
|
|
323
|
+
|
|
324
|
+
return f"Series {idx + 1}"
|
|
325
|
+
|
|
326
|
+
def _extract_categories(self, ser_elem: ET.Element) -> List[str]:
|
|
327
|
+
"""Extract category labels."""
|
|
328
|
+
ns_c = OOXML_NS['c']
|
|
329
|
+
categories = []
|
|
330
|
+
|
|
331
|
+
cat_elem = ser_elem.find('.//c:cat', OOXML_NS)
|
|
332
|
+
if cat_elem is None:
|
|
333
|
+
cat_elem = ser_elem.find(f'.//{{{ns_c}}}cat')
|
|
334
|
+
|
|
335
|
+
if cat_elem is None:
|
|
336
|
+
return categories
|
|
337
|
+
|
|
338
|
+
# Try string cache
|
|
339
|
+
str_cache = cat_elem.find('.//c:strCache', OOXML_NS)
|
|
340
|
+
if str_cache is None:
|
|
341
|
+
str_cache = cat_elem.find(f'.//{{{ns_c}}}strCache')
|
|
342
|
+
|
|
343
|
+
if str_cache is not None:
|
|
344
|
+
categories = self._extract_point_values(str_cache, as_string=True)
|
|
345
|
+
|
|
346
|
+
# Fallback to numeric cache
|
|
347
|
+
if not categories:
|
|
348
|
+
num_cache = cat_elem.find('.//c:numCache', OOXML_NS)
|
|
349
|
+
if num_cache is None:
|
|
350
|
+
num_cache = cat_elem.find(f'.//{{{ns_c}}}numCache')
|
|
351
|
+
|
|
352
|
+
if num_cache is not None:
|
|
353
|
+
categories = self._extract_point_values(num_cache, as_string=True)
|
|
354
|
+
|
|
355
|
+
return categories
|
|
356
|
+
|
|
357
|
+
def _extract_values(self, ser_elem: ET.Element) -> List[Any]:
|
|
358
|
+
"""Extract series values."""
|
|
359
|
+
ns_c = OOXML_NS['c']
|
|
360
|
+
values = []
|
|
361
|
+
|
|
362
|
+
val_elem = ser_elem.find('.//c:val', OOXML_NS)
|
|
363
|
+
if val_elem is None:
|
|
364
|
+
val_elem = ser_elem.find(f'.//{{{ns_c}}}val')
|
|
365
|
+
|
|
366
|
+
if val_elem is not None:
|
|
367
|
+
num_cache = val_elem.find('.//c:numCache', OOXML_NS)
|
|
368
|
+
if num_cache is None:
|
|
369
|
+
num_cache = val_elem.find(f'.//{{{ns_c}}}numCache')
|
|
370
|
+
|
|
371
|
+
if num_cache is not None:
|
|
372
|
+
values = self._extract_point_values(num_cache, as_string=False)
|
|
373
|
+
|
|
374
|
+
# Try yVal for scatter/bubble charts
|
|
375
|
+
if not values:
|
|
376
|
+
yval_elem = ser_elem.find('.//c:yVal', OOXML_NS)
|
|
377
|
+
if yval_elem is None:
|
|
378
|
+
yval_elem = ser_elem.find(f'.//{{{ns_c}}}yVal')
|
|
379
|
+
|
|
380
|
+
if yval_elem is not None:
|
|
381
|
+
num_cache = yval_elem.find('.//c:numCache', OOXML_NS)
|
|
382
|
+
if num_cache is None:
|
|
383
|
+
num_cache = yval_elem.find(f'.//{{{ns_c}}}numCache')
|
|
384
|
+
|
|
385
|
+
if num_cache is not None:
|
|
386
|
+
values = self._extract_point_values(num_cache, as_string=False)
|
|
387
|
+
|
|
388
|
+
return values
|
|
389
|
+
|
|
390
|
+
def _extract_point_values(self, cache_elem: ET.Element, as_string: bool = False) -> List[Any]:
|
|
391
|
+
"""Extract values from cache element."""
|
|
392
|
+
ns_c = OOXML_NS['c']
|
|
393
|
+
values = []
|
|
394
|
+
|
|
395
|
+
pts = cache_elem.findall('.//c:pt', OOXML_NS)
|
|
396
|
+
if not pts:
|
|
397
|
+
pts = cache_elem.findall(f'.//{{{ns_c}}}pt')
|
|
398
|
+
|
|
399
|
+
for pt in sorted(pts, key=lambda x: int(x.get('idx', 0))):
|
|
400
|
+
v_elem = pt.find('c:v', OOXML_NS)
|
|
401
|
+
if v_elem is None:
|
|
402
|
+
v_elem = pt.find(f'{{{ns_c}}}v')
|
|
403
|
+
|
|
404
|
+
if v_elem is not None and v_elem.text:
|
|
405
|
+
text = v_elem.text.strip()
|
|
406
|
+
if as_string:
|
|
407
|
+
values.append(text)
|
|
408
|
+
else:
|
|
409
|
+
try:
|
|
410
|
+
values.append(float(text))
|
|
411
|
+
except ValueError:
|
|
412
|
+
values.append(text)
|
|
413
|
+
|
|
414
|
+
return values
|
|
415
|
+
|
|
416
|
+
# ========================================================================
|
|
417
|
+
# Formatting
|
|
418
|
+
# ========================================================================
|
|
419
|
+
|
|
420
|
+
def _format_chart_data(self, chart_data: ChartData) -> str:
|
|
421
|
+
"""Format ChartData using ChartProcessor."""
|
|
422
|
+
if chart_data.has_data():
|
|
423
|
+
return self._chart_processor.format_chart_data(
|
|
424
|
+
chart_type=chart_data.chart_type,
|
|
425
|
+
title=chart_data.title,
|
|
426
|
+
categories=chart_data.categories,
|
|
427
|
+
series=chart_data.series
|
|
428
|
+
)
|
|
429
|
+
else:
|
|
430
|
+
return self._chart_processor.format_chart_fallback(
|
|
431
|
+
chart_type=chart_data.chart_type,
|
|
432
|
+
title=chart_data.title
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
__all__ = ['DOCXChartExtractor']
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# service/document_processor/processor/docx_helper/docx_constants.py
|
|
2
|
+
"""
|
|
3
|
+
DOCX 상수 및 타입 정의
|
|
4
|
+
|
|
5
|
+
DOCX 문서 처리에 필요한 상수, Enum, 데이터클래스를 정의합니다.
|
|
6
|
+
- ElementType: 문서 요소 타입 (텍스트, 이미지, 테이블 등)
|
|
7
|
+
- DocxElement: 문서 요소 데이터 클래스
|
|
8
|
+
- NAMESPACES: OOXML 네임스페이스
|
|
9
|
+
- CHART_TYPE_MAP: 차트 타입 매핑
|
|
10
|
+
"""
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from enum import Enum
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# === 문서 요소 타입 정의 ===
|
|
16
|
+
|
|
17
|
+
class ElementType(Enum):
|
|
18
|
+
"""문서 요소 타입"""
|
|
19
|
+
TEXT = "text"
|
|
20
|
+
IMAGE = "image"
|
|
21
|
+
TABLE = "table"
|
|
22
|
+
CHART = "chart"
|
|
23
|
+
DIAGRAM = "diagram"
|
|
24
|
+
PAGE_BREAK = "page_break"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class DocxElement:
|
|
29
|
+
"""문서 내 요소를 나타내는 데이터 클래스"""
|
|
30
|
+
element_type: ElementType
|
|
31
|
+
content: str
|
|
32
|
+
element_index: int # 문서 내 순서
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# === OOXML 네임스페이스 ===
|
|
36
|
+
|
|
37
|
+
NAMESPACES = {
|
|
38
|
+
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
|
|
39
|
+
'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
|
|
40
|
+
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
|
|
41
|
+
'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
|
|
42
|
+
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
|
|
43
|
+
'c': 'http://schemas.openxmlformats.org/drawingml/2006/chart',
|
|
44
|
+
'dgm': 'http://schemas.openxmlformats.org/drawingml/2006/diagram',
|
|
45
|
+
'mc': 'http://schemas.openxmlformats.org/markup-compatibility/2006',
|
|
46
|
+
'wps': 'http://schemas.microsoft.com/office/word/2010/wordprocessingShape',
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
# OOXML 차트 타입 맵핑
|
|
50
|
+
CHART_TYPE_MAP = {
|
|
51
|
+
'barChart': '막대 차트',
|
|
52
|
+
'bar3DChart': '3D 막대 차트',
|
|
53
|
+
'lineChart': '선 차트',
|
|
54
|
+
'line3DChart': '3D 선 차트',
|
|
55
|
+
'pieChart': '파이 차트',
|
|
56
|
+
'pie3DChart': '3D 파이 차트',
|
|
57
|
+
'doughnutChart': '도넛 차트',
|
|
58
|
+
'areaChart': '영역 차트',
|
|
59
|
+
'area3DChart': '3D 영역 차트',
|
|
60
|
+
'scatterChart': '분산형 차트',
|
|
61
|
+
'radarChart': '방사형 차트',
|
|
62
|
+
'bubbleChart': '거품형 차트',
|
|
63
|
+
'stockChart': '주식형 차트',
|
|
64
|
+
'surfaceChart': '표면 차트',
|
|
65
|
+
'surface3DChart': '3D 표면 차트',
|
|
66
|
+
'ofPieChart': '분리형 파이 차트',
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
__all__ = [
|
|
71
|
+
'ElementType',
|
|
72
|
+
'DocxElement',
|
|
73
|
+
'NAMESPACES',
|
|
74
|
+
'CHART_TYPE_MAP',
|
|
75
|
+
]
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py
|
|
2
|
+
"""
|
|
3
|
+
DOCXFileConverter - DOCX file format converter
|
|
4
|
+
|
|
5
|
+
Converts binary DOCX data to python-docx Document object.
|
|
6
|
+
"""
|
|
7
|
+
from io import BytesIO
|
|
8
|
+
from typing import Any, Optional, BinaryIO
|
|
9
|
+
import zipfile
|
|
10
|
+
|
|
11
|
+
from xgen_doc2chunk.core.functions.file_converter import BaseFileConverter
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DOCXFileConverter(BaseFileConverter):
|
|
15
|
+
"""
|
|
16
|
+
DOCX file converter using python-docx.
|
|
17
|
+
|
|
18
|
+
Converts binary DOCX data to Document object.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
# ZIP magic number (DOCX is a ZIP file)
|
|
22
|
+
ZIP_MAGIC = b'PK\x03\x04'
|
|
23
|
+
|
|
24
|
+
def convert(
|
|
25
|
+
self,
|
|
26
|
+
file_data: bytes,
|
|
27
|
+
file_stream: Optional[BinaryIO] = None,
|
|
28
|
+
**kwargs
|
|
29
|
+
) -> Any:
|
|
30
|
+
"""
|
|
31
|
+
Convert binary DOCX data to Document object.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
file_data: Raw binary DOCX data
|
|
35
|
+
file_stream: Optional file stream
|
|
36
|
+
**kwargs: Additional options
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
docx.Document object
|
|
40
|
+
|
|
41
|
+
Raises:
|
|
42
|
+
Exception: If DOCX cannot be opened
|
|
43
|
+
"""
|
|
44
|
+
from docx import Document
|
|
45
|
+
|
|
46
|
+
stream = file_stream if file_stream is not None else BytesIO(file_data)
|
|
47
|
+
stream.seek(0)
|
|
48
|
+
return Document(stream)
|
|
49
|
+
|
|
50
|
+
def get_format_name(self) -> str:
|
|
51
|
+
"""Return format name."""
|
|
52
|
+
return "DOCX Document"
|
|
53
|
+
|
|
54
|
+
def validate(self, file_data: bytes) -> bool:
|
|
55
|
+
"""
|
|
56
|
+
Validate if data is a valid DOCX (ZIP with specific structure).
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
file_data: Raw binary file data
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
True if file appears to be a DOCX
|
|
63
|
+
"""
|
|
64
|
+
if not file_data or len(file_data) < 4:
|
|
65
|
+
return False
|
|
66
|
+
|
|
67
|
+
if not file_data[:4] == self.ZIP_MAGIC:
|
|
68
|
+
return False
|
|
69
|
+
|
|
70
|
+
# Check for DOCX-specific content
|
|
71
|
+
try:
|
|
72
|
+
with zipfile.ZipFile(BytesIO(file_data), 'r') as zf:
|
|
73
|
+
return '[Content_Types].xml' in zf.namelist()
|
|
74
|
+
except zipfile.BadZipFile:
|
|
75
|
+
return False
|
|
76
|
+
|