xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,464 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HWPX Chart Extractor
|
|
3
|
+
|
|
4
|
+
Extracts chart data from HWPX files.
|
|
5
|
+
HWPX uses OOXML-based chart format similar to Office documents.
|
|
6
|
+
|
|
7
|
+
Provides:
|
|
8
|
+
- extract(): Single chart XML extraction
|
|
9
|
+
- extract_all_from_file(): Extract all charts from HWPX file
|
|
10
|
+
"""
|
|
11
|
+
import io
|
|
12
|
+
import logging
|
|
13
|
+
import xml.etree.ElementTree as ET
|
|
14
|
+
import zipfile
|
|
15
|
+
import zlib
|
|
16
|
+
from typing import Any, BinaryIO, Dict, List, Optional, Union
|
|
17
|
+
|
|
18
|
+
import olefile
|
|
19
|
+
|
|
20
|
+
from xgen_doc2chunk.core.functions.chart_extractor import BaseChartExtractor, ChartData
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger("document-processor")
|
|
23
|
+
|
|
24
|
+
# OOXML namespaces
|
|
25
|
+
OOXML_NS = {
|
|
26
|
+
'c': 'http://schemas.openxmlformats.org/drawingml/2006/chart',
|
|
27
|
+
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
# Chart type mapping
|
|
31
|
+
CHART_TYPE_MAP = {
|
|
32
|
+
'barChart': 'Bar Chart',
|
|
33
|
+
'bar3DChart': '3D Bar Chart',
|
|
34
|
+
'lineChart': 'Line Chart',
|
|
35
|
+
'line3DChart': '3D Line Chart',
|
|
36
|
+
'pieChart': 'Pie Chart',
|
|
37
|
+
'pie3DChart': '3D Pie Chart',
|
|
38
|
+
'doughnutChart': 'Doughnut Chart',
|
|
39
|
+
'areaChart': 'Area Chart',
|
|
40
|
+
'area3DChart': '3D Area Chart',
|
|
41
|
+
'scatterChart': 'Scatter Chart',
|
|
42
|
+
'bubbleChart': 'Bubble Chart',
|
|
43
|
+
'radarChart': 'Radar Chart',
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
# OLE file magic signature
|
|
47
|
+
OLE_MAGIC = b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1'
|
|
48
|
+
|
|
49
|
+
# Image extensions to skip
|
|
50
|
+
SKIP_IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tif', '.tiff', '.wmf', '.emf'}
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class HWPXChartExtractor(BaseChartExtractor):
|
|
54
|
+
"""
|
|
55
|
+
Chart extractor for HWPX files.
|
|
56
|
+
|
|
57
|
+
HWPX is the Open Document format for Hangul.
|
|
58
|
+
Charts are stored as:
|
|
59
|
+
- OOXML XML in Chart/, Charts/, or Contents/Charts/ directory
|
|
60
|
+
- OLE objects in BinData/ directory
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
# ========================================================================
|
|
64
|
+
# Main Interface
|
|
65
|
+
# ========================================================================
|
|
66
|
+
|
|
67
|
+
def extract(self, chart_element: Any) -> ChartData:
|
|
68
|
+
"""
|
|
69
|
+
Extract chart data from HWPX chart XML or OLE data.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
chart_element: Chart XML bytes or OLE bytes from HWPX archive
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
ChartData with extracted information
|
|
76
|
+
"""
|
|
77
|
+
if not chart_element:
|
|
78
|
+
return ChartData()
|
|
79
|
+
|
|
80
|
+
if isinstance(chart_element, bytes):
|
|
81
|
+
# Try as OOXML first
|
|
82
|
+
result = self._parse_chart_xml(chart_element)
|
|
83
|
+
if result.has_data():
|
|
84
|
+
return result
|
|
85
|
+
# Try as OLE
|
|
86
|
+
return self._extract_from_ole(chart_element)
|
|
87
|
+
elif isinstance(chart_element, str):
|
|
88
|
+
return self._parse_chart_xml(chart_element.encode('utf-8'))
|
|
89
|
+
|
|
90
|
+
return ChartData()
|
|
91
|
+
|
|
92
|
+
def extract_all_from_file(
|
|
93
|
+
self,
|
|
94
|
+
file_source: Union[str, bytes, BinaryIO]
|
|
95
|
+
) -> List[ChartData]:
|
|
96
|
+
"""
|
|
97
|
+
Extract all charts from an HWPX file.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
file_source: File path, bytes, or file-like object
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
List of ChartData for all charts in the file
|
|
104
|
+
"""
|
|
105
|
+
charts = []
|
|
106
|
+
processed_hashes = set()
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
# Prepare file-like object
|
|
110
|
+
if isinstance(file_source, str):
|
|
111
|
+
with open(file_source, 'rb') as f:
|
|
112
|
+
file_obj = io.BytesIO(f.read())
|
|
113
|
+
elif isinstance(file_source, bytes):
|
|
114
|
+
file_obj = io.BytesIO(file_source)
|
|
115
|
+
else:
|
|
116
|
+
file_source.seek(0)
|
|
117
|
+
file_obj = file_source
|
|
118
|
+
|
|
119
|
+
with zipfile.ZipFile(file_obj, 'r') as zf:
|
|
120
|
+
namelist = zf.namelist()
|
|
121
|
+
|
|
122
|
+
# 1. Extract OOXML charts
|
|
123
|
+
charts.extend(self._extract_ooxml_charts(zf, namelist, processed_hashes))
|
|
124
|
+
|
|
125
|
+
# 2. Extract OLE charts from BinData
|
|
126
|
+
charts.extend(self._extract_ole_charts(zf, namelist, processed_hashes))
|
|
127
|
+
|
|
128
|
+
logger.info(f"Extracted {len(charts)} charts from HWPX file")
|
|
129
|
+
|
|
130
|
+
except Exception as e:
|
|
131
|
+
logger.error(f"Error extracting charts from HWPX: {e}")
|
|
132
|
+
|
|
133
|
+
return charts
|
|
134
|
+
|
|
135
|
+
def extract_all_with_refs(
|
|
136
|
+
self,
|
|
137
|
+
file_source: Union[str, bytes, BinaryIO]
|
|
138
|
+
) -> Dict[str, ChartData]:
|
|
139
|
+
"""
|
|
140
|
+
Extract all charts from an HWPX file with their chartIDRefs.
|
|
141
|
+
|
|
142
|
+
This method returns a dictionary mapping chartIDRef (e.g., "Chart/chart1.xml")
|
|
143
|
+
to ChartData, allowing for inline chart processing in document order.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
file_source: File path, bytes, or file-like object
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
Dictionary mapping chartIDRef -> ChartData
|
|
150
|
+
"""
|
|
151
|
+
chart_map: Dict[str, ChartData] = {}
|
|
152
|
+
processed_hashes = set()
|
|
153
|
+
|
|
154
|
+
try:
|
|
155
|
+
# Prepare file-like object
|
|
156
|
+
if isinstance(file_source, str):
|
|
157
|
+
with open(file_source, 'rb') as f:
|
|
158
|
+
file_obj = io.BytesIO(f.read())
|
|
159
|
+
elif isinstance(file_source, bytes):
|
|
160
|
+
file_obj = io.BytesIO(file_source)
|
|
161
|
+
else:
|
|
162
|
+
file_source.seek(0)
|
|
163
|
+
file_obj = file_source
|
|
164
|
+
|
|
165
|
+
with zipfile.ZipFile(file_obj, 'r') as zf:
|
|
166
|
+
namelist = zf.namelist()
|
|
167
|
+
|
|
168
|
+
# Extract OOXML charts with their references
|
|
169
|
+
chart_files = [
|
|
170
|
+
f for f in namelist
|
|
171
|
+
if (f.startswith('Chart/') and f.endswith('.xml'))
|
|
172
|
+
or (f.startswith('Contents/Charts/') and f.endswith('.xml'))
|
|
173
|
+
or (f.startswith('Charts/') and f.endswith('.xml'))
|
|
174
|
+
]
|
|
175
|
+
|
|
176
|
+
for chart_file in sorted(chart_files):
|
|
177
|
+
try:
|
|
178
|
+
with zf.open(chart_file) as f:
|
|
179
|
+
chart_xml = f.read()
|
|
180
|
+
|
|
181
|
+
chart_data = self._parse_chart_xml(chart_xml)
|
|
182
|
+
|
|
183
|
+
if chart_data.has_data():
|
|
184
|
+
# Duplicate check
|
|
185
|
+
chart_hash = f"{chart_data.title}|{chart_data.series}"
|
|
186
|
+
if chart_hash in processed_hashes:
|
|
187
|
+
continue
|
|
188
|
+
processed_hashes.add(chart_hash)
|
|
189
|
+
|
|
190
|
+
# Map by chartIDRef (e.g., "Chart/chart1.xml")
|
|
191
|
+
chart_map[chart_file] = chart_data
|
|
192
|
+
logger.debug(f"Mapped chart: {chart_file}")
|
|
193
|
+
|
|
194
|
+
except Exception as e:
|
|
195
|
+
logger.debug(f"Error reading chart file {chart_file}: {e}")
|
|
196
|
+
|
|
197
|
+
logger.info(f"Extracted {len(chart_map)} charts with refs from HWPX file")
|
|
198
|
+
|
|
199
|
+
except Exception as e:
|
|
200
|
+
logger.error(f"Error extracting charts from HWPX: {e}")
|
|
201
|
+
|
|
202
|
+
return chart_map
|
|
203
|
+
|
|
204
|
+
def _parse_chart_xml(self, chart_xml: bytes) -> ChartData:
|
|
205
|
+
"""Parse OOXML chart XML."""
|
|
206
|
+
try:
|
|
207
|
+
root = ET.fromstring(chart_xml)
|
|
208
|
+
|
|
209
|
+
# Find chart element
|
|
210
|
+
chart_elem = root.find('.//c:chart', OOXML_NS)
|
|
211
|
+
if chart_elem is None:
|
|
212
|
+
chart_elem = root.find('.//{http://schemas.openxmlformats.org/drawingml/2006/chart}chart')
|
|
213
|
+
if chart_elem is None:
|
|
214
|
+
if root.tag.endswith('}chart') or root.tag == 'chart':
|
|
215
|
+
chart_elem = root
|
|
216
|
+
else:
|
|
217
|
+
return ChartData()
|
|
218
|
+
|
|
219
|
+
# Extract title
|
|
220
|
+
title = self._extract_title(chart_elem)
|
|
221
|
+
|
|
222
|
+
# Extract plot data
|
|
223
|
+
chart_type, categories, series = self._extract_plot_data(chart_elem)
|
|
224
|
+
|
|
225
|
+
return ChartData(
|
|
226
|
+
chart_type=chart_type,
|
|
227
|
+
title=title,
|
|
228
|
+
categories=categories,
|
|
229
|
+
series=series
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
except Exception as e:
|
|
233
|
+
logger.debug(f"Error parsing HWPX chart: {e}")
|
|
234
|
+
return ChartData()
|
|
235
|
+
|
|
236
|
+
def _extract_title(self, chart_elem) -> Optional[str]:
|
|
237
|
+
"""Extract chart title."""
|
|
238
|
+
title_elem = chart_elem.find('.//c:title//c:tx//c:rich//a:t', OOXML_NS)
|
|
239
|
+
if title_elem is not None and title_elem.text:
|
|
240
|
+
return title_elem.text.strip()
|
|
241
|
+
return None
|
|
242
|
+
|
|
243
|
+
def _extract_plot_data(self, chart_elem) -> tuple:
|
|
244
|
+
"""Extract chart type, categories, and series."""
|
|
245
|
+
plot_area = chart_elem.find('.//c:plotArea', OOXML_NS)
|
|
246
|
+
if plot_area is None:
|
|
247
|
+
return "Chart", [], []
|
|
248
|
+
|
|
249
|
+
chart_type = "Chart"
|
|
250
|
+
categories = []
|
|
251
|
+
series = []
|
|
252
|
+
|
|
253
|
+
for chart_tag, type_name in CHART_TYPE_MAP.items():
|
|
254
|
+
elem = plot_area.find(f'.//c:{chart_tag}', OOXML_NS)
|
|
255
|
+
if elem is not None:
|
|
256
|
+
chart_type = type_name
|
|
257
|
+
categories, series = self._extract_series_data(elem)
|
|
258
|
+
break
|
|
259
|
+
|
|
260
|
+
return chart_type, categories, series
|
|
261
|
+
|
|
262
|
+
def _extract_series_data(self, chart_type_elem) -> tuple:
|
|
263
|
+
"""Extract series and categories from chart type element."""
|
|
264
|
+
categories = []
|
|
265
|
+
series = []
|
|
266
|
+
categories_extracted = False
|
|
267
|
+
|
|
268
|
+
series_elements = chart_type_elem.findall('.//c:ser', OOXML_NS)
|
|
269
|
+
|
|
270
|
+
for idx, ser_elem in enumerate(series_elements):
|
|
271
|
+
# Extract series name
|
|
272
|
+
name = f"Series {idx + 1}"
|
|
273
|
+
tx_elem = ser_elem.find('.//c:tx//c:v', OOXML_NS)
|
|
274
|
+
if tx_elem is not None and tx_elem.text:
|
|
275
|
+
name = tx_elem.text.strip()
|
|
276
|
+
|
|
277
|
+
# Extract categories from first series
|
|
278
|
+
if not categories_extracted:
|
|
279
|
+
cat_elem = ser_elem.find('.//c:cat', OOXML_NS)
|
|
280
|
+
if cat_elem is not None:
|
|
281
|
+
categories = self._extract_string_cache(cat_elem)
|
|
282
|
+
categories_extracted = True
|
|
283
|
+
|
|
284
|
+
# Extract values
|
|
285
|
+
values = []
|
|
286
|
+
val_elem = ser_elem.find('.//c:val', OOXML_NS)
|
|
287
|
+
if val_elem is not None:
|
|
288
|
+
values = self._extract_num_cache(val_elem)
|
|
289
|
+
|
|
290
|
+
if values:
|
|
291
|
+
series.append({'name': name, 'values': values})
|
|
292
|
+
|
|
293
|
+
return categories, series
|
|
294
|
+
|
|
295
|
+
def _extract_string_cache(self, cat_elem) -> List[str]:
|
|
296
|
+
"""Extract string cache values."""
|
|
297
|
+
values = []
|
|
298
|
+
str_cache = cat_elem.find('.//c:strCache', OOXML_NS)
|
|
299
|
+
if str_cache is not None:
|
|
300
|
+
pts = str_cache.findall('.//c:pt', OOXML_NS)
|
|
301
|
+
for pt in sorted(pts, key=lambda x: int(x.get('idx', 0))):
|
|
302
|
+
v = pt.find('c:v', OOXML_NS)
|
|
303
|
+
if v is not None and v.text:
|
|
304
|
+
values.append(v.text.strip())
|
|
305
|
+
return values
|
|
306
|
+
|
|
307
|
+
def _extract_num_cache(self, val_elem) -> List[Any]:
|
|
308
|
+
"""Extract numeric cache values."""
|
|
309
|
+
values = []
|
|
310
|
+
num_cache = val_elem.find('.//c:numCache', OOXML_NS)
|
|
311
|
+
if num_cache is not None:
|
|
312
|
+
pts = num_cache.findall('.//c:pt', OOXML_NS)
|
|
313
|
+
for pt in sorted(pts, key=lambda x: int(x.get('idx', 0))):
|
|
314
|
+
v = pt.find('c:v', OOXML_NS)
|
|
315
|
+
if v is not None and v.text:
|
|
316
|
+
try:
|
|
317
|
+
values.append(float(v.text))
|
|
318
|
+
except ValueError:
|
|
319
|
+
values.append(v.text)
|
|
320
|
+
return values
|
|
321
|
+
|
|
322
|
+
# ========================================================================
|
|
323
|
+
# File-Level Extraction Helpers
|
|
324
|
+
# ========================================================================
|
|
325
|
+
|
|
326
|
+
def _extract_ooxml_charts(
|
|
327
|
+
self,
|
|
328
|
+
zf: zipfile.ZipFile,
|
|
329
|
+
namelist: List[str],
|
|
330
|
+
processed_hashes: set
|
|
331
|
+
) -> List[ChartData]:
|
|
332
|
+
"""Extract OOXML charts from ZIP archive."""
|
|
333
|
+
charts = []
|
|
334
|
+
|
|
335
|
+
chart_files = [
|
|
336
|
+
f for f in namelist
|
|
337
|
+
if (f.startswith('Chart/') and f.endswith('.xml'))
|
|
338
|
+
or (f.startswith('Contents/Charts/') and f.endswith('.xml'))
|
|
339
|
+
or (f.startswith('Charts/') and f.endswith('.xml'))
|
|
340
|
+
]
|
|
341
|
+
|
|
342
|
+
for chart_file in sorted(chart_files):
|
|
343
|
+
try:
|
|
344
|
+
with zf.open(chart_file) as f:
|
|
345
|
+
chart_xml = f.read()
|
|
346
|
+
|
|
347
|
+
chart_data = self._parse_chart_xml(chart_xml)
|
|
348
|
+
|
|
349
|
+
if chart_data.has_data():
|
|
350
|
+
# Duplicate check
|
|
351
|
+
chart_hash = f"{chart_data.title}|{chart_data.series}"
|
|
352
|
+
if chart_hash in processed_hashes:
|
|
353
|
+
continue
|
|
354
|
+
processed_hashes.add(chart_hash)
|
|
355
|
+
|
|
356
|
+
charts.append(chart_data)
|
|
357
|
+
logger.debug(f"Extracted chart from: {chart_file}")
|
|
358
|
+
|
|
359
|
+
except Exception as e:
|
|
360
|
+
logger.debug(f"Error reading chart file {chart_file}: {e}")
|
|
361
|
+
|
|
362
|
+
return charts
|
|
363
|
+
|
|
364
|
+
def _extract_ole_charts(
|
|
365
|
+
self,
|
|
366
|
+
zf: zipfile.ZipFile,
|
|
367
|
+
namelist: List[str],
|
|
368
|
+
processed_hashes: set
|
|
369
|
+
) -> List[ChartData]:
|
|
370
|
+
"""Extract OLE charts from BinData directory."""
|
|
371
|
+
charts = []
|
|
372
|
+
|
|
373
|
+
bindata_files = [
|
|
374
|
+
f for f in namelist
|
|
375
|
+
if f.startswith('BinData/') and not f.endswith('/')
|
|
376
|
+
]
|
|
377
|
+
|
|
378
|
+
for bindata_file in bindata_files:
|
|
379
|
+
import os
|
|
380
|
+
ext = os.path.splitext(bindata_file)[1].lower()
|
|
381
|
+
|
|
382
|
+
if ext in SKIP_IMAGE_EXTENSIONS:
|
|
383
|
+
continue
|
|
384
|
+
|
|
385
|
+
try:
|
|
386
|
+
with zf.open(bindata_file) as f:
|
|
387
|
+
data = f.read()
|
|
388
|
+
|
|
389
|
+
# Try decompression
|
|
390
|
+
try:
|
|
391
|
+
data = zlib.decompress(data, -15)
|
|
392
|
+
except:
|
|
393
|
+
try:
|
|
394
|
+
data = zlib.decompress(data)
|
|
395
|
+
except:
|
|
396
|
+
pass
|
|
397
|
+
|
|
398
|
+
chart_data = self._extract_from_ole(data)
|
|
399
|
+
|
|
400
|
+
if chart_data.has_data():
|
|
401
|
+
# Duplicate check
|
|
402
|
+
chart_hash = f"{chart_data.title}|{chart_data.series}"
|
|
403
|
+
if chart_hash in processed_hashes:
|
|
404
|
+
continue
|
|
405
|
+
processed_hashes.add(chart_hash)
|
|
406
|
+
|
|
407
|
+
charts.append(chart_data)
|
|
408
|
+
logger.debug(f"Extracted OLE chart from: {bindata_file}")
|
|
409
|
+
|
|
410
|
+
except Exception as e:
|
|
411
|
+
logger.debug(f"Error reading bindata file {bindata_file}: {e}")
|
|
412
|
+
|
|
413
|
+
return charts
|
|
414
|
+
|
|
415
|
+
def _extract_from_ole(self, ole_data: bytes) -> ChartData:
|
|
416
|
+
"""Extract chart from OLE compound file."""
|
|
417
|
+
if len(ole_data) < 12:
|
|
418
|
+
return ChartData()
|
|
419
|
+
|
|
420
|
+
# Find OLE magic
|
|
421
|
+
offset = 0
|
|
422
|
+
if ole_data[:8] == OLE_MAGIC:
|
|
423
|
+
offset = 0
|
|
424
|
+
elif len(ole_data) > 12 and ole_data[4:12] == OLE_MAGIC:
|
|
425
|
+
offset = 4
|
|
426
|
+
else:
|
|
427
|
+
for i in range(16):
|
|
428
|
+
if ole_data[i:i+8] == OLE_MAGIC:
|
|
429
|
+
offset = i
|
|
430
|
+
break
|
|
431
|
+
else:
|
|
432
|
+
return ChartData()
|
|
433
|
+
|
|
434
|
+
try:
|
|
435
|
+
ole_stream = io.BytesIO(ole_data[offset:])
|
|
436
|
+
ole = olefile.OleFileIO(ole_stream)
|
|
437
|
+
|
|
438
|
+
try:
|
|
439
|
+
# Try OOXML format first
|
|
440
|
+
if ole.exists('OOXMLChartContents'):
|
|
441
|
+
stream = ole.openstream('OOXMLChartContents')
|
|
442
|
+
ooxml_data = stream.read()
|
|
443
|
+
return self._parse_chart_xml(ooxml_data)
|
|
444
|
+
|
|
445
|
+
# Try Contents stream
|
|
446
|
+
if ole.exists('Contents'):
|
|
447
|
+
stream = ole.openstream('Contents')
|
|
448
|
+
contents_data = stream.read()
|
|
449
|
+
# Try as OOXML first
|
|
450
|
+
result = self._parse_chart_xml(contents_data)
|
|
451
|
+
if result.has_data():
|
|
452
|
+
return result
|
|
453
|
+
|
|
454
|
+
return ChartData()
|
|
455
|
+
|
|
456
|
+
finally:
|
|
457
|
+
ole.close()
|
|
458
|
+
|
|
459
|
+
except Exception as e:
|
|
460
|
+
logger.debug(f"Error extracting chart from OLE: {e}")
|
|
461
|
+
return ChartData()
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
__all__ = ['HWPXChartExtractor']
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# hwpx_helper/hwpx_constants.py
|
|
2
|
+
"""
|
|
3
|
+
HWPX Handler 상수 및 네임스페이스 정의
|
|
4
|
+
|
|
5
|
+
HWPX (ZIP/XML 기반 한글 문서) 처리에 필요한 상수와 네임스페이스를 정의합니다.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
# HWPX XML 네임스페이스
|
|
9
|
+
HWPX_NAMESPACES = {
|
|
10
|
+
'hp': 'http://www.hancom.co.kr/hwpml/2011/paragraph',
|
|
11
|
+
'hc': 'http://www.hancom.co.kr/hwpml/2011/core',
|
|
12
|
+
'hh': 'http://www.hancom.co.kr/hwpml/2011/head',
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
# OPF 네임스페이스 (content.hpf 파싱용)
|
|
16
|
+
OPF_NAMESPACES = {
|
|
17
|
+
'opf': 'http://www.idpf.org/2007/opf/',
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
# 지원하는 이미지 확장자
|
|
21
|
+
SUPPORTED_IMAGE_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.bmp', '.gif']
|
|
22
|
+
|
|
23
|
+
# 건너뛸 이미지 확장자 (차트 추출 시)
|
|
24
|
+
SKIP_IMAGE_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tif', '.tiff', '.wmf', '.emf']
|
|
25
|
+
|
|
26
|
+
# HWPX 메타데이터 파일 경로 후보
|
|
27
|
+
HEADER_FILE_PATHS = ['Contents/header.xml', 'header.xml']
|
|
28
|
+
|
|
29
|
+
# HWPX 콘텐츠 파일 경로
|
|
30
|
+
HPF_PATH = "Contents/content.hpf"
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py
|
|
2
|
+
"""
|
|
3
|
+
HWPXFileConverter - HWPX file format converter
|
|
4
|
+
|
|
5
|
+
Converts binary HWPX data to ZipFile object.
|
|
6
|
+
"""
|
|
7
|
+
from io import BytesIO
|
|
8
|
+
from typing import Any, Optional, BinaryIO
|
|
9
|
+
import zipfile
|
|
10
|
+
|
|
11
|
+
from xgen_doc2chunk.core.functions.file_converter import BaseFileConverter
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class HWPXFileConverter(BaseFileConverter):
|
|
15
|
+
"""
|
|
16
|
+
HWPX file converter.
|
|
17
|
+
|
|
18
|
+
Converts binary HWPX (ZIP format) data to ZipFile object.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
# ZIP magic number
|
|
22
|
+
ZIP_MAGIC = b'PK\x03\x04'
|
|
23
|
+
|
|
24
|
+
def convert(
|
|
25
|
+
self,
|
|
26
|
+
file_data: bytes,
|
|
27
|
+
file_stream: Optional[BinaryIO] = None,
|
|
28
|
+
**kwargs
|
|
29
|
+
) -> zipfile.ZipFile:
|
|
30
|
+
"""
|
|
31
|
+
Convert binary HWPX data to ZipFile object.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
file_data: Raw binary HWPX data
|
|
35
|
+
file_stream: Optional file stream
|
|
36
|
+
**kwargs: Additional options
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
zipfile.ZipFile object
|
|
40
|
+
"""
|
|
41
|
+
stream = file_stream if file_stream is not None else BytesIO(file_data)
|
|
42
|
+
stream.seek(0)
|
|
43
|
+
return zipfile.ZipFile(stream, 'r')
|
|
44
|
+
|
|
45
|
+
def get_format_name(self) -> str:
|
|
46
|
+
"""Return format name."""
|
|
47
|
+
return "HWPX Document (ZIP/XML)"
|
|
48
|
+
|
|
49
|
+
def validate(self, file_data: bytes) -> bool:
|
|
50
|
+
"""Validate if data is a valid ZIP file."""
|
|
51
|
+
if not file_data or len(file_data) < 4:
|
|
52
|
+
return False
|
|
53
|
+
|
|
54
|
+
if file_data[:4] != self.ZIP_MAGIC:
|
|
55
|
+
return False
|
|
56
|
+
|
|
57
|
+
# Verify it's a valid ZIP
|
|
58
|
+
try:
|
|
59
|
+
with zipfile.ZipFile(BytesIO(file_data), 'r') as zf:
|
|
60
|
+
# HWPX should have specific structure
|
|
61
|
+
namelist = zf.namelist()
|
|
62
|
+
return len(namelist) > 0
|
|
63
|
+
except zipfile.BadZipFile:
|
|
64
|
+
return False
|
|
65
|
+
|
|
66
|
+
def close(self, converted_object: Any) -> None:
|
|
67
|
+
"""Close the ZipFile."""
|
|
68
|
+
if converted_object is not None and hasattr(converted_object, 'close'):
|
|
69
|
+
converted_object.close()
|
|
70
|
+
|