xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,464 @@
1
+ """
2
+ HWPX Chart Extractor
3
+
4
+ Extracts chart data from HWPX files.
5
+ HWPX uses OOXML-based chart format similar to Office documents.
6
+
7
+ Provides:
8
+ - extract(): Single chart XML extraction
9
+ - extract_all_from_file(): Extract all charts from HWPX file
10
+ """
11
+ import io
12
+ import logging
13
+ import xml.etree.ElementTree as ET
14
+ import zipfile
15
+ import zlib
16
+ from typing import Any, BinaryIO, Dict, List, Optional, Union
17
+
18
+ import olefile
19
+
20
+ from xgen_doc2chunk.core.functions.chart_extractor import BaseChartExtractor, ChartData
21
+
22
+ logger = logging.getLogger("document-processor")
23
+
24
+ # OOXML namespaces
25
+ OOXML_NS = {
26
+ 'c': 'http://schemas.openxmlformats.org/drawingml/2006/chart',
27
+ 'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
28
+ }
29
+
30
+ # Chart type mapping
31
+ CHART_TYPE_MAP = {
32
+ 'barChart': 'Bar Chart',
33
+ 'bar3DChart': '3D Bar Chart',
34
+ 'lineChart': 'Line Chart',
35
+ 'line3DChart': '3D Line Chart',
36
+ 'pieChart': 'Pie Chart',
37
+ 'pie3DChart': '3D Pie Chart',
38
+ 'doughnutChart': 'Doughnut Chart',
39
+ 'areaChart': 'Area Chart',
40
+ 'area3DChart': '3D Area Chart',
41
+ 'scatterChart': 'Scatter Chart',
42
+ 'bubbleChart': 'Bubble Chart',
43
+ 'radarChart': 'Radar Chart',
44
+ }
45
+
46
+ # OLE file magic signature
47
+ OLE_MAGIC = b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1'
48
+
49
+ # Image extensions to skip
50
+ SKIP_IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tif', '.tiff', '.wmf', '.emf'}
51
+
52
+
53
+ class HWPXChartExtractor(BaseChartExtractor):
54
+ """
55
+ Chart extractor for HWPX files.
56
+
57
+ HWPX is the Open Document format for Hangul.
58
+ Charts are stored as:
59
+ - OOXML XML in Chart/, Charts/, or Contents/Charts/ directory
60
+ - OLE objects in BinData/ directory
61
+ """
62
+
63
+ # ========================================================================
64
+ # Main Interface
65
+ # ========================================================================
66
+
67
+ def extract(self, chart_element: Any) -> ChartData:
68
+ """
69
+ Extract chart data from HWPX chart XML or OLE data.
70
+
71
+ Args:
72
+ chart_element: Chart XML bytes or OLE bytes from HWPX archive
73
+
74
+ Returns:
75
+ ChartData with extracted information
76
+ """
77
+ if not chart_element:
78
+ return ChartData()
79
+
80
+ if isinstance(chart_element, bytes):
81
+ # Try as OOXML first
82
+ result = self._parse_chart_xml(chart_element)
83
+ if result.has_data():
84
+ return result
85
+ # Try as OLE
86
+ return self._extract_from_ole(chart_element)
87
+ elif isinstance(chart_element, str):
88
+ return self._parse_chart_xml(chart_element.encode('utf-8'))
89
+
90
+ return ChartData()
91
+
92
+ def extract_all_from_file(
93
+ self,
94
+ file_source: Union[str, bytes, BinaryIO]
95
+ ) -> List[ChartData]:
96
+ """
97
+ Extract all charts from an HWPX file.
98
+
99
+ Args:
100
+ file_source: File path, bytes, or file-like object
101
+
102
+ Returns:
103
+ List of ChartData for all charts in the file
104
+ """
105
+ charts = []
106
+ processed_hashes = set()
107
+
108
+ try:
109
+ # Prepare file-like object
110
+ if isinstance(file_source, str):
111
+ with open(file_source, 'rb') as f:
112
+ file_obj = io.BytesIO(f.read())
113
+ elif isinstance(file_source, bytes):
114
+ file_obj = io.BytesIO(file_source)
115
+ else:
116
+ file_source.seek(0)
117
+ file_obj = file_source
118
+
119
+ with zipfile.ZipFile(file_obj, 'r') as zf:
120
+ namelist = zf.namelist()
121
+
122
+ # 1. Extract OOXML charts
123
+ charts.extend(self._extract_ooxml_charts(zf, namelist, processed_hashes))
124
+
125
+ # 2. Extract OLE charts from BinData
126
+ charts.extend(self._extract_ole_charts(zf, namelist, processed_hashes))
127
+
128
+ logger.info(f"Extracted {len(charts)} charts from HWPX file")
129
+
130
+ except Exception as e:
131
+ logger.error(f"Error extracting charts from HWPX: {e}")
132
+
133
+ return charts
134
+
135
+ def extract_all_with_refs(
136
+ self,
137
+ file_source: Union[str, bytes, BinaryIO]
138
+ ) -> Dict[str, ChartData]:
139
+ """
140
+ Extract all charts from an HWPX file with their chartIDRefs.
141
+
142
+ This method returns a dictionary mapping chartIDRef (e.g., "Chart/chart1.xml")
143
+ to ChartData, allowing for inline chart processing in document order.
144
+
145
+ Args:
146
+ file_source: File path, bytes, or file-like object
147
+
148
+ Returns:
149
+ Dictionary mapping chartIDRef -> ChartData
150
+ """
151
+ chart_map: Dict[str, ChartData] = {}
152
+ processed_hashes = set()
153
+
154
+ try:
155
+ # Prepare file-like object
156
+ if isinstance(file_source, str):
157
+ with open(file_source, 'rb') as f:
158
+ file_obj = io.BytesIO(f.read())
159
+ elif isinstance(file_source, bytes):
160
+ file_obj = io.BytesIO(file_source)
161
+ else:
162
+ file_source.seek(0)
163
+ file_obj = file_source
164
+
165
+ with zipfile.ZipFile(file_obj, 'r') as zf:
166
+ namelist = zf.namelist()
167
+
168
+ # Extract OOXML charts with their references
169
+ chart_files = [
170
+ f for f in namelist
171
+ if (f.startswith('Chart/') and f.endswith('.xml'))
172
+ or (f.startswith('Contents/Charts/') and f.endswith('.xml'))
173
+ or (f.startswith('Charts/') and f.endswith('.xml'))
174
+ ]
175
+
176
+ for chart_file in sorted(chart_files):
177
+ try:
178
+ with zf.open(chart_file) as f:
179
+ chart_xml = f.read()
180
+
181
+ chart_data = self._parse_chart_xml(chart_xml)
182
+
183
+ if chart_data.has_data():
184
+ # Duplicate check
185
+ chart_hash = f"{chart_data.title}|{chart_data.series}"
186
+ if chart_hash in processed_hashes:
187
+ continue
188
+ processed_hashes.add(chart_hash)
189
+
190
+ # Map by chartIDRef (e.g., "Chart/chart1.xml")
191
+ chart_map[chart_file] = chart_data
192
+ logger.debug(f"Mapped chart: {chart_file}")
193
+
194
+ except Exception as e:
195
+ logger.debug(f"Error reading chart file {chart_file}: {e}")
196
+
197
+ logger.info(f"Extracted {len(chart_map)} charts with refs from HWPX file")
198
+
199
+ except Exception as e:
200
+ logger.error(f"Error extracting charts from HWPX: {e}")
201
+
202
+ return chart_map
203
+
204
+ def _parse_chart_xml(self, chart_xml: bytes) -> ChartData:
205
+ """Parse OOXML chart XML."""
206
+ try:
207
+ root = ET.fromstring(chart_xml)
208
+
209
+ # Find chart element
210
+ chart_elem = root.find('.//c:chart', OOXML_NS)
211
+ if chart_elem is None:
212
+ chart_elem = root.find('.//{http://schemas.openxmlformats.org/drawingml/2006/chart}chart')
213
+ if chart_elem is None:
214
+ if root.tag.endswith('}chart') or root.tag == 'chart':
215
+ chart_elem = root
216
+ else:
217
+ return ChartData()
218
+
219
+ # Extract title
220
+ title = self._extract_title(chart_elem)
221
+
222
+ # Extract plot data
223
+ chart_type, categories, series = self._extract_plot_data(chart_elem)
224
+
225
+ return ChartData(
226
+ chart_type=chart_type,
227
+ title=title,
228
+ categories=categories,
229
+ series=series
230
+ )
231
+
232
+ except Exception as e:
233
+ logger.debug(f"Error parsing HWPX chart: {e}")
234
+ return ChartData()
235
+
236
+ def _extract_title(self, chart_elem) -> Optional[str]:
237
+ """Extract chart title."""
238
+ title_elem = chart_elem.find('.//c:title//c:tx//c:rich//a:t', OOXML_NS)
239
+ if title_elem is not None and title_elem.text:
240
+ return title_elem.text.strip()
241
+ return None
242
+
243
+ def _extract_plot_data(self, chart_elem) -> tuple:
244
+ """Extract chart type, categories, and series."""
245
+ plot_area = chart_elem.find('.//c:plotArea', OOXML_NS)
246
+ if plot_area is None:
247
+ return "Chart", [], []
248
+
249
+ chart_type = "Chart"
250
+ categories = []
251
+ series = []
252
+
253
+ for chart_tag, type_name in CHART_TYPE_MAP.items():
254
+ elem = plot_area.find(f'.//c:{chart_tag}', OOXML_NS)
255
+ if elem is not None:
256
+ chart_type = type_name
257
+ categories, series = self._extract_series_data(elem)
258
+ break
259
+
260
+ return chart_type, categories, series
261
+
262
+ def _extract_series_data(self, chart_type_elem) -> tuple:
263
+ """Extract series and categories from chart type element."""
264
+ categories = []
265
+ series = []
266
+ categories_extracted = False
267
+
268
+ series_elements = chart_type_elem.findall('.//c:ser', OOXML_NS)
269
+
270
+ for idx, ser_elem in enumerate(series_elements):
271
+ # Extract series name
272
+ name = f"Series {idx + 1}"
273
+ tx_elem = ser_elem.find('.//c:tx//c:v', OOXML_NS)
274
+ if tx_elem is not None and tx_elem.text:
275
+ name = tx_elem.text.strip()
276
+
277
+ # Extract categories from first series
278
+ if not categories_extracted:
279
+ cat_elem = ser_elem.find('.//c:cat', OOXML_NS)
280
+ if cat_elem is not None:
281
+ categories = self._extract_string_cache(cat_elem)
282
+ categories_extracted = True
283
+
284
+ # Extract values
285
+ values = []
286
+ val_elem = ser_elem.find('.//c:val', OOXML_NS)
287
+ if val_elem is not None:
288
+ values = self._extract_num_cache(val_elem)
289
+
290
+ if values:
291
+ series.append({'name': name, 'values': values})
292
+
293
+ return categories, series
294
+
295
+ def _extract_string_cache(self, cat_elem) -> List[str]:
296
+ """Extract string cache values."""
297
+ values = []
298
+ str_cache = cat_elem.find('.//c:strCache', OOXML_NS)
299
+ if str_cache is not None:
300
+ pts = str_cache.findall('.//c:pt', OOXML_NS)
301
+ for pt in sorted(pts, key=lambda x: int(x.get('idx', 0))):
302
+ v = pt.find('c:v', OOXML_NS)
303
+ if v is not None and v.text:
304
+ values.append(v.text.strip())
305
+ return values
306
+
307
+ def _extract_num_cache(self, val_elem) -> List[Any]:
308
+ """Extract numeric cache values."""
309
+ values = []
310
+ num_cache = val_elem.find('.//c:numCache', OOXML_NS)
311
+ if num_cache is not None:
312
+ pts = num_cache.findall('.//c:pt', OOXML_NS)
313
+ for pt in sorted(pts, key=lambda x: int(x.get('idx', 0))):
314
+ v = pt.find('c:v', OOXML_NS)
315
+ if v is not None and v.text:
316
+ try:
317
+ values.append(float(v.text))
318
+ except ValueError:
319
+ values.append(v.text)
320
+ return values
321
+
322
+ # ========================================================================
323
+ # File-Level Extraction Helpers
324
+ # ========================================================================
325
+
326
+ def _extract_ooxml_charts(
327
+ self,
328
+ zf: zipfile.ZipFile,
329
+ namelist: List[str],
330
+ processed_hashes: set
331
+ ) -> List[ChartData]:
332
+ """Extract OOXML charts from ZIP archive."""
333
+ charts = []
334
+
335
+ chart_files = [
336
+ f for f in namelist
337
+ if (f.startswith('Chart/') and f.endswith('.xml'))
338
+ or (f.startswith('Contents/Charts/') and f.endswith('.xml'))
339
+ or (f.startswith('Charts/') and f.endswith('.xml'))
340
+ ]
341
+
342
+ for chart_file in sorted(chart_files):
343
+ try:
344
+ with zf.open(chart_file) as f:
345
+ chart_xml = f.read()
346
+
347
+ chart_data = self._parse_chart_xml(chart_xml)
348
+
349
+ if chart_data.has_data():
350
+ # Duplicate check
351
+ chart_hash = f"{chart_data.title}|{chart_data.series}"
352
+ if chart_hash in processed_hashes:
353
+ continue
354
+ processed_hashes.add(chart_hash)
355
+
356
+ charts.append(chart_data)
357
+ logger.debug(f"Extracted chart from: {chart_file}")
358
+
359
+ except Exception as e:
360
+ logger.debug(f"Error reading chart file {chart_file}: {e}")
361
+
362
+ return charts
363
+
364
+ def _extract_ole_charts(
365
+ self,
366
+ zf: zipfile.ZipFile,
367
+ namelist: List[str],
368
+ processed_hashes: set
369
+ ) -> List[ChartData]:
370
+ """Extract OLE charts from BinData directory."""
371
+ charts = []
372
+
373
+ bindata_files = [
374
+ f for f in namelist
375
+ if f.startswith('BinData/') and not f.endswith('/')
376
+ ]
377
+
378
+ for bindata_file in bindata_files:
379
+ import os
380
+ ext = os.path.splitext(bindata_file)[1].lower()
381
+
382
+ if ext in SKIP_IMAGE_EXTENSIONS:
383
+ continue
384
+
385
+ try:
386
+ with zf.open(bindata_file) as f:
387
+ data = f.read()
388
+
389
+ # Try decompression
390
+ try:
391
+ data = zlib.decompress(data, -15)
392
+ except:
393
+ try:
394
+ data = zlib.decompress(data)
395
+ except:
396
+ pass
397
+
398
+ chart_data = self._extract_from_ole(data)
399
+
400
+ if chart_data.has_data():
401
+ # Duplicate check
402
+ chart_hash = f"{chart_data.title}|{chart_data.series}"
403
+ if chart_hash in processed_hashes:
404
+ continue
405
+ processed_hashes.add(chart_hash)
406
+
407
+ charts.append(chart_data)
408
+ logger.debug(f"Extracted OLE chart from: {bindata_file}")
409
+
410
+ except Exception as e:
411
+ logger.debug(f"Error reading bindata file {bindata_file}: {e}")
412
+
413
+ return charts
414
+
415
+ def _extract_from_ole(self, ole_data: bytes) -> ChartData:
416
+ """Extract chart from OLE compound file."""
417
+ if len(ole_data) < 12:
418
+ return ChartData()
419
+
420
+ # Find OLE magic
421
+ offset = 0
422
+ if ole_data[:8] == OLE_MAGIC:
423
+ offset = 0
424
+ elif len(ole_data) > 12 and ole_data[4:12] == OLE_MAGIC:
425
+ offset = 4
426
+ else:
427
+ for i in range(16):
428
+ if ole_data[i:i+8] == OLE_MAGIC:
429
+ offset = i
430
+ break
431
+ else:
432
+ return ChartData()
433
+
434
+ try:
435
+ ole_stream = io.BytesIO(ole_data[offset:])
436
+ ole = olefile.OleFileIO(ole_stream)
437
+
438
+ try:
439
+ # Try OOXML format first
440
+ if ole.exists('OOXMLChartContents'):
441
+ stream = ole.openstream('OOXMLChartContents')
442
+ ooxml_data = stream.read()
443
+ return self._parse_chart_xml(ooxml_data)
444
+
445
+ # Try Contents stream
446
+ if ole.exists('Contents'):
447
+ stream = ole.openstream('Contents')
448
+ contents_data = stream.read()
449
+ # Try as OOXML first
450
+ result = self._parse_chart_xml(contents_data)
451
+ if result.has_data():
452
+ return result
453
+
454
+ return ChartData()
455
+
456
+ finally:
457
+ ole.close()
458
+
459
+ except Exception as e:
460
+ logger.debug(f"Error extracting chart from OLE: {e}")
461
+ return ChartData()
462
+
463
+
464
+ __all__ = ['HWPXChartExtractor']
@@ -0,0 +1,30 @@
1
+ # hwpx_helper/hwpx_constants.py
2
+ """
3
+ HWPX Handler 상수 및 네임스페이스 정의
4
+
5
+ HWPX (ZIP/XML 기반 한글 문서) 처리에 필요한 상수와 네임스페이스를 정의합니다.
6
+ """
7
+
8
+ # HWPX XML 네임스페이스
9
+ HWPX_NAMESPACES = {
10
+ 'hp': 'http://www.hancom.co.kr/hwpml/2011/paragraph',
11
+ 'hc': 'http://www.hancom.co.kr/hwpml/2011/core',
12
+ 'hh': 'http://www.hancom.co.kr/hwpml/2011/head',
13
+ }
14
+
15
+ # OPF 네임스페이스 (content.hpf 파싱용)
16
+ OPF_NAMESPACES = {
17
+ 'opf': 'http://www.idpf.org/2007/opf/',
18
+ }
19
+
20
+ # 지원하는 이미지 확장자
21
+ SUPPORTED_IMAGE_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.bmp', '.gif']
22
+
23
+ # 건너뛸 이미지 확장자 (차트 추출 시)
24
+ SKIP_IMAGE_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tif', '.tiff', '.wmf', '.emf']
25
+
26
+ # HWPX 메타데이터 파일 경로 후보
27
+ HEADER_FILE_PATHS = ['Contents/header.xml', 'header.xml']
28
+
29
+ # HWPX 콘텐츠 파일 경로
30
+ HPF_PATH = "Contents/content.hpf"
@@ -0,0 +1,70 @@
1
+ # xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py
2
+ """
3
+ HWPXFileConverter - HWPX file format converter
4
+
5
+ Converts binary HWPX data to ZipFile object.
6
+ """
7
+ from io import BytesIO
8
+ from typing import Any, Optional, BinaryIO
9
+ import zipfile
10
+
11
+ from xgen_doc2chunk.core.functions.file_converter import BaseFileConverter
12
+
13
+
14
+ class HWPXFileConverter(BaseFileConverter):
15
+ """
16
+ HWPX file converter.
17
+
18
+ Converts binary HWPX (ZIP format) data to ZipFile object.
19
+ """
20
+
21
+ # ZIP magic number
22
+ ZIP_MAGIC = b'PK\x03\x04'
23
+
24
+ def convert(
25
+ self,
26
+ file_data: bytes,
27
+ file_stream: Optional[BinaryIO] = None,
28
+ **kwargs
29
+ ) -> zipfile.ZipFile:
30
+ """
31
+ Convert binary HWPX data to ZipFile object.
32
+
33
+ Args:
34
+ file_data: Raw binary HWPX data
35
+ file_stream: Optional file stream
36
+ **kwargs: Additional options
37
+
38
+ Returns:
39
+ zipfile.ZipFile object
40
+ """
41
+ stream = file_stream if file_stream is not None else BytesIO(file_data)
42
+ stream.seek(0)
43
+ return zipfile.ZipFile(stream, 'r')
44
+
45
+ def get_format_name(self) -> str:
46
+ """Return format name."""
47
+ return "HWPX Document (ZIP/XML)"
48
+
49
+ def validate(self, file_data: bytes) -> bool:
50
+ """Validate if data is a valid ZIP file."""
51
+ if not file_data or len(file_data) < 4:
52
+ return False
53
+
54
+ if file_data[:4] != self.ZIP_MAGIC:
55
+ return False
56
+
57
+ # Verify it's a valid ZIP
58
+ try:
59
+ with zipfile.ZipFile(BytesIO(file_data), 'r') as zf:
60
+ # HWPX should have specific structure
61
+ namelist = zf.namelist()
62
+ return len(namelist) > 0
63
+ except zipfile.BadZipFile:
64
+ return False
65
+
66
+ def close(self, converted_object: Any) -> None:
67
+ """Close the ZipFile."""
68
+ if converted_object is not None and hasattr(converted_object, 'close'):
69
+ converted_object.close()
70
+