xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,373 @@
1
+ """
2
+ HWP Chart Extractor
3
+
4
+ Extracts chart data from HWP files.
5
+ Supports both OOXML charts (한글 2018+) and legacy HWP charts.
6
+
7
+ Provides:
8
+ - extract(): Single chart extraction from OLE bytes
9
+ - extract_all_from_file(): Extract all charts from HWP file
10
+ """
11
+ import io
12
+ import logging
13
+ import os
14
+ import struct
15
+ import xml.etree.ElementTree as ET
16
+ import zlib
17
+ from typing import Any, BinaryIO, Dict, List, Optional, Union
18
+
19
+ import olefile
20
+
21
+ from xgen_doc2chunk.core.functions.chart_extractor import BaseChartExtractor, ChartData
22
+
23
+ logger = logging.getLogger("document-processor")
24
+
25
+ # OOXML namespaces
26
+ OOXML_NS = {
27
+ 'c': 'http://schemas.openxmlformats.org/drawingml/2006/chart',
28
+ 'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
29
+ }
30
+
31
+ # Chart type mapping
32
+ CHART_TYPE_MAP = {
33
+ 'barChart': 'Bar Chart',
34
+ 'bar3DChart': '3D Bar Chart',
35
+ 'lineChart': 'Line Chart',
36
+ 'line3DChart': '3D Line Chart',
37
+ 'pieChart': 'Pie Chart',
38
+ 'pie3DChart': '3D Pie Chart',
39
+ 'doughnutChart': 'Doughnut Chart',
40
+ 'areaChart': 'Area Chart',
41
+ 'area3DChart': '3D Area Chart',
42
+ 'scatterChart': 'Scatter Chart',
43
+ 'bubbleChart': 'Bubble Chart',
44
+ 'radarChart': 'Radar Chart',
45
+ }
46
+
47
+ # OLE magic signature
48
+ OLE_MAGIC = b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1'
49
+
50
+ # Image extensions to skip
51
+ SKIP_IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tif', '.tiff', '.wmf', '.emf'}
52
+
53
+
54
+ class HWPChartExtractor(BaseChartExtractor):
55
+ """
56
+ Chart extractor for HWP files.
57
+
58
+ HWP stores charts as OLE objects in BinData streams.
59
+ Supports:
60
+ - OOXML chart format (한글 2018+) via 'OOXMLChartContents' stream
61
+ - Legacy HWP chart format via 'Contents' stream
62
+ """
63
+
64
+ # ========================================================================
65
+ # Main Interface
66
+ # ========================================================================
67
+
68
+ def extract(self, chart_element: Any) -> ChartData:
69
+ """
70
+ Extract chart data from HWP OLE stream data.
71
+
72
+ Args:
73
+ chart_element: Raw bytes of OLE compound file from BinData
74
+
75
+ Returns:
76
+ ChartData with extracted information
77
+ """
78
+ if not chart_element or not isinstance(chart_element, bytes):
79
+ return ChartData()
80
+
81
+ ole_data = self._prepare_ole_data(chart_element)
82
+ if not ole_data:
83
+ return ChartData()
84
+
85
+ return self._extract_from_ole(ole_data)
86
+
87
+ def extract_all_from_file(
88
+ self,
89
+ file_source: Union[str, bytes, BinaryIO]
90
+ ) -> List[ChartData]:
91
+ """
92
+ Extract all charts from an HWP file.
93
+
94
+ Args:
95
+ file_source: File path, bytes, or file-like object
96
+
97
+ Returns:
98
+ List of ChartData for all charts in the file
99
+ """
100
+ charts = []
101
+
102
+ try:
103
+ # Prepare file-like object
104
+ if isinstance(file_source, str):
105
+ with open(file_source, 'rb') as f:
106
+ file_obj = io.BytesIO(f.read())
107
+ elif isinstance(file_source, bytes):
108
+ file_obj = io.BytesIO(file_source)
109
+ else:
110
+ file_source.seek(0)
111
+ file_obj = file_source
112
+
113
+ # Check if valid OLE file
114
+ file_obj.seek(0)
115
+ header = file_obj.read(8)
116
+ file_obj.seek(0)
117
+
118
+ if header != OLE_MAGIC:
119
+ logger.debug("Not a valid HWP OLE file")
120
+ return charts
121
+
122
+ ole = olefile.OleFileIO(file_obj)
123
+
124
+ try:
125
+ # Find all BinData streams
126
+ bindata_streams = [
127
+ e for e in ole.listdir()
128
+ if len(e) >= 2 and e[0] == "BinData"
129
+ ]
130
+
131
+ for stream_path in bindata_streams:
132
+ stream_name = stream_path[-1]
133
+ ext = os.path.splitext(stream_name)[1].lower()
134
+
135
+ # Skip image files
136
+ if ext in SKIP_IMAGE_EXTENSIONS:
137
+ continue
138
+
139
+ chart_data = self._process_chart_stream(ole, stream_path)
140
+ if chart_data.has_data():
141
+ charts.append(chart_data)
142
+ logger.debug(f"Extracted chart from: {'/'.join(stream_path)}")
143
+
144
+ finally:
145
+ ole.close()
146
+
147
+ logger.info(f"Extracted {len(charts)} charts from HWP file")
148
+
149
+ except Exception as e:
150
+ logger.error(f"Error extracting charts from HWP: {e}")
151
+
152
+ return charts
153
+
154
+ def _process_chart_stream(self, ole, stream_path: List[str]) -> ChartData:
155
+ """Process a single BinData stream for chart data."""
156
+ try:
157
+ stream = ole.openstream(stream_path)
158
+ ole_data = stream.read()
159
+
160
+ # Try decompression
161
+ try:
162
+ ole_data = zlib.decompress(ole_data, -15)
163
+ except:
164
+ try:
165
+ ole_data = zlib.decompress(ole_data)
166
+ except:
167
+ pass
168
+
169
+ return self.extract(ole_data)
170
+
171
+ except Exception as e:
172
+ logger.debug(f"Error processing chart stream: {e}")
173
+ return ChartData()
174
+
175
+ def _prepare_ole_data(self, raw_data: bytes) -> Optional[bytes]:
176
+ """Prepare OLE data by finding and extracting OLE compound file."""
177
+ if len(raw_data) < 12:
178
+ return None
179
+
180
+ OLE_MAGIC = b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1'
181
+
182
+ # Find OLE magic
183
+ offset = 0
184
+ if raw_data[:8] == OLE_MAGIC:
185
+ offset = 0
186
+ elif raw_data[4:12] == OLE_MAGIC:
187
+ offset = 4 # HWP often has 4-byte header
188
+ else:
189
+ for i in range(16):
190
+ if raw_data[i:i+8] == OLE_MAGIC:
191
+ offset = i
192
+ break
193
+ else:
194
+ return None
195
+
196
+ return raw_data[offset:]
197
+
198
+ def _extract_from_ole(self, ole_data: bytes) -> ChartData:
199
+ """Extract chart data from OLE compound file."""
200
+ try:
201
+ ole_stream = io.BytesIO(ole_data)
202
+ ole = olefile.OleFileIO(ole_stream)
203
+
204
+ try:
205
+ # Try OOXML format first (한글 2018+)
206
+ if ole.exists('OOXMLChartContents'):
207
+ stream = ole.openstream('OOXMLChartContents')
208
+ ooxml_data = stream.read()
209
+ return self._parse_ooxml_chart(ooxml_data)
210
+
211
+ # Fallback to legacy format
212
+ if ole.exists('Contents'):
213
+ stream = ole.openstream('Contents')
214
+ contents_data = stream.read()
215
+ return self._parse_legacy_chart(contents_data)
216
+
217
+ return ChartData()
218
+
219
+ finally:
220
+ ole.close()
221
+
222
+ except Exception as e:
223
+ logger.debug(f"Error extracting chart from OLE: {e}")
224
+ return ChartData()
225
+
226
+ def _parse_ooxml_chart(self, ooxml_data: bytes) -> ChartData:
227
+ """Parse OOXML chart format."""
228
+ try:
229
+ root = ET.fromstring(ooxml_data)
230
+
231
+ # Find chart element
232
+ chart_elem = root.find('.//c:chart', OOXML_NS)
233
+ if chart_elem is None:
234
+ chart_elem = root.find('.//{http://schemas.openxmlformats.org/drawingml/2006/chart}chart')
235
+ if chart_elem is None:
236
+ return ChartData()
237
+
238
+ # Extract title
239
+ title = self._extract_ooxml_title(chart_elem)
240
+
241
+ # Extract plot data
242
+ chart_type, categories, series = self._extract_ooxml_plot_data(chart_elem)
243
+
244
+ return ChartData(
245
+ chart_type=chart_type,
246
+ title=title,
247
+ categories=categories,
248
+ series=series
249
+ )
250
+
251
+ except Exception as e:
252
+ logger.debug(f"Error parsing OOXML chart: {e}")
253
+ return ChartData()
254
+
255
+ def _extract_ooxml_title(self, chart_elem) -> Optional[str]:
256
+ """Extract title from OOXML chart."""
257
+ title_elem = chart_elem.find('.//c:title//c:tx//c:rich//a:t', OOXML_NS)
258
+ if title_elem is not None and title_elem.text:
259
+ return title_elem.text.strip()
260
+ return None
261
+
262
+ def _extract_ooxml_plot_data(self, chart_elem) -> tuple:
263
+ """Extract chart type, categories, and series from OOXML."""
264
+ plot_area = chart_elem.find('.//c:plotArea', OOXML_NS)
265
+ if plot_area is None:
266
+ return "Chart", [], []
267
+
268
+ chart_type = "Chart"
269
+ categories = []
270
+ series = []
271
+
272
+ for chart_tag, type_name in CHART_TYPE_MAP.items():
273
+ elem = plot_area.find(f'.//c:{chart_tag}', OOXML_NS)
274
+ if elem is not None:
275
+ chart_type = type_name
276
+ categories, series = self._extract_ooxml_series(elem)
277
+ break
278
+
279
+ return chart_type, categories, series
280
+
281
+ def _extract_ooxml_series(self, chart_type_elem) -> tuple:
282
+ """Extract series data from OOXML chart type element."""
283
+ ns_c = OOXML_NS['c']
284
+ categories = []
285
+ series = []
286
+ categories_extracted = False
287
+
288
+ series_elements = chart_type_elem.findall('.//c:ser', OOXML_NS)
289
+
290
+ for idx, ser_elem in enumerate(series_elements):
291
+ # Extract series name
292
+ name = f"Series {idx + 1}"
293
+ tx_elem = ser_elem.find('.//c:tx//c:v', OOXML_NS)
294
+ if tx_elem is not None and tx_elem.text:
295
+ name = tx_elem.text.strip()
296
+
297
+ # Extract categories from first series
298
+ if not categories_extracted:
299
+ cat_elem = ser_elem.find('.//c:cat', OOXML_NS)
300
+ if cat_elem is not None:
301
+ categories = self._extract_ooxml_string_cache(cat_elem)
302
+ categories_extracted = True
303
+
304
+ # Extract values
305
+ values = []
306
+ val_elem = ser_elem.find('.//c:val', OOXML_NS)
307
+ if val_elem is not None:
308
+ values = self._extract_ooxml_num_cache(val_elem)
309
+
310
+ if values:
311
+ series.append({'name': name, 'values': values})
312
+
313
+ return categories, series
314
+
315
+ def _extract_ooxml_string_cache(self, cat_elem) -> List[str]:
316
+ """Extract string cache values."""
317
+ values = []
318
+ str_cache = cat_elem.find('.//c:strCache', OOXML_NS)
319
+ if str_cache is not None:
320
+ pts = str_cache.findall('.//c:pt', OOXML_NS)
321
+ for pt in sorted(pts, key=lambda x: int(x.get('idx', 0))):
322
+ v = pt.find('c:v', OOXML_NS)
323
+ if v is not None and v.text:
324
+ values.append(v.text.strip())
325
+ return values
326
+
327
+ def _extract_ooxml_num_cache(self, val_elem) -> List[Any]:
328
+ """Extract numeric cache values."""
329
+ values = []
330
+ num_cache = val_elem.find('.//c:numCache', OOXML_NS)
331
+ if num_cache is not None:
332
+ pts = num_cache.findall('.//c:pt', OOXML_NS)
333
+ for pt in sorted(pts, key=lambda x: int(x.get('idx', 0))):
334
+ v = pt.find('c:v', OOXML_NS)
335
+ if v is not None and v.text:
336
+ try:
337
+ values.append(float(v.text))
338
+ except ValueError:
339
+ values.append(v.text)
340
+ return values
341
+
342
+ def _parse_legacy_chart(self, contents_data: bytes) -> ChartData:
343
+ """Parse legacy HWP chart format."""
344
+ try:
345
+ # Legacy format uses record-based structure
346
+ # Try to extract basic info
347
+ title = None
348
+ categories = []
349
+ series = []
350
+
351
+ # Scan for UTF-16LE text patterns
352
+ try:
353
+ text = contents_data.decode('utf-16le', errors='ignore')
354
+ # Look for title-like strings
355
+ lines = [l.strip() for l in text.split('\n') if l.strip()]
356
+ if lines:
357
+ title = lines[0][:50] # First line might be title
358
+ except:
359
+ pass
360
+
361
+ return ChartData(
362
+ chart_type="Chart",
363
+ title=title,
364
+ categories=categories,
365
+ series=series
366
+ )
367
+
368
+ except Exception as e:
369
+ logger.debug(f"Error parsing legacy chart: {e}")
370
+ return ChartData()
371
+
372
+
373
+ __all__ = ['HWPChartExtractor']
@@ -0,0 +1,78 @@
1
+ # service/document_processor/processor/hwp_helper/hwp_constants.py
2
+ """
3
+ HWP/HWPX 공통 상수 정의
4
+
5
+ HWP 5.0 OLE 형식의 레코드 태그 ID, 차트 타입 코드 등을 정의합니다.
6
+ """
7
+
8
+ # ==========================================================================
9
+ # HWP 5.0 Tag Constants
10
+ # ==========================================================================
11
+
12
+ HWPTAG_BEGIN = 0x10
13
+
14
+ # DocInfo 관련
15
+ HWPTAG_BIN_DATA = HWPTAG_BEGIN + 2 # 18 - Binary data info in DocInfo
16
+
17
+ # Section/Paragraph 관련
18
+ HWPTAG_PARA_HEADER = HWPTAG_BEGIN + 50 # 66 - Paragraph header
19
+ HWPTAG_PARA_TEXT = HWPTAG_BEGIN + 51 # 67 - Paragraph text
20
+
21
+ # Control/Shape 관련
22
+ HWPTAG_CTRL_HEADER = HWPTAG_BEGIN + 55 # 71 - Control header
23
+ HWPTAG_LIST_HEADER = HWPTAG_BEGIN + 56 # 72 - List header (table cells)
24
+ HWPTAG_SHAPE_COMPONENT = HWPTAG_BEGIN + 60 # 76 - Shape component (container)
25
+ HWPTAG_TABLE = HWPTAG_BEGIN + 61 # 77 - Table properties
26
+ HWPTAG_SHAPE_COMPONENT_OLE = HWPTAG_BEGIN + 63 # 79 - OLE object (charts are OLE)
27
+ HWPTAG_SHAPE_COMPONENT_PICTURE = HWPTAG_BEGIN + 69 # 85 - Picture shape
28
+
29
+ # Chart 관련
30
+ HWPTAG_CHART_DATA = HWPTAG_BEGIN + 118 # 134 - Chart data
31
+
32
+
33
+ # ==========================================================================
34
+ # Chart Type Constants
35
+ # ==========================================================================
36
+
37
+ # HWP Chart specification에서 정의된 차트 타입 코드
38
+ CHART_TYPES = {
39
+ 0: '3D 막대', 1: '2D 막대', 2: '3D 선', 3: '2D 선',
40
+ 4: '3D 영역', 5: '2D 영역', 6: '3D 계단', 7: '2D 계단',
41
+ 8: '3D 조합', 9: '2D 조합', 10: '3D 가로 막대', 11: '2D 가로 막대',
42
+ 12: '3D 클러스터 막대', 13: '3D 파이', 14: '2D 파이', 15: '2D 도넛',
43
+ 16: '2D XY', 17: '2D 원추', 18: '2D 방사', 19: '2D 풍선',
44
+ 20: '2D Hi-Lo', 21: '2D 간트', 22: '3D 간트', 23: '3D 평면',
45
+ 24: '2D 등고선', 25: '3D 산포', 26: '3D XYZ'
46
+ }
47
+
48
+
49
+ # ==========================================================================
50
+ # Control Character Codes
51
+ # ==========================================================================
52
+
53
+ # PARA_TEXT에서 사용되는 컨트롤 문자 코드
54
+ CTRL_CHAR_DRAWING_TABLE_OBJECT = 0x0B # Extended control for GSO (images, tables, etc.)
55
+
56
+
57
+ # ==========================================================================
58
+ # Export List
59
+ # ==========================================================================
60
+
61
+ __all__ = [
62
+ # Tag IDs
63
+ 'HWPTAG_BEGIN',
64
+ 'HWPTAG_BIN_DATA',
65
+ 'HWPTAG_PARA_HEADER',
66
+ 'HWPTAG_PARA_TEXT',
67
+ 'HWPTAG_CTRL_HEADER',
68
+ 'HWPTAG_LIST_HEADER',
69
+ 'HWPTAG_SHAPE_COMPONENT',
70
+ 'HWPTAG_SHAPE_COMPONENT_PICTURE',
71
+ 'HWPTAG_TABLE',
72
+ 'HWPTAG_SHAPE_COMPONENT_OLE',
73
+ 'HWPTAG_CHART_DATA',
74
+ # Chart types
75
+ 'CHART_TYPES',
76
+ # Control chars
77
+ 'CTRL_CHAR_DRAWING_TABLE_OBJECT',
78
+ ]
@@ -0,0 +1,106 @@
1
+ # service/document_processor/processor/hwp_helper/hwp_decoder.py
2
+ """
3
+ HWP 압축/인코딩 유틸리티
4
+
5
+ HWP 5.0 OLE 파일의 스트림 압축 해제 및 관련 유틸리티를 제공합니다.
6
+ - is_compressed: FileHeader를 읽어 압축 여부 확인
7
+ - decompress_stream: zlib Deflate 압축 해제
8
+ """
9
+ import zlib
10
+ import struct
11
+ import logging
12
+ from typing import Tuple
13
+
14
+ import olefile
15
+
16
+ logger = logging.getLogger("document-processor")
17
+
18
+
19
+ def is_compressed(ole: olefile.OleFileIO) -> bool:
20
+ """
21
+ FileHeader를 읽어 HWP 파일 스트림이 압축되어 있는지 확인합니다.
22
+
23
+ HWP FileHeader의 36-40 바이트에 있는 플래그 필드를 읽어
24
+ 압축 비트(0x01)가 설정되어 있는지 확인합니다.
25
+
26
+ Args:
27
+ ole: OLE 파일 객체
28
+
29
+ Returns:
30
+ 압축 여부 (기본값: True - 대부분의 HWP 파일은 압축됨)
31
+ """
32
+ try:
33
+ if ole.exists("FileHeader"):
34
+ stream = ole.openstream("FileHeader")
35
+ header = stream.read()
36
+ if len(header) >= 40:
37
+ flags = struct.unpack('<I', header[36:40])[0]
38
+ return bool(flags & 0x01)
39
+ except Exception as e:
40
+ logger.debug(f"Failed to read FileHeader: {e}")
41
+ return True # 기본값: 압축됨 (대부분의 HWP 파일)
42
+
43
+
44
+ def decompress_stream(data: bytes, is_compressed_flag: bool = True) -> bytes:
45
+ """
46
+ 필요시 스트림 데이터를 압축 해제합니다.
47
+
48
+ HWP는 zlib Deflate 알고리즘을 사용하며, raw deflate(-15)를 먼저 시도합니다.
49
+
50
+ Args:
51
+ data: 스트림 바이너리 데이터
52
+ is_compressed_flag: 압축 여부 플래그
53
+
54
+ Returns:
55
+ 압축 해제된 데이터 (또는 원본 데이터)
56
+ """
57
+ if not is_compressed_flag:
58
+ return data
59
+
60
+ # Raw deflate 시도 (헤더 없음)
61
+ try:
62
+ return zlib.decompress(data, -15)
63
+ except zlib.error:
64
+ pass
65
+
66
+ # 표준 zlib 시도 (헤더 포함)
67
+ try:
68
+ return zlib.decompress(data)
69
+ except zlib.error:
70
+ pass
71
+
72
+ return data
73
+
74
+
75
+ def decompress_section(data: bytes) -> Tuple[bytes, bool]:
76
+ """
77
+ BodyText 섹션 데이터를 압축 해제합니다.
78
+
79
+ Args:
80
+ data: 섹션 바이너리 데이터
81
+
82
+ Returns:
83
+ (압축 해제된 데이터, 성공 여부) 튜플
84
+ """
85
+ # Raw deflate 시도
86
+ try:
87
+ decompressed = zlib.decompress(data, -15)
88
+ return decompressed, True
89
+ except zlib.error:
90
+ pass
91
+
92
+ # 표준 zlib 시도
93
+ try:
94
+ decompressed = zlib.decompress(data)
95
+ return decompressed, True
96
+ except zlib.error:
97
+ pass
98
+
99
+ return data, False
100
+
101
+
102
+ __all__ = [
103
+ 'is_compressed',
104
+ 'decompress_stream',
105
+ 'decompress_section',
106
+ ]