xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,401 @@
1
+ # xgen_doc2chunk/core/processor/hwp_processor.py
2
+ """
3
+ HWP Handler - HWP 5.0 OLE Format File Processor
4
+
5
+ Class-based handler for HWP files inheriting from BaseHandler.
6
+ """
7
+ import io
8
+ import os
9
+ import zlib
10
+ import logging
11
+ import traceback
12
+ import zipfile
13
+ from typing import List, Dict, Any, Optional, Set, TYPE_CHECKING
14
+
15
+ import olefile
16
+
17
+ from xgen_doc2chunk.core.processor.base_handler import BaseHandler
18
+ from xgen_doc2chunk.core.functions.chart_extractor import BaseChartExtractor
19
+ from xgen_doc2chunk.core.processor.hwp_helper import (
20
+ HWPTAG_PARA_HEADER,
21
+ HWPTAG_PARA_TEXT,
22
+ HWPTAG_CTRL_HEADER,
23
+ HWPTAG_SHAPE_COMPONENT_PICTURE,
24
+ HWPTAG_TABLE,
25
+ HwpRecord,
26
+ decompress_section,
27
+ parse_doc_info,
28
+ parse_table,
29
+ extract_text_from_stream_raw,
30
+ find_zlib_streams,
31
+ recover_images_from_raw,
32
+ check_file_signature,
33
+ )
34
+ from xgen_doc2chunk.core.processor.hwp_helper.hwp_chart_extractor import HWPChartExtractor
35
+ from xgen_doc2chunk.core.processor.hwp_helper.hwp_metadata import HWPMetadataExtractor
36
+ from xgen_doc2chunk.core.processor.hwp_helper.hwp_image_processor import HWPImageProcessor
37
+
38
+ if TYPE_CHECKING:
39
+ from xgen_doc2chunk.core.document_processor import CurrentFile
40
+ from xgen_doc2chunk.core.functions.chart_extractor import ChartData
41
+
42
+ logger = logging.getLogger("document-processor")
43
+
44
+
45
+ class HWPHandler(BaseHandler):
46
+ """HWP 5.0 OLE Format File Processing Handler Class"""
47
+
48
+ def _create_file_converter(self):
49
+ """Create HWP-specific file converter."""
50
+ from xgen_doc2chunk.core.processor.hwp_helper.hwp_file_converter import HWPFileConverter
51
+ return HWPFileConverter()
52
+
53
+ def _create_preprocessor(self):
54
+ """Create HWP-specific preprocessor."""
55
+ from xgen_doc2chunk.core.processor.hwp_helper.hwp_preprocessor import HWPPreprocessor
56
+ return HWPPreprocessor()
57
+
58
+ def _create_chart_extractor(self) -> BaseChartExtractor:
59
+ """Create HWP-specific chart extractor."""
60
+ return HWPChartExtractor(self._chart_processor)
61
+
62
+ def _create_metadata_extractor(self):
63
+ """Create HWP-specific metadata extractor."""
64
+ return HWPMetadataExtractor()
65
+
66
+ def _create_format_image_processor(self):
67
+ """Create HWP-specific image processor."""
68
+ return HWPImageProcessor(
69
+ directory_path=self._image_processor.config.directory_path,
70
+ tag_prefix=self._image_processor.config.tag_prefix,
71
+ tag_suffix=self._image_processor.config.tag_suffix,
72
+ storage_backend=self._image_processor.storage_backend,
73
+ )
74
+
75
+ def extract_text(
76
+ self,
77
+ current_file: "CurrentFile",
78
+ extract_metadata: bool = True,
79
+ **kwargs
80
+ ) -> str:
81
+ """
82
+ Extract text from HWP file.
83
+
84
+ Args:
85
+ current_file: CurrentFile dict containing file info and binary data
86
+ extract_metadata: Whether to extract metadata
87
+ **kwargs: Additional options
88
+
89
+ Returns:
90
+ Extracted text
91
+ """
92
+ file_path = current_file.get("file_path", "unknown")
93
+ file_data = current_file.get("file_data", b"")
94
+
95
+ # Check if it's an OLE file using file_converter.validate()
96
+ if not self.file_converter.validate(file_data):
97
+ return self._handle_non_ole_file(current_file, extract_metadata)
98
+
99
+ text_content = []
100
+ processed_images: Set[str] = set()
101
+
102
+ try:
103
+ # Step 1: Open OLE file using file_converter
104
+ file_stream = self.get_file_stream(current_file)
105
+
106
+ # Pre-extract all charts using ChartExtractor
107
+ chart_data_list = self.chart_extractor.extract_all_from_file(file_stream)
108
+
109
+ # Convert binary to OLE object using file_converter
110
+ ole = self.file_converter.convert(file_data, file_stream)
111
+
112
+ # Step 2: Preprocess - may transform ole in the future
113
+ preprocessed = self.preprocess(ole)
114
+ ole = preprocessed.clean_content # TRUE SOURCE
115
+
116
+ try:
117
+ if extract_metadata:
118
+ metadata_text = self._extract_metadata(ole)
119
+ if metadata_text:
120
+ text_content.append(metadata_text)
121
+ text_content.append("")
122
+
123
+ bin_data_map = self._parse_docinfo(ole)
124
+ section_texts = self._extract_body_text(ole, bin_data_map, processed_images)
125
+ text_content.extend(section_texts)
126
+
127
+ # Use format_image_processor directly
128
+ image_processor = self.format_image_processor
129
+ if hasattr(image_processor, 'process_images_from_bindata'):
130
+ image_text = image_processor.process_images_from_bindata(ole, processed_images=processed_images)
131
+ else:
132
+ image_text = ""
133
+ if image_text:
134
+ text_content.append("\n\n=== Extracted Images (Not Inline) ===\n")
135
+ text_content.append(image_text)
136
+
137
+ # Add pre-extracted charts
138
+ for chart_data in chart_data_list:
139
+ chart_text = self._format_chart_data(chart_data)
140
+ if chart_text:
141
+ text_content.append(chart_text)
142
+ finally:
143
+ # Close OLE object using file_converter
144
+ self.file_converter.close(ole)
145
+
146
+ except Exception as e:
147
+ self.logger.error(f"Error processing HWP file: {e}")
148
+ return f"Error processing HWP file: {str(e)}"
149
+
150
+ return "\n".join(text_content)
151
+
152
+ def _format_chart_data(self, chart_data: "ChartData") -> str:
153
+ """Format ChartData using ChartProcessor."""
154
+ from xgen_doc2chunk.core.functions.chart_extractor import ChartData
155
+
156
+ if not isinstance(chart_data, ChartData):
157
+ return ""
158
+
159
+ if chart_data.has_data():
160
+ return self.chart_processor.format_chart_data(
161
+ chart_type=chart_data.chart_type,
162
+ title=chart_data.title,
163
+ categories=chart_data.categories,
164
+ series=chart_data.series
165
+ )
166
+ else:
167
+ return self.chart_processor.format_chart_fallback(
168
+ chart_type=chart_data.chart_type,
169
+ title=chart_data.title
170
+ )
171
+
172
+ def _handle_non_ole_file(self, current_file: "CurrentFile", extract_metadata: bool) -> str:
173
+ """Handle non-OLE file."""
174
+ file_path = current_file.get("file_path", "unknown")
175
+ file_data = current_file.get("file_data", b"")
176
+
177
+ # Check if it's a ZIP file (HWPX)
178
+ if file_data[:4] == b'PK\x03\x04':
179
+ self.logger.info(f"File {file_path} is a Zip file. Processing as HWPX.")
180
+ from xgen_doc2chunk.core.processor.hwpx_handler import HWPXHandler
181
+ hwpx_handler = HWPXHandler(config=self.config, image_processor=self.format_image_processor)
182
+ return hwpx_handler.extract_text(current_file, extract_metadata=extract_metadata)
183
+
184
+ # Check HWP 3.0 format
185
+ if b'HWP Document File' in file_data[:32]:
186
+ return "[HWP 3.0 Format - Not Supported]"
187
+
188
+ return self._process_corrupted_hwp(current_file)
189
+
190
+ def _extract_metadata(self, ole: olefile.OleFileIO) -> str:
191
+ """Extract metadata from OLE file."""
192
+ return self.extract_and_format_metadata(ole)
193
+
194
+ def _parse_docinfo(self, ole: olefile.OleFileIO) -> Dict:
195
+ """Parse DocInfo stream."""
196
+ bin_data_by_storage_id, bin_data_list = parse_doc_info(ole)
197
+ return {'by_storage_id': bin_data_by_storage_id, 'by_index': bin_data_list}
198
+
199
+ def _extract_body_text(self, ole: olefile.OleFileIO, bin_data_map: Dict, processed_images: Set[str]) -> List[str]:
200
+ """Extract text from BodyText sections."""
201
+ text_content = []
202
+
203
+ body_text_sections = [
204
+ entry for entry in ole.listdir()
205
+ if entry[0] == "BodyText" and entry[1].startswith("Section")
206
+ ]
207
+ body_text_sections.sort(key=lambda x: int(x[1].replace("Section", "")))
208
+
209
+ for section in body_text_sections:
210
+ stream = ole.openstream(section)
211
+ data = stream.read()
212
+
213
+ decompressed_data, success = decompress_section(data)
214
+ if not success:
215
+ continue
216
+
217
+ section_text = self._parse_section(decompressed_data, ole, bin_data_map, processed_images)
218
+
219
+ if not section_text or not section_text.strip():
220
+ section_text = extract_text_from_stream_raw(decompressed_data)
221
+
222
+ text_content.append(section_text)
223
+
224
+ return text_content
225
+
226
+ def _parse_section(self, data: bytes, ole=None, bin_data_map=None, processed_images=None) -> str:
227
+ """Parse a section."""
228
+ try:
229
+ root = HwpRecord.build_tree(data)
230
+ return self._traverse_tree(root, ole, bin_data_map, processed_images)
231
+ except Exception as e:
232
+ self.logger.error(f"Error parsing HWP section: {e}")
233
+ return ""
234
+
235
+ def _traverse_tree(self, record: 'HwpRecord', ole=None, bin_data_map=None, processed_images=None) -> str:
236
+ """Traverse record tree."""
237
+ parts = []
238
+
239
+ if record.tag_id == HWPTAG_PARA_HEADER:
240
+ return self._process_paragraph(record, ole, bin_data_map, processed_images)
241
+
242
+ if record.tag_id == HWPTAG_CTRL_HEADER:
243
+ result = self._process_control(record, ole, bin_data_map, processed_images)
244
+ if result:
245
+ return result
246
+
247
+ if record.tag_id == HWPTAG_SHAPE_COMPONENT_PICTURE:
248
+ result = self._process_picture(record, ole, bin_data_map, processed_images)
249
+ if result:
250
+ return result
251
+
252
+ if record.tag_id == HWPTAG_PARA_TEXT:
253
+ text = record.get_text().replace('\x0b', '')
254
+ if text:
255
+ parts.append(text)
256
+
257
+ for child in record.children:
258
+ child_text = self._traverse_tree(child, ole, bin_data_map, processed_images)
259
+ if child_text:
260
+ parts.append(child_text)
261
+
262
+ if record.tag_id == HWPTAG_PARA_HEADER:
263
+ parts.append("\n")
264
+
265
+ return "".join(parts)
266
+
267
+ def _process_paragraph(self, record: 'HwpRecord', ole, bin_data_map, processed_images) -> str:
268
+ """Process PARA_HEADER record."""
269
+ parts = []
270
+
271
+ text_rec = next((c for c in record.children if c.tag_id == HWPTAG_PARA_TEXT), None)
272
+ text_content = text_rec.get_text() if text_rec else ""
273
+
274
+ control_tags = [HWPTAG_CTRL_HEADER, HWPTAG_TABLE]
275
+ controls = [c for c in record.children if c.tag_id in control_tags]
276
+
277
+ if '\x0b' in text_content:
278
+ segments = text_content.split('\x0b')
279
+ for i, segment in enumerate(segments):
280
+ parts.append(segment)
281
+ if i < len(controls):
282
+ parts.append(self._traverse_tree(controls[i], ole, bin_data_map, processed_images))
283
+ for k in range(len(segments) - 1, len(controls)):
284
+ parts.append(self._traverse_tree(controls[k], ole, bin_data_map, processed_images))
285
+ else:
286
+ parts.append(text_content)
287
+ for c in controls:
288
+ parts.append(self._traverse_tree(c, ole, bin_data_map, processed_images))
289
+
290
+ parts.append("\n")
291
+ return "".join(parts)
292
+
293
+ def _process_control(self, record: 'HwpRecord', ole, bin_data_map, processed_images) -> Optional[str]:
294
+ """Process CTRL_HEADER record."""
295
+ if len(record.payload) < 4:
296
+ return None
297
+
298
+ ctrl_id = record.payload[:4][::-1]
299
+
300
+ if ctrl_id == b'tbl ':
301
+ return parse_table(record, self._traverse_tree, ole, bin_data_map, processed_images)
302
+
303
+ if ctrl_id == b'gso ':
304
+ return self._process_gso(record, ole, bin_data_map, processed_images)
305
+
306
+ return None
307
+
308
+ def _process_gso(self, record: 'HwpRecord', ole, bin_data_map, processed_images) -> Optional[str]:
309
+ """Process GSO (Graphic Shape Object) record."""
310
+ def find_pictures(rec):
311
+ results = []
312
+ if rec.tag_id == HWPTAG_SHAPE_COMPONENT_PICTURE:
313
+ results.append(rec)
314
+ for child in rec.children:
315
+ results.extend(find_pictures(child))
316
+ return results
317
+
318
+ pictures = find_pictures(record)
319
+ if pictures:
320
+ image_parts = []
321
+ for pic_rec in pictures:
322
+ img_result = self._process_picture(pic_rec, ole, bin_data_map, processed_images)
323
+ if img_result:
324
+ image_parts.append(img_result)
325
+ if image_parts:
326
+ return "".join(image_parts)
327
+
328
+ return None
329
+
330
+ def _process_picture(self, record: 'HwpRecord', ole, bin_data_map, processed_images) -> Optional[str]:
331
+ """Process SHAPE_COMPONENT_PICTURE record."""
332
+ if not bin_data_map or not ole:
333
+ return None
334
+
335
+ bin_data_list = bin_data_map.get('by_index', [])
336
+ if not bin_data_list:
337
+ return None
338
+
339
+ image_processor = self.format_image_processor
340
+
341
+ # Use image processor methods directly
342
+ bindata_index = image_processor.extract_bindata_index(record.payload, len(bin_data_list))
343
+
344
+ if bindata_index and 0 < bindata_index <= len(bin_data_list):
345
+ storage_id, ext = bin_data_list[bindata_index - 1]
346
+ if storage_id > 0:
347
+ target_stream = image_processor.find_bindata_stream(ole, storage_id, ext)
348
+ if target_stream:
349
+ return image_processor.extract_and_save_image(ole, target_stream, processed_images)
350
+
351
+ if len(bin_data_list) == 1:
352
+ storage_id, ext = bin_data_list[0]
353
+ if storage_id > 0:
354
+ target_stream = image_processor.find_bindata_stream(ole, storage_id, ext)
355
+ if target_stream:
356
+ return image_processor.extract_and_save_image(ole, target_stream, processed_images)
357
+
358
+ return None
359
+
360
+ def _process_corrupted_hwp(self, current_file: "CurrentFile") -> str:
361
+ """Attempt forensic recovery of corrupted HWP file."""
362
+ file_path = current_file.get("file_path", "unknown")
363
+ file_data = current_file.get("file_data", b"")
364
+
365
+ self.logger.info(f"Starting forensic recovery for: {file_path}")
366
+ text_content = []
367
+
368
+ try:
369
+ raw_data = file_data
370
+
371
+ file_type = check_file_signature(raw_data)
372
+ if file_type == "HWP3.0":
373
+ return "[HWP 3.0 Format - Not Supported]"
374
+
375
+ zlib_chunks = find_zlib_streams(raw_data, min_size=50)
376
+
377
+ for offset, decompressed in zlib_chunks:
378
+ parsed_text = self._parse_section(decompressed)
379
+ if not parsed_text or not parsed_text.strip():
380
+ parsed_text = extract_text_from_stream_raw(decompressed)
381
+ if parsed_text and len(parsed_text.strip()) > 0:
382
+ text_content.append(parsed_text)
383
+
384
+ if not text_content:
385
+ plain_text = extract_text_from_stream_raw(raw_data)
386
+ if plain_text and len(plain_text) > 100:
387
+ text_content.append(plain_text)
388
+
389
+ image_text = recover_images_from_raw(raw_data, image_processor=self.format_image_processor)
390
+ if image_text:
391
+ text_content.append(f"\n\n=== Recovered Images ===\n{image_text}")
392
+
393
+ except Exception as e:
394
+ self.logger.error(f"Forensic recovery failed: {e}")
395
+ return f"Forensic recovery failed: {str(e)}"
396
+
397
+ if not text_content:
398
+ return "[Forensic Recovery: No text found]"
399
+
400
+ return "\n".join(text_content)
401
+
@@ -0,0 +1,120 @@
1
+ # service/document_processor/processor/hwp_helper/__init__.py
2
+ """
3
+ HWP/HWPX 공통 헬퍼 모듈
4
+
5
+ HWP 5.0 OLE 파일 처리에 필요한 유틸리티 모듈을 제공합니다.
6
+
7
+ 파일 구조:
8
+ - hwp_constants.py: 상수 정의 (태그 ID, 차트 타입 등)
9
+ - hwp_record.py: HWP 레코드 파싱 클래스
10
+ - hwp_decoder.py: 압축/인코딩 유틸리티
11
+ - hwp_metadata.py: 메타데이터 추출
12
+ - hwp_image.py: 이미지 처리
13
+ - hwp_chart.py: 차트 처리
14
+ - hwp_docinfo.py: DocInfo 파싱
15
+ - hwp_table.py: 테이블 파싱
16
+ - hwp_recovery.py: 손상 파일 복구
17
+ """
18
+
19
+ # Constants
20
+ from xgen_doc2chunk.core.processor.hwp_helper.hwp_constants import (
21
+ HWPTAG_BEGIN,
22
+ HWPTAG_BIN_DATA,
23
+ HWPTAG_PARA_HEADER,
24
+ HWPTAG_PARA_TEXT,
25
+ HWPTAG_CTRL_HEADER,
26
+ HWPTAG_LIST_HEADER,
27
+ HWPTAG_SHAPE_COMPONENT,
28
+ HWPTAG_SHAPE_COMPONENT_PICTURE,
29
+ HWPTAG_TABLE,
30
+ HWPTAG_SHAPE_COMPONENT_OLE,
31
+ HWPTAG_CHART_DATA,
32
+ CHART_TYPES,
33
+ CTRL_CHAR_DRAWING_TABLE_OBJECT,
34
+ )
35
+
36
+ # Record Parser
37
+ from xgen_doc2chunk.core.processor.hwp_helper.hwp_record import HwpRecord
38
+
39
+ # Decoder
40
+ from xgen_doc2chunk.core.processor.hwp_helper.hwp_decoder import (
41
+ is_compressed,
42
+ decompress_stream,
43
+ decompress_section,
44
+ )
45
+
46
+ # Metadata
47
+ from xgen_doc2chunk.core.processor.hwp_helper.hwp_metadata import (
48
+ HWPMetadataExtractor,
49
+ parse_hwp_summary_information,
50
+ )
51
+
52
+ # Image Processor (replaces hwp_image.py utility functions)
53
+ from xgen_doc2chunk.core.processor.hwp_helper.hwp_image_processor import HWPImageProcessor
54
+
55
+ # Chart Extractor
56
+ from xgen_doc2chunk.core.processor.hwp_helper.hwp_chart_extractor import HWPChartExtractor
57
+
58
+ # DocInfo
59
+ from xgen_doc2chunk.core.processor.hwp_helper.hwp_docinfo import (
60
+ parse_doc_info,
61
+ scan_bindata_folder,
62
+ )
63
+
64
+ # Table
65
+ from xgen_doc2chunk.core.processor.hwp_helper.hwp_table import (
66
+ parse_table,
67
+ build_table_grid,
68
+ render_table_html,
69
+ )
70
+
71
+ # Recovery
72
+ from xgen_doc2chunk.core.processor.hwp_helper.hwp_recovery import (
73
+ extract_text_from_stream_raw,
74
+ find_zlib_streams,
75
+ recover_images_from_raw,
76
+ check_file_signature,
77
+ )
78
+
79
+
80
+ __all__ = [
81
+ # Constants
82
+ 'HWPTAG_BEGIN',
83
+ 'HWPTAG_BIN_DATA',
84
+ 'HWPTAG_PARA_HEADER',
85
+ 'HWPTAG_PARA_TEXT',
86
+ 'HWPTAG_CTRL_HEADER',
87
+ 'HWPTAG_LIST_HEADER',
88
+ 'HWPTAG_SHAPE_COMPONENT',
89
+ 'HWPTAG_SHAPE_COMPONENT_PICTURE',
90
+ 'HWPTAG_TABLE',
91
+ 'HWPTAG_SHAPE_COMPONENT_OLE',
92
+ 'HWPTAG_CHART_DATA',
93
+ 'CHART_TYPES',
94
+ 'CTRL_CHAR_DRAWING_TABLE_OBJECT',
95
+ # Record
96
+ 'HwpRecord',
97
+ # Decoder
98
+ 'is_compressed',
99
+ 'decompress_stream',
100
+ 'decompress_section',
101
+ # Metadata
102
+ 'HWPMetadataExtractor',
103
+ 'parse_hwp_summary_information',
104
+ # Image Processor
105
+ 'HWPImageProcessor',
106
+ # Chart Extractor
107
+ 'HWPChartExtractor',
108
+ # DocInfo
109
+ 'parse_doc_info',
110
+ 'scan_bindata_folder',
111
+ # Table
112
+ 'parse_table',
113
+ 'build_table_grid',
114
+ 'render_table_html',
115
+ # Recovery
116
+ 'extract_text_from_stream_raw',
117
+ 'find_zlib_streams',
118
+ 'recover_images_from_raw',
119
+ 'check_file_signature',
120
+ ]