xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,401 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/hwp_processor.py
|
|
2
|
+
"""
|
|
3
|
+
HWP Handler - HWP 5.0 OLE Format File Processor
|
|
4
|
+
|
|
5
|
+
Class-based handler for HWP files inheriting from BaseHandler.
|
|
6
|
+
"""
|
|
7
|
+
import io
|
|
8
|
+
import os
|
|
9
|
+
import zlib
|
|
10
|
+
import logging
|
|
11
|
+
import traceback
|
|
12
|
+
import zipfile
|
|
13
|
+
from typing import List, Dict, Any, Optional, Set, TYPE_CHECKING
|
|
14
|
+
|
|
15
|
+
import olefile
|
|
16
|
+
|
|
17
|
+
from xgen_doc2chunk.core.processor.base_handler import BaseHandler
|
|
18
|
+
from xgen_doc2chunk.core.functions.chart_extractor import BaseChartExtractor
|
|
19
|
+
from xgen_doc2chunk.core.processor.hwp_helper import (
|
|
20
|
+
HWPTAG_PARA_HEADER,
|
|
21
|
+
HWPTAG_PARA_TEXT,
|
|
22
|
+
HWPTAG_CTRL_HEADER,
|
|
23
|
+
HWPTAG_SHAPE_COMPONENT_PICTURE,
|
|
24
|
+
HWPTAG_TABLE,
|
|
25
|
+
HwpRecord,
|
|
26
|
+
decompress_section,
|
|
27
|
+
parse_doc_info,
|
|
28
|
+
parse_table,
|
|
29
|
+
extract_text_from_stream_raw,
|
|
30
|
+
find_zlib_streams,
|
|
31
|
+
recover_images_from_raw,
|
|
32
|
+
check_file_signature,
|
|
33
|
+
)
|
|
34
|
+
from xgen_doc2chunk.core.processor.hwp_helper.hwp_chart_extractor import HWPChartExtractor
|
|
35
|
+
from xgen_doc2chunk.core.processor.hwp_helper.hwp_metadata import HWPMetadataExtractor
|
|
36
|
+
from xgen_doc2chunk.core.processor.hwp_helper.hwp_image_processor import HWPImageProcessor
|
|
37
|
+
|
|
38
|
+
if TYPE_CHECKING:
|
|
39
|
+
from xgen_doc2chunk.core.document_processor import CurrentFile
|
|
40
|
+
from xgen_doc2chunk.core.functions.chart_extractor import ChartData
|
|
41
|
+
|
|
42
|
+
logger = logging.getLogger("document-processor")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class HWPHandler(BaseHandler):
|
|
46
|
+
"""HWP 5.0 OLE Format File Processing Handler Class"""
|
|
47
|
+
|
|
48
|
+
def _create_file_converter(self):
|
|
49
|
+
"""Create HWP-specific file converter."""
|
|
50
|
+
from xgen_doc2chunk.core.processor.hwp_helper.hwp_file_converter import HWPFileConverter
|
|
51
|
+
return HWPFileConverter()
|
|
52
|
+
|
|
53
|
+
def _create_preprocessor(self):
|
|
54
|
+
"""Create HWP-specific preprocessor."""
|
|
55
|
+
from xgen_doc2chunk.core.processor.hwp_helper.hwp_preprocessor import HWPPreprocessor
|
|
56
|
+
return HWPPreprocessor()
|
|
57
|
+
|
|
58
|
+
def _create_chart_extractor(self) -> BaseChartExtractor:
|
|
59
|
+
"""Create HWP-specific chart extractor."""
|
|
60
|
+
return HWPChartExtractor(self._chart_processor)
|
|
61
|
+
|
|
62
|
+
def _create_metadata_extractor(self):
|
|
63
|
+
"""Create HWP-specific metadata extractor."""
|
|
64
|
+
return HWPMetadataExtractor()
|
|
65
|
+
|
|
66
|
+
def _create_format_image_processor(self):
|
|
67
|
+
"""Create HWP-specific image processor."""
|
|
68
|
+
return HWPImageProcessor(
|
|
69
|
+
directory_path=self._image_processor.config.directory_path,
|
|
70
|
+
tag_prefix=self._image_processor.config.tag_prefix,
|
|
71
|
+
tag_suffix=self._image_processor.config.tag_suffix,
|
|
72
|
+
storage_backend=self._image_processor.storage_backend,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
def extract_text(
|
|
76
|
+
self,
|
|
77
|
+
current_file: "CurrentFile",
|
|
78
|
+
extract_metadata: bool = True,
|
|
79
|
+
**kwargs
|
|
80
|
+
) -> str:
|
|
81
|
+
"""
|
|
82
|
+
Extract text from HWP file.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
current_file: CurrentFile dict containing file info and binary data
|
|
86
|
+
extract_metadata: Whether to extract metadata
|
|
87
|
+
**kwargs: Additional options
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
Extracted text
|
|
91
|
+
"""
|
|
92
|
+
file_path = current_file.get("file_path", "unknown")
|
|
93
|
+
file_data = current_file.get("file_data", b"")
|
|
94
|
+
|
|
95
|
+
# Check if it's an OLE file using file_converter.validate()
|
|
96
|
+
if not self.file_converter.validate(file_data):
|
|
97
|
+
return self._handle_non_ole_file(current_file, extract_metadata)
|
|
98
|
+
|
|
99
|
+
text_content = []
|
|
100
|
+
processed_images: Set[str] = set()
|
|
101
|
+
|
|
102
|
+
try:
|
|
103
|
+
# Step 1: Open OLE file using file_converter
|
|
104
|
+
file_stream = self.get_file_stream(current_file)
|
|
105
|
+
|
|
106
|
+
# Pre-extract all charts using ChartExtractor
|
|
107
|
+
chart_data_list = self.chart_extractor.extract_all_from_file(file_stream)
|
|
108
|
+
|
|
109
|
+
# Convert binary to OLE object using file_converter
|
|
110
|
+
ole = self.file_converter.convert(file_data, file_stream)
|
|
111
|
+
|
|
112
|
+
# Step 2: Preprocess - may transform ole in the future
|
|
113
|
+
preprocessed = self.preprocess(ole)
|
|
114
|
+
ole = preprocessed.clean_content # TRUE SOURCE
|
|
115
|
+
|
|
116
|
+
try:
|
|
117
|
+
if extract_metadata:
|
|
118
|
+
metadata_text = self._extract_metadata(ole)
|
|
119
|
+
if metadata_text:
|
|
120
|
+
text_content.append(metadata_text)
|
|
121
|
+
text_content.append("")
|
|
122
|
+
|
|
123
|
+
bin_data_map = self._parse_docinfo(ole)
|
|
124
|
+
section_texts = self._extract_body_text(ole, bin_data_map, processed_images)
|
|
125
|
+
text_content.extend(section_texts)
|
|
126
|
+
|
|
127
|
+
# Use format_image_processor directly
|
|
128
|
+
image_processor = self.format_image_processor
|
|
129
|
+
if hasattr(image_processor, 'process_images_from_bindata'):
|
|
130
|
+
image_text = image_processor.process_images_from_bindata(ole, processed_images=processed_images)
|
|
131
|
+
else:
|
|
132
|
+
image_text = ""
|
|
133
|
+
if image_text:
|
|
134
|
+
text_content.append("\n\n=== Extracted Images (Not Inline) ===\n")
|
|
135
|
+
text_content.append(image_text)
|
|
136
|
+
|
|
137
|
+
# Add pre-extracted charts
|
|
138
|
+
for chart_data in chart_data_list:
|
|
139
|
+
chart_text = self._format_chart_data(chart_data)
|
|
140
|
+
if chart_text:
|
|
141
|
+
text_content.append(chart_text)
|
|
142
|
+
finally:
|
|
143
|
+
# Close OLE object using file_converter
|
|
144
|
+
self.file_converter.close(ole)
|
|
145
|
+
|
|
146
|
+
except Exception as e:
|
|
147
|
+
self.logger.error(f"Error processing HWP file: {e}")
|
|
148
|
+
return f"Error processing HWP file: {str(e)}"
|
|
149
|
+
|
|
150
|
+
return "\n".join(text_content)
|
|
151
|
+
|
|
152
|
+
def _format_chart_data(self, chart_data: "ChartData") -> str:
|
|
153
|
+
"""Format ChartData using ChartProcessor."""
|
|
154
|
+
from xgen_doc2chunk.core.functions.chart_extractor import ChartData
|
|
155
|
+
|
|
156
|
+
if not isinstance(chart_data, ChartData):
|
|
157
|
+
return ""
|
|
158
|
+
|
|
159
|
+
if chart_data.has_data():
|
|
160
|
+
return self.chart_processor.format_chart_data(
|
|
161
|
+
chart_type=chart_data.chart_type,
|
|
162
|
+
title=chart_data.title,
|
|
163
|
+
categories=chart_data.categories,
|
|
164
|
+
series=chart_data.series
|
|
165
|
+
)
|
|
166
|
+
else:
|
|
167
|
+
return self.chart_processor.format_chart_fallback(
|
|
168
|
+
chart_type=chart_data.chart_type,
|
|
169
|
+
title=chart_data.title
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
def _handle_non_ole_file(self, current_file: "CurrentFile", extract_metadata: bool) -> str:
|
|
173
|
+
"""Handle non-OLE file."""
|
|
174
|
+
file_path = current_file.get("file_path", "unknown")
|
|
175
|
+
file_data = current_file.get("file_data", b"")
|
|
176
|
+
|
|
177
|
+
# Check if it's a ZIP file (HWPX)
|
|
178
|
+
if file_data[:4] == b'PK\x03\x04':
|
|
179
|
+
self.logger.info(f"File {file_path} is a Zip file. Processing as HWPX.")
|
|
180
|
+
from xgen_doc2chunk.core.processor.hwpx_handler import HWPXHandler
|
|
181
|
+
hwpx_handler = HWPXHandler(config=self.config, image_processor=self.format_image_processor)
|
|
182
|
+
return hwpx_handler.extract_text(current_file, extract_metadata=extract_metadata)
|
|
183
|
+
|
|
184
|
+
# Check HWP 3.0 format
|
|
185
|
+
if b'HWP Document File' in file_data[:32]:
|
|
186
|
+
return "[HWP 3.0 Format - Not Supported]"
|
|
187
|
+
|
|
188
|
+
return self._process_corrupted_hwp(current_file)
|
|
189
|
+
|
|
190
|
+
def _extract_metadata(self, ole: olefile.OleFileIO) -> str:
|
|
191
|
+
"""Extract metadata from OLE file."""
|
|
192
|
+
return self.extract_and_format_metadata(ole)
|
|
193
|
+
|
|
194
|
+
def _parse_docinfo(self, ole: olefile.OleFileIO) -> Dict:
|
|
195
|
+
"""Parse DocInfo stream."""
|
|
196
|
+
bin_data_by_storage_id, bin_data_list = parse_doc_info(ole)
|
|
197
|
+
return {'by_storage_id': bin_data_by_storage_id, 'by_index': bin_data_list}
|
|
198
|
+
|
|
199
|
+
def _extract_body_text(self, ole: olefile.OleFileIO, bin_data_map: Dict, processed_images: Set[str]) -> List[str]:
|
|
200
|
+
"""Extract text from BodyText sections."""
|
|
201
|
+
text_content = []
|
|
202
|
+
|
|
203
|
+
body_text_sections = [
|
|
204
|
+
entry for entry in ole.listdir()
|
|
205
|
+
if entry[0] == "BodyText" and entry[1].startswith("Section")
|
|
206
|
+
]
|
|
207
|
+
body_text_sections.sort(key=lambda x: int(x[1].replace("Section", "")))
|
|
208
|
+
|
|
209
|
+
for section in body_text_sections:
|
|
210
|
+
stream = ole.openstream(section)
|
|
211
|
+
data = stream.read()
|
|
212
|
+
|
|
213
|
+
decompressed_data, success = decompress_section(data)
|
|
214
|
+
if not success:
|
|
215
|
+
continue
|
|
216
|
+
|
|
217
|
+
section_text = self._parse_section(decompressed_data, ole, bin_data_map, processed_images)
|
|
218
|
+
|
|
219
|
+
if not section_text or not section_text.strip():
|
|
220
|
+
section_text = extract_text_from_stream_raw(decompressed_data)
|
|
221
|
+
|
|
222
|
+
text_content.append(section_text)
|
|
223
|
+
|
|
224
|
+
return text_content
|
|
225
|
+
|
|
226
|
+
def _parse_section(self, data: bytes, ole=None, bin_data_map=None, processed_images=None) -> str:
|
|
227
|
+
"""Parse a section."""
|
|
228
|
+
try:
|
|
229
|
+
root = HwpRecord.build_tree(data)
|
|
230
|
+
return self._traverse_tree(root, ole, bin_data_map, processed_images)
|
|
231
|
+
except Exception as e:
|
|
232
|
+
self.logger.error(f"Error parsing HWP section: {e}")
|
|
233
|
+
return ""
|
|
234
|
+
|
|
235
|
+
def _traverse_tree(self, record: 'HwpRecord', ole=None, bin_data_map=None, processed_images=None) -> str:
|
|
236
|
+
"""Traverse record tree."""
|
|
237
|
+
parts = []
|
|
238
|
+
|
|
239
|
+
if record.tag_id == HWPTAG_PARA_HEADER:
|
|
240
|
+
return self._process_paragraph(record, ole, bin_data_map, processed_images)
|
|
241
|
+
|
|
242
|
+
if record.tag_id == HWPTAG_CTRL_HEADER:
|
|
243
|
+
result = self._process_control(record, ole, bin_data_map, processed_images)
|
|
244
|
+
if result:
|
|
245
|
+
return result
|
|
246
|
+
|
|
247
|
+
if record.tag_id == HWPTAG_SHAPE_COMPONENT_PICTURE:
|
|
248
|
+
result = self._process_picture(record, ole, bin_data_map, processed_images)
|
|
249
|
+
if result:
|
|
250
|
+
return result
|
|
251
|
+
|
|
252
|
+
if record.tag_id == HWPTAG_PARA_TEXT:
|
|
253
|
+
text = record.get_text().replace('\x0b', '')
|
|
254
|
+
if text:
|
|
255
|
+
parts.append(text)
|
|
256
|
+
|
|
257
|
+
for child in record.children:
|
|
258
|
+
child_text = self._traverse_tree(child, ole, bin_data_map, processed_images)
|
|
259
|
+
if child_text:
|
|
260
|
+
parts.append(child_text)
|
|
261
|
+
|
|
262
|
+
if record.tag_id == HWPTAG_PARA_HEADER:
|
|
263
|
+
parts.append("\n")
|
|
264
|
+
|
|
265
|
+
return "".join(parts)
|
|
266
|
+
|
|
267
|
+
def _process_paragraph(self, record: 'HwpRecord', ole, bin_data_map, processed_images) -> str:
|
|
268
|
+
"""Process PARA_HEADER record."""
|
|
269
|
+
parts = []
|
|
270
|
+
|
|
271
|
+
text_rec = next((c for c in record.children if c.tag_id == HWPTAG_PARA_TEXT), None)
|
|
272
|
+
text_content = text_rec.get_text() if text_rec else ""
|
|
273
|
+
|
|
274
|
+
control_tags = [HWPTAG_CTRL_HEADER, HWPTAG_TABLE]
|
|
275
|
+
controls = [c for c in record.children if c.tag_id in control_tags]
|
|
276
|
+
|
|
277
|
+
if '\x0b' in text_content:
|
|
278
|
+
segments = text_content.split('\x0b')
|
|
279
|
+
for i, segment in enumerate(segments):
|
|
280
|
+
parts.append(segment)
|
|
281
|
+
if i < len(controls):
|
|
282
|
+
parts.append(self._traverse_tree(controls[i], ole, bin_data_map, processed_images))
|
|
283
|
+
for k in range(len(segments) - 1, len(controls)):
|
|
284
|
+
parts.append(self._traverse_tree(controls[k], ole, bin_data_map, processed_images))
|
|
285
|
+
else:
|
|
286
|
+
parts.append(text_content)
|
|
287
|
+
for c in controls:
|
|
288
|
+
parts.append(self._traverse_tree(c, ole, bin_data_map, processed_images))
|
|
289
|
+
|
|
290
|
+
parts.append("\n")
|
|
291
|
+
return "".join(parts)
|
|
292
|
+
|
|
293
|
+
def _process_control(self, record: 'HwpRecord', ole, bin_data_map, processed_images) -> Optional[str]:
|
|
294
|
+
"""Process CTRL_HEADER record."""
|
|
295
|
+
if len(record.payload) < 4:
|
|
296
|
+
return None
|
|
297
|
+
|
|
298
|
+
ctrl_id = record.payload[:4][::-1]
|
|
299
|
+
|
|
300
|
+
if ctrl_id == b'tbl ':
|
|
301
|
+
return parse_table(record, self._traverse_tree, ole, bin_data_map, processed_images)
|
|
302
|
+
|
|
303
|
+
if ctrl_id == b'gso ':
|
|
304
|
+
return self._process_gso(record, ole, bin_data_map, processed_images)
|
|
305
|
+
|
|
306
|
+
return None
|
|
307
|
+
|
|
308
|
+
def _process_gso(self, record: 'HwpRecord', ole, bin_data_map, processed_images) -> Optional[str]:
|
|
309
|
+
"""Process GSO (Graphic Shape Object) record."""
|
|
310
|
+
def find_pictures(rec):
|
|
311
|
+
results = []
|
|
312
|
+
if rec.tag_id == HWPTAG_SHAPE_COMPONENT_PICTURE:
|
|
313
|
+
results.append(rec)
|
|
314
|
+
for child in rec.children:
|
|
315
|
+
results.extend(find_pictures(child))
|
|
316
|
+
return results
|
|
317
|
+
|
|
318
|
+
pictures = find_pictures(record)
|
|
319
|
+
if pictures:
|
|
320
|
+
image_parts = []
|
|
321
|
+
for pic_rec in pictures:
|
|
322
|
+
img_result = self._process_picture(pic_rec, ole, bin_data_map, processed_images)
|
|
323
|
+
if img_result:
|
|
324
|
+
image_parts.append(img_result)
|
|
325
|
+
if image_parts:
|
|
326
|
+
return "".join(image_parts)
|
|
327
|
+
|
|
328
|
+
return None
|
|
329
|
+
|
|
330
|
+
def _process_picture(self, record: 'HwpRecord', ole, bin_data_map, processed_images) -> Optional[str]:
|
|
331
|
+
"""Process SHAPE_COMPONENT_PICTURE record."""
|
|
332
|
+
if not bin_data_map or not ole:
|
|
333
|
+
return None
|
|
334
|
+
|
|
335
|
+
bin_data_list = bin_data_map.get('by_index', [])
|
|
336
|
+
if not bin_data_list:
|
|
337
|
+
return None
|
|
338
|
+
|
|
339
|
+
image_processor = self.format_image_processor
|
|
340
|
+
|
|
341
|
+
# Use image processor methods directly
|
|
342
|
+
bindata_index = image_processor.extract_bindata_index(record.payload, len(bin_data_list))
|
|
343
|
+
|
|
344
|
+
if bindata_index and 0 < bindata_index <= len(bin_data_list):
|
|
345
|
+
storage_id, ext = bin_data_list[bindata_index - 1]
|
|
346
|
+
if storage_id > 0:
|
|
347
|
+
target_stream = image_processor.find_bindata_stream(ole, storage_id, ext)
|
|
348
|
+
if target_stream:
|
|
349
|
+
return image_processor.extract_and_save_image(ole, target_stream, processed_images)
|
|
350
|
+
|
|
351
|
+
if len(bin_data_list) == 1:
|
|
352
|
+
storage_id, ext = bin_data_list[0]
|
|
353
|
+
if storage_id > 0:
|
|
354
|
+
target_stream = image_processor.find_bindata_stream(ole, storage_id, ext)
|
|
355
|
+
if target_stream:
|
|
356
|
+
return image_processor.extract_and_save_image(ole, target_stream, processed_images)
|
|
357
|
+
|
|
358
|
+
return None
|
|
359
|
+
|
|
360
|
+
def _process_corrupted_hwp(self, current_file: "CurrentFile") -> str:
|
|
361
|
+
"""Attempt forensic recovery of corrupted HWP file."""
|
|
362
|
+
file_path = current_file.get("file_path", "unknown")
|
|
363
|
+
file_data = current_file.get("file_data", b"")
|
|
364
|
+
|
|
365
|
+
self.logger.info(f"Starting forensic recovery for: {file_path}")
|
|
366
|
+
text_content = []
|
|
367
|
+
|
|
368
|
+
try:
|
|
369
|
+
raw_data = file_data
|
|
370
|
+
|
|
371
|
+
file_type = check_file_signature(raw_data)
|
|
372
|
+
if file_type == "HWP3.0":
|
|
373
|
+
return "[HWP 3.0 Format - Not Supported]"
|
|
374
|
+
|
|
375
|
+
zlib_chunks = find_zlib_streams(raw_data, min_size=50)
|
|
376
|
+
|
|
377
|
+
for offset, decompressed in zlib_chunks:
|
|
378
|
+
parsed_text = self._parse_section(decompressed)
|
|
379
|
+
if not parsed_text or not parsed_text.strip():
|
|
380
|
+
parsed_text = extract_text_from_stream_raw(decompressed)
|
|
381
|
+
if parsed_text and len(parsed_text.strip()) > 0:
|
|
382
|
+
text_content.append(parsed_text)
|
|
383
|
+
|
|
384
|
+
if not text_content:
|
|
385
|
+
plain_text = extract_text_from_stream_raw(raw_data)
|
|
386
|
+
if plain_text and len(plain_text) > 100:
|
|
387
|
+
text_content.append(plain_text)
|
|
388
|
+
|
|
389
|
+
image_text = recover_images_from_raw(raw_data, image_processor=self.format_image_processor)
|
|
390
|
+
if image_text:
|
|
391
|
+
text_content.append(f"\n\n=== Recovered Images ===\n{image_text}")
|
|
392
|
+
|
|
393
|
+
except Exception as e:
|
|
394
|
+
self.logger.error(f"Forensic recovery failed: {e}")
|
|
395
|
+
return f"Forensic recovery failed: {str(e)}"
|
|
396
|
+
|
|
397
|
+
if not text_content:
|
|
398
|
+
return "[Forensic Recovery: No text found]"
|
|
399
|
+
|
|
400
|
+
return "\n".join(text_content)
|
|
401
|
+
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# service/document_processor/processor/hwp_helper/__init__.py
|
|
2
|
+
"""
|
|
3
|
+
HWP/HWPX 공통 헬퍼 모듈
|
|
4
|
+
|
|
5
|
+
HWP 5.0 OLE 파일 처리에 필요한 유틸리티 모듈을 제공합니다.
|
|
6
|
+
|
|
7
|
+
파일 구조:
|
|
8
|
+
- hwp_constants.py: 상수 정의 (태그 ID, 차트 타입 등)
|
|
9
|
+
- hwp_record.py: HWP 레코드 파싱 클래스
|
|
10
|
+
- hwp_decoder.py: 압축/인코딩 유틸리티
|
|
11
|
+
- hwp_metadata.py: 메타데이터 추출
|
|
12
|
+
- hwp_image.py: 이미지 처리
|
|
13
|
+
- hwp_chart.py: 차트 처리
|
|
14
|
+
- hwp_docinfo.py: DocInfo 파싱
|
|
15
|
+
- hwp_table.py: 테이블 파싱
|
|
16
|
+
- hwp_recovery.py: 손상 파일 복구
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
# Constants
|
|
20
|
+
from xgen_doc2chunk.core.processor.hwp_helper.hwp_constants import (
|
|
21
|
+
HWPTAG_BEGIN,
|
|
22
|
+
HWPTAG_BIN_DATA,
|
|
23
|
+
HWPTAG_PARA_HEADER,
|
|
24
|
+
HWPTAG_PARA_TEXT,
|
|
25
|
+
HWPTAG_CTRL_HEADER,
|
|
26
|
+
HWPTAG_LIST_HEADER,
|
|
27
|
+
HWPTAG_SHAPE_COMPONENT,
|
|
28
|
+
HWPTAG_SHAPE_COMPONENT_PICTURE,
|
|
29
|
+
HWPTAG_TABLE,
|
|
30
|
+
HWPTAG_SHAPE_COMPONENT_OLE,
|
|
31
|
+
HWPTAG_CHART_DATA,
|
|
32
|
+
CHART_TYPES,
|
|
33
|
+
CTRL_CHAR_DRAWING_TABLE_OBJECT,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
# Record Parser
|
|
37
|
+
from xgen_doc2chunk.core.processor.hwp_helper.hwp_record import HwpRecord
|
|
38
|
+
|
|
39
|
+
# Decoder
|
|
40
|
+
from xgen_doc2chunk.core.processor.hwp_helper.hwp_decoder import (
|
|
41
|
+
is_compressed,
|
|
42
|
+
decompress_stream,
|
|
43
|
+
decompress_section,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# Metadata
|
|
47
|
+
from xgen_doc2chunk.core.processor.hwp_helper.hwp_metadata import (
|
|
48
|
+
HWPMetadataExtractor,
|
|
49
|
+
parse_hwp_summary_information,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Image Processor (replaces hwp_image.py utility functions)
|
|
53
|
+
from xgen_doc2chunk.core.processor.hwp_helper.hwp_image_processor import HWPImageProcessor
|
|
54
|
+
|
|
55
|
+
# Chart Extractor
|
|
56
|
+
from xgen_doc2chunk.core.processor.hwp_helper.hwp_chart_extractor import HWPChartExtractor
|
|
57
|
+
|
|
58
|
+
# DocInfo
|
|
59
|
+
from xgen_doc2chunk.core.processor.hwp_helper.hwp_docinfo import (
|
|
60
|
+
parse_doc_info,
|
|
61
|
+
scan_bindata_folder,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# Table
|
|
65
|
+
from xgen_doc2chunk.core.processor.hwp_helper.hwp_table import (
|
|
66
|
+
parse_table,
|
|
67
|
+
build_table_grid,
|
|
68
|
+
render_table_html,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
# Recovery
|
|
72
|
+
from xgen_doc2chunk.core.processor.hwp_helper.hwp_recovery import (
|
|
73
|
+
extract_text_from_stream_raw,
|
|
74
|
+
find_zlib_streams,
|
|
75
|
+
recover_images_from_raw,
|
|
76
|
+
check_file_signature,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
__all__ = [
|
|
81
|
+
# Constants
|
|
82
|
+
'HWPTAG_BEGIN',
|
|
83
|
+
'HWPTAG_BIN_DATA',
|
|
84
|
+
'HWPTAG_PARA_HEADER',
|
|
85
|
+
'HWPTAG_PARA_TEXT',
|
|
86
|
+
'HWPTAG_CTRL_HEADER',
|
|
87
|
+
'HWPTAG_LIST_HEADER',
|
|
88
|
+
'HWPTAG_SHAPE_COMPONENT',
|
|
89
|
+
'HWPTAG_SHAPE_COMPONENT_PICTURE',
|
|
90
|
+
'HWPTAG_TABLE',
|
|
91
|
+
'HWPTAG_SHAPE_COMPONENT_OLE',
|
|
92
|
+
'HWPTAG_CHART_DATA',
|
|
93
|
+
'CHART_TYPES',
|
|
94
|
+
'CTRL_CHAR_DRAWING_TABLE_OBJECT',
|
|
95
|
+
# Record
|
|
96
|
+
'HwpRecord',
|
|
97
|
+
# Decoder
|
|
98
|
+
'is_compressed',
|
|
99
|
+
'decompress_stream',
|
|
100
|
+
'decompress_section',
|
|
101
|
+
# Metadata
|
|
102
|
+
'HWPMetadataExtractor',
|
|
103
|
+
'parse_hwp_summary_information',
|
|
104
|
+
# Image Processor
|
|
105
|
+
'HWPImageProcessor',
|
|
106
|
+
# Chart Extractor
|
|
107
|
+
'HWPChartExtractor',
|
|
108
|
+
# DocInfo
|
|
109
|
+
'parse_doc_info',
|
|
110
|
+
'scan_bindata_folder',
|
|
111
|
+
# Table
|
|
112
|
+
'parse_table',
|
|
113
|
+
'build_table_grid',
|
|
114
|
+
'render_table_html',
|
|
115
|
+
# Recovery
|
|
116
|
+
'extract_text_from_stream_raw',
|
|
117
|
+
'find_zlib_streams',
|
|
118
|
+
'recover_images_from_raw',
|
|
119
|
+
'check_file_signature',
|
|
120
|
+
]
|