xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
# service/document_processor/processor/hwp_helper/hwp_docinfo.py
|
|
2
|
+
"""
|
|
3
|
+
HWP DocInfo 파싱 유틸리티
|
|
4
|
+
|
|
5
|
+
HWP 5.0 OLE 파일의 DocInfo 스트림을 파싱하여 BinData 매핑 정보를 추출합니다.
|
|
6
|
+
- parse_doc_info: DocInfo 스트림에서 BinData 레코드 매핑 추출
|
|
7
|
+
- scan_bindata_folder: BinData 폴더 직접 스캔 (폴백)
|
|
8
|
+
"""
|
|
9
|
+
import re
|
|
10
|
+
import struct
|
|
11
|
+
import logging
|
|
12
|
+
import traceback
|
|
13
|
+
from typing import Dict, List, Tuple
|
|
14
|
+
|
|
15
|
+
import olefile
|
|
16
|
+
|
|
17
|
+
from xgen_doc2chunk.core.processor.hwp_helper.hwp_constants import HWPTAG_BIN_DATA
|
|
18
|
+
from xgen_doc2chunk.core.processor.hwp_helper.hwp_record import HwpRecord
|
|
19
|
+
from xgen_doc2chunk.core.processor.hwp_helper.hwp_decoder import (
|
|
20
|
+
is_compressed,
|
|
21
|
+
decompress_stream,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger("document-processor")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def parse_doc_info(ole: olefile.OleFileIO) -> Tuple[Dict[int, Tuple[int, str]], List[Tuple[int, str]]]:
|
|
28
|
+
"""
|
|
29
|
+
DocInfo 스트림을 파싱하여 BinData 레코드를 매핑합니다.
|
|
30
|
+
|
|
31
|
+
HWP의 DocInfo 스트림에는 BinData 레코드들이 포함되어 있으며,
|
|
32
|
+
각 레코드는 storage_id와 확장자 정보를 가지고 있습니다.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
ole: OLE 파일 객체
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
튜플:
|
|
39
|
+
- bin_data_by_storage_id: storage_id -> (storage_id, extension) 매핑
|
|
40
|
+
- bin_data_list: (storage_id, extension) 순서 리스트 (1-based index lookup)
|
|
41
|
+
"""
|
|
42
|
+
bin_data_by_storage_id = {}
|
|
43
|
+
bin_data_list = []
|
|
44
|
+
|
|
45
|
+
try:
|
|
46
|
+
if not ole.exists("DocInfo"):
|
|
47
|
+
logger.warning("DocInfo stream not found in OLE file")
|
|
48
|
+
return bin_data_by_storage_id, bin_data_list
|
|
49
|
+
|
|
50
|
+
compressed = is_compressed(ole)
|
|
51
|
+
logger.info(f"HWP file compressed: {compressed}")
|
|
52
|
+
|
|
53
|
+
stream = ole.openstream("DocInfo")
|
|
54
|
+
data = stream.read()
|
|
55
|
+
original_size = len(data)
|
|
56
|
+
|
|
57
|
+
data = decompress_stream(data, compressed)
|
|
58
|
+
logger.info(f"DocInfo stream: original={original_size}, decompressed={len(data)}")
|
|
59
|
+
|
|
60
|
+
root = HwpRecord.build_tree(data)
|
|
61
|
+
logger.info(f"DocInfo tree built with {len(root.children)} top-level records")
|
|
62
|
+
|
|
63
|
+
# 디버그: 모든 태그 ID 로깅
|
|
64
|
+
tag_counts = {}
|
|
65
|
+
for child in root.children:
|
|
66
|
+
tag_counts[child.tag_id] = tag_counts.get(child.tag_id, 0) + 1
|
|
67
|
+
logger.info(f"DocInfo tag distribution: {tag_counts}")
|
|
68
|
+
|
|
69
|
+
for child in root.children:
|
|
70
|
+
if child.tag_id == HWPTAG_BIN_DATA:
|
|
71
|
+
payload = child.payload
|
|
72
|
+
logger.debug(f"Found BIN_DATA record, payload size: {len(payload)}, hex: {payload[:20].hex() if len(payload) >= 20 else payload.hex()}")
|
|
73
|
+
|
|
74
|
+
if len(payload) < 2:
|
|
75
|
+
continue
|
|
76
|
+
|
|
77
|
+
flags = struct.unpack('<H', payload[0:2])[0]
|
|
78
|
+
storage_type = flags & 0x0F
|
|
79
|
+
logger.debug(f"BIN_DATA flags: {flags:#06x}, storage_type: {storage_type}")
|
|
80
|
+
|
|
81
|
+
if storage_type in [1, 2]: # EMBEDDING or STORAGE
|
|
82
|
+
if len(payload) < 4:
|
|
83
|
+
bin_data_list.append((0, ""))
|
|
84
|
+
continue
|
|
85
|
+
storage_id = struct.unpack('<H', payload[2:4])[0]
|
|
86
|
+
|
|
87
|
+
ext = ""
|
|
88
|
+
if len(payload) >= 6:
|
|
89
|
+
ext_len = struct.unpack('<H', payload[4:6])[0]
|
|
90
|
+
if ext_len > 0 and len(payload) >= 6 + ext_len * 2:
|
|
91
|
+
ext = payload[6:6+ext_len*2].decode('utf-16le', errors='ignore')
|
|
92
|
+
|
|
93
|
+
bin_data_by_storage_id[storage_id] = (storage_id, ext)
|
|
94
|
+
bin_data_list.append((storage_id, ext))
|
|
95
|
+
logger.debug(f"DocInfo BIN_DATA #{len(bin_data_list)}: storage_id={storage_id}, ext='{ext}'")
|
|
96
|
+
|
|
97
|
+
elif storage_type == 0: # LINK
|
|
98
|
+
bin_data_list.append((0, ""))
|
|
99
|
+
logger.debug(f"DocInfo BIN_DATA #{len(bin_data_list)}: LINK type (external)")
|
|
100
|
+
|
|
101
|
+
else:
|
|
102
|
+
storage_id = 0
|
|
103
|
+
ext = ""
|
|
104
|
+
if len(payload) >= 4:
|
|
105
|
+
storage_id = struct.unpack('<H', payload[2:4])[0]
|
|
106
|
+
if len(payload) >= 6:
|
|
107
|
+
ext_len = struct.unpack('<H', payload[4:6])[0]
|
|
108
|
+
if ext_len > 0 and ext_len < 20 and len(payload) >= 6 + ext_len * 2:
|
|
109
|
+
ext = payload[6:6+ext_len*2].decode('utf-16le', errors='ignore')
|
|
110
|
+
if storage_id > 0:
|
|
111
|
+
bin_data_by_storage_id[storage_id] = (storage_id, ext)
|
|
112
|
+
bin_data_list.append((storage_id, ext))
|
|
113
|
+
logger.debug(f"DocInfo BIN_DATA #{len(bin_data_list)}: unknown type {storage_type}, storage_id={storage_id}")
|
|
114
|
+
|
|
115
|
+
logger.info(f"DocInfo parsed: {len(bin_data_list)} BIN_DATA records, {len(bin_data_by_storage_id)} with storage_id")
|
|
116
|
+
|
|
117
|
+
# Fallback: DocInfo에 BIN_DATA가 없으면 BinData 폴더 직접 스캔
|
|
118
|
+
if len(bin_data_list) == 0:
|
|
119
|
+
logger.info("No BIN_DATA in DocInfo, scanning BinData folder directly...")
|
|
120
|
+
bin_data_by_storage_id, bin_data_list = scan_bindata_folder(ole)
|
|
121
|
+
|
|
122
|
+
except Exception as e:
|
|
123
|
+
logger.warning(f"Failed to parse DocInfo: {e}")
|
|
124
|
+
logger.debug(traceback.format_exc())
|
|
125
|
+
try:
|
|
126
|
+
bin_data_by_storage_id, bin_data_list = scan_bindata_folder(ole)
|
|
127
|
+
except Exception:
|
|
128
|
+
pass
|
|
129
|
+
|
|
130
|
+
return bin_data_by_storage_id, bin_data_list
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def scan_bindata_folder(ole: olefile.OleFileIO) -> Tuple[Dict[int, Tuple[int, str]], List[Tuple[int, str]]]:
|
|
134
|
+
"""
|
|
135
|
+
Fallback: BinData 폴더를 직접 스캔하여 임베디드 파일을 찾습니다.
|
|
136
|
+
|
|
137
|
+
DocInfo 파싱에 실패했거나 BIN_DATA 레코드가 없는 경우 사용합니다.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
ole: OLE 파일 객체
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
튜플:
|
|
144
|
+
- bin_data_by_storage_id: storage_id -> (storage_id, extension) 매핑
|
|
145
|
+
- bin_data_list: (storage_id, extension) 순서 리스트
|
|
146
|
+
"""
|
|
147
|
+
bin_data_by_storage_id = {}
|
|
148
|
+
bin_data_list = []
|
|
149
|
+
|
|
150
|
+
try:
|
|
151
|
+
for entry in ole.listdir():
|
|
152
|
+
if len(entry) >= 2 and entry[0] == "BinData":
|
|
153
|
+
filename = entry[1]
|
|
154
|
+
match = re.match(r'BIN([0-9A-Fa-f]{4})\.(\w+)', filename)
|
|
155
|
+
if match:
|
|
156
|
+
storage_id = int(match.group(1), 16)
|
|
157
|
+
ext = match.group(2)
|
|
158
|
+
bin_data_by_storage_id[storage_id] = (storage_id, ext)
|
|
159
|
+
bin_data_list.append((storage_id, ext))
|
|
160
|
+
logger.debug(f"Found BinData stream: {filename} -> storage_id={storage_id}, ext={ext}")
|
|
161
|
+
|
|
162
|
+
if bin_data_list:
|
|
163
|
+
bin_data_list.sort(key=lambda x: x[0])
|
|
164
|
+
logger.info(f"BinData folder scan: found {len(bin_data_list)} files")
|
|
165
|
+
except Exception as e:
|
|
166
|
+
logger.warning(f"Failed to scan BinData folder: {e}")
|
|
167
|
+
|
|
168
|
+
return bin_data_by_storage_id, bin_data_list
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
__all__ = [
|
|
172
|
+
'parse_doc_info',
|
|
173
|
+
'scan_bindata_folder',
|
|
174
|
+
]
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py
|
|
2
|
+
"""
|
|
3
|
+
HWPFileConverter - HWP file format converter
|
|
4
|
+
|
|
5
|
+
Converts binary HWP data to OLE file object.
|
|
6
|
+
"""
|
|
7
|
+
from io import BytesIO
|
|
8
|
+
from typing import Any, Optional, BinaryIO
|
|
9
|
+
|
|
10
|
+
from xgen_doc2chunk.core.functions.file_converter import BaseFileConverter
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class HWPFileConverter(BaseFileConverter):
|
|
14
|
+
"""
|
|
15
|
+
HWP file converter using olefile.
|
|
16
|
+
|
|
17
|
+
Converts binary HWP (OLE format) data to OleFileIO object.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
# OLE magic number
|
|
21
|
+
OLE_MAGIC = b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1'
|
|
22
|
+
|
|
23
|
+
def convert(
|
|
24
|
+
self,
|
|
25
|
+
file_data: bytes,
|
|
26
|
+
file_stream: Optional[BinaryIO] = None,
|
|
27
|
+
**kwargs
|
|
28
|
+
) -> Any:
|
|
29
|
+
"""
|
|
30
|
+
Convert binary HWP data to OleFileIO object.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
file_data: Raw binary HWP data
|
|
34
|
+
file_stream: Optional file stream
|
|
35
|
+
**kwargs: Additional options
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
olefile.OleFileIO object
|
|
39
|
+
"""
|
|
40
|
+
import olefile
|
|
41
|
+
|
|
42
|
+
stream = file_stream if file_stream is not None else BytesIO(file_data)
|
|
43
|
+
stream.seek(0)
|
|
44
|
+
return olefile.OleFileIO(stream)
|
|
45
|
+
|
|
46
|
+
def get_format_name(self) -> str:
|
|
47
|
+
"""Return format name."""
|
|
48
|
+
return "HWP Document (OLE)"
|
|
49
|
+
|
|
50
|
+
def validate(self, file_data: bytes) -> bool:
|
|
51
|
+
"""Validate if data is a valid OLE file."""
|
|
52
|
+
if not file_data or len(file_data) < 8:
|
|
53
|
+
return False
|
|
54
|
+
return file_data[:8] == self.OLE_MAGIC
|
|
55
|
+
|
|
56
|
+
def close(self, converted_object: Any) -> None:
|
|
57
|
+
"""Close the OLE file."""
|
|
58
|
+
if converted_object is not None and hasattr(converted_object, 'close'):
|
|
59
|
+
converted_object.close()
|
|
60
|
+
|
|
@@ -0,0 +1,413 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py
|
|
2
|
+
"""
|
|
3
|
+
HWP Image Processor
|
|
4
|
+
|
|
5
|
+
Provides HWP-specific image processing that inherits from ImageProcessor.
|
|
6
|
+
Handles BinData stream images and embedded images in HWP 5.0 OLE format.
|
|
7
|
+
|
|
8
|
+
This class consolidates all HWP image extraction logic including:
|
|
9
|
+
- zlib decompression for compressed images
|
|
10
|
+
- BinData stream finding and extraction
|
|
11
|
+
- OLE storage image processing
|
|
12
|
+
"""
|
|
13
|
+
import io
|
|
14
|
+
import os
|
|
15
|
+
import zlib
|
|
16
|
+
import struct
|
|
17
|
+
import logging
|
|
18
|
+
from typing import Any, Dict, List, Optional, Set, TYPE_CHECKING
|
|
19
|
+
|
|
20
|
+
from PIL import Image
|
|
21
|
+
|
|
22
|
+
from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
|
|
23
|
+
from xgen_doc2chunk.core.functions.storage_backend import BaseStorageBackend
|
|
24
|
+
|
|
25
|
+
if TYPE_CHECKING:
|
|
26
|
+
import olefile
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger("xgen_doc2chunk.image_processor.hwp")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class HWPImageProcessor(ImageProcessor):
|
|
32
|
+
"""
|
|
33
|
+
HWP-specific image processor.
|
|
34
|
+
|
|
35
|
+
Inherits from ImageProcessor and provides HWP-specific processing.
|
|
36
|
+
|
|
37
|
+
Handles:
|
|
38
|
+
- BinData stream images
|
|
39
|
+
- Compressed images (zlib)
|
|
40
|
+
- Embedded OLE images
|
|
41
|
+
|
|
42
|
+
Example:
|
|
43
|
+
processor = HWPImageProcessor()
|
|
44
|
+
|
|
45
|
+
# Process BinData image
|
|
46
|
+
tag = processor.process_image(image_data, bindata_id="BIN0001")
|
|
47
|
+
|
|
48
|
+
# Process from OLE stream
|
|
49
|
+
tag = processor.process_bindata_stream(ole, stream_path)
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
def __init__(
|
|
53
|
+
self,
|
|
54
|
+
directory_path: str = "temp/images",
|
|
55
|
+
tag_prefix: str = "[Image:",
|
|
56
|
+
tag_suffix: str = "]",
|
|
57
|
+
storage_backend: Optional[BaseStorageBackend] = None,
|
|
58
|
+
):
|
|
59
|
+
"""
|
|
60
|
+
Initialize HWPImageProcessor.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
directory_path: Image save directory
|
|
64
|
+
tag_prefix: Tag prefix for image references
|
|
65
|
+
tag_suffix: Tag suffix for image references
|
|
66
|
+
storage_backend: Storage backend for saving images
|
|
67
|
+
"""
|
|
68
|
+
super().__init__(
|
|
69
|
+
directory_path=directory_path,
|
|
70
|
+
tag_prefix=tag_prefix,
|
|
71
|
+
tag_suffix=tag_suffix,
|
|
72
|
+
storage_backend=storage_backend,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
def process_image(
|
|
76
|
+
self,
|
|
77
|
+
image_data: bytes,
|
|
78
|
+
bindata_id: Optional[str] = None,
|
|
79
|
+
image_index: Optional[int] = None,
|
|
80
|
+
**kwargs
|
|
81
|
+
) -> Optional[str]:
|
|
82
|
+
"""
|
|
83
|
+
Process and save HWP image data.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
image_data: Raw image binary data
|
|
87
|
+
bindata_id: BinData ID (e.g., "BIN0001")
|
|
88
|
+
image_index: Image index (for naming)
|
|
89
|
+
**kwargs: Additional options
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
Image tag string, or None on failure
|
|
93
|
+
"""
|
|
94
|
+
custom_name = None
|
|
95
|
+
if bindata_id is not None:
|
|
96
|
+
custom_name = f"hwp_{bindata_id}"
|
|
97
|
+
elif image_index is not None:
|
|
98
|
+
custom_name = f"hwp_image_{image_index}"
|
|
99
|
+
|
|
100
|
+
return self.save_image(image_data, custom_name=custom_name)
|
|
101
|
+
|
|
102
|
+
def process_bindata_stream(
|
|
103
|
+
self,
|
|
104
|
+
ole: "olefile.OleFileIO",
|
|
105
|
+
stream_path: str,
|
|
106
|
+
is_compressed: bool = True,
|
|
107
|
+
) -> Optional[str]:
|
|
108
|
+
"""
|
|
109
|
+
Process image from HWP BinData OLE stream.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
ole: OleFileIO object
|
|
113
|
+
stream_path: Path to BinData stream
|
|
114
|
+
is_compressed: Whether data is zlib compressed
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
Image tag string, or None on failure
|
|
118
|
+
"""
|
|
119
|
+
try:
|
|
120
|
+
import zlib
|
|
121
|
+
|
|
122
|
+
stream_data = ole.openstream(stream_path).read()
|
|
123
|
+
|
|
124
|
+
if is_compressed:
|
|
125
|
+
try:
|
|
126
|
+
image_data = zlib.decompress(stream_data, -15)
|
|
127
|
+
except zlib.error:
|
|
128
|
+
# Try without negative windowBits
|
|
129
|
+
try:
|
|
130
|
+
image_data = zlib.decompress(stream_data)
|
|
131
|
+
except zlib.error:
|
|
132
|
+
# Not compressed after all
|
|
133
|
+
image_data = stream_data
|
|
134
|
+
else:
|
|
135
|
+
image_data = stream_data
|
|
136
|
+
|
|
137
|
+
# Extract bindata ID from path
|
|
138
|
+
bindata_id = stream_path.split('/')[-1] if '/' in stream_path else stream_path
|
|
139
|
+
|
|
140
|
+
return self.process_image(image_data, bindata_id=bindata_id)
|
|
141
|
+
|
|
142
|
+
except Exception as e:
|
|
143
|
+
self._logger.warning(f"Failed to process BinData stream {stream_path}: {e}")
|
|
144
|
+
return None
|
|
145
|
+
|
|
146
|
+
def process_embedded_image(
|
|
147
|
+
self,
|
|
148
|
+
image_data: bytes,
|
|
149
|
+
image_name: Optional[str] = None,
|
|
150
|
+
bindata_id: Optional[str] = None,
|
|
151
|
+
**kwargs
|
|
152
|
+
) -> Optional[str]:
|
|
153
|
+
"""
|
|
154
|
+
Process embedded HWP image.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
image_data: Image binary data
|
|
158
|
+
image_name: Original image filename
|
|
159
|
+
bindata_id: BinData ID
|
|
160
|
+
**kwargs: Additional options
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Image tag string, or None on failure
|
|
164
|
+
"""
|
|
165
|
+
custom_name = image_name
|
|
166
|
+
if custom_name is None and bindata_id is not None:
|
|
167
|
+
custom_name = f"hwp_embed_{bindata_id}"
|
|
168
|
+
|
|
169
|
+
return self.save_image(image_data, custom_name=custom_name)
|
|
170
|
+
|
|
171
|
+
def decompress_and_process(
|
|
172
|
+
self,
|
|
173
|
+
compressed_data: bytes,
|
|
174
|
+
bindata_id: Optional[str] = None,
|
|
175
|
+
) -> Optional[str]:
|
|
176
|
+
"""
|
|
177
|
+
Decompress and process zlib-compressed image data.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
compressed_data: zlib compressed image data
|
|
181
|
+
bindata_id: BinData ID
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
Image tag string, or None on failure
|
|
185
|
+
"""
|
|
186
|
+
image_data = self.try_decompress_image(compressed_data)
|
|
187
|
+
return self.process_image(image_data, bindata_id=bindata_id)
|
|
188
|
+
|
|
189
|
+
@staticmethod
|
|
190
|
+
def try_decompress_image(data: bytes) -> bytes:
|
|
191
|
+
"""
|
|
192
|
+
Attempt to decompress HWP image data.
|
|
193
|
+
|
|
194
|
+
HWP files may contain zlib-compressed images, so this method
|
|
195
|
+
tries various decompression strategies.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
data: Original image data (possibly compressed)
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
Decompressed image data (or original if not compressed)
|
|
202
|
+
"""
|
|
203
|
+
# 1. Try zlib decompression if zlib header present
|
|
204
|
+
if data.startswith(b'\x78'):
|
|
205
|
+
try:
|
|
206
|
+
return zlib.decompress(data)
|
|
207
|
+
except Exception:
|
|
208
|
+
pass
|
|
209
|
+
|
|
210
|
+
# 2. Check if already a valid image
|
|
211
|
+
try:
|
|
212
|
+
with Image.open(io.BytesIO(data)) as img:
|
|
213
|
+
img.verify()
|
|
214
|
+
return data # Valid image
|
|
215
|
+
except Exception:
|
|
216
|
+
pass
|
|
217
|
+
|
|
218
|
+
# 3. Try raw deflate (no header)
|
|
219
|
+
try:
|
|
220
|
+
return zlib.decompress(data, -15)
|
|
221
|
+
except Exception:
|
|
222
|
+
pass
|
|
223
|
+
|
|
224
|
+
return data
|
|
225
|
+
|
|
226
|
+
@staticmethod
|
|
227
|
+
def find_bindata_stream(ole: "olefile.OleFileIO", storage_id: int, ext: str) -> Optional[List[str]]:
|
|
228
|
+
"""
|
|
229
|
+
Find BinData stream in OLE container by storage_id and extension.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
ole: OLE file object
|
|
233
|
+
storage_id: BinData storage ID
|
|
234
|
+
ext: File extension
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
Stream path if found, None otherwise
|
|
238
|
+
"""
|
|
239
|
+
ole_dirs = ole.listdir()
|
|
240
|
+
|
|
241
|
+
candidates = [
|
|
242
|
+
f"BinData/BIN{storage_id:04X}.{ext}",
|
|
243
|
+
f"BinData/BIN{storage_id:04x}.{ext}",
|
|
244
|
+
f"BinData/Bin{storage_id:04X}.{ext}",
|
|
245
|
+
f"BinData/Bin{storage_id:04x}.{ext}",
|
|
246
|
+
f"BinData/BIN{storage_id:04X}.{ext.lower()}",
|
|
247
|
+
f"BinData/BIN{storage_id:04x}.{ext.lower()}",
|
|
248
|
+
]
|
|
249
|
+
|
|
250
|
+
# Pattern matching
|
|
251
|
+
for entry in ole_dirs:
|
|
252
|
+
if entry[0] == "BinData" and len(entry) > 1:
|
|
253
|
+
fname = entry[1].lower()
|
|
254
|
+
expected_patterns = [
|
|
255
|
+
f"bin{storage_id:04x}",
|
|
256
|
+
f"bin{storage_id:04X}",
|
|
257
|
+
]
|
|
258
|
+
for pattern in expected_patterns:
|
|
259
|
+
if pattern.lower() in fname.lower():
|
|
260
|
+
logger.debug(f"Found stream by pattern match: {entry}")
|
|
261
|
+
return entry
|
|
262
|
+
|
|
263
|
+
# Exact path matching
|
|
264
|
+
for candidate in candidates:
|
|
265
|
+
candidate_parts = candidate.split('/')
|
|
266
|
+
if candidate_parts in ole_dirs:
|
|
267
|
+
return candidate_parts
|
|
268
|
+
|
|
269
|
+
# Case-insensitive matching
|
|
270
|
+
for entry in ole_dirs:
|
|
271
|
+
if entry[0] == "BinData" and len(entry) > 1:
|
|
272
|
+
fname = entry[1]
|
|
273
|
+
for candidate in candidates:
|
|
274
|
+
if fname.lower() == candidate.split('/')[-1].lower():
|
|
275
|
+
return entry
|
|
276
|
+
|
|
277
|
+
return None
|
|
278
|
+
|
|
279
|
+
@staticmethod
|
|
280
|
+
def extract_bindata_index(payload: bytes, bin_data_list_len: int) -> Optional[int]:
|
|
281
|
+
"""
|
|
282
|
+
Extract BinData index from SHAPE_COMPONENT_PICTURE record payload.
|
|
283
|
+
|
|
284
|
+
Tries various offset strategies for compatibility with different HWP versions.
|
|
285
|
+
|
|
286
|
+
Args:
|
|
287
|
+
payload: SHAPE_COMPONENT_PICTURE record payload
|
|
288
|
+
bin_data_list_len: Length of bin_data_list (for validation)
|
|
289
|
+
|
|
290
|
+
Returns:
|
|
291
|
+
BinData index (1-based) or None
|
|
292
|
+
"""
|
|
293
|
+
if bin_data_list_len == 0:
|
|
294
|
+
return None
|
|
295
|
+
|
|
296
|
+
bindata_index = None
|
|
297
|
+
|
|
298
|
+
# Strategy 1: ?�프??79 (HWP 5.0.3.x+ ?�펙)
|
|
299
|
+
if len(payload) >= 81:
|
|
300
|
+
test_id = struct.unpack('<H', payload[79:81])[0]
|
|
301
|
+
if 0 < test_id <= bin_data_list_len:
|
|
302
|
+
bindata_index = test_id
|
|
303
|
+
logger.debug(f"Found BinData index at offset 79: {bindata_index}")
|
|
304
|
+
return bindata_index
|
|
305
|
+
|
|
306
|
+
# Strategy 2: ?�프??8 (�?버전)
|
|
307
|
+
if len(payload) >= 10:
|
|
308
|
+
test_id = struct.unpack('<H', payload[8:10])[0]
|
|
309
|
+
if 0 < test_id <= bin_data_list_len:
|
|
310
|
+
bindata_index = test_id
|
|
311
|
+
logger.debug(f"Found BinData index at offset 8: {bindata_index}")
|
|
312
|
+
return bindata_index
|
|
313
|
+
|
|
314
|
+
# Strategy 3: ?�반?�인 ?�프???�캔
|
|
315
|
+
for offset in [4, 6, 10, 12, 14, 16, 18, 20, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80]:
|
|
316
|
+
if len(payload) >= offset + 2:
|
|
317
|
+
test_id = struct.unpack('<H', payload[offset:offset+2])[0]
|
|
318
|
+
if 0 < test_id <= bin_data_list_len:
|
|
319
|
+
bindata_index = test_id
|
|
320
|
+
logger.debug(f"Found potential BinData index at offset {offset}: {bindata_index}")
|
|
321
|
+
return bindata_index
|
|
322
|
+
|
|
323
|
+
# Strategy 4: 범위 ??�?번째 non-zero 2바이??�??�캔
|
|
324
|
+
for i in range(0, min(len(payload) - 1, 100), 2):
|
|
325
|
+
test_id = struct.unpack('<H', payload[i:i+2])[0]
|
|
326
|
+
if 0 < test_id <= bin_data_list_len:
|
|
327
|
+
bindata_index = test_id
|
|
328
|
+
logger.debug(f"Found BinData index by scanning at offset {i}: {bindata_index}")
|
|
329
|
+
return bindata_index
|
|
330
|
+
|
|
331
|
+
return None
|
|
332
|
+
|
|
333
|
+
def extract_and_save_image(
|
|
334
|
+
self,
|
|
335
|
+
ole: "olefile.OleFileIO",
|
|
336
|
+
target_stream: List[str],
|
|
337
|
+
processed_images: Optional[Set[str]] = None,
|
|
338
|
+
) -> Optional[str]:
|
|
339
|
+
"""
|
|
340
|
+
Extract image from OLE stream and save locally.
|
|
341
|
+
|
|
342
|
+
Args:
|
|
343
|
+
ole: OLE file object
|
|
344
|
+
target_stream: Stream path
|
|
345
|
+
processed_images: Set of processed image paths
|
|
346
|
+
|
|
347
|
+
Returns:
|
|
348
|
+
Image tag string or None
|
|
349
|
+
"""
|
|
350
|
+
try:
|
|
351
|
+
stream = ole.openstream(target_stream)
|
|
352
|
+
image_data = stream.read()
|
|
353
|
+
image_data = self.try_decompress_image(image_data)
|
|
354
|
+
|
|
355
|
+
bindata_id = target_stream[-1] if target_stream else None
|
|
356
|
+
image_tag = self.process_image(image_data, bindata_id=bindata_id)
|
|
357
|
+
|
|
358
|
+
if image_tag:
|
|
359
|
+
if processed_images is not None:
|
|
360
|
+
processed_images.add("/".join(target_stream))
|
|
361
|
+
logger.info(f"Successfully extracted inline image: {image_tag}")
|
|
362
|
+
return f"\n{image_tag}\n"
|
|
363
|
+
except Exception as e:
|
|
364
|
+
logger.warning(f"Failed to process inline HWP image {target_stream}: {e}")
|
|
365
|
+
|
|
366
|
+
return None
|
|
367
|
+
|
|
368
|
+
def process_images_from_bindata(
|
|
369
|
+
self,
|
|
370
|
+
ole: "olefile.OleFileIO",
|
|
371
|
+
processed_images: Optional[Set[str]] = None,
|
|
372
|
+
) -> str:
|
|
373
|
+
"""
|
|
374
|
+
Extract images from BinData storage and save locally.
|
|
375
|
+
|
|
376
|
+
Args:
|
|
377
|
+
ole: OLE file object
|
|
378
|
+
processed_images: Set of already processed image paths (to skip)
|
|
379
|
+
|
|
380
|
+
Returns:
|
|
381
|
+
Joined image tag strings
|
|
382
|
+
"""
|
|
383
|
+
results = []
|
|
384
|
+
|
|
385
|
+
try:
|
|
386
|
+
bindata_streams = [
|
|
387
|
+
entry for entry in ole.listdir()
|
|
388
|
+
if entry[0] == "BinData"
|
|
389
|
+
]
|
|
390
|
+
|
|
391
|
+
for stream_path in bindata_streams:
|
|
392
|
+
if processed_images and "/".join(stream_path) in processed_images:
|
|
393
|
+
continue
|
|
394
|
+
|
|
395
|
+
stream_name = stream_path[-1]
|
|
396
|
+
ext = os.path.splitext(stream_name)[1].lower()
|
|
397
|
+
if ext in ['.jpg', '.jpeg', '.png', '.bmp', '.gif']:
|
|
398
|
+
stream = ole.openstream(stream_path)
|
|
399
|
+
image_data = stream.read()
|
|
400
|
+
image_data = self.try_decompress_image(image_data)
|
|
401
|
+
|
|
402
|
+
bindata_id = stream_name
|
|
403
|
+
image_tag = self.process_image(image_data, bindata_id=bindata_id)
|
|
404
|
+
if image_tag:
|
|
405
|
+
results.append(image_tag)
|
|
406
|
+
|
|
407
|
+
except Exception as e:
|
|
408
|
+
logger.warning(f"Error processing HWP images: {e}")
|
|
409
|
+
|
|
410
|
+
return "\n\n".join(results)
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
__all__ = ["HWPImageProcessor"]
|