xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,174 @@
1
+ # service/document_processor/processor/hwp_helper/hwp_docinfo.py
2
+ """
3
+ HWP DocInfo 파싱 유틸리티
4
+
5
+ HWP 5.0 OLE 파일의 DocInfo 스트림을 파싱하여 BinData 매핑 정보를 추출합니다.
6
+ - parse_doc_info: DocInfo 스트림에서 BinData 레코드 매핑 추출
7
+ - scan_bindata_folder: BinData 폴더 직접 스캔 (폴백)
8
+ """
9
+ import re
10
+ import struct
11
+ import logging
12
+ import traceback
13
+ from typing import Dict, List, Tuple
14
+
15
+ import olefile
16
+
17
+ from xgen_doc2chunk.core.processor.hwp_helper.hwp_constants import HWPTAG_BIN_DATA
18
+ from xgen_doc2chunk.core.processor.hwp_helper.hwp_record import HwpRecord
19
+ from xgen_doc2chunk.core.processor.hwp_helper.hwp_decoder import (
20
+ is_compressed,
21
+ decompress_stream,
22
+ )
23
+
24
+ logger = logging.getLogger("document-processor")
25
+
26
+
27
+ def parse_doc_info(ole: olefile.OleFileIO) -> Tuple[Dict[int, Tuple[int, str]], List[Tuple[int, str]]]:
28
+ """
29
+ DocInfo 스트림을 파싱하여 BinData 레코드를 매핑합니다.
30
+
31
+ HWP의 DocInfo 스트림에는 BinData 레코드들이 포함되어 있으며,
32
+ 각 레코드는 storage_id와 확장자 정보를 가지고 있습니다.
33
+
34
+ Args:
35
+ ole: OLE 파일 객체
36
+
37
+ Returns:
38
+ 튜플:
39
+ - bin_data_by_storage_id: storage_id -> (storage_id, extension) 매핑
40
+ - bin_data_list: (storage_id, extension) 순서 리스트 (1-based index lookup)
41
+ """
42
+ bin_data_by_storage_id = {}
43
+ bin_data_list = []
44
+
45
+ try:
46
+ if not ole.exists("DocInfo"):
47
+ logger.warning("DocInfo stream not found in OLE file")
48
+ return bin_data_by_storage_id, bin_data_list
49
+
50
+ compressed = is_compressed(ole)
51
+ logger.info(f"HWP file compressed: {compressed}")
52
+
53
+ stream = ole.openstream("DocInfo")
54
+ data = stream.read()
55
+ original_size = len(data)
56
+
57
+ data = decompress_stream(data, compressed)
58
+ logger.info(f"DocInfo stream: original={original_size}, decompressed={len(data)}")
59
+
60
+ root = HwpRecord.build_tree(data)
61
+ logger.info(f"DocInfo tree built with {len(root.children)} top-level records")
62
+
63
+ # 디버그: 모든 태그 ID 로깅
64
+ tag_counts = {}
65
+ for child in root.children:
66
+ tag_counts[child.tag_id] = tag_counts.get(child.tag_id, 0) + 1
67
+ logger.info(f"DocInfo tag distribution: {tag_counts}")
68
+
69
+ for child in root.children:
70
+ if child.tag_id == HWPTAG_BIN_DATA:
71
+ payload = child.payload
72
+ logger.debug(f"Found BIN_DATA record, payload size: {len(payload)}, hex: {payload[:20].hex() if len(payload) >= 20 else payload.hex()}")
73
+
74
+ if len(payload) < 2:
75
+ continue
76
+
77
+ flags = struct.unpack('<H', payload[0:2])[0]
78
+ storage_type = flags & 0x0F
79
+ logger.debug(f"BIN_DATA flags: {flags:#06x}, storage_type: {storage_type}")
80
+
81
+ if storage_type in [1, 2]: # EMBEDDING or STORAGE
82
+ if len(payload) < 4:
83
+ bin_data_list.append((0, ""))
84
+ continue
85
+ storage_id = struct.unpack('<H', payload[2:4])[0]
86
+
87
+ ext = ""
88
+ if len(payload) >= 6:
89
+ ext_len = struct.unpack('<H', payload[4:6])[0]
90
+ if ext_len > 0 and len(payload) >= 6 + ext_len * 2:
91
+ ext = payload[6:6+ext_len*2].decode('utf-16le', errors='ignore')
92
+
93
+ bin_data_by_storage_id[storage_id] = (storage_id, ext)
94
+ bin_data_list.append((storage_id, ext))
95
+ logger.debug(f"DocInfo BIN_DATA #{len(bin_data_list)}: storage_id={storage_id}, ext='{ext}'")
96
+
97
+ elif storage_type == 0: # LINK
98
+ bin_data_list.append((0, ""))
99
+ logger.debug(f"DocInfo BIN_DATA #{len(bin_data_list)}: LINK type (external)")
100
+
101
+ else:
102
+ storage_id = 0
103
+ ext = ""
104
+ if len(payload) >= 4:
105
+ storage_id = struct.unpack('<H', payload[2:4])[0]
106
+ if len(payload) >= 6:
107
+ ext_len = struct.unpack('<H', payload[4:6])[0]
108
+ if ext_len > 0 and ext_len < 20 and len(payload) >= 6 + ext_len * 2:
109
+ ext = payload[6:6+ext_len*2].decode('utf-16le', errors='ignore')
110
+ if storage_id > 0:
111
+ bin_data_by_storage_id[storage_id] = (storage_id, ext)
112
+ bin_data_list.append((storage_id, ext))
113
+ logger.debug(f"DocInfo BIN_DATA #{len(bin_data_list)}: unknown type {storage_type}, storage_id={storage_id}")
114
+
115
+ logger.info(f"DocInfo parsed: {len(bin_data_list)} BIN_DATA records, {len(bin_data_by_storage_id)} with storage_id")
116
+
117
+ # Fallback: DocInfo에 BIN_DATA가 없으면 BinData 폴더 직접 스캔
118
+ if len(bin_data_list) == 0:
119
+ logger.info("No BIN_DATA in DocInfo, scanning BinData folder directly...")
120
+ bin_data_by_storage_id, bin_data_list = scan_bindata_folder(ole)
121
+
122
+ except Exception as e:
123
+ logger.warning(f"Failed to parse DocInfo: {e}")
124
+ logger.debug(traceback.format_exc())
125
+ try:
126
+ bin_data_by_storage_id, bin_data_list = scan_bindata_folder(ole)
127
+ except Exception:
128
+ pass
129
+
130
+ return bin_data_by_storage_id, bin_data_list
131
+
132
+
133
+ def scan_bindata_folder(ole: olefile.OleFileIO) -> Tuple[Dict[int, Tuple[int, str]], List[Tuple[int, str]]]:
134
+ """
135
+ Fallback: BinData 폴더를 직접 스캔하여 임베디드 파일을 찾습니다.
136
+
137
+ DocInfo 파싱에 실패했거나 BIN_DATA 레코드가 없는 경우 사용합니다.
138
+
139
+ Args:
140
+ ole: OLE 파일 객체
141
+
142
+ Returns:
143
+ 튜플:
144
+ - bin_data_by_storage_id: storage_id -> (storage_id, extension) 매핑
145
+ - bin_data_list: (storage_id, extension) 순서 리스트
146
+ """
147
+ bin_data_by_storage_id = {}
148
+ bin_data_list = []
149
+
150
+ try:
151
+ for entry in ole.listdir():
152
+ if len(entry) >= 2 and entry[0] == "BinData":
153
+ filename = entry[1]
154
+ match = re.match(r'BIN([0-9A-Fa-f]{4})\.(\w+)', filename)
155
+ if match:
156
+ storage_id = int(match.group(1), 16)
157
+ ext = match.group(2)
158
+ bin_data_by_storage_id[storage_id] = (storage_id, ext)
159
+ bin_data_list.append((storage_id, ext))
160
+ logger.debug(f"Found BinData stream: {filename} -> storage_id={storage_id}, ext={ext}")
161
+
162
+ if bin_data_list:
163
+ bin_data_list.sort(key=lambda x: x[0])
164
+ logger.info(f"BinData folder scan: found {len(bin_data_list)} files")
165
+ except Exception as e:
166
+ logger.warning(f"Failed to scan BinData folder: {e}")
167
+
168
+ return bin_data_by_storage_id, bin_data_list
169
+
170
+
171
+ __all__ = [
172
+ 'parse_doc_info',
173
+ 'scan_bindata_folder',
174
+ ]
@@ -0,0 +1,60 @@
1
+ # xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py
2
+ """
3
+ HWPFileConverter - HWP file format converter
4
+
5
+ Converts binary HWP data to OLE file object.
6
+ """
7
+ from io import BytesIO
8
+ from typing import Any, Optional, BinaryIO
9
+
10
+ from xgen_doc2chunk.core.functions.file_converter import BaseFileConverter
11
+
12
+
13
+ class HWPFileConverter(BaseFileConverter):
14
+ """
15
+ HWP file converter using olefile.
16
+
17
+ Converts binary HWP (OLE format) data to OleFileIO object.
18
+ """
19
+
20
+ # OLE magic number
21
+ OLE_MAGIC = b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1'
22
+
23
+ def convert(
24
+ self,
25
+ file_data: bytes,
26
+ file_stream: Optional[BinaryIO] = None,
27
+ **kwargs
28
+ ) -> Any:
29
+ """
30
+ Convert binary HWP data to OleFileIO object.
31
+
32
+ Args:
33
+ file_data: Raw binary HWP data
34
+ file_stream: Optional file stream
35
+ **kwargs: Additional options
36
+
37
+ Returns:
38
+ olefile.OleFileIO object
39
+ """
40
+ import olefile
41
+
42
+ stream = file_stream if file_stream is not None else BytesIO(file_data)
43
+ stream.seek(0)
44
+ return olefile.OleFileIO(stream)
45
+
46
+ def get_format_name(self) -> str:
47
+ """Return format name."""
48
+ return "HWP Document (OLE)"
49
+
50
+ def validate(self, file_data: bytes) -> bool:
51
+ """Validate if data is a valid OLE file."""
52
+ if not file_data or len(file_data) < 8:
53
+ return False
54
+ return file_data[:8] == self.OLE_MAGIC
55
+
56
+ def close(self, converted_object: Any) -> None:
57
+ """Close the OLE file."""
58
+ if converted_object is not None and hasattr(converted_object, 'close'):
59
+ converted_object.close()
60
+
@@ -0,0 +1,413 @@
1
+ # xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py
2
+ """
3
+ HWP Image Processor
4
+
5
+ Provides HWP-specific image processing that inherits from ImageProcessor.
6
+ Handles BinData stream images and embedded images in HWP 5.0 OLE format.
7
+
8
+ This class consolidates all HWP image extraction logic including:
9
+ - zlib decompression for compressed images
10
+ - BinData stream finding and extraction
11
+ - OLE storage image processing
12
+ """
13
+ import io
14
+ import os
15
+ import zlib
16
+ import struct
17
+ import logging
18
+ from typing import Any, Dict, List, Optional, Set, TYPE_CHECKING
19
+
20
+ from PIL import Image
21
+
22
+ from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
23
+ from xgen_doc2chunk.core.functions.storage_backend import BaseStorageBackend
24
+
25
+ if TYPE_CHECKING:
26
+ import olefile
27
+
28
+ logger = logging.getLogger("xgen_doc2chunk.image_processor.hwp")
29
+
30
+
31
+ class HWPImageProcessor(ImageProcessor):
32
+ """
33
+ HWP-specific image processor.
34
+
35
+ Inherits from ImageProcessor and provides HWP-specific processing.
36
+
37
+ Handles:
38
+ - BinData stream images
39
+ - Compressed images (zlib)
40
+ - Embedded OLE images
41
+
42
+ Example:
43
+ processor = HWPImageProcessor()
44
+
45
+ # Process BinData image
46
+ tag = processor.process_image(image_data, bindata_id="BIN0001")
47
+
48
+ # Process from OLE stream
49
+ tag = processor.process_bindata_stream(ole, stream_path)
50
+ """
51
+
52
+ def __init__(
53
+ self,
54
+ directory_path: str = "temp/images",
55
+ tag_prefix: str = "[Image:",
56
+ tag_suffix: str = "]",
57
+ storage_backend: Optional[BaseStorageBackend] = None,
58
+ ):
59
+ """
60
+ Initialize HWPImageProcessor.
61
+
62
+ Args:
63
+ directory_path: Image save directory
64
+ tag_prefix: Tag prefix for image references
65
+ tag_suffix: Tag suffix for image references
66
+ storage_backend: Storage backend for saving images
67
+ """
68
+ super().__init__(
69
+ directory_path=directory_path,
70
+ tag_prefix=tag_prefix,
71
+ tag_suffix=tag_suffix,
72
+ storage_backend=storage_backend,
73
+ )
74
+
75
+ def process_image(
76
+ self,
77
+ image_data: bytes,
78
+ bindata_id: Optional[str] = None,
79
+ image_index: Optional[int] = None,
80
+ **kwargs
81
+ ) -> Optional[str]:
82
+ """
83
+ Process and save HWP image data.
84
+
85
+ Args:
86
+ image_data: Raw image binary data
87
+ bindata_id: BinData ID (e.g., "BIN0001")
88
+ image_index: Image index (for naming)
89
+ **kwargs: Additional options
90
+
91
+ Returns:
92
+ Image tag string, or None on failure
93
+ """
94
+ custom_name = None
95
+ if bindata_id is not None:
96
+ custom_name = f"hwp_{bindata_id}"
97
+ elif image_index is not None:
98
+ custom_name = f"hwp_image_{image_index}"
99
+
100
+ return self.save_image(image_data, custom_name=custom_name)
101
+
102
+ def process_bindata_stream(
103
+ self,
104
+ ole: "olefile.OleFileIO",
105
+ stream_path: str,
106
+ is_compressed: bool = True,
107
+ ) -> Optional[str]:
108
+ """
109
+ Process image from HWP BinData OLE stream.
110
+
111
+ Args:
112
+ ole: OleFileIO object
113
+ stream_path: Path to BinData stream
114
+ is_compressed: Whether data is zlib compressed
115
+
116
+ Returns:
117
+ Image tag string, or None on failure
118
+ """
119
+ try:
120
+ import zlib
121
+
122
+ stream_data = ole.openstream(stream_path).read()
123
+
124
+ if is_compressed:
125
+ try:
126
+ image_data = zlib.decompress(stream_data, -15)
127
+ except zlib.error:
128
+ # Try without negative windowBits
129
+ try:
130
+ image_data = zlib.decompress(stream_data)
131
+ except zlib.error:
132
+ # Not compressed after all
133
+ image_data = stream_data
134
+ else:
135
+ image_data = stream_data
136
+
137
+ # Extract bindata ID from path
138
+ bindata_id = stream_path.split('/')[-1] if '/' in stream_path else stream_path
139
+
140
+ return self.process_image(image_data, bindata_id=bindata_id)
141
+
142
+ except Exception as e:
143
+ self._logger.warning(f"Failed to process BinData stream {stream_path}: {e}")
144
+ return None
145
+
146
+ def process_embedded_image(
147
+ self,
148
+ image_data: bytes,
149
+ image_name: Optional[str] = None,
150
+ bindata_id: Optional[str] = None,
151
+ **kwargs
152
+ ) -> Optional[str]:
153
+ """
154
+ Process embedded HWP image.
155
+
156
+ Args:
157
+ image_data: Image binary data
158
+ image_name: Original image filename
159
+ bindata_id: BinData ID
160
+ **kwargs: Additional options
161
+
162
+ Returns:
163
+ Image tag string, or None on failure
164
+ """
165
+ custom_name = image_name
166
+ if custom_name is None and bindata_id is not None:
167
+ custom_name = f"hwp_embed_{bindata_id}"
168
+
169
+ return self.save_image(image_data, custom_name=custom_name)
170
+
171
+ def decompress_and_process(
172
+ self,
173
+ compressed_data: bytes,
174
+ bindata_id: Optional[str] = None,
175
+ ) -> Optional[str]:
176
+ """
177
+ Decompress and process zlib-compressed image data.
178
+
179
+ Args:
180
+ compressed_data: zlib compressed image data
181
+ bindata_id: BinData ID
182
+
183
+ Returns:
184
+ Image tag string, or None on failure
185
+ """
186
+ image_data = self.try_decompress_image(compressed_data)
187
+ return self.process_image(image_data, bindata_id=bindata_id)
188
+
189
+ @staticmethod
190
+ def try_decompress_image(data: bytes) -> bytes:
191
+ """
192
+ Attempt to decompress HWP image data.
193
+
194
+ HWP files may contain zlib-compressed images, so this method
195
+ tries various decompression strategies.
196
+
197
+ Args:
198
+ data: Original image data (possibly compressed)
199
+
200
+ Returns:
201
+ Decompressed image data (or original if not compressed)
202
+ """
203
+ # 1. Try zlib decompression if zlib header present
204
+ if data.startswith(b'\x78'):
205
+ try:
206
+ return zlib.decompress(data)
207
+ except Exception:
208
+ pass
209
+
210
+ # 2. Check if already a valid image
211
+ try:
212
+ with Image.open(io.BytesIO(data)) as img:
213
+ img.verify()
214
+ return data # Valid image
215
+ except Exception:
216
+ pass
217
+
218
+ # 3. Try raw deflate (no header)
219
+ try:
220
+ return zlib.decompress(data, -15)
221
+ except Exception:
222
+ pass
223
+
224
+ return data
225
+
226
+ @staticmethod
227
+ def find_bindata_stream(ole: "olefile.OleFileIO", storage_id: int, ext: str) -> Optional[List[str]]:
228
+ """
229
+ Find BinData stream in OLE container by storage_id and extension.
230
+
231
+ Args:
232
+ ole: OLE file object
233
+ storage_id: BinData storage ID
234
+ ext: File extension
235
+
236
+ Returns:
237
+ Stream path if found, None otherwise
238
+ """
239
+ ole_dirs = ole.listdir()
240
+
241
+ candidates = [
242
+ f"BinData/BIN{storage_id:04X}.{ext}",
243
+ f"BinData/BIN{storage_id:04x}.{ext}",
244
+ f"BinData/Bin{storage_id:04X}.{ext}",
245
+ f"BinData/Bin{storage_id:04x}.{ext}",
246
+ f"BinData/BIN{storage_id:04X}.{ext.lower()}",
247
+ f"BinData/BIN{storage_id:04x}.{ext.lower()}",
248
+ ]
249
+
250
+ # Pattern matching
251
+ for entry in ole_dirs:
252
+ if entry[0] == "BinData" and len(entry) > 1:
253
+ fname = entry[1].lower()
254
+ expected_patterns = [
255
+ f"bin{storage_id:04x}",
256
+ f"bin{storage_id:04X}",
257
+ ]
258
+ for pattern in expected_patterns:
259
+ if pattern.lower() in fname.lower():
260
+ logger.debug(f"Found stream by pattern match: {entry}")
261
+ return entry
262
+
263
+ # Exact path matching
264
+ for candidate in candidates:
265
+ candidate_parts = candidate.split('/')
266
+ if candidate_parts in ole_dirs:
267
+ return candidate_parts
268
+
269
+ # Case-insensitive matching
270
+ for entry in ole_dirs:
271
+ if entry[0] == "BinData" and len(entry) > 1:
272
+ fname = entry[1]
273
+ for candidate in candidates:
274
+ if fname.lower() == candidate.split('/')[-1].lower():
275
+ return entry
276
+
277
+ return None
278
+
279
+ @staticmethod
280
+ def extract_bindata_index(payload: bytes, bin_data_list_len: int) -> Optional[int]:
281
+ """
282
+ Extract BinData index from SHAPE_COMPONENT_PICTURE record payload.
283
+
284
+ Tries various offset strategies for compatibility with different HWP versions.
285
+
286
+ Args:
287
+ payload: SHAPE_COMPONENT_PICTURE record payload
288
+ bin_data_list_len: Length of bin_data_list (for validation)
289
+
290
+ Returns:
291
+ BinData index (1-based) or None
292
+ """
293
+ if bin_data_list_len == 0:
294
+ return None
295
+
296
+ bindata_index = None
297
+
298
+ # Strategy 1: ?�프??79 (HWP 5.0.3.x+ ?�펙)
299
+ if len(payload) >= 81:
300
+ test_id = struct.unpack('<H', payload[79:81])[0]
301
+ if 0 < test_id <= bin_data_list_len:
302
+ bindata_index = test_id
303
+ logger.debug(f"Found BinData index at offset 79: {bindata_index}")
304
+ return bindata_index
305
+
306
+ # Strategy 2: ?�프??8 (�?버전)
307
+ if len(payload) >= 10:
308
+ test_id = struct.unpack('<H', payload[8:10])[0]
309
+ if 0 < test_id <= bin_data_list_len:
310
+ bindata_index = test_id
311
+ logger.debug(f"Found BinData index at offset 8: {bindata_index}")
312
+ return bindata_index
313
+
314
+ # Strategy 3: ?�반?�인 ?�프???�캔
315
+ for offset in [4, 6, 10, 12, 14, 16, 18, 20, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80]:
316
+ if len(payload) >= offset + 2:
317
+ test_id = struct.unpack('<H', payload[offset:offset+2])[0]
318
+ if 0 < test_id <= bin_data_list_len:
319
+ bindata_index = test_id
320
+ logger.debug(f"Found potential BinData index at offset {offset}: {bindata_index}")
321
+ return bindata_index
322
+
323
+ # Strategy 4: 범위 ??�?번째 non-zero 2바이??�??�캔
324
+ for i in range(0, min(len(payload) - 1, 100), 2):
325
+ test_id = struct.unpack('<H', payload[i:i+2])[0]
326
+ if 0 < test_id <= bin_data_list_len:
327
+ bindata_index = test_id
328
+ logger.debug(f"Found BinData index by scanning at offset {i}: {bindata_index}")
329
+ return bindata_index
330
+
331
+ return None
332
+
333
+ def extract_and_save_image(
334
+ self,
335
+ ole: "olefile.OleFileIO",
336
+ target_stream: List[str],
337
+ processed_images: Optional[Set[str]] = None,
338
+ ) -> Optional[str]:
339
+ """
340
+ Extract image from OLE stream and save locally.
341
+
342
+ Args:
343
+ ole: OLE file object
344
+ target_stream: Stream path
345
+ processed_images: Set of processed image paths
346
+
347
+ Returns:
348
+ Image tag string or None
349
+ """
350
+ try:
351
+ stream = ole.openstream(target_stream)
352
+ image_data = stream.read()
353
+ image_data = self.try_decompress_image(image_data)
354
+
355
+ bindata_id = target_stream[-1] if target_stream else None
356
+ image_tag = self.process_image(image_data, bindata_id=bindata_id)
357
+
358
+ if image_tag:
359
+ if processed_images is not None:
360
+ processed_images.add("/".join(target_stream))
361
+ logger.info(f"Successfully extracted inline image: {image_tag}")
362
+ return f"\n{image_tag}\n"
363
+ except Exception as e:
364
+ logger.warning(f"Failed to process inline HWP image {target_stream}: {e}")
365
+
366
+ return None
367
+
368
+ def process_images_from_bindata(
369
+ self,
370
+ ole: "olefile.OleFileIO",
371
+ processed_images: Optional[Set[str]] = None,
372
+ ) -> str:
373
+ """
374
+ Extract images from BinData storage and save locally.
375
+
376
+ Args:
377
+ ole: OLE file object
378
+ processed_images: Set of already processed image paths (to skip)
379
+
380
+ Returns:
381
+ Joined image tag strings
382
+ """
383
+ results = []
384
+
385
+ try:
386
+ bindata_streams = [
387
+ entry for entry in ole.listdir()
388
+ if entry[0] == "BinData"
389
+ ]
390
+
391
+ for stream_path in bindata_streams:
392
+ if processed_images and "/".join(stream_path) in processed_images:
393
+ continue
394
+
395
+ stream_name = stream_path[-1]
396
+ ext = os.path.splitext(stream_name)[1].lower()
397
+ if ext in ['.jpg', '.jpeg', '.png', '.bmp', '.gif']:
398
+ stream = ole.openstream(stream_path)
399
+ image_data = stream.read()
400
+ image_data = self.try_decompress_image(image_data)
401
+
402
+ bindata_id = stream_name
403
+ image_tag = self.process_image(image_data, bindata_id=bindata_id)
404
+ if image_tag:
405
+ results.append(image_tag)
406
+
407
+ except Exception as e:
408
+ logger.warning(f"Error processing HWP images: {e}")
409
+
410
+ return "\n\n".join(results)
411
+
412
+
413
+ __all__ = ["HWPImageProcessor"]