xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,236 @@
1
+ # xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py
2
+ """
3
+ HWP Metadata Extraction Module
4
+
5
+ Provides HWPMetadataExtractor class for extracting metadata from HWP 5.0 OLE files.
6
+ Implements BaseMetadataExtractor interface.
7
+
8
+ Extraction methods:
9
+ 1. olefile's get_metadata() - OLE standard metadata
10
+ 2. HwpSummaryInformation stream direct parsing - HWP-specific metadata
11
+
12
+ Note: HWP is a Korean-native document format, so Korean metadata labels
13
+ are preserved in output for proper display.
14
+ """
15
+ import struct
16
+ import logging
17
+ from datetime import datetime
18
+ from typing import Dict, Any, Optional
19
+
20
+ import olefile
21
+
22
+ from xgen_doc2chunk.core.functions.metadata_extractor import (
23
+ BaseMetadataExtractor,
24
+ DocumentMetadata,
25
+ )
26
+
27
+ logger = logging.getLogger("document-processor")
28
+
29
+
30
+ class HWPMetadataExtractor(BaseMetadataExtractor):
31
+ """
32
+ HWP Metadata Extractor.
33
+
34
+ Extracts metadata from olefile OleFileIO objects.
35
+ Supports both OLE standard metadata and HWP-specific HwpSummaryInformation.
36
+
37
+ Supported fields:
38
+ - title, subject, author, keywords, comments
39
+ - last_saved_by, create_time, last_saved_time
40
+
41
+ Usage:
42
+ extractor = HWPMetadataExtractor()
43
+ metadata = extractor.extract(ole_file)
44
+ text = extractor.format(metadata)
45
+ """
46
+
47
+ def extract(self, source: olefile.OleFileIO) -> DocumentMetadata:
48
+ """
49
+ Extract metadata from HWP file.
50
+
51
+ Args:
52
+ source: olefile OleFileIO object
53
+
54
+ Returns:
55
+ DocumentMetadata instance containing extracted metadata.
56
+ """
57
+ metadata_dict: Dict[str, Any] = {}
58
+
59
+ # Method 1: Use olefile's get_metadata()
60
+ try:
61
+ ole_meta = source.get_metadata()
62
+
63
+ if ole_meta:
64
+ if ole_meta.title:
65
+ metadata_dict['title'] = ole_meta.title
66
+ if ole_meta.subject:
67
+ metadata_dict['subject'] = ole_meta.subject
68
+ if ole_meta.author:
69
+ metadata_dict['author'] = ole_meta.author
70
+ if ole_meta.keywords:
71
+ metadata_dict['keywords'] = ole_meta.keywords
72
+ if ole_meta.comments:
73
+ metadata_dict['comments'] = ole_meta.comments
74
+ if ole_meta.last_saved_by:
75
+ metadata_dict['last_saved_by'] = ole_meta.last_saved_by
76
+ if ole_meta.create_time:
77
+ metadata_dict['create_time'] = ole_meta.create_time
78
+ if ole_meta.last_saved_time:
79
+ metadata_dict['last_saved_time'] = ole_meta.last_saved_time
80
+
81
+ self.logger.debug(f"Extracted OLE metadata: {list(metadata_dict.keys())}")
82
+
83
+ except Exception as e:
84
+ self.logger.warning(f"Failed to extract OLE metadata: {e}")
85
+
86
+ # Method 2: Parse HwpSummaryInformation stream directly
87
+ try:
88
+ hwp_summary_stream = '\x05HwpSummaryInformation'
89
+ if source.exists(hwp_summary_stream):
90
+ self.logger.debug("Found HwpSummaryInformation stream, attempting to parse...")
91
+ stream = source.openstream(hwp_summary_stream)
92
+ data = stream.read()
93
+ hwp_meta = parse_hwp_summary_information(data)
94
+
95
+ # HWP-specific metadata takes priority
96
+ for key, value in hwp_meta.items():
97
+ if value:
98
+ metadata_dict[key] = value
99
+
100
+ except Exception as e:
101
+ self.logger.debug(f"Failed to parse HwpSummaryInformation: {e}")
102
+
103
+ return DocumentMetadata(
104
+ title=metadata_dict.get('title'),
105
+ subject=metadata_dict.get('subject'),
106
+ author=metadata_dict.get('author'),
107
+ keywords=metadata_dict.get('keywords'),
108
+ comments=metadata_dict.get('comments'),
109
+ last_saved_by=metadata_dict.get('last_saved_by'),
110
+ create_time=metadata_dict.get('create_time'),
111
+ last_saved_time=metadata_dict.get('last_saved_time'),
112
+ )
113
+
114
+
115
+ def parse_hwp_summary_information(data: bytes) -> Dict[str, Any]:
116
+ """
117
+ Parse HwpSummaryInformation stream (OLE Property Set format).
118
+
119
+ OLE Property Set structure:
120
+ - Header (28 bytes)
121
+ - Section(s) containing property ID/offset pairs
122
+ - Property values (string, datetime, etc.)
123
+
124
+ Args:
125
+ data: HwpSummaryInformation stream binary data
126
+
127
+ Returns:
128
+ Dictionary containing parsed metadata.
129
+ """
130
+ metadata = {}
131
+
132
+ try:
133
+ if len(data) < 28:
134
+ return metadata
135
+
136
+ pos = 0
137
+ _byte_order = struct.unpack('<H', data[pos:pos+2])[0] # noqa: F841
138
+ pos = 28 # Skip header
139
+
140
+ if len(data) < pos + 20:
141
+ return metadata
142
+
143
+ # Section Header: FMTID (16 bytes) + Offset (4 bytes)
144
+ section_offset = struct.unpack('<I', data[pos+16:pos+20])[0]
145
+
146
+ if section_offset >= len(data):
147
+ return metadata
148
+
149
+ # Parse section
150
+ pos = section_offset
151
+ if len(data) < pos + 8:
152
+ return metadata
153
+
154
+ _section_size = struct.unpack('<I', data[pos:pos+4])[0] # noqa: F841
155
+ num_properties = struct.unpack('<I', data[pos+4:pos+8])[0]
156
+ pos += 8
157
+
158
+ # Read property ID/offset pairs
159
+ properties = []
160
+ for _ in range(min(num_properties, 50)):
161
+ if len(data) < pos + 8:
162
+ break
163
+ prop_id = struct.unpack('<I', data[pos:pos+4])[0]
164
+ prop_offset = struct.unpack('<I', data[pos+4:pos+8])[0]
165
+ properties.append((prop_id, prop_offset))
166
+ pos += 8
167
+
168
+ # Read property values
169
+ for prop_id, prop_offset in properties:
170
+ abs_offset = section_offset + prop_offset
171
+ if abs_offset + 4 >= len(data):
172
+ continue
173
+
174
+ prop_type = struct.unpack('<I', data[abs_offset:abs_offset+4])[0]
175
+ value_offset = abs_offset + 4
176
+
177
+ value = None
178
+
179
+ if prop_type == 0x1E: # ANSI String
180
+ if value_offset + 4 < len(data):
181
+ str_len = struct.unpack('<I', data[value_offset:value_offset+4])[0]
182
+ if str_len > 0 and value_offset + 4 + str_len <= len(data):
183
+ try:
184
+ value = data[value_offset+4:value_offset+4+str_len].decode('cp949', errors='ignore').rstrip('\x00')
185
+ except Exception:
186
+ value = data[value_offset+4:value_offset+4+str_len].decode('utf-8', errors='ignore').rstrip('\x00')
187
+
188
+ elif prop_type == 0x1F: # Unicode String
189
+ if value_offset + 4 < len(data):
190
+ str_len = struct.unpack('<I', data[value_offset:value_offset+4])[0]
191
+ byte_len = str_len * 2
192
+ if str_len > 0 and value_offset + 4 + byte_len <= len(data):
193
+ value = data[value_offset+4:value_offset+4+byte_len].decode('utf-16le', errors='ignore').rstrip('\x00')
194
+
195
+ elif prop_type == 0x40: # FILETIME
196
+ if value_offset + 8 <= len(data):
197
+ filetime = struct.unpack('<Q', data[value_offset:value_offset+8])[0]
198
+ if filetime > 0:
199
+ try:
200
+ seconds = filetime / 10000000
201
+ epoch_diff = 11644473600
202
+ unix_time = seconds - epoch_diff
203
+ if 0 < unix_time < 2000000000:
204
+ value = datetime.fromtimestamp(unix_time)
205
+ except Exception:
206
+ pass
207
+
208
+ # Property ID mapping
209
+ if value:
210
+ if prop_id == 0x02:
211
+ metadata['title'] = value
212
+ elif prop_id == 0x03:
213
+ metadata['subject'] = value
214
+ elif prop_id == 0x04:
215
+ metadata['author'] = value
216
+ elif prop_id == 0x05:
217
+ metadata['keywords'] = value
218
+ elif prop_id == 0x06:
219
+ metadata['comments'] = value
220
+ elif prop_id == 0x08:
221
+ metadata['last_saved_by'] = value
222
+ elif prop_id == 0x0C:
223
+ metadata['create_time'] = value
224
+ elif prop_id == 0x0D:
225
+ metadata['last_saved_time'] = value
226
+
227
+ except Exception as e:
228
+ logger.debug(f"Error parsing HWP summary information: {e}")
229
+
230
+ return metadata
231
+
232
+
233
+ __all__ = [
234
+ 'HWPMetadataExtractor',
235
+ 'parse_hwp_summary_information',
236
+ ]
@@ -0,0 +1,82 @@
1
+ # xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py
2
+ """
3
+ HWP Preprocessor - Process HWP OLE document after conversion.
4
+
5
+ Processing Pipeline Position:
6
+ 1. HWPFileConverter.convert() ??olefile.OleFileIO
7
+ 2. HWPPreprocessor.preprocess() ??PreprocessedData (THIS STEP)
8
+ 3. HWPMetadataExtractor.extract() ??DocumentMetadata
9
+ 4. Content extraction (body text, tables, images)
10
+
11
+ Current Implementation:
12
+ - Pass-through (HWP uses olefile object directly)
13
+ """
14
+ import logging
15
+ from typing import Any, Dict
16
+
17
+ from xgen_doc2chunk.core.functions.preprocessor import (
18
+ BasePreprocessor,
19
+ PreprocessedData,
20
+ )
21
+
22
+ logger = logging.getLogger("xgen_doc2chunk.hwp.preprocessor")
23
+
24
+
25
+ class HWPPreprocessor(BasePreprocessor):
26
+ """
27
+ HWP OLE Document Preprocessor.
28
+
29
+ Currently a pass-through implementation as HWP processing
30
+ is handled during the content extraction phase using olefile.
31
+ """
32
+
33
+ def preprocess(
34
+ self,
35
+ converted_data: Any,
36
+ **kwargs
37
+ ) -> PreprocessedData:
38
+ """
39
+ Preprocess the converted HWP OLE document.
40
+
41
+ Args:
42
+ converted_data: olefile.OleFileIO object from HWPFileConverter
43
+ **kwargs: Additional options
44
+
45
+ Returns:
46
+ PreprocessedData with the OLE object and any extracted resources
47
+ """
48
+ metadata: Dict[str, Any] = {}
49
+
50
+ if hasattr(converted_data, 'listdir'):
51
+ try:
52
+ streams = converted_data.listdir()
53
+ metadata['stream_count'] = len(streams)
54
+ # Check for common HWP streams
55
+ has_body = any('BodyText' in '/'.join(s) for s in streams)
56
+ has_docinfo = any('DocInfo' in '/'.join(s) for s in streams)
57
+ metadata['has_body_text'] = has_body
58
+ metadata['has_doc_info'] = has_docinfo
59
+ except Exception:
60
+ pass
61
+
62
+ logger.debug("HWP preprocessor: pass-through, metadata=%s", metadata)
63
+
64
+ # clean_content is the TRUE SOURCE - contains the OLE object
65
+ return PreprocessedData(
66
+ raw_content=converted_data,
67
+ clean_content=converted_data, # TRUE SOURCE - olefile.OleFileIO
68
+ encoding="utf-8",
69
+ extracted_resources={},
70
+ metadata=metadata,
71
+ )
72
+
73
+ def get_format_name(self) -> str:
74
+ """Return format name."""
75
+ return "HWP Preprocessor"
76
+
77
+ def validate(self, data: Any) -> bool:
78
+ """Validate if data is an OLE file object."""
79
+ return hasattr(data, 'listdir') and hasattr(data, 'openstream')
80
+
81
+
82
+ __all__ = ['HWPPreprocessor']
@@ -0,0 +1,149 @@
1
+ """
2
+ HWP Record 파싱 클래스
3
+ """
4
+ import struct
5
+ import logging
6
+ from itertools import islice
7
+ from typing import Optional
8
+
9
+ from xgen_doc2chunk.core.processor.hwp_helper.hwp_constants import HWPTAG_PARA_TEXT
10
+
11
+ logger = logging.getLogger("document-processor")
12
+
13
+
14
+ class HwpRecord:
15
+ def __init__(self, tag_id: int, payload: bytes, parent: 'HwpRecord' = None):
16
+ self.tag_id = tag_id
17
+ self.payload = payload
18
+ self.parent = parent
19
+ self.children = []
20
+
21
+ def get_next_siblings(self, count=None):
22
+ if not self.parent:
23
+ return []
24
+ try:
25
+ start_idx = self.parent.children.index(self) + 1
26
+ if count is None:
27
+ end_idx = None
28
+ else:
29
+ end_idx = start_idx + count
30
+ return islice(self.parent.children, start_idx, end_idx)
31
+ except ValueError:
32
+ return []
33
+
34
+ def get_text(self) -> str:
35
+ """
36
+ Extract text from HWPTAG_PARA_TEXT payload, handling control characters.
37
+ Returns text with \\x0b markers for extended controls.
38
+ """
39
+ if self.tag_id != HWPTAG_PARA_TEXT:
40
+ return ""
41
+
42
+ # HWP text is UTF-16LE
43
+ text = ''
44
+ payload = self.payload
45
+ cursor = 0
46
+
47
+ while cursor < len(payload):
48
+ if cursor + 1 >= len(payload):
49
+ break
50
+
51
+ code = struct.unpack('<H', payload[cursor:cursor+2])[0]
52
+
53
+ if code >= 32:
54
+ # Normal char
55
+ text += chr(code)
56
+ cursor += 2
57
+ else:
58
+ # Control char handling
59
+ if code == 13: # Para break
60
+ text += '\n'
61
+ cursor += 2
62
+ elif code == 10: # Line break
63
+ text += '\n'
64
+ cursor += 2
65
+ elif code == 9: # Tab
66
+ text += '\t'
67
+ cursor += 2
68
+ else:
69
+ # Extended control chars have extra data
70
+ # Simplified logic based on known HWP structure
71
+ size = 1
72
+ if code in [4, 5, 6, 7, 8, 9, 19, 20]: # Inline
73
+ size = 8
74
+ elif code in [1, 2, 3, 11, 12, 14, 15, 16, 17, 18, 21, 22, 23]: # Extended
75
+ size = 8
76
+ # Code 11 is the standard "Extended Control" marker (for Tables, GSO, etc.)
77
+ if code == 11:
78
+ text += '\x0b'
79
+ # logger.debug(f"Found Extended Control Marker (11) at cursor {cursor}")
80
+
81
+ cursor += size * 2
82
+
83
+ return text
84
+
85
+ @staticmethod
86
+ def build_tree(data: bytes) -> 'HwpRecord':
87
+ root = HwpRecord(0, b'')
88
+ pos = 0
89
+ size = len(data)
90
+
91
+ # Stack to keep track of parents based on level
92
+ # Level 0 is root children
93
+ # stack[0] = root
94
+ stack = {0: root}
95
+
96
+ while pos < size:
97
+ try:
98
+ if pos + 4 > size:
99
+ break
100
+ header = struct.unpack('<I', data[pos:pos+4])[0]
101
+ pos += 4
102
+
103
+ tag_id = header & 0x3FF
104
+ level = (header >> 10) & 0x3FF
105
+ rec_len = (header >> 20) & 0xFFF
106
+
107
+ if rec_len == 0xFFF:
108
+ if pos + 4 > size:
109
+ break
110
+ rec_len = struct.unpack('<I', data[pos:pos+4])[0]
111
+ pos += 4
112
+
113
+ if pos + rec_len > size:
114
+ # Truncated record, stop parsing
115
+ break
116
+
117
+ payload = data[pos:pos+rec_len]
118
+ pos += rec_len
119
+
120
+ # Determine parent
121
+ parent = stack.get(level - 1, root)
122
+ if level == 0:
123
+ parent = root
124
+
125
+ # If parent is not in stack (gap in levels), fallback to root or nearest
126
+ if parent is None:
127
+ # Find nearest lower level
128
+ for l in range(level - 1, -1, -1):
129
+ if l in stack:
130
+ parent = stack[l]
131
+ break
132
+ if parent is None:
133
+ parent = root
134
+
135
+ record = HwpRecord(tag_id, payload, parent)
136
+ parent.children.append(record)
137
+
138
+ # Update stack for this level
139
+ stack[level] = record
140
+
141
+ # Clear deeper levels from stack as we moved to a new node at this level
142
+ keys_to_remove = [k for k in stack.keys() if k > level]
143
+ for k in keys_to_remove:
144
+ del stack[k]
145
+ except Exception as e:
146
+ logger.debug(f"Error parsing HWP record at pos {pos}: {e}")
147
+ break
148
+
149
+ return root
@@ -0,0 +1,217 @@
1
+ # service/document_processor/processor/hwp_helper/hwp_recovery.py
2
+ """
3
+ HWP 손상 파일 복구 유틸리티
4
+
5
+ 손상되었거나 비-OLE HWP 파일에서 텍스트와 이미지를 복구합니다.
6
+ - extract_text_from_stream_raw: 바이너리에서 UTF-16LE 문자열 추출
7
+ - recover_images_from_raw: 이미지 시그니처 스캔 후 추출
8
+ - find_zlib_streams: zlib 압축 스트림 찾기 및 해제
9
+ """
10
+ import zlib
11
+ import struct
12
+ import logging
13
+ from typing import List, Tuple, Optional
14
+
15
+ from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
16
+
17
+ logger = logging.getLogger("document-processor")
18
+
19
+
20
+ def extract_text_from_stream_raw(data: bytes) -> str:
21
+ """
22
+ Fallback: 레코드 파싱 없이 바이너리 데이터에서 UTF-16LE 문자열을 추출합니다.
23
+
24
+ 한글 완성형(0xAC00-0xD7A3), ASCII 인쇄 가능 문자, 한글 자모,
25
+ CJK 구두점 등 유효한 문자만 추출합니다.
26
+
27
+ Args:
28
+ data: 바이너리 데이터
29
+
30
+ Returns:
31
+ 추출된 텍스트 문자열
32
+ """
33
+ text_parts = []
34
+ current_run = []
35
+
36
+ for i in range(0, len(data) - 1, 2):
37
+ chunk = data[i:i+2]
38
+ val = struct.unpack('<H', chunk)[0]
39
+
40
+ is_valid = (
41
+ (0xAC00 <= val <= 0xD7A3) or # 한글 완성형
42
+ (0x0020 <= val <= 0x007E) or # ASCII 인쇄 가능
43
+ (0x3130 <= val <= 0x318F) or # 한글 호환 자모
44
+ (0x1100 <= val <= 0x11FF) or # 한글 자모
45
+ (0x3000 <= val <= 0x303F) or # CJK 구두점
46
+ val in [10, 13, 9] # 줄바꿈, 탭
47
+ )
48
+
49
+ if is_valid:
50
+ if val in [10, 13]:
51
+ if current_run:
52
+ text_parts.append("".join(current_run))
53
+ current_run = []
54
+ text_parts.append("\n")
55
+ elif val == 9:
56
+ current_run.append("\t")
57
+ else:
58
+ current_run.append(chr(val))
59
+ else:
60
+ if len(current_run) > 0:
61
+ text_parts.append("".join(current_run))
62
+ current_run = []
63
+
64
+ if current_run:
65
+ text_parts.append("".join(current_run))
66
+
67
+ final_parts = [p for p in text_parts if len(p.strip()) > 0]
68
+ return "".join(final_parts)
69
+
70
+
71
+ def find_zlib_streams(raw_data: bytes, min_size: int = 50) -> List[Tuple[int, bytes]]:
72
+ """
73
+ 바이너리 데이터에서 zlib 압축 스트림을 찾아 압축 해제합니다.
74
+
75
+ zlib 헤더(0x78 0x9c, 0x78 0x01, 0x78 0xda)를 스캔하고
76
+ 압축 해제를 시도합니다.
77
+
78
+ Args:
79
+ raw_data: 바이너리 데이터
80
+ min_size: 유효한 스트림으로 인정할 최소 압축 해제 크기
81
+
82
+ Returns:
83
+ (시작 오프셋, 압축 해제된 데이터) 튜플 리스트
84
+ """
85
+ zlib_headers = [b'\x78\x9c', b'\x78\x01', b'\x78\xda']
86
+
87
+ decompressed_chunks = []
88
+ start = 0
89
+ file_len = len(raw_data)
90
+
91
+ while start < file_len:
92
+ next_header_pos = -1
93
+
94
+ for h in zlib_headers:
95
+ pos = raw_data.find(h, start)
96
+ if pos != -1:
97
+ if next_header_pos == -1 or pos < next_header_pos:
98
+ next_header_pos = pos
99
+
100
+ if next_header_pos == -1:
101
+ break
102
+
103
+ start = next_header_pos
104
+
105
+ try:
106
+ dobj = zlib.decompressobj()
107
+ decompressed = dobj.decompress(raw_data[start:])
108
+
109
+ if len(decompressed) > min_size:
110
+ decompressed_chunks.append((start, decompressed))
111
+
112
+ if dobj.unused_data:
113
+ compressed_size = len(raw_data[start:]) - len(dobj.unused_data)
114
+ start += compressed_size
115
+ else:
116
+ start += 1
117
+
118
+ except (zlib.error, Exception):
119
+ start += 1
120
+
121
+ return decompressed_chunks
122
+
123
+
124
+ def recover_images_from_raw(
125
+ raw_data: bytes,
126
+ image_processor: ImageProcessor
127
+ ) -> str:
128
+ """
129
+ raw 바이너리 데이터에서 이미지 시그니처(JPEG, PNG)를 스캔하여 로컬에 저장합니다.
130
+
131
+ Args:
132
+ raw_data: 바이너리 데이터
133
+ image_processor: 이미지 프로세서 인스턴스
134
+
135
+ Returns:
136
+ 이미지 태그들을 결합한 문자열
137
+ """
138
+
139
+ results = []
140
+
141
+ # JPEG 추출
142
+ start = 0
143
+ while True:
144
+ start = raw_data.find(b'\xff\xd8\xff', start)
145
+ if start == -1:
146
+ break
147
+
148
+ end = raw_data.find(b'\xff\xd9', start)
149
+ if end == -1:
150
+ break
151
+
152
+ end += 2
153
+
154
+ size = end - start
155
+ if 100 < size < 10 * 1024 * 1024:
156
+ img_data = raw_data[start:end]
157
+
158
+ image_tag = image_processor.save_image(img_data)
159
+ if image_tag:
160
+ results.append(image_tag)
161
+
162
+ start = end
163
+
164
+ # PNG 추출
165
+ png_sig = b'\x89PNG\r\n\x1a\n'
166
+ png_end = b'IEND\xae\x42\x60\x82'
167
+
168
+ start = 0
169
+ while True:
170
+ start = raw_data.find(png_sig, start)
171
+ if start == -1:
172
+ break
173
+
174
+ end = raw_data.find(png_end, start)
175
+ if end == -1:
176
+ break
177
+
178
+ end += len(png_end)
179
+
180
+ size = end - start
181
+ if 100 < size < 10 * 1024 * 1024:
182
+ img_data = raw_data[start:end]
183
+
184
+ image_tag = image_processor.save_image(img_data)
185
+ if image_tag:
186
+ results.append(image_tag)
187
+
188
+ start = end
189
+
190
+ return "\n\n".join(results)
191
+
192
+
193
+ def check_file_signature(raw_data: bytes) -> Optional[str]:
194
+ """
195
+ 파일 시그니처를 확인하여 파일 형식을 식별합니다.
196
+
197
+ Args:
198
+ raw_data: 바이너리 데이터
199
+
200
+ Returns:
201
+ 파일 형식 문자열 또는 None
202
+ """
203
+ if raw_data.startswith(b'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1'):
204
+ return "OLE"
205
+ elif raw_data.startswith(b'PK\x03\x04'):
206
+ return "ZIP/HWPX"
207
+ elif b'HWP Document File' in raw_data[:100]:
208
+ return "HWP3.0"
209
+ return None
210
+
211
+
212
+ __all__ = [
213
+ 'extract_text_from_stream_raw',
214
+ 'find_zlib_streams',
215
+ 'recover_images_from_raw',
216
+ 'check_file_signature',
217
+ ]