xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,579 @@
1
+ # xgen_doc2chunk/core/processor/doc_handler.py
2
+ """
3
+ DOC Handler - Legacy Microsoft Word Document Processor
4
+
5
+ Class-based handler for DOC files inheriting from BaseHandler.
6
+ Automatically detects file format (RTF, OLE, HTML, DOCX) and processes accordingly.
7
+ RTF processing is delegated to RTFHandler.
8
+ """
9
+ import io
10
+ import logging
11
+ import os
12
+ import re
13
+ import struct
14
+ import base64
15
+ from typing import Any, Dict, List, Optional, Set, TYPE_CHECKING
16
+ from enum import Enum
17
+ import zipfile
18
+
19
+ import olefile
20
+ from bs4 import BeautifulSoup
21
+
22
+ from xgen_doc2chunk.core.processor.base_handler import BaseHandler
23
+ from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
24
+ from xgen_doc2chunk.core.functions.chart_extractor import BaseChartExtractor, NullChartExtractor
25
+ from xgen_doc2chunk.core.processor.doc_helpers.doc_image_processor import DOCImageProcessor
26
+
27
+ if TYPE_CHECKING:
28
+ from xgen_doc2chunk.core.document_processor import CurrentFile
29
+
30
+ logger = logging.getLogger("document-processor")
31
+
32
+
33
+ class DocFormat(Enum):
34
+ """Actual format types for DOC files."""
35
+ RTF = "rtf"
36
+ OLE = "ole"
37
+ HTML = "html"
38
+ DOCX = "docx"
39
+ UNKNOWN = "unknown"
40
+
41
+
42
+ MAGIC_NUMBERS = {
43
+ 'RTF': b'{\\rtf',
44
+ 'OLE': b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1',
45
+ 'ZIP': b'PK\x03\x04',
46
+ }
47
+
48
+
49
+ class DOCHandler(BaseHandler):
50
+ """DOC file processing handler class."""
51
+
52
+ def _create_file_converter(self):
53
+ """Create DOC-specific file converter."""
54
+ from xgen_doc2chunk.core.processor.doc_helpers.doc_file_converter import DOCFileConverter
55
+ return DOCFileConverter()
56
+
57
+ def _create_preprocessor(self):
58
+ """Create DOC-specific preprocessor."""
59
+ from xgen_doc2chunk.core.processor.doc_helpers.doc_preprocessor import DOCPreprocessor
60
+ return DOCPreprocessor()
61
+
62
+ def _create_chart_extractor(self) -> BaseChartExtractor:
63
+ """DOC files chart extraction not yet implemented. Return NullChartExtractor."""
64
+ return NullChartExtractor(self._chart_processor)
65
+
66
+ def _create_metadata_extractor(self):
67
+ """DOC metadata extraction not yet implemented. Return None to use NullMetadataExtractor."""
68
+ return None
69
+
70
+ def _create_format_image_processor(self) -> ImageProcessor:
71
+ """Create DOC-specific image processor."""
72
+ return DOCImageProcessor()
73
+
74
+ def extract_text(
75
+ self,
76
+ current_file: "CurrentFile",
77
+ extract_metadata: bool = True,
78
+ **kwargs
79
+ ) -> str:
80
+ """Extract text from DOC file."""
81
+ file_path = current_file.get("file_path", "unknown")
82
+ file_data = current_file.get("file_data", b"")
83
+
84
+ self.logger.info(f"DOC processing: {file_path}")
85
+
86
+ if not file_data:
87
+ self.logger.error(f"Empty file data: {file_path}")
88
+ return f"[DOC file is empty: {file_path}]"
89
+
90
+ try:
91
+ # Step 1: Use file_converter to detect format and convert
92
+ converted_obj, doc_format = self.file_converter.convert(file_data)
93
+
94
+ # Step 2: Preprocess - may transform converted_obj in the future
95
+ preprocessed = self.preprocess(converted_obj)
96
+ converted_obj = preprocessed.clean_content # TRUE SOURCE
97
+
98
+ if doc_format == DocFormat.RTF:
99
+ # Delegate to RTFHandler for RTF processing
100
+ return self._delegate_to_rtf_handler(converted_obj, current_file, extract_metadata)
101
+ elif doc_format == DocFormat.OLE:
102
+ return self._extract_from_ole_obj(converted_obj, current_file, extract_metadata)
103
+ elif doc_format == DocFormat.HTML:
104
+ return self._extract_from_html_obj(converted_obj, current_file, extract_metadata)
105
+ elif doc_format == DocFormat.DOCX:
106
+ return self._extract_from_docx_obj(converted_obj, current_file, extract_metadata)
107
+ else:
108
+ self.logger.warning(f"Unknown DOC format, trying OLE fallback: {file_path}")
109
+ return self._extract_from_ole(current_file, extract_metadata)
110
+ except Exception as e:
111
+ self.logger.error(f"Error in DOC processing: {e}")
112
+ return f"[DOC file processing failed: {str(e)}]"
113
+
114
+ def _delegate_to_rtf_handler(self, rtf_doc, current_file: "CurrentFile", extract_metadata: bool) -> str:
115
+ """
116
+ Delegate RTF processing to RTFHandler.
117
+
118
+ DOC ?�일???�제로는 RTF ?�식??경우, RTFHandler???�임?�니??
119
+ RTFHandler.extract_text()??raw bytes�?받으므�?current_file??그�?�??�달?�니??
120
+
121
+ Args:
122
+ rtf_doc: Pre-converted RTFDocument object (unused, for consistency)
123
+ current_file: CurrentFile dict containing original file_data
124
+ extract_metadata: Whether to extract metadata
125
+
126
+ Returns:
127
+ Extracted text
128
+ """
129
+ from xgen_doc2chunk.core.processor.rtf_handler import RTFHandler
130
+
131
+ rtf_handler = RTFHandler(
132
+ config=self.config,
133
+ image_processor=self._image_processor,
134
+ page_tag_processor=self._page_tag_processor,
135
+ chart_processor=self._chart_processor
136
+ )
137
+
138
+ # RTFHandler.extract_text()??current_file?�서 file_data�?직접 ?�어 처리
139
+ return rtf_handler.extract_text(current_file, extract_metadata=extract_metadata)
140
+
141
+ def _extract_from_ole_obj(self, ole, current_file: "CurrentFile", extract_metadata: bool) -> str:
142
+ """OLE Compound Document processing using pre-converted OLE object."""
143
+ file_path = current_file.get("file_path", "unknown")
144
+
145
+ self.logger.info(f"Processing OLE: {file_path}")
146
+
147
+ result_parts = []
148
+ processed_images: Set[str] = set()
149
+
150
+ try:
151
+ # Metadata extraction
152
+ if extract_metadata:
153
+ metadata = self._extract_ole_metadata(ole)
154
+ metadata_str = self.extract_and_format_metadata(metadata)
155
+ if metadata_str:
156
+ result_parts.append(metadata_str + "\n\n")
157
+
158
+ page_tag = self.create_page_tag(1)
159
+ result_parts.append(f"{page_tag}\n")
160
+
161
+ # Extract text from WordDocument stream
162
+ text = self._extract_ole_text(ole)
163
+ if text:
164
+ result_parts.append(text)
165
+
166
+ # Extract images
167
+ images = self._extract_ole_images(ole, processed_images)
168
+ for img_tag in images:
169
+ result_parts.append(img_tag)
170
+
171
+ except Exception as e:
172
+ self.logger.error(f"OLE processing error: {e}")
173
+ return f"[DOC file processing failed: {str(e)}]"
174
+ finally:
175
+ # Close the OLE object
176
+ self.file_converter.close(ole)
177
+
178
+ return "\n".join(result_parts)
179
+
180
+ def _extract_from_ole(self, current_file: "CurrentFile", extract_metadata: bool) -> str:
181
+ """OLE Compound Document processing - extract text directly from WordDocument stream."""
182
+ file_path = current_file.get("file_path", "unknown")
183
+ file_data = current_file.get("file_data", b"")
184
+
185
+ self.logger.info(f"Processing OLE: {file_path}")
186
+
187
+ result_parts = []
188
+ processed_images: Set[str] = set()
189
+
190
+ try:
191
+ file_stream = io.BytesIO(file_data)
192
+ with olefile.OleFileIO(file_stream) as ole:
193
+ # Metadata extraction
194
+ if extract_metadata:
195
+ metadata = self._extract_ole_metadata(ole)
196
+ metadata_str = self.extract_and_format_metadata(metadata)
197
+ if metadata_str:
198
+ result_parts.append(metadata_str + "\n\n")
199
+
200
+ page_tag = self.create_page_tag(1)
201
+ result_parts.append(f"{page_tag}\n")
202
+
203
+ # Extract text from WordDocument stream
204
+ text = self._extract_ole_text(ole)
205
+ if text:
206
+ result_parts.append(text)
207
+
208
+ # Extract images
209
+ images = self._extract_ole_images(ole, processed_images)
210
+ for img_tag in images:
211
+ result_parts.append(img_tag)
212
+
213
+ except Exception as e:
214
+ self.logger.error(f"OLE processing error: {e}")
215
+ return f"[DOC file processing failed: {str(e)}]"
216
+
217
+ return "\n".join(result_parts)
218
+
219
+ def _extract_ole_metadata(self, ole: olefile.OleFileIO) -> Dict[str, Any]:
220
+ """Extract OLE metadata."""
221
+ metadata = {}
222
+ try:
223
+ ole_meta = ole.get_metadata()
224
+ if ole_meta:
225
+ if ole_meta.title:
226
+ metadata['title'] = self._decode_ole_string(ole_meta.title)
227
+ if ole_meta.subject:
228
+ metadata['subject'] = self._decode_ole_string(ole_meta.subject)
229
+ if ole_meta.author:
230
+ metadata['author'] = self._decode_ole_string(ole_meta.author)
231
+ if ole_meta.keywords:
232
+ metadata['keywords'] = self._decode_ole_string(ole_meta.keywords)
233
+ if ole_meta.comments:
234
+ metadata['comments'] = self._decode_ole_string(ole_meta.comments)
235
+ if ole_meta.last_saved_by:
236
+ metadata['last_saved_by'] = self._decode_ole_string(ole_meta.last_saved_by)
237
+ if ole_meta.create_time:
238
+ metadata['create_time'] = ole_meta.create_time
239
+ if ole_meta.last_saved_time:
240
+ metadata['last_saved_time'] = ole_meta.last_saved_time
241
+ except Exception as e:
242
+ self.logger.warning(f"Error extracting OLE metadata: {e}")
243
+ return metadata
244
+
245
+ def _decode_ole_string(self, value) -> str:
246
+ """Decode OLE string."""
247
+ if value is None:
248
+ return ""
249
+ if isinstance(value, str):
250
+ return value.strip()
251
+ if isinstance(value, bytes):
252
+ for encoding in ['utf-8', 'cp949', 'euc-kr', 'cp1252', 'latin-1']:
253
+ try:
254
+ return value.decode(encoding).strip()
255
+ except (UnicodeDecodeError, UnicodeError):
256
+ continue
257
+ return value.decode('utf-8', errors='replace').strip()
258
+ return str(value).strip()
259
+
260
+ def _extract_ole_images(self, ole: olefile.OleFileIO, processed_images: Set[str]) -> List[str]:
261
+ """OLE?�서 ?��?지 추출"""
262
+ images = []
263
+ try:
264
+ for entry in ole.listdir():
265
+ if any(x.lower() in ['pictures', 'data', 'object', 'oleobject'] for x in entry):
266
+ try:
267
+ stream = ole.openstream(entry)
268
+ data = stream.read()
269
+
270
+ if data[:8] == b'\x89PNG\r\n\x1a\n' or data[:2] == b'\xff\xd8' or \
271
+ data[:6] in (b'GIF87a', b'GIF89a') or data[:2] == b'BM':
272
+ image_tag = self.format_image_processor.save_image(data)
273
+ if image_tag:
274
+ images.append(f"\n{image_tag}\n")
275
+ except:
276
+ continue
277
+ except Exception as e:
278
+ self.logger.warning(f"Error extracting OLE images: {e}")
279
+ return images
280
+
281
+ def _extract_from_html_obj(self, soup, current_file: "CurrentFile", extract_metadata: bool) -> str:
282
+ """HTML DOC processing using pre-converted BeautifulSoup object."""
283
+ file_path = current_file.get("file_path", "unknown")
284
+
285
+ self.logger.info(f"Processing HTML DOC: {file_path}")
286
+
287
+ result_parts = []
288
+
289
+ if extract_metadata:
290
+ metadata = self._extract_html_metadata(soup)
291
+ metadata_str = self.extract_and_format_metadata(metadata)
292
+ if metadata_str:
293
+ result_parts.append(metadata_str + "\n\n")
294
+
295
+ page_tag = self.create_page_tag(1)
296
+ result_parts.append(f"{page_tag}\n")
297
+
298
+ # Copy soup to avoid modifying the original
299
+ soup_copy = BeautifulSoup(str(soup), 'html.parser')
300
+
301
+ for tag in soup_copy(['script', 'style', 'meta', 'link', 'head']):
302
+ tag.decompose()
303
+
304
+ text = soup_copy.get_text(separator='\n', strip=True)
305
+ text = re.sub(r'\n{3,}', '\n\n', text)
306
+
307
+ if text:
308
+ result_parts.append(text)
309
+
310
+ for table in soup_copy.find_all('table'):
311
+ table_html = str(table)
312
+ table_html = re.sub(r'\s+style="[^"]*"', '', table_html)
313
+ table_html = re.sub(r'\s+class="[^"]*"', '', table_html)
314
+ result_parts.append("\n" + table_html + "\n")
315
+
316
+ for img in soup_copy.find_all('img'):
317
+ src = img.get('src', '')
318
+ if src and src.startswith('data:image'):
319
+ try:
320
+ match = re.match(r'data:image/(\w+);base64,(.+)', src)
321
+ if match:
322
+ image_data = base64.b64decode(match.group(2))
323
+ image_tag = self.format_image_processor.save_image(image_data)
324
+ if image_tag:
325
+ result_parts.append(f"\n{image_tag}\n")
326
+ except:
327
+ pass
328
+
329
+ return "\n".join(result_parts)
330
+
331
+ def _extract_from_docx_obj(self, doc, current_file: "CurrentFile", extract_metadata: bool) -> str:
332
+ """Extract from misnamed DOCX using pre-converted Document object."""
333
+ file_path = current_file.get("file_path", "unknown")
334
+
335
+ self.logger.info(f"Processing misnamed DOCX: {file_path}")
336
+
337
+ try:
338
+ result_parts = []
339
+
340
+ if extract_metadata:
341
+ # Basic metadata from docx Document
342
+ if hasattr(doc, 'core_properties'):
343
+ metadata = {
344
+ 'title': doc.core_properties.title or '',
345
+ 'author': doc.core_properties.author or '',
346
+ 'subject': doc.core_properties.subject or '',
347
+ 'keywords': doc.core_properties.keywords or '',
348
+ }
349
+ metadata = {k: v for k, v in metadata.items() if v}
350
+ metadata_str = self.extract_and_format_metadata(metadata)
351
+ if metadata_str:
352
+ result_parts.append(metadata_str + "\n\n")
353
+
354
+ page_tag = self.create_page_tag(1)
355
+ result_parts.append(f"{page_tag}\n")
356
+
357
+ for para in doc.paragraphs:
358
+ if para.text.strip():
359
+ result_parts.append(para.text)
360
+
361
+ for table in doc.tables:
362
+ for row in table.rows:
363
+ row_texts = []
364
+ for cell in row.cells:
365
+ row_texts.append(cell.text.strip())
366
+ if any(t for t in row_texts):
367
+ result_parts.append(" | ".join(row_texts))
368
+
369
+ return "\n".join(result_parts)
370
+
371
+ except Exception as e:
372
+ self.logger.error(f"Error processing misnamed DOCX: {e}")
373
+ return f"[DOCX processing failed: {str(e)}]"
374
+
375
+ def _extract_from_html(self, current_file: "CurrentFile", extract_metadata: bool) -> str:
376
+ """HTML DOC processing."""
377
+ file_path = current_file.get("file_path", "unknown")
378
+ file_data = current_file.get("file_data", b"")
379
+
380
+ self.logger.info(f"Processing HTML DOC: {file_path}")
381
+
382
+ content = None
383
+ for encoding in ['utf-8', 'utf-8-sig', 'cp949', 'euc-kr', 'cp1252', 'latin-1']:
384
+ try:
385
+ content = file_data.decode(encoding)
386
+ break
387
+ except (UnicodeDecodeError, UnicodeError):
388
+ continue
389
+
390
+ if content is None:
391
+ content = file_data.decode('utf-8', errors='replace')
392
+
393
+ result_parts = []
394
+ soup = BeautifulSoup(content, 'html.parser')
395
+
396
+ if extract_metadata:
397
+ metadata = self._extract_html_metadata(soup)
398
+ metadata_str = self.extract_and_format_metadata(metadata)
399
+ if metadata_str:
400
+ result_parts.append(metadata_str + "\n\n")
401
+
402
+ page_tag = self.create_page_tag(1)
403
+ result_parts.append(f"{page_tag}\n")
404
+
405
+ for tag in soup(['script', 'style', 'meta', 'link', 'head']):
406
+ tag.decompose()
407
+
408
+ text = soup.get_text(separator='\n', strip=True)
409
+ text = re.sub(r'\n{3,}', '\n\n', text)
410
+
411
+ if text:
412
+ result_parts.append(text)
413
+
414
+ for table in soup.find_all('table'):
415
+ table_html = str(table)
416
+ table_html = re.sub(r'\s+style="[^"]*"', '', table_html)
417
+ table_html = re.sub(r'\s+class="[^"]*"', '', table_html)
418
+ result_parts.append("\n" + table_html + "\n")
419
+
420
+ for img in soup.find_all('img'):
421
+ src = img.get('src', '')
422
+ if src and src.startswith('data:image'):
423
+ try:
424
+ match = re.match(r'data:image/(\w+);base64,(.+)', src)
425
+ if match:
426
+ image_data = base64.b64decode(match.group(2))
427
+ image_tag = self.format_image_processor.save_image(image_data)
428
+ if image_tag:
429
+ result_parts.append(f"\n{image_tag}\n")
430
+ except:
431
+ pass
432
+
433
+ return "\n".join(result_parts)
434
+
435
+ def _extract_html_metadata(self, soup: BeautifulSoup) -> Dict[str, Any]:
436
+ """HTML metadata extraction."""
437
+ metadata = {}
438
+ title_tag = soup.find('title')
439
+ if title_tag and title_tag.string:
440
+ metadata['title'] = title_tag.string.strip()
441
+
442
+ meta_mappings = {
443
+ 'author': 'author', 'description': 'comments', 'keywords': 'keywords',
444
+ 'subject': 'subject', 'creator': 'author', 'producer': 'last_saved_by',
445
+ }
446
+
447
+ for meta in soup.find_all('meta'):
448
+ name = meta.get('name', '').lower()
449
+ content = meta.get('content', '')
450
+ if name in meta_mappings and content:
451
+ metadata[meta_mappings[name]] = content.strip()
452
+
453
+ return metadata
454
+
455
+ def _extract_from_docx_misnamed(self, current_file: "CurrentFile", extract_metadata: bool) -> str:
456
+ """Process misnamed DOCX file."""
457
+ file_path = current_file.get("file_path", "unknown")
458
+
459
+ self.logger.info(f"Processing misnamed DOCX: {file_path}")
460
+
461
+ try:
462
+ from xgen_doc2chunk.core.processor.docx_handler import DOCXHandler
463
+
464
+ # Pass current_file directly - DOCXHandler now accepts CurrentFile
465
+ docx_handler = DOCXHandler(config=self.config, image_processor=self.format_image_processor)
466
+ return docx_handler.extract_text(current_file, extract_metadata=extract_metadata)
467
+ except Exception as e:
468
+ self.logger.error(f"Error processing misnamed DOCX: {e}")
469
+ return f"[DOC file processing failed: {str(e)}]"
470
+
471
+ def _extract_ole_text(self, ole: olefile.OleFileIO) -> str:
472
+ """Extract text from OLE WordDocument stream."""
473
+ try:
474
+ # Check WordDocument stream
475
+ if not ole.exists('WordDocument'):
476
+ self.logger.warning("WordDocument stream not found")
477
+ return ""
478
+
479
+ # Read Word Document stream
480
+ word_stream = ole.openstream('WordDocument')
481
+ word_data = word_stream.read()
482
+
483
+ if len(word_data) < 12:
484
+ return ""
485
+
486
+ # FIB (File Information Block) parsing
487
+ # Check magic number (0xA5EC or 0xA5DC)
488
+ magic = struct.unpack('<H', word_data[0:2])[0]
489
+ if magic not in (0xA5EC, 0xA5DC):
490
+ self.logger.warning(f"Invalid Word magic number: {hex(magic)}")
491
+ return ""
492
+
493
+ # Text extraction attempt
494
+ text_parts = []
495
+
496
+ # 1. Table ?�트림에???�스??조각 찾기 ?�도
497
+ table_stream_name = None
498
+ if ole.exists('1Table'):
499
+ table_stream_name = '1Table'
500
+ elif ole.exists('0Table'):
501
+ table_stream_name = '0Table'
502
+
503
+ # 2. Simple method: Direct Unicode/ASCII text extraction
504
+ # Word 97-2003 contains some Unicode text internally
505
+ extracted_text = self._extract_text_from_word_stream(word_data)
506
+ if extracted_text:
507
+ text_parts.append(extracted_text)
508
+
509
+ return '\n'.join(text_parts)
510
+
511
+ except Exception as e:
512
+ self.logger.warning(f"Error extracting OLE text: {e}")
513
+ return ""
514
+
515
+ def _extract_text_from_word_stream(self, data: bytes) -> str:
516
+ """Extract text from Word stream (heuristic method)."""
517
+ text_parts = []
518
+
519
+ # Method 1: UTF-16LE Unicode text extraction
520
+ try:
521
+ # Find consecutive Unicode characters
522
+ i = 0
523
+ while i < len(data) - 1:
524
+ # ?�니코드 ?�스???�작??찾기 (printable 문자)
525
+ if 0x20 <= data[i] <= 0x7E and data[i+1] == 0x00:
526
+ # ?�니코드 문자???�집
527
+ unicode_bytes = []
528
+ j = i
529
+ while j < len(data) - 1:
530
+ char = data[j]
531
+ next_byte = data[j+1]
532
+
533
+ # ASCII range Unicode character or newline
534
+ if next_byte == 0x00 and (0x20 <= char <= 0x7E or char in (0x0D, 0x0A, 0x09)):
535
+ unicode_bytes.extend([char, next_byte])
536
+ j += 2
537
+ elif 0xAC <= next_byte <= 0xD7: # Korean Unicode range (AC00-D7AF)
538
+ unicode_bytes.extend([char, next_byte])
539
+ j += 2
540
+ elif next_byte in range(0x30, 0x4E): # CJK range partial
541
+ unicode_bytes.extend([char, next_byte])
542
+ j += 2
543
+ else:
544
+ break
545
+
546
+ if len(unicode_bytes) >= 8: # Minimum 4 characters
547
+ try:
548
+ text = bytes(unicode_bytes).decode('utf-16-le', errors='ignore')
549
+ text = text.strip()
550
+ if len(text) >= 4 and not text.startswith('\\'):
551
+ text = text.replace('\r\n', '\n').replace('\r', '\n')
552
+ text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F]', '', text)
553
+ if text:
554
+ text_parts.append(text)
555
+ except:
556
+ pass
557
+ i = j
558
+ else:
559
+ i += 1
560
+ except Exception as e:
561
+ self.logger.debug(f"Unicode extraction error: {e}")
562
+
563
+ # Process result
564
+ if text_parts:
565
+ # Remove duplicates and merge
566
+ seen = set()
567
+ unique_parts = []
568
+ for part in text_parts:
569
+ if part not in seen and len(part) > 3:
570
+ seen.add(part)
571
+ unique_parts.append(part)
572
+
573
+ result = '\n'.join(unique_parts)
574
+ # Handle excessive line breaks
575
+ result = re.sub(r'\n{3,}', '\n\n', result)
576
+ return result.strip()
577
+
578
+ return ""
579
+
@@ -0,0 +1,25 @@
1
+ # xgen_doc2chunk/core/processor/doc_helpers/__init__.py
2
+ """
3
+ DOC Helper Module
4
+
5
+ Provides utilities needed for DOC document processing.
6
+
7
+ RTF-related modules have been moved to rtf_helper.
8
+ If RTF processing is needed, use rtf_helper:
9
+ from xgen_doc2chunk.core.processor import rtf_helper
10
+ from xgen_doc2chunk.core.processor.rtf_helper import RTFParser
11
+
12
+ Module Structure:
13
+ - doc_file_converter: DOC file converter
14
+ - doc_image_processor: DOC image processor
15
+ """
16
+
17
+ # DOC-specific components
18
+ from xgen_doc2chunk.core.processor.doc_helpers.doc_file_converter import DOCFileConverter
19
+ from xgen_doc2chunk.core.processor.doc_helpers.doc_image_processor import DOCImageProcessor
20
+
21
+ __all__ = [
22
+ 'DOCFileConverter',
23
+ 'DOCImageProcessor',
24
+ ]
25
+