xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,160 @@
1
+ # xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py
2
+ """
3
+ DOCFileConverter - DOC file format converter
4
+
5
+ Converts binary DOC data to appropriate format based on detection.
6
+ Supports RTF, OLE, HTML, and misnamed DOCX files.
7
+ """
8
+ from io import BytesIO
9
+ from typing import Any, Optional, BinaryIO, Tuple
10
+ from enum import Enum
11
+ import zipfile
12
+
13
+ from xgen_doc2chunk.core.functions.file_converter import BaseFileConverter
14
+
15
+
16
+ class DocFormat(Enum):
17
+ """Detected DOC file format."""
18
+ RTF = "rtf"
19
+ OLE = "ole"
20
+ HTML = "html"
21
+ DOCX = "docx"
22
+ UNKNOWN = "unknown"
23
+
24
+
25
+ class DOCFileConverter(BaseFileConverter):
26
+ """
27
+ DOC file converter with format auto-detection.
28
+
29
+ Detects actual format (RTF, OLE, HTML, DOCX) and converts accordingly.
30
+ """
31
+
32
+ # Magic numbers for format detection
33
+ MAGIC_RTF = b'{\\rtf'
34
+ MAGIC_OLE = b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1'
35
+ MAGIC_ZIP = b'PK\x03\x04'
36
+
37
+ def __init__(self):
38
+ """Initialize DOCFileConverter."""
39
+ self._detected_format: DocFormat = DocFormat.UNKNOWN
40
+
41
+ def convert(
42
+ self,
43
+ file_data: bytes,
44
+ file_stream: Optional[BinaryIO] = None,
45
+ **kwargs
46
+ ) -> Tuple[Any, DocFormat]:
47
+ """
48
+ Convert binary DOC data to appropriate format.
49
+
50
+ Args:
51
+ file_data: Raw binary DOC data
52
+ file_stream: Optional file stream
53
+ **kwargs: Additional options
54
+
55
+ Returns:
56
+ Tuple of (converted object, detected format)
57
+ - RTF: (bytes, DocFormat.RTF) - Returns raw binary (processed by RTFHandler)
58
+ - OLE: (olefile.OleFileIO, DocFormat.OLE)
59
+ - HTML: (BeautifulSoup, DocFormat.HTML)
60
+ - DOCX: (docx.Document, DocFormat.DOCX)
61
+
62
+ Raises:
63
+ Exception: If conversion fails
64
+ """
65
+ self._detected_format = self._detect_format(file_data)
66
+
67
+ if self._detected_format == DocFormat.RTF:
68
+ # RTF returns raw binary - processed by RTFHandler.extract_text()
69
+ return file_data, self._detected_format
70
+ elif self._detected_format == DocFormat.OLE:
71
+ return self._convert_ole(file_data), self._detected_format
72
+ elif self._detected_format == DocFormat.HTML:
73
+ return self._convert_html(file_data), self._detected_format
74
+ elif self._detected_format == DocFormat.DOCX:
75
+ return self._convert_docx(file_data), self._detected_format
76
+ else:
77
+ # Try OLE as fallback
78
+ return self._convert_ole(file_data), DocFormat.OLE
79
+
80
+ def _detect_format(self, file_data: bytes) -> DocFormat:
81
+ """Detect actual file format from binary data."""
82
+ if not file_data:
83
+ return DocFormat.UNKNOWN
84
+
85
+ header = file_data[:32] if len(file_data) >= 32 else file_data
86
+
87
+ # Check RTF
88
+ if header.startswith(self.MAGIC_RTF):
89
+ return DocFormat.RTF
90
+
91
+ # Check OLE
92
+ if header.startswith(self.MAGIC_OLE):
93
+ return DocFormat.OLE
94
+
95
+ # Check ZIP (possible DOCX)
96
+ if header.startswith(self.MAGIC_ZIP):
97
+ try:
98
+ with zipfile.ZipFile(BytesIO(file_data), 'r') as zf:
99
+ if '[Content_Types].xml' in zf.namelist():
100
+ return DocFormat.DOCX
101
+ except zipfile.BadZipFile:
102
+ pass
103
+
104
+ # Check HTML
105
+ header_lower = header.lower()
106
+ if (header_lower.startswith(b'<!doctype') or
107
+ header_lower.startswith(b'<html') or
108
+ b'<html' in header_lower[:100]):
109
+ return DocFormat.HTML
110
+
111
+ # Check for BOM + RTF
112
+ if header.startswith(b'\xef\xbb\xbf'):
113
+ text_header = header[3:].decode('utf-8', errors='ignore').lower()
114
+ if text_header.startswith('{\\rtf'):
115
+ return DocFormat.RTF
116
+
117
+ return DocFormat.UNKNOWN
118
+
119
+ def _convert_ole(self, file_data: bytes) -> Any:
120
+ """Convert OLE data."""
121
+ import olefile
122
+ return olefile.OleFileIO(BytesIO(file_data))
123
+
124
+ def _convert_html(self, file_data: bytes) -> Any:
125
+ """Convert HTML data."""
126
+ from bs4 import BeautifulSoup
127
+ # Decode with fallback
128
+ try:
129
+ text = file_data.decode('utf-8')
130
+ except UnicodeDecodeError:
131
+ text = file_data.decode('cp949', errors='replace')
132
+ return BeautifulSoup(text, 'html.parser')
133
+
134
+ def _convert_docx(self, file_data: bytes) -> Any:
135
+ """Convert misnamed DOCX data."""
136
+ from docx import Document
137
+ return Document(BytesIO(file_data))
138
+
139
+ def get_format_name(self) -> str:
140
+ """Return detected format name."""
141
+ format_names = {
142
+ DocFormat.RTF: "RTF Document",
143
+ DocFormat.OLE: "OLE Document (DOC)",
144
+ DocFormat.HTML: "HTML Document",
145
+ DocFormat.DOCX: "DOCX Document (misnamed)",
146
+ DocFormat.UNKNOWN: "Unknown DOC Format",
147
+ }
148
+ return format_names.get(self._detected_format, "Unknown")
149
+
150
+ @property
151
+ def detected_format(self) -> DocFormat:
152
+ """Return detected format after conversion."""
153
+ return self._detected_format
154
+
155
+ def close(self, converted_object: Any) -> None:
156
+ """Close the converted object if needed."""
157
+ if converted_object is not None:
158
+ if hasattr(converted_object, 'close'):
159
+ converted_object.close()
160
+
@@ -0,0 +1,179 @@
1
+ # xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py
2
+ """
3
+ DOC Image Processor
4
+
5
+ Provides DOC-specific image processing that inherits from ImageProcessor.
6
+ Handles images from RTF, OLE compound documents, and HTML-formatted DOC files.
7
+ """
8
+ import logging
9
+ from typing import Any, Dict, Optional, Set
10
+
11
+ from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
12
+ from xgen_doc2chunk.core.functions.storage_backend import BaseStorageBackend
13
+
14
+ logger = logging.getLogger("xgen_doc2chunk.image_processor.doc")
15
+
16
+
17
+ class DOCImageProcessor(ImageProcessor):
18
+ """
19
+ DOC-specific image processor.
20
+
21
+ Inherits from ImageProcessor and provides DOC-specific processing.
22
+
23
+ Handles:
24
+ - RTF embedded images (pict, shppict, blipuid)
25
+ - OLE compound document images (Pictures stream, embedded objects)
26
+ - HTML-format DOC images (base64 encoded)
27
+ - WMF/EMF metafiles
28
+
29
+ Example:
30
+ processor = DOCImageProcessor()
31
+
32
+ # Process RTF picture
33
+ tag = processor.process_image(image_data, source="rtf", blipuid="abc123")
34
+
35
+ # Process OLE embedded image
36
+ tag = processor.process_ole_image(ole_data, stream_name="Pictures/image1.png")
37
+
38
+ # Process HTML base64 image
39
+ tag = processor.process_html_image(base64_data, src_attr="data:image/png;base64,...")
40
+ """
41
+
42
+ def __init__(
43
+ self,
44
+ directory_path: str = "temp/images",
45
+ tag_prefix: str = "[Image:",
46
+ tag_suffix: str = "]",
47
+ storage_backend: Optional[BaseStorageBackend] = None,
48
+ ):
49
+ """
50
+ Initialize DOCImageProcessor.
51
+
52
+ Args:
53
+ directory_path: Image save directory
54
+ tag_prefix: Tag prefix for image references
55
+ tag_suffix: Tag suffix for image references
56
+ storage_backend: Storage backend for saving images
57
+ """
58
+ super().__init__(
59
+ directory_path=directory_path,
60
+ tag_prefix=tag_prefix,
61
+ tag_suffix=tag_suffix,
62
+ storage_backend=storage_backend,
63
+ )
64
+ self._processed_blipuids: Set[str] = set()
65
+
66
+ def process_image(
67
+ self,
68
+ image_data: bytes,
69
+ source: Optional[str] = None,
70
+ blipuid: Optional[str] = None,
71
+ stream_name: Optional[str] = None,
72
+ **kwargs
73
+ ) -> Optional[str]:
74
+ """
75
+ Process and save DOC image data.
76
+
77
+ Args:
78
+ image_data: Raw image binary data
79
+ source: Image source type ("rtf", "ole", "html")
80
+ blipuid: RTF BLIP unique ID (for deduplication)
81
+ stream_name: OLE stream name
82
+ **kwargs: Additional options
83
+
84
+ Returns:
85
+ Image tag string or None if processing failed
86
+ """
87
+ # Custom naming based on source
88
+ custom_name = None
89
+
90
+ if source == "rtf" and blipuid:
91
+ # Use blipuid for RTF images (deduplication key)
92
+ if blipuid in self._processed_blipuids:
93
+ logger.debug(f"Skipping duplicate RTF image: {blipuid}")
94
+ return None
95
+ self._processed_blipuids.add(blipuid)
96
+ custom_name = f"rtf_{blipuid[:16]}"
97
+ elif source == "ole" and stream_name:
98
+ # Use stream name for OLE images
99
+ import os
100
+ custom_name = f"ole_{os.path.basename(stream_name).split('.')[0]}"
101
+ elif source == "html":
102
+ custom_name = None # Use hash-based naming
103
+
104
+ return self.save_image(image_data, custom_name=custom_name)
105
+
106
+ def process_ole_image(
107
+ self,
108
+ image_data: bytes,
109
+ stream_name: Optional[str] = None,
110
+ **kwargs
111
+ ) -> Optional[str]:
112
+ """
113
+ Process OLE compound document embedded image.
114
+
115
+ Args:
116
+ image_data: Raw image binary data from OLE stream
117
+ stream_name: Name of the OLE stream
118
+ **kwargs: Additional options
119
+
120
+ Returns:
121
+ Image tag string or None if processing failed
122
+ """
123
+ return self.process_image(
124
+ image_data,
125
+ source="ole",
126
+ stream_name=stream_name,
127
+ **kwargs
128
+ )
129
+
130
+ def process_rtf_image(
131
+ self,
132
+ image_data: bytes,
133
+ blipuid: Optional[str] = None,
134
+ **kwargs
135
+ ) -> Optional[str]:
136
+ """
137
+ Process RTF embedded image.
138
+
139
+ Args:
140
+ image_data: Raw image binary data from RTF
141
+ blipuid: BLIP unique ID for deduplication
142
+ **kwargs: Additional options
143
+
144
+ Returns:
145
+ Image tag string or None if processing failed
146
+ """
147
+ return self.process_image(
148
+ image_data,
149
+ source="rtf",
150
+ blipuid=blipuid,
151
+ **kwargs
152
+ )
153
+
154
+ def process_html_image(
155
+ self,
156
+ image_data: bytes,
157
+ src_attr: Optional[str] = None,
158
+ **kwargs
159
+ ) -> Optional[str]:
160
+ """
161
+ Process HTML-format DOC base64 image.
162
+
163
+ Args:
164
+ image_data: Decoded image binary data
165
+ src_attr: Original src attribute value
166
+ **kwargs: Additional options
167
+
168
+ Returns:
169
+ Image tag string or None if processing failed
170
+ """
171
+ return self.process_image(
172
+ image_data,
173
+ source="html",
174
+ **kwargs
175
+ )
176
+
177
+ def reset_tracking(self) -> None:
178
+ """Reset processed image tracking for new document."""
179
+ self._processed_blipuids.clear()
@@ -0,0 +1,83 @@
1
+ # xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py
2
+ """
3
+ DOC Preprocessor - Process DOC content after conversion.
4
+
5
+ Processing Pipeline Position:
6
+ 1. DOCFileConverter.convert() ??(converted_obj, DocFormat)
7
+ 2. DOCPreprocessor.preprocess() ??PreprocessedData (THIS STEP)
8
+ 3. Content extraction (depends on format: RTF, OLE, HTML, DOCX)
9
+
10
+ Current Implementation:
11
+ - Pass-through (DOC delegates to format-specific handlers)
12
+ """
13
+ import logging
14
+ from typing import Any, Dict
15
+
16
+ from xgen_doc2chunk.core.functions.preprocessor import (
17
+ BasePreprocessor,
18
+ PreprocessedData,
19
+ )
20
+
21
+ logger = logging.getLogger("xgen_doc2chunk.doc.preprocessor")
22
+
23
+
24
+ class DOCPreprocessor(BasePreprocessor):
25
+ """
26
+ DOC Document Preprocessor.
27
+
28
+ Currently a pass-through implementation as DOC processing
29
+ delegates to format-specific handlers (RTF, OLE, HTML, DOCX).
30
+ """
31
+
32
+ def preprocess(
33
+ self,
34
+ converted_data: Any,
35
+ **kwargs
36
+ ) -> PreprocessedData:
37
+ """
38
+ Preprocess the converted DOC content.
39
+
40
+ Args:
41
+ converted_data: Tuple of (converted_obj, DocFormat) from DOCFileConverter
42
+ **kwargs: Additional options
43
+
44
+ Returns:
45
+ PreprocessedData with the converted object
46
+ """
47
+ metadata: Dict[str, Any] = {}
48
+
49
+ converted_obj = converted_data
50
+ doc_format = None
51
+
52
+ # Handle tuple return from DOCFileConverter
53
+ if isinstance(converted_data, tuple) and len(converted_data) >= 2:
54
+ converted_obj, doc_format = converted_data[0], converted_data[1]
55
+ if hasattr(doc_format, 'value'):
56
+ metadata['detected_format'] = doc_format.value
57
+ else:
58
+ metadata['detected_format'] = str(doc_format)
59
+
60
+ logger.debug("DOC preprocessor: pass-through, metadata=%s", metadata)
61
+
62
+ # clean_content is the TRUE SOURCE - contains the converted object
63
+ # For DOC, this is the format-specific object (OLE, BeautifulSoup, etc.)
64
+ return PreprocessedData(
65
+ raw_content=converted_data,
66
+ clean_content=converted_obj, # TRUE SOURCE - the converted object
67
+ encoding="utf-8",
68
+ extracted_resources={"doc_format": doc_format},
69
+ metadata=metadata,
70
+ )
71
+
72
+ def get_format_name(self) -> str:
73
+ """Return format name."""
74
+ return "DOC Preprocessor"
75
+
76
+ def validate(self, data: Any) -> bool:
77
+ """Validate if data is DOC conversion result."""
78
+ if isinstance(data, tuple) and len(data) >= 2:
79
+ return True
80
+ return data is not None
81
+
82
+
83
+ __all__ = ['DOCPreprocessor']