xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py
|
|
2
|
+
"""
|
|
3
|
+
DOCFileConverter - DOC file format converter
|
|
4
|
+
|
|
5
|
+
Converts binary DOC data to appropriate format based on detection.
|
|
6
|
+
Supports RTF, OLE, HTML, and misnamed DOCX files.
|
|
7
|
+
"""
|
|
8
|
+
from io import BytesIO
|
|
9
|
+
from typing import Any, Optional, BinaryIO, Tuple
|
|
10
|
+
from enum import Enum
|
|
11
|
+
import zipfile
|
|
12
|
+
|
|
13
|
+
from xgen_doc2chunk.core.functions.file_converter import BaseFileConverter
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DocFormat(Enum):
|
|
17
|
+
"""Detected DOC file format."""
|
|
18
|
+
RTF = "rtf"
|
|
19
|
+
OLE = "ole"
|
|
20
|
+
HTML = "html"
|
|
21
|
+
DOCX = "docx"
|
|
22
|
+
UNKNOWN = "unknown"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class DOCFileConverter(BaseFileConverter):
|
|
26
|
+
"""
|
|
27
|
+
DOC file converter with format auto-detection.
|
|
28
|
+
|
|
29
|
+
Detects actual format (RTF, OLE, HTML, DOCX) and converts accordingly.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
# Magic numbers for format detection
|
|
33
|
+
MAGIC_RTF = b'{\\rtf'
|
|
34
|
+
MAGIC_OLE = b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1'
|
|
35
|
+
MAGIC_ZIP = b'PK\x03\x04'
|
|
36
|
+
|
|
37
|
+
def __init__(self):
|
|
38
|
+
"""Initialize DOCFileConverter."""
|
|
39
|
+
self._detected_format: DocFormat = DocFormat.UNKNOWN
|
|
40
|
+
|
|
41
|
+
def convert(
|
|
42
|
+
self,
|
|
43
|
+
file_data: bytes,
|
|
44
|
+
file_stream: Optional[BinaryIO] = None,
|
|
45
|
+
**kwargs
|
|
46
|
+
) -> Tuple[Any, DocFormat]:
|
|
47
|
+
"""
|
|
48
|
+
Convert binary DOC data to appropriate format.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
file_data: Raw binary DOC data
|
|
52
|
+
file_stream: Optional file stream
|
|
53
|
+
**kwargs: Additional options
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
Tuple of (converted object, detected format)
|
|
57
|
+
- RTF: (bytes, DocFormat.RTF) - Returns raw binary (processed by RTFHandler)
|
|
58
|
+
- OLE: (olefile.OleFileIO, DocFormat.OLE)
|
|
59
|
+
- HTML: (BeautifulSoup, DocFormat.HTML)
|
|
60
|
+
- DOCX: (docx.Document, DocFormat.DOCX)
|
|
61
|
+
|
|
62
|
+
Raises:
|
|
63
|
+
Exception: If conversion fails
|
|
64
|
+
"""
|
|
65
|
+
self._detected_format = self._detect_format(file_data)
|
|
66
|
+
|
|
67
|
+
if self._detected_format == DocFormat.RTF:
|
|
68
|
+
# RTF returns raw binary - processed by RTFHandler.extract_text()
|
|
69
|
+
return file_data, self._detected_format
|
|
70
|
+
elif self._detected_format == DocFormat.OLE:
|
|
71
|
+
return self._convert_ole(file_data), self._detected_format
|
|
72
|
+
elif self._detected_format == DocFormat.HTML:
|
|
73
|
+
return self._convert_html(file_data), self._detected_format
|
|
74
|
+
elif self._detected_format == DocFormat.DOCX:
|
|
75
|
+
return self._convert_docx(file_data), self._detected_format
|
|
76
|
+
else:
|
|
77
|
+
# Try OLE as fallback
|
|
78
|
+
return self._convert_ole(file_data), DocFormat.OLE
|
|
79
|
+
|
|
80
|
+
def _detect_format(self, file_data: bytes) -> DocFormat:
|
|
81
|
+
"""Detect actual file format from binary data."""
|
|
82
|
+
if not file_data:
|
|
83
|
+
return DocFormat.UNKNOWN
|
|
84
|
+
|
|
85
|
+
header = file_data[:32] if len(file_data) >= 32 else file_data
|
|
86
|
+
|
|
87
|
+
# Check RTF
|
|
88
|
+
if header.startswith(self.MAGIC_RTF):
|
|
89
|
+
return DocFormat.RTF
|
|
90
|
+
|
|
91
|
+
# Check OLE
|
|
92
|
+
if header.startswith(self.MAGIC_OLE):
|
|
93
|
+
return DocFormat.OLE
|
|
94
|
+
|
|
95
|
+
# Check ZIP (possible DOCX)
|
|
96
|
+
if header.startswith(self.MAGIC_ZIP):
|
|
97
|
+
try:
|
|
98
|
+
with zipfile.ZipFile(BytesIO(file_data), 'r') as zf:
|
|
99
|
+
if '[Content_Types].xml' in zf.namelist():
|
|
100
|
+
return DocFormat.DOCX
|
|
101
|
+
except zipfile.BadZipFile:
|
|
102
|
+
pass
|
|
103
|
+
|
|
104
|
+
# Check HTML
|
|
105
|
+
header_lower = header.lower()
|
|
106
|
+
if (header_lower.startswith(b'<!doctype') or
|
|
107
|
+
header_lower.startswith(b'<html') or
|
|
108
|
+
b'<html' in header_lower[:100]):
|
|
109
|
+
return DocFormat.HTML
|
|
110
|
+
|
|
111
|
+
# Check for BOM + RTF
|
|
112
|
+
if header.startswith(b'\xef\xbb\xbf'):
|
|
113
|
+
text_header = header[3:].decode('utf-8', errors='ignore').lower()
|
|
114
|
+
if text_header.startswith('{\\rtf'):
|
|
115
|
+
return DocFormat.RTF
|
|
116
|
+
|
|
117
|
+
return DocFormat.UNKNOWN
|
|
118
|
+
|
|
119
|
+
def _convert_ole(self, file_data: bytes) -> Any:
|
|
120
|
+
"""Convert OLE data."""
|
|
121
|
+
import olefile
|
|
122
|
+
return olefile.OleFileIO(BytesIO(file_data))
|
|
123
|
+
|
|
124
|
+
def _convert_html(self, file_data: bytes) -> Any:
|
|
125
|
+
"""Convert HTML data."""
|
|
126
|
+
from bs4 import BeautifulSoup
|
|
127
|
+
# Decode with fallback
|
|
128
|
+
try:
|
|
129
|
+
text = file_data.decode('utf-8')
|
|
130
|
+
except UnicodeDecodeError:
|
|
131
|
+
text = file_data.decode('cp949', errors='replace')
|
|
132
|
+
return BeautifulSoup(text, 'html.parser')
|
|
133
|
+
|
|
134
|
+
def _convert_docx(self, file_data: bytes) -> Any:
|
|
135
|
+
"""Convert misnamed DOCX data."""
|
|
136
|
+
from docx import Document
|
|
137
|
+
return Document(BytesIO(file_data))
|
|
138
|
+
|
|
139
|
+
def get_format_name(self) -> str:
|
|
140
|
+
"""Return detected format name."""
|
|
141
|
+
format_names = {
|
|
142
|
+
DocFormat.RTF: "RTF Document",
|
|
143
|
+
DocFormat.OLE: "OLE Document (DOC)",
|
|
144
|
+
DocFormat.HTML: "HTML Document",
|
|
145
|
+
DocFormat.DOCX: "DOCX Document (misnamed)",
|
|
146
|
+
DocFormat.UNKNOWN: "Unknown DOC Format",
|
|
147
|
+
}
|
|
148
|
+
return format_names.get(self._detected_format, "Unknown")
|
|
149
|
+
|
|
150
|
+
@property
|
|
151
|
+
def detected_format(self) -> DocFormat:
|
|
152
|
+
"""Return detected format after conversion."""
|
|
153
|
+
return self._detected_format
|
|
154
|
+
|
|
155
|
+
def close(self, converted_object: Any) -> None:
|
|
156
|
+
"""Close the converted object if needed."""
|
|
157
|
+
if converted_object is not None:
|
|
158
|
+
if hasattr(converted_object, 'close'):
|
|
159
|
+
converted_object.close()
|
|
160
|
+
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py
|
|
2
|
+
"""
|
|
3
|
+
DOC Image Processor
|
|
4
|
+
|
|
5
|
+
Provides DOC-specific image processing that inherits from ImageProcessor.
|
|
6
|
+
Handles images from RTF, OLE compound documents, and HTML-formatted DOC files.
|
|
7
|
+
"""
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Any, Dict, Optional, Set
|
|
10
|
+
|
|
11
|
+
from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
|
|
12
|
+
from xgen_doc2chunk.core.functions.storage_backend import BaseStorageBackend
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger("xgen_doc2chunk.image_processor.doc")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DOCImageProcessor(ImageProcessor):
|
|
18
|
+
"""
|
|
19
|
+
DOC-specific image processor.
|
|
20
|
+
|
|
21
|
+
Inherits from ImageProcessor and provides DOC-specific processing.
|
|
22
|
+
|
|
23
|
+
Handles:
|
|
24
|
+
- RTF embedded images (pict, shppict, blipuid)
|
|
25
|
+
- OLE compound document images (Pictures stream, embedded objects)
|
|
26
|
+
- HTML-format DOC images (base64 encoded)
|
|
27
|
+
- WMF/EMF metafiles
|
|
28
|
+
|
|
29
|
+
Example:
|
|
30
|
+
processor = DOCImageProcessor()
|
|
31
|
+
|
|
32
|
+
# Process RTF picture
|
|
33
|
+
tag = processor.process_image(image_data, source="rtf", blipuid="abc123")
|
|
34
|
+
|
|
35
|
+
# Process OLE embedded image
|
|
36
|
+
tag = processor.process_ole_image(ole_data, stream_name="Pictures/image1.png")
|
|
37
|
+
|
|
38
|
+
# Process HTML base64 image
|
|
39
|
+
tag = processor.process_html_image(base64_data, src_attr="data:image/png;base64,...")
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(
|
|
43
|
+
self,
|
|
44
|
+
directory_path: str = "temp/images",
|
|
45
|
+
tag_prefix: str = "[Image:",
|
|
46
|
+
tag_suffix: str = "]",
|
|
47
|
+
storage_backend: Optional[BaseStorageBackend] = None,
|
|
48
|
+
):
|
|
49
|
+
"""
|
|
50
|
+
Initialize DOCImageProcessor.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
directory_path: Image save directory
|
|
54
|
+
tag_prefix: Tag prefix for image references
|
|
55
|
+
tag_suffix: Tag suffix for image references
|
|
56
|
+
storage_backend: Storage backend for saving images
|
|
57
|
+
"""
|
|
58
|
+
super().__init__(
|
|
59
|
+
directory_path=directory_path,
|
|
60
|
+
tag_prefix=tag_prefix,
|
|
61
|
+
tag_suffix=tag_suffix,
|
|
62
|
+
storage_backend=storage_backend,
|
|
63
|
+
)
|
|
64
|
+
self._processed_blipuids: Set[str] = set()
|
|
65
|
+
|
|
66
|
+
def process_image(
|
|
67
|
+
self,
|
|
68
|
+
image_data: bytes,
|
|
69
|
+
source: Optional[str] = None,
|
|
70
|
+
blipuid: Optional[str] = None,
|
|
71
|
+
stream_name: Optional[str] = None,
|
|
72
|
+
**kwargs
|
|
73
|
+
) -> Optional[str]:
|
|
74
|
+
"""
|
|
75
|
+
Process and save DOC image data.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
image_data: Raw image binary data
|
|
79
|
+
source: Image source type ("rtf", "ole", "html")
|
|
80
|
+
blipuid: RTF BLIP unique ID (for deduplication)
|
|
81
|
+
stream_name: OLE stream name
|
|
82
|
+
**kwargs: Additional options
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
Image tag string or None if processing failed
|
|
86
|
+
"""
|
|
87
|
+
# Custom naming based on source
|
|
88
|
+
custom_name = None
|
|
89
|
+
|
|
90
|
+
if source == "rtf" and blipuid:
|
|
91
|
+
# Use blipuid for RTF images (deduplication key)
|
|
92
|
+
if blipuid in self._processed_blipuids:
|
|
93
|
+
logger.debug(f"Skipping duplicate RTF image: {blipuid}")
|
|
94
|
+
return None
|
|
95
|
+
self._processed_blipuids.add(blipuid)
|
|
96
|
+
custom_name = f"rtf_{blipuid[:16]}"
|
|
97
|
+
elif source == "ole" and stream_name:
|
|
98
|
+
# Use stream name for OLE images
|
|
99
|
+
import os
|
|
100
|
+
custom_name = f"ole_{os.path.basename(stream_name).split('.')[0]}"
|
|
101
|
+
elif source == "html":
|
|
102
|
+
custom_name = None # Use hash-based naming
|
|
103
|
+
|
|
104
|
+
return self.save_image(image_data, custom_name=custom_name)
|
|
105
|
+
|
|
106
|
+
def process_ole_image(
|
|
107
|
+
self,
|
|
108
|
+
image_data: bytes,
|
|
109
|
+
stream_name: Optional[str] = None,
|
|
110
|
+
**kwargs
|
|
111
|
+
) -> Optional[str]:
|
|
112
|
+
"""
|
|
113
|
+
Process OLE compound document embedded image.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
image_data: Raw image binary data from OLE stream
|
|
117
|
+
stream_name: Name of the OLE stream
|
|
118
|
+
**kwargs: Additional options
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
Image tag string or None if processing failed
|
|
122
|
+
"""
|
|
123
|
+
return self.process_image(
|
|
124
|
+
image_data,
|
|
125
|
+
source="ole",
|
|
126
|
+
stream_name=stream_name,
|
|
127
|
+
**kwargs
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
def process_rtf_image(
|
|
131
|
+
self,
|
|
132
|
+
image_data: bytes,
|
|
133
|
+
blipuid: Optional[str] = None,
|
|
134
|
+
**kwargs
|
|
135
|
+
) -> Optional[str]:
|
|
136
|
+
"""
|
|
137
|
+
Process RTF embedded image.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
image_data: Raw image binary data from RTF
|
|
141
|
+
blipuid: BLIP unique ID for deduplication
|
|
142
|
+
**kwargs: Additional options
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Image tag string or None if processing failed
|
|
146
|
+
"""
|
|
147
|
+
return self.process_image(
|
|
148
|
+
image_data,
|
|
149
|
+
source="rtf",
|
|
150
|
+
blipuid=blipuid,
|
|
151
|
+
**kwargs
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
def process_html_image(
|
|
155
|
+
self,
|
|
156
|
+
image_data: bytes,
|
|
157
|
+
src_attr: Optional[str] = None,
|
|
158
|
+
**kwargs
|
|
159
|
+
) -> Optional[str]:
|
|
160
|
+
"""
|
|
161
|
+
Process HTML-format DOC base64 image.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
image_data: Decoded image binary data
|
|
165
|
+
src_attr: Original src attribute value
|
|
166
|
+
**kwargs: Additional options
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
Image tag string or None if processing failed
|
|
170
|
+
"""
|
|
171
|
+
return self.process_image(
|
|
172
|
+
image_data,
|
|
173
|
+
source="html",
|
|
174
|
+
**kwargs
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
def reset_tracking(self) -> None:
|
|
178
|
+
"""Reset processed image tracking for new document."""
|
|
179
|
+
self._processed_blipuids.clear()
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py
|
|
2
|
+
"""
|
|
3
|
+
DOC Preprocessor - Process DOC content after conversion.
|
|
4
|
+
|
|
5
|
+
Processing Pipeline Position:
|
|
6
|
+
1. DOCFileConverter.convert() ??(converted_obj, DocFormat)
|
|
7
|
+
2. DOCPreprocessor.preprocess() ??PreprocessedData (THIS STEP)
|
|
8
|
+
3. Content extraction (depends on format: RTF, OLE, HTML, DOCX)
|
|
9
|
+
|
|
10
|
+
Current Implementation:
|
|
11
|
+
- Pass-through (DOC delegates to format-specific handlers)
|
|
12
|
+
"""
|
|
13
|
+
import logging
|
|
14
|
+
from typing import Any, Dict
|
|
15
|
+
|
|
16
|
+
from xgen_doc2chunk.core.functions.preprocessor import (
|
|
17
|
+
BasePreprocessor,
|
|
18
|
+
PreprocessedData,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger("xgen_doc2chunk.doc.preprocessor")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DOCPreprocessor(BasePreprocessor):
|
|
25
|
+
"""
|
|
26
|
+
DOC Document Preprocessor.
|
|
27
|
+
|
|
28
|
+
Currently a pass-through implementation as DOC processing
|
|
29
|
+
delegates to format-specific handlers (RTF, OLE, HTML, DOCX).
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def preprocess(
|
|
33
|
+
self,
|
|
34
|
+
converted_data: Any,
|
|
35
|
+
**kwargs
|
|
36
|
+
) -> PreprocessedData:
|
|
37
|
+
"""
|
|
38
|
+
Preprocess the converted DOC content.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
converted_data: Tuple of (converted_obj, DocFormat) from DOCFileConverter
|
|
42
|
+
**kwargs: Additional options
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
PreprocessedData with the converted object
|
|
46
|
+
"""
|
|
47
|
+
metadata: Dict[str, Any] = {}
|
|
48
|
+
|
|
49
|
+
converted_obj = converted_data
|
|
50
|
+
doc_format = None
|
|
51
|
+
|
|
52
|
+
# Handle tuple return from DOCFileConverter
|
|
53
|
+
if isinstance(converted_data, tuple) and len(converted_data) >= 2:
|
|
54
|
+
converted_obj, doc_format = converted_data[0], converted_data[1]
|
|
55
|
+
if hasattr(doc_format, 'value'):
|
|
56
|
+
metadata['detected_format'] = doc_format.value
|
|
57
|
+
else:
|
|
58
|
+
metadata['detected_format'] = str(doc_format)
|
|
59
|
+
|
|
60
|
+
logger.debug("DOC preprocessor: pass-through, metadata=%s", metadata)
|
|
61
|
+
|
|
62
|
+
# clean_content is the TRUE SOURCE - contains the converted object
|
|
63
|
+
# For DOC, this is the format-specific object (OLE, BeautifulSoup, etc.)
|
|
64
|
+
return PreprocessedData(
|
|
65
|
+
raw_content=converted_data,
|
|
66
|
+
clean_content=converted_obj, # TRUE SOURCE - the converted object
|
|
67
|
+
encoding="utf-8",
|
|
68
|
+
extracted_resources={"doc_format": doc_format},
|
|
69
|
+
metadata=metadata,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
def get_format_name(self) -> str:
|
|
73
|
+
"""Return format name."""
|
|
74
|
+
return "DOC Preprocessor"
|
|
75
|
+
|
|
76
|
+
def validate(self, data: Any) -> bool:
|
|
77
|
+
"""Validate if data is DOC conversion result."""
|
|
78
|
+
if isinstance(data, tuple) and len(data) >= 2:
|
|
79
|
+
return True
|
|
80
|
+
return data is not None
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
__all__ = ['DOCPreprocessor']
|