xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py
|
|
2
|
+
"""
|
|
3
|
+
RTF Metadata Extractor
|
|
4
|
+
|
|
5
|
+
Extracts metadata from RTF content.
|
|
6
|
+
Implements BaseMetadataExtractor interface.
|
|
7
|
+
"""
|
|
8
|
+
import logging
|
|
9
|
+
import re
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
from typing import Any, Dict, Optional, Union
|
|
13
|
+
|
|
14
|
+
from xgen_doc2chunk.core.functions.metadata_extractor import (
|
|
15
|
+
BaseMetadataExtractor,
|
|
16
|
+
DocumentMetadata,
|
|
17
|
+
)
|
|
18
|
+
from xgen_doc2chunk.core.processor.rtf_helper.rtf_decoder import (
|
|
19
|
+
decode_hex_escapes,
|
|
20
|
+
)
|
|
21
|
+
from xgen_doc2chunk.core.processor.rtf_helper.rtf_text_cleaner import (
|
|
22
|
+
clean_rtf_text,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger("xgen_doc2chunk.rtf.metadata")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class RTFSourceInfo:
|
|
30
|
+
"""
|
|
31
|
+
Source information for RTF metadata extraction.
|
|
32
|
+
|
|
33
|
+
Container for data passed to RTFMetadataExtractor.extract().
|
|
34
|
+
"""
|
|
35
|
+
content: str
|
|
36
|
+
encoding: str = "cp949"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class RTFMetadataExtractor(BaseMetadataExtractor):
|
|
40
|
+
"""
|
|
41
|
+
RTF Metadata Extractor.
|
|
42
|
+
|
|
43
|
+
Extracts metadata from RTF content.
|
|
44
|
+
|
|
45
|
+
Supported fields:
|
|
46
|
+
- title, subject, author, keywords, comments
|
|
47
|
+
- last_saved_by, create_time, last_saved_time
|
|
48
|
+
|
|
49
|
+
Usage:
|
|
50
|
+
extractor = RTFMetadataExtractor()
|
|
51
|
+
source = RTFSourceInfo(content=rtf_content, encoding="cp949")
|
|
52
|
+
metadata = extractor.extract(source)
|
|
53
|
+
text = extractor.format(metadata)
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
def extract(self, source: Union[RTFSourceInfo, Dict[str, Any]]) -> DocumentMetadata:
|
|
57
|
+
"""
|
|
58
|
+
Extract metadata from RTF content.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
source: RTFSourceInfo object (content string and encoding)
|
|
62
|
+
OR Dict[str, Any] (pre-parsed metadata)
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
DocumentMetadata instance
|
|
66
|
+
"""
|
|
67
|
+
if isinstance(source, dict):
|
|
68
|
+
return self._from_dict(source)
|
|
69
|
+
|
|
70
|
+
content = source.content
|
|
71
|
+
encoding = source.encoding
|
|
72
|
+
|
|
73
|
+
title = None
|
|
74
|
+
subject = None
|
|
75
|
+
author = None
|
|
76
|
+
keywords = None
|
|
77
|
+
comments = None
|
|
78
|
+
last_saved_by = None
|
|
79
|
+
create_time = None
|
|
80
|
+
last_saved_time = None
|
|
81
|
+
|
|
82
|
+
# Find \info group
|
|
83
|
+
info_match = re.search(r'\\info\s*\{([^}]*(?:\{[^}]*\}[^}]*)*)\}', content)
|
|
84
|
+
if info_match:
|
|
85
|
+
info_content = info_match.group(1)
|
|
86
|
+
|
|
87
|
+
# Extract each metadata field
|
|
88
|
+
field_patterns = {
|
|
89
|
+
'title': r'\\title\s*\{([^}]*)\}',
|
|
90
|
+
'subject': r'\\subject\s*\{([^}]*)\}',
|
|
91
|
+
'author': r'\\author\s*\{([^}]*)\}',
|
|
92
|
+
'keywords': r'\\keywords\s*\{([^}]*)\}',
|
|
93
|
+
'comments': r'\\doccomm\s*\{([^}]*)\}',
|
|
94
|
+
'last_saved_by': r'\\operator\s*\{([^}]*)\}',
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
for key, pattern in field_patterns.items():
|
|
98
|
+
match = re.search(pattern, info_content)
|
|
99
|
+
if match:
|
|
100
|
+
value = decode_hex_escapes(match.group(1), encoding)
|
|
101
|
+
value = clean_rtf_text(value, encoding)
|
|
102
|
+
if value:
|
|
103
|
+
if key == 'title':
|
|
104
|
+
title = value
|
|
105
|
+
elif key == 'subject':
|
|
106
|
+
subject = value
|
|
107
|
+
elif key == 'author':
|
|
108
|
+
author = value
|
|
109
|
+
elif key == 'keywords':
|
|
110
|
+
keywords = value
|
|
111
|
+
elif key == 'comments':
|
|
112
|
+
comments = value
|
|
113
|
+
elif key == 'last_saved_by':
|
|
114
|
+
last_saved_by = value
|
|
115
|
+
|
|
116
|
+
# Extract dates
|
|
117
|
+
create_time = self._extract_date(
|
|
118
|
+
content,
|
|
119
|
+
r'\\creatim\\yr(\d+)\\mo(\d+)\\dy(\d+)(?:\\hr(\d+))?(?:\\min(\d+))?'
|
|
120
|
+
)
|
|
121
|
+
last_saved_time = self._extract_date(
|
|
122
|
+
content,
|
|
123
|
+
r'\\revtim\\yr(\d+)\\mo(\d+)\\dy(\d+)(?:\\hr(\d+))?(?:\\min(\d+))?'
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
self.logger.debug("Extracted RTF metadata fields")
|
|
127
|
+
|
|
128
|
+
return DocumentMetadata(
|
|
129
|
+
title=title,
|
|
130
|
+
subject=subject,
|
|
131
|
+
author=author,
|
|
132
|
+
keywords=keywords,
|
|
133
|
+
comments=comments,
|
|
134
|
+
last_saved_by=last_saved_by,
|
|
135
|
+
create_time=create_time,
|
|
136
|
+
last_saved_time=last_saved_time,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
def _extract_date(self, content: str, pattern: str) -> Optional[datetime]:
|
|
140
|
+
"""Extract datetime from RTF date pattern."""
|
|
141
|
+
match = re.search(pattern, content)
|
|
142
|
+
if match:
|
|
143
|
+
try:
|
|
144
|
+
year = int(match.group(1))
|
|
145
|
+
month = int(match.group(2))
|
|
146
|
+
day = int(match.group(3))
|
|
147
|
+
hour = int(match.group(4)) if match.group(4) else 0
|
|
148
|
+
minute = int(match.group(5)) if match.group(5) else 0
|
|
149
|
+
return datetime(year, month, day, hour, minute)
|
|
150
|
+
except (ValueError, TypeError):
|
|
151
|
+
pass
|
|
152
|
+
return None
|
|
153
|
+
|
|
154
|
+
def _from_dict(self, metadata: Dict[str, Any]) -> DocumentMetadata:
|
|
155
|
+
"""
|
|
156
|
+
Convert pre-parsed metadata dict to DocumentMetadata.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
metadata: Pre-parsed metadata dict
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
DocumentMetadata instance
|
|
163
|
+
"""
|
|
164
|
+
return DocumentMetadata(
|
|
165
|
+
title=metadata.get('title'),
|
|
166
|
+
subject=metadata.get('subject'),
|
|
167
|
+
author=metadata.get('author'),
|
|
168
|
+
keywords=metadata.get('keywords'),
|
|
169
|
+
comments=metadata.get('comments'),
|
|
170
|
+
last_saved_by=metadata.get('last_saved_by'),
|
|
171
|
+
create_time=metadata.get('create_time'),
|
|
172
|
+
last_saved_time=metadata.get('last_saved_time'),
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
__all__ = [
|
|
177
|
+
'RTFMetadataExtractor',
|
|
178
|
+
'RTFSourceInfo',
|
|
179
|
+
]
|
|
@@ -0,0 +1,426 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py
|
|
2
|
+
"""
|
|
3
|
+
RTF Preprocessor
|
|
4
|
+
|
|
5
|
+
Preprocesses RTF binary data before conversion:
|
|
6
|
+
- \\binN tag processing (skip N bytes of raw binary data)
|
|
7
|
+
- \\pict group image extraction
|
|
8
|
+
- Image saving and tag generation
|
|
9
|
+
- Encoding detection
|
|
10
|
+
|
|
11
|
+
Implements BasePreprocessor interface.
|
|
12
|
+
"""
|
|
13
|
+
import hashlib
|
|
14
|
+
import logging
|
|
15
|
+
import re
|
|
16
|
+
from dataclasses import dataclass, field
|
|
17
|
+
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
18
|
+
|
|
19
|
+
from xgen_doc2chunk.core.functions.preprocessor import (
|
|
20
|
+
BasePreprocessor,
|
|
21
|
+
PreprocessedData,
|
|
22
|
+
)
|
|
23
|
+
from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
|
|
24
|
+
from xgen_doc2chunk.core.functions.storage_backend import BaseStorageBackend
|
|
25
|
+
from xgen_doc2chunk.core.processor.rtf_helper.rtf_decoder import (
|
|
26
|
+
detect_encoding,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger("xgen_doc2chunk.rtf.preprocessor")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# Image format magic numbers
|
|
33
|
+
IMAGE_SIGNATURES = {
|
|
34
|
+
b'\xff\xd8\xff': 'jpeg',
|
|
35
|
+
b'\x89PNG\r\n\x1a\n': 'png',
|
|
36
|
+
b'GIF87a': 'gif',
|
|
37
|
+
b'GIF89a': 'gif',
|
|
38
|
+
b'BM': 'bmp',
|
|
39
|
+
b'\xd7\xcd\xc6\x9a': 'wmf',
|
|
40
|
+
b'\x01\x00\x09\x00': 'wmf',
|
|
41
|
+
b'\x01\x00\x00\x00': 'emf',
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
# RTF image type mapping
|
|
45
|
+
RTF_IMAGE_TYPES = {
|
|
46
|
+
'jpegblip': 'jpeg',
|
|
47
|
+
'pngblip': 'png',
|
|
48
|
+
'wmetafile': 'wmf',
|
|
49
|
+
'emfblip': 'emf',
|
|
50
|
+
'dibitmap': 'bmp',
|
|
51
|
+
'wbitmap': 'bmp',
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
# Supported image formats for saving
|
|
55
|
+
SUPPORTED_IMAGE_FORMATS = {'jpeg', 'png', 'gif', 'bmp'}
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass
|
|
59
|
+
class RTFBinaryRegion:
|
|
60
|
+
"""RTF binary data region information."""
|
|
61
|
+
start_pos: int
|
|
62
|
+
end_pos: int
|
|
63
|
+
bin_type: str # "bin" or "pict"
|
|
64
|
+
data_size: int
|
|
65
|
+
image_format: str = ""
|
|
66
|
+
image_data: bytes = b""
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class RTFPreprocessor(BasePreprocessor):
|
|
70
|
+
"""
|
|
71
|
+
RTF-specific preprocessor.
|
|
72
|
+
|
|
73
|
+
Handles RTF binary preprocessing:
|
|
74
|
+
- Removes \\bin tag binary data
|
|
75
|
+
- Extracts embedded images
|
|
76
|
+
- Detects encoding
|
|
77
|
+
- Returns clean content ready for parsing
|
|
78
|
+
|
|
79
|
+
Usage:
|
|
80
|
+
preprocessor = RTFPreprocessor(image_processor=img_proc)
|
|
81
|
+
result = preprocessor.preprocess(rtf_bytes)
|
|
82
|
+
|
|
83
|
+
# result.clean_content - bytes ready for parsing
|
|
84
|
+
# result.encoding - detected encoding
|
|
85
|
+
# result.extracted_resources["image_tags"] - list of image tags
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
RTF_MAGIC = b'{\\rtf'
|
|
89
|
+
|
|
90
|
+
def __init__(
|
|
91
|
+
self,
|
|
92
|
+
image_processor: Optional[ImageProcessor] = None,
|
|
93
|
+
processed_images: Optional[Set[str]] = None,
|
|
94
|
+
):
|
|
95
|
+
"""
|
|
96
|
+
Initialize RTFPreprocessor.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
image_processor: Image processor for saving images
|
|
100
|
+
processed_images: Set of already processed image hashes
|
|
101
|
+
"""
|
|
102
|
+
self._image_processor = image_processor
|
|
103
|
+
self._processed_images = processed_images if processed_images is not None else set()
|
|
104
|
+
|
|
105
|
+
def preprocess(
|
|
106
|
+
self,
|
|
107
|
+
converted_data: Any,
|
|
108
|
+
**kwargs
|
|
109
|
+
) -> PreprocessedData:
|
|
110
|
+
"""
|
|
111
|
+
Preprocess RTF data.
|
|
112
|
+
|
|
113
|
+
For RTF, the converter returns raw bytes (pass-through),
|
|
114
|
+
so converted_data is the original RTF binary data.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
converted_data: RTF binary data (bytes) from RTFFileConverter
|
|
118
|
+
**kwargs: Additional options
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
PreprocessedData with clean content, encoding, and image tags
|
|
122
|
+
"""
|
|
123
|
+
# Handle bytes input
|
|
124
|
+
if isinstance(converted_data, bytes):
|
|
125
|
+
file_data = converted_data
|
|
126
|
+
elif hasattr(converted_data, 'read'):
|
|
127
|
+
# Handle file-like objects
|
|
128
|
+
file_data = converted_data.read()
|
|
129
|
+
else:
|
|
130
|
+
return PreprocessedData(
|
|
131
|
+
raw_content=b"",
|
|
132
|
+
clean_content=b"",
|
|
133
|
+
encoding="cp949",
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
if not file_data:
|
|
137
|
+
return PreprocessedData(
|
|
138
|
+
raw_content=b"",
|
|
139
|
+
clean_content=b"",
|
|
140
|
+
encoding="cp949",
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# Get options from kwargs
|
|
144
|
+
image_processor = kwargs.get('image_processor', self._image_processor)
|
|
145
|
+
processed_images = kwargs.get('processed_images', self._processed_images)
|
|
146
|
+
|
|
147
|
+
# Detect encoding
|
|
148
|
+
detected_encoding = detect_encoding(file_data, "cp949")
|
|
149
|
+
|
|
150
|
+
# Process binary data (extract images, clean content)
|
|
151
|
+
clean_content, image_tags = self._process_binary_content(
|
|
152
|
+
file_data,
|
|
153
|
+
image_processor,
|
|
154
|
+
processed_images
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
# Filter valid image tags
|
|
158
|
+
valid_tags = [
|
|
159
|
+
tag for tag in image_tags
|
|
160
|
+
if tag and tag.strip() and '/uploads/.' not in tag
|
|
161
|
+
]
|
|
162
|
+
|
|
163
|
+
return PreprocessedData(
|
|
164
|
+
raw_content=file_data,
|
|
165
|
+
clean_content=clean_content,
|
|
166
|
+
encoding=detected_encoding,
|
|
167
|
+
extracted_resources={
|
|
168
|
+
"image_tags": valid_tags,
|
|
169
|
+
}
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
def get_format_name(self) -> str:
|
|
173
|
+
"""Return format name."""
|
|
174
|
+
return "RTF Preprocessor"
|
|
175
|
+
|
|
176
|
+
def validate(self, data: Any) -> bool:
|
|
177
|
+
"""Validate if data is valid RTF content."""
|
|
178
|
+
if isinstance(data, bytes):
|
|
179
|
+
if len(data) < 5:
|
|
180
|
+
return False
|
|
181
|
+
return data[:5] == self.RTF_MAGIC
|
|
182
|
+
return False
|
|
183
|
+
|
|
184
|
+
def _process_binary_content(
|
|
185
|
+
self,
|
|
186
|
+
content: bytes,
|
|
187
|
+
image_processor: Optional[ImageProcessor],
|
|
188
|
+
processed_images: Set[str]
|
|
189
|
+
) -> Tuple[bytes, List[str]]:
|
|
190
|
+
"""
|
|
191
|
+
Process RTF binary content.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
content: RTF binary content
|
|
195
|
+
image_processor: Image processor instance
|
|
196
|
+
processed_images: Set of processed image hashes
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
Tuple of (clean_content, list of image tags)
|
|
200
|
+
"""
|
|
201
|
+
image_tags: Dict[int, str] = {}
|
|
202
|
+
|
|
203
|
+
# Find \bin tag regions
|
|
204
|
+
bin_regions = self._find_bin_regions(content)
|
|
205
|
+
|
|
206
|
+
# Find \pict regions (excluding bin regions)
|
|
207
|
+
pict_regions = self._find_pict_regions(content, bin_regions)
|
|
208
|
+
|
|
209
|
+
# Merge and sort all regions
|
|
210
|
+
all_regions = bin_regions + pict_regions
|
|
211
|
+
all_regions.sort(key=lambda r: r.start_pos)
|
|
212
|
+
|
|
213
|
+
# Process images and generate tags
|
|
214
|
+
for region in all_regions:
|
|
215
|
+
if not region.image_data:
|
|
216
|
+
continue
|
|
217
|
+
|
|
218
|
+
# Check for duplicates
|
|
219
|
+
image_hash = hashlib.md5(region.image_data).hexdigest()
|
|
220
|
+
if image_hash in processed_images:
|
|
221
|
+
image_tags[region.start_pos] = ""
|
|
222
|
+
continue
|
|
223
|
+
|
|
224
|
+
processed_images.add(image_hash)
|
|
225
|
+
|
|
226
|
+
if region.image_format in SUPPORTED_IMAGE_FORMATS and image_processor:
|
|
227
|
+
tag = image_processor.save_image(region.image_data)
|
|
228
|
+
if tag:
|
|
229
|
+
image_tags[region.start_pos] = f"\n{tag}\n"
|
|
230
|
+
logger.info(
|
|
231
|
+
f"Saved RTF image: {tag} "
|
|
232
|
+
f"(format={region.image_format}, size={region.data_size})"
|
|
233
|
+
)
|
|
234
|
+
else:
|
|
235
|
+
image_tags[region.start_pos] = ""
|
|
236
|
+
else:
|
|
237
|
+
image_tags[region.start_pos] = ""
|
|
238
|
+
|
|
239
|
+
# Remove binary data from content
|
|
240
|
+
clean_content = self._remove_binary_data(content, all_regions, image_tags)
|
|
241
|
+
|
|
242
|
+
# Collect all image tags as list
|
|
243
|
+
tag_list = [tag for tag in image_tags.values() if tag and tag.strip()]
|
|
244
|
+
|
|
245
|
+
return clean_content, tag_list
|
|
246
|
+
|
|
247
|
+
def _find_bin_regions(self, content: bytes) -> List[RTFBinaryRegion]:
|
|
248
|
+
"""Find \\binN tags and identify binary regions."""
|
|
249
|
+
regions = []
|
|
250
|
+
pattern = rb'\\bin(\d+)'
|
|
251
|
+
|
|
252
|
+
for match in re.finditer(pattern, content):
|
|
253
|
+
try:
|
|
254
|
+
bin_size = int(match.group(1))
|
|
255
|
+
bin_tag_start = match.start()
|
|
256
|
+
bin_tag_end = match.end()
|
|
257
|
+
|
|
258
|
+
data_start = bin_tag_end
|
|
259
|
+
if data_start < len(content) and content[data_start:data_start+1] == b' ':
|
|
260
|
+
data_start += 1
|
|
261
|
+
|
|
262
|
+
data_end = data_start + bin_size
|
|
263
|
+
|
|
264
|
+
if data_end <= len(content):
|
|
265
|
+
binary_data = content[data_start:data_end]
|
|
266
|
+
image_format = self._detect_image_format(binary_data)
|
|
267
|
+
|
|
268
|
+
# Find parent \shppict group
|
|
269
|
+
group_start = bin_tag_start
|
|
270
|
+
group_end = data_end
|
|
271
|
+
|
|
272
|
+
search_start = max(0, bin_tag_start - 500)
|
|
273
|
+
search_area = content[search_start:bin_tag_start]
|
|
274
|
+
|
|
275
|
+
shppict_pos = search_area.rfind(b'\\shppict')
|
|
276
|
+
if shppict_pos != -1:
|
|
277
|
+
abs_pos = search_start + shppict_pos
|
|
278
|
+
brace_pos = abs_pos
|
|
279
|
+
while brace_pos > 0 and content[brace_pos:brace_pos+1] != b'{':
|
|
280
|
+
brace_pos -= 1
|
|
281
|
+
group_start = brace_pos
|
|
282
|
+
|
|
283
|
+
depth = 1
|
|
284
|
+
j = data_end
|
|
285
|
+
while j < len(content) and depth > 0:
|
|
286
|
+
if content[j:j+1] == b'{':
|
|
287
|
+
depth += 1
|
|
288
|
+
elif content[j:j+1] == b'}':
|
|
289
|
+
depth -= 1
|
|
290
|
+
j += 1
|
|
291
|
+
group_end = j
|
|
292
|
+
|
|
293
|
+
regions.append(RTFBinaryRegion(
|
|
294
|
+
start_pos=group_start,
|
|
295
|
+
end_pos=group_end,
|
|
296
|
+
bin_type="bin",
|
|
297
|
+
data_size=bin_size,
|
|
298
|
+
image_format=image_format,
|
|
299
|
+
image_data=binary_data
|
|
300
|
+
))
|
|
301
|
+
except (ValueError, IndexError):
|
|
302
|
+
continue
|
|
303
|
+
|
|
304
|
+
return regions
|
|
305
|
+
|
|
306
|
+
def _find_pict_regions(
|
|
307
|
+
self,
|
|
308
|
+
content: bytes,
|
|
309
|
+
exclude_regions: List[RTFBinaryRegion]
|
|
310
|
+
) -> List[RTFBinaryRegion]:
|
|
311
|
+
"""Find hex-encoded \\pict regions."""
|
|
312
|
+
regions = []
|
|
313
|
+
|
|
314
|
+
bin_tag_positions = {r.start_pos for r in exclude_regions if r.bin_type == "bin"}
|
|
315
|
+
excluded_ranges = [(r.start_pos, r.end_pos) for r in exclude_regions]
|
|
316
|
+
|
|
317
|
+
def is_excluded(pos: int) -> bool:
|
|
318
|
+
return any(start <= pos < end for start, end in excluded_ranges)
|
|
319
|
+
|
|
320
|
+
def has_bin_nearby(pict_pos: int) -> bool:
|
|
321
|
+
return any(pict_pos < bp < pict_pos + 200 for bp in bin_tag_positions)
|
|
322
|
+
|
|
323
|
+
try:
|
|
324
|
+
text_content = content.decode('cp1252', errors='replace')
|
|
325
|
+
pict_pattern = r'\\pict\s*((?:\\[a-zA-Z]+\d*\s*)*)'
|
|
326
|
+
|
|
327
|
+
for match in re.finditer(pict_pattern, text_content):
|
|
328
|
+
start_pos = match.start()
|
|
329
|
+
|
|
330
|
+
if is_excluded(start_pos) or has_bin_nearby(start_pos):
|
|
331
|
+
continue
|
|
332
|
+
|
|
333
|
+
attrs = match.group(1)
|
|
334
|
+
image_format = ""
|
|
335
|
+
for rtf_type, fmt in RTF_IMAGE_TYPES.items():
|
|
336
|
+
if rtf_type in attrs:
|
|
337
|
+
image_format = fmt
|
|
338
|
+
break
|
|
339
|
+
|
|
340
|
+
# Extract hex data
|
|
341
|
+
hex_start = match.end()
|
|
342
|
+
hex_data = []
|
|
343
|
+
i = hex_start
|
|
344
|
+
|
|
345
|
+
while i < len(text_content):
|
|
346
|
+
ch = text_content[i]
|
|
347
|
+
if ch in '0123456789abcdefABCDEF':
|
|
348
|
+
hex_data.append(ch)
|
|
349
|
+
elif ch in ' \t\r\n':
|
|
350
|
+
pass
|
|
351
|
+
elif ch == '}':
|
|
352
|
+
break
|
|
353
|
+
elif ch == '\\':
|
|
354
|
+
if text_content[i:i+4] == '\\bin':
|
|
355
|
+
hex_data = []
|
|
356
|
+
break
|
|
357
|
+
while i < len(text_content) and text_content[i] not in ' \t\r\n}':
|
|
358
|
+
i += 1
|
|
359
|
+
continue
|
|
360
|
+
else:
|
|
361
|
+
break
|
|
362
|
+
i += 1
|
|
363
|
+
|
|
364
|
+
hex_str = ''.join(hex_data)
|
|
365
|
+
|
|
366
|
+
if len(hex_str) >= 32:
|
|
367
|
+
try:
|
|
368
|
+
image_data = bytes.fromhex(hex_str)
|
|
369
|
+
if not image_format:
|
|
370
|
+
image_format = self._detect_image_format(image_data)
|
|
371
|
+
|
|
372
|
+
if image_format:
|
|
373
|
+
regions.append(RTFBinaryRegion(
|
|
374
|
+
start_pos=start_pos,
|
|
375
|
+
end_pos=i,
|
|
376
|
+
bin_type="pict",
|
|
377
|
+
data_size=len(image_data),
|
|
378
|
+
image_format=image_format,
|
|
379
|
+
image_data=image_data
|
|
380
|
+
))
|
|
381
|
+
except ValueError:
|
|
382
|
+
continue
|
|
383
|
+
except Exception as e:
|
|
384
|
+
logger.warning(f"Error finding pict regions: {e}")
|
|
385
|
+
|
|
386
|
+
return regions
|
|
387
|
+
|
|
388
|
+
def _detect_image_format(self, data: bytes) -> str:
|
|
389
|
+
"""Detect image format from binary data."""
|
|
390
|
+
if not data or len(data) < 4:
|
|
391
|
+
return ""
|
|
392
|
+
|
|
393
|
+
for signature, format_name in IMAGE_SIGNATURES.items():
|
|
394
|
+
if data.startswith(signature):
|
|
395
|
+
return format_name
|
|
396
|
+
|
|
397
|
+
if len(data) >= 2 and data[0:2] == b'\xff\xd8':
|
|
398
|
+
return 'jpeg'
|
|
399
|
+
|
|
400
|
+
return ""
|
|
401
|
+
|
|
402
|
+
def _remove_binary_data(
|
|
403
|
+
self,
|
|
404
|
+
content: bytes,
|
|
405
|
+
regions: List[RTFBinaryRegion],
|
|
406
|
+
image_tags: Dict[int, str]
|
|
407
|
+
) -> bytes:
|
|
408
|
+
"""Remove binary data regions from content."""
|
|
409
|
+
if not regions:
|
|
410
|
+
return content
|
|
411
|
+
|
|
412
|
+
sorted_regions = sorted(regions, key=lambda r: r.start_pos, reverse=True)
|
|
413
|
+
result = bytearray(content)
|
|
414
|
+
|
|
415
|
+
for region in sorted_regions:
|
|
416
|
+
replacement = b''
|
|
417
|
+
if region.start_pos in image_tags:
|
|
418
|
+
tag = image_tags[region.start_pos]
|
|
419
|
+
if tag:
|
|
420
|
+
replacement = tag.encode('ascii', errors='replace')
|
|
421
|
+
result[region.start_pos:region.end_pos] = replacement
|
|
422
|
+
|
|
423
|
+
return bytes(result)
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
__all__ = ['RTFPreprocessor', 'RTFBinaryRegion']
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py
|
|
2
|
+
"""
|
|
3
|
+
RTF Region Finder
|
|
4
|
+
|
|
5
|
+
Functions for finding excluded regions (header, footer, footnote, etc.) in RTF.
|
|
6
|
+
"""
|
|
7
|
+
import re
|
|
8
|
+
from typing import List, Tuple
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def find_excluded_regions(content: str) -> List[Tuple[int, int]]:
|
|
12
|
+
"""
|
|
13
|
+
Find regions to exclude from content extraction.
|
|
14
|
+
|
|
15
|
+
Finds header, footer, footnote, and other special regions
|
|
16
|
+
that should not be part of main content.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
content: RTF content string
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
List of (start, end) position tuples
|
|
23
|
+
"""
|
|
24
|
+
regions = []
|
|
25
|
+
|
|
26
|
+
# Header/footer patterns
|
|
27
|
+
patterns = [
|
|
28
|
+
(r'\\header[lrf]?\b', r'\\par\s*\}'), # Headers
|
|
29
|
+
(r'\\footer[lrf]?\b', r'\\par\s*\}'), # Footers
|
|
30
|
+
(r'\\footnote\b', r'\}'), # Footnotes
|
|
31
|
+
(r'\\annotation\b', r'\}'), # Annotations
|
|
32
|
+
(r'\{\\headerf', r'\}'), # First page header
|
|
33
|
+
(r'\{\\footerf', r'\}'), # First page footer
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
for start_pattern, end_pattern in patterns:
|
|
37
|
+
for match in re.finditer(start_pattern, content):
|
|
38
|
+
start_pos = match.start()
|
|
39
|
+
|
|
40
|
+
# Find matching closing brace
|
|
41
|
+
depth = 0
|
|
42
|
+
i = start_pos
|
|
43
|
+
found_start = False
|
|
44
|
+
|
|
45
|
+
while i < len(content):
|
|
46
|
+
if content[i] == '{':
|
|
47
|
+
if not found_start:
|
|
48
|
+
found_start = True
|
|
49
|
+
depth += 1
|
|
50
|
+
elif content[i] == '}':
|
|
51
|
+
depth -= 1
|
|
52
|
+
if found_start and depth == 0:
|
|
53
|
+
regions.append((start_pos, i + 1))
|
|
54
|
+
break
|
|
55
|
+
i += 1
|
|
56
|
+
|
|
57
|
+
# Merge overlapping regions
|
|
58
|
+
if regions:
|
|
59
|
+
regions.sort(key=lambda x: x[0])
|
|
60
|
+
merged = [regions[0]]
|
|
61
|
+
for start, end in regions[1:]:
|
|
62
|
+
if start <= merged[-1][1]:
|
|
63
|
+
merged[-1] = (merged[-1][0], max(merged[-1][1], end))
|
|
64
|
+
else:
|
|
65
|
+
merged.append((start, end))
|
|
66
|
+
return merged
|
|
67
|
+
|
|
68
|
+
return regions
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def is_in_excluded_region(position: int, regions: List[Tuple[int, int]]) -> bool:
|
|
72
|
+
"""
|
|
73
|
+
Check if a position is within an excluded region.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
position: Position to check
|
|
77
|
+
regions: List of (start, end) tuples
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
True if position is in an excluded region
|
|
81
|
+
"""
|
|
82
|
+
for start, end in regions:
|
|
83
|
+
if start <= position < end:
|
|
84
|
+
return True
|
|
85
|
+
return False
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
__all__ = [
|
|
89
|
+
'find_excluded_regions',
|
|
90
|
+
'is_in_excluded_region',
|
|
91
|
+
]
|