xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,179 @@
1
+ # xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py
2
+ """
3
+ RTF Metadata Extractor
4
+
5
+ Extracts metadata from RTF content.
6
+ Implements BaseMetadataExtractor interface.
7
+ """
8
+ import logging
9
+ import re
10
+ from dataclasses import dataclass
11
+ from datetime import datetime
12
+ from typing import Any, Dict, Optional, Union
13
+
14
+ from xgen_doc2chunk.core.functions.metadata_extractor import (
15
+ BaseMetadataExtractor,
16
+ DocumentMetadata,
17
+ )
18
+ from xgen_doc2chunk.core.processor.rtf_helper.rtf_decoder import (
19
+ decode_hex_escapes,
20
+ )
21
+ from xgen_doc2chunk.core.processor.rtf_helper.rtf_text_cleaner import (
22
+ clean_rtf_text,
23
+ )
24
+
25
+ logger = logging.getLogger("xgen_doc2chunk.rtf.metadata")
26
+
27
+
28
+ @dataclass
29
+ class RTFSourceInfo:
30
+ """
31
+ Source information for RTF metadata extraction.
32
+
33
+ Container for data passed to RTFMetadataExtractor.extract().
34
+ """
35
+ content: str
36
+ encoding: str = "cp949"
37
+
38
+
39
+ class RTFMetadataExtractor(BaseMetadataExtractor):
40
+ """
41
+ RTF Metadata Extractor.
42
+
43
+ Extracts metadata from RTF content.
44
+
45
+ Supported fields:
46
+ - title, subject, author, keywords, comments
47
+ - last_saved_by, create_time, last_saved_time
48
+
49
+ Usage:
50
+ extractor = RTFMetadataExtractor()
51
+ source = RTFSourceInfo(content=rtf_content, encoding="cp949")
52
+ metadata = extractor.extract(source)
53
+ text = extractor.format(metadata)
54
+ """
55
+
56
+ def extract(self, source: Union[RTFSourceInfo, Dict[str, Any]]) -> DocumentMetadata:
57
+ """
58
+ Extract metadata from RTF content.
59
+
60
+ Args:
61
+ source: RTFSourceInfo object (content string and encoding)
62
+ OR Dict[str, Any] (pre-parsed metadata)
63
+
64
+ Returns:
65
+ DocumentMetadata instance
66
+ """
67
+ if isinstance(source, dict):
68
+ return self._from_dict(source)
69
+
70
+ content = source.content
71
+ encoding = source.encoding
72
+
73
+ title = None
74
+ subject = None
75
+ author = None
76
+ keywords = None
77
+ comments = None
78
+ last_saved_by = None
79
+ create_time = None
80
+ last_saved_time = None
81
+
82
+ # Find \info group
83
+ info_match = re.search(r'\\info\s*\{([^}]*(?:\{[^}]*\}[^}]*)*)\}', content)
84
+ if info_match:
85
+ info_content = info_match.group(1)
86
+
87
+ # Extract each metadata field
88
+ field_patterns = {
89
+ 'title': r'\\title\s*\{([^}]*)\}',
90
+ 'subject': r'\\subject\s*\{([^}]*)\}',
91
+ 'author': r'\\author\s*\{([^}]*)\}',
92
+ 'keywords': r'\\keywords\s*\{([^}]*)\}',
93
+ 'comments': r'\\doccomm\s*\{([^}]*)\}',
94
+ 'last_saved_by': r'\\operator\s*\{([^}]*)\}',
95
+ }
96
+
97
+ for key, pattern in field_patterns.items():
98
+ match = re.search(pattern, info_content)
99
+ if match:
100
+ value = decode_hex_escapes(match.group(1), encoding)
101
+ value = clean_rtf_text(value, encoding)
102
+ if value:
103
+ if key == 'title':
104
+ title = value
105
+ elif key == 'subject':
106
+ subject = value
107
+ elif key == 'author':
108
+ author = value
109
+ elif key == 'keywords':
110
+ keywords = value
111
+ elif key == 'comments':
112
+ comments = value
113
+ elif key == 'last_saved_by':
114
+ last_saved_by = value
115
+
116
+ # Extract dates
117
+ create_time = self._extract_date(
118
+ content,
119
+ r'\\creatim\\yr(\d+)\\mo(\d+)\\dy(\d+)(?:\\hr(\d+))?(?:\\min(\d+))?'
120
+ )
121
+ last_saved_time = self._extract_date(
122
+ content,
123
+ r'\\revtim\\yr(\d+)\\mo(\d+)\\dy(\d+)(?:\\hr(\d+))?(?:\\min(\d+))?'
124
+ )
125
+
126
+ self.logger.debug("Extracted RTF metadata fields")
127
+
128
+ return DocumentMetadata(
129
+ title=title,
130
+ subject=subject,
131
+ author=author,
132
+ keywords=keywords,
133
+ comments=comments,
134
+ last_saved_by=last_saved_by,
135
+ create_time=create_time,
136
+ last_saved_time=last_saved_time,
137
+ )
138
+
139
+ def _extract_date(self, content: str, pattern: str) -> Optional[datetime]:
140
+ """Extract datetime from RTF date pattern."""
141
+ match = re.search(pattern, content)
142
+ if match:
143
+ try:
144
+ year = int(match.group(1))
145
+ month = int(match.group(2))
146
+ day = int(match.group(3))
147
+ hour = int(match.group(4)) if match.group(4) else 0
148
+ minute = int(match.group(5)) if match.group(5) else 0
149
+ return datetime(year, month, day, hour, minute)
150
+ except (ValueError, TypeError):
151
+ pass
152
+ return None
153
+
154
+ def _from_dict(self, metadata: Dict[str, Any]) -> DocumentMetadata:
155
+ """
156
+ Convert pre-parsed metadata dict to DocumentMetadata.
157
+
158
+ Args:
159
+ metadata: Pre-parsed metadata dict
160
+
161
+ Returns:
162
+ DocumentMetadata instance
163
+ """
164
+ return DocumentMetadata(
165
+ title=metadata.get('title'),
166
+ subject=metadata.get('subject'),
167
+ author=metadata.get('author'),
168
+ keywords=metadata.get('keywords'),
169
+ comments=metadata.get('comments'),
170
+ last_saved_by=metadata.get('last_saved_by'),
171
+ create_time=metadata.get('create_time'),
172
+ last_saved_time=metadata.get('last_saved_time'),
173
+ )
174
+
175
+
176
+ __all__ = [
177
+ 'RTFMetadataExtractor',
178
+ 'RTFSourceInfo',
179
+ ]
@@ -0,0 +1,426 @@
1
+ # xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py
2
+ """
3
+ RTF Preprocessor
4
+
5
+ Preprocesses RTF binary data before conversion:
6
+ - \\binN tag processing (skip N bytes of raw binary data)
7
+ - \\pict group image extraction
8
+ - Image saving and tag generation
9
+ - Encoding detection
10
+
11
+ Implements BasePreprocessor interface.
12
+ """
13
+ import hashlib
14
+ import logging
15
+ import re
16
+ from dataclasses import dataclass, field
17
+ from typing import Any, Dict, List, Optional, Set, Tuple
18
+
19
+ from xgen_doc2chunk.core.functions.preprocessor import (
20
+ BasePreprocessor,
21
+ PreprocessedData,
22
+ )
23
+ from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
24
+ from xgen_doc2chunk.core.functions.storage_backend import BaseStorageBackend
25
+ from xgen_doc2chunk.core.processor.rtf_helper.rtf_decoder import (
26
+ detect_encoding,
27
+ )
28
+
29
+ logger = logging.getLogger("xgen_doc2chunk.rtf.preprocessor")
30
+
31
+
32
+ # Image format magic numbers
33
+ IMAGE_SIGNATURES = {
34
+ b'\xff\xd8\xff': 'jpeg',
35
+ b'\x89PNG\r\n\x1a\n': 'png',
36
+ b'GIF87a': 'gif',
37
+ b'GIF89a': 'gif',
38
+ b'BM': 'bmp',
39
+ b'\xd7\xcd\xc6\x9a': 'wmf',
40
+ b'\x01\x00\x09\x00': 'wmf',
41
+ b'\x01\x00\x00\x00': 'emf',
42
+ }
43
+
44
+ # RTF image type mapping
45
+ RTF_IMAGE_TYPES = {
46
+ 'jpegblip': 'jpeg',
47
+ 'pngblip': 'png',
48
+ 'wmetafile': 'wmf',
49
+ 'emfblip': 'emf',
50
+ 'dibitmap': 'bmp',
51
+ 'wbitmap': 'bmp',
52
+ }
53
+
54
+ # Supported image formats for saving
55
+ SUPPORTED_IMAGE_FORMATS = {'jpeg', 'png', 'gif', 'bmp'}
56
+
57
+
58
+ @dataclass
59
+ class RTFBinaryRegion:
60
+ """RTF binary data region information."""
61
+ start_pos: int
62
+ end_pos: int
63
+ bin_type: str # "bin" or "pict"
64
+ data_size: int
65
+ image_format: str = ""
66
+ image_data: bytes = b""
67
+
68
+
69
+ class RTFPreprocessor(BasePreprocessor):
70
+ """
71
+ RTF-specific preprocessor.
72
+
73
+ Handles RTF binary preprocessing:
74
+ - Removes \\bin tag binary data
75
+ - Extracts embedded images
76
+ - Detects encoding
77
+ - Returns clean content ready for parsing
78
+
79
+ Usage:
80
+ preprocessor = RTFPreprocessor(image_processor=img_proc)
81
+ result = preprocessor.preprocess(rtf_bytes)
82
+
83
+ # result.clean_content - bytes ready for parsing
84
+ # result.encoding - detected encoding
85
+ # result.extracted_resources["image_tags"] - list of image tags
86
+ """
87
+
88
+ RTF_MAGIC = b'{\\rtf'
89
+
90
+ def __init__(
91
+ self,
92
+ image_processor: Optional[ImageProcessor] = None,
93
+ processed_images: Optional[Set[str]] = None,
94
+ ):
95
+ """
96
+ Initialize RTFPreprocessor.
97
+
98
+ Args:
99
+ image_processor: Image processor for saving images
100
+ processed_images: Set of already processed image hashes
101
+ """
102
+ self._image_processor = image_processor
103
+ self._processed_images = processed_images if processed_images is not None else set()
104
+
105
+ def preprocess(
106
+ self,
107
+ converted_data: Any,
108
+ **kwargs
109
+ ) -> PreprocessedData:
110
+ """
111
+ Preprocess RTF data.
112
+
113
+ For RTF, the converter returns raw bytes (pass-through),
114
+ so converted_data is the original RTF binary data.
115
+
116
+ Args:
117
+ converted_data: RTF binary data (bytes) from RTFFileConverter
118
+ **kwargs: Additional options
119
+
120
+ Returns:
121
+ PreprocessedData with clean content, encoding, and image tags
122
+ """
123
+ # Handle bytes input
124
+ if isinstance(converted_data, bytes):
125
+ file_data = converted_data
126
+ elif hasattr(converted_data, 'read'):
127
+ # Handle file-like objects
128
+ file_data = converted_data.read()
129
+ else:
130
+ return PreprocessedData(
131
+ raw_content=b"",
132
+ clean_content=b"",
133
+ encoding="cp949",
134
+ )
135
+
136
+ if not file_data:
137
+ return PreprocessedData(
138
+ raw_content=b"",
139
+ clean_content=b"",
140
+ encoding="cp949",
141
+ )
142
+
143
+ # Get options from kwargs
144
+ image_processor = kwargs.get('image_processor', self._image_processor)
145
+ processed_images = kwargs.get('processed_images', self._processed_images)
146
+
147
+ # Detect encoding
148
+ detected_encoding = detect_encoding(file_data, "cp949")
149
+
150
+ # Process binary data (extract images, clean content)
151
+ clean_content, image_tags = self._process_binary_content(
152
+ file_data,
153
+ image_processor,
154
+ processed_images
155
+ )
156
+
157
+ # Filter valid image tags
158
+ valid_tags = [
159
+ tag for tag in image_tags
160
+ if tag and tag.strip() and '/uploads/.' not in tag
161
+ ]
162
+
163
+ return PreprocessedData(
164
+ raw_content=file_data,
165
+ clean_content=clean_content,
166
+ encoding=detected_encoding,
167
+ extracted_resources={
168
+ "image_tags": valid_tags,
169
+ }
170
+ )
171
+
172
+ def get_format_name(self) -> str:
173
+ """Return format name."""
174
+ return "RTF Preprocessor"
175
+
176
+ def validate(self, data: Any) -> bool:
177
+ """Validate if data is valid RTF content."""
178
+ if isinstance(data, bytes):
179
+ if len(data) < 5:
180
+ return False
181
+ return data[:5] == self.RTF_MAGIC
182
+ return False
183
+
184
+ def _process_binary_content(
185
+ self,
186
+ content: bytes,
187
+ image_processor: Optional[ImageProcessor],
188
+ processed_images: Set[str]
189
+ ) -> Tuple[bytes, List[str]]:
190
+ """
191
+ Process RTF binary content.
192
+
193
+ Args:
194
+ content: RTF binary content
195
+ image_processor: Image processor instance
196
+ processed_images: Set of processed image hashes
197
+
198
+ Returns:
199
+ Tuple of (clean_content, list of image tags)
200
+ """
201
+ image_tags: Dict[int, str] = {}
202
+
203
+ # Find \bin tag regions
204
+ bin_regions = self._find_bin_regions(content)
205
+
206
+ # Find \pict regions (excluding bin regions)
207
+ pict_regions = self._find_pict_regions(content, bin_regions)
208
+
209
+ # Merge and sort all regions
210
+ all_regions = bin_regions + pict_regions
211
+ all_regions.sort(key=lambda r: r.start_pos)
212
+
213
+ # Process images and generate tags
214
+ for region in all_regions:
215
+ if not region.image_data:
216
+ continue
217
+
218
+ # Check for duplicates
219
+ image_hash = hashlib.md5(region.image_data).hexdigest()
220
+ if image_hash in processed_images:
221
+ image_tags[region.start_pos] = ""
222
+ continue
223
+
224
+ processed_images.add(image_hash)
225
+
226
+ if region.image_format in SUPPORTED_IMAGE_FORMATS and image_processor:
227
+ tag = image_processor.save_image(region.image_data)
228
+ if tag:
229
+ image_tags[region.start_pos] = f"\n{tag}\n"
230
+ logger.info(
231
+ f"Saved RTF image: {tag} "
232
+ f"(format={region.image_format}, size={region.data_size})"
233
+ )
234
+ else:
235
+ image_tags[region.start_pos] = ""
236
+ else:
237
+ image_tags[region.start_pos] = ""
238
+
239
+ # Remove binary data from content
240
+ clean_content = self._remove_binary_data(content, all_regions, image_tags)
241
+
242
+ # Collect all image tags as list
243
+ tag_list = [tag for tag in image_tags.values() if tag and tag.strip()]
244
+
245
+ return clean_content, tag_list
246
+
247
+ def _find_bin_regions(self, content: bytes) -> List[RTFBinaryRegion]:
248
+ """Find \\binN tags and identify binary regions."""
249
+ regions = []
250
+ pattern = rb'\\bin(\d+)'
251
+
252
+ for match in re.finditer(pattern, content):
253
+ try:
254
+ bin_size = int(match.group(1))
255
+ bin_tag_start = match.start()
256
+ bin_tag_end = match.end()
257
+
258
+ data_start = bin_tag_end
259
+ if data_start < len(content) and content[data_start:data_start+1] == b' ':
260
+ data_start += 1
261
+
262
+ data_end = data_start + bin_size
263
+
264
+ if data_end <= len(content):
265
+ binary_data = content[data_start:data_end]
266
+ image_format = self._detect_image_format(binary_data)
267
+
268
+ # Find parent \shppict group
269
+ group_start = bin_tag_start
270
+ group_end = data_end
271
+
272
+ search_start = max(0, bin_tag_start - 500)
273
+ search_area = content[search_start:bin_tag_start]
274
+
275
+ shppict_pos = search_area.rfind(b'\\shppict')
276
+ if shppict_pos != -1:
277
+ abs_pos = search_start + shppict_pos
278
+ brace_pos = abs_pos
279
+ while brace_pos > 0 and content[brace_pos:brace_pos+1] != b'{':
280
+ brace_pos -= 1
281
+ group_start = brace_pos
282
+
283
+ depth = 1
284
+ j = data_end
285
+ while j < len(content) and depth > 0:
286
+ if content[j:j+1] == b'{':
287
+ depth += 1
288
+ elif content[j:j+1] == b'}':
289
+ depth -= 1
290
+ j += 1
291
+ group_end = j
292
+
293
+ regions.append(RTFBinaryRegion(
294
+ start_pos=group_start,
295
+ end_pos=group_end,
296
+ bin_type="bin",
297
+ data_size=bin_size,
298
+ image_format=image_format,
299
+ image_data=binary_data
300
+ ))
301
+ except (ValueError, IndexError):
302
+ continue
303
+
304
+ return regions
305
+
306
+ def _find_pict_regions(
307
+ self,
308
+ content: bytes,
309
+ exclude_regions: List[RTFBinaryRegion]
310
+ ) -> List[RTFBinaryRegion]:
311
+ """Find hex-encoded \\pict regions."""
312
+ regions = []
313
+
314
+ bin_tag_positions = {r.start_pos for r in exclude_regions if r.bin_type == "bin"}
315
+ excluded_ranges = [(r.start_pos, r.end_pos) for r in exclude_regions]
316
+
317
+ def is_excluded(pos: int) -> bool:
318
+ return any(start <= pos < end for start, end in excluded_ranges)
319
+
320
+ def has_bin_nearby(pict_pos: int) -> bool:
321
+ return any(pict_pos < bp < pict_pos + 200 for bp in bin_tag_positions)
322
+
323
+ try:
324
+ text_content = content.decode('cp1252', errors='replace')
325
+ pict_pattern = r'\\pict\s*((?:\\[a-zA-Z]+\d*\s*)*)'
326
+
327
+ for match in re.finditer(pict_pattern, text_content):
328
+ start_pos = match.start()
329
+
330
+ if is_excluded(start_pos) or has_bin_nearby(start_pos):
331
+ continue
332
+
333
+ attrs = match.group(1)
334
+ image_format = ""
335
+ for rtf_type, fmt in RTF_IMAGE_TYPES.items():
336
+ if rtf_type in attrs:
337
+ image_format = fmt
338
+ break
339
+
340
+ # Extract hex data
341
+ hex_start = match.end()
342
+ hex_data = []
343
+ i = hex_start
344
+
345
+ while i < len(text_content):
346
+ ch = text_content[i]
347
+ if ch in '0123456789abcdefABCDEF':
348
+ hex_data.append(ch)
349
+ elif ch in ' \t\r\n':
350
+ pass
351
+ elif ch == '}':
352
+ break
353
+ elif ch == '\\':
354
+ if text_content[i:i+4] == '\\bin':
355
+ hex_data = []
356
+ break
357
+ while i < len(text_content) and text_content[i] not in ' \t\r\n}':
358
+ i += 1
359
+ continue
360
+ else:
361
+ break
362
+ i += 1
363
+
364
+ hex_str = ''.join(hex_data)
365
+
366
+ if len(hex_str) >= 32:
367
+ try:
368
+ image_data = bytes.fromhex(hex_str)
369
+ if not image_format:
370
+ image_format = self._detect_image_format(image_data)
371
+
372
+ if image_format:
373
+ regions.append(RTFBinaryRegion(
374
+ start_pos=start_pos,
375
+ end_pos=i,
376
+ bin_type="pict",
377
+ data_size=len(image_data),
378
+ image_format=image_format,
379
+ image_data=image_data
380
+ ))
381
+ except ValueError:
382
+ continue
383
+ except Exception as e:
384
+ logger.warning(f"Error finding pict regions: {e}")
385
+
386
+ return regions
387
+
388
+ def _detect_image_format(self, data: bytes) -> str:
389
+ """Detect image format from binary data."""
390
+ if not data or len(data) < 4:
391
+ return ""
392
+
393
+ for signature, format_name in IMAGE_SIGNATURES.items():
394
+ if data.startswith(signature):
395
+ return format_name
396
+
397
+ if len(data) >= 2 and data[0:2] == b'\xff\xd8':
398
+ return 'jpeg'
399
+
400
+ return ""
401
+
402
+ def _remove_binary_data(
403
+ self,
404
+ content: bytes,
405
+ regions: List[RTFBinaryRegion],
406
+ image_tags: Dict[int, str]
407
+ ) -> bytes:
408
+ """Remove binary data regions from content."""
409
+ if not regions:
410
+ return content
411
+
412
+ sorted_regions = sorted(regions, key=lambda r: r.start_pos, reverse=True)
413
+ result = bytearray(content)
414
+
415
+ for region in sorted_regions:
416
+ replacement = b''
417
+ if region.start_pos in image_tags:
418
+ tag = image_tags[region.start_pos]
419
+ if tag:
420
+ replacement = tag.encode('ascii', errors='replace')
421
+ result[region.start_pos:region.end_pos] = replacement
422
+
423
+ return bytes(result)
424
+
425
+
426
+ __all__ = ['RTFPreprocessor', 'RTFBinaryRegion']
@@ -0,0 +1,91 @@
1
+ # xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py
2
+ """
3
+ RTF Region Finder
4
+
5
+ Functions for finding excluded regions (header, footer, footnote, etc.) in RTF.
6
+ """
7
+ import re
8
+ from typing import List, Tuple
9
+
10
+
11
+ def find_excluded_regions(content: str) -> List[Tuple[int, int]]:
12
+ """
13
+ Find regions to exclude from content extraction.
14
+
15
+ Finds header, footer, footnote, and other special regions
16
+ that should not be part of main content.
17
+
18
+ Args:
19
+ content: RTF content string
20
+
21
+ Returns:
22
+ List of (start, end) position tuples
23
+ """
24
+ regions = []
25
+
26
+ # Header/footer patterns
27
+ patterns = [
28
+ (r'\\header[lrf]?\b', r'\\par\s*\}'), # Headers
29
+ (r'\\footer[lrf]?\b', r'\\par\s*\}'), # Footers
30
+ (r'\\footnote\b', r'\}'), # Footnotes
31
+ (r'\\annotation\b', r'\}'), # Annotations
32
+ (r'\{\\headerf', r'\}'), # First page header
33
+ (r'\{\\footerf', r'\}'), # First page footer
34
+ ]
35
+
36
+ for start_pattern, end_pattern in patterns:
37
+ for match in re.finditer(start_pattern, content):
38
+ start_pos = match.start()
39
+
40
+ # Find matching closing brace
41
+ depth = 0
42
+ i = start_pos
43
+ found_start = False
44
+
45
+ while i < len(content):
46
+ if content[i] == '{':
47
+ if not found_start:
48
+ found_start = True
49
+ depth += 1
50
+ elif content[i] == '}':
51
+ depth -= 1
52
+ if found_start and depth == 0:
53
+ regions.append((start_pos, i + 1))
54
+ break
55
+ i += 1
56
+
57
+ # Merge overlapping regions
58
+ if regions:
59
+ regions.sort(key=lambda x: x[0])
60
+ merged = [regions[0]]
61
+ for start, end in regions[1:]:
62
+ if start <= merged[-1][1]:
63
+ merged[-1] = (merged[-1][0], max(merged[-1][1], end))
64
+ else:
65
+ merged.append((start, end))
66
+ return merged
67
+
68
+ return regions
69
+
70
+
71
+ def is_in_excluded_region(position: int, regions: List[Tuple[int, int]]) -> bool:
72
+ """
73
+ Check if a position is within an excluded region.
74
+
75
+ Args:
76
+ position: Position to check
77
+ regions: List of (start, end) tuples
78
+
79
+ Returns:
80
+ True if position is in an excluded region
81
+ """
82
+ for start, end in regions:
83
+ if start <= position < end:
84
+ return True
85
+ return False
86
+
87
+
88
+ __all__ = [
89
+ 'find_excluded_regions',
90
+ 'is_in_excluded_region',
91
+ ]