xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,389 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py
|
|
2
|
+
"""
|
|
3
|
+
RTF Text Cleaner
|
|
4
|
+
|
|
5
|
+
Functions for removing RTF control codes and cleaning text.
|
|
6
|
+
"""
|
|
7
|
+
import re
|
|
8
|
+
from typing import List
|
|
9
|
+
|
|
10
|
+
from xgen_doc2chunk.core.processor.rtf_helper.rtf_constants import (
|
|
11
|
+
SHAPE_PROPERTY_NAMES,
|
|
12
|
+
SKIP_DESTINATIONS,
|
|
13
|
+
IMAGE_DESTINATIONS,
|
|
14
|
+
)
|
|
15
|
+
from xgen_doc2chunk.core.processor.rtf_helper.rtf_decoder import (
|
|
16
|
+
decode_bytes,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def clean_rtf_text(text: str, encoding: str = "cp949") -> str:
|
|
21
|
+
"""
|
|
22
|
+
Remove RTF control codes and extract pure text.
|
|
23
|
+
|
|
24
|
+
Uses token-based parsing to prevent content loss.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
text: RTF text
|
|
28
|
+
encoding: Encoding for decoding
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
Cleaned text
|
|
32
|
+
"""
|
|
33
|
+
if not text:
|
|
34
|
+
return ""
|
|
35
|
+
|
|
36
|
+
# Protect image tags (replace with temporary markers)
|
|
37
|
+
image_tags = []
|
|
38
|
+
def save_image_tag(m):
|
|
39
|
+
image_tags.append(m.group())
|
|
40
|
+
return f'\x00IMG{len(image_tags)-1}\x00'
|
|
41
|
+
|
|
42
|
+
text = re.sub(r'\[image:[^\]]+\]', save_image_tag, text)
|
|
43
|
+
|
|
44
|
+
# Remove shape properties
|
|
45
|
+
text = re.sub(r'\{\\sp\{\\sn\s*\w+\}\{\\sv\s*[^}]*\}\}', '', text)
|
|
46
|
+
text = re.sub(r'shapeType\d+[a-zA-Z0-9]+(?:posrelh\d+posrelv\d+)?', '', text)
|
|
47
|
+
text = re.sub(r'\\shp(?:inst|txt|left|right|top|bottom|bx\w+|by\w+|wr\d+|fblwtxt\d+|z\d+|lid\d+)\b\d*', '', text)
|
|
48
|
+
|
|
49
|
+
result = []
|
|
50
|
+
i = 0
|
|
51
|
+
n = len(text)
|
|
52
|
+
|
|
53
|
+
while i < n:
|
|
54
|
+
ch = text[i]
|
|
55
|
+
|
|
56
|
+
# Restore image tag markers
|
|
57
|
+
if ch == '\x00' and i + 3 < n and text[i+1:i+4] == 'IMG':
|
|
58
|
+
end_idx = text.find('\x00', i + 4)
|
|
59
|
+
if end_idx != -1:
|
|
60
|
+
try:
|
|
61
|
+
tag_idx = int(text[i+4:end_idx])
|
|
62
|
+
result.append(image_tags[tag_idx])
|
|
63
|
+
i = end_idx + 1
|
|
64
|
+
continue
|
|
65
|
+
except (ValueError, IndexError):
|
|
66
|
+
pass
|
|
67
|
+
|
|
68
|
+
if ch == '\\':
|
|
69
|
+
if i + 1 < n:
|
|
70
|
+
next_ch = text[i + 1]
|
|
71
|
+
|
|
72
|
+
# Special escapes
|
|
73
|
+
if next_ch == '\\':
|
|
74
|
+
result.append('\\')
|
|
75
|
+
i += 2
|
|
76
|
+
continue
|
|
77
|
+
elif next_ch == '{':
|
|
78
|
+
result.append('{')
|
|
79
|
+
i += 2
|
|
80
|
+
continue
|
|
81
|
+
elif next_ch == '}':
|
|
82
|
+
result.append('}')
|
|
83
|
+
i += 2
|
|
84
|
+
continue
|
|
85
|
+
elif next_ch == '~':
|
|
86
|
+
result.append('\u00A0') # non-breaking space
|
|
87
|
+
i += 2
|
|
88
|
+
continue
|
|
89
|
+
elif next_ch == '-':
|
|
90
|
+
result.append('\u00AD') # soft hyphen
|
|
91
|
+
i += 2
|
|
92
|
+
continue
|
|
93
|
+
elif next_ch == '_':
|
|
94
|
+
result.append('\u2011') # non-breaking hyphen
|
|
95
|
+
i += 2
|
|
96
|
+
continue
|
|
97
|
+
elif next_ch == "'":
|
|
98
|
+
# Hex escape \'XX
|
|
99
|
+
if i + 3 < n:
|
|
100
|
+
try:
|
|
101
|
+
hex_val = text[i+2:i+4]
|
|
102
|
+
byte_val = int(hex_val, 16)
|
|
103
|
+
try:
|
|
104
|
+
result.append(bytes([byte_val]).decode(encoding))
|
|
105
|
+
except:
|
|
106
|
+
try:
|
|
107
|
+
result.append(bytes([byte_val]).decode('cp1252'))
|
|
108
|
+
except:
|
|
109
|
+
pass
|
|
110
|
+
i += 4
|
|
111
|
+
continue
|
|
112
|
+
except (ValueError, IndexError):
|
|
113
|
+
pass
|
|
114
|
+
i += 1
|
|
115
|
+
continue
|
|
116
|
+
elif next_ch == '*':
|
|
117
|
+
# \* destination marker, skip
|
|
118
|
+
i += 2
|
|
119
|
+
continue
|
|
120
|
+
elif next_ch.isalpha():
|
|
121
|
+
# Control word: \word[N][delimiter]
|
|
122
|
+
j = i + 1
|
|
123
|
+
while j < n and text[j].isalpha():
|
|
124
|
+
j += 1
|
|
125
|
+
|
|
126
|
+
control_word = text[i+1:j]
|
|
127
|
+
|
|
128
|
+
# Skip numeric parameter
|
|
129
|
+
while j < n and (text[j].isdigit() or text[j] == '-'):
|
|
130
|
+
j += 1
|
|
131
|
+
|
|
132
|
+
# Handle delimiter (space is part of control word)
|
|
133
|
+
if j < n and text[j] == ' ':
|
|
134
|
+
j += 1
|
|
135
|
+
|
|
136
|
+
# Special control words
|
|
137
|
+
if control_word in ('par', 'line'):
|
|
138
|
+
result.append('\n')
|
|
139
|
+
elif control_word == 'tab':
|
|
140
|
+
result.append('\t')
|
|
141
|
+
elif control_word == 'u':
|
|
142
|
+
# Unicode: \uN?
|
|
143
|
+
um = re.match(r'\\u(-?\d+)\??', text[i:])
|
|
144
|
+
if um:
|
|
145
|
+
try:
|
|
146
|
+
code = int(um.group(1))
|
|
147
|
+
if code < 0:
|
|
148
|
+
code += 65536
|
|
149
|
+
result.append(chr(code))
|
|
150
|
+
except:
|
|
151
|
+
pass
|
|
152
|
+
j = i + um.end()
|
|
153
|
+
|
|
154
|
+
i = j
|
|
155
|
+
continue
|
|
156
|
+
|
|
157
|
+
i += 1
|
|
158
|
+
elif ch == '{' or ch == '}':
|
|
159
|
+
i += 1
|
|
160
|
+
elif ch == '\r' or ch == '\n':
|
|
161
|
+
i += 1
|
|
162
|
+
else:
|
|
163
|
+
result.append(ch)
|
|
164
|
+
i += 1
|
|
165
|
+
|
|
166
|
+
text_result = ''.join(result)
|
|
167
|
+
|
|
168
|
+
# Remove shape property names
|
|
169
|
+
shape_name_pattern = r'\b(' + '|'.join(SHAPE_PROPERTY_NAMES) + r')\b'
|
|
170
|
+
text_result = re.sub(shape_name_pattern, '', text_result)
|
|
171
|
+
|
|
172
|
+
# Remove garbage numbers
|
|
173
|
+
text_result = re.sub(r'\s*-\d+\s*', ' ', text_result)
|
|
174
|
+
|
|
175
|
+
# Remove hex data outside image tags
|
|
176
|
+
text_result = _remove_hex_outside_image_tags(text_result)
|
|
177
|
+
|
|
178
|
+
# Normalize whitespace
|
|
179
|
+
text_result = re.sub(r'\s+', ' ', text_result)
|
|
180
|
+
|
|
181
|
+
return text_result.strip()
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def _remove_hex_outside_image_tags(text: str) -> str:
|
|
185
|
+
"""Remove long hex strings outside image tags."""
|
|
186
|
+
protected_ranges = []
|
|
187
|
+
for m in re.finditer(r'\[image:[^\]]+\]', text):
|
|
188
|
+
protected_ranges.append((m.start(), m.end()))
|
|
189
|
+
|
|
190
|
+
if not protected_ranges:
|
|
191
|
+
return re.sub(r'(?<![a-zA-Z])[0-9a-fA-F]{32,}(?![a-zA-Z])', '', text)
|
|
192
|
+
|
|
193
|
+
result = []
|
|
194
|
+
last_end = 0
|
|
195
|
+
for start, end in protected_ranges:
|
|
196
|
+
before = text[last_end:start]
|
|
197
|
+
before = re.sub(r'(?<![a-zA-Z])[0-9a-fA-F]{32,}(?![a-zA-Z])', '', before)
|
|
198
|
+
result.append(before)
|
|
199
|
+
result.append(text[start:end])
|
|
200
|
+
last_end = end
|
|
201
|
+
|
|
202
|
+
after = text[last_end:]
|
|
203
|
+
after = re.sub(r'(?<![a-zA-Z])[0-9a-fA-F]{32,}(?![a-zA-Z])', '', after)
|
|
204
|
+
result.append(after)
|
|
205
|
+
return ''.join(result)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def remove_destination_groups(content: str) -> str:
|
|
209
|
+
"""
|
|
210
|
+
Remove RTF destination groups {\\*\\destination...}.
|
|
211
|
+
|
|
212
|
+
Removes themedata, colorschememapping, latentstyles, datastore, etc.
|
|
213
|
+
to prevent metadata from being extracted as text.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
content: RTF content
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
Content with destination groups removed
|
|
220
|
+
"""
|
|
221
|
+
result = []
|
|
222
|
+
i = 0
|
|
223
|
+
n = len(content)
|
|
224
|
+
|
|
225
|
+
while i < n:
|
|
226
|
+
if content[i:i+3] == '{\\*':
|
|
227
|
+
j = i + 3
|
|
228
|
+
while j < n and content[j] in ' \t\r\n':
|
|
229
|
+
j += 1
|
|
230
|
+
|
|
231
|
+
if j < n and content[j] == '\\':
|
|
232
|
+
k = j + 1
|
|
233
|
+
while k < n and content[k].isalpha():
|
|
234
|
+
k += 1
|
|
235
|
+
ctrl_word = content[j+1:k]
|
|
236
|
+
|
|
237
|
+
if ctrl_word in SKIP_DESTINATIONS:
|
|
238
|
+
depth = 1
|
|
239
|
+
i += 1
|
|
240
|
+
while i < n and depth > 0:
|
|
241
|
+
if content[i] == '{':
|
|
242
|
+
depth += 1
|
|
243
|
+
elif content[i] == '}':
|
|
244
|
+
depth -= 1
|
|
245
|
+
i += 1
|
|
246
|
+
continue
|
|
247
|
+
|
|
248
|
+
if ctrl_word in IMAGE_DESTINATIONS:
|
|
249
|
+
depth = 1
|
|
250
|
+
group_start = i
|
|
251
|
+
i += 1
|
|
252
|
+
while i < n and depth > 0:
|
|
253
|
+
if content[i] == '{':
|
|
254
|
+
depth += 1
|
|
255
|
+
elif content[i] == '}':
|
|
256
|
+
depth -= 1
|
|
257
|
+
i += 1
|
|
258
|
+
|
|
259
|
+
group_content = content[group_start:i]
|
|
260
|
+
image_tag_match = re.search(r'\[image:[^\]]+\]', group_content)
|
|
261
|
+
if image_tag_match:
|
|
262
|
+
tag = image_tag_match.group()
|
|
263
|
+
if '/uploads/.' not in tag and 'uploads/.' not in tag:
|
|
264
|
+
result.append(tag)
|
|
265
|
+
continue
|
|
266
|
+
|
|
267
|
+
result.append(content[i])
|
|
268
|
+
i += 1
|
|
269
|
+
|
|
270
|
+
return ''.join(result)
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def remove_shape_groups(content: str) -> str:
|
|
274
|
+
"""
|
|
275
|
+
Remove shape groups but preserve text in shptxt.
|
|
276
|
+
|
|
277
|
+
RTF Shape structure:
|
|
278
|
+
{\\shp{\\*\\shpinst...{\\sp{\\sn xxx}{\\sv yyy}}...{\\shptxt actual_text}}}
|
|
279
|
+
|
|
280
|
+
Args:
|
|
281
|
+
content: RTF content
|
|
282
|
+
|
|
283
|
+
Returns:
|
|
284
|
+
Content with shape groups cleaned
|
|
285
|
+
"""
|
|
286
|
+
result = []
|
|
287
|
+
i = 0
|
|
288
|
+
|
|
289
|
+
while i < len(content):
|
|
290
|
+
if content[i:i+5] == '{\\shp' or content[i:i+10] == '{\\*\\shpinst':
|
|
291
|
+
depth = 1
|
|
292
|
+
i += 1
|
|
293
|
+
shptxt_content = []
|
|
294
|
+
in_shptxt = False
|
|
295
|
+
shptxt_depth = 0
|
|
296
|
+
|
|
297
|
+
while i < len(content) and depth > 0:
|
|
298
|
+
if content[i] == '{':
|
|
299
|
+
if content[i:i+8] == '{\\shptxt':
|
|
300
|
+
in_shptxt = True
|
|
301
|
+
shptxt_depth = depth + 1
|
|
302
|
+
i += 8
|
|
303
|
+
continue
|
|
304
|
+
depth += 1
|
|
305
|
+
elif content[i] == '}':
|
|
306
|
+
if in_shptxt and depth == shptxt_depth:
|
|
307
|
+
in_shptxt = False
|
|
308
|
+
depth -= 1
|
|
309
|
+
elif in_shptxt:
|
|
310
|
+
shptxt_content.append(content[i])
|
|
311
|
+
i += 1
|
|
312
|
+
|
|
313
|
+
if shptxt_content:
|
|
314
|
+
result.append(''.join(shptxt_content))
|
|
315
|
+
else:
|
|
316
|
+
result.append(content[i])
|
|
317
|
+
i += 1
|
|
318
|
+
|
|
319
|
+
return ''.join(result)
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def remove_shape_property_groups(content: str) -> str:
|
|
323
|
+
"""
|
|
324
|
+
Remove shape property groups {\\sp{\\sn xxx}{\\sv yyy}}.
|
|
325
|
+
|
|
326
|
+
Args:
|
|
327
|
+
content: RTF content
|
|
328
|
+
|
|
329
|
+
Returns:
|
|
330
|
+
Content with shape properties removed
|
|
331
|
+
"""
|
|
332
|
+
content = re.sub(r'\{\\sp\{\\sn\s*[^}]*\}\{\\sv\s*[^}]*\}\}', '', content)
|
|
333
|
+
content = re.sub(r'\{\\sp\s*[^}]*\}', '', content)
|
|
334
|
+
content = re.sub(r'\{\\sn\s*[^}]*\}', '', content)
|
|
335
|
+
content = re.sub(r'\{\\sv\s*[^}]*\}', '', content)
|
|
336
|
+
return content
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def remove_shprslt_blocks(content: str) -> str:
|
|
340
|
+
"""
|
|
341
|
+
Remove \\shprslt{...} blocks.
|
|
342
|
+
|
|
343
|
+
Word saves Shape (drawing/table) in \\shp block and duplicates
|
|
344
|
+
the same content in \\shprslt block for backward compatibility.
|
|
345
|
+
|
|
346
|
+
Args:
|
|
347
|
+
content: RTF content
|
|
348
|
+
|
|
349
|
+
Returns:
|
|
350
|
+
Content with \\shprslt blocks removed
|
|
351
|
+
"""
|
|
352
|
+
result = []
|
|
353
|
+
i = 0
|
|
354
|
+
pattern = '\\shprslt'
|
|
355
|
+
|
|
356
|
+
while i < len(content):
|
|
357
|
+
idx = content.find(pattern, i)
|
|
358
|
+
if idx == -1:
|
|
359
|
+
result.append(content[i:])
|
|
360
|
+
break
|
|
361
|
+
|
|
362
|
+
result.append(content[i:idx])
|
|
363
|
+
|
|
364
|
+
brace_start = content.find('{', idx)
|
|
365
|
+
if brace_start == -1:
|
|
366
|
+
i = idx + len(pattern)
|
|
367
|
+
continue
|
|
368
|
+
|
|
369
|
+
depth = 1
|
|
370
|
+
j = brace_start + 1
|
|
371
|
+
while j < len(content) and depth > 0:
|
|
372
|
+
if content[j] == '{':
|
|
373
|
+
depth += 1
|
|
374
|
+
elif content[j] == '}':
|
|
375
|
+
depth -= 1
|
|
376
|
+
j += 1
|
|
377
|
+
|
|
378
|
+
i = j
|
|
379
|
+
|
|
380
|
+
return ''.join(result)
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
__all__ = [
|
|
384
|
+
'clean_rtf_text',
|
|
385
|
+
'remove_destination_groups',
|
|
386
|
+
'remove_shape_groups',
|
|
387
|
+
'remove_shape_property_groups',
|
|
388
|
+
'remove_shprslt_blocks',
|
|
389
|
+
]
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/text_handler.py
|
|
2
|
+
"""
|
|
3
|
+
Text Handler - Text File Processor
|
|
4
|
+
|
|
5
|
+
Class-based handler for text files inheriting from BaseHandler.
|
|
6
|
+
"""
|
|
7
|
+
import logging
|
|
8
|
+
from typing import List, Optional, TYPE_CHECKING
|
|
9
|
+
|
|
10
|
+
from xgen_doc2chunk.core.processor.base_handler import BaseHandler
|
|
11
|
+
from xgen_doc2chunk.core.functions.utils import clean_text, clean_code_text
|
|
12
|
+
from xgen_doc2chunk.core.functions.chart_extractor import BaseChartExtractor, NullChartExtractor
|
|
13
|
+
from xgen_doc2chunk.core.processor.text_helper.text_image_processor import TextImageProcessor
|
|
14
|
+
from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from xgen_doc2chunk.core.document_processor import CurrentFile
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger("document-processor")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
DEFAULT_ENCODINGS = ['utf-8', 'utf-8-sig', 'cp949', 'euc-kr', 'latin-1', 'ascii']
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class TextHandler(BaseHandler):
|
|
26
|
+
"""Text File Processing Handler Class"""
|
|
27
|
+
|
|
28
|
+
def _create_file_converter(self):
|
|
29
|
+
"""Create text-specific file converter."""
|
|
30
|
+
from xgen_doc2chunk.core.processor.text_helper.text_file_converter import TextFileConverter
|
|
31
|
+
return TextFileConverter()
|
|
32
|
+
|
|
33
|
+
def _create_preprocessor(self):
|
|
34
|
+
"""Create text-specific preprocessor."""
|
|
35
|
+
from xgen_doc2chunk.core.processor.text_helper.text_preprocessor import TextPreprocessor
|
|
36
|
+
return TextPreprocessor()
|
|
37
|
+
|
|
38
|
+
def _create_chart_extractor(self) -> BaseChartExtractor:
|
|
39
|
+
"""Text files do not contain charts. Return NullChartExtractor."""
|
|
40
|
+
return NullChartExtractor(self._chart_processor)
|
|
41
|
+
|
|
42
|
+
def _create_metadata_extractor(self):
|
|
43
|
+
"""Text files do not have embedded metadata. Return None (uses NullMetadataExtractor)."""
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
def _create_format_image_processor(self) -> ImageProcessor:
|
|
47
|
+
"""Create text-specific image processor."""
|
|
48
|
+
return TextImageProcessor()
|
|
49
|
+
|
|
50
|
+
def extract_text(
|
|
51
|
+
self,
|
|
52
|
+
current_file: "CurrentFile",
|
|
53
|
+
extract_metadata: bool = True,
|
|
54
|
+
file_type: Optional[str] = None,
|
|
55
|
+
encodings: Optional[List[str]] = None,
|
|
56
|
+
is_code: bool = False,
|
|
57
|
+
**kwargs
|
|
58
|
+
) -> str:
|
|
59
|
+
"""
|
|
60
|
+
Extract text from text file.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
current_file: CurrentFile dict containing file info and binary data
|
|
64
|
+
extract_metadata: Whether to extract metadata (ignored for text files)
|
|
65
|
+
file_type: File type (extension)
|
|
66
|
+
encodings: List of encodings to try
|
|
67
|
+
is_code: Whether this is a code file
|
|
68
|
+
**kwargs: Additional options
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
Extracted text
|
|
72
|
+
"""
|
|
73
|
+
file_path = current_file.get("file_path", "unknown")
|
|
74
|
+
file_data = current_file.get("file_data", b"")
|
|
75
|
+
enc = encodings or DEFAULT_ENCODINGS
|
|
76
|
+
|
|
77
|
+
# Step 1: No file_converter for text files (direct decode)
|
|
78
|
+
# Step 2: Preprocess - clean_content is the TRUE SOURCE
|
|
79
|
+
preprocessed = self.preprocess(file_data)
|
|
80
|
+
file_data = preprocessed.clean_content # TRUE SOURCE
|
|
81
|
+
|
|
82
|
+
for e in enc:
|
|
83
|
+
try:
|
|
84
|
+
text = file_data.decode(e)
|
|
85
|
+
self.logger.info(f"Successfully decoded {file_path} with {e} encoding")
|
|
86
|
+
return clean_code_text(text) if is_code else clean_text(text)
|
|
87
|
+
except UnicodeDecodeError:
|
|
88
|
+
self.logger.debug(f"Failed to decode {file_path} with {e}, trying next...")
|
|
89
|
+
continue
|
|
90
|
+
except Exception as ex:
|
|
91
|
+
self.logger.error(f"Error decoding file {file_path} with {e}: {ex}")
|
|
92
|
+
continue
|
|
93
|
+
|
|
94
|
+
raise Exception(f"Could not decode file {file_path} with any supported encoding")
|
|
95
|
+
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/text_helper/__init__.py
|
|
2
|
+
"""
|
|
3
|
+
Text Helper 모듈
|
|
4
|
+
|
|
5
|
+
텍스트 파일 처리에 필요한 유틸리티를 제공합니다.
|
|
6
|
+
|
|
7
|
+
모듈 구성:
|
|
8
|
+
- text_image_processor: 텍스트 파일용 이미지 프로세서
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from xgen_doc2chunk.core.processor.text_helper.text_image_processor import (
|
|
12
|
+
TextImageProcessor,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"TextImageProcessor",
|
|
17
|
+
]
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/text_helper/text_file_converter.py
|
|
2
|
+
"""
|
|
3
|
+
TextFileConverter - Text file format converter
|
|
4
|
+
|
|
5
|
+
Converts binary text data to string with encoding detection.
|
|
6
|
+
"""
|
|
7
|
+
from typing import Optional, BinaryIO
|
|
8
|
+
|
|
9
|
+
from xgen_doc2chunk.core.functions.file_converter import TextFileConverter as BaseTextFileConverter
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TextFileConverter(BaseTextFileConverter):
|
|
13
|
+
"""
|
|
14
|
+
Text file converter.
|
|
15
|
+
|
|
16
|
+
Converts binary text data to decoded string.
|
|
17
|
+
Inherits from base TextFileConverter.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self):
|
|
21
|
+
"""Initialize with common text encodings."""
|
|
22
|
+
super().__init__(encodings=['utf-8', 'utf-8-sig', 'cp949', 'euc-kr', 'latin-1', 'ascii'])
|
|
23
|
+
|
|
24
|
+
def get_format_name(self) -> str:
|
|
25
|
+
"""Return format name."""
|
|
26
|
+
enc = self._detected_encoding or 'unknown'
|
|
27
|
+
return f"Text File ({enc})"
|
|
28
|
+
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/text_helper/text_image_processor.py
|
|
2
|
+
"""
|
|
3
|
+
Text Image Processor
|
|
4
|
+
|
|
5
|
+
Provides text-specific image processing that inherits from ImageProcessor.
|
|
6
|
+
Text files do not contain embedded images, so this is a minimal implementation.
|
|
7
|
+
"""
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Any, Optional
|
|
10
|
+
|
|
11
|
+
from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
|
|
12
|
+
from xgen_doc2chunk.core.functions.storage_backend import BaseStorageBackend
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger("xgen_doc2chunk.image_processor.text")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TextImageProcessor(ImageProcessor):
|
|
18
|
+
"""
|
|
19
|
+
Text-specific image processor.
|
|
20
|
+
|
|
21
|
+
Inherits from ImageProcessor and provides text-specific processing.
|
|
22
|
+
Text files do not contain embedded images, so this processor
|
|
23
|
+
provides a consistent interface without additional functionality.
|
|
24
|
+
|
|
25
|
+
This class exists to maintain interface consistency across all handlers.
|
|
26
|
+
|
|
27
|
+
Example:
|
|
28
|
+
processor = TextImageProcessor()
|
|
29
|
+
|
|
30
|
+
# No images in text files, but interface is consistent
|
|
31
|
+
tag = processor.process_image(image_data) # Falls back to base implementation
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
directory_path: str = "temp/images",
|
|
37
|
+
tag_prefix: str = "[Image:",
|
|
38
|
+
tag_suffix: str = "]",
|
|
39
|
+
storage_backend: Optional[BaseStorageBackend] = None,
|
|
40
|
+
):
|
|
41
|
+
"""
|
|
42
|
+
Initialize TextImageProcessor.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
directory_path: Image save directory
|
|
46
|
+
tag_prefix: Tag prefix for image references
|
|
47
|
+
tag_suffix: Tag suffix for image references
|
|
48
|
+
storage_backend: Storage backend for saving images
|
|
49
|
+
"""
|
|
50
|
+
super().__init__(
|
|
51
|
+
directory_path=directory_path,
|
|
52
|
+
tag_prefix=tag_prefix,
|
|
53
|
+
tag_suffix=tag_suffix,
|
|
54
|
+
storage_backend=storage_backend,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
def process_image(
|
|
58
|
+
self,
|
|
59
|
+
image_data: bytes,
|
|
60
|
+
**kwargs
|
|
61
|
+
) -> Optional[str]:
|
|
62
|
+
"""
|
|
63
|
+
Process and save image data.
|
|
64
|
+
|
|
65
|
+
Text files do not contain embedded images, so this method
|
|
66
|
+
delegates to the base implementation.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
image_data: Raw image binary data
|
|
70
|
+
**kwargs: Additional options
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
Image tag string or None if processing failed
|
|
74
|
+
"""
|
|
75
|
+
return super().process_image(image_data, **kwargs)
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py
|
|
2
|
+
"""
|
|
3
|
+
Text Preprocessor - Process text content after conversion.
|
|
4
|
+
|
|
5
|
+
Processing Pipeline Position:
|
|
6
|
+
1. TextFileConverter.convert() ??str
|
|
7
|
+
2. TextPreprocessor.preprocess() ??PreprocessedData (THIS STEP)
|
|
8
|
+
3. TextMetadataExtractor.extract() ??DocumentMetadata (if any)
|
|
9
|
+
4. Content extraction
|
|
10
|
+
|
|
11
|
+
Current Implementation:
|
|
12
|
+
- Pass-through (Text uses decoded string content directly)
|
|
13
|
+
"""
|
|
14
|
+
import logging
|
|
15
|
+
from typing import Any, Dict
|
|
16
|
+
|
|
17
|
+
from xgen_doc2chunk.core.functions.preprocessor import (
|
|
18
|
+
BasePreprocessor,
|
|
19
|
+
PreprocessedData,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger("xgen_doc2chunk.text.preprocessor")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class TextPreprocessor(BasePreprocessor):
|
|
26
|
+
"""
|
|
27
|
+
Text Content Preprocessor.
|
|
28
|
+
|
|
29
|
+
Currently a pass-through implementation as text processing
|
|
30
|
+
is straightforward.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def preprocess(
|
|
34
|
+
self,
|
|
35
|
+
converted_data: Any,
|
|
36
|
+
**kwargs
|
|
37
|
+
) -> PreprocessedData:
|
|
38
|
+
"""
|
|
39
|
+
Preprocess the converted text content.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
converted_data: Text string from TextFileConverter
|
|
43
|
+
**kwargs: Additional options
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
PreprocessedData with the content
|
|
47
|
+
"""
|
|
48
|
+
metadata: Dict[str, Any] = {}
|
|
49
|
+
|
|
50
|
+
content = ""
|
|
51
|
+
encoding = kwargs.get("encoding", "utf-8")
|
|
52
|
+
|
|
53
|
+
if isinstance(converted_data, str):
|
|
54
|
+
content = converted_data
|
|
55
|
+
metadata['char_count'] = len(content)
|
|
56
|
+
metadata['line_count'] = len(content.split('\n'))
|
|
57
|
+
elif isinstance(converted_data, bytes):
|
|
58
|
+
content = converted_data.decode(encoding, errors='replace')
|
|
59
|
+
metadata['char_count'] = len(content)
|
|
60
|
+
metadata['line_count'] = len(content.split('\n'))
|
|
61
|
+
|
|
62
|
+
logger.debug("Text preprocessor: pass-through, metadata=%s", metadata)
|
|
63
|
+
|
|
64
|
+
# clean_content is the TRUE SOURCE - contains the processed text/bytes
|
|
65
|
+
return PreprocessedData(
|
|
66
|
+
raw_content=converted_data,
|
|
67
|
+
clean_content=converted_data, # TRUE SOURCE - bytes or str
|
|
68
|
+
encoding=encoding,
|
|
69
|
+
extracted_resources={},
|
|
70
|
+
metadata=metadata,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
def get_format_name(self) -> str:
|
|
74
|
+
"""Return format name."""
|
|
75
|
+
return "Text Preprocessor"
|
|
76
|
+
|
|
77
|
+
def validate(self, data: Any) -> bool:
|
|
78
|
+
"""Validate if data is text content."""
|
|
79
|
+
return isinstance(data, (str, bytes))
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
__all__ = ['TextPreprocessor']
|