xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,389 @@
1
+ # xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py
2
+ """
3
+ RTF Text Cleaner
4
+
5
+ Functions for removing RTF control codes and cleaning text.
6
+ """
7
+ import re
8
+ from typing import List
9
+
10
+ from xgen_doc2chunk.core.processor.rtf_helper.rtf_constants import (
11
+ SHAPE_PROPERTY_NAMES,
12
+ SKIP_DESTINATIONS,
13
+ IMAGE_DESTINATIONS,
14
+ )
15
+ from xgen_doc2chunk.core.processor.rtf_helper.rtf_decoder import (
16
+ decode_bytes,
17
+ )
18
+
19
+
20
+ def clean_rtf_text(text: str, encoding: str = "cp949") -> str:
21
+ """
22
+ Remove RTF control codes and extract pure text.
23
+
24
+ Uses token-based parsing to prevent content loss.
25
+
26
+ Args:
27
+ text: RTF text
28
+ encoding: Encoding for decoding
29
+
30
+ Returns:
31
+ Cleaned text
32
+ """
33
+ if not text:
34
+ return ""
35
+
36
+ # Protect image tags (replace with temporary markers)
37
+ image_tags = []
38
+ def save_image_tag(m):
39
+ image_tags.append(m.group())
40
+ return f'\x00IMG{len(image_tags)-1}\x00'
41
+
42
+ text = re.sub(r'\[image:[^\]]+\]', save_image_tag, text)
43
+
44
+ # Remove shape properties
45
+ text = re.sub(r'\{\\sp\{\\sn\s*\w+\}\{\\sv\s*[^}]*\}\}', '', text)
46
+ text = re.sub(r'shapeType\d+[a-zA-Z0-9]+(?:posrelh\d+posrelv\d+)?', '', text)
47
+ text = re.sub(r'\\shp(?:inst|txt|left|right|top|bottom|bx\w+|by\w+|wr\d+|fblwtxt\d+|z\d+|lid\d+)\b\d*', '', text)
48
+
49
+ result = []
50
+ i = 0
51
+ n = len(text)
52
+
53
+ while i < n:
54
+ ch = text[i]
55
+
56
+ # Restore image tag markers
57
+ if ch == '\x00' and i + 3 < n and text[i+1:i+4] == 'IMG':
58
+ end_idx = text.find('\x00', i + 4)
59
+ if end_idx != -1:
60
+ try:
61
+ tag_idx = int(text[i+4:end_idx])
62
+ result.append(image_tags[tag_idx])
63
+ i = end_idx + 1
64
+ continue
65
+ except (ValueError, IndexError):
66
+ pass
67
+
68
+ if ch == '\\':
69
+ if i + 1 < n:
70
+ next_ch = text[i + 1]
71
+
72
+ # Special escapes
73
+ if next_ch == '\\':
74
+ result.append('\\')
75
+ i += 2
76
+ continue
77
+ elif next_ch == '{':
78
+ result.append('{')
79
+ i += 2
80
+ continue
81
+ elif next_ch == '}':
82
+ result.append('}')
83
+ i += 2
84
+ continue
85
+ elif next_ch == '~':
86
+ result.append('\u00A0') # non-breaking space
87
+ i += 2
88
+ continue
89
+ elif next_ch == '-':
90
+ result.append('\u00AD') # soft hyphen
91
+ i += 2
92
+ continue
93
+ elif next_ch == '_':
94
+ result.append('\u2011') # non-breaking hyphen
95
+ i += 2
96
+ continue
97
+ elif next_ch == "'":
98
+ # Hex escape \'XX
99
+ if i + 3 < n:
100
+ try:
101
+ hex_val = text[i+2:i+4]
102
+ byte_val = int(hex_val, 16)
103
+ try:
104
+ result.append(bytes([byte_val]).decode(encoding))
105
+ except:
106
+ try:
107
+ result.append(bytes([byte_val]).decode('cp1252'))
108
+ except:
109
+ pass
110
+ i += 4
111
+ continue
112
+ except (ValueError, IndexError):
113
+ pass
114
+ i += 1
115
+ continue
116
+ elif next_ch == '*':
117
+ # \* destination marker, skip
118
+ i += 2
119
+ continue
120
+ elif next_ch.isalpha():
121
+ # Control word: \word[N][delimiter]
122
+ j = i + 1
123
+ while j < n and text[j].isalpha():
124
+ j += 1
125
+
126
+ control_word = text[i+1:j]
127
+
128
+ # Skip numeric parameter
129
+ while j < n and (text[j].isdigit() or text[j] == '-'):
130
+ j += 1
131
+
132
+ # Handle delimiter (space is part of control word)
133
+ if j < n and text[j] == ' ':
134
+ j += 1
135
+
136
+ # Special control words
137
+ if control_word in ('par', 'line'):
138
+ result.append('\n')
139
+ elif control_word == 'tab':
140
+ result.append('\t')
141
+ elif control_word == 'u':
142
+ # Unicode: \uN?
143
+ um = re.match(r'\\u(-?\d+)\??', text[i:])
144
+ if um:
145
+ try:
146
+ code = int(um.group(1))
147
+ if code < 0:
148
+ code += 65536
149
+ result.append(chr(code))
150
+ except:
151
+ pass
152
+ j = i + um.end()
153
+
154
+ i = j
155
+ continue
156
+
157
+ i += 1
158
+ elif ch == '{' or ch == '}':
159
+ i += 1
160
+ elif ch == '\r' or ch == '\n':
161
+ i += 1
162
+ else:
163
+ result.append(ch)
164
+ i += 1
165
+
166
+ text_result = ''.join(result)
167
+
168
+ # Remove shape property names
169
+ shape_name_pattern = r'\b(' + '|'.join(SHAPE_PROPERTY_NAMES) + r')\b'
170
+ text_result = re.sub(shape_name_pattern, '', text_result)
171
+
172
+ # Remove garbage numbers
173
+ text_result = re.sub(r'\s*-\d+\s*', ' ', text_result)
174
+
175
+ # Remove hex data outside image tags
176
+ text_result = _remove_hex_outside_image_tags(text_result)
177
+
178
+ # Normalize whitespace
179
+ text_result = re.sub(r'\s+', ' ', text_result)
180
+
181
+ return text_result.strip()
182
+
183
+
184
+ def _remove_hex_outside_image_tags(text: str) -> str:
185
+ """Remove long hex strings outside image tags."""
186
+ protected_ranges = []
187
+ for m in re.finditer(r'\[image:[^\]]+\]', text):
188
+ protected_ranges.append((m.start(), m.end()))
189
+
190
+ if not protected_ranges:
191
+ return re.sub(r'(?<![a-zA-Z])[0-9a-fA-F]{32,}(?![a-zA-Z])', '', text)
192
+
193
+ result = []
194
+ last_end = 0
195
+ for start, end in protected_ranges:
196
+ before = text[last_end:start]
197
+ before = re.sub(r'(?<![a-zA-Z])[0-9a-fA-F]{32,}(?![a-zA-Z])', '', before)
198
+ result.append(before)
199
+ result.append(text[start:end])
200
+ last_end = end
201
+
202
+ after = text[last_end:]
203
+ after = re.sub(r'(?<![a-zA-Z])[0-9a-fA-F]{32,}(?![a-zA-Z])', '', after)
204
+ result.append(after)
205
+ return ''.join(result)
206
+
207
+
208
+ def remove_destination_groups(content: str) -> str:
209
+ """
210
+ Remove RTF destination groups {\\*\\destination...}.
211
+
212
+ Removes themedata, colorschememapping, latentstyles, datastore, etc.
213
+ to prevent metadata from being extracted as text.
214
+
215
+ Args:
216
+ content: RTF content
217
+
218
+ Returns:
219
+ Content with destination groups removed
220
+ """
221
+ result = []
222
+ i = 0
223
+ n = len(content)
224
+
225
+ while i < n:
226
+ if content[i:i+3] == '{\\*':
227
+ j = i + 3
228
+ while j < n and content[j] in ' \t\r\n':
229
+ j += 1
230
+
231
+ if j < n and content[j] == '\\':
232
+ k = j + 1
233
+ while k < n and content[k].isalpha():
234
+ k += 1
235
+ ctrl_word = content[j+1:k]
236
+
237
+ if ctrl_word in SKIP_DESTINATIONS:
238
+ depth = 1
239
+ i += 1
240
+ while i < n and depth > 0:
241
+ if content[i] == '{':
242
+ depth += 1
243
+ elif content[i] == '}':
244
+ depth -= 1
245
+ i += 1
246
+ continue
247
+
248
+ if ctrl_word in IMAGE_DESTINATIONS:
249
+ depth = 1
250
+ group_start = i
251
+ i += 1
252
+ while i < n and depth > 0:
253
+ if content[i] == '{':
254
+ depth += 1
255
+ elif content[i] == '}':
256
+ depth -= 1
257
+ i += 1
258
+
259
+ group_content = content[group_start:i]
260
+ image_tag_match = re.search(r'\[image:[^\]]+\]', group_content)
261
+ if image_tag_match:
262
+ tag = image_tag_match.group()
263
+ if '/uploads/.' not in tag and 'uploads/.' not in tag:
264
+ result.append(tag)
265
+ continue
266
+
267
+ result.append(content[i])
268
+ i += 1
269
+
270
+ return ''.join(result)
271
+
272
+
273
+ def remove_shape_groups(content: str) -> str:
274
+ """
275
+ Remove shape groups but preserve text in shptxt.
276
+
277
+ RTF Shape structure:
278
+ {\\shp{\\*\\shpinst...{\\sp{\\sn xxx}{\\sv yyy}}...{\\shptxt actual_text}}}
279
+
280
+ Args:
281
+ content: RTF content
282
+
283
+ Returns:
284
+ Content with shape groups cleaned
285
+ """
286
+ result = []
287
+ i = 0
288
+
289
+ while i < len(content):
290
+ if content[i:i+5] == '{\\shp' or content[i:i+10] == '{\\*\\shpinst':
291
+ depth = 1
292
+ i += 1
293
+ shptxt_content = []
294
+ in_shptxt = False
295
+ shptxt_depth = 0
296
+
297
+ while i < len(content) and depth > 0:
298
+ if content[i] == '{':
299
+ if content[i:i+8] == '{\\shptxt':
300
+ in_shptxt = True
301
+ shptxt_depth = depth + 1
302
+ i += 8
303
+ continue
304
+ depth += 1
305
+ elif content[i] == '}':
306
+ if in_shptxt and depth == shptxt_depth:
307
+ in_shptxt = False
308
+ depth -= 1
309
+ elif in_shptxt:
310
+ shptxt_content.append(content[i])
311
+ i += 1
312
+
313
+ if shptxt_content:
314
+ result.append(''.join(shptxt_content))
315
+ else:
316
+ result.append(content[i])
317
+ i += 1
318
+
319
+ return ''.join(result)
320
+
321
+
322
+ def remove_shape_property_groups(content: str) -> str:
323
+ """
324
+ Remove shape property groups {\\sp{\\sn xxx}{\\sv yyy}}.
325
+
326
+ Args:
327
+ content: RTF content
328
+
329
+ Returns:
330
+ Content with shape properties removed
331
+ """
332
+ content = re.sub(r'\{\\sp\{\\sn\s*[^}]*\}\{\\sv\s*[^}]*\}\}', '', content)
333
+ content = re.sub(r'\{\\sp\s*[^}]*\}', '', content)
334
+ content = re.sub(r'\{\\sn\s*[^}]*\}', '', content)
335
+ content = re.sub(r'\{\\sv\s*[^}]*\}', '', content)
336
+ return content
337
+
338
+
339
+ def remove_shprslt_blocks(content: str) -> str:
340
+ """
341
+ Remove \\shprslt{...} blocks.
342
+
343
+ Word saves Shape (drawing/table) in \\shp block and duplicates
344
+ the same content in \\shprslt block for backward compatibility.
345
+
346
+ Args:
347
+ content: RTF content
348
+
349
+ Returns:
350
+ Content with \\shprslt blocks removed
351
+ """
352
+ result = []
353
+ i = 0
354
+ pattern = '\\shprslt'
355
+
356
+ while i < len(content):
357
+ idx = content.find(pattern, i)
358
+ if idx == -1:
359
+ result.append(content[i:])
360
+ break
361
+
362
+ result.append(content[i:idx])
363
+
364
+ brace_start = content.find('{', idx)
365
+ if brace_start == -1:
366
+ i = idx + len(pattern)
367
+ continue
368
+
369
+ depth = 1
370
+ j = brace_start + 1
371
+ while j < len(content) and depth > 0:
372
+ if content[j] == '{':
373
+ depth += 1
374
+ elif content[j] == '}':
375
+ depth -= 1
376
+ j += 1
377
+
378
+ i = j
379
+
380
+ return ''.join(result)
381
+
382
+
383
+ __all__ = [
384
+ 'clean_rtf_text',
385
+ 'remove_destination_groups',
386
+ 'remove_shape_groups',
387
+ 'remove_shape_property_groups',
388
+ 'remove_shprslt_blocks',
389
+ ]
@@ -0,0 +1,95 @@
1
+ # xgen_doc2chunk/core/processor/text_handler.py
2
+ """
3
+ Text Handler - Text File Processor
4
+
5
+ Class-based handler for text files inheriting from BaseHandler.
6
+ """
7
+ import logging
8
+ from typing import List, Optional, TYPE_CHECKING
9
+
10
+ from xgen_doc2chunk.core.processor.base_handler import BaseHandler
11
+ from xgen_doc2chunk.core.functions.utils import clean_text, clean_code_text
12
+ from xgen_doc2chunk.core.functions.chart_extractor import BaseChartExtractor, NullChartExtractor
13
+ from xgen_doc2chunk.core.processor.text_helper.text_image_processor import TextImageProcessor
14
+ from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
15
+
16
+ if TYPE_CHECKING:
17
+ from xgen_doc2chunk.core.document_processor import CurrentFile
18
+
19
+ logger = logging.getLogger("document-processor")
20
+
21
+
22
+ DEFAULT_ENCODINGS = ['utf-8', 'utf-8-sig', 'cp949', 'euc-kr', 'latin-1', 'ascii']
23
+
24
+
25
+ class TextHandler(BaseHandler):
26
+ """Text File Processing Handler Class"""
27
+
28
+ def _create_file_converter(self):
29
+ """Create text-specific file converter."""
30
+ from xgen_doc2chunk.core.processor.text_helper.text_file_converter import TextFileConverter
31
+ return TextFileConverter()
32
+
33
+ def _create_preprocessor(self):
34
+ """Create text-specific preprocessor."""
35
+ from xgen_doc2chunk.core.processor.text_helper.text_preprocessor import TextPreprocessor
36
+ return TextPreprocessor()
37
+
38
+ def _create_chart_extractor(self) -> BaseChartExtractor:
39
+ """Text files do not contain charts. Return NullChartExtractor."""
40
+ return NullChartExtractor(self._chart_processor)
41
+
42
+ def _create_metadata_extractor(self):
43
+ """Text files do not have embedded metadata. Return None (uses NullMetadataExtractor)."""
44
+ return None
45
+
46
+ def _create_format_image_processor(self) -> ImageProcessor:
47
+ """Create text-specific image processor."""
48
+ return TextImageProcessor()
49
+
50
+ def extract_text(
51
+ self,
52
+ current_file: "CurrentFile",
53
+ extract_metadata: bool = True,
54
+ file_type: Optional[str] = None,
55
+ encodings: Optional[List[str]] = None,
56
+ is_code: bool = False,
57
+ **kwargs
58
+ ) -> str:
59
+ """
60
+ Extract text from text file.
61
+
62
+ Args:
63
+ current_file: CurrentFile dict containing file info and binary data
64
+ extract_metadata: Whether to extract metadata (ignored for text files)
65
+ file_type: File type (extension)
66
+ encodings: List of encodings to try
67
+ is_code: Whether this is a code file
68
+ **kwargs: Additional options
69
+
70
+ Returns:
71
+ Extracted text
72
+ """
73
+ file_path = current_file.get("file_path", "unknown")
74
+ file_data = current_file.get("file_data", b"")
75
+ enc = encodings or DEFAULT_ENCODINGS
76
+
77
+ # Step 1: No file_converter for text files (direct decode)
78
+ # Step 2: Preprocess - clean_content is the TRUE SOURCE
79
+ preprocessed = self.preprocess(file_data)
80
+ file_data = preprocessed.clean_content # TRUE SOURCE
81
+
82
+ for e in enc:
83
+ try:
84
+ text = file_data.decode(e)
85
+ self.logger.info(f"Successfully decoded {file_path} with {e} encoding")
86
+ return clean_code_text(text) if is_code else clean_text(text)
87
+ except UnicodeDecodeError:
88
+ self.logger.debug(f"Failed to decode {file_path} with {e}, trying next...")
89
+ continue
90
+ except Exception as ex:
91
+ self.logger.error(f"Error decoding file {file_path} with {e}: {ex}")
92
+ continue
93
+
94
+ raise Exception(f"Could not decode file {file_path} with any supported encoding")
95
+
@@ -0,0 +1,17 @@
1
+ # xgen_doc2chunk/core/processor/text_helper/__init__.py
2
+ """
3
+ Text Helper 모듈
4
+
5
+ 텍스트 파일 처리에 필요한 유틸리티를 제공합니다.
6
+
7
+ 모듈 구성:
8
+ - text_image_processor: 텍스트 파일용 이미지 프로세서
9
+ """
10
+
11
+ from xgen_doc2chunk.core.processor.text_helper.text_image_processor import (
12
+ TextImageProcessor,
13
+ )
14
+
15
+ __all__ = [
16
+ "TextImageProcessor",
17
+ ]
@@ -0,0 +1,28 @@
1
+ # xgen_doc2chunk/core/processor/text_helper/text_file_converter.py
2
+ """
3
+ TextFileConverter - Text file format converter
4
+
5
+ Converts binary text data to string with encoding detection.
6
+ """
7
+ from typing import Optional, BinaryIO
8
+
9
+ from xgen_doc2chunk.core.functions.file_converter import TextFileConverter as BaseTextFileConverter
10
+
11
+
12
+ class TextFileConverter(BaseTextFileConverter):
13
+ """
14
+ Text file converter.
15
+
16
+ Converts binary text data to decoded string.
17
+ Inherits from base TextFileConverter.
18
+ """
19
+
20
+ def __init__(self):
21
+ """Initialize with common text encodings."""
22
+ super().__init__(encodings=['utf-8', 'utf-8-sig', 'cp949', 'euc-kr', 'latin-1', 'ascii'])
23
+
24
+ def get_format_name(self) -> str:
25
+ """Return format name."""
26
+ enc = self._detected_encoding or 'unknown'
27
+ return f"Text File ({enc})"
28
+
@@ -0,0 +1,75 @@
1
+ # xgen_doc2chunk/core/processor/text_helper/text_image_processor.py
2
+ """
3
+ Text Image Processor
4
+
5
+ Provides text-specific image processing that inherits from ImageProcessor.
6
+ Text files do not contain embedded images, so this is a minimal implementation.
7
+ """
8
+ import logging
9
+ from typing import Any, Optional
10
+
11
+ from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
12
+ from xgen_doc2chunk.core.functions.storage_backend import BaseStorageBackend
13
+
14
+ logger = logging.getLogger("xgen_doc2chunk.image_processor.text")
15
+
16
+
17
+ class TextImageProcessor(ImageProcessor):
18
+ """
19
+ Text-specific image processor.
20
+
21
+ Inherits from ImageProcessor and provides text-specific processing.
22
+ Text files do not contain embedded images, so this processor
23
+ provides a consistent interface without additional functionality.
24
+
25
+ This class exists to maintain interface consistency across all handlers.
26
+
27
+ Example:
28
+ processor = TextImageProcessor()
29
+
30
+ # No images in text files, but interface is consistent
31
+ tag = processor.process_image(image_data) # Falls back to base implementation
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ directory_path: str = "temp/images",
37
+ tag_prefix: str = "[Image:",
38
+ tag_suffix: str = "]",
39
+ storage_backend: Optional[BaseStorageBackend] = None,
40
+ ):
41
+ """
42
+ Initialize TextImageProcessor.
43
+
44
+ Args:
45
+ directory_path: Image save directory
46
+ tag_prefix: Tag prefix for image references
47
+ tag_suffix: Tag suffix for image references
48
+ storage_backend: Storage backend for saving images
49
+ """
50
+ super().__init__(
51
+ directory_path=directory_path,
52
+ tag_prefix=tag_prefix,
53
+ tag_suffix=tag_suffix,
54
+ storage_backend=storage_backend,
55
+ )
56
+
57
+ def process_image(
58
+ self,
59
+ image_data: bytes,
60
+ **kwargs
61
+ ) -> Optional[str]:
62
+ """
63
+ Process and save image data.
64
+
65
+ Text files do not contain embedded images, so this method
66
+ delegates to the base implementation.
67
+
68
+ Args:
69
+ image_data: Raw image binary data
70
+ **kwargs: Additional options
71
+
72
+ Returns:
73
+ Image tag string or None if processing failed
74
+ """
75
+ return super().process_image(image_data, **kwargs)
@@ -0,0 +1,82 @@
1
+ # xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py
2
+ """
3
+ Text Preprocessor - Process text content after conversion.
4
+
5
+ Processing Pipeline Position:
6
+ 1. TextFileConverter.convert() ??str
7
+ 2. TextPreprocessor.preprocess() ??PreprocessedData (THIS STEP)
8
+ 3. TextMetadataExtractor.extract() ??DocumentMetadata (if any)
9
+ 4. Content extraction
10
+
11
+ Current Implementation:
12
+ - Pass-through (Text uses decoded string content directly)
13
+ """
14
+ import logging
15
+ from typing import Any, Dict
16
+
17
+ from xgen_doc2chunk.core.functions.preprocessor import (
18
+ BasePreprocessor,
19
+ PreprocessedData,
20
+ )
21
+
22
+ logger = logging.getLogger("xgen_doc2chunk.text.preprocessor")
23
+
24
+
25
+ class TextPreprocessor(BasePreprocessor):
26
+ """
27
+ Text Content Preprocessor.
28
+
29
+ Currently a pass-through implementation as text processing
30
+ is straightforward.
31
+ """
32
+
33
+ def preprocess(
34
+ self,
35
+ converted_data: Any,
36
+ **kwargs
37
+ ) -> PreprocessedData:
38
+ """
39
+ Preprocess the converted text content.
40
+
41
+ Args:
42
+ converted_data: Text string from TextFileConverter
43
+ **kwargs: Additional options
44
+
45
+ Returns:
46
+ PreprocessedData with the content
47
+ """
48
+ metadata: Dict[str, Any] = {}
49
+
50
+ content = ""
51
+ encoding = kwargs.get("encoding", "utf-8")
52
+
53
+ if isinstance(converted_data, str):
54
+ content = converted_data
55
+ metadata['char_count'] = len(content)
56
+ metadata['line_count'] = len(content.split('\n'))
57
+ elif isinstance(converted_data, bytes):
58
+ content = converted_data.decode(encoding, errors='replace')
59
+ metadata['char_count'] = len(content)
60
+ metadata['line_count'] = len(content.split('\n'))
61
+
62
+ logger.debug("Text preprocessor: pass-through, metadata=%s", metadata)
63
+
64
+ # clean_content is the TRUE SOURCE - contains the processed text/bytes
65
+ return PreprocessedData(
66
+ raw_content=converted_data,
67
+ clean_content=converted_data, # TRUE SOURCE - bytes or str
68
+ encoding=encoding,
69
+ extracted_resources={},
70
+ metadata=metadata,
71
+ )
72
+
73
+ def get_format_name(self) -> str:
74
+ """Return format name."""
75
+ return "Text Preprocessor"
76
+
77
+ def validate(self, data: Any) -> bool:
78
+ """Validate if data is text content."""
79
+ return isinstance(data, (str, bytes))
80
+
81
+
82
+ __all__ = ['TextPreprocessor']