xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,128 @@
1
+ # xgen_doc2chunk/core/processor/rtf_helper/__init__.py
2
+ """
3
+ RTF Helper Module
4
+
5
+ Provides RTF parsing and extraction utilities with proper interface separation.
6
+
7
+ Architecture:
8
+ - RTFPreprocessor: Binary preprocessing (image extraction, \\bin handling)
9
+ - RTFFileConverter: Pass through (RTF uses raw binary)
10
+ - RTFMetadataExtractor: Metadata extraction
11
+ - Table extraction: extract_tables_with_positions()
12
+ - Content extraction: extract_inline_content(), extract_text_only()
13
+
14
+ Usage:
15
+ from xgen_doc2chunk.core.processor.rtf_helper import (
16
+ RTFFileConverter,
17
+ RTFConvertedData,
18
+ RTFPreprocessor,
19
+ RTFMetadataExtractor,
20
+ RTFSourceInfo,
21
+ extract_tables_with_positions,
22
+ extract_inline_content,
23
+ extract_text_only,
24
+ )
25
+ """
26
+
27
+ # Converter
28
+ from xgen_doc2chunk.core.processor.rtf_helper.rtf_file_converter import (
29
+ RTFFileConverter,
30
+ RTFConvertedData,
31
+ )
32
+
33
+ # Preprocessor
34
+ from xgen_doc2chunk.core.processor.rtf_helper.rtf_preprocessor import (
35
+ RTFPreprocessor,
36
+ )
37
+
38
+ # Metadata
39
+ from xgen_doc2chunk.core.processor.rtf_helper.rtf_metadata_extractor import (
40
+ RTFMetadataExtractor,
41
+ RTFSourceInfo,
42
+ )
43
+
44
+ # Table extraction
45
+ from xgen_doc2chunk.core.processor.rtf_helper.rtf_table_extractor import (
46
+ RTFCellInfo,
47
+ RTFTable,
48
+ extract_tables_with_positions,
49
+ )
50
+
51
+ # Content extraction
52
+ from xgen_doc2chunk.core.processor.rtf_helper.rtf_content_extractor import (
53
+ extract_inline_content,
54
+ extract_text_only,
55
+ )
56
+
57
+ # Decoder utilities
58
+ from xgen_doc2chunk.core.processor.rtf_helper.rtf_decoder import (
59
+ detect_encoding,
60
+ decode_content,
61
+ decode_bytes,
62
+ decode_hex_escapes,
63
+ )
64
+
65
+ # Text cleaning utilities
66
+ from xgen_doc2chunk.core.processor.rtf_helper.rtf_text_cleaner import (
67
+ clean_rtf_text,
68
+ remove_destination_groups,
69
+ remove_shape_groups,
70
+ remove_shape_property_groups,
71
+ remove_shprslt_blocks,
72
+ )
73
+
74
+ # Region finder utilities
75
+ from xgen_doc2chunk.core.processor.rtf_helper.rtf_region_finder import (
76
+ find_excluded_regions,
77
+ is_in_excluded_region,
78
+ )
79
+
80
+ # Constants
81
+ from xgen_doc2chunk.core.processor.rtf_helper.rtf_constants import (
82
+ SHAPE_PROPERTY_NAMES,
83
+ SKIP_DESTINATIONS,
84
+ EXCLUDE_DESTINATION_KEYWORDS,
85
+ IMAGE_DESTINATIONS,
86
+ CODEPAGE_ENCODING_MAP,
87
+ DEFAULT_ENCODINGS,
88
+ )
89
+
90
+
91
+ __all__ = [
92
+ # Converter
93
+ 'RTFFileConverter',
94
+ 'RTFConvertedData',
95
+ # Preprocessor
96
+ 'RTFPreprocessor',
97
+ # Metadata
98
+ 'RTFMetadataExtractor',
99
+ 'RTFSourceInfo',
100
+ # Table
101
+ 'RTFCellInfo',
102
+ 'RTFTable',
103
+ 'extract_tables_with_positions',
104
+ # Content
105
+ 'extract_inline_content',
106
+ 'extract_text_only',
107
+ # Decoder
108
+ 'detect_encoding',
109
+ 'decode_content',
110
+ 'decode_bytes',
111
+ 'decode_hex_escapes',
112
+ # Text cleaner
113
+ 'clean_rtf_text',
114
+ 'remove_destination_groups',
115
+ 'remove_shape_groups',
116
+ 'remove_shape_property_groups',
117
+ 'remove_shprslt_blocks',
118
+ # Region finder
119
+ 'find_excluded_regions',
120
+ 'is_in_excluded_region',
121
+ # Constants
122
+ 'SHAPE_PROPERTY_NAMES',
123
+ 'SKIP_DESTINATIONS',
124
+ 'EXCLUDE_DESTINATION_KEYWORDS',
125
+ 'IMAGE_DESTINATIONS',
126
+ 'CODEPAGE_ENCODING_MAP',
127
+ 'DEFAULT_ENCODINGS',
128
+ ]
@@ -0,0 +1,94 @@
1
+ # xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py
2
+ """
3
+ RTF Constants
4
+
5
+ Constants used for RTF parsing.
6
+ """
7
+
8
+ # Shape property names (to be removed)
9
+ SHAPE_PROPERTY_NAMES = [
10
+ 'shapeType', 'fFlipH', 'fFlipV', 'rotation',
11
+ 'posh', 'posrelh', 'posv', 'posrelv',
12
+ 'fLayoutInCell', 'fAllowOverlap', 'fBehindDocument',
13
+ 'fPseudoInline', 'fLockAnchor', 'fLockPosition',
14
+ 'fLockAspectRatio', 'fLockRotation', 'fLockAgainstSelect',
15
+ 'fLockCropping', 'fLockVerticies', 'fLockText',
16
+ 'fLockAdjustHandles', 'fLockAgainstGrouping',
17
+ 'geoLeft', 'geoTop', 'geoRight', 'geoBottom',
18
+ 'shapePath', 'pWrapPolygonVertices', 'dxWrapDistLeft',
19
+ 'dyWrapDistTop', 'dxWrapDistRight', 'dyWrapDistBottom',
20
+ 'fLine', 'fFilled', 'fillType', 'fillColor',
21
+ 'fillOpacity', 'fillBackColor', 'fillBackOpacity',
22
+ 'lineColor', 'lineOpacity', 'lineWidth', 'lineStyle',
23
+ 'lineDashing', 'lineStartArrowhead', 'lineStartArrowWidth',
24
+ 'lineStartArrowLength', 'lineEndArrowhead', 'lineEndArrowWidth',
25
+ 'lineEndArrowLength', 'shadowType', 'shadowColor',
26
+ 'shadowOpacity', 'shadowOffsetX', 'shadowOffsetY',
27
+ ]
28
+
29
+ # RTF destination 키워드 (제외 대상)
30
+ EXCLUDE_DESTINATION_KEYWORDS = [
31
+ 'fonttbl', 'colortbl', 'stylesheet', 'listtable',
32
+ 'listoverridetable', 'revtbl', 'rsidtbl', 'generator',
33
+ 'info', 'xmlnstbl', 'mmathPr', 'themedata', 'colorschememapping',
34
+ 'datastore', 'latentstyles', 'pgptbl', 'protusertbl',
35
+ ]
36
+
37
+ # RTF skip destinations
38
+ SKIP_DESTINATIONS = {
39
+ 'fonttbl', 'colortbl', 'stylesheet', 'listtable',
40
+ 'listoverridetable', 'revtbl', 'rsidtbl', 'generator',
41
+ 'xmlnstbl', 'mmathPr', 'themedata', 'colorschememapping',
42
+ 'datastore', 'latentstyles', 'pgptbl', 'protusertbl',
43
+ 'bookmarkstart', 'bookmarkend', 'bkmkstart', 'bkmkend',
44
+ 'fldinst', 'fldrslt', # field instructions and results
45
+ }
46
+
47
+ # Image-related destinations
48
+ IMAGE_DESTINATIONS = {
49
+ 'pict', 'shppict', 'nonshppict', 'blipuid',
50
+ }
51
+
52
+ # Codepage to encoding mapping
53
+ CODEPAGE_ENCODING_MAP = {
54
+ 437: 'cp437',
55
+ 850: 'cp850',
56
+ 852: 'cp852',
57
+ 855: 'cp855',
58
+ 857: 'cp857',
59
+ 860: 'cp860',
60
+ 861: 'cp861',
61
+ 863: 'cp863',
62
+ 865: 'cp865',
63
+ 866: 'cp866',
64
+ 869: 'cp869',
65
+ 874: 'cp874',
66
+ 932: 'cp932', # Japanese
67
+ 936: 'gb2312', # Simplified Chinese
68
+ 949: 'cp949', # Korean
69
+ 950: 'big5', # Traditional Chinese
70
+ 1250: 'cp1250', # Central European
71
+ 1251: 'cp1251', # Cyrillic
72
+ 1252: 'cp1252', # Western European
73
+ 1253: 'cp1253', # Greek
74
+ 1254: 'cp1254', # Turkish
75
+ 1255: 'cp1255', # Hebrew
76
+ 1256: 'cp1256', # Arabic
77
+ 1257: 'cp1257', # Baltic
78
+ 1258: 'cp1258', # Vietnamese
79
+ 10000: 'mac_roman',
80
+ 65001: 'utf-8',
81
+ }
82
+
83
+ # Default encodings to try
84
+ DEFAULT_ENCODINGS = ['utf-8', 'cp949', 'euc-kr', 'cp1252', 'latin-1']
85
+
86
+
87
+ __all__ = [
88
+ 'SHAPE_PROPERTY_NAMES',
89
+ 'EXCLUDE_DESTINATION_KEYWORDS',
90
+ 'SKIP_DESTINATIONS',
91
+ 'IMAGE_DESTINATIONS',
92
+ 'CODEPAGE_ENCODING_MAP',
93
+ 'DEFAULT_ENCODINGS',
94
+ ]
@@ -0,0 +1,211 @@
1
+ # xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py
2
+ """
3
+ RTF Content Extractor
4
+
5
+ Extracts inline content (text + tables) from RTF documents.
6
+ """
7
+ import logging
8
+ import re
9
+ from typing import List, Tuple
10
+
11
+ from xgen_doc2chunk.core.processor.rtf_helper.rtf_decoder import (
12
+ decode_hex_escapes,
13
+ )
14
+ from xgen_doc2chunk.core.processor.rtf_helper.rtf_text_cleaner import (
15
+ clean_rtf_text,
16
+ remove_destination_groups,
17
+ remove_shape_groups,
18
+ remove_shape_property_groups,
19
+ )
20
+ from xgen_doc2chunk.core.processor.rtf_helper.rtf_region_finder import (
21
+ find_excluded_regions,
22
+ )
23
+ from xgen_doc2chunk.core.processor.rtf_helper.rtf_table_extractor import (
24
+ RTFTable,
25
+ )
26
+
27
+ logger = logging.getLogger("xgen_doc2chunk.rtf.content")
28
+
29
+
30
+ def extract_inline_content(
31
+ content: str,
32
+ table_regions: List[Tuple[int, int, RTFTable]],
33
+ encoding: str = "cp949"
34
+ ) -> str:
35
+ """
36
+ Extract inline content from RTF with tables in original positions.
37
+
38
+ Args:
39
+ content: RTF string content
40
+ table_regions: Table region list [(start, end, table), ...]
41
+ encoding: Encoding to use
42
+
43
+ Returns:
44
+ Content string with tables inline
45
+ """
46
+ # Find header end (before first \pard)
47
+ header_end = 0
48
+ pard_match = re.search(r'\\pard\b', content)
49
+ if pard_match:
50
+ header_end = pard_match.start()
51
+
52
+ # Find excluded regions (header, footer, footnote, etc.)
53
+ excluded_regions = find_excluded_regions(content)
54
+
55
+ def clean_segment(segment: str, start_pos: int) -> str:
56
+ """Clean a segment while respecting excluded regions."""
57
+ if not excluded_regions:
58
+ segment = remove_destination_groups(segment)
59
+ decoded = decode_hex_escapes(segment, encoding)
60
+ return clean_rtf_text(decoded, encoding)
61
+
62
+ result_parts = []
63
+ seg_pos = 0
64
+
65
+ for excl_start, excl_end in excluded_regions:
66
+ rel_start = excl_start - start_pos
67
+ rel_end = excl_end - start_pos
68
+
69
+ if rel_end <= 0 or rel_start >= len(segment):
70
+ continue
71
+
72
+ rel_start = max(0, rel_start)
73
+ rel_end = min(len(segment), rel_end)
74
+
75
+ if rel_start > seg_pos:
76
+ part = segment[seg_pos:rel_start]
77
+ part = remove_destination_groups(part)
78
+ decoded = decode_hex_escapes(part, encoding)
79
+ clean = clean_rtf_text(decoded, encoding)
80
+ if clean.strip():
81
+ result_parts.append(clean)
82
+
83
+ seg_pos = rel_end
84
+
85
+ if seg_pos < len(segment):
86
+ part = segment[seg_pos:]
87
+ part = remove_destination_groups(part)
88
+ decoded = decode_hex_escapes(part, encoding)
89
+ clean = clean_rtf_text(decoded, encoding)
90
+ if clean.strip():
91
+ result_parts.append(clean)
92
+
93
+ return ' '.join(result_parts)
94
+
95
+ result_parts = []
96
+
97
+ # No tables - just extract text
98
+ if not table_regions:
99
+ clean = clean_segment(content[header_end:], header_end)
100
+ if clean.strip():
101
+ result_parts.append(clean)
102
+ return '\n\n'.join(result_parts)
103
+
104
+ # Adjust regions for header offset
105
+ adjusted_regions = []
106
+ for start_pos, end_pos, table in table_regions:
107
+ if end_pos > header_end:
108
+ adj_start = max(start_pos, header_end)
109
+ adjusted_regions.append((adj_start, end_pos, table))
110
+
111
+ # Build content parts
112
+ last_end = header_end
113
+
114
+ for start_pos, end_pos, table in adjusted_regions:
115
+ # Text before table
116
+ if start_pos > last_end:
117
+ segment = content[last_end:start_pos]
118
+ clean = clean_segment(segment, last_end)
119
+ if clean.strip():
120
+ result_parts.append(clean)
121
+
122
+ # Table
123
+ if table.is_real_table():
124
+ result_parts.append(table.to_html())
125
+ else:
126
+ text_list = table.to_text_list()
127
+ if text_list:
128
+ result_parts.append(text_list)
129
+
130
+ last_end = end_pos
131
+
132
+ # Text after last table
133
+ if last_end < len(content):
134
+ segment = content[last_end:]
135
+ clean = clean_segment(segment, last_end)
136
+ if clean.strip():
137
+ result_parts.append(clean)
138
+
139
+ return '\n\n'.join(result_parts)
140
+
141
+
142
+ def extract_text_only(content: str, encoding: str = "cp949") -> str:
143
+ """
144
+ Extract only text from RTF (exclude tables).
145
+
146
+ Legacy compatibility function.
147
+
148
+ Args:
149
+ content: RTF string content
150
+ encoding: Encoding to use
151
+
152
+ Returns:
153
+ Extracted text
154
+ """
155
+ # Remove header (fonttbl, colortbl, stylesheet, etc.)
156
+ pard_match = re.search(r'\\pard\b', content)
157
+ if pard_match:
158
+ content = content[pard_match.start():]
159
+
160
+ # Remove destination groups
161
+ content = remove_destination_groups(content)
162
+
163
+ # Handle shape groups (preserve shptxt content)
164
+ content = remove_shape_groups(content)
165
+
166
+ # Remove shape property groups
167
+ content = remove_shape_property_groups(content)
168
+
169
+ # Find table regions
170
+ table_regions = []
171
+ for match in re.finditer(r'\\trowd.*?\\row', content, re.DOTALL):
172
+ table_regions.append((match.start(), match.end()))
173
+
174
+ # Merge adjacent tables
175
+ merged_regions = []
176
+ for start, end in table_regions:
177
+ if merged_regions and start - merged_regions[-1][1] < 100:
178
+ merged_regions[-1] = (merged_regions[-1][0], end)
179
+ else:
180
+ merged_regions.append((start, end))
181
+
182
+ # Extract text excluding table regions
183
+ text_parts = []
184
+ last_end = 0
185
+
186
+ for start, end in merged_regions:
187
+ if start > last_end:
188
+ segment = content[last_end:start]
189
+ decoded = decode_hex_escapes(segment, encoding)
190
+ clean = clean_rtf_text(decoded, encoding)
191
+ if clean:
192
+ text_parts.append(clean)
193
+ last_end = end
194
+
195
+ if last_end < len(content):
196
+ segment = content[last_end:]
197
+ decoded = decode_hex_escapes(segment, encoding)
198
+ clean = clean_rtf_text(decoded, encoding)
199
+ if clean:
200
+ text_parts.append(clean)
201
+
202
+ text = '\n'.join(text_parts)
203
+ text = re.sub(r'\n{3,}', '\n\n', text)
204
+
205
+ return text.strip()
206
+
207
+
208
+ __all__ = [
209
+ 'extract_inline_content',
210
+ 'extract_text_only',
211
+ ]
@@ -0,0 +1,141 @@
1
+ # xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py
2
+ """
3
+ RTF Decoding Utilities
4
+
5
+ Encoding detection and decoding functions for RTF content.
6
+ """
7
+ import logging
8
+ import re
9
+ from typing import List
10
+
11
+ from xgen_doc2chunk.core.processor.rtf_helper.rtf_constants import (
12
+ CODEPAGE_ENCODING_MAP,
13
+ DEFAULT_ENCODINGS,
14
+ )
15
+
16
+ logger = logging.getLogger("xgen_doc2chunk.rtf.decoder")
17
+
18
+
19
+ def detect_encoding(content: bytes, default_encoding: str = "cp949") -> str:
20
+ """
21
+ Detect encoding from RTF content.
22
+
23
+ Looks for \\ansicpgXXXX pattern in the header.
24
+
25
+ Args:
26
+ content: RTF binary data
27
+ default_encoding: Fallback encoding
28
+
29
+ Returns:
30
+ Detected encoding string
31
+ """
32
+ try:
33
+ text = content[:1000].decode('ascii', errors='ignore')
34
+
35
+ match = re.search(r'\\ansicpg(\d+)', text)
36
+ if match:
37
+ codepage = int(match.group(1))
38
+ encoding = CODEPAGE_ENCODING_MAP.get(codepage, 'cp1252')
39
+ logger.debug(f"RTF encoding detected: {encoding} (codepage {codepage})")
40
+ return encoding
41
+ except Exception as e:
42
+ logger.debug(f"Encoding detection failed: {e}")
43
+
44
+ return default_encoding
45
+
46
+
47
+ def decode_content(content: bytes, encoding: str = "cp949") -> str:
48
+ """
49
+ Decode RTF binary to string.
50
+
51
+ Tries multiple encodings and returns first successful result.
52
+
53
+ Args:
54
+ content: RTF binary data
55
+ encoding: Preferred encoding to try first
56
+
57
+ Returns:
58
+ Decoded string
59
+ """
60
+ encodings = [encoding] + [e for e in DEFAULT_ENCODINGS if e != encoding]
61
+
62
+ for enc in encodings:
63
+ try:
64
+ return content.decode(enc)
65
+ except (UnicodeDecodeError, LookupError):
66
+ continue
67
+
68
+ return content.decode('cp1252', errors='replace')
69
+
70
+
71
+ def decode_bytes(byte_list: List[int], encoding: str = "cp949") -> str:
72
+ """
73
+ Decode byte list to string.
74
+
75
+ Args:
76
+ byte_list: List of byte values
77
+ encoding: Encoding to use
78
+
79
+ Returns:
80
+ Decoded string
81
+ """
82
+ try:
83
+ return bytes(byte_list).decode(encoding)
84
+ except (UnicodeDecodeError, LookupError):
85
+ try:
86
+ return bytes(byte_list).decode('cp949')
87
+ except:
88
+ return bytes(byte_list).decode('latin-1', errors='replace')
89
+
90
+
91
+ def decode_hex_escapes(text: str, encoding: str = "cp949") -> str:
92
+ """
93
+ Decode RTF hex escape sequences (\\'XX).
94
+
95
+ Args:
96
+ text: RTF text with hex escapes
97
+ encoding: Encoding for decoding
98
+
99
+ Returns:
100
+ Decoded text
101
+ """
102
+ if "\\'" not in text:
103
+ return text
104
+
105
+ result = []
106
+ byte_buffer = []
107
+ i = 0
108
+ n = len(text)
109
+
110
+ while i < n:
111
+ if i + 3 < n and text[i:i+2] == "\\'":
112
+ try:
113
+ hex_val = text[i+2:i+4]
114
+ byte_val = int(hex_val, 16)
115
+ byte_buffer.append(byte_val)
116
+ i += 4
117
+ continue
118
+ except ValueError:
119
+ pass
120
+
121
+ # Flush byte buffer
122
+ if byte_buffer:
123
+ result.append(decode_bytes(byte_buffer, encoding))
124
+ byte_buffer = []
125
+
126
+ result.append(text[i])
127
+ i += 1
128
+
129
+ # Flush remaining bytes
130
+ if byte_buffer:
131
+ result.append(decode_bytes(byte_buffer, encoding))
132
+
133
+ return ''.join(result)
134
+
135
+
136
+ __all__ = [
137
+ 'detect_encoding',
138
+ 'decode_content',
139
+ 'decode_bytes',
140
+ 'decode_hex_escapes',
141
+ ]
@@ -0,0 +1,87 @@
1
+ # xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py
2
+ """
3
+ RTF File Converter
4
+
5
+ RTF uses raw binary directly, so converter just passes through.
6
+ All actual processing is done by Preprocessor in Handler.
7
+ """
8
+ import logging
9
+ from dataclasses import dataclass, field
10
+ from typing import Any, BinaryIO, List, Optional
11
+
12
+ from xgen_doc2chunk.core.functions.file_converter import BaseFileConverter
13
+
14
+ logger = logging.getLogger("xgen_doc2chunk.rtf.converter")
15
+
16
+
17
+ @dataclass
18
+ class RTFConvertedData:
19
+ """
20
+ RTF converted data container.
21
+
22
+ Attributes:
23
+ content: RTF content string (after preprocessing)
24
+ encoding: Detected encoding
25
+ image_tags: List of image tags from preprocessing
26
+ original_size: Original binary data size
27
+ has_images: Whether images were extracted
28
+ """
29
+ content: str
30
+ encoding: str = "cp949"
31
+ image_tags: List[str] = field(default_factory=list)
32
+ original_size: int = 0
33
+ has_images: bool = False
34
+
35
+ def __post_init__(self):
36
+ """Set has_images based on image_tags."""
37
+ if self.image_tags:
38
+ self.has_images = True
39
+
40
+
41
+ class RTFFileConverter(BaseFileConverter):
42
+ """
43
+ RTF file converter.
44
+
45
+ RTF uses raw binary directly, so this converter just passes through.
46
+ All actual processing (image extraction, binary removal, decoding)
47
+ is done by RTFPreprocessor called from Handler.
48
+ """
49
+
50
+ def __init__(self):
51
+ """Initialize RTFFileConverter."""
52
+ self.logger = logger
53
+
54
+ def convert(
55
+ self,
56
+ file_data: bytes,
57
+ file_stream: Optional[BinaryIO] = None,
58
+ **kwargs
59
+ ) -> bytes:
60
+ """
61
+ Pass through binary data.
62
+
63
+ RTF processing uses raw binary, so just return as-is.
64
+
65
+ Args:
66
+ file_data: Raw binary RTF data
67
+ file_stream: Optional file stream (not used)
68
+ **kwargs: Not used
69
+
70
+ Returns:
71
+ Original bytes (pass through)
72
+ """
73
+ return file_data
74
+
75
+ def get_format_name(self) -> str:
76
+ """Return format name."""
77
+ return "RTF Document"
78
+
79
+ def close(self, converted_object: Any) -> None:
80
+ """Nothing to close."""
81
+ pass
82
+
83
+
84
+ __all__ = [
85
+ 'RTFFileConverter',
86
+ 'RTFConvertedData',
87
+ ]