xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/rtf_helper/__init__.py
|
|
2
|
+
"""
|
|
3
|
+
RTF Helper Module
|
|
4
|
+
|
|
5
|
+
Provides RTF parsing and extraction utilities with proper interface separation.
|
|
6
|
+
|
|
7
|
+
Architecture:
|
|
8
|
+
- RTFPreprocessor: Binary preprocessing (image extraction, \\bin handling)
|
|
9
|
+
- RTFFileConverter: Pass through (RTF uses raw binary)
|
|
10
|
+
- RTFMetadataExtractor: Metadata extraction
|
|
11
|
+
- Table extraction: extract_tables_with_positions()
|
|
12
|
+
- Content extraction: extract_inline_content(), extract_text_only()
|
|
13
|
+
|
|
14
|
+
Usage:
|
|
15
|
+
from xgen_doc2chunk.core.processor.rtf_helper import (
|
|
16
|
+
RTFFileConverter,
|
|
17
|
+
RTFConvertedData,
|
|
18
|
+
RTFPreprocessor,
|
|
19
|
+
RTFMetadataExtractor,
|
|
20
|
+
RTFSourceInfo,
|
|
21
|
+
extract_tables_with_positions,
|
|
22
|
+
extract_inline_content,
|
|
23
|
+
extract_text_only,
|
|
24
|
+
)
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
# Converter
|
|
28
|
+
from xgen_doc2chunk.core.processor.rtf_helper.rtf_file_converter import (
|
|
29
|
+
RTFFileConverter,
|
|
30
|
+
RTFConvertedData,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# Preprocessor
|
|
34
|
+
from xgen_doc2chunk.core.processor.rtf_helper.rtf_preprocessor import (
|
|
35
|
+
RTFPreprocessor,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
# Metadata
|
|
39
|
+
from xgen_doc2chunk.core.processor.rtf_helper.rtf_metadata_extractor import (
|
|
40
|
+
RTFMetadataExtractor,
|
|
41
|
+
RTFSourceInfo,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# Table extraction
|
|
45
|
+
from xgen_doc2chunk.core.processor.rtf_helper.rtf_table_extractor import (
|
|
46
|
+
RTFCellInfo,
|
|
47
|
+
RTFTable,
|
|
48
|
+
extract_tables_with_positions,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# Content extraction
|
|
52
|
+
from xgen_doc2chunk.core.processor.rtf_helper.rtf_content_extractor import (
|
|
53
|
+
extract_inline_content,
|
|
54
|
+
extract_text_only,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
# Decoder utilities
|
|
58
|
+
from xgen_doc2chunk.core.processor.rtf_helper.rtf_decoder import (
|
|
59
|
+
detect_encoding,
|
|
60
|
+
decode_content,
|
|
61
|
+
decode_bytes,
|
|
62
|
+
decode_hex_escapes,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
# Text cleaning utilities
|
|
66
|
+
from xgen_doc2chunk.core.processor.rtf_helper.rtf_text_cleaner import (
|
|
67
|
+
clean_rtf_text,
|
|
68
|
+
remove_destination_groups,
|
|
69
|
+
remove_shape_groups,
|
|
70
|
+
remove_shape_property_groups,
|
|
71
|
+
remove_shprslt_blocks,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
# Region finder utilities
|
|
75
|
+
from xgen_doc2chunk.core.processor.rtf_helper.rtf_region_finder import (
|
|
76
|
+
find_excluded_regions,
|
|
77
|
+
is_in_excluded_region,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
# Constants
|
|
81
|
+
from xgen_doc2chunk.core.processor.rtf_helper.rtf_constants import (
|
|
82
|
+
SHAPE_PROPERTY_NAMES,
|
|
83
|
+
SKIP_DESTINATIONS,
|
|
84
|
+
EXCLUDE_DESTINATION_KEYWORDS,
|
|
85
|
+
IMAGE_DESTINATIONS,
|
|
86
|
+
CODEPAGE_ENCODING_MAP,
|
|
87
|
+
DEFAULT_ENCODINGS,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
__all__ = [
|
|
92
|
+
# Converter
|
|
93
|
+
'RTFFileConverter',
|
|
94
|
+
'RTFConvertedData',
|
|
95
|
+
# Preprocessor
|
|
96
|
+
'RTFPreprocessor',
|
|
97
|
+
# Metadata
|
|
98
|
+
'RTFMetadataExtractor',
|
|
99
|
+
'RTFSourceInfo',
|
|
100
|
+
# Table
|
|
101
|
+
'RTFCellInfo',
|
|
102
|
+
'RTFTable',
|
|
103
|
+
'extract_tables_with_positions',
|
|
104
|
+
# Content
|
|
105
|
+
'extract_inline_content',
|
|
106
|
+
'extract_text_only',
|
|
107
|
+
# Decoder
|
|
108
|
+
'detect_encoding',
|
|
109
|
+
'decode_content',
|
|
110
|
+
'decode_bytes',
|
|
111
|
+
'decode_hex_escapes',
|
|
112
|
+
# Text cleaner
|
|
113
|
+
'clean_rtf_text',
|
|
114
|
+
'remove_destination_groups',
|
|
115
|
+
'remove_shape_groups',
|
|
116
|
+
'remove_shape_property_groups',
|
|
117
|
+
'remove_shprslt_blocks',
|
|
118
|
+
# Region finder
|
|
119
|
+
'find_excluded_regions',
|
|
120
|
+
'is_in_excluded_region',
|
|
121
|
+
# Constants
|
|
122
|
+
'SHAPE_PROPERTY_NAMES',
|
|
123
|
+
'SKIP_DESTINATIONS',
|
|
124
|
+
'EXCLUDE_DESTINATION_KEYWORDS',
|
|
125
|
+
'IMAGE_DESTINATIONS',
|
|
126
|
+
'CODEPAGE_ENCODING_MAP',
|
|
127
|
+
'DEFAULT_ENCODINGS',
|
|
128
|
+
]
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py
|
|
2
|
+
"""
|
|
3
|
+
RTF Constants
|
|
4
|
+
|
|
5
|
+
Constants used for RTF parsing.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
# Shape property names (to be removed)
|
|
9
|
+
SHAPE_PROPERTY_NAMES = [
|
|
10
|
+
'shapeType', 'fFlipH', 'fFlipV', 'rotation',
|
|
11
|
+
'posh', 'posrelh', 'posv', 'posrelv',
|
|
12
|
+
'fLayoutInCell', 'fAllowOverlap', 'fBehindDocument',
|
|
13
|
+
'fPseudoInline', 'fLockAnchor', 'fLockPosition',
|
|
14
|
+
'fLockAspectRatio', 'fLockRotation', 'fLockAgainstSelect',
|
|
15
|
+
'fLockCropping', 'fLockVerticies', 'fLockText',
|
|
16
|
+
'fLockAdjustHandles', 'fLockAgainstGrouping',
|
|
17
|
+
'geoLeft', 'geoTop', 'geoRight', 'geoBottom',
|
|
18
|
+
'shapePath', 'pWrapPolygonVertices', 'dxWrapDistLeft',
|
|
19
|
+
'dyWrapDistTop', 'dxWrapDistRight', 'dyWrapDistBottom',
|
|
20
|
+
'fLine', 'fFilled', 'fillType', 'fillColor',
|
|
21
|
+
'fillOpacity', 'fillBackColor', 'fillBackOpacity',
|
|
22
|
+
'lineColor', 'lineOpacity', 'lineWidth', 'lineStyle',
|
|
23
|
+
'lineDashing', 'lineStartArrowhead', 'lineStartArrowWidth',
|
|
24
|
+
'lineStartArrowLength', 'lineEndArrowhead', 'lineEndArrowWidth',
|
|
25
|
+
'lineEndArrowLength', 'shadowType', 'shadowColor',
|
|
26
|
+
'shadowOpacity', 'shadowOffsetX', 'shadowOffsetY',
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
# RTF destination 키워드 (제외 대상)
|
|
30
|
+
EXCLUDE_DESTINATION_KEYWORDS = [
|
|
31
|
+
'fonttbl', 'colortbl', 'stylesheet', 'listtable',
|
|
32
|
+
'listoverridetable', 'revtbl', 'rsidtbl', 'generator',
|
|
33
|
+
'info', 'xmlnstbl', 'mmathPr', 'themedata', 'colorschememapping',
|
|
34
|
+
'datastore', 'latentstyles', 'pgptbl', 'protusertbl',
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
# RTF skip destinations
|
|
38
|
+
SKIP_DESTINATIONS = {
|
|
39
|
+
'fonttbl', 'colortbl', 'stylesheet', 'listtable',
|
|
40
|
+
'listoverridetable', 'revtbl', 'rsidtbl', 'generator',
|
|
41
|
+
'xmlnstbl', 'mmathPr', 'themedata', 'colorschememapping',
|
|
42
|
+
'datastore', 'latentstyles', 'pgptbl', 'protusertbl',
|
|
43
|
+
'bookmarkstart', 'bookmarkend', 'bkmkstart', 'bkmkend',
|
|
44
|
+
'fldinst', 'fldrslt', # field instructions and results
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
# Image-related destinations
|
|
48
|
+
IMAGE_DESTINATIONS = {
|
|
49
|
+
'pict', 'shppict', 'nonshppict', 'blipuid',
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
# Codepage to encoding mapping
|
|
53
|
+
CODEPAGE_ENCODING_MAP = {
|
|
54
|
+
437: 'cp437',
|
|
55
|
+
850: 'cp850',
|
|
56
|
+
852: 'cp852',
|
|
57
|
+
855: 'cp855',
|
|
58
|
+
857: 'cp857',
|
|
59
|
+
860: 'cp860',
|
|
60
|
+
861: 'cp861',
|
|
61
|
+
863: 'cp863',
|
|
62
|
+
865: 'cp865',
|
|
63
|
+
866: 'cp866',
|
|
64
|
+
869: 'cp869',
|
|
65
|
+
874: 'cp874',
|
|
66
|
+
932: 'cp932', # Japanese
|
|
67
|
+
936: 'gb2312', # Simplified Chinese
|
|
68
|
+
949: 'cp949', # Korean
|
|
69
|
+
950: 'big5', # Traditional Chinese
|
|
70
|
+
1250: 'cp1250', # Central European
|
|
71
|
+
1251: 'cp1251', # Cyrillic
|
|
72
|
+
1252: 'cp1252', # Western European
|
|
73
|
+
1253: 'cp1253', # Greek
|
|
74
|
+
1254: 'cp1254', # Turkish
|
|
75
|
+
1255: 'cp1255', # Hebrew
|
|
76
|
+
1256: 'cp1256', # Arabic
|
|
77
|
+
1257: 'cp1257', # Baltic
|
|
78
|
+
1258: 'cp1258', # Vietnamese
|
|
79
|
+
10000: 'mac_roman',
|
|
80
|
+
65001: 'utf-8',
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
# Default encodings to try
|
|
84
|
+
DEFAULT_ENCODINGS = ['utf-8', 'cp949', 'euc-kr', 'cp1252', 'latin-1']
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
__all__ = [
|
|
88
|
+
'SHAPE_PROPERTY_NAMES',
|
|
89
|
+
'EXCLUDE_DESTINATION_KEYWORDS',
|
|
90
|
+
'SKIP_DESTINATIONS',
|
|
91
|
+
'IMAGE_DESTINATIONS',
|
|
92
|
+
'CODEPAGE_ENCODING_MAP',
|
|
93
|
+
'DEFAULT_ENCODINGS',
|
|
94
|
+
]
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py
|
|
2
|
+
"""
|
|
3
|
+
RTF Content Extractor
|
|
4
|
+
|
|
5
|
+
Extracts inline content (text + tables) from RTF documents.
|
|
6
|
+
"""
|
|
7
|
+
import logging
|
|
8
|
+
import re
|
|
9
|
+
from typing import List, Tuple
|
|
10
|
+
|
|
11
|
+
from xgen_doc2chunk.core.processor.rtf_helper.rtf_decoder import (
|
|
12
|
+
decode_hex_escapes,
|
|
13
|
+
)
|
|
14
|
+
from xgen_doc2chunk.core.processor.rtf_helper.rtf_text_cleaner import (
|
|
15
|
+
clean_rtf_text,
|
|
16
|
+
remove_destination_groups,
|
|
17
|
+
remove_shape_groups,
|
|
18
|
+
remove_shape_property_groups,
|
|
19
|
+
)
|
|
20
|
+
from xgen_doc2chunk.core.processor.rtf_helper.rtf_region_finder import (
|
|
21
|
+
find_excluded_regions,
|
|
22
|
+
)
|
|
23
|
+
from xgen_doc2chunk.core.processor.rtf_helper.rtf_table_extractor import (
|
|
24
|
+
RTFTable,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger("xgen_doc2chunk.rtf.content")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def extract_inline_content(
|
|
31
|
+
content: str,
|
|
32
|
+
table_regions: List[Tuple[int, int, RTFTable]],
|
|
33
|
+
encoding: str = "cp949"
|
|
34
|
+
) -> str:
|
|
35
|
+
"""
|
|
36
|
+
Extract inline content from RTF with tables in original positions.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
content: RTF string content
|
|
40
|
+
table_regions: Table region list [(start, end, table), ...]
|
|
41
|
+
encoding: Encoding to use
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Content string with tables inline
|
|
45
|
+
"""
|
|
46
|
+
# Find header end (before first \pard)
|
|
47
|
+
header_end = 0
|
|
48
|
+
pard_match = re.search(r'\\pard\b', content)
|
|
49
|
+
if pard_match:
|
|
50
|
+
header_end = pard_match.start()
|
|
51
|
+
|
|
52
|
+
# Find excluded regions (header, footer, footnote, etc.)
|
|
53
|
+
excluded_regions = find_excluded_regions(content)
|
|
54
|
+
|
|
55
|
+
def clean_segment(segment: str, start_pos: int) -> str:
|
|
56
|
+
"""Clean a segment while respecting excluded regions."""
|
|
57
|
+
if not excluded_regions:
|
|
58
|
+
segment = remove_destination_groups(segment)
|
|
59
|
+
decoded = decode_hex_escapes(segment, encoding)
|
|
60
|
+
return clean_rtf_text(decoded, encoding)
|
|
61
|
+
|
|
62
|
+
result_parts = []
|
|
63
|
+
seg_pos = 0
|
|
64
|
+
|
|
65
|
+
for excl_start, excl_end in excluded_regions:
|
|
66
|
+
rel_start = excl_start - start_pos
|
|
67
|
+
rel_end = excl_end - start_pos
|
|
68
|
+
|
|
69
|
+
if rel_end <= 0 or rel_start >= len(segment):
|
|
70
|
+
continue
|
|
71
|
+
|
|
72
|
+
rel_start = max(0, rel_start)
|
|
73
|
+
rel_end = min(len(segment), rel_end)
|
|
74
|
+
|
|
75
|
+
if rel_start > seg_pos:
|
|
76
|
+
part = segment[seg_pos:rel_start]
|
|
77
|
+
part = remove_destination_groups(part)
|
|
78
|
+
decoded = decode_hex_escapes(part, encoding)
|
|
79
|
+
clean = clean_rtf_text(decoded, encoding)
|
|
80
|
+
if clean.strip():
|
|
81
|
+
result_parts.append(clean)
|
|
82
|
+
|
|
83
|
+
seg_pos = rel_end
|
|
84
|
+
|
|
85
|
+
if seg_pos < len(segment):
|
|
86
|
+
part = segment[seg_pos:]
|
|
87
|
+
part = remove_destination_groups(part)
|
|
88
|
+
decoded = decode_hex_escapes(part, encoding)
|
|
89
|
+
clean = clean_rtf_text(decoded, encoding)
|
|
90
|
+
if clean.strip():
|
|
91
|
+
result_parts.append(clean)
|
|
92
|
+
|
|
93
|
+
return ' '.join(result_parts)
|
|
94
|
+
|
|
95
|
+
result_parts = []
|
|
96
|
+
|
|
97
|
+
# No tables - just extract text
|
|
98
|
+
if not table_regions:
|
|
99
|
+
clean = clean_segment(content[header_end:], header_end)
|
|
100
|
+
if clean.strip():
|
|
101
|
+
result_parts.append(clean)
|
|
102
|
+
return '\n\n'.join(result_parts)
|
|
103
|
+
|
|
104
|
+
# Adjust regions for header offset
|
|
105
|
+
adjusted_regions = []
|
|
106
|
+
for start_pos, end_pos, table in table_regions:
|
|
107
|
+
if end_pos > header_end:
|
|
108
|
+
adj_start = max(start_pos, header_end)
|
|
109
|
+
adjusted_regions.append((adj_start, end_pos, table))
|
|
110
|
+
|
|
111
|
+
# Build content parts
|
|
112
|
+
last_end = header_end
|
|
113
|
+
|
|
114
|
+
for start_pos, end_pos, table in adjusted_regions:
|
|
115
|
+
# Text before table
|
|
116
|
+
if start_pos > last_end:
|
|
117
|
+
segment = content[last_end:start_pos]
|
|
118
|
+
clean = clean_segment(segment, last_end)
|
|
119
|
+
if clean.strip():
|
|
120
|
+
result_parts.append(clean)
|
|
121
|
+
|
|
122
|
+
# Table
|
|
123
|
+
if table.is_real_table():
|
|
124
|
+
result_parts.append(table.to_html())
|
|
125
|
+
else:
|
|
126
|
+
text_list = table.to_text_list()
|
|
127
|
+
if text_list:
|
|
128
|
+
result_parts.append(text_list)
|
|
129
|
+
|
|
130
|
+
last_end = end_pos
|
|
131
|
+
|
|
132
|
+
# Text after last table
|
|
133
|
+
if last_end < len(content):
|
|
134
|
+
segment = content[last_end:]
|
|
135
|
+
clean = clean_segment(segment, last_end)
|
|
136
|
+
if clean.strip():
|
|
137
|
+
result_parts.append(clean)
|
|
138
|
+
|
|
139
|
+
return '\n\n'.join(result_parts)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def extract_text_only(content: str, encoding: str = "cp949") -> str:
|
|
143
|
+
"""
|
|
144
|
+
Extract only text from RTF (exclude tables).
|
|
145
|
+
|
|
146
|
+
Legacy compatibility function.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
content: RTF string content
|
|
150
|
+
encoding: Encoding to use
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
Extracted text
|
|
154
|
+
"""
|
|
155
|
+
# Remove header (fonttbl, colortbl, stylesheet, etc.)
|
|
156
|
+
pard_match = re.search(r'\\pard\b', content)
|
|
157
|
+
if pard_match:
|
|
158
|
+
content = content[pard_match.start():]
|
|
159
|
+
|
|
160
|
+
# Remove destination groups
|
|
161
|
+
content = remove_destination_groups(content)
|
|
162
|
+
|
|
163
|
+
# Handle shape groups (preserve shptxt content)
|
|
164
|
+
content = remove_shape_groups(content)
|
|
165
|
+
|
|
166
|
+
# Remove shape property groups
|
|
167
|
+
content = remove_shape_property_groups(content)
|
|
168
|
+
|
|
169
|
+
# Find table regions
|
|
170
|
+
table_regions = []
|
|
171
|
+
for match in re.finditer(r'\\trowd.*?\\row', content, re.DOTALL):
|
|
172
|
+
table_regions.append((match.start(), match.end()))
|
|
173
|
+
|
|
174
|
+
# Merge adjacent tables
|
|
175
|
+
merged_regions = []
|
|
176
|
+
for start, end in table_regions:
|
|
177
|
+
if merged_regions and start - merged_regions[-1][1] < 100:
|
|
178
|
+
merged_regions[-1] = (merged_regions[-1][0], end)
|
|
179
|
+
else:
|
|
180
|
+
merged_regions.append((start, end))
|
|
181
|
+
|
|
182
|
+
# Extract text excluding table regions
|
|
183
|
+
text_parts = []
|
|
184
|
+
last_end = 0
|
|
185
|
+
|
|
186
|
+
for start, end in merged_regions:
|
|
187
|
+
if start > last_end:
|
|
188
|
+
segment = content[last_end:start]
|
|
189
|
+
decoded = decode_hex_escapes(segment, encoding)
|
|
190
|
+
clean = clean_rtf_text(decoded, encoding)
|
|
191
|
+
if clean:
|
|
192
|
+
text_parts.append(clean)
|
|
193
|
+
last_end = end
|
|
194
|
+
|
|
195
|
+
if last_end < len(content):
|
|
196
|
+
segment = content[last_end:]
|
|
197
|
+
decoded = decode_hex_escapes(segment, encoding)
|
|
198
|
+
clean = clean_rtf_text(decoded, encoding)
|
|
199
|
+
if clean:
|
|
200
|
+
text_parts.append(clean)
|
|
201
|
+
|
|
202
|
+
text = '\n'.join(text_parts)
|
|
203
|
+
text = re.sub(r'\n{3,}', '\n\n', text)
|
|
204
|
+
|
|
205
|
+
return text.strip()
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
__all__ = [
|
|
209
|
+
'extract_inline_content',
|
|
210
|
+
'extract_text_only',
|
|
211
|
+
]
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py
|
|
2
|
+
"""
|
|
3
|
+
RTF Decoding Utilities
|
|
4
|
+
|
|
5
|
+
Encoding detection and decoding functions for RTF content.
|
|
6
|
+
"""
|
|
7
|
+
import logging
|
|
8
|
+
import re
|
|
9
|
+
from typing import List
|
|
10
|
+
|
|
11
|
+
from xgen_doc2chunk.core.processor.rtf_helper.rtf_constants import (
|
|
12
|
+
CODEPAGE_ENCODING_MAP,
|
|
13
|
+
DEFAULT_ENCODINGS,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger("xgen_doc2chunk.rtf.decoder")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def detect_encoding(content: bytes, default_encoding: str = "cp949") -> str:
|
|
20
|
+
"""
|
|
21
|
+
Detect encoding from RTF content.
|
|
22
|
+
|
|
23
|
+
Looks for \\ansicpgXXXX pattern in the header.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
content: RTF binary data
|
|
27
|
+
default_encoding: Fallback encoding
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Detected encoding string
|
|
31
|
+
"""
|
|
32
|
+
try:
|
|
33
|
+
text = content[:1000].decode('ascii', errors='ignore')
|
|
34
|
+
|
|
35
|
+
match = re.search(r'\\ansicpg(\d+)', text)
|
|
36
|
+
if match:
|
|
37
|
+
codepage = int(match.group(1))
|
|
38
|
+
encoding = CODEPAGE_ENCODING_MAP.get(codepage, 'cp1252')
|
|
39
|
+
logger.debug(f"RTF encoding detected: {encoding} (codepage {codepage})")
|
|
40
|
+
return encoding
|
|
41
|
+
except Exception as e:
|
|
42
|
+
logger.debug(f"Encoding detection failed: {e}")
|
|
43
|
+
|
|
44
|
+
return default_encoding
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def decode_content(content: bytes, encoding: str = "cp949") -> str:
|
|
48
|
+
"""
|
|
49
|
+
Decode RTF binary to string.
|
|
50
|
+
|
|
51
|
+
Tries multiple encodings and returns first successful result.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
content: RTF binary data
|
|
55
|
+
encoding: Preferred encoding to try first
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Decoded string
|
|
59
|
+
"""
|
|
60
|
+
encodings = [encoding] + [e for e in DEFAULT_ENCODINGS if e != encoding]
|
|
61
|
+
|
|
62
|
+
for enc in encodings:
|
|
63
|
+
try:
|
|
64
|
+
return content.decode(enc)
|
|
65
|
+
except (UnicodeDecodeError, LookupError):
|
|
66
|
+
continue
|
|
67
|
+
|
|
68
|
+
return content.decode('cp1252', errors='replace')
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def decode_bytes(byte_list: List[int], encoding: str = "cp949") -> str:
|
|
72
|
+
"""
|
|
73
|
+
Decode byte list to string.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
byte_list: List of byte values
|
|
77
|
+
encoding: Encoding to use
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Decoded string
|
|
81
|
+
"""
|
|
82
|
+
try:
|
|
83
|
+
return bytes(byte_list).decode(encoding)
|
|
84
|
+
except (UnicodeDecodeError, LookupError):
|
|
85
|
+
try:
|
|
86
|
+
return bytes(byte_list).decode('cp949')
|
|
87
|
+
except:
|
|
88
|
+
return bytes(byte_list).decode('latin-1', errors='replace')
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def decode_hex_escapes(text: str, encoding: str = "cp949") -> str:
|
|
92
|
+
"""
|
|
93
|
+
Decode RTF hex escape sequences (\\'XX).
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
text: RTF text with hex escapes
|
|
97
|
+
encoding: Encoding for decoding
|
|
98
|
+
|
|
99
|
+
Returns:
|
|
100
|
+
Decoded text
|
|
101
|
+
"""
|
|
102
|
+
if "\\'" not in text:
|
|
103
|
+
return text
|
|
104
|
+
|
|
105
|
+
result = []
|
|
106
|
+
byte_buffer = []
|
|
107
|
+
i = 0
|
|
108
|
+
n = len(text)
|
|
109
|
+
|
|
110
|
+
while i < n:
|
|
111
|
+
if i + 3 < n and text[i:i+2] == "\\'":
|
|
112
|
+
try:
|
|
113
|
+
hex_val = text[i+2:i+4]
|
|
114
|
+
byte_val = int(hex_val, 16)
|
|
115
|
+
byte_buffer.append(byte_val)
|
|
116
|
+
i += 4
|
|
117
|
+
continue
|
|
118
|
+
except ValueError:
|
|
119
|
+
pass
|
|
120
|
+
|
|
121
|
+
# Flush byte buffer
|
|
122
|
+
if byte_buffer:
|
|
123
|
+
result.append(decode_bytes(byte_buffer, encoding))
|
|
124
|
+
byte_buffer = []
|
|
125
|
+
|
|
126
|
+
result.append(text[i])
|
|
127
|
+
i += 1
|
|
128
|
+
|
|
129
|
+
# Flush remaining bytes
|
|
130
|
+
if byte_buffer:
|
|
131
|
+
result.append(decode_bytes(byte_buffer, encoding))
|
|
132
|
+
|
|
133
|
+
return ''.join(result)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
__all__ = [
|
|
137
|
+
'detect_encoding',
|
|
138
|
+
'decode_content',
|
|
139
|
+
'decode_bytes',
|
|
140
|
+
'decode_hex_escapes',
|
|
141
|
+
]
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py
|
|
2
|
+
"""
|
|
3
|
+
RTF File Converter
|
|
4
|
+
|
|
5
|
+
RTF uses raw binary directly, so converter just passes through.
|
|
6
|
+
All actual processing is done by Preprocessor in Handler.
|
|
7
|
+
"""
|
|
8
|
+
import logging
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from typing import Any, BinaryIO, List, Optional
|
|
11
|
+
|
|
12
|
+
from xgen_doc2chunk.core.functions.file_converter import BaseFileConverter
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger("xgen_doc2chunk.rtf.converter")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class RTFConvertedData:
|
|
19
|
+
"""
|
|
20
|
+
RTF converted data container.
|
|
21
|
+
|
|
22
|
+
Attributes:
|
|
23
|
+
content: RTF content string (after preprocessing)
|
|
24
|
+
encoding: Detected encoding
|
|
25
|
+
image_tags: List of image tags from preprocessing
|
|
26
|
+
original_size: Original binary data size
|
|
27
|
+
has_images: Whether images were extracted
|
|
28
|
+
"""
|
|
29
|
+
content: str
|
|
30
|
+
encoding: str = "cp949"
|
|
31
|
+
image_tags: List[str] = field(default_factory=list)
|
|
32
|
+
original_size: int = 0
|
|
33
|
+
has_images: bool = False
|
|
34
|
+
|
|
35
|
+
def __post_init__(self):
|
|
36
|
+
"""Set has_images based on image_tags."""
|
|
37
|
+
if self.image_tags:
|
|
38
|
+
self.has_images = True
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class RTFFileConverter(BaseFileConverter):
|
|
42
|
+
"""
|
|
43
|
+
RTF file converter.
|
|
44
|
+
|
|
45
|
+
RTF uses raw binary directly, so this converter just passes through.
|
|
46
|
+
All actual processing (image extraction, binary removal, decoding)
|
|
47
|
+
is done by RTFPreprocessor called from Handler.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def __init__(self):
|
|
51
|
+
"""Initialize RTFFileConverter."""
|
|
52
|
+
self.logger = logger
|
|
53
|
+
|
|
54
|
+
def convert(
|
|
55
|
+
self,
|
|
56
|
+
file_data: bytes,
|
|
57
|
+
file_stream: Optional[BinaryIO] = None,
|
|
58
|
+
**kwargs
|
|
59
|
+
) -> bytes:
|
|
60
|
+
"""
|
|
61
|
+
Pass through binary data.
|
|
62
|
+
|
|
63
|
+
RTF processing uses raw binary, so just return as-is.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
file_data: Raw binary RTF data
|
|
67
|
+
file_stream: Optional file stream (not used)
|
|
68
|
+
**kwargs: Not used
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
Original bytes (pass through)
|
|
72
|
+
"""
|
|
73
|
+
return file_data
|
|
74
|
+
|
|
75
|
+
def get_format_name(self) -> str:
|
|
76
|
+
"""Return format name."""
|
|
77
|
+
return "RTF Document"
|
|
78
|
+
|
|
79
|
+
def close(self, converted_object: Any) -> None:
|
|
80
|
+
"""Nothing to close."""
|
|
81
|
+
pass
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
__all__ = [
|
|
85
|
+
'RTFFileConverter',
|
|
86
|
+
'RTFConvertedData',
|
|
87
|
+
]
|