xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py
|
|
2
|
+
"""
|
|
3
|
+
PPT Image Processor
|
|
4
|
+
|
|
5
|
+
Provides PPT/PPTX-specific image processing that inherits from ImageProcessor.
|
|
6
|
+
Handles slide images, shape images, and embedded pictures.
|
|
7
|
+
"""
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Any, Dict, Optional, Set, TYPE_CHECKING
|
|
10
|
+
|
|
11
|
+
from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
|
|
12
|
+
from xgen_doc2chunk.core.functions.storage_backend import BaseStorageBackend
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from pptx import Presentation
|
|
16
|
+
from pptx.slide import Slide
|
|
17
|
+
from pptx.shapes.base import BaseShape
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger("xgen_doc2chunk.image_processor.ppt")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class PPTImageProcessor(ImageProcessor):
|
|
23
|
+
"""
|
|
24
|
+
PPT/PPTX-specific image processor.
|
|
25
|
+
|
|
26
|
+
Inherits from ImageProcessor and provides PPT-specific processing.
|
|
27
|
+
|
|
28
|
+
Handles:
|
|
29
|
+
- Picture shapes
|
|
30
|
+
- Embedded images
|
|
31
|
+
- Group shape images
|
|
32
|
+
- Background images
|
|
33
|
+
|
|
34
|
+
Example:
|
|
35
|
+
processor = PPTImageProcessor()
|
|
36
|
+
|
|
37
|
+
# Process slide image
|
|
38
|
+
tag = processor.process_image(image_data, slide_num=1)
|
|
39
|
+
|
|
40
|
+
# Process from shape
|
|
41
|
+
tag = processor.process_picture_shape(shape)
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
directory_path: str = "temp/images",
|
|
47
|
+
tag_prefix: str = "[Image:",
|
|
48
|
+
tag_suffix: str = "]",
|
|
49
|
+
storage_backend: Optional[BaseStorageBackend] = None,
|
|
50
|
+
):
|
|
51
|
+
"""
|
|
52
|
+
Initialize PPTImageProcessor.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
directory_path: Image save directory
|
|
56
|
+
tag_prefix: Tag prefix for image references
|
|
57
|
+
tag_suffix: Tag suffix for image references
|
|
58
|
+
storage_backend: Storage backend for saving images
|
|
59
|
+
"""
|
|
60
|
+
super().__init__(
|
|
61
|
+
directory_path=directory_path,
|
|
62
|
+
tag_prefix=tag_prefix,
|
|
63
|
+
tag_suffix=tag_suffix,
|
|
64
|
+
storage_backend=storage_backend,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
def process_image(
|
|
68
|
+
self,
|
|
69
|
+
image_data: bytes,
|
|
70
|
+
slide_num: Optional[int] = None,
|
|
71
|
+
shape_id: Optional[int] = None,
|
|
72
|
+
**kwargs
|
|
73
|
+
) -> Optional[str]:
|
|
74
|
+
"""
|
|
75
|
+
Process and save PPT image data.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
image_data: Raw image binary data
|
|
79
|
+
slide_num: Source slide number (for naming)
|
|
80
|
+
shape_id: Shape ID (for naming)
|
|
81
|
+
**kwargs: Additional options
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
Image tag string, or None on failure
|
|
85
|
+
"""
|
|
86
|
+
custom_name = None
|
|
87
|
+
if slide_num is not None:
|
|
88
|
+
if shape_id is not None:
|
|
89
|
+
custom_name = f"ppt_slide{slide_num}_shape{shape_id}"
|
|
90
|
+
else:
|
|
91
|
+
custom_name = f"ppt_slide{slide_num}"
|
|
92
|
+
elif shape_id is not None:
|
|
93
|
+
custom_name = f"ppt_shape{shape_id}"
|
|
94
|
+
|
|
95
|
+
return self.save_image(image_data, custom_name=custom_name)
|
|
96
|
+
|
|
97
|
+
def process_picture_shape(
|
|
98
|
+
self,
|
|
99
|
+
shape: "BaseShape",
|
|
100
|
+
slide_num: Optional[int] = None,
|
|
101
|
+
) -> Optional[str]:
|
|
102
|
+
"""
|
|
103
|
+
Process python-pptx picture shape.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
shape: Picture shape object
|
|
107
|
+
slide_num: Source slide number
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
Image tag string, or None on failure
|
|
111
|
+
"""
|
|
112
|
+
try:
|
|
113
|
+
if not hasattr(shape, 'image'):
|
|
114
|
+
return None
|
|
115
|
+
|
|
116
|
+
image = shape.image
|
|
117
|
+
image_data = image.blob
|
|
118
|
+
|
|
119
|
+
if not image_data:
|
|
120
|
+
return None
|
|
121
|
+
|
|
122
|
+
shape_id = shape.shape_id if hasattr(shape, 'shape_id') else None
|
|
123
|
+
|
|
124
|
+
return self.process_image(
|
|
125
|
+
image_data,
|
|
126
|
+
slide_num=slide_num,
|
|
127
|
+
shape_id=shape_id
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
except Exception as e:
|
|
131
|
+
self._logger.warning(f"Failed to process picture shape: {e}")
|
|
132
|
+
return None
|
|
133
|
+
|
|
134
|
+
def process_embedded_image(
|
|
135
|
+
self,
|
|
136
|
+
image_data: bytes,
|
|
137
|
+
image_name: Optional[str] = None,
|
|
138
|
+
slide_num: Optional[int] = None,
|
|
139
|
+
**kwargs
|
|
140
|
+
) -> Optional[str]:
|
|
141
|
+
"""
|
|
142
|
+
Process embedded PPT image.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
image_data: Image binary data
|
|
146
|
+
image_name: Original image filename
|
|
147
|
+
slide_num: Source slide number
|
|
148
|
+
**kwargs: Additional options
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
Image tag string, or None on failure
|
|
152
|
+
"""
|
|
153
|
+
custom_name = image_name
|
|
154
|
+
if custom_name is None and slide_num is not None:
|
|
155
|
+
custom_name = f"ppt_embed_slide{slide_num}"
|
|
156
|
+
|
|
157
|
+
return self.save_image(image_data, custom_name=custom_name)
|
|
158
|
+
|
|
159
|
+
def process_group_shape_images(
|
|
160
|
+
self,
|
|
161
|
+
group_shape: "BaseShape",
|
|
162
|
+
slide_num: Optional[int] = None,
|
|
163
|
+
) -> list:
|
|
164
|
+
"""
|
|
165
|
+
Process all images in a group shape.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
group_shape: Group shape containing other shapes
|
|
169
|
+
slide_num: Source slide number
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
List of image tags
|
|
173
|
+
"""
|
|
174
|
+
tags = []
|
|
175
|
+
|
|
176
|
+
try:
|
|
177
|
+
if not hasattr(group_shape, 'shapes'):
|
|
178
|
+
return tags
|
|
179
|
+
|
|
180
|
+
for shape in group_shape.shapes:
|
|
181
|
+
if hasattr(shape, 'image'):
|
|
182
|
+
tag = self.process_picture_shape(shape, slide_num)
|
|
183
|
+
if tag:
|
|
184
|
+
tags.append(tag)
|
|
185
|
+
elif hasattr(shape, 'shapes'):
|
|
186
|
+
# Nested group
|
|
187
|
+
nested_tags = self.process_group_shape_images(shape, slide_num)
|
|
188
|
+
tags.extend(nested_tags)
|
|
189
|
+
|
|
190
|
+
except Exception as e:
|
|
191
|
+
self._logger.warning(f"Failed to process group shape: {e}")
|
|
192
|
+
|
|
193
|
+
return tags
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
__all__ = ["PPTImageProcessor"]
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py
|
|
2
|
+
"""
|
|
3
|
+
PPT Metadata Extraction Module
|
|
4
|
+
|
|
5
|
+
Provides PPTMetadataExtractor class for extracting metadata from PowerPoint documents.
|
|
6
|
+
Implements BaseMetadataExtractor interface.
|
|
7
|
+
"""
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Any, Optional
|
|
10
|
+
|
|
11
|
+
from pptx import Presentation
|
|
12
|
+
|
|
13
|
+
from xgen_doc2chunk.core.functions.metadata_extractor import (
|
|
14
|
+
BaseMetadataExtractor,
|
|
15
|
+
DocumentMetadata,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger("document-processor")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class PPTMetadataExtractor(BaseMetadataExtractor):
|
|
22
|
+
"""
|
|
23
|
+
PPT/PPTX Metadata Extractor.
|
|
24
|
+
|
|
25
|
+
Extracts metadata from python-pptx Presentation objects.
|
|
26
|
+
|
|
27
|
+
Supported fields:
|
|
28
|
+
- title, subject, author, keywords, comments
|
|
29
|
+
- last_saved_by, create_time, last_saved_time
|
|
30
|
+
|
|
31
|
+
Usage:
|
|
32
|
+
extractor = PPTMetadataExtractor()
|
|
33
|
+
metadata = extractor.extract(presentation)
|
|
34
|
+
text = extractor.format(metadata)
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def extract(self, source: Presentation) -> DocumentMetadata:
|
|
38
|
+
"""
|
|
39
|
+
Extract metadata from PPT document.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
source: python-pptx Presentation object
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
DocumentMetadata instance containing extracted metadata.
|
|
46
|
+
"""
|
|
47
|
+
try:
|
|
48
|
+
props = source.core_properties
|
|
49
|
+
|
|
50
|
+
return DocumentMetadata(
|
|
51
|
+
title=self._get_value(props.title),
|
|
52
|
+
subject=self._get_value(props.subject),
|
|
53
|
+
author=self._get_value(props.author),
|
|
54
|
+
keywords=self._get_value(props.keywords),
|
|
55
|
+
comments=self._get_value(props.comments),
|
|
56
|
+
last_saved_by=self._get_value(props.last_modified_by),
|
|
57
|
+
create_time=props.created,
|
|
58
|
+
last_saved_time=props.modified,
|
|
59
|
+
)
|
|
60
|
+
except Exception as e:
|
|
61
|
+
self.logger.warning(f"Failed to extract PPT metadata: {e}")
|
|
62
|
+
return DocumentMetadata()
|
|
63
|
+
|
|
64
|
+
def _get_value(self, value: Optional[str]) -> Optional[str]:
|
|
65
|
+
"""Return value if present, None otherwise."""
|
|
66
|
+
return value if value else None
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
__all__ = [
|
|
70
|
+
'PPTMetadataExtractor',
|
|
71
|
+
]
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py
|
|
2
|
+
"""
|
|
3
|
+
PPT Preprocessor - Process PPT/PPTX presentation after conversion.
|
|
4
|
+
|
|
5
|
+
Processing Pipeline Position:
|
|
6
|
+
1. PPTFileConverter.convert() ??pptx.Presentation
|
|
7
|
+
2. PPTPreprocessor.preprocess() ??PreprocessedData (THIS STEP)
|
|
8
|
+
3. PPTMetadataExtractor.extract() ??DocumentMetadata
|
|
9
|
+
4. Content extraction (slides, shapes, images, charts)
|
|
10
|
+
|
|
11
|
+
Current Implementation:
|
|
12
|
+
- Pass-through (PPT uses python-pptx Presentation object directly)
|
|
13
|
+
"""
|
|
14
|
+
import logging
|
|
15
|
+
from typing import Any, Dict
|
|
16
|
+
|
|
17
|
+
from xgen_doc2chunk.core.functions.preprocessor import (
|
|
18
|
+
BasePreprocessor,
|
|
19
|
+
PreprocessedData,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger("xgen_doc2chunk.ppt.preprocessor")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class PPTPreprocessor(BasePreprocessor):
|
|
26
|
+
"""
|
|
27
|
+
PPT/PPTX Presentation Preprocessor.
|
|
28
|
+
|
|
29
|
+
Currently a pass-through implementation as PPT processing
|
|
30
|
+
is handled during the content extraction phase using python-pptx.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def preprocess(
|
|
34
|
+
self,
|
|
35
|
+
converted_data: Any,
|
|
36
|
+
**kwargs
|
|
37
|
+
) -> PreprocessedData:
|
|
38
|
+
"""
|
|
39
|
+
Preprocess the converted PPT presentation.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
converted_data: pptx.Presentation object from PPTFileConverter
|
|
43
|
+
**kwargs: Additional options
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
PreprocessedData with the presentation and any extracted resources
|
|
47
|
+
"""
|
|
48
|
+
metadata: Dict[str, Any] = {}
|
|
49
|
+
|
|
50
|
+
if hasattr(converted_data, 'slides'):
|
|
51
|
+
metadata['slide_count'] = len(converted_data.slides)
|
|
52
|
+
|
|
53
|
+
if hasattr(converted_data, 'slide_width'):
|
|
54
|
+
metadata['slide_width'] = converted_data.slide_width
|
|
55
|
+
metadata['slide_height'] = converted_data.slide_height
|
|
56
|
+
|
|
57
|
+
logger.debug("PPT preprocessor: pass-through, metadata=%s", metadata)
|
|
58
|
+
|
|
59
|
+
# clean_content is the TRUE SOURCE - contains the Presentation
|
|
60
|
+
return PreprocessedData(
|
|
61
|
+
raw_content=converted_data,
|
|
62
|
+
clean_content=converted_data, # TRUE SOURCE - pptx.Presentation
|
|
63
|
+
encoding="utf-8",
|
|
64
|
+
extracted_resources={},
|
|
65
|
+
metadata=metadata,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
def get_format_name(self) -> str:
|
|
69
|
+
"""Return format name."""
|
|
70
|
+
return "PPT Preprocessor"
|
|
71
|
+
|
|
72
|
+
def validate(self, data: Any) -> bool:
|
|
73
|
+
"""Validate if data is a PPT Presentation object."""
|
|
74
|
+
return hasattr(data, 'slides') and hasattr(data, 'slide_layouts')
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
__all__ = ['PPTPreprocessor']
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PPT Shape 처리 모듈
|
|
3
|
+
|
|
4
|
+
포함 함수:
|
|
5
|
+
- get_shape_position(): Shape의 위치 정보 반환
|
|
6
|
+
- is_picture_shape(): Shape이 이미지인지 확인
|
|
7
|
+
- process_image_shape(): 이미지 Shape 처리 및 로컬 저장
|
|
8
|
+
- process_group_shape(): 그룹 Shape 처리
|
|
9
|
+
"""
|
|
10
|
+
import logging
|
|
11
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
12
|
+
|
|
13
|
+
from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
|
|
14
|
+
|
|
15
|
+
from xgen_doc2chunk.core.processor.ppt_helper.ppt_constants import ElementType, SlideElement
|
|
16
|
+
from xgen_doc2chunk.core.processor.ppt_helper.ppt_bullet import extract_text_with_bullets
|
|
17
|
+
from xgen_doc2chunk.core.processor.ppt_helper.ppt_table import is_simple_table, extract_simple_table_as_text, convert_table_to_html
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger("document-processor")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def get_shape_position(shape) -> Tuple[int, int, int, int]:
|
|
23
|
+
"""
|
|
24
|
+
Shape의 위치 정보를 반환합니다.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
shape: python-pptx Shape 객체
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
(left, top, width, height) 튜플 (EMU 단위)
|
|
31
|
+
"""
|
|
32
|
+
try:
|
|
33
|
+
left = shape.left if hasattr(shape, 'left') and shape.left else 0
|
|
34
|
+
top = shape.top if hasattr(shape, 'top') and shape.top else 0
|
|
35
|
+
width = shape.width if hasattr(shape, 'width') and shape.width else 0
|
|
36
|
+
height = shape.height if hasattr(shape, 'height') and shape.height else 0
|
|
37
|
+
return (left, top, width, height)
|
|
38
|
+
except Exception:
|
|
39
|
+
return (0, 0, 0, 0)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def is_picture_shape(shape) -> bool:
|
|
43
|
+
"""
|
|
44
|
+
Shape이 이미지인지 확인합니다.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
shape: python-pptx Shape 객체
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
이미지이면 True
|
|
51
|
+
"""
|
|
52
|
+
# 방법 1: shape_type 확인
|
|
53
|
+
try:
|
|
54
|
+
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
|
55
|
+
if hasattr(shape, 'shape_type') and shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
|
56
|
+
return True
|
|
57
|
+
except Exception:
|
|
58
|
+
pass
|
|
59
|
+
|
|
60
|
+
# 방법 2: image 속성 확인
|
|
61
|
+
if hasattr(shape, 'image'):
|
|
62
|
+
try:
|
|
63
|
+
_ = shape.image
|
|
64
|
+
return True
|
|
65
|
+
except Exception:
|
|
66
|
+
pass
|
|
67
|
+
|
|
68
|
+
return False
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def process_image_shape(
|
|
72
|
+
shape,
|
|
73
|
+
processed_images: set,
|
|
74
|
+
image_processor: ImageProcessor
|
|
75
|
+
) -> Optional[str]:
|
|
76
|
+
"""
|
|
77
|
+
이미지 Shape을 처리하고 로컬에 저장합니다.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
shape: python-pptx Shape 객체 (이미지)
|
|
81
|
+
processed_images: 이미 처리된 이미지 해시 집합
|
|
82
|
+
image_processor: ImageProcessor 인스턴스
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
이미지 태그 문자열 또는 None
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
try:
|
|
89
|
+
if not hasattr(shape, 'image'):
|
|
90
|
+
return None
|
|
91
|
+
|
|
92
|
+
image = shape.image
|
|
93
|
+
image_bytes = image.blob
|
|
94
|
+
|
|
95
|
+
if not image_bytes:
|
|
96
|
+
return None
|
|
97
|
+
|
|
98
|
+
image_tag = image_processor.save_image(image_bytes, processed_images=processed_images)
|
|
99
|
+
|
|
100
|
+
if image_tag:
|
|
101
|
+
return f"\n{image_tag}\n"
|
|
102
|
+
|
|
103
|
+
return None
|
|
104
|
+
|
|
105
|
+
except Exception as e:
|
|
106
|
+
logger.warning("Error processing image shape: %s", e)
|
|
107
|
+
return None
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def process_group_shape(
|
|
111
|
+
group_shape,
|
|
112
|
+
processed_images: set,
|
|
113
|
+
image_processor: ImageProcessor
|
|
114
|
+
) -> List[SlideElement]:
|
|
115
|
+
"""
|
|
116
|
+
그룹 Shape 내의 요소들을 처리합니다.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
group_shape: python-pptx Group Shape 객체
|
|
120
|
+
processed_images: 이미 처리된 이미지 해시 집합
|
|
121
|
+
image_processor: ImageProcessor 인스턴스
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
SlideElement 리스트
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
elements = []
|
|
128
|
+
|
|
129
|
+
try:
|
|
130
|
+
for shape in group_shape.shapes:
|
|
131
|
+
position = get_shape_position(shape)
|
|
132
|
+
shape_id = shape.shape_id if hasattr(shape, 'shape_id') else id(shape)
|
|
133
|
+
|
|
134
|
+
if shape.has_table:
|
|
135
|
+
# 단순 표(1xN, Nx1, 2x2 이하)는 텍스트로 처리
|
|
136
|
+
if is_simple_table(shape.table):
|
|
137
|
+
simple_text = extract_simple_table_as_text(shape.table)
|
|
138
|
+
if simple_text:
|
|
139
|
+
elements.append(SlideElement(
|
|
140
|
+
element_type=ElementType.TEXT,
|
|
141
|
+
content=simple_text,
|
|
142
|
+
position=position,
|
|
143
|
+
shape_id=shape_id
|
|
144
|
+
))
|
|
145
|
+
else:
|
|
146
|
+
# 일반 표는 HTML로 처리
|
|
147
|
+
table_html = convert_table_to_html(shape.table)
|
|
148
|
+
if table_html:
|
|
149
|
+
elements.append(SlideElement(
|
|
150
|
+
element_type=ElementType.TABLE,
|
|
151
|
+
content=table_html,
|
|
152
|
+
position=position,
|
|
153
|
+
shape_id=shape_id
|
|
154
|
+
))
|
|
155
|
+
|
|
156
|
+
elif is_picture_shape(shape):
|
|
157
|
+
image_tag = process_image_shape(shape, processed_images, image_processor)
|
|
158
|
+
if image_tag:
|
|
159
|
+
elements.append(SlideElement(
|
|
160
|
+
element_type=ElementType.IMAGE,
|
|
161
|
+
content=image_tag,
|
|
162
|
+
position=position,
|
|
163
|
+
shape_id=shape_id
|
|
164
|
+
))
|
|
165
|
+
|
|
166
|
+
# 텍스트 처리 - 목록 정보 포함
|
|
167
|
+
elif hasattr(shape, "text_frame") and shape.text_frame:
|
|
168
|
+
text_content = extract_text_with_bullets(shape.text_frame)
|
|
169
|
+
if text_content:
|
|
170
|
+
elements.append(SlideElement(
|
|
171
|
+
element_type=ElementType.TEXT,
|
|
172
|
+
content=text_content,
|
|
173
|
+
position=position,
|
|
174
|
+
shape_id=shape_id
|
|
175
|
+
))
|
|
176
|
+
|
|
177
|
+
# 기존 text 속성만 있는 경우 (폴백)
|
|
178
|
+
elif hasattr(shape, "text") and shape.text.strip():
|
|
179
|
+
elements.append(SlideElement(
|
|
180
|
+
element_type=ElementType.TEXT,
|
|
181
|
+
content=shape.text.strip(),
|
|
182
|
+
position=position,
|
|
183
|
+
shape_id=shape_id
|
|
184
|
+
))
|
|
185
|
+
|
|
186
|
+
except Exception as e:
|
|
187
|
+
logger.warning(f"Error processing group shape: {e}")
|
|
188
|
+
|
|
189
|
+
return elements
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PPT 슬라이드 처리 모듈
|
|
3
|
+
|
|
4
|
+
포함 함수:
|
|
5
|
+
- extract_slide_notes(): 슬라이드 노트 추출
|
|
6
|
+
- merge_slide_elements(): 슬라이드 요소들을 병합하여 최종 텍스트 생성
|
|
7
|
+
"""
|
|
8
|
+
import logging
|
|
9
|
+
from typing import List, Optional
|
|
10
|
+
|
|
11
|
+
from xgen_doc2chunk.core.processor.ppt_helper.ppt_constants import ElementType, SlideElement
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger("document-processor")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def extract_slide_notes(slide) -> Optional[str]:
|
|
17
|
+
"""
|
|
18
|
+
슬라이드 노트를 추출합니다.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
slide: python-pptx Slide 객체
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
노트 텍스트 또는 None
|
|
25
|
+
"""
|
|
26
|
+
try:
|
|
27
|
+
if hasattr(slide, "notes_slide") and slide.notes_slide:
|
|
28
|
+
notes_frame = slide.notes_slide.notes_text_frame
|
|
29
|
+
if notes_frame:
|
|
30
|
+
notes_text = notes_frame.text.strip()
|
|
31
|
+
if notes_text:
|
|
32
|
+
return notes_text
|
|
33
|
+
except Exception:
|
|
34
|
+
pass
|
|
35
|
+
return None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def merge_slide_elements(elements: List[SlideElement]) -> str:
|
|
39
|
+
"""
|
|
40
|
+
슬라이드 요소들을 병합하여 최종 텍스트를 생성합니다.
|
|
41
|
+
|
|
42
|
+
각 요소 타입에 맞게 적절한 포맷팅을 적용합니다:
|
|
43
|
+
- TABLE: 앞뒤 줄바꿈 추가
|
|
44
|
+
- IMAGE: 그대로 출력 (이미 줄바꿈 포함)
|
|
45
|
+
- CHART: 앞뒤 줄바꿈 추가
|
|
46
|
+
- TEXT: 뒤에 줄바꿈 추가
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
elements: SlideElement 리스트 (위치 기준 정렬된 상태)
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
병합된 텍스트
|
|
53
|
+
"""
|
|
54
|
+
if not elements:
|
|
55
|
+
return ""
|
|
56
|
+
|
|
57
|
+
result_parts = []
|
|
58
|
+
|
|
59
|
+
for element in elements:
|
|
60
|
+
if element.element_type == ElementType.TABLE:
|
|
61
|
+
result_parts.append("\n" + element.content + "\n")
|
|
62
|
+
elif element.element_type == ElementType.IMAGE:
|
|
63
|
+
result_parts.append(element.content)
|
|
64
|
+
elif element.element_type == ElementType.CHART:
|
|
65
|
+
result_parts.append("\n" + element.content + "\n")
|
|
66
|
+
elif element.element_type == ElementType.TEXT:
|
|
67
|
+
result_parts.append(element.content + "\n")
|
|
68
|
+
|
|
69
|
+
return "".join(result_parts)
|