xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,196 @@
1
+ # xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py
2
+ """
3
+ PPT Image Processor
4
+
5
+ Provides PPT/PPTX-specific image processing that inherits from ImageProcessor.
6
+ Handles slide images, shape images, and embedded pictures.
7
+ """
8
+ import logging
9
+ from typing import Any, Dict, Optional, Set, TYPE_CHECKING
10
+
11
+ from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
12
+ from xgen_doc2chunk.core.functions.storage_backend import BaseStorageBackend
13
+
14
+ if TYPE_CHECKING:
15
+ from pptx import Presentation
16
+ from pptx.slide import Slide
17
+ from pptx.shapes.base import BaseShape
18
+
19
+ logger = logging.getLogger("xgen_doc2chunk.image_processor.ppt")
20
+
21
+
22
+ class PPTImageProcessor(ImageProcessor):
23
+ """
24
+ PPT/PPTX-specific image processor.
25
+
26
+ Inherits from ImageProcessor and provides PPT-specific processing.
27
+
28
+ Handles:
29
+ - Picture shapes
30
+ - Embedded images
31
+ - Group shape images
32
+ - Background images
33
+
34
+ Example:
35
+ processor = PPTImageProcessor()
36
+
37
+ # Process slide image
38
+ tag = processor.process_image(image_data, slide_num=1)
39
+
40
+ # Process from shape
41
+ tag = processor.process_picture_shape(shape)
42
+ """
43
+
44
+ def __init__(
45
+ self,
46
+ directory_path: str = "temp/images",
47
+ tag_prefix: str = "[Image:",
48
+ tag_suffix: str = "]",
49
+ storage_backend: Optional[BaseStorageBackend] = None,
50
+ ):
51
+ """
52
+ Initialize PPTImageProcessor.
53
+
54
+ Args:
55
+ directory_path: Image save directory
56
+ tag_prefix: Tag prefix for image references
57
+ tag_suffix: Tag suffix for image references
58
+ storage_backend: Storage backend for saving images
59
+ """
60
+ super().__init__(
61
+ directory_path=directory_path,
62
+ tag_prefix=tag_prefix,
63
+ tag_suffix=tag_suffix,
64
+ storage_backend=storage_backend,
65
+ )
66
+
67
+ def process_image(
68
+ self,
69
+ image_data: bytes,
70
+ slide_num: Optional[int] = None,
71
+ shape_id: Optional[int] = None,
72
+ **kwargs
73
+ ) -> Optional[str]:
74
+ """
75
+ Process and save PPT image data.
76
+
77
+ Args:
78
+ image_data: Raw image binary data
79
+ slide_num: Source slide number (for naming)
80
+ shape_id: Shape ID (for naming)
81
+ **kwargs: Additional options
82
+
83
+ Returns:
84
+ Image tag string, or None on failure
85
+ """
86
+ custom_name = None
87
+ if slide_num is not None:
88
+ if shape_id is not None:
89
+ custom_name = f"ppt_slide{slide_num}_shape{shape_id}"
90
+ else:
91
+ custom_name = f"ppt_slide{slide_num}"
92
+ elif shape_id is not None:
93
+ custom_name = f"ppt_shape{shape_id}"
94
+
95
+ return self.save_image(image_data, custom_name=custom_name)
96
+
97
+ def process_picture_shape(
98
+ self,
99
+ shape: "BaseShape",
100
+ slide_num: Optional[int] = None,
101
+ ) -> Optional[str]:
102
+ """
103
+ Process python-pptx picture shape.
104
+
105
+ Args:
106
+ shape: Picture shape object
107
+ slide_num: Source slide number
108
+
109
+ Returns:
110
+ Image tag string, or None on failure
111
+ """
112
+ try:
113
+ if not hasattr(shape, 'image'):
114
+ return None
115
+
116
+ image = shape.image
117
+ image_data = image.blob
118
+
119
+ if not image_data:
120
+ return None
121
+
122
+ shape_id = shape.shape_id if hasattr(shape, 'shape_id') else None
123
+
124
+ return self.process_image(
125
+ image_data,
126
+ slide_num=slide_num,
127
+ shape_id=shape_id
128
+ )
129
+
130
+ except Exception as e:
131
+ self._logger.warning(f"Failed to process picture shape: {e}")
132
+ return None
133
+
134
+ def process_embedded_image(
135
+ self,
136
+ image_data: bytes,
137
+ image_name: Optional[str] = None,
138
+ slide_num: Optional[int] = None,
139
+ **kwargs
140
+ ) -> Optional[str]:
141
+ """
142
+ Process embedded PPT image.
143
+
144
+ Args:
145
+ image_data: Image binary data
146
+ image_name: Original image filename
147
+ slide_num: Source slide number
148
+ **kwargs: Additional options
149
+
150
+ Returns:
151
+ Image tag string, or None on failure
152
+ """
153
+ custom_name = image_name
154
+ if custom_name is None and slide_num is not None:
155
+ custom_name = f"ppt_embed_slide{slide_num}"
156
+
157
+ return self.save_image(image_data, custom_name=custom_name)
158
+
159
+ def process_group_shape_images(
160
+ self,
161
+ group_shape: "BaseShape",
162
+ slide_num: Optional[int] = None,
163
+ ) -> list:
164
+ """
165
+ Process all images in a group shape.
166
+
167
+ Args:
168
+ group_shape: Group shape containing other shapes
169
+ slide_num: Source slide number
170
+
171
+ Returns:
172
+ List of image tags
173
+ """
174
+ tags = []
175
+
176
+ try:
177
+ if not hasattr(group_shape, 'shapes'):
178
+ return tags
179
+
180
+ for shape in group_shape.shapes:
181
+ if hasattr(shape, 'image'):
182
+ tag = self.process_picture_shape(shape, slide_num)
183
+ if tag:
184
+ tags.append(tag)
185
+ elif hasattr(shape, 'shapes'):
186
+ # Nested group
187
+ nested_tags = self.process_group_shape_images(shape, slide_num)
188
+ tags.extend(nested_tags)
189
+
190
+ except Exception as e:
191
+ self._logger.warning(f"Failed to process group shape: {e}")
192
+
193
+ return tags
194
+
195
+
196
+ __all__ = ["PPTImageProcessor"]
@@ -0,0 +1,71 @@
1
+ # xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py
2
+ """
3
+ PPT Metadata Extraction Module
4
+
5
+ Provides PPTMetadataExtractor class for extracting metadata from PowerPoint documents.
6
+ Implements BaseMetadataExtractor interface.
7
+ """
8
+ import logging
9
+ from typing import Any, Optional
10
+
11
+ from pptx import Presentation
12
+
13
+ from xgen_doc2chunk.core.functions.metadata_extractor import (
14
+ BaseMetadataExtractor,
15
+ DocumentMetadata,
16
+ )
17
+
18
+ logger = logging.getLogger("document-processor")
19
+
20
+
21
+ class PPTMetadataExtractor(BaseMetadataExtractor):
22
+ """
23
+ PPT/PPTX Metadata Extractor.
24
+
25
+ Extracts metadata from python-pptx Presentation objects.
26
+
27
+ Supported fields:
28
+ - title, subject, author, keywords, comments
29
+ - last_saved_by, create_time, last_saved_time
30
+
31
+ Usage:
32
+ extractor = PPTMetadataExtractor()
33
+ metadata = extractor.extract(presentation)
34
+ text = extractor.format(metadata)
35
+ """
36
+
37
+ def extract(self, source: Presentation) -> DocumentMetadata:
38
+ """
39
+ Extract metadata from PPT document.
40
+
41
+ Args:
42
+ source: python-pptx Presentation object
43
+
44
+ Returns:
45
+ DocumentMetadata instance containing extracted metadata.
46
+ """
47
+ try:
48
+ props = source.core_properties
49
+
50
+ return DocumentMetadata(
51
+ title=self._get_value(props.title),
52
+ subject=self._get_value(props.subject),
53
+ author=self._get_value(props.author),
54
+ keywords=self._get_value(props.keywords),
55
+ comments=self._get_value(props.comments),
56
+ last_saved_by=self._get_value(props.last_modified_by),
57
+ create_time=props.created,
58
+ last_saved_time=props.modified,
59
+ )
60
+ except Exception as e:
61
+ self.logger.warning(f"Failed to extract PPT metadata: {e}")
62
+ return DocumentMetadata()
63
+
64
+ def _get_value(self, value: Optional[str]) -> Optional[str]:
65
+ """Return value if present, None otherwise."""
66
+ return value if value else None
67
+
68
+
69
+ __all__ = [
70
+ 'PPTMetadataExtractor',
71
+ ]
@@ -0,0 +1,77 @@
1
+ # xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py
2
+ """
3
+ PPT Preprocessor - Process PPT/PPTX presentation after conversion.
4
+
5
+ Processing Pipeline Position:
6
+ 1. PPTFileConverter.convert() ??pptx.Presentation
7
+ 2. PPTPreprocessor.preprocess() ??PreprocessedData (THIS STEP)
8
+ 3. PPTMetadataExtractor.extract() ??DocumentMetadata
9
+ 4. Content extraction (slides, shapes, images, charts)
10
+
11
+ Current Implementation:
12
+ - Pass-through (PPT uses python-pptx Presentation object directly)
13
+ """
14
+ import logging
15
+ from typing import Any, Dict
16
+
17
+ from xgen_doc2chunk.core.functions.preprocessor import (
18
+ BasePreprocessor,
19
+ PreprocessedData,
20
+ )
21
+
22
+ logger = logging.getLogger("xgen_doc2chunk.ppt.preprocessor")
23
+
24
+
25
+ class PPTPreprocessor(BasePreprocessor):
26
+ """
27
+ PPT/PPTX Presentation Preprocessor.
28
+
29
+ Currently a pass-through implementation as PPT processing
30
+ is handled during the content extraction phase using python-pptx.
31
+ """
32
+
33
+ def preprocess(
34
+ self,
35
+ converted_data: Any,
36
+ **kwargs
37
+ ) -> PreprocessedData:
38
+ """
39
+ Preprocess the converted PPT presentation.
40
+
41
+ Args:
42
+ converted_data: pptx.Presentation object from PPTFileConverter
43
+ **kwargs: Additional options
44
+
45
+ Returns:
46
+ PreprocessedData with the presentation and any extracted resources
47
+ """
48
+ metadata: Dict[str, Any] = {}
49
+
50
+ if hasattr(converted_data, 'slides'):
51
+ metadata['slide_count'] = len(converted_data.slides)
52
+
53
+ if hasattr(converted_data, 'slide_width'):
54
+ metadata['slide_width'] = converted_data.slide_width
55
+ metadata['slide_height'] = converted_data.slide_height
56
+
57
+ logger.debug("PPT preprocessor: pass-through, metadata=%s", metadata)
58
+
59
+ # clean_content is the TRUE SOURCE - contains the Presentation
60
+ return PreprocessedData(
61
+ raw_content=converted_data,
62
+ clean_content=converted_data, # TRUE SOURCE - pptx.Presentation
63
+ encoding="utf-8",
64
+ extracted_resources={},
65
+ metadata=metadata,
66
+ )
67
+
68
+ def get_format_name(self) -> str:
69
+ """Return format name."""
70
+ return "PPT Preprocessor"
71
+
72
+ def validate(self, data: Any) -> bool:
73
+ """Validate if data is a PPT Presentation object."""
74
+ return hasattr(data, 'slides') and hasattr(data, 'slide_layouts')
75
+
76
+
77
+ __all__ = ['PPTPreprocessor']
@@ -0,0 +1,189 @@
1
+ """
2
+ PPT Shape 처리 모듈
3
+
4
+ 포함 함수:
5
+ - get_shape_position(): Shape의 위치 정보 반환
6
+ - is_picture_shape(): Shape이 이미지인지 확인
7
+ - process_image_shape(): 이미지 Shape 처리 및 로컬 저장
8
+ - process_group_shape(): 그룹 Shape 처리
9
+ """
10
+ import logging
11
+ from typing import Any, Dict, List, Optional, Tuple
12
+
13
+ from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
14
+
15
+ from xgen_doc2chunk.core.processor.ppt_helper.ppt_constants import ElementType, SlideElement
16
+ from xgen_doc2chunk.core.processor.ppt_helper.ppt_bullet import extract_text_with_bullets
17
+ from xgen_doc2chunk.core.processor.ppt_helper.ppt_table import is_simple_table, extract_simple_table_as_text, convert_table_to_html
18
+
19
+ logger = logging.getLogger("document-processor")
20
+
21
+
22
+ def get_shape_position(shape) -> Tuple[int, int, int, int]:
23
+ """
24
+ Shape의 위치 정보를 반환합니다.
25
+
26
+ Args:
27
+ shape: python-pptx Shape 객체
28
+
29
+ Returns:
30
+ (left, top, width, height) 튜플 (EMU 단위)
31
+ """
32
+ try:
33
+ left = shape.left if hasattr(shape, 'left') and shape.left else 0
34
+ top = shape.top if hasattr(shape, 'top') and shape.top else 0
35
+ width = shape.width if hasattr(shape, 'width') and shape.width else 0
36
+ height = shape.height if hasattr(shape, 'height') and shape.height else 0
37
+ return (left, top, width, height)
38
+ except Exception:
39
+ return (0, 0, 0, 0)
40
+
41
+
42
+ def is_picture_shape(shape) -> bool:
43
+ """
44
+ Shape이 이미지인지 확인합니다.
45
+
46
+ Args:
47
+ shape: python-pptx Shape 객체
48
+
49
+ Returns:
50
+ 이미지이면 True
51
+ """
52
+ # 방법 1: shape_type 확인
53
+ try:
54
+ from pptx.enum.shapes import MSO_SHAPE_TYPE
55
+ if hasattr(shape, 'shape_type') and shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
56
+ return True
57
+ except Exception:
58
+ pass
59
+
60
+ # 방법 2: image 속성 확인
61
+ if hasattr(shape, 'image'):
62
+ try:
63
+ _ = shape.image
64
+ return True
65
+ except Exception:
66
+ pass
67
+
68
+ return False
69
+
70
+
71
+ def process_image_shape(
72
+ shape,
73
+ processed_images: set,
74
+ image_processor: ImageProcessor
75
+ ) -> Optional[str]:
76
+ """
77
+ 이미지 Shape을 처리하고 로컬에 저장합니다.
78
+
79
+ Args:
80
+ shape: python-pptx Shape 객체 (이미지)
81
+ processed_images: 이미 처리된 이미지 해시 집합
82
+ image_processor: ImageProcessor 인스턴스
83
+
84
+ Returns:
85
+ 이미지 태그 문자열 또는 None
86
+ """
87
+
88
+ try:
89
+ if not hasattr(shape, 'image'):
90
+ return None
91
+
92
+ image = shape.image
93
+ image_bytes = image.blob
94
+
95
+ if not image_bytes:
96
+ return None
97
+
98
+ image_tag = image_processor.save_image(image_bytes, processed_images=processed_images)
99
+
100
+ if image_tag:
101
+ return f"\n{image_tag}\n"
102
+
103
+ return None
104
+
105
+ except Exception as e:
106
+ logger.warning("Error processing image shape: %s", e)
107
+ return None
108
+
109
+
110
+ def process_group_shape(
111
+ group_shape,
112
+ processed_images: set,
113
+ image_processor: ImageProcessor
114
+ ) -> List[SlideElement]:
115
+ """
116
+ 그룹 Shape 내의 요소들을 처리합니다.
117
+
118
+ Args:
119
+ group_shape: python-pptx Group Shape 객체
120
+ processed_images: 이미 처리된 이미지 해시 집합
121
+ image_processor: ImageProcessor 인스턴스
122
+
123
+ Returns:
124
+ SlideElement 리스트
125
+ """
126
+
127
+ elements = []
128
+
129
+ try:
130
+ for shape in group_shape.shapes:
131
+ position = get_shape_position(shape)
132
+ shape_id = shape.shape_id if hasattr(shape, 'shape_id') else id(shape)
133
+
134
+ if shape.has_table:
135
+ # 단순 표(1xN, Nx1, 2x2 이하)는 텍스트로 처리
136
+ if is_simple_table(shape.table):
137
+ simple_text = extract_simple_table_as_text(shape.table)
138
+ if simple_text:
139
+ elements.append(SlideElement(
140
+ element_type=ElementType.TEXT,
141
+ content=simple_text,
142
+ position=position,
143
+ shape_id=shape_id
144
+ ))
145
+ else:
146
+ # 일반 표는 HTML로 처리
147
+ table_html = convert_table_to_html(shape.table)
148
+ if table_html:
149
+ elements.append(SlideElement(
150
+ element_type=ElementType.TABLE,
151
+ content=table_html,
152
+ position=position,
153
+ shape_id=shape_id
154
+ ))
155
+
156
+ elif is_picture_shape(shape):
157
+ image_tag = process_image_shape(shape, processed_images, image_processor)
158
+ if image_tag:
159
+ elements.append(SlideElement(
160
+ element_type=ElementType.IMAGE,
161
+ content=image_tag,
162
+ position=position,
163
+ shape_id=shape_id
164
+ ))
165
+
166
+ # 텍스트 처리 - 목록 정보 포함
167
+ elif hasattr(shape, "text_frame") and shape.text_frame:
168
+ text_content = extract_text_with_bullets(shape.text_frame)
169
+ if text_content:
170
+ elements.append(SlideElement(
171
+ element_type=ElementType.TEXT,
172
+ content=text_content,
173
+ position=position,
174
+ shape_id=shape_id
175
+ ))
176
+
177
+ # 기존 text 속성만 있는 경우 (폴백)
178
+ elif hasattr(shape, "text") and shape.text.strip():
179
+ elements.append(SlideElement(
180
+ element_type=ElementType.TEXT,
181
+ content=shape.text.strip(),
182
+ position=position,
183
+ shape_id=shape_id
184
+ ))
185
+
186
+ except Exception as e:
187
+ logger.warning(f"Error processing group shape: {e}")
188
+
189
+ return elements
@@ -0,0 +1,69 @@
1
+ """
2
+ PPT 슬라이드 처리 모듈
3
+
4
+ 포함 함수:
5
+ - extract_slide_notes(): 슬라이드 노트 추출
6
+ - merge_slide_elements(): 슬라이드 요소들을 병합하여 최종 텍스트 생성
7
+ """
8
+ import logging
9
+ from typing import List, Optional
10
+
11
+ from xgen_doc2chunk.core.processor.ppt_helper.ppt_constants import ElementType, SlideElement
12
+
13
+ logger = logging.getLogger("document-processor")
14
+
15
+
16
+ def extract_slide_notes(slide) -> Optional[str]:
17
+ """
18
+ 슬라이드 노트를 추출합니다.
19
+
20
+ Args:
21
+ slide: python-pptx Slide 객체
22
+
23
+ Returns:
24
+ 노트 텍스트 또는 None
25
+ """
26
+ try:
27
+ if hasattr(slide, "notes_slide") and slide.notes_slide:
28
+ notes_frame = slide.notes_slide.notes_text_frame
29
+ if notes_frame:
30
+ notes_text = notes_frame.text.strip()
31
+ if notes_text:
32
+ return notes_text
33
+ except Exception:
34
+ pass
35
+ return None
36
+
37
+
38
+ def merge_slide_elements(elements: List[SlideElement]) -> str:
39
+ """
40
+ 슬라이드 요소들을 병합하여 최종 텍스트를 생성합니다.
41
+
42
+ 각 요소 타입에 맞게 적절한 포맷팅을 적용합니다:
43
+ - TABLE: 앞뒤 줄바꿈 추가
44
+ - IMAGE: 그대로 출력 (이미 줄바꿈 포함)
45
+ - CHART: 앞뒤 줄바꿈 추가
46
+ - TEXT: 뒤에 줄바꿈 추가
47
+
48
+ Args:
49
+ elements: SlideElement 리스트 (위치 기준 정렬된 상태)
50
+
51
+ Returns:
52
+ 병합된 텍스트
53
+ """
54
+ if not elements:
55
+ return ""
56
+
57
+ result_parts = []
58
+
59
+ for element in elements:
60
+ if element.element_type == ElementType.TABLE:
61
+ result_parts.append("\n" + element.content + "\n")
62
+ elif element.element_type == ElementType.IMAGE:
63
+ result_parts.append(element.content)
64
+ elif element.element_type == ElementType.CHART:
65
+ result_parts.append("\n" + element.content + "\n")
66
+ elif element.element_type == ElementType.TEXT:
67
+ result_parts.append(element.content + "\n")
68
+
69
+ return "".join(result_parts)