xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,278 @@
1
+ """
2
+ PDF Handler Types and Configuration
3
+
4
+ Defines all data classes and configuration values used by the PDF engine.
5
+ """
6
+
7
+ from dataclasses import dataclass, field
8
+ from enum import Enum, auto
9
+ from typing import Optional, List, Dict, Tuple, Any
10
+
11
+
12
+ # ============================================================================
13
+ # Enums
14
+ # ============================================================================
15
+
16
+ class LineThickness(Enum):
17
+ """Line thickness classification"""
18
+ THIN = auto() # Table inner lines (0.3-0.5pt)
19
+ NORMAL = auto() # Regular borders (0.5-1.5pt)
20
+ THICK = auto() # Emphasis/header divider lines (1.5pt+)
21
+
22
+
23
+ class TableDetectionStrategy(Enum):
24
+ """Table detection strategy"""
25
+ PYMUPDF_NATIVE = auto() # PyMuPDF built-in table detection
26
+ PDFPLUMBER_LINES = auto() # pdfplumber line-based detection
27
+ HYBRID_ANALYSIS = auto() # Line analysis-based hybrid
28
+ BORDERLESS_HEURISTIC = auto() # Borderless table heuristic
29
+
30
+
31
+ class ElementType(Enum):
32
+ """Page element type"""
33
+ TEXT = "text"
34
+ TABLE = "table"
35
+ IMAGE = "image"
36
+ ANNOTATION = "annotation"
37
+
38
+
39
+ # ============================================================================
40
+ # Configuration Constants
41
+ # ============================================================================
42
+
43
+ class PDFConfig:
44
+ """PDF engine configuration constants"""
45
+
46
+ # Line thickness thresholds (pt)
47
+ THIN_LINE_MAX = 0.5
48
+ NORMAL_LINE_MAX = 1.5
49
+
50
+ # Table detection settings
51
+ MIN_TABLE_ROWS = 2
52
+ MIN_TABLE_COLS = 2
53
+ TABLE_MERGE_TOLERANCE = 5.0 # Table merge tolerance (pt)
54
+
55
+ # Double line merge settings
56
+ DOUBLE_LINE_TOLERANCE = 3.0 # Double line detection distance (pt)
57
+
58
+ # Cell analysis settings
59
+ CELL_PADDING = 2.0
60
+ MIN_CELL_WIDTH = 10.0
61
+ MIN_CELL_HEIGHT = 8.0
62
+
63
+ # Text extraction settings
64
+ TEXT_BLOCK_TOLERANCE = 3.0
65
+
66
+ # Confidence threshold
67
+ CONFIDENCE_THRESHOLD = 0.5
68
+
69
+ # Page border detection settings
70
+ BORDER_MARGIN = 30.0 # Maximum distance from page edge
71
+ BORDER_LENGTH_RATIO = 0.8 # Minimum border length ratio relative to page size
72
+ PAGE_BORDER_MARGIN = 0.1 # Page border margin ratio relative to page size
73
+ PAGE_SPANNING_RATIO = 0.85 # Ratio to determine if line spans the page
74
+
75
+ # Graphic region detection settings
76
+ GRAPHIC_CURVE_RATIO_THRESHOLD = 0.3 # Curve ratio threshold
77
+ GRAPHIC_MIN_CURVE_COUNT = 10 # Minimum curve count
78
+ GRAPHIC_FILL_RATIO_THRESHOLD = 0.2 # Fill ratio threshold
79
+ GRAPHIC_COLOR_VARIETY_THRESHOLD = 3 # Color variety threshold
80
+
81
+ # Table quality validation settings
82
+ TABLE_MIN_FILLED_CELL_RATIO = 0.15 # Minimum filled cell ratio
83
+ TABLE_MAX_EMPTY_ROW_RATIO = 0.7 # Maximum empty row ratio
84
+ TABLE_MIN_MEANINGFUL_CELLS = 2 # Minimum meaningful cell count
85
+ TABLE_MIN_VALID_ROWS = 2 # Minimum valid row count
86
+ TABLE_MIN_TEXT_DENSITY = 0.005 # Minimum text density
87
+
88
+ # Cell text length settings
89
+ TABLE_MAX_CELL_TEXT_LENGTH = 300 # Maximum text length per cell
90
+ TABLE_EXTREME_CELL_LENGTH = 800 # Extremely long cell threshold
91
+ TABLE_MAX_LONG_CELLS_RATIO = 0.4 # Maximum long cell ratio
92
+
93
+ # Annotation detection settings
94
+ ANNOTATION_Y_MARGIN = 30.0 # pt - Search range below table for annotations
95
+ ANNOTATION_PATTERNS = ['주)', '주 )', '※', '*', '†', '‡', '¹', '²', '³']
96
+
97
+
98
+ # ============================================================================
99
+ # Data Classes - Basic Types
100
+ # ============================================================================
101
+
102
+ @dataclass
103
+ class LineInfo:
104
+ """Line information"""
105
+ x0: float
106
+ y0: float
107
+ x1: float
108
+ y1: float
109
+ thickness: float = 1.0
110
+ thickness_class: LineThickness = LineThickness.NORMAL
111
+ is_horizontal: bool = False
112
+ is_vertical: bool = False
113
+
114
+ @property
115
+ def length(self) -> float:
116
+ """Line length"""
117
+ import math
118
+ return math.sqrt((self.x1 - self.x0) ** 2 + (self.y1 - self.y0) ** 2)
119
+
120
+ @property
121
+ def midpoint(self) -> Tuple[float, float]:
122
+ """Midpoint"""
123
+ return ((self.x0 + self.x1) / 2, (self.y0 + self.y1) / 2)
124
+
125
+
126
+ @dataclass
127
+ class GridInfo:
128
+ """Grid information"""
129
+ h_lines: List[float] = field(default_factory=list) # Y coordinates
130
+ v_lines: List[float] = field(default_factory=list) # X coordinates
131
+ cells: List['CellInfo'] = field(default_factory=list)
132
+ bbox: Tuple[float, float, float, float] = (0, 0, 0, 0)
133
+ is_complete: bool = False # Whether border is complete
134
+ reconstructed: bool = False # Whether border was reconstructed
135
+
136
+ @property
137
+ def row_count(self) -> int:
138
+ """Row count (number of regions between horizontal lines)"""
139
+ return max(0, len(self.h_lines) - 1)
140
+
141
+ @property
142
+ def col_count(self) -> int:
143
+ """Column count (number of regions between vertical lines)"""
144
+ return max(0, len(self.v_lines) - 1)
145
+
146
+
147
+ @dataclass
148
+ class CellInfo:
149
+ """Cell information"""
150
+ row: int
151
+ col: int
152
+ bbox: Tuple[float, float, float, float]
153
+ text: str = ""
154
+ rowspan: int = 1
155
+ colspan: int = 1
156
+ is_header: bool = False
157
+ alignment: str = "left"
158
+
159
+
160
+ @dataclass
161
+ class AnnotationInfo:
162
+ """Annotation information"""
163
+ type: str
164
+ bbox: Tuple[float, float, float, float]
165
+ content: str = ""
166
+ color: Optional[Tuple[float, float, float]] = None
167
+
168
+
169
+ # ============================================================================
170
+ # Data Classes - Vector Text OCR
171
+ # ============================================================================
172
+
173
+ @dataclass
174
+ class VectorTextRegion:
175
+ """
176
+ Vector text (Outlined/Path Text) region information
177
+ """
178
+ bbox: Tuple[float, float, float, float]
179
+ drawing_count: int # Number of drawings contained
180
+ curve_count: int # Curve count (c items)
181
+ fill_count: int # Filled path count
182
+ ocr_text: str = "" # OCR result
183
+ confidence: float = 0.0 # Confidence score
184
+ is_vector_text: bool = False # Whether this is vector text
185
+
186
+
187
+ # ============================================================================
188
+ # Data Classes - Graphic Region
189
+ # ============================================================================
190
+
191
+ @dataclass
192
+ class GraphicRegionInfo:
193
+ """
194
+ Graphic region information (charts, diagrams, icons, etc.)
195
+ """
196
+ bbox: Tuple[float, float, float, float]
197
+ curve_count: int = 0 # Curve count
198
+ line_count: int = 0 # Straight line count
199
+ rect_count: int = 0 # Rectangle count
200
+ fill_count: int = 0 # Filled shape count
201
+ color_count: int = 0 # Number of colors used
202
+ is_graphic: bool = False # Whether this is a graphic region
203
+ confidence: float = 0.0 # Confidence score
204
+ reason: str = "" # Reasoning for determination
205
+
206
+
207
+ # ============================================================================
208
+ # Data Classes - Table Detection
209
+ # ============================================================================
210
+
211
+ @dataclass
212
+ class TableCandidate:
213
+ """Table candidate"""
214
+ strategy: TableDetectionStrategy
215
+ confidence: float
216
+ bbox: Tuple[float, float, float, float]
217
+ grid: Optional[GridInfo] = None
218
+ cells: List['CellInfo'] = field(default_factory=list)
219
+ data: List[List[Optional[str]]] = field(default_factory=list)
220
+ raw_table: Any = None # Original table object
221
+
222
+ @property
223
+ def row_count(self) -> int:
224
+ """Row count"""
225
+ return len(self.data)
226
+
227
+ @property
228
+ def col_count(self) -> int:
229
+ """Column count"""
230
+ return max(len(row) for row in self.data) if self.data else 0
231
+
232
+
233
+ @dataclass
234
+ class PageElement:
235
+ """Page element"""
236
+ element_type: ElementType
237
+ content: str
238
+ bbox: Tuple[float, float, float, float]
239
+ page_num: int
240
+ table_data: Optional[List[List]] = None
241
+ cells_info: Optional[List[Dict]] = None
242
+ annotations: Optional[List[AnnotationInfo]] = None
243
+ detection_strategy: Optional[TableDetectionStrategy] = None
244
+ confidence: float = 1.0
245
+
246
+
247
+ @dataclass
248
+ class PageBorderInfo:
249
+ """Page border information"""
250
+ has_border: bool = False
251
+ border_bbox: Optional[Tuple[float, float, float, float]] = None
252
+ border_lines: Dict[str, bool] = field(default_factory=lambda: {
253
+ 'top': False, 'bottom': False, 'left': False, 'right': False
254
+ })
255
+
256
+
257
+ # ============================================================================
258
+ # Export
259
+ # ============================================================================
260
+
261
+ __all__ = [
262
+ # Enums
263
+ 'LineThickness',
264
+ 'TableDetectionStrategy',
265
+ 'ElementType',
266
+ # Config
267
+ 'PDFConfig',
268
+ # Data Classes
269
+ 'LineInfo',
270
+ 'GridInfo',
271
+ 'CellInfo',
272
+ 'AnnotationInfo',
273
+ 'VectorTextRegion',
274
+ 'GraphicRegionInfo',
275
+ 'TableCandidate',
276
+ 'PageElement',
277
+ 'PageBorderInfo',
278
+ ]
@@ -0,0 +1,288 @@
1
+ # xgen_doc2chunk/core/processor/ppt_handler.py
2
+ """
3
+ PPT Handler - PPT/PPTX Document Processor
4
+
5
+ Class-based handler for PPT/PPTX files inheriting from BaseHandler.
6
+ """
7
+ import logging
8
+ from typing import Any, Dict, List, Optional, Set, TYPE_CHECKING
9
+
10
+ from xgen_doc2chunk.core.processor.base_handler import BaseHandler
11
+ from xgen_doc2chunk.core.functions.chart_extractor import BaseChartExtractor
12
+ from xgen_doc2chunk.core.processor.ppt_helper import (
13
+ ElementType,
14
+ SlideElement,
15
+ extract_text_with_bullets,
16
+ is_simple_table,
17
+ extract_simple_table_as_text,
18
+ convert_table_to_html,
19
+ extract_table_as_text,
20
+ get_shape_position,
21
+ is_picture_shape,
22
+ process_image_shape,
23
+ process_group_shape,
24
+ extract_slide_notes,
25
+ merge_slide_elements,
26
+ )
27
+ from xgen_doc2chunk.core.processor.ppt_helper.ppt_chart_extractor import PPTChartExtractor
28
+ from xgen_doc2chunk.core.processor.ppt_helper.ppt_metadata import PPTMetadataExtractor
29
+ from xgen_doc2chunk.core.processor.ppt_helper.ppt_image_processor import PPTImageProcessor
30
+
31
+ if TYPE_CHECKING:
32
+ from xgen_doc2chunk.core.document_processor import CurrentFile
33
+ from xgen_doc2chunk.core.functions.chart_extractor import ChartData
34
+
35
+ logger = logging.getLogger("document-processor")
36
+
37
+
38
+ class PPTHandler(BaseHandler):
39
+ """PPT/PPTX File Processing Handler Class"""
40
+
41
+ def _create_file_converter(self):
42
+ """Create PPT-specific file converter."""
43
+ from xgen_doc2chunk.core.processor.ppt_helper.ppt_file_converter import PPTFileConverter
44
+ return PPTFileConverter()
45
+
46
+ def _create_preprocessor(self):
47
+ """Create PPT-specific preprocessor."""
48
+ from xgen_doc2chunk.core.processor.ppt_helper.ppt_preprocessor import PPTPreprocessor
49
+ return PPTPreprocessor()
50
+
51
+ def _create_chart_extractor(self) -> BaseChartExtractor:
52
+ """Create PPT-specific chart extractor."""
53
+ return PPTChartExtractor(self._chart_processor)
54
+
55
+ def _create_metadata_extractor(self):
56
+ """Create PPT-specific metadata extractor."""
57
+ return PPTMetadataExtractor()
58
+
59
+ def _create_format_image_processor(self):
60
+ """Create PPT-specific image processor."""
61
+ return PPTImageProcessor(
62
+ directory_path=self._image_processor.config.directory_path,
63
+ tag_prefix=self._image_processor.config.tag_prefix,
64
+ tag_suffix=self._image_processor.config.tag_suffix,
65
+ storage_backend=self._image_processor.storage_backend,
66
+ )
67
+
68
+ def extract_text(
69
+ self,
70
+ current_file: "CurrentFile",
71
+ extract_metadata: bool = True,
72
+ **kwargs
73
+ ) -> str:
74
+ """
75
+ Extract text from PPT/PPTX file.
76
+
77
+ Args:
78
+ current_file: CurrentFile dict containing file info and binary data
79
+ extract_metadata: Whether to extract metadata
80
+ **kwargs: Additional options
81
+
82
+ Returns:
83
+ Extracted text
84
+ """
85
+ file_path = current_file.get("file_path", "unknown")
86
+ self.logger.info(f"PPT processing: {file_path}")
87
+ return self._extract_ppt_enhanced(current_file, extract_metadata)
88
+
89
+ def _extract_ppt_enhanced(self, current_file: "CurrentFile", extract_metadata: bool = True) -> str:
90
+ """Enhanced PPT processing with pre-extracted charts."""
91
+ file_path = current_file.get("file_path", "unknown")
92
+ self.logger.info(f"Enhanced PPT processing: {file_path}")
93
+
94
+ try:
95
+ # Step 1: Convert to Presentation using file_converter
96
+ file_data = current_file.get("file_data", b"")
97
+ file_stream = self.get_file_stream(current_file)
98
+ prs = self.file_converter.convert(file_data, file_stream)
99
+
100
+ # Step 2: Preprocess - may transform prs in the future
101
+ preprocessed = self.preprocess(prs)
102
+ prs = preprocessed.clean_content # TRUE SOURCE
103
+
104
+ result_parts = []
105
+ processed_images: Set[str] = set()
106
+ total_tables = 0
107
+ total_images = 0
108
+ total_charts = 0
109
+
110
+ # Pre-extract all charts using ChartExtractor
111
+ file_stream.seek(0)
112
+ chart_data_list = self.chart_extractor.extract_all_from_file(file_stream)
113
+ chart_idx = [0] # Mutable container for closure
114
+
115
+ def get_next_chart() -> str:
116
+ """Callback to get the next pre-extracted chart content."""
117
+ if chart_idx[0] < len(chart_data_list):
118
+ chart_data = chart_data_list[chart_idx[0]]
119
+ chart_idx[0] += 1
120
+ return self._format_chart_data(chart_data)
121
+ return ""
122
+
123
+ if extract_metadata:
124
+ metadata_text = self.extract_and_format_metadata(prs)
125
+ if metadata_text:
126
+ result_parts.append(metadata_text)
127
+ result_parts.append("")
128
+
129
+ for slide_idx, slide in enumerate(prs.slides):
130
+ slide_tag = self.create_slide_tag(slide_idx + 1)
131
+ result_parts.append(f"\n{slide_tag}\n")
132
+
133
+ elements: List[SlideElement] = []
134
+
135
+ for shape in slide.shapes:
136
+ try:
137
+ position = get_shape_position(shape)
138
+ shape_id = shape.shape_id if hasattr(shape, 'shape_id') else id(shape)
139
+
140
+ if shape.has_table:
141
+ if is_simple_table(shape.table):
142
+ simple_text = extract_simple_table_as_text(shape.table)
143
+ if simple_text:
144
+ elements.append(SlideElement(
145
+ element_type=ElementType.TEXT,
146
+ content=simple_text,
147
+ position=position,
148
+ shape_id=shape_id
149
+ ))
150
+ else:
151
+ table_html = convert_table_to_html(shape.table)
152
+ if table_html:
153
+ total_tables += 1
154
+ elements.append(SlideElement(
155
+ element_type=ElementType.TABLE,
156
+ content=table_html,
157
+ position=position,
158
+ shape_id=shape_id
159
+ ))
160
+
161
+ elif is_picture_shape(shape):
162
+ image_tag = process_image_shape(shape, processed_images, self.format_image_processor)
163
+ if image_tag:
164
+ total_images += 1
165
+ elements.append(SlideElement(
166
+ element_type=ElementType.IMAGE,
167
+ content=image_tag,
168
+ position=position,
169
+ shape_id=shape_id
170
+ ))
171
+
172
+ elif shape.has_chart:
173
+ # Use pre-extracted chart via callback
174
+ chart_text = get_next_chart()
175
+ if chart_text:
176
+ total_charts += 1
177
+ elements.append(SlideElement(
178
+ element_type=ElementType.CHART,
179
+ content=chart_text,
180
+ position=position,
181
+ shape_id=shape_id
182
+ ))
183
+
184
+ elif hasattr(shape, "text_frame") and shape.text_frame:
185
+ text_content = extract_text_with_bullets(shape.text_frame)
186
+ if text_content:
187
+ elements.append(SlideElement(
188
+ element_type=ElementType.TEXT,
189
+ content=text_content,
190
+ position=position,
191
+ shape_id=shape_id
192
+ ))
193
+
194
+ elif hasattr(shape, "text") and shape.text.strip():
195
+ elements.append(SlideElement(
196
+ element_type=ElementType.TEXT,
197
+ content=shape.text.strip(),
198
+ position=position,
199
+ shape_id=shape_id
200
+ ))
201
+
202
+ elif hasattr(shape, "shapes"):
203
+ group_elements = process_group_shape(shape, processed_images, self.format_image_processor)
204
+ elements.extend(group_elements)
205
+
206
+ except Exception as shape_e:
207
+ self.logger.warning(f"Error processing shape in slide {slide_idx + 1}: {shape_e}")
208
+ continue
209
+
210
+ elements.sort(key=lambda e: e.sort_key)
211
+ slide_content = merge_slide_elements(elements)
212
+
213
+ if slide_content.strip():
214
+ result_parts.append(slide_content)
215
+ else:
216
+ result_parts.append("[Empty Slide]\n")
217
+
218
+ notes_text = extract_slide_notes(slide)
219
+ if notes_text:
220
+ result_parts.append(f"\n[Slide Notes]\n{notes_text}\n")
221
+
222
+ result = "".join(result_parts)
223
+ self.logger.info(f"Enhanced PPT: {len(prs.slides)} slides, {total_tables} tables, "
224
+ f"{total_images} images, {total_charts} charts")
225
+
226
+ return result
227
+
228
+ except Exception as e:
229
+ self.logger.error(f"Error in enhanced PPT processing: {e}")
230
+ import traceback
231
+ self.logger.debug(traceback.format_exc())
232
+ return self._extract_ppt_simple(current_file)
233
+
234
+ def _format_chart_data(self, chart_data: "ChartData") -> str:
235
+ """Format ChartData using ChartProcessor."""
236
+ from xgen_doc2chunk.core.functions.chart_extractor import ChartData
237
+
238
+ if not isinstance(chart_data, ChartData):
239
+ return ""
240
+
241
+ if chart_data.has_data():
242
+ return self.chart_processor.format_chart_data(
243
+ chart_type=chart_data.chart_type,
244
+ title=chart_data.title,
245
+ categories=chart_data.categories,
246
+ series=chart_data.series
247
+ )
248
+ else:
249
+ return self.chart_processor.format_chart_fallback(
250
+ chart_type=chart_data.chart_type,
251
+ title=chart_data.title
252
+ )
253
+
254
+ def _extract_ppt_simple(self, current_file: "CurrentFile") -> str:
255
+ """Simple text extraction (fallback)."""
256
+ try:
257
+ file_data = current_file.get("file_data", b"")
258
+ file_stream = self.get_file_stream(current_file)
259
+ prs = self.file_converter.convert(file_data, file_stream)
260
+ result_parts = []
261
+
262
+ for slide_idx, slide in enumerate(prs.slides):
263
+ slide_tag = self.create_slide_tag(slide_idx + 1)
264
+ result_parts.append(f"\n{slide_tag}\n")
265
+
266
+ slide_texts = []
267
+ for shape in slide.shapes:
268
+ try:
269
+ if hasattr(shape, "text") and shape.text.strip():
270
+ slide_texts.append(shape.text.strip())
271
+ elif hasattr(shape, "table"):
272
+ table_text = extract_table_as_text(shape.table)
273
+ if table_text:
274
+ slide_texts.append(table_text)
275
+ except:
276
+ continue
277
+
278
+ if slide_texts:
279
+ result_parts.append("\n".join(slide_texts) + "\n")
280
+ else:
281
+ result_parts.append("[Empty Slide]\n")
282
+
283
+ return "".join(result_parts)
284
+
285
+ except Exception as e:
286
+ self.logger.error(f"Error in simple PPT extraction: {e}")
287
+ return f"[PPT file processing failed: {str(e)}]"
288
+
@@ -0,0 +1,96 @@
1
+ """
2
+ PPT Helper 모듈
3
+
4
+ PPT/PPTX 문서 처리를 위한 헬퍼 함수 모음.
5
+
6
+ 모듈 구성:
7
+ - ppt_constants: 상수, 매핑 테이블, 타입 정의
8
+ - ppt_metadata: 메타데이터 추출/포맷팅
9
+ - ppt_bullet: 목록(Bullet/Numbering) 처리
10
+ - ppt_table: 테이블 처리 (HTML 변환, 병합)
11
+ - ppt_chart_extractor: 차트 데이터 추출 (ChartExtractor)
12
+ - ppt_shape: Shape 처리 (위치, 이미지, 그룹)
13
+ - ppt_slide: 슬라이드 처리 (노트, 요소 병합)
14
+ """
15
+
16
+ # === Constants ===
17
+ from xgen_doc2chunk.core.processor.ppt_helper.ppt_constants import (
18
+ WINGDINGS_MAPPING,
19
+ WINGDINGS_CHAR_MAPPING,
20
+ SYMBOL_MAPPING,
21
+ ElementType,
22
+ SlideElement,
23
+ )
24
+
25
+ # === Metadata ===
26
+ from xgen_doc2chunk.core.processor.ppt_helper.ppt_metadata import (
27
+ PPTMetadataExtractor,
28
+ )
29
+
30
+ # === Bullet/Numbering ===
31
+ from xgen_doc2chunk.core.processor.ppt_helper.ppt_bullet import (
32
+ extract_text_with_bullets,
33
+ extract_bullet_info,
34
+ convert_special_font_char,
35
+ )
36
+
37
+ # === Table ===
38
+ from xgen_doc2chunk.core.processor.ppt_helper.ppt_table import (
39
+ is_simple_table,
40
+ extract_simple_table_as_text,
41
+ convert_table_to_html,
42
+ extract_table_as_text,
43
+ debug_table_structure,
44
+ )
45
+
46
+ # === Chart Extractor ===
47
+ from xgen_doc2chunk.core.processor.ppt_helper.ppt_chart_extractor import (
48
+ PPTChartExtractor,
49
+ )
50
+
51
+ # === Shape ===
52
+ from xgen_doc2chunk.core.processor.ppt_helper.ppt_shape import (
53
+ get_shape_position,
54
+ is_picture_shape,
55
+ process_image_shape,
56
+ process_group_shape,
57
+ )
58
+
59
+ # === Slide ===
60
+ from xgen_doc2chunk.core.processor.ppt_helper.ppt_slide import (
61
+ extract_slide_notes,
62
+ merge_slide_elements,
63
+ )
64
+
65
+
66
+ __all__ = [
67
+ # Constants
68
+ "WINGDINGS_MAPPING",
69
+ "WINGDINGS_CHAR_MAPPING",
70
+ "SYMBOL_MAPPING",
71
+ "ElementType",
72
+ "SlideElement",
73
+ # Metadata
74
+ "extract_ppt_metadata",
75
+ "format_metadata",
76
+ # Bullet
77
+ "extract_text_with_bullets",
78
+ "extract_bullet_info",
79
+ "convert_special_font_char",
80
+ # Table
81
+ "is_simple_table",
82
+ "extract_simple_table_as_text",
83
+ "convert_table_to_html",
84
+ "extract_table_as_text",
85
+ "debug_table_structure",
86
+ # Chart Extractor
87
+ "PPTChartExtractor",
88
+ # Shape
89
+ "get_shape_position",
90
+ "is_picture_shape",
91
+ "process_image_shape",
92
+ "process_group_shape",
93
+ # Slide
94
+ "extract_slide_notes",
95
+ "merge_slide_elements",
96
+ ]