xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,598 @@
1
+ """
2
+ Complexity Analyzer for PDF Handler
3
+
4
+ Analyzes page and region complexity to determine processing strategy.
5
+
6
+ Processing Strategy Based on Complexity Score:
7
+ 1. Determine processing strategy based on complexity score
8
+ 2. Complex regions use block image + OCR
9
+ 3. Simple regions use standard text extraction
10
+
11
+ Complexity Criteria:
12
+ - Drawing density (curves, lines, fill count)
13
+ - Image density
14
+ - Text quality (broken text ratio)
15
+ - Layout complexity (multi-column)
16
+ """
17
+
18
+ import logging
19
+ from dataclasses import dataclass, field
20
+ from typing import List, Dict, Optional, Tuple, Set
21
+ from enum import Enum, auto
22
+
23
+ import fitz
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ # ============================================================================
29
+ # Types and Enums
30
+ # ============================================================================
31
+
32
+ class ComplexityLevel(Enum):
33
+ """Complexity level"""
34
+ SIMPLE = auto() # Simple text - standard extraction
35
+ MODERATE = auto() # Moderate complexity - extraction + quality validation
36
+ COMPLEX = auto() # Complex - block image recommended
37
+ EXTREME = auto() # Extremely complex - full page image recommended
38
+
39
+
40
+ class ProcessingStrategy(Enum):
41
+ """Processing strategy"""
42
+ TEXT_EXTRACTION = auto() # Standard text extraction
43
+ HYBRID = auto() # Text + partial OCR
44
+ BLOCK_IMAGE_OCR = auto() # Block image + OCR
45
+ FULL_PAGE_OCR = auto() # Full page OCR
46
+
47
+
48
+ @dataclass
49
+ class RegionComplexity:
50
+ """Region complexity information"""
51
+ bbox: Tuple[float, float, float, float]
52
+ complexity_level: ComplexityLevel
53
+ complexity_score: float # 0.0 ~ 1.0
54
+
55
+ # Detail scores
56
+ drawing_density: float = 0.0
57
+ image_density: float = 0.0
58
+ text_quality: float = 1.0 # 1.0 = perfect, 0.0 = completely broken
59
+ layout_complexity: float = 0.0
60
+
61
+ # Recommended strategy
62
+ recommended_strategy: ProcessingStrategy = ProcessingStrategy.TEXT_EXTRACTION
63
+
64
+ # Detailed information
65
+ reasons: List[str] = field(default_factory=list)
66
+
67
+
68
+ @dataclass
69
+ class PageComplexity:
70
+ """Full page complexity information"""
71
+ page_num: int
72
+ page_size: Tuple[float, float]
73
+
74
+ # Overall complexity
75
+ overall_complexity: ComplexityLevel
76
+ overall_score: float
77
+
78
+ # Region-wise complexity
79
+ regions: List[RegionComplexity] = field(default_factory=list)
80
+
81
+ # Complex regions
82
+ complex_regions: List[Tuple[float, float, float, float]] = field(default_factory=list)
83
+
84
+ # Statistics
85
+ total_drawings: int = 0
86
+ total_images: int = 0
87
+ total_text_blocks: int = 0
88
+ column_count: int = 1
89
+
90
+ # Recommended strategy
91
+ recommended_strategy: ProcessingStrategy = ProcessingStrategy.TEXT_EXTRACTION
92
+
93
+
94
+ # ============================================================================
95
+ # Configuration
96
+ # ============================================================================
97
+
98
+ @dataclass
99
+ class ComplexityConfig:
100
+ """Complexity analysis configuration"""
101
+ # Drawing density threshold (per 1000pt² area)
102
+ DRAWING_DENSITY_MODERATE = 0.5
103
+ DRAWING_DENSITY_COMPLEX = 2.0
104
+ DRAWING_DENSITY_EXTREME = 5.0
105
+
106
+ # Image density threshold
107
+ IMAGE_DENSITY_MODERATE = 0.1
108
+ IMAGE_DENSITY_COMPLEX = 0.3
109
+ IMAGE_DENSITY_EXTREME = 0.5
110
+
111
+ # Text quality threshold
112
+ TEXT_QUALITY_POOR = 0.7
113
+ TEXT_QUALITY_BAD = 0.5
114
+
115
+ # Layout complexity (multi-column)
116
+ # Raised threshold - multi-column does not automatically mean EXTREME
117
+ COLUMN_COUNT_MODERATE = 3 # 3+ columns = MODERATE
118
+ COLUMN_COUNT_COMPLEX = 5 # 5+ columns = COMPLEX (newspaper-level)
119
+ COLUMN_COUNT_EXTREME = 7 # 7+ columns = EXTREME (very complex newspaper)
120
+
121
+ # Overall complexity threshold
122
+ # Raised EXTREME threshold - avoid going to FULL_PAGE_OCR too easily
123
+ COMPLEXITY_MODERATE = 0.35
124
+ COMPLEXITY_COMPLEX = 0.65
125
+ COMPLEXITY_EXTREME = 0.90 # Raised from 0.8 to 0.90
126
+
127
+ # Region division settings
128
+ REGION_GRID_SIZE = 200 # pt - analysis grid size
129
+ MIN_REGION_SIZE = 100 # pt - minimum region size
130
+
131
+ # Table quality analysis enabled
132
+ ANALYZE_TABLE_QUALITY = True # Enable table quality analysis
133
+ TABLE_QUALITY_THRESHOLD = 0.65 # Attempt table extraction if above this
134
+
135
+
136
+ # ============================================================================
137
+ # Complexity Analyzer
138
+ # ============================================================================
139
+
140
+ class ComplexityAnalyzer:
141
+ """
142
+ Page complexity analyzer.
143
+
144
+ Analyzes page complexity to determine the optimal processing strategy.
145
+ """
146
+
147
+ def __init__(self, page, page_num: int, config: Optional[ComplexityConfig] = None):
148
+ """
149
+ Args:
150
+ page: PyMuPDF page object
151
+ page_num: Page number (0-indexed)
152
+ config: Analysis configuration
153
+ """
154
+ self.page = page
155
+ self.page_num = page_num
156
+ self.config = config or ComplexityConfig()
157
+
158
+ self.page_width = page.rect.width
159
+ self.page_height = page.rect.height
160
+ self.page_area = self.page_width * self.page_height
161
+
162
+ # Cache
163
+ self._drawings = None
164
+ self._text_dict = None
165
+ self._images = None
166
+
167
+ def analyze(self) -> PageComplexity:
168
+ """
169
+ Analyzes page complexity.
170
+
171
+ Returns:
172
+ PageComplexity object
173
+ """
174
+ # Collect base data
175
+ drawings = self._get_drawings()
176
+ text_dict = self._get_text_dict()
177
+ images = self._get_images()
178
+
179
+ text_blocks = [b for b in text_dict.get("blocks", []) if b.get("type") == 0]
180
+
181
+ # 1. Overall statistics
182
+ total_drawings = len(drawings)
183
+ total_images = len(images)
184
+ total_text_blocks = len(text_blocks)
185
+
186
+ # 2. Analyze column count
187
+ column_count = self._analyze_columns(text_blocks)
188
+
189
+ # 3. Drawing complexity
190
+ drawing_complexity = self._analyze_drawing_complexity(drawings)
191
+
192
+ # 4. Image complexity
193
+ image_complexity = self._analyze_image_complexity(images)
194
+
195
+ # 5. Text quality
196
+ text_quality = self._analyze_text_quality(text_blocks)
197
+
198
+ # 6. Layout complexity
199
+ layout_complexity = self._analyze_layout_complexity(column_count, text_blocks)
200
+
201
+ # 7. Calculate overall complexity score
202
+ overall_score = self._calculate_overall_score(
203
+ drawing_complexity, image_complexity, text_quality, layout_complexity
204
+ )
205
+
206
+ # 8. Determine complexity level
207
+ overall_complexity = self._determine_complexity_level(overall_score)
208
+
209
+ # 9. Region-wise analysis
210
+ regions = self._analyze_regions(drawings, text_blocks, images)
211
+
212
+ # 10. Identify complex regions
213
+ complex_regions = [
214
+ r.bbox for r in regions
215
+ if r.complexity_level in (ComplexityLevel.COMPLEX, ComplexityLevel.EXTREME)
216
+ ]
217
+
218
+ # 11. Determine processing strategy
219
+ recommended_strategy = self._determine_strategy(
220
+ overall_complexity, overall_score, text_quality, complex_regions
221
+ )
222
+
223
+ result = PageComplexity(
224
+ page_num=self.page_num,
225
+ page_size=(self.page_width, self.page_height),
226
+ overall_complexity=overall_complexity,
227
+ overall_score=overall_score,
228
+ regions=regions,
229
+ complex_regions=complex_regions,
230
+ total_drawings=total_drawings,
231
+ total_images=total_images,
232
+ total_text_blocks=total_text_blocks,
233
+ column_count=column_count,
234
+ recommended_strategy=recommended_strategy
235
+ )
236
+
237
+ logger.debug(f"[ComplexityAnalyzer] Page {self.page_num + 1}: "
238
+ f"complexity={overall_complexity.name}, score={overall_score:.2f}, "
239
+ f"strategy={recommended_strategy.name}, "
240
+ f"complex_regions={len(complex_regions)}")
241
+
242
+ return result
243
+
244
+ def _get_drawings(self) -> List[Dict]:
245
+ """Cached drawings retrieval"""
246
+ if self._drawings is None:
247
+ self._drawings = self.page.get_drawings()
248
+ return self._drawings
249
+
250
+ def _get_text_dict(self) -> Dict:
251
+ """Cached text dictionary retrieval"""
252
+ if self._text_dict is None:
253
+ self._text_dict = self.page.get_text("dict", sort=True)
254
+ return self._text_dict
255
+
256
+ def _get_images(self) -> List:
257
+ """Cached images retrieval"""
258
+ if self._images is None:
259
+ self._images = self.page.get_images()
260
+ return self._images
261
+
262
+ def _analyze_columns(self, text_blocks: List[Dict]) -> int:
263
+ """Analyze column count"""
264
+ if not text_blocks:
265
+ return 1
266
+
267
+ x_positions = []
268
+ for block in text_blocks:
269
+ bbox = block.get("bbox", (0, 0, 0, 0))
270
+ x_positions.append(bbox[0])
271
+
272
+ if not x_positions:
273
+ return 1
274
+
275
+ x_positions.sort()
276
+
277
+ # Clustering
278
+ columns = []
279
+ current_column = [x_positions[0]]
280
+
281
+ for x in x_positions[1:]:
282
+ if x - current_column[-1] < 50: # Within 50pt means same column
283
+ current_column.append(x)
284
+ else:
285
+ columns.append(current_column)
286
+ current_column = [x]
287
+ columns.append(current_column)
288
+
289
+ return len(columns)
290
+
291
+ def _analyze_drawing_complexity(self, drawings: List[Dict]) -> float:
292
+ """Analyze drawing complexity (0.0 ~ 1.0)"""
293
+ if not drawings:
294
+ return 0.0
295
+
296
+ # Count items
297
+ total_items = 0
298
+ curve_count = 0
299
+ fill_count = 0
300
+
301
+ for d in drawings:
302
+ items = d.get("items", [])
303
+ total_items += len(items)
304
+
305
+ for item in items:
306
+ if item[0] == 'c': # Curve
307
+ curve_count += 1
308
+
309
+ if d.get("fill"):
310
+ fill_count += 1
311
+
312
+ # Calculate density (per 1000pt²)
313
+ density = total_items / (self.page_area / 1000) if self.page_area > 0 else 0
314
+
315
+ # Curve ratio (chart/graph indicator)
316
+ curve_ratio = curve_count / max(1, total_items)
317
+
318
+ # Fill ratio (color complexity)
319
+ fill_ratio = fill_count / max(1, len(drawings))
320
+
321
+ # Calculate complexity score
322
+ score = 0.0
323
+
324
+ if density >= self.config.DRAWING_DENSITY_EXTREME:
325
+ score = 1.0
326
+ elif density >= self.config.DRAWING_DENSITY_COMPLEX:
327
+ score = 0.7
328
+ elif density >= self.config.DRAWING_DENSITY_MODERATE:
329
+ score = 0.4
330
+ else:
331
+ score = density / self.config.DRAWING_DENSITY_MODERATE * 0.4
332
+
333
+ # Add points for curves and fills
334
+ score += curve_ratio * 0.2
335
+ score += fill_ratio * 0.1
336
+
337
+ return min(1.0, score)
338
+
339
+ def _analyze_image_complexity(self, images: List) -> float:
340
+ """Analyze image complexity (0.0 ~ 1.0)"""
341
+ if not images:
342
+ return 0.0
343
+
344
+ # Image density (relative to page size)
345
+ density = len(images) / (self.page_area / 10000) # Per 100x100pt
346
+
347
+ if density >= self.config.IMAGE_DENSITY_EXTREME:
348
+ return 1.0
349
+ elif density >= self.config.IMAGE_DENSITY_COMPLEX:
350
+ return 0.7
351
+ elif density >= self.config.IMAGE_DENSITY_MODERATE:
352
+ return 0.4
353
+ else:
354
+ return density / self.config.IMAGE_DENSITY_MODERATE * 0.4
355
+
356
+ def _analyze_text_quality(self, text_blocks: List[Dict]) -> float:
357
+ """Analyze text quality (0.0 = poor, 1.0 = good)"""
358
+ if not text_blocks:
359
+ return 1.0
360
+
361
+ total_chars = 0
362
+ bad_chars = 0
363
+
364
+ for block in text_blocks:
365
+ for line in block.get("lines", []):
366
+ for span in line.get("spans", []):
367
+ text = span.get("text", "")
368
+ total_chars += len(text)
369
+
370
+ for char in text:
371
+ code = ord(char)
372
+ # PUA (Private Use Area) characters
373
+ if 0xE000 <= code <= 0xF8FF:
374
+ bad_chars += 1
375
+ # Strange symbols
376
+ elif code in range(0x2400, 0x2500): # Control Pictures
377
+ bad_chars += 1
378
+
379
+ if total_chars == 0:
380
+ return 1.0
381
+
382
+ return 1.0 - (bad_chars / total_chars)
383
+
384
+ def _analyze_layout_complexity(self, column_count: int, text_blocks: List[Dict]) -> float:
385
+ """Analyze layout complexity (0.0 ~ 1.0).
386
+
387
+ Does not automatically assign high score for multi-column layouts.
388
+ TEXT_EXTRACTION may be more efficient when tables can be processed.
389
+ """
390
+ score = 0.0
391
+
392
+ # Column count based - relaxed threshold
393
+ if column_count >= getattr(self.config, 'COLUMN_COUNT_EXTREME', 7):
394
+ # 7+ columns = very complex newspaper layout
395
+ score = 0.95
396
+ logger.info(f"[ComplexityAnalyzer] Page {self.page_num + 1}: "
397
+ f"Detected very complex layout ({column_count} columns) → HIGH")
398
+ elif column_count >= self.config.COLUMN_COUNT_COMPLEX:
399
+ # 5-6 columns = newspaper-level layout, but may be table-processable
400
+ score = 0.75
401
+ logger.info(f"[ComplexityAnalyzer] Page {self.page_num + 1}: "
402
+ f"Detected multi-column layout ({column_count} columns) → COMPLEX")
403
+ elif column_count >= self.config.COLUMN_COUNT_MODERATE:
404
+ # 3-4 columns = moderate complexity
405
+ score = 0.5
406
+ elif column_count >= 2:
407
+ # 2 columns = low complexity
408
+ score = 0.3
409
+
410
+ # Text block distribution analysis - multiple blocks at same Y indicates multi-column
411
+ if text_blocks:
412
+ y_positions = [b.get("bbox", (0,0,0,0))[1] for b in text_blocks]
413
+ unique_y = len(set(int(y/10) for y in y_positions))
414
+
415
+ if unique_y < len(text_blocks) * 0.5 and len(text_blocks) > 5:
416
+ # Multiple blocks on same Y line = additional evidence of multi-column layout
417
+ score = max(score, 0.6)
418
+
419
+ return min(1.0, score)
420
+
421
+ def _calculate_overall_score(
422
+ self,
423
+ drawing: float,
424
+ image: float,
425
+ text_quality: float,
426
+ layout: float
427
+ ) -> float:
428
+ """Calculate overall complexity score.
429
+
430
+ Does not determine EXTREME based on layout complexity alone.
431
+ TEXT_EXTRACTION is more efficient when tables can be processed.
432
+ """
433
+ # Extremely complex layout (7+ columns) gets high score
434
+ if layout >= 0.95:
435
+ return 0.9 # Limited to 0.9 (other factors needed for EXTREME)
436
+
437
+ # Standard weighted calculation
438
+ # Layout weight reduced (0.35 → 0.25)
439
+ w_drawing = 0.30
440
+ w_image = 0.20
441
+ w_text = 0.25
442
+ w_layout = 0.25
443
+
444
+ # Text quality is inverse (lower = more complex)
445
+ text_complexity = 1.0 - text_quality
446
+
447
+ score = (
448
+ drawing * w_drawing +
449
+ image * w_image +
450
+ text_complexity * w_text +
451
+ layout * w_layout
452
+ )
453
+
454
+ return min(1.0, score)
455
+
456
+ def _determine_complexity_level(self, score: float) -> ComplexityLevel:
457
+ """Determine complexity level"""
458
+ if score >= self.config.COMPLEXITY_EXTREME:
459
+ return ComplexityLevel.EXTREME
460
+ elif score >= self.config.COMPLEXITY_COMPLEX:
461
+ return ComplexityLevel.COMPLEX
462
+ elif score >= self.config.COMPLEXITY_MODERATE:
463
+ return ComplexityLevel.MODERATE
464
+ else:
465
+ return ComplexityLevel.SIMPLE
466
+
467
+ def _analyze_regions(
468
+ self,
469
+ drawings: List[Dict],
470
+ text_blocks: List[Dict],
471
+ images: List
472
+ ) -> List[RegionComplexity]:
473
+ """Analyze complexity by region"""
474
+ regions = []
475
+ grid_size = self.config.REGION_GRID_SIZE
476
+
477
+ # Grid-based analysis
478
+ for y in range(0, int(self.page_height), grid_size):
479
+ for x in range(0, int(self.page_width), grid_size):
480
+ x0, y0 = x, y
481
+ x1 = min(x + grid_size, self.page_width)
482
+ y1 = min(y + grid_size, self.page_height)
483
+
484
+ bbox = (x0, y0, x1, y1)
485
+
486
+ # Number of drawings in region
487
+ region_drawings = [
488
+ d for d in drawings
489
+ if d.get("rect") and self._bbox_overlaps(bbox, tuple(d["rect"]))
490
+ ]
491
+
492
+ # Number of text blocks in region
493
+ region_texts = [
494
+ b for b in text_blocks
495
+ if self._bbox_overlaps(bbox, b.get("bbox", (0,0,0,0)))
496
+ ]
497
+
498
+ # Calculate region complexity
499
+ area = (x1 - x0) * (y1 - y0)
500
+ drawing_density = len(region_drawings) / (area / 1000) if area > 0 else 0
501
+
502
+ # Text quality
503
+ text_quality = self._analyze_text_quality(region_texts)
504
+
505
+ # Complexity score
506
+ region_score = min(1.0, drawing_density / 3.0 + (1.0 - text_quality) * 0.5)
507
+
508
+ # Determine level
509
+ if region_score >= 0.7:
510
+ level = ComplexityLevel.COMPLEX
511
+ elif region_score >= 0.4:
512
+ level = ComplexityLevel.MODERATE
513
+ else:
514
+ level = ComplexityLevel.SIMPLE
515
+
516
+ # Determine strategy
517
+ if level == ComplexityLevel.COMPLEX:
518
+ strategy = ProcessingStrategy.BLOCK_IMAGE_OCR
519
+ elif text_quality < 0.7:
520
+ strategy = ProcessingStrategy.HYBRID
521
+ else:
522
+ strategy = ProcessingStrategy.TEXT_EXTRACTION
523
+
524
+ regions.append(RegionComplexity(
525
+ bbox=bbox,
526
+ complexity_level=level,
527
+ complexity_score=region_score,
528
+ drawing_density=drawing_density,
529
+ text_quality=text_quality,
530
+ recommended_strategy=strategy
531
+ ))
532
+
533
+ return regions
534
+
535
+ def _determine_strategy(
536
+ self,
537
+ complexity: ComplexityLevel,
538
+ score: float,
539
+ text_quality: float,
540
+ complex_regions: List[Tuple]
541
+ ) -> ProcessingStrategy:
542
+ """Determine processing strategy.
543
+
544
+ Recommends TEXT_EXTRACTION even for multi-column layouts if table processing is possible.
545
+ Text extraction is more efficient than image conversion when table quality is good.
546
+ """
547
+ # 1. Full page image conversion if text quality is very low
548
+ if text_quality < 0.4:
549
+ logger.info(f"[ComplexityAnalyzer] Page {self.page_num + 1}: "
550
+ f"Very low text quality ({text_quality:.2f}) → FULL_PAGE_OCR")
551
+ return ProcessingStrategy.FULL_PAGE_OCR
552
+
553
+ # 2. Full page image conversion if extremely complex (score >= 0.90) and low text quality
554
+ if complexity == ComplexityLevel.EXTREME and text_quality < 0.6:
555
+ return ProcessingStrategy.FULL_PAGE_OCR
556
+
557
+ # 3. Full page image conversion if complex regions are 50%+ and text quality is low
558
+ if len(complex_regions) > 0:
559
+ complex_area = sum(
560
+ (r[2] - r[0]) * (r[3] - r[1]) for r in complex_regions
561
+ )
562
+ if complex_area / self.page_area > 0.5 and text_quality < 0.7:
563
+ return ProcessingStrategy.FULL_PAGE_OCR
564
+
565
+ # 4. Try HYBRID processing even for COMPLEX level
566
+ # (Determine table/text processability per block)
567
+ if complexity == ComplexityLevel.COMPLEX:
568
+ return ProcessingStrategy.HYBRID # HYBRID instead of FULL_PAGE_OCR
569
+
570
+ # 5. Hybrid for moderate complexity
571
+ if complexity == ComplexityLevel.MODERATE:
572
+ return ProcessingStrategy.HYBRID
573
+
574
+ # 6. Text extraction for simple
575
+ return ProcessingStrategy.TEXT_EXTRACTION
576
+
577
+ def _bbox_overlaps(self, bbox1: Tuple, bbox2: Tuple) -> bool:
578
+ """Check if two bboxes overlap"""
579
+ return not (
580
+ bbox1[2] <= bbox2[0] or # bbox1 is left of bbox2
581
+ bbox1[0] >= bbox2[2] or # bbox1 is right of bbox2
582
+ bbox1[3] <= bbox2[1] or # bbox1 is above bbox2
583
+ bbox1[1] >= bbox2[3] # bbox1 is below bbox2
584
+ )
585
+
586
+
587
+ # ============================================================================
588
+ # Export
589
+ # ============================================================================
590
+
591
+ __all__ = [
592
+ 'ComplexityLevel',
593
+ 'ProcessingStrategy',
594
+ 'RegionComplexity',
595
+ 'PageComplexity',
596
+ 'ComplexityConfig',
597
+ 'ComplexityAnalyzer',
598
+ ]
@@ -0,0 +1,46 @@
1
+ # xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py
2
+ """
3
+ PDF Element Merger Module
4
+
5
+ Provides functions for merging and sorting page elements.
6
+ """
7
+ import logging
8
+ from typing import List
9
+
10
+ from xgen_doc2chunk.core.processor.pdf_helpers.types import (
11
+ ElementType,
12
+ PageElement,
13
+ )
14
+
15
+ logger = logging.getLogger("document-processor")
16
+
17
+
18
+ def merge_page_elements(elements: List[PageElement]) -> str:
19
+ """
20
+ Merge page elements sorted by position.
21
+
22
+ Args:
23
+ elements: List of PageElement
24
+
25
+ Returns:
26
+ Merged text content
27
+ """
28
+ if not elements:
29
+ return ""
30
+
31
+ sorted_elements = sorted(elements, key=lambda e: (e.bbox[1], e.bbox[0]))
32
+
33
+ text_parts = []
34
+
35
+ for element in sorted_elements:
36
+ content = element.content.strip()
37
+ if not content:
38
+ continue
39
+
40
+ if element.element_type == ElementType.TABLE:
41
+ text_parts.append(f"\n{content}\n")
42
+ else:
43
+ text_parts.append(content)
44
+
45
+ return "\n".join(text_parts)
46
+