xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,667 @@
1
+ """
2
+ Block Image Engine for PDF Handler
3
+
4
+ Splits complex regions into semantic block units, renders them as images, and saves locally.
5
+
6
+ =============================================================================
7
+ Core Concepts:
8
+ =============================================================================
9
+ Previous: Upload entire page as single image
10
+ Improved: Split page into **semantic/logical block units** and save each as PNG
11
+
12
+ Benefits:
13
+ 1. LLM can interpret each block **individually**
14
+ 2. Resolution issues resolved (high resolution maintained per block)
15
+ 3. Reading order preserved
16
+ 4. Context separation (ads/articles/tables distinguished)
17
+
18
+ =============================================================================
19
+ Processing Strategies:
20
+ =============================================================================
21
+ 1. SEMANTIC_BLOCKS: Semantic block-based splitting (recommended)
22
+ - Block detection via LayoutBlockDetector
23
+ - Convert each block to individual image
24
+ - Generate [Image:path] tags in reading order
25
+
26
+ 2. GRID_BLOCKS: Grid-based splitting (fallback)
27
+ - Split page into NxM grid
28
+ - Convert each grid cell to individual image
29
+
30
+ 3. FULL_PAGE: Full page imaging (last resort)
31
+ - Maintain existing approach
32
+
33
+ Rendering Settings:
34
+ - Default DPI: 300 (high resolution)
35
+ - Max image size: 4096px
36
+ - Image format: PNG (lossless)
37
+ """
38
+
39
+ import logging
40
+ import io
41
+ import hashlib
42
+ from dataclasses import dataclass, field
43
+ from typing import List, Dict, Optional, Tuple
44
+ from enum import Enum, auto
45
+
46
+ import fitz
47
+ from PIL import Image
48
+
49
+ # Image processing module
50
+ from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
51
+
52
+ logger = logging.getLogger(__name__)
53
+
54
+
55
+ # ============================================================================
56
+ # Block Strategy Enum
57
+ # ============================================================================
58
+
59
+ class BlockStrategy(Enum):
60
+ """Block processing strategy."""
61
+ SEMANTIC_BLOCKS = auto() # Semantic block units
62
+ GRID_BLOCKS = auto() # Grid-based splitting
63
+ FULL_PAGE = auto() # Full page
64
+
65
+
66
+ # ============================================================================
67
+ # Configuration
68
+ # ============================================================================
69
+
70
+ @dataclass
71
+ class BlockImageConfig:
72
+ """Block image engine configuration."""
73
+ # Rendering settings
74
+ DEFAULT_DPI: int = 300
75
+ MAX_IMAGE_SIZE: int = 4096
76
+
77
+ # Image format
78
+ IMAGE_FORMAT: str = "PNG"
79
+
80
+ # Region settings
81
+ REGION_PADDING: int = 5 # Region padding (pt)
82
+
83
+ # Minimum size (below this is ignored)
84
+ MIN_REGION_WIDTH: int = 80 # Increased
85
+ MIN_REGION_HEIGHT: int = 60 # Increased
86
+
87
+ # Block splitting strategy
88
+ PREFERRED_STRATEGY: str = "semantic" # semantic, grid, full_page
89
+
90
+ # Grid splitting settings (for GRID_BLOCKS strategy)
91
+ GRID_ROWS: int = 2
92
+ GRID_COLS: int = 2
93
+
94
+ # Block merging settings
95
+ MERGE_SMALL_BLOCKS: bool = True
96
+ MIN_BLOCK_AREA: float = 15000.0 # Minimum block area (pt²) - significantly increased
97
+
98
+ # Empty block filtering
99
+ SKIP_EMPTY_BLOCKS: bool = True
100
+ EMPTY_THRESHOLD: float = 0.95 # Block is empty if white pixel ratio exceeds this
101
+
102
+
103
+ @dataclass
104
+ class BlockImageResult:
105
+ """Block image processing result."""
106
+ bbox: Tuple[float, float, float, float]
107
+
108
+ # Image info
109
+ image_size: Tuple[int, int]
110
+ dpi: int
111
+
112
+ # Image path
113
+ image_path: Optional[str] = None
114
+
115
+ # Inline tag ([Image:{path}] format)
116
+ image_tag: Optional[str] = None
117
+
118
+ # Success status
119
+ success: bool = False
120
+ error: Optional[str] = None
121
+
122
+ # Block info (advanced)
123
+ block_type: Optional[str] = None # Block type (article, image, table, etc.)
124
+ reading_order: int = 0 # Reading order
125
+ column_index: int = 0 # Column index
126
+
127
+
128
+ @dataclass
129
+ class MultiBlockResult:
130
+ """Multi-block processing result."""
131
+ page_num: int
132
+ strategy_used: BlockStrategy
133
+
134
+ # Individual block results (in reading order)
135
+ block_results: List[BlockImageResult] = field(default_factory=list)
136
+
137
+ # Overall success status
138
+ success: bool = False
139
+
140
+ # Combined text output (includes all [Image:...] tags)
141
+ combined_output: str = ""
142
+
143
+ # Statistics
144
+ total_blocks: int = 0
145
+ successful_blocks: int = 0
146
+ failed_blocks: int = 0
147
+
148
+
149
+ # ============================================================================
150
+ # Block Image Engine
151
+ # ============================================================================
152
+
153
+ class BlockImageEngine:
154
+ """
155
+ Block Image Engine
156
+
157
+ Renders complex regions as images and saves locally.
158
+ Results are returned in [image:{path}] format.
159
+ """
160
+
161
+ def __init__(
162
+ self,
163
+ page,
164
+ page_num: int,
165
+ image_processor: ImageProcessor,
166
+ config: Optional[BlockImageConfig] = None
167
+ ):
168
+ """
169
+ Args:
170
+ page: PyMuPDF page object
171
+ page_num: Page number (0-indexed)
172
+ image_processor: ImageProcessor instance for saving images
173
+ config: Engine configuration (BlockImageConfig)
174
+ """
175
+ self.page = page
176
+ self.page_num = page_num
177
+ self.config = config or BlockImageConfig()
178
+
179
+ self.page_width = page.rect.width
180
+ self.page_height = page.rect.height
181
+
182
+ self._image_processor = image_processor
183
+
184
+ # Processed image hashes (duplicate prevention)
185
+ self._processed_hashes: set = set()
186
+
187
+ def process_region(
188
+ self,
189
+ bbox: Tuple[float, float, float, float],
190
+ region_type: str = "complex_region"
191
+ ) -> BlockImageResult:
192
+ """
193
+ Renders a specific region as an image and saves locally.
194
+
195
+ Args:
196
+ bbox: Region to process (x0, y0, x1, y1)
197
+ region_type: Region type (for logging)
198
+
199
+ Returns:
200
+ BlockImageResult object (includes image_path, image_tag)
201
+ """
202
+ try:
203
+ # Minimum size validation
204
+ width = bbox[2] - bbox[0]
205
+ height = bbox[3] - bbox[1]
206
+
207
+ if width < self.config.MIN_REGION_WIDTH or height < self.config.MIN_REGION_HEIGHT:
208
+ return BlockImageResult(
209
+ bbox=bbox,
210
+ image_size=(0, 0),
211
+ dpi=0,
212
+ success=False,
213
+ error="Region too small"
214
+ )
215
+
216
+ # 1. Render region image
217
+ image_bytes, actual_dpi, image_size = self._render_region(bbox)
218
+
219
+ if image_bytes is None:
220
+ return BlockImageResult(
221
+ bbox=bbox,
222
+ image_size=(0, 0),
223
+ dpi=self.config.DEFAULT_DPI,
224
+ success=False,
225
+ error="Failed to render region"
226
+ )
227
+
228
+ # 2. Duplicate check
229
+ image_hash = hashlib.md5(image_bytes).hexdigest()
230
+ if image_hash in self._processed_hashes:
231
+ return BlockImageResult(
232
+ bbox=bbox,
233
+ image_size=image_size,
234
+ dpi=actual_dpi,
235
+ success=False,
236
+ error="Duplicate image"
237
+ )
238
+ self._processed_hashes.add(image_hash)
239
+
240
+ # 3. Save locally (using ImageProcessor)
241
+ image_tag = self._image_processor.save_image(image_bytes)
242
+
243
+ if not image_tag:
244
+ return BlockImageResult(
245
+ bbox=bbox,
246
+ image_size=image_size,
247
+ dpi=actual_dpi,
248
+ success=False,
249
+ error="Failed to save image"
250
+ )
251
+
252
+ # Extract path (from tag)
253
+ image_path = image_tag.replace("[Image:", "").replace("]", "")
254
+
255
+ logger.debug(f"[BlockImageEngine] Saved {region_type} at page {self.page_num + 1}: {image_path}")
256
+
257
+ return BlockImageResult(
258
+ bbox=bbox,
259
+ image_size=image_size,
260
+ dpi=actual_dpi,
261
+ image_path=image_path,
262
+ image_tag=image_tag,
263
+ success=True
264
+ )
265
+
266
+ except Exception as e:
267
+ logger.error(f"[BlockImageEngine] Error processing region {bbox}: {e}")
268
+ return BlockImageResult(
269
+ bbox=bbox,
270
+ image_size=(0, 0),
271
+ dpi=self.config.DEFAULT_DPI,
272
+ success=False,
273
+ error=str(e)
274
+ )
275
+
276
+ def process_full_page(self, region_type: str = "full_page") -> BlockImageResult:
277
+ """
278
+ Renders the entire page as an image and saves locally.
279
+
280
+ Args:
281
+ region_type: Region type (for logging)
282
+
283
+ Returns:
284
+ BlockImageResult object
285
+ """
286
+ bbox = (0, 0, self.page_width, self.page_height)
287
+ return self.process_region(bbox, region_type)
288
+
289
+ def process_regions(
290
+ self,
291
+ bboxes: List[Tuple[float, float, float, float]],
292
+ region_type: str = "complex_region"
293
+ ) -> List[BlockImageResult]:
294
+ """
295
+ Processes multiple regions.
296
+
297
+ Args:
298
+ bboxes: List of regions to process
299
+ region_type: Region type (for logging)
300
+
301
+ Returns:
302
+ List of BlockImageResult objects
303
+ """
304
+ results = []
305
+ for bbox in bboxes:
306
+ result = self.process_region(bbox, region_type)
307
+ results.append(result)
308
+ return results
309
+
310
+ def _render_region(
311
+ self,
312
+ bbox: Tuple[float, float, float, float]
313
+ ) -> Tuple[Optional[bytes], int, Tuple[int, int]]:
314
+ """
315
+ Renders a region to image bytes.
316
+
317
+ Args:
318
+ bbox: Region to render
319
+
320
+ Returns:
321
+ (image bytes, actual DPI, (width, height))
322
+ """
323
+ try:
324
+ # Apply padding
325
+ padding = self.config.REGION_PADDING
326
+ x0 = max(0, bbox[0] - padding)
327
+ y0 = max(0, bbox[1] - padding)
328
+ x1 = min(self.page_width, bbox[2] + padding)
329
+ y1 = min(self.page_height, bbox[3] + padding)
330
+
331
+ # Create clip rect
332
+ clip_rect = fitz.Rect(x0, y0, x1, y1)
333
+
334
+ # Calculate DPI (considering max image size)
335
+ dpi = self.config.DEFAULT_DPI
336
+
337
+ region_width = x1 - x0
338
+ region_height = y1 - y0
339
+
340
+ max_dim = max(region_width, region_height)
341
+ expected_size = max_dim * dpi / 72
342
+
343
+ if expected_size > self.config.MAX_IMAGE_SIZE:
344
+ # Adjust DPI
345
+ dpi = int(self.config.MAX_IMAGE_SIZE * 72 / max_dim)
346
+
347
+ # Create matrix (zoom = DPI / 72)
348
+ zoom = dpi / 72
349
+ matrix = fitz.Matrix(zoom, zoom)
350
+
351
+ # Render
352
+ pix = self.page.get_pixmap(matrix=matrix, clip=clip_rect)
353
+
354
+ # Convert to PNG bytes
355
+ image_bytes = pix.tobytes("png")
356
+ image_size = (pix.width, pix.height)
357
+
358
+ return image_bytes, dpi, image_size
359
+
360
+ except Exception as e:
361
+ logger.error(f"[BlockImageEngine] Render error: {e}")
362
+ return None, 0, (0, 0)
363
+
364
+ def render_to_bytes(
365
+ self,
366
+ bbox: Tuple[float, float, float, float]
367
+ ) -> Optional[bytes]:
368
+ """
369
+ Renders a region to image bytes (without saving).
370
+
371
+ Args:
372
+ bbox: Region to render
373
+
374
+ Returns:
375
+ Image bytes
376
+ """
377
+ image_bytes, _, _ = self._render_region(bbox)
378
+ return image_bytes
379
+
380
+ # ========================================================================
381
+ # Advanced Block Processing
382
+ # ========================================================================
383
+
384
+ def process_page_as_semantic_blocks(self) -> MultiBlockResult:
385
+ """
386
+ Advanced processing: Splits page into semantic block units for processing.
387
+
388
+ Unlike traditional FULL_PAGE_OCR:
389
+ 1. Detect semantic blocks with LayoutBlockDetector
390
+ 2. Render each block as individual image
391
+ 3. Generate [Image:path] tags in reading order
392
+
393
+ Returns:
394
+ MultiBlockResult object (contains all block results)
395
+ """
396
+ try:
397
+ # 1. Layout block detection
398
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_layout_block_detector import (
399
+ LayoutBlockDetector,
400
+ LayoutBlock,
401
+ )
402
+
403
+ detector = LayoutBlockDetector(self.page, self.page_num)
404
+ layout_result = detector.detect()
405
+
406
+ if not layout_result.blocks:
407
+ logger.warning(f"[BlockImageEngine] No blocks detected, falling back to full page")
408
+ return self._fallback_to_full_page()
409
+
410
+ logger.info(f"[BlockImageEngine] Page {self.page_num + 1}: "
411
+ f"Detected {len(layout_result.blocks)} semantic blocks in {layout_result.column_count} columns")
412
+
413
+ # 2. Process each block as individual image
414
+ block_results: List[BlockImageResult] = []
415
+
416
+ for block in layout_result.blocks:
417
+ # Filter out blocks that are too small (by area)
418
+ # NOTE: Process if block region is valid even without elements
419
+ if block.area < self.config.MIN_BLOCK_AREA:
420
+ logger.debug(f"[BlockImageEngine] Skipping small block: area={block.area:.0f}")
421
+ continue
422
+
423
+ result = self.process_region(
424
+ block.bbox,
425
+ region_type=block.block_type.name if block.block_type else "unknown"
426
+ )
427
+
428
+ # Add block metadata
429
+ result.block_type = block.block_type.name if block.block_type else "unknown"
430
+ result.reading_order = block.reading_order
431
+ result.column_index = block.column_index
432
+
433
+ if result.success:
434
+ block_results.append(result)
435
+
436
+ if not block_results:
437
+ logger.warning(f"[BlockImageEngine] No valid blocks, falling back to full page")
438
+ return self._fallback_to_full_page()
439
+
440
+ # 3. Sort by reading order
441
+ block_results.sort(key=lambda r: r.reading_order)
442
+
443
+ # 4. Generate combined output
444
+ combined_output = self._generate_combined_output(block_results)
445
+
446
+ return MultiBlockResult(
447
+ page_num=self.page_num,
448
+ strategy_used=BlockStrategy.SEMANTIC_BLOCKS,
449
+ block_results=block_results,
450
+ success=True,
451
+ combined_output=combined_output,
452
+ total_blocks=len(layout_result.blocks),
453
+ successful_blocks=len(block_results),
454
+ failed_blocks=len(layout_result.blocks) - len(block_results)
455
+ )
456
+
457
+ except Exception as e:
458
+ logger.error(f"[BlockImageEngine] Semantic block processing failed: {e}")
459
+ return self._fallback_to_full_page()
460
+
461
+ def process_page_as_grid_blocks(
462
+ self,
463
+ rows: Optional[int] = None,
464
+ cols: Optional[int] = None
465
+ ) -> MultiBlockResult:
466
+ """
467
+ Processes the page by dividing into a grid.
468
+
469
+ Used as fallback when semantic analysis fails.
470
+
471
+ Args:
472
+ rows: Number of rows (default: config.GRID_ROWS)
473
+ cols: Number of columns (default: config.GRID_COLS)
474
+
475
+ Returns:
476
+ MultiBlockResult object
477
+ """
478
+ rows = rows or self.config.GRID_ROWS
479
+ cols = cols or self.config.GRID_COLS
480
+
481
+ try:
482
+ cell_width = self.page_width / cols
483
+ cell_height = self.page_height / rows
484
+
485
+ block_results: List[BlockImageResult] = []
486
+ reading_order = 0
487
+
488
+ # Process left→right, top→bottom order
489
+ for row in range(rows):
490
+ for col in range(cols):
491
+ x0 = col * cell_width
492
+ y0 = row * cell_height
493
+ x1 = (col + 1) * cell_width
494
+ y1 = (row + 1) * cell_height
495
+
496
+ bbox = (x0, y0, x1, y1)
497
+
498
+ # Check if region is empty
499
+ if self.config.SKIP_EMPTY_BLOCKS and self._is_empty_region(bbox):
500
+ continue
501
+
502
+ result = self.process_region(bbox, region_type="grid_cell")
503
+ result.reading_order = reading_order
504
+ result.column_index = col
505
+
506
+ if result.success:
507
+ block_results.append(result)
508
+ reading_order += 1
509
+
510
+ combined_output = self._generate_combined_output(block_results)
511
+
512
+ return MultiBlockResult(
513
+ page_num=self.page_num,
514
+ strategy_used=BlockStrategy.GRID_BLOCKS,
515
+ block_results=block_results,
516
+ success=len(block_results) > 0,
517
+ combined_output=combined_output,
518
+ total_blocks=rows * cols,
519
+ successful_blocks=len(block_results),
520
+ failed_blocks=rows * cols - len(block_results)
521
+ )
522
+
523
+ except Exception as e:
524
+ logger.error(f"[BlockImageEngine] Grid processing failed: {e}")
525
+ return self._fallback_to_full_page()
526
+
527
+ def process_page_smart(self) -> MultiBlockResult:
528
+ """
529
+ ★ Smart processing: Automatically selects optimal strategy.
530
+
531
+ 1. First try semantic block splitting
532
+ 2. If fails or results are poor, use grid splitting
533
+ 3. If still fails, fall back to full page imaging
534
+
535
+ Returns:
536
+ MultiBlockResult object
537
+ """
538
+ # 1. Try semantic block splitting
539
+ result = self.process_page_as_semantic_blocks()
540
+
541
+ if result.success and result.successful_blocks >= 1:
542
+ # Use if sufficient blocks detected
543
+ if result.successful_blocks >= 2 or result.block_results:
544
+ logger.info(f"[BlockImageEngine] Smart: Using semantic blocks "
545
+ f"({result.successful_blocks} blocks)")
546
+ return result
547
+
548
+ # 2. If semantic analysis results are poor, use grid splitting
549
+ logger.info(f"[BlockImageEngine] Smart: Semantic blocks insufficient, trying grid")
550
+
551
+ # Determine grid based on column count
552
+ try:
553
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_layout_block_detector import (
554
+ LayoutBlockDetector,
555
+ )
556
+ detector = LayoutBlockDetector(self.page, self.page_num)
557
+ layout_result = detector.detect()
558
+
559
+ cols = max(2, layout_result.column_count)
560
+ rows = max(2, int(self.page_height / self.page_width * cols))
561
+
562
+ result = self.process_page_as_grid_blocks(rows=rows, cols=cols)
563
+
564
+ if result.success and result.successful_blocks >= 2:
565
+ logger.info(f"[BlockImageEngine] Smart: Using grid {rows}x{cols} "
566
+ f"({result.successful_blocks} blocks)")
567
+ return result
568
+ except Exception:
569
+ pass
570
+
571
+ # 3. Full page fallback
572
+ logger.info(f"[BlockImageEngine] Smart: Falling back to full page")
573
+ return self._fallback_to_full_page()
574
+
575
+ def _fallback_to_full_page(self) -> MultiBlockResult:
576
+ """Full page imaging fallback."""
577
+ result = self.process_full_page()
578
+
579
+ return MultiBlockResult(
580
+ page_num=self.page_num,
581
+ strategy_used=BlockStrategy.FULL_PAGE,
582
+ block_results=[result] if result.success else [],
583
+ success=result.success,
584
+ combined_output=result.image_tag if result.success else "",
585
+ total_blocks=1,
586
+ successful_blocks=1 if result.success else 0,
587
+ failed_blocks=0 if result.success else 1
588
+ )
589
+
590
+ def _is_empty_region(self, bbox: Tuple[float, float, float, float]) -> bool:
591
+ """Check if region is empty (mostly white)."""
592
+ try:
593
+ image_bytes, _, _ = self._render_region(bbox)
594
+ if not image_bytes:
595
+ return False
596
+
597
+ # Analyze with PIL
598
+ img = Image.open(io.BytesIO(image_bytes))
599
+
600
+ # Calculate white pixel ratio
601
+ if img.mode != 'RGB':
602
+ img = img.convert('RGB')
603
+
604
+ pixels = list(img.getdata())
605
+ total_pixels = len(pixels)
606
+
607
+ if total_pixels == 0:
608
+ return True
609
+
610
+ # Count nearly white pixels (R, G, B all > 240)
611
+ white_pixels = sum(1 for p in pixels if p[0] > 240 and p[1] > 240 and p[2] > 240)
612
+ white_ratio = white_pixels / total_pixels
613
+
614
+ return white_ratio >= self.config.EMPTY_THRESHOLD
615
+
616
+ except Exception:
617
+ return False
618
+
619
+ def _generate_combined_output(self, block_results: List[BlockImageResult]) -> str:
620
+ """
621
+ Converts block results to combined output string.
622
+
623
+ Each block is arranged in reading order,
624
+ with appropriate markup based on block type.
625
+ """
626
+ if not block_results:
627
+ return ""
628
+
629
+ output_parts = []
630
+
631
+ for result in block_results:
632
+ if not result.success or not result.image_tag:
633
+ continue
634
+
635
+ # Context hint based on block type
636
+ block_type = result.block_type or "unknown"
637
+
638
+ if block_type == "HEADER":
639
+ output_parts.append(f"<!-- Page Header -->\n{result.image_tag}")
640
+ elif block_type == "FOOTER":
641
+ output_parts.append(f"<!-- Page Footer -->\n{result.image_tag}")
642
+ elif block_type == "TABLE":
643
+ output_parts.append(f"<!-- Table -->\n{result.image_tag}")
644
+ elif block_type in ("IMAGE_WITH_CAPTION", "STANDALONE_IMAGE"):
645
+ output_parts.append(f"<!-- Figure -->\n{result.image_tag}")
646
+ elif block_type == "ADVERTISEMENT":
647
+ output_parts.append(f"<!-- Advertisement -->\n{result.image_tag}")
648
+ elif block_type == "SIDEBAR":
649
+ output_parts.append(f"<!-- Sidebar -->\n{result.image_tag}")
650
+ else:
651
+ # General content block (ARTICLE, COLUMN_BLOCK, etc.)
652
+ output_parts.append(result.image_tag)
653
+
654
+ return "\n".join(output_parts)
655
+
656
+
657
+ # ============================================================================
658
+ # Export
659
+ # ============================================================================
660
+
661
+ __all__ = [
662
+ 'BlockStrategy',
663
+ 'BlockImageConfig',
664
+ 'BlockImageResult',
665
+ 'MultiBlockResult',
666
+ 'BlockImageEngine',
667
+ ]