xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1244 @@
1
+ """
2
+ Layout Block Detector for PDF Handler
3
+
4
+ Divides complex multi-column layouts (newspapers, magazines, etc.) into semantic block units.
5
+
6
+ =============================================================================
7
+ =============================================================================
8
+ Instead of processing the entire page as a single image,
9
+ the page is divided into **semantic/logical block units** and saved as individual PNGs.
10
+
11
+ This enables:
12
+ 1. LLM can interpret each block **individually**
13
+ 2. Resolution issue resolved (maintains high resolution per block)
14
+ 3. Preserves reading order
15
+ 4. Context separation (distinguishes ads/articles/tables)
16
+
17
+ =============================================================================
18
+ Layout Analysis Algorithm:
19
+ =============================================================================
20
+
21
+ Phase 1: Basic Analysis
22
+ ┌─────────────────────────────────────────────────────────────────┐
23
+ │ 1. Extract text blocks │
24
+ │ 2. Extract image/graphic regions │
25
+ │ 3. Extract drawings (lines, boxes) │
26
+ │ 4. Identify table regions │
27
+ └─────────────────────────────────────────────────────────────────┘
28
+
29
+ Phase 2: Column Detection (Multi-column Layout)
30
+ ┌─────────────────────────────────────────────────────────────────┐
31
+ │ 1. X-coordinate based clustering │
32
+ │ 2. Identify column boundaries │
33
+ │ 3. Group content by column │
34
+ └─────────────────────────────────────────────────────────────────┘
35
+
36
+ Phase 3: Semantic Block Clustering
37
+ ┌─────────────────────────────────────────────────────────────────┐
38
+ │ 1. Connect adjacent elements (distance-based) │
39
+ │ 2. Connect headline-body (font size analysis) │
40
+ │ 3. Connect image-caption (positional relationship) │
41
+ │ 4. Separate regions based on dividers/boxes │
42
+ └─────────────────────────────────────────────────────────────────┘
43
+
44
+ Phase 4: Block Optimization and Ordering
45
+ ┌─────────────────────────────────────────────────────────────────┐
46
+ │ 1. Merge small blocks │
47
+ │ 2. Resolve overlaps │
48
+ │ 3. Determine reading order (column → top-to-bottom) │
49
+ │ 4. Normalize block bboxes │
50
+ └─────────────────────────────────────────────────────────────────┘
51
+
52
+ =============================================================================
53
+ Block Types:
54
+ =============================================================================
55
+ - ARTICLE: Article block (headline + body)
56
+ - IMAGE_WITH_CAPTION: Image + caption
57
+ - TABLE: Table region
58
+ - ADVERTISEMENT: Advertisement region (separated by box)
59
+ - SIDEBAR: Sidebar/infobox
60
+ - HEADER_FOOTER: Header/footer
61
+ - UNKNOWN: Unclassifiable
62
+ """
63
+
64
+ import logging
65
+ from dataclasses import dataclass, field
66
+ from typing import List, Dict, Optional, Tuple, Set
67
+ from enum import Enum, auto
68
+ from collections import defaultdict
69
+ import math
70
+
71
+ import fitz
72
+
73
+ logger = logging.getLogger(__name__)
74
+
75
+
76
+ # ============================================================================
77
+ # Types and Enums
78
+ # ============================================================================
79
+
80
+ class LayoutBlockType(Enum):
81
+ """Layout block type"""
82
+ ARTICLE = auto() # Article (headline + body)
83
+ IMAGE_WITH_CAPTION = auto() # Image + caption
84
+ STANDALONE_IMAGE = auto() # Standalone image
85
+ TABLE = auto() # Table
86
+ ADVERTISEMENT = auto() # Advertisement
87
+ SIDEBAR = auto() # Sidebar/infobox
88
+ HEADER = auto() # Page header
89
+ FOOTER = auto() # Page footer
90
+ COLUMN_BLOCK = auto() # Column unit block
91
+ UNKNOWN = auto() # Unclassifiable
92
+
93
+
94
+ @dataclass
95
+ class ContentElement:
96
+ """Content element within page"""
97
+ element_type: str # 'text', 'image', 'drawing', 'table'
98
+ bbox: Tuple[float, float, float, float]
99
+ content: Optional[str] = None
100
+
101
+ # Text properties
102
+ font_size: float = 0.0
103
+ is_bold: bool = False
104
+ text_length: int = 0
105
+
106
+ # Image properties
107
+ image_area: float = 0.0
108
+
109
+ # Group ID (assigned after clustering)
110
+ group_id: int = -1
111
+
112
+
113
+ @dataclass
114
+ class LayoutBlock:
115
+ """Semantic layout block"""
116
+ block_id: int
117
+ block_type: LayoutBlockType
118
+ bbox: Tuple[float, float, float, float]
119
+
120
+ # Contained elements
121
+ elements: List[ContentElement] = field(default_factory=list)
122
+
123
+ # Column information
124
+ column_index: int = 0
125
+
126
+ # Reading order (starts from 0)
127
+ reading_order: int = 0
128
+
129
+ # Confidence (0.0 ~ 1.0)
130
+ confidence: float = 1.0
131
+
132
+ # Metadata
133
+ metadata: Dict = field(default_factory=dict)
134
+
135
+ @property
136
+ def area(self) -> float:
137
+ """Block area"""
138
+ return (self.bbox[2] - self.bbox[0]) * (self.bbox[3] - self.bbox[1])
139
+
140
+ @property
141
+ def width(self) -> float:
142
+ return self.bbox[2] - self.bbox[0]
143
+
144
+ @property
145
+ def height(self) -> float:
146
+ return self.bbox[3] - self.bbox[1]
147
+
148
+ @property
149
+ def center(self) -> Tuple[float, float]:
150
+ """Block center point"""
151
+ return (
152
+ (self.bbox[0] + self.bbox[2]) / 2,
153
+ (self.bbox[1] + self.bbox[3]) / 2
154
+ )
155
+
156
+
157
+ @dataclass
158
+ class ColumnInfo:
159
+ """Column information"""
160
+ index: int
161
+ x_start: float
162
+ x_end: float
163
+
164
+ # Blocks within column
165
+ blocks: List[LayoutBlock] = field(default_factory=list)
166
+
167
+ @property
168
+ def width(self) -> float:
169
+ return self.x_end - self.x_start
170
+
171
+
172
+ @dataclass
173
+ class LayoutAnalysisResult:
174
+ """Layout analysis result"""
175
+ page_num: int
176
+ page_size: Tuple[float, float]
177
+
178
+ # Column information
179
+ columns: List[ColumnInfo] = field(default_factory=list)
180
+ column_count: int = 1
181
+
182
+ # Layout blocks (sorted by reading order)
183
+ blocks: List[LayoutBlock] = field(default_factory=list)
184
+
185
+ # Header/footer region
186
+ header_region: Optional[Tuple[float, float, float, float]] = None
187
+ footer_region: Optional[Tuple[float, float, float, float]] = None
188
+
189
+ # Statistics
190
+ total_text_elements: int = 0
191
+ total_image_elements: int = 0
192
+
193
+ # Analysis confidence
194
+ confidence: float = 1.0
195
+
196
+
197
+ # ============================================================================
198
+ # Configuration
199
+ # ============================================================================
200
+
201
+ @dataclass
202
+ class LayoutDetectorConfig:
203
+ """Layout detection configuration."""
204
+
205
+ # Column detection settings
206
+ MIN_COLUMN_GAP: float = 20.0 # Minimum gap between columns (pt)
207
+ COLUMN_CLUSTER_TOLERANCE: float = 30.0 # X-coordinate clustering tolerance (pt)
208
+
209
+ # Block clustering settings
210
+ ELEMENT_PROXIMITY_THRESHOLD: float = 15.0 # Element proximity threshold (pt)
211
+ VERTICAL_MERGE_THRESHOLD: float = 40.0 # Vertical merge distance (pt) - aggressive merge
212
+ HORIZONTAL_MERGE_THRESHOLD: float = 15.0 # Horizontal merge distance (pt) - aggressive merge
213
+
214
+ # Headline detection settings
215
+ HEADLINE_FONT_RATIO: float = 1.3 # Headline font ratio vs. body text
216
+ HEADLINE_MIN_SIZE: float = 14.0 # Minimum headline font size (pt)
217
+
218
+ # Image-caption connection settings
219
+ CAPTION_MAX_DISTANCE: float = 30.0 # Max image-caption distance (pt)
220
+ CAPTION_MAX_HEIGHT: float = 50.0 # Max caption height (pt)
221
+
222
+ # Header/footer settings
223
+ HEADER_MAX_HEIGHT: float = 60.0 # Max header height (pt)
224
+ FOOTER_MAX_HEIGHT: float = 60.0 # Max footer height (pt)
225
+ HEADER_FOOTER_MARGIN: float = 0.1 # Page top/bottom margin ratio
226
+
227
+ # Minimum block size (small blocks are merge candidates)
228
+ MIN_BLOCK_WIDTH: float = 80.0 # Minimum block width (pt)
229
+ MIN_BLOCK_HEIGHT: float = 60.0 # Minimum block height (pt)
230
+ MIN_BLOCK_AREA: float = 15000.0 # Minimum block area (pt²) (~100x150pt)
231
+
232
+ # Block count target (prevents too many blocks)
233
+ TARGET_MIN_BLOCKS: int = 3 # Minimum blocks per page
234
+ TARGET_MAX_BLOCKS: int = 10 # Maximum blocks per page (considering 5-column newspapers)
235
+ AGGRESSIVE_MERGE_THRESHOLD: int = 15 # Aggressive merge if more than this
236
+
237
+ # Advertisement detection
238
+ AD_BOX_DETECTION: bool = True # Detect advertisements enclosed by boxes
239
+ AD_MIN_BOX_AREA: float = 10000.0 # Minimum box area to be considered as advertisement
240
+
241
+ # Separator detection
242
+ SEPARATOR_MIN_LENGTH_RATIO: float = 0.3 # Minimum separator length (relative to page width)
243
+ SEPARATOR_MAX_THICKNESS: float = 3.0 # Maximum separator thickness (pt)
244
+
245
+
246
+ # ============================================================================
247
+ # Layout Block Detector
248
+ # ============================================================================
249
+
250
+ class LayoutBlockDetector:
251
+ """
252
+ Layout Block Detector
253
+
254
+ Divides complex multi-column layouts into semantic block units.
255
+ """
256
+
257
+ def __init__(
258
+ self,
259
+ page,
260
+ page_num: int,
261
+ config: Optional[LayoutDetectorConfig] = None
262
+ ):
263
+ """
264
+ Args:
265
+ page: PyMuPDF page object
266
+ page_num: Page number (0-indexed)
267
+ config: Detection configuration
268
+ """
269
+ self.page = page
270
+ self.page_num = page_num
271
+ self.config = config or LayoutDetectorConfig()
272
+
273
+ self.page_width = page.rect.width
274
+ self.page_height = page.rect.height
275
+
276
+ # Cache
277
+ self._text_dict: Optional[Dict] = None
278
+ self._drawings: Optional[List] = None
279
+ self._images: Optional[List] = None
280
+
281
+ # Internal state
282
+ self._elements: List[ContentElement] = []
283
+ self._separators: List[Tuple[float, float, float, float]] = []
284
+ self._boxes: List[Tuple[float, float, float, float]] = []
285
+
286
+ def detect(self) -> LayoutAnalysisResult:
287
+ """
288
+ Detect layout blocks.
289
+
290
+ Returns:
291
+ LayoutAnalysisResult object
292
+ """
293
+ columns = [ColumnInfo(index=0, x_start=0, x_end=self.page_width)]
294
+ header_region = None
295
+ footer_region = None
296
+ blocks = []
297
+
298
+ try:
299
+ # Phase 1: Basic Analysis
300
+ try:
301
+ self._extract_elements()
302
+ except Exception as e:
303
+ logger.warning(f"[LayoutBlockDetector] Phase 1 (_extract_elements) failed: {e}")
304
+ self._elements = []
305
+
306
+ try:
307
+ self._extract_separators_and_boxes()
308
+ except Exception as e:
309
+ logger.warning(f"[LayoutBlockDetector] Phase 1 (_extract_separators_and_boxes) failed: {e}")
310
+ self._separators = []
311
+ self._boxes = []
312
+
313
+ # Phase 2: Column detection
314
+ try:
315
+ columns = self._detect_columns()
316
+ except Exception as e:
317
+ logger.warning(f"[LayoutBlockDetector] Phase 2 (_detect_columns) failed: {e}")
318
+ columns = [ColumnInfo(index=0, x_start=0, x_end=self.page_width)]
319
+
320
+ # Phase 3: Header/footer detection
321
+ try:
322
+ header_region, footer_region = self._detect_header_footer()
323
+ except Exception as e:
324
+ logger.warning(f"[LayoutBlockDetector] Phase 3 (_detect_header_footer) failed: {e}")
325
+ header_region = None
326
+ footer_region = None
327
+
328
+ # Phase 4: Semantic block clustering
329
+ try:
330
+ blocks = self._cluster_into_blocks(columns, header_region, footer_region)
331
+ except Exception as e:
332
+ logger.warning(f"[LayoutBlockDetector] Phase 4 (_cluster_into_blocks) failed: {e}")
333
+ # Fallback: Create simple column-based blocks
334
+ blocks = self._create_column_based_blocks(columns)
335
+
336
+ # Phase 5: Block classification
337
+ try:
338
+ self._classify_blocks(blocks)
339
+ except Exception as e:
340
+ logger.warning(f"[LayoutBlockDetector] Phase 5 (_classify_blocks) failed: {e}")
341
+
342
+ # Phase 6: Block optimization and sorting
343
+ try:
344
+ blocks = self._optimize_and_sort_blocks(blocks, columns)
345
+ except Exception as e:
346
+ logger.warning(f"[LayoutBlockDetector] Phase 6 (_optimize_and_sort_blocks) failed: {e}")
347
+
348
+ except Exception as e:
349
+ logger.error(f"[LayoutBlockDetector] Critical error during detection: {e}")
350
+ # Return entire page as a single block as minimum fallback
351
+ blocks = [LayoutBlock(
352
+ block_id=0,
353
+ block_type=LayoutBlockType.UNKNOWN,
354
+ bbox=(0, 0, self.page_width, self.page_height),
355
+ elements=self._elements if self._elements else [],
356
+ column_index=0,
357
+ reading_order=0,
358
+ confidence=0.1
359
+ )]
360
+
361
+ result = LayoutAnalysisResult(
362
+ page_num=self.page_num,
363
+ page_size=(self.page_width, self.page_height),
364
+ columns=columns,
365
+ column_count=len(columns),
366
+ blocks=blocks,
367
+ header_region=header_region,
368
+ footer_region=footer_region,
369
+ total_text_elements=sum(1 for e in self._elements if e.element_type == 'text'),
370
+ total_image_elements=sum(1 for e in self._elements if e.element_type == 'image'),
371
+ confidence=self._calculate_confidence(blocks, columns)
372
+ )
373
+
374
+ logger.info(f"[LayoutBlockDetector] Page {self.page_num + 1}: "
375
+ f"detected {len(blocks)} blocks in {len(columns)} columns")
376
+
377
+ return result
378
+
379
+ def _create_column_based_blocks(self, columns: List[ColumnInfo]) -> List[LayoutBlock]:
380
+ """
381
+ Fallback: Create simple column-based blocks.
382
+
383
+ When clustering fails, treats each column as a single block.
384
+ """
385
+ blocks = []
386
+ block_id = 0
387
+
388
+ for col in columns:
389
+ # Elements belonging to this column
390
+ col_elements = [
391
+ e for e in self._elements
392
+ if self._element_in_column(e, col)
393
+ ]
394
+
395
+ if col_elements:
396
+ bbox = self._merge_bboxes([e.bbox for e in col_elements])
397
+ blocks.append(LayoutBlock(
398
+ block_id=block_id,
399
+ block_type=LayoutBlockType.COLUMN_BLOCK,
400
+ bbox=bbox,
401
+ elements=col_elements,
402
+ column_index=col.index,
403
+ reading_order=block_id,
404
+ confidence=0.5
405
+ ))
406
+ block_id += 1
407
+
408
+ # If no elements, create entire page as a single block
409
+ if not blocks:
410
+ blocks.append(LayoutBlock(
411
+ block_id=0,
412
+ block_type=LayoutBlockType.UNKNOWN,
413
+ bbox=(0, 0, self.page_width, self.page_height),
414
+ elements=[],
415
+ column_index=0,
416
+ reading_order=0,
417
+ confidence=0.1
418
+ ))
419
+
420
+ return blocks
421
+
422
+ # ========================================================================
423
+ # Phase 1: Basic Analysis
424
+ # ========================================================================
425
+
426
+ def _extract_elements(self):
427
+ """Extract all content elements from the page."""
428
+ self._elements = []
429
+
430
+ # 1. Extract text blocks
431
+ text_dict = self._get_text_dict()
432
+ for block in text_dict.get("blocks", []):
433
+ if block.get("type") != 0: # Text blocks only
434
+ continue
435
+
436
+ bbox = tuple(block.get("bbox", (0, 0, 0, 0)))
437
+
438
+ # Collect font information
439
+ max_font_size = 0.0
440
+ is_bold = False
441
+ total_text = ""
442
+
443
+ for line in block.get("lines", []):
444
+ for span in line.get("spans", []):
445
+ font_size = span.get("size", 0.0)
446
+ if font_size > max_font_size:
447
+ max_font_size = font_size
448
+
449
+ flags = span.get("flags", 0)
450
+ if flags & 2**4: # Bold flag
451
+ is_bold = True
452
+
453
+ total_text += span.get("text", "")
454
+
455
+ if total_text.strip():
456
+ self._elements.append(ContentElement(
457
+ element_type='text',
458
+ bbox=bbox,
459
+ content=total_text.strip(),
460
+ font_size=max_font_size,
461
+ is_bold=is_bold,
462
+ text_length=len(total_text.strip())
463
+ ))
464
+
465
+ # 2. Extract images
466
+ images = self._get_images()
467
+ for img_info in images:
468
+ xref = img_info[0]
469
+ try:
470
+ # Find image position
471
+ img_bbox = self._find_image_position(xref)
472
+ if img_bbox:
473
+ area = (img_bbox[2] - img_bbox[0]) * (img_bbox[3] - img_bbox[1])
474
+ self._elements.append(ContentElement(
475
+ element_type='image',
476
+ bbox=img_bbox,
477
+ image_area=area
478
+ ))
479
+ except Exception:
480
+ pass
481
+
482
+ def _extract_separators_and_boxes(self):
483
+ """Extract separators and boxes."""
484
+ self._separators = []
485
+ self._boxes = []
486
+
487
+ drawings = self._get_drawings()
488
+
489
+ for drawing in drawings:
490
+ try:
491
+ rect = drawing.get("rect")
492
+ if not rect:
493
+ continue
494
+
495
+ # Safely access rect attributes
496
+ try:
497
+ w = rect.width
498
+ h = rect.height
499
+ x0, y0, x1, y1 = rect.x0, rect.y0, rect.x1, rect.y1
500
+ except (AttributeError, TypeError):
501
+ # rect might be a tuple
502
+ if isinstance(rect, (list, tuple)) and len(rect) >= 4:
503
+ x0, y0, x1, y1 = rect[0], rect[1], rect[2], rect[3]
504
+ w = x1 - x0
505
+ h = y1 - y0
506
+ else:
507
+ continue
508
+
509
+ # Horizontal separator
510
+ if (h <= self.config.SEPARATOR_MAX_THICKNESS and
511
+ w >= self.page_width * self.config.SEPARATOR_MIN_LENGTH_RATIO):
512
+ self._separators.append((x0, y0, x1, y1))
513
+
514
+ # Vertical separator
515
+ elif (w <= self.config.SEPARATOR_MAX_THICKNESS and
516
+ h >= self.page_height * self.config.SEPARATOR_MIN_LENGTH_RATIO * 0.5):
517
+ self._separators.append((x0, y0, x1, y1))
518
+
519
+ # Box (Advertisement/infobox candidate)
520
+ elif w > 50 and h > 50:
521
+ area = w * h
522
+ if area >= self.config.AD_MIN_BOX_AREA:
523
+ # Check if it's a box with border
524
+ # NOTE: stroke_opacity can be None, so handle safely
525
+ stroke_opacity = drawing.get("stroke_opacity")
526
+ has_stroke = drawing.get("color") or (stroke_opacity is not None and stroke_opacity > 0)
527
+ if has_stroke:
528
+ self._boxes.append((x0, y0, x1, y1))
529
+ except Exception as e:
530
+ # Log and continue on individual drawing processing failure
531
+ logger.debug(f"[LayoutBlockDetector] Error processing drawing: {e}")
532
+ continue
533
+
534
+ # ========================================================================
535
+ # Phase 2: Column detection
536
+ # ========================================================================
537
+
538
+ def _detect_columns(self) -> List[ColumnInfo]:
539
+ """Detect column structure."""
540
+ if not self._elements:
541
+ return [ColumnInfo(index=0, x_start=0, x_end=self.page_width)]
542
+
543
+ # Collect X start positions of text elements
544
+ x_starts = []
545
+ for elem in self._elements:
546
+ if elem.element_type == 'text' and elem.text_length > 20: # Only sufficiently long text
547
+ x_starts.append(elem.bbox[0])
548
+
549
+ if not x_starts:
550
+ return [ColumnInfo(index=0, x_start=0, x_end=self.page_width)]
551
+
552
+ # X-coordinate clustering
553
+ x_starts.sort()
554
+ clusters = self._cluster_x_positions(x_starts)
555
+
556
+ if len(clusters) <= 1:
557
+ return [ColumnInfo(index=0, x_start=0, x_end=self.page_width)]
558
+
559
+ # Analyze gaps between clusters
560
+ cluster_centers = [sum(c) / len(c) for c in clusters]
561
+
562
+ # Only clusters with sufficient gap are recognized as columns
563
+ columns = []
564
+ valid_boundaries = [0]
565
+
566
+ for i in range(len(cluster_centers) - 1):
567
+ gap = cluster_centers[i + 1] - cluster_centers[i]
568
+ if gap >= self.config.MIN_COLUMN_GAP:
569
+ # Column boundary = midpoint between two clusters
570
+ boundary = (cluster_centers[i] + cluster_centers[i + 1]) / 2
571
+ valid_boundaries.append(boundary)
572
+
573
+ valid_boundaries.append(self.page_width)
574
+
575
+ # Create columns
576
+ for i in range(len(valid_boundaries) - 1):
577
+ columns.append(ColumnInfo(
578
+ index=i,
579
+ x_start=valid_boundaries[i],
580
+ x_end=valid_boundaries[i + 1]
581
+ ))
582
+
583
+ logger.debug(f"[LayoutBlockDetector] Detected {len(columns)} columns")
584
+ return columns
585
+
586
+ def _cluster_x_positions(self, x_positions: List[float]) -> List[List[float]]:
587
+ """X-coordinate clustering (density-based)."""
588
+ if not x_positions:
589
+ return []
590
+
591
+ clusters = []
592
+ current_cluster = [x_positions[0]]
593
+
594
+ for x in x_positions[1:]:
595
+ if x - current_cluster[-1] <= self.config.COLUMN_CLUSTER_TOLERANCE:
596
+ current_cluster.append(x)
597
+ else:
598
+ if len(current_cluster) >= 3: # Minimum 3 elements
599
+ clusters.append(current_cluster)
600
+ current_cluster = [x]
601
+
602
+ if len(current_cluster) >= 3:
603
+ clusters.append(current_cluster)
604
+
605
+ return clusters
606
+
607
+ # ========================================================================
608
+ # Phase 3: Header/footer detection
609
+ # ========================================================================
610
+
611
+ def _detect_header_footer(self) -> Tuple[Optional[Tuple], Optional[Tuple]]:
612
+ """Detect header and footer regions."""
613
+ header_region = None
614
+ footer_region = None
615
+
616
+ header_boundary = self.page_height * self.config.HEADER_FOOTER_MARGIN
617
+ footer_boundary = self.page_height * (1 - self.config.HEADER_FOOTER_MARGIN)
618
+
619
+ # Analyze top region
620
+ header_elements = [
621
+ e for e in self._elements
622
+ if e.bbox[3] <= header_boundary and e.element_type == 'text'
623
+ ]
624
+
625
+ if header_elements:
626
+ min_y = min(e.bbox[1] for e in header_elements)
627
+ max_y = max(e.bbox[3] for e in header_elements)
628
+
629
+ if max_y - min_y <= self.config.HEADER_MAX_HEIGHT:
630
+ header_region = (0, min_y, self.page_width, max_y)
631
+
632
+ # Analyze bottom region
633
+ footer_elements = [
634
+ e for e in self._elements
635
+ if e.bbox[1] >= footer_boundary and e.element_type == 'text'
636
+ ]
637
+
638
+ if footer_elements:
639
+ min_y = min(e.bbox[1] for e in footer_elements)
640
+ max_y = max(e.bbox[3] for e in footer_elements)
641
+
642
+ if max_y - min_y <= self.config.FOOTER_MAX_HEIGHT:
643
+ footer_region = (0, min_y, self.page_width, max_y)
644
+
645
+ return header_region, footer_region
646
+
647
+ # ========================================================================
648
+ # Phase 4: Semantic block clustering
649
+ # ========================================================================
650
+
651
+ def _cluster_into_blocks(
652
+ self,
653
+ columns: List[ColumnInfo],
654
+ header_region: Optional[Tuple],
655
+ footer_region: Optional[Tuple]
656
+ ) -> List[LayoutBlock]:
657
+ """Cluster elements into semantic blocks."""
658
+ blocks = []
659
+ block_id = 0
660
+
661
+ # Elements excluding header/footer regions
662
+ main_elements = []
663
+ header_elements = []
664
+ footer_elements = []
665
+
666
+ for elem in self._elements:
667
+ if header_region and self._is_inside(elem.bbox, header_region):
668
+ header_elements.append(elem)
669
+ elif footer_region and self._is_inside(elem.bbox, footer_region):
670
+ footer_elements.append(elem)
671
+ else:
672
+ main_elements.append(elem)
673
+
674
+ # Header block
675
+ if header_elements:
676
+ bbox = self._merge_bboxes([e.bbox for e in header_elements])
677
+ blocks.append(LayoutBlock(
678
+ block_id=block_id,
679
+ block_type=LayoutBlockType.HEADER,
680
+ bbox=bbox,
681
+ elements=header_elements
682
+ ))
683
+ block_id += 1
684
+
685
+ # Process by column
686
+ for col in columns:
687
+ # Elements belonging to this column
688
+ col_elements = [
689
+ e for e in main_elements
690
+ if self._element_in_column(e, col)
691
+ ]
692
+
693
+ if not col_elements:
694
+ continue
695
+
696
+ # Vertical split based on separators
697
+ vertical_groups = self._split_by_separators(col_elements, col)
698
+
699
+ for group_elements in vertical_groups:
700
+ if not group_elements:
701
+ continue
702
+
703
+ # Adjacent element clustering
704
+ clusters = self._cluster_adjacent_elements(group_elements)
705
+
706
+ for cluster in clusters:
707
+ if not cluster:
708
+ continue
709
+
710
+ bbox = self._merge_bboxes([e.bbox for e in cluster])
711
+
712
+ # Ignore too small blocks
713
+ if (bbox[2] - bbox[0] < self.config.MIN_BLOCK_WIDTH or
714
+ bbox[3] - bbox[1] < self.config.MIN_BLOCK_HEIGHT):
715
+ continue
716
+
717
+ blocks.append(LayoutBlock(
718
+ block_id=block_id,
719
+ block_type=LayoutBlockType.UNKNOWN, # Classify later
720
+ bbox=bbox,
721
+ elements=cluster,
722
+ column_index=col.index
723
+ ))
724
+ block_id += 1
725
+
726
+ # Footer block
727
+ if footer_elements:
728
+ bbox = self._merge_bboxes([e.bbox for e in footer_elements])
729
+ blocks.append(LayoutBlock(
730
+ block_id=block_id,
731
+ block_type=LayoutBlockType.FOOTER,
732
+ bbox=bbox,
733
+ elements=footer_elements
734
+ ))
735
+
736
+ return blocks
737
+
738
+ def _element_in_column(self, elem: ContentElement, col: ColumnInfo) -> bool:
739
+ """Check if element belongs to column."""
740
+ elem_center_x = (elem.bbox[0] + elem.bbox[2]) / 2
741
+ return col.x_start <= elem_center_x <= col.x_end
742
+
743
+ def _split_by_separators(
744
+ self,
745
+ elements: List[ContentElement],
746
+ col: ColumnInfo
747
+ ) -> List[List[ContentElement]]:
748
+ """Split vertically based on separators."""
749
+ if not elements:
750
+ return []
751
+
752
+ # Find horizontal separators within this column
753
+ col_separators = []
754
+ for sep in self._separators:
755
+ # Check if horizontal separator overlaps with this column
756
+ is_horizontal = abs(sep[3] - sep[1]) < 5
757
+ if is_horizontal:
758
+ sep_start_x = sep[0]
759
+ sep_end_x = sep[2]
760
+ if (sep_start_x <= col.x_end and sep_end_x >= col.x_start):
761
+ col_separators.append(sep[1]) # Y coordinate
762
+
763
+ if not col_separators:
764
+ return [elements]
765
+
766
+ # Sort separator positions
767
+ col_separators.sort()
768
+
769
+ # Split elements based on separators
770
+ groups = []
771
+ boundaries = [0] + col_separators + [self.page_height]
772
+
773
+ for i in range(len(boundaries) - 1):
774
+ y_start = boundaries[i]
775
+ y_end = boundaries[i + 1]
776
+
777
+ group = [
778
+ e for e in elements
779
+ if e.bbox[1] >= y_start - 5 and e.bbox[3] <= y_end + 5
780
+ ]
781
+
782
+ if group:
783
+ groups.append(group)
784
+
785
+ return groups if groups else [elements]
786
+
787
+ def _cluster_adjacent_elements(
788
+ self,
789
+ elements: List[ContentElement]
790
+ ) -> List[List[ContentElement]]:
791
+ """Adjacent element clustering."""
792
+ if not elements:
793
+ return []
794
+
795
+ if len(elements) == 1:
796
+ return [elements]
797
+
798
+ # Sort elements by Y coordinate
799
+ sorted_elements = sorted(elements, key=lambda e: (e.bbox[1], e.bbox[0]))
800
+
801
+ # Union-Find style clustering
802
+ clusters: List[List[ContentElement]] = []
803
+ used = set()
804
+
805
+ for elem in sorted_elements:
806
+ if id(elem) in used:
807
+ continue
808
+
809
+ # Start new cluster
810
+ cluster = [elem]
811
+ used.add(id(elem))
812
+ queue = [elem]
813
+
814
+ while queue:
815
+ current = queue.pop(0)
816
+
817
+ for other in sorted_elements:
818
+ if id(other) in used:
819
+ continue
820
+
821
+ if self._are_adjacent(current, other):
822
+ cluster.append(other)
823
+ used.add(id(other))
824
+ queue.append(other)
825
+
826
+ clusters.append(cluster)
827
+
828
+ return clusters
829
+
830
+ def _are_adjacent(self, e1: ContentElement, e2: ContentElement) -> bool:
831
+ """Check if two elements are adjacent."""
832
+ # Vertical gap
833
+ vertical_gap = max(0, e2.bbox[1] - e1.bbox[3], e1.bbox[1] - e2.bbox[3])
834
+
835
+ # Horizontal overlap
836
+ x_overlap_start = max(e1.bbox[0], e2.bbox[0])
837
+ x_overlap_end = min(e1.bbox[2], e2.bbox[2])
838
+ has_x_overlap = x_overlap_start < x_overlap_end
839
+
840
+ # Vertically adjacent (same X range, close Y)
841
+ if has_x_overlap and vertical_gap <= self.config.VERTICAL_MERGE_THRESHOLD:
842
+ return True
843
+
844
+ # Horizontally adjacent (same Y range)
845
+ horizontal_gap = max(0, e2.bbox[0] - e1.bbox[2], e1.bbox[0] - e2.bbox[2])
846
+
847
+ y_overlap_start = max(e1.bbox[1], e2.bbox[1])
848
+ y_overlap_end = min(e1.bbox[3], e2.bbox[3])
849
+ has_y_overlap = y_overlap_start < y_overlap_end
850
+
851
+ if has_y_overlap and horizontal_gap <= self.config.HORIZONTAL_MERGE_THRESHOLD:
852
+ return True
853
+
854
+ return False
855
+
856
+ # ========================================================================
857
+ # Phase 5: Block classification
858
+ # ========================================================================
859
+
860
+ def _classify_blocks(self, blocks: List[LayoutBlock]):
861
+ """Classify block types."""
862
+ for block in blocks:
863
+ if block.block_type in (LayoutBlockType.HEADER, LayoutBlockType.FOOTER):
864
+ continue
865
+
866
+ block.block_type = self._determine_block_type(block)
867
+
868
+ def _determine_block_type(self, block: LayoutBlock) -> LayoutBlockType:
869
+ """Determine block type."""
870
+ text_elements = [e for e in block.elements if e.element_type == 'text']
871
+ image_elements = [e for e in block.elements if e.element_type == 'image']
872
+
873
+ has_text = len(text_elements) > 0
874
+ has_image = len(image_elements) > 0
875
+
876
+ # Image + text = IMAGE_WITH_CAPTION
877
+ if has_image and has_text:
878
+ # Check if text is above/below image
879
+ for img_elem in image_elements:
880
+ for txt_elem in text_elements:
881
+ if self._is_caption_of_image(txt_elem, img_elem):
882
+ return LayoutBlockType.IMAGE_WITH_CAPTION
883
+ return LayoutBlockType.IMAGE_WITH_CAPTION # Default assumption
884
+
885
+ # Image only = STANDALONE_IMAGE
886
+ if has_image and not has_text:
887
+ return LayoutBlockType.STANDALONE_IMAGE
888
+
889
+ # Text only
890
+ if has_text:
891
+ # Headline detection (large font + short text)
892
+ avg_font_size = sum(e.font_size for e in text_elements) / len(text_elements)
893
+ max_font_size = max(e.font_size for e in text_elements)
894
+
895
+ # If font size variation is large, ARTICLE (headline + body)
896
+ if max_font_size >= self.config.HEADLINE_MIN_SIZE:
897
+ if max_font_size >= avg_font_size * self.config.HEADLINE_FONT_RATIO:
898
+ return LayoutBlockType.ARTICLE
899
+
900
+ # If inside a box, SIDEBAR or ADVERTISEMENT
901
+ if self._is_inside_box(block.bbox):
902
+ # Short text means advertisement
903
+ total_text_len = sum(e.text_length for e in text_elements)
904
+ if total_text_len < 200:
905
+ return LayoutBlockType.ADVERTISEMENT
906
+ return LayoutBlockType.SIDEBAR
907
+
908
+ return LayoutBlockType.ARTICLE
909
+
910
+ return LayoutBlockType.UNKNOWN
911
+
912
+ def _is_caption_of_image(self, text_elem: ContentElement, img_elem: ContentElement) -> bool:
913
+ """Check if text is a caption for the image."""
914
+ # Directly below image
915
+ if (text_elem.bbox[1] > img_elem.bbox[3] - 5 and
916
+ text_elem.bbox[1] < img_elem.bbox[3] + self.config.CAPTION_MAX_DISTANCE):
917
+ # Similar X range
918
+ if (text_elem.bbox[0] >= img_elem.bbox[0] - 20 and
919
+ text_elem.bbox[2] <= img_elem.bbox[2] + 20):
920
+ # Height within caption range
921
+ if text_elem.bbox[3] - text_elem.bbox[1] <= self.config.CAPTION_MAX_HEIGHT:
922
+ return True
923
+
924
+ # Also possible directly above image
925
+ if (text_elem.bbox[3] < img_elem.bbox[1] + 5 and
926
+ text_elem.bbox[3] > img_elem.bbox[1] - self.config.CAPTION_MAX_DISTANCE):
927
+ if (text_elem.bbox[0] >= img_elem.bbox[0] - 20 and
928
+ text_elem.bbox[2] <= img_elem.bbox[2] + 20):
929
+ if text_elem.bbox[3] - text_elem.bbox[1] <= self.config.CAPTION_MAX_HEIGHT:
930
+ return True
931
+
932
+ return False
933
+
934
+ def _is_inside_box(self, bbox: Tuple) -> bool:
935
+ """Check if block is inside a box."""
936
+ for box in self._boxes:
937
+ if self._is_inside(bbox, box, margin=10):
938
+ return True
939
+ return False
940
+
941
+ # ========================================================================
942
+ # Phase 6: Block optimization and sorting
943
+ # ========================================================================
944
+
945
+ def _optimize_and_sort_blocks(
946
+ self,
947
+ blocks: List[LayoutBlock],
948
+ columns: List[ColumnInfo]
949
+ ) -> List[LayoutBlock]:
950
+ """Block optimization and reading order sorting."""
951
+ if not blocks:
952
+ return []
953
+
954
+ # 1. Merge small blocks
955
+ blocks = self._merge_small_blocks(blocks)
956
+
957
+ # 2. Resolve overlaps
958
+ blocks = self._resolve_overlaps(blocks)
959
+
960
+ # 3. Determine reading order
961
+ # - Header first
962
+ # - Column order (left → right)
963
+ # - Top to bottom within column
964
+ # - Footer last
965
+
966
+ header_blocks = [b for b in blocks if b.block_type == LayoutBlockType.HEADER]
967
+ footer_blocks = [b for b in blocks if b.block_type == LayoutBlockType.FOOTER]
968
+ main_blocks = [b for b in blocks if b.block_type not in (LayoutBlockType.HEADER, LayoutBlockType.FOOTER)]
969
+
970
+ # Sort by column
971
+ column_groups = defaultdict(list)
972
+ for block in main_blocks:
973
+ column_groups[block.column_index].append(block)
974
+
975
+ # Sort by Y coordinate within each column
976
+ for col_idx in column_groups:
977
+ column_groups[col_idx].sort(key=lambda b: b.bbox[1])
978
+
979
+ # Final order: Header → (by column) → Footer
980
+ sorted_blocks = []
981
+ order = 0
982
+
983
+ for block in header_blocks:
984
+ block.reading_order = order
985
+ sorted_blocks.append(block)
986
+ order += 1
987
+
988
+ for col_idx in sorted(column_groups.keys()):
989
+ for block in column_groups[col_idx]:
990
+ block.reading_order = order
991
+ sorted_blocks.append(block)
992
+ order += 1
993
+
994
+ for block in footer_blocks:
995
+ block.reading_order = order
996
+ sorted_blocks.append(block)
997
+ order += 1
998
+
999
+ return sorted_blocks
1000
+
1001
+ def _merge_small_blocks(self, blocks: List[LayoutBlock]) -> List[LayoutBlock]:
1002
+ """Merge adjacent blocks that are too small."""
1003
+ if len(blocks) <= 1:
1004
+ return blocks
1005
+
1006
+ # Skip merge if block count is within target range
1007
+ if len(blocks) <= self.config.TARGET_MAX_BLOCKS:
1008
+ return blocks
1009
+
1010
+ result = []
1011
+ skip_ids = set()
1012
+
1013
+ # Aggressive merge if too many blocks
1014
+ aggressive_merge = len(blocks) > self.config.AGGRESSIVE_MERGE_THRESHOLD
1015
+
1016
+ for block in blocks:
1017
+ if block.block_id in skip_ids:
1018
+ continue
1019
+
1020
+ # Check if small block (raise threshold for aggressive merge)
1021
+ min_area = self.config.MIN_BLOCK_AREA
1022
+ if aggressive_merge:
1023
+ min_area = self.config.MIN_BLOCK_AREA * 2 # 2x threshold
1024
+
1025
+ if block.area >= min_area:
1026
+ result.append(block)
1027
+ continue
1028
+
1029
+ # Find adjacent block
1030
+ merged = False
1031
+ for other in blocks:
1032
+ if other.block_id == block.block_id or other.block_id in skip_ids:
1033
+ continue
1034
+
1035
+ if self._should_merge_blocks(block, other, aggressive=aggressive_merge):
1036
+ # Merge
1037
+ merged_bbox = self._merge_bboxes([block.bbox, other.bbox])
1038
+ other.bbox = merged_bbox
1039
+ other.elements.extend(block.elements)
1040
+ skip_ids.add(block.block_id)
1041
+ merged = True
1042
+ break
1043
+
1044
+ if not merged:
1045
+ result.append(block)
1046
+
1047
+ # Try additional merge if still above target
1048
+ if len(result) > self.config.TARGET_MAX_BLOCKS:
1049
+ result = self._force_merge_to_target(result)
1050
+
1051
+ return result
1052
+
1053
+ def _should_merge_blocks(self, b1: LayoutBlock, b2: LayoutBlock, aggressive: bool = False) -> bool:
1054
+ """Check if two blocks should be merged."""
1055
+ # Same column (allow adjacent columns for aggressive merge)
1056
+ if not aggressive and b1.column_index != b2.column_index:
1057
+ return False
1058
+ if aggressive and abs(b1.column_index - b2.column_index) > 1:
1059
+ return False
1060
+
1061
+ # Close distance
1062
+ vertical_gap = max(0, b2.bbox[1] - b1.bbox[3], b1.bbox[1] - b2.bbox[3])
1063
+ threshold = self.config.VERTICAL_MERGE_THRESHOLD * (3 if aggressive else 2)
1064
+ if vertical_gap > threshold:
1065
+ return False
1066
+
1067
+ return True
1068
+
1069
+ def _force_merge_to_target(self, blocks: List[LayoutBlock]) -> List[LayoutBlock]:
1070
+ """
1071
+ Force merge when block count exceeds target.
1072
+ Merges adjacent blocks within the same column.
1073
+ """
1074
+ if len(blocks) <= self.config.TARGET_MAX_BLOCKS:
1075
+ return blocks
1076
+
1077
+ # Group by column
1078
+ column_groups: Dict[int, List[LayoutBlock]] = defaultdict(list)
1079
+ for block in blocks:
1080
+ column_groups[block.column_index].append(block)
1081
+
1082
+ result = []
1083
+
1084
+ for col_idx in sorted(column_groups.keys()):
1085
+ col_blocks = sorted(column_groups[col_idx], key=lambda b: b.bbox[1])
1086
+
1087
+ # If 2+ blocks in column, merge is possible
1088
+ if len(col_blocks) >= 2:
1089
+ # Merge adjacent blocks
1090
+ merged_blocks = self._merge_adjacent_in_column(col_blocks)
1091
+ result.extend(merged_blocks)
1092
+ else:
1093
+ result.extend(col_blocks)
1094
+
1095
+ logger.debug(f"[LayoutBlockDetector] Force merged: {len(blocks)} → {len(result)} blocks")
1096
+ return result
1097
+
1098
+ def _merge_adjacent_in_column(self, col_blocks: List[LayoutBlock]) -> List[LayoutBlock]:
1099
+ """
1100
+ Merge adjacent blocks within a column.
1101
+ Reduces to at most 2-3 blocks.
1102
+ """
1103
+ if len(col_blocks) <= 2:
1104
+ return col_blocks
1105
+
1106
+ # Divide blocks into 2-3 groups
1107
+ target_groups = max(2, min(3, len(col_blocks) // 2))
1108
+ blocks_per_group = max(1, len(col_blocks) // target_groups)
1109
+
1110
+ result = []
1111
+ current_group = []
1112
+
1113
+ for i, block in enumerate(col_blocks):
1114
+ current_group.append(block)
1115
+
1116
+ # When group is filled, merge
1117
+ if len(current_group) >= blocks_per_group and len(result) < target_groups - 1:
1118
+ merged = self._merge_block_group(current_group)
1119
+ result.append(merged)
1120
+ current_group = []
1121
+
1122
+ # Merge remaining blocks
1123
+ if current_group:
1124
+ merged = self._merge_block_group(current_group)
1125
+ result.append(merged)
1126
+
1127
+ return result
1128
+
1129
+ def _merge_block_group(self, blocks: List[LayoutBlock]) -> LayoutBlock:
1130
+ """Merge a group of blocks into one."""
1131
+ if len(blocks) == 1:
1132
+ return blocks[0]
1133
+
1134
+ merged_bbox = self._merge_bboxes([b.bbox for b in blocks])
1135
+ merged_elements = []
1136
+ for b in blocks:
1137
+ merged_elements.extend(b.elements)
1138
+
1139
+ return LayoutBlock(
1140
+ block_id=blocks[0].block_id,
1141
+ block_type=blocks[0].block_type,
1142
+ bbox=merged_bbox,
1143
+ elements=merged_elements,
1144
+ column_index=blocks[0].column_index,
1145
+ reading_order=blocks[0].reading_order,
1146
+ confidence=min(b.confidence for b in blocks)
1147
+ )
1148
+
1149
+ def _resolve_overlaps(self, blocks: List[LayoutBlock]) -> List[LayoutBlock]:
1150
+ """Resolve block overlaps."""
1151
+ # Currently simply returns (can be improved later)
1152
+ return blocks
1153
+
1154
+ # ========================================================================
1155
+ # Helper Methods
1156
+ # ========================================================================
1157
+
1158
+ def _get_text_dict(self) -> Dict:
1159
+ """Cached text dictionary."""
1160
+ if self._text_dict is None:
1161
+ self._text_dict = self.page.get_text("dict", sort=True)
1162
+ return self._text_dict
1163
+
1164
+ def _get_drawings(self) -> List:
1165
+ """Cached drawings."""
1166
+ if self._drawings is None:
1167
+ self._drawings = self.page.get_drawings()
1168
+ return self._drawings
1169
+
1170
+ def _get_images(self) -> List:
1171
+ """Cached images."""
1172
+ if self._images is None:
1173
+ self._images = self.page.get_images()
1174
+ return self._images
1175
+
1176
+ def _find_image_position(self, xref: int) -> Optional[Tuple[float, float, float, float]]:
1177
+ """Find image position."""
1178
+ try:
1179
+ for img in self.page.get_image_rects(xref):
1180
+ return (img.x0, img.y0, img.x1, img.y1)
1181
+ except Exception:
1182
+ pass
1183
+ return None
1184
+
1185
+ def _is_inside(
1186
+ self,
1187
+ inner: Tuple[float, float, float, float],
1188
+ outer: Tuple[float, float, float, float],
1189
+ margin: float = 0
1190
+ ) -> bool:
1191
+ """Check if inner is inside outer."""
1192
+ return (
1193
+ inner[0] >= outer[0] - margin and
1194
+ inner[1] >= outer[1] - margin and
1195
+ inner[2] <= outer[2] + margin and
1196
+ inner[3] <= outer[3] + margin
1197
+ )
1198
+
1199
+ def _merge_bboxes(self, bboxes: List[Tuple]) -> Tuple[float, float, float, float]:
1200
+ """Merge multiple bboxes."""
1201
+ if not bboxes:
1202
+ return (0, 0, 0, 0)
1203
+
1204
+ x0 = min(b[0] for b in bboxes)
1205
+ y0 = min(b[1] for b in bboxes)
1206
+ x1 = max(b[2] for b in bboxes)
1207
+ y1 = max(b[3] for b in bboxes)
1208
+
1209
+ return (x0, y0, x1, y1)
1210
+
1211
+ def _calculate_confidence(self, blocks: List[LayoutBlock], columns: List[ColumnInfo]) -> float:
1212
+ """Calculate analysis confidence."""
1213
+ if not blocks:
1214
+ return 0.5
1215
+
1216
+ # Ratio of blocks to total elements
1217
+ total_elements = len(self._elements)
1218
+ if total_elements == 0:
1219
+ return 0.5
1220
+
1221
+ covered_elements = sum(len(b.elements) for b in blocks)
1222
+ coverage = covered_elements / total_elements
1223
+
1224
+ # Ratio of UNKNOWN blocks
1225
+ unknown_ratio = sum(1 for b in blocks if b.block_type == LayoutBlockType.UNKNOWN) / max(1, len(blocks))
1226
+
1227
+ confidence = coverage * (1 - unknown_ratio * 0.3)
1228
+
1229
+ return min(1.0, max(0.0, confidence))
1230
+
1231
+
1232
+ # ============================================================================
1233
+ # Export
1234
+ # ============================================================================
1235
+
1236
+ __all__ = [
1237
+ 'LayoutBlockType',
1238
+ 'ContentElement',
1239
+ 'LayoutBlock',
1240
+ 'ColumnInfo',
1241
+ 'LayoutAnalysisResult',
1242
+ 'LayoutDetectorConfig',
1243
+ 'LayoutBlockDetector',
1244
+ ]