xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,750 @@
1
+ """
2
+ Table Quality Analyzer for PDF Handler
3
+
4
+ Analyzes table quality to determine whether text extraction is feasible.
5
+
6
+ =============================================================================
7
+ Core Concepts:
8
+ =============================================================================
9
+ Processing all tables as images is inefficient.
10
+ Normal tables (with complete borders and regular grids) should be extracted as text.
11
+
12
+ Evaluation Criteria:
13
+ 1. Border Completeness - Is the table fully enclosed on all sides?
14
+ 2. Grid Regularity - Is it composed of orthogonal horizontal/vertical lines?
15
+ 3. Cell Structure - Are cells in regular rectangular shapes?
16
+ 4. Absence of Complex Elements - No curves, diagonals, or complex graphics?
17
+
18
+ =============================================================================
19
+ Table Quality Grades:
20
+ =============================================================================
21
+ - EXCELLENT: Perfect table → Must use text extraction
22
+ - GOOD: Good table → Text extraction recommended
23
+ - MODERATE: Table with minor issues → Attempt text extraction, use image if it fails
24
+ - POOR: Table with major issues → Image conversion recommended
25
+ - UNPROCESSABLE: Cannot process → Must use image conversion
26
+ """
27
+
28
+ import logging
29
+ from dataclasses import dataclass, field
30
+ from typing import List, Dict, Optional, Tuple, Set, Any
31
+ from enum import Enum, auto
32
+
33
+ import fitz
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+
38
+ # ============================================================================
39
+ # Types and Enums
40
+ # ============================================================================
41
+
42
+ class TableQuality(Enum):
43
+ """Table quality grades"""
44
+ EXCELLENT = auto() # Perfect table - must use text extraction
45
+ GOOD = auto() # Good table - text extraction recommended
46
+ MODERATE = auto() # Medium - try and evaluate
47
+ POOR = auto() # Has issues - image conversion recommended
48
+ UNPROCESSABLE = auto() # Cannot process - must use image conversion
49
+
50
+
51
+ class BlockProcessability(Enum):
52
+ """Block processability"""
53
+ TEXT_EXTRACTABLE = auto() # Text extraction possible
54
+ TABLE_EXTRACTABLE = auto() # Table extraction possible
55
+ NEEDS_OCR = auto() # OCR required
56
+ IMAGE_REQUIRED = auto() # Image conversion required
57
+
58
+
59
+ @dataclass
60
+ class TableQualityResult:
61
+ """Table quality analysis result"""
62
+ bbox: Tuple[float, float, float, float]
63
+ quality: TableQuality
64
+ score: float # 0.0 ~ 1.0 (higher is better)
65
+
66
+ # Detailed scores
67
+ border_completeness: float = 1.0 # Border completeness
68
+ grid_regularity: float = 1.0 # Grid regularity
69
+ cell_structure: float = 1.0 # Cell structure quality
70
+ no_complex_elements: float = 1.0 # Absence of complex elements
71
+
72
+ # Recommended action
73
+ recommended_action: BlockProcessability = BlockProcessability.TABLE_EXTRACTABLE
74
+
75
+ # Issues
76
+ issues: List[str] = field(default_factory=list)
77
+
78
+
79
+ # ============================================================================
80
+ # Configuration
81
+ # ============================================================================
82
+
83
+ @dataclass
84
+ class TableQualityConfig:
85
+ """Table quality analysis configuration"""
86
+ # Border completeness
87
+ BORDER_REQUIRED_SIDES: int = 4 # Minimum sides for a complete table
88
+ BORDER_TOLERANCE: float = 5.0 # Border alignment tolerance (pt)
89
+
90
+ # Grid regularity
91
+ LINE_ANGLE_TOLERANCE: float = 2.0 # Horizontal/vertical angle tolerance (degrees)
92
+ GRID_ALIGNMENT_TOLERANCE: float = 3.0 # Grid alignment tolerance (pt)
93
+ MIN_ORTHOGONAL_RATIO: float = 0.9 # Minimum orthogonal line ratio (90%+ for normal table)
94
+
95
+ # Cell structure
96
+ MIN_CELL_SIZE: float = 10.0 # Minimum cell size (pt)
97
+ MAX_CELL_ASPECT_RATIO: float = 20.0 # Maximum cell aspect ratio
98
+
99
+ # Complex elements
100
+ MAX_CURVE_RATIO: float = 0.05 # Curve ratio threshold (5% or less)
101
+ MAX_DIAGONAL_RATIO: float = 0.05 # Diagonal line ratio threshold
102
+
103
+ # Quality grade thresholds
104
+ QUALITY_EXCELLENT: float = 0.95 # EXCELLENT threshold
105
+ QUALITY_GOOD: float = 0.85 # GOOD threshold
106
+ QUALITY_MODERATE: float = 0.65 # MODERATE threshold
107
+ QUALITY_POOR: float = 0.40 # POOR threshold (below = UNPROCESSABLE)
108
+
109
+
110
+ # ============================================================================
111
+ # Table Quality Analyzer
112
+ # ============================================================================
113
+
114
+ class TableQualityAnalyzer:
115
+ """
116
+ Table Quality Analyzer
117
+
118
+ Analyzes table regions to determine whether text extraction is feasible.
119
+ """
120
+
121
+ def __init__(
122
+ self,
123
+ page,
124
+ page_num: int = 0,
125
+ config: Optional[TableQualityConfig] = None
126
+ ):
127
+ """
128
+ Args:
129
+ page: PyMuPDF page object
130
+ page_num: Page number (0-indexed), default 0
131
+ config: Analysis configuration
132
+ """
133
+ self.page = page
134
+ self.page_num = page_num
135
+ self.config = config or TableQualityConfig()
136
+
137
+ self.page_width = page.rect.width
138
+ self.page_height = page.rect.height
139
+
140
+ # Cache
141
+ self._drawings = None
142
+ self._text_dict = None
143
+
144
+ def analyze_table(
145
+ self,
146
+ bbox: Tuple[float, float, float, float]
147
+ ) -> TableQualityResult:
148
+ """
149
+ Analyzes the quality of a table region.
150
+
151
+ Args:
152
+ bbox: Table region bounding box
153
+
154
+ Returns:
155
+ TableQualityResult object
156
+ """
157
+ drawings = self._get_region_drawings(bbox)
158
+
159
+ issues = []
160
+
161
+ # 1. Analyze border completeness
162
+ border_score, border_issues = self._analyze_border_completeness(bbox, drawings)
163
+ issues.extend(border_issues)
164
+
165
+ # 2. Analyze grid regularity
166
+ grid_score, grid_issues = self._analyze_grid_regularity(bbox, drawings)
167
+ issues.extend(grid_issues)
168
+
169
+ # 3. Analyze cell structure
170
+ cell_score, cell_issues = self._analyze_cell_structure(bbox, drawings)
171
+ issues.extend(cell_issues)
172
+
173
+ # 4. Analyze complex elements
174
+ simple_score, simple_issues = self._analyze_element_simplicity(bbox, drawings)
175
+ issues.extend(simple_issues)
176
+
177
+ # Calculate total score (weighted average)
178
+ total_score = (
179
+ border_score * 0.30 + # Border completeness 30%
180
+ grid_score * 0.30 + # Grid regularity 30%
181
+ cell_score * 0.20 + # Cell structure 20%
182
+ simple_score * 0.20 # Element simplicity 20%
183
+ )
184
+
185
+ # Determine quality grade
186
+ if total_score >= self.config.QUALITY_EXCELLENT:
187
+ quality = TableQuality.EXCELLENT
188
+ action = BlockProcessability.TABLE_EXTRACTABLE
189
+ elif total_score >= self.config.QUALITY_GOOD:
190
+ quality = TableQuality.GOOD
191
+ action = BlockProcessability.TABLE_EXTRACTABLE
192
+ elif total_score >= self.config.QUALITY_MODERATE:
193
+ quality = TableQuality.MODERATE
194
+ action = BlockProcessability.TABLE_EXTRACTABLE
195
+ elif total_score >= self.config.QUALITY_POOR:
196
+ quality = TableQuality.POOR
197
+ action = BlockProcessability.IMAGE_REQUIRED
198
+ else:
199
+ quality = TableQuality.UNPROCESSABLE
200
+ action = BlockProcessability.IMAGE_REQUIRED
201
+
202
+ logger.debug(f"[TableQualityAnalyzer] Table at {bbox}: "
203
+ f"quality={quality.name}, score={total_score:.2f}, "
204
+ f"border={border_score:.2f}, grid={grid_score:.2f}, "
205
+ f"cell={cell_score:.2f}, simple={simple_score:.2f}")
206
+
207
+ return TableQualityResult(
208
+ bbox=bbox,
209
+ quality=quality,
210
+ score=total_score,
211
+ border_completeness=border_score,
212
+ grid_regularity=grid_score,
213
+ cell_structure=cell_score,
214
+ no_complex_elements=simple_score,
215
+ recommended_action=action,
216
+ issues=issues
217
+ )
218
+
219
+ def _get_region_drawings(
220
+ self,
221
+ bbox: Tuple[float, float, float, float]
222
+ ) -> List[Dict]:
223
+ """Extract drawings within the region"""
224
+ if self._drawings is None:
225
+ self._drawings = self.page.get_drawings()
226
+
227
+ result = []
228
+ for d in self._drawings:
229
+ rect = d.get("rect")
230
+ if rect and self._bbox_overlaps(bbox, (rect.x0, rect.y0, rect.x1, rect.y1)):
231
+ result.append(d)
232
+ return result
233
+
234
+ def _get_drawings_cached(self) -> List[Dict]:
235
+ """Return cached drawings for the entire page"""
236
+ if self._drawings is None:
237
+ self._drawings = self.page.get_drawings()
238
+ return self._drawings
239
+
240
+ def _get_region_text_blocks(
241
+ self,
242
+ bbox: Tuple[float, float, float, float]
243
+ ) -> List[Dict]:
244
+ """Extract text blocks within the region"""
245
+ if self._text_dict is None:
246
+ self._text_dict = self.page.get_text("dict", sort=True)
247
+
248
+ result = []
249
+ for block in self._text_dict.get("blocks", []):
250
+ if block.get("type") != 0:
251
+ continue
252
+ block_bbox = block.get("bbox", (0, 0, 0, 0))
253
+ if self._bbox_overlaps(bbox, block_bbox):
254
+ result.append(block)
255
+ return result
256
+
257
+ def _analyze_as_table(
258
+ self,
259
+ bbox: Tuple[float, float, float, float],
260
+ drawings: List[Dict]
261
+ ) -> Tuple[bool, Optional[TableQualityResult]]:
262
+ """Analyze if the region is a table"""
263
+ # Extract lines
264
+ lines = self._extract_lines(drawings)
265
+
266
+ # Minimum lines required for a table
267
+ if len(lines) < 4: # At least 4 lines (rectangle)
268
+ return False, None
269
+
270
+ # Separate horizontal and vertical lines
271
+ h_lines = [l for l in lines if l['is_horizontal']]
272
+ v_lines = [l for l in lines if l['is_vertical']]
273
+
274
+ # Both horizontal and vertical lines must exist for a table
275
+ if len(h_lines) < 2 or len(v_lines) < 2:
276
+ return False, None
277
+
278
+ # If identified as table, analyze quality
279
+ quality = self.analyze_table(bbox)
280
+ return True, quality
281
+
282
+ def _analyze_border_completeness(
283
+ self,
284
+ bbox: Tuple[float, float, float, float],
285
+ drawings: List[Dict]
286
+ ) -> Tuple[float, List[str]]:
287
+ """Analyze border completeness"""
288
+ issues = []
289
+ lines = self._extract_lines(drawings)
290
+
291
+ if not lines:
292
+ issues.append("No border lines detected")
293
+ return 0.0, issues
294
+
295
+ # Border detection
296
+ tolerance = self.config.BORDER_TOLERANCE
297
+ x0, y0, x1, y1 = bbox
298
+
299
+ has_top = False
300
+ has_bottom = False
301
+ has_left = False
302
+ has_right = False
303
+
304
+ for line in lines:
305
+ if line['is_horizontal']:
306
+ # Top border
307
+ if abs(line['y1'] - y0) <= tolerance and line['x1'] >= x0 and line['x2'] <= x1:
308
+ has_top = True
309
+ # Bottom border
310
+ elif abs(line['y1'] - y1) <= tolerance and line['x1'] >= x0 and line['x2'] <= x1:
311
+ has_bottom = True
312
+
313
+ if line['is_vertical']:
314
+ # Left border
315
+ if abs(line['x1'] - x0) <= tolerance and line['y1'] >= y0 and line['y2'] <= y1:
316
+ has_left = True
317
+ # Right border
318
+ elif abs(line['x1'] - x1) <= tolerance and line['y1'] >= y0 and line['y2'] <= y1:
319
+ has_right = True
320
+
321
+ sides = [has_top, has_bottom, has_left, has_right]
322
+ complete_sides = sum(sides)
323
+
324
+ if complete_sides < 4:
325
+ missing = []
326
+ if not has_top: missing.append("top")
327
+ if not has_bottom: missing.append("bottom")
328
+ if not has_left: missing.append("left")
329
+ if not has_right: missing.append("right")
330
+ issues.append(f"Missing borders: {', '.join(missing)}")
331
+
332
+ return complete_sides / 4.0, issues
333
+
334
+ def _analyze_grid_regularity(
335
+ self,
336
+ bbox: Tuple[float, float, float, float],
337
+ drawings: List[Dict]
338
+ ) -> Tuple[float, List[str]]:
339
+ """Analyze grid regularity"""
340
+ issues = []
341
+ lines = self._extract_lines(drawings)
342
+
343
+ if not lines:
344
+ return 0.0, ["No grid lines"]
345
+
346
+ # Calculate orthogonal line ratio
347
+ orthogonal_count = sum(1 for l in lines if l['is_horizontal'] or l['is_vertical'])
348
+ total_lines = len(lines)
349
+
350
+ orthogonal_ratio = orthogonal_count / total_lines if total_lines > 0 else 0
351
+
352
+ if orthogonal_ratio < self.config.MIN_ORTHOGONAL_RATIO:
353
+ issues.append(f"Non-orthogonal lines: {(1-orthogonal_ratio)*100:.1f}%")
354
+
355
+ # Analyze line alignment
356
+ h_lines = [l for l in lines if l['is_horizontal']]
357
+ v_lines = [l for l in lines if l['is_vertical']]
358
+
359
+ # Check Y-coordinate alignment of horizontal lines
360
+ h_alignment = self._check_line_alignment([l['y1'] for l in h_lines])
361
+ # Check X-coordinate alignment of vertical lines
362
+ v_alignment = self._check_line_alignment([l['x1'] for l in v_lines])
363
+
364
+ alignment_score = (h_alignment + v_alignment) / 2
365
+
366
+ if alignment_score < 0.8:
367
+ issues.append("Misaligned grid lines")
368
+
369
+ return (orthogonal_ratio * 0.6 + alignment_score * 0.4), issues
370
+
371
+ def _analyze_cell_structure(
372
+ self,
373
+ bbox: Tuple[float, float, float, float],
374
+ drawings: List[Dict]
375
+ ) -> Tuple[float, List[str]]:
376
+ """Analyze cell structure"""
377
+ issues = []
378
+ lines = self._extract_lines(drawings)
379
+
380
+ h_lines = sorted([l for l in lines if l['is_horizontal']], key=lambda l: l['y1'])
381
+ v_lines = sorted([l for l in lines if l['is_vertical']], key=lambda l: l['x1'])
382
+
383
+ if len(h_lines) < 2 or len(v_lines) < 2:
384
+ issues.append("Insufficient lines for cell structure")
385
+ return 0.5, issues
386
+
387
+ # Analyze cell sizes
388
+ cell_heights = []
389
+ for i in range(len(h_lines) - 1):
390
+ height = h_lines[i+1]['y1'] - h_lines[i]['y1']
391
+ if height > 0:
392
+ cell_heights.append(height)
393
+
394
+ cell_widths = []
395
+ for i in range(len(v_lines) - 1):
396
+ width = v_lines[i+1]['x1'] - v_lines[i]['x1']
397
+ if width > 0:
398
+ cell_widths.append(width)
399
+
400
+ # Check for cells that are too small
401
+ tiny_cells = 0
402
+ for h in cell_heights:
403
+ if h < self.config.MIN_CELL_SIZE:
404
+ tiny_cells += 1
405
+ for w in cell_widths:
406
+ if w < self.config.MIN_CELL_SIZE:
407
+ tiny_cells += 1
408
+
409
+ total_cells = len(cell_heights) + len(cell_widths)
410
+ if total_cells > 0 and tiny_cells / total_cells > 0.1:
411
+ issues.append("Too many tiny cells")
412
+
413
+ # Check for extreme aspect ratios
414
+ extreme_ratio_count = 0
415
+ for h in cell_heights:
416
+ for w in cell_widths:
417
+ if h > 0 and w > 0:
418
+ ratio = max(h/w, w/h)
419
+ if ratio > self.config.MAX_CELL_ASPECT_RATIO:
420
+ extreme_ratio_count += 1
421
+
422
+ if extreme_ratio_count > 0:
423
+ issues.append("Extreme cell aspect ratios")
424
+
425
+ # Calculate score
426
+ score = 1.0
427
+ if tiny_cells > 0:
428
+ score -= 0.2
429
+ if extreme_ratio_count > 0:
430
+ score -= 0.2
431
+
432
+ return max(0.0, score), issues
433
+
434
+ def _analyze_element_simplicity(
435
+ self,
436
+ bbox: Tuple[float, float, float, float],
437
+ drawings: List[Dict]
438
+ ) -> Tuple[float, List[str]]:
439
+ """Analyze element simplicity (absence of curves, diagonals, and other complex elements)"""
440
+ issues = []
441
+
442
+ if not drawings:
443
+ return 1.0, issues
444
+
445
+ curve_count = 0
446
+ diagonal_count = 0
447
+ fill_count = 0
448
+ total_items = 0
449
+
450
+ for d in drawings:
451
+ items = d.get("items", [])
452
+ total_items += len(items)
453
+
454
+ for item in items:
455
+ item_type = item[0]
456
+ if item_type == 'c': # curve
457
+ curve_count += 1
458
+ elif item_type == 'l': # line
459
+ # Check for diagonal
460
+ p1, p2 = item[1], item[2]
461
+ if not self._is_orthogonal_line(p1, p2):
462
+ diagonal_count += 1
463
+
464
+ if d.get("fill"):
465
+ fill_count += 1
466
+
467
+ # Calculate ratios
468
+ curve_ratio = curve_count / max(1, total_items)
469
+ diagonal_ratio = diagonal_count / max(1, total_items)
470
+ fill_ratio = fill_count / max(1, len(drawings))
471
+
472
+ # Detect issues
473
+ if curve_ratio > self.config.MAX_CURVE_RATIO:
474
+ issues.append(f"Too many curves: {curve_ratio*100:.1f}%")
475
+
476
+ if diagonal_ratio > self.config.MAX_DIAGONAL_RATIO:
477
+ issues.append(f"Too many diagonals: {diagonal_ratio*100:.1f}%")
478
+
479
+ if fill_ratio > 0.5:
480
+ issues.append("Heavy fill patterns")
481
+
482
+ # Calculate score
483
+ score = 1.0
484
+ score -= min(0.3, curve_ratio * 3)
485
+ score -= min(0.3, diagonal_ratio * 3)
486
+ score -= min(0.2, fill_ratio * 0.4)
487
+
488
+ return max(0.0, score), issues
489
+
490
+ def _extract_lines(self, drawings: List[Dict]) -> List[Dict]:
491
+ """Extract lines from drawings"""
492
+ lines = []
493
+
494
+ for d in drawings:
495
+ for item in d.get("items", []):
496
+ if item[0] == 'l': # straight line
497
+ p1, p2 = item[1], item[2]
498
+ x1, y1 = p1.x, p1.y
499
+ x2, y2 = p2.x, p2.y
500
+
501
+ # Determine horizontal/vertical
502
+ angle_tolerance = self.config.LINE_ANGLE_TOLERANCE
503
+ is_horizontal = abs(y2 - y1) <= angle_tolerance
504
+ is_vertical = abs(x2 - x1) <= angle_tolerance
505
+
506
+ lines.append({
507
+ 'x1': min(x1, x2),
508
+ 'y1': min(y1, y2),
509
+ 'x2': max(x1, x2),
510
+ 'y2': max(y1, y2),
511
+ 'is_horizontal': is_horizontal,
512
+ 'is_vertical': is_vertical,
513
+ 'length': ((x2-x1)**2 + (y2-y1)**2) ** 0.5
514
+ })
515
+ elif item[0] == 're': # rectangle
516
+ rect = item[1]
517
+ x0, y0, x1, y1 = rect.x0, rect.y0, rect.x1, rect.y1
518
+
519
+ # Add rectangle's 4 sides as lines
520
+ lines.extend([
521
+ {'x1': x0, 'y1': y0, 'x2': x1, 'y2': y0, 'is_horizontal': True, 'is_vertical': False, 'length': x1-x0}, # top
522
+ {'x1': x0, 'y1': y1, 'x2': x1, 'y2': y1, 'is_horizontal': True, 'is_vertical': False, 'length': x1-x0}, # bottom
523
+ {'x1': x0, 'y1': y0, 'x2': x0, 'y2': y1, 'is_horizontal': False, 'is_vertical': True, 'length': y1-y0}, # left
524
+ {'x1': x1, 'y1': y0, 'x2': x1, 'y2': y1, 'is_horizontal': False, 'is_vertical': True, 'length': y1-y0}, # right
525
+ ])
526
+
527
+ return lines
528
+
529
+ def _is_orthogonal_line(self, p1, p2) -> bool:
530
+ """Check if the line is horizontal or vertical"""
531
+ tolerance = self.config.LINE_ANGLE_TOLERANCE
532
+ return abs(p2.x - p1.x) <= tolerance or abs(p2.y - p1.y) <= tolerance
533
+
534
+ def _check_line_alignment(self, positions: List[float]) -> float:
535
+ """Check line alignment quality"""
536
+ if len(positions) < 2:
537
+ return 1.0
538
+
539
+ # Clustering
540
+ tolerance = self.config.GRID_ALIGNMENT_TOLERANCE
541
+ sorted_pos = sorted(positions)
542
+
543
+ clusters = []
544
+ current_cluster = [sorted_pos[0]]
545
+
546
+ for pos in sorted_pos[1:]:
547
+ if pos - current_cluster[-1] <= tolerance:
548
+ current_cluster.append(pos)
549
+ else:
550
+ clusters.append(current_cluster)
551
+ current_cluster = [pos]
552
+ clusters.append(current_cluster)
553
+
554
+ # Ratio of well-aligned lines
555
+ well_aligned = sum(len(c) for c in clusters if len(c) > 1)
556
+ return well_aligned / len(positions) if positions else 1.0
557
+
558
+ def _analyze_text_quality(self, text_blocks: List[Dict]) -> float:
559
+ """Analyze text quality"""
560
+ if not text_blocks:
561
+ return 0.0
562
+
563
+ total_chars = 0
564
+ bad_chars = 0
565
+
566
+ for block in text_blocks:
567
+ for line in block.get("lines", []):
568
+ for span in line.get("spans", []):
569
+ text = span.get("text", "")
570
+ total_chars += len(text)
571
+
572
+ for char in text:
573
+ code = ord(char)
574
+ if 0xE000 <= code <= 0xF8FF: # PUA
575
+ bad_chars += 1
576
+
577
+ if total_chars == 0:
578
+ return 0.0
579
+
580
+ return 1.0 - (bad_chars / total_chars)
581
+
582
+ def _bbox_overlaps(self, bbox1: Tuple, bbox2: Tuple) -> bool:
583
+ """Check if two bounding boxes overlap"""
584
+ return not (
585
+ bbox1[2] <= bbox2[0] or
586
+ bbox1[0] >= bbox2[2] or
587
+ bbox1[3] <= bbox2[1] or
588
+ bbox1[1] >= bbox2[3]
589
+ )
590
+
591
+ def analyze_page_tables(self) -> Dict[str, Any]:
592
+ """
593
+ Analyzes all table candidate regions on the page.
594
+
595
+ Returns:
596
+ Dict containing:
597
+ - table_candidates: List of table candidates (each with quality info)
598
+ - has_processable_tables: Whether processable tables exist
599
+ - summary: Analysis summary
600
+ """
601
+ # Search for table candidate regions from drawings
602
+ drawings = self._get_drawings_cached()
603
+
604
+ # Extract lines
605
+ h_lines = []
606
+ v_lines = []
607
+
608
+ for d in drawings:
609
+ items = d.get("items", [])
610
+ for item in items:
611
+ cmd = item[0] if item else None
612
+
613
+ if cmd == "l": # line
614
+ x0, y0, x1, y1 = item[1], item[2], item[3], item[4]
615
+
616
+ if abs(y1 - y0) < 3: # horizontal line
617
+ h_lines.append((min(x0, x1), y0, max(x0, x1), y1))
618
+ elif abs(x1 - x0) < 3: # vertical line
619
+ v_lines.append((x0, min(y0, y1), x1, max(y0, y1)))
620
+
621
+ elif cmd == "re": # rect
622
+ x, y, w, h = item[1], item[2], item[3], item[4]
623
+ if w > 20 and h > 10:
624
+ # Add rectangle's four sides as lines
625
+ h_lines.append((x, y, x + w, y)) # top
626
+ h_lines.append((x, y + h, x + w, y + h)) # bottom
627
+ v_lines.append((x, y, x, y + h)) # left
628
+ v_lines.append((x + w, y, x + w, y + h)) # right
629
+
630
+ # Find table candidate regions (areas with dense lines)
631
+ table_candidates = self._find_table_regions(h_lines, v_lines)
632
+
633
+ results = []
634
+ for bbox in table_candidates:
635
+ quality_result = self.analyze_table(bbox)
636
+ results.append({
637
+ 'bbox': bbox,
638
+ 'quality': quality_result.quality,
639
+ 'score': quality_result.score,
640
+ 'is_processable': quality_result.recommended_action == BlockProcessability.TABLE_EXTRACTABLE,
641
+ 'issues': quality_result.issues
642
+ })
643
+
644
+ has_processable = any(r['is_processable'] for r in results)
645
+
646
+ summary = {
647
+ 'total_candidates': len(results),
648
+ 'processable': sum(1 for r in results if r['is_processable']),
649
+ 'unprocessable': sum(1 for r in results if not r['is_processable']),
650
+ }
651
+
652
+ logger.info(f"[TableQualityAnalyzer] Page {self.page_num + 1}: "
653
+ f"Found {summary['total_candidates']} table candidates, "
654
+ f"{summary['processable']} processable")
655
+
656
+ return {
657
+ 'table_candidates': results,
658
+ 'has_processable_tables': has_processable,
659
+ 'summary': summary
660
+ }
661
+
662
+ def _find_table_regions(
663
+ self,
664
+ h_lines: List[Tuple],
665
+ v_lines: List[Tuple]
666
+ ) -> List[Tuple[float, float, float, float]]:
667
+ """
668
+ Search for table candidates in regions where horizontal and vertical lines intersect
669
+ """
670
+ if not h_lines or not v_lines:
671
+ return []
672
+
673
+ # Calculate bounding box of all lines
674
+ all_lines = h_lines + v_lines
675
+ if not all_lines:
676
+ return []
677
+
678
+ # Find table regions by clustering lines
679
+ clusters = []
680
+ used = set()
681
+
682
+ # Simplified approach: group lines that intersect or are close to each other
683
+ tolerance = 50 # pixels
684
+
685
+ for i, line1 in enumerate(all_lines):
686
+ if i in used:
687
+ continue
688
+
689
+ cluster = [line1]
690
+ used.add(i)
691
+
692
+ for j, line2 in enumerate(all_lines):
693
+ if j in used:
694
+ continue
695
+
696
+ # If two lines are close, put them in the same cluster
697
+ if self._lines_are_close(line1, line2, tolerance):
698
+ cluster.append(line2)
699
+ used.add(j)
700
+
701
+ if len(cluster) >= 4: # At least 4 lines required for a table candidate
702
+ clusters.append(cluster)
703
+
704
+ # Convert clusters to bounding boxes
705
+ table_regions = []
706
+ for cluster in clusters:
707
+ x0 = min(min(l[0], l[2]) for l in cluster)
708
+ y0 = min(min(l[1], l[3]) for l in cluster)
709
+ x1 = max(max(l[0], l[2]) for l in cluster)
710
+ y1 = max(max(l[1], l[3]) for l in cluster)
711
+
712
+ # Check minimum size
713
+ if (x1 - x0) > 100 and (y1 - y0) > 50:
714
+ table_regions.append((x0, y0, x1, y1))
715
+
716
+ return table_regions
717
+
718
+ def _lines_are_close(
719
+ self,
720
+ line1: Tuple,
721
+ line2: Tuple,
722
+ tolerance: float
723
+ ) -> bool:
724
+ """Check if two lines are close to each other"""
725
+ # Check distance between endpoints of line1 and line2
726
+ x1_min, y1_min = min(line1[0], line1[2]), min(line1[1], line1[3])
727
+ x1_max, y1_max = max(line1[0], line1[2]), max(line1[1], line1[3])
728
+ x2_min, y2_min = min(line2[0], line2[2]), min(line2[1], line2[3])
729
+ x2_max, y2_max = max(line2[0], line2[2]), max(line2[1], line2[3])
730
+
731
+ # True if bounding boxes of the two lines overlap or are close
732
+ return not (
733
+ x1_max + tolerance < x2_min or
734
+ x2_max + tolerance < x1_min or
735
+ y1_max + tolerance < y2_min or
736
+ y2_max + tolerance < y1_min
737
+ )
738
+
739
+
740
+ # ============================================================================
741
+ # Export
742
+ # ============================================================================
743
+
744
+ __all__ = [
745
+ 'TableQuality',
746
+ 'BlockProcessability',
747
+ 'TableQualityResult',
748
+ 'TableQualityConfig',
749
+ 'TableQualityAnalyzer',
750
+ ]