xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,750 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Table Quality Analyzer for PDF Handler
|
|
3
|
+
|
|
4
|
+
Analyzes table quality to determine whether text extraction is feasible.
|
|
5
|
+
|
|
6
|
+
=============================================================================
|
|
7
|
+
Core Concepts:
|
|
8
|
+
=============================================================================
|
|
9
|
+
Processing all tables as images is inefficient.
|
|
10
|
+
Normal tables (with complete borders and regular grids) should be extracted as text.
|
|
11
|
+
|
|
12
|
+
Evaluation Criteria:
|
|
13
|
+
1. Border Completeness - Is the table fully enclosed on all sides?
|
|
14
|
+
2. Grid Regularity - Is it composed of orthogonal horizontal/vertical lines?
|
|
15
|
+
3. Cell Structure - Are cells in regular rectangular shapes?
|
|
16
|
+
4. Absence of Complex Elements - No curves, diagonals, or complex graphics?
|
|
17
|
+
|
|
18
|
+
=============================================================================
|
|
19
|
+
Table Quality Grades:
|
|
20
|
+
=============================================================================
|
|
21
|
+
- EXCELLENT: Perfect table → Must use text extraction
|
|
22
|
+
- GOOD: Good table → Text extraction recommended
|
|
23
|
+
- MODERATE: Table with minor issues → Attempt text extraction, use image if it fails
|
|
24
|
+
- POOR: Table with major issues → Image conversion recommended
|
|
25
|
+
- UNPROCESSABLE: Cannot process → Must use image conversion
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
import logging
|
|
29
|
+
from dataclasses import dataclass, field
|
|
30
|
+
from typing import List, Dict, Optional, Tuple, Set, Any
|
|
31
|
+
from enum import Enum, auto
|
|
32
|
+
|
|
33
|
+
import fitz
|
|
34
|
+
|
|
35
|
+
logger = logging.getLogger(__name__)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# ============================================================================
|
|
39
|
+
# Types and Enums
|
|
40
|
+
# ============================================================================
|
|
41
|
+
|
|
42
|
+
class TableQuality(Enum):
|
|
43
|
+
"""Table quality grades"""
|
|
44
|
+
EXCELLENT = auto() # Perfect table - must use text extraction
|
|
45
|
+
GOOD = auto() # Good table - text extraction recommended
|
|
46
|
+
MODERATE = auto() # Medium - try and evaluate
|
|
47
|
+
POOR = auto() # Has issues - image conversion recommended
|
|
48
|
+
UNPROCESSABLE = auto() # Cannot process - must use image conversion
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class BlockProcessability(Enum):
|
|
52
|
+
"""Block processability"""
|
|
53
|
+
TEXT_EXTRACTABLE = auto() # Text extraction possible
|
|
54
|
+
TABLE_EXTRACTABLE = auto() # Table extraction possible
|
|
55
|
+
NEEDS_OCR = auto() # OCR required
|
|
56
|
+
IMAGE_REQUIRED = auto() # Image conversion required
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class TableQualityResult:
|
|
61
|
+
"""Table quality analysis result"""
|
|
62
|
+
bbox: Tuple[float, float, float, float]
|
|
63
|
+
quality: TableQuality
|
|
64
|
+
score: float # 0.0 ~ 1.0 (higher is better)
|
|
65
|
+
|
|
66
|
+
# Detailed scores
|
|
67
|
+
border_completeness: float = 1.0 # Border completeness
|
|
68
|
+
grid_regularity: float = 1.0 # Grid regularity
|
|
69
|
+
cell_structure: float = 1.0 # Cell structure quality
|
|
70
|
+
no_complex_elements: float = 1.0 # Absence of complex elements
|
|
71
|
+
|
|
72
|
+
# Recommended action
|
|
73
|
+
recommended_action: BlockProcessability = BlockProcessability.TABLE_EXTRACTABLE
|
|
74
|
+
|
|
75
|
+
# Issues
|
|
76
|
+
issues: List[str] = field(default_factory=list)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
# ============================================================================
|
|
80
|
+
# Configuration
|
|
81
|
+
# ============================================================================
|
|
82
|
+
|
|
83
|
+
@dataclass
|
|
84
|
+
class TableQualityConfig:
|
|
85
|
+
"""Table quality analysis configuration"""
|
|
86
|
+
# Border completeness
|
|
87
|
+
BORDER_REQUIRED_SIDES: int = 4 # Minimum sides for a complete table
|
|
88
|
+
BORDER_TOLERANCE: float = 5.0 # Border alignment tolerance (pt)
|
|
89
|
+
|
|
90
|
+
# Grid regularity
|
|
91
|
+
LINE_ANGLE_TOLERANCE: float = 2.0 # Horizontal/vertical angle tolerance (degrees)
|
|
92
|
+
GRID_ALIGNMENT_TOLERANCE: float = 3.0 # Grid alignment tolerance (pt)
|
|
93
|
+
MIN_ORTHOGONAL_RATIO: float = 0.9 # Minimum orthogonal line ratio (90%+ for normal table)
|
|
94
|
+
|
|
95
|
+
# Cell structure
|
|
96
|
+
MIN_CELL_SIZE: float = 10.0 # Minimum cell size (pt)
|
|
97
|
+
MAX_CELL_ASPECT_RATIO: float = 20.0 # Maximum cell aspect ratio
|
|
98
|
+
|
|
99
|
+
# Complex elements
|
|
100
|
+
MAX_CURVE_RATIO: float = 0.05 # Curve ratio threshold (5% or less)
|
|
101
|
+
MAX_DIAGONAL_RATIO: float = 0.05 # Diagonal line ratio threshold
|
|
102
|
+
|
|
103
|
+
# Quality grade thresholds
|
|
104
|
+
QUALITY_EXCELLENT: float = 0.95 # EXCELLENT threshold
|
|
105
|
+
QUALITY_GOOD: float = 0.85 # GOOD threshold
|
|
106
|
+
QUALITY_MODERATE: float = 0.65 # MODERATE threshold
|
|
107
|
+
QUALITY_POOR: float = 0.40 # POOR threshold (below = UNPROCESSABLE)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
# ============================================================================
|
|
111
|
+
# Table Quality Analyzer
|
|
112
|
+
# ============================================================================
|
|
113
|
+
|
|
114
|
+
class TableQualityAnalyzer:
|
|
115
|
+
"""
|
|
116
|
+
Table Quality Analyzer
|
|
117
|
+
|
|
118
|
+
Analyzes table regions to determine whether text extraction is feasible.
|
|
119
|
+
"""
|
|
120
|
+
|
|
121
|
+
def __init__(
|
|
122
|
+
self,
|
|
123
|
+
page,
|
|
124
|
+
page_num: int = 0,
|
|
125
|
+
config: Optional[TableQualityConfig] = None
|
|
126
|
+
):
|
|
127
|
+
"""
|
|
128
|
+
Args:
|
|
129
|
+
page: PyMuPDF page object
|
|
130
|
+
page_num: Page number (0-indexed), default 0
|
|
131
|
+
config: Analysis configuration
|
|
132
|
+
"""
|
|
133
|
+
self.page = page
|
|
134
|
+
self.page_num = page_num
|
|
135
|
+
self.config = config or TableQualityConfig()
|
|
136
|
+
|
|
137
|
+
self.page_width = page.rect.width
|
|
138
|
+
self.page_height = page.rect.height
|
|
139
|
+
|
|
140
|
+
# Cache
|
|
141
|
+
self._drawings = None
|
|
142
|
+
self._text_dict = None
|
|
143
|
+
|
|
144
|
+
def analyze_table(
|
|
145
|
+
self,
|
|
146
|
+
bbox: Tuple[float, float, float, float]
|
|
147
|
+
) -> TableQualityResult:
|
|
148
|
+
"""
|
|
149
|
+
Analyzes the quality of a table region.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
bbox: Table region bounding box
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
TableQualityResult object
|
|
156
|
+
"""
|
|
157
|
+
drawings = self._get_region_drawings(bbox)
|
|
158
|
+
|
|
159
|
+
issues = []
|
|
160
|
+
|
|
161
|
+
# 1. Analyze border completeness
|
|
162
|
+
border_score, border_issues = self._analyze_border_completeness(bbox, drawings)
|
|
163
|
+
issues.extend(border_issues)
|
|
164
|
+
|
|
165
|
+
# 2. Analyze grid regularity
|
|
166
|
+
grid_score, grid_issues = self._analyze_grid_regularity(bbox, drawings)
|
|
167
|
+
issues.extend(grid_issues)
|
|
168
|
+
|
|
169
|
+
# 3. Analyze cell structure
|
|
170
|
+
cell_score, cell_issues = self._analyze_cell_structure(bbox, drawings)
|
|
171
|
+
issues.extend(cell_issues)
|
|
172
|
+
|
|
173
|
+
# 4. Analyze complex elements
|
|
174
|
+
simple_score, simple_issues = self._analyze_element_simplicity(bbox, drawings)
|
|
175
|
+
issues.extend(simple_issues)
|
|
176
|
+
|
|
177
|
+
# Calculate total score (weighted average)
|
|
178
|
+
total_score = (
|
|
179
|
+
border_score * 0.30 + # Border completeness 30%
|
|
180
|
+
grid_score * 0.30 + # Grid regularity 30%
|
|
181
|
+
cell_score * 0.20 + # Cell structure 20%
|
|
182
|
+
simple_score * 0.20 # Element simplicity 20%
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
# Determine quality grade
|
|
186
|
+
if total_score >= self.config.QUALITY_EXCELLENT:
|
|
187
|
+
quality = TableQuality.EXCELLENT
|
|
188
|
+
action = BlockProcessability.TABLE_EXTRACTABLE
|
|
189
|
+
elif total_score >= self.config.QUALITY_GOOD:
|
|
190
|
+
quality = TableQuality.GOOD
|
|
191
|
+
action = BlockProcessability.TABLE_EXTRACTABLE
|
|
192
|
+
elif total_score >= self.config.QUALITY_MODERATE:
|
|
193
|
+
quality = TableQuality.MODERATE
|
|
194
|
+
action = BlockProcessability.TABLE_EXTRACTABLE
|
|
195
|
+
elif total_score >= self.config.QUALITY_POOR:
|
|
196
|
+
quality = TableQuality.POOR
|
|
197
|
+
action = BlockProcessability.IMAGE_REQUIRED
|
|
198
|
+
else:
|
|
199
|
+
quality = TableQuality.UNPROCESSABLE
|
|
200
|
+
action = BlockProcessability.IMAGE_REQUIRED
|
|
201
|
+
|
|
202
|
+
logger.debug(f"[TableQualityAnalyzer] Table at {bbox}: "
|
|
203
|
+
f"quality={quality.name}, score={total_score:.2f}, "
|
|
204
|
+
f"border={border_score:.2f}, grid={grid_score:.2f}, "
|
|
205
|
+
f"cell={cell_score:.2f}, simple={simple_score:.2f}")
|
|
206
|
+
|
|
207
|
+
return TableQualityResult(
|
|
208
|
+
bbox=bbox,
|
|
209
|
+
quality=quality,
|
|
210
|
+
score=total_score,
|
|
211
|
+
border_completeness=border_score,
|
|
212
|
+
grid_regularity=grid_score,
|
|
213
|
+
cell_structure=cell_score,
|
|
214
|
+
no_complex_elements=simple_score,
|
|
215
|
+
recommended_action=action,
|
|
216
|
+
issues=issues
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
def _get_region_drawings(
|
|
220
|
+
self,
|
|
221
|
+
bbox: Tuple[float, float, float, float]
|
|
222
|
+
) -> List[Dict]:
|
|
223
|
+
"""Extract drawings within the region"""
|
|
224
|
+
if self._drawings is None:
|
|
225
|
+
self._drawings = self.page.get_drawings()
|
|
226
|
+
|
|
227
|
+
result = []
|
|
228
|
+
for d in self._drawings:
|
|
229
|
+
rect = d.get("rect")
|
|
230
|
+
if rect and self._bbox_overlaps(bbox, (rect.x0, rect.y0, rect.x1, rect.y1)):
|
|
231
|
+
result.append(d)
|
|
232
|
+
return result
|
|
233
|
+
|
|
234
|
+
def _get_drawings_cached(self) -> List[Dict]:
|
|
235
|
+
"""Return cached drawings for the entire page"""
|
|
236
|
+
if self._drawings is None:
|
|
237
|
+
self._drawings = self.page.get_drawings()
|
|
238
|
+
return self._drawings
|
|
239
|
+
|
|
240
|
+
def _get_region_text_blocks(
|
|
241
|
+
self,
|
|
242
|
+
bbox: Tuple[float, float, float, float]
|
|
243
|
+
) -> List[Dict]:
|
|
244
|
+
"""Extract text blocks within the region"""
|
|
245
|
+
if self._text_dict is None:
|
|
246
|
+
self._text_dict = self.page.get_text("dict", sort=True)
|
|
247
|
+
|
|
248
|
+
result = []
|
|
249
|
+
for block in self._text_dict.get("blocks", []):
|
|
250
|
+
if block.get("type") != 0:
|
|
251
|
+
continue
|
|
252
|
+
block_bbox = block.get("bbox", (0, 0, 0, 0))
|
|
253
|
+
if self._bbox_overlaps(bbox, block_bbox):
|
|
254
|
+
result.append(block)
|
|
255
|
+
return result
|
|
256
|
+
|
|
257
|
+
def _analyze_as_table(
|
|
258
|
+
self,
|
|
259
|
+
bbox: Tuple[float, float, float, float],
|
|
260
|
+
drawings: List[Dict]
|
|
261
|
+
) -> Tuple[bool, Optional[TableQualityResult]]:
|
|
262
|
+
"""Analyze if the region is a table"""
|
|
263
|
+
# Extract lines
|
|
264
|
+
lines = self._extract_lines(drawings)
|
|
265
|
+
|
|
266
|
+
# Minimum lines required for a table
|
|
267
|
+
if len(lines) < 4: # At least 4 lines (rectangle)
|
|
268
|
+
return False, None
|
|
269
|
+
|
|
270
|
+
# Separate horizontal and vertical lines
|
|
271
|
+
h_lines = [l for l in lines if l['is_horizontal']]
|
|
272
|
+
v_lines = [l for l in lines if l['is_vertical']]
|
|
273
|
+
|
|
274
|
+
# Both horizontal and vertical lines must exist for a table
|
|
275
|
+
if len(h_lines) < 2 or len(v_lines) < 2:
|
|
276
|
+
return False, None
|
|
277
|
+
|
|
278
|
+
# If identified as table, analyze quality
|
|
279
|
+
quality = self.analyze_table(bbox)
|
|
280
|
+
return True, quality
|
|
281
|
+
|
|
282
|
+
def _analyze_border_completeness(
|
|
283
|
+
self,
|
|
284
|
+
bbox: Tuple[float, float, float, float],
|
|
285
|
+
drawings: List[Dict]
|
|
286
|
+
) -> Tuple[float, List[str]]:
|
|
287
|
+
"""Analyze border completeness"""
|
|
288
|
+
issues = []
|
|
289
|
+
lines = self._extract_lines(drawings)
|
|
290
|
+
|
|
291
|
+
if not lines:
|
|
292
|
+
issues.append("No border lines detected")
|
|
293
|
+
return 0.0, issues
|
|
294
|
+
|
|
295
|
+
# Border detection
|
|
296
|
+
tolerance = self.config.BORDER_TOLERANCE
|
|
297
|
+
x0, y0, x1, y1 = bbox
|
|
298
|
+
|
|
299
|
+
has_top = False
|
|
300
|
+
has_bottom = False
|
|
301
|
+
has_left = False
|
|
302
|
+
has_right = False
|
|
303
|
+
|
|
304
|
+
for line in lines:
|
|
305
|
+
if line['is_horizontal']:
|
|
306
|
+
# Top border
|
|
307
|
+
if abs(line['y1'] - y0) <= tolerance and line['x1'] >= x0 and line['x2'] <= x1:
|
|
308
|
+
has_top = True
|
|
309
|
+
# Bottom border
|
|
310
|
+
elif abs(line['y1'] - y1) <= tolerance and line['x1'] >= x0 and line['x2'] <= x1:
|
|
311
|
+
has_bottom = True
|
|
312
|
+
|
|
313
|
+
if line['is_vertical']:
|
|
314
|
+
# Left border
|
|
315
|
+
if abs(line['x1'] - x0) <= tolerance and line['y1'] >= y0 and line['y2'] <= y1:
|
|
316
|
+
has_left = True
|
|
317
|
+
# Right border
|
|
318
|
+
elif abs(line['x1'] - x1) <= tolerance and line['y1'] >= y0 and line['y2'] <= y1:
|
|
319
|
+
has_right = True
|
|
320
|
+
|
|
321
|
+
sides = [has_top, has_bottom, has_left, has_right]
|
|
322
|
+
complete_sides = sum(sides)
|
|
323
|
+
|
|
324
|
+
if complete_sides < 4:
|
|
325
|
+
missing = []
|
|
326
|
+
if not has_top: missing.append("top")
|
|
327
|
+
if not has_bottom: missing.append("bottom")
|
|
328
|
+
if not has_left: missing.append("left")
|
|
329
|
+
if not has_right: missing.append("right")
|
|
330
|
+
issues.append(f"Missing borders: {', '.join(missing)}")
|
|
331
|
+
|
|
332
|
+
return complete_sides / 4.0, issues
|
|
333
|
+
|
|
334
|
+
def _analyze_grid_regularity(
|
|
335
|
+
self,
|
|
336
|
+
bbox: Tuple[float, float, float, float],
|
|
337
|
+
drawings: List[Dict]
|
|
338
|
+
) -> Tuple[float, List[str]]:
|
|
339
|
+
"""Analyze grid regularity"""
|
|
340
|
+
issues = []
|
|
341
|
+
lines = self._extract_lines(drawings)
|
|
342
|
+
|
|
343
|
+
if not lines:
|
|
344
|
+
return 0.0, ["No grid lines"]
|
|
345
|
+
|
|
346
|
+
# Calculate orthogonal line ratio
|
|
347
|
+
orthogonal_count = sum(1 for l in lines if l['is_horizontal'] or l['is_vertical'])
|
|
348
|
+
total_lines = len(lines)
|
|
349
|
+
|
|
350
|
+
orthogonal_ratio = orthogonal_count / total_lines if total_lines > 0 else 0
|
|
351
|
+
|
|
352
|
+
if orthogonal_ratio < self.config.MIN_ORTHOGONAL_RATIO:
|
|
353
|
+
issues.append(f"Non-orthogonal lines: {(1-orthogonal_ratio)*100:.1f}%")
|
|
354
|
+
|
|
355
|
+
# Analyze line alignment
|
|
356
|
+
h_lines = [l for l in lines if l['is_horizontal']]
|
|
357
|
+
v_lines = [l for l in lines if l['is_vertical']]
|
|
358
|
+
|
|
359
|
+
# Check Y-coordinate alignment of horizontal lines
|
|
360
|
+
h_alignment = self._check_line_alignment([l['y1'] for l in h_lines])
|
|
361
|
+
# Check X-coordinate alignment of vertical lines
|
|
362
|
+
v_alignment = self._check_line_alignment([l['x1'] for l in v_lines])
|
|
363
|
+
|
|
364
|
+
alignment_score = (h_alignment + v_alignment) / 2
|
|
365
|
+
|
|
366
|
+
if alignment_score < 0.8:
|
|
367
|
+
issues.append("Misaligned grid lines")
|
|
368
|
+
|
|
369
|
+
return (orthogonal_ratio * 0.6 + alignment_score * 0.4), issues
|
|
370
|
+
|
|
371
|
+
def _analyze_cell_structure(
|
|
372
|
+
self,
|
|
373
|
+
bbox: Tuple[float, float, float, float],
|
|
374
|
+
drawings: List[Dict]
|
|
375
|
+
) -> Tuple[float, List[str]]:
|
|
376
|
+
"""Analyze cell structure"""
|
|
377
|
+
issues = []
|
|
378
|
+
lines = self._extract_lines(drawings)
|
|
379
|
+
|
|
380
|
+
h_lines = sorted([l for l in lines if l['is_horizontal']], key=lambda l: l['y1'])
|
|
381
|
+
v_lines = sorted([l for l in lines if l['is_vertical']], key=lambda l: l['x1'])
|
|
382
|
+
|
|
383
|
+
if len(h_lines) < 2 or len(v_lines) < 2:
|
|
384
|
+
issues.append("Insufficient lines for cell structure")
|
|
385
|
+
return 0.5, issues
|
|
386
|
+
|
|
387
|
+
# Analyze cell sizes
|
|
388
|
+
cell_heights = []
|
|
389
|
+
for i in range(len(h_lines) - 1):
|
|
390
|
+
height = h_lines[i+1]['y1'] - h_lines[i]['y1']
|
|
391
|
+
if height > 0:
|
|
392
|
+
cell_heights.append(height)
|
|
393
|
+
|
|
394
|
+
cell_widths = []
|
|
395
|
+
for i in range(len(v_lines) - 1):
|
|
396
|
+
width = v_lines[i+1]['x1'] - v_lines[i]['x1']
|
|
397
|
+
if width > 0:
|
|
398
|
+
cell_widths.append(width)
|
|
399
|
+
|
|
400
|
+
# Check for cells that are too small
|
|
401
|
+
tiny_cells = 0
|
|
402
|
+
for h in cell_heights:
|
|
403
|
+
if h < self.config.MIN_CELL_SIZE:
|
|
404
|
+
tiny_cells += 1
|
|
405
|
+
for w in cell_widths:
|
|
406
|
+
if w < self.config.MIN_CELL_SIZE:
|
|
407
|
+
tiny_cells += 1
|
|
408
|
+
|
|
409
|
+
total_cells = len(cell_heights) + len(cell_widths)
|
|
410
|
+
if total_cells > 0 and tiny_cells / total_cells > 0.1:
|
|
411
|
+
issues.append("Too many tiny cells")
|
|
412
|
+
|
|
413
|
+
# Check for extreme aspect ratios
|
|
414
|
+
extreme_ratio_count = 0
|
|
415
|
+
for h in cell_heights:
|
|
416
|
+
for w in cell_widths:
|
|
417
|
+
if h > 0 and w > 0:
|
|
418
|
+
ratio = max(h/w, w/h)
|
|
419
|
+
if ratio > self.config.MAX_CELL_ASPECT_RATIO:
|
|
420
|
+
extreme_ratio_count += 1
|
|
421
|
+
|
|
422
|
+
if extreme_ratio_count > 0:
|
|
423
|
+
issues.append("Extreme cell aspect ratios")
|
|
424
|
+
|
|
425
|
+
# Calculate score
|
|
426
|
+
score = 1.0
|
|
427
|
+
if tiny_cells > 0:
|
|
428
|
+
score -= 0.2
|
|
429
|
+
if extreme_ratio_count > 0:
|
|
430
|
+
score -= 0.2
|
|
431
|
+
|
|
432
|
+
return max(0.0, score), issues
|
|
433
|
+
|
|
434
|
+
def _analyze_element_simplicity(
|
|
435
|
+
self,
|
|
436
|
+
bbox: Tuple[float, float, float, float],
|
|
437
|
+
drawings: List[Dict]
|
|
438
|
+
) -> Tuple[float, List[str]]:
|
|
439
|
+
"""Analyze element simplicity (absence of curves, diagonals, and other complex elements)"""
|
|
440
|
+
issues = []
|
|
441
|
+
|
|
442
|
+
if not drawings:
|
|
443
|
+
return 1.0, issues
|
|
444
|
+
|
|
445
|
+
curve_count = 0
|
|
446
|
+
diagonal_count = 0
|
|
447
|
+
fill_count = 0
|
|
448
|
+
total_items = 0
|
|
449
|
+
|
|
450
|
+
for d in drawings:
|
|
451
|
+
items = d.get("items", [])
|
|
452
|
+
total_items += len(items)
|
|
453
|
+
|
|
454
|
+
for item in items:
|
|
455
|
+
item_type = item[0]
|
|
456
|
+
if item_type == 'c': # curve
|
|
457
|
+
curve_count += 1
|
|
458
|
+
elif item_type == 'l': # line
|
|
459
|
+
# Check for diagonal
|
|
460
|
+
p1, p2 = item[1], item[2]
|
|
461
|
+
if not self._is_orthogonal_line(p1, p2):
|
|
462
|
+
diagonal_count += 1
|
|
463
|
+
|
|
464
|
+
if d.get("fill"):
|
|
465
|
+
fill_count += 1
|
|
466
|
+
|
|
467
|
+
# Calculate ratios
|
|
468
|
+
curve_ratio = curve_count / max(1, total_items)
|
|
469
|
+
diagonal_ratio = diagonal_count / max(1, total_items)
|
|
470
|
+
fill_ratio = fill_count / max(1, len(drawings))
|
|
471
|
+
|
|
472
|
+
# Detect issues
|
|
473
|
+
if curve_ratio > self.config.MAX_CURVE_RATIO:
|
|
474
|
+
issues.append(f"Too many curves: {curve_ratio*100:.1f}%")
|
|
475
|
+
|
|
476
|
+
if diagonal_ratio > self.config.MAX_DIAGONAL_RATIO:
|
|
477
|
+
issues.append(f"Too many diagonals: {diagonal_ratio*100:.1f}%")
|
|
478
|
+
|
|
479
|
+
if fill_ratio > 0.5:
|
|
480
|
+
issues.append("Heavy fill patterns")
|
|
481
|
+
|
|
482
|
+
# Calculate score
|
|
483
|
+
score = 1.0
|
|
484
|
+
score -= min(0.3, curve_ratio * 3)
|
|
485
|
+
score -= min(0.3, diagonal_ratio * 3)
|
|
486
|
+
score -= min(0.2, fill_ratio * 0.4)
|
|
487
|
+
|
|
488
|
+
return max(0.0, score), issues
|
|
489
|
+
|
|
490
|
+
def _extract_lines(self, drawings: List[Dict]) -> List[Dict]:
|
|
491
|
+
"""Extract lines from drawings"""
|
|
492
|
+
lines = []
|
|
493
|
+
|
|
494
|
+
for d in drawings:
|
|
495
|
+
for item in d.get("items", []):
|
|
496
|
+
if item[0] == 'l': # straight line
|
|
497
|
+
p1, p2 = item[1], item[2]
|
|
498
|
+
x1, y1 = p1.x, p1.y
|
|
499
|
+
x2, y2 = p2.x, p2.y
|
|
500
|
+
|
|
501
|
+
# Determine horizontal/vertical
|
|
502
|
+
angle_tolerance = self.config.LINE_ANGLE_TOLERANCE
|
|
503
|
+
is_horizontal = abs(y2 - y1) <= angle_tolerance
|
|
504
|
+
is_vertical = abs(x2 - x1) <= angle_tolerance
|
|
505
|
+
|
|
506
|
+
lines.append({
|
|
507
|
+
'x1': min(x1, x2),
|
|
508
|
+
'y1': min(y1, y2),
|
|
509
|
+
'x2': max(x1, x2),
|
|
510
|
+
'y2': max(y1, y2),
|
|
511
|
+
'is_horizontal': is_horizontal,
|
|
512
|
+
'is_vertical': is_vertical,
|
|
513
|
+
'length': ((x2-x1)**2 + (y2-y1)**2) ** 0.5
|
|
514
|
+
})
|
|
515
|
+
elif item[0] == 're': # rectangle
|
|
516
|
+
rect = item[1]
|
|
517
|
+
x0, y0, x1, y1 = rect.x0, rect.y0, rect.x1, rect.y1
|
|
518
|
+
|
|
519
|
+
# Add rectangle's 4 sides as lines
|
|
520
|
+
lines.extend([
|
|
521
|
+
{'x1': x0, 'y1': y0, 'x2': x1, 'y2': y0, 'is_horizontal': True, 'is_vertical': False, 'length': x1-x0}, # top
|
|
522
|
+
{'x1': x0, 'y1': y1, 'x2': x1, 'y2': y1, 'is_horizontal': True, 'is_vertical': False, 'length': x1-x0}, # bottom
|
|
523
|
+
{'x1': x0, 'y1': y0, 'x2': x0, 'y2': y1, 'is_horizontal': False, 'is_vertical': True, 'length': y1-y0}, # left
|
|
524
|
+
{'x1': x1, 'y1': y0, 'x2': x1, 'y2': y1, 'is_horizontal': False, 'is_vertical': True, 'length': y1-y0}, # right
|
|
525
|
+
])
|
|
526
|
+
|
|
527
|
+
return lines
|
|
528
|
+
|
|
529
|
+
def _is_orthogonal_line(self, p1, p2) -> bool:
|
|
530
|
+
"""Check if the line is horizontal or vertical"""
|
|
531
|
+
tolerance = self.config.LINE_ANGLE_TOLERANCE
|
|
532
|
+
return abs(p2.x - p1.x) <= tolerance or abs(p2.y - p1.y) <= tolerance
|
|
533
|
+
|
|
534
|
+
def _check_line_alignment(self, positions: List[float]) -> float:
|
|
535
|
+
"""Check line alignment quality"""
|
|
536
|
+
if len(positions) < 2:
|
|
537
|
+
return 1.0
|
|
538
|
+
|
|
539
|
+
# Clustering
|
|
540
|
+
tolerance = self.config.GRID_ALIGNMENT_TOLERANCE
|
|
541
|
+
sorted_pos = sorted(positions)
|
|
542
|
+
|
|
543
|
+
clusters = []
|
|
544
|
+
current_cluster = [sorted_pos[0]]
|
|
545
|
+
|
|
546
|
+
for pos in sorted_pos[1:]:
|
|
547
|
+
if pos - current_cluster[-1] <= tolerance:
|
|
548
|
+
current_cluster.append(pos)
|
|
549
|
+
else:
|
|
550
|
+
clusters.append(current_cluster)
|
|
551
|
+
current_cluster = [pos]
|
|
552
|
+
clusters.append(current_cluster)
|
|
553
|
+
|
|
554
|
+
# Ratio of well-aligned lines
|
|
555
|
+
well_aligned = sum(len(c) for c in clusters if len(c) > 1)
|
|
556
|
+
return well_aligned / len(positions) if positions else 1.0
|
|
557
|
+
|
|
558
|
+
def _analyze_text_quality(self, text_blocks: List[Dict]) -> float:
|
|
559
|
+
"""Analyze text quality"""
|
|
560
|
+
if not text_blocks:
|
|
561
|
+
return 0.0
|
|
562
|
+
|
|
563
|
+
total_chars = 0
|
|
564
|
+
bad_chars = 0
|
|
565
|
+
|
|
566
|
+
for block in text_blocks:
|
|
567
|
+
for line in block.get("lines", []):
|
|
568
|
+
for span in line.get("spans", []):
|
|
569
|
+
text = span.get("text", "")
|
|
570
|
+
total_chars += len(text)
|
|
571
|
+
|
|
572
|
+
for char in text:
|
|
573
|
+
code = ord(char)
|
|
574
|
+
if 0xE000 <= code <= 0xF8FF: # PUA
|
|
575
|
+
bad_chars += 1
|
|
576
|
+
|
|
577
|
+
if total_chars == 0:
|
|
578
|
+
return 0.0
|
|
579
|
+
|
|
580
|
+
return 1.0 - (bad_chars / total_chars)
|
|
581
|
+
|
|
582
|
+
def _bbox_overlaps(self, bbox1: Tuple, bbox2: Tuple) -> bool:
|
|
583
|
+
"""Check if two bounding boxes overlap"""
|
|
584
|
+
return not (
|
|
585
|
+
bbox1[2] <= bbox2[0] or
|
|
586
|
+
bbox1[0] >= bbox2[2] or
|
|
587
|
+
bbox1[3] <= bbox2[1] or
|
|
588
|
+
bbox1[1] >= bbox2[3]
|
|
589
|
+
)
|
|
590
|
+
|
|
591
|
+
def analyze_page_tables(self) -> Dict[str, Any]:
|
|
592
|
+
"""
|
|
593
|
+
Analyzes all table candidate regions on the page.
|
|
594
|
+
|
|
595
|
+
Returns:
|
|
596
|
+
Dict containing:
|
|
597
|
+
- table_candidates: List of table candidates (each with quality info)
|
|
598
|
+
- has_processable_tables: Whether processable tables exist
|
|
599
|
+
- summary: Analysis summary
|
|
600
|
+
"""
|
|
601
|
+
# Search for table candidate regions from drawings
|
|
602
|
+
drawings = self._get_drawings_cached()
|
|
603
|
+
|
|
604
|
+
# Extract lines
|
|
605
|
+
h_lines = []
|
|
606
|
+
v_lines = []
|
|
607
|
+
|
|
608
|
+
for d in drawings:
|
|
609
|
+
items = d.get("items", [])
|
|
610
|
+
for item in items:
|
|
611
|
+
cmd = item[0] if item else None
|
|
612
|
+
|
|
613
|
+
if cmd == "l": # line
|
|
614
|
+
x0, y0, x1, y1 = item[1], item[2], item[3], item[4]
|
|
615
|
+
|
|
616
|
+
if abs(y1 - y0) < 3: # horizontal line
|
|
617
|
+
h_lines.append((min(x0, x1), y0, max(x0, x1), y1))
|
|
618
|
+
elif abs(x1 - x0) < 3: # vertical line
|
|
619
|
+
v_lines.append((x0, min(y0, y1), x1, max(y0, y1)))
|
|
620
|
+
|
|
621
|
+
elif cmd == "re": # rect
|
|
622
|
+
x, y, w, h = item[1], item[2], item[3], item[4]
|
|
623
|
+
if w > 20 and h > 10:
|
|
624
|
+
# Add rectangle's four sides as lines
|
|
625
|
+
h_lines.append((x, y, x + w, y)) # top
|
|
626
|
+
h_lines.append((x, y + h, x + w, y + h)) # bottom
|
|
627
|
+
v_lines.append((x, y, x, y + h)) # left
|
|
628
|
+
v_lines.append((x + w, y, x + w, y + h)) # right
|
|
629
|
+
|
|
630
|
+
# Find table candidate regions (areas with dense lines)
|
|
631
|
+
table_candidates = self._find_table_regions(h_lines, v_lines)
|
|
632
|
+
|
|
633
|
+
results = []
|
|
634
|
+
for bbox in table_candidates:
|
|
635
|
+
quality_result = self.analyze_table(bbox)
|
|
636
|
+
results.append({
|
|
637
|
+
'bbox': bbox,
|
|
638
|
+
'quality': quality_result.quality,
|
|
639
|
+
'score': quality_result.score,
|
|
640
|
+
'is_processable': quality_result.recommended_action == BlockProcessability.TABLE_EXTRACTABLE,
|
|
641
|
+
'issues': quality_result.issues
|
|
642
|
+
})
|
|
643
|
+
|
|
644
|
+
has_processable = any(r['is_processable'] for r in results)
|
|
645
|
+
|
|
646
|
+
summary = {
|
|
647
|
+
'total_candidates': len(results),
|
|
648
|
+
'processable': sum(1 for r in results if r['is_processable']),
|
|
649
|
+
'unprocessable': sum(1 for r in results if not r['is_processable']),
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
logger.info(f"[TableQualityAnalyzer] Page {self.page_num + 1}: "
|
|
653
|
+
f"Found {summary['total_candidates']} table candidates, "
|
|
654
|
+
f"{summary['processable']} processable")
|
|
655
|
+
|
|
656
|
+
return {
|
|
657
|
+
'table_candidates': results,
|
|
658
|
+
'has_processable_tables': has_processable,
|
|
659
|
+
'summary': summary
|
|
660
|
+
}
|
|
661
|
+
|
|
662
|
+
def _find_table_regions(
|
|
663
|
+
self,
|
|
664
|
+
h_lines: List[Tuple],
|
|
665
|
+
v_lines: List[Tuple]
|
|
666
|
+
) -> List[Tuple[float, float, float, float]]:
|
|
667
|
+
"""
|
|
668
|
+
Search for table candidates in regions where horizontal and vertical lines intersect
|
|
669
|
+
"""
|
|
670
|
+
if not h_lines or not v_lines:
|
|
671
|
+
return []
|
|
672
|
+
|
|
673
|
+
# Calculate bounding box of all lines
|
|
674
|
+
all_lines = h_lines + v_lines
|
|
675
|
+
if not all_lines:
|
|
676
|
+
return []
|
|
677
|
+
|
|
678
|
+
# Find table regions by clustering lines
|
|
679
|
+
clusters = []
|
|
680
|
+
used = set()
|
|
681
|
+
|
|
682
|
+
# Simplified approach: group lines that intersect or are close to each other
|
|
683
|
+
tolerance = 50 # pixels
|
|
684
|
+
|
|
685
|
+
for i, line1 in enumerate(all_lines):
|
|
686
|
+
if i in used:
|
|
687
|
+
continue
|
|
688
|
+
|
|
689
|
+
cluster = [line1]
|
|
690
|
+
used.add(i)
|
|
691
|
+
|
|
692
|
+
for j, line2 in enumerate(all_lines):
|
|
693
|
+
if j in used:
|
|
694
|
+
continue
|
|
695
|
+
|
|
696
|
+
# If two lines are close, put them in the same cluster
|
|
697
|
+
if self._lines_are_close(line1, line2, tolerance):
|
|
698
|
+
cluster.append(line2)
|
|
699
|
+
used.add(j)
|
|
700
|
+
|
|
701
|
+
if len(cluster) >= 4: # At least 4 lines required for a table candidate
|
|
702
|
+
clusters.append(cluster)
|
|
703
|
+
|
|
704
|
+
# Convert clusters to bounding boxes
|
|
705
|
+
table_regions = []
|
|
706
|
+
for cluster in clusters:
|
|
707
|
+
x0 = min(min(l[0], l[2]) for l in cluster)
|
|
708
|
+
y0 = min(min(l[1], l[3]) for l in cluster)
|
|
709
|
+
x1 = max(max(l[0], l[2]) for l in cluster)
|
|
710
|
+
y1 = max(max(l[1], l[3]) for l in cluster)
|
|
711
|
+
|
|
712
|
+
# Check minimum size
|
|
713
|
+
if (x1 - x0) > 100 and (y1 - y0) > 50:
|
|
714
|
+
table_regions.append((x0, y0, x1, y1))
|
|
715
|
+
|
|
716
|
+
return table_regions
|
|
717
|
+
|
|
718
|
+
def _lines_are_close(
|
|
719
|
+
self,
|
|
720
|
+
line1: Tuple,
|
|
721
|
+
line2: Tuple,
|
|
722
|
+
tolerance: float
|
|
723
|
+
) -> bool:
|
|
724
|
+
"""Check if two lines are close to each other"""
|
|
725
|
+
# Check distance between endpoints of line1 and line2
|
|
726
|
+
x1_min, y1_min = min(line1[0], line1[2]), min(line1[1], line1[3])
|
|
727
|
+
x1_max, y1_max = max(line1[0], line1[2]), max(line1[1], line1[3])
|
|
728
|
+
x2_min, y2_min = min(line2[0], line2[2]), min(line2[1], line2[3])
|
|
729
|
+
x2_max, y2_max = max(line2[0], line2[2]), max(line2[1], line2[3])
|
|
730
|
+
|
|
731
|
+
# True if bounding boxes of the two lines overlap or are close
|
|
732
|
+
return not (
|
|
733
|
+
x1_max + tolerance < x2_min or
|
|
734
|
+
x2_max + tolerance < x1_min or
|
|
735
|
+
y1_max + tolerance < y2_min or
|
|
736
|
+
y2_max + tolerance < y1_min
|
|
737
|
+
)
|
|
738
|
+
|
|
739
|
+
|
|
740
|
+
# ============================================================================
|
|
741
|
+
# Export
|
|
742
|
+
# ============================================================================
|
|
743
|
+
|
|
744
|
+
__all__ = [
|
|
745
|
+
'TableQuality',
|
|
746
|
+
'BlockProcessability',
|
|
747
|
+
'TableQualityResult',
|
|
748
|
+
'TableQualityConfig',
|
|
749
|
+
'TableQualityAnalyzer',
|
|
750
|
+
]
|