xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,1346 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Table Detection Engine for PDF Handler
|
|
3
|
+
|
|
4
|
+
Detects tables using multiple strategies and selects the best results.
|
|
5
|
+
Includes graphic region exclusion and fake table filtering capabilities.
|
|
6
|
+
Improved cell extraction accuracy.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
from typing import List, Dict, Optional, Tuple, Any, Set
|
|
11
|
+
|
|
12
|
+
import fitz
|
|
13
|
+
import pdfplumber
|
|
14
|
+
|
|
15
|
+
from xgen_doc2chunk.core.processor.pdf_helpers.types import (
|
|
16
|
+
PDFConfig,
|
|
17
|
+
TableDetectionStrategy,
|
|
18
|
+
GridInfo,
|
|
19
|
+
CellInfo,
|
|
20
|
+
TableCandidate,
|
|
21
|
+
)
|
|
22
|
+
from xgen_doc2chunk.core.processor.pdf_helpers.pdf_line_analysis import LineAnalysisEngine
|
|
23
|
+
from xgen_doc2chunk.core.processor.pdf_helpers.pdf_graphic_detector import GraphicRegionDetector
|
|
24
|
+
from xgen_doc2chunk.core.processor.pdf_helpers.pdf_table_validator import TableQualityValidator
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# ============================================================================
|
|
30
|
+
# Table Detection Engine
|
|
31
|
+
# ============================================================================
|
|
32
|
+
|
|
33
|
+
class TableDetectionEngine:
|
|
34
|
+
"""
|
|
35
|
+
Table Detection Engine
|
|
36
|
+
|
|
37
|
+
Detects tables using multiple strategies and selects the best results.
|
|
38
|
+
|
|
39
|
+
Features:
|
|
40
|
+
- GraphicRegionDetector integration to exclude vector graphic regions
|
|
41
|
+
- TableQualityValidator integration to filter fake tables
|
|
42
|
+
|
|
43
|
+
Supported Strategies:
|
|
44
|
+
1. PyMuPDF find_tables() - Most accurate, preferred
|
|
45
|
+
2. pdfplumber - Line-based detection
|
|
46
|
+
3. Line-based - Direct line analysis
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
# Configuration constants
|
|
50
|
+
CONFIDENCE_THRESHOLD = getattr(PDFConfig, 'CONFIDENCE_THRESHOLD', 0.5)
|
|
51
|
+
MIN_TABLE_ROWS = getattr(PDFConfig, 'MIN_TABLE_ROWS', 2)
|
|
52
|
+
MIN_TABLE_COLS = getattr(PDFConfig, 'MIN_TABLE_COLS', 2)
|
|
53
|
+
|
|
54
|
+
def __init__(self, page, page_num: int, file_path: str):
|
|
55
|
+
"""
|
|
56
|
+
Args:
|
|
57
|
+
page: PyMuPDF page object
|
|
58
|
+
page_num: Page number (0-indexed)
|
|
59
|
+
file_path: PDF file path
|
|
60
|
+
"""
|
|
61
|
+
self.page = page
|
|
62
|
+
self.page_num = page_num
|
|
63
|
+
self.file_path = file_path
|
|
64
|
+
self.page_width = page.rect.width
|
|
65
|
+
self.page_height = page.rect.height
|
|
66
|
+
|
|
67
|
+
# Line analysis engine
|
|
68
|
+
self.line_engine = LineAnalysisEngine(page, self.page_width, self.page_height)
|
|
69
|
+
self.h_lines, self.v_lines = self.line_engine.analyze()
|
|
70
|
+
|
|
71
|
+
# Graphic region detector
|
|
72
|
+
self.graphic_detector = GraphicRegionDetector(page, page_num)
|
|
73
|
+
self.graphic_regions = self.graphic_detector.detect()
|
|
74
|
+
|
|
75
|
+
# Table quality validator
|
|
76
|
+
self.quality_validator = TableQualityValidator(page, self.graphic_detector)
|
|
77
|
+
|
|
78
|
+
def detect_tables(self) -> List[TableCandidate]:
|
|
79
|
+
"""
|
|
80
|
+
Detect tables using all strategies.
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
List of table candidates sorted by confidence.
|
|
84
|
+
"""
|
|
85
|
+
candidates: List[TableCandidate] = []
|
|
86
|
+
|
|
87
|
+
# Strategy 1: PyMuPDF
|
|
88
|
+
pymupdf_candidates = self._detect_with_pymupdf()
|
|
89
|
+
|
|
90
|
+
# Pre-merge adjacent header-data tables (before validation)
|
|
91
|
+
pymupdf_candidates = self._merge_header_data_tables(pymupdf_candidates)
|
|
92
|
+
candidates.extend(pymupdf_candidates)
|
|
93
|
+
|
|
94
|
+
# Strategy 2: pdfplumber
|
|
95
|
+
pdfplumber_candidates = self._detect_with_pdfplumber()
|
|
96
|
+
pdfplumber_candidates = self._merge_header_data_tables(pdfplumber_candidates)
|
|
97
|
+
candidates.extend(pdfplumber_candidates)
|
|
98
|
+
|
|
99
|
+
# Strategy 3: Line-based (HYBRID_ANALYSIS)
|
|
100
|
+
# Used only when PyMuPDF and pdfplumber don't find tables
|
|
101
|
+
# Or used additionally with stricter validation
|
|
102
|
+
line_candidates = self._detect_with_lines()
|
|
103
|
+
|
|
104
|
+
# Enhanced cross-validation for HYBRID results
|
|
105
|
+
if line_candidates and not pymupdf_candidates:
|
|
106
|
+
# When PyMuPDF didn't find tables but HYBRID did
|
|
107
|
+
# Apply higher confidence threshold (0.65 or above)
|
|
108
|
+
line_candidates = [
|
|
109
|
+
c for c in line_candidates
|
|
110
|
+
if c.confidence >= 0.65
|
|
111
|
+
]
|
|
112
|
+
logger.debug(f"[TableDetection] HYBRID-only detection: "
|
|
113
|
+
f"{len(line_candidates)} candidates passed stricter threshold (0.65)")
|
|
114
|
+
|
|
115
|
+
candidates.extend(line_candidates)
|
|
116
|
+
|
|
117
|
+
# Filter fake tables through quality validation
|
|
118
|
+
validated_candidates = self._validate_candidates(candidates)
|
|
119
|
+
|
|
120
|
+
# Select best candidates based on confidence
|
|
121
|
+
selected = self._select_best_candidates(validated_candidates)
|
|
122
|
+
|
|
123
|
+
return selected
|
|
124
|
+
|
|
125
|
+
def _merge_header_data_tables(self, candidates: List[TableCandidate]) -> List[TableCandidate]:
|
|
126
|
+
"""
|
|
127
|
+
Merge adjacent header-data tables.
|
|
128
|
+
|
|
129
|
+
Conditions:
|
|
130
|
+
1. First table has 1-2 rows (assumed to be header)
|
|
131
|
+
2. Second table is directly below (Y gap < 30pt)
|
|
132
|
+
3. X range is similar (80% or more overlap)
|
|
133
|
+
4. Column count relationship: header columns <= data columns
|
|
134
|
+
"""
|
|
135
|
+
if len(candidates) < 2:
|
|
136
|
+
return candidates
|
|
137
|
+
|
|
138
|
+
# Sort by Y position
|
|
139
|
+
sorted_candidates = sorted(candidates, key=lambda c: c.bbox[1])
|
|
140
|
+
merged = []
|
|
141
|
+
skip_indices = set()
|
|
142
|
+
|
|
143
|
+
for i, header_cand in enumerate(sorted_candidates):
|
|
144
|
+
if i in skip_indices:
|
|
145
|
+
continue
|
|
146
|
+
|
|
147
|
+
# Check header candidate condition (1-2 rows)
|
|
148
|
+
if len(header_cand.data) > 2:
|
|
149
|
+
merged.append(header_cand)
|
|
150
|
+
continue
|
|
151
|
+
|
|
152
|
+
# Check if can merge with next table
|
|
153
|
+
merged_cand = header_cand
|
|
154
|
+
for j in range(i + 1, len(sorted_candidates)):
|
|
155
|
+
if j in skip_indices:
|
|
156
|
+
continue
|
|
157
|
+
|
|
158
|
+
data_cand = sorted_candidates[j]
|
|
159
|
+
|
|
160
|
+
if self._can_merge_header_data(merged_cand, data_cand):
|
|
161
|
+
merged_cand = self._do_merge_header_data(merged_cand, data_cand)
|
|
162
|
+
skip_indices.add(j)
|
|
163
|
+
logger.debug(f"[TableDetection] Merged header with data table: "
|
|
164
|
+
f"header rows={len(header_cand.data)}, "
|
|
165
|
+
f"data rows={len(data_cand.data)}")
|
|
166
|
+
else:
|
|
167
|
+
break
|
|
168
|
+
|
|
169
|
+
merged.append(merged_cand)
|
|
170
|
+
|
|
171
|
+
return merged
|
|
172
|
+
|
|
173
|
+
def _can_merge_header_data(self, header: TableCandidate, data: TableCandidate) -> bool:
|
|
174
|
+
"""Determine if header and data tables can be merged."""
|
|
175
|
+
# Check Y gap
|
|
176
|
+
y_gap = data.bbox[1] - header.bbox[3]
|
|
177
|
+
if y_gap < -5 or y_gap > 40: # Allow slight overlap, max 40pt gap
|
|
178
|
+
return False
|
|
179
|
+
|
|
180
|
+
# Check X range overlap
|
|
181
|
+
x_overlap_start = max(header.bbox[0], data.bbox[0])
|
|
182
|
+
x_overlap_end = min(header.bbox[2], data.bbox[2])
|
|
183
|
+
x_overlap = max(0, x_overlap_end - x_overlap_start)
|
|
184
|
+
|
|
185
|
+
header_width = header.bbox[2] - header.bbox[0]
|
|
186
|
+
data_width = data.bbox[2] - data.bbox[0]
|
|
187
|
+
max_width = max(header_width, data_width)
|
|
188
|
+
|
|
189
|
+
if max_width > 0 and x_overlap / max_width < 0.7:
|
|
190
|
+
return False
|
|
191
|
+
|
|
192
|
+
# Check column count relationship
|
|
193
|
+
header_cols = max(len(row) for row in header.data) if header.data else 0
|
|
194
|
+
data_cols = max(len(row) for row in data.data) if data.data else 0
|
|
195
|
+
|
|
196
|
+
# Don't merge if header has more columns than data
|
|
197
|
+
if header_cols > data_cols + 1:
|
|
198
|
+
return False
|
|
199
|
+
|
|
200
|
+
return True
|
|
201
|
+
|
|
202
|
+
def _do_merge_header_data(self, header: TableCandidate, data: TableCandidate) -> TableCandidate:
|
|
203
|
+
"""Perform header and data table merge (includes subheader detection)."""
|
|
204
|
+
# New bbox
|
|
205
|
+
merged_bbox = (
|
|
206
|
+
min(header.bbox[0], data.bbox[0]),
|
|
207
|
+
header.bbox[1],
|
|
208
|
+
max(header.bbox[2], data.bbox[2]),
|
|
209
|
+
data.bbox[3]
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
# Determine column count
|
|
213
|
+
header_cols = max(len(row) for row in header.data) if header.data else 0
|
|
214
|
+
data_cols = max(len(row) for row in data.data) if data.data else 0
|
|
215
|
+
merged_cols = max(header_cols, data_cols)
|
|
216
|
+
|
|
217
|
+
# Detect subheader between header and data
|
|
218
|
+
subheader_row = self._detect_subheader_between(header, data, merged_cols)
|
|
219
|
+
|
|
220
|
+
# Merge data
|
|
221
|
+
merged_data = []
|
|
222
|
+
merged_cells = []
|
|
223
|
+
|
|
224
|
+
# Process header rows
|
|
225
|
+
for row_idx, row in enumerate(header.data):
|
|
226
|
+
if len(row) < merged_cols:
|
|
227
|
+
# Apply colspan if header has fewer columns
|
|
228
|
+
adjusted_row = list(row)
|
|
229
|
+
col_diff = merged_cols - len(row)
|
|
230
|
+
|
|
231
|
+
# Apply colspan to second column
|
|
232
|
+
if len(row) >= 2 and col_diff > 0:
|
|
233
|
+
# Store colspan info
|
|
234
|
+
merged_cells.append({
|
|
235
|
+
'row': row_idx,
|
|
236
|
+
'col': 1,
|
|
237
|
+
'rowspan': 1,
|
|
238
|
+
'colspan': 1 + col_diff,
|
|
239
|
+
'bbox': None
|
|
240
|
+
})
|
|
241
|
+
# Add empty columns
|
|
242
|
+
for _ in range(col_diff):
|
|
243
|
+
adjusted_row.insert(2, '')
|
|
244
|
+
else:
|
|
245
|
+
adjusted_row.extend([''] * col_diff)
|
|
246
|
+
|
|
247
|
+
merged_data.append(adjusted_row)
|
|
248
|
+
else:
|
|
249
|
+
merged_data.append(list(row))
|
|
250
|
+
|
|
251
|
+
# Insert subheader row (header cell info)
|
|
252
|
+
header_row_count = len(header.data)
|
|
253
|
+
if subheader_row:
|
|
254
|
+
merged_data.append(subheader_row)
|
|
255
|
+
# Add cell info for subheader row (each cell has colspan=1)
|
|
256
|
+
subheader_row_idx = header_row_count # Row after header
|
|
257
|
+
for col_idx, cell_value in enumerate(subheader_row):
|
|
258
|
+
merged_cells.append({
|
|
259
|
+
'row': subheader_row_idx,
|
|
260
|
+
'col': col_idx,
|
|
261
|
+
'rowspan': 1,
|
|
262
|
+
'colspan': 1,
|
|
263
|
+
'bbox': None
|
|
264
|
+
})
|
|
265
|
+
header_row_count += 1
|
|
266
|
+
logger.debug(f"[TableDetection] Added subheader row with cell info: {subheader_row}")
|
|
267
|
+
|
|
268
|
+
# Header cell info
|
|
269
|
+
if header.cells:
|
|
270
|
+
for cell in header.cells:
|
|
271
|
+
if not any(c['row'] == cell.row and c['col'] == cell.col for c in merged_cells):
|
|
272
|
+
merged_cells.append({
|
|
273
|
+
'row': cell.row,
|
|
274
|
+
'col': cell.col,
|
|
275
|
+
'rowspan': cell.rowspan,
|
|
276
|
+
'colspan': cell.colspan,
|
|
277
|
+
'bbox': cell.bbox
|
|
278
|
+
})
|
|
279
|
+
|
|
280
|
+
# Process data rows
|
|
281
|
+
for row_idx, row in enumerate(data.data):
|
|
282
|
+
if len(row) < merged_cols:
|
|
283
|
+
adjusted_row = list(row) + [''] * (merged_cols - len(row))
|
|
284
|
+
else:
|
|
285
|
+
adjusted_row = list(row)
|
|
286
|
+
merged_data.append(adjusted_row)
|
|
287
|
+
|
|
288
|
+
# Data cell info (apply row offset)
|
|
289
|
+
if data.cells:
|
|
290
|
+
for cell in data.cells:
|
|
291
|
+
merged_cells.append({
|
|
292
|
+
'row': cell.row + header_row_count,
|
|
293
|
+
'col': cell.col,
|
|
294
|
+
'rowspan': cell.rowspan,
|
|
295
|
+
'colspan': cell.colspan,
|
|
296
|
+
'bbox': cell.bbox
|
|
297
|
+
})
|
|
298
|
+
|
|
299
|
+
# Convert cell info to CellInfo objects
|
|
300
|
+
cell_objects = [
|
|
301
|
+
CellInfo(
|
|
302
|
+
row=c['row'],
|
|
303
|
+
col=c['col'],
|
|
304
|
+
rowspan=c.get('rowspan', 1),
|
|
305
|
+
colspan=c.get('colspan', 1),
|
|
306
|
+
# Use default value if bbox is None or missing
|
|
307
|
+
bbox=c.get('bbox') or (0, 0, 0, 0)
|
|
308
|
+
)
|
|
309
|
+
for c in merged_cells
|
|
310
|
+
]
|
|
311
|
+
|
|
312
|
+
return TableCandidate(
|
|
313
|
+
strategy=header.strategy,
|
|
314
|
+
confidence=max(header.confidence, data.confidence),
|
|
315
|
+
bbox=merged_bbox,
|
|
316
|
+
grid=header.grid or data.grid,
|
|
317
|
+
cells=cell_objects,
|
|
318
|
+
data=merged_data,
|
|
319
|
+
raw_table=None
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
def _detect_subheader_between(self, header: TableCandidate, data: TableCandidate,
|
|
323
|
+
num_cols: int) -> Optional[List[str]]:
|
|
324
|
+
"""
|
|
325
|
+
Detect subheader row between header and data tables.
|
|
326
|
+
|
|
327
|
+
Example: Sub-column headers like (A), (B), etc.
|
|
328
|
+
"""
|
|
329
|
+
header_bottom = header.bbox[3]
|
|
330
|
+
data_top = data.bbox[1]
|
|
331
|
+
|
|
332
|
+
# Must have sufficient gap between header and data
|
|
333
|
+
gap = data_top - header_bottom
|
|
334
|
+
if gap < 5 or gap > 50:
|
|
335
|
+
return None
|
|
336
|
+
|
|
337
|
+
# Extract text from the region on the page
|
|
338
|
+
page_dict = self.page.get_text("dict", sort=True)
|
|
339
|
+
|
|
340
|
+
subheader_texts = []
|
|
341
|
+
for block in page_dict.get("blocks", []):
|
|
342
|
+
if block.get("type") != 0:
|
|
343
|
+
continue
|
|
344
|
+
|
|
345
|
+
for line in block.get("lines", []):
|
|
346
|
+
line_bbox = line.get("bbox", (0, 0, 0, 0))
|
|
347
|
+
line_y = (line_bbox[1] + line_bbox[3]) / 2
|
|
348
|
+
|
|
349
|
+
# Check if located between header and data
|
|
350
|
+
if header_bottom - 5 <= line_y <= data_top + 5:
|
|
351
|
+
# Check if within table X range
|
|
352
|
+
if line_bbox[0] >= header.bbox[0] - 10 and line_bbox[2] <= data.bbox[2] + 10:
|
|
353
|
+
for span in line.get("spans", []):
|
|
354
|
+
text = span.get("text", "").strip()
|
|
355
|
+
span_bbox = span.get("bbox", (0, 0, 0, 0))
|
|
356
|
+
if text and text not in [' ', '']:
|
|
357
|
+
subheader_texts.append({
|
|
358
|
+
'text': text,
|
|
359
|
+
'x0': span_bbox[0],
|
|
360
|
+
'x1': span_bbox[2]
|
|
361
|
+
})
|
|
362
|
+
|
|
363
|
+
if not subheader_texts:
|
|
364
|
+
return None
|
|
365
|
+
|
|
366
|
+
# Check subheader pattern: (A), (B), etc.
|
|
367
|
+
has_subheader_pattern = any('(' in t['text'] and ')' in t['text'] for t in subheader_texts)
|
|
368
|
+
if not has_subheader_pattern:
|
|
369
|
+
return None
|
|
370
|
+
|
|
371
|
+
# Construct subheader row
|
|
372
|
+
table_left = min(header.bbox[0], data.bbox[0])
|
|
373
|
+
table_width = max(header.bbox[2], data.bbox[2]) - table_left
|
|
374
|
+
col_width = table_width / num_cols
|
|
375
|
+
|
|
376
|
+
subheader_row = [''] * num_cols
|
|
377
|
+
for item in sorted(subheader_texts, key=lambda x: x['x0']):
|
|
378
|
+
relative_x = item['x0'] - table_left
|
|
379
|
+
col_idx = min(int(relative_x / col_width), num_cols - 1)
|
|
380
|
+
col_idx = max(0, col_idx)
|
|
381
|
+
|
|
382
|
+
if subheader_row[col_idx]:
|
|
383
|
+
subheader_row[col_idx] += ' ' + item['text']
|
|
384
|
+
else:
|
|
385
|
+
subheader_row[col_idx] = item['text']
|
|
386
|
+
|
|
387
|
+
# Validate subheader (must have at least one (A), (B) pattern)
|
|
388
|
+
valid_count = sum(1 for s in subheader_row if '(' in s and ')' in s)
|
|
389
|
+
if valid_count < 1:
|
|
390
|
+
return None
|
|
391
|
+
|
|
392
|
+
return subheader_row
|
|
393
|
+
|
|
394
|
+
def _validate_candidates(self, candidates: List[TableCandidate]) -> List[TableCandidate]:
|
|
395
|
+
"""
|
|
396
|
+
Validate table candidates for quality.
|
|
397
|
+
|
|
398
|
+
Validation criteria:
|
|
399
|
+
1. Not overlapping with graphic regions (except PyMuPDF - text-based, high reliability)
|
|
400
|
+
2. Sufficient filled cell ratio
|
|
401
|
+
3. Has meaningful data
|
|
402
|
+
|
|
403
|
+
Tables detected with PyMuPDF strategy skip graphic region check.
|
|
404
|
+
Reason: PyMuPDF detects tables based on text, so it accurately recognizes
|
|
405
|
+
tables even when cells with background colors are mistaken as graphics.
|
|
406
|
+
"""
|
|
407
|
+
validated = []
|
|
408
|
+
|
|
409
|
+
for candidate in candidates:
|
|
410
|
+
# PyMuPDF strategy skips graphic region check
|
|
411
|
+
skip_graphic_check = (candidate.strategy == TableDetectionStrategy.PYMUPDF_NATIVE)
|
|
412
|
+
|
|
413
|
+
is_valid, new_confidence, reason = self.quality_validator.validate(
|
|
414
|
+
data=candidate.data,
|
|
415
|
+
bbox=candidate.bbox,
|
|
416
|
+
cells_info=candidate.cells,
|
|
417
|
+
skip_graphic_check=skip_graphic_check # New parameter
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
if is_valid:
|
|
421
|
+
# Adjust confidence based on validation result
|
|
422
|
+
adjusted_confidence = min(candidate.confidence, new_confidence)
|
|
423
|
+
|
|
424
|
+
validated.append(TableCandidate(
|
|
425
|
+
strategy=candidate.strategy,
|
|
426
|
+
confidence=adjusted_confidence,
|
|
427
|
+
bbox=candidate.bbox,
|
|
428
|
+
grid=candidate.grid,
|
|
429
|
+
cells=candidate.cells,
|
|
430
|
+
data=candidate.data,
|
|
431
|
+
raw_table=candidate.raw_table
|
|
432
|
+
))
|
|
433
|
+
else:
|
|
434
|
+
logger.debug(f"[TableDetection] Filtered out candidate: page={self.page_num+1}, "
|
|
435
|
+
f"bbox={candidate.bbox}, reason={reason}")
|
|
436
|
+
|
|
437
|
+
return validated
|
|
438
|
+
|
|
439
|
+
def _detect_with_pymupdf(self) -> List[TableCandidate]:
|
|
440
|
+
"""Use PyMuPDF find_tables() (tolerance settings to resolve double-line issues)."""
|
|
441
|
+
candidates = []
|
|
442
|
+
|
|
443
|
+
if not hasattr(self.page, 'find_tables'):
|
|
444
|
+
return candidates
|
|
445
|
+
|
|
446
|
+
try:
|
|
447
|
+
# Apply same tolerance settings as pdf_handler.py
|
|
448
|
+
# Resolves fake column creation due to double/triple line borders
|
|
449
|
+
# snap_tolerance: Snaps nearby coordinates together
|
|
450
|
+
# join_tolerance: Joins nearby lines together
|
|
451
|
+
# edge_min_length: Ignores short lines (border lines)
|
|
452
|
+
# intersection_tolerance: Intersection detection tolerance
|
|
453
|
+
tabs = self.page.find_tables(
|
|
454
|
+
snap_tolerance=7,
|
|
455
|
+
join_tolerance=7,
|
|
456
|
+
edge_min_length=10,
|
|
457
|
+
intersection_tolerance=7,
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
for table_idx, table in enumerate(tabs.tables):
|
|
461
|
+
try:
|
|
462
|
+
table_data = table.extract()
|
|
463
|
+
|
|
464
|
+
if not table_data or not any(any(cell for cell in row if cell) for row in table_data):
|
|
465
|
+
continue
|
|
466
|
+
|
|
467
|
+
# Narrow column merge processing
|
|
468
|
+
merged_data, col_mapping = self._merge_narrow_columns(
|
|
469
|
+
table_data, table.cells if hasattr(table, 'cells') else None
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
# Calculate confidence (with merged data)
|
|
473
|
+
confidence = self._calculate_pymupdf_confidence(table, merged_data)
|
|
474
|
+
|
|
475
|
+
if confidence < self.CONFIDENCE_THRESHOLD:
|
|
476
|
+
continue
|
|
477
|
+
|
|
478
|
+
# Extract cell info (apply col_mapping)
|
|
479
|
+
cells = self._extract_cells_from_pymupdf_with_mapping(table, col_mapping)
|
|
480
|
+
|
|
481
|
+
candidates.append(TableCandidate(
|
|
482
|
+
strategy=TableDetectionStrategy.PYMUPDF_NATIVE,
|
|
483
|
+
confidence=confidence,
|
|
484
|
+
bbox=table.bbox,
|
|
485
|
+
grid=None,
|
|
486
|
+
cells=cells,
|
|
487
|
+
data=merged_data,
|
|
488
|
+
raw_table=table
|
|
489
|
+
))
|
|
490
|
+
|
|
491
|
+
except Exception as e:
|
|
492
|
+
logger.debug(f"[PDF] PyMuPDF table extraction error: {e}")
|
|
493
|
+
continue
|
|
494
|
+
|
|
495
|
+
except Exception as e:
|
|
496
|
+
logger.debug(f"[PDF] PyMuPDF find_tables error: {e}")
|
|
497
|
+
|
|
498
|
+
return candidates
|
|
499
|
+
|
|
500
|
+
def _merge_narrow_columns(
|
|
501
|
+
self,
|
|
502
|
+
data: List[List],
|
|
503
|
+
cells: List[Tuple] = None,
|
|
504
|
+
min_col_width: float = 15.0
|
|
505
|
+
) -> Tuple[List[List[str]], Dict[int, int]]:
|
|
506
|
+
"""
|
|
507
|
+
Merge narrow columns with adjacent columns.
|
|
508
|
+
|
|
509
|
+
Removes fake columns generated by double/triple line borders in PDF.
|
|
510
|
+
|
|
511
|
+
Args:
|
|
512
|
+
data: Table data
|
|
513
|
+
cells: PyMuPDF cell bbox list
|
|
514
|
+
min_col_width: Minimum column width (pt)
|
|
515
|
+
|
|
516
|
+
Returns:
|
|
517
|
+
(merged data, original column -> new column mapping)
|
|
518
|
+
"""
|
|
519
|
+
if not data or not data[0]:
|
|
520
|
+
return data, {}
|
|
521
|
+
|
|
522
|
+
num_cols = max(len(row) for row in data)
|
|
523
|
+
|
|
524
|
+
# Analyze columns based on text if no cell info
|
|
525
|
+
if not cells:
|
|
526
|
+
return self._merge_columns_by_content(data)
|
|
527
|
+
|
|
528
|
+
# Calculate width per column
|
|
529
|
+
col_widths = self._calculate_column_widths(cells, num_cols)
|
|
530
|
+
|
|
531
|
+
# Determine column groups to merge
|
|
532
|
+
col_groups = self._determine_column_groups(col_widths, min_col_width)
|
|
533
|
+
|
|
534
|
+
if len(col_groups) == num_cols:
|
|
535
|
+
# No merge needed
|
|
536
|
+
return data, {i: i for i in range(num_cols)}
|
|
537
|
+
|
|
538
|
+
# Create column mapping
|
|
539
|
+
col_mapping = {}
|
|
540
|
+
for new_idx, group in enumerate(col_groups):
|
|
541
|
+
for old_idx in group:
|
|
542
|
+
col_mapping[old_idx] = new_idx
|
|
543
|
+
|
|
544
|
+
# Merge data
|
|
545
|
+
merged_data = []
|
|
546
|
+
for row in data:
|
|
547
|
+
new_row = [''] * len(col_groups)
|
|
548
|
+
for old_idx, cell_val in enumerate(row):
|
|
549
|
+
if old_idx in col_mapping:
|
|
550
|
+
new_idx = col_mapping[old_idx]
|
|
551
|
+
if cell_val and str(cell_val).strip():
|
|
552
|
+
if new_row[new_idx]:
|
|
553
|
+
new_row[new_idx] += str(cell_val).strip()
|
|
554
|
+
else:
|
|
555
|
+
new_row[new_idx] = str(cell_val).strip()
|
|
556
|
+
merged_data.append(new_row)
|
|
557
|
+
|
|
558
|
+
logger.debug(f"[TableDetection] Merged {num_cols} columns -> {len(col_groups)} columns")
|
|
559
|
+
|
|
560
|
+
return merged_data, col_mapping
|
|
561
|
+
|
|
562
|
+
def _calculate_column_widths(self, cells: List[Tuple], num_cols: int) -> List[float]:
|
|
563
|
+
"""Calculate column widths from cell bbox."""
|
|
564
|
+
if not cells:
|
|
565
|
+
return [0.0] * num_cols
|
|
566
|
+
|
|
567
|
+
# Collect X coordinates
|
|
568
|
+
x_coords = sorted(set([c[0] for c in cells if c] + [c[2] for c in cells if c]))
|
|
569
|
+
|
|
570
|
+
if len(x_coords) < 2:
|
|
571
|
+
return [0.0] * num_cols
|
|
572
|
+
|
|
573
|
+
# Calculate column widths
|
|
574
|
+
widths = []
|
|
575
|
+
for i in range(len(x_coords) - 1):
|
|
576
|
+
widths.append(x_coords[i + 1] - x_coords[i])
|
|
577
|
+
|
|
578
|
+
# Match num_cols
|
|
579
|
+
if len(widths) < num_cols:
|
|
580
|
+
widths.extend([0.0] * (num_cols - len(widths)))
|
|
581
|
+
elif len(widths) > num_cols:
|
|
582
|
+
widths = widths[:num_cols]
|
|
583
|
+
|
|
584
|
+
return widths
|
|
585
|
+
|
|
586
|
+
def _determine_column_groups(
|
|
587
|
+
self,
|
|
588
|
+
col_widths: List[float],
|
|
589
|
+
min_width: float
|
|
590
|
+
) -> List[List[int]]:
|
|
591
|
+
"""
|
|
592
|
+
Determine column groups to merge based on column widths.
|
|
593
|
+
|
|
594
|
+
Narrow columns are merged with the next wider column.
|
|
595
|
+
"""
|
|
596
|
+
groups = []
|
|
597
|
+
current_group = []
|
|
598
|
+
|
|
599
|
+
for idx, width in enumerate(col_widths):
|
|
600
|
+
current_group.append(idx)
|
|
601
|
+
|
|
602
|
+
# Finalize group when total width meets minimum
|
|
603
|
+
group_width = sum(col_widths[i] for i in current_group)
|
|
604
|
+
|
|
605
|
+
if group_width >= min_width:
|
|
606
|
+
groups.append(current_group)
|
|
607
|
+
current_group = []
|
|
608
|
+
|
|
609
|
+
# Handle last group
|
|
610
|
+
if current_group:
|
|
611
|
+
if groups:
|
|
612
|
+
# Merge with previous group
|
|
613
|
+
groups[-1].extend(current_group)
|
|
614
|
+
else:
|
|
615
|
+
groups.append(current_group)
|
|
616
|
+
|
|
617
|
+
return groups
|
|
618
|
+
|
|
619
|
+
def _merge_columns_by_content(self, data: List[List]) -> Tuple[List[List[str]], Dict[int, int]]:
|
|
620
|
+
"""
|
|
621
|
+
Merge empty columns based on text content.
|
|
622
|
+
|
|
623
|
+
Columns empty in most rows are merged with adjacent columns.
|
|
624
|
+
"""
|
|
625
|
+
if not data or not data[0]:
|
|
626
|
+
return data, {}
|
|
627
|
+
|
|
628
|
+
num_cols = max(len(row) for row in data)
|
|
629
|
+
num_rows = len(data)
|
|
630
|
+
|
|
631
|
+
# Calculate "emptiness" ratio for each column
|
|
632
|
+
empty_ratios = []
|
|
633
|
+
for col_idx in range(num_cols):
|
|
634
|
+
empty_count = 0
|
|
635
|
+
for row in data:
|
|
636
|
+
if col_idx >= len(row) or not row[col_idx] or not str(row[col_idx]).strip():
|
|
637
|
+
empty_count += 1
|
|
638
|
+
empty_ratios.append(empty_count / num_rows if num_rows > 0 else 1.0)
|
|
639
|
+
|
|
640
|
+
# Find columns with 90%+ empty ratio and merge with adjacent
|
|
641
|
+
groups = []
|
|
642
|
+
current_group = []
|
|
643
|
+
|
|
644
|
+
for col_idx, empty_ratio in enumerate(empty_ratios):
|
|
645
|
+
current_group.append(col_idx)
|
|
646
|
+
|
|
647
|
+
# Finalize group for non-empty columns
|
|
648
|
+
if empty_ratio < 0.9:
|
|
649
|
+
groups.append(current_group)
|
|
650
|
+
current_group = []
|
|
651
|
+
|
|
652
|
+
# Handle last group
|
|
653
|
+
if current_group:
|
|
654
|
+
if groups:
|
|
655
|
+
groups[-1].extend(current_group)
|
|
656
|
+
else:
|
|
657
|
+
groups.append(current_group)
|
|
658
|
+
|
|
659
|
+
if len(groups) == num_cols:
|
|
660
|
+
return data, {i: i for i in range(num_cols)}
|
|
661
|
+
|
|
662
|
+
# Create column mapping
|
|
663
|
+
col_mapping = {}
|
|
664
|
+
for new_idx, group in enumerate(groups):
|
|
665
|
+
for old_idx in group:
|
|
666
|
+
col_mapping[old_idx] = new_idx
|
|
667
|
+
|
|
668
|
+
# Merge data
|
|
669
|
+
merged_data = []
|
|
670
|
+
for row in data:
|
|
671
|
+
new_row = [''] * len(groups)
|
|
672
|
+
for old_idx, cell_val in enumerate(row):
|
|
673
|
+
if old_idx in col_mapping:
|
|
674
|
+
new_idx = col_mapping[old_idx]
|
|
675
|
+
if cell_val and str(cell_val).strip():
|
|
676
|
+
if new_row[new_idx]:
|
|
677
|
+
new_row[new_idx] += str(cell_val).strip()
|
|
678
|
+
else:
|
|
679
|
+
new_row[new_idx] = str(cell_val).strip()
|
|
680
|
+
merged_data.append(new_row)
|
|
681
|
+
|
|
682
|
+
logger.debug(f"[TableDetection] Content-based merge: {num_cols} -> {len(groups)} columns")
|
|
683
|
+
|
|
684
|
+
return merged_data, col_mapping
|
|
685
|
+
|
|
686
|
+
def _extract_cells_from_pymupdf_with_mapping(
|
|
687
|
+
self,
|
|
688
|
+
table,
|
|
689
|
+
col_mapping: Dict[int, int]
|
|
690
|
+
) -> List[CellInfo]:
|
|
691
|
+
"""
|
|
692
|
+
Extract cell info with column mapping applied.
|
|
693
|
+
"""
|
|
694
|
+
if not col_mapping:
|
|
695
|
+
return self._extract_cells_from_pymupdf(table)
|
|
696
|
+
|
|
697
|
+
cells = self._extract_cells_from_pymupdf(table)
|
|
698
|
+
|
|
699
|
+
if not cells:
|
|
700
|
+
return cells
|
|
701
|
+
|
|
702
|
+
# Calculate mapped column count
|
|
703
|
+
new_col_count = max(col_mapping.values()) + 1 if col_mapping else 0
|
|
704
|
+
|
|
705
|
+
# Remap cell info
|
|
706
|
+
remapped_cells = []
|
|
707
|
+
processed_positions = set()
|
|
708
|
+
|
|
709
|
+
for cell in cells:
|
|
710
|
+
old_col = cell.col
|
|
711
|
+
new_col = col_mapping.get(old_col, old_col)
|
|
712
|
+
|
|
713
|
+
# Skip if cell already exists at same position
|
|
714
|
+
if (cell.row, new_col) in processed_positions:
|
|
715
|
+
continue
|
|
716
|
+
|
|
717
|
+
# Recalculate colspan: consider merged columns
|
|
718
|
+
new_colspan = 1
|
|
719
|
+
for c in range(cell.col, cell.col + cell.colspan):
|
|
720
|
+
mapped_c = col_mapping.get(c, c)
|
|
721
|
+
if mapped_c != new_col:
|
|
722
|
+
new_colspan = max(new_colspan, mapped_c - new_col + 1)
|
|
723
|
+
|
|
724
|
+
new_colspan = min(new_colspan, new_col_count - new_col)
|
|
725
|
+
|
|
726
|
+
remapped_cells.append(CellInfo(
|
|
727
|
+
row=cell.row,
|
|
728
|
+
col=new_col,
|
|
729
|
+
rowspan=cell.rowspan,
|
|
730
|
+
colspan=max(1, new_colspan),
|
|
731
|
+
bbox=cell.bbox
|
|
732
|
+
))
|
|
733
|
+
|
|
734
|
+
# Record covered positions
|
|
735
|
+
for r in range(cell.row, cell.row + cell.rowspan):
|
|
736
|
+
for c in range(new_col, new_col + max(1, new_colspan)):
|
|
737
|
+
processed_positions.add((r, c))
|
|
738
|
+
|
|
739
|
+
return remapped_cells
|
|
740
|
+
|
|
741
|
+
def _calculate_pymupdf_confidence(self, table, data: List[List]) -> float:
|
|
742
|
+
"""
|
|
743
|
+
Calculate PyMuPDF result confidence.
|
|
744
|
+
|
|
745
|
+
Features:
|
|
746
|
+
- Higher base score (trusting PyMuPDF results)
|
|
747
|
+
- Relaxed penalties
|
|
748
|
+
- Stronger cell info bonus
|
|
749
|
+
"""
|
|
750
|
+
score = 0.0
|
|
751
|
+
|
|
752
|
+
# Higher base score (PyMuPDF is highly reliable)
|
|
753
|
+
score += 0.6
|
|
754
|
+
|
|
755
|
+
# Score based on row/column count
|
|
756
|
+
num_rows = len(data)
|
|
757
|
+
if num_rows >= self.MIN_TABLE_ROWS:
|
|
758
|
+
score += 0.1
|
|
759
|
+
if table.col_count >= self.MIN_TABLE_COLS:
|
|
760
|
+
score += 0.1
|
|
761
|
+
|
|
762
|
+
# Score based on data density (relaxed penalties)
|
|
763
|
+
total_cells = sum(len(row) for row in data)
|
|
764
|
+
filled_cells = sum(1 for row in data for cell in row if cell and str(cell).strip())
|
|
765
|
+
|
|
766
|
+
if total_cells > 0:
|
|
767
|
+
density = filled_cells / total_cells
|
|
768
|
+
|
|
769
|
+
if density < 0.05:
|
|
770
|
+
# Penalty only for very low density
|
|
771
|
+
score -= 0.2
|
|
772
|
+
elif density < 0.1:
|
|
773
|
+
score -= 0.1
|
|
774
|
+
else:
|
|
775
|
+
score += density * 0.15
|
|
776
|
+
|
|
777
|
+
# Additional score for cell info (stronger bonus)
|
|
778
|
+
if hasattr(table, 'cells') and table.cells:
|
|
779
|
+
score += 0.15
|
|
780
|
+
|
|
781
|
+
# Check meaningful cell count (relaxed penalty)
|
|
782
|
+
meaningful_count = sum(
|
|
783
|
+
1 for row in data for cell in row
|
|
784
|
+
if cell and len(str(cell).strip()) >= 2
|
|
785
|
+
)
|
|
786
|
+
|
|
787
|
+
if meaningful_count < 2:
|
|
788
|
+
score -= 0.1
|
|
789
|
+
|
|
790
|
+
# Check valid row count (relaxed penalty)
|
|
791
|
+
valid_rows = sum(1 for row in data if any(cell and str(cell).strip() for cell in row))
|
|
792
|
+
if valid_rows <= 1:
|
|
793
|
+
score -= 0.1
|
|
794
|
+
|
|
795
|
+
# Check graphic region overlap (relaxed penalty)
|
|
796
|
+
if self.graphic_detector:
|
|
797
|
+
if self.graphic_detector.is_bbox_in_graphic_region(table.bbox, threshold=0.5):
|
|
798
|
+
score -= 0.15
|
|
799
|
+
|
|
800
|
+
return max(0.0, min(1.0, score))
|
|
801
|
+
|
|
802
|
+
def _extract_cells_from_pymupdf(self, table) -> List[CellInfo]:
|
|
803
|
+
"""
|
|
804
|
+
Extract cell info from PyMuPDF table.
|
|
805
|
+
|
|
806
|
+
Applies logic from pdf_handler_default's _extract_cell_spans_from_table():
|
|
807
|
+
1. Extract physical bbox for each cell from table.cells
|
|
808
|
+
2. Map Y coordinates to row indices, X coordinates to column indices
|
|
809
|
+
3. Calculate rowspan/colspan if cell bbox spans multiple grid cells
|
|
810
|
+
"""
|
|
811
|
+
cells = []
|
|
812
|
+
|
|
813
|
+
if not hasattr(table, 'cells') or not table.cells:
|
|
814
|
+
# Return empty list if no cell info (handled by CellAnalysisEngine)
|
|
815
|
+
return cells
|
|
816
|
+
|
|
817
|
+
raw_cells = table.cells
|
|
818
|
+
if not raw_cells:
|
|
819
|
+
return cells
|
|
820
|
+
|
|
821
|
+
# Extract X, Y boundary lines (same approach as pdf_handler_default)
|
|
822
|
+
x_coords = sorted(set([c[0] for c in raw_cells if c] + [c[2] for c in raw_cells if c]))
|
|
823
|
+
y_coords = sorted(set([c[1] for c in raw_cells if c] + [c[3] for c in raw_cells if c]))
|
|
824
|
+
|
|
825
|
+
if len(x_coords) < 2 or len(y_coords) < 2:
|
|
826
|
+
# Return basic cell info if grid cannot be constructed
|
|
827
|
+
for idx, cell_bbox in enumerate(raw_cells):
|
|
828
|
+
if cell_bbox is None:
|
|
829
|
+
continue
|
|
830
|
+
num_rows = len(table.extract()) if hasattr(table, 'extract') else 0
|
|
831
|
+
row_idx = idx // max(1, table.col_count) if hasattr(table, 'col_count') else 0
|
|
832
|
+
col_idx = idx % max(1, table.col_count) if hasattr(table, 'col_count') else 0
|
|
833
|
+
cells.append(CellInfo(
|
|
834
|
+
row=row_idx,
|
|
835
|
+
col=col_idx,
|
|
836
|
+
rowspan=1,
|
|
837
|
+
colspan=1,
|
|
838
|
+
bbox=cell_bbox
|
|
839
|
+
))
|
|
840
|
+
return cells
|
|
841
|
+
|
|
842
|
+
# Function to map coordinates to grid indices (same as pdf_handler_default)
|
|
843
|
+
def coord_to_index(coord: float, coords: List[float], tolerance: float = 3.0) -> int:
|
|
844
|
+
for i, c in enumerate(coords):
|
|
845
|
+
if abs(coord - c) <= tolerance:
|
|
846
|
+
return i
|
|
847
|
+
# Return closest index
|
|
848
|
+
return min(range(len(coords)), key=lambda i: abs(coords[i] - coord))
|
|
849
|
+
|
|
850
|
+
# Track processed grid positions
|
|
851
|
+
processed_positions: Set[Tuple[int, int]] = set()
|
|
852
|
+
|
|
853
|
+
for cell_bbox in raw_cells:
|
|
854
|
+
if cell_bbox is None:
|
|
855
|
+
continue
|
|
856
|
+
|
|
857
|
+
x0, y0, x1, y1 = cell_bbox[:4]
|
|
858
|
+
|
|
859
|
+
col_start = coord_to_index(x0, x_coords)
|
|
860
|
+
col_end = coord_to_index(x1, x_coords)
|
|
861
|
+
row_start = coord_to_index(y0, y_coords)
|
|
862
|
+
row_end = coord_to_index(y1, y_coords)
|
|
863
|
+
|
|
864
|
+
colspan = max(1, col_end - col_start)
|
|
865
|
+
rowspan = max(1, row_end - row_start)
|
|
866
|
+
|
|
867
|
+
if (row_start, col_start) in processed_positions:
|
|
868
|
+
continue
|
|
869
|
+
|
|
870
|
+
processed_positions.add((row_start, col_start))
|
|
871
|
+
|
|
872
|
+
cells.append(CellInfo(
|
|
873
|
+
row=row_start,
|
|
874
|
+
col=col_start,
|
|
875
|
+
rowspan=rowspan,
|
|
876
|
+
colspan=colspan,
|
|
877
|
+
bbox=cell_bbox
|
|
878
|
+
))
|
|
879
|
+
|
|
880
|
+
# Mark other cells in merged area
|
|
881
|
+
for r in range(row_start, row_start + rowspan):
|
|
882
|
+
for c in range(col_start, col_start + colspan):
|
|
883
|
+
if (r, c) != (row_start, col_start):
|
|
884
|
+
processed_positions.add((r, c))
|
|
885
|
+
|
|
886
|
+
return cells
|
|
887
|
+
|
|
888
|
+
def _cluster_grid_positions(self, positions: List[float], tolerance: float = 3.0) -> List[float]:
|
|
889
|
+
"""
|
|
890
|
+
Cluster grid positions.
|
|
891
|
+
|
|
892
|
+
Merge nearby lines into one.
|
|
893
|
+
"""
|
|
894
|
+
if not positions:
|
|
895
|
+
return []
|
|
896
|
+
|
|
897
|
+
sorted_pos = sorted(set(positions))
|
|
898
|
+
if len(sorted_pos) == 0:
|
|
899
|
+
return []
|
|
900
|
+
|
|
901
|
+
clusters: List[List[float]] = [[sorted_pos[0]]]
|
|
902
|
+
|
|
903
|
+
for pos in sorted_pos[1:]:
|
|
904
|
+
if pos - clusters[-1][-1] <= tolerance:
|
|
905
|
+
clusters[-1].append(pos)
|
|
906
|
+
else:
|
|
907
|
+
clusters.append([pos])
|
|
908
|
+
|
|
909
|
+
# Return average value of each cluster
|
|
910
|
+
return [sum(c) / len(c) for c in clusters]
|
|
911
|
+
|
|
912
|
+
def _find_grid_index_v2(self, value: float, grid_lines: List[float],
|
|
913
|
+
tolerance: float = 5.0) -> Optional[int]:
|
|
914
|
+
"""
|
|
915
|
+
Find index of value in grid lines (improved version).
|
|
916
|
+
|
|
917
|
+
If exact matching fails, select the closest line.
|
|
918
|
+
"""
|
|
919
|
+
if not grid_lines:
|
|
920
|
+
return None
|
|
921
|
+
|
|
922
|
+
# Try exact matching
|
|
923
|
+
for i, line in enumerate(grid_lines):
|
|
924
|
+
if abs(value - line) <= tolerance:
|
|
925
|
+
return i
|
|
926
|
+
|
|
927
|
+
# Find closest line
|
|
928
|
+
min_diff = float('inf')
|
|
929
|
+
closest_idx = 0
|
|
930
|
+
|
|
931
|
+
for i, line in enumerate(grid_lines):
|
|
932
|
+
diff = abs(value - line)
|
|
933
|
+
if diff < min_diff:
|
|
934
|
+
min_diff = diff
|
|
935
|
+
closest_idx = i
|
|
936
|
+
|
|
937
|
+
# Return if within 3x tolerance
|
|
938
|
+
if min_diff <= tolerance * 3:
|
|
939
|
+
return closest_idx
|
|
940
|
+
|
|
941
|
+
return None
|
|
942
|
+
|
|
943
|
+
def _find_grid_index(self, value: float, grid_lines: List[float], tolerance: float = 3.0) -> Optional[int]:
|
|
944
|
+
"""Find index of value in grid lines (for compatibility)."""
|
|
945
|
+
return self._find_grid_index_v2(value, grid_lines, tolerance)
|
|
946
|
+
|
|
947
|
+
def _detect_with_pdfplumber(self) -> List[TableCandidate]:
|
|
948
|
+
"""Use pdfplumber for table detection."""
|
|
949
|
+
candidates = []
|
|
950
|
+
|
|
951
|
+
try:
|
|
952
|
+
with pdfplumber.open(self.file_path) as pdf:
|
|
953
|
+
if self.page_num >= len(pdf.pages):
|
|
954
|
+
return candidates
|
|
955
|
+
|
|
956
|
+
plumber_page = pdf.pages[self.page_num]
|
|
957
|
+
|
|
958
|
+
# Table settings
|
|
959
|
+
settings = {
|
|
960
|
+
"vertical_strategy": "lines",
|
|
961
|
+
"horizontal_strategy": "lines",
|
|
962
|
+
"snap_tolerance": 5,
|
|
963
|
+
"join_tolerance": 5,
|
|
964
|
+
}
|
|
965
|
+
|
|
966
|
+
tables = plumber_page.extract_tables(settings)
|
|
967
|
+
|
|
968
|
+
for table_idx, table_data in enumerate(tables):
|
|
969
|
+
if not table_data or not any(any(cell for cell in row if cell) for row in table_data):
|
|
970
|
+
continue
|
|
971
|
+
|
|
972
|
+
# Estimate bbox
|
|
973
|
+
bbox = self._estimate_table_bbox_pdfplumber(plumber_page, table_data)
|
|
974
|
+
|
|
975
|
+
if not bbox:
|
|
976
|
+
continue
|
|
977
|
+
|
|
978
|
+
confidence = self._calculate_pdfplumber_confidence(table_data)
|
|
979
|
+
|
|
980
|
+
if confidence < self.CONFIDENCE_THRESHOLD:
|
|
981
|
+
continue
|
|
982
|
+
|
|
983
|
+
candidates.append(TableCandidate(
|
|
984
|
+
strategy=TableDetectionStrategy.PDFPLUMBER_LINES,
|
|
985
|
+
confidence=confidence,
|
|
986
|
+
bbox=bbox,
|
|
987
|
+
grid=None,
|
|
988
|
+
cells=[],
|
|
989
|
+
data=table_data,
|
|
990
|
+
raw_table=None
|
|
991
|
+
))
|
|
992
|
+
|
|
993
|
+
except Exception as e:
|
|
994
|
+
logger.debug(f"[PDF] pdfplumber error: {e}")
|
|
995
|
+
|
|
996
|
+
return candidates
|
|
997
|
+
|
|
998
|
+
def _estimate_table_bbox_pdfplumber(self, page, data: List[List]) -> Optional[Tuple[float, float, float, float]]:
|
|
999
|
+
"""Estimate pdfplumber table bbox."""
|
|
1000
|
+
try:
|
|
1001
|
+
words = page.extract_words()
|
|
1002
|
+
if not words:
|
|
1003
|
+
return None
|
|
1004
|
+
|
|
1005
|
+
table_texts = set()
|
|
1006
|
+
for row in data:
|
|
1007
|
+
for cell in row:
|
|
1008
|
+
if cell and str(cell).strip():
|
|
1009
|
+
table_texts.add(str(cell).strip()[:20])
|
|
1010
|
+
|
|
1011
|
+
matching_words = []
|
|
1012
|
+
for word in words:
|
|
1013
|
+
if any(word['text'] in text or text in word['text'] for text in table_texts):
|
|
1014
|
+
matching_words.append(word)
|
|
1015
|
+
|
|
1016
|
+
if not matching_words:
|
|
1017
|
+
return None
|
|
1018
|
+
|
|
1019
|
+
x0 = min(w['x0'] for w in matching_words)
|
|
1020
|
+
y0 = min(w['top'] for w in matching_words)
|
|
1021
|
+
x1 = max(w['x1'] for w in matching_words)
|
|
1022
|
+
y1 = max(w['bottom'] for w in matching_words)
|
|
1023
|
+
|
|
1024
|
+
margin = 5
|
|
1025
|
+
return (x0 - margin, y0 - margin, x1 + margin, y1 + margin)
|
|
1026
|
+
|
|
1027
|
+
except Exception:
|
|
1028
|
+
return None
|
|
1029
|
+
|
|
1030
|
+
def _calculate_pdfplumber_confidence(self, data: List[List]) -> float:
|
|
1031
|
+
"""Calculate pdfplumber result confidence."""
|
|
1032
|
+
score = 0.0
|
|
1033
|
+
|
|
1034
|
+
# Base score (slightly lower than PyMuPDF)
|
|
1035
|
+
score += 0.4
|
|
1036
|
+
|
|
1037
|
+
num_rows = len(data)
|
|
1038
|
+
col_count = max(len(row) for row in data) if data else 0
|
|
1039
|
+
|
|
1040
|
+
if num_rows >= self.MIN_TABLE_ROWS:
|
|
1041
|
+
score += 0.1
|
|
1042
|
+
if col_count >= self.MIN_TABLE_COLS:
|
|
1043
|
+
score += 0.1
|
|
1044
|
+
|
|
1045
|
+
# Data density
|
|
1046
|
+
total_cells = sum(len(row) for row in data)
|
|
1047
|
+
filled_cells = sum(1 for row in data for cell in row if cell and str(cell).strip())
|
|
1048
|
+
|
|
1049
|
+
if total_cells > 0:
|
|
1050
|
+
density = filled_cells / total_cells
|
|
1051
|
+
|
|
1052
|
+
if density < 0.1:
|
|
1053
|
+
score -= 0.5
|
|
1054
|
+
elif density < 0.2:
|
|
1055
|
+
score -= 0.3
|
|
1056
|
+
else:
|
|
1057
|
+
score += density * 0.2
|
|
1058
|
+
|
|
1059
|
+
# Meaningful cell count
|
|
1060
|
+
meaningful_count = sum(
|
|
1061
|
+
1 for row in data for cell in row
|
|
1062
|
+
if cell and len(str(cell).strip()) >= 2
|
|
1063
|
+
)
|
|
1064
|
+
|
|
1065
|
+
if meaningful_count < 2:
|
|
1066
|
+
score -= 0.3
|
|
1067
|
+
|
|
1068
|
+
# Valid row count
|
|
1069
|
+
valid_rows = sum(1 for row in data if any(cell and str(cell).strip() for cell in row))
|
|
1070
|
+
if valid_rows <= 1:
|
|
1071
|
+
score -= 0.2
|
|
1072
|
+
|
|
1073
|
+
# Empty row ratio
|
|
1074
|
+
empty_rows = num_rows - valid_rows
|
|
1075
|
+
if num_rows > 0 and empty_rows / num_rows > 0.5:
|
|
1076
|
+
score -= 0.2
|
|
1077
|
+
|
|
1078
|
+
return max(0.0, min(1.0, score))
|
|
1079
|
+
|
|
1080
|
+
def _detect_with_lines(self) -> List[TableCandidate]:
|
|
1081
|
+
"""Line analysis based table detection."""
|
|
1082
|
+
candidates = []
|
|
1083
|
+
|
|
1084
|
+
# Build grid
|
|
1085
|
+
grid = self.line_engine.build_grid()
|
|
1086
|
+
|
|
1087
|
+
if not grid:
|
|
1088
|
+
return candidates
|
|
1089
|
+
|
|
1090
|
+
# Recover incomplete border
|
|
1091
|
+
if not grid.is_complete:
|
|
1092
|
+
grid = self.line_engine.reconstruct_incomplete_border(grid)
|
|
1093
|
+
if not grid.is_complete:
|
|
1094
|
+
return candidates
|
|
1095
|
+
|
|
1096
|
+
# Check if grid is valid
|
|
1097
|
+
if grid.row_count < self.MIN_TABLE_ROWS or grid.col_count < self.MIN_TABLE_COLS:
|
|
1098
|
+
return candidates
|
|
1099
|
+
|
|
1100
|
+
# Extract text from cells
|
|
1101
|
+
data = self._extract_text_from_grid(grid)
|
|
1102
|
+
|
|
1103
|
+
if not data or not any(any(cell for cell in row if cell) for row in data):
|
|
1104
|
+
return candidates
|
|
1105
|
+
|
|
1106
|
+
# Create cell info
|
|
1107
|
+
cells = self._create_cells_from_grid(grid)
|
|
1108
|
+
|
|
1109
|
+
# Calculate confidence
|
|
1110
|
+
confidence = self._calculate_line_based_confidence(grid, data)
|
|
1111
|
+
|
|
1112
|
+
if confidence < self.CONFIDENCE_THRESHOLD:
|
|
1113
|
+
return candidates
|
|
1114
|
+
|
|
1115
|
+
candidates.append(TableCandidate(
|
|
1116
|
+
strategy=TableDetectionStrategy.HYBRID_ANALYSIS,
|
|
1117
|
+
confidence=confidence,
|
|
1118
|
+
bbox=grid.bbox,
|
|
1119
|
+
grid=grid,
|
|
1120
|
+
cells=cells,
|
|
1121
|
+
data=data,
|
|
1122
|
+
raw_table=None
|
|
1123
|
+
))
|
|
1124
|
+
|
|
1125
|
+
return candidates
|
|
1126
|
+
|
|
1127
|
+
def _extract_text_from_grid(self, grid: GridInfo) -> List[List[Optional[str]]]:
|
|
1128
|
+
"""Extract text from grid cells."""
|
|
1129
|
+
data = []
|
|
1130
|
+
|
|
1131
|
+
page_dict = self.page.get_text("dict", sort=True)
|
|
1132
|
+
|
|
1133
|
+
for row_idx in range(grid.row_count):
|
|
1134
|
+
row_data = []
|
|
1135
|
+
y0 = grid.h_lines[row_idx]
|
|
1136
|
+
y1 = grid.h_lines[row_idx + 1]
|
|
1137
|
+
|
|
1138
|
+
for col_idx in range(grid.col_count):
|
|
1139
|
+
x0 = grid.v_lines[col_idx]
|
|
1140
|
+
x1 = grid.v_lines[col_idx + 1]
|
|
1141
|
+
|
|
1142
|
+
cell_bbox = (x0, y0, x1, y1)
|
|
1143
|
+
cell_text = self._get_text_in_bbox(page_dict, cell_bbox)
|
|
1144
|
+
row_data.append(cell_text)
|
|
1145
|
+
|
|
1146
|
+
data.append(row_data)
|
|
1147
|
+
|
|
1148
|
+
return data
|
|
1149
|
+
|
|
1150
|
+
def _get_text_in_bbox(self, page_dict: dict, bbox: Tuple[float, float, float, float]) -> str:
|
|
1151
|
+
"""Extract text within bbox."""
|
|
1152
|
+
x0, y0, x1, y1 = bbox
|
|
1153
|
+
texts = []
|
|
1154
|
+
|
|
1155
|
+
for block in page_dict.get("blocks", []):
|
|
1156
|
+
if block.get("type") != 0:
|
|
1157
|
+
continue
|
|
1158
|
+
|
|
1159
|
+
for line in block.get("lines", []):
|
|
1160
|
+
line_bbox = line.get("bbox", (0, 0, 0, 0))
|
|
1161
|
+
|
|
1162
|
+
if self._bbox_overlaps(line_bbox, bbox):
|
|
1163
|
+
line_text = ""
|
|
1164
|
+
for span in line.get("spans", []):
|
|
1165
|
+
span_bbox = span.get("bbox", (0, 0, 0, 0))
|
|
1166
|
+
if self._bbox_overlaps(span_bbox, bbox):
|
|
1167
|
+
line_text += span.get("text", "")
|
|
1168
|
+
|
|
1169
|
+
if line_text.strip():
|
|
1170
|
+
texts.append(line_text.strip())
|
|
1171
|
+
|
|
1172
|
+
return " ".join(texts)
|
|
1173
|
+
|
|
1174
|
+
def _bbox_overlaps(self, bbox1: Tuple, bbox2: Tuple, threshold: float = 0.3) -> bool:
|
|
1175
|
+
"""Check if two bboxes overlap."""
|
|
1176
|
+
x0 = max(bbox1[0], bbox2[0])
|
|
1177
|
+
y0 = max(bbox1[1], bbox2[1])
|
|
1178
|
+
x1 = min(bbox1[2], bbox2[2])
|
|
1179
|
+
y1 = min(bbox1[3], bbox2[3])
|
|
1180
|
+
|
|
1181
|
+
if x1 <= x0 or y1 <= y0:
|
|
1182
|
+
return False
|
|
1183
|
+
|
|
1184
|
+
overlap_area = (x1 - x0) * (y1 - y0)
|
|
1185
|
+
bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
|
|
1186
|
+
|
|
1187
|
+
if bbox1_area <= 0:
|
|
1188
|
+
return False
|
|
1189
|
+
|
|
1190
|
+
return overlap_area / bbox1_area >= threshold
|
|
1191
|
+
|
|
1192
|
+
def _create_cells_from_grid(self, grid: GridInfo) -> List[CellInfo]:
|
|
1193
|
+
"""Create cell info from grid."""
|
|
1194
|
+
cells = []
|
|
1195
|
+
|
|
1196
|
+
for row_idx in range(grid.row_count):
|
|
1197
|
+
y0 = grid.h_lines[row_idx]
|
|
1198
|
+
y1 = grid.h_lines[row_idx + 1]
|
|
1199
|
+
|
|
1200
|
+
for col_idx in range(grid.col_count):
|
|
1201
|
+
x0 = grid.v_lines[col_idx]
|
|
1202
|
+
x1 = grid.v_lines[col_idx + 1]
|
|
1203
|
+
|
|
1204
|
+
cells.append(CellInfo(
|
|
1205
|
+
row=row_idx,
|
|
1206
|
+
col=col_idx,
|
|
1207
|
+
rowspan=1,
|
|
1208
|
+
colspan=1,
|
|
1209
|
+
bbox=(x0, y0, x1, y1)
|
|
1210
|
+
))
|
|
1211
|
+
|
|
1212
|
+
return cells
|
|
1213
|
+
|
|
1214
|
+
def _calculate_line_based_confidence(self, grid: GridInfo, data: List[List]) -> float:
|
|
1215
|
+
"""Calculate line-based result confidence."""
|
|
1216
|
+
score = 0.0
|
|
1217
|
+
|
|
1218
|
+
# Base score (lower than other strategies)
|
|
1219
|
+
score += 0.3
|
|
1220
|
+
|
|
1221
|
+
# Grid completeness
|
|
1222
|
+
if grid.is_complete:
|
|
1223
|
+
score += 0.2
|
|
1224
|
+
elif grid.reconstructed:
|
|
1225
|
+
score += 0.1
|
|
1226
|
+
|
|
1227
|
+
# Row/column count
|
|
1228
|
+
if grid.row_count >= self.MIN_TABLE_ROWS:
|
|
1229
|
+
score += 0.1
|
|
1230
|
+
if grid.col_count >= self.MIN_TABLE_COLS:
|
|
1231
|
+
score += 0.1
|
|
1232
|
+
|
|
1233
|
+
# Data density
|
|
1234
|
+
total_cells = sum(len(row) for row in data)
|
|
1235
|
+
filled_cells = sum(1 for row in data for cell in row if cell and str(cell).strip())
|
|
1236
|
+
|
|
1237
|
+
if total_cells > 0:
|
|
1238
|
+
density = filled_cells / total_cells
|
|
1239
|
+
|
|
1240
|
+
if density < 0.1:
|
|
1241
|
+
score -= 0.4
|
|
1242
|
+
elif density < 0.2:
|
|
1243
|
+
score -= 0.2
|
|
1244
|
+
else:
|
|
1245
|
+
score += density * 0.2
|
|
1246
|
+
|
|
1247
|
+
# Meaningful cell count
|
|
1248
|
+
meaningful_count = sum(
|
|
1249
|
+
1 for row in data for cell in row
|
|
1250
|
+
if cell and len(str(cell).strip()) >= 2
|
|
1251
|
+
)
|
|
1252
|
+
|
|
1253
|
+
if meaningful_count < 2:
|
|
1254
|
+
score -= 0.2
|
|
1255
|
+
|
|
1256
|
+
# Valid row count
|
|
1257
|
+
valid_rows = sum(1 for row in data if any(cell and str(cell).strip() for cell in row))
|
|
1258
|
+
if valid_rows <= 1:
|
|
1259
|
+
score -= 0.2
|
|
1260
|
+
|
|
1261
|
+
# Check graphic region overlap
|
|
1262
|
+
if self.graphic_detector:
|
|
1263
|
+
if self.graphic_detector.is_bbox_in_graphic_region(grid.bbox, threshold=0.3):
|
|
1264
|
+
score -= 0.3
|
|
1265
|
+
|
|
1266
|
+
return max(0.0, min(1.0, score))
|
|
1267
|
+
|
|
1268
|
+
def _select_best_candidates(self, candidates: List[TableCandidate]) -> List[TableCandidate]:
|
|
1269
|
+
"""
|
|
1270
|
+
Select best table candidates.
|
|
1271
|
+
|
|
1272
|
+
Strongly reflects strategy priority:
|
|
1273
|
+
- PyMuPDF is most accurate, so PyMuPDF results are preferred in the same region
|
|
1274
|
+
- If confidence difference is less than 0.2, select by strategy priority
|
|
1275
|
+
"""
|
|
1276
|
+
if not candidates:
|
|
1277
|
+
return []
|
|
1278
|
+
|
|
1279
|
+
# Strategy priority: PYMUPDF > PDFPLUMBER > HYBRID
|
|
1280
|
+
priority_order = {
|
|
1281
|
+
TableDetectionStrategy.PYMUPDF_NATIVE: 0,
|
|
1282
|
+
TableDetectionStrategy.PDFPLUMBER_LINES: 1,
|
|
1283
|
+
TableDetectionStrategy.HYBRID_ANALYSIS: 2,
|
|
1284
|
+
TableDetectionStrategy.BORDERLESS_HEURISTIC: 3,
|
|
1285
|
+
}
|
|
1286
|
+
|
|
1287
|
+
# Changed sort key - prioritize strategy order more
|
|
1288
|
+
# If confidence difference is not large, decide by strategy priority
|
|
1289
|
+
def sort_key(c):
|
|
1290
|
+
# Subtract strategy priority * 0.15 from confidence
|
|
1291
|
+
# This makes PyMuPDF (priority=0) more favorable than pdfplumber (priority=1)
|
|
1292
|
+
adjusted_confidence = c.confidence - (priority_order.get(c.strategy, 99) * 0.15)
|
|
1293
|
+
return (-adjusted_confidence, priority_order.get(c.strategy, 99))
|
|
1294
|
+
|
|
1295
|
+
candidates_sorted = sorted(candidates, key=sort_key)
|
|
1296
|
+
|
|
1297
|
+
selected = []
|
|
1298
|
+
|
|
1299
|
+
for candidate in candidates_sorted:
|
|
1300
|
+
overlaps = False
|
|
1301
|
+
|
|
1302
|
+
for selected_candidate in selected:
|
|
1303
|
+
if self._tables_overlap_any(candidate.bbox, selected_candidate.bbox):
|
|
1304
|
+
overlaps = True
|
|
1305
|
+
break
|
|
1306
|
+
|
|
1307
|
+
if not overlaps:
|
|
1308
|
+
selected.append(candidate)
|
|
1309
|
+
|
|
1310
|
+
return selected
|
|
1311
|
+
|
|
1312
|
+
def _tables_overlap_any(self, bbox1: Tuple, bbox2: Tuple, threshold: float = 0.3) -> bool:
|
|
1313
|
+
"""
|
|
1314
|
+
Check if two tables overlap (improved version).
|
|
1315
|
+
|
|
1316
|
+
Returns True if either one is covered by the other by threshold or more.
|
|
1317
|
+
"""
|
|
1318
|
+
x0 = max(bbox1[0], bbox2[0])
|
|
1319
|
+
y0 = max(bbox1[1], bbox2[1])
|
|
1320
|
+
x1 = min(bbox1[2], bbox2[2])
|
|
1321
|
+
y1 = min(bbox1[3], bbox2[3])
|
|
1322
|
+
|
|
1323
|
+
if x1 <= x0 or y1 <= y0:
|
|
1324
|
+
return False
|
|
1325
|
+
|
|
1326
|
+
overlap_area = (x1 - x0) * (y1 - y0)
|
|
1327
|
+
bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
|
|
1328
|
+
bbox2_area = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
|
|
1329
|
+
|
|
1330
|
+
if bbox1_area <= 0 or bbox2_area <= 0:
|
|
1331
|
+
return False
|
|
1332
|
+
|
|
1333
|
+
# Consider overlapping if either side is covered by threshold or more
|
|
1334
|
+
ratio1 = overlap_area / bbox1_area
|
|
1335
|
+
ratio2 = overlap_area / bbox2_area
|
|
1336
|
+
|
|
1337
|
+
return ratio1 >= threshold or ratio2 >= threshold
|
|
1338
|
+
|
|
1339
|
+
|
|
1340
|
+
# ============================================================================
|
|
1341
|
+
# Export
|
|
1342
|
+
# ============================================================================
|
|
1343
|
+
|
|
1344
|
+
__all__ = [
|
|
1345
|
+
'TableDetectionEngine',
|
|
1346
|
+
]
|