xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1346 @@
1
+ """
2
+ Table Detection Engine for PDF Handler
3
+
4
+ Detects tables using multiple strategies and selects the best results.
5
+ Includes graphic region exclusion and fake table filtering capabilities.
6
+ Improved cell extraction accuracy.
7
+ """
8
+
9
+ import logging
10
+ from typing import List, Dict, Optional, Tuple, Any, Set
11
+
12
+ import fitz
13
+ import pdfplumber
14
+
15
+ from xgen_doc2chunk.core.processor.pdf_helpers.types import (
16
+ PDFConfig,
17
+ TableDetectionStrategy,
18
+ GridInfo,
19
+ CellInfo,
20
+ TableCandidate,
21
+ )
22
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_line_analysis import LineAnalysisEngine
23
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_graphic_detector import GraphicRegionDetector
24
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_table_validator import TableQualityValidator
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ # ============================================================================
30
+ # Table Detection Engine
31
+ # ============================================================================
32
+
33
+ class TableDetectionEngine:
34
+ """
35
+ Table Detection Engine
36
+
37
+ Detects tables using multiple strategies and selects the best results.
38
+
39
+ Features:
40
+ - GraphicRegionDetector integration to exclude vector graphic regions
41
+ - TableQualityValidator integration to filter fake tables
42
+
43
+ Supported Strategies:
44
+ 1. PyMuPDF find_tables() - Most accurate, preferred
45
+ 2. pdfplumber - Line-based detection
46
+ 3. Line-based - Direct line analysis
47
+ """
48
+
49
+ # Configuration constants
50
+ CONFIDENCE_THRESHOLD = getattr(PDFConfig, 'CONFIDENCE_THRESHOLD', 0.5)
51
+ MIN_TABLE_ROWS = getattr(PDFConfig, 'MIN_TABLE_ROWS', 2)
52
+ MIN_TABLE_COLS = getattr(PDFConfig, 'MIN_TABLE_COLS', 2)
53
+
54
+ def __init__(self, page, page_num: int, file_path: str):
55
+ """
56
+ Args:
57
+ page: PyMuPDF page object
58
+ page_num: Page number (0-indexed)
59
+ file_path: PDF file path
60
+ """
61
+ self.page = page
62
+ self.page_num = page_num
63
+ self.file_path = file_path
64
+ self.page_width = page.rect.width
65
+ self.page_height = page.rect.height
66
+
67
+ # Line analysis engine
68
+ self.line_engine = LineAnalysisEngine(page, self.page_width, self.page_height)
69
+ self.h_lines, self.v_lines = self.line_engine.analyze()
70
+
71
+ # Graphic region detector
72
+ self.graphic_detector = GraphicRegionDetector(page, page_num)
73
+ self.graphic_regions = self.graphic_detector.detect()
74
+
75
+ # Table quality validator
76
+ self.quality_validator = TableQualityValidator(page, self.graphic_detector)
77
+
78
+ def detect_tables(self) -> List[TableCandidate]:
79
+ """
80
+ Detect tables using all strategies.
81
+
82
+ Returns:
83
+ List of table candidates sorted by confidence.
84
+ """
85
+ candidates: List[TableCandidate] = []
86
+
87
+ # Strategy 1: PyMuPDF
88
+ pymupdf_candidates = self._detect_with_pymupdf()
89
+
90
+ # Pre-merge adjacent header-data tables (before validation)
91
+ pymupdf_candidates = self._merge_header_data_tables(pymupdf_candidates)
92
+ candidates.extend(pymupdf_candidates)
93
+
94
+ # Strategy 2: pdfplumber
95
+ pdfplumber_candidates = self._detect_with_pdfplumber()
96
+ pdfplumber_candidates = self._merge_header_data_tables(pdfplumber_candidates)
97
+ candidates.extend(pdfplumber_candidates)
98
+
99
+ # Strategy 3: Line-based (HYBRID_ANALYSIS)
100
+ # Used only when PyMuPDF and pdfplumber don't find tables
101
+ # Or used additionally with stricter validation
102
+ line_candidates = self._detect_with_lines()
103
+
104
+ # Enhanced cross-validation for HYBRID results
105
+ if line_candidates and not pymupdf_candidates:
106
+ # When PyMuPDF didn't find tables but HYBRID did
107
+ # Apply higher confidence threshold (0.65 or above)
108
+ line_candidates = [
109
+ c for c in line_candidates
110
+ if c.confidence >= 0.65
111
+ ]
112
+ logger.debug(f"[TableDetection] HYBRID-only detection: "
113
+ f"{len(line_candidates)} candidates passed stricter threshold (0.65)")
114
+
115
+ candidates.extend(line_candidates)
116
+
117
+ # Filter fake tables through quality validation
118
+ validated_candidates = self._validate_candidates(candidates)
119
+
120
+ # Select best candidates based on confidence
121
+ selected = self._select_best_candidates(validated_candidates)
122
+
123
+ return selected
124
+
125
+ def _merge_header_data_tables(self, candidates: List[TableCandidate]) -> List[TableCandidate]:
126
+ """
127
+ Merge adjacent header-data tables.
128
+
129
+ Conditions:
130
+ 1. First table has 1-2 rows (assumed to be header)
131
+ 2. Second table is directly below (Y gap < 30pt)
132
+ 3. X range is similar (80% or more overlap)
133
+ 4. Column count relationship: header columns <= data columns
134
+ """
135
+ if len(candidates) < 2:
136
+ return candidates
137
+
138
+ # Sort by Y position
139
+ sorted_candidates = sorted(candidates, key=lambda c: c.bbox[1])
140
+ merged = []
141
+ skip_indices = set()
142
+
143
+ for i, header_cand in enumerate(sorted_candidates):
144
+ if i in skip_indices:
145
+ continue
146
+
147
+ # Check header candidate condition (1-2 rows)
148
+ if len(header_cand.data) > 2:
149
+ merged.append(header_cand)
150
+ continue
151
+
152
+ # Check if can merge with next table
153
+ merged_cand = header_cand
154
+ for j in range(i + 1, len(sorted_candidates)):
155
+ if j in skip_indices:
156
+ continue
157
+
158
+ data_cand = sorted_candidates[j]
159
+
160
+ if self._can_merge_header_data(merged_cand, data_cand):
161
+ merged_cand = self._do_merge_header_data(merged_cand, data_cand)
162
+ skip_indices.add(j)
163
+ logger.debug(f"[TableDetection] Merged header with data table: "
164
+ f"header rows={len(header_cand.data)}, "
165
+ f"data rows={len(data_cand.data)}")
166
+ else:
167
+ break
168
+
169
+ merged.append(merged_cand)
170
+
171
+ return merged
172
+
173
+ def _can_merge_header_data(self, header: TableCandidate, data: TableCandidate) -> bool:
174
+ """Determine if header and data tables can be merged."""
175
+ # Check Y gap
176
+ y_gap = data.bbox[1] - header.bbox[3]
177
+ if y_gap < -5 or y_gap > 40: # Allow slight overlap, max 40pt gap
178
+ return False
179
+
180
+ # Check X range overlap
181
+ x_overlap_start = max(header.bbox[0], data.bbox[0])
182
+ x_overlap_end = min(header.bbox[2], data.bbox[2])
183
+ x_overlap = max(0, x_overlap_end - x_overlap_start)
184
+
185
+ header_width = header.bbox[2] - header.bbox[0]
186
+ data_width = data.bbox[2] - data.bbox[0]
187
+ max_width = max(header_width, data_width)
188
+
189
+ if max_width > 0 and x_overlap / max_width < 0.7:
190
+ return False
191
+
192
+ # Check column count relationship
193
+ header_cols = max(len(row) for row in header.data) if header.data else 0
194
+ data_cols = max(len(row) for row in data.data) if data.data else 0
195
+
196
+ # Don't merge if header has more columns than data
197
+ if header_cols > data_cols + 1:
198
+ return False
199
+
200
+ return True
201
+
202
+ def _do_merge_header_data(self, header: TableCandidate, data: TableCandidate) -> TableCandidate:
203
+ """Perform header and data table merge (includes subheader detection)."""
204
+ # New bbox
205
+ merged_bbox = (
206
+ min(header.bbox[0], data.bbox[0]),
207
+ header.bbox[1],
208
+ max(header.bbox[2], data.bbox[2]),
209
+ data.bbox[3]
210
+ )
211
+
212
+ # Determine column count
213
+ header_cols = max(len(row) for row in header.data) if header.data else 0
214
+ data_cols = max(len(row) for row in data.data) if data.data else 0
215
+ merged_cols = max(header_cols, data_cols)
216
+
217
+ # Detect subheader between header and data
218
+ subheader_row = self._detect_subheader_between(header, data, merged_cols)
219
+
220
+ # Merge data
221
+ merged_data = []
222
+ merged_cells = []
223
+
224
+ # Process header rows
225
+ for row_idx, row in enumerate(header.data):
226
+ if len(row) < merged_cols:
227
+ # Apply colspan if header has fewer columns
228
+ adjusted_row = list(row)
229
+ col_diff = merged_cols - len(row)
230
+
231
+ # Apply colspan to second column
232
+ if len(row) >= 2 and col_diff > 0:
233
+ # Store colspan info
234
+ merged_cells.append({
235
+ 'row': row_idx,
236
+ 'col': 1,
237
+ 'rowspan': 1,
238
+ 'colspan': 1 + col_diff,
239
+ 'bbox': None
240
+ })
241
+ # Add empty columns
242
+ for _ in range(col_diff):
243
+ adjusted_row.insert(2, '')
244
+ else:
245
+ adjusted_row.extend([''] * col_diff)
246
+
247
+ merged_data.append(adjusted_row)
248
+ else:
249
+ merged_data.append(list(row))
250
+
251
+ # Insert subheader row (header cell info)
252
+ header_row_count = len(header.data)
253
+ if subheader_row:
254
+ merged_data.append(subheader_row)
255
+ # Add cell info for subheader row (each cell has colspan=1)
256
+ subheader_row_idx = header_row_count # Row after header
257
+ for col_idx, cell_value in enumerate(subheader_row):
258
+ merged_cells.append({
259
+ 'row': subheader_row_idx,
260
+ 'col': col_idx,
261
+ 'rowspan': 1,
262
+ 'colspan': 1,
263
+ 'bbox': None
264
+ })
265
+ header_row_count += 1
266
+ logger.debug(f"[TableDetection] Added subheader row with cell info: {subheader_row}")
267
+
268
+ # Header cell info
269
+ if header.cells:
270
+ for cell in header.cells:
271
+ if not any(c['row'] == cell.row and c['col'] == cell.col for c in merged_cells):
272
+ merged_cells.append({
273
+ 'row': cell.row,
274
+ 'col': cell.col,
275
+ 'rowspan': cell.rowspan,
276
+ 'colspan': cell.colspan,
277
+ 'bbox': cell.bbox
278
+ })
279
+
280
+ # Process data rows
281
+ for row_idx, row in enumerate(data.data):
282
+ if len(row) < merged_cols:
283
+ adjusted_row = list(row) + [''] * (merged_cols - len(row))
284
+ else:
285
+ adjusted_row = list(row)
286
+ merged_data.append(adjusted_row)
287
+
288
+ # Data cell info (apply row offset)
289
+ if data.cells:
290
+ for cell in data.cells:
291
+ merged_cells.append({
292
+ 'row': cell.row + header_row_count,
293
+ 'col': cell.col,
294
+ 'rowspan': cell.rowspan,
295
+ 'colspan': cell.colspan,
296
+ 'bbox': cell.bbox
297
+ })
298
+
299
+ # Convert cell info to CellInfo objects
300
+ cell_objects = [
301
+ CellInfo(
302
+ row=c['row'],
303
+ col=c['col'],
304
+ rowspan=c.get('rowspan', 1),
305
+ colspan=c.get('colspan', 1),
306
+ # Use default value if bbox is None or missing
307
+ bbox=c.get('bbox') or (0, 0, 0, 0)
308
+ )
309
+ for c in merged_cells
310
+ ]
311
+
312
+ return TableCandidate(
313
+ strategy=header.strategy,
314
+ confidence=max(header.confidence, data.confidence),
315
+ bbox=merged_bbox,
316
+ grid=header.grid or data.grid,
317
+ cells=cell_objects,
318
+ data=merged_data,
319
+ raw_table=None
320
+ )
321
+
322
+ def _detect_subheader_between(self, header: TableCandidate, data: TableCandidate,
323
+ num_cols: int) -> Optional[List[str]]:
324
+ """
325
+ Detect subheader row between header and data tables.
326
+
327
+ Example: Sub-column headers like (A), (B), etc.
328
+ """
329
+ header_bottom = header.bbox[3]
330
+ data_top = data.bbox[1]
331
+
332
+ # Must have sufficient gap between header and data
333
+ gap = data_top - header_bottom
334
+ if gap < 5 or gap > 50:
335
+ return None
336
+
337
+ # Extract text from the region on the page
338
+ page_dict = self.page.get_text("dict", sort=True)
339
+
340
+ subheader_texts = []
341
+ for block in page_dict.get("blocks", []):
342
+ if block.get("type") != 0:
343
+ continue
344
+
345
+ for line in block.get("lines", []):
346
+ line_bbox = line.get("bbox", (0, 0, 0, 0))
347
+ line_y = (line_bbox[1] + line_bbox[3]) / 2
348
+
349
+ # Check if located between header and data
350
+ if header_bottom - 5 <= line_y <= data_top + 5:
351
+ # Check if within table X range
352
+ if line_bbox[0] >= header.bbox[0] - 10 and line_bbox[2] <= data.bbox[2] + 10:
353
+ for span in line.get("spans", []):
354
+ text = span.get("text", "").strip()
355
+ span_bbox = span.get("bbox", (0, 0, 0, 0))
356
+ if text and text not in [' ', '']:
357
+ subheader_texts.append({
358
+ 'text': text,
359
+ 'x0': span_bbox[0],
360
+ 'x1': span_bbox[2]
361
+ })
362
+
363
+ if not subheader_texts:
364
+ return None
365
+
366
+ # Check subheader pattern: (A), (B), etc.
367
+ has_subheader_pattern = any('(' in t['text'] and ')' in t['text'] for t in subheader_texts)
368
+ if not has_subheader_pattern:
369
+ return None
370
+
371
+ # Construct subheader row
372
+ table_left = min(header.bbox[0], data.bbox[0])
373
+ table_width = max(header.bbox[2], data.bbox[2]) - table_left
374
+ col_width = table_width / num_cols
375
+
376
+ subheader_row = [''] * num_cols
377
+ for item in sorted(subheader_texts, key=lambda x: x['x0']):
378
+ relative_x = item['x0'] - table_left
379
+ col_idx = min(int(relative_x / col_width), num_cols - 1)
380
+ col_idx = max(0, col_idx)
381
+
382
+ if subheader_row[col_idx]:
383
+ subheader_row[col_idx] += ' ' + item['text']
384
+ else:
385
+ subheader_row[col_idx] = item['text']
386
+
387
+ # Validate subheader (must have at least one (A), (B) pattern)
388
+ valid_count = sum(1 for s in subheader_row if '(' in s and ')' in s)
389
+ if valid_count < 1:
390
+ return None
391
+
392
+ return subheader_row
393
+
394
+ def _validate_candidates(self, candidates: List[TableCandidate]) -> List[TableCandidate]:
395
+ """
396
+ Validate table candidates for quality.
397
+
398
+ Validation criteria:
399
+ 1. Not overlapping with graphic regions (except PyMuPDF - text-based, high reliability)
400
+ 2. Sufficient filled cell ratio
401
+ 3. Has meaningful data
402
+
403
+ Tables detected with PyMuPDF strategy skip graphic region check.
404
+ Reason: PyMuPDF detects tables based on text, so it accurately recognizes
405
+ tables even when cells with background colors are mistaken as graphics.
406
+ """
407
+ validated = []
408
+
409
+ for candidate in candidates:
410
+ # PyMuPDF strategy skips graphic region check
411
+ skip_graphic_check = (candidate.strategy == TableDetectionStrategy.PYMUPDF_NATIVE)
412
+
413
+ is_valid, new_confidence, reason = self.quality_validator.validate(
414
+ data=candidate.data,
415
+ bbox=candidate.bbox,
416
+ cells_info=candidate.cells,
417
+ skip_graphic_check=skip_graphic_check # New parameter
418
+ )
419
+
420
+ if is_valid:
421
+ # Adjust confidence based on validation result
422
+ adjusted_confidence = min(candidate.confidence, new_confidence)
423
+
424
+ validated.append(TableCandidate(
425
+ strategy=candidate.strategy,
426
+ confidence=adjusted_confidence,
427
+ bbox=candidate.bbox,
428
+ grid=candidate.grid,
429
+ cells=candidate.cells,
430
+ data=candidate.data,
431
+ raw_table=candidate.raw_table
432
+ ))
433
+ else:
434
+ logger.debug(f"[TableDetection] Filtered out candidate: page={self.page_num+1}, "
435
+ f"bbox={candidate.bbox}, reason={reason}")
436
+
437
+ return validated
438
+
439
+ def _detect_with_pymupdf(self) -> List[TableCandidate]:
440
+ """Use PyMuPDF find_tables() (tolerance settings to resolve double-line issues)."""
441
+ candidates = []
442
+
443
+ if not hasattr(self.page, 'find_tables'):
444
+ return candidates
445
+
446
+ try:
447
+ # Apply same tolerance settings as pdf_handler.py
448
+ # Resolves fake column creation due to double/triple line borders
449
+ # snap_tolerance: Snaps nearby coordinates together
450
+ # join_tolerance: Joins nearby lines together
451
+ # edge_min_length: Ignores short lines (border lines)
452
+ # intersection_tolerance: Intersection detection tolerance
453
+ tabs = self.page.find_tables(
454
+ snap_tolerance=7,
455
+ join_tolerance=7,
456
+ edge_min_length=10,
457
+ intersection_tolerance=7,
458
+ )
459
+
460
+ for table_idx, table in enumerate(tabs.tables):
461
+ try:
462
+ table_data = table.extract()
463
+
464
+ if not table_data or not any(any(cell for cell in row if cell) for row in table_data):
465
+ continue
466
+
467
+ # Narrow column merge processing
468
+ merged_data, col_mapping = self._merge_narrow_columns(
469
+ table_data, table.cells if hasattr(table, 'cells') else None
470
+ )
471
+
472
+ # Calculate confidence (with merged data)
473
+ confidence = self._calculate_pymupdf_confidence(table, merged_data)
474
+
475
+ if confidence < self.CONFIDENCE_THRESHOLD:
476
+ continue
477
+
478
+ # Extract cell info (apply col_mapping)
479
+ cells = self._extract_cells_from_pymupdf_with_mapping(table, col_mapping)
480
+
481
+ candidates.append(TableCandidate(
482
+ strategy=TableDetectionStrategy.PYMUPDF_NATIVE,
483
+ confidence=confidence,
484
+ bbox=table.bbox,
485
+ grid=None,
486
+ cells=cells,
487
+ data=merged_data,
488
+ raw_table=table
489
+ ))
490
+
491
+ except Exception as e:
492
+ logger.debug(f"[PDF] PyMuPDF table extraction error: {e}")
493
+ continue
494
+
495
+ except Exception as e:
496
+ logger.debug(f"[PDF] PyMuPDF find_tables error: {e}")
497
+
498
+ return candidates
499
+
500
+ def _merge_narrow_columns(
501
+ self,
502
+ data: List[List],
503
+ cells: List[Tuple] = None,
504
+ min_col_width: float = 15.0
505
+ ) -> Tuple[List[List[str]], Dict[int, int]]:
506
+ """
507
+ Merge narrow columns with adjacent columns.
508
+
509
+ Removes fake columns generated by double/triple line borders in PDF.
510
+
511
+ Args:
512
+ data: Table data
513
+ cells: PyMuPDF cell bbox list
514
+ min_col_width: Minimum column width (pt)
515
+
516
+ Returns:
517
+ (merged data, original column -> new column mapping)
518
+ """
519
+ if not data or not data[0]:
520
+ return data, {}
521
+
522
+ num_cols = max(len(row) for row in data)
523
+
524
+ # Analyze columns based on text if no cell info
525
+ if not cells:
526
+ return self._merge_columns_by_content(data)
527
+
528
+ # Calculate width per column
529
+ col_widths = self._calculate_column_widths(cells, num_cols)
530
+
531
+ # Determine column groups to merge
532
+ col_groups = self._determine_column_groups(col_widths, min_col_width)
533
+
534
+ if len(col_groups) == num_cols:
535
+ # No merge needed
536
+ return data, {i: i for i in range(num_cols)}
537
+
538
+ # Create column mapping
539
+ col_mapping = {}
540
+ for new_idx, group in enumerate(col_groups):
541
+ for old_idx in group:
542
+ col_mapping[old_idx] = new_idx
543
+
544
+ # Merge data
545
+ merged_data = []
546
+ for row in data:
547
+ new_row = [''] * len(col_groups)
548
+ for old_idx, cell_val in enumerate(row):
549
+ if old_idx in col_mapping:
550
+ new_idx = col_mapping[old_idx]
551
+ if cell_val and str(cell_val).strip():
552
+ if new_row[new_idx]:
553
+ new_row[new_idx] += str(cell_val).strip()
554
+ else:
555
+ new_row[new_idx] = str(cell_val).strip()
556
+ merged_data.append(new_row)
557
+
558
+ logger.debug(f"[TableDetection] Merged {num_cols} columns -> {len(col_groups)} columns")
559
+
560
+ return merged_data, col_mapping
561
+
562
+ def _calculate_column_widths(self, cells: List[Tuple], num_cols: int) -> List[float]:
563
+ """Calculate column widths from cell bbox."""
564
+ if not cells:
565
+ return [0.0] * num_cols
566
+
567
+ # Collect X coordinates
568
+ x_coords = sorted(set([c[0] for c in cells if c] + [c[2] for c in cells if c]))
569
+
570
+ if len(x_coords) < 2:
571
+ return [0.0] * num_cols
572
+
573
+ # Calculate column widths
574
+ widths = []
575
+ for i in range(len(x_coords) - 1):
576
+ widths.append(x_coords[i + 1] - x_coords[i])
577
+
578
+ # Match num_cols
579
+ if len(widths) < num_cols:
580
+ widths.extend([0.0] * (num_cols - len(widths)))
581
+ elif len(widths) > num_cols:
582
+ widths = widths[:num_cols]
583
+
584
+ return widths
585
+
586
+ def _determine_column_groups(
587
+ self,
588
+ col_widths: List[float],
589
+ min_width: float
590
+ ) -> List[List[int]]:
591
+ """
592
+ Determine column groups to merge based on column widths.
593
+
594
+ Narrow columns are merged with the next wider column.
595
+ """
596
+ groups = []
597
+ current_group = []
598
+
599
+ for idx, width in enumerate(col_widths):
600
+ current_group.append(idx)
601
+
602
+ # Finalize group when total width meets minimum
603
+ group_width = sum(col_widths[i] for i in current_group)
604
+
605
+ if group_width >= min_width:
606
+ groups.append(current_group)
607
+ current_group = []
608
+
609
+ # Handle last group
610
+ if current_group:
611
+ if groups:
612
+ # Merge with previous group
613
+ groups[-1].extend(current_group)
614
+ else:
615
+ groups.append(current_group)
616
+
617
+ return groups
618
+
619
+ def _merge_columns_by_content(self, data: List[List]) -> Tuple[List[List[str]], Dict[int, int]]:
620
+ """
621
+ Merge empty columns based on text content.
622
+
623
+ Columns empty in most rows are merged with adjacent columns.
624
+ """
625
+ if not data or not data[0]:
626
+ return data, {}
627
+
628
+ num_cols = max(len(row) for row in data)
629
+ num_rows = len(data)
630
+
631
+ # Calculate "emptiness" ratio for each column
632
+ empty_ratios = []
633
+ for col_idx in range(num_cols):
634
+ empty_count = 0
635
+ for row in data:
636
+ if col_idx >= len(row) or not row[col_idx] or not str(row[col_idx]).strip():
637
+ empty_count += 1
638
+ empty_ratios.append(empty_count / num_rows if num_rows > 0 else 1.0)
639
+
640
+ # Find columns with 90%+ empty ratio and merge with adjacent
641
+ groups = []
642
+ current_group = []
643
+
644
+ for col_idx, empty_ratio in enumerate(empty_ratios):
645
+ current_group.append(col_idx)
646
+
647
+ # Finalize group for non-empty columns
648
+ if empty_ratio < 0.9:
649
+ groups.append(current_group)
650
+ current_group = []
651
+
652
+ # Handle last group
653
+ if current_group:
654
+ if groups:
655
+ groups[-1].extend(current_group)
656
+ else:
657
+ groups.append(current_group)
658
+
659
+ if len(groups) == num_cols:
660
+ return data, {i: i for i in range(num_cols)}
661
+
662
+ # Create column mapping
663
+ col_mapping = {}
664
+ for new_idx, group in enumerate(groups):
665
+ for old_idx in group:
666
+ col_mapping[old_idx] = new_idx
667
+
668
+ # Merge data
669
+ merged_data = []
670
+ for row in data:
671
+ new_row = [''] * len(groups)
672
+ for old_idx, cell_val in enumerate(row):
673
+ if old_idx in col_mapping:
674
+ new_idx = col_mapping[old_idx]
675
+ if cell_val and str(cell_val).strip():
676
+ if new_row[new_idx]:
677
+ new_row[new_idx] += str(cell_val).strip()
678
+ else:
679
+ new_row[new_idx] = str(cell_val).strip()
680
+ merged_data.append(new_row)
681
+
682
+ logger.debug(f"[TableDetection] Content-based merge: {num_cols} -> {len(groups)} columns")
683
+
684
+ return merged_data, col_mapping
685
+
686
+ def _extract_cells_from_pymupdf_with_mapping(
687
+ self,
688
+ table,
689
+ col_mapping: Dict[int, int]
690
+ ) -> List[CellInfo]:
691
+ """
692
+ Extract cell info with column mapping applied.
693
+ """
694
+ if not col_mapping:
695
+ return self._extract_cells_from_pymupdf(table)
696
+
697
+ cells = self._extract_cells_from_pymupdf(table)
698
+
699
+ if not cells:
700
+ return cells
701
+
702
+ # Calculate mapped column count
703
+ new_col_count = max(col_mapping.values()) + 1 if col_mapping else 0
704
+
705
+ # Remap cell info
706
+ remapped_cells = []
707
+ processed_positions = set()
708
+
709
+ for cell in cells:
710
+ old_col = cell.col
711
+ new_col = col_mapping.get(old_col, old_col)
712
+
713
+ # Skip if cell already exists at same position
714
+ if (cell.row, new_col) in processed_positions:
715
+ continue
716
+
717
+ # Recalculate colspan: consider merged columns
718
+ new_colspan = 1
719
+ for c in range(cell.col, cell.col + cell.colspan):
720
+ mapped_c = col_mapping.get(c, c)
721
+ if mapped_c != new_col:
722
+ new_colspan = max(new_colspan, mapped_c - new_col + 1)
723
+
724
+ new_colspan = min(new_colspan, new_col_count - new_col)
725
+
726
+ remapped_cells.append(CellInfo(
727
+ row=cell.row,
728
+ col=new_col,
729
+ rowspan=cell.rowspan,
730
+ colspan=max(1, new_colspan),
731
+ bbox=cell.bbox
732
+ ))
733
+
734
+ # Record covered positions
735
+ for r in range(cell.row, cell.row + cell.rowspan):
736
+ for c in range(new_col, new_col + max(1, new_colspan)):
737
+ processed_positions.add((r, c))
738
+
739
+ return remapped_cells
740
+
741
+ def _calculate_pymupdf_confidence(self, table, data: List[List]) -> float:
742
+ """
743
+ Calculate PyMuPDF result confidence.
744
+
745
+ Features:
746
+ - Higher base score (trusting PyMuPDF results)
747
+ - Relaxed penalties
748
+ - Stronger cell info bonus
749
+ """
750
+ score = 0.0
751
+
752
+ # Higher base score (PyMuPDF is highly reliable)
753
+ score += 0.6
754
+
755
+ # Score based on row/column count
756
+ num_rows = len(data)
757
+ if num_rows >= self.MIN_TABLE_ROWS:
758
+ score += 0.1
759
+ if table.col_count >= self.MIN_TABLE_COLS:
760
+ score += 0.1
761
+
762
+ # Score based on data density (relaxed penalties)
763
+ total_cells = sum(len(row) for row in data)
764
+ filled_cells = sum(1 for row in data for cell in row if cell and str(cell).strip())
765
+
766
+ if total_cells > 0:
767
+ density = filled_cells / total_cells
768
+
769
+ if density < 0.05:
770
+ # Penalty only for very low density
771
+ score -= 0.2
772
+ elif density < 0.1:
773
+ score -= 0.1
774
+ else:
775
+ score += density * 0.15
776
+
777
+ # Additional score for cell info (stronger bonus)
778
+ if hasattr(table, 'cells') and table.cells:
779
+ score += 0.15
780
+
781
+ # Check meaningful cell count (relaxed penalty)
782
+ meaningful_count = sum(
783
+ 1 for row in data for cell in row
784
+ if cell and len(str(cell).strip()) >= 2
785
+ )
786
+
787
+ if meaningful_count < 2:
788
+ score -= 0.1
789
+
790
+ # Check valid row count (relaxed penalty)
791
+ valid_rows = sum(1 for row in data if any(cell and str(cell).strip() for cell in row))
792
+ if valid_rows <= 1:
793
+ score -= 0.1
794
+
795
+ # Check graphic region overlap (relaxed penalty)
796
+ if self.graphic_detector:
797
+ if self.graphic_detector.is_bbox_in_graphic_region(table.bbox, threshold=0.5):
798
+ score -= 0.15
799
+
800
+ return max(0.0, min(1.0, score))
801
+
802
+ def _extract_cells_from_pymupdf(self, table) -> List[CellInfo]:
803
+ """
804
+ Extract cell info from PyMuPDF table.
805
+
806
+ Applies logic from pdf_handler_default's _extract_cell_spans_from_table():
807
+ 1. Extract physical bbox for each cell from table.cells
808
+ 2. Map Y coordinates to row indices, X coordinates to column indices
809
+ 3. Calculate rowspan/colspan if cell bbox spans multiple grid cells
810
+ """
811
+ cells = []
812
+
813
+ if not hasattr(table, 'cells') or not table.cells:
814
+ # Return empty list if no cell info (handled by CellAnalysisEngine)
815
+ return cells
816
+
817
+ raw_cells = table.cells
818
+ if not raw_cells:
819
+ return cells
820
+
821
+ # Extract X, Y boundary lines (same approach as pdf_handler_default)
822
+ x_coords = sorted(set([c[0] for c in raw_cells if c] + [c[2] for c in raw_cells if c]))
823
+ y_coords = sorted(set([c[1] for c in raw_cells if c] + [c[3] for c in raw_cells if c]))
824
+
825
+ if len(x_coords) < 2 or len(y_coords) < 2:
826
+ # Return basic cell info if grid cannot be constructed
827
+ for idx, cell_bbox in enumerate(raw_cells):
828
+ if cell_bbox is None:
829
+ continue
830
+ num_rows = len(table.extract()) if hasattr(table, 'extract') else 0
831
+ row_idx = idx // max(1, table.col_count) if hasattr(table, 'col_count') else 0
832
+ col_idx = idx % max(1, table.col_count) if hasattr(table, 'col_count') else 0
833
+ cells.append(CellInfo(
834
+ row=row_idx,
835
+ col=col_idx,
836
+ rowspan=1,
837
+ colspan=1,
838
+ bbox=cell_bbox
839
+ ))
840
+ return cells
841
+
842
+ # Function to map coordinates to grid indices (same as pdf_handler_default)
843
+ def coord_to_index(coord: float, coords: List[float], tolerance: float = 3.0) -> int:
844
+ for i, c in enumerate(coords):
845
+ if abs(coord - c) <= tolerance:
846
+ return i
847
+ # Return closest index
848
+ return min(range(len(coords)), key=lambda i: abs(coords[i] - coord))
849
+
850
+ # Track processed grid positions
851
+ processed_positions: Set[Tuple[int, int]] = set()
852
+
853
+ for cell_bbox in raw_cells:
854
+ if cell_bbox is None:
855
+ continue
856
+
857
+ x0, y0, x1, y1 = cell_bbox[:4]
858
+
859
+ col_start = coord_to_index(x0, x_coords)
860
+ col_end = coord_to_index(x1, x_coords)
861
+ row_start = coord_to_index(y0, y_coords)
862
+ row_end = coord_to_index(y1, y_coords)
863
+
864
+ colspan = max(1, col_end - col_start)
865
+ rowspan = max(1, row_end - row_start)
866
+
867
+ if (row_start, col_start) in processed_positions:
868
+ continue
869
+
870
+ processed_positions.add((row_start, col_start))
871
+
872
+ cells.append(CellInfo(
873
+ row=row_start,
874
+ col=col_start,
875
+ rowspan=rowspan,
876
+ colspan=colspan,
877
+ bbox=cell_bbox
878
+ ))
879
+
880
+ # Mark other cells in merged area
881
+ for r in range(row_start, row_start + rowspan):
882
+ for c in range(col_start, col_start + colspan):
883
+ if (r, c) != (row_start, col_start):
884
+ processed_positions.add((r, c))
885
+
886
+ return cells
887
+
888
+ def _cluster_grid_positions(self, positions: List[float], tolerance: float = 3.0) -> List[float]:
889
+ """
890
+ Cluster grid positions.
891
+
892
+ Merge nearby lines into one.
893
+ """
894
+ if not positions:
895
+ return []
896
+
897
+ sorted_pos = sorted(set(positions))
898
+ if len(sorted_pos) == 0:
899
+ return []
900
+
901
+ clusters: List[List[float]] = [[sorted_pos[0]]]
902
+
903
+ for pos in sorted_pos[1:]:
904
+ if pos - clusters[-1][-1] <= tolerance:
905
+ clusters[-1].append(pos)
906
+ else:
907
+ clusters.append([pos])
908
+
909
+ # Return average value of each cluster
910
+ return [sum(c) / len(c) for c in clusters]
911
+
912
+ def _find_grid_index_v2(self, value: float, grid_lines: List[float],
913
+ tolerance: float = 5.0) -> Optional[int]:
914
+ """
915
+ Find index of value in grid lines (improved version).
916
+
917
+ If exact matching fails, select the closest line.
918
+ """
919
+ if not grid_lines:
920
+ return None
921
+
922
+ # Try exact matching
923
+ for i, line in enumerate(grid_lines):
924
+ if abs(value - line) <= tolerance:
925
+ return i
926
+
927
+ # Find closest line
928
+ min_diff = float('inf')
929
+ closest_idx = 0
930
+
931
+ for i, line in enumerate(grid_lines):
932
+ diff = abs(value - line)
933
+ if diff < min_diff:
934
+ min_diff = diff
935
+ closest_idx = i
936
+
937
+ # Return if within 3x tolerance
938
+ if min_diff <= tolerance * 3:
939
+ return closest_idx
940
+
941
+ return None
942
+
943
+ def _find_grid_index(self, value: float, grid_lines: List[float], tolerance: float = 3.0) -> Optional[int]:
944
+ """Find index of value in grid lines (for compatibility)."""
945
+ return self._find_grid_index_v2(value, grid_lines, tolerance)
946
+
947
+ def _detect_with_pdfplumber(self) -> List[TableCandidate]:
948
+ """Use pdfplumber for table detection."""
949
+ candidates = []
950
+
951
+ try:
952
+ with pdfplumber.open(self.file_path) as pdf:
953
+ if self.page_num >= len(pdf.pages):
954
+ return candidates
955
+
956
+ plumber_page = pdf.pages[self.page_num]
957
+
958
+ # Table settings
959
+ settings = {
960
+ "vertical_strategy": "lines",
961
+ "horizontal_strategy": "lines",
962
+ "snap_tolerance": 5,
963
+ "join_tolerance": 5,
964
+ }
965
+
966
+ tables = plumber_page.extract_tables(settings)
967
+
968
+ for table_idx, table_data in enumerate(tables):
969
+ if not table_data or not any(any(cell for cell in row if cell) for row in table_data):
970
+ continue
971
+
972
+ # Estimate bbox
973
+ bbox = self._estimate_table_bbox_pdfplumber(plumber_page, table_data)
974
+
975
+ if not bbox:
976
+ continue
977
+
978
+ confidence = self._calculate_pdfplumber_confidence(table_data)
979
+
980
+ if confidence < self.CONFIDENCE_THRESHOLD:
981
+ continue
982
+
983
+ candidates.append(TableCandidate(
984
+ strategy=TableDetectionStrategy.PDFPLUMBER_LINES,
985
+ confidence=confidence,
986
+ bbox=bbox,
987
+ grid=None,
988
+ cells=[],
989
+ data=table_data,
990
+ raw_table=None
991
+ ))
992
+
993
+ except Exception as e:
994
+ logger.debug(f"[PDF] pdfplumber error: {e}")
995
+
996
+ return candidates
997
+
998
+ def _estimate_table_bbox_pdfplumber(self, page, data: List[List]) -> Optional[Tuple[float, float, float, float]]:
999
+ """Estimate pdfplumber table bbox."""
1000
+ try:
1001
+ words = page.extract_words()
1002
+ if not words:
1003
+ return None
1004
+
1005
+ table_texts = set()
1006
+ for row in data:
1007
+ for cell in row:
1008
+ if cell and str(cell).strip():
1009
+ table_texts.add(str(cell).strip()[:20])
1010
+
1011
+ matching_words = []
1012
+ for word in words:
1013
+ if any(word['text'] in text or text in word['text'] for text in table_texts):
1014
+ matching_words.append(word)
1015
+
1016
+ if not matching_words:
1017
+ return None
1018
+
1019
+ x0 = min(w['x0'] for w in matching_words)
1020
+ y0 = min(w['top'] for w in matching_words)
1021
+ x1 = max(w['x1'] for w in matching_words)
1022
+ y1 = max(w['bottom'] for w in matching_words)
1023
+
1024
+ margin = 5
1025
+ return (x0 - margin, y0 - margin, x1 + margin, y1 + margin)
1026
+
1027
+ except Exception:
1028
+ return None
1029
+
1030
+ def _calculate_pdfplumber_confidence(self, data: List[List]) -> float:
1031
+ """Calculate pdfplumber result confidence."""
1032
+ score = 0.0
1033
+
1034
+ # Base score (slightly lower than PyMuPDF)
1035
+ score += 0.4
1036
+
1037
+ num_rows = len(data)
1038
+ col_count = max(len(row) for row in data) if data else 0
1039
+
1040
+ if num_rows >= self.MIN_TABLE_ROWS:
1041
+ score += 0.1
1042
+ if col_count >= self.MIN_TABLE_COLS:
1043
+ score += 0.1
1044
+
1045
+ # Data density
1046
+ total_cells = sum(len(row) for row in data)
1047
+ filled_cells = sum(1 for row in data for cell in row if cell and str(cell).strip())
1048
+
1049
+ if total_cells > 0:
1050
+ density = filled_cells / total_cells
1051
+
1052
+ if density < 0.1:
1053
+ score -= 0.5
1054
+ elif density < 0.2:
1055
+ score -= 0.3
1056
+ else:
1057
+ score += density * 0.2
1058
+
1059
+ # Meaningful cell count
1060
+ meaningful_count = sum(
1061
+ 1 for row in data for cell in row
1062
+ if cell and len(str(cell).strip()) >= 2
1063
+ )
1064
+
1065
+ if meaningful_count < 2:
1066
+ score -= 0.3
1067
+
1068
+ # Valid row count
1069
+ valid_rows = sum(1 for row in data if any(cell and str(cell).strip() for cell in row))
1070
+ if valid_rows <= 1:
1071
+ score -= 0.2
1072
+
1073
+ # Empty row ratio
1074
+ empty_rows = num_rows - valid_rows
1075
+ if num_rows > 0 and empty_rows / num_rows > 0.5:
1076
+ score -= 0.2
1077
+
1078
+ return max(0.0, min(1.0, score))
1079
+
1080
+ def _detect_with_lines(self) -> List[TableCandidate]:
1081
+ """Line analysis based table detection."""
1082
+ candidates = []
1083
+
1084
+ # Build grid
1085
+ grid = self.line_engine.build_grid()
1086
+
1087
+ if not grid:
1088
+ return candidates
1089
+
1090
+ # Recover incomplete border
1091
+ if not grid.is_complete:
1092
+ grid = self.line_engine.reconstruct_incomplete_border(grid)
1093
+ if not grid.is_complete:
1094
+ return candidates
1095
+
1096
+ # Check if grid is valid
1097
+ if grid.row_count < self.MIN_TABLE_ROWS or grid.col_count < self.MIN_TABLE_COLS:
1098
+ return candidates
1099
+
1100
+ # Extract text from cells
1101
+ data = self._extract_text_from_grid(grid)
1102
+
1103
+ if not data or not any(any(cell for cell in row if cell) for row in data):
1104
+ return candidates
1105
+
1106
+ # Create cell info
1107
+ cells = self._create_cells_from_grid(grid)
1108
+
1109
+ # Calculate confidence
1110
+ confidence = self._calculate_line_based_confidence(grid, data)
1111
+
1112
+ if confidence < self.CONFIDENCE_THRESHOLD:
1113
+ return candidates
1114
+
1115
+ candidates.append(TableCandidate(
1116
+ strategy=TableDetectionStrategy.HYBRID_ANALYSIS,
1117
+ confidence=confidence,
1118
+ bbox=grid.bbox,
1119
+ grid=grid,
1120
+ cells=cells,
1121
+ data=data,
1122
+ raw_table=None
1123
+ ))
1124
+
1125
+ return candidates
1126
+
1127
+ def _extract_text_from_grid(self, grid: GridInfo) -> List[List[Optional[str]]]:
1128
+ """Extract text from grid cells."""
1129
+ data = []
1130
+
1131
+ page_dict = self.page.get_text("dict", sort=True)
1132
+
1133
+ for row_idx in range(grid.row_count):
1134
+ row_data = []
1135
+ y0 = grid.h_lines[row_idx]
1136
+ y1 = grid.h_lines[row_idx + 1]
1137
+
1138
+ for col_idx in range(grid.col_count):
1139
+ x0 = grid.v_lines[col_idx]
1140
+ x1 = grid.v_lines[col_idx + 1]
1141
+
1142
+ cell_bbox = (x0, y0, x1, y1)
1143
+ cell_text = self._get_text_in_bbox(page_dict, cell_bbox)
1144
+ row_data.append(cell_text)
1145
+
1146
+ data.append(row_data)
1147
+
1148
+ return data
1149
+
1150
+ def _get_text_in_bbox(self, page_dict: dict, bbox: Tuple[float, float, float, float]) -> str:
1151
+ """Extract text within bbox."""
1152
+ x0, y0, x1, y1 = bbox
1153
+ texts = []
1154
+
1155
+ for block in page_dict.get("blocks", []):
1156
+ if block.get("type") != 0:
1157
+ continue
1158
+
1159
+ for line in block.get("lines", []):
1160
+ line_bbox = line.get("bbox", (0, 0, 0, 0))
1161
+
1162
+ if self._bbox_overlaps(line_bbox, bbox):
1163
+ line_text = ""
1164
+ for span in line.get("spans", []):
1165
+ span_bbox = span.get("bbox", (0, 0, 0, 0))
1166
+ if self._bbox_overlaps(span_bbox, bbox):
1167
+ line_text += span.get("text", "")
1168
+
1169
+ if line_text.strip():
1170
+ texts.append(line_text.strip())
1171
+
1172
+ return " ".join(texts)
1173
+
1174
+ def _bbox_overlaps(self, bbox1: Tuple, bbox2: Tuple, threshold: float = 0.3) -> bool:
1175
+ """Check if two bboxes overlap."""
1176
+ x0 = max(bbox1[0], bbox2[0])
1177
+ y0 = max(bbox1[1], bbox2[1])
1178
+ x1 = min(bbox1[2], bbox2[2])
1179
+ y1 = min(bbox1[3], bbox2[3])
1180
+
1181
+ if x1 <= x0 or y1 <= y0:
1182
+ return False
1183
+
1184
+ overlap_area = (x1 - x0) * (y1 - y0)
1185
+ bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
1186
+
1187
+ if bbox1_area <= 0:
1188
+ return False
1189
+
1190
+ return overlap_area / bbox1_area >= threshold
1191
+
1192
+ def _create_cells_from_grid(self, grid: GridInfo) -> List[CellInfo]:
1193
+ """Create cell info from grid."""
1194
+ cells = []
1195
+
1196
+ for row_idx in range(grid.row_count):
1197
+ y0 = grid.h_lines[row_idx]
1198
+ y1 = grid.h_lines[row_idx + 1]
1199
+
1200
+ for col_idx in range(grid.col_count):
1201
+ x0 = grid.v_lines[col_idx]
1202
+ x1 = grid.v_lines[col_idx + 1]
1203
+
1204
+ cells.append(CellInfo(
1205
+ row=row_idx,
1206
+ col=col_idx,
1207
+ rowspan=1,
1208
+ colspan=1,
1209
+ bbox=(x0, y0, x1, y1)
1210
+ ))
1211
+
1212
+ return cells
1213
+
1214
+ def _calculate_line_based_confidence(self, grid: GridInfo, data: List[List]) -> float:
1215
+ """Calculate line-based result confidence."""
1216
+ score = 0.0
1217
+
1218
+ # Base score (lower than other strategies)
1219
+ score += 0.3
1220
+
1221
+ # Grid completeness
1222
+ if grid.is_complete:
1223
+ score += 0.2
1224
+ elif grid.reconstructed:
1225
+ score += 0.1
1226
+
1227
+ # Row/column count
1228
+ if grid.row_count >= self.MIN_TABLE_ROWS:
1229
+ score += 0.1
1230
+ if grid.col_count >= self.MIN_TABLE_COLS:
1231
+ score += 0.1
1232
+
1233
+ # Data density
1234
+ total_cells = sum(len(row) for row in data)
1235
+ filled_cells = sum(1 for row in data for cell in row if cell and str(cell).strip())
1236
+
1237
+ if total_cells > 0:
1238
+ density = filled_cells / total_cells
1239
+
1240
+ if density < 0.1:
1241
+ score -= 0.4
1242
+ elif density < 0.2:
1243
+ score -= 0.2
1244
+ else:
1245
+ score += density * 0.2
1246
+
1247
+ # Meaningful cell count
1248
+ meaningful_count = sum(
1249
+ 1 for row in data for cell in row
1250
+ if cell and len(str(cell).strip()) >= 2
1251
+ )
1252
+
1253
+ if meaningful_count < 2:
1254
+ score -= 0.2
1255
+
1256
+ # Valid row count
1257
+ valid_rows = sum(1 for row in data if any(cell and str(cell).strip() for cell in row))
1258
+ if valid_rows <= 1:
1259
+ score -= 0.2
1260
+
1261
+ # Check graphic region overlap
1262
+ if self.graphic_detector:
1263
+ if self.graphic_detector.is_bbox_in_graphic_region(grid.bbox, threshold=0.3):
1264
+ score -= 0.3
1265
+
1266
+ return max(0.0, min(1.0, score))
1267
+
1268
+ def _select_best_candidates(self, candidates: List[TableCandidate]) -> List[TableCandidate]:
1269
+ """
1270
+ Select best table candidates.
1271
+
1272
+ Strongly reflects strategy priority:
1273
+ - PyMuPDF is most accurate, so PyMuPDF results are preferred in the same region
1274
+ - If confidence difference is less than 0.2, select by strategy priority
1275
+ """
1276
+ if not candidates:
1277
+ return []
1278
+
1279
+ # Strategy priority: PYMUPDF > PDFPLUMBER > HYBRID
1280
+ priority_order = {
1281
+ TableDetectionStrategy.PYMUPDF_NATIVE: 0,
1282
+ TableDetectionStrategy.PDFPLUMBER_LINES: 1,
1283
+ TableDetectionStrategy.HYBRID_ANALYSIS: 2,
1284
+ TableDetectionStrategy.BORDERLESS_HEURISTIC: 3,
1285
+ }
1286
+
1287
+ # Changed sort key - prioritize strategy order more
1288
+ # If confidence difference is not large, decide by strategy priority
1289
+ def sort_key(c):
1290
+ # Subtract strategy priority * 0.15 from confidence
1291
+ # This makes PyMuPDF (priority=0) more favorable than pdfplumber (priority=1)
1292
+ adjusted_confidence = c.confidence - (priority_order.get(c.strategy, 99) * 0.15)
1293
+ return (-adjusted_confidence, priority_order.get(c.strategy, 99))
1294
+
1295
+ candidates_sorted = sorted(candidates, key=sort_key)
1296
+
1297
+ selected = []
1298
+
1299
+ for candidate in candidates_sorted:
1300
+ overlaps = False
1301
+
1302
+ for selected_candidate in selected:
1303
+ if self._tables_overlap_any(candidate.bbox, selected_candidate.bbox):
1304
+ overlaps = True
1305
+ break
1306
+
1307
+ if not overlaps:
1308
+ selected.append(candidate)
1309
+
1310
+ return selected
1311
+
1312
+ def _tables_overlap_any(self, bbox1: Tuple, bbox2: Tuple, threshold: float = 0.3) -> bool:
1313
+ """
1314
+ Check if two tables overlap (improved version).
1315
+
1316
+ Returns True if either one is covered by the other by threshold or more.
1317
+ """
1318
+ x0 = max(bbox1[0], bbox2[0])
1319
+ y0 = max(bbox1[1], bbox2[1])
1320
+ x1 = min(bbox1[2], bbox2[2])
1321
+ y1 = min(bbox1[3], bbox2[3])
1322
+
1323
+ if x1 <= x0 or y1 <= y0:
1324
+ return False
1325
+
1326
+ overlap_area = (x1 - x0) * (y1 - y0)
1327
+ bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
1328
+ bbox2_area = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
1329
+
1330
+ if bbox1_area <= 0 or bbox2_area <= 0:
1331
+ return False
1332
+
1333
+ # Consider overlapping if either side is covered by threshold or more
1334
+ ratio1 = overlap_area / bbox1_area
1335
+ ratio2 = overlap_area / bbox2_area
1336
+
1337
+ return ratio1 >= threshold or ratio2 >= threshold
1338
+
1339
+
1340
+ # ============================================================================
1341
+ # Export
1342
+ # ============================================================================
1343
+
1344
+ __all__ = [
1345
+ 'TableDetectionEngine',
1346
+ ]