xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,401 @@
1
+ """
2
+ Table Quality Validator for PDF Handler
3
+
4
+ Validates whether detected table candidates are actual tables.
5
+ Prevents graphic regions from being misidentified as tables.
6
+ """
7
+
8
+ import logging
9
+ from typing import List, Tuple, Optional
10
+
11
+ from xgen_doc2chunk.core.processor.pdf_helpers.types import PDFConfig
12
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_graphic_detector import GraphicRegionDetector
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ # ============================================================================
18
+ # Table Quality Validator
19
+ # ============================================================================
20
+
21
+ class TableQualityValidator:
22
+ """
23
+ Table Quality Validator
24
+
25
+ Validates whether detected table candidates are actual tables.
26
+
27
+ Validation Criteria:
28
+ 1. Filled cell ratio (too low indicates fake table)
29
+ 2. Empty row/column ratio
30
+ 3. Text density
31
+ 4. Data validity (meaningful text)
32
+ 5. Grid regularity
33
+ 6. Long text cell detection (text blocks misidentified as tables)
34
+ 7. Paragraph text detection (body text misidentified as tables)
35
+ 8. Two-column table special validation (body text easily misidentified as table)
36
+ """
37
+
38
+ def __init__(self, page, graphic_detector: Optional[GraphicRegionDetector] = None):
39
+ """
40
+ Args:
41
+ page: PyMuPDF page object
42
+ graphic_detector: Graphic region detector (optional)
43
+ """
44
+ self.page = page
45
+ self.page_width = page.rect.width
46
+ self.page_height = page.rect.height
47
+ self.graphic_detector = graphic_detector
48
+
49
+ def validate(self,
50
+ data: List[List[Optional[str]]],
51
+ bbox: Tuple[float, float, float, float],
52
+ cells_info: Optional[List] = None,
53
+ skip_graphic_check: bool = False) -> Tuple[bool, float, str]:
54
+ """
55
+ Validates a table candidate.
56
+
57
+ Features:
58
+ - Relaxed penalty accumulation
59
+ - Prevents filtering of normal tables
60
+ - Enhanced PyMuPDF result reliability
61
+
62
+ Args:
63
+ data: Table data (2D list)
64
+ bbox: Table bounding box
65
+ cells_info: Cell information (optional)
66
+ skip_graphic_check: Skip graphic region check.
67
+ PyMuPDF strategy is text-based, so it has high reliability.
68
+
69
+ Returns:
70
+ Tuple of (is_valid, confidence, reason)
71
+ """
72
+ reasons = []
73
+ penalties = []
74
+ is_valid = True
75
+ confidence = 1.0
76
+
77
+ # If PyMuPDF provided cell information, increase base confidence
78
+ if cells_info and len(cells_info) > 0:
79
+ confidence = 1.1 # Slight bonus
80
+
81
+ # 0. Graphic region check (skip_graphic_check option added)
82
+ if not skip_graphic_check:
83
+ if self.graphic_detector and self.graphic_detector.is_bbox_in_graphic_region(bbox, threshold=0.5):
84
+ return False, 0.0, "in_graphic_region"
85
+
86
+ # 1. Basic data validation
87
+ if not data or len(data) == 0:
88
+ return False, 0.0, "empty_data"
89
+
90
+ num_rows = len(data)
91
+ num_cols = max(len(row) for row in data) if data else 0
92
+
93
+ if num_rows < PDFConfig.MIN_TABLE_ROWS:
94
+ return False, 0.0, f"too_few_rows({num_rows})"
95
+
96
+ if num_cols < PDFConfig.MIN_TABLE_COLS:
97
+ return False, 0.0, f"too_few_cols({num_cols})"
98
+
99
+ # 2. Filled cell ratio validation
100
+ total_cells = sum(len(row) for row in data)
101
+ filled_cells = sum(1 for row in data for cell in row
102
+ if cell and str(cell).strip())
103
+ filled_ratio = filled_cells / total_cells if total_cells > 0 else 0
104
+
105
+ # Progressive penalty based on fill ratio
106
+ if filled_ratio < PDFConfig.TABLE_MIN_FILLED_CELL_RATIO:
107
+ if filled_ratio < 0.05:
108
+ penalties.append(f"very_low_fill_ratio({filled_ratio:.2f})")
109
+ confidence -= 0.3
110
+ else:
111
+ penalties.append(f"low_fill_ratio({filled_ratio:.2f})")
112
+ confidence -= 0.15
113
+
114
+ # 3. Empty row ratio validation
115
+ empty_rows = sum(1 for row in data
116
+ if not any(cell and str(cell).strip() for cell in row))
117
+ empty_row_ratio = empty_rows / num_rows if num_rows > 0 else 1.0
118
+
119
+ if empty_row_ratio >= PDFConfig.TABLE_MAX_EMPTY_ROW_RATIO:
120
+ penalties.append(f"too_many_empty_rows({empty_row_ratio:.2f})")
121
+ confidence -= 0.15
122
+
123
+ # 4. Meaningful cell count validation
124
+ meaningful_cells = self._count_meaningful_cells(data)
125
+ if meaningful_cells < PDFConfig.TABLE_MIN_MEANINGFUL_CELLS:
126
+ penalties.append(f"few_meaningful_cells({meaningful_cells})")
127
+ confidence -= 0.15
128
+
129
+ # 5. Valid row count validation (rows that are not empty)
130
+ valid_rows = sum(1 for row in data
131
+ if any(cell and str(cell).strip() for cell in row))
132
+ if valid_rows < PDFConfig.TABLE_MIN_VALID_ROWS:
133
+ penalties.append(f"few_valid_rows({valid_rows})")
134
+ confidence -= 0.15
135
+
136
+ # 6. Text density validation
137
+ text_density = self._calculate_text_density(data, bbox)
138
+ if text_density < PDFConfig.TABLE_MIN_TEXT_DENSITY:
139
+ penalties.append(f"low_text_density({text_density:.3f})")
140
+ confidence -= 0.1
141
+
142
+ # 7. Single row/column table special validation
143
+ if num_rows == 1 or num_cols == 1:
144
+ # More strict validation for 1 row or 1 column tables
145
+ if filled_ratio < 0.5:
146
+ penalties.append("single_row_col_low_fill")
147
+ confidence -= 0.2
148
+
149
+ # 8. Abnormal row/column ratio validation
150
+ if num_cols > num_rows * 5: # More than 5 times as many columns as rows
151
+ penalties.append(f"abnormal_ratio(cols/rows={num_cols}/{num_rows})")
152
+ confidence -= 0.1
153
+
154
+ # 9. Long text cell detection (text blocks misidentified as tables)
155
+ long_cell_count, extreme_cell_count = self._analyze_cell_lengths(data)
156
+
157
+ # Fail immediately if there are extremely long cells
158
+ if extreme_cell_count > 0:
159
+ return False, 0.0, f"extreme_long_cell({extreme_cell_count})"
160
+
161
+ # Long text cell ratio check (more lenient)
162
+ if filled_cells > 0:
163
+ long_cell_ratio = long_cell_count / filled_cells
164
+ if long_cell_ratio > PDFConfig.TABLE_MAX_LONG_CELLS_RATIO:
165
+ penalties.append(f"too_many_long_cells({long_cell_ratio:.2f})")
166
+ confidence -= 0.2
167
+
168
+ # 10. Paragraph text detection (body text misidentified as tables)
169
+ paragraph_count = self._count_paragraph_cells(data)
170
+ if paragraph_count > 0:
171
+ # High probability of not being a table if paragraph-style text exists
172
+ paragraph_ratio = paragraph_count / max(1, filled_cells)
173
+ if paragraph_ratio > 0.25: # Relaxed from 15% to 25%
174
+ return False, 0.0, f"contains_paragraph_text({paragraph_count})"
175
+ elif paragraph_ratio > 0.1: # Relaxed from 5% to 10%
176
+ penalties.append(f"has_paragraph_cells({paragraph_count})")
177
+ confidence -= 0.15
178
+
179
+ # 11. Two-column table special validation (body text easily misidentified as table)
180
+ if num_cols == 2:
181
+ is_valid_2col, reason_2col = self._validate_two_column_table(data, bbox)
182
+ if not is_valid_2col:
183
+ return False, 0.0, f"invalid_2col_table({reason_2col})"
184
+
185
+ # 12. Suspicious if table bbox covers large portion of page with many rows
186
+ # More lenient conditions
187
+ bbox_height = bbox[3] - bbox[1]
188
+ page_coverage = bbox_height / self.page_height if self.page_height > 0 else 0
189
+ if page_coverage > 0.7 and num_rows > 15 and num_cols == 2: # Relaxed conditions
190
+ # High probability of body text if covering 70%+ of page, 15+ rows, and 2 columns
191
+ penalties.append(f"suspicious_large_2col(coverage={page_coverage:.2f}, rows={num_rows})")
192
+ confidence -= 0.15
193
+
194
+ # Final judgment
195
+ # Confidence floor adjustment (lowered to 0.4)
196
+ confidence = max(0.0, min(1.0, confidence))
197
+
198
+ # Using lower threshold instead of CONFIDENCE_THRESHOLD
199
+ min_threshold = 0.35 # Lowered from 0.5
200
+ if confidence < min_threshold:
201
+ is_valid = False
202
+
203
+ reason = ", ".join(penalties) if penalties else "valid"
204
+
205
+ if not is_valid:
206
+ logger.debug(f"[TableValidator] Rejected: {bbox}, reason={reason}, conf={confidence:.2f}")
207
+
208
+ return is_valid, confidence, reason
209
+
210
+ def _analyze_cell_lengths(self, data: List[List[Optional[str]]]) -> Tuple[int, int]:
211
+ """
212
+ Analyzes cell text lengths.
213
+
214
+ Returns:
215
+ Tuple of (long_cell_count, extreme_cell_count)
216
+ - long_cell_count: Number of cells exceeding TABLE_MAX_CELL_TEXT_LENGTH
217
+ - extreme_cell_count: Number of cells exceeding TABLE_EXTREME_CELL_LENGTH
218
+ """
219
+ long_count = 0
220
+ extreme_count = 0
221
+
222
+ for row in data:
223
+ for cell in row:
224
+ if cell:
225
+ text = str(cell).strip()
226
+ text_len = len(text)
227
+
228
+ if text_len > PDFConfig.TABLE_EXTREME_CELL_LENGTH:
229
+ extreme_count += 1
230
+ long_count += 1 # Extremely long cells are also included in long cells
231
+ elif text_len > PDFConfig.TABLE_MAX_CELL_TEXT_LENGTH:
232
+ long_count += 1
233
+
234
+ return long_count, extreme_count
235
+
236
+ def _count_meaningful_cells(self, data: List[List[Optional[str]]]) -> int:
237
+ """
238
+ Counts the number of meaningful cells.
239
+
240
+ Meaningful cells:
241
+ - Text with 2 or more characters
242
+ - Not simple symbols
243
+ """
244
+ count = 0
245
+ simple_symbols = {'', '-', '–', '—', '.', ':', ';', '|', '/', '\\',
246
+ '*', '#', '@', '!', '?', ',', ' '}
247
+
248
+ for row in data:
249
+ for cell in row:
250
+ if cell:
251
+ text = str(cell).strip()
252
+ if len(text) >= 2 and text not in simple_symbols:
253
+ count += 1
254
+
255
+ return count
256
+
257
+ def _calculate_text_density(self,
258
+ data: List[List[Optional[str]]],
259
+ bbox: Tuple[float, float, float, float]) -> float:
260
+ """
261
+ Calculates text density relative to the region area.
262
+ """
263
+ # Total text length
264
+ total_text_len = sum(
265
+ len(str(cell).strip())
266
+ for row in data
267
+ for cell in row
268
+ if cell
269
+ )
270
+
271
+ # Region area
272
+ area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
273
+
274
+ if area <= 0:
275
+ return 0.0
276
+
277
+ # Approximate area per character (approximately 50 pt² for 10pt font)
278
+ estimated_text_area = total_text_len * 50
279
+
280
+ return estimated_text_area / area
281
+
282
+ def _count_paragraph_cells(self, data: List[List[Optional[str]]]) -> int:
283
+ """
284
+ Counts cells containing paragraph-style text.
285
+
286
+ Paragraph detection criteria:
287
+ - Text with 50 or more characters
288
+ - Contains sentence punctuation (periods, commas, etc.)
289
+ - 5 or more words separated by spaces
290
+
291
+ If many such cells exist, body text has likely been misidentified as a table.
292
+ """
293
+ paragraph_count = 0
294
+
295
+ for row in data:
296
+ for cell in row:
297
+ if not cell:
298
+ continue
299
+
300
+ text = str(cell).strip()
301
+ text_len = len(text)
302
+
303
+ # Base condition: 50 characters or more
304
+ if text_len < 50:
305
+ continue
306
+
307
+ # Calculate word count
308
+ words = text.split()
309
+ word_count = len(words)
310
+
311
+ # Check for sentence punctuation
312
+ has_sentence_marks = any(p in text for p in ['.', '。', '?', '!', ',', '、'])
313
+
314
+ # Paragraph determination
315
+ is_paragraph = False
316
+
317
+ # Case 1: Long text + multiple words + sentence punctuation
318
+ if text_len >= 100 and word_count >= 8 and has_sentence_marks:
319
+ is_paragraph = True
320
+
321
+ # Case 2: Very long text + sentence punctuation
322
+ elif text_len >= 150 and has_sentence_marks:
323
+ is_paragraph = True
324
+
325
+ # Case 3: Long description in parentheses (e.g., annotations in papers, reports)
326
+ elif text_len >= 80 and word_count >= 10:
327
+ is_paragraph = True
328
+
329
+ if is_paragraph:
330
+ paragraph_count += 1
331
+
332
+ return paragraph_count
333
+
334
+ def _validate_two_column_table(self, data: List[List[Optional[str]]],
335
+ bbox: Tuple[float, float, float, float]) -> Tuple[bool, str]:
336
+ """
337
+ Validates the validity of a two-column table.
338
+
339
+ Two-column tables are easily misidentified from body text.
340
+ Example: Chart Y-axis labels + body text can be detected as a 2-column table.
341
+
342
+ Returns:
343
+ Tuple of (is_valid, reason)
344
+ """
345
+ num_rows = len(data)
346
+
347
+ # 1. Check if first column is mostly empty cells or short text
348
+ col1_empty_count = 0
349
+ col1_short_count = 0
350
+ col2_long_count = 0
351
+ col2_has_paragraphs = 0
352
+
353
+ for row in data:
354
+ if len(row) < 2:
355
+ continue
356
+
357
+ col1 = str(row[0]).strip() if row[0] else ""
358
+ col2 = str(row[1]).strip() if row[1] else ""
359
+
360
+ # First column analysis
361
+ if not col1:
362
+ col1_empty_count += 1
363
+ elif len(col1) <= 10:
364
+ col1_short_count += 1
365
+
366
+ # Second column analysis
367
+ if len(col2) > 80:
368
+ col2_long_count += 1
369
+ # Check for sentence structure
370
+ if any(p in col2 for p in ['.', '。', ',', '、']) and len(col2.split()) >= 5:
371
+ col2_has_paragraphs += 1
372
+
373
+ # Pattern 1: First column mostly empty + second column has long text
374
+ if num_rows > 0:
375
+ col1_empty_ratio = col1_empty_count / num_rows
376
+ col2_long_ratio = col2_long_count / num_rows
377
+
378
+ # First column 60%+ empty + second column 30%+ long text = body text
379
+ if col1_empty_ratio >= 0.6 and col2_long_ratio >= 0.3:
380
+ return False, f"col1_empty({col1_empty_ratio:.0%})_col2_long({col2_long_ratio:.0%})"
381
+
382
+ # Pattern 2: Many paragraph-style entries in second column
383
+ if num_rows > 5 and col2_has_paragraphs >= 2:
384
+ return False, f"col2_paragraphs({col2_has_paragraphs})"
385
+
386
+ # Pattern 3: If first column is short and second is long overall, likely body text not key-value
387
+ if num_rows > 10:
388
+ col1_short_ratio = (col1_empty_count + col1_short_count) / num_rows
389
+ if col1_short_ratio >= 0.8 and col2_long_count >= 5:
390
+ return False, f"asymmetric_cols(short1={col1_short_ratio:.0%}, long2={col2_long_count})"
391
+
392
+ return True, "valid"
393
+
394
+
395
+ # ============================================================================
396
+ # Export
397
+ # ============================================================================
398
+
399
+ __all__ = [
400
+ 'TableQualityValidator',
401
+ ]
@@ -0,0 +1,155 @@
1
+ # xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py
2
+ """
3
+ PDF Text Extraction Module
4
+
5
+ Provides functions for extracting text blocks from PDF pages.
6
+ """
7
+ import logging
8
+ from typing import List, Tuple
9
+
10
+ from xgen_doc2chunk.core.processor.pdf_helpers.types import (
11
+ ElementType,
12
+ PageElement,
13
+ PageBorderInfo,
14
+ )
15
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_utils import is_inside_any_bbox
16
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_text_quality_analyzer import (
17
+ TextQualityAnalyzer,
18
+ QualityAwareTextExtractor,
19
+ PageOCRFallbackEngine,
20
+ )
21
+
22
+ logger = logging.getLogger("document-processor")
23
+
24
+
25
+ def extract_text_blocks(
26
+ page,
27
+ page_num: int,
28
+ table_bboxes: List[Tuple[float, float, float, float]],
29
+ border_info: PageBorderInfo,
30
+ use_quality_check: bool = True
31
+ ) -> List[PageElement]:
32
+ """
33
+ Extract text blocks excluding table regions.
34
+
35
+ Improvements:
36
+ 1. Text quality analysis (broken text detection)
37
+ 2. OCR fallback for low quality text
38
+
39
+ Args:
40
+ page: PyMuPDF page object
41
+ page_num: Page number (0-indexed)
42
+ table_bboxes: List of table bounding boxes to exclude
43
+ border_info: Page border information
44
+ use_quality_check: Whether to perform quality checks
45
+
46
+ Returns:
47
+ List of PageElement for extracted text
48
+ """
49
+ elements = []
50
+
51
+ # Analyze text quality
52
+ if use_quality_check:
53
+ analyzer = TextQualityAnalyzer(page, page_num)
54
+ page_analysis = analyzer.analyze_page()
55
+
56
+ # If quality is too low, use full page OCR fallback
57
+ if page_analysis.quality_result.needs_ocr:
58
+ logger.info(
59
+ f"[PDF] Page {page_num + 1}: Low text quality "
60
+ f"({page_analysis.quality_result.quality_score:.2f}), "
61
+ f"PUA={page_analysis.quality_result.pua_count}, "
62
+ f"using OCR fallback"
63
+ )
64
+
65
+ extractor = QualityAwareTextExtractor(page, page_num)
66
+ ocr_text, _ = extractor.extract()
67
+
68
+ if ocr_text.strip():
69
+ # Split OCR text into blocks
70
+ # Exclude table regions
71
+ ocr_blocks = split_ocr_text_to_blocks(ocr_text, page, table_bboxes)
72
+ return ocr_blocks
73
+
74
+ # Existing logic: regular text extraction
75
+ page_dict = page.get_text("dict", sort=True)
76
+
77
+ for block in page_dict.get("blocks", []):
78
+ if block.get("type") != 0:
79
+ continue
80
+
81
+ block_bbox = block.get("bbox", (0, 0, 0, 0))
82
+
83
+ if is_inside_any_bbox(block_bbox, table_bboxes):
84
+ continue
85
+
86
+ text_parts = []
87
+
88
+ for line in block.get("lines", []):
89
+ line_text = ""
90
+ for span in line.get("spans", []):
91
+ line_text += span.get("text", "")
92
+ if line_text.strip():
93
+ text_parts.append(line_text.strip())
94
+
95
+ if text_parts:
96
+ full_text = "\n".join(text_parts)
97
+
98
+ # Individual block quality check (when use_quality_check is True)
99
+ if use_quality_check:
100
+ analyzer = TextQualityAnalyzer(page, page_num)
101
+ block_quality = analyzer.analyze_text(full_text)
102
+
103
+ if block_quality.needs_ocr:
104
+ # OCR only this block
105
+ ocr_engine = PageOCRFallbackEngine(page, page_num)
106
+ ocr_text = ocr_engine.ocr_region(block_bbox)
107
+ if ocr_text.strip():
108
+ full_text = ocr_text
109
+ logger.debug(f"[PDF] Block OCR: '{ocr_text[:50]}...'")
110
+
111
+ elements.append(PageElement(
112
+ element_type=ElementType.TEXT,
113
+ content=full_text,
114
+ bbox=block_bbox,
115
+ page_num=page_num
116
+ ))
117
+
118
+ return elements
119
+
120
+
121
+ def split_ocr_text_to_blocks(
122
+ ocr_text: str,
123
+ page,
124
+ table_bboxes: List[Tuple[float, float, float, float]]
125
+ ) -> List[PageElement]:
126
+ """
127
+ Convert OCR text to page elements.
128
+
129
+ Since OCR lacks position info, the entire text is treated as a single block.
130
+ Table regions are excluded.
131
+
132
+ Args:
133
+ ocr_text: OCR extracted text
134
+ page: PyMuPDF page object
135
+ table_bboxes: List of table bounding boxes
136
+
137
+ Returns:
138
+ List of PageElement
139
+ """
140
+ if not ocr_text.strip():
141
+ return []
142
+
143
+ # Calculate page region excluding table areas
144
+ page_width = page.rect.width
145
+ page_height = page.rect.height
146
+
147
+ # Return OCR text as a single block (position covers entire page)
148
+ # For actual position info, pytesseract's image_to_data can be used
149
+ return [PageElement(
150
+ element_type=ElementType.TEXT,
151
+ content=ocr_text,
152
+ bbox=(0, 0, page_width, page_height),
153
+ page_num=page.number
154
+ )]
155
+