xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,493 @@
1
+ # service/document_processor/processor/pdf_helpers/cell_analysis.py
2
+ """
3
+ Cell Analysis Engine
4
+
5
+ Analyzes physical cell information and text positions to calculate accurate rowspan/colspan.
6
+
7
+ - Precise grid analysis based on bbox
8
+ - Accurate distinction between merged cells and empty cells
9
+ - Enhanced merge validation based on text position
10
+ - Improved span inference through adjacent cell analysis
11
+ """
12
+ import logging
13
+ from typing import Any, Dict, List, Optional, Tuple, Set
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class CellAnalysisEngine:
19
+ """
20
+ Cell Analysis Engine.
21
+
22
+ Analyzes physical cell information and text positions to calculate accurate rowspan/colspan.
23
+
24
+ - Precise bbox-based analysis when PyMuPDF cell info is available
25
+ - Cell position recalculation based on grid lines
26
+ - Accurate distinction between empty cells and merged cells
27
+ """
28
+
29
+ # Tolerance constants
30
+ GRID_TOLERANCE = 5.0 # Grid line matching tolerance (pt)
31
+ OVERLAP_THRESHOLD = 0.3 # Bbox overlap threshold
32
+
33
+ def __init__(self, table_info: Any, page: Any):
34
+ """
35
+ Args:
36
+ table_info: TableInfo object (requires data, cells_info, bbox attributes)
37
+ page: PyMuPDF page object
38
+ """
39
+ self.table_info = table_info
40
+ self.page = page
41
+ self.data = table_info.data or []
42
+ self.cells_info = table_info.cells_info or []
43
+ self.table_bbox = getattr(table_info, 'bbox', None)
44
+
45
+ # Grid line cache
46
+ self._h_grid_lines: List[float] = []
47
+ self._v_grid_lines: List[float] = []
48
+
49
+ def analyze(self) -> List[Dict]:
50
+ """
51
+ Perform cell analysis.
52
+
53
+ Returns:
54
+ List of cell info (row, col, rowspan, colspan, bbox)
55
+
56
+ - Uses existing rowspan/colspan info from TableDetectionEngine if available
57
+ - Improves accuracy by avoiding unnecessary recalculation
58
+ """
59
+ num_rows = len(self.data)
60
+ num_cols = max(len(row) for row in self.data) if self.data else 0
61
+
62
+ if num_rows == 0 or num_cols == 0:
63
+ return []
64
+
65
+ # Use existing cells with validation if valid rowspan/colspan info exists
66
+ if self.cells_info and self._has_valid_span_info():
67
+ result = self._use_existing_cells_with_validation(num_rows, num_cols)
68
+ if result:
69
+ return result
70
+
71
+ # 1. If cell info exists, perform precise bbox-based analysis
72
+ if self.cells_info and any(c.get('bbox') for c in self.cells_info):
73
+ result = self._analyze_with_bbox_grid()
74
+ if result:
75
+ return result
76
+
77
+ # 2. If cell info exists but no bbox, validate existing info
78
+ if self.cells_info:
79
+ result = self._validate_and_enhance_cells()
80
+ if result:
81
+ return result
82
+
83
+ # 3. If no cell info, create default cells based on data
84
+ return self._create_default_cells(num_rows, num_cols)
85
+
86
+ def _has_valid_span_info(self) -> bool:
87
+ """Check if cell info has valid rowspan/colspan.
88
+
89
+ Conditions:
90
+ - Two or more cells have rowspan > 1 or colspan > 1
91
+ - Or all cells have row, col information
92
+ """
93
+ if not self.cells_info:
94
+ return False
95
+
96
+ has_span = False
97
+ has_position = True
98
+
99
+ for cell in self.cells_info:
100
+ rowspan = cell.get('rowspan', 1)
101
+ colspan = cell.get('colspan', 1)
102
+
103
+ if rowspan > 1 or colspan > 1:
104
+ has_span = True
105
+
106
+ if cell.get('row') is None or cell.get('col') is None:
107
+ has_position = False
108
+
109
+ return has_span or has_position
110
+
111
+ def _use_existing_cells_with_validation(self, num_rows: int, num_cols: int) -> List[Dict]:
112
+ """Use existing cell info after validation.
113
+
114
+ Uses already correctly calculated rowspan/colspan from TableDetectionEngine
115
+ without recalculating, only validates the range.
116
+ """
117
+ validated_cells: List[Dict] = []
118
+ covered_positions: Set[Tuple[int, int]] = set()
119
+
120
+ for cell in self.cells_info:
121
+ row = cell.get('row', 0)
122
+ col = cell.get('col', 0)
123
+ rowspan = max(1, cell.get('rowspan', 1))
124
+ colspan = max(1, cell.get('colspan', 1))
125
+ bbox = cell.get('bbox')
126
+
127
+ # Validate data range
128
+ if row >= num_rows or col >= num_cols:
129
+ continue
130
+
131
+ # Adjust span to fit within data range
132
+ rowspan = min(rowspan, num_rows - row)
133
+ colspan = min(colspan, num_cols - col)
134
+
135
+ # Check if position is already covered
136
+ if (row, col) in covered_positions:
137
+ continue
138
+
139
+ validated_cells.append({
140
+ 'row': row,
141
+ 'col': col,
142
+ 'rowspan': rowspan,
143
+ 'colspan': colspan,
144
+ 'bbox': bbox
145
+ })
146
+
147
+ # Record covered positions
148
+ for r in range(row, row + rowspan):
149
+ for c in range(col, col + colspan):
150
+ covered_positions.add((r, c))
151
+
152
+ # Add missing cells (positions not covered by span)
153
+ for row_idx in range(num_rows):
154
+ for col_idx in range(num_cols):
155
+ if (row_idx, col_idx) not in covered_positions:
156
+ validated_cells.append({
157
+ 'row': row_idx,
158
+ 'col': col_idx,
159
+ 'rowspan': 1,
160
+ 'colspan': 1,
161
+ 'bbox': None
162
+ })
163
+
164
+ return validated_cells
165
+
166
+ def _analyze_with_bbox_grid(self) -> List[Dict]:
167
+ """
168
+ Perform precise grid analysis using bbox information.
169
+
170
+ Algorithm:
171
+ 1. Extract grid lines from all cell bboxes
172
+ 2. Calculate how many grid cells each cell's bbox covers
173
+ 3. Determine rowspan/colspan
174
+ """
175
+ # Extract grid lines
176
+ h_lines: Set[float] = set()
177
+ v_lines: Set[float] = set()
178
+
179
+ for cell in self.cells_info:
180
+ bbox = cell.get('bbox')
181
+ if bbox and len(bbox) >= 4:
182
+ # Y coordinates (horizontal lines)
183
+ h_lines.add(round(bbox[1], 1))
184
+ h_lines.add(round(bbox[3], 1))
185
+ # X coordinates (vertical lines)
186
+ v_lines.add(round(bbox[0], 1))
187
+ v_lines.add(round(bbox[2], 1))
188
+
189
+ if len(h_lines) < 2 or len(v_lines) < 2:
190
+ return []
191
+
192
+ # Sort and cluster
193
+ self._h_grid_lines = self._cluster_and_sort(list(h_lines))
194
+ self._v_grid_lines = self._cluster_and_sort(list(v_lines))
195
+
196
+ grid_rows = len(self._h_grid_lines) - 1
197
+ grid_cols = len(self._v_grid_lines) - 1
198
+
199
+ if grid_rows < 1 or grid_cols < 1:
200
+ return []
201
+
202
+ # Calculate grid position and span for each cell
203
+ analyzed_cells: List[Dict] = []
204
+ covered_positions: Set[Tuple[int, int]] = set()
205
+
206
+ # Process cells with bbox
207
+ cells_with_bbox = [c for c in self.cells_info if c.get('bbox')]
208
+
209
+ for cell in cells_with_bbox:
210
+ bbox = cell.get('bbox')
211
+ orig_row = cell.get('row', 0)
212
+ orig_col = cell.get('col', 0)
213
+
214
+ # Determine grid position from bbox
215
+ row_start = self._find_grid_index(bbox[1], self._h_grid_lines)
216
+ row_end = self._find_grid_index(bbox[3], self._h_grid_lines)
217
+ col_start = self._find_grid_index(bbox[0], self._v_grid_lines)
218
+ col_end = self._find_grid_index(bbox[2], self._v_grid_lines)
219
+
220
+ if row_start is None or col_start is None:
221
+ # Use original values if grid matching fails
222
+ row_start = orig_row
223
+ row_end = orig_row + cell.get('rowspan', 1)
224
+ col_start = orig_col
225
+ col_end = orig_col + cell.get('colspan', 1)
226
+ else:
227
+ # If end index is less than or equal to start, span is 1
228
+ if row_end is None or row_end <= row_start:
229
+ row_end = row_start + 1
230
+ if col_end is None or col_end <= col_start:
231
+ col_end = col_start + 1
232
+
233
+ rowspan = max(1, row_end - row_start)
234
+ colspan = max(1, col_end - col_start)
235
+
236
+ # Check and adjust data range
237
+ num_data_rows = len(self.data)
238
+ num_data_cols = max(len(row) for row in self.data) if self.data else 0
239
+
240
+ # Grid rows/cols may differ from data rows/cols
241
+ # Map to data index
242
+ data_row = min(row_start, num_data_rows - 1) if num_data_rows > 0 else 0
243
+ data_col = min(col_start, num_data_cols - 1) if num_data_cols > 0 else 0
244
+
245
+ # Adjust span to data range
246
+ rowspan = min(rowspan, num_data_rows - data_row)
247
+ colspan = min(colspan, num_data_cols - data_col)
248
+
249
+ # Check if position is already covered
250
+ if (data_row, data_col) in covered_positions:
251
+ continue
252
+
253
+ analyzed_cells.append({
254
+ 'row': data_row,
255
+ 'col': data_col,
256
+ 'rowspan': max(1, rowspan),
257
+ 'colspan': max(1, colspan),
258
+ 'bbox': bbox
259
+ })
260
+
261
+ # Record covered positions
262
+ for r in range(data_row, min(data_row + rowspan, num_data_rows)):
263
+ for c in range(data_col, min(data_col + colspan, num_data_cols)):
264
+ covered_positions.add((r, c))
265
+
266
+ # Add default cells for uncovered positions
267
+ num_data_rows = len(self.data)
268
+ num_data_cols = max(len(row) for row in self.data) if self.data else 0
269
+
270
+ for row_idx in range(num_data_rows):
271
+ for col_idx in range(num_data_cols):
272
+ if (row_idx, col_idx) not in covered_positions:
273
+ analyzed_cells.append({
274
+ 'row': row_idx,
275
+ 'col': col_idx,
276
+ 'rowspan': 1,
277
+ 'colspan': 1,
278
+ 'bbox': None
279
+ })
280
+
281
+ return analyzed_cells
282
+
283
+ def _cluster_and_sort(self, values: List[float], tolerance: float = None) -> List[float]:
284
+ """Cluster and sort values."""
285
+ if not values:
286
+ return []
287
+
288
+ if tolerance is None:
289
+ tolerance = self.GRID_TOLERANCE
290
+
291
+ sorted_vals = sorted(values)
292
+ clusters: List[List[float]] = [[sorted_vals[0]]]
293
+
294
+ for val in sorted_vals[1:]:
295
+ if val - clusters[-1][-1] <= tolerance:
296
+ clusters[-1].append(val)
297
+ else:
298
+ clusters.append([val])
299
+
300
+ # Return average value of each cluster
301
+ return [sum(c) / len(c) for c in clusters]
302
+
303
+ def _find_grid_index(self, value: float, grid_lines: List[float],
304
+ tolerance: float = None) -> Optional[int]:
305
+ """Find grid index corresponding to the value."""
306
+ if tolerance is None:
307
+ tolerance = self.GRID_TOLERANCE
308
+
309
+ for i, line in enumerate(grid_lines):
310
+ if abs(value - line) <= tolerance:
311
+ return i
312
+
313
+ # If no exact match, find the closest line
314
+ if grid_lines:
315
+ closest_idx = 0
316
+ min_diff = abs(value - grid_lines[0])
317
+
318
+ for i, line in enumerate(grid_lines[1:], 1):
319
+ diff = abs(value - line)
320
+ if diff < min_diff:
321
+ min_diff = diff
322
+ closest_idx = i
323
+
324
+ # Return if within 2x tolerance
325
+ if min_diff <= tolerance * 2:
326
+ return closest_idx
327
+
328
+ return None
329
+
330
+ def _validate_and_enhance_cells(self) -> List[Dict]:
331
+ """
332
+ Validate and enhance existing cell info.
333
+
334
+ - Fix spans that exceed data range
335
+ - Remove duplicate cell info
336
+ - Add missing cells
337
+ """
338
+ num_rows = len(self.data)
339
+ num_cols = max(len(row) for row in self.data) if self.data else 0
340
+
341
+ enhanced_cells: List[Dict] = []
342
+ covered_positions: Set[Tuple[int, int]] = set()
343
+
344
+ # Process existing cell info
345
+ for cell in self.cells_info:
346
+ row = cell.get('row', 0)
347
+ col = cell.get('col', 0)
348
+ rowspan = cell.get('rowspan', 1)
349
+ colspan = cell.get('colspan', 1)
350
+ bbox = cell.get('bbox')
351
+
352
+ # Validate and adjust range
353
+ if row >= num_rows or col >= num_cols:
354
+ continue
355
+
356
+ rowspan = min(rowspan, num_rows - row)
357
+ colspan = min(colspan, num_cols - col)
358
+
359
+ # Check if position is already covered
360
+ if (row, col) in covered_positions:
361
+ continue
362
+
363
+ # Text-based span verification (when bbox exists)
364
+ if bbox and self.data:
365
+ verified_rowspan, verified_colspan = self._verify_span_with_text_v2(
366
+ row, col, rowspan, colspan, bbox
367
+ )
368
+ rowspan = max(rowspan, verified_rowspan)
369
+ colspan = max(colspan, verified_colspan)
370
+
371
+ enhanced_cells.append({
372
+ 'row': row,
373
+ 'col': col,
374
+ 'rowspan': max(1, rowspan),
375
+ 'colspan': max(1, colspan),
376
+ 'bbox': bbox
377
+ })
378
+
379
+ # Record covered positions
380
+ for r in range(row, min(row + rowspan, num_rows)):
381
+ for c in range(col, min(col + colspan, num_cols)):
382
+ covered_positions.add((r, c))
383
+
384
+ # Add missing cells
385
+ for row_idx in range(num_rows):
386
+ for col_idx in range(num_cols):
387
+ if (row_idx, col_idx) not in covered_positions:
388
+ enhanced_cells.append({
389
+ 'row': row_idx,
390
+ 'col': col_idx,
391
+ 'rowspan': 1,
392
+ 'colspan': 1,
393
+ 'bbox': None
394
+ })
395
+
396
+ return enhanced_cells
397
+
398
+ def _verify_span_with_text_v2(
399
+ self,
400
+ row: int,
401
+ col: int,
402
+ rowspan: int,
403
+ colspan: int,
404
+ bbox: Tuple[float, float, float, float]
405
+ ) -> Tuple[int, int]:
406
+ """
407
+ Verify span using text position.
408
+
409
+ Logic:
410
+ - If current cell has text
411
+ - And adjacent cell is empty
412
+ - And is contained within bbox
413
+ - Extend span
414
+ """
415
+ num_rows = len(self.data)
416
+ num_cols = max(len(row) for row in self.data) if self.data else 0
417
+
418
+ # Check current cell value
419
+ current_value = ""
420
+ if row < len(self.data) and col < len(self.data[row]):
421
+ current_value = str(self.data[row][col] or "").strip()
422
+
423
+ if not current_value:
424
+ return rowspan, colspan
425
+
426
+ verified_rowspan = rowspan
427
+ verified_colspan = colspan
428
+
429
+ # Colspan verification: check empty cells to the right in same row
430
+ for c in range(col + colspan, num_cols):
431
+ if c >= len(self.data[row]):
432
+ break
433
+ next_val = str(self.data[row][c] or "").strip()
434
+ if not next_val:
435
+ # Empty cell -> check merge possibility
436
+ # Hard to verify if current bbox extends to that column
437
+ # But if consecutive empty cells, increase colspan
438
+ verified_colspan += 1
439
+ else:
440
+ break
441
+
442
+ # Rowspan verification: check empty cells below in same column
443
+ for r in range(row + rowspan, num_rows):
444
+ if col >= len(self.data[r]):
445
+ break
446
+ next_val = str(self.data[r][col] or "").strip()
447
+ if not next_val:
448
+ # Check if other cells in same row have values
449
+ has_value_in_row = any(
450
+ str(self.data[r][c] or "").strip()
451
+ for c in range(len(self.data[r]))
452
+ if c != col
453
+ )
454
+ if has_value_in_row:
455
+ # If other columns have values, increase rowspan
456
+ verified_rowspan += 1
457
+ else:
458
+ break
459
+ else:
460
+ break
461
+
462
+ return verified_rowspan, verified_colspan
463
+
464
+ def _create_default_cells(self, num_rows: int, num_cols: int) -> List[Dict]:
465
+ """
466
+ Create default cell info. Creates all cells as 1x1 without value-based inference.
467
+ Value-based inference is disabled due to high error rates,
468
+ prioritizing PyMuPDF's physical cell information instead.
469
+
470
+ Empty cells are rendered as empty <td> elements in HTML generation.
471
+ (Having empty cells is normal in table structures)
472
+ """
473
+ cells = []
474
+
475
+ for row_idx in range(num_rows):
476
+ for col_idx in range(num_cols):
477
+ cells.append({
478
+ 'row': row_idx,
479
+ 'col': col_idx,
480
+ 'rowspan': 1,
481
+ 'colspan': 1,
482
+ 'bbox': None
483
+ })
484
+
485
+ return cells
486
+
487
+ # ============================================================================
488
+ # Export
489
+ # ============================================================================
490
+
491
+ __all__ = [
492
+ 'CellAnalysisEngine',
493
+ ]