xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,897 @@
1
+ # xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py
2
+ """
3
+ PDF Table Processing Module
4
+
5
+ Provides functions for table extraction, merging, annotation integration,
6
+ and HTML conversion from PDF documents.
7
+ """
8
+ import copy
9
+ import logging
10
+ from typing import Any, Dict, List, Optional, Tuple, Set
11
+ from dataclasses import dataclass
12
+ from collections import defaultdict
13
+
14
+ from xgen_doc2chunk.core.processor.pdf_helpers.types import (
15
+ TableDetectionStrategy,
16
+ ElementType,
17
+ PDFConfig,
18
+ PageElement,
19
+ PageBorderInfo,
20
+ CellInfo,
21
+ )
22
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_utils import (
23
+ escape_html,
24
+ get_text_lines_with_positions,
25
+ )
26
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_table_detection import TableDetectionEngine
27
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_cell_analysis import CellAnalysisEngine
28
+
29
+ logger = logging.getLogger("document-processor")
30
+
31
+
32
+ # ============================================================================
33
+ # Data Classes
34
+ # ============================================================================
35
+
36
+ @dataclass
37
+ class AnnotationInfo:
38
+ """Annotation/footnote/endnote info."""
39
+ text: str
40
+ bbox: Tuple[float, float, float, float]
41
+ type: str # 'footnote', 'endnote', 'table_note'
42
+ related_table_idx: Optional[int] = None
43
+
44
+
45
+ @dataclass
46
+ class TableInfo:
47
+ """Final table info."""
48
+ page_num: int
49
+ table_idx: int
50
+ bbox: Tuple[float, float, float, float]
51
+ data: List[List[Optional[str]]]
52
+ col_count: int
53
+ row_count: int
54
+ page_height: float
55
+ cells_info: Optional[List[Dict]] = None
56
+ annotations: Optional[List[AnnotationInfo]] = None
57
+ detection_strategy: Optional[TableDetectionStrategy] = None
58
+ confidence: float = 1.0
59
+
60
+
61
+ # ============================================================================
62
+ # Table Extraction
63
+ # ============================================================================
64
+
65
+ def extract_all_tables(
66
+ doc,
67
+ file_path: str,
68
+ detect_page_border_func,
69
+ is_table_likely_border_func
70
+ ) -> Dict[int, List[PageElement]]:
71
+ """
72
+ Extracts tables from entire document.
73
+
74
+ Strategy:
75
+ 1. Multi-strategy table detection
76
+ 2. Select best result based on confidence
77
+ 3. Cell analysis and merge cell processing
78
+ 4. Annotation integration
79
+ 5. Cross-page continuity handling
80
+
81
+ Args:
82
+ doc: PyMuPDF document object
83
+ file_path: PDF file path
84
+ detect_page_border_func: Function to detect page borders
85
+ is_table_likely_border_func: Function to check if table is a border
86
+
87
+ Returns:
88
+ Dictionary mapping page numbers to list of table PageElements
89
+ """
90
+ tables_by_page: Dict[int, List[PageElement]] = {}
91
+ all_table_infos: List[TableInfo] = []
92
+
93
+ # Step 1: Detect tables on each page
94
+ for page_num in range(len(doc)):
95
+ page = doc[page_num]
96
+ page_height = page.rect.height
97
+
98
+ # Detect page border
99
+ border_info = detect_page_border_func(page)
100
+
101
+ try:
102
+ # Use table detection engine
103
+ detection_engine = TableDetectionEngine(page, page_num, file_path)
104
+ candidates = detection_engine.detect_tables()
105
+
106
+ for idx, candidate in enumerate(candidates):
107
+ # Check if overlaps with page border
108
+ if border_info.has_border and is_table_likely_border_func(
109
+ candidate.bbox, border_info, page
110
+ ):
111
+ logger.debug(f"[PDF] Skipping page border table: {candidate.bbox}")
112
+ continue
113
+
114
+ # Convert cell info to dictionary
115
+ cells_info = None
116
+ if candidate.cells:
117
+ cells_info = [
118
+ {
119
+ 'row': cell.row,
120
+ 'col': cell.col,
121
+ 'rowspan': cell.rowspan,
122
+ 'colspan': cell.colspan,
123
+ 'bbox': cell.bbox
124
+ }
125
+ for cell in candidate.cells
126
+ ]
127
+
128
+ table_info = TableInfo(
129
+ page_num=page_num,
130
+ table_idx=idx,
131
+ bbox=candidate.bbox,
132
+ data=candidate.data,
133
+ col_count=candidate.col_count,
134
+ row_count=candidate.row_count,
135
+ page_height=page_height,
136
+ cells_info=cells_info,
137
+ detection_strategy=candidate.strategy,
138
+ confidence=candidate.confidence
139
+ )
140
+
141
+ all_table_infos.append(table_info)
142
+
143
+ except Exception as e:
144
+ logger.debug(f"[PDF] Error detecting tables on page {page_num}: {e}")
145
+ continue
146
+
147
+ # Step 2: Merge adjacent tables
148
+ merged_tables = merge_adjacent_tables(all_table_infos)
149
+
150
+ # Step 3: Find and insert annotations
151
+ merged_tables = find_and_insert_annotations(doc, merged_tables)
152
+
153
+ # Step 4: Handle table continuity
154
+ processed_tables = process_table_continuity(merged_tables)
155
+
156
+ # Step 5: HTML conversion and PageElement creation
157
+ # Single-column tables as TEXT, 2+ columns as TABLE
158
+ single_col_count = 0
159
+ real_table_count = 0
160
+
161
+ for table_info in processed_tables:
162
+ try:
163
+ page_num = table_info.page_num
164
+
165
+ if page_num not in tables_by_page:
166
+ tables_by_page[page_num] = []
167
+
168
+ # Check if single-column table
169
+ if is_single_column_table(table_info):
170
+ # Single-column table: convert to text list as TEXT type
171
+ text_content = convert_single_column_to_text(table_info)
172
+
173
+ if text_content:
174
+ tables_by_page[page_num].append(PageElement(
175
+ element_type=ElementType.TEXT,
176
+ content=text_content,
177
+ bbox=table_info.bbox,
178
+ page_num=page_num
179
+ ))
180
+ single_col_count += 1
181
+ else:
182
+ # 2+ columns: convert to HTML table
183
+ html_table = convert_table_to_html(table_info)
184
+
185
+ if html_table:
186
+ tables_by_page[page_num].append(PageElement(
187
+ element_type=ElementType.TABLE,
188
+ content=html_table,
189
+ bbox=table_info.bbox,
190
+ page_num=page_num
191
+ ))
192
+ real_table_count += 1
193
+
194
+ except Exception as e:
195
+ logger.debug(f"[PDF] Error converting table to HTML: {e}")
196
+ continue
197
+
198
+ if single_col_count > 0:
199
+ logger.info(f"[PDF] Converted {single_col_count} single-column tables to text")
200
+ logger.info(f"[PDF] Extracted {real_table_count} tables from {len(tables_by_page)} pages")
201
+ return tables_by_page
202
+
203
+
204
+ # ============================================================================
205
+ # Annotation Integration
206
+ # ============================================================================
207
+
208
+ def find_and_insert_annotations(doc, tables: List[TableInfo]) -> List[TableInfo]:
209
+ """
210
+ Finds and integrates annotations/footnotes/endnotes inside and after tables.
211
+
212
+ Detection patterns:
213
+ 1. Rows starting with "Note)" etc. right after table
214
+ 2. Subheader rows inside table (e.g., (A), (B))
215
+ 3. Footnote/endnote markers (?? *, ?? ?? etc.)
216
+
217
+ Args:
218
+ doc: PyMuPDF document object
219
+ tables: List of TableInfo
220
+
221
+ Returns:
222
+ Updated list of TableInfo with annotations
223
+ """
224
+ if not tables:
225
+ return tables
226
+
227
+ result = []
228
+ tables_by_page: Dict[int, List[TableInfo]] = defaultdict(list)
229
+
230
+ for table in tables:
231
+ tables_by_page[table.page_num].append(table)
232
+
233
+ for page_num, page_tables in tables_by_page.items():
234
+ page = doc[page_num]
235
+ page_height = page.rect.height
236
+
237
+ sorted_tables = sorted(page_tables, key=lambda t: t.bbox[1])
238
+ text_lines = get_text_lines_with_positions(page)
239
+
240
+ for i, table in enumerate(sorted_tables):
241
+ table_top = table.bbox[1]
242
+ table_bottom = table.bbox[3]
243
+ table_left = table.bbox[0]
244
+ table_right = table.bbox[2]
245
+
246
+ next_table_top = sorted_tables[i + 1].bbox[1] if i + 1 < len(sorted_tables) else page_height
247
+
248
+ # 1. Find annotation rows right after table
249
+ annotation_lines = []
250
+ for line in text_lines:
251
+ # Right below table, before next table
252
+ if table_bottom - 3 <= line['y0'] <= table_bottom + PDFConfig.ANNOTATION_Y_MARGIN:
253
+ if line['x0'] >= table_left - 10 and line['x1'] <= table_right + 10:
254
+ if line['y0'] < next_table_top - 20:
255
+ # Check annotation pattern
256
+ for pattern in PDFConfig.ANNOTATION_PATTERNS:
257
+ if line['text'].startswith(pattern):
258
+ annotation_lines.append(line)
259
+ break
260
+
261
+ if annotation_lines:
262
+ table = add_annotation_to_table(table, annotation_lines, 'footer')
263
+ logger.debug(f"[PDF] Added annotation to table on page {page_num + 1}")
264
+
265
+ # 2. Find subheader rows (e.g., (A), (B)) - only when no subheader exists
266
+ has_subheader = False
267
+ if table.row_count >= 2 and table.data and len(table.data) >= 2:
268
+ # Check if second row is subheader pattern
269
+ second_row = table.data[1] if len(table.data) > 1 else []
270
+ for cell in second_row:
271
+ if cell and ('(A)' in str(cell) or '(B)' in str(cell)):
272
+ has_subheader = True
273
+ break
274
+
275
+ if not has_subheader and table.row_count >= 2 and table.data:
276
+ row_height_estimate = (table_bottom - table_top) / table.row_count
277
+ header_bottom_estimate = table_top + row_height_estimate
278
+ second_row_top_estimate = table_top + row_height_estimate * 2
279
+
280
+ subheader_lines = []
281
+ for line in text_lines:
282
+ if header_bottom_estimate - 5 <= line['y0'] <= second_row_top_estimate - 5:
283
+ if line['x0'] >= table_left - 10 and line['x1'] <= table_right + 10:
284
+ # Check (A), (B) pattern
285
+ if '(A)' in line['text'] or '(B)' in line['text']:
286
+ subheader_lines.append(line)
287
+
288
+ if subheader_lines:
289
+ table = add_annotation_to_table(table, subheader_lines, 'subheader')
290
+ logger.debug(f"[PDF] Added subheader to table on page {page_num + 1}")
291
+
292
+ result.append(table)
293
+
294
+ result.sort(key=lambda t: (t.page_num, t.bbox[1]))
295
+ return result
296
+
297
+
298
+ def add_annotation_to_table(
299
+ table: TableInfo,
300
+ text_lines: List[Dict],
301
+ position: str
302
+ ) -> TableInfo:
303
+ """
304
+ Adds annotation rows to a table.
305
+
306
+ Args:
307
+ table: TableInfo object
308
+ text_lines: List of text line dictionaries
309
+ position: 'footer' or 'subheader'
310
+
311
+ Returns:
312
+ Updated TableInfo
313
+ """
314
+ if not text_lines:
315
+ return table
316
+
317
+ text_lines_sorted = sorted(text_lines, key=lambda l: l['x0'])
318
+
319
+ table_width = table.bbox[2] - table.bbox[0]
320
+ col_width = table_width / table.col_count if table.col_count > 0 else table_width
321
+
322
+ new_row = [''] * table.col_count
323
+
324
+ for line in text_lines_sorted:
325
+ relative_x = line['x0'] - table.bbox[0]
326
+ col_idx = min(int(relative_x / col_width), table.col_count - 1)
327
+ col_idx = max(0, col_idx)
328
+
329
+ if new_row[col_idx]:
330
+ new_row[col_idx] += " " + line['text']
331
+ else:
332
+ new_row[col_idx] = line['text']
333
+
334
+ non_empty_cols = sum(1 for c in new_row if c)
335
+ if non_empty_cols == 1 and new_row[0]:
336
+ combined_text = " ".join(line['text'] for line in text_lines_sorted)
337
+ new_row = [combined_text] + [''] * (table.col_count - 1)
338
+
339
+ new_data = list(table.data)
340
+
341
+ # Update cell info
342
+ new_cells_info = None
343
+ if table.cells_info:
344
+ new_cells_info = list(table.cells_info)
345
+ else:
346
+ new_cells_info = []
347
+
348
+ if position == 'subheader':
349
+ if len(new_data) > 0:
350
+ new_data.insert(1, new_row)
351
+ # Adjust existing cell info row indices (+1 for row >= 1)
352
+ adjusted_cells = []
353
+ for cell in new_cells_info:
354
+ if cell['row'] >= 1:
355
+ adjusted_cell = dict(cell)
356
+ adjusted_cell['row'] = cell['row'] + 1
357
+ adjusted_cells.append(adjusted_cell)
358
+ else:
359
+ adjusted_cells.append(cell)
360
+ new_cells_info = adjusted_cells
361
+ # Add cell info for new subheader row (each cell has colspan=1)
362
+ for col_idx in range(table.col_count):
363
+ new_cells_info.append({
364
+ 'row': 1,
365
+ 'col': col_idx,
366
+ 'rowspan': 1,
367
+ 'colspan': 1,
368
+ 'bbox': None
369
+ })
370
+ else:
371
+ new_data.append(new_row)
372
+ else:
373
+ new_data.append(new_row)
374
+ # Footer row cell info is handled in generate_html_from_cells
375
+
376
+ all_y = [line['y0'] for line in text_lines] + [line['y1'] for line in text_lines]
377
+ min_y = min(all_y)
378
+ max_y = max(all_y)
379
+
380
+ new_bbox = (
381
+ table.bbox[0],
382
+ min(table.bbox[1], min_y),
383
+ table.bbox[2],
384
+ max(table.bbox[3], max_y)
385
+ )
386
+
387
+ return TableInfo(
388
+ page_num=table.page_num,
389
+ table_idx=table.table_idx,
390
+ bbox=new_bbox,
391
+ data=new_data,
392
+ col_count=table.col_count,
393
+ row_count=len(new_data),
394
+ page_height=table.page_height,
395
+ cells_info=new_cells_info if new_cells_info else None,
396
+ annotations=table.annotations,
397
+ detection_strategy=table.detection_strategy,
398
+ confidence=table.confidence
399
+ )
400
+
401
+
402
+ # ============================================================================
403
+ # Table Merging
404
+ # ============================================================================
405
+
406
+ def merge_adjacent_tables(tables: List[TableInfo]) -> List[TableInfo]:
407
+ """
408
+ Merge adjacent tables.
409
+
410
+ Args:
411
+ tables: List of TableInfo
412
+
413
+ Returns:
414
+ Merged list of TableInfo
415
+ """
416
+ if not tables:
417
+ return tables
418
+
419
+ tables_by_page: Dict[int, List[TableInfo]] = defaultdict(list)
420
+ for table in tables:
421
+ tables_by_page[table.page_num].append(table)
422
+
423
+ merged_result = []
424
+
425
+ for page_num, page_tables in tables_by_page.items():
426
+ sorted_tables = sorted(page_tables, key=lambda t: t.bbox[1])
427
+
428
+ i = 0
429
+ while i < len(sorted_tables):
430
+ current = sorted_tables[i]
431
+
432
+ merged = current
433
+ while i + 1 < len(sorted_tables):
434
+ next_table = sorted_tables[i + 1]
435
+
436
+ if should_merge_tables(merged, next_table):
437
+ merged = do_merge_tables(merged, next_table)
438
+ i += 1
439
+ logger.debug(f"[PDF] Merged adjacent tables on page {page_num + 1}")
440
+ else:
441
+ break
442
+
443
+ merged_result.append(merged)
444
+ i += 1
445
+
446
+ merged_result.sort(key=lambda t: (t.page_num, t.bbox[1]))
447
+ return merged_result
448
+
449
+
450
+ def should_merge_tables(t1: TableInfo, t2: TableInfo) -> bool:
451
+ """
452
+ Determine whether two tables should be merged.
453
+
454
+ Args:
455
+ t1: First table
456
+ t2: Second table
457
+
458
+ Returns:
459
+ True if should merge, False otherwise
460
+ """
461
+ if t1.page_num != t2.page_num:
462
+ return False
463
+
464
+ y_gap = t2.bbox[1] - t1.bbox[3]
465
+ if y_gap < 0 or y_gap > 30:
466
+ return False
467
+
468
+ x_overlap_start = max(t1.bbox[0], t2.bbox[0])
469
+ x_overlap_end = min(t1.bbox[2], t2.bbox[2])
470
+ x_overlap = max(0, x_overlap_end - x_overlap_start)
471
+
472
+ t1_width = t1.bbox[2] - t1.bbox[0]
473
+ t2_width = t2.bbox[2] - t2.bbox[0]
474
+
475
+ overlap_ratio = x_overlap / max(t1_width, t2_width, 1)
476
+ if overlap_ratio < 0.8:
477
+ return False
478
+
479
+ if t1.col_count == t2.col_count:
480
+ return True
481
+ if t1.row_count == 1 and t1.col_count < t2.col_count:
482
+ return True
483
+
484
+ return False
485
+
486
+
487
+ def do_merge_tables(t1: TableInfo, t2: TableInfo) -> TableInfo:
488
+ """
489
+ Perform table merging.
490
+
491
+ Improvements:
492
+ - Maintain basic cell info even without cells_info
493
+ - Accurately adjust cell indices after merging
494
+
495
+ Args:
496
+ t1: First table
497
+ t2: Second table
498
+
499
+ Returns:
500
+ Merged TableInfo
501
+ """
502
+ merged_bbox = (
503
+ min(t1.bbox[0], t2.bbox[0]),
504
+ t1.bbox[1],
505
+ max(t1.bbox[2], t2.bbox[2]),
506
+ t2.bbox[3]
507
+ )
508
+
509
+ merged_col_count = max(t1.col_count, t2.col_count)
510
+
511
+ merged_data = []
512
+ merged_cells = []
513
+
514
+ # Process t1 data
515
+ t1_row_count = len(t1.data)
516
+
517
+ if t1.col_count < merged_col_count and t1.row_count == 1 and t1.data:
518
+ # Handle colspan when header row has fewer columns
519
+ extra_cols = merged_col_count - t1.col_count
520
+ header_row = list(t1.data[0])
521
+
522
+ new_header = []
523
+ col_position = 0
524
+
525
+ for orig_col_idx, value in enumerate(header_row):
526
+ new_header.append(value)
527
+
528
+ if orig_col_idx == 1 and extra_cols > 0:
529
+ colspan = 1 + extra_cols
530
+ merged_cells.append({
531
+ 'row': 0,
532
+ 'col': col_position,
533
+ 'rowspan': 1,
534
+ 'colspan': colspan,
535
+ 'bbox': None
536
+ })
537
+ for _ in range(extra_cols):
538
+ new_header.append('')
539
+ col_position += colspan
540
+ else:
541
+ merged_cells.append({
542
+ 'row': 0,
543
+ 'col': col_position,
544
+ 'rowspan': 1,
545
+ 'colspan': 1,
546
+ 'bbox': None
547
+ })
548
+ col_position += 1
549
+
550
+ merged_data.append(new_header)
551
+ else:
552
+ # Process regular rows
553
+ for row_idx, row in enumerate(t1.data):
554
+ if len(row) < merged_col_count:
555
+ adjusted_row = list(row) + [''] * (merged_col_count - len(row))
556
+ else:
557
+ adjusted_row = list(row)
558
+ merged_data.append(adjusted_row)
559
+
560
+ # Copy t1 cell info
561
+ if t1.cells_info:
562
+ merged_cells.extend(t1.cells_info)
563
+
564
+ # Process t2 data
565
+ row_offset = t1_row_count
566
+
567
+ for row in t2.data:
568
+ if len(row) < merged_col_count:
569
+ adjusted_row = list(row) + [''] * (merged_col_count - len(row))
570
+ else:
571
+ adjusted_row = list(row)
572
+ merged_data.append(adjusted_row)
573
+
574
+ # Copy t2 cell info (with row offset applied)
575
+ if t2.cells_info:
576
+ for cell in t2.cells_info:
577
+ adjusted_cell = dict(cell)
578
+ adjusted_cell['row'] = cell.get('row', 0) + row_offset
579
+ merged_cells.append(adjusted_cell)
580
+
581
+ # If cell info is empty, set to None (handled by CellAnalysisEngine)
582
+ final_cells_info = merged_cells if merged_cells else None
583
+
584
+ return TableInfo(
585
+ page_num=t1.page_num,
586
+ table_idx=t1.table_idx,
587
+ bbox=merged_bbox,
588
+ data=merged_data,
589
+ col_count=merged_col_count,
590
+ row_count=len(merged_data),
591
+ page_height=t1.page_height,
592
+ cells_info=final_cells_info,
593
+ detection_strategy=t1.detection_strategy,
594
+ confidence=max(t1.confidence, t2.confidence)
595
+ )
596
+
597
+
598
+ # ============================================================================
599
+ # Table Continuity Processing
600
+ # ============================================================================
601
+
602
+ def process_table_continuity(all_tables: List[TableInfo]) -> List[TableInfo]:
603
+ """
604
+ Handle table continuity across pages.
605
+
606
+ Args:
607
+ all_tables: List of all TableInfo
608
+
609
+ Returns:
610
+ Processed list of TableInfo
611
+ """
612
+ if not all_tables:
613
+ return all_tables
614
+
615
+ result = []
616
+ last_category = None
617
+
618
+ for i, table_info in enumerate(all_tables):
619
+ table_info = TableInfo(
620
+ page_num=table_info.page_num,
621
+ table_idx=table_info.table_idx,
622
+ bbox=table_info.bbox,
623
+ data=copy.deepcopy(table_info.data),
624
+ col_count=table_info.col_count,
625
+ row_count=table_info.row_count,
626
+ page_height=table_info.page_height,
627
+ cells_info=table_info.cells_info,
628
+ annotations=table_info.annotations,
629
+ detection_strategy=table_info.detection_strategy,
630
+ confidence=table_info.confidence
631
+ )
632
+
633
+ curr_data = table_info.data
634
+
635
+ if i == 0:
636
+ last_category = extract_last_category(curr_data)
637
+ result.append(table_info)
638
+ continue
639
+
640
+ prev_table = all_tables[i - 1]
641
+
642
+ is_continuation = (
643
+ table_info.page_num > prev_table.page_num and
644
+ prev_table.bbox[3] > prev_table.page_height * 0.7 and
645
+ table_info.bbox[1] < table_info.page_height * 0.3 and
646
+ table_info.col_count == prev_table.col_count
647
+ )
648
+
649
+ if is_continuation and last_category:
650
+ for row in curr_data:
651
+ if len(row) >= 2:
652
+ first_col = row[0]
653
+ second_col = row[1] if len(row) > 1 else ""
654
+
655
+ if (not first_col or not str(first_col).strip()) and second_col and str(second_col).strip():
656
+ row[0] = last_category
657
+ elif first_col and str(first_col).strip():
658
+ last_category = first_col
659
+ else:
660
+ new_last = extract_last_category(curr_data)
661
+ if new_last:
662
+ last_category = new_last
663
+
664
+ result.append(table_info)
665
+
666
+ return result
667
+
668
+
669
+ def extract_last_category(table_data: List[List[Optional[str]]]) -> Optional[str]:
670
+ """
671
+ Extract last category from table.
672
+
673
+ Args:
674
+ table_data: Table data
675
+
676
+ Returns:
677
+ Last category string or None
678
+ """
679
+ if not table_data:
680
+ return None
681
+
682
+ last_category = None
683
+
684
+ for row in table_data:
685
+ if len(row) >= 1 and row[0] and str(row[0]).strip():
686
+ last_category = str(row[0]).strip()
687
+
688
+ return last_category
689
+
690
+
691
+ # ============================================================================
692
+ # HTML Conversion
693
+ # ============================================================================
694
+
695
+ def is_single_column_table(table_info: TableInfo) -> bool:
696
+ """
697
+ Determines if a table has n rows × 1 column format.
698
+
699
+ Tables with n rows × 1 column are often not actual tables,
700
+ so converting them to a text list is more appropriate.
701
+
702
+ Args:
703
+ table_info: Table information
704
+
705
+ Returns:
706
+ True if single-column table, False otherwise
707
+ """
708
+ data = table_info.data
709
+
710
+ if not data:
711
+ return False
712
+
713
+ # Calculate max columns across all rows
714
+ max_cols = max(len(row) for row in data) if data else 0
715
+
716
+ # Single column if max_cols is 1
717
+ return max_cols == 1
718
+
719
+
720
+ def convert_single_column_to_text(table_info: TableInfo) -> str:
721
+ """
722
+ Converts a single-column table to a text list.
723
+
724
+ Data with n rows × 1 column format is semantically more
725
+ appropriate to express as structured text rather than a table.
726
+
727
+ Args:
728
+ table_info: Table information
729
+
730
+ Returns:
731
+ String in text list format
732
+ """
733
+ data = table_info.data
734
+
735
+ if not data:
736
+ return ""
737
+
738
+ lines = []
739
+ for row in data:
740
+ if row and len(row) > 0:
741
+ cell_text = str(row[0]).strip() if row[0] else ""
742
+ if cell_text:
743
+ lines.append(cell_text)
744
+
745
+ return '\n'.join(lines)
746
+
747
+
748
+ def convert_table_to_html(table_info: TableInfo) -> str:
749
+ """
750
+ Converts a table to HTML.
751
+
752
+ Improvements:
753
+ 1. Prioritize using PyMuPDF cell info
754
+ 2. Apply CellAnalysisEngine
755
+ 3. Accurate rowspan/colspan handling
756
+ 4. Full colspan for annotation rows
757
+ 5. Semantic HTML with accessibility considerations
758
+
759
+ Args:
760
+ table_info: Table information
761
+
762
+ Returns:
763
+ HTML string
764
+ """
765
+ data = table_info.data
766
+
767
+ if not data:
768
+ return ""
769
+
770
+ num_rows = len(data)
771
+ num_cols = max(len(row) for row in data) if data else 0
772
+
773
+ if num_cols == 0:
774
+ return ""
775
+
776
+ # Perform cell analysis using CellAnalysisEngine
777
+ cell_engine = CellAnalysisEngine(table_info, None)
778
+ analyzed_cells = cell_engine.analyze()
779
+
780
+ # Generate HTML from analyzed cell info
781
+ return generate_html_from_cells(data, analyzed_cells, num_rows, num_cols)
782
+
783
+
784
+ def generate_html_from_cells(
785
+ data: List[List[Optional[str]]],
786
+ cells_info: List[Dict],
787
+ num_rows: int,
788
+ num_cols: int
789
+ ) -> str:
790
+ """
791
+ Improved HTML generation.
792
+
793
+ Improvements:
794
+ - Process all cells even with incomplete cell info
795
+ - Render empty cells correctly
796
+ - Enhanced data range validation
797
+
798
+ Args:
799
+ data: Table data
800
+ cells_info: Cell information list
801
+ num_rows: Number of rows
802
+ num_cols: Number of columns
803
+
804
+ Returns:
805
+ HTML string
806
+ """
807
+ # Create span_map: (row, col) -> {rowspan, colspan}
808
+ span_map: Dict[Tuple[int, int], Dict] = {}
809
+
810
+ for cell in cells_info:
811
+ row = cell.get('row', 0)
812
+ col = cell.get('col', 0)
813
+ rowspan = max(1, cell.get('rowspan', 1))
814
+ colspan = max(1, cell.get('colspan', 1))
815
+
816
+ # Adjust to stay within data range
817
+ if row >= num_rows or col >= num_cols:
818
+ continue
819
+
820
+ rowspan = min(rowspan, num_rows - row)
821
+ colspan = min(colspan, num_cols - col)
822
+
823
+ key = (row, col)
824
+ span_map[key] = {
825
+ 'rowspan': rowspan,
826
+ 'colspan': colspan
827
+ }
828
+
829
+ # Create skip_set: positions covered by merged cells
830
+ skip_set: Set[Tuple[int, int]] = set()
831
+
832
+ for (row, col), spans in span_map.items():
833
+ rowspan = spans['rowspan']
834
+ colspan = spans['colspan']
835
+
836
+ for r in range(row, min(row + rowspan, num_rows)):
837
+ for c in range(col, min(col + colspan, num_cols)):
838
+ if (r, c) != (row, col):
839
+ skip_set.add((r, c))
840
+
841
+ # Detect annotation rows and apply full colspan
842
+ for row_idx, row in enumerate(data):
843
+ if not row:
844
+ continue
845
+ first_val = str(row[0]).strip() if row[0] else ""
846
+
847
+ is_annotation = False
848
+ for pattern in PDFConfig.ANNOTATION_PATTERNS:
849
+ if first_val.startswith(pattern):
850
+ is_annotation = True
851
+ break
852
+
853
+ if is_annotation:
854
+ # Annotation row gets full colspan
855
+ span_map[(row_idx, 0)] = {'rowspan': 1, 'colspan': num_cols}
856
+ for col_idx in range(1, num_cols):
857
+ skip_set.add((row_idx, col_idx))
858
+
859
+ # Generate HTML
860
+ html_parts = ["<table>"]
861
+
862
+ for row_idx in range(num_rows):
863
+ html_parts.append(" <tr>")
864
+
865
+ row_data = data[row_idx] if row_idx < len(data) else []
866
+
867
+ for col_idx in range(num_cols):
868
+ # Check if this cell should be skipped
869
+ if (row_idx, col_idx) in skip_set:
870
+ continue
871
+
872
+ # Extract cell content
873
+ content = ""
874
+ if col_idx < len(row_data):
875
+ content = row_data[col_idx]
876
+ content = escape_html(str(content).strip() if content else "")
877
+
878
+ # Get span info (default to 1 if not found)
879
+ spans = span_map.get((row_idx, col_idx), {'rowspan': 1, 'colspan': 1})
880
+ attrs = []
881
+
882
+ if spans['rowspan'] > 1:
883
+ attrs.append(f'rowspan="{spans["rowspan"]}"')
884
+ if spans['colspan'] > 1:
885
+ attrs.append(f'colspan="{spans["colspan"]}"')
886
+
887
+ attr_str = " " + " ".join(attrs) if attrs else ""
888
+
889
+ # First row is treated as header
890
+ tag = "th" if row_idx == 0 else "td"
891
+ html_parts.append(f" <{tag}{attr_str}>{content}</{tag}>")
892
+
893
+ html_parts.append(" </tr>")
894
+
895
+ html_parts.append("</table>")
896
+ return "\n".join(html_parts)
897
+