xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,897 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py
|
|
2
|
+
"""
|
|
3
|
+
PDF Table Processing Module
|
|
4
|
+
|
|
5
|
+
Provides functions for table extraction, merging, annotation integration,
|
|
6
|
+
and HTML conversion from PDF documents.
|
|
7
|
+
"""
|
|
8
|
+
import copy
|
|
9
|
+
import logging
|
|
10
|
+
from typing import Any, Dict, List, Optional, Tuple, Set
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from collections import defaultdict
|
|
13
|
+
|
|
14
|
+
from xgen_doc2chunk.core.processor.pdf_helpers.types import (
|
|
15
|
+
TableDetectionStrategy,
|
|
16
|
+
ElementType,
|
|
17
|
+
PDFConfig,
|
|
18
|
+
PageElement,
|
|
19
|
+
PageBorderInfo,
|
|
20
|
+
CellInfo,
|
|
21
|
+
)
|
|
22
|
+
from xgen_doc2chunk.core.processor.pdf_helpers.pdf_utils import (
|
|
23
|
+
escape_html,
|
|
24
|
+
get_text_lines_with_positions,
|
|
25
|
+
)
|
|
26
|
+
from xgen_doc2chunk.core.processor.pdf_helpers.pdf_table_detection import TableDetectionEngine
|
|
27
|
+
from xgen_doc2chunk.core.processor.pdf_helpers.pdf_cell_analysis import CellAnalysisEngine
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger("document-processor")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# ============================================================================
|
|
33
|
+
# Data Classes
|
|
34
|
+
# ============================================================================
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class AnnotationInfo:
|
|
38
|
+
"""Annotation/footnote/endnote info."""
|
|
39
|
+
text: str
|
|
40
|
+
bbox: Tuple[float, float, float, float]
|
|
41
|
+
type: str # 'footnote', 'endnote', 'table_note'
|
|
42
|
+
related_table_idx: Optional[int] = None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class TableInfo:
|
|
47
|
+
"""Final table info."""
|
|
48
|
+
page_num: int
|
|
49
|
+
table_idx: int
|
|
50
|
+
bbox: Tuple[float, float, float, float]
|
|
51
|
+
data: List[List[Optional[str]]]
|
|
52
|
+
col_count: int
|
|
53
|
+
row_count: int
|
|
54
|
+
page_height: float
|
|
55
|
+
cells_info: Optional[List[Dict]] = None
|
|
56
|
+
annotations: Optional[List[AnnotationInfo]] = None
|
|
57
|
+
detection_strategy: Optional[TableDetectionStrategy] = None
|
|
58
|
+
confidence: float = 1.0
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# ============================================================================
|
|
62
|
+
# Table Extraction
|
|
63
|
+
# ============================================================================
|
|
64
|
+
|
|
65
|
+
def extract_all_tables(
|
|
66
|
+
doc,
|
|
67
|
+
file_path: str,
|
|
68
|
+
detect_page_border_func,
|
|
69
|
+
is_table_likely_border_func
|
|
70
|
+
) -> Dict[int, List[PageElement]]:
|
|
71
|
+
"""
|
|
72
|
+
Extracts tables from entire document.
|
|
73
|
+
|
|
74
|
+
Strategy:
|
|
75
|
+
1. Multi-strategy table detection
|
|
76
|
+
2. Select best result based on confidence
|
|
77
|
+
3. Cell analysis and merge cell processing
|
|
78
|
+
4. Annotation integration
|
|
79
|
+
5. Cross-page continuity handling
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
doc: PyMuPDF document object
|
|
83
|
+
file_path: PDF file path
|
|
84
|
+
detect_page_border_func: Function to detect page borders
|
|
85
|
+
is_table_likely_border_func: Function to check if table is a border
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
Dictionary mapping page numbers to list of table PageElements
|
|
89
|
+
"""
|
|
90
|
+
tables_by_page: Dict[int, List[PageElement]] = {}
|
|
91
|
+
all_table_infos: List[TableInfo] = []
|
|
92
|
+
|
|
93
|
+
# Step 1: Detect tables on each page
|
|
94
|
+
for page_num in range(len(doc)):
|
|
95
|
+
page = doc[page_num]
|
|
96
|
+
page_height = page.rect.height
|
|
97
|
+
|
|
98
|
+
# Detect page border
|
|
99
|
+
border_info = detect_page_border_func(page)
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
# Use table detection engine
|
|
103
|
+
detection_engine = TableDetectionEngine(page, page_num, file_path)
|
|
104
|
+
candidates = detection_engine.detect_tables()
|
|
105
|
+
|
|
106
|
+
for idx, candidate in enumerate(candidates):
|
|
107
|
+
# Check if overlaps with page border
|
|
108
|
+
if border_info.has_border and is_table_likely_border_func(
|
|
109
|
+
candidate.bbox, border_info, page
|
|
110
|
+
):
|
|
111
|
+
logger.debug(f"[PDF] Skipping page border table: {candidate.bbox}")
|
|
112
|
+
continue
|
|
113
|
+
|
|
114
|
+
# Convert cell info to dictionary
|
|
115
|
+
cells_info = None
|
|
116
|
+
if candidate.cells:
|
|
117
|
+
cells_info = [
|
|
118
|
+
{
|
|
119
|
+
'row': cell.row,
|
|
120
|
+
'col': cell.col,
|
|
121
|
+
'rowspan': cell.rowspan,
|
|
122
|
+
'colspan': cell.colspan,
|
|
123
|
+
'bbox': cell.bbox
|
|
124
|
+
}
|
|
125
|
+
for cell in candidate.cells
|
|
126
|
+
]
|
|
127
|
+
|
|
128
|
+
table_info = TableInfo(
|
|
129
|
+
page_num=page_num,
|
|
130
|
+
table_idx=idx,
|
|
131
|
+
bbox=candidate.bbox,
|
|
132
|
+
data=candidate.data,
|
|
133
|
+
col_count=candidate.col_count,
|
|
134
|
+
row_count=candidate.row_count,
|
|
135
|
+
page_height=page_height,
|
|
136
|
+
cells_info=cells_info,
|
|
137
|
+
detection_strategy=candidate.strategy,
|
|
138
|
+
confidence=candidate.confidence
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
all_table_infos.append(table_info)
|
|
142
|
+
|
|
143
|
+
except Exception as e:
|
|
144
|
+
logger.debug(f"[PDF] Error detecting tables on page {page_num}: {e}")
|
|
145
|
+
continue
|
|
146
|
+
|
|
147
|
+
# Step 2: Merge adjacent tables
|
|
148
|
+
merged_tables = merge_adjacent_tables(all_table_infos)
|
|
149
|
+
|
|
150
|
+
# Step 3: Find and insert annotations
|
|
151
|
+
merged_tables = find_and_insert_annotations(doc, merged_tables)
|
|
152
|
+
|
|
153
|
+
# Step 4: Handle table continuity
|
|
154
|
+
processed_tables = process_table_continuity(merged_tables)
|
|
155
|
+
|
|
156
|
+
# Step 5: HTML conversion and PageElement creation
|
|
157
|
+
# Single-column tables as TEXT, 2+ columns as TABLE
|
|
158
|
+
single_col_count = 0
|
|
159
|
+
real_table_count = 0
|
|
160
|
+
|
|
161
|
+
for table_info in processed_tables:
|
|
162
|
+
try:
|
|
163
|
+
page_num = table_info.page_num
|
|
164
|
+
|
|
165
|
+
if page_num not in tables_by_page:
|
|
166
|
+
tables_by_page[page_num] = []
|
|
167
|
+
|
|
168
|
+
# Check if single-column table
|
|
169
|
+
if is_single_column_table(table_info):
|
|
170
|
+
# Single-column table: convert to text list as TEXT type
|
|
171
|
+
text_content = convert_single_column_to_text(table_info)
|
|
172
|
+
|
|
173
|
+
if text_content:
|
|
174
|
+
tables_by_page[page_num].append(PageElement(
|
|
175
|
+
element_type=ElementType.TEXT,
|
|
176
|
+
content=text_content,
|
|
177
|
+
bbox=table_info.bbox,
|
|
178
|
+
page_num=page_num
|
|
179
|
+
))
|
|
180
|
+
single_col_count += 1
|
|
181
|
+
else:
|
|
182
|
+
# 2+ columns: convert to HTML table
|
|
183
|
+
html_table = convert_table_to_html(table_info)
|
|
184
|
+
|
|
185
|
+
if html_table:
|
|
186
|
+
tables_by_page[page_num].append(PageElement(
|
|
187
|
+
element_type=ElementType.TABLE,
|
|
188
|
+
content=html_table,
|
|
189
|
+
bbox=table_info.bbox,
|
|
190
|
+
page_num=page_num
|
|
191
|
+
))
|
|
192
|
+
real_table_count += 1
|
|
193
|
+
|
|
194
|
+
except Exception as e:
|
|
195
|
+
logger.debug(f"[PDF] Error converting table to HTML: {e}")
|
|
196
|
+
continue
|
|
197
|
+
|
|
198
|
+
if single_col_count > 0:
|
|
199
|
+
logger.info(f"[PDF] Converted {single_col_count} single-column tables to text")
|
|
200
|
+
logger.info(f"[PDF] Extracted {real_table_count} tables from {len(tables_by_page)} pages")
|
|
201
|
+
return tables_by_page
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
# ============================================================================
|
|
205
|
+
# Annotation Integration
|
|
206
|
+
# ============================================================================
|
|
207
|
+
|
|
208
|
+
def find_and_insert_annotations(doc, tables: List[TableInfo]) -> List[TableInfo]:
|
|
209
|
+
"""
|
|
210
|
+
Finds and integrates annotations/footnotes/endnotes inside and after tables.
|
|
211
|
+
|
|
212
|
+
Detection patterns:
|
|
213
|
+
1. Rows starting with "Note)" etc. right after table
|
|
214
|
+
2. Subheader rows inside table (e.g., (A), (B))
|
|
215
|
+
3. Footnote/endnote markers (?? *, ?? ?? etc.)
|
|
216
|
+
|
|
217
|
+
Args:
|
|
218
|
+
doc: PyMuPDF document object
|
|
219
|
+
tables: List of TableInfo
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
Updated list of TableInfo with annotations
|
|
223
|
+
"""
|
|
224
|
+
if not tables:
|
|
225
|
+
return tables
|
|
226
|
+
|
|
227
|
+
result = []
|
|
228
|
+
tables_by_page: Dict[int, List[TableInfo]] = defaultdict(list)
|
|
229
|
+
|
|
230
|
+
for table in tables:
|
|
231
|
+
tables_by_page[table.page_num].append(table)
|
|
232
|
+
|
|
233
|
+
for page_num, page_tables in tables_by_page.items():
|
|
234
|
+
page = doc[page_num]
|
|
235
|
+
page_height = page.rect.height
|
|
236
|
+
|
|
237
|
+
sorted_tables = sorted(page_tables, key=lambda t: t.bbox[1])
|
|
238
|
+
text_lines = get_text_lines_with_positions(page)
|
|
239
|
+
|
|
240
|
+
for i, table in enumerate(sorted_tables):
|
|
241
|
+
table_top = table.bbox[1]
|
|
242
|
+
table_bottom = table.bbox[3]
|
|
243
|
+
table_left = table.bbox[0]
|
|
244
|
+
table_right = table.bbox[2]
|
|
245
|
+
|
|
246
|
+
next_table_top = sorted_tables[i + 1].bbox[1] if i + 1 < len(sorted_tables) else page_height
|
|
247
|
+
|
|
248
|
+
# 1. Find annotation rows right after table
|
|
249
|
+
annotation_lines = []
|
|
250
|
+
for line in text_lines:
|
|
251
|
+
# Right below table, before next table
|
|
252
|
+
if table_bottom - 3 <= line['y0'] <= table_bottom + PDFConfig.ANNOTATION_Y_MARGIN:
|
|
253
|
+
if line['x0'] >= table_left - 10 and line['x1'] <= table_right + 10:
|
|
254
|
+
if line['y0'] < next_table_top - 20:
|
|
255
|
+
# Check annotation pattern
|
|
256
|
+
for pattern in PDFConfig.ANNOTATION_PATTERNS:
|
|
257
|
+
if line['text'].startswith(pattern):
|
|
258
|
+
annotation_lines.append(line)
|
|
259
|
+
break
|
|
260
|
+
|
|
261
|
+
if annotation_lines:
|
|
262
|
+
table = add_annotation_to_table(table, annotation_lines, 'footer')
|
|
263
|
+
logger.debug(f"[PDF] Added annotation to table on page {page_num + 1}")
|
|
264
|
+
|
|
265
|
+
# 2. Find subheader rows (e.g., (A), (B)) - only when no subheader exists
|
|
266
|
+
has_subheader = False
|
|
267
|
+
if table.row_count >= 2 and table.data and len(table.data) >= 2:
|
|
268
|
+
# Check if second row is subheader pattern
|
|
269
|
+
second_row = table.data[1] if len(table.data) > 1 else []
|
|
270
|
+
for cell in second_row:
|
|
271
|
+
if cell and ('(A)' in str(cell) or '(B)' in str(cell)):
|
|
272
|
+
has_subheader = True
|
|
273
|
+
break
|
|
274
|
+
|
|
275
|
+
if not has_subheader and table.row_count >= 2 and table.data:
|
|
276
|
+
row_height_estimate = (table_bottom - table_top) / table.row_count
|
|
277
|
+
header_bottom_estimate = table_top + row_height_estimate
|
|
278
|
+
second_row_top_estimate = table_top + row_height_estimate * 2
|
|
279
|
+
|
|
280
|
+
subheader_lines = []
|
|
281
|
+
for line in text_lines:
|
|
282
|
+
if header_bottom_estimate - 5 <= line['y0'] <= second_row_top_estimate - 5:
|
|
283
|
+
if line['x0'] >= table_left - 10 and line['x1'] <= table_right + 10:
|
|
284
|
+
# Check (A), (B) pattern
|
|
285
|
+
if '(A)' in line['text'] or '(B)' in line['text']:
|
|
286
|
+
subheader_lines.append(line)
|
|
287
|
+
|
|
288
|
+
if subheader_lines:
|
|
289
|
+
table = add_annotation_to_table(table, subheader_lines, 'subheader')
|
|
290
|
+
logger.debug(f"[PDF] Added subheader to table on page {page_num + 1}")
|
|
291
|
+
|
|
292
|
+
result.append(table)
|
|
293
|
+
|
|
294
|
+
result.sort(key=lambda t: (t.page_num, t.bbox[1]))
|
|
295
|
+
return result
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def add_annotation_to_table(
|
|
299
|
+
table: TableInfo,
|
|
300
|
+
text_lines: List[Dict],
|
|
301
|
+
position: str
|
|
302
|
+
) -> TableInfo:
|
|
303
|
+
"""
|
|
304
|
+
Adds annotation rows to a table.
|
|
305
|
+
|
|
306
|
+
Args:
|
|
307
|
+
table: TableInfo object
|
|
308
|
+
text_lines: List of text line dictionaries
|
|
309
|
+
position: 'footer' or 'subheader'
|
|
310
|
+
|
|
311
|
+
Returns:
|
|
312
|
+
Updated TableInfo
|
|
313
|
+
"""
|
|
314
|
+
if not text_lines:
|
|
315
|
+
return table
|
|
316
|
+
|
|
317
|
+
text_lines_sorted = sorted(text_lines, key=lambda l: l['x0'])
|
|
318
|
+
|
|
319
|
+
table_width = table.bbox[2] - table.bbox[0]
|
|
320
|
+
col_width = table_width / table.col_count if table.col_count > 0 else table_width
|
|
321
|
+
|
|
322
|
+
new_row = [''] * table.col_count
|
|
323
|
+
|
|
324
|
+
for line in text_lines_sorted:
|
|
325
|
+
relative_x = line['x0'] - table.bbox[0]
|
|
326
|
+
col_idx = min(int(relative_x / col_width), table.col_count - 1)
|
|
327
|
+
col_idx = max(0, col_idx)
|
|
328
|
+
|
|
329
|
+
if new_row[col_idx]:
|
|
330
|
+
new_row[col_idx] += " " + line['text']
|
|
331
|
+
else:
|
|
332
|
+
new_row[col_idx] = line['text']
|
|
333
|
+
|
|
334
|
+
non_empty_cols = sum(1 for c in new_row if c)
|
|
335
|
+
if non_empty_cols == 1 and new_row[0]:
|
|
336
|
+
combined_text = " ".join(line['text'] for line in text_lines_sorted)
|
|
337
|
+
new_row = [combined_text] + [''] * (table.col_count - 1)
|
|
338
|
+
|
|
339
|
+
new_data = list(table.data)
|
|
340
|
+
|
|
341
|
+
# Update cell info
|
|
342
|
+
new_cells_info = None
|
|
343
|
+
if table.cells_info:
|
|
344
|
+
new_cells_info = list(table.cells_info)
|
|
345
|
+
else:
|
|
346
|
+
new_cells_info = []
|
|
347
|
+
|
|
348
|
+
if position == 'subheader':
|
|
349
|
+
if len(new_data) > 0:
|
|
350
|
+
new_data.insert(1, new_row)
|
|
351
|
+
# Adjust existing cell info row indices (+1 for row >= 1)
|
|
352
|
+
adjusted_cells = []
|
|
353
|
+
for cell in new_cells_info:
|
|
354
|
+
if cell['row'] >= 1:
|
|
355
|
+
adjusted_cell = dict(cell)
|
|
356
|
+
adjusted_cell['row'] = cell['row'] + 1
|
|
357
|
+
adjusted_cells.append(adjusted_cell)
|
|
358
|
+
else:
|
|
359
|
+
adjusted_cells.append(cell)
|
|
360
|
+
new_cells_info = adjusted_cells
|
|
361
|
+
# Add cell info for new subheader row (each cell has colspan=1)
|
|
362
|
+
for col_idx in range(table.col_count):
|
|
363
|
+
new_cells_info.append({
|
|
364
|
+
'row': 1,
|
|
365
|
+
'col': col_idx,
|
|
366
|
+
'rowspan': 1,
|
|
367
|
+
'colspan': 1,
|
|
368
|
+
'bbox': None
|
|
369
|
+
})
|
|
370
|
+
else:
|
|
371
|
+
new_data.append(new_row)
|
|
372
|
+
else:
|
|
373
|
+
new_data.append(new_row)
|
|
374
|
+
# Footer row cell info is handled in generate_html_from_cells
|
|
375
|
+
|
|
376
|
+
all_y = [line['y0'] for line in text_lines] + [line['y1'] for line in text_lines]
|
|
377
|
+
min_y = min(all_y)
|
|
378
|
+
max_y = max(all_y)
|
|
379
|
+
|
|
380
|
+
new_bbox = (
|
|
381
|
+
table.bbox[0],
|
|
382
|
+
min(table.bbox[1], min_y),
|
|
383
|
+
table.bbox[2],
|
|
384
|
+
max(table.bbox[3], max_y)
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
return TableInfo(
|
|
388
|
+
page_num=table.page_num,
|
|
389
|
+
table_idx=table.table_idx,
|
|
390
|
+
bbox=new_bbox,
|
|
391
|
+
data=new_data,
|
|
392
|
+
col_count=table.col_count,
|
|
393
|
+
row_count=len(new_data),
|
|
394
|
+
page_height=table.page_height,
|
|
395
|
+
cells_info=new_cells_info if new_cells_info else None,
|
|
396
|
+
annotations=table.annotations,
|
|
397
|
+
detection_strategy=table.detection_strategy,
|
|
398
|
+
confidence=table.confidence
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
# ============================================================================
|
|
403
|
+
# Table Merging
|
|
404
|
+
# ============================================================================
|
|
405
|
+
|
|
406
|
+
def merge_adjacent_tables(tables: List[TableInfo]) -> List[TableInfo]:
|
|
407
|
+
"""
|
|
408
|
+
Merge adjacent tables.
|
|
409
|
+
|
|
410
|
+
Args:
|
|
411
|
+
tables: List of TableInfo
|
|
412
|
+
|
|
413
|
+
Returns:
|
|
414
|
+
Merged list of TableInfo
|
|
415
|
+
"""
|
|
416
|
+
if not tables:
|
|
417
|
+
return tables
|
|
418
|
+
|
|
419
|
+
tables_by_page: Dict[int, List[TableInfo]] = defaultdict(list)
|
|
420
|
+
for table in tables:
|
|
421
|
+
tables_by_page[table.page_num].append(table)
|
|
422
|
+
|
|
423
|
+
merged_result = []
|
|
424
|
+
|
|
425
|
+
for page_num, page_tables in tables_by_page.items():
|
|
426
|
+
sorted_tables = sorted(page_tables, key=lambda t: t.bbox[1])
|
|
427
|
+
|
|
428
|
+
i = 0
|
|
429
|
+
while i < len(sorted_tables):
|
|
430
|
+
current = sorted_tables[i]
|
|
431
|
+
|
|
432
|
+
merged = current
|
|
433
|
+
while i + 1 < len(sorted_tables):
|
|
434
|
+
next_table = sorted_tables[i + 1]
|
|
435
|
+
|
|
436
|
+
if should_merge_tables(merged, next_table):
|
|
437
|
+
merged = do_merge_tables(merged, next_table)
|
|
438
|
+
i += 1
|
|
439
|
+
logger.debug(f"[PDF] Merged adjacent tables on page {page_num + 1}")
|
|
440
|
+
else:
|
|
441
|
+
break
|
|
442
|
+
|
|
443
|
+
merged_result.append(merged)
|
|
444
|
+
i += 1
|
|
445
|
+
|
|
446
|
+
merged_result.sort(key=lambda t: (t.page_num, t.bbox[1]))
|
|
447
|
+
return merged_result
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
def should_merge_tables(t1: TableInfo, t2: TableInfo) -> bool:
|
|
451
|
+
"""
|
|
452
|
+
Determine whether two tables should be merged.
|
|
453
|
+
|
|
454
|
+
Args:
|
|
455
|
+
t1: First table
|
|
456
|
+
t2: Second table
|
|
457
|
+
|
|
458
|
+
Returns:
|
|
459
|
+
True if should merge, False otherwise
|
|
460
|
+
"""
|
|
461
|
+
if t1.page_num != t2.page_num:
|
|
462
|
+
return False
|
|
463
|
+
|
|
464
|
+
y_gap = t2.bbox[1] - t1.bbox[3]
|
|
465
|
+
if y_gap < 0 or y_gap > 30:
|
|
466
|
+
return False
|
|
467
|
+
|
|
468
|
+
x_overlap_start = max(t1.bbox[0], t2.bbox[0])
|
|
469
|
+
x_overlap_end = min(t1.bbox[2], t2.bbox[2])
|
|
470
|
+
x_overlap = max(0, x_overlap_end - x_overlap_start)
|
|
471
|
+
|
|
472
|
+
t1_width = t1.bbox[2] - t1.bbox[0]
|
|
473
|
+
t2_width = t2.bbox[2] - t2.bbox[0]
|
|
474
|
+
|
|
475
|
+
overlap_ratio = x_overlap / max(t1_width, t2_width, 1)
|
|
476
|
+
if overlap_ratio < 0.8:
|
|
477
|
+
return False
|
|
478
|
+
|
|
479
|
+
if t1.col_count == t2.col_count:
|
|
480
|
+
return True
|
|
481
|
+
if t1.row_count == 1 and t1.col_count < t2.col_count:
|
|
482
|
+
return True
|
|
483
|
+
|
|
484
|
+
return False
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
def do_merge_tables(t1: TableInfo, t2: TableInfo) -> TableInfo:
|
|
488
|
+
"""
|
|
489
|
+
Perform table merging.
|
|
490
|
+
|
|
491
|
+
Improvements:
|
|
492
|
+
- Maintain basic cell info even without cells_info
|
|
493
|
+
- Accurately adjust cell indices after merging
|
|
494
|
+
|
|
495
|
+
Args:
|
|
496
|
+
t1: First table
|
|
497
|
+
t2: Second table
|
|
498
|
+
|
|
499
|
+
Returns:
|
|
500
|
+
Merged TableInfo
|
|
501
|
+
"""
|
|
502
|
+
merged_bbox = (
|
|
503
|
+
min(t1.bbox[0], t2.bbox[0]),
|
|
504
|
+
t1.bbox[1],
|
|
505
|
+
max(t1.bbox[2], t2.bbox[2]),
|
|
506
|
+
t2.bbox[3]
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
merged_col_count = max(t1.col_count, t2.col_count)
|
|
510
|
+
|
|
511
|
+
merged_data = []
|
|
512
|
+
merged_cells = []
|
|
513
|
+
|
|
514
|
+
# Process t1 data
|
|
515
|
+
t1_row_count = len(t1.data)
|
|
516
|
+
|
|
517
|
+
if t1.col_count < merged_col_count and t1.row_count == 1 and t1.data:
|
|
518
|
+
# Handle colspan when header row has fewer columns
|
|
519
|
+
extra_cols = merged_col_count - t1.col_count
|
|
520
|
+
header_row = list(t1.data[0])
|
|
521
|
+
|
|
522
|
+
new_header = []
|
|
523
|
+
col_position = 0
|
|
524
|
+
|
|
525
|
+
for orig_col_idx, value in enumerate(header_row):
|
|
526
|
+
new_header.append(value)
|
|
527
|
+
|
|
528
|
+
if orig_col_idx == 1 and extra_cols > 0:
|
|
529
|
+
colspan = 1 + extra_cols
|
|
530
|
+
merged_cells.append({
|
|
531
|
+
'row': 0,
|
|
532
|
+
'col': col_position,
|
|
533
|
+
'rowspan': 1,
|
|
534
|
+
'colspan': colspan,
|
|
535
|
+
'bbox': None
|
|
536
|
+
})
|
|
537
|
+
for _ in range(extra_cols):
|
|
538
|
+
new_header.append('')
|
|
539
|
+
col_position += colspan
|
|
540
|
+
else:
|
|
541
|
+
merged_cells.append({
|
|
542
|
+
'row': 0,
|
|
543
|
+
'col': col_position,
|
|
544
|
+
'rowspan': 1,
|
|
545
|
+
'colspan': 1,
|
|
546
|
+
'bbox': None
|
|
547
|
+
})
|
|
548
|
+
col_position += 1
|
|
549
|
+
|
|
550
|
+
merged_data.append(new_header)
|
|
551
|
+
else:
|
|
552
|
+
# Process regular rows
|
|
553
|
+
for row_idx, row in enumerate(t1.data):
|
|
554
|
+
if len(row) < merged_col_count:
|
|
555
|
+
adjusted_row = list(row) + [''] * (merged_col_count - len(row))
|
|
556
|
+
else:
|
|
557
|
+
adjusted_row = list(row)
|
|
558
|
+
merged_data.append(adjusted_row)
|
|
559
|
+
|
|
560
|
+
# Copy t1 cell info
|
|
561
|
+
if t1.cells_info:
|
|
562
|
+
merged_cells.extend(t1.cells_info)
|
|
563
|
+
|
|
564
|
+
# Process t2 data
|
|
565
|
+
row_offset = t1_row_count
|
|
566
|
+
|
|
567
|
+
for row in t2.data:
|
|
568
|
+
if len(row) < merged_col_count:
|
|
569
|
+
adjusted_row = list(row) + [''] * (merged_col_count - len(row))
|
|
570
|
+
else:
|
|
571
|
+
adjusted_row = list(row)
|
|
572
|
+
merged_data.append(adjusted_row)
|
|
573
|
+
|
|
574
|
+
# Copy t2 cell info (with row offset applied)
|
|
575
|
+
if t2.cells_info:
|
|
576
|
+
for cell in t2.cells_info:
|
|
577
|
+
adjusted_cell = dict(cell)
|
|
578
|
+
adjusted_cell['row'] = cell.get('row', 0) + row_offset
|
|
579
|
+
merged_cells.append(adjusted_cell)
|
|
580
|
+
|
|
581
|
+
# If cell info is empty, set to None (handled by CellAnalysisEngine)
|
|
582
|
+
final_cells_info = merged_cells if merged_cells else None
|
|
583
|
+
|
|
584
|
+
return TableInfo(
|
|
585
|
+
page_num=t1.page_num,
|
|
586
|
+
table_idx=t1.table_idx,
|
|
587
|
+
bbox=merged_bbox,
|
|
588
|
+
data=merged_data,
|
|
589
|
+
col_count=merged_col_count,
|
|
590
|
+
row_count=len(merged_data),
|
|
591
|
+
page_height=t1.page_height,
|
|
592
|
+
cells_info=final_cells_info,
|
|
593
|
+
detection_strategy=t1.detection_strategy,
|
|
594
|
+
confidence=max(t1.confidence, t2.confidence)
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
# ============================================================================
|
|
599
|
+
# Table Continuity Processing
|
|
600
|
+
# ============================================================================
|
|
601
|
+
|
|
602
|
+
def process_table_continuity(all_tables: List[TableInfo]) -> List[TableInfo]:
|
|
603
|
+
"""
|
|
604
|
+
Handle table continuity across pages.
|
|
605
|
+
|
|
606
|
+
Args:
|
|
607
|
+
all_tables: List of all TableInfo
|
|
608
|
+
|
|
609
|
+
Returns:
|
|
610
|
+
Processed list of TableInfo
|
|
611
|
+
"""
|
|
612
|
+
if not all_tables:
|
|
613
|
+
return all_tables
|
|
614
|
+
|
|
615
|
+
result = []
|
|
616
|
+
last_category = None
|
|
617
|
+
|
|
618
|
+
for i, table_info in enumerate(all_tables):
|
|
619
|
+
table_info = TableInfo(
|
|
620
|
+
page_num=table_info.page_num,
|
|
621
|
+
table_idx=table_info.table_idx,
|
|
622
|
+
bbox=table_info.bbox,
|
|
623
|
+
data=copy.deepcopy(table_info.data),
|
|
624
|
+
col_count=table_info.col_count,
|
|
625
|
+
row_count=table_info.row_count,
|
|
626
|
+
page_height=table_info.page_height,
|
|
627
|
+
cells_info=table_info.cells_info,
|
|
628
|
+
annotations=table_info.annotations,
|
|
629
|
+
detection_strategy=table_info.detection_strategy,
|
|
630
|
+
confidence=table_info.confidence
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
curr_data = table_info.data
|
|
634
|
+
|
|
635
|
+
if i == 0:
|
|
636
|
+
last_category = extract_last_category(curr_data)
|
|
637
|
+
result.append(table_info)
|
|
638
|
+
continue
|
|
639
|
+
|
|
640
|
+
prev_table = all_tables[i - 1]
|
|
641
|
+
|
|
642
|
+
is_continuation = (
|
|
643
|
+
table_info.page_num > prev_table.page_num and
|
|
644
|
+
prev_table.bbox[3] > prev_table.page_height * 0.7 and
|
|
645
|
+
table_info.bbox[1] < table_info.page_height * 0.3 and
|
|
646
|
+
table_info.col_count == prev_table.col_count
|
|
647
|
+
)
|
|
648
|
+
|
|
649
|
+
if is_continuation and last_category:
|
|
650
|
+
for row in curr_data:
|
|
651
|
+
if len(row) >= 2:
|
|
652
|
+
first_col = row[0]
|
|
653
|
+
second_col = row[1] if len(row) > 1 else ""
|
|
654
|
+
|
|
655
|
+
if (not first_col or not str(first_col).strip()) and second_col and str(second_col).strip():
|
|
656
|
+
row[0] = last_category
|
|
657
|
+
elif first_col and str(first_col).strip():
|
|
658
|
+
last_category = first_col
|
|
659
|
+
else:
|
|
660
|
+
new_last = extract_last_category(curr_data)
|
|
661
|
+
if new_last:
|
|
662
|
+
last_category = new_last
|
|
663
|
+
|
|
664
|
+
result.append(table_info)
|
|
665
|
+
|
|
666
|
+
return result
|
|
667
|
+
|
|
668
|
+
|
|
669
|
+
def extract_last_category(table_data: List[List[Optional[str]]]) -> Optional[str]:
|
|
670
|
+
"""
|
|
671
|
+
Extract last category from table.
|
|
672
|
+
|
|
673
|
+
Args:
|
|
674
|
+
table_data: Table data
|
|
675
|
+
|
|
676
|
+
Returns:
|
|
677
|
+
Last category string or None
|
|
678
|
+
"""
|
|
679
|
+
if not table_data:
|
|
680
|
+
return None
|
|
681
|
+
|
|
682
|
+
last_category = None
|
|
683
|
+
|
|
684
|
+
for row in table_data:
|
|
685
|
+
if len(row) >= 1 and row[0] and str(row[0]).strip():
|
|
686
|
+
last_category = str(row[0]).strip()
|
|
687
|
+
|
|
688
|
+
return last_category
|
|
689
|
+
|
|
690
|
+
|
|
691
|
+
# ============================================================================
|
|
692
|
+
# HTML Conversion
|
|
693
|
+
# ============================================================================
|
|
694
|
+
|
|
695
|
+
def is_single_column_table(table_info: TableInfo) -> bool:
|
|
696
|
+
"""
|
|
697
|
+
Determines if a table has n rows × 1 column format.
|
|
698
|
+
|
|
699
|
+
Tables with n rows × 1 column are often not actual tables,
|
|
700
|
+
so converting them to a text list is more appropriate.
|
|
701
|
+
|
|
702
|
+
Args:
|
|
703
|
+
table_info: Table information
|
|
704
|
+
|
|
705
|
+
Returns:
|
|
706
|
+
True if single-column table, False otherwise
|
|
707
|
+
"""
|
|
708
|
+
data = table_info.data
|
|
709
|
+
|
|
710
|
+
if not data:
|
|
711
|
+
return False
|
|
712
|
+
|
|
713
|
+
# Calculate max columns across all rows
|
|
714
|
+
max_cols = max(len(row) for row in data) if data else 0
|
|
715
|
+
|
|
716
|
+
# Single column if max_cols is 1
|
|
717
|
+
return max_cols == 1
|
|
718
|
+
|
|
719
|
+
|
|
720
|
+
def convert_single_column_to_text(table_info: TableInfo) -> str:
|
|
721
|
+
"""
|
|
722
|
+
Converts a single-column table to a text list.
|
|
723
|
+
|
|
724
|
+
Data with n rows × 1 column format is semantically more
|
|
725
|
+
appropriate to express as structured text rather than a table.
|
|
726
|
+
|
|
727
|
+
Args:
|
|
728
|
+
table_info: Table information
|
|
729
|
+
|
|
730
|
+
Returns:
|
|
731
|
+
String in text list format
|
|
732
|
+
"""
|
|
733
|
+
data = table_info.data
|
|
734
|
+
|
|
735
|
+
if not data:
|
|
736
|
+
return ""
|
|
737
|
+
|
|
738
|
+
lines = []
|
|
739
|
+
for row in data:
|
|
740
|
+
if row and len(row) > 0:
|
|
741
|
+
cell_text = str(row[0]).strip() if row[0] else ""
|
|
742
|
+
if cell_text:
|
|
743
|
+
lines.append(cell_text)
|
|
744
|
+
|
|
745
|
+
return '\n'.join(lines)
|
|
746
|
+
|
|
747
|
+
|
|
748
|
+
def convert_table_to_html(table_info: TableInfo) -> str:
|
|
749
|
+
"""
|
|
750
|
+
Converts a table to HTML.
|
|
751
|
+
|
|
752
|
+
Improvements:
|
|
753
|
+
1. Prioritize using PyMuPDF cell info
|
|
754
|
+
2. Apply CellAnalysisEngine
|
|
755
|
+
3. Accurate rowspan/colspan handling
|
|
756
|
+
4. Full colspan for annotation rows
|
|
757
|
+
5. Semantic HTML with accessibility considerations
|
|
758
|
+
|
|
759
|
+
Args:
|
|
760
|
+
table_info: Table information
|
|
761
|
+
|
|
762
|
+
Returns:
|
|
763
|
+
HTML string
|
|
764
|
+
"""
|
|
765
|
+
data = table_info.data
|
|
766
|
+
|
|
767
|
+
if not data:
|
|
768
|
+
return ""
|
|
769
|
+
|
|
770
|
+
num_rows = len(data)
|
|
771
|
+
num_cols = max(len(row) for row in data) if data else 0
|
|
772
|
+
|
|
773
|
+
if num_cols == 0:
|
|
774
|
+
return ""
|
|
775
|
+
|
|
776
|
+
# Perform cell analysis using CellAnalysisEngine
|
|
777
|
+
cell_engine = CellAnalysisEngine(table_info, None)
|
|
778
|
+
analyzed_cells = cell_engine.analyze()
|
|
779
|
+
|
|
780
|
+
# Generate HTML from analyzed cell info
|
|
781
|
+
return generate_html_from_cells(data, analyzed_cells, num_rows, num_cols)
|
|
782
|
+
|
|
783
|
+
|
|
784
|
+
def generate_html_from_cells(
|
|
785
|
+
data: List[List[Optional[str]]],
|
|
786
|
+
cells_info: List[Dict],
|
|
787
|
+
num_rows: int,
|
|
788
|
+
num_cols: int
|
|
789
|
+
) -> str:
|
|
790
|
+
"""
|
|
791
|
+
Improved HTML generation.
|
|
792
|
+
|
|
793
|
+
Improvements:
|
|
794
|
+
- Process all cells even with incomplete cell info
|
|
795
|
+
- Render empty cells correctly
|
|
796
|
+
- Enhanced data range validation
|
|
797
|
+
|
|
798
|
+
Args:
|
|
799
|
+
data: Table data
|
|
800
|
+
cells_info: Cell information list
|
|
801
|
+
num_rows: Number of rows
|
|
802
|
+
num_cols: Number of columns
|
|
803
|
+
|
|
804
|
+
Returns:
|
|
805
|
+
HTML string
|
|
806
|
+
"""
|
|
807
|
+
# Create span_map: (row, col) -> {rowspan, colspan}
|
|
808
|
+
span_map: Dict[Tuple[int, int], Dict] = {}
|
|
809
|
+
|
|
810
|
+
for cell in cells_info:
|
|
811
|
+
row = cell.get('row', 0)
|
|
812
|
+
col = cell.get('col', 0)
|
|
813
|
+
rowspan = max(1, cell.get('rowspan', 1))
|
|
814
|
+
colspan = max(1, cell.get('colspan', 1))
|
|
815
|
+
|
|
816
|
+
# Adjust to stay within data range
|
|
817
|
+
if row >= num_rows or col >= num_cols:
|
|
818
|
+
continue
|
|
819
|
+
|
|
820
|
+
rowspan = min(rowspan, num_rows - row)
|
|
821
|
+
colspan = min(colspan, num_cols - col)
|
|
822
|
+
|
|
823
|
+
key = (row, col)
|
|
824
|
+
span_map[key] = {
|
|
825
|
+
'rowspan': rowspan,
|
|
826
|
+
'colspan': colspan
|
|
827
|
+
}
|
|
828
|
+
|
|
829
|
+
# Create skip_set: positions covered by merged cells
|
|
830
|
+
skip_set: Set[Tuple[int, int]] = set()
|
|
831
|
+
|
|
832
|
+
for (row, col), spans in span_map.items():
|
|
833
|
+
rowspan = spans['rowspan']
|
|
834
|
+
colspan = spans['colspan']
|
|
835
|
+
|
|
836
|
+
for r in range(row, min(row + rowspan, num_rows)):
|
|
837
|
+
for c in range(col, min(col + colspan, num_cols)):
|
|
838
|
+
if (r, c) != (row, col):
|
|
839
|
+
skip_set.add((r, c))
|
|
840
|
+
|
|
841
|
+
# Detect annotation rows and apply full colspan
|
|
842
|
+
for row_idx, row in enumerate(data):
|
|
843
|
+
if not row:
|
|
844
|
+
continue
|
|
845
|
+
first_val = str(row[0]).strip() if row[0] else ""
|
|
846
|
+
|
|
847
|
+
is_annotation = False
|
|
848
|
+
for pattern in PDFConfig.ANNOTATION_PATTERNS:
|
|
849
|
+
if first_val.startswith(pattern):
|
|
850
|
+
is_annotation = True
|
|
851
|
+
break
|
|
852
|
+
|
|
853
|
+
if is_annotation:
|
|
854
|
+
# Annotation row gets full colspan
|
|
855
|
+
span_map[(row_idx, 0)] = {'rowspan': 1, 'colspan': num_cols}
|
|
856
|
+
for col_idx in range(1, num_cols):
|
|
857
|
+
skip_set.add((row_idx, col_idx))
|
|
858
|
+
|
|
859
|
+
# Generate HTML
|
|
860
|
+
html_parts = ["<table>"]
|
|
861
|
+
|
|
862
|
+
for row_idx in range(num_rows):
|
|
863
|
+
html_parts.append(" <tr>")
|
|
864
|
+
|
|
865
|
+
row_data = data[row_idx] if row_idx < len(data) else []
|
|
866
|
+
|
|
867
|
+
for col_idx in range(num_cols):
|
|
868
|
+
# Check if this cell should be skipped
|
|
869
|
+
if (row_idx, col_idx) in skip_set:
|
|
870
|
+
continue
|
|
871
|
+
|
|
872
|
+
# Extract cell content
|
|
873
|
+
content = ""
|
|
874
|
+
if col_idx < len(row_data):
|
|
875
|
+
content = row_data[col_idx]
|
|
876
|
+
content = escape_html(str(content).strip() if content else "")
|
|
877
|
+
|
|
878
|
+
# Get span info (default to 1 if not found)
|
|
879
|
+
spans = span_map.get((row_idx, col_idx), {'rowspan': 1, 'colspan': 1})
|
|
880
|
+
attrs = []
|
|
881
|
+
|
|
882
|
+
if spans['rowspan'] > 1:
|
|
883
|
+
attrs.append(f'rowspan="{spans["rowspan"]}"')
|
|
884
|
+
if spans['colspan'] > 1:
|
|
885
|
+
attrs.append(f'colspan="{spans["colspan"]}"')
|
|
886
|
+
|
|
887
|
+
attr_str = " " + " ".join(attrs) if attrs else ""
|
|
888
|
+
|
|
889
|
+
# First row is treated as header
|
|
890
|
+
tag = "th" if row_idx == 0 else "td"
|
|
891
|
+
html_parts.append(f" <{tag}{attr_str}>{content}</{tag}>")
|
|
892
|
+
|
|
893
|
+
html_parts.append(" </tr>")
|
|
894
|
+
|
|
895
|
+
html_parts.append("</table>")
|
|
896
|
+
return "\n".join(html_parts)
|
|
897
|
+
|