xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,468 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/functions/table_extractor.py
|
|
2
|
+
"""
|
|
3
|
+
Table Extractor - Abstract Interface for Table Extraction
|
|
4
|
+
|
|
5
|
+
Provides abstract base classes and data structures for table extraction.
|
|
6
|
+
Format-specific implementations should be placed in respective helper modules.
|
|
7
|
+
|
|
8
|
+
================================================================================
|
|
9
|
+
TABLE EXTRACTION ARCHITECTURE
|
|
10
|
+
================================================================================
|
|
11
|
+
|
|
12
|
+
This module defines the common interface for all format-specific table extractors.
|
|
13
|
+
There are TWO main extraction approaches supported:
|
|
14
|
+
|
|
15
|
+
--------------------------------------------------------------------------------
|
|
16
|
+
APPROACH 1: Batch Processing (Entire Document Processing)
|
|
17
|
+
--------------------------------------------------------------------------------
|
|
18
|
+
Method: extract_tables(content) -> List[TableData]
|
|
19
|
+
|
|
20
|
+
Description:
|
|
21
|
+
- Extracts ALL tables from the entire document at once
|
|
22
|
+
- Uses 2-Pass approach internally:
|
|
23
|
+
Pass 1: detect_table_regions() - Find table locations
|
|
24
|
+
Pass 2: extract_table_from_region() - Extract from each region
|
|
25
|
+
|
|
26
|
+
Use Cases:
|
|
27
|
+
- PDF: Tables detected via layout analysis, extracted in batch
|
|
28
|
+
- Excel: All sheets processed together
|
|
29
|
+
- Scanned documents: OCR-based table detection
|
|
30
|
+
|
|
31
|
+
Implemented By:
|
|
32
|
+
- PDFTableExtractor (planned)
|
|
33
|
+
- ExcelTableExtractor (planned)
|
|
34
|
+
|
|
35
|
+
--------------------------------------------------------------------------------
|
|
36
|
+
APPROACH 2: Streaming/Element Processing (Element-wise Real-time Processing)
|
|
37
|
+
--------------------------------------------------------------------------------
|
|
38
|
+
Method: extract_table(element, context) -> Optional[TableData]
|
|
39
|
+
|
|
40
|
+
Description:
|
|
41
|
+
- Extracts a SINGLE table from an element/node
|
|
42
|
+
- Called in real-time as document is traversed
|
|
43
|
+
- More memory efficient for large documents
|
|
44
|
+
- Preserves document order naturally
|
|
45
|
+
|
|
46
|
+
Use Cases:
|
|
47
|
+
- DOCX: Tables are explicit <w:tbl> elements
|
|
48
|
+
- PPTX: Tables are shape elements in slides
|
|
49
|
+
- HTML: Tables are <table> elements
|
|
50
|
+
|
|
51
|
+
Implemented By:
|
|
52
|
+
- DOCXTableExtractor (xgen_doc2chunk.core.processor.docx_helper)
|
|
53
|
+
- PPTXTableExtractor (planned)
|
|
54
|
+
- HTMLTableExtractor (planned)
|
|
55
|
+
|
|
56
|
+
================================================================================
|
|
57
|
+
IMPLEMENTATION STATUS BY FORMAT
|
|
58
|
+
================================================================================
|
|
59
|
+
|
|
60
|
+
| Format | Extractor Class | Approach | Status | Location |
|
|
61
|
+
|--------|---------------------|-----------|-------------|------------------------------|
|
|
62
|
+
| DOCX | DOCXTableExtractor | Streaming | Complete | docx_helper/docx_table_extractor.py |
|
|
63
|
+
| DOC | DOCTableExtractor | Batch | Planned | doc_helper/ |
|
|
64
|
+
| PDF | PDFTableExtractor | Batch | Planned | pdf_helper/ |
|
|
65
|
+
| XLSX | ExcelTableExtractor | Batch | Planned | excel_helper/ |
|
|
66
|
+
| PPTX | PPTXTableExtractor | Streaming | Planned | pptx_helper/ |
|
|
67
|
+
| HTML | HTMLTableExtractor | Streaming | Planned | html_helper/ |
|
|
68
|
+
| HWP | HWPTableExtractor | Batch | Planned | hwp_helper/ |
|
|
69
|
+
|
|
70
|
+
================================================================================
|
|
71
|
+
MODULE COMPONENTS
|
|
72
|
+
================================================================================
|
|
73
|
+
|
|
74
|
+
- TableCell: Data class for table cell information
|
|
75
|
+
- TableData: Data class for complete table information
|
|
76
|
+
- TableRegion: Data class for detected table regions (Batch approach)
|
|
77
|
+
- TableExtractorConfig: Configuration for extraction behavior
|
|
78
|
+
- BaseTableExtractor: Abstract base class for format-specific extractors
|
|
79
|
+
- NullTableExtractor: No-op extractor for unsupported formats
|
|
80
|
+
|
|
81
|
+
================================================================================
|
|
82
|
+
USAGE EXAMPLES
|
|
83
|
+
================================================================================
|
|
84
|
+
|
|
85
|
+
Example 1: Batch Processing (PDF, Excel)
|
|
86
|
+
|
|
87
|
+
from xgen_doc2chunk.core.functions.table_extractor import BaseTableExtractor
|
|
88
|
+
|
|
89
|
+
class PDFTableExtractor(BaseTableExtractor):
|
|
90
|
+
def detect_table_regions(self, content):
|
|
91
|
+
# Scan PDF for table-like regions
|
|
92
|
+
return [TableRegion(...), ...]
|
|
93
|
+
|
|
94
|
+
def extract_table_from_region(self, content, region):
|
|
95
|
+
# Extract table from specific region
|
|
96
|
+
return TableData(...)
|
|
97
|
+
|
|
98
|
+
# Use inherited extract_tables() for batch processing
|
|
99
|
+
|
|
100
|
+
extractor = PDFTableExtractor()
|
|
101
|
+
tables = extractor.extract_tables(pdf_content) # Returns List[TableData]
|
|
102
|
+
|
|
103
|
+
Example 2: Streaming Processing (DOCX, PPTX)
|
|
104
|
+
|
|
105
|
+
from xgen_doc2chunk.core.functions.table_extractor import BaseTableExtractor
|
|
106
|
+
|
|
107
|
+
class DOCXTableExtractor(BaseTableExtractor):
|
|
108
|
+
def extract_table(self, element, context=None):
|
|
109
|
+
# Extract single table from <w:tbl> element
|
|
110
|
+
return TableData(...) # or None if invalid
|
|
111
|
+
|
|
112
|
+
extractor = DOCXTableExtractor()
|
|
113
|
+
|
|
114
|
+
# Called during document traversal:
|
|
115
|
+
for elem in doc.body:
|
|
116
|
+
if is_table(elem):
|
|
117
|
+
table = extractor.extract_table(elem, doc) # Returns Optional[TableData]
|
|
118
|
+
if table:
|
|
119
|
+
process(table)
|
|
120
|
+
"""
|
|
121
|
+
import logging
|
|
122
|
+
from abc import ABC, abstractmethod
|
|
123
|
+
from dataclasses import dataclass, field
|
|
124
|
+
from typing import Any, Dict, List, Optional
|
|
125
|
+
|
|
126
|
+
logger = logging.getLogger("document-processor")
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
@dataclass
|
|
130
|
+
class TableCell:
|
|
131
|
+
"""Represents a single table cell.
|
|
132
|
+
|
|
133
|
+
Attributes:
|
|
134
|
+
content: Cell content (text)
|
|
135
|
+
row_span: Number of rows this cell spans
|
|
136
|
+
col_span: Number of columns this cell spans
|
|
137
|
+
is_header: Whether this cell is a header cell
|
|
138
|
+
row_index: Row position in the table
|
|
139
|
+
col_index: Column position in the table
|
|
140
|
+
nested_table: Nested table data if this cell contains a table
|
|
141
|
+
"""
|
|
142
|
+
content: str = ""
|
|
143
|
+
row_span: int = 1
|
|
144
|
+
col_span: int = 1
|
|
145
|
+
is_header: bool = False
|
|
146
|
+
row_index: int = 0
|
|
147
|
+
col_index: int = 0
|
|
148
|
+
nested_table: Optional['TableData'] = None
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
@dataclass
|
|
152
|
+
class TableData:
|
|
153
|
+
"""Data class for table information.
|
|
154
|
+
|
|
155
|
+
Attributes:
|
|
156
|
+
rows: 2D list of TableCell objects
|
|
157
|
+
num_rows: Number of rows
|
|
158
|
+
num_cols: Number of columns
|
|
159
|
+
has_header: Whether the table has a header row
|
|
160
|
+
start_offset: Byte offset where the table starts (for binary formats)
|
|
161
|
+
end_offset: Byte offset where the table ends (for binary formats)
|
|
162
|
+
source_format: Source format identifier (e.g., "doc", "docx", "xlsx")
|
|
163
|
+
metadata: Additional metadata about the table
|
|
164
|
+
col_widths_percent: Column widths as percentages (e.g., [25.0, 50.0, 25.0])
|
|
165
|
+
"""
|
|
166
|
+
rows: List[List[TableCell]] = field(default_factory=list)
|
|
167
|
+
num_rows: int = 0
|
|
168
|
+
num_cols: int = 0
|
|
169
|
+
has_header: bool = False
|
|
170
|
+
start_offset: int = 0
|
|
171
|
+
end_offset: int = 0
|
|
172
|
+
source_format: str = ""
|
|
173
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
174
|
+
col_widths_percent: List[float] = field(default_factory=list)
|
|
175
|
+
|
|
176
|
+
def is_valid(self, min_rows: int = 2, min_cols: int = 2) -> bool:
|
|
177
|
+
"""Check if this table meets minimum requirements."""
|
|
178
|
+
return self.num_rows >= min_rows and self.num_cols >= min_cols
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
@dataclass
|
|
182
|
+
class TableRegion:
|
|
183
|
+
"""Represents a detected table region in the document.
|
|
184
|
+
|
|
185
|
+
Used for 2-Pass table detection approach:
|
|
186
|
+
- Pass 1: Detect table regions (TableRegion objects)
|
|
187
|
+
- Pass 2: Extract content from regions (TableData objects)
|
|
188
|
+
|
|
189
|
+
Attributes:
|
|
190
|
+
start_offset: Start position in the document
|
|
191
|
+
end_offset: End position in the document
|
|
192
|
+
row_count: Estimated number of rows
|
|
193
|
+
col_count: Estimated number of columns
|
|
194
|
+
confidence: Confidence score (0.0 - 1.0)
|
|
195
|
+
metadata: Additional metadata (optional)
|
|
196
|
+
"""
|
|
197
|
+
start_offset: int = 0
|
|
198
|
+
end_offset: int = 0
|
|
199
|
+
row_count: int = 0
|
|
200
|
+
col_count: int = 0
|
|
201
|
+
confidence: float = 0.0
|
|
202
|
+
metadata: dict = field(default_factory=dict)
|
|
203
|
+
|
|
204
|
+
def is_confident(self, threshold: float = 0.5) -> bool:
|
|
205
|
+
"""Check if this region detection is confident enough."""
|
|
206
|
+
return self.confidence >= threshold
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
@dataclass
|
|
210
|
+
class TableExtractorConfig:
|
|
211
|
+
"""Configuration for table extraction.
|
|
212
|
+
|
|
213
|
+
Attributes:
|
|
214
|
+
min_rows: Minimum rows to consider as a table
|
|
215
|
+
min_cols: Minimum columns to consider as a table
|
|
216
|
+
confidence_threshold: Minimum confidence to accept a table region
|
|
217
|
+
include_header_row: Whether to mark first row as header
|
|
218
|
+
"""
|
|
219
|
+
min_rows: int = 2
|
|
220
|
+
min_cols: int = 2
|
|
221
|
+
confidence_threshold: float = 0.5
|
|
222
|
+
include_header_row: bool = True
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
class BaseTableExtractor(ABC):
|
|
226
|
+
"""Abstract base class for format-specific table extractors.
|
|
227
|
+
|
|
228
|
+
Each document format (DOC, DOCX, XLSX, etc.) should implement
|
|
229
|
+
a subclass of BaseTableExtractor with format-specific logic.
|
|
230
|
+
|
|
231
|
+
============================================================================
|
|
232
|
+
SUPPORTED EXTRACTION APPROACHES
|
|
233
|
+
============================================================================
|
|
234
|
+
|
|
235
|
+
APPROACH 1: Batch Processing (Entire Document)
|
|
236
|
+
------------------------------------------------
|
|
237
|
+
Uses 2-Pass detection and extraction:
|
|
238
|
+
- detect_table_regions(): Find all table locations in document
|
|
239
|
+
- extract_table_from_region(): Extract table from each location
|
|
240
|
+
- extract_tables(): Combines both passes (main entry point)
|
|
241
|
+
|
|
242
|
+
Suitable for: PDF, DOC, Excel, HWP (where tables need detection)
|
|
243
|
+
|
|
244
|
+
APPROACH 2: Streaming Processing (Element-wise Real-time)
|
|
245
|
+
-------------------------------------------------------
|
|
246
|
+
Uses direct element extraction:
|
|
247
|
+
- extract_table(): Extract single table from element/node
|
|
248
|
+
|
|
249
|
+
Suitable for: DOCX, PPTX, HTML (where tables are explicit elements)
|
|
250
|
+
|
|
251
|
+
============================================================================
|
|
252
|
+
IMPLEMENTATION GUIDE
|
|
253
|
+
============================================================================
|
|
254
|
+
|
|
255
|
+
For Batch Processing (PDF, Excel, etc.):
|
|
256
|
+
- Override detect_table_regions() - REQUIRED
|
|
257
|
+
- Override extract_table_from_region() - REQUIRED
|
|
258
|
+
- Use extract_tables() as main entry point
|
|
259
|
+
|
|
260
|
+
For Streaming Processing (DOCX, PPTX, etc.):
|
|
261
|
+
- Override extract_table() - REQUIRED
|
|
262
|
+
- detect_table_regions() can return empty list
|
|
263
|
+
- extract_table_from_region() can return None
|
|
264
|
+
- Call extract_table() directly during document traversal
|
|
265
|
+
|
|
266
|
+
============================================================================
|
|
267
|
+
"""
|
|
268
|
+
|
|
269
|
+
def __init__(self, config: Optional[TableExtractorConfig] = None):
|
|
270
|
+
"""Initialize the extractor.
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
config: Table extraction configuration
|
|
274
|
+
"""
|
|
275
|
+
self.config = config or TableExtractorConfig()
|
|
276
|
+
self.logger = logging.getLogger("document-processor")
|
|
277
|
+
|
|
278
|
+
# ==========================================================================
|
|
279
|
+
# APPROACH 1: Batch Processing Methods (PDF, DOC, Excel, HWP)
|
|
280
|
+
# ==========================================================================
|
|
281
|
+
|
|
282
|
+
def detect_table_regions(self, content: Any) -> List[TableRegion]:
|
|
283
|
+
"""Detect table regions in the document content.
|
|
284
|
+
|
|
285
|
+
[BATCH PROCESSING - Pass 1]
|
|
286
|
+
Scan document to find potential table locations.
|
|
287
|
+
|
|
288
|
+
Override this method for formats that require table detection:
|
|
289
|
+
- PDF: Layout analysis to find table-like structures
|
|
290
|
+
- DOC: Binary format parsing for table markers
|
|
291
|
+
- Excel: Sheet enumeration
|
|
292
|
+
|
|
293
|
+
Args:
|
|
294
|
+
content: Document content (bytes, str, or format-specific object)
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
List of TableRegion objects representing detected table locations
|
|
298
|
+
|
|
299
|
+
Note:
|
|
300
|
+
For streaming formats (DOCX, PPTX), this can return empty list
|
|
301
|
+
as tables are processed via extract_table() instead.
|
|
302
|
+
"""
|
|
303
|
+
# Default implementation returns empty list
|
|
304
|
+
# Override for batch processing formats
|
|
305
|
+
return []
|
|
306
|
+
|
|
307
|
+
def extract_table_from_region(
|
|
308
|
+
self,
|
|
309
|
+
content: Any,
|
|
310
|
+
region: TableRegion
|
|
311
|
+
) -> Optional[TableData]:
|
|
312
|
+
"""Extract table data from a detected region.
|
|
313
|
+
|
|
314
|
+
[BATCH PROCESSING - Pass 2]
|
|
315
|
+
Extract actual table content from a specific region.
|
|
316
|
+
|
|
317
|
+
Override this method for formats that use region-based extraction:
|
|
318
|
+
- PDF: Extract from page coordinates
|
|
319
|
+
- DOC: Extract from byte offsets
|
|
320
|
+
- Excel: Extract from sheet/cell ranges
|
|
321
|
+
|
|
322
|
+
Args:
|
|
323
|
+
content: Document content (bytes, str, or format-specific object)
|
|
324
|
+
region: TableRegion identifying where the table is
|
|
325
|
+
|
|
326
|
+
Returns:
|
|
327
|
+
TableData object or None if extraction fails
|
|
328
|
+
|
|
329
|
+
Note:
|
|
330
|
+
For streaming formats (DOCX, PPTX), this can return None
|
|
331
|
+
as tables are processed via extract_table() instead.
|
|
332
|
+
"""
|
|
333
|
+
# Default implementation returns None
|
|
334
|
+
# Override for batch processing formats
|
|
335
|
+
return None
|
|
336
|
+
|
|
337
|
+
def extract_tables(self, content: Any) -> List[TableData]:
|
|
338
|
+
"""Extract all tables from document content using batch processing.
|
|
339
|
+
|
|
340
|
+
[BATCH PROCESSING - Main Entry Point]
|
|
341
|
+
Combines both passes for complete extraction:
|
|
342
|
+
1. Detect all table regions
|
|
343
|
+
2. Extract tables from each region
|
|
344
|
+
|
|
345
|
+
Used by: PDF, DOC, Excel, HWP extractors
|
|
346
|
+
|
|
347
|
+
Args:
|
|
348
|
+
content: Document content
|
|
349
|
+
|
|
350
|
+
Returns:
|
|
351
|
+
List of TableData objects
|
|
352
|
+
"""
|
|
353
|
+
tables = []
|
|
354
|
+
|
|
355
|
+
# Pass 1: Detect regions
|
|
356
|
+
regions = self.detect_table_regions(content)
|
|
357
|
+
self.logger.debug(f"Detected {len(regions)} table regions")
|
|
358
|
+
|
|
359
|
+
# Pass 2: Extract from each region
|
|
360
|
+
for region in regions:
|
|
361
|
+
if region.is_confident(self.config.confidence_threshold):
|
|
362
|
+
table = self.extract_table_from_region(content, region)
|
|
363
|
+
if table and table.is_valid(self.config.min_rows, self.config.min_cols):
|
|
364
|
+
tables.append(table)
|
|
365
|
+
|
|
366
|
+
self.logger.debug(f"Extracted {len(tables)} valid tables")
|
|
367
|
+
return tables
|
|
368
|
+
|
|
369
|
+
# ==========================================================================
|
|
370
|
+
# APPROACH 2: Streaming Processing Methods (DOCX, PPTX, HTML)
|
|
371
|
+
# ==========================================================================
|
|
372
|
+
|
|
373
|
+
def extract_table(
|
|
374
|
+
self,
|
|
375
|
+
element: Any,
|
|
376
|
+
context: Any = None
|
|
377
|
+
) -> Optional[TableData]:
|
|
378
|
+
"""Extract a single table from an element/node.
|
|
379
|
+
|
|
380
|
+
[STREAMING PROCESSING - Main Entry Point]
|
|
381
|
+
Extract table data from a specific element during document traversal.
|
|
382
|
+
Called in real-time as the document is being processed.
|
|
383
|
+
|
|
384
|
+
Override this method for formats with explicit table elements:
|
|
385
|
+
- DOCX: <w:tbl> XML element ??TableData
|
|
386
|
+
- PPTX: Table shape element ??TableData
|
|
387
|
+
- HTML: <table> DOM element ??TableData
|
|
388
|
+
|
|
389
|
+
Used by:
|
|
390
|
+
- DOCXTableExtractor: Extracts from <w:tbl> elements
|
|
391
|
+
- PPTXTableExtractor: Extracts from slide table shapes (planned)
|
|
392
|
+
- HTMLTableExtractor: Extracts from <table> elements (planned)
|
|
393
|
+
|
|
394
|
+
Args:
|
|
395
|
+
element: Table element/node (format-specific)
|
|
396
|
+
- DOCX: lxml Element (<w:tbl>)
|
|
397
|
+
- PPTX: Shape object
|
|
398
|
+
- HTML: DOM Element
|
|
399
|
+
context: Optional context object for additional information
|
|
400
|
+
- DOCX: Document object
|
|
401
|
+
- PPTX: Slide object
|
|
402
|
+
- HTML: Parent document
|
|
403
|
+
|
|
404
|
+
Returns:
|
|
405
|
+
TableData object or None if extraction fails/invalid
|
|
406
|
+
|
|
407
|
+
Example (DOCX):
|
|
408
|
+
for elem in doc.body:
|
|
409
|
+
if elem.tag.endswith('tbl'):
|
|
410
|
+
table_data = extractor.extract_table(elem, doc)
|
|
411
|
+
if table_data:
|
|
412
|
+
html = processor.format_table_as_html(table_data)
|
|
413
|
+
"""
|
|
414
|
+
# Default implementation returns None
|
|
415
|
+
# Override for streaming processing formats
|
|
416
|
+
return None
|
|
417
|
+
|
|
418
|
+
# ==========================================================================
|
|
419
|
+
# Common Methods
|
|
420
|
+
# ==========================================================================
|
|
421
|
+
|
|
422
|
+
def supports_format(self, format_type: str) -> bool:
|
|
423
|
+
"""Check if this extractor supports the given format.
|
|
424
|
+
|
|
425
|
+
Args:
|
|
426
|
+
format_type: Format identifier (e.g., "doc", "docx")
|
|
427
|
+
|
|
428
|
+
Returns:
|
|
429
|
+
True if format is supported
|
|
430
|
+
"""
|
|
431
|
+
return False
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
class NullTableExtractor(BaseTableExtractor):
|
|
435
|
+
"""No-op table extractor for unsupported formats.
|
|
436
|
+
|
|
437
|
+
Returns empty results for all operations.
|
|
438
|
+
Used as a fallback when no format-specific extractor is available.
|
|
439
|
+
"""
|
|
440
|
+
|
|
441
|
+
def detect_table_regions(self, content: Any) -> List[TableRegion]:
|
|
442
|
+
"""Return empty list (no table detection)."""
|
|
443
|
+
return []
|
|
444
|
+
|
|
445
|
+
def extract_table_from_region(
|
|
446
|
+
self,
|
|
447
|
+
content: Any,
|
|
448
|
+
region: TableRegion
|
|
449
|
+
) -> Optional[TableData]:
|
|
450
|
+
"""Return None (no table extraction)."""
|
|
451
|
+
return None
|
|
452
|
+
|
|
453
|
+
def extract_tables(self, content: Any) -> List[TableData]:
|
|
454
|
+
"""Return empty list (no tables)."""
|
|
455
|
+
return []
|
|
456
|
+
|
|
457
|
+
def extract_table(
|
|
458
|
+
self,
|
|
459
|
+
element: Any,
|
|
460
|
+
context: Any = None
|
|
461
|
+
) -> Optional[TableData]:
|
|
462
|
+
"""Return None (no table extraction)."""
|
|
463
|
+
return None
|
|
464
|
+
|
|
465
|
+
|
|
466
|
+
# Default configuration
|
|
467
|
+
DEFAULT_EXTRACTOR_CONFIG = TableExtractorConfig()
|
|
468
|
+
|