xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,468 @@
1
+ # xgen_doc2chunk/core/functions/table_extractor.py
2
+ """
3
+ Table Extractor - Abstract Interface for Table Extraction
4
+
5
+ Provides abstract base classes and data structures for table extraction.
6
+ Format-specific implementations should be placed in respective helper modules.
7
+
8
+ ================================================================================
9
+ TABLE EXTRACTION ARCHITECTURE
10
+ ================================================================================
11
+
12
+ This module defines the common interface for all format-specific table extractors.
13
+ There are TWO main extraction approaches supported:
14
+
15
+ --------------------------------------------------------------------------------
16
+ APPROACH 1: Batch Processing (Entire Document Processing)
17
+ --------------------------------------------------------------------------------
18
+ Method: extract_tables(content) -> List[TableData]
19
+
20
+ Description:
21
+ - Extracts ALL tables from the entire document at once
22
+ - Uses 2-Pass approach internally:
23
+ Pass 1: detect_table_regions() - Find table locations
24
+ Pass 2: extract_table_from_region() - Extract from each region
25
+
26
+ Use Cases:
27
+ - PDF: Tables detected via layout analysis, extracted in batch
28
+ - Excel: All sheets processed together
29
+ - Scanned documents: OCR-based table detection
30
+
31
+ Implemented By:
32
+ - PDFTableExtractor (planned)
33
+ - ExcelTableExtractor (planned)
34
+
35
+ --------------------------------------------------------------------------------
36
+ APPROACH 2: Streaming/Element Processing (Element-wise Real-time Processing)
37
+ --------------------------------------------------------------------------------
38
+ Method: extract_table(element, context) -> Optional[TableData]
39
+
40
+ Description:
41
+ - Extracts a SINGLE table from an element/node
42
+ - Called in real-time as document is traversed
43
+ - More memory efficient for large documents
44
+ - Preserves document order naturally
45
+
46
+ Use Cases:
47
+ - DOCX: Tables are explicit <w:tbl> elements
48
+ - PPTX: Tables are shape elements in slides
49
+ - HTML: Tables are <table> elements
50
+
51
+ Implemented By:
52
+ - DOCXTableExtractor (xgen_doc2chunk.core.processor.docx_helper)
53
+ - PPTXTableExtractor (planned)
54
+ - HTMLTableExtractor (planned)
55
+
56
+ ================================================================================
57
+ IMPLEMENTATION STATUS BY FORMAT
58
+ ================================================================================
59
+
60
+ | Format | Extractor Class | Approach | Status | Location |
61
+ |--------|---------------------|-----------|-------------|------------------------------|
62
+ | DOCX | DOCXTableExtractor | Streaming | Complete | docx_helper/docx_table_extractor.py |
63
+ | DOC | DOCTableExtractor | Batch | Planned | doc_helper/ |
64
+ | PDF | PDFTableExtractor | Batch | Planned | pdf_helper/ |
65
+ | XLSX | ExcelTableExtractor | Batch | Planned | excel_helper/ |
66
+ | PPTX | PPTXTableExtractor | Streaming | Planned | pptx_helper/ |
67
+ | HTML | HTMLTableExtractor | Streaming | Planned | html_helper/ |
68
+ | HWP | HWPTableExtractor | Batch | Planned | hwp_helper/ |
69
+
70
+ ================================================================================
71
+ MODULE COMPONENTS
72
+ ================================================================================
73
+
74
+ - TableCell: Data class for table cell information
75
+ - TableData: Data class for complete table information
76
+ - TableRegion: Data class for detected table regions (Batch approach)
77
+ - TableExtractorConfig: Configuration for extraction behavior
78
+ - BaseTableExtractor: Abstract base class for format-specific extractors
79
+ - NullTableExtractor: No-op extractor for unsupported formats
80
+
81
+ ================================================================================
82
+ USAGE EXAMPLES
83
+ ================================================================================
84
+
85
+ Example 1: Batch Processing (PDF, Excel)
86
+
87
+ from xgen_doc2chunk.core.functions.table_extractor import BaseTableExtractor
88
+
89
+ class PDFTableExtractor(BaseTableExtractor):
90
+ def detect_table_regions(self, content):
91
+ # Scan PDF for table-like regions
92
+ return [TableRegion(...), ...]
93
+
94
+ def extract_table_from_region(self, content, region):
95
+ # Extract table from specific region
96
+ return TableData(...)
97
+
98
+ # Use inherited extract_tables() for batch processing
99
+
100
+ extractor = PDFTableExtractor()
101
+ tables = extractor.extract_tables(pdf_content) # Returns List[TableData]
102
+
103
+ Example 2: Streaming Processing (DOCX, PPTX)
104
+
105
+ from xgen_doc2chunk.core.functions.table_extractor import BaseTableExtractor
106
+
107
+ class DOCXTableExtractor(BaseTableExtractor):
108
+ def extract_table(self, element, context=None):
109
+ # Extract single table from <w:tbl> element
110
+ return TableData(...) # or None if invalid
111
+
112
+ extractor = DOCXTableExtractor()
113
+
114
+ # Called during document traversal:
115
+ for elem in doc.body:
116
+ if is_table(elem):
117
+ table = extractor.extract_table(elem, doc) # Returns Optional[TableData]
118
+ if table:
119
+ process(table)
120
+ """
121
+ import logging
122
+ from abc import ABC, abstractmethod
123
+ from dataclasses import dataclass, field
124
+ from typing import Any, Dict, List, Optional
125
+
126
+ logger = logging.getLogger("document-processor")
127
+
128
+
129
+ @dataclass
130
+ class TableCell:
131
+ """Represents a single table cell.
132
+
133
+ Attributes:
134
+ content: Cell content (text)
135
+ row_span: Number of rows this cell spans
136
+ col_span: Number of columns this cell spans
137
+ is_header: Whether this cell is a header cell
138
+ row_index: Row position in the table
139
+ col_index: Column position in the table
140
+ nested_table: Nested table data if this cell contains a table
141
+ """
142
+ content: str = ""
143
+ row_span: int = 1
144
+ col_span: int = 1
145
+ is_header: bool = False
146
+ row_index: int = 0
147
+ col_index: int = 0
148
+ nested_table: Optional['TableData'] = None
149
+
150
+
151
+ @dataclass
152
+ class TableData:
153
+ """Data class for table information.
154
+
155
+ Attributes:
156
+ rows: 2D list of TableCell objects
157
+ num_rows: Number of rows
158
+ num_cols: Number of columns
159
+ has_header: Whether the table has a header row
160
+ start_offset: Byte offset where the table starts (for binary formats)
161
+ end_offset: Byte offset where the table ends (for binary formats)
162
+ source_format: Source format identifier (e.g., "doc", "docx", "xlsx")
163
+ metadata: Additional metadata about the table
164
+ col_widths_percent: Column widths as percentages (e.g., [25.0, 50.0, 25.0])
165
+ """
166
+ rows: List[List[TableCell]] = field(default_factory=list)
167
+ num_rows: int = 0
168
+ num_cols: int = 0
169
+ has_header: bool = False
170
+ start_offset: int = 0
171
+ end_offset: int = 0
172
+ source_format: str = ""
173
+ metadata: Dict[str, Any] = field(default_factory=dict)
174
+ col_widths_percent: List[float] = field(default_factory=list)
175
+
176
+ def is_valid(self, min_rows: int = 2, min_cols: int = 2) -> bool:
177
+ """Check if this table meets minimum requirements."""
178
+ return self.num_rows >= min_rows and self.num_cols >= min_cols
179
+
180
+
181
+ @dataclass
182
+ class TableRegion:
183
+ """Represents a detected table region in the document.
184
+
185
+ Used for 2-Pass table detection approach:
186
+ - Pass 1: Detect table regions (TableRegion objects)
187
+ - Pass 2: Extract content from regions (TableData objects)
188
+
189
+ Attributes:
190
+ start_offset: Start position in the document
191
+ end_offset: End position in the document
192
+ row_count: Estimated number of rows
193
+ col_count: Estimated number of columns
194
+ confidence: Confidence score (0.0 - 1.0)
195
+ metadata: Additional metadata (optional)
196
+ """
197
+ start_offset: int = 0
198
+ end_offset: int = 0
199
+ row_count: int = 0
200
+ col_count: int = 0
201
+ confidence: float = 0.0
202
+ metadata: dict = field(default_factory=dict)
203
+
204
+ def is_confident(self, threshold: float = 0.5) -> bool:
205
+ """Check if this region detection is confident enough."""
206
+ return self.confidence >= threshold
207
+
208
+
209
+ @dataclass
210
+ class TableExtractorConfig:
211
+ """Configuration for table extraction.
212
+
213
+ Attributes:
214
+ min_rows: Minimum rows to consider as a table
215
+ min_cols: Minimum columns to consider as a table
216
+ confidence_threshold: Minimum confidence to accept a table region
217
+ include_header_row: Whether to mark first row as header
218
+ """
219
+ min_rows: int = 2
220
+ min_cols: int = 2
221
+ confidence_threshold: float = 0.5
222
+ include_header_row: bool = True
223
+
224
+
225
+ class BaseTableExtractor(ABC):
226
+ """Abstract base class for format-specific table extractors.
227
+
228
+ Each document format (DOC, DOCX, XLSX, etc.) should implement
229
+ a subclass of BaseTableExtractor with format-specific logic.
230
+
231
+ ============================================================================
232
+ SUPPORTED EXTRACTION APPROACHES
233
+ ============================================================================
234
+
235
+ APPROACH 1: Batch Processing (Entire Document)
236
+ ------------------------------------------------
237
+ Uses 2-Pass detection and extraction:
238
+ - detect_table_regions(): Find all table locations in document
239
+ - extract_table_from_region(): Extract table from each location
240
+ - extract_tables(): Combines both passes (main entry point)
241
+
242
+ Suitable for: PDF, DOC, Excel, HWP (where tables need detection)
243
+
244
+ APPROACH 2: Streaming Processing (Element-wise Real-time)
245
+ -------------------------------------------------------
246
+ Uses direct element extraction:
247
+ - extract_table(): Extract single table from element/node
248
+
249
+ Suitable for: DOCX, PPTX, HTML (where tables are explicit elements)
250
+
251
+ ============================================================================
252
+ IMPLEMENTATION GUIDE
253
+ ============================================================================
254
+
255
+ For Batch Processing (PDF, Excel, etc.):
256
+ - Override detect_table_regions() - REQUIRED
257
+ - Override extract_table_from_region() - REQUIRED
258
+ - Use extract_tables() as main entry point
259
+
260
+ For Streaming Processing (DOCX, PPTX, etc.):
261
+ - Override extract_table() - REQUIRED
262
+ - detect_table_regions() can return empty list
263
+ - extract_table_from_region() can return None
264
+ - Call extract_table() directly during document traversal
265
+
266
+ ============================================================================
267
+ """
268
+
269
+ def __init__(self, config: Optional[TableExtractorConfig] = None):
270
+ """Initialize the extractor.
271
+
272
+ Args:
273
+ config: Table extraction configuration
274
+ """
275
+ self.config = config or TableExtractorConfig()
276
+ self.logger = logging.getLogger("document-processor")
277
+
278
+ # ==========================================================================
279
+ # APPROACH 1: Batch Processing Methods (PDF, DOC, Excel, HWP)
280
+ # ==========================================================================
281
+
282
+ def detect_table_regions(self, content: Any) -> List[TableRegion]:
283
+ """Detect table regions in the document content.
284
+
285
+ [BATCH PROCESSING - Pass 1]
286
+ Scan document to find potential table locations.
287
+
288
+ Override this method for formats that require table detection:
289
+ - PDF: Layout analysis to find table-like structures
290
+ - DOC: Binary format parsing for table markers
291
+ - Excel: Sheet enumeration
292
+
293
+ Args:
294
+ content: Document content (bytes, str, or format-specific object)
295
+
296
+ Returns:
297
+ List of TableRegion objects representing detected table locations
298
+
299
+ Note:
300
+ For streaming formats (DOCX, PPTX), this can return empty list
301
+ as tables are processed via extract_table() instead.
302
+ """
303
+ # Default implementation returns empty list
304
+ # Override for batch processing formats
305
+ return []
306
+
307
+ def extract_table_from_region(
308
+ self,
309
+ content: Any,
310
+ region: TableRegion
311
+ ) -> Optional[TableData]:
312
+ """Extract table data from a detected region.
313
+
314
+ [BATCH PROCESSING - Pass 2]
315
+ Extract actual table content from a specific region.
316
+
317
+ Override this method for formats that use region-based extraction:
318
+ - PDF: Extract from page coordinates
319
+ - DOC: Extract from byte offsets
320
+ - Excel: Extract from sheet/cell ranges
321
+
322
+ Args:
323
+ content: Document content (bytes, str, or format-specific object)
324
+ region: TableRegion identifying where the table is
325
+
326
+ Returns:
327
+ TableData object or None if extraction fails
328
+
329
+ Note:
330
+ For streaming formats (DOCX, PPTX), this can return None
331
+ as tables are processed via extract_table() instead.
332
+ """
333
+ # Default implementation returns None
334
+ # Override for batch processing formats
335
+ return None
336
+
337
+ def extract_tables(self, content: Any) -> List[TableData]:
338
+ """Extract all tables from document content using batch processing.
339
+
340
+ [BATCH PROCESSING - Main Entry Point]
341
+ Combines both passes for complete extraction:
342
+ 1. Detect all table regions
343
+ 2. Extract tables from each region
344
+
345
+ Used by: PDF, DOC, Excel, HWP extractors
346
+
347
+ Args:
348
+ content: Document content
349
+
350
+ Returns:
351
+ List of TableData objects
352
+ """
353
+ tables = []
354
+
355
+ # Pass 1: Detect regions
356
+ regions = self.detect_table_regions(content)
357
+ self.logger.debug(f"Detected {len(regions)} table regions")
358
+
359
+ # Pass 2: Extract from each region
360
+ for region in regions:
361
+ if region.is_confident(self.config.confidence_threshold):
362
+ table = self.extract_table_from_region(content, region)
363
+ if table and table.is_valid(self.config.min_rows, self.config.min_cols):
364
+ tables.append(table)
365
+
366
+ self.logger.debug(f"Extracted {len(tables)} valid tables")
367
+ return tables
368
+
369
+ # ==========================================================================
370
+ # APPROACH 2: Streaming Processing Methods (DOCX, PPTX, HTML)
371
+ # ==========================================================================
372
+
373
+ def extract_table(
374
+ self,
375
+ element: Any,
376
+ context: Any = None
377
+ ) -> Optional[TableData]:
378
+ """Extract a single table from an element/node.
379
+
380
+ [STREAMING PROCESSING - Main Entry Point]
381
+ Extract table data from a specific element during document traversal.
382
+ Called in real-time as the document is being processed.
383
+
384
+ Override this method for formats with explicit table elements:
385
+ - DOCX: <w:tbl> XML element ??TableData
386
+ - PPTX: Table shape element ??TableData
387
+ - HTML: <table> DOM element ??TableData
388
+
389
+ Used by:
390
+ - DOCXTableExtractor: Extracts from <w:tbl> elements
391
+ - PPTXTableExtractor: Extracts from slide table shapes (planned)
392
+ - HTMLTableExtractor: Extracts from <table> elements (planned)
393
+
394
+ Args:
395
+ element: Table element/node (format-specific)
396
+ - DOCX: lxml Element (<w:tbl>)
397
+ - PPTX: Shape object
398
+ - HTML: DOM Element
399
+ context: Optional context object for additional information
400
+ - DOCX: Document object
401
+ - PPTX: Slide object
402
+ - HTML: Parent document
403
+
404
+ Returns:
405
+ TableData object or None if extraction fails/invalid
406
+
407
+ Example (DOCX):
408
+ for elem in doc.body:
409
+ if elem.tag.endswith('tbl'):
410
+ table_data = extractor.extract_table(elem, doc)
411
+ if table_data:
412
+ html = processor.format_table_as_html(table_data)
413
+ """
414
+ # Default implementation returns None
415
+ # Override for streaming processing formats
416
+ return None
417
+
418
+ # ==========================================================================
419
+ # Common Methods
420
+ # ==========================================================================
421
+
422
+ def supports_format(self, format_type: str) -> bool:
423
+ """Check if this extractor supports the given format.
424
+
425
+ Args:
426
+ format_type: Format identifier (e.g., "doc", "docx")
427
+
428
+ Returns:
429
+ True if format is supported
430
+ """
431
+ return False
432
+
433
+
434
+ class NullTableExtractor(BaseTableExtractor):
435
+ """No-op table extractor for unsupported formats.
436
+
437
+ Returns empty results for all operations.
438
+ Used as a fallback when no format-specific extractor is available.
439
+ """
440
+
441
+ def detect_table_regions(self, content: Any) -> List[TableRegion]:
442
+ """Return empty list (no table detection)."""
443
+ return []
444
+
445
+ def extract_table_from_region(
446
+ self,
447
+ content: Any,
448
+ region: TableRegion
449
+ ) -> Optional[TableData]:
450
+ """Return None (no table extraction)."""
451
+ return None
452
+
453
+ def extract_tables(self, content: Any) -> List[TableData]:
454
+ """Return empty list (no tables)."""
455
+ return []
456
+
457
+ def extract_table(
458
+ self,
459
+ element: Any,
460
+ context: Any = None
461
+ ) -> Optional[TableData]:
462
+ """Return None (no table extraction)."""
463
+ return None
464
+
465
+
466
+ # Default configuration
467
+ DEFAULT_EXTRACTOR_CONFIG = TableExtractorConfig()
468
+