xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,527 @@
1
+ # xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py
2
+ """
3
+ DOCX Table Extractor
4
+
5
+ Extracts tables from DOCX documents using the BaseTableExtractor interface.
6
+ Converts DOCX table elements to TableData objects for further processing.
7
+
8
+ ================================================================================
9
+ EXTRACTION APPROACH: Streaming Processing (요소 단위 실시간 처리) - APPROACH 2
10
+ ================================================================================
11
+
12
+ DOCX uses the Streaming Processing approach because:
13
+ - Tables are explicit <w:tbl> XML elements
14
+ - Tables can be processed one-by-one during document traversal
15
+ - Preserves natural document order
16
+ - Memory efficient for large documents
17
+
18
+ External Interface: extract_table(element, context) -> Optional[TableData]
19
+ - Called from docx_handler.py during body element traversal
20
+ - Each <w:tbl> element is passed to extract_table()
21
+ - Returns TableData or None
22
+ - ALL internal processing is encapsulated within this single method
23
+
24
+ ================================================================================
25
+ APPROACH 2 Pure Implementation:
26
+ ================================================================================
27
+ Per table_extractor.py structure, APPROACH 2 exposes ONLY extract_table().
28
+ All sub-functions are private and called only from within extract_table().
29
+
30
+ External (Public):
31
+ extract_table(element, context) → Optional[TableData]
32
+
33
+ Internal (Private) - All called from extract_table():
34
+ _estimate_column_count() - Grid column count calculation
35
+ _calculate_column_widths() - Column width percentages
36
+ _calculate_all_rowspans() - vMerge rowspan calculation
37
+ _extract_cell_text() - Cell content extraction
38
+
39
+ ================================================================================
40
+ Key Features:
41
+ - Full support for rowspan/colspan (vMerge/gridSpan)
42
+ - Column width calculation
43
+ - Header row detection
44
+ - Nested table support (TODO)
45
+
46
+ OOXML Table Structure:
47
+ - w:tblGrid: Table grid column definitions
48
+ - w:tr: Table row
49
+ - w:tc: Table cell
50
+ - w:tcPr/w:gridSpan: colspan (horizontal merge)
51
+ - w:tcPr/w:vMerge val="restart": rowspan start
52
+ - w:tcPr/w:vMerge (no val): rowspan continue (merged cell)
53
+ """
54
+ import logging
55
+ import traceback
56
+ from typing import Any, Dict, List, Optional, Tuple
57
+
58
+ from docx import Document
59
+ from docx.oxml.ns import qn
60
+
61
+ from xgen_doc2chunk.core.functions.table_extractor import (
62
+ BaseTableExtractor,
63
+ TableCell,
64
+ TableData,
65
+ TableExtractorConfig,
66
+ )
67
+ from xgen_doc2chunk.core.processor.docx_helper.docx_constants import NAMESPACES
68
+
69
+ logger = logging.getLogger("document-processor")
70
+
71
+
72
+ class DOCXTableExtractor(BaseTableExtractor):
73
+ """
74
+ DOCX-specific table extractor implementation.
75
+
76
+ Uses STREAMING PROCESSING approach (APPROACH 2 - 요소 단위 실시간 처리).
77
+
78
+ Extracts tables from DOCX documents and converts them to TableData objects.
79
+ Supports complex table structures including merged cells (rowspan/colspan).
80
+
81
+ ============================================================================
82
+ External Interface (Public):
83
+ ============================================================================
84
+ extract_table(element, context) -> Optional[TableData]
85
+
86
+ This is the ONLY public method for table extraction.
87
+ All other methods are private and called internally from extract_table().
88
+
89
+ ============================================================================
90
+ Usage:
91
+ ============================================================================
92
+ extractor = DOCXTableExtractor()
93
+
94
+ # Streaming approach (APPROACH 2):
95
+ for elem in doc.element.body:
96
+ if elem.tag.endswith('tbl'):
97
+ table_data = extractor.extract_table(elem, doc)
98
+ if table_data:
99
+ process(table_data)
100
+ """
101
+
102
+ def __init__(self, config: Optional[TableExtractorConfig] = None):
103
+ """Initialize the DOCX table extractor.
104
+
105
+ Args:
106
+ config: Table extraction configuration
107
+ """
108
+ super().__init__(config)
109
+
110
+ def supports_format(self, format_type: str) -> bool:
111
+ """Check if this extractor supports the given format.
112
+
113
+ Args:
114
+ format_type: Format identifier
115
+
116
+ Returns:
117
+ True if format is 'docx'
118
+ """
119
+ return format_type.lower() == 'docx'
120
+
121
+ # ==========================================================================
122
+ # STREAMING PROCESSING - APPROACH 2 (요소 단위 실시간 처리)
123
+ # ==========================================================================
124
+ #
125
+ # DOCX는 APPROACH 2를 사용하므로 extract_table() 하나만 외부에 노출됨.
126
+ # 모든 세부 함수는 extract_table() 내부에서만 호출됨.
127
+ #
128
+ # ==========================================================================
129
+
130
+ def extract_table(
131
+ self,
132
+ element: Any,
133
+ context: Any = None
134
+ ) -> Optional[TableData]:
135
+ """Extract a single table from a <w:tbl> XML element.
136
+
137
+ ========================================================================
138
+ [APPROACH 2 - STREAMING PROCESSING] - Single External Interface
139
+ ========================================================================
140
+
141
+ This is the ONLY public method for DOCX table extraction.
142
+ Called from docx_handler.py during document body traversal.
143
+
144
+ All internal processing (column calculation, rowspan detection,
145
+ cell extraction) is encapsulated within this method.
146
+
147
+ Args:
148
+ element: <w:tbl> XML element (lxml Element)
149
+ context: Document object for additional context (optional)
150
+
151
+ Returns:
152
+ TableData object or None if extraction fails
153
+
154
+ Example:
155
+ for elem in doc.element.body:
156
+ if etree.QName(elem).localname == 'tbl':
157
+ table_data = extractor.extract_table(elem, doc)
158
+ if table_data:
159
+ html = processor.format_table_as_html(table_data)
160
+ """
161
+ try:
162
+ # ----------------------------------------------------------------
163
+ # Step 1: Validate input and get row elements
164
+ # ----------------------------------------------------------------
165
+ rows_elem = element.findall('w:tr', NAMESPACES)
166
+ if not rows_elem:
167
+ return None
168
+
169
+ num_rows = len(rows_elem)
170
+
171
+ # ----------------------------------------------------------------
172
+ # Step 2: Calculate column count and widths
173
+ # ----------------------------------------------------------------
174
+ num_cols = self._estimate_column_count(element, rows_elem)
175
+ col_widths = self._calculate_column_widths(element, num_cols)
176
+
177
+ # ----------------------------------------------------------------
178
+ # Step 3: Calculate all rowspans and cell positions
179
+ # ----------------------------------------------------------------
180
+ rowspan_map, cell_grid_col = self._calculate_all_rowspans(
181
+ element, rows_elem, num_rows
182
+ )
183
+
184
+ # ----------------------------------------------------------------
185
+ # Step 4: Build TableCell grid
186
+ # ----------------------------------------------------------------
187
+ table_rows: List[List[TableCell]] = []
188
+
189
+ for row_idx, row in enumerate(rows_elem):
190
+ cells_elem = row.findall('w:tc', NAMESPACES)
191
+ row_cells: List[TableCell] = []
192
+
193
+ for cell_idx, cell in enumerate(cells_elem):
194
+ # Get cell properties
195
+ tcPr = cell.find('w:tcPr', NAMESPACES)
196
+ colspan = 1
197
+ is_vmerge_continue = False
198
+
199
+ if tcPr is not None:
200
+ # Get colspan (gridSpan)
201
+ gs = tcPr.find('w:gridSpan', NAMESPACES)
202
+ if gs is not None:
203
+ try:
204
+ colspan = int(gs.get(qn('w:val'), 1))
205
+ except (ValueError, TypeError):
206
+ colspan = 1
207
+
208
+ # Check vMerge status
209
+ vMerge = tcPr.find('w:vMerge', NAMESPACES)
210
+ if vMerge is not None:
211
+ val = vMerge.get(qn('w:val'))
212
+ if val != 'restart':
213
+ is_vmerge_continue = True
214
+
215
+ # Skip cells that are merged (continue cells)
216
+ if is_vmerge_continue:
217
+ continue
218
+
219
+ # Get grid column position
220
+ if cell_idx < len(cell_grid_col[row_idx]):
221
+ start_col, end_col = cell_grid_col[row_idx][cell_idx]
222
+ else:
223
+ start_col = cell_idx
224
+
225
+ # Get rowspan from pre-calculated map
226
+ rowspan = rowspan_map.get((row_idx, start_col), 1)
227
+
228
+ # Extract cell content
229
+ content = self._extract_cell_text(cell)
230
+
231
+ # Create TableCell
232
+ table_cell = TableCell(
233
+ content=content,
234
+ row_span=rowspan,
235
+ col_span=colspan,
236
+ is_header=(row_idx == 0 and self.config.include_header_row),
237
+ row_index=row_idx,
238
+ col_index=start_col,
239
+ nested_table=None # TODO: Handle nested tables if needed
240
+ )
241
+ row_cells.append(table_cell)
242
+
243
+ if row_cells:
244
+ table_rows.append(row_cells)
245
+
246
+ # ----------------------------------------------------------------
247
+ # Step 5: Create and return TableData
248
+ # ----------------------------------------------------------------
249
+ actual_rows = len(table_rows)
250
+ actual_cols = num_cols
251
+
252
+ table_data = TableData(
253
+ rows=table_rows,
254
+ num_rows=actual_rows,
255
+ num_cols=actual_cols,
256
+ has_header=self.config.include_header_row and actual_rows > 0,
257
+ start_offset=0,
258
+ end_offset=0,
259
+ source_format='docx',
260
+ metadata={},
261
+ col_widths_percent=col_widths
262
+ )
263
+
264
+ return table_data
265
+
266
+ except Exception as e:
267
+ self.logger.error(f"Error extracting table from element: {e}")
268
+ self.logger.debug(traceback.format_exc())
269
+ return None
270
+
271
+ # ==========================================================================
272
+ # Private Helper Methods (Called internally from extract_table)
273
+ # ==========================================================================
274
+
275
+ def _estimate_column_count(
276
+ self,
277
+ table_elem: Any,
278
+ rows: List[Any]
279
+ ) -> int:
280
+ """Estimate the number of columns in the table.
281
+
282
+ Args:
283
+ table_elem: Table XML element
284
+ rows: List of row elements
285
+
286
+ Returns:
287
+ Number of columns
288
+ """
289
+ # Try to get from tblGrid first
290
+ tblGrid = table_elem.find('w:tblGrid', NAMESPACES)
291
+ if tblGrid is not None:
292
+ grid_cols = tblGrid.findall('w:gridCol', NAMESPACES)
293
+ if grid_cols:
294
+ return len(grid_cols)
295
+
296
+ # Fallback: calculate from first row
297
+ if not rows:
298
+ return 0
299
+
300
+ num_cols = 0
301
+ for cell in rows[0].findall('w:tc', NAMESPACES):
302
+ tcPr = cell.find('w:tcPr', NAMESPACES)
303
+ colspan = 1
304
+ if tcPr is not None:
305
+ gs = tcPr.find('w:gridSpan', NAMESPACES)
306
+ if gs is not None:
307
+ try:
308
+ colspan = int(gs.get(qn('w:val'), 1))
309
+ except (ValueError, TypeError):
310
+ colspan = 1
311
+ num_cols += colspan
312
+
313
+ return num_cols
314
+
315
+ def _calculate_column_widths(
316
+ self,
317
+ table_elem: Any,
318
+ num_cols: int
319
+ ) -> List[float]:
320
+ """Calculate column widths as percentages.
321
+
322
+ Args:
323
+ table_elem: Table XML element
324
+ num_cols: Number of columns
325
+
326
+ Returns:
327
+ List of column widths as percentages
328
+ """
329
+ widths = []
330
+
331
+ tblGrid = table_elem.find('w:tblGrid', NAMESPACES)
332
+ if tblGrid is not None:
333
+ grid_cols = tblGrid.findall('w:gridCol', NAMESPACES)
334
+
335
+ # Extract widths in twips
336
+ raw_widths = []
337
+ for col in grid_cols:
338
+ w = col.get(qn('w:w'))
339
+ if w:
340
+ try:
341
+ raw_widths.append(int(w))
342
+ except ValueError:
343
+ raw_widths.append(0)
344
+ else:
345
+ raw_widths.append(0)
346
+
347
+ # Convert to percentages
348
+ total_width = sum(raw_widths)
349
+ if total_width > 0:
350
+ widths = [(w / total_width) * 100 for w in raw_widths]
351
+
352
+ # Fallback: equal widths
353
+ if not widths and num_cols > 0:
354
+ widths = [100.0 / num_cols] * num_cols
355
+
356
+ return widths
357
+
358
+ def _calculate_all_rowspans(
359
+ self,
360
+ table_elem: Any,
361
+ rows: List[Any],
362
+ num_rows: int
363
+ ) -> Tuple[Dict[Tuple[int, int], int], List[List[Tuple[int, int]]]]:
364
+ """Calculate rowspans for all cells with vMerge restart.
365
+
366
+ Uses improved algorithm (v3) for accurate merge tracking:
367
+ 1. Collect all cell information
368
+ 2. Use merge_info matrix to track cell ownership
369
+ 3. Connect continue cells to restart cells above
370
+ 4. Calculate rowspan by counting owned cells below
371
+
372
+ Args:
373
+ table_elem: Table XML element
374
+ rows: List of row elements
375
+ num_rows: Number of rows
376
+
377
+ Returns:
378
+ Tuple of (rowspan_map, cell_grid_col)
379
+ - rowspan_map: Dict[(row_idx, grid_col), rowspan]
380
+ - cell_grid_col: List[List[(start_col, end_col)]]
381
+ """
382
+ rowspan_map: Dict[Tuple[int, int], int] = {}
383
+
384
+ # Collect all cell info
385
+ all_cells_info: List[List[Tuple[int, str]]] = []
386
+
387
+ for row in rows:
388
+ cells = row.findall('w:tc', NAMESPACES)
389
+ row_info = []
390
+ for cell in cells:
391
+ tcPr = cell.find('w:tcPr', NAMESPACES)
392
+ colspan = 1
393
+ vmerge_status = 'none'
394
+
395
+ if tcPr is not None:
396
+ gs = tcPr.find('w:gridSpan', NAMESPACES)
397
+ if gs is not None:
398
+ try:
399
+ colspan = int(gs.get(qn('w:val'), 1))
400
+ except (ValueError, TypeError):
401
+ colspan = 1
402
+
403
+ vMerge = tcPr.find('w:vMerge', NAMESPACES)
404
+ if vMerge is not None:
405
+ val = vMerge.get(qn('w:val'))
406
+ vmerge_status = 'restart' if val == 'restart' else 'continue'
407
+
408
+ row_info.append((colspan, vmerge_status))
409
+ all_cells_info.append(row_info)
410
+
411
+ # Step 1: Calculate grid column positions for all cells
412
+ max_cols = 30
413
+ cell_grid_col: List[List[Tuple[int, int]]] = []
414
+
415
+ # merge_info[row][col] = (owner_row, owner_col, colspan)
416
+ merge_info: List[List[Optional[Tuple[int, int, int]]]] = [
417
+ [None] * max_cols for _ in range(num_rows)
418
+ ]
419
+
420
+ for row_idx, row_info in enumerate(all_cells_info):
421
+ grid_col = 0
422
+ row_grid_cols: List[Tuple[int, int]] = []
423
+
424
+ for cell_idx, (colspan, vmerge_status) in enumerate(row_info):
425
+ # Skip already occupied columns (from vMerge above)
426
+ while grid_col < max_cols and merge_info[row_idx][grid_col] is not None:
427
+ grid_col += 1
428
+
429
+ # Expand if needed
430
+ while grid_col + colspan > max_cols:
431
+ for r in range(num_rows):
432
+ merge_info[r].extend([None] * 10)
433
+ max_cols += 10
434
+
435
+ start_col = grid_col
436
+ end_col = grid_col + colspan - 1
437
+ row_grid_cols.append((start_col, end_col))
438
+
439
+ if vmerge_status == 'restart':
440
+ # Restart cell: mark current row only
441
+ for c in range(start_col, start_col + colspan):
442
+ merge_info[row_idx][c] = (row_idx, start_col, colspan)
443
+
444
+ elif vmerge_status == 'continue':
445
+ # Continue cell: link to cell above
446
+ for prev_row in range(row_idx - 1, -1, -1):
447
+ if merge_info[prev_row][start_col] is not None:
448
+ owner = merge_info[prev_row][start_col]
449
+ for c in range(start_col, start_col + colspan):
450
+ merge_info[row_idx][c] = owner
451
+ break
452
+ else:
453
+ # Not found - set to current (edge case)
454
+ for c in range(start_col, start_col + colspan):
455
+ merge_info[row_idx][c] = (row_idx, start_col, colspan)
456
+ else:
457
+ # Normal cell
458
+ for c in range(start_col, start_col + colspan):
459
+ merge_info[row_idx][c] = (row_idx, start_col, colspan)
460
+
461
+ grid_col += colspan
462
+
463
+ cell_grid_col.append(row_grid_cols)
464
+
465
+ # Step 2: Calculate rowspans for restart cells
466
+ for row_idx, row_info in enumerate(all_cells_info):
467
+ for cell_idx, (colspan, vmerge_status) in enumerate(row_info):
468
+ if cell_idx >= len(cell_grid_col[row_idx]):
469
+ continue
470
+ start_col, end_col = cell_grid_col[row_idx][cell_idx]
471
+
472
+ if vmerge_status == 'restart':
473
+ # Count cells below with same owner
474
+ rowspan = 1
475
+ for next_row in range(row_idx + 1, num_rows):
476
+ if start_col < max_cols and merge_info[next_row][start_col] == (row_idx, start_col, colspan):
477
+ rowspan += 1
478
+ else:
479
+ break
480
+ rowspan_map[(row_idx, start_col)] = rowspan
481
+
482
+ elif vmerge_status == 'none':
483
+ rowspan_map[(row_idx, start_col)] = 1
484
+
485
+ return rowspan_map, cell_grid_col
486
+
487
+ def _extract_cell_text(self, cell_elem: Any) -> str:
488
+ """Extract text content from a cell element.
489
+
490
+ Args:
491
+ cell_elem: Cell XML element
492
+
493
+ Returns:
494
+ Cell text content
495
+ """
496
+ texts = []
497
+
498
+ for p in cell_elem.findall('.//w:p', NAMESPACES):
499
+ p_texts = []
500
+ for t in p.findall('.//w:t', NAMESPACES):
501
+ if t.text:
502
+ p_texts.append(t.text)
503
+ if p_texts:
504
+ texts.append(''.join(p_texts))
505
+
506
+ return '\n'.join(texts)
507
+
508
+
509
+ # Factory function
510
+ def create_docx_table_extractor(
511
+ config: Optional[TableExtractorConfig] = None
512
+ ) -> DOCXTableExtractor:
513
+ """Create a DOCX table extractor instance.
514
+
515
+ Args:
516
+ config: Table extraction configuration
517
+
518
+ Returns:
519
+ Configured DOCXTableExtractor instance
520
+ """
521
+ return DOCXTableExtractor(config)
522
+
523
+
524
+ __all__ = [
525
+ 'DOCXTableExtractor',
526
+ 'create_docx_table_extractor',
527
+ ]