xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,482 @@
1
+ # xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py
2
+ """
3
+ RTF Table Extractor
4
+
5
+ Extracts and parses tables from RTF content.
6
+ Includes RTFCellInfo and RTFTable data models.
7
+ """
8
+ import logging
9
+ import re
10
+ from dataclasses import dataclass, field
11
+ from typing import List, NamedTuple, Optional, Tuple
12
+
13
+ from xgen_doc2chunk.core.processor.rtf_helper.rtf_decoder import (
14
+ decode_hex_escapes,
15
+ )
16
+ from xgen_doc2chunk.core.processor.rtf_helper.rtf_text_cleaner import (
17
+ clean_rtf_text,
18
+ )
19
+ from xgen_doc2chunk.core.processor.rtf_helper.rtf_region_finder import (
20
+ find_excluded_regions,
21
+ is_in_excluded_region,
22
+ )
23
+
24
+ logger = logging.getLogger("xgen_doc2chunk.rtf.table")
25
+
26
+
27
+ # =============================================================================
28
+ # Data Models
29
+ # =============================================================================
30
+
31
+ class RTFCellInfo(NamedTuple):
32
+ """RTF cell information with merge info."""
33
+ text: str # Cell text content
34
+ h_merge_first: bool # Horizontal merge start (clmgf)
35
+ h_merge_cont: bool # Horizontal merge continue (clmrg)
36
+ v_merge_first: bool # Vertical merge start (clvmgf)
37
+ v_merge_cont: bool # Vertical merge continue (clvmrg)
38
+ right_boundary: int # Cell right boundary (twips)
39
+
40
+
41
+ @dataclass
42
+ class RTFTable:
43
+ """RTF table structure with merge cell support."""
44
+ rows: List[List[RTFCellInfo]] = field(default_factory=list)
45
+ col_count: int = 0
46
+ position: int = 0 # Start position in document
47
+ end_position: int = 0 # End position in document
48
+
49
+ def is_real_table(self) -> bool:
50
+ """
51
+ Determine if this is a real table.
52
+
53
+ n rows x 1 column is considered a list, not a table.
54
+ """
55
+ if not self.rows:
56
+ return False
57
+
58
+ effective_cols = self._get_effective_col_count()
59
+ return effective_cols >= 2
60
+
61
+ def _get_effective_col_count(self) -> int:
62
+ """Calculate effective column count (excluding empty columns)."""
63
+ if not self.rows:
64
+ return 0
65
+
66
+ effective_counts = []
67
+ for row in self.rows:
68
+ non_empty_cells = []
69
+ for i, cell in enumerate(row):
70
+ if cell.h_merge_cont:
71
+ continue
72
+ if cell.text.strip() or cell.v_merge_first:
73
+ non_empty_cells.append(i)
74
+
75
+ if non_empty_cells:
76
+ effective_counts.append(max(non_empty_cells) + 1)
77
+
78
+ return max(effective_counts) if effective_counts else 0
79
+
80
+ def to_html(self) -> str:
81
+ """Convert table to HTML with merge cell support."""
82
+ if not self.rows:
83
+ return ""
84
+
85
+ merge_info = self._calculate_merge_info()
86
+ html_parts = ['<table border="1">']
87
+
88
+ for row_idx, row in enumerate(self.rows):
89
+ html_parts.append('<tr>')
90
+
91
+ for col_idx, cell in enumerate(row):
92
+ if col_idx < len(merge_info[row_idx]):
93
+ colspan, rowspan = merge_info[row_idx][col_idx]
94
+
95
+ if colspan == 0 or rowspan == 0:
96
+ continue
97
+
98
+ cell_text = re.sub(r'\s+', ' ', cell.text).strip()
99
+
100
+ attrs = []
101
+ if colspan > 1:
102
+ attrs.append(f'colspan="{colspan}"')
103
+ if rowspan > 1:
104
+ attrs.append(f'rowspan="{rowspan}"')
105
+
106
+ attr_str = ' ' + ' '.join(attrs) if attrs else ''
107
+ html_parts.append(f'<td{attr_str}>{cell_text}</td>')
108
+ else:
109
+ cell_text = re.sub(r'\s+', ' ', cell.text).strip()
110
+ html_parts.append(f'<td>{cell_text}</td>')
111
+
112
+ html_parts.append('</tr>')
113
+
114
+ html_parts.append('</table>')
115
+ return '\n'.join(html_parts)
116
+
117
+ def to_text_list(self) -> str:
118
+ """
119
+ Convert 1-column table to text list.
120
+
121
+ - 1x1 table: Return cell content only (container table)
122
+ - nx1 table: Return rows separated by blank lines
123
+ """
124
+ if not self.rows:
125
+ return ""
126
+
127
+ if len(self.rows) == 1 and len(self.rows[0]) == 1:
128
+ return self.rows[0][0].text
129
+
130
+ lines = []
131
+ for row in self.rows:
132
+ if row:
133
+ cell_text = row[0].text
134
+ if cell_text:
135
+ lines.append(cell_text)
136
+
137
+ return '\n\n'.join(lines)
138
+
139
+ def _calculate_merge_info(self) -> List[List[tuple]]:
140
+ """Calculate colspan and rowspan for each cell."""
141
+ if not self.rows:
142
+ return []
143
+
144
+ num_rows = len(self.rows)
145
+ max_cols = max(len(row) for row in self.rows) if self.rows else 0
146
+
147
+ if max_cols == 0:
148
+ return []
149
+
150
+ # Initialize with (1, 1) for all cells
151
+ merge_info = [[(1, 1) for _ in range(max_cols)] for _ in range(num_rows)]
152
+
153
+ # Process horizontal merges
154
+ for row_idx, row in enumerate(self.rows):
155
+ col_idx = 0
156
+ while col_idx < len(row):
157
+ cell = row[col_idx]
158
+
159
+ if cell.h_merge_first:
160
+ colspan = 1
161
+ for next_col in range(col_idx + 1, len(row)):
162
+ if row[next_col].h_merge_cont:
163
+ colspan += 1
164
+ merge_info[row_idx][next_col] = (0, 0)
165
+ else:
166
+ break
167
+ merge_info[row_idx][col_idx] = (colspan, 1)
168
+
169
+ col_idx += 1
170
+
171
+ # Process vertical merges
172
+ for col_idx in range(max_cols):
173
+ row_idx = 0
174
+ while row_idx < num_rows:
175
+ if col_idx >= len(self.rows[row_idx]):
176
+ row_idx += 1
177
+ continue
178
+
179
+ cell = self.rows[row_idx][col_idx]
180
+
181
+ if cell.v_merge_first:
182
+ rowspan = 1
183
+ for next_row in range(row_idx + 1, num_rows):
184
+ if col_idx < len(self.rows[next_row]) and self.rows[next_row][col_idx].v_merge_cont:
185
+ rowspan += 1
186
+ merge_info[next_row][col_idx] = (0, 0)
187
+ else:
188
+ break
189
+
190
+ current_colspan = merge_info[row_idx][col_idx][0]
191
+ merge_info[row_idx][col_idx] = (current_colspan, rowspan)
192
+ row_idx += rowspan
193
+ elif cell.v_merge_cont:
194
+ merge_info[row_idx][col_idx] = (0, 0)
195
+ row_idx += 1
196
+ else:
197
+ row_idx += 1
198
+
199
+ return merge_info
200
+
201
+
202
+ # =============================================================================
203
+ # Table Extraction Functions
204
+ # =============================================================================
205
+
206
+ def extract_tables_with_positions(
207
+ content: str,
208
+ encoding: str = "cp949"
209
+ ) -> Tuple[List[RTFTable], List[Tuple[int, int, RTFTable]]]:
210
+ """
211
+ Extract tables from RTF content with position information.
212
+
213
+ RTF table structure:
214
+ - \\trowd: Table row start (row definition)
215
+ - \\cellxN: Cell boundary position
216
+ - \\clmgf: Horizontal merge start
217
+ - \\clmrg: Horizontal merge continue
218
+ - \\clvmgf: Vertical merge start
219
+ - \\clvmrg: Vertical merge continue
220
+ - \\intbl: Paragraph in cell
221
+ - \\cell: Cell end
222
+ - \\row: Row end
223
+
224
+ Args:
225
+ content: RTF string content
226
+ encoding: Encoding to use
227
+
228
+ Returns:
229
+ Tuple of (table list, table region list [(start, end, table), ...])
230
+ """
231
+ tables = []
232
+ table_regions = []
233
+
234
+ # Find excluded regions (header, footer, footnote, etc.)
235
+ excluded_regions = find_excluded_regions(content)
236
+
237
+ # Step 1: Find all \row positions
238
+ row_positions = []
239
+ for match in re.finditer(r'\\row(?![a-z])', content):
240
+ row_positions.append(match.end())
241
+
242
+ if not row_positions:
243
+ return tables, table_regions
244
+
245
+ # Step 2: Find \trowd before each \row
246
+ all_rows = []
247
+ for i, row_end in enumerate(row_positions):
248
+ if i == 0:
249
+ search_start = 0
250
+ else:
251
+ search_start = row_positions[i - 1]
252
+
253
+ segment = content[search_start:row_end]
254
+ trowd_match = re.search(r'\\trowd', segment)
255
+
256
+ if trowd_match:
257
+ row_start = search_start + trowd_match.start()
258
+
259
+ # Skip rows in excluded regions
260
+ if is_in_excluded_region(row_start, excluded_regions):
261
+ logger.debug(f"Skipping table row at {row_start} (in header/footer/footnote)")
262
+ continue
263
+
264
+ row_text = content[row_start:row_end]
265
+ all_rows.append((row_start, row_end, row_text))
266
+
267
+ if not all_rows:
268
+ return tables, table_regions
269
+
270
+ # Group consecutive rows into tables
271
+ table_groups = []
272
+ current_table = []
273
+ current_start = -1
274
+ current_end = -1
275
+ prev_end = -1
276
+
277
+ for row_start, row_end, row_text in all_rows:
278
+ # Rows within 150 chars are same table
279
+ if prev_end == -1 or row_start - prev_end < 150:
280
+ if current_start == -1:
281
+ current_start = row_start
282
+ current_table.append(row_text)
283
+ current_end = row_end
284
+ else:
285
+ if current_table:
286
+ table_groups.append((current_start, current_end, current_table))
287
+ current_table = [row_text]
288
+ current_start = row_start
289
+ current_end = row_end
290
+ prev_end = row_end
291
+
292
+ if current_table:
293
+ table_groups.append((current_start, current_end, current_table))
294
+
295
+ logger.info(f"Found {len(table_groups)} table groups")
296
+
297
+ # Parse each table group
298
+ for start_pos, end_pos, table_rows in table_groups:
299
+ table = _parse_table_with_merge(table_rows, encoding)
300
+ if table and table.rows:
301
+ table.position = start_pos
302
+ table.end_position = end_pos
303
+ tables.append(table)
304
+ table_regions.append((start_pos, end_pos, table))
305
+
306
+ logger.info(f"Extracted {len(tables)} tables")
307
+ return tables, table_regions
308
+
309
+
310
+ def _parse_table_with_merge(rows: List[str], encoding: str = "cp949") -> Optional[RTFTable]:
311
+ """
312
+ Parse table rows to RTFTable object with merge support.
313
+
314
+ Args:
315
+ rows: Table row text list
316
+ encoding: Encoding to use
317
+
318
+ Returns:
319
+ RTFTable object
320
+ """
321
+ table = RTFTable()
322
+
323
+ for row_text in rows:
324
+ cells = _extract_cells_with_merge(row_text, encoding)
325
+ if cells:
326
+ table.rows.append(cells)
327
+ if len(cells) > table.col_count:
328
+ table.col_count = len(cells)
329
+
330
+ return table if table.rows else None
331
+
332
+
333
+ def _extract_cells_with_merge(row_text: str, encoding: str = "cp949") -> List[RTFCellInfo]:
334
+ """
335
+ Extract cell content and merge information from table row.
336
+
337
+ Args:
338
+ row_text: Table row RTF text
339
+ encoding: Encoding to use
340
+
341
+ Returns:
342
+ List of RTFCellInfo
343
+ """
344
+ cells = []
345
+
346
+ # Step 1: Parse cell definitions (attributes before cellx)
347
+ cell_defs = []
348
+
349
+ # Find first \cell that is not \cellx
350
+ first_cell_idx = -1
351
+ pos = 0
352
+ while True:
353
+ idx = row_text.find('\\cell', pos)
354
+ if idx == -1:
355
+ first_cell_idx = len(row_text)
356
+ break
357
+ if idx + 5 < len(row_text) and row_text[idx + 5] == 'x':
358
+ pos = idx + 1
359
+ continue
360
+ first_cell_idx = idx
361
+ break
362
+
363
+ def_part = row_text[:first_cell_idx]
364
+
365
+ current_def = {
366
+ 'h_merge_first': False,
367
+ 'h_merge_cont': False,
368
+ 'v_merge_first': False,
369
+ 'v_merge_cont': False,
370
+ 'right_boundary': 0
371
+ }
372
+
373
+ cell_def_pattern = r'\\cl(?:mgf|mrg|vmgf|vmrg)|\\cellx(-?\d+)'
374
+
375
+ for match in re.finditer(cell_def_pattern, def_part):
376
+ token = match.group()
377
+ if token == '\\clmgf':
378
+ current_def['h_merge_first'] = True
379
+ elif token == '\\clmrg':
380
+ current_def['h_merge_cont'] = True
381
+ elif token == '\\clvmgf':
382
+ current_def['v_merge_first'] = True
383
+ elif token == '\\clvmrg':
384
+ current_def['v_merge_cont'] = True
385
+ elif token.startswith('\\cellx'):
386
+ if match.group(1):
387
+ current_def['right_boundary'] = int(match.group(1))
388
+ cell_defs.append(current_def.copy())
389
+ current_def = {
390
+ 'h_merge_first': False,
391
+ 'h_merge_cont': False,
392
+ 'v_merge_first': False,
393
+ 'v_merge_cont': False,
394
+ 'right_boundary': 0
395
+ }
396
+
397
+ # Step 2: Extract cell texts
398
+ cell_texts = _extract_cell_texts(row_text, encoding)
399
+
400
+ # Step 3: Match cell definitions with content
401
+ for i, cell_text in enumerate(cell_texts):
402
+ if i < len(cell_defs):
403
+ cell_def = cell_defs[i]
404
+ else:
405
+ cell_def = {
406
+ 'h_merge_first': False,
407
+ 'h_merge_cont': False,
408
+ 'v_merge_first': False,
409
+ 'v_merge_cont': False,
410
+ 'right_boundary': 0
411
+ }
412
+
413
+ cells.append(RTFCellInfo(
414
+ text=cell_text,
415
+ h_merge_first=cell_def['h_merge_first'],
416
+ h_merge_cont=cell_def['h_merge_cont'],
417
+ v_merge_first=cell_def['v_merge_first'],
418
+ v_merge_cont=cell_def['v_merge_cont'],
419
+ right_boundary=cell_def['right_boundary']
420
+ ))
421
+
422
+ return cells
423
+
424
+
425
+ def _extract_cell_texts(row_text: str, encoding: str = "cp949") -> List[str]:
426
+ """
427
+ Extract cell texts from row.
428
+
429
+ Args:
430
+ row_text: Table row RTF text
431
+ encoding: Encoding to use
432
+
433
+ Returns:
434
+ List of cell texts
435
+ """
436
+ cell_texts = []
437
+
438
+ # Step 1: Find all \cell positions (not \cellx)
439
+ cell_positions = []
440
+ pos = 0
441
+ while True:
442
+ idx = row_text.find('\\cell', pos)
443
+ if idx == -1:
444
+ break
445
+ next_pos = idx + 5
446
+ if next_pos < len(row_text) and row_text[next_pos] == 'x':
447
+ pos = idx + 1
448
+ continue
449
+ cell_positions.append(idx)
450
+ pos = idx + 1
451
+
452
+ if not cell_positions:
453
+ return cell_texts
454
+
455
+ # Step 2: Find last \cellx before first \cell
456
+ first_cell_pos = cell_positions[0]
457
+ def_part = row_text[:first_cell_pos]
458
+
459
+ last_cellx_end = 0
460
+ for match in re.finditer(r'\\cellx-?\d+', def_part):
461
+ last_cellx_end = match.end()
462
+
463
+ # Step 3: Extract each cell content
464
+ prev_end = last_cellx_end
465
+ for cell_end in cell_positions:
466
+ cell_content = row_text[prev_end:cell_end]
467
+
468
+ # RTF decoding and cleaning
469
+ decoded = decode_hex_escapes(cell_content, encoding)
470
+ clean = clean_rtf_text(decoded, encoding)
471
+ cell_texts.append(clean)
472
+
473
+ prev_end = cell_end + 5 # len('\\cell') = 5
474
+
475
+ return cell_texts
476
+
477
+
478
+ __all__ = [
479
+ 'RTFCellInfo',
480
+ 'RTFTable',
481
+ 'extract_tables_with_positions',
482
+ ]