xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,832 @@
1
+ # chunking_helper/table_chunker.py
2
+ """
3
+ Table Chunker - Core table chunking logic
4
+
5
+ Main Features:
6
+ - Split large HTML tables to fit chunk_size
7
+ - Split large Markdown tables to fit chunk_size
8
+ - Preserve and restore table structure (headers)
9
+ - rowspan/colspan aware splitting for HTML
10
+ - rowspan adjustment
11
+ - NO OVERLAP for table chunks (intentional to prevent data duplication)
12
+ """
13
+ import logging
14
+ import re
15
+ from typing import Dict, List, Optional
16
+
17
+ from xgen_doc2chunk.chunking.constants import (
18
+ ParsedTable, TableRow, ParsedMarkdownTable,
19
+ TABLE_WRAPPER_OVERHEAD, CHUNK_INDEX_OVERHEAD,
20
+ MARKDOWN_TABLE_SEPARATOR_PATTERN
21
+ )
22
+ from xgen_doc2chunk.chunking.table_parser import (
23
+ parse_html_table, extract_cell_spans_with_positions, has_complex_spans
24
+ )
25
+
26
+ logger = logging.getLogger("document-processor")
27
+
28
+
29
+ def calculate_available_space(
30
+ chunk_size: int,
31
+ header_size: int,
32
+ chunk_index: int = 0,
33
+ total_chunks: int = 1
34
+ ) -> int:
35
+ """
36
+ Calculate available space for data rows in a chunk.
37
+
38
+ Args:
39
+ chunk_size: Total chunk size
40
+ header_size: Header size
41
+ chunk_index: Current chunk index (0-based)
42
+ total_chunks: Expected total number of chunks
43
+
44
+ Returns:
45
+ Number of characters available for data rows
46
+ """
47
+ # Fixed overhead
48
+ overhead = TABLE_WRAPPER_OVERHEAD
49
+
50
+ # Chunk index metadata overhead (only when total chunks > 1)
51
+ if total_chunks > 1:
52
+ overhead += CHUNK_INDEX_OVERHEAD
53
+
54
+ # Header overhead (include header even for non-first chunks)
55
+ overhead += header_size
56
+
57
+ available = chunk_size - overhead
58
+
59
+ return max(available, 100) # Guarantee at least 100 characters
60
+
61
+
62
+ def adjust_rowspan_in_chunk(rows_html: List[str], total_rows_in_chunk: int) -> List[str]:
63
+ """
64
+ Readjust rowspan values for rows in a chunk.
65
+
66
+ Adjusts rowspan values to match the number of rows included in the chunk
67
+ so that the table renders correctly.
68
+
69
+ Args:
70
+ rows_html: List of HTML row strings included in the chunk
71
+ total_rows_in_chunk: Total number of rows in the chunk
72
+
73
+ Returns:
74
+ List of HTML row strings with adjusted rowspan values
75
+ """
76
+ if not rows_html:
77
+ return rows_html
78
+
79
+ adjusted_rows = []
80
+
81
+ for row_idx, row_html in enumerate(rows_html):
82
+ remaining_rows = total_rows_in_chunk - row_idx
83
+
84
+ def adjust_cell_rowspan(match):
85
+ """Callback function to adjust cell rowspan"""
86
+ tag = match.group(1) # td or th
87
+ attrs = match.group(2)
88
+ content = match.group(3)
89
+
90
+ # Extract current rowspan
91
+ rowspan_match = re.search(r'rowspan=["\']?(\d+)["\']?', attrs, re.IGNORECASE)
92
+ if rowspan_match:
93
+ original_rowspan = int(rowspan_match.group(1))
94
+
95
+ # Adjust if greater than remaining rows
96
+ adjusted_rowspan = min(original_rowspan, remaining_rows)
97
+
98
+ if adjusted_rowspan <= 1:
99
+ # Remove attribute if rowspan=1
100
+ new_attrs = re.sub(r'\s*rowspan=["\']?\d+["\']?', '', attrs, flags=re.IGNORECASE)
101
+ else:
102
+ # Adjust rowspan value
103
+ new_attrs = re.sub(
104
+ r'rowspan=["\']?\d+["\']?',
105
+ f"rowspan='{adjusted_rowspan}'",
106
+ attrs,
107
+ flags=re.IGNORECASE
108
+ )
109
+
110
+ return f'<{tag}{new_attrs}>{content}</{tag}>'
111
+
112
+ return match.group(0)
113
+
114
+ # Cell pattern: <td ...>...</td> or <th ...>...</th>
115
+ cell_pattern = r'<(td|th)([^>]*)>(.*?)</\1>'
116
+ adjusted_row = re.sub(cell_pattern, adjust_cell_rowspan, row_html, flags=re.DOTALL | re.IGNORECASE)
117
+
118
+ adjusted_rows.append(adjusted_row)
119
+
120
+ return adjusted_rows
121
+
122
+
123
+ def build_table_chunk(
124
+ header_html: str,
125
+ data_rows: List[TableRow],
126
+ chunk_index: int = 0,
127
+ total_chunks: int = 1,
128
+ context_prefix: str = ""
129
+ ) -> str:
130
+ """
131
+ Build a complete table HTML for a chunk.
132
+
133
+ Automatically adjusts rowspan if it exceeds the chunk boundary.
134
+
135
+ Args:
136
+ header_html: HTML of header rows
137
+ data_rows: Data rows
138
+ chunk_index: Current chunk index (0-based)
139
+ total_chunks: Total number of chunks
140
+ context_prefix: Context info (metadata, sheet info, etc.) - included in all chunks
141
+
142
+ Returns:
143
+ Complete table HTML
144
+ """
145
+ parts = []
146
+
147
+ # Context info (metadata, sheet info, etc.) - included in all chunks
148
+ if context_prefix:
149
+ parts.append(context_prefix)
150
+
151
+ # Chunk index metadata (only when more than 1 chunk)
152
+ if total_chunks > 1:
153
+ parts.append(f"[Table Chunk {chunk_index + 1}/{total_chunks}]")
154
+
155
+ # Table start
156
+ parts.append("<table border='1'>")
157
+
158
+ # Header (if exists)
159
+ if header_html:
160
+ parts.append(header_html)
161
+
162
+ # Extract HTML for data rows
163
+ rows_html = [row.html for row in data_rows]
164
+
165
+ # Adjust rowspan
166
+ adjusted_rows = adjust_rowspan_in_chunk(rows_html, len(data_rows))
167
+
168
+ # Add adjusted rows
169
+ for row_html in adjusted_rows:
170
+ parts.append(row_html)
171
+
172
+ # Table end
173
+ parts.append("</table>")
174
+
175
+ return "\n".join(parts)
176
+
177
+
178
+ def update_chunk_metadata(chunks: List[str], total_chunks: int) -> List[str]:
179
+ """
180
+ Update chunk metadata (total chunk count).
181
+ """
182
+ updated_chunks = []
183
+
184
+ for idx, chunk in enumerate(chunks):
185
+ # Existing metadata pattern
186
+ old_pattern = r'\[Table Chunk \d+/\d+\]'
187
+ new_metadata = f"[Table Chunk {idx + 1}/{total_chunks}]"
188
+
189
+ if re.search(old_pattern, chunk):
190
+ updated_chunk = re.sub(old_pattern, new_metadata, chunk)
191
+ else:
192
+ # Add metadata if not present
193
+ updated_chunk = f"{new_metadata}\n{chunk}"
194
+
195
+ updated_chunks.append(updated_chunk)
196
+
197
+ return updated_chunks
198
+
199
+
200
+ def split_table_into_chunks(
201
+ parsed_table: ParsedTable,
202
+ chunk_size: int,
203
+ chunk_overlap: int = 0,
204
+ context_prefix: str = ""
205
+ ) -> List[str]:
206
+ """
207
+ Split a parsed table to fit chunk_size.
208
+ Each chunk has a complete table structure (including headers).
209
+
210
+ NOTE: Table chunking does NOT apply overlap.
211
+ Data duplication degrades search quality, so overlap is intentionally excluded.
212
+
213
+ Row splitting rules:
214
+ - Minimum 1 row per chunk (rows are NEVER split)
215
+ - Chunks can expand up to 1.5x of chunk_size to include more rows
216
+ - Only exceeds chunk_size when necessary to maintain row integrity
217
+
218
+ Args:
219
+ parsed_table: Parsed table information
220
+ chunk_size: Maximum chunk size
221
+ chunk_overlap: Not used (kept for compatibility)
222
+ context_prefix: Context info (metadata, sheet info, etc.) - included in all chunks
223
+
224
+ Returns:
225
+ List of split table HTML chunks
226
+ """
227
+ data_rows = parsed_table.data_rows
228
+ header_html = parsed_table.header_html
229
+ header_size = parsed_table.header_size
230
+
231
+ # Calculate context size
232
+ context_size = len(context_prefix) + 2 if context_prefix else 0 # Including newline
233
+
234
+ if not data_rows:
235
+ # Return original if no data rows
236
+ return [parsed_table.original_html]
237
+
238
+ # Calculate estimated chunk count (approximate)
239
+ total_data_size = sum(row.char_length for row in data_rows)
240
+ available_per_chunk = calculate_available_space(chunk_size, header_size + context_size, 0, 1)
241
+ estimated_chunks = max(1, (total_data_size + available_per_chunk - 1) // available_per_chunk)
242
+
243
+ # Recalculate with actual chunk count
244
+ available_per_chunk = calculate_available_space(chunk_size, header_size + context_size, 0, estimated_chunks)
245
+
246
+ # Maximum allowed chunk size (1.5x of chunk_size)
247
+ max_chunk_data_size = int(chunk_size * 1.5) - header_size - context_size - CHUNK_INDEX_OVERHEAD
248
+
249
+ chunks: List[str] = []
250
+ current_rows: List[TableRow] = []
251
+ current_size = 0
252
+ # Table chunking does not apply overlap (prevent data duplication)
253
+
254
+ for row_idx, row in enumerate(data_rows):
255
+ row_size = row.char_length + 1 # Including newline
256
+
257
+ # Check if adding this row exceeds available space
258
+ if current_rows and (current_size + row_size > available_per_chunk):
259
+ # Check if we can still fit within 1.5x limit
260
+ if current_size + row_size <= max_chunk_data_size:
261
+ # Still within 1.5x limit - add row to current chunk
262
+ current_rows.append(row)
263
+ current_size += row_size
264
+ else:
265
+ # Exceeds 1.5x limit - flush current chunk and start new one
266
+ chunk_html = build_table_chunk(
267
+ header_html,
268
+ current_rows,
269
+ chunk_index=len(chunks),
270
+ total_chunks=estimated_chunks,
271
+ context_prefix=context_prefix
272
+ )
273
+ chunks.append(chunk_html)
274
+
275
+ # Start new chunk with this row (minimum 1 row guaranteed)
276
+ current_rows = [row]
277
+ current_size = row_size
278
+ else:
279
+ # Row fits - add to current chunk
280
+ current_rows.append(row)
281
+ current_size += row_size
282
+
283
+ # Process last chunk
284
+ if current_rows:
285
+ chunk_html = build_table_chunk(
286
+ header_html,
287
+ current_rows,
288
+ chunk_index=len(chunks),
289
+ total_chunks=max(len(chunks) + 1, estimated_chunks),
290
+ context_prefix=context_prefix
291
+ )
292
+ chunks.append(chunk_html)
293
+
294
+ # Update metadata with actual total chunk count
295
+ if len(chunks) != estimated_chunks and len(chunks) > 1:
296
+ chunks = update_chunk_metadata(chunks, len(chunks))
297
+
298
+ logger.info(f"Table split into {len(chunks)} chunks (original: {len(parsed_table.original_html)} chars)")
299
+
300
+ return chunks
301
+
302
+
303
+ def split_table_preserving_rowspan(
304
+ parsed_table: ParsedTable,
305
+ chunk_size: int,
306
+ chunk_overlap: int,
307
+ context_prefix: str = ""
308
+ ) -> List[str]:
309
+ """
310
+ Split a table considering rowspan.
311
+
312
+ Rows connected by rowspan are kept together as semantic blocks.
313
+
314
+ NOTE: Table chunking does NOT apply overlap.
315
+ Data duplication degrades search quality, so overlap is intentionally excluded.
316
+
317
+ Algorithm:
318
+ 1. Track active rowspan for each row (by column position, considering colspan)
319
+ 2. If all rowspans from previous row end and new rowspan starts, create new block
320
+ 3. Combine blocks to fit chunk_size
321
+
322
+ Args:
323
+ parsed_table: Parsed table
324
+ chunk_size: Chunk size
325
+ chunk_overlap: Not used (kept for compatibility)
326
+ context_prefix: Context info (metadata, sheet info, etc.)
327
+
328
+ Returns:
329
+ List of split table chunks
330
+ """
331
+ data_rows = parsed_table.data_rows
332
+ header_html = parsed_table.header_html
333
+ header_size = parsed_table.header_size
334
+
335
+ # Calculate context size
336
+ context_size = len(context_prefix) + 2 if context_prefix else 0
337
+
338
+ if not data_rows:
339
+ if context_prefix:
340
+ return [f"{context_prefix}\n{parsed_table.original_html}"]
341
+ return [parsed_table.original_html]
342
+
343
+ # === Identify rowspan blocks ===
344
+ # Block = group of consecutive rows connected by rowspan
345
+ active_rowspans: Dict[int, int] = {} # column_position -> remaining_rows (including current row)
346
+ row_block_ids: List[int] = [] # Block ID for each row
347
+ current_block_id = -1
348
+
349
+ for row_idx, row in enumerate(data_rows):
350
+ # 1. Decrease remaining rowspan from previous row (except first row)
351
+ if row_idx > 0:
352
+ finished_cols = []
353
+ for col in list(active_rowspans.keys()):
354
+ active_rowspans[col] -= 1
355
+ if active_rowspans[col] <= 0:
356
+ finished_cols.append(col)
357
+ for col in finished_cols:
358
+ del active_rowspans[col]
359
+
360
+ # State after decrease (before adding new spans)
361
+ had_active_before_new = len(active_rowspans) > 0
362
+
363
+ # 2. Add new rowspans starting from current row
364
+ new_spans = extract_cell_spans_with_positions(row.html)
365
+ for col, span in new_spans.items():
366
+ # Update if larger than existing rowspan (longer span takes priority)
367
+ if col not in active_rowspans or span > active_rowspans[col]:
368
+ active_rowspans[col] = span
369
+
370
+ has_active_now = len(active_rowspans) > 0
371
+ has_new_span = len(new_spans) > 0
372
+
373
+ # Block determination logic:
374
+ # - No active rowspan -> independent block
375
+ # - No active after previous row processing but new span starts -> new block
376
+ # - Otherwise maintain existing block
377
+ if not has_active_now:
378
+ # No rowspan - independent row
379
+ current_block_id += 1
380
+ row_block_ids.append(current_block_id)
381
+ elif not had_active_before_new and has_new_span:
382
+ # All previous rowspans ended and new rowspan starts - new block
383
+ current_block_id += 1
384
+ row_block_ids.append(current_block_id)
385
+ else:
386
+ # Maintain existing block
387
+ row_block_ids.append(current_block_id)
388
+
389
+ # Group rows by block
390
+ block_groups: Dict[int, List[int]] = {}
391
+ for row_idx, block_id in enumerate(row_block_ids):
392
+ if block_id not in block_groups:
393
+ block_groups[block_id] = []
394
+ block_groups[block_id].append(row_idx)
395
+
396
+ # Create row_groups in sorted block order
397
+ row_groups: List[List[int]] = [
398
+ block_groups[block_id]
399
+ for block_id in sorted(block_groups.keys())
400
+ ]
401
+
402
+ # === Combine groups into chunks ===
403
+ chunks: List[str] = []
404
+ current_rows: List[TableRow] = []
405
+ current_size = 0
406
+
407
+ available_space = calculate_available_space(chunk_size, header_size + context_size, 0, 1)
408
+ # Maximum allowed chunk size (1.5x of chunk_size)
409
+ max_chunk_data_size = int(chunk_size * 1.5) - header_size - context_size - CHUNK_INDEX_OVERHEAD
410
+
411
+ for group in row_groups:
412
+ group_rows = [data_rows[idx] for idx in group]
413
+ group_size = sum(row.char_length + 1 for row in group_rows)
414
+
415
+ if current_rows and current_size + group_size > available_space:
416
+ # Check if we can still fit within 1.5x limit
417
+ if current_size + group_size <= max_chunk_data_size:
418
+ # Still within 1.5x limit - add group to current chunk
419
+ current_rows.extend(group_rows)
420
+ current_size += group_size
421
+ else:
422
+ # Exceeds 1.5x limit - flush current chunk and start new one
423
+ chunks.append(build_table_chunk(
424
+ header_html, current_rows, len(chunks), len(chunks) + 2,
425
+ context_prefix=context_prefix
426
+ ))
427
+ current_rows = group_rows[:]
428
+ current_size = group_size
429
+ else:
430
+ current_rows.extend(group_rows)
431
+ current_size += group_size
432
+
433
+ # Last chunk
434
+ if current_rows:
435
+ chunks.append(build_table_chunk(
436
+ header_html, current_rows, len(chunks), len(chunks) + 1,
437
+ context_prefix=context_prefix
438
+ ))
439
+
440
+ # Update chunk count
441
+ if len(chunks) > 1:
442
+ chunks = update_chunk_metadata(chunks, len(chunks))
443
+
444
+ return chunks
445
+
446
+
447
+ def chunk_large_table(
448
+ table_html: str,
449
+ chunk_size: int,
450
+ chunk_overlap: int,
451
+ context_prefix: str = ""
452
+ ) -> List[str]:
453
+ """
454
+ Split large HTML table to fit chunk_size.
455
+ Restores table structure (headers) in each chunk.
456
+
457
+ Also handles complex tables with rowspan.
458
+
459
+ NOTE: Table chunking does NOT apply overlap.
460
+ Data duplication degrades search quality, so overlap is intentionally excluded.
461
+
462
+ Args:
463
+ table_html: HTML table string
464
+ chunk_size: Maximum chunk size
465
+ chunk_overlap: Not used (kept for compatibility)
466
+ context_prefix: Context info (metadata, sheet info, etc.) - included in all chunks
467
+
468
+ Returns:
469
+ List of split table HTML chunks
470
+ """
471
+ # Parse table
472
+ parsed = parse_html_table(table_html)
473
+
474
+ if not parsed:
475
+ logger.warning("Failed to parse table, returning original")
476
+ if context_prefix:
477
+ return [f"{context_prefix}\n{table_html}"]
478
+ return [table_html]
479
+
480
+ # No need to split if table fits in chunk_size
481
+ if len(table_html) + len(context_prefix) <= chunk_size:
482
+ if context_prefix:
483
+ return [f"{context_prefix}\n{table_html}"]
484
+ return [table_html]
485
+
486
+ # No need to split if no data rows
487
+ if not parsed.data_rows:
488
+ if context_prefix:
489
+ return [f"{context_prefix}\n{table_html}"]
490
+ return [table_html]
491
+
492
+ # Check for complex spans (rowspan)
493
+ if has_complex_spans(table_html):
494
+ logger.info("Complex table with rowspan detected, using span-aware splitting")
495
+ return split_table_preserving_rowspan(parsed, chunk_size, chunk_overlap, context_prefix)
496
+
497
+ # Standard table splitting
498
+ chunks = split_table_into_chunks(parsed, chunk_size, chunk_overlap, context_prefix)
499
+
500
+ return chunks
501
+
502
+
503
+ # ============================================================================
504
+ # Markdown Table Chunking Functions
505
+ # ============================================================================
506
+
507
+ def parse_markdown_table(table_text: str) -> Optional[ParsedMarkdownTable]:
508
+ """
509
+ Parse a Markdown table and extract structural information.
510
+
511
+ A Markdown table has:
512
+ - Header row: | col1 | col2 | col3 |
513
+ - Separator row: |---|---|---| or |:---:|:---|---:|
514
+ - Data rows: | data1 | data2 | data3 |
515
+
516
+ Args:
517
+ table_text: Markdown table text
518
+
519
+ Returns:
520
+ ParsedMarkdownTable object or None if parsing fails
521
+ """
522
+ try:
523
+ # Split into lines and filter empty lines
524
+ lines = [line.strip() for line in table_text.strip().split('\n') if line.strip()]
525
+
526
+ if len(lines) < 2:
527
+ logger.debug("Not enough lines for a valid Markdown table")
528
+ return None
529
+
530
+ # Find header and separator rows
531
+ header_row = None
532
+ separator_row = None
533
+ separator_idx = -1
534
+
535
+ for idx, line in enumerate(lines):
536
+ # Check if this line is a separator (contains only |, -, :, and spaces)
537
+ if re.match(MARKDOWN_TABLE_SEPARATOR_PATTERN, line):
538
+ separator_row = line
539
+ separator_idx = idx
540
+ # Header is the line before separator
541
+ if idx > 0:
542
+ header_row = lines[idx - 1]
543
+ break
544
+
545
+ if not separator_row or not header_row:
546
+ # Try simpler detection: first row is header, second row is separator
547
+ if len(lines) >= 2 and lines[0].startswith('|') and '---' in lines[1]:
548
+ header_row = lines[0]
549
+ separator_row = lines[1]
550
+ separator_idx = 1
551
+ else:
552
+ logger.debug("Could not identify header/separator in Markdown table")
553
+ return None
554
+
555
+ # Count columns from separator
556
+ total_cols = separator_row.count('|') - 1 # -1 because |---|---| has n+1 pipes for n columns
557
+
558
+ # Data rows are all rows after separator
559
+ data_rows = lines[separator_idx + 1:]
560
+
561
+ # Construct header text (header + separator) for restoration in each chunk
562
+ header_text = f"{header_row}\n{separator_row}"
563
+ header_size = len(header_text) + 1 # +1 for newline
564
+
565
+ return ParsedMarkdownTable(
566
+ header_row=header_row,
567
+ separator_row=separator_row,
568
+ data_rows=data_rows,
569
+ total_cols=total_cols,
570
+ original_text=table_text,
571
+ header_text=header_text,
572
+ header_size=header_size
573
+ )
574
+
575
+ except Exception as e:
576
+ logger.warning(f"Failed to parse Markdown table: {e}")
577
+ return None
578
+
579
+
580
+ def build_markdown_table_chunk(
581
+ header_text: str,
582
+ data_rows: List[str],
583
+ chunk_index: int = 0,
584
+ total_chunks: int = 1,
585
+ context_prefix: str = ""
586
+ ) -> str:
587
+ """
588
+ Build a complete Markdown table chunk with header restored.
589
+
590
+ Args:
591
+ header_text: Header row + separator row
592
+ data_rows: List of data row strings
593
+ chunk_index: Current chunk index (0-based)
594
+ total_chunks: Total number of chunks
595
+ context_prefix: Context info (metadata, sheet info, etc.) - included in all chunks
596
+
597
+ Returns:
598
+ Complete Markdown table chunk
599
+ """
600
+ parts = []
601
+
602
+ # Add context prefix if provided
603
+ if context_prefix:
604
+ parts.append(context_prefix)
605
+
606
+ # Add chunk index metadata (only if more than 1 chunk)
607
+ if total_chunks > 1:
608
+ parts.append(f"[Table Chunk {chunk_index + 1}/{total_chunks}]")
609
+
610
+ # Add header (header row + separator row)
611
+ parts.append(header_text)
612
+
613
+ # Add data rows
614
+ for row in data_rows:
615
+ parts.append(row)
616
+
617
+ return "\n".join(parts)
618
+
619
+
620
+ def update_markdown_chunk_metadata(chunks: List[str], total_chunks: int) -> List[str]:
621
+ """
622
+ Update chunk metadata (total chunk count) in Markdown table chunks.
623
+
624
+ Args:
625
+ chunks: List of chunks
626
+ total_chunks: Actual total number of chunks
627
+
628
+ Returns:
629
+ Updated chunks with correct metadata
630
+ """
631
+ updated_chunks = []
632
+
633
+ for idx, chunk in enumerate(chunks):
634
+ # Pattern for existing metadata
635
+ old_pattern = r'\[Table Chunk \d+/\d+\]'
636
+ new_metadata = f"[Table Chunk {idx + 1}/{total_chunks}]"
637
+
638
+ if re.search(old_pattern, chunk):
639
+ updated_chunk = re.sub(old_pattern, new_metadata, chunk)
640
+ else:
641
+ # No metadata found - add it
642
+ updated_chunk = f"{new_metadata}\n{chunk}"
643
+
644
+ updated_chunks.append(updated_chunk)
645
+
646
+ return updated_chunks
647
+
648
+
649
+ def split_markdown_table_into_chunks(
650
+ parsed_table: ParsedMarkdownTable,
651
+ chunk_size: int,
652
+ chunk_overlap: int = 0,
653
+ context_prefix: str = ""
654
+ ) -> List[str]:
655
+ """
656
+ Split a parsed Markdown table into chunks that fit chunk_size.
657
+ Each chunk is a complete Markdown table with headers restored.
658
+
659
+ NOTE: Table chunking does NOT apply overlap.
660
+ Data duplication degrades search quality, so overlap is intentionally excluded.
661
+
662
+ Args:
663
+ parsed_table: Parsed Markdown table information
664
+ chunk_size: Maximum chunk size
665
+ chunk_overlap: Not used (kept for compatibility)
666
+ context_prefix: Context info (metadata, sheet info, etc.) - included in all chunks
667
+
668
+ Returns:
669
+ List of Markdown table chunk strings
670
+ """
671
+ data_rows = parsed_table.data_rows
672
+ header_text = parsed_table.header_text
673
+ header_size = parsed_table.header_size
674
+
675
+ # Calculate context size
676
+ context_size = len(context_prefix) + 2 if context_prefix else 0 # +2 for newline
677
+
678
+ if not data_rows:
679
+ # No data rows - return original
680
+ if context_prefix:
681
+ return [f"{context_prefix}\n{parsed_table.original_text}"]
682
+ return [parsed_table.original_text]
683
+
684
+ # Calculate available space per chunk
685
+ # Overhead: chunk index metadata (~25 chars) + header + context
686
+ estimated_chunks = 1
687
+ total_data_size = sum(len(row) + 1 for row in data_rows) # +1 for newline
688
+ available_per_chunk = chunk_size - header_size - context_size - CHUNK_INDEX_OVERHEAD
689
+
690
+ if available_per_chunk > 0:
691
+ estimated_chunks = max(1, (total_data_size + available_per_chunk - 1) // available_per_chunk)
692
+
693
+ # Recalculate with estimated chunks
694
+ if estimated_chunks > 1:
695
+ available_per_chunk = chunk_size - header_size - context_size - CHUNK_INDEX_OVERHEAD
696
+ else:
697
+ available_per_chunk = chunk_size - header_size - context_size
698
+
699
+ # Maximum allowed chunk size (1.5x of chunk_size)
700
+ max_chunk_data_size = int(chunk_size * 1.5) - header_size - context_size - CHUNK_INDEX_OVERHEAD
701
+
702
+ chunks: List[str] = []
703
+ current_rows: List[str] = []
704
+ current_size = 0
705
+
706
+ for row in data_rows:
707
+ row_size = len(row) + 1 # +1 for newline
708
+
709
+ # Check if adding this row exceeds available space
710
+ if current_rows and (current_size + row_size > available_per_chunk):
711
+ # Check if we can still fit within 1.5x limit
712
+ if current_size + row_size <= max_chunk_data_size:
713
+ # Still within 1.5x limit - add row to current chunk
714
+ current_rows.append(row)
715
+ current_size += row_size
716
+ else:
717
+ # Exceeds 1.5x limit - flush current chunk and start new one
718
+ chunk_text = build_markdown_table_chunk(
719
+ header_text,
720
+ current_rows,
721
+ chunk_index=len(chunks),
722
+ total_chunks=estimated_chunks,
723
+ context_prefix=context_prefix
724
+ )
725
+ chunks.append(chunk_text)
726
+
727
+ # Start new chunk with this row (minimum 1 row guaranteed)
728
+ current_rows = [row]
729
+ current_size = row_size
730
+ else:
731
+ # Row fits - add to current chunk
732
+ current_rows.append(row)
733
+ current_size += row_size
734
+
735
+ # Handle last chunk
736
+ if current_rows:
737
+ chunk_text = build_markdown_table_chunk(
738
+ header_text,
739
+ current_rows,
740
+ chunk_index=len(chunks),
741
+ total_chunks=max(len(chunks) + 1, estimated_chunks),
742
+ context_prefix=context_prefix
743
+ )
744
+ chunks.append(chunk_text)
745
+
746
+ # Update total chunk count in metadata if different from estimate
747
+ if len(chunks) != estimated_chunks and len(chunks) > 1:
748
+ chunks = update_markdown_chunk_metadata(chunks, len(chunks))
749
+
750
+ logger.info(f"Markdown table split into {len(chunks)} chunks (original: {len(parsed_table.original_text)} chars)")
751
+
752
+ return chunks
753
+
754
+
755
+ def chunk_large_markdown_table(
756
+ table_text: str,
757
+ chunk_size: int,
758
+ chunk_overlap: int,
759
+ context_prefix: str = ""
760
+ ) -> List[str]:
761
+ """
762
+ Split a large Markdown table to fit chunk_size.
763
+ Restores table structure (header + separator) in each chunk.
764
+
765
+ NOTE: Table chunking does NOT apply overlap.
766
+ Data duplication degrades search quality, so overlap is intentionally excluded.
767
+
768
+ Args:
769
+ table_text: Markdown table text
770
+ chunk_size: Maximum chunk size
771
+ chunk_overlap: Not used (kept for compatibility)
772
+ context_prefix: Context info (metadata, sheet info, etc.) - included in all chunks
773
+
774
+ Returns:
775
+ List of split Markdown table chunks
776
+ """
777
+ # Parse table
778
+ parsed = parse_markdown_table(table_text)
779
+
780
+ if not parsed:
781
+ logger.warning("Failed to parse Markdown table, returning original")
782
+ if context_prefix:
783
+ return [f"{context_prefix}\n{table_text}"]
784
+ return [table_text]
785
+
786
+ # No need to split if table fits in chunk_size
787
+ if len(table_text) + len(context_prefix) <= chunk_size:
788
+ if context_prefix:
789
+ return [f"{context_prefix}\n{table_text}"]
790
+ return [table_text]
791
+
792
+ # No need to split if no data rows
793
+ if not parsed.data_rows:
794
+ if context_prefix:
795
+ return [f"{context_prefix}\n{table_text}"]
796
+ return [table_text]
797
+
798
+ # Split table into chunks
799
+ chunks = split_markdown_table_into_chunks(parsed, chunk_size, chunk_overlap, context_prefix)
800
+
801
+ return chunks
802
+
803
+
804
+ def is_markdown_table(text: str) -> bool:
805
+ """
806
+ Check if text is a Markdown table.
807
+
808
+ A Markdown table has:
809
+ - Lines starting with |
810
+ - A separator line with |---|
811
+
812
+ Args:
813
+ text: Text to check
814
+
815
+ Returns:
816
+ True if text is a Markdown table
817
+ """
818
+ lines = text.strip().split('\n')
819
+ if len(lines) < 2:
820
+ return False
821
+
822
+ # Check for | at start of lines and separator pattern
823
+ has_pipe_rows = any(line.strip().startswith('|') for line in lines)
824
+ has_separator = any('---' in line and '|' in line for line in lines)
825
+
826
+ return has_pipe_rows and has_separator
827
+
828
+
829
+ # Note: detect_table_type and chunk_large_table_unified were removed because they
830
+ # were not referenced anywhere in the codebase and duplicated logic handled elsewhere
831
+ # (e.g., via _chunk_table_unified in chunking.py). Keeping a single authoritative
832
+ # implementation reduces the risk of divergent behavior.