xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,832 @@
|
|
|
1
|
+
# chunking_helper/table_chunker.py
|
|
2
|
+
"""
|
|
3
|
+
Table Chunker - Core table chunking logic
|
|
4
|
+
|
|
5
|
+
Main Features:
|
|
6
|
+
- Split large HTML tables to fit chunk_size
|
|
7
|
+
- Split large Markdown tables to fit chunk_size
|
|
8
|
+
- Preserve and restore table structure (headers)
|
|
9
|
+
- rowspan/colspan aware splitting for HTML
|
|
10
|
+
- rowspan adjustment
|
|
11
|
+
- NO OVERLAP for table chunks (intentional to prevent data duplication)
|
|
12
|
+
"""
|
|
13
|
+
import logging
|
|
14
|
+
import re
|
|
15
|
+
from typing import Dict, List, Optional
|
|
16
|
+
|
|
17
|
+
from xgen_doc2chunk.chunking.constants import (
|
|
18
|
+
ParsedTable, TableRow, ParsedMarkdownTable,
|
|
19
|
+
TABLE_WRAPPER_OVERHEAD, CHUNK_INDEX_OVERHEAD,
|
|
20
|
+
MARKDOWN_TABLE_SEPARATOR_PATTERN
|
|
21
|
+
)
|
|
22
|
+
from xgen_doc2chunk.chunking.table_parser import (
|
|
23
|
+
parse_html_table, extract_cell_spans_with_positions, has_complex_spans
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger("document-processor")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def calculate_available_space(
|
|
30
|
+
chunk_size: int,
|
|
31
|
+
header_size: int,
|
|
32
|
+
chunk_index: int = 0,
|
|
33
|
+
total_chunks: int = 1
|
|
34
|
+
) -> int:
|
|
35
|
+
"""
|
|
36
|
+
Calculate available space for data rows in a chunk.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
chunk_size: Total chunk size
|
|
40
|
+
header_size: Header size
|
|
41
|
+
chunk_index: Current chunk index (0-based)
|
|
42
|
+
total_chunks: Expected total number of chunks
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
Number of characters available for data rows
|
|
46
|
+
"""
|
|
47
|
+
# Fixed overhead
|
|
48
|
+
overhead = TABLE_WRAPPER_OVERHEAD
|
|
49
|
+
|
|
50
|
+
# Chunk index metadata overhead (only when total chunks > 1)
|
|
51
|
+
if total_chunks > 1:
|
|
52
|
+
overhead += CHUNK_INDEX_OVERHEAD
|
|
53
|
+
|
|
54
|
+
# Header overhead (include header even for non-first chunks)
|
|
55
|
+
overhead += header_size
|
|
56
|
+
|
|
57
|
+
available = chunk_size - overhead
|
|
58
|
+
|
|
59
|
+
return max(available, 100) # Guarantee at least 100 characters
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def adjust_rowspan_in_chunk(rows_html: List[str], total_rows_in_chunk: int) -> List[str]:
|
|
63
|
+
"""
|
|
64
|
+
Readjust rowspan values for rows in a chunk.
|
|
65
|
+
|
|
66
|
+
Adjusts rowspan values to match the number of rows included in the chunk
|
|
67
|
+
so that the table renders correctly.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
rows_html: List of HTML row strings included in the chunk
|
|
71
|
+
total_rows_in_chunk: Total number of rows in the chunk
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
List of HTML row strings with adjusted rowspan values
|
|
75
|
+
"""
|
|
76
|
+
if not rows_html:
|
|
77
|
+
return rows_html
|
|
78
|
+
|
|
79
|
+
adjusted_rows = []
|
|
80
|
+
|
|
81
|
+
for row_idx, row_html in enumerate(rows_html):
|
|
82
|
+
remaining_rows = total_rows_in_chunk - row_idx
|
|
83
|
+
|
|
84
|
+
def adjust_cell_rowspan(match):
|
|
85
|
+
"""Callback function to adjust cell rowspan"""
|
|
86
|
+
tag = match.group(1) # td or th
|
|
87
|
+
attrs = match.group(2)
|
|
88
|
+
content = match.group(3)
|
|
89
|
+
|
|
90
|
+
# Extract current rowspan
|
|
91
|
+
rowspan_match = re.search(r'rowspan=["\']?(\d+)["\']?', attrs, re.IGNORECASE)
|
|
92
|
+
if rowspan_match:
|
|
93
|
+
original_rowspan = int(rowspan_match.group(1))
|
|
94
|
+
|
|
95
|
+
# Adjust if greater than remaining rows
|
|
96
|
+
adjusted_rowspan = min(original_rowspan, remaining_rows)
|
|
97
|
+
|
|
98
|
+
if adjusted_rowspan <= 1:
|
|
99
|
+
# Remove attribute if rowspan=1
|
|
100
|
+
new_attrs = re.sub(r'\s*rowspan=["\']?\d+["\']?', '', attrs, flags=re.IGNORECASE)
|
|
101
|
+
else:
|
|
102
|
+
# Adjust rowspan value
|
|
103
|
+
new_attrs = re.sub(
|
|
104
|
+
r'rowspan=["\']?\d+["\']?',
|
|
105
|
+
f"rowspan='{adjusted_rowspan}'",
|
|
106
|
+
attrs,
|
|
107
|
+
flags=re.IGNORECASE
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
return f'<{tag}{new_attrs}>{content}</{tag}>'
|
|
111
|
+
|
|
112
|
+
return match.group(0)
|
|
113
|
+
|
|
114
|
+
# Cell pattern: <td ...>...</td> or <th ...>...</th>
|
|
115
|
+
cell_pattern = r'<(td|th)([^>]*)>(.*?)</\1>'
|
|
116
|
+
adjusted_row = re.sub(cell_pattern, adjust_cell_rowspan, row_html, flags=re.DOTALL | re.IGNORECASE)
|
|
117
|
+
|
|
118
|
+
adjusted_rows.append(adjusted_row)
|
|
119
|
+
|
|
120
|
+
return adjusted_rows
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def build_table_chunk(
|
|
124
|
+
header_html: str,
|
|
125
|
+
data_rows: List[TableRow],
|
|
126
|
+
chunk_index: int = 0,
|
|
127
|
+
total_chunks: int = 1,
|
|
128
|
+
context_prefix: str = ""
|
|
129
|
+
) -> str:
|
|
130
|
+
"""
|
|
131
|
+
Build a complete table HTML for a chunk.
|
|
132
|
+
|
|
133
|
+
Automatically adjusts rowspan if it exceeds the chunk boundary.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
header_html: HTML of header rows
|
|
137
|
+
data_rows: Data rows
|
|
138
|
+
chunk_index: Current chunk index (0-based)
|
|
139
|
+
total_chunks: Total number of chunks
|
|
140
|
+
context_prefix: Context info (metadata, sheet info, etc.) - included in all chunks
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
Complete table HTML
|
|
144
|
+
"""
|
|
145
|
+
parts = []
|
|
146
|
+
|
|
147
|
+
# Context info (metadata, sheet info, etc.) - included in all chunks
|
|
148
|
+
if context_prefix:
|
|
149
|
+
parts.append(context_prefix)
|
|
150
|
+
|
|
151
|
+
# Chunk index metadata (only when more than 1 chunk)
|
|
152
|
+
if total_chunks > 1:
|
|
153
|
+
parts.append(f"[Table Chunk {chunk_index + 1}/{total_chunks}]")
|
|
154
|
+
|
|
155
|
+
# Table start
|
|
156
|
+
parts.append("<table border='1'>")
|
|
157
|
+
|
|
158
|
+
# Header (if exists)
|
|
159
|
+
if header_html:
|
|
160
|
+
parts.append(header_html)
|
|
161
|
+
|
|
162
|
+
# Extract HTML for data rows
|
|
163
|
+
rows_html = [row.html for row in data_rows]
|
|
164
|
+
|
|
165
|
+
# Adjust rowspan
|
|
166
|
+
adjusted_rows = adjust_rowspan_in_chunk(rows_html, len(data_rows))
|
|
167
|
+
|
|
168
|
+
# Add adjusted rows
|
|
169
|
+
for row_html in adjusted_rows:
|
|
170
|
+
parts.append(row_html)
|
|
171
|
+
|
|
172
|
+
# Table end
|
|
173
|
+
parts.append("</table>")
|
|
174
|
+
|
|
175
|
+
return "\n".join(parts)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def update_chunk_metadata(chunks: List[str], total_chunks: int) -> List[str]:
|
|
179
|
+
"""
|
|
180
|
+
Update chunk metadata (total chunk count).
|
|
181
|
+
"""
|
|
182
|
+
updated_chunks = []
|
|
183
|
+
|
|
184
|
+
for idx, chunk in enumerate(chunks):
|
|
185
|
+
# Existing metadata pattern
|
|
186
|
+
old_pattern = r'\[Table Chunk \d+/\d+\]'
|
|
187
|
+
new_metadata = f"[Table Chunk {idx + 1}/{total_chunks}]"
|
|
188
|
+
|
|
189
|
+
if re.search(old_pattern, chunk):
|
|
190
|
+
updated_chunk = re.sub(old_pattern, new_metadata, chunk)
|
|
191
|
+
else:
|
|
192
|
+
# Add metadata if not present
|
|
193
|
+
updated_chunk = f"{new_metadata}\n{chunk}"
|
|
194
|
+
|
|
195
|
+
updated_chunks.append(updated_chunk)
|
|
196
|
+
|
|
197
|
+
return updated_chunks
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def split_table_into_chunks(
|
|
201
|
+
parsed_table: ParsedTable,
|
|
202
|
+
chunk_size: int,
|
|
203
|
+
chunk_overlap: int = 0,
|
|
204
|
+
context_prefix: str = ""
|
|
205
|
+
) -> List[str]:
|
|
206
|
+
"""
|
|
207
|
+
Split a parsed table to fit chunk_size.
|
|
208
|
+
Each chunk has a complete table structure (including headers).
|
|
209
|
+
|
|
210
|
+
NOTE: Table chunking does NOT apply overlap.
|
|
211
|
+
Data duplication degrades search quality, so overlap is intentionally excluded.
|
|
212
|
+
|
|
213
|
+
Row splitting rules:
|
|
214
|
+
- Minimum 1 row per chunk (rows are NEVER split)
|
|
215
|
+
- Chunks can expand up to 1.5x of chunk_size to include more rows
|
|
216
|
+
- Only exceeds chunk_size when necessary to maintain row integrity
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
parsed_table: Parsed table information
|
|
220
|
+
chunk_size: Maximum chunk size
|
|
221
|
+
chunk_overlap: Not used (kept for compatibility)
|
|
222
|
+
context_prefix: Context info (metadata, sheet info, etc.) - included in all chunks
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
List of split table HTML chunks
|
|
226
|
+
"""
|
|
227
|
+
data_rows = parsed_table.data_rows
|
|
228
|
+
header_html = parsed_table.header_html
|
|
229
|
+
header_size = parsed_table.header_size
|
|
230
|
+
|
|
231
|
+
# Calculate context size
|
|
232
|
+
context_size = len(context_prefix) + 2 if context_prefix else 0 # Including newline
|
|
233
|
+
|
|
234
|
+
if not data_rows:
|
|
235
|
+
# Return original if no data rows
|
|
236
|
+
return [parsed_table.original_html]
|
|
237
|
+
|
|
238
|
+
# Calculate estimated chunk count (approximate)
|
|
239
|
+
total_data_size = sum(row.char_length for row in data_rows)
|
|
240
|
+
available_per_chunk = calculate_available_space(chunk_size, header_size + context_size, 0, 1)
|
|
241
|
+
estimated_chunks = max(1, (total_data_size + available_per_chunk - 1) // available_per_chunk)
|
|
242
|
+
|
|
243
|
+
# Recalculate with actual chunk count
|
|
244
|
+
available_per_chunk = calculate_available_space(chunk_size, header_size + context_size, 0, estimated_chunks)
|
|
245
|
+
|
|
246
|
+
# Maximum allowed chunk size (1.5x of chunk_size)
|
|
247
|
+
max_chunk_data_size = int(chunk_size * 1.5) - header_size - context_size - CHUNK_INDEX_OVERHEAD
|
|
248
|
+
|
|
249
|
+
chunks: List[str] = []
|
|
250
|
+
current_rows: List[TableRow] = []
|
|
251
|
+
current_size = 0
|
|
252
|
+
# Table chunking does not apply overlap (prevent data duplication)
|
|
253
|
+
|
|
254
|
+
for row_idx, row in enumerate(data_rows):
|
|
255
|
+
row_size = row.char_length + 1 # Including newline
|
|
256
|
+
|
|
257
|
+
# Check if adding this row exceeds available space
|
|
258
|
+
if current_rows and (current_size + row_size > available_per_chunk):
|
|
259
|
+
# Check if we can still fit within 1.5x limit
|
|
260
|
+
if current_size + row_size <= max_chunk_data_size:
|
|
261
|
+
# Still within 1.5x limit - add row to current chunk
|
|
262
|
+
current_rows.append(row)
|
|
263
|
+
current_size += row_size
|
|
264
|
+
else:
|
|
265
|
+
# Exceeds 1.5x limit - flush current chunk and start new one
|
|
266
|
+
chunk_html = build_table_chunk(
|
|
267
|
+
header_html,
|
|
268
|
+
current_rows,
|
|
269
|
+
chunk_index=len(chunks),
|
|
270
|
+
total_chunks=estimated_chunks,
|
|
271
|
+
context_prefix=context_prefix
|
|
272
|
+
)
|
|
273
|
+
chunks.append(chunk_html)
|
|
274
|
+
|
|
275
|
+
# Start new chunk with this row (minimum 1 row guaranteed)
|
|
276
|
+
current_rows = [row]
|
|
277
|
+
current_size = row_size
|
|
278
|
+
else:
|
|
279
|
+
# Row fits - add to current chunk
|
|
280
|
+
current_rows.append(row)
|
|
281
|
+
current_size += row_size
|
|
282
|
+
|
|
283
|
+
# Process last chunk
|
|
284
|
+
if current_rows:
|
|
285
|
+
chunk_html = build_table_chunk(
|
|
286
|
+
header_html,
|
|
287
|
+
current_rows,
|
|
288
|
+
chunk_index=len(chunks),
|
|
289
|
+
total_chunks=max(len(chunks) + 1, estimated_chunks),
|
|
290
|
+
context_prefix=context_prefix
|
|
291
|
+
)
|
|
292
|
+
chunks.append(chunk_html)
|
|
293
|
+
|
|
294
|
+
# Update metadata with actual total chunk count
|
|
295
|
+
if len(chunks) != estimated_chunks and len(chunks) > 1:
|
|
296
|
+
chunks = update_chunk_metadata(chunks, len(chunks))
|
|
297
|
+
|
|
298
|
+
logger.info(f"Table split into {len(chunks)} chunks (original: {len(parsed_table.original_html)} chars)")
|
|
299
|
+
|
|
300
|
+
return chunks
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def split_table_preserving_rowspan(
|
|
304
|
+
parsed_table: ParsedTable,
|
|
305
|
+
chunk_size: int,
|
|
306
|
+
chunk_overlap: int,
|
|
307
|
+
context_prefix: str = ""
|
|
308
|
+
) -> List[str]:
|
|
309
|
+
"""
|
|
310
|
+
Split a table considering rowspan.
|
|
311
|
+
|
|
312
|
+
Rows connected by rowspan are kept together as semantic blocks.
|
|
313
|
+
|
|
314
|
+
NOTE: Table chunking does NOT apply overlap.
|
|
315
|
+
Data duplication degrades search quality, so overlap is intentionally excluded.
|
|
316
|
+
|
|
317
|
+
Algorithm:
|
|
318
|
+
1. Track active rowspan for each row (by column position, considering colspan)
|
|
319
|
+
2. If all rowspans from previous row end and new rowspan starts, create new block
|
|
320
|
+
3. Combine blocks to fit chunk_size
|
|
321
|
+
|
|
322
|
+
Args:
|
|
323
|
+
parsed_table: Parsed table
|
|
324
|
+
chunk_size: Chunk size
|
|
325
|
+
chunk_overlap: Not used (kept for compatibility)
|
|
326
|
+
context_prefix: Context info (metadata, sheet info, etc.)
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
List of split table chunks
|
|
330
|
+
"""
|
|
331
|
+
data_rows = parsed_table.data_rows
|
|
332
|
+
header_html = parsed_table.header_html
|
|
333
|
+
header_size = parsed_table.header_size
|
|
334
|
+
|
|
335
|
+
# Calculate context size
|
|
336
|
+
context_size = len(context_prefix) + 2 if context_prefix else 0
|
|
337
|
+
|
|
338
|
+
if not data_rows:
|
|
339
|
+
if context_prefix:
|
|
340
|
+
return [f"{context_prefix}\n{parsed_table.original_html}"]
|
|
341
|
+
return [parsed_table.original_html]
|
|
342
|
+
|
|
343
|
+
# === Identify rowspan blocks ===
|
|
344
|
+
# Block = group of consecutive rows connected by rowspan
|
|
345
|
+
active_rowspans: Dict[int, int] = {} # column_position -> remaining_rows (including current row)
|
|
346
|
+
row_block_ids: List[int] = [] # Block ID for each row
|
|
347
|
+
current_block_id = -1
|
|
348
|
+
|
|
349
|
+
for row_idx, row in enumerate(data_rows):
|
|
350
|
+
# 1. Decrease remaining rowspan from previous row (except first row)
|
|
351
|
+
if row_idx > 0:
|
|
352
|
+
finished_cols = []
|
|
353
|
+
for col in list(active_rowspans.keys()):
|
|
354
|
+
active_rowspans[col] -= 1
|
|
355
|
+
if active_rowspans[col] <= 0:
|
|
356
|
+
finished_cols.append(col)
|
|
357
|
+
for col in finished_cols:
|
|
358
|
+
del active_rowspans[col]
|
|
359
|
+
|
|
360
|
+
# State after decrease (before adding new spans)
|
|
361
|
+
had_active_before_new = len(active_rowspans) > 0
|
|
362
|
+
|
|
363
|
+
# 2. Add new rowspans starting from current row
|
|
364
|
+
new_spans = extract_cell_spans_with_positions(row.html)
|
|
365
|
+
for col, span in new_spans.items():
|
|
366
|
+
# Update if larger than existing rowspan (longer span takes priority)
|
|
367
|
+
if col not in active_rowspans or span > active_rowspans[col]:
|
|
368
|
+
active_rowspans[col] = span
|
|
369
|
+
|
|
370
|
+
has_active_now = len(active_rowspans) > 0
|
|
371
|
+
has_new_span = len(new_spans) > 0
|
|
372
|
+
|
|
373
|
+
# Block determination logic:
|
|
374
|
+
# - No active rowspan -> independent block
|
|
375
|
+
# - No active after previous row processing but new span starts -> new block
|
|
376
|
+
# - Otherwise maintain existing block
|
|
377
|
+
if not has_active_now:
|
|
378
|
+
# No rowspan - independent row
|
|
379
|
+
current_block_id += 1
|
|
380
|
+
row_block_ids.append(current_block_id)
|
|
381
|
+
elif not had_active_before_new and has_new_span:
|
|
382
|
+
# All previous rowspans ended and new rowspan starts - new block
|
|
383
|
+
current_block_id += 1
|
|
384
|
+
row_block_ids.append(current_block_id)
|
|
385
|
+
else:
|
|
386
|
+
# Maintain existing block
|
|
387
|
+
row_block_ids.append(current_block_id)
|
|
388
|
+
|
|
389
|
+
# Group rows by block
|
|
390
|
+
block_groups: Dict[int, List[int]] = {}
|
|
391
|
+
for row_idx, block_id in enumerate(row_block_ids):
|
|
392
|
+
if block_id not in block_groups:
|
|
393
|
+
block_groups[block_id] = []
|
|
394
|
+
block_groups[block_id].append(row_idx)
|
|
395
|
+
|
|
396
|
+
# Create row_groups in sorted block order
|
|
397
|
+
row_groups: List[List[int]] = [
|
|
398
|
+
block_groups[block_id]
|
|
399
|
+
for block_id in sorted(block_groups.keys())
|
|
400
|
+
]
|
|
401
|
+
|
|
402
|
+
# === Combine groups into chunks ===
|
|
403
|
+
chunks: List[str] = []
|
|
404
|
+
current_rows: List[TableRow] = []
|
|
405
|
+
current_size = 0
|
|
406
|
+
|
|
407
|
+
available_space = calculate_available_space(chunk_size, header_size + context_size, 0, 1)
|
|
408
|
+
# Maximum allowed chunk size (1.5x of chunk_size)
|
|
409
|
+
max_chunk_data_size = int(chunk_size * 1.5) - header_size - context_size - CHUNK_INDEX_OVERHEAD
|
|
410
|
+
|
|
411
|
+
for group in row_groups:
|
|
412
|
+
group_rows = [data_rows[idx] for idx in group]
|
|
413
|
+
group_size = sum(row.char_length + 1 for row in group_rows)
|
|
414
|
+
|
|
415
|
+
if current_rows and current_size + group_size > available_space:
|
|
416
|
+
# Check if we can still fit within 1.5x limit
|
|
417
|
+
if current_size + group_size <= max_chunk_data_size:
|
|
418
|
+
# Still within 1.5x limit - add group to current chunk
|
|
419
|
+
current_rows.extend(group_rows)
|
|
420
|
+
current_size += group_size
|
|
421
|
+
else:
|
|
422
|
+
# Exceeds 1.5x limit - flush current chunk and start new one
|
|
423
|
+
chunks.append(build_table_chunk(
|
|
424
|
+
header_html, current_rows, len(chunks), len(chunks) + 2,
|
|
425
|
+
context_prefix=context_prefix
|
|
426
|
+
))
|
|
427
|
+
current_rows = group_rows[:]
|
|
428
|
+
current_size = group_size
|
|
429
|
+
else:
|
|
430
|
+
current_rows.extend(group_rows)
|
|
431
|
+
current_size += group_size
|
|
432
|
+
|
|
433
|
+
# Last chunk
|
|
434
|
+
if current_rows:
|
|
435
|
+
chunks.append(build_table_chunk(
|
|
436
|
+
header_html, current_rows, len(chunks), len(chunks) + 1,
|
|
437
|
+
context_prefix=context_prefix
|
|
438
|
+
))
|
|
439
|
+
|
|
440
|
+
# Update chunk count
|
|
441
|
+
if len(chunks) > 1:
|
|
442
|
+
chunks = update_chunk_metadata(chunks, len(chunks))
|
|
443
|
+
|
|
444
|
+
return chunks
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
def chunk_large_table(
|
|
448
|
+
table_html: str,
|
|
449
|
+
chunk_size: int,
|
|
450
|
+
chunk_overlap: int,
|
|
451
|
+
context_prefix: str = ""
|
|
452
|
+
) -> List[str]:
|
|
453
|
+
"""
|
|
454
|
+
Split large HTML table to fit chunk_size.
|
|
455
|
+
Restores table structure (headers) in each chunk.
|
|
456
|
+
|
|
457
|
+
Also handles complex tables with rowspan.
|
|
458
|
+
|
|
459
|
+
NOTE: Table chunking does NOT apply overlap.
|
|
460
|
+
Data duplication degrades search quality, so overlap is intentionally excluded.
|
|
461
|
+
|
|
462
|
+
Args:
|
|
463
|
+
table_html: HTML table string
|
|
464
|
+
chunk_size: Maximum chunk size
|
|
465
|
+
chunk_overlap: Not used (kept for compatibility)
|
|
466
|
+
context_prefix: Context info (metadata, sheet info, etc.) - included in all chunks
|
|
467
|
+
|
|
468
|
+
Returns:
|
|
469
|
+
List of split table HTML chunks
|
|
470
|
+
"""
|
|
471
|
+
# Parse table
|
|
472
|
+
parsed = parse_html_table(table_html)
|
|
473
|
+
|
|
474
|
+
if not parsed:
|
|
475
|
+
logger.warning("Failed to parse table, returning original")
|
|
476
|
+
if context_prefix:
|
|
477
|
+
return [f"{context_prefix}\n{table_html}"]
|
|
478
|
+
return [table_html]
|
|
479
|
+
|
|
480
|
+
# No need to split if table fits in chunk_size
|
|
481
|
+
if len(table_html) + len(context_prefix) <= chunk_size:
|
|
482
|
+
if context_prefix:
|
|
483
|
+
return [f"{context_prefix}\n{table_html}"]
|
|
484
|
+
return [table_html]
|
|
485
|
+
|
|
486
|
+
# No need to split if no data rows
|
|
487
|
+
if not parsed.data_rows:
|
|
488
|
+
if context_prefix:
|
|
489
|
+
return [f"{context_prefix}\n{table_html}"]
|
|
490
|
+
return [table_html]
|
|
491
|
+
|
|
492
|
+
# Check for complex spans (rowspan)
|
|
493
|
+
if has_complex_spans(table_html):
|
|
494
|
+
logger.info("Complex table with rowspan detected, using span-aware splitting")
|
|
495
|
+
return split_table_preserving_rowspan(parsed, chunk_size, chunk_overlap, context_prefix)
|
|
496
|
+
|
|
497
|
+
# Standard table splitting
|
|
498
|
+
chunks = split_table_into_chunks(parsed, chunk_size, chunk_overlap, context_prefix)
|
|
499
|
+
|
|
500
|
+
return chunks
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
# ============================================================================
|
|
504
|
+
# Markdown Table Chunking Functions
|
|
505
|
+
# ============================================================================
|
|
506
|
+
|
|
507
|
+
def parse_markdown_table(table_text: str) -> Optional[ParsedMarkdownTable]:
|
|
508
|
+
"""
|
|
509
|
+
Parse a Markdown table and extract structural information.
|
|
510
|
+
|
|
511
|
+
A Markdown table has:
|
|
512
|
+
- Header row: | col1 | col2 | col3 |
|
|
513
|
+
- Separator row: |---|---|---| or |:---:|:---|---:|
|
|
514
|
+
- Data rows: | data1 | data2 | data3 |
|
|
515
|
+
|
|
516
|
+
Args:
|
|
517
|
+
table_text: Markdown table text
|
|
518
|
+
|
|
519
|
+
Returns:
|
|
520
|
+
ParsedMarkdownTable object or None if parsing fails
|
|
521
|
+
"""
|
|
522
|
+
try:
|
|
523
|
+
# Split into lines and filter empty lines
|
|
524
|
+
lines = [line.strip() for line in table_text.strip().split('\n') if line.strip()]
|
|
525
|
+
|
|
526
|
+
if len(lines) < 2:
|
|
527
|
+
logger.debug("Not enough lines for a valid Markdown table")
|
|
528
|
+
return None
|
|
529
|
+
|
|
530
|
+
# Find header and separator rows
|
|
531
|
+
header_row = None
|
|
532
|
+
separator_row = None
|
|
533
|
+
separator_idx = -1
|
|
534
|
+
|
|
535
|
+
for idx, line in enumerate(lines):
|
|
536
|
+
# Check if this line is a separator (contains only |, -, :, and spaces)
|
|
537
|
+
if re.match(MARKDOWN_TABLE_SEPARATOR_PATTERN, line):
|
|
538
|
+
separator_row = line
|
|
539
|
+
separator_idx = idx
|
|
540
|
+
# Header is the line before separator
|
|
541
|
+
if idx > 0:
|
|
542
|
+
header_row = lines[idx - 1]
|
|
543
|
+
break
|
|
544
|
+
|
|
545
|
+
if not separator_row or not header_row:
|
|
546
|
+
# Try simpler detection: first row is header, second row is separator
|
|
547
|
+
if len(lines) >= 2 and lines[0].startswith('|') and '---' in lines[1]:
|
|
548
|
+
header_row = lines[0]
|
|
549
|
+
separator_row = lines[1]
|
|
550
|
+
separator_idx = 1
|
|
551
|
+
else:
|
|
552
|
+
logger.debug("Could not identify header/separator in Markdown table")
|
|
553
|
+
return None
|
|
554
|
+
|
|
555
|
+
# Count columns from separator
|
|
556
|
+
total_cols = separator_row.count('|') - 1 # -1 because |---|---| has n+1 pipes for n columns
|
|
557
|
+
|
|
558
|
+
# Data rows are all rows after separator
|
|
559
|
+
data_rows = lines[separator_idx + 1:]
|
|
560
|
+
|
|
561
|
+
# Construct header text (header + separator) for restoration in each chunk
|
|
562
|
+
header_text = f"{header_row}\n{separator_row}"
|
|
563
|
+
header_size = len(header_text) + 1 # +1 for newline
|
|
564
|
+
|
|
565
|
+
return ParsedMarkdownTable(
|
|
566
|
+
header_row=header_row,
|
|
567
|
+
separator_row=separator_row,
|
|
568
|
+
data_rows=data_rows,
|
|
569
|
+
total_cols=total_cols,
|
|
570
|
+
original_text=table_text,
|
|
571
|
+
header_text=header_text,
|
|
572
|
+
header_size=header_size
|
|
573
|
+
)
|
|
574
|
+
|
|
575
|
+
except Exception as e:
|
|
576
|
+
logger.warning(f"Failed to parse Markdown table: {e}")
|
|
577
|
+
return None
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
def build_markdown_table_chunk(
|
|
581
|
+
header_text: str,
|
|
582
|
+
data_rows: List[str],
|
|
583
|
+
chunk_index: int = 0,
|
|
584
|
+
total_chunks: int = 1,
|
|
585
|
+
context_prefix: str = ""
|
|
586
|
+
) -> str:
|
|
587
|
+
"""
|
|
588
|
+
Build a complete Markdown table chunk with header restored.
|
|
589
|
+
|
|
590
|
+
Args:
|
|
591
|
+
header_text: Header row + separator row
|
|
592
|
+
data_rows: List of data row strings
|
|
593
|
+
chunk_index: Current chunk index (0-based)
|
|
594
|
+
total_chunks: Total number of chunks
|
|
595
|
+
context_prefix: Context info (metadata, sheet info, etc.) - included in all chunks
|
|
596
|
+
|
|
597
|
+
Returns:
|
|
598
|
+
Complete Markdown table chunk
|
|
599
|
+
"""
|
|
600
|
+
parts = []
|
|
601
|
+
|
|
602
|
+
# Add context prefix if provided
|
|
603
|
+
if context_prefix:
|
|
604
|
+
parts.append(context_prefix)
|
|
605
|
+
|
|
606
|
+
# Add chunk index metadata (only if more than 1 chunk)
|
|
607
|
+
if total_chunks > 1:
|
|
608
|
+
parts.append(f"[Table Chunk {chunk_index + 1}/{total_chunks}]")
|
|
609
|
+
|
|
610
|
+
# Add header (header row + separator row)
|
|
611
|
+
parts.append(header_text)
|
|
612
|
+
|
|
613
|
+
# Add data rows
|
|
614
|
+
for row in data_rows:
|
|
615
|
+
parts.append(row)
|
|
616
|
+
|
|
617
|
+
return "\n".join(parts)
|
|
618
|
+
|
|
619
|
+
|
|
620
|
+
def update_markdown_chunk_metadata(chunks: List[str], total_chunks: int) -> List[str]:
|
|
621
|
+
"""
|
|
622
|
+
Update chunk metadata (total chunk count) in Markdown table chunks.
|
|
623
|
+
|
|
624
|
+
Args:
|
|
625
|
+
chunks: List of chunks
|
|
626
|
+
total_chunks: Actual total number of chunks
|
|
627
|
+
|
|
628
|
+
Returns:
|
|
629
|
+
Updated chunks with correct metadata
|
|
630
|
+
"""
|
|
631
|
+
updated_chunks = []
|
|
632
|
+
|
|
633
|
+
for idx, chunk in enumerate(chunks):
|
|
634
|
+
# Pattern for existing metadata
|
|
635
|
+
old_pattern = r'\[Table Chunk \d+/\d+\]'
|
|
636
|
+
new_metadata = f"[Table Chunk {idx + 1}/{total_chunks}]"
|
|
637
|
+
|
|
638
|
+
if re.search(old_pattern, chunk):
|
|
639
|
+
updated_chunk = re.sub(old_pattern, new_metadata, chunk)
|
|
640
|
+
else:
|
|
641
|
+
# No metadata found - add it
|
|
642
|
+
updated_chunk = f"{new_metadata}\n{chunk}"
|
|
643
|
+
|
|
644
|
+
updated_chunks.append(updated_chunk)
|
|
645
|
+
|
|
646
|
+
return updated_chunks
|
|
647
|
+
|
|
648
|
+
|
|
649
|
+
def split_markdown_table_into_chunks(
|
|
650
|
+
parsed_table: ParsedMarkdownTable,
|
|
651
|
+
chunk_size: int,
|
|
652
|
+
chunk_overlap: int = 0,
|
|
653
|
+
context_prefix: str = ""
|
|
654
|
+
) -> List[str]:
|
|
655
|
+
"""
|
|
656
|
+
Split a parsed Markdown table into chunks that fit chunk_size.
|
|
657
|
+
Each chunk is a complete Markdown table with headers restored.
|
|
658
|
+
|
|
659
|
+
NOTE: Table chunking does NOT apply overlap.
|
|
660
|
+
Data duplication degrades search quality, so overlap is intentionally excluded.
|
|
661
|
+
|
|
662
|
+
Args:
|
|
663
|
+
parsed_table: Parsed Markdown table information
|
|
664
|
+
chunk_size: Maximum chunk size
|
|
665
|
+
chunk_overlap: Not used (kept for compatibility)
|
|
666
|
+
context_prefix: Context info (metadata, sheet info, etc.) - included in all chunks
|
|
667
|
+
|
|
668
|
+
Returns:
|
|
669
|
+
List of Markdown table chunk strings
|
|
670
|
+
"""
|
|
671
|
+
data_rows = parsed_table.data_rows
|
|
672
|
+
header_text = parsed_table.header_text
|
|
673
|
+
header_size = parsed_table.header_size
|
|
674
|
+
|
|
675
|
+
# Calculate context size
|
|
676
|
+
context_size = len(context_prefix) + 2 if context_prefix else 0 # +2 for newline
|
|
677
|
+
|
|
678
|
+
if not data_rows:
|
|
679
|
+
# No data rows - return original
|
|
680
|
+
if context_prefix:
|
|
681
|
+
return [f"{context_prefix}\n{parsed_table.original_text}"]
|
|
682
|
+
return [parsed_table.original_text]
|
|
683
|
+
|
|
684
|
+
# Calculate available space per chunk
|
|
685
|
+
# Overhead: chunk index metadata (~25 chars) + header + context
|
|
686
|
+
estimated_chunks = 1
|
|
687
|
+
total_data_size = sum(len(row) + 1 for row in data_rows) # +1 for newline
|
|
688
|
+
available_per_chunk = chunk_size - header_size - context_size - CHUNK_INDEX_OVERHEAD
|
|
689
|
+
|
|
690
|
+
if available_per_chunk > 0:
|
|
691
|
+
estimated_chunks = max(1, (total_data_size + available_per_chunk - 1) // available_per_chunk)
|
|
692
|
+
|
|
693
|
+
# Recalculate with estimated chunks
|
|
694
|
+
if estimated_chunks > 1:
|
|
695
|
+
available_per_chunk = chunk_size - header_size - context_size - CHUNK_INDEX_OVERHEAD
|
|
696
|
+
else:
|
|
697
|
+
available_per_chunk = chunk_size - header_size - context_size
|
|
698
|
+
|
|
699
|
+
# Maximum allowed chunk size (1.5x of chunk_size)
|
|
700
|
+
max_chunk_data_size = int(chunk_size * 1.5) - header_size - context_size - CHUNK_INDEX_OVERHEAD
|
|
701
|
+
|
|
702
|
+
chunks: List[str] = []
|
|
703
|
+
current_rows: List[str] = []
|
|
704
|
+
current_size = 0
|
|
705
|
+
|
|
706
|
+
for row in data_rows:
|
|
707
|
+
row_size = len(row) + 1 # +1 for newline
|
|
708
|
+
|
|
709
|
+
# Check if adding this row exceeds available space
|
|
710
|
+
if current_rows and (current_size + row_size > available_per_chunk):
|
|
711
|
+
# Check if we can still fit within 1.5x limit
|
|
712
|
+
if current_size + row_size <= max_chunk_data_size:
|
|
713
|
+
# Still within 1.5x limit - add row to current chunk
|
|
714
|
+
current_rows.append(row)
|
|
715
|
+
current_size += row_size
|
|
716
|
+
else:
|
|
717
|
+
# Exceeds 1.5x limit - flush current chunk and start new one
|
|
718
|
+
chunk_text = build_markdown_table_chunk(
|
|
719
|
+
header_text,
|
|
720
|
+
current_rows,
|
|
721
|
+
chunk_index=len(chunks),
|
|
722
|
+
total_chunks=estimated_chunks,
|
|
723
|
+
context_prefix=context_prefix
|
|
724
|
+
)
|
|
725
|
+
chunks.append(chunk_text)
|
|
726
|
+
|
|
727
|
+
# Start new chunk with this row (minimum 1 row guaranteed)
|
|
728
|
+
current_rows = [row]
|
|
729
|
+
current_size = row_size
|
|
730
|
+
else:
|
|
731
|
+
# Row fits - add to current chunk
|
|
732
|
+
current_rows.append(row)
|
|
733
|
+
current_size += row_size
|
|
734
|
+
|
|
735
|
+
# Handle last chunk
|
|
736
|
+
if current_rows:
|
|
737
|
+
chunk_text = build_markdown_table_chunk(
|
|
738
|
+
header_text,
|
|
739
|
+
current_rows,
|
|
740
|
+
chunk_index=len(chunks),
|
|
741
|
+
total_chunks=max(len(chunks) + 1, estimated_chunks),
|
|
742
|
+
context_prefix=context_prefix
|
|
743
|
+
)
|
|
744
|
+
chunks.append(chunk_text)
|
|
745
|
+
|
|
746
|
+
# Update total chunk count in metadata if different from estimate
|
|
747
|
+
if len(chunks) != estimated_chunks and len(chunks) > 1:
|
|
748
|
+
chunks = update_markdown_chunk_metadata(chunks, len(chunks))
|
|
749
|
+
|
|
750
|
+
logger.info(f"Markdown table split into {len(chunks)} chunks (original: {len(parsed_table.original_text)} chars)")
|
|
751
|
+
|
|
752
|
+
return chunks
|
|
753
|
+
|
|
754
|
+
|
|
755
|
+
def chunk_large_markdown_table(
|
|
756
|
+
table_text: str,
|
|
757
|
+
chunk_size: int,
|
|
758
|
+
chunk_overlap: int,
|
|
759
|
+
context_prefix: str = ""
|
|
760
|
+
) -> List[str]:
|
|
761
|
+
"""
|
|
762
|
+
Split a large Markdown table to fit chunk_size.
|
|
763
|
+
Restores table structure (header + separator) in each chunk.
|
|
764
|
+
|
|
765
|
+
NOTE: Table chunking does NOT apply overlap.
|
|
766
|
+
Data duplication degrades search quality, so overlap is intentionally excluded.
|
|
767
|
+
|
|
768
|
+
Args:
|
|
769
|
+
table_text: Markdown table text
|
|
770
|
+
chunk_size: Maximum chunk size
|
|
771
|
+
chunk_overlap: Not used (kept for compatibility)
|
|
772
|
+
context_prefix: Context info (metadata, sheet info, etc.) - included in all chunks
|
|
773
|
+
|
|
774
|
+
Returns:
|
|
775
|
+
List of split Markdown table chunks
|
|
776
|
+
"""
|
|
777
|
+
# Parse table
|
|
778
|
+
parsed = parse_markdown_table(table_text)
|
|
779
|
+
|
|
780
|
+
if not parsed:
|
|
781
|
+
logger.warning("Failed to parse Markdown table, returning original")
|
|
782
|
+
if context_prefix:
|
|
783
|
+
return [f"{context_prefix}\n{table_text}"]
|
|
784
|
+
return [table_text]
|
|
785
|
+
|
|
786
|
+
# No need to split if table fits in chunk_size
|
|
787
|
+
if len(table_text) + len(context_prefix) <= chunk_size:
|
|
788
|
+
if context_prefix:
|
|
789
|
+
return [f"{context_prefix}\n{table_text}"]
|
|
790
|
+
return [table_text]
|
|
791
|
+
|
|
792
|
+
# No need to split if no data rows
|
|
793
|
+
if not parsed.data_rows:
|
|
794
|
+
if context_prefix:
|
|
795
|
+
return [f"{context_prefix}\n{table_text}"]
|
|
796
|
+
return [table_text]
|
|
797
|
+
|
|
798
|
+
# Split table into chunks
|
|
799
|
+
chunks = split_markdown_table_into_chunks(parsed, chunk_size, chunk_overlap, context_prefix)
|
|
800
|
+
|
|
801
|
+
return chunks
|
|
802
|
+
|
|
803
|
+
|
|
804
|
+
def is_markdown_table(text: str) -> bool:
|
|
805
|
+
"""
|
|
806
|
+
Check if text is a Markdown table.
|
|
807
|
+
|
|
808
|
+
A Markdown table has:
|
|
809
|
+
- Lines starting with |
|
|
810
|
+
- A separator line with |---|
|
|
811
|
+
|
|
812
|
+
Args:
|
|
813
|
+
text: Text to check
|
|
814
|
+
|
|
815
|
+
Returns:
|
|
816
|
+
True if text is a Markdown table
|
|
817
|
+
"""
|
|
818
|
+
lines = text.strip().split('\n')
|
|
819
|
+
if len(lines) < 2:
|
|
820
|
+
return False
|
|
821
|
+
|
|
822
|
+
# Check for | at start of lines and separator pattern
|
|
823
|
+
has_pipe_rows = any(line.strip().startswith('|') for line in lines)
|
|
824
|
+
has_separator = any('---' in line and '|' in line for line in lines)
|
|
825
|
+
|
|
826
|
+
return has_pipe_rows and has_separator
|
|
827
|
+
|
|
828
|
+
|
|
829
|
+
# Note: detect_table_type and chunk_large_table_unified were removed because they
|
|
830
|
+
# were not referenced anywhere in the codebase and duplicated logic handled elsewhere
|
|
831
|
+
# (e.g., via _chunk_table_unified in chunking.py). Keeping a single authoritative
|
|
832
|
+
# implementation reduces the risk of divergent behavior.
|