xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,715 @@
|
|
|
1
|
+
# chunking_helper/protected_regions.py
|
|
2
|
+
"""
|
|
3
|
+
Protected Regions - Protected region detection and processing
|
|
4
|
+
|
|
5
|
+
Main Features:
|
|
6
|
+
- Detect protected regions that should not be split during chunking
|
|
7
|
+
- Split text while protecting protected regions
|
|
8
|
+
- Efficient handling of large tables (HTML and Markdown)
|
|
9
|
+
- Row-level chunking for tables with NO overlap
|
|
10
|
+
- Support for dynamic tag patterns from processors (Image, Chart, Page, Slide, Metadata)
|
|
11
|
+
- Protected regions NEVER overlap when splitting chunks
|
|
12
|
+
"""
|
|
13
|
+
import logging
|
|
14
|
+
import re
|
|
15
|
+
from typing import Any, List, Optional, Tuple
|
|
16
|
+
|
|
17
|
+
from xgen_doc2chunk.chunking.constants import (
|
|
18
|
+
HTML_TABLE_PATTERN, CHART_BLOCK_PATTERN, TEXTBOX_BLOCK_PATTERN,
|
|
19
|
+
IMAGE_TAG_PATTERN, MARKDOWN_TABLE_PATTERN,
|
|
20
|
+
PAGE_TAG_PATTERN, SLIDE_TAG_PATTERN, SHEET_TAG_PATTERN,
|
|
21
|
+
PAGE_TAG_OCR_PATTERN, SLIDE_TAG_OCR_PATTERN,
|
|
22
|
+
METADATA_BLOCK_PATTERN, DATA_ANALYSIS_PATTERN
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger("document-processor")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def find_protected_regions(
|
|
29
|
+
text: str,
|
|
30
|
+
is_table_based: bool = False,
|
|
31
|
+
force_chunking: bool = False,
|
|
32
|
+
image_pattern: Optional[str] = None,
|
|
33
|
+
chart_pattern: Optional[str] = None,
|
|
34
|
+
page_tag_processor: Optional[Any] = None,
|
|
35
|
+
metadata_pattern: Optional[str] = None
|
|
36
|
+
) -> List[Tuple[int, int, str]]:
|
|
37
|
+
"""
|
|
38
|
+
Find protected regions that should not be split during chunking.
|
|
39
|
+
|
|
40
|
+
Protected Regions (NEVER split or overlap):
|
|
41
|
+
1. HTML tables: <table>...</table> (row-level only when force_chunking/table-based)
|
|
42
|
+
2. Chart blocks: [chart]...[/chart] or custom - always protected (never split)
|
|
43
|
+
3. Textbox blocks: [textbox]...[/textbox] - always protected (never split)
|
|
44
|
+
4. Image tags: [image:...] or custom - always protected (never split, no overlap)
|
|
45
|
+
5. Markdown tables: |...|\\n|---|...| (row-level only when force_chunking/table-based)
|
|
46
|
+
6. Page/Slide/Sheet tags: [Page Number: n], [Slide Number: n], [Sheet: name] - always protected (no overlap)
|
|
47
|
+
7. Metadata blocks: <Document-Metadata>...</Document-Metadata> or custom - always protected (no overlap)
|
|
48
|
+
8. Data analysis blocks: [Data Analysis]...[/Data Analysis] - always protected
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
text: Text to search
|
|
52
|
+
is_table_based: Whether file is table-based (if True, row-level protection only for tables)
|
|
53
|
+
force_chunking: Force chunking mode (if True, same as table-based for row-level protection)
|
|
54
|
+
image_pattern: Image tag pattern (if None, uses default IMAGE_TAG_PATTERN)
|
|
55
|
+
chart_pattern: Chart block pattern (if None, uses default CHART_BLOCK_PATTERN)
|
|
56
|
+
page_tag_processor: PageTagProcessor instance for custom page/slide/sheet patterns
|
|
57
|
+
metadata_pattern: Metadata block pattern (if None, uses default METADATA_BLOCK_PATTERN)
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
[(start, end, type), ...] - Sorted list of protected regions
|
|
61
|
+
"""
|
|
62
|
+
regions: List[Tuple[int, int, str]] = []
|
|
63
|
+
|
|
64
|
+
# Table protection disabled when is_table_based or force_chunking is True
|
|
65
|
+
disable_table_protection = is_table_based or force_chunking
|
|
66
|
+
|
|
67
|
+
# 1. HTML tables (row-level only when table protection disabled)
|
|
68
|
+
if not disable_table_protection:
|
|
69
|
+
for match in re.finditer(HTML_TABLE_PATTERN, text, re.DOTALL | re.IGNORECASE):
|
|
70
|
+
regions.append((match.start(), match.end(), 'html_table'))
|
|
71
|
+
# else: HTML tables allow row-level chunking (handled by chunk_large_table)
|
|
72
|
+
|
|
73
|
+
# 2. Chart blocks - always protected (never split under any condition)
|
|
74
|
+
chart_pat = chart_pattern if chart_pattern is not None else CHART_BLOCK_PATTERN
|
|
75
|
+
for match in re.finditer(chart_pat, text, re.DOTALL):
|
|
76
|
+
regions.append((match.start(), match.end(), 'chart'))
|
|
77
|
+
|
|
78
|
+
# 3. Textbox blocks - always protected (never split under any condition)
|
|
79
|
+
for match in re.finditer(TEXTBOX_BLOCK_PATTERN, text, re.DOTALL):
|
|
80
|
+
regions.append((match.start(), match.end(), 'textbox'))
|
|
81
|
+
|
|
82
|
+
# 4. Image tags - always protected (never split under any condition, no overlap)
|
|
83
|
+
img_pattern = image_pattern if image_pattern is not None else IMAGE_TAG_PATTERN
|
|
84
|
+
for match in re.finditer(img_pattern, text):
|
|
85
|
+
regions.append((match.start(), match.end(), 'image_tag'))
|
|
86
|
+
|
|
87
|
+
# 5. Markdown tables (row-level only when table protection disabled)
|
|
88
|
+
if not disable_table_protection:
|
|
89
|
+
for match in re.finditer(MARKDOWN_TABLE_PATTERN, text, re.MULTILINE):
|
|
90
|
+
table_start = match.start()
|
|
91
|
+
if match.group(0).startswith('\n'):
|
|
92
|
+
table_start += 1
|
|
93
|
+
table_end = match.end()
|
|
94
|
+
regions.append((table_start, table_end, 'markdown_table'))
|
|
95
|
+
# else: Markdown tables allow row-level chunking (handled by chunk_large_markdown_table)
|
|
96
|
+
|
|
97
|
+
# 6. Page/Slide/Sheet tags - always protected (no overlap)
|
|
98
|
+
# Use dynamic patterns from PageTagProcessor if provided
|
|
99
|
+
if page_tag_processor is not None:
|
|
100
|
+
try:
|
|
101
|
+
from xgen_doc2chunk.core.functions.page_tag_processor import PageTagType
|
|
102
|
+
# Page tags
|
|
103
|
+
page_pattern = page_tag_processor.get_pattern_string(PageTagType.PAGE)
|
|
104
|
+
for match in re.finditer(page_pattern, text, re.IGNORECASE):
|
|
105
|
+
regions.append((match.start(), match.end(), 'page_tag'))
|
|
106
|
+
# OCR page tag variants (use stable default pattern)
|
|
107
|
+
for match in re.finditer(PAGE_TAG_OCR_PATTERN, text, re.IGNORECASE):
|
|
108
|
+
regions.append((match.start(), match.end(), 'page_tag'))
|
|
109
|
+
# Slide tags
|
|
110
|
+
slide_pattern = page_tag_processor.get_pattern_string(PageTagType.SLIDE)
|
|
111
|
+
for match in re.finditer(slide_pattern, text, re.IGNORECASE):
|
|
112
|
+
regions.append((match.start(), match.end(), 'slide_tag'))
|
|
113
|
+
# OCR slide tag variants (use stable default pattern)
|
|
114
|
+
for match in re.finditer(SLIDE_TAG_OCR_PATTERN, text, re.IGNORECASE):
|
|
115
|
+
regions.append((match.start(), match.end(), 'slide_tag'))
|
|
116
|
+
# Sheet tags
|
|
117
|
+
sheet_pattern = page_tag_processor.get_pattern_string(PageTagType.SHEET)
|
|
118
|
+
for match in re.finditer(sheet_pattern, text, re.IGNORECASE):
|
|
119
|
+
regions.append((match.start(), match.end(), 'sheet_tag'))
|
|
120
|
+
except Exception as e:
|
|
121
|
+
logger.warning(f"Error getting patterns from page_tag_processor: {e}, using defaults")
|
|
122
|
+
_add_default_page_tag_regions(text, regions)
|
|
123
|
+
else:
|
|
124
|
+
_add_default_page_tag_regions(text, regions)
|
|
125
|
+
|
|
126
|
+
# 7. Metadata blocks - always protected (no overlap)
|
|
127
|
+
meta_pattern = metadata_pattern if metadata_pattern is not None else METADATA_BLOCK_PATTERN
|
|
128
|
+
for match in re.finditer(meta_pattern, text, re.DOTALL):
|
|
129
|
+
regions.append((match.start(), match.end(), 'metadata'))
|
|
130
|
+
|
|
131
|
+
# 8. Data analysis blocks - always protected
|
|
132
|
+
for match in re.finditer(DATA_ANALYSIS_PATTERN, text, re.DOTALL):
|
|
133
|
+
regions.append((match.start(), match.end(), 'data_analysis'))
|
|
134
|
+
|
|
135
|
+
# Sort by start position
|
|
136
|
+
regions.sort(key=lambda x: x[0])
|
|
137
|
+
|
|
138
|
+
# Merge overlapping regions
|
|
139
|
+
merged_regions: List[Tuple[int, int, str]] = []
|
|
140
|
+
for start, end, region_type in regions:
|
|
141
|
+
if merged_regions and start < merged_regions[-1][1]:
|
|
142
|
+
# Overlap with previous region -> merge
|
|
143
|
+
prev_start, prev_end, prev_type = merged_regions[-1]
|
|
144
|
+
merged_regions[-1] = (prev_start, max(prev_end, end), f"{prev_type}+{region_type}")
|
|
145
|
+
else:
|
|
146
|
+
merged_regions.append((start, end, region_type))
|
|
147
|
+
|
|
148
|
+
return merged_regions
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _add_default_page_tag_regions(text: str, regions: List[Tuple[int, int, str]]) -> None:
|
|
152
|
+
"""
|
|
153
|
+
Add default page/slide/sheet tag regions using default patterns.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
text: Text to search
|
|
157
|
+
regions: List to append found regions to
|
|
158
|
+
"""
|
|
159
|
+
# Page tags (including OCR variants)
|
|
160
|
+
for match in re.finditer(PAGE_TAG_PATTERN, text, re.IGNORECASE):
|
|
161
|
+
regions.append((match.start(), match.end(), 'page_tag'))
|
|
162
|
+
for match in re.finditer(PAGE_TAG_OCR_PATTERN, text, re.IGNORECASE):
|
|
163
|
+
regions.append((match.start(), match.end(), 'page_tag'))
|
|
164
|
+
|
|
165
|
+
# Slide tags (including OCR variants)
|
|
166
|
+
for match in re.finditer(SLIDE_TAG_PATTERN, text, re.IGNORECASE):
|
|
167
|
+
regions.append((match.start(), match.end(), 'slide_tag'))
|
|
168
|
+
for match in re.finditer(SLIDE_TAG_OCR_PATTERN, text, re.IGNORECASE):
|
|
169
|
+
regions.append((match.start(), match.end(), 'slide_tag'))
|
|
170
|
+
|
|
171
|
+
# Sheet tags
|
|
172
|
+
for match in re.finditer(SHEET_TAG_PATTERN, text, re.IGNORECASE):
|
|
173
|
+
regions.append((match.start(), match.end(), 'sheet_tag'))
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _add_no_overlap_tag_regions_default(text: str, regions: List[Tuple[int, int, str]]) -> None:
|
|
177
|
+
"""
|
|
178
|
+
Add default no-overlap tag regions (page/slide/sheet) using default patterns.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
text: Text to search
|
|
182
|
+
regions: List to append found regions to
|
|
183
|
+
"""
|
|
184
|
+
# Page tags (including OCR variants)
|
|
185
|
+
for match in re.finditer(PAGE_TAG_PATTERN, text, re.IGNORECASE):
|
|
186
|
+
regions.append((match.start(), match.end(), 'page_tag'))
|
|
187
|
+
for match in re.finditer(PAGE_TAG_OCR_PATTERN, text, re.IGNORECASE):
|
|
188
|
+
regions.append((match.start(), match.end(), 'page_tag'))
|
|
189
|
+
|
|
190
|
+
# Slide tags (including OCR variants)
|
|
191
|
+
for match in re.finditer(SLIDE_TAG_PATTERN, text, re.IGNORECASE):
|
|
192
|
+
regions.append((match.start(), match.end(), 'slide_tag'))
|
|
193
|
+
for match in re.finditer(SLIDE_TAG_OCR_PATTERN, text, re.IGNORECASE):
|
|
194
|
+
regions.append((match.start(), match.end(), 'slide_tag'))
|
|
195
|
+
|
|
196
|
+
# Sheet tags
|
|
197
|
+
for match in re.finditer(SHEET_TAG_PATTERN, text, re.IGNORECASE):
|
|
198
|
+
regions.append((match.start(), match.end(), 'sheet_tag'))
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def get_protected_region_positions(regions: List[Tuple[int, int, str]]) -> List[Tuple[int, int]]:
|
|
202
|
+
"""
|
|
203
|
+
Extract (start, end) tuples from protected regions.
|
|
204
|
+
"""
|
|
205
|
+
return [(start, end) for start, end, _ in regions]
|
|
206
|
+
|
|
207
|
+
def ensure_protected_region_integrity(content: str) -> str:
|
|
208
|
+
"""
|
|
209
|
+
Verify that protected regions (HTML tables, chart blocks, Markdown tables) in chunk are complete.
|
|
210
|
+
Log warning if incomplete protected region found (content is preserved).
|
|
211
|
+
"""
|
|
212
|
+
# HTML table integrity check
|
|
213
|
+
open_tables = len(re.findall(r'<table[^>]*>', content, re.IGNORECASE))
|
|
214
|
+
close_tables = len(re.findall(r'</table>', content, re.IGNORECASE))
|
|
215
|
+
if open_tables != close_tables:
|
|
216
|
+
logger.warning(f"Incomplete HTML table detected in chunk: {open_tables} open, {close_tables} close tags")
|
|
217
|
+
|
|
218
|
+
# Chart block integrity check
|
|
219
|
+
open_charts = len(re.findall(r'\[chart\]', content))
|
|
220
|
+
close_charts = len(re.findall(r'\[/chart\]', content))
|
|
221
|
+
if open_charts != close_charts:
|
|
222
|
+
logger.warning(f"Incomplete chart block detected in chunk: {open_charts} open, {close_charts} close tags")
|
|
223
|
+
|
|
224
|
+
return content
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _is_markdown_table(text: str) -> bool:
|
|
228
|
+
"""
|
|
229
|
+
Check if text contains a Markdown table pattern.
|
|
230
|
+
"""
|
|
231
|
+
lines = text.strip().split('\n')
|
|
232
|
+
if len(lines) < 2:
|
|
233
|
+
return False
|
|
234
|
+
has_pipe_rows = any(line.strip().startswith('|') for line in lines)
|
|
235
|
+
has_separator = any('---' in line and '|' in line for line in lines)
|
|
236
|
+
return has_pipe_rows and has_separator
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def split_with_protected_regions(
|
|
240
|
+
text: str,
|
|
241
|
+
protected_regions: List[Tuple[int, int]],
|
|
242
|
+
chunk_size: int,
|
|
243
|
+
chunk_overlap: int,
|
|
244
|
+
force_chunking: bool = False,
|
|
245
|
+
image_pattern: Optional[str] = None,
|
|
246
|
+
chart_pattern: Optional[str] = None,
|
|
247
|
+
page_tag_processor: Optional[Any] = None,
|
|
248
|
+
metadata_pattern: Optional[str] = None
|
|
249
|
+
) -> List[str]:
|
|
250
|
+
"""
|
|
251
|
+
Split text into chunks while protecting regions (HTML tables, charts, Markdown tables, tags).
|
|
252
|
+
|
|
253
|
+
Algorithm:
|
|
254
|
+
1. Move forward by chunk_size from current position
|
|
255
|
+
2. If that point is inside a protected region -> cut before region start or include until region end
|
|
256
|
+
3. If protected region is larger than chunk_size:
|
|
257
|
+
- HTML table -> split efficiently with chunk_large_table (row-level, NO overlap)
|
|
258
|
+
- Markdown table -> split efficiently with chunk_large_markdown_table (row-level, NO overlap)
|
|
259
|
+
- Other (charts, metadata, page tags, etc.) -> single chunk for protected region
|
|
260
|
+
4. Apply overlap for next chunk start ONLY for plain text
|
|
261
|
+
- Tables, images, charts, page/slide tags, metadata blocks: NO overlap
|
|
262
|
+
|
|
263
|
+
Protected regions that NEVER overlap:
|
|
264
|
+
- Image tags: [Image:...] or custom pattern
|
|
265
|
+
- Page/Slide/Sheet tags: [Page Number: n], etc.
|
|
266
|
+
- Chart blocks: [chart]...[/chart] or custom
|
|
267
|
+
- Metadata blocks: <Document-Metadata>...</Document-Metadata> or custom
|
|
268
|
+
- Tables: Split by rows, each chunk has NO overlap
|
|
269
|
+
|
|
270
|
+
force_chunking handling:
|
|
271
|
+
- When force_chunking=True, even if tables are not in protected_regions
|
|
272
|
+
- Directly scan for HTML/Markdown tables to avoid cutting in the middle
|
|
273
|
+
- Large tables are split by chunk_large_table/chunk_large_markdown_table with NO overlap
|
|
274
|
+
|
|
275
|
+
Args:
|
|
276
|
+
text: Text to split
|
|
277
|
+
protected_regions: List of (start, end) tuples for protected regions
|
|
278
|
+
chunk_size: Maximum chunk size
|
|
279
|
+
chunk_overlap: Overlap size (NOT applied to protected regions)
|
|
280
|
+
force_chunking: Force chunking mode
|
|
281
|
+
image_pattern: Custom image tag pattern
|
|
282
|
+
chart_pattern: Custom chart block pattern
|
|
283
|
+
page_tag_processor: PageTagProcessor instance for custom patterns
|
|
284
|
+
metadata_pattern: Custom metadata block pattern
|
|
285
|
+
|
|
286
|
+
Returns:
|
|
287
|
+
List of chunks
|
|
288
|
+
"""
|
|
289
|
+
# Get image pattern (custom or default)
|
|
290
|
+
img_pattern = image_pattern if image_pattern is not None else IMAGE_TAG_PATTERN
|
|
291
|
+
|
|
292
|
+
# Extract image tag positions separately (to prevent mid-split and no overlap)
|
|
293
|
+
image_regions = []
|
|
294
|
+
for match in re.finditer(img_pattern, text):
|
|
295
|
+
image_regions.append((match.start(), match.end()))
|
|
296
|
+
|
|
297
|
+
# Extract all "no-overlap" tag regions (page, slide, sheet, chart, metadata)
|
|
298
|
+
no_overlap_regions: List[Tuple[int, int, str]] = []
|
|
299
|
+
|
|
300
|
+
# Page/Slide/Sheet tags
|
|
301
|
+
if page_tag_processor is not None:
|
|
302
|
+
try:
|
|
303
|
+
from xgen_doc2chunk.core.functions.page_tag_processor import PageTagType
|
|
304
|
+
for match in re.finditer(page_tag_processor.get_pattern_string(PageTagType.PAGE), text, re.IGNORECASE):
|
|
305
|
+
no_overlap_regions.append((match.start(), match.end(), 'page_tag'))
|
|
306
|
+
for match in re.finditer(page_tag_processor.get_pattern_string(PageTagType.SLIDE), text, re.IGNORECASE):
|
|
307
|
+
no_overlap_regions.append((match.start(), match.end(), 'slide_tag'))
|
|
308
|
+
for match in re.finditer(page_tag_processor.get_pattern_string(PageTagType.SHEET), text, re.IGNORECASE):
|
|
309
|
+
no_overlap_regions.append((match.start(), match.end(), 'sheet_tag'))
|
|
310
|
+
except Exception:
|
|
311
|
+
_add_no_overlap_tag_regions_default(text, no_overlap_regions)
|
|
312
|
+
else:
|
|
313
|
+
_add_no_overlap_tag_regions_default(text, no_overlap_regions)
|
|
314
|
+
|
|
315
|
+
# Chart blocks
|
|
316
|
+
chart_pat = chart_pattern if chart_pattern is not None else CHART_BLOCK_PATTERN
|
|
317
|
+
for match in re.finditer(chart_pat, text, re.DOTALL):
|
|
318
|
+
no_overlap_regions.append((match.start(), match.end(), 'chart'))
|
|
319
|
+
|
|
320
|
+
# Metadata blocks
|
|
321
|
+
meta_pat = metadata_pattern if metadata_pattern is not None else METADATA_BLOCK_PATTERN
|
|
322
|
+
for match in re.finditer(meta_pat, text, re.DOTALL):
|
|
323
|
+
no_overlap_regions.append((match.start(), match.end(), 'metadata'))
|
|
324
|
+
|
|
325
|
+
# Data analysis blocks
|
|
326
|
+
for match in re.finditer(DATA_ANALYSIS_PATTERN, text, re.DOTALL):
|
|
327
|
+
no_overlap_regions.append((match.start(), match.end(), 'data_analysis'))
|
|
328
|
+
|
|
329
|
+
# Block protected regions (excluding images - handled separately)
|
|
330
|
+
block_regions = []
|
|
331
|
+
for t_start, t_end in protected_regions:
|
|
332
|
+
is_image = False
|
|
333
|
+
for img_start, img_end in image_regions:
|
|
334
|
+
if t_start == img_start and t_end == img_end:
|
|
335
|
+
is_image = True
|
|
336
|
+
break
|
|
337
|
+
if not is_image:
|
|
338
|
+
block_regions.append((t_start, t_end))
|
|
339
|
+
|
|
340
|
+
# When force_chunking, directly scan for HTML tables
|
|
341
|
+
# (to handle tables not registered in protected_regions)
|
|
342
|
+
html_table_regions = []
|
|
343
|
+
markdown_table_regions = []
|
|
344
|
+
|
|
345
|
+
if force_chunking:
|
|
346
|
+
# Scan for HTML tables
|
|
347
|
+
for match in re.finditer(HTML_TABLE_PATTERN, text, re.DOTALL | re.IGNORECASE):
|
|
348
|
+
t_start, t_end = match.start(), match.end()
|
|
349
|
+
# Check if already in block_regions
|
|
350
|
+
already_in_block = any(
|
|
351
|
+
bs <= t_start and be >= t_end
|
|
352
|
+
for bs, be in block_regions
|
|
353
|
+
)
|
|
354
|
+
if not already_in_block:
|
|
355
|
+
html_table_regions.append((t_start, t_end, 'html'))
|
|
356
|
+
|
|
357
|
+
# Scan for Markdown tables
|
|
358
|
+
for match in re.finditer(MARKDOWN_TABLE_PATTERN, text, re.MULTILINE):
|
|
359
|
+
table_start = match.start()
|
|
360
|
+
if match.group(0).startswith('\n'):
|
|
361
|
+
table_start += 1
|
|
362
|
+
t_start, t_end = table_start, match.end()
|
|
363
|
+
# Check if already in block_regions
|
|
364
|
+
already_in_block = any(
|
|
365
|
+
bs <= t_start and be >= t_end
|
|
366
|
+
for bs, be in block_regions
|
|
367
|
+
)
|
|
368
|
+
if not already_in_block:
|
|
369
|
+
markdown_table_regions.append((t_start, t_end, 'markdown'))
|
|
370
|
+
|
|
371
|
+
# Combine all block regions with type info
|
|
372
|
+
# Convert existing block_regions to include type
|
|
373
|
+
all_block_regions_with_type = [(s, e, 'block') for s, e in block_regions]
|
|
374
|
+
all_block_regions_with_type.extend(html_table_regions)
|
|
375
|
+
all_block_regions_with_type.extend(markdown_table_regions)
|
|
376
|
+
|
|
377
|
+
# Sort by start position
|
|
378
|
+
all_block_regions_with_type.sort(key=lambda x: x[0])
|
|
379
|
+
|
|
380
|
+
# Extract just positions for compatibility
|
|
381
|
+
all_block_regions = [(s, e) for s, e, _ in all_block_regions_with_type]
|
|
382
|
+
|
|
383
|
+
# Create mapping from position to type
|
|
384
|
+
region_type_map = {(s, e): t for s, e, t in all_block_regions_with_type}
|
|
385
|
+
|
|
386
|
+
chunks = []
|
|
387
|
+
current_pos = 0
|
|
388
|
+
text_len = len(text)
|
|
389
|
+
|
|
390
|
+
while current_pos < text_len:
|
|
391
|
+
# If remaining text is <= chunk_size, it's the last chunk
|
|
392
|
+
remaining = text_len - current_pos
|
|
393
|
+
if remaining <= chunk_size:
|
|
394
|
+
chunk = text[current_pos:].strip()
|
|
395
|
+
if chunk:
|
|
396
|
+
chunks.append(chunk)
|
|
397
|
+
break
|
|
398
|
+
|
|
399
|
+
# Calculate chunk_size endpoint
|
|
400
|
+
tentative_end = current_pos + chunk_size
|
|
401
|
+
|
|
402
|
+
# Check if there's a block protected region in this range
|
|
403
|
+
block_in_range = None
|
|
404
|
+
block_type = None
|
|
405
|
+
for t_start, t_end in all_block_regions:
|
|
406
|
+
if t_start < tentative_end and t_end > current_pos:
|
|
407
|
+
block_in_range = (t_start, t_end)
|
|
408
|
+
block_type = region_type_map.get((t_start, t_end), 'block')
|
|
409
|
+
break
|
|
410
|
+
|
|
411
|
+
if block_in_range:
|
|
412
|
+
t_start, t_end = block_in_range
|
|
413
|
+
table_size = t_end - t_start
|
|
414
|
+
|
|
415
|
+
if t_start <= current_pos:
|
|
416
|
+
# Current position is inside or at start of table/block
|
|
417
|
+
if table_size > chunk_size:
|
|
418
|
+
# Table/block is larger than chunk_size
|
|
419
|
+
table_content = text[t_start:t_end].strip()
|
|
420
|
+
|
|
421
|
+
# CRITICAL: Only split tables when force_chunking=True
|
|
422
|
+
# When force_chunking=False, tables are protected and should NOT be split
|
|
423
|
+
if force_chunking:
|
|
424
|
+
# Check type and split efficiently
|
|
425
|
+
if block_type == 'html' or table_content.startswith('<table'):
|
|
426
|
+
# HTML table - split by rows with NO overlap
|
|
427
|
+
from .table_chunker import chunk_large_table
|
|
428
|
+
table_chunks = chunk_large_table(table_content, chunk_size, 0, "")
|
|
429
|
+
chunks.extend(table_chunks)
|
|
430
|
+
elif block_type == 'markdown' or _is_markdown_table(table_content):
|
|
431
|
+
# Markdown table - split by rows with NO overlap
|
|
432
|
+
from .table_chunker import chunk_large_markdown_table
|
|
433
|
+
table_chunks = chunk_large_markdown_table(table_content, chunk_size, 0, "")
|
|
434
|
+
chunks.extend(table_chunks)
|
|
435
|
+
else:
|
|
436
|
+
# Charts, textboxes, etc. -> single chunk (never split)
|
|
437
|
+
if table_content:
|
|
438
|
+
chunks.append(table_content)
|
|
439
|
+
else:
|
|
440
|
+
# force_chunking=False: Keep entire block as single chunk
|
|
441
|
+
# Tables, charts, textboxes, etc. are protected and never split
|
|
442
|
+
if table_content:
|
|
443
|
+
chunks.append(table_content)
|
|
444
|
+
|
|
445
|
+
# Protected blocks have NO overlap - move to end
|
|
446
|
+
current_pos = t_end
|
|
447
|
+
else:
|
|
448
|
+
# Table fits in chunk_size -> try to include table + text after
|
|
449
|
+
end_pos = min(t_end + (chunk_size - table_size), text_len)
|
|
450
|
+
|
|
451
|
+
# Check for collision with next block region (excluding images)
|
|
452
|
+
for next_t_start, next_t_end in all_block_regions:
|
|
453
|
+
if next_t_start > t_end and next_t_start < end_pos:
|
|
454
|
+
end_pos = next_t_start
|
|
455
|
+
break
|
|
456
|
+
|
|
457
|
+
# Adjust if end_pos is in the middle of an image or protected tag
|
|
458
|
+
end_pos, ends_with_image = _adjust_for_image_boundary(end_pos, image_regions, text_len)
|
|
459
|
+
ends_with_no_overlap = _check_ends_with_no_overlap_region(end_pos, no_overlap_regions)
|
|
460
|
+
|
|
461
|
+
chunk = text[current_pos:end_pos].strip()
|
|
462
|
+
if chunk:
|
|
463
|
+
chunks.append(chunk)
|
|
464
|
+
|
|
465
|
+
# Determine if this chunk contains a table (for overlap decision)
|
|
466
|
+
chunk_has_table = (block_type in ('html', 'markdown') or
|
|
467
|
+
text[t_start:t_end].strip().startswith('<table') or
|
|
468
|
+
_is_markdown_table(text[t_start:t_end]))
|
|
469
|
+
|
|
470
|
+
# NO overlap for: tables, images, page/slide tags, charts, metadata
|
|
471
|
+
if ends_with_image or ends_with_no_overlap or chunk_has_table:
|
|
472
|
+
current_pos = end_pos
|
|
473
|
+
else:
|
|
474
|
+
current_pos = max(t_end, end_pos - chunk_overlap)
|
|
475
|
+
else:
|
|
476
|
+
# Table is in the middle of potential chunk
|
|
477
|
+
space_before_table = t_start - current_pos
|
|
478
|
+
space_with_table = t_end - current_pos
|
|
479
|
+
|
|
480
|
+
if space_with_table <= chunk_size:
|
|
481
|
+
# Can include entire table -> include up to table end
|
|
482
|
+
end_pos = t_end
|
|
483
|
+
|
|
484
|
+
# Check if we can add text after table with remaining space
|
|
485
|
+
remaining_space = chunk_size - space_with_table
|
|
486
|
+
if remaining_space > 0:
|
|
487
|
+
potential_end = min(t_end + remaining_space, text_len)
|
|
488
|
+
|
|
489
|
+
# Check for collision with next block region (excluding images)
|
|
490
|
+
for next_t_start, next_t_end in all_block_regions:
|
|
491
|
+
if next_t_start > t_end and next_t_start < potential_end:
|
|
492
|
+
potential_end = next_t_start
|
|
493
|
+
break
|
|
494
|
+
|
|
495
|
+
end_pos = potential_end
|
|
496
|
+
|
|
497
|
+
# Adjust if end_pos is in the middle of an image or protected tag
|
|
498
|
+
end_pos, ends_with_image = _adjust_for_image_boundary(end_pos, image_regions, text_len)
|
|
499
|
+
ends_with_no_overlap = _check_ends_with_no_overlap_region(end_pos, no_overlap_regions)
|
|
500
|
+
|
|
501
|
+
chunk = text[current_pos:end_pos].strip()
|
|
502
|
+
if chunk:
|
|
503
|
+
chunks.append(chunk)
|
|
504
|
+
|
|
505
|
+
# Determine if this chunk ends with a table
|
|
506
|
+
chunk_ends_with_table = (end_pos == t_end or
|
|
507
|
+
(block_type in ('html', 'markdown')))
|
|
508
|
+
|
|
509
|
+
# NO overlap for: tables, images, page/slide tags, charts, metadata
|
|
510
|
+
if ends_with_image or ends_with_no_overlap or chunk_ends_with_table:
|
|
511
|
+
current_pos = end_pos
|
|
512
|
+
else:
|
|
513
|
+
current_pos = max(t_end, end_pos - chunk_overlap)
|
|
514
|
+
else:
|
|
515
|
+
# Cannot include entire table
|
|
516
|
+
if space_before_table > chunk_overlap:
|
|
517
|
+
# Split text before table first
|
|
518
|
+
end_pos = t_start
|
|
519
|
+
# Adjust if end_pos is in the middle of an image or protected tag
|
|
520
|
+
end_pos, ends_with_image = _adjust_for_image_boundary(end_pos, image_regions, text_len)
|
|
521
|
+
ends_with_no_overlap = _check_ends_with_no_overlap_region(end_pos, no_overlap_regions)
|
|
522
|
+
|
|
523
|
+
chunk = text[current_pos:end_pos].strip()
|
|
524
|
+
if chunk:
|
|
525
|
+
chunks.append(chunk)
|
|
526
|
+
|
|
527
|
+
# NO overlap for: images, page/slide tags, charts, metadata
|
|
528
|
+
if ends_with_image or ends_with_no_overlap:
|
|
529
|
+
current_pos = end_pos
|
|
530
|
+
else:
|
|
531
|
+
current_pos = max(current_pos + 1, t_start - chunk_overlap)
|
|
532
|
+
else:
|
|
533
|
+
# Space before table too small -> handle table
|
|
534
|
+
table_content = text[t_start:t_end].strip()
|
|
535
|
+
|
|
536
|
+
# CRITICAL: Only split tables when force_chunking=True
|
|
537
|
+
# When force_chunking=False, tables are protected and should NOT be split
|
|
538
|
+
if table_size > chunk_size and force_chunking:
|
|
539
|
+
if block_type == 'html' or table_content.startswith('<table'):
|
|
540
|
+
# HTML table - split by rows with NO overlap
|
|
541
|
+
from .table_chunker import chunk_large_table
|
|
542
|
+
table_chunks = chunk_large_table(table_content, chunk_size, 0, "")
|
|
543
|
+
chunks.extend(table_chunks)
|
|
544
|
+
elif block_type == 'markdown' or _is_markdown_table(table_content):
|
|
545
|
+
# Markdown table - split by rows with NO overlap
|
|
546
|
+
from .table_chunker import chunk_large_markdown_table
|
|
547
|
+
table_chunks = chunk_large_markdown_table(table_content, chunk_size, 0, "")
|
|
548
|
+
chunks.extend(table_chunks)
|
|
549
|
+
else:
|
|
550
|
+
# Charts, textboxes, etc. -> single chunk
|
|
551
|
+
if table_content:
|
|
552
|
+
chunks.append(table_content)
|
|
553
|
+
else:
|
|
554
|
+
# force_chunking=False OR table fits in chunk_size: single chunk
|
|
555
|
+
if table_content:
|
|
556
|
+
chunks.append(table_content)
|
|
557
|
+
# Tables have NO overlap
|
|
558
|
+
current_pos = t_end
|
|
559
|
+
else:
|
|
560
|
+
# No block protected region -> find best split point
|
|
561
|
+
best_split = tentative_end
|
|
562
|
+
|
|
563
|
+
# Look for paragraph separator
|
|
564
|
+
search_start = max(current_pos, tentative_end - 200)
|
|
565
|
+
para_match = None
|
|
566
|
+
for m in re.finditer(r'\n\s*\n', text[search_start:tentative_end]):
|
|
567
|
+
para_match = m
|
|
568
|
+
|
|
569
|
+
if para_match:
|
|
570
|
+
best_split = search_start + para_match.end()
|
|
571
|
+
else:
|
|
572
|
+
# Look for newline
|
|
573
|
+
newline_pos = text.rfind('\n', current_pos, tentative_end)
|
|
574
|
+
if newline_pos > current_pos + chunk_size // 2:
|
|
575
|
+
best_split = newline_pos + 1
|
|
576
|
+
else:
|
|
577
|
+
# Look for space
|
|
578
|
+
space_pos = text.rfind(' ', current_pos, tentative_end)
|
|
579
|
+
if space_pos > current_pos + chunk_size // 2:
|
|
580
|
+
best_split = space_pos + 1
|
|
581
|
+
|
|
582
|
+
# Adjust if best_split is in the middle of an image or protected tag
|
|
583
|
+
best_split, ends_with_image = _adjust_for_image_boundary(best_split, image_regions, text_len)
|
|
584
|
+
ends_with_no_overlap = _check_ends_with_no_overlap_region(best_split, no_overlap_regions)
|
|
585
|
+
|
|
586
|
+
chunk = text[current_pos:best_split].strip()
|
|
587
|
+
if chunk:
|
|
588
|
+
chunks.append(chunk)
|
|
589
|
+
|
|
590
|
+
# NO overlap for: images, page/slide tags, charts, metadata
|
|
591
|
+
if ends_with_image or ends_with_no_overlap:
|
|
592
|
+
current_pos = best_split
|
|
593
|
+
else:
|
|
594
|
+
current_pos = best_split - chunk_overlap
|
|
595
|
+
if current_pos < 0:
|
|
596
|
+
current_pos = best_split
|
|
597
|
+
|
|
598
|
+
return chunks
|
|
599
|
+
|
|
600
|
+
|
|
601
|
+
def _adjust_for_image_boundary(
|
|
602
|
+
pos: int,
|
|
603
|
+
image_regions: List[Tuple[int, int]],
|
|
604
|
+
text_len: int
|
|
605
|
+
) -> Tuple[int, bool]:
|
|
606
|
+
"""
|
|
607
|
+
Check if position is in the middle of an image tag and adjust to image end if so.
|
|
608
|
+
|
|
609
|
+
Args:
|
|
610
|
+
pos: Current split position
|
|
611
|
+
image_regions: Image tag position list [(start, end), ...]
|
|
612
|
+
text_len: Total text length
|
|
613
|
+
|
|
614
|
+
Returns:
|
|
615
|
+
(adjusted_pos, ends_with_image): Adjusted position and whether it ends with image
|
|
616
|
+
"""
|
|
617
|
+
for img_start, img_end in image_regions:
|
|
618
|
+
# If split position is in the middle of an image tag
|
|
619
|
+
if img_start < pos < img_end:
|
|
620
|
+
# Extend to image end
|
|
621
|
+
return min(img_end, text_len), True
|
|
622
|
+
# If split position is right after an image tag (including space/newline)
|
|
623
|
+
if img_end <= pos <= img_end + 5:
|
|
624
|
+
return pos, True
|
|
625
|
+
return pos, False
|
|
626
|
+
|
|
627
|
+
|
|
628
|
+
def _check_ends_with_no_overlap_region(
|
|
629
|
+
end_pos: int,
|
|
630
|
+
no_overlap_regions: List[Tuple[int, int, str]],
|
|
631
|
+
tolerance: int = 5
|
|
632
|
+
) -> bool:
|
|
633
|
+
"""
|
|
634
|
+
Check if position ends with or is right after a no-overlap region.
|
|
635
|
+
|
|
636
|
+
Args:
|
|
637
|
+
end_pos: End position of chunk
|
|
638
|
+
no_overlap_regions: List of (start, end, type) for no-overlap regions
|
|
639
|
+
tolerance: Number of characters after region end to still consider it as ending with region
|
|
640
|
+
|
|
641
|
+
Returns:
|
|
642
|
+
True if position ends with a no-overlap region
|
|
643
|
+
"""
|
|
644
|
+
for region_start, region_end, _ in no_overlap_regions:
|
|
645
|
+
# If end_pos is exactly at or just after the region end (within tolerance)
|
|
646
|
+
if region_end <= end_pos <= region_end + tolerance:
|
|
647
|
+
return True
|
|
648
|
+
return False
|
|
649
|
+
|
|
650
|
+
|
|
651
|
+
def split_large_chunk_with_protected_regions(
|
|
652
|
+
text: str,
|
|
653
|
+
chunk_size: int,
|
|
654
|
+
chunk_overlap: int,
|
|
655
|
+
is_table_based: bool = False,
|
|
656
|
+
force_chunking: bool = False,
|
|
657
|
+
image_pattern: Optional[str] = None,
|
|
658
|
+
chart_pattern: Optional[str] = None,
|
|
659
|
+
page_tag_processor: Optional[Any] = None,
|
|
660
|
+
metadata_pattern: Optional[str] = None
|
|
661
|
+
) -> List[str]:
|
|
662
|
+
"""
|
|
663
|
+
Split large chunk while protecting regions (HTML tables, charts, Markdown tables, tags).
|
|
664
|
+
When force_chunking, table protection is disabled (charts always protected, rows protected).
|
|
665
|
+
|
|
666
|
+
Protected regions that NEVER overlap:
|
|
667
|
+
- Image tags, Page/Slide/Sheet tags, Chart blocks, Metadata blocks
|
|
668
|
+
- Tables split by rows with NO overlap
|
|
669
|
+
|
|
670
|
+
When force_chunking=True:
|
|
671
|
+
- Tables are not registered as protected regions in find_protected_regions
|
|
672
|
+
- But split_with_protected_regions directly scans for tables and handles them
|
|
673
|
+
- Tables are split by rows with NO overlap
|
|
674
|
+
|
|
675
|
+
Args:
|
|
676
|
+
text: Text to split
|
|
677
|
+
chunk_size: Maximum chunk size
|
|
678
|
+
chunk_overlap: Overlap size between chunks (NOT applied to protected regions)
|
|
679
|
+
is_table_based: Whether file is table-based
|
|
680
|
+
force_chunking: Force chunking mode
|
|
681
|
+
image_pattern: Custom image tag pattern
|
|
682
|
+
chart_pattern: Custom chart block pattern
|
|
683
|
+
page_tag_processor: PageTagProcessor instance for custom page/slide/sheet patterns
|
|
684
|
+
metadata_pattern: Custom metadata block pattern
|
|
685
|
+
|
|
686
|
+
Returns:
|
|
687
|
+
List of chunks
|
|
688
|
+
"""
|
|
689
|
+
protected_regions = find_protected_regions(
|
|
690
|
+
text, is_table_based, force_chunking, image_pattern,
|
|
691
|
+
chart_pattern, page_tag_processor, metadata_pattern
|
|
692
|
+
)
|
|
693
|
+
protected_positions = get_protected_region_positions(protected_regions)
|
|
694
|
+
|
|
695
|
+
# split_with_protected_regions handles tables even with force_chunking
|
|
696
|
+
# (it directly scans for tables when force_chunking=True)
|
|
697
|
+
return split_with_protected_regions(
|
|
698
|
+
text, protected_positions, chunk_size, chunk_overlap, force_chunking,
|
|
699
|
+
image_pattern, chart_pattern, page_tag_processor, metadata_pattern
|
|
700
|
+
)
|
|
701
|
+
|
|
702
|
+
|
|
703
|
+
# Backward compatibility aliases
|
|
704
|
+
def ensure_table_integrity(content: str, table_pattern: str) -> str:
|
|
705
|
+
"""Deprecated: Use ensure_protected_region_integrity instead."""
|
|
706
|
+
return ensure_protected_region_integrity(content)
|
|
707
|
+
|
|
708
|
+
|
|
709
|
+
def split_large_chunk_with_table_protection(
|
|
710
|
+
text: str,
|
|
711
|
+
chunk_size: int,
|
|
712
|
+
chunk_overlap: int
|
|
713
|
+
) -> List[str]:
|
|
714
|
+
"""Deprecated: Use split_large_chunk_with_protected_regions instead."""
|
|
715
|
+
return split_large_chunk_with_protected_regions(text, chunk_size, chunk_overlap, False)
|