xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,715 @@
1
+ # chunking_helper/protected_regions.py
2
+ """
3
+ Protected Regions - Protected region detection and processing
4
+
5
+ Main Features:
6
+ - Detect protected regions that should not be split during chunking
7
+ - Split text while protecting protected regions
8
+ - Efficient handling of large tables (HTML and Markdown)
9
+ - Row-level chunking for tables with NO overlap
10
+ - Support for dynamic tag patterns from processors (Image, Chart, Page, Slide, Metadata)
11
+ - Protected regions NEVER overlap when splitting chunks
12
+ """
13
+ import logging
14
+ import re
15
+ from typing import Any, List, Optional, Tuple
16
+
17
+ from xgen_doc2chunk.chunking.constants import (
18
+ HTML_TABLE_PATTERN, CHART_BLOCK_PATTERN, TEXTBOX_BLOCK_PATTERN,
19
+ IMAGE_TAG_PATTERN, MARKDOWN_TABLE_PATTERN,
20
+ PAGE_TAG_PATTERN, SLIDE_TAG_PATTERN, SHEET_TAG_PATTERN,
21
+ PAGE_TAG_OCR_PATTERN, SLIDE_TAG_OCR_PATTERN,
22
+ METADATA_BLOCK_PATTERN, DATA_ANALYSIS_PATTERN
23
+ )
24
+
25
+ logger = logging.getLogger("document-processor")
26
+
27
+
28
+ def find_protected_regions(
29
+ text: str,
30
+ is_table_based: bool = False,
31
+ force_chunking: bool = False,
32
+ image_pattern: Optional[str] = None,
33
+ chart_pattern: Optional[str] = None,
34
+ page_tag_processor: Optional[Any] = None,
35
+ metadata_pattern: Optional[str] = None
36
+ ) -> List[Tuple[int, int, str]]:
37
+ """
38
+ Find protected regions that should not be split during chunking.
39
+
40
+ Protected Regions (NEVER split or overlap):
41
+ 1. HTML tables: <table>...</table> (row-level only when force_chunking/table-based)
42
+ 2. Chart blocks: [chart]...[/chart] or custom - always protected (never split)
43
+ 3. Textbox blocks: [textbox]...[/textbox] - always protected (never split)
44
+ 4. Image tags: [image:...] or custom - always protected (never split, no overlap)
45
+ 5. Markdown tables: |...|\\n|---|...| (row-level only when force_chunking/table-based)
46
+ 6. Page/Slide/Sheet tags: [Page Number: n], [Slide Number: n], [Sheet: name] - always protected (no overlap)
47
+ 7. Metadata blocks: <Document-Metadata>...</Document-Metadata> or custom - always protected (no overlap)
48
+ 8. Data analysis blocks: [Data Analysis]...[/Data Analysis] - always protected
49
+
50
+ Args:
51
+ text: Text to search
52
+ is_table_based: Whether file is table-based (if True, row-level protection only for tables)
53
+ force_chunking: Force chunking mode (if True, same as table-based for row-level protection)
54
+ image_pattern: Image tag pattern (if None, uses default IMAGE_TAG_PATTERN)
55
+ chart_pattern: Chart block pattern (if None, uses default CHART_BLOCK_PATTERN)
56
+ page_tag_processor: PageTagProcessor instance for custom page/slide/sheet patterns
57
+ metadata_pattern: Metadata block pattern (if None, uses default METADATA_BLOCK_PATTERN)
58
+
59
+ Returns:
60
+ [(start, end, type), ...] - Sorted list of protected regions
61
+ """
62
+ regions: List[Tuple[int, int, str]] = []
63
+
64
+ # Table protection disabled when is_table_based or force_chunking is True
65
+ disable_table_protection = is_table_based or force_chunking
66
+
67
+ # 1. HTML tables (row-level only when table protection disabled)
68
+ if not disable_table_protection:
69
+ for match in re.finditer(HTML_TABLE_PATTERN, text, re.DOTALL | re.IGNORECASE):
70
+ regions.append((match.start(), match.end(), 'html_table'))
71
+ # else: HTML tables allow row-level chunking (handled by chunk_large_table)
72
+
73
+ # 2. Chart blocks - always protected (never split under any condition)
74
+ chart_pat = chart_pattern if chart_pattern is not None else CHART_BLOCK_PATTERN
75
+ for match in re.finditer(chart_pat, text, re.DOTALL):
76
+ regions.append((match.start(), match.end(), 'chart'))
77
+
78
+ # 3. Textbox blocks - always protected (never split under any condition)
79
+ for match in re.finditer(TEXTBOX_BLOCK_PATTERN, text, re.DOTALL):
80
+ regions.append((match.start(), match.end(), 'textbox'))
81
+
82
+ # 4. Image tags - always protected (never split under any condition, no overlap)
83
+ img_pattern = image_pattern if image_pattern is not None else IMAGE_TAG_PATTERN
84
+ for match in re.finditer(img_pattern, text):
85
+ regions.append((match.start(), match.end(), 'image_tag'))
86
+
87
+ # 5. Markdown tables (row-level only when table protection disabled)
88
+ if not disable_table_protection:
89
+ for match in re.finditer(MARKDOWN_TABLE_PATTERN, text, re.MULTILINE):
90
+ table_start = match.start()
91
+ if match.group(0).startswith('\n'):
92
+ table_start += 1
93
+ table_end = match.end()
94
+ regions.append((table_start, table_end, 'markdown_table'))
95
+ # else: Markdown tables allow row-level chunking (handled by chunk_large_markdown_table)
96
+
97
+ # 6. Page/Slide/Sheet tags - always protected (no overlap)
98
+ # Use dynamic patterns from PageTagProcessor if provided
99
+ if page_tag_processor is not None:
100
+ try:
101
+ from xgen_doc2chunk.core.functions.page_tag_processor import PageTagType
102
+ # Page tags
103
+ page_pattern = page_tag_processor.get_pattern_string(PageTagType.PAGE)
104
+ for match in re.finditer(page_pattern, text, re.IGNORECASE):
105
+ regions.append((match.start(), match.end(), 'page_tag'))
106
+ # OCR page tag variants (use stable default pattern)
107
+ for match in re.finditer(PAGE_TAG_OCR_PATTERN, text, re.IGNORECASE):
108
+ regions.append((match.start(), match.end(), 'page_tag'))
109
+ # Slide tags
110
+ slide_pattern = page_tag_processor.get_pattern_string(PageTagType.SLIDE)
111
+ for match in re.finditer(slide_pattern, text, re.IGNORECASE):
112
+ regions.append((match.start(), match.end(), 'slide_tag'))
113
+ # OCR slide tag variants (use stable default pattern)
114
+ for match in re.finditer(SLIDE_TAG_OCR_PATTERN, text, re.IGNORECASE):
115
+ regions.append((match.start(), match.end(), 'slide_tag'))
116
+ # Sheet tags
117
+ sheet_pattern = page_tag_processor.get_pattern_string(PageTagType.SHEET)
118
+ for match in re.finditer(sheet_pattern, text, re.IGNORECASE):
119
+ regions.append((match.start(), match.end(), 'sheet_tag'))
120
+ except Exception as e:
121
+ logger.warning(f"Error getting patterns from page_tag_processor: {e}, using defaults")
122
+ _add_default_page_tag_regions(text, regions)
123
+ else:
124
+ _add_default_page_tag_regions(text, regions)
125
+
126
+ # 7. Metadata blocks - always protected (no overlap)
127
+ meta_pattern = metadata_pattern if metadata_pattern is not None else METADATA_BLOCK_PATTERN
128
+ for match in re.finditer(meta_pattern, text, re.DOTALL):
129
+ regions.append((match.start(), match.end(), 'metadata'))
130
+
131
+ # 8. Data analysis blocks - always protected
132
+ for match in re.finditer(DATA_ANALYSIS_PATTERN, text, re.DOTALL):
133
+ regions.append((match.start(), match.end(), 'data_analysis'))
134
+
135
+ # Sort by start position
136
+ regions.sort(key=lambda x: x[0])
137
+
138
+ # Merge overlapping regions
139
+ merged_regions: List[Tuple[int, int, str]] = []
140
+ for start, end, region_type in regions:
141
+ if merged_regions and start < merged_regions[-1][1]:
142
+ # Overlap with previous region -> merge
143
+ prev_start, prev_end, prev_type = merged_regions[-1]
144
+ merged_regions[-1] = (prev_start, max(prev_end, end), f"{prev_type}+{region_type}")
145
+ else:
146
+ merged_regions.append((start, end, region_type))
147
+
148
+ return merged_regions
149
+
150
+
151
+ def _add_default_page_tag_regions(text: str, regions: List[Tuple[int, int, str]]) -> None:
152
+ """
153
+ Add default page/slide/sheet tag regions using default patterns.
154
+
155
+ Args:
156
+ text: Text to search
157
+ regions: List to append found regions to
158
+ """
159
+ # Page tags (including OCR variants)
160
+ for match in re.finditer(PAGE_TAG_PATTERN, text, re.IGNORECASE):
161
+ regions.append((match.start(), match.end(), 'page_tag'))
162
+ for match in re.finditer(PAGE_TAG_OCR_PATTERN, text, re.IGNORECASE):
163
+ regions.append((match.start(), match.end(), 'page_tag'))
164
+
165
+ # Slide tags (including OCR variants)
166
+ for match in re.finditer(SLIDE_TAG_PATTERN, text, re.IGNORECASE):
167
+ regions.append((match.start(), match.end(), 'slide_tag'))
168
+ for match in re.finditer(SLIDE_TAG_OCR_PATTERN, text, re.IGNORECASE):
169
+ regions.append((match.start(), match.end(), 'slide_tag'))
170
+
171
+ # Sheet tags
172
+ for match in re.finditer(SHEET_TAG_PATTERN, text, re.IGNORECASE):
173
+ regions.append((match.start(), match.end(), 'sheet_tag'))
174
+
175
+
176
+ def _add_no_overlap_tag_regions_default(text: str, regions: List[Tuple[int, int, str]]) -> None:
177
+ """
178
+ Add default no-overlap tag regions (page/slide/sheet) using default patterns.
179
+
180
+ Args:
181
+ text: Text to search
182
+ regions: List to append found regions to
183
+ """
184
+ # Page tags (including OCR variants)
185
+ for match in re.finditer(PAGE_TAG_PATTERN, text, re.IGNORECASE):
186
+ regions.append((match.start(), match.end(), 'page_tag'))
187
+ for match in re.finditer(PAGE_TAG_OCR_PATTERN, text, re.IGNORECASE):
188
+ regions.append((match.start(), match.end(), 'page_tag'))
189
+
190
+ # Slide tags (including OCR variants)
191
+ for match in re.finditer(SLIDE_TAG_PATTERN, text, re.IGNORECASE):
192
+ regions.append((match.start(), match.end(), 'slide_tag'))
193
+ for match in re.finditer(SLIDE_TAG_OCR_PATTERN, text, re.IGNORECASE):
194
+ regions.append((match.start(), match.end(), 'slide_tag'))
195
+
196
+ # Sheet tags
197
+ for match in re.finditer(SHEET_TAG_PATTERN, text, re.IGNORECASE):
198
+ regions.append((match.start(), match.end(), 'sheet_tag'))
199
+
200
+
201
+ def get_protected_region_positions(regions: List[Tuple[int, int, str]]) -> List[Tuple[int, int]]:
202
+ """
203
+ Extract (start, end) tuples from protected regions.
204
+ """
205
+ return [(start, end) for start, end, _ in regions]
206
+
207
+ def ensure_protected_region_integrity(content: str) -> str:
208
+ """
209
+ Verify that protected regions (HTML tables, chart blocks, Markdown tables) in chunk are complete.
210
+ Log warning if incomplete protected region found (content is preserved).
211
+ """
212
+ # HTML table integrity check
213
+ open_tables = len(re.findall(r'<table[^>]*>', content, re.IGNORECASE))
214
+ close_tables = len(re.findall(r'</table>', content, re.IGNORECASE))
215
+ if open_tables != close_tables:
216
+ logger.warning(f"Incomplete HTML table detected in chunk: {open_tables} open, {close_tables} close tags")
217
+
218
+ # Chart block integrity check
219
+ open_charts = len(re.findall(r'\[chart\]', content))
220
+ close_charts = len(re.findall(r'\[/chart\]', content))
221
+ if open_charts != close_charts:
222
+ logger.warning(f"Incomplete chart block detected in chunk: {open_charts} open, {close_charts} close tags")
223
+
224
+ return content
225
+
226
+
227
+ def _is_markdown_table(text: str) -> bool:
228
+ """
229
+ Check if text contains a Markdown table pattern.
230
+ """
231
+ lines = text.strip().split('\n')
232
+ if len(lines) < 2:
233
+ return False
234
+ has_pipe_rows = any(line.strip().startswith('|') for line in lines)
235
+ has_separator = any('---' in line and '|' in line for line in lines)
236
+ return has_pipe_rows and has_separator
237
+
238
+
239
+ def split_with_protected_regions(
240
+ text: str,
241
+ protected_regions: List[Tuple[int, int]],
242
+ chunk_size: int,
243
+ chunk_overlap: int,
244
+ force_chunking: bool = False,
245
+ image_pattern: Optional[str] = None,
246
+ chart_pattern: Optional[str] = None,
247
+ page_tag_processor: Optional[Any] = None,
248
+ metadata_pattern: Optional[str] = None
249
+ ) -> List[str]:
250
+ """
251
+ Split text into chunks while protecting regions (HTML tables, charts, Markdown tables, tags).
252
+
253
+ Algorithm:
254
+ 1. Move forward by chunk_size from current position
255
+ 2. If that point is inside a protected region -> cut before region start or include until region end
256
+ 3. If protected region is larger than chunk_size:
257
+ - HTML table -> split efficiently with chunk_large_table (row-level, NO overlap)
258
+ - Markdown table -> split efficiently with chunk_large_markdown_table (row-level, NO overlap)
259
+ - Other (charts, metadata, page tags, etc.) -> single chunk for protected region
260
+ 4. Apply overlap for next chunk start ONLY for plain text
261
+ - Tables, images, charts, page/slide tags, metadata blocks: NO overlap
262
+
263
+ Protected regions that NEVER overlap:
264
+ - Image tags: [Image:...] or custom pattern
265
+ - Page/Slide/Sheet tags: [Page Number: n], etc.
266
+ - Chart blocks: [chart]...[/chart] or custom
267
+ - Metadata blocks: <Document-Metadata>...</Document-Metadata> or custom
268
+ - Tables: Split by rows, each chunk has NO overlap
269
+
270
+ force_chunking handling:
271
+ - When force_chunking=True, even if tables are not in protected_regions
272
+ - Directly scan for HTML/Markdown tables to avoid cutting in the middle
273
+ - Large tables are split by chunk_large_table/chunk_large_markdown_table with NO overlap
274
+
275
+ Args:
276
+ text: Text to split
277
+ protected_regions: List of (start, end) tuples for protected regions
278
+ chunk_size: Maximum chunk size
279
+ chunk_overlap: Overlap size (NOT applied to protected regions)
280
+ force_chunking: Force chunking mode
281
+ image_pattern: Custom image tag pattern
282
+ chart_pattern: Custom chart block pattern
283
+ page_tag_processor: PageTagProcessor instance for custom patterns
284
+ metadata_pattern: Custom metadata block pattern
285
+
286
+ Returns:
287
+ List of chunks
288
+ """
289
+ # Get image pattern (custom or default)
290
+ img_pattern = image_pattern if image_pattern is not None else IMAGE_TAG_PATTERN
291
+
292
+ # Extract image tag positions separately (to prevent mid-split and no overlap)
293
+ image_regions = []
294
+ for match in re.finditer(img_pattern, text):
295
+ image_regions.append((match.start(), match.end()))
296
+
297
+ # Extract all "no-overlap" tag regions (page, slide, sheet, chart, metadata)
298
+ no_overlap_regions: List[Tuple[int, int, str]] = []
299
+
300
+ # Page/Slide/Sheet tags
301
+ if page_tag_processor is not None:
302
+ try:
303
+ from xgen_doc2chunk.core.functions.page_tag_processor import PageTagType
304
+ for match in re.finditer(page_tag_processor.get_pattern_string(PageTagType.PAGE), text, re.IGNORECASE):
305
+ no_overlap_regions.append((match.start(), match.end(), 'page_tag'))
306
+ for match in re.finditer(page_tag_processor.get_pattern_string(PageTagType.SLIDE), text, re.IGNORECASE):
307
+ no_overlap_regions.append((match.start(), match.end(), 'slide_tag'))
308
+ for match in re.finditer(page_tag_processor.get_pattern_string(PageTagType.SHEET), text, re.IGNORECASE):
309
+ no_overlap_regions.append((match.start(), match.end(), 'sheet_tag'))
310
+ except Exception:
311
+ _add_no_overlap_tag_regions_default(text, no_overlap_regions)
312
+ else:
313
+ _add_no_overlap_tag_regions_default(text, no_overlap_regions)
314
+
315
+ # Chart blocks
316
+ chart_pat = chart_pattern if chart_pattern is not None else CHART_BLOCK_PATTERN
317
+ for match in re.finditer(chart_pat, text, re.DOTALL):
318
+ no_overlap_regions.append((match.start(), match.end(), 'chart'))
319
+
320
+ # Metadata blocks
321
+ meta_pat = metadata_pattern if metadata_pattern is not None else METADATA_BLOCK_PATTERN
322
+ for match in re.finditer(meta_pat, text, re.DOTALL):
323
+ no_overlap_regions.append((match.start(), match.end(), 'metadata'))
324
+
325
+ # Data analysis blocks
326
+ for match in re.finditer(DATA_ANALYSIS_PATTERN, text, re.DOTALL):
327
+ no_overlap_regions.append((match.start(), match.end(), 'data_analysis'))
328
+
329
+ # Block protected regions (excluding images - handled separately)
330
+ block_regions = []
331
+ for t_start, t_end in protected_regions:
332
+ is_image = False
333
+ for img_start, img_end in image_regions:
334
+ if t_start == img_start and t_end == img_end:
335
+ is_image = True
336
+ break
337
+ if not is_image:
338
+ block_regions.append((t_start, t_end))
339
+
340
+ # When force_chunking, directly scan for HTML tables
341
+ # (to handle tables not registered in protected_regions)
342
+ html_table_regions = []
343
+ markdown_table_regions = []
344
+
345
+ if force_chunking:
346
+ # Scan for HTML tables
347
+ for match in re.finditer(HTML_TABLE_PATTERN, text, re.DOTALL | re.IGNORECASE):
348
+ t_start, t_end = match.start(), match.end()
349
+ # Check if already in block_regions
350
+ already_in_block = any(
351
+ bs <= t_start and be >= t_end
352
+ for bs, be in block_regions
353
+ )
354
+ if not already_in_block:
355
+ html_table_regions.append((t_start, t_end, 'html'))
356
+
357
+ # Scan for Markdown tables
358
+ for match in re.finditer(MARKDOWN_TABLE_PATTERN, text, re.MULTILINE):
359
+ table_start = match.start()
360
+ if match.group(0).startswith('\n'):
361
+ table_start += 1
362
+ t_start, t_end = table_start, match.end()
363
+ # Check if already in block_regions
364
+ already_in_block = any(
365
+ bs <= t_start and be >= t_end
366
+ for bs, be in block_regions
367
+ )
368
+ if not already_in_block:
369
+ markdown_table_regions.append((t_start, t_end, 'markdown'))
370
+
371
+ # Combine all block regions with type info
372
+ # Convert existing block_regions to include type
373
+ all_block_regions_with_type = [(s, e, 'block') for s, e in block_regions]
374
+ all_block_regions_with_type.extend(html_table_regions)
375
+ all_block_regions_with_type.extend(markdown_table_regions)
376
+
377
+ # Sort by start position
378
+ all_block_regions_with_type.sort(key=lambda x: x[0])
379
+
380
+ # Extract just positions for compatibility
381
+ all_block_regions = [(s, e) for s, e, _ in all_block_regions_with_type]
382
+
383
+ # Create mapping from position to type
384
+ region_type_map = {(s, e): t for s, e, t in all_block_regions_with_type}
385
+
386
+ chunks = []
387
+ current_pos = 0
388
+ text_len = len(text)
389
+
390
+ while current_pos < text_len:
391
+ # If remaining text is <= chunk_size, it's the last chunk
392
+ remaining = text_len - current_pos
393
+ if remaining <= chunk_size:
394
+ chunk = text[current_pos:].strip()
395
+ if chunk:
396
+ chunks.append(chunk)
397
+ break
398
+
399
+ # Calculate chunk_size endpoint
400
+ tentative_end = current_pos + chunk_size
401
+
402
+ # Check if there's a block protected region in this range
403
+ block_in_range = None
404
+ block_type = None
405
+ for t_start, t_end in all_block_regions:
406
+ if t_start < tentative_end and t_end > current_pos:
407
+ block_in_range = (t_start, t_end)
408
+ block_type = region_type_map.get((t_start, t_end), 'block')
409
+ break
410
+
411
+ if block_in_range:
412
+ t_start, t_end = block_in_range
413
+ table_size = t_end - t_start
414
+
415
+ if t_start <= current_pos:
416
+ # Current position is inside or at start of table/block
417
+ if table_size > chunk_size:
418
+ # Table/block is larger than chunk_size
419
+ table_content = text[t_start:t_end].strip()
420
+
421
+ # CRITICAL: Only split tables when force_chunking=True
422
+ # When force_chunking=False, tables are protected and should NOT be split
423
+ if force_chunking:
424
+ # Check type and split efficiently
425
+ if block_type == 'html' or table_content.startswith('<table'):
426
+ # HTML table - split by rows with NO overlap
427
+ from .table_chunker import chunk_large_table
428
+ table_chunks = chunk_large_table(table_content, chunk_size, 0, "")
429
+ chunks.extend(table_chunks)
430
+ elif block_type == 'markdown' or _is_markdown_table(table_content):
431
+ # Markdown table - split by rows with NO overlap
432
+ from .table_chunker import chunk_large_markdown_table
433
+ table_chunks = chunk_large_markdown_table(table_content, chunk_size, 0, "")
434
+ chunks.extend(table_chunks)
435
+ else:
436
+ # Charts, textboxes, etc. -> single chunk (never split)
437
+ if table_content:
438
+ chunks.append(table_content)
439
+ else:
440
+ # force_chunking=False: Keep entire block as single chunk
441
+ # Tables, charts, textboxes, etc. are protected and never split
442
+ if table_content:
443
+ chunks.append(table_content)
444
+
445
+ # Protected blocks have NO overlap - move to end
446
+ current_pos = t_end
447
+ else:
448
+ # Table fits in chunk_size -> try to include table + text after
449
+ end_pos = min(t_end + (chunk_size - table_size), text_len)
450
+
451
+ # Check for collision with next block region (excluding images)
452
+ for next_t_start, next_t_end in all_block_regions:
453
+ if next_t_start > t_end and next_t_start < end_pos:
454
+ end_pos = next_t_start
455
+ break
456
+
457
+ # Adjust if end_pos is in the middle of an image or protected tag
458
+ end_pos, ends_with_image = _adjust_for_image_boundary(end_pos, image_regions, text_len)
459
+ ends_with_no_overlap = _check_ends_with_no_overlap_region(end_pos, no_overlap_regions)
460
+
461
+ chunk = text[current_pos:end_pos].strip()
462
+ if chunk:
463
+ chunks.append(chunk)
464
+
465
+ # Determine if this chunk contains a table (for overlap decision)
466
+ chunk_has_table = (block_type in ('html', 'markdown') or
467
+ text[t_start:t_end].strip().startswith('<table') or
468
+ _is_markdown_table(text[t_start:t_end]))
469
+
470
+ # NO overlap for: tables, images, page/slide tags, charts, metadata
471
+ if ends_with_image or ends_with_no_overlap or chunk_has_table:
472
+ current_pos = end_pos
473
+ else:
474
+ current_pos = max(t_end, end_pos - chunk_overlap)
475
+ else:
476
+ # Table is in the middle of potential chunk
477
+ space_before_table = t_start - current_pos
478
+ space_with_table = t_end - current_pos
479
+
480
+ if space_with_table <= chunk_size:
481
+ # Can include entire table -> include up to table end
482
+ end_pos = t_end
483
+
484
+ # Check if we can add text after table with remaining space
485
+ remaining_space = chunk_size - space_with_table
486
+ if remaining_space > 0:
487
+ potential_end = min(t_end + remaining_space, text_len)
488
+
489
+ # Check for collision with next block region (excluding images)
490
+ for next_t_start, next_t_end in all_block_regions:
491
+ if next_t_start > t_end and next_t_start < potential_end:
492
+ potential_end = next_t_start
493
+ break
494
+
495
+ end_pos = potential_end
496
+
497
+ # Adjust if end_pos is in the middle of an image or protected tag
498
+ end_pos, ends_with_image = _adjust_for_image_boundary(end_pos, image_regions, text_len)
499
+ ends_with_no_overlap = _check_ends_with_no_overlap_region(end_pos, no_overlap_regions)
500
+
501
+ chunk = text[current_pos:end_pos].strip()
502
+ if chunk:
503
+ chunks.append(chunk)
504
+
505
+ # Determine if this chunk ends with a table
506
+ chunk_ends_with_table = (end_pos == t_end or
507
+ (block_type in ('html', 'markdown')))
508
+
509
+ # NO overlap for: tables, images, page/slide tags, charts, metadata
510
+ if ends_with_image or ends_with_no_overlap or chunk_ends_with_table:
511
+ current_pos = end_pos
512
+ else:
513
+ current_pos = max(t_end, end_pos - chunk_overlap)
514
+ else:
515
+ # Cannot include entire table
516
+ if space_before_table > chunk_overlap:
517
+ # Split text before table first
518
+ end_pos = t_start
519
+ # Adjust if end_pos is in the middle of an image or protected tag
520
+ end_pos, ends_with_image = _adjust_for_image_boundary(end_pos, image_regions, text_len)
521
+ ends_with_no_overlap = _check_ends_with_no_overlap_region(end_pos, no_overlap_regions)
522
+
523
+ chunk = text[current_pos:end_pos].strip()
524
+ if chunk:
525
+ chunks.append(chunk)
526
+
527
+ # NO overlap for: images, page/slide tags, charts, metadata
528
+ if ends_with_image or ends_with_no_overlap:
529
+ current_pos = end_pos
530
+ else:
531
+ current_pos = max(current_pos + 1, t_start - chunk_overlap)
532
+ else:
533
+ # Space before table too small -> handle table
534
+ table_content = text[t_start:t_end].strip()
535
+
536
+ # CRITICAL: Only split tables when force_chunking=True
537
+ # When force_chunking=False, tables are protected and should NOT be split
538
+ if table_size > chunk_size and force_chunking:
539
+ if block_type == 'html' or table_content.startswith('<table'):
540
+ # HTML table - split by rows with NO overlap
541
+ from .table_chunker import chunk_large_table
542
+ table_chunks = chunk_large_table(table_content, chunk_size, 0, "")
543
+ chunks.extend(table_chunks)
544
+ elif block_type == 'markdown' or _is_markdown_table(table_content):
545
+ # Markdown table - split by rows with NO overlap
546
+ from .table_chunker import chunk_large_markdown_table
547
+ table_chunks = chunk_large_markdown_table(table_content, chunk_size, 0, "")
548
+ chunks.extend(table_chunks)
549
+ else:
550
+ # Charts, textboxes, etc. -> single chunk
551
+ if table_content:
552
+ chunks.append(table_content)
553
+ else:
554
+ # force_chunking=False OR table fits in chunk_size: single chunk
555
+ if table_content:
556
+ chunks.append(table_content)
557
+ # Tables have NO overlap
558
+ current_pos = t_end
559
+ else:
560
+ # No block protected region -> find best split point
561
+ best_split = tentative_end
562
+
563
+ # Look for paragraph separator
564
+ search_start = max(current_pos, tentative_end - 200)
565
+ para_match = None
566
+ for m in re.finditer(r'\n\s*\n', text[search_start:tentative_end]):
567
+ para_match = m
568
+
569
+ if para_match:
570
+ best_split = search_start + para_match.end()
571
+ else:
572
+ # Look for newline
573
+ newline_pos = text.rfind('\n', current_pos, tentative_end)
574
+ if newline_pos > current_pos + chunk_size // 2:
575
+ best_split = newline_pos + 1
576
+ else:
577
+ # Look for space
578
+ space_pos = text.rfind(' ', current_pos, tentative_end)
579
+ if space_pos > current_pos + chunk_size // 2:
580
+ best_split = space_pos + 1
581
+
582
+ # Adjust if best_split is in the middle of an image or protected tag
583
+ best_split, ends_with_image = _adjust_for_image_boundary(best_split, image_regions, text_len)
584
+ ends_with_no_overlap = _check_ends_with_no_overlap_region(best_split, no_overlap_regions)
585
+
586
+ chunk = text[current_pos:best_split].strip()
587
+ if chunk:
588
+ chunks.append(chunk)
589
+
590
+ # NO overlap for: images, page/slide tags, charts, metadata
591
+ if ends_with_image or ends_with_no_overlap:
592
+ current_pos = best_split
593
+ else:
594
+ current_pos = best_split - chunk_overlap
595
+ if current_pos < 0:
596
+ current_pos = best_split
597
+
598
+ return chunks
599
+
600
+
601
+ def _adjust_for_image_boundary(
602
+ pos: int,
603
+ image_regions: List[Tuple[int, int]],
604
+ text_len: int
605
+ ) -> Tuple[int, bool]:
606
+ """
607
+ Check if position is in the middle of an image tag and adjust to image end if so.
608
+
609
+ Args:
610
+ pos: Current split position
611
+ image_regions: Image tag position list [(start, end), ...]
612
+ text_len: Total text length
613
+
614
+ Returns:
615
+ (adjusted_pos, ends_with_image): Adjusted position and whether it ends with image
616
+ """
617
+ for img_start, img_end in image_regions:
618
+ # If split position is in the middle of an image tag
619
+ if img_start < pos < img_end:
620
+ # Extend to image end
621
+ return min(img_end, text_len), True
622
+ # If split position is right after an image tag (including space/newline)
623
+ if img_end <= pos <= img_end + 5:
624
+ return pos, True
625
+ return pos, False
626
+
627
+
628
+ def _check_ends_with_no_overlap_region(
629
+ end_pos: int,
630
+ no_overlap_regions: List[Tuple[int, int, str]],
631
+ tolerance: int = 5
632
+ ) -> bool:
633
+ """
634
+ Check if position ends with or is right after a no-overlap region.
635
+
636
+ Args:
637
+ end_pos: End position of chunk
638
+ no_overlap_regions: List of (start, end, type) for no-overlap regions
639
+ tolerance: Number of characters after region end to still consider it as ending with region
640
+
641
+ Returns:
642
+ True if position ends with a no-overlap region
643
+ """
644
+ for region_start, region_end, _ in no_overlap_regions:
645
+ # If end_pos is exactly at or just after the region end (within tolerance)
646
+ if region_end <= end_pos <= region_end + tolerance:
647
+ return True
648
+ return False
649
+
650
+
651
+ def split_large_chunk_with_protected_regions(
652
+ text: str,
653
+ chunk_size: int,
654
+ chunk_overlap: int,
655
+ is_table_based: bool = False,
656
+ force_chunking: bool = False,
657
+ image_pattern: Optional[str] = None,
658
+ chart_pattern: Optional[str] = None,
659
+ page_tag_processor: Optional[Any] = None,
660
+ metadata_pattern: Optional[str] = None
661
+ ) -> List[str]:
662
+ """
663
+ Split large chunk while protecting regions (HTML tables, charts, Markdown tables, tags).
664
+ When force_chunking, table protection is disabled (charts always protected, rows protected).
665
+
666
+ Protected regions that NEVER overlap:
667
+ - Image tags, Page/Slide/Sheet tags, Chart blocks, Metadata blocks
668
+ - Tables split by rows with NO overlap
669
+
670
+ When force_chunking=True:
671
+ - Tables are not registered as protected regions in find_protected_regions
672
+ - But split_with_protected_regions directly scans for tables and handles them
673
+ - Tables are split by rows with NO overlap
674
+
675
+ Args:
676
+ text: Text to split
677
+ chunk_size: Maximum chunk size
678
+ chunk_overlap: Overlap size between chunks (NOT applied to protected regions)
679
+ is_table_based: Whether file is table-based
680
+ force_chunking: Force chunking mode
681
+ image_pattern: Custom image tag pattern
682
+ chart_pattern: Custom chart block pattern
683
+ page_tag_processor: PageTagProcessor instance for custom page/slide/sheet patterns
684
+ metadata_pattern: Custom metadata block pattern
685
+
686
+ Returns:
687
+ List of chunks
688
+ """
689
+ protected_regions = find_protected_regions(
690
+ text, is_table_based, force_chunking, image_pattern,
691
+ chart_pattern, page_tag_processor, metadata_pattern
692
+ )
693
+ protected_positions = get_protected_region_positions(protected_regions)
694
+
695
+ # split_with_protected_regions handles tables even with force_chunking
696
+ # (it directly scans for tables when force_chunking=True)
697
+ return split_with_protected_regions(
698
+ text, protected_positions, chunk_size, chunk_overlap, force_chunking,
699
+ image_pattern, chart_pattern, page_tag_processor, metadata_pattern
700
+ )
701
+
702
+
703
+ # Backward compatibility aliases
704
+ def ensure_table_integrity(content: str, table_pattern: str) -> str:
705
+ """Deprecated: Use ensure_protected_region_integrity instead."""
706
+ return ensure_protected_region_integrity(content)
707
+
708
+
709
+ def split_large_chunk_with_table_protection(
710
+ text: str,
711
+ chunk_size: int,
712
+ chunk_overlap: int
713
+ ) -> List[str]:
714
+ """Deprecated: Use split_large_chunk_with_protected_regions instead."""
715
+ return split_large_chunk_with_protected_regions(text, chunk_size, chunk_overlap, False)