xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,406 @@
1
+ # chunking_helper/sheet_processor.py
2
+ """
3
+ Sheet Processor - Sheet and metadata processing
4
+
5
+ Main Features:
6
+ - Document metadata extraction
7
+ - Sheet section extraction
8
+ - Multi-sheet content chunking
9
+ - Single table content chunking
10
+ - NO overlap for table chunks (intentional for search quality)
11
+ """
12
+ import logging
13
+ import re
14
+ from typing import List, Optional, Tuple
15
+
16
+ from xgen_doc2chunk.chunking.constants import (
17
+ HTML_TABLE_PATTERN,
18
+ MARKDOWN_TABLE_PATTERN,
19
+ IMAGE_TAG_PATTERN,
20
+ CHART_BLOCK_PATTERN,
21
+ TEXTBOX_BLOCK_PATTERN
22
+ )
23
+
24
+ logger = logging.getLogger("document-processor")
25
+
26
+
27
+ def extract_document_metadata(
28
+ text: str,
29
+ metadata_pattern: Optional[str] = None
30
+ ) -> Tuple[Optional[str], str]:
31
+ """
32
+ Extract Document-Metadata block from text.
33
+
34
+ Args:
35
+ text: Original text
36
+ metadata_pattern: Custom metadata pattern (if None, uses default)
37
+
38
+ Returns:
39
+ (metadata_block, remaining_text) tuple
40
+ """
41
+ # Use custom pattern or default
42
+ pattern = metadata_pattern if metadata_pattern is not None else r'<Document-Metadata>.*?</Document-Metadata>\s*'
43
+ match = re.search(pattern, text, re.DOTALL)
44
+
45
+ if match:
46
+ metadata_block = match.group(0).strip()
47
+ remaining_text = text[:match.start()] + text[match.end():]
48
+ return metadata_block, remaining_text.strip()
49
+
50
+ return None, text
51
+
52
+
53
+ def prepend_metadata_to_chunks(chunks: List[str], metadata: Optional[str]) -> List[str]:
54
+ """
55
+ Prepend metadata to each chunk.
56
+
57
+ Args:
58
+ chunks: List of chunks
59
+ metadata: Metadata block
60
+
61
+ Returns:
62
+ Chunks with metadata prepended
63
+ """
64
+ if not metadata:
65
+ return chunks
66
+ return [f"{metadata}\n\n{chunk}" for chunk in chunks]
67
+
68
+
69
+ def extract_sheet_sections(text: str) -> List[Tuple[str, str]]:
70
+ """
71
+ Extract Excel sheet sections.
72
+
73
+ Args:
74
+ text: Full text
75
+
76
+ Returns:
77
+ [(sheet_name, sheet_content), ...] list
78
+ """
79
+ # Sheet marker pattern - only standard format from PageTagProcessor
80
+ sheet_pattern = r'\[Sheet:\s*([^\]]+)\]'
81
+ marker_template = '[Sheet: {name}]'
82
+
83
+ matches = list(re.finditer(sheet_pattern, text))
84
+
85
+ if not matches:
86
+ return []
87
+
88
+ sheets = []
89
+
90
+ for i, match in enumerate(matches):
91
+ sheet_name = match.group(1).strip()
92
+ start = match.end()
93
+ end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
94
+
95
+ content = text[start:end].strip()
96
+ if content:
97
+ # Include sheet marker in content
98
+ sheet_marker = marker_template.format(name=sheet_name)
99
+ full_content = f"{sheet_marker}\n{content}"
100
+ sheets.append((sheet_name, full_content))
101
+
102
+ return sheets
103
+
104
+
105
+ def extract_content_segments(
106
+ content: str,
107
+ image_pattern: Optional[str] = None,
108
+ chart_pattern: Optional[str] = None
109
+ ) -> List[Tuple[str, str]]:
110
+ """
111
+ Extract various types of segments from content.
112
+
113
+ Segment Types:
114
+ - table: HTML table or Markdown table (including table markers)
115
+ - textbox: [textbox]...[/textbox] block
116
+ - chart: [chart]...[/chart] block
117
+ - image: [image:...] tag
118
+ - text: Plain text
119
+
120
+ Args:
121
+ content: Content to parse
122
+ image_pattern: Custom image tag pattern (if None, uses default IMAGE_TAG_PATTERN)
123
+ chart_pattern: Custom chart block pattern (if None, uses default CHART_BLOCK_PATTERN)
124
+
125
+ Returns:
126
+ [(segment_type, segment_content), ...] list
127
+ """
128
+ segments: List[Tuple[str, str]] = []
129
+
130
+ # Use custom patterns or defaults
131
+ img_pat = image_pattern if image_pattern is not None else IMAGE_TAG_PATTERN
132
+ chart_pat = chart_pattern if chart_pattern is not None else CHART_BLOCK_PATTERN
133
+
134
+ # Define special block patterns
135
+ # Recognize [Table N] marker together with table as a single block
136
+ patterns = [
137
+ # [Table N] + HTML table
138
+ ('table', r'(?:\[Table\s*\d+\]\s*)?<table\s+border=["\']1["\']>.*?</table>'),
139
+ # [Table N] + Markdown table (multiple lines starting with |, last row matches even without newline)
140
+ ('table', r'\[Table\s*\d+\]\s*\n(?:\|[^\n]*\|(?:\s*\n|$))+'),
141
+ # Standalone Markdown table (starts with | and has --- separator, last row matches even without newline)
142
+ ('table', r'(?:^|\n)(\|[^\n]*\|\s*\n\|[\s\-:]*\|[^\n]*(?:\n\|[^\n]*\|)*)'),
143
+ ('textbox', TEXTBOX_BLOCK_PATTERN),
144
+ ('chart', chart_pat),
145
+ ('image', img_pat),
146
+ ]
147
+
148
+ # Find all special block positions
149
+ all_matches: List[Tuple[int, int, str, str]] = [] # (start, end, type, content)
150
+
151
+ for segment_type, pattern in patterns:
152
+ for match in re.finditer(pattern, content, re.DOTALL | re.IGNORECASE | re.MULTILINE):
153
+ matched_content = match.group(0).strip()
154
+ # Ignore empty matches
155
+ if not matched_content:
156
+ continue
157
+ all_matches.append((match.start(), match.end(), segment_type, matched_content))
158
+
159
+ # Sort by start position
160
+ all_matches.sort(key=lambda x: x[0])
161
+
162
+ # Remove overlapping matches (longer match wins)
163
+ filtered_matches: List[Tuple[int, int, str, str]] = []
164
+ last_end = 0
165
+ for start, end, segment_type, segment_content in all_matches:
166
+ if start >= last_end:
167
+ filtered_matches.append((start, end, segment_type, segment_content))
168
+ last_end = end
169
+
170
+ # Build segments (special blocks + plain text between them)
171
+ current_pos = 0
172
+ for start, end, segment_type, segment_content in filtered_matches:
173
+ # Plain text before special block
174
+ if start > current_pos:
175
+ text_between = content[current_pos:start].strip()
176
+ # Skip text that only contains [Table N] marker (will be combined with next table)
177
+ if text_between and not re.match(r'^\[Table\s*\d+\]\s*$', text_between):
178
+ segments.append(('text', text_between))
179
+
180
+ # Special block
181
+ segments.append((segment_type, segment_content))
182
+ current_pos = end
183
+
184
+ # Plain text after last special block
185
+ if current_pos < len(content):
186
+ remaining_text = content[current_pos:].strip()
187
+ # Ignore text that only contains [Table N] marker
188
+ if remaining_text and not re.match(r'^\[Table\s*\d+\]\s*$', remaining_text):
189
+ segments.append(('text', remaining_text))
190
+
191
+ return segments
192
+
193
+
194
+ def chunk_multi_sheet_content(
195
+ sheets: List[Tuple[str, str]],
196
+ metadata_block: Optional[str],
197
+ analysis_block: str,
198
+ chunk_size: int,
199
+ chunk_overlap: int,
200
+ chunk_plain_text_func,
201
+ chunk_large_table_func,
202
+ image_pattern: Optional[str] = None,
203
+ chart_pattern: Optional[str] = None,
204
+ metadata_pattern: Optional[str] = None
205
+ ) -> List[str]:
206
+ """
207
+ Chunk multi-sheet content.
208
+
209
+ Each sheet is processed independently and split if necessary.
210
+ All chunks include metadata and sheet information.
211
+ Handles not only tables but also additional content before/after tables (textbox, chart, image, etc.).
212
+
213
+ Args:
214
+ sheets: [(sheet_name, sheet_content), ...] list
215
+ metadata_block: Metadata block
216
+ analysis_block: Analysis block
217
+ chunk_size: Chunk size
218
+ chunk_overlap: Chunk overlap
219
+ chunk_plain_text_func: Plain text chunking function
220
+ chunk_large_table_func: Large table chunking function
221
+ image_pattern: Custom image tag pattern (if None, uses default)
222
+ chart_pattern: Custom chart block pattern (if None, uses default)
223
+ metadata_pattern: Custom metadata block pattern (if None, uses default)
224
+
225
+ Returns:
226
+ List of chunks
227
+ """
228
+ all_chunks: List[str] = []
229
+
230
+ # Build common metadata (included in all chunks)
231
+ common_metadata_parts = []
232
+ if metadata_block:
233
+ common_metadata_parts.append(metadata_block)
234
+ if analysis_block:
235
+ common_metadata_parts.append(analysis_block)
236
+ common_metadata = "\n\n".join(common_metadata_parts) if common_metadata_parts else ""
237
+
238
+ for sheet_idx, (sheet_name, sheet_content) in enumerate(sheets):
239
+ # Extract sheet marker - only standard format
240
+ sheet_marker_match = re.match(r'(\[Sheet:\s*[^\]]+\])', sheet_content)
241
+ sheet_marker = sheet_marker_match.group(1) if sheet_marker_match else f"[Sheet: {sheet_name}]"
242
+
243
+ # Build context for this sheet (metadata + sheet info)
244
+ context_parts = []
245
+ if common_metadata:
246
+ context_parts.append(common_metadata)
247
+ context_parts.append(sheet_marker)
248
+ context_prefix = "\n\n".join(context_parts) if context_parts else ""
249
+
250
+ # Remove sheet marker from content
251
+ content_after_marker = sheet_content
252
+ if sheet_marker_match:
253
+ content_after_marker = sheet_content[sheet_marker_match.end():].strip()
254
+
255
+ # === Split sheet content into segments ===
256
+ # Segments: tables, textbox, chart, image blocks and plain text
257
+ segments = extract_content_segments(
258
+ content_after_marker,
259
+ image_pattern=image_pattern,
260
+ chart_pattern=chart_pattern
261
+ )
262
+
263
+ if not segments:
264
+ # Skip empty sheets
265
+ continue
266
+
267
+ # Process each segment
268
+ for segment_type, segment_content in segments:
269
+ if not segment_content.strip():
270
+ continue
271
+
272
+ segment_size = len(segment_content)
273
+
274
+ if segment_type == 'table':
275
+ # Table processing - NO overlap for tables
276
+ if segment_size + len(context_prefix) <= chunk_size:
277
+ all_chunks.append(f"{context_prefix}\n{segment_content}")
278
+ else:
279
+ # Large table: split with NO overlap (0 is passed, not chunk_overlap)
280
+ table_chunks = chunk_large_table_func(
281
+ segment_content, chunk_size, 0, # NO overlap for tables
282
+ context_prefix=context_prefix
283
+ )
284
+ all_chunks.extend(table_chunks)
285
+
286
+ elif segment_type in ('textbox', 'chart', 'image'):
287
+ # Protected blocks: never split, keep as single chunk
288
+ if len(context_prefix) + segment_size > chunk_size:
289
+ # Exceeds chunk size but keep intact (protected block)
290
+ logger.warning(f"{segment_type} block exceeds chunk_size, but keeping it intact")
291
+ all_chunks.append(f"{context_prefix}\n{segment_content}")
292
+
293
+ else:
294
+ # Plain text
295
+ if len(context_prefix) + segment_size <= chunk_size:
296
+ all_chunks.append(f"{context_prefix}\n{segment_content}")
297
+ else:
298
+ # Split long plain text
299
+ text_chunks = chunk_plain_text_func(segment_content, chunk_size, chunk_overlap)
300
+ for chunk in text_chunks:
301
+ all_chunks.append(f"{context_prefix}\n{chunk}")
302
+
303
+ logger.info(f"Multi-sheet content split into {len(all_chunks)} chunks")
304
+
305
+ return all_chunks
306
+
307
+
308
+ def chunk_single_table_content(
309
+ text: str,
310
+ metadata_block: Optional[str],
311
+ analysis_block: str,
312
+ chunk_size: int,
313
+ chunk_overlap: int,
314
+ chunk_plain_text_func,
315
+ chunk_large_table_func,
316
+ image_pattern: Optional[str] = None,
317
+ chart_pattern: Optional[str] = None,
318
+ metadata_pattern: Optional[str] = None
319
+ ) -> List[str]:
320
+ """
321
+ Chunk single table content.
322
+ Include metadata in all chunks.
323
+
324
+ NOTE: Table chunks have NO overlap to prevent data duplication.
325
+
326
+ Args:
327
+ text: Text containing table
328
+ metadata_block: Metadata block
329
+ analysis_block: Analysis block
330
+ chunk_size: Chunk size
331
+ chunk_overlap: Chunk overlap (NOT applied to tables)
332
+ chunk_plain_text_func: Plain text chunking function
333
+ chunk_large_table_func: Large table chunking function
334
+ image_pattern: Custom image tag pattern (if None, uses default)
335
+ chart_pattern: Custom chart block pattern (if None, uses default)
336
+ metadata_pattern: Custom metadata block pattern (if None, uses default)
337
+
338
+ Returns:
339
+ List of chunks
340
+ """
341
+ # Build context (included in all chunks)
342
+ context_parts = []
343
+ if metadata_block:
344
+ context_parts.append(metadata_block)
345
+ if analysis_block:
346
+ context_parts.append(analysis_block)
347
+ context_prefix = "\n\n".join(context_parts) if context_parts else ""
348
+
349
+ # Extract tables (HTML or Markdown)
350
+ html_table_matches = list(re.finditer(HTML_TABLE_PATTERN, text, re.DOTALL | re.IGNORECASE))
351
+ markdown_table_matches = list(re.finditer(MARKDOWN_TABLE_PATTERN, text, re.MULTILINE))
352
+
353
+ # Combine all table matches with type info
354
+ all_table_matches: List[Tuple[int, int, str, str]] = [] # (start, end, type, content)
355
+
356
+ for match in html_table_matches:
357
+ all_table_matches.append((match.start(), match.end(), 'html', match.group(0)))
358
+
359
+ for match in markdown_table_matches:
360
+ table_start = match.start()
361
+ if match.group(0).startswith('\n'):
362
+ table_start += 1
363
+ all_table_matches.append((table_start, match.end(), 'markdown', match.group(0).strip()))
364
+
365
+ # Sort by start position and remove overlaps
366
+ all_table_matches.sort(key=lambda x: x[0])
367
+ filtered_matches: List[Tuple[int, int, str, str]] = []
368
+ last_end = 0
369
+ for start, end, ttype, content in all_table_matches:
370
+ if start >= last_end:
371
+ filtered_matches.append((start, end, ttype, content))
372
+ last_end = end
373
+
374
+ if not filtered_matches:
375
+ # No tables found - use plain text chunking
376
+ full_text = text
377
+ if context_prefix:
378
+ full_text = f"{context_prefix}\n\n{full_text}"
379
+ return chunk_plain_text_func(full_text, chunk_size, chunk_overlap)
380
+
381
+ # Result chunks
382
+ all_chunks: List[str] = []
383
+
384
+ # Process each table
385
+ for start, end, table_type, table_content in filtered_matches:
386
+ table_size = len(table_content)
387
+
388
+ logger.debug(f"Processing {table_type} table: {table_size} chars")
389
+
390
+ if table_size + len(context_prefix) <= chunk_size:
391
+ # Small table: include with context
392
+ if context_prefix:
393
+ all_chunks.append(f"{context_prefix}\n\n{table_content}")
394
+ else:
395
+ all_chunks.append(table_content)
396
+ else:
397
+ # Large table: split with NO overlap (context included in all chunks)
398
+ table_chunks = chunk_large_table_func(
399
+ table_content, chunk_size, 0, # NO overlap for tables
400
+ context_prefix=context_prefix
401
+ )
402
+ all_chunks.extend(table_chunks)
403
+
404
+ logger.info(f"Single table content split into {len(all_chunks)} chunks")
405
+
406
+ return all_chunks