xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,786 @@
1
+ # xgen_doc2chunk/chunking/chunking.py
2
+ """
3
+ Document Chunking Module - Advanced Text Chunking System
4
+
5
+ Main Features:
6
+ - HTML table-preserving chunking with row-level splitting
7
+ - Markdown table-preserving chunking with row-level splitting
8
+ - Intelligent splitting for large table data (CSV/TSV/Excel)
9
+ - Table structure restoration (header preservation for both HTML and Markdown)
10
+ - Page-based chunking
11
+ - Language-specific code file chunking
12
+
13
+ Key Improvements (Table Chunking Enhancement):
14
+ - Split large tables (HTML and Markdown) by rows to fit chunk_size
15
+ - Automatically restore table headers in each chunk
16
+ - Ensure table structure integrity
17
+ - Add chunk indexing metadata
18
+ - NO OVERLAP for table chunks (intentional to prevent data duplication)
19
+
20
+ Critical Rules for Table-Based Files (CSV, TSV, XLSX, XLS):
21
+ - Always use force_chunking=True
22
+ - Always split by rows (never cut in the middle of a row)
23
+ - Never apply overlap between table chunks
24
+ - Restore headers in each chunk for context
25
+
26
+ Refactoring:
27
+ - Core logic is separated into chunking_helper submodules
28
+ - This file maintains only the public API and integration logic
29
+ """
30
+ import bisect
31
+ import logging
32
+ import re
33
+ from typing import Any, Dict, List, Optional, Union
34
+
35
+ # Import from individual modules
36
+ from xgen_doc2chunk.chunking.constants import (
37
+ TABLE_SIZE_THRESHOLD_MULTIPLIER,
38
+ TABLE_BASED_FILE_TYPES,
39
+ HTML_TABLE_PATTERN,
40
+ MARKDOWN_TABLE_PATTERN,
41
+ )
42
+ from xgen_doc2chunk.chunking.table_chunker import (
43
+ chunk_large_table as _chunk_large_table,
44
+ chunk_large_markdown_table as _chunk_large_markdown_table,
45
+ is_markdown_table as _is_markdown_table,
46
+ )
47
+
48
+ from xgen_doc2chunk.chunking.protected_regions import (
49
+ find_protected_regions as _find_protected_regions,
50
+ get_protected_region_positions as _get_protected_region_positions,
51
+ split_with_protected_regions as _split_with_protected_regions,
52
+ )
53
+
54
+ from xgen_doc2chunk.chunking.page_chunker import (
55
+ chunk_by_pages as _chunk_by_pages,
56
+ )
57
+
58
+ from xgen_doc2chunk.chunking.text_chunker import (
59
+ chunk_plain_text as _chunk_plain_text,
60
+ chunk_text_without_tables,
61
+ chunk_with_row_protection,
62
+ clean_chunks as _clean_chunks,
63
+ reconstruct_text_from_chunks,
64
+ find_overlap_length,
65
+ )
66
+
67
+ from xgen_doc2chunk.chunking.sheet_processor import (
68
+ extract_document_metadata as _extract_document_metadata,
69
+ prepend_metadata_to_chunks as _prepend_metadata_to_chunks,
70
+ extract_sheet_sections as _extract_sheet_sections,
71
+ chunk_multi_sheet_content,
72
+ chunk_single_table_content,
73
+ )
74
+
75
+ logger = logging.getLogger("document-processor")
76
+
77
+
78
+ # ============================================================================
79
+ # Helper Functions for PageTagProcessor integration
80
+ # ============================================================================
81
+
82
+ def _get_page_marker_patterns(page_tag_processor: Optional[Any] = None) -> List[str]:
83
+ """
84
+ Get page marker regex patterns from PageTagProcessor or use defaults.
85
+
86
+ Args:
87
+ page_tag_processor: PageTagProcessor instance (optional)
88
+
89
+ Returns:
90
+ List of regex patterns for page/slide markers
91
+ """
92
+ if page_tag_processor is not None:
93
+ # Build patterns from processor's config
94
+ config = page_tag_processor.config
95
+ patterns = [
96
+ page_tag_processor.get_pattern_string(), # Page pattern
97
+ ]
98
+ # Add slide pattern if different prefix
99
+ if config.slide_prefix != config.tag_prefix:
100
+ from xgen_doc2chunk.core.functions.page_tag_processor import PageTagType
101
+ patterns.append(page_tag_processor.get_pattern_string(PageTagType.SLIDE))
102
+ return patterns
103
+ else:
104
+ # Default patterns
105
+ return [
106
+ r'\[Page Number:\s*(\d+)\]',
107
+ r'\[Slide Number:\s*(\d+)\]',
108
+ ]
109
+
110
+
111
+ def _get_sheet_marker_pattern(page_tag_processor: Optional[Any] = None) -> str:
112
+ """
113
+ Get sheet marker regex pattern from PageTagProcessor or use default.
114
+
115
+ Args:
116
+ page_tag_processor: PageTagProcessor instance (optional)
117
+
118
+ Returns:
119
+ Regex pattern for sheet markers
120
+ """
121
+ if page_tag_processor is not None:
122
+ from xgen_doc2chunk.core.functions.page_tag_processor import PageTagType
123
+ return page_tag_processor.get_pattern_string(PageTagType.SHEET)
124
+ else:
125
+ return r'\[Sheet:\s*([^\]]+)\]'
126
+
127
+
128
+ def _get_image_tag_pattern(image_processor: Optional[Any] = None) -> str:
129
+ """
130
+ Get image tag regex pattern from ImageProcessor or use default.
131
+
132
+ Args:
133
+ image_processor: ImageProcessor instance (optional)
134
+
135
+ Returns:
136
+ Regex pattern for image tags
137
+ """
138
+ if image_processor is not None:
139
+ return image_processor.get_pattern_string()
140
+ else:
141
+ # Default pattern: [Image:...] or [image:...] with optional spaces and braces
142
+ from xgen_doc2chunk.chunking.constants import IMAGE_TAG_PATTERN
143
+ return IMAGE_TAG_PATTERN
144
+
145
+
146
+ def _get_chart_block_pattern(chart_processor: Optional[Any] = None) -> str:
147
+ """
148
+ Get chart block regex pattern from ChartProcessor or use default.
149
+
150
+ Args:
151
+ chart_processor: ChartProcessor instance (optional)
152
+
153
+ Returns:
154
+ Regex pattern for chart blocks
155
+ """
156
+ if chart_processor is not None:
157
+ try:
158
+ # Build pattern from processor's config
159
+ prefix = re.escape(chart_processor.config.tag_prefix)
160
+ suffix = re.escape(chart_processor.config.tag_suffix)
161
+ return f'{prefix}.*?{suffix}'
162
+ except Exception:
163
+ pass
164
+ # Default pattern: [chart]...[/chart]
165
+ from xgen_doc2chunk.chunking.constants import CHART_BLOCK_PATTERN
166
+ return CHART_BLOCK_PATTERN
167
+
168
+
169
+ def _get_metadata_block_pattern(metadata_formatter: Optional[Any] = None) -> str:
170
+ """
171
+ Get metadata block regex pattern from MetadataFormatter or use default.
172
+
173
+ Args:
174
+ metadata_formatter: MetadataFormatter instance (optional)
175
+
176
+ Returns:
177
+ Regex pattern for metadata blocks
178
+ """
179
+ if metadata_formatter is not None:
180
+ try:
181
+ # Build pattern from formatter's config
182
+ prefix = re.escape(metadata_formatter.metadata_tag_prefix)
183
+ suffix = re.escape(metadata_formatter.metadata_tag_suffix)
184
+ return f'{prefix}.*?{suffix}'
185
+ except Exception:
186
+ pass
187
+ # Default pattern: <Document-Metadata>...</Document-Metadata>
188
+ from xgen_doc2chunk.chunking.constants import METADATA_BLOCK_PATTERN
189
+ return METADATA_BLOCK_PATTERN
190
+
191
+
192
+ # ============================================================================
193
+ # Public API - Single entry point for external use
194
+ # ============================================================================
195
+
196
+ def create_chunks(
197
+ text: str,
198
+ file_extension: str,
199
+ chunk_size: int = 1000,
200
+ chunk_overlap: int = 200,
201
+ force_chunking: bool = False,
202
+ include_position_metadata: bool = True,
203
+ chunking_strategy: str = "recursive",
204
+ page_tag_processor: Optional[Any] = None,
205
+ image_processor: Optional[Any] = None,
206
+ chart_processor: Optional[Any] = None,
207
+ metadata_formatter: Optional[Any] = None,
208
+ stride: Optional[int] = None,
209
+ parent_chunk_size: Optional[int] = None,
210
+ child_chunk_size: Optional[int] = None,
211
+ **kwargs
212
+ ) -> Union[List[str], List[Dict[str, Any]]]:
213
+ """
214
+ Split text into chunks. (Single public API)
215
+
216
+ Args:
217
+ text: Original text
218
+ file_extension: File extension
219
+ chunk_size: Maximum chunk size
220
+ chunk_overlap: Overlap size between chunks (NOT applied to protected regions)
221
+ force_chunking: Force chunking (disable table protection)
222
+ include_position_metadata: Whether to include position metadata
223
+ - True: Include metadata like page_number, line_start, line_end (List[Dict])
224
+ - False: Return only chunk text (List[str])
225
+ chunking_strategy: Chunking strategy (recursive, sliding, hierarchical) - future implementation
226
+ page_tag_processor: PageTagProcessor instance for custom tag patterns
227
+ - If None, uses default patterns [Page Number: n], [Slide Number: n], [Sheet: name]
228
+ - If provided, uses the processor's configured patterns
229
+ - Page/Slide/Sheet tags are protected and NEVER overlap
230
+ image_processor: ImageProcessor instance for custom image tag patterns
231
+ - If None, uses default pattern [Image:...]
232
+ - If provided, uses the processor's configured patterns
233
+ - Image tags are protected and NEVER overlap
234
+ chart_processor: ChartProcessor instance for custom chart tag patterns
235
+ - If None, uses default pattern [chart]...[/chart]
236
+ - If provided, uses the processor's configured patterns
237
+ - Chart blocks are protected and NEVER overlap
238
+ metadata_formatter: MetadataFormatter instance for custom metadata tag patterns
239
+ - If None, uses default pattern <Document-Metadata>...</Document-Metadata>
240
+ - If provided, uses the formatter's configured patterns
241
+ - Metadata blocks are protected and NEVER overlap
242
+ stride: Stride for sliding window strategy - future implementation
243
+ parent_chunk_size: Parent chunk size for hierarchical strategy - future implementation
244
+ child_chunk_size: Child chunk size for hierarchical strategy - future implementation
245
+
246
+ Returns:
247
+ When include_position_metadata=True:
248
+ List of chunks with metadata [{"text", "page_number", "line_start", ...}, ...]
249
+ When include_position_metadata=False:
250
+ List of chunk texts ["chunk1", "chunk2", ...]
251
+
252
+ Protected Regions (NEVER split or overlap):
253
+ - Image tags: [Image:...] or custom pattern
254
+ - Page/Slide/Sheet tags: [Page Number: n], [Slide Number: n], [Sheet: name] or custom
255
+ - Chart blocks: [chart]...[/chart] or custom
256
+ - Metadata blocks: <Document-Metadata>...</Document-Metadata> or custom
257
+ - Tables: Split by rows, each chunk has NO overlap
258
+ """
259
+ # TODO: Implement various chunking strategies based on chunking_strategy
260
+ if chunking_strategy != "recursive":
261
+ logger.warning(
262
+ f"Chunking strategy '{chunking_strategy}' is not yet implemented, "
263
+ "falling back to 'recursive'"
264
+ )
265
+
266
+ # Split text into chunks
267
+ chunks = _split_text(
268
+ text, chunk_size, chunk_overlap,
269
+ file_extension=file_extension,
270
+ force_chunking=force_chunking,
271
+ page_tag_processor=page_tag_processor,
272
+ image_processor=image_processor,
273
+ chart_processor=chart_processor,
274
+ metadata_formatter=metadata_formatter
275
+ )
276
+
277
+ # Return chunks without metadata
278
+ if not include_position_metadata:
279
+ return chunks
280
+
281
+ # Reconstruct text and build line offset table
282
+ reconstructed = reconstruct_text_from_chunks(chunks, chunk_overlap)
283
+ line_table = _build_line_offset_table(reconstructed, file_extension, page_tag_processor)
284
+
285
+ # Add metadata to each chunk
286
+ result: List[Dict[str, Any]] = []
287
+ current_pos = 0
288
+
289
+ for idx, chunk in enumerate(chunks):
290
+ start = current_pos
291
+ end = current_pos + len(chunk) - 1
292
+
293
+ start_line_idx = _find_line_index_by_pos(start, line_table)
294
+ end_line_idx = _find_line_index_by_pos(end, line_table)
295
+
296
+ line_start = line_table[start_line_idx]["line_num"]
297
+ line_end = line_table[end_line_idx]["line_num"]
298
+ page_number = line_table[start_line_idx].get("page", 1)
299
+
300
+ result.append({
301
+ "text": chunk,
302
+ "page_number": page_number,
303
+ "line_start": line_start,
304
+ "line_end": line_end,
305
+ "global_start": start,
306
+ "global_end": end,
307
+ "chunk_index": idx
308
+ })
309
+
310
+ current_pos += len(chunk)
311
+ if idx < len(chunks) - 1:
312
+ overlap_len = find_overlap_length(chunk, chunks[idx + 1], chunk_overlap)
313
+ current_pos -= overlap_len
314
+
315
+ logger.info(f"Created {len(result)} chunks with position metadata")
316
+ return result
317
+
318
+
319
+ # ============================================================================
320
+ # Internal Functions - Table-based content processing
321
+ # ============================================================================
322
+
323
+ def _split_table_based_content(
324
+ text: str,
325
+ chunk_size: int,
326
+ chunk_overlap: int,
327
+ page_tag_processor: Optional[Any] = None,
328
+ image_processor: Optional[Any] = None,
329
+ chart_processor: Optional[Any] = None,
330
+ metadata_formatter: Optional[Any] = None
331
+ ) -> List[str]:
332
+ """
333
+ Chunk table-based content (CSV/TSV/Excel).
334
+
335
+ Split large tables (HTML or Markdown) to fit chunk_size and restore
336
+ table structure in each chunk.
337
+
338
+ For multi-sheet Excel files, process each sheet separately.
339
+
340
+ CRITICAL: Table chunks have NO overlap to prevent data duplication.
341
+ This is intentional for search/retrieval quality.
342
+
343
+ Args:
344
+ text: Full text (metadata + table)
345
+ chunk_size: Maximum chunk size
346
+ chunk_overlap: Not used for tables (kept for API compatibility)
347
+ page_tag_processor: PageTagProcessor for page/sheet tag patterns
348
+ image_processor: ImageProcessor for image tag patterns
349
+ chart_processor: ChartProcessor for chart block patterns
350
+ metadata_formatter: MetadataFormatter for metadata block patterns
351
+
352
+ Returns:
353
+ List of chunks
354
+ """
355
+ if not text or not text.strip():
356
+ return [""]
357
+
358
+ # Get metadata pattern from processor
359
+ metadata_pattern = _get_metadata_block_pattern(metadata_formatter)
360
+
361
+ # Extract metadata using custom pattern
362
+ metadata_block, text_without_metadata = _extract_document_metadata(text, metadata_pattern)
363
+
364
+ # Extract data analysis block (supports both English and Korean tags)
365
+ analysis_pattern = r'(\[Data Analysis\].*?\[/Data Analysis\])\s*'
366
+ analysis_match = re.search(analysis_pattern, text_without_metadata, re.DOTALL)
367
+ analysis_block = ""
368
+
369
+ if analysis_match:
370
+ analysis_block = analysis_match.group(1)
371
+ text_without_analysis = (
372
+ text_without_metadata[:analysis_match.start()] +
373
+ text_without_metadata[analysis_match.end():]
374
+ ).strip()
375
+ else:
376
+ text_without_analysis = text_without_metadata
377
+
378
+ # Check for multi-sheet (Excel)
379
+ sheets = _extract_sheet_sections(text_without_analysis)
380
+
381
+ # Get patterns from processors for protected region detection
382
+ image_pattern = _get_image_tag_pattern(image_processor)
383
+ chart_pattern = _get_chart_block_pattern(chart_processor)
384
+ metadata_pattern = _get_metadata_block_pattern(metadata_formatter)
385
+
386
+ if sheets:
387
+ logger.info(f"Multi-sheet Excel detected: {len(sheets)} sheets")
388
+ # Pass 0 for overlap since tables should not have overlap
389
+ return chunk_multi_sheet_content(
390
+ sheets, metadata_block, analysis_block, chunk_size, 0,
391
+ _chunk_plain_text, _chunk_table_unified,
392
+ image_pattern=image_pattern,
393
+ chart_pattern=chart_pattern,
394
+ metadata_pattern=metadata_pattern
395
+ )
396
+
397
+ # Single table/sheet processing
398
+ # Pass 0 for overlap since tables should not have overlap
399
+ return chunk_single_table_content(
400
+ text_without_analysis, metadata_block, analysis_block, chunk_size, 0,
401
+ _chunk_plain_text, _chunk_table_unified,
402
+ image_pattern=image_pattern,
403
+ chart_pattern=chart_pattern,
404
+ metadata_pattern=metadata_pattern
405
+ )
406
+
407
+
408
+ def _chunk_table_unified(table_text: str, chunk_size: int, chunk_overlap: int, context_prefix: str = "") -> List[str]:
409
+ """
410
+ Unified table chunking function that handles both HTML and Markdown tables.
411
+
412
+ Detects table type and applies appropriate chunking with NO overlap.
413
+
414
+ Args:
415
+ table_text: Table content (HTML or Markdown)
416
+ chunk_size: Maximum chunk size
417
+ chunk_overlap: Ignored (tables have no overlap)
418
+ context_prefix: Context to prepend to each chunk
419
+
420
+ Returns:
421
+ List of table chunks
422
+ """
423
+ if _is_markdown_table(table_text):
424
+ return _chunk_large_markdown_table(table_text, chunk_size, 0, context_prefix)
425
+ else:
426
+ return _chunk_large_table(table_text, chunk_size, 0, context_prefix)
427
+
428
+
429
+ def _split_text(
430
+ text: str,
431
+ chunk_size: int,
432
+ chunk_overlap: int,
433
+ file_extension: Optional[str] = None,
434
+ force_chunking: Optional[bool] = False,
435
+ page_tag_processor: Optional[Any] = None,
436
+ image_processor: Optional[Any] = None,
437
+ chart_processor: Optional[Any] = None,
438
+ metadata_formatter: Optional[Any] = None
439
+ ) -> List[str]:
440
+ """
441
+ Split text into chunks. (Internal use)
442
+
443
+ Preserves HTML and Markdown tables with proper row-level chunking.
444
+ Considers page boundaries for chunking.
445
+ Protects all tag regions (image, page, slide, chart, metadata) with NO overlap.
446
+
447
+ Core Strategy:
448
+ 1. Apply table-based chunking if file_extension is CSV/TSV/Excel (NO overlap for tables)
449
+ 2. Apply page-based chunking first if page markers exist
450
+ 3. Merge pages based on chunk_size (allow up to 1.5x)
451
+ 4. Never cut in the middle of a table or protected tag
452
+ 5. Apply overlap ONLY for plain text (NOT for protected regions)
453
+
454
+ Protected Regions (NEVER split or overlap):
455
+ - Image tags, Page/Slide/Sheet tags, Chart blocks, Metadata blocks
456
+ - Tables (split by rows with NO overlap)
457
+
458
+ Args:
459
+ text: Original text
460
+ chunk_size: Maximum chunk size
461
+ chunk_overlap: Overlap size between chunks (NOT applied to protected regions)
462
+ file_extension: File extension (csv, xlsx, pdf, etc.) - used for table-based processing
463
+ force_chunking: Force chunking (disable table protection except for table-based files)
464
+ page_tag_processor: PageTagProcessor instance for custom tag patterns
465
+ image_processor: ImageProcessor instance for custom image tag patterns
466
+ chart_processor: ChartProcessor instance for custom chart tag patterns
467
+ metadata_formatter: MetadataFormatter instance for custom metadata tag patterns
468
+
469
+ Returns:
470
+ List of chunks
471
+ """
472
+ if not text or not text.strip():
473
+ logger.warning("Empty text provided for chunking")
474
+ return [""]
475
+
476
+ # === Check for table-based content (CSV/Excel files only) ===
477
+ # Explicitly determine based on file_extension (no text content guessing)
478
+ is_table_based = file_extension and file_extension.lower() in TABLE_BASED_FILE_TYPES
479
+
480
+ # Disable table protection if is_table_based or force_chunking is True
481
+ disable_table_protection = is_table_based or force_chunking
482
+
483
+ if is_table_based:
484
+ # For table-based files (CSV/Excel), always use table-based chunking
485
+ # This handles both HTML tables and Markdown tables properly
486
+ logger.info(f"Table-based file detected ({file_extension}), using table-based chunking")
487
+ return _split_table_based_content(
488
+ text, chunk_size, chunk_overlap,
489
+ page_tag_processor=page_tag_processor,
490
+ image_processor=image_processor,
491
+ chart_processor=chart_processor,
492
+ metadata_formatter=metadata_formatter
493
+ )
494
+
495
+ # Get tag patterns from processors or use defaults (needed for metadata extraction)
496
+ metadata_pattern = _get_metadata_block_pattern(metadata_formatter)
497
+
498
+ # Extract metadata using custom pattern
499
+ metadata_block, text_without_metadata = _extract_document_metadata(text, metadata_pattern)
500
+ text = text_without_metadata
501
+
502
+ # === Check for page markers ===
503
+ # Build patterns from PageTagProcessor or use defaults
504
+ page_marker_patterns = _get_page_marker_patterns(page_tag_processor)
505
+ has_page_markers = any(re.search(pattern, text) for pattern in page_marker_patterns)
506
+
507
+ # Get remaining tag patterns from processors or use defaults
508
+ image_pattern = _get_image_tag_pattern(image_processor)
509
+ chart_pattern = _get_chart_block_pattern(chart_processor)
510
+
511
+ if has_page_markers:
512
+ # Page-based chunking
513
+ logger.debug("Page markers found, using page-based chunking")
514
+ chunks = _chunk_by_pages(
515
+ text, chunk_size, chunk_overlap, is_table_based, force_chunking,
516
+ page_tag_processor, image_pattern, chart_pattern, metadata_pattern
517
+ )
518
+ else:
519
+ # Find protected regions (HTML tables, chart blocks, Markdown tables, all tags)
520
+ # Disable table protection on force_chunking (other regions are always protected)
521
+ protected_regions = _find_protected_regions(
522
+ text, is_table_based, force_chunking, image_pattern,
523
+ chart_pattern, page_tag_processor, metadata_pattern
524
+ )
525
+ protected_positions = _get_protected_region_positions(protected_regions)
526
+
527
+ if protected_positions:
528
+ region_types = set(r[2] for r in protected_regions)
529
+ logger.info(f"Found {len(protected_positions)} protected regions in document: {region_types}")
530
+ chunks = _split_with_protected_regions(
531
+ text, protected_positions, chunk_size, chunk_overlap, force_chunking,
532
+ image_pattern, chart_pattern, page_tag_processor, metadata_pattern
533
+ )
534
+ else:
535
+ # No protected regions: apply row-level chunking if force_chunking
536
+ if disable_table_protection:
537
+ logger.debug("Force chunking enabled, using row-preserving chunking")
538
+ chunks = _chunk_with_row_protection(text, chunk_size, chunk_overlap, force_chunking)
539
+ else:
540
+ logger.debug("No protected blocks found, using standard chunking")
541
+ return _chunk_text_without_tables(text, chunk_size, chunk_overlap, metadata_block, page_tag_processor)
542
+
543
+ # Clean chunks
544
+ cleaned_chunks = _clean_chunks(chunks, page_tag_processor)
545
+
546
+ # Add metadata
547
+ cleaned_chunks = _prepend_metadata_to_chunks(cleaned_chunks, metadata_block)
548
+
549
+ logger.info(f"Final text split into {len(cleaned_chunks)} chunks")
550
+
551
+ return cleaned_chunks
552
+
553
+ # ============================================================================
554
+ # Internal Wrapper Functions
555
+ # ============================================================================
556
+
557
+ def _chunk_text_without_tables(
558
+ text: str,
559
+ chunk_size: int,
560
+ chunk_overlap: int,
561
+ metadata: Optional[str],
562
+ page_tag_processor: Optional[Any] = None
563
+ ) -> List[str]:
564
+ """
565
+ Chunking logic for text without tables.
566
+ Wrapper function for chunk_text_without_tables.
567
+ """
568
+ return chunk_text_without_tables(
569
+ text, chunk_size, chunk_overlap, metadata,
570
+ _prepend_metadata_to_chunks,
571
+ page_tag_processor
572
+ )
573
+
574
+
575
+ def _chunk_with_row_protection(
576
+ text: str,
577
+ chunk_size: int,
578
+ chunk_overlap: int,
579
+ force_chunking: bool = False
580
+ ) -> List[str]:
581
+ """
582
+ Chunk while protecting row boundaries when table protection is disabled.
583
+
584
+ Both HTML and Markdown tables are split by rows with NO overlap.
585
+ Wrapper function for chunk_with_row_protection.
586
+ """
587
+ # Wrapper function to pass force_chunking
588
+ def split_with_protected_regions_wrapper(text, regions, chunk_size, chunk_overlap):
589
+ return _split_with_protected_regions(text, regions, chunk_size, chunk_overlap, force_chunking)
590
+
591
+ # Use unified table chunker that handles both HTML and Markdown
592
+ return chunk_with_row_protection(
593
+ text, chunk_size, chunk_overlap,
594
+ split_with_protected_regions_wrapper, _chunk_table_unified
595
+ )
596
+
597
+
598
+ # ============================================================================
599
+ # Internal Functions - Page/Line Mapping
600
+ # ============================================================================
601
+
602
+ def _extract_page_mapping(
603
+ text: str,
604
+ file_extension: str,
605
+ page_tag_processor: Optional[Any] = None
606
+ ) -> List[Dict[str, Any]]:
607
+ """
608
+ Extract page/slide mapping information from text.
609
+
610
+ Recognizes page markers for various file formats:
611
+ - PDF/PPT/DOCX: Page/slide markers
612
+ - Excel: Sheet markers
613
+ - Others: Line-based estimation
614
+
615
+ Args:
616
+ text: Original text
617
+ file_extension: File extension
618
+ page_tag_processor: PageTagProcessor instance for custom patterns
619
+
620
+ Returns:
621
+ Page mapping list [{"page_num": int, "start_pos": int, "end_pos": int, ...}, ...]
622
+ """
623
+ try:
624
+ page_mapping: List[Dict[str, Any]] = []
625
+ ext_lower = file_extension.lower() if file_extension else ""
626
+
627
+ if ext_lower in ['pdf', 'ppt', 'pptx', 'doc', 'docx']:
628
+ # Build patterns from PageTagProcessor or use defaults
629
+ patterns = _get_page_marker_patterns(page_tag_processor)
630
+ # Add OCR variants
631
+ ocr_patterns = []
632
+ for p in patterns:
633
+ # Add (OCR) and (OCR+Ref) variants
634
+ base_pattern = p.rstrip(']').rstrip(')')
635
+ if base_pattern.endswith('\\d+'):
636
+ ocr_patterns.append(p[:-1] + r'\s*\(OCR\)\]')
637
+ ocr_patterns.append(p[:-1] + r'\s*\(OCR\+Ref\)\]')
638
+ patterns.extend(ocr_patterns)
639
+
640
+ for pattern in patterns:
641
+ matches = list(re.finditer(pattern, text))
642
+ if matches:
643
+ for i, match in enumerate(matches):
644
+ page_num = int(match.group(1))
645
+ start = match.end()
646
+ end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
647
+ page_mapping.append({
648
+ "page_num": page_num,
649
+ "start_pos": start,
650
+ "end_pos": end
651
+ })
652
+ page_mapping.sort(key=lambda x: x["page_num"])
653
+ break
654
+
655
+ # Estimate pages for doc/docx if no markers found
656
+ if not page_mapping and ext_lower in ['doc', 'docx']:
657
+ chars_per_page = 1500
658
+ text_len = len(text)
659
+ if text_len > chars_per_page:
660
+ estimated_pages = (text_len + chars_per_page - 1) // chars_per_page
661
+ for page_num in range(1, estimated_pages + 1):
662
+ start = (page_num - 1) * chars_per_page
663
+ end = min(page_num * chars_per_page, text_len)
664
+ page_mapping.append({
665
+ "page_num": page_num,
666
+ "start_pos": start,
667
+ "end_pos": end
668
+ })
669
+
670
+ if not page_mapping:
671
+ page_mapping = [{"page_num": 1, "start_pos": 0, "end_pos": len(text)}]
672
+
673
+ elif ext_lower in ['xlsx', 'xls']:
674
+ # Build sheet pattern from PageTagProcessor or use default
675
+ sheet_pattern = _get_sheet_marker_pattern(page_tag_processor)
676
+ matches = list(re.finditer(sheet_pattern, text))
677
+
678
+ if matches:
679
+ for i, match in enumerate(matches):
680
+ start = match.end()
681
+ end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
682
+ page_mapping.append({
683
+ "page_num": i + 1,
684
+ "start_pos": start,
685
+ "end_pos": end,
686
+ "sheet_name": match.group(1).strip()
687
+ })
688
+ else:
689
+ page_mapping = [{"page_num": 1, "start_pos": 0, "end_pos": len(text)}]
690
+
691
+ else:
692
+ # Line-based estimation for other file types
693
+ lines = text.split('\n')
694
+ lines_per_page = 1000
695
+
696
+ if len(lines) > lines_per_page:
697
+ page_count = (len(lines) + lines_per_page - 1) // lines_per_page
698
+ current_pos = 0
699
+
700
+ for page_num in range(1, page_count + 1):
701
+ start_line = (page_num - 1) * lines_per_page
702
+ end_line = min(page_num * lines_per_page, len(lines))
703
+ page_text = '\n'.join(lines[start_line:end_line])
704
+ start = current_pos
705
+ end = current_pos + len(page_text)
706
+ page_mapping.append({
707
+ "page_num": page_num,
708
+ "start_pos": start,
709
+ "end_pos": end
710
+ })
711
+ current_pos = end + 1
712
+ else:
713
+ page_mapping = [{"page_num": 1, "start_pos": 0, "end_pos": len(text)}]
714
+
715
+ return page_mapping
716
+
717
+ except Exception:
718
+ return [{"page_num": 1, "start_pos": 0, "end_pos": len(text)}]
719
+
720
+
721
+ def _find_line_index_by_pos(pos: int, line_table: List[Dict[str, int]]) -> int:
722
+ """
723
+ Find the line index corresponding to the given position.
724
+
725
+ Args:
726
+ pos: Position in text
727
+ line_table: Line offset table
728
+
729
+ Returns:
730
+ Line index (0-based)
731
+ """
732
+ try:
733
+ if not line_table:
734
+ return 0
735
+ starts = [line["start"] for line in line_table]
736
+ idx = bisect.bisect_right(starts, pos) - 1
737
+ return 0 if idx < 0 else min(idx, len(line_table) - 1)
738
+ except Exception:
739
+ return 0
740
+
741
+
742
+ def _build_line_offset_table(
743
+ text: str,
744
+ file_extension: str,
745
+ page_tag_processor: Optional[Any] = None
746
+ ) -> List[Dict[str, int]]:
747
+ """
748
+ Build an offset table for each line in the text.
749
+
750
+ Args:
751
+ text: Original text
752
+ file_extension: File extension
753
+ page_tag_processor: PageTagProcessor instance for custom patterns
754
+
755
+ Returns:
756
+ Line offset table [{"line_num": int, "start": int, "end": int, "page": int}, ...]
757
+ """
758
+ try:
759
+ lines = text.split('\n')
760
+ table: List[Dict[str, int]] = []
761
+ pos = 0
762
+ page_mapping = _extract_page_mapping(text, file_extension, page_tag_processor)
763
+
764
+ def _page_for_pos(p: int) -> int:
765
+ for info in page_mapping:
766
+ if info["start_pos"] <= p < info["end_pos"]:
767
+ return info["page_num"]
768
+ return 1
769
+
770
+ for i, line in enumerate(lines):
771
+ start = pos
772
+ end = pos + len(line)
773
+ mid = start + max(0, (end - start) // 2)
774
+ page = _page_for_pos(mid)
775
+ table.append({
776
+ "line_num": i + 1,
777
+ "start": start,
778
+ "end": end,
779
+ "page": page
780
+ })
781
+ pos = end + 1
782
+
783
+ return table
784
+
785
+ except Exception:
786
+ return [{"line_num": 1, "start": 0, "end": len(text), "page": 1}]