xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,134 @@
1
+ # chunking_helper/constants.py
2
+ """
3
+ Chunking Module Constants - Definition of constants, patterns, and dataclasses for chunking
4
+
5
+ This module defines all constants and data structures used throughout the chunking system.
6
+ """
7
+ import logging
8
+ from dataclasses import dataclass
9
+ from typing import List
10
+ from langchain_text_splitters import Language
11
+
12
+ logger = logging.getLogger("document-processor")
13
+
14
+
15
+ # ============================================================================
16
+ # Code Language Mapping
17
+ # ============================================================================
18
+
19
+ LANGCHAIN_CODE_LANGUAGE_MAP = {
20
+ 'py': Language.PYTHON, 'js': Language.JS, 'ts': Language.TS,
21
+ 'java': Language.JAVA, 'cpp': Language.CPP, 'c': Language.CPP,
22
+ 'cs': Language.CSHARP, 'go': Language.GO, 'rs': Language.RUST,
23
+ 'php': Language.PHP, 'rb': Language.RUBY, 'swift': Language.SWIFT,
24
+ 'kt': Language.KOTLIN, 'scala': Language.SCALA,
25
+ 'html': Language.HTML, 'jsx': Language.JS, 'tsx': Language.TS,
26
+ }
27
+
28
+
29
+ # ============================================================================
30
+ # Protected Region Patterns (Blocks that should not be split during chunking)
31
+ # ============================================================================
32
+
33
+ # HTML table - Protect all <table> tags (regardless of attributes)
34
+ HTML_TABLE_PATTERN = r'<table[^>]*>.*?</table>'
35
+
36
+ # Chart block - Always protected (cannot be chunked under any condition)
37
+ # Default format: [chart]...[/chart] - can be customized via ChartProcessor
38
+ CHART_BLOCK_PATTERN = r'\[chart\].*?\[/chart\]'
39
+
40
+ # Textbox block - Always protected (cannot be chunked under any condition)
41
+ TEXTBOX_BLOCK_PATTERN = r'\[textbox\].*?\[/textbox\]'
42
+
43
+ # Image tag - Always protected (cannot be chunked under any condition)
44
+ # Format: [image:path], [Image: {path}], [image : path] etc. (case-insensitive, whitespace allowed, {} wrapping allowed)
45
+ IMAGE_TAG_PATTERN = r'\[(?i:image)\s*:\s*\{?[^\]\}]+\}?\]'
46
+
47
+ # Page/Slide/Sheet tag patterns - Always protected (NEVER overlap)
48
+ # Default formats from PageTagProcessor
49
+ PAGE_TAG_PATTERN = r'\[Page Number:\s*\d+\]'
50
+ SLIDE_TAG_PATTERN = r'\[Slide Number:\s*\d+\]'
51
+ SHEET_TAG_PATTERN = r'\[Sheet:\s*[^\]]+\]'
52
+
53
+ # OCR variants of page/slide tags
54
+ PAGE_TAG_OCR_PATTERN = r'\[Page Number:\s*\d+\s*\(OCR(?:\+Ref)?\)\]'
55
+ SLIDE_TAG_OCR_PATTERN = r'\[Slide Number:\s*\d+\s*\(OCR(?:\+Ref)?\)\]'
56
+
57
+ # Document metadata block - Always protected (NEVER overlap)
58
+ # Default format: <Document-Metadata>...</Document-Metadata> - can be customized via MetadataFormatter
59
+ METADATA_BLOCK_PATTERN = r'<Document-Metadata>.*?</Document-Metadata>'
60
+
61
+ # Data analysis block - Always protected
62
+ DATA_ANALYSIS_PATTERN = r'\[(?:Data Analysis|데이터 분석)\].*?\[/(?:Data Analysis|데이터 분석)\]'
63
+
64
+ # Markdown table patterns
65
+ # Complete Markdown table pattern (rows starting with |, including header separator |---|---|)
66
+ MARKDOWN_TABLE_PATTERN = r'(?:^|\n)(\|[^\n]+\|\n\|[-:|\s]+\|\n(?:\|[^\n]+\|(?:\n|$))+)'
67
+
68
+ # Markdown table individual row pattern (for row-level protection)
69
+ MARKDOWN_TABLE_ROW_PATTERN = r'\|[^\n]+\|'
70
+
71
+ # Markdown table header separator pattern (|---|---| or |:---:|---| etc.)
72
+ MARKDOWN_TABLE_SEPARATOR_PATTERN = r'^\|[\s\-:]+\|[\s\-:|]*$'
73
+
74
+ # Markdown table header detection (first row followed by separator)
75
+ MARKDOWN_TABLE_HEADER_PATTERN = r'^(\|[^\n]+\|\n)(\|[-:|\s]+\|)'
76
+
77
+
78
+ # ============================================================================
79
+ # Table Chunking Related Constants
80
+ # ============================================================================
81
+
82
+ # Table wrapping overhead (table tags, line breaks, etc.)
83
+ TABLE_WRAPPER_OVERHEAD = 30 # <table border='1'>\n</table>
84
+
85
+ # Minimum overhead per row (<tr>\n</tr>)
86
+ ROW_OVERHEAD = 12
87
+
88
+ # Overhead per cell (<td></td> or <th></th>)
89
+ CELL_OVERHEAD = 10
90
+
91
+ # Chunk index metadata overhead
92
+ CHUNK_INDEX_OVERHEAD = 30 # [Table chunk 1/10]\n
93
+
94
+ # Tables larger than this are subject to splitting
95
+ TABLE_SIZE_THRESHOLD_MULTIPLIER = 1.2 # 1.2x of chunk_size
96
+
97
+ # Table-based file types (CSV, TSV, Excel)
98
+ TABLE_BASED_FILE_TYPES = {'csv', 'tsv', 'xlsx', 'xls'}
99
+
100
+
101
+ # ============================================================================
102
+ # Dataclasses
103
+ # ============================================================================
104
+
105
+ @dataclass
106
+ class TableRow:
107
+ """Table row data (HTML or Markdown)"""
108
+ html: str # Raw content (HTML or Markdown)
109
+ is_header: bool
110
+ cell_count: int
111
+ char_length: int
112
+
113
+
114
+ @dataclass
115
+ class ParsedTable:
116
+ """Parsed table information (HTML)"""
117
+ header_rows: List[TableRow] # Header rows
118
+ data_rows: List[TableRow] # Data rows
119
+ total_cols: int # Total columns
120
+ original_html: str # Original HTML
121
+ header_html: str # Header HTML (for reuse)
122
+ header_size: int # Header size (characters)
123
+
124
+
125
+ @dataclass
126
+ class ParsedMarkdownTable:
127
+ """Parsed Markdown table information"""
128
+ header_row: str # Header row (first row with column names)
129
+ separator_row: str # Separator row (|---|---|)
130
+ data_rows: List[str] # Data rows
131
+ total_cols: int # Total columns
132
+ original_text: str # Original Markdown text
133
+ header_text: str # Header + separator for reuse
134
+ header_size: int # Header size (characters)
@@ -0,0 +1,248 @@
1
+ # chunking_helper/page_chunker.py
2
+ """
3
+ Page Chunker - Page-based chunking
4
+
5
+ Main Features:
6
+ - Split text by pages
7
+ - Page merging and chunking
8
+ - Overlap handling
9
+ - Table protection (HTML and Markdown) with NO overlap for tables
10
+ """
11
+ import logging
12
+ import re
13
+ from typing import List, Optional, Tuple
14
+
15
+ from xgen_doc2chunk.chunking.protected_regions import (
16
+ find_protected_regions, get_protected_region_positions,
17
+ ensure_protected_region_integrity, split_large_chunk_with_protected_regions
18
+ )
19
+
20
+ logger = logging.getLogger("document-processor")
21
+
22
+
23
+ def split_into_pages(text: str, page_marker_pattern: str) -> List[Tuple[int, str]]:
24
+ """
25
+ Split text by pages.
26
+ Exclude empty pages (pages with only page marker).
27
+
28
+ Returns:
29
+ [(page_num, page_content), ...] list
30
+ """
31
+ pages = []
32
+
33
+ # Find page marker positions
34
+ markers = list(re.finditer(page_marker_pattern, text))
35
+
36
+ if not markers:
37
+ return []
38
+
39
+ for i, match in enumerate(markers):
40
+ page_num = int(match.group(1))
41
+ start = match.start()
42
+
43
+ # Until next page marker or end of text
44
+ if i + 1 < len(markers):
45
+ end = markers[i + 1].start()
46
+ else:
47
+ end = len(text)
48
+
49
+ # Page content (including marker)
50
+ page_content = text[start:end].strip()
51
+
52
+ # Empty page check: only page marker exists
53
+ if page_content:
54
+ content_without_marker = re.sub(page_marker_pattern, '', page_content).strip()
55
+
56
+ if content_without_marker:
57
+ # Add only pages with actual content
58
+ pages.append((page_num, page_content))
59
+ else:
60
+ # Skip empty pages
61
+ logger.debug(f"Skipping empty page {page_num}")
62
+
63
+ # Add content before first page marker if exists
64
+ if markers and markers[0].start() > 0:
65
+ before_content = text[:markers[0].start()].strip()
66
+ if before_content:
67
+ pages.insert(0, (0, before_content))
68
+
69
+ return pages
70
+
71
+
72
+ def merge_pages(pages: List[Tuple[int, str]]) -> str:
73
+ """
74
+ Merge pages into a single string.
75
+ """
76
+ return '\n\n'.join(content for _, content in pages)
77
+
78
+
79
+ def get_overlap_content(pages: List[Tuple[int, str]], overlap_size: int) -> str:
80
+ """
81
+ Extract overlap-size content from the last page.
82
+ """
83
+ if not pages:
84
+ return ""
85
+
86
+ _, last_content = pages[-1]
87
+ if len(last_content) <= overlap_size:
88
+ return last_content
89
+
90
+ return last_content[-overlap_size:]
91
+
92
+
93
+ def chunk_by_pages(
94
+ text: str,
95
+ chunk_size: int,
96
+ chunk_overlap: int,
97
+ is_table_based: bool = False,
98
+ force_chunking: bool = False,
99
+ page_tag_processor = None,
100
+ image_pattern: Optional[str] = None,
101
+ chart_pattern: Optional[str] = None,
102
+ metadata_pattern: Optional[str] = None
103
+ ) -> List[str]:
104
+ """
105
+ Page-based text chunking.
106
+
107
+ Algorithm:
108
+ 1. Split text by pages
109
+ 2. Try to merge pages sequentially
110
+ 3. If merged size <= chunk_size, continue merging
111
+ 4. If exceeds chunk_size:
112
+ - Allow up to 1.5x
113
+ - If exceeds 1.5x, finalize previous as chunk
114
+ 5. If protected regions (tables, charts, Markdown tables) span page boundaries, keep together
115
+ (force_chunking only protects rows for tables, charts are always protected)
116
+ 6. Protected regions (image, page, slide, chart, metadata tags) NEVER overlap
117
+
118
+ Args:
119
+ text: Original text
120
+ chunk_size: Maximum chunk size
121
+ chunk_overlap: Overlap size between chunks (NOT applied to protected regions)
122
+ is_table_based: Whether the file is table-based
123
+ force_chunking: Force chunking (disable table protection)
124
+ page_tag_processor: PageTagProcessor instance for custom patterns
125
+ image_pattern: Custom regex pattern for image tags
126
+ chart_pattern: Custom regex pattern for chart blocks
127
+ metadata_pattern: Custom regex pattern for metadata blocks
128
+ """
129
+ # Build page marker patterns from PageTagProcessor or use defaults
130
+ if page_tag_processor is not None:
131
+ page_marker_patterns = [
132
+ page_tag_processor.get_pattern_string(), # Page pattern
133
+ ]
134
+ config = page_tag_processor.config
135
+ if config.slide_prefix != config.tag_prefix:
136
+ from xgen_doc2chunk.core.functions.page_tag_processor import PageTagType
137
+ page_marker_patterns.append(page_tag_processor.get_pattern_string(PageTagType.SLIDE))
138
+ else:
139
+ page_marker_patterns = [
140
+ r'\[Page Number:\s*(\d+)\]', # Default page format
141
+ r'\[Slide Number:\s*(\d+)\]', # Default slide format
142
+ ]
143
+
144
+ # Find first matching pattern
145
+ pages = []
146
+ for page_marker_pattern in page_marker_patterns:
147
+ pages = split_into_pages(text, page_marker_pattern)
148
+ if pages:
149
+ break
150
+
151
+ if not pages:
152
+ # Page split failed, fall back to plain text chunking
153
+ from .text_chunker import chunk_plain_text
154
+ return chunk_plain_text(text, chunk_size, chunk_overlap)
155
+
156
+ logger.debug(f"Split into {len(pages)} pages")
157
+
158
+ # Identify protected region positions (HTML tables, chart blocks, Markdown tables, image tags)
159
+ # force_chunking disables table protection (charts are always protected)
160
+ protected_regions = find_protected_regions(
161
+ text, is_table_based, force_chunking, image_pattern,
162
+ chart_pattern, page_tag_processor, metadata_pattern
163
+ )
164
+ protected_positions = get_protected_region_positions(protected_regions)
165
+
166
+ # Merge pages to create chunks
167
+ chunks = []
168
+ max_size = int(chunk_size * 1.5) # Allow up to 1.5x
169
+
170
+ current_chunk_pages = [] # Pages included in current chunk
171
+ current_size = 0
172
+ pending_overlap = "" # Overlap content to prepend to next chunk
173
+
174
+ for page_idx, (page_num, page_content) in enumerate(pages):
175
+ page_size = len(page_content)
176
+
177
+ # Apply pending overlap to page content
178
+ if pending_overlap:
179
+ page_content = pending_overlap + "\n\n" + page_content
180
+ page_size = len(page_content)
181
+ pending_overlap = ""
182
+
183
+ if not current_chunk_pages:
184
+ # First page
185
+ current_chunk_pages.append((page_num, page_content))
186
+ current_size = page_size
187
+ continue
188
+
189
+ # Try to merge
190
+ # Add \n\n between pages (4 chars)
191
+ potential_size = current_size + 4 + page_size
192
+
193
+ if potential_size <= chunk_size:
194
+ # Within chunk_size: merge
195
+ current_chunk_pages.append((page_num, page_content))
196
+ current_size = potential_size
197
+ elif potential_size <= max_size:
198
+ # Exceeds chunk_size but within 1.5x: allow merge
199
+ current_chunk_pages.append((page_num, page_content))
200
+ current_size = potential_size
201
+
202
+ # Finalize this chunk (no more additions)
203
+ chunk_content = merge_pages(current_chunk_pages)
204
+
205
+ # Verify protected region integrity: warn if chunk ends mid-region
206
+ chunk_content = ensure_protected_region_integrity(chunk_content)
207
+
208
+ chunks.append(chunk_content)
209
+
210
+ # Overlap handling: include part of last page in next chunk
211
+ overlap_content = get_overlap_content(current_chunk_pages, chunk_overlap)
212
+ current_chunk_pages = []
213
+ current_size = 0
214
+
215
+ if overlap_content:
216
+ # Store overlap to prepend to next chunk's first page
217
+ pending_overlap = overlap_content
218
+ else:
219
+ # Exceeds 1.5x: finalize current chunk, new page goes to next chunk
220
+ if current_chunk_pages:
221
+ chunk_content = merge_pages(current_chunk_pages)
222
+ chunk_content = ensure_protected_region_integrity(chunk_content)
223
+ chunks.append(chunk_content)
224
+
225
+ # Start new chunk
226
+ current_chunk_pages = [(page_num, page_content)]
227
+ current_size = page_size
228
+
229
+ # Process remaining pages
230
+ if current_chunk_pages:
231
+ chunk_content = merge_pages(current_chunk_pages)
232
+ chunk_content = ensure_protected_region_integrity(chunk_content)
233
+ chunks.append(chunk_content)
234
+
235
+ # Split very large chunks (protect protected regions)
236
+ final_chunks = []
237
+ for chunk in chunks:
238
+ if len(chunk) > max_size * 1.5:
239
+ # Very large chunk: split while protecting regions
240
+ sub_chunks = split_large_chunk_with_protected_regions(
241
+ chunk, chunk_size, chunk_overlap, is_table_based, force_chunking,
242
+ image_pattern, chart_pattern, page_tag_processor, metadata_pattern
243
+ )
244
+ final_chunks.extend(sub_chunks)
245
+ else:
246
+ final_chunks.append(chunk)
247
+
248
+ return final_chunks