xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
# chunking_helper/constants.py
|
|
2
|
+
"""
|
|
3
|
+
Chunking Module Constants - Definition of constants, patterns, and dataclasses for chunking
|
|
4
|
+
|
|
5
|
+
This module defines all constants and data structures used throughout the chunking system.
|
|
6
|
+
"""
|
|
7
|
+
import logging
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import List
|
|
10
|
+
from langchain_text_splitters import Language
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger("document-processor")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# ============================================================================
|
|
16
|
+
# Code Language Mapping
|
|
17
|
+
# ============================================================================
|
|
18
|
+
|
|
19
|
+
LANGCHAIN_CODE_LANGUAGE_MAP = {
|
|
20
|
+
'py': Language.PYTHON, 'js': Language.JS, 'ts': Language.TS,
|
|
21
|
+
'java': Language.JAVA, 'cpp': Language.CPP, 'c': Language.CPP,
|
|
22
|
+
'cs': Language.CSHARP, 'go': Language.GO, 'rs': Language.RUST,
|
|
23
|
+
'php': Language.PHP, 'rb': Language.RUBY, 'swift': Language.SWIFT,
|
|
24
|
+
'kt': Language.KOTLIN, 'scala': Language.SCALA,
|
|
25
|
+
'html': Language.HTML, 'jsx': Language.JS, 'tsx': Language.TS,
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# ============================================================================
|
|
30
|
+
# Protected Region Patterns (Blocks that should not be split during chunking)
|
|
31
|
+
# ============================================================================
|
|
32
|
+
|
|
33
|
+
# HTML table - Protect all <table> tags (regardless of attributes)
|
|
34
|
+
HTML_TABLE_PATTERN = r'<table[^>]*>.*?</table>'
|
|
35
|
+
|
|
36
|
+
# Chart block - Always protected (cannot be chunked under any condition)
|
|
37
|
+
# Default format: [chart]...[/chart] - can be customized via ChartProcessor
|
|
38
|
+
CHART_BLOCK_PATTERN = r'\[chart\].*?\[/chart\]'
|
|
39
|
+
|
|
40
|
+
# Textbox block - Always protected (cannot be chunked under any condition)
|
|
41
|
+
TEXTBOX_BLOCK_PATTERN = r'\[textbox\].*?\[/textbox\]'
|
|
42
|
+
|
|
43
|
+
# Image tag - Always protected (cannot be chunked under any condition)
|
|
44
|
+
# Format: [image:path], [Image: {path}], [image : path] etc. (case-insensitive, whitespace allowed, {} wrapping allowed)
|
|
45
|
+
IMAGE_TAG_PATTERN = r'\[(?i:image)\s*:\s*\{?[^\]\}]+\}?\]'
|
|
46
|
+
|
|
47
|
+
# Page/Slide/Sheet tag patterns - Always protected (NEVER overlap)
|
|
48
|
+
# Default formats from PageTagProcessor
|
|
49
|
+
PAGE_TAG_PATTERN = r'\[Page Number:\s*\d+\]'
|
|
50
|
+
SLIDE_TAG_PATTERN = r'\[Slide Number:\s*\d+\]'
|
|
51
|
+
SHEET_TAG_PATTERN = r'\[Sheet:\s*[^\]]+\]'
|
|
52
|
+
|
|
53
|
+
# OCR variants of page/slide tags
|
|
54
|
+
PAGE_TAG_OCR_PATTERN = r'\[Page Number:\s*\d+\s*\(OCR(?:\+Ref)?\)\]'
|
|
55
|
+
SLIDE_TAG_OCR_PATTERN = r'\[Slide Number:\s*\d+\s*\(OCR(?:\+Ref)?\)\]'
|
|
56
|
+
|
|
57
|
+
# Document metadata block - Always protected (NEVER overlap)
|
|
58
|
+
# Default format: <Document-Metadata>...</Document-Metadata> - can be customized via MetadataFormatter
|
|
59
|
+
METADATA_BLOCK_PATTERN = r'<Document-Metadata>.*?</Document-Metadata>'
|
|
60
|
+
|
|
61
|
+
# Data analysis block - Always protected
|
|
62
|
+
DATA_ANALYSIS_PATTERN = r'\[(?:Data Analysis|데이터 분석)\].*?\[/(?:Data Analysis|데이터 분석)\]'
|
|
63
|
+
|
|
64
|
+
# Markdown table patterns
|
|
65
|
+
# Complete Markdown table pattern (rows starting with |, including header separator |---|---|)
|
|
66
|
+
MARKDOWN_TABLE_PATTERN = r'(?:^|\n)(\|[^\n]+\|\n\|[-:|\s]+\|\n(?:\|[^\n]+\|(?:\n|$))+)'
|
|
67
|
+
|
|
68
|
+
# Markdown table individual row pattern (for row-level protection)
|
|
69
|
+
MARKDOWN_TABLE_ROW_PATTERN = r'\|[^\n]+\|'
|
|
70
|
+
|
|
71
|
+
# Markdown table header separator pattern (|---|---| or |:---:|---| etc.)
|
|
72
|
+
MARKDOWN_TABLE_SEPARATOR_PATTERN = r'^\|[\s\-:]+\|[\s\-:|]*$'
|
|
73
|
+
|
|
74
|
+
# Markdown table header detection (first row followed by separator)
|
|
75
|
+
MARKDOWN_TABLE_HEADER_PATTERN = r'^(\|[^\n]+\|\n)(\|[-:|\s]+\|)'
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
# ============================================================================
|
|
79
|
+
# Table Chunking Related Constants
|
|
80
|
+
# ============================================================================
|
|
81
|
+
|
|
82
|
+
# Table wrapping overhead (table tags, line breaks, etc.)
|
|
83
|
+
TABLE_WRAPPER_OVERHEAD = 30 # <table border='1'>\n</table>
|
|
84
|
+
|
|
85
|
+
# Minimum overhead per row (<tr>\n</tr>)
|
|
86
|
+
ROW_OVERHEAD = 12
|
|
87
|
+
|
|
88
|
+
# Overhead per cell (<td></td> or <th></th>)
|
|
89
|
+
CELL_OVERHEAD = 10
|
|
90
|
+
|
|
91
|
+
# Chunk index metadata overhead
|
|
92
|
+
CHUNK_INDEX_OVERHEAD = 30 # [Table chunk 1/10]\n
|
|
93
|
+
|
|
94
|
+
# Tables larger than this are subject to splitting
|
|
95
|
+
TABLE_SIZE_THRESHOLD_MULTIPLIER = 1.2 # 1.2x of chunk_size
|
|
96
|
+
|
|
97
|
+
# Table-based file types (CSV, TSV, Excel)
|
|
98
|
+
TABLE_BASED_FILE_TYPES = {'csv', 'tsv', 'xlsx', 'xls'}
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# ============================================================================
|
|
102
|
+
# Dataclasses
|
|
103
|
+
# ============================================================================
|
|
104
|
+
|
|
105
|
+
@dataclass
|
|
106
|
+
class TableRow:
|
|
107
|
+
"""Table row data (HTML or Markdown)"""
|
|
108
|
+
html: str # Raw content (HTML or Markdown)
|
|
109
|
+
is_header: bool
|
|
110
|
+
cell_count: int
|
|
111
|
+
char_length: int
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
@dataclass
|
|
115
|
+
class ParsedTable:
|
|
116
|
+
"""Parsed table information (HTML)"""
|
|
117
|
+
header_rows: List[TableRow] # Header rows
|
|
118
|
+
data_rows: List[TableRow] # Data rows
|
|
119
|
+
total_cols: int # Total columns
|
|
120
|
+
original_html: str # Original HTML
|
|
121
|
+
header_html: str # Header HTML (for reuse)
|
|
122
|
+
header_size: int # Header size (characters)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
@dataclass
|
|
126
|
+
class ParsedMarkdownTable:
|
|
127
|
+
"""Parsed Markdown table information"""
|
|
128
|
+
header_row: str # Header row (first row with column names)
|
|
129
|
+
separator_row: str # Separator row (|---|---|)
|
|
130
|
+
data_rows: List[str] # Data rows
|
|
131
|
+
total_cols: int # Total columns
|
|
132
|
+
original_text: str # Original Markdown text
|
|
133
|
+
header_text: str # Header + separator for reuse
|
|
134
|
+
header_size: int # Header size (characters)
|
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
# chunking_helper/page_chunker.py
|
|
2
|
+
"""
|
|
3
|
+
Page Chunker - Page-based chunking
|
|
4
|
+
|
|
5
|
+
Main Features:
|
|
6
|
+
- Split text by pages
|
|
7
|
+
- Page merging and chunking
|
|
8
|
+
- Overlap handling
|
|
9
|
+
- Table protection (HTML and Markdown) with NO overlap for tables
|
|
10
|
+
"""
|
|
11
|
+
import logging
|
|
12
|
+
import re
|
|
13
|
+
from typing import List, Optional, Tuple
|
|
14
|
+
|
|
15
|
+
from xgen_doc2chunk.chunking.protected_regions import (
|
|
16
|
+
find_protected_regions, get_protected_region_positions,
|
|
17
|
+
ensure_protected_region_integrity, split_large_chunk_with_protected_regions
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger("document-processor")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def split_into_pages(text: str, page_marker_pattern: str) -> List[Tuple[int, str]]:
|
|
24
|
+
"""
|
|
25
|
+
Split text by pages.
|
|
26
|
+
Exclude empty pages (pages with only page marker).
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
[(page_num, page_content), ...] list
|
|
30
|
+
"""
|
|
31
|
+
pages = []
|
|
32
|
+
|
|
33
|
+
# Find page marker positions
|
|
34
|
+
markers = list(re.finditer(page_marker_pattern, text))
|
|
35
|
+
|
|
36
|
+
if not markers:
|
|
37
|
+
return []
|
|
38
|
+
|
|
39
|
+
for i, match in enumerate(markers):
|
|
40
|
+
page_num = int(match.group(1))
|
|
41
|
+
start = match.start()
|
|
42
|
+
|
|
43
|
+
# Until next page marker or end of text
|
|
44
|
+
if i + 1 < len(markers):
|
|
45
|
+
end = markers[i + 1].start()
|
|
46
|
+
else:
|
|
47
|
+
end = len(text)
|
|
48
|
+
|
|
49
|
+
# Page content (including marker)
|
|
50
|
+
page_content = text[start:end].strip()
|
|
51
|
+
|
|
52
|
+
# Empty page check: only page marker exists
|
|
53
|
+
if page_content:
|
|
54
|
+
content_without_marker = re.sub(page_marker_pattern, '', page_content).strip()
|
|
55
|
+
|
|
56
|
+
if content_without_marker:
|
|
57
|
+
# Add only pages with actual content
|
|
58
|
+
pages.append((page_num, page_content))
|
|
59
|
+
else:
|
|
60
|
+
# Skip empty pages
|
|
61
|
+
logger.debug(f"Skipping empty page {page_num}")
|
|
62
|
+
|
|
63
|
+
# Add content before first page marker if exists
|
|
64
|
+
if markers and markers[0].start() > 0:
|
|
65
|
+
before_content = text[:markers[0].start()].strip()
|
|
66
|
+
if before_content:
|
|
67
|
+
pages.insert(0, (0, before_content))
|
|
68
|
+
|
|
69
|
+
return pages
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def merge_pages(pages: List[Tuple[int, str]]) -> str:
|
|
73
|
+
"""
|
|
74
|
+
Merge pages into a single string.
|
|
75
|
+
"""
|
|
76
|
+
return '\n\n'.join(content for _, content in pages)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def get_overlap_content(pages: List[Tuple[int, str]], overlap_size: int) -> str:
|
|
80
|
+
"""
|
|
81
|
+
Extract overlap-size content from the last page.
|
|
82
|
+
"""
|
|
83
|
+
if not pages:
|
|
84
|
+
return ""
|
|
85
|
+
|
|
86
|
+
_, last_content = pages[-1]
|
|
87
|
+
if len(last_content) <= overlap_size:
|
|
88
|
+
return last_content
|
|
89
|
+
|
|
90
|
+
return last_content[-overlap_size:]
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def chunk_by_pages(
|
|
94
|
+
text: str,
|
|
95
|
+
chunk_size: int,
|
|
96
|
+
chunk_overlap: int,
|
|
97
|
+
is_table_based: bool = False,
|
|
98
|
+
force_chunking: bool = False,
|
|
99
|
+
page_tag_processor = None,
|
|
100
|
+
image_pattern: Optional[str] = None,
|
|
101
|
+
chart_pattern: Optional[str] = None,
|
|
102
|
+
metadata_pattern: Optional[str] = None
|
|
103
|
+
) -> List[str]:
|
|
104
|
+
"""
|
|
105
|
+
Page-based text chunking.
|
|
106
|
+
|
|
107
|
+
Algorithm:
|
|
108
|
+
1. Split text by pages
|
|
109
|
+
2. Try to merge pages sequentially
|
|
110
|
+
3. If merged size <= chunk_size, continue merging
|
|
111
|
+
4. If exceeds chunk_size:
|
|
112
|
+
- Allow up to 1.5x
|
|
113
|
+
- If exceeds 1.5x, finalize previous as chunk
|
|
114
|
+
5. If protected regions (tables, charts, Markdown tables) span page boundaries, keep together
|
|
115
|
+
(force_chunking only protects rows for tables, charts are always protected)
|
|
116
|
+
6. Protected regions (image, page, slide, chart, metadata tags) NEVER overlap
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
text: Original text
|
|
120
|
+
chunk_size: Maximum chunk size
|
|
121
|
+
chunk_overlap: Overlap size between chunks (NOT applied to protected regions)
|
|
122
|
+
is_table_based: Whether the file is table-based
|
|
123
|
+
force_chunking: Force chunking (disable table protection)
|
|
124
|
+
page_tag_processor: PageTagProcessor instance for custom patterns
|
|
125
|
+
image_pattern: Custom regex pattern for image tags
|
|
126
|
+
chart_pattern: Custom regex pattern for chart blocks
|
|
127
|
+
metadata_pattern: Custom regex pattern for metadata blocks
|
|
128
|
+
"""
|
|
129
|
+
# Build page marker patterns from PageTagProcessor or use defaults
|
|
130
|
+
if page_tag_processor is not None:
|
|
131
|
+
page_marker_patterns = [
|
|
132
|
+
page_tag_processor.get_pattern_string(), # Page pattern
|
|
133
|
+
]
|
|
134
|
+
config = page_tag_processor.config
|
|
135
|
+
if config.slide_prefix != config.tag_prefix:
|
|
136
|
+
from xgen_doc2chunk.core.functions.page_tag_processor import PageTagType
|
|
137
|
+
page_marker_patterns.append(page_tag_processor.get_pattern_string(PageTagType.SLIDE))
|
|
138
|
+
else:
|
|
139
|
+
page_marker_patterns = [
|
|
140
|
+
r'\[Page Number:\s*(\d+)\]', # Default page format
|
|
141
|
+
r'\[Slide Number:\s*(\d+)\]', # Default slide format
|
|
142
|
+
]
|
|
143
|
+
|
|
144
|
+
# Find first matching pattern
|
|
145
|
+
pages = []
|
|
146
|
+
for page_marker_pattern in page_marker_patterns:
|
|
147
|
+
pages = split_into_pages(text, page_marker_pattern)
|
|
148
|
+
if pages:
|
|
149
|
+
break
|
|
150
|
+
|
|
151
|
+
if not pages:
|
|
152
|
+
# Page split failed, fall back to plain text chunking
|
|
153
|
+
from .text_chunker import chunk_plain_text
|
|
154
|
+
return chunk_plain_text(text, chunk_size, chunk_overlap)
|
|
155
|
+
|
|
156
|
+
logger.debug(f"Split into {len(pages)} pages")
|
|
157
|
+
|
|
158
|
+
# Identify protected region positions (HTML tables, chart blocks, Markdown tables, image tags)
|
|
159
|
+
# force_chunking disables table protection (charts are always protected)
|
|
160
|
+
protected_regions = find_protected_regions(
|
|
161
|
+
text, is_table_based, force_chunking, image_pattern,
|
|
162
|
+
chart_pattern, page_tag_processor, metadata_pattern
|
|
163
|
+
)
|
|
164
|
+
protected_positions = get_protected_region_positions(protected_regions)
|
|
165
|
+
|
|
166
|
+
# Merge pages to create chunks
|
|
167
|
+
chunks = []
|
|
168
|
+
max_size = int(chunk_size * 1.5) # Allow up to 1.5x
|
|
169
|
+
|
|
170
|
+
current_chunk_pages = [] # Pages included in current chunk
|
|
171
|
+
current_size = 0
|
|
172
|
+
pending_overlap = "" # Overlap content to prepend to next chunk
|
|
173
|
+
|
|
174
|
+
for page_idx, (page_num, page_content) in enumerate(pages):
|
|
175
|
+
page_size = len(page_content)
|
|
176
|
+
|
|
177
|
+
# Apply pending overlap to page content
|
|
178
|
+
if pending_overlap:
|
|
179
|
+
page_content = pending_overlap + "\n\n" + page_content
|
|
180
|
+
page_size = len(page_content)
|
|
181
|
+
pending_overlap = ""
|
|
182
|
+
|
|
183
|
+
if not current_chunk_pages:
|
|
184
|
+
# First page
|
|
185
|
+
current_chunk_pages.append((page_num, page_content))
|
|
186
|
+
current_size = page_size
|
|
187
|
+
continue
|
|
188
|
+
|
|
189
|
+
# Try to merge
|
|
190
|
+
# Add \n\n between pages (4 chars)
|
|
191
|
+
potential_size = current_size + 4 + page_size
|
|
192
|
+
|
|
193
|
+
if potential_size <= chunk_size:
|
|
194
|
+
# Within chunk_size: merge
|
|
195
|
+
current_chunk_pages.append((page_num, page_content))
|
|
196
|
+
current_size = potential_size
|
|
197
|
+
elif potential_size <= max_size:
|
|
198
|
+
# Exceeds chunk_size but within 1.5x: allow merge
|
|
199
|
+
current_chunk_pages.append((page_num, page_content))
|
|
200
|
+
current_size = potential_size
|
|
201
|
+
|
|
202
|
+
# Finalize this chunk (no more additions)
|
|
203
|
+
chunk_content = merge_pages(current_chunk_pages)
|
|
204
|
+
|
|
205
|
+
# Verify protected region integrity: warn if chunk ends mid-region
|
|
206
|
+
chunk_content = ensure_protected_region_integrity(chunk_content)
|
|
207
|
+
|
|
208
|
+
chunks.append(chunk_content)
|
|
209
|
+
|
|
210
|
+
# Overlap handling: include part of last page in next chunk
|
|
211
|
+
overlap_content = get_overlap_content(current_chunk_pages, chunk_overlap)
|
|
212
|
+
current_chunk_pages = []
|
|
213
|
+
current_size = 0
|
|
214
|
+
|
|
215
|
+
if overlap_content:
|
|
216
|
+
# Store overlap to prepend to next chunk's first page
|
|
217
|
+
pending_overlap = overlap_content
|
|
218
|
+
else:
|
|
219
|
+
# Exceeds 1.5x: finalize current chunk, new page goes to next chunk
|
|
220
|
+
if current_chunk_pages:
|
|
221
|
+
chunk_content = merge_pages(current_chunk_pages)
|
|
222
|
+
chunk_content = ensure_protected_region_integrity(chunk_content)
|
|
223
|
+
chunks.append(chunk_content)
|
|
224
|
+
|
|
225
|
+
# Start new chunk
|
|
226
|
+
current_chunk_pages = [(page_num, page_content)]
|
|
227
|
+
current_size = page_size
|
|
228
|
+
|
|
229
|
+
# Process remaining pages
|
|
230
|
+
if current_chunk_pages:
|
|
231
|
+
chunk_content = merge_pages(current_chunk_pages)
|
|
232
|
+
chunk_content = ensure_protected_region_integrity(chunk_content)
|
|
233
|
+
chunks.append(chunk_content)
|
|
234
|
+
|
|
235
|
+
# Split very large chunks (protect protected regions)
|
|
236
|
+
final_chunks = []
|
|
237
|
+
for chunk in chunks:
|
|
238
|
+
if len(chunk) > max_size * 1.5:
|
|
239
|
+
# Very large chunk: split while protecting regions
|
|
240
|
+
sub_chunks = split_large_chunk_with_protected_regions(
|
|
241
|
+
chunk, chunk_size, chunk_overlap, is_table_based, force_chunking,
|
|
242
|
+
image_pattern, chart_pattern, page_tag_processor, metadata_pattern
|
|
243
|
+
)
|
|
244
|
+
final_chunks.extend(sub_chunks)
|
|
245
|
+
else:
|
|
246
|
+
final_chunks.append(chunk)
|
|
247
|
+
|
|
248
|
+
return final_chunks
|