xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,786 @@
|
|
|
1
|
+
# xgen_doc2chunk/chunking/chunking.py
|
|
2
|
+
"""
|
|
3
|
+
Document Chunking Module - Advanced Text Chunking System
|
|
4
|
+
|
|
5
|
+
Main Features:
|
|
6
|
+
- HTML table-preserving chunking with row-level splitting
|
|
7
|
+
- Markdown table-preserving chunking with row-level splitting
|
|
8
|
+
- Intelligent splitting for large table data (CSV/TSV/Excel)
|
|
9
|
+
- Table structure restoration (header preservation for both HTML and Markdown)
|
|
10
|
+
- Page-based chunking
|
|
11
|
+
- Language-specific code file chunking
|
|
12
|
+
|
|
13
|
+
Key Improvements (Table Chunking Enhancement):
|
|
14
|
+
- Split large tables (HTML and Markdown) by rows to fit chunk_size
|
|
15
|
+
- Automatically restore table headers in each chunk
|
|
16
|
+
- Ensure table structure integrity
|
|
17
|
+
- Add chunk indexing metadata
|
|
18
|
+
- NO OVERLAP for table chunks (intentional to prevent data duplication)
|
|
19
|
+
|
|
20
|
+
Critical Rules for Table-Based Files (CSV, TSV, XLSX, XLS):
|
|
21
|
+
- Always use force_chunking=True
|
|
22
|
+
- Always split by rows (never cut in the middle of a row)
|
|
23
|
+
- Never apply overlap between table chunks
|
|
24
|
+
- Restore headers in each chunk for context
|
|
25
|
+
|
|
26
|
+
Refactoring:
|
|
27
|
+
- Core logic is separated into chunking_helper submodules
|
|
28
|
+
- This file maintains only the public API and integration logic
|
|
29
|
+
"""
|
|
30
|
+
import bisect
|
|
31
|
+
import logging
|
|
32
|
+
import re
|
|
33
|
+
from typing import Any, Dict, List, Optional, Union
|
|
34
|
+
|
|
35
|
+
# Import from individual modules
|
|
36
|
+
from xgen_doc2chunk.chunking.constants import (
|
|
37
|
+
TABLE_SIZE_THRESHOLD_MULTIPLIER,
|
|
38
|
+
TABLE_BASED_FILE_TYPES,
|
|
39
|
+
HTML_TABLE_PATTERN,
|
|
40
|
+
MARKDOWN_TABLE_PATTERN,
|
|
41
|
+
)
|
|
42
|
+
from xgen_doc2chunk.chunking.table_chunker import (
|
|
43
|
+
chunk_large_table as _chunk_large_table,
|
|
44
|
+
chunk_large_markdown_table as _chunk_large_markdown_table,
|
|
45
|
+
is_markdown_table as _is_markdown_table,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
from xgen_doc2chunk.chunking.protected_regions import (
|
|
49
|
+
find_protected_regions as _find_protected_regions,
|
|
50
|
+
get_protected_region_positions as _get_protected_region_positions,
|
|
51
|
+
split_with_protected_regions as _split_with_protected_regions,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
from xgen_doc2chunk.chunking.page_chunker import (
|
|
55
|
+
chunk_by_pages as _chunk_by_pages,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
from xgen_doc2chunk.chunking.text_chunker import (
|
|
59
|
+
chunk_plain_text as _chunk_plain_text,
|
|
60
|
+
chunk_text_without_tables,
|
|
61
|
+
chunk_with_row_protection,
|
|
62
|
+
clean_chunks as _clean_chunks,
|
|
63
|
+
reconstruct_text_from_chunks,
|
|
64
|
+
find_overlap_length,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
from xgen_doc2chunk.chunking.sheet_processor import (
|
|
68
|
+
extract_document_metadata as _extract_document_metadata,
|
|
69
|
+
prepend_metadata_to_chunks as _prepend_metadata_to_chunks,
|
|
70
|
+
extract_sheet_sections as _extract_sheet_sections,
|
|
71
|
+
chunk_multi_sheet_content,
|
|
72
|
+
chunk_single_table_content,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
logger = logging.getLogger("document-processor")
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
# ============================================================================
|
|
79
|
+
# Helper Functions for PageTagProcessor integration
|
|
80
|
+
# ============================================================================
|
|
81
|
+
|
|
82
|
+
def _get_page_marker_patterns(page_tag_processor: Optional[Any] = None) -> List[str]:
|
|
83
|
+
"""
|
|
84
|
+
Get page marker regex patterns from PageTagProcessor or use defaults.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
page_tag_processor: PageTagProcessor instance (optional)
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
List of regex patterns for page/slide markers
|
|
91
|
+
"""
|
|
92
|
+
if page_tag_processor is not None:
|
|
93
|
+
# Build patterns from processor's config
|
|
94
|
+
config = page_tag_processor.config
|
|
95
|
+
patterns = [
|
|
96
|
+
page_tag_processor.get_pattern_string(), # Page pattern
|
|
97
|
+
]
|
|
98
|
+
# Add slide pattern if different prefix
|
|
99
|
+
if config.slide_prefix != config.tag_prefix:
|
|
100
|
+
from xgen_doc2chunk.core.functions.page_tag_processor import PageTagType
|
|
101
|
+
patterns.append(page_tag_processor.get_pattern_string(PageTagType.SLIDE))
|
|
102
|
+
return patterns
|
|
103
|
+
else:
|
|
104
|
+
# Default patterns
|
|
105
|
+
return [
|
|
106
|
+
r'\[Page Number:\s*(\d+)\]',
|
|
107
|
+
r'\[Slide Number:\s*(\d+)\]',
|
|
108
|
+
]
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _get_sheet_marker_pattern(page_tag_processor: Optional[Any] = None) -> str:
|
|
112
|
+
"""
|
|
113
|
+
Get sheet marker regex pattern from PageTagProcessor or use default.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
page_tag_processor: PageTagProcessor instance (optional)
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
Regex pattern for sheet markers
|
|
120
|
+
"""
|
|
121
|
+
if page_tag_processor is not None:
|
|
122
|
+
from xgen_doc2chunk.core.functions.page_tag_processor import PageTagType
|
|
123
|
+
return page_tag_processor.get_pattern_string(PageTagType.SHEET)
|
|
124
|
+
else:
|
|
125
|
+
return r'\[Sheet:\s*([^\]]+)\]'
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _get_image_tag_pattern(image_processor: Optional[Any] = None) -> str:
|
|
129
|
+
"""
|
|
130
|
+
Get image tag regex pattern from ImageProcessor or use default.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
image_processor: ImageProcessor instance (optional)
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
Regex pattern for image tags
|
|
137
|
+
"""
|
|
138
|
+
if image_processor is not None:
|
|
139
|
+
return image_processor.get_pattern_string()
|
|
140
|
+
else:
|
|
141
|
+
# Default pattern: [Image:...] or [image:...] with optional spaces and braces
|
|
142
|
+
from xgen_doc2chunk.chunking.constants import IMAGE_TAG_PATTERN
|
|
143
|
+
return IMAGE_TAG_PATTERN
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _get_chart_block_pattern(chart_processor: Optional[Any] = None) -> str:
|
|
147
|
+
"""
|
|
148
|
+
Get chart block regex pattern from ChartProcessor or use default.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
chart_processor: ChartProcessor instance (optional)
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
Regex pattern for chart blocks
|
|
155
|
+
"""
|
|
156
|
+
if chart_processor is not None:
|
|
157
|
+
try:
|
|
158
|
+
# Build pattern from processor's config
|
|
159
|
+
prefix = re.escape(chart_processor.config.tag_prefix)
|
|
160
|
+
suffix = re.escape(chart_processor.config.tag_suffix)
|
|
161
|
+
return f'{prefix}.*?{suffix}'
|
|
162
|
+
except Exception:
|
|
163
|
+
pass
|
|
164
|
+
# Default pattern: [chart]...[/chart]
|
|
165
|
+
from xgen_doc2chunk.chunking.constants import CHART_BLOCK_PATTERN
|
|
166
|
+
return CHART_BLOCK_PATTERN
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _get_metadata_block_pattern(metadata_formatter: Optional[Any] = None) -> str:
|
|
170
|
+
"""
|
|
171
|
+
Get metadata block regex pattern from MetadataFormatter or use default.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
metadata_formatter: MetadataFormatter instance (optional)
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
Regex pattern for metadata blocks
|
|
178
|
+
"""
|
|
179
|
+
if metadata_formatter is not None:
|
|
180
|
+
try:
|
|
181
|
+
# Build pattern from formatter's config
|
|
182
|
+
prefix = re.escape(metadata_formatter.metadata_tag_prefix)
|
|
183
|
+
suffix = re.escape(metadata_formatter.metadata_tag_suffix)
|
|
184
|
+
return f'{prefix}.*?{suffix}'
|
|
185
|
+
except Exception:
|
|
186
|
+
pass
|
|
187
|
+
# Default pattern: <Document-Metadata>...</Document-Metadata>
|
|
188
|
+
from xgen_doc2chunk.chunking.constants import METADATA_BLOCK_PATTERN
|
|
189
|
+
return METADATA_BLOCK_PATTERN
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
# ============================================================================
|
|
193
|
+
# Public API - Single entry point for external use
|
|
194
|
+
# ============================================================================
|
|
195
|
+
|
|
196
|
+
def create_chunks(
|
|
197
|
+
text: str,
|
|
198
|
+
file_extension: str,
|
|
199
|
+
chunk_size: int = 1000,
|
|
200
|
+
chunk_overlap: int = 200,
|
|
201
|
+
force_chunking: bool = False,
|
|
202
|
+
include_position_metadata: bool = True,
|
|
203
|
+
chunking_strategy: str = "recursive",
|
|
204
|
+
page_tag_processor: Optional[Any] = None,
|
|
205
|
+
image_processor: Optional[Any] = None,
|
|
206
|
+
chart_processor: Optional[Any] = None,
|
|
207
|
+
metadata_formatter: Optional[Any] = None,
|
|
208
|
+
stride: Optional[int] = None,
|
|
209
|
+
parent_chunk_size: Optional[int] = None,
|
|
210
|
+
child_chunk_size: Optional[int] = None,
|
|
211
|
+
**kwargs
|
|
212
|
+
) -> Union[List[str], List[Dict[str, Any]]]:
|
|
213
|
+
"""
|
|
214
|
+
Split text into chunks. (Single public API)
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
text: Original text
|
|
218
|
+
file_extension: File extension
|
|
219
|
+
chunk_size: Maximum chunk size
|
|
220
|
+
chunk_overlap: Overlap size between chunks (NOT applied to protected regions)
|
|
221
|
+
force_chunking: Force chunking (disable table protection)
|
|
222
|
+
include_position_metadata: Whether to include position metadata
|
|
223
|
+
- True: Include metadata like page_number, line_start, line_end (List[Dict])
|
|
224
|
+
- False: Return only chunk text (List[str])
|
|
225
|
+
chunking_strategy: Chunking strategy (recursive, sliding, hierarchical) - future implementation
|
|
226
|
+
page_tag_processor: PageTagProcessor instance for custom tag patterns
|
|
227
|
+
- If None, uses default patterns [Page Number: n], [Slide Number: n], [Sheet: name]
|
|
228
|
+
- If provided, uses the processor's configured patterns
|
|
229
|
+
- Page/Slide/Sheet tags are protected and NEVER overlap
|
|
230
|
+
image_processor: ImageProcessor instance for custom image tag patterns
|
|
231
|
+
- If None, uses default pattern [Image:...]
|
|
232
|
+
- If provided, uses the processor's configured patterns
|
|
233
|
+
- Image tags are protected and NEVER overlap
|
|
234
|
+
chart_processor: ChartProcessor instance for custom chart tag patterns
|
|
235
|
+
- If None, uses default pattern [chart]...[/chart]
|
|
236
|
+
- If provided, uses the processor's configured patterns
|
|
237
|
+
- Chart blocks are protected and NEVER overlap
|
|
238
|
+
metadata_formatter: MetadataFormatter instance for custom metadata tag patterns
|
|
239
|
+
- If None, uses default pattern <Document-Metadata>...</Document-Metadata>
|
|
240
|
+
- If provided, uses the formatter's configured patterns
|
|
241
|
+
- Metadata blocks are protected and NEVER overlap
|
|
242
|
+
stride: Stride for sliding window strategy - future implementation
|
|
243
|
+
parent_chunk_size: Parent chunk size for hierarchical strategy - future implementation
|
|
244
|
+
child_chunk_size: Child chunk size for hierarchical strategy - future implementation
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
When include_position_metadata=True:
|
|
248
|
+
List of chunks with metadata [{"text", "page_number", "line_start", ...}, ...]
|
|
249
|
+
When include_position_metadata=False:
|
|
250
|
+
List of chunk texts ["chunk1", "chunk2", ...]
|
|
251
|
+
|
|
252
|
+
Protected Regions (NEVER split or overlap):
|
|
253
|
+
- Image tags: [Image:...] or custom pattern
|
|
254
|
+
- Page/Slide/Sheet tags: [Page Number: n], [Slide Number: n], [Sheet: name] or custom
|
|
255
|
+
- Chart blocks: [chart]...[/chart] or custom
|
|
256
|
+
- Metadata blocks: <Document-Metadata>...</Document-Metadata> or custom
|
|
257
|
+
- Tables: Split by rows, each chunk has NO overlap
|
|
258
|
+
"""
|
|
259
|
+
# TODO: Implement various chunking strategies based on chunking_strategy
|
|
260
|
+
if chunking_strategy != "recursive":
|
|
261
|
+
logger.warning(
|
|
262
|
+
f"Chunking strategy '{chunking_strategy}' is not yet implemented, "
|
|
263
|
+
"falling back to 'recursive'"
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
# Split text into chunks
|
|
267
|
+
chunks = _split_text(
|
|
268
|
+
text, chunk_size, chunk_overlap,
|
|
269
|
+
file_extension=file_extension,
|
|
270
|
+
force_chunking=force_chunking,
|
|
271
|
+
page_tag_processor=page_tag_processor,
|
|
272
|
+
image_processor=image_processor,
|
|
273
|
+
chart_processor=chart_processor,
|
|
274
|
+
metadata_formatter=metadata_formatter
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# Return chunks without metadata
|
|
278
|
+
if not include_position_metadata:
|
|
279
|
+
return chunks
|
|
280
|
+
|
|
281
|
+
# Reconstruct text and build line offset table
|
|
282
|
+
reconstructed = reconstruct_text_from_chunks(chunks, chunk_overlap)
|
|
283
|
+
line_table = _build_line_offset_table(reconstructed, file_extension, page_tag_processor)
|
|
284
|
+
|
|
285
|
+
# Add metadata to each chunk
|
|
286
|
+
result: List[Dict[str, Any]] = []
|
|
287
|
+
current_pos = 0
|
|
288
|
+
|
|
289
|
+
for idx, chunk in enumerate(chunks):
|
|
290
|
+
start = current_pos
|
|
291
|
+
end = current_pos + len(chunk) - 1
|
|
292
|
+
|
|
293
|
+
start_line_idx = _find_line_index_by_pos(start, line_table)
|
|
294
|
+
end_line_idx = _find_line_index_by_pos(end, line_table)
|
|
295
|
+
|
|
296
|
+
line_start = line_table[start_line_idx]["line_num"]
|
|
297
|
+
line_end = line_table[end_line_idx]["line_num"]
|
|
298
|
+
page_number = line_table[start_line_idx].get("page", 1)
|
|
299
|
+
|
|
300
|
+
result.append({
|
|
301
|
+
"text": chunk,
|
|
302
|
+
"page_number": page_number,
|
|
303
|
+
"line_start": line_start,
|
|
304
|
+
"line_end": line_end,
|
|
305
|
+
"global_start": start,
|
|
306
|
+
"global_end": end,
|
|
307
|
+
"chunk_index": idx
|
|
308
|
+
})
|
|
309
|
+
|
|
310
|
+
current_pos += len(chunk)
|
|
311
|
+
if idx < len(chunks) - 1:
|
|
312
|
+
overlap_len = find_overlap_length(chunk, chunks[idx + 1], chunk_overlap)
|
|
313
|
+
current_pos -= overlap_len
|
|
314
|
+
|
|
315
|
+
logger.info(f"Created {len(result)} chunks with position metadata")
|
|
316
|
+
return result
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
# ============================================================================
|
|
320
|
+
# Internal Functions - Table-based content processing
|
|
321
|
+
# ============================================================================
|
|
322
|
+
|
|
323
|
+
def _split_table_based_content(
|
|
324
|
+
text: str,
|
|
325
|
+
chunk_size: int,
|
|
326
|
+
chunk_overlap: int,
|
|
327
|
+
page_tag_processor: Optional[Any] = None,
|
|
328
|
+
image_processor: Optional[Any] = None,
|
|
329
|
+
chart_processor: Optional[Any] = None,
|
|
330
|
+
metadata_formatter: Optional[Any] = None
|
|
331
|
+
) -> List[str]:
|
|
332
|
+
"""
|
|
333
|
+
Chunk table-based content (CSV/TSV/Excel).
|
|
334
|
+
|
|
335
|
+
Split large tables (HTML or Markdown) to fit chunk_size and restore
|
|
336
|
+
table structure in each chunk.
|
|
337
|
+
|
|
338
|
+
For multi-sheet Excel files, process each sheet separately.
|
|
339
|
+
|
|
340
|
+
CRITICAL: Table chunks have NO overlap to prevent data duplication.
|
|
341
|
+
This is intentional for search/retrieval quality.
|
|
342
|
+
|
|
343
|
+
Args:
|
|
344
|
+
text: Full text (metadata + table)
|
|
345
|
+
chunk_size: Maximum chunk size
|
|
346
|
+
chunk_overlap: Not used for tables (kept for API compatibility)
|
|
347
|
+
page_tag_processor: PageTagProcessor for page/sheet tag patterns
|
|
348
|
+
image_processor: ImageProcessor for image tag patterns
|
|
349
|
+
chart_processor: ChartProcessor for chart block patterns
|
|
350
|
+
metadata_formatter: MetadataFormatter for metadata block patterns
|
|
351
|
+
|
|
352
|
+
Returns:
|
|
353
|
+
List of chunks
|
|
354
|
+
"""
|
|
355
|
+
if not text or not text.strip():
|
|
356
|
+
return [""]
|
|
357
|
+
|
|
358
|
+
# Get metadata pattern from processor
|
|
359
|
+
metadata_pattern = _get_metadata_block_pattern(metadata_formatter)
|
|
360
|
+
|
|
361
|
+
# Extract metadata using custom pattern
|
|
362
|
+
metadata_block, text_without_metadata = _extract_document_metadata(text, metadata_pattern)
|
|
363
|
+
|
|
364
|
+
# Extract data analysis block (supports both English and Korean tags)
|
|
365
|
+
analysis_pattern = r'(\[Data Analysis\].*?\[/Data Analysis\])\s*'
|
|
366
|
+
analysis_match = re.search(analysis_pattern, text_without_metadata, re.DOTALL)
|
|
367
|
+
analysis_block = ""
|
|
368
|
+
|
|
369
|
+
if analysis_match:
|
|
370
|
+
analysis_block = analysis_match.group(1)
|
|
371
|
+
text_without_analysis = (
|
|
372
|
+
text_without_metadata[:analysis_match.start()] +
|
|
373
|
+
text_without_metadata[analysis_match.end():]
|
|
374
|
+
).strip()
|
|
375
|
+
else:
|
|
376
|
+
text_without_analysis = text_without_metadata
|
|
377
|
+
|
|
378
|
+
# Check for multi-sheet (Excel)
|
|
379
|
+
sheets = _extract_sheet_sections(text_without_analysis)
|
|
380
|
+
|
|
381
|
+
# Get patterns from processors for protected region detection
|
|
382
|
+
image_pattern = _get_image_tag_pattern(image_processor)
|
|
383
|
+
chart_pattern = _get_chart_block_pattern(chart_processor)
|
|
384
|
+
metadata_pattern = _get_metadata_block_pattern(metadata_formatter)
|
|
385
|
+
|
|
386
|
+
if sheets:
|
|
387
|
+
logger.info(f"Multi-sheet Excel detected: {len(sheets)} sheets")
|
|
388
|
+
# Pass 0 for overlap since tables should not have overlap
|
|
389
|
+
return chunk_multi_sheet_content(
|
|
390
|
+
sheets, metadata_block, analysis_block, chunk_size, 0,
|
|
391
|
+
_chunk_plain_text, _chunk_table_unified,
|
|
392
|
+
image_pattern=image_pattern,
|
|
393
|
+
chart_pattern=chart_pattern,
|
|
394
|
+
metadata_pattern=metadata_pattern
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
# Single table/sheet processing
|
|
398
|
+
# Pass 0 for overlap since tables should not have overlap
|
|
399
|
+
return chunk_single_table_content(
|
|
400
|
+
text_without_analysis, metadata_block, analysis_block, chunk_size, 0,
|
|
401
|
+
_chunk_plain_text, _chunk_table_unified,
|
|
402
|
+
image_pattern=image_pattern,
|
|
403
|
+
chart_pattern=chart_pattern,
|
|
404
|
+
metadata_pattern=metadata_pattern
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
def _chunk_table_unified(table_text: str, chunk_size: int, chunk_overlap: int, context_prefix: str = "") -> List[str]:
|
|
409
|
+
"""
|
|
410
|
+
Unified table chunking function that handles both HTML and Markdown tables.
|
|
411
|
+
|
|
412
|
+
Detects table type and applies appropriate chunking with NO overlap.
|
|
413
|
+
|
|
414
|
+
Args:
|
|
415
|
+
table_text: Table content (HTML or Markdown)
|
|
416
|
+
chunk_size: Maximum chunk size
|
|
417
|
+
chunk_overlap: Ignored (tables have no overlap)
|
|
418
|
+
context_prefix: Context to prepend to each chunk
|
|
419
|
+
|
|
420
|
+
Returns:
|
|
421
|
+
List of table chunks
|
|
422
|
+
"""
|
|
423
|
+
if _is_markdown_table(table_text):
|
|
424
|
+
return _chunk_large_markdown_table(table_text, chunk_size, 0, context_prefix)
|
|
425
|
+
else:
|
|
426
|
+
return _chunk_large_table(table_text, chunk_size, 0, context_prefix)
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
def _split_text(
|
|
430
|
+
text: str,
|
|
431
|
+
chunk_size: int,
|
|
432
|
+
chunk_overlap: int,
|
|
433
|
+
file_extension: Optional[str] = None,
|
|
434
|
+
force_chunking: Optional[bool] = False,
|
|
435
|
+
page_tag_processor: Optional[Any] = None,
|
|
436
|
+
image_processor: Optional[Any] = None,
|
|
437
|
+
chart_processor: Optional[Any] = None,
|
|
438
|
+
metadata_formatter: Optional[Any] = None
|
|
439
|
+
) -> List[str]:
|
|
440
|
+
"""
|
|
441
|
+
Split text into chunks. (Internal use)
|
|
442
|
+
|
|
443
|
+
Preserves HTML and Markdown tables with proper row-level chunking.
|
|
444
|
+
Considers page boundaries for chunking.
|
|
445
|
+
Protects all tag regions (image, page, slide, chart, metadata) with NO overlap.
|
|
446
|
+
|
|
447
|
+
Core Strategy:
|
|
448
|
+
1. Apply table-based chunking if file_extension is CSV/TSV/Excel (NO overlap for tables)
|
|
449
|
+
2. Apply page-based chunking first if page markers exist
|
|
450
|
+
3. Merge pages based on chunk_size (allow up to 1.5x)
|
|
451
|
+
4. Never cut in the middle of a table or protected tag
|
|
452
|
+
5. Apply overlap ONLY for plain text (NOT for protected regions)
|
|
453
|
+
|
|
454
|
+
Protected Regions (NEVER split or overlap):
|
|
455
|
+
- Image tags, Page/Slide/Sheet tags, Chart blocks, Metadata blocks
|
|
456
|
+
- Tables (split by rows with NO overlap)
|
|
457
|
+
|
|
458
|
+
Args:
|
|
459
|
+
text: Original text
|
|
460
|
+
chunk_size: Maximum chunk size
|
|
461
|
+
chunk_overlap: Overlap size between chunks (NOT applied to protected regions)
|
|
462
|
+
file_extension: File extension (csv, xlsx, pdf, etc.) - used for table-based processing
|
|
463
|
+
force_chunking: Force chunking (disable table protection except for table-based files)
|
|
464
|
+
page_tag_processor: PageTagProcessor instance for custom tag patterns
|
|
465
|
+
image_processor: ImageProcessor instance for custom image tag patterns
|
|
466
|
+
chart_processor: ChartProcessor instance for custom chart tag patterns
|
|
467
|
+
metadata_formatter: MetadataFormatter instance for custom metadata tag patterns
|
|
468
|
+
|
|
469
|
+
Returns:
|
|
470
|
+
List of chunks
|
|
471
|
+
"""
|
|
472
|
+
if not text or not text.strip():
|
|
473
|
+
logger.warning("Empty text provided for chunking")
|
|
474
|
+
return [""]
|
|
475
|
+
|
|
476
|
+
# === Check for table-based content (CSV/Excel files only) ===
|
|
477
|
+
# Explicitly determine based on file_extension (no text content guessing)
|
|
478
|
+
is_table_based = file_extension and file_extension.lower() in TABLE_BASED_FILE_TYPES
|
|
479
|
+
|
|
480
|
+
# Disable table protection if is_table_based or force_chunking is True
|
|
481
|
+
disable_table_protection = is_table_based or force_chunking
|
|
482
|
+
|
|
483
|
+
if is_table_based:
|
|
484
|
+
# For table-based files (CSV/Excel), always use table-based chunking
|
|
485
|
+
# This handles both HTML tables and Markdown tables properly
|
|
486
|
+
logger.info(f"Table-based file detected ({file_extension}), using table-based chunking")
|
|
487
|
+
return _split_table_based_content(
|
|
488
|
+
text, chunk_size, chunk_overlap,
|
|
489
|
+
page_tag_processor=page_tag_processor,
|
|
490
|
+
image_processor=image_processor,
|
|
491
|
+
chart_processor=chart_processor,
|
|
492
|
+
metadata_formatter=metadata_formatter
|
|
493
|
+
)
|
|
494
|
+
|
|
495
|
+
# Get tag patterns from processors or use defaults (needed for metadata extraction)
|
|
496
|
+
metadata_pattern = _get_metadata_block_pattern(metadata_formatter)
|
|
497
|
+
|
|
498
|
+
# Extract metadata using custom pattern
|
|
499
|
+
metadata_block, text_without_metadata = _extract_document_metadata(text, metadata_pattern)
|
|
500
|
+
text = text_without_metadata
|
|
501
|
+
|
|
502
|
+
# === Check for page markers ===
|
|
503
|
+
# Build patterns from PageTagProcessor or use defaults
|
|
504
|
+
page_marker_patterns = _get_page_marker_patterns(page_tag_processor)
|
|
505
|
+
has_page_markers = any(re.search(pattern, text) for pattern in page_marker_patterns)
|
|
506
|
+
|
|
507
|
+
# Get remaining tag patterns from processors or use defaults
|
|
508
|
+
image_pattern = _get_image_tag_pattern(image_processor)
|
|
509
|
+
chart_pattern = _get_chart_block_pattern(chart_processor)
|
|
510
|
+
|
|
511
|
+
if has_page_markers:
|
|
512
|
+
# Page-based chunking
|
|
513
|
+
logger.debug("Page markers found, using page-based chunking")
|
|
514
|
+
chunks = _chunk_by_pages(
|
|
515
|
+
text, chunk_size, chunk_overlap, is_table_based, force_chunking,
|
|
516
|
+
page_tag_processor, image_pattern, chart_pattern, metadata_pattern
|
|
517
|
+
)
|
|
518
|
+
else:
|
|
519
|
+
# Find protected regions (HTML tables, chart blocks, Markdown tables, all tags)
|
|
520
|
+
# Disable table protection on force_chunking (other regions are always protected)
|
|
521
|
+
protected_regions = _find_protected_regions(
|
|
522
|
+
text, is_table_based, force_chunking, image_pattern,
|
|
523
|
+
chart_pattern, page_tag_processor, metadata_pattern
|
|
524
|
+
)
|
|
525
|
+
protected_positions = _get_protected_region_positions(protected_regions)
|
|
526
|
+
|
|
527
|
+
if protected_positions:
|
|
528
|
+
region_types = set(r[2] for r in protected_regions)
|
|
529
|
+
logger.info(f"Found {len(protected_positions)} protected regions in document: {region_types}")
|
|
530
|
+
chunks = _split_with_protected_regions(
|
|
531
|
+
text, protected_positions, chunk_size, chunk_overlap, force_chunking,
|
|
532
|
+
image_pattern, chart_pattern, page_tag_processor, metadata_pattern
|
|
533
|
+
)
|
|
534
|
+
else:
|
|
535
|
+
# No protected regions: apply row-level chunking if force_chunking
|
|
536
|
+
if disable_table_protection:
|
|
537
|
+
logger.debug("Force chunking enabled, using row-preserving chunking")
|
|
538
|
+
chunks = _chunk_with_row_protection(text, chunk_size, chunk_overlap, force_chunking)
|
|
539
|
+
else:
|
|
540
|
+
logger.debug("No protected blocks found, using standard chunking")
|
|
541
|
+
return _chunk_text_without_tables(text, chunk_size, chunk_overlap, metadata_block, page_tag_processor)
|
|
542
|
+
|
|
543
|
+
# Clean chunks
|
|
544
|
+
cleaned_chunks = _clean_chunks(chunks, page_tag_processor)
|
|
545
|
+
|
|
546
|
+
# Add metadata
|
|
547
|
+
cleaned_chunks = _prepend_metadata_to_chunks(cleaned_chunks, metadata_block)
|
|
548
|
+
|
|
549
|
+
logger.info(f"Final text split into {len(cleaned_chunks)} chunks")
|
|
550
|
+
|
|
551
|
+
return cleaned_chunks
|
|
552
|
+
|
|
553
|
+
# ============================================================================
|
|
554
|
+
# Internal Wrapper Functions
|
|
555
|
+
# ============================================================================
|
|
556
|
+
|
|
557
|
+
def _chunk_text_without_tables(
|
|
558
|
+
text: str,
|
|
559
|
+
chunk_size: int,
|
|
560
|
+
chunk_overlap: int,
|
|
561
|
+
metadata: Optional[str],
|
|
562
|
+
page_tag_processor: Optional[Any] = None
|
|
563
|
+
) -> List[str]:
|
|
564
|
+
"""
|
|
565
|
+
Chunking logic for text without tables.
|
|
566
|
+
Wrapper function for chunk_text_without_tables.
|
|
567
|
+
"""
|
|
568
|
+
return chunk_text_without_tables(
|
|
569
|
+
text, chunk_size, chunk_overlap, metadata,
|
|
570
|
+
_prepend_metadata_to_chunks,
|
|
571
|
+
page_tag_processor
|
|
572
|
+
)
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
def _chunk_with_row_protection(
|
|
576
|
+
text: str,
|
|
577
|
+
chunk_size: int,
|
|
578
|
+
chunk_overlap: int,
|
|
579
|
+
force_chunking: bool = False
|
|
580
|
+
) -> List[str]:
|
|
581
|
+
"""
|
|
582
|
+
Chunk while protecting row boundaries when table protection is disabled.
|
|
583
|
+
|
|
584
|
+
Both HTML and Markdown tables are split by rows with NO overlap.
|
|
585
|
+
Wrapper function for chunk_with_row_protection.
|
|
586
|
+
"""
|
|
587
|
+
# Wrapper function to pass force_chunking
|
|
588
|
+
def split_with_protected_regions_wrapper(text, regions, chunk_size, chunk_overlap):
|
|
589
|
+
return _split_with_protected_regions(text, regions, chunk_size, chunk_overlap, force_chunking)
|
|
590
|
+
|
|
591
|
+
# Use unified table chunker that handles both HTML and Markdown
|
|
592
|
+
return chunk_with_row_protection(
|
|
593
|
+
text, chunk_size, chunk_overlap,
|
|
594
|
+
split_with_protected_regions_wrapper, _chunk_table_unified
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
# ============================================================================
|
|
599
|
+
# Internal Functions - Page/Line Mapping
|
|
600
|
+
# ============================================================================
|
|
601
|
+
|
|
602
|
+
def _extract_page_mapping(
|
|
603
|
+
text: str,
|
|
604
|
+
file_extension: str,
|
|
605
|
+
page_tag_processor: Optional[Any] = None
|
|
606
|
+
) -> List[Dict[str, Any]]:
|
|
607
|
+
"""
|
|
608
|
+
Extract page/slide mapping information from text.
|
|
609
|
+
|
|
610
|
+
Recognizes page markers for various file formats:
|
|
611
|
+
- PDF/PPT/DOCX: Page/slide markers
|
|
612
|
+
- Excel: Sheet markers
|
|
613
|
+
- Others: Line-based estimation
|
|
614
|
+
|
|
615
|
+
Args:
|
|
616
|
+
text: Original text
|
|
617
|
+
file_extension: File extension
|
|
618
|
+
page_tag_processor: PageTagProcessor instance for custom patterns
|
|
619
|
+
|
|
620
|
+
Returns:
|
|
621
|
+
Page mapping list [{"page_num": int, "start_pos": int, "end_pos": int, ...}, ...]
|
|
622
|
+
"""
|
|
623
|
+
try:
|
|
624
|
+
page_mapping: List[Dict[str, Any]] = []
|
|
625
|
+
ext_lower = file_extension.lower() if file_extension else ""
|
|
626
|
+
|
|
627
|
+
if ext_lower in ['pdf', 'ppt', 'pptx', 'doc', 'docx']:
|
|
628
|
+
# Build patterns from PageTagProcessor or use defaults
|
|
629
|
+
patterns = _get_page_marker_patterns(page_tag_processor)
|
|
630
|
+
# Add OCR variants
|
|
631
|
+
ocr_patterns = []
|
|
632
|
+
for p in patterns:
|
|
633
|
+
# Add (OCR) and (OCR+Ref) variants
|
|
634
|
+
base_pattern = p.rstrip(']').rstrip(')')
|
|
635
|
+
if base_pattern.endswith('\\d+'):
|
|
636
|
+
ocr_patterns.append(p[:-1] + r'\s*\(OCR\)\]')
|
|
637
|
+
ocr_patterns.append(p[:-1] + r'\s*\(OCR\+Ref\)\]')
|
|
638
|
+
patterns.extend(ocr_patterns)
|
|
639
|
+
|
|
640
|
+
for pattern in patterns:
|
|
641
|
+
matches = list(re.finditer(pattern, text))
|
|
642
|
+
if matches:
|
|
643
|
+
for i, match in enumerate(matches):
|
|
644
|
+
page_num = int(match.group(1))
|
|
645
|
+
start = match.end()
|
|
646
|
+
end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
|
|
647
|
+
page_mapping.append({
|
|
648
|
+
"page_num": page_num,
|
|
649
|
+
"start_pos": start,
|
|
650
|
+
"end_pos": end
|
|
651
|
+
})
|
|
652
|
+
page_mapping.sort(key=lambda x: x["page_num"])
|
|
653
|
+
break
|
|
654
|
+
|
|
655
|
+
# Estimate pages for doc/docx if no markers found
|
|
656
|
+
if not page_mapping and ext_lower in ['doc', 'docx']:
|
|
657
|
+
chars_per_page = 1500
|
|
658
|
+
text_len = len(text)
|
|
659
|
+
if text_len > chars_per_page:
|
|
660
|
+
estimated_pages = (text_len + chars_per_page - 1) // chars_per_page
|
|
661
|
+
for page_num in range(1, estimated_pages + 1):
|
|
662
|
+
start = (page_num - 1) * chars_per_page
|
|
663
|
+
end = min(page_num * chars_per_page, text_len)
|
|
664
|
+
page_mapping.append({
|
|
665
|
+
"page_num": page_num,
|
|
666
|
+
"start_pos": start,
|
|
667
|
+
"end_pos": end
|
|
668
|
+
})
|
|
669
|
+
|
|
670
|
+
if not page_mapping:
|
|
671
|
+
page_mapping = [{"page_num": 1, "start_pos": 0, "end_pos": len(text)}]
|
|
672
|
+
|
|
673
|
+
elif ext_lower in ['xlsx', 'xls']:
|
|
674
|
+
# Build sheet pattern from PageTagProcessor or use default
|
|
675
|
+
sheet_pattern = _get_sheet_marker_pattern(page_tag_processor)
|
|
676
|
+
matches = list(re.finditer(sheet_pattern, text))
|
|
677
|
+
|
|
678
|
+
if matches:
|
|
679
|
+
for i, match in enumerate(matches):
|
|
680
|
+
start = match.end()
|
|
681
|
+
end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
|
|
682
|
+
page_mapping.append({
|
|
683
|
+
"page_num": i + 1,
|
|
684
|
+
"start_pos": start,
|
|
685
|
+
"end_pos": end,
|
|
686
|
+
"sheet_name": match.group(1).strip()
|
|
687
|
+
})
|
|
688
|
+
else:
|
|
689
|
+
page_mapping = [{"page_num": 1, "start_pos": 0, "end_pos": len(text)}]
|
|
690
|
+
|
|
691
|
+
else:
|
|
692
|
+
# Line-based estimation for other file types
|
|
693
|
+
lines = text.split('\n')
|
|
694
|
+
lines_per_page = 1000
|
|
695
|
+
|
|
696
|
+
if len(lines) > lines_per_page:
|
|
697
|
+
page_count = (len(lines) + lines_per_page - 1) // lines_per_page
|
|
698
|
+
current_pos = 0
|
|
699
|
+
|
|
700
|
+
for page_num in range(1, page_count + 1):
|
|
701
|
+
start_line = (page_num - 1) * lines_per_page
|
|
702
|
+
end_line = min(page_num * lines_per_page, len(lines))
|
|
703
|
+
page_text = '\n'.join(lines[start_line:end_line])
|
|
704
|
+
start = current_pos
|
|
705
|
+
end = current_pos + len(page_text)
|
|
706
|
+
page_mapping.append({
|
|
707
|
+
"page_num": page_num,
|
|
708
|
+
"start_pos": start,
|
|
709
|
+
"end_pos": end
|
|
710
|
+
})
|
|
711
|
+
current_pos = end + 1
|
|
712
|
+
else:
|
|
713
|
+
page_mapping = [{"page_num": 1, "start_pos": 0, "end_pos": len(text)}]
|
|
714
|
+
|
|
715
|
+
return page_mapping
|
|
716
|
+
|
|
717
|
+
except Exception:
|
|
718
|
+
return [{"page_num": 1, "start_pos": 0, "end_pos": len(text)}]
|
|
719
|
+
|
|
720
|
+
|
|
721
|
+
def _find_line_index_by_pos(pos: int, line_table: List[Dict[str, int]]) -> int:
|
|
722
|
+
"""
|
|
723
|
+
Find the line index corresponding to the given position.
|
|
724
|
+
|
|
725
|
+
Args:
|
|
726
|
+
pos: Position in text
|
|
727
|
+
line_table: Line offset table
|
|
728
|
+
|
|
729
|
+
Returns:
|
|
730
|
+
Line index (0-based)
|
|
731
|
+
"""
|
|
732
|
+
try:
|
|
733
|
+
if not line_table:
|
|
734
|
+
return 0
|
|
735
|
+
starts = [line["start"] for line in line_table]
|
|
736
|
+
idx = bisect.bisect_right(starts, pos) - 1
|
|
737
|
+
return 0 if idx < 0 else min(idx, len(line_table) - 1)
|
|
738
|
+
except Exception:
|
|
739
|
+
return 0
|
|
740
|
+
|
|
741
|
+
|
|
742
|
+
def _build_line_offset_table(
|
|
743
|
+
text: str,
|
|
744
|
+
file_extension: str,
|
|
745
|
+
page_tag_processor: Optional[Any] = None
|
|
746
|
+
) -> List[Dict[str, int]]:
|
|
747
|
+
"""
|
|
748
|
+
Build an offset table for each line in the text.
|
|
749
|
+
|
|
750
|
+
Args:
|
|
751
|
+
text: Original text
|
|
752
|
+
file_extension: File extension
|
|
753
|
+
page_tag_processor: PageTagProcessor instance for custom patterns
|
|
754
|
+
|
|
755
|
+
Returns:
|
|
756
|
+
Line offset table [{"line_num": int, "start": int, "end": int, "page": int}, ...]
|
|
757
|
+
"""
|
|
758
|
+
try:
|
|
759
|
+
lines = text.split('\n')
|
|
760
|
+
table: List[Dict[str, int]] = []
|
|
761
|
+
pos = 0
|
|
762
|
+
page_mapping = _extract_page_mapping(text, file_extension, page_tag_processor)
|
|
763
|
+
|
|
764
|
+
def _page_for_pos(p: int) -> int:
|
|
765
|
+
for info in page_mapping:
|
|
766
|
+
if info["start_pos"] <= p < info["end_pos"]:
|
|
767
|
+
return info["page_num"]
|
|
768
|
+
return 1
|
|
769
|
+
|
|
770
|
+
for i, line in enumerate(lines):
|
|
771
|
+
start = pos
|
|
772
|
+
end = pos + len(line)
|
|
773
|
+
mid = start + max(0, (end - start) // 2)
|
|
774
|
+
page = _page_for_pos(mid)
|
|
775
|
+
table.append({
|
|
776
|
+
"line_num": i + 1,
|
|
777
|
+
"start": start,
|
|
778
|
+
"end": end,
|
|
779
|
+
"page": page
|
|
780
|
+
})
|
|
781
|
+
pos = end + 1
|
|
782
|
+
|
|
783
|
+
return table
|
|
784
|
+
|
|
785
|
+
except Exception:
|
|
786
|
+
return [{"line_num": 1, "start": 0, "end": len(text), "page": 1}]
|