xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,393 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/functions/page_tag_processor.py
|
|
2
|
+
"""
|
|
3
|
+
Page Tag Processor Module
|
|
4
|
+
|
|
5
|
+
Provides functionality for generating and parsing page/slide/sheet markers in extracted text.
|
|
6
|
+
This module standardizes page numbering format across all document handlers.
|
|
7
|
+
|
|
8
|
+
=== Architecture Overview ===
|
|
9
|
+
|
|
10
|
+
1. Creation:
|
|
11
|
+
- PageTagProcessor instance is created when DocumentProcessor is initialized.
|
|
12
|
+
- Created via DocumentProcessor.__init__() calling _create_page_tag_processor() method.
|
|
13
|
+
|
|
14
|
+
2. Propagation:
|
|
15
|
+
- The created PageTagProcessor is passed to ALL handlers.
|
|
16
|
+
- In DocumentProcessor._get_handler_registry(), each handler is created with
|
|
17
|
+
page_tag_processor=self._page_tag_processor parameter.
|
|
18
|
+
- Even handlers that don't use page tags receive it for consistency.
|
|
19
|
+
|
|
20
|
+
3. Access from Handlers:
|
|
21
|
+
- Each Handler inherits from BaseHandler and can access via self.page_tag_processor.
|
|
22
|
+
- Convenience methods: self.create_page_tag(n), self.create_slide_tag(n), self.create_sheet_tag(name)
|
|
23
|
+
|
|
24
|
+
4. Components:
|
|
25
|
+
- PageTagConfig: Dataclass holding tag prefix/suffix settings
|
|
26
|
+
- PageTagProcessor: Main class for tag generation and parsing
|
|
27
|
+
- PageTagType: Enum distinguishing PAGE, SLIDE, SHEET types
|
|
28
|
+
|
|
29
|
+
=== Usage Examples ===
|
|
30
|
+
|
|
31
|
+
# Custom settings at DocumentProcessor level
|
|
32
|
+
from xgen_doc2chunk.core.document_processor import DocumentProcessor
|
|
33
|
+
|
|
34
|
+
processor = DocumentProcessor(
|
|
35
|
+
page_tag_prefix="<page>",
|
|
36
|
+
page_tag_suffix="</page>",
|
|
37
|
+
slide_tag_prefix="<slide>",
|
|
38
|
+
slide_tag_suffix="</slide>"
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Usage inside Handler (BaseHandler subclass)
|
|
42
|
+
class MyHandler(BaseHandler):
|
|
43
|
+
def extract_text(self, ...):
|
|
44
|
+
tag = self.create_page_tag(1) # "[Page Number: 1]" or custom format
|
|
45
|
+
slide_tag = self.create_slide_tag(1) # "[Slide Number: 1]"
|
|
46
|
+
sheet_tag = self.create_sheet_tag("Sheet1") # "[Sheet: Sheet1]"
|
|
47
|
+
|
|
48
|
+
=== Default Tag Formats ===
|
|
49
|
+
|
|
50
|
+
- Page: [Page Number: 1]
|
|
51
|
+
- Slide: [Slide Number: 1]
|
|
52
|
+
- Sheet: [Sheet: Sheet1]
|
|
53
|
+
|
|
54
|
+
=== Supported Handlers ===
|
|
55
|
+
|
|
56
|
+
- PDFHandler: Uses create_page_tag()
|
|
57
|
+
- DOCXHandler: Uses create_page_tag()
|
|
58
|
+
- DOCHandler: Uses create_page_tag()
|
|
59
|
+
- PPTHandler: Uses create_slide_tag()
|
|
60
|
+
- ExcelHandler: Uses create_sheet_tag()
|
|
61
|
+
- HWPHandler, HWPXHandler, CSVHandler, TextHandler: Propagated but not used
|
|
62
|
+
|
|
63
|
+
"""
|
|
64
|
+
import logging
|
|
65
|
+
import re
|
|
66
|
+
from dataclasses import dataclass
|
|
67
|
+
from enum import Enum
|
|
68
|
+
from typing import List, Optional, Pattern, Tuple
|
|
69
|
+
|
|
70
|
+
logger = logging.getLogger("document-processor")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class PageTagType(Enum):
|
|
74
|
+
"""Type of page tag for different document formats."""
|
|
75
|
+
PAGE = "page" # PDF, DOCX, DOC, HWP
|
|
76
|
+
SLIDE = "slide" # PPT, PPTX
|
|
77
|
+
SHEET = "sheet" # Excel (XLSX, XLS)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@dataclass
|
|
81
|
+
class PageTagConfig:
|
|
82
|
+
"""
|
|
83
|
+
PageTagProcessor configuration.
|
|
84
|
+
|
|
85
|
+
Attributes:
|
|
86
|
+
tag_prefix: Tag prefix (e.g., "[Page Number: ")
|
|
87
|
+
tag_suffix: Tag suffix (e.g., "]")
|
|
88
|
+
slide_prefix: Slide tag prefix for presentations (e.g., "[Slide Number: ")
|
|
89
|
+
slide_suffix: Slide tag suffix (e.g., "]")
|
|
90
|
+
sheet_prefix: Sheet tag prefix for spreadsheets (e.g., "[Sheet: ")
|
|
91
|
+
sheet_suffix: Sheet tag suffix (e.g., "]")
|
|
92
|
+
"""
|
|
93
|
+
tag_prefix: str = "[Page Number: "
|
|
94
|
+
tag_suffix: str = "]"
|
|
95
|
+
slide_prefix: str = "[Slide Number: "
|
|
96
|
+
slide_suffix: str = "]"
|
|
97
|
+
sheet_prefix: str = "[Sheet: "
|
|
98
|
+
sheet_suffix: str = "]"
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class PageTagProcessor:
|
|
102
|
+
"""
|
|
103
|
+
Page Tag Processor Class
|
|
104
|
+
|
|
105
|
+
Generates and parses page/slide/sheet markers for document text extraction.
|
|
106
|
+
Provides a standardized interface for all document handlers.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
tag_prefix: Page tag prefix (default: "[Page Number: ")
|
|
110
|
+
tag_suffix: Page tag suffix (default: "]")
|
|
111
|
+
slide_prefix: Slide tag prefix (default: "[Slide Number: ")
|
|
112
|
+
slide_suffix: Slide tag suffix (default: "]")
|
|
113
|
+
sheet_prefix: Sheet tag prefix (default: "[Sheet: ")
|
|
114
|
+
sheet_suffix: Sheet tag suffix (default: "]")
|
|
115
|
+
config: PageTagConfig instance (overrides individual parameters)
|
|
116
|
+
"""
|
|
117
|
+
|
|
118
|
+
def __init__(
|
|
119
|
+
self,
|
|
120
|
+
tag_prefix: Optional[str] = None,
|
|
121
|
+
tag_suffix: Optional[str] = None,
|
|
122
|
+
slide_prefix: Optional[str] = None,
|
|
123
|
+
slide_suffix: Optional[str] = None,
|
|
124
|
+
sheet_prefix: Optional[str] = None,
|
|
125
|
+
sheet_suffix: Optional[str] = None,
|
|
126
|
+
config: Optional[PageTagConfig] = None
|
|
127
|
+
):
|
|
128
|
+
"""Initialize PageTagProcessor with configuration."""
|
|
129
|
+
if config is not None:
|
|
130
|
+
self._config = config
|
|
131
|
+
else:
|
|
132
|
+
self._config = PageTagConfig(
|
|
133
|
+
tag_prefix=tag_prefix if tag_prefix is not None else PageTagConfig.tag_prefix,
|
|
134
|
+
tag_suffix=tag_suffix if tag_suffix is not None else PageTagConfig.tag_suffix,
|
|
135
|
+
slide_prefix=slide_prefix if slide_prefix is not None else PageTagConfig.slide_prefix,
|
|
136
|
+
slide_suffix=slide_suffix if slide_suffix is not None else PageTagConfig.slide_suffix,
|
|
137
|
+
sheet_prefix=sheet_prefix if sheet_prefix is not None else PageTagConfig.sheet_prefix,
|
|
138
|
+
sheet_suffix=sheet_suffix if sheet_suffix is not None else PageTagConfig.sheet_suffix,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# Pre-compile regex patterns for parsing
|
|
142
|
+
self._page_pattern: Optional[Pattern] = None
|
|
143
|
+
self._slide_pattern: Optional[Pattern] = None
|
|
144
|
+
self._sheet_pattern: Optional[Pattern] = None
|
|
145
|
+
|
|
146
|
+
@property
|
|
147
|
+
def config(self) -> PageTagConfig:
|
|
148
|
+
"""Current configuration."""
|
|
149
|
+
return self._config
|
|
150
|
+
|
|
151
|
+
@property
|
|
152
|
+
def page_pattern(self) -> Pattern:
|
|
153
|
+
"""Compiled regex pattern for matching page tags."""
|
|
154
|
+
if self._page_pattern is None:
|
|
155
|
+
escaped_prefix = re.escape(self._config.tag_prefix)
|
|
156
|
+
escaped_suffix = re.escape(self._config.tag_suffix)
|
|
157
|
+
self._page_pattern = re.compile(
|
|
158
|
+
f'{escaped_prefix}(\\d+){escaped_suffix}',
|
|
159
|
+
re.IGNORECASE
|
|
160
|
+
)
|
|
161
|
+
return self._page_pattern
|
|
162
|
+
|
|
163
|
+
@property
|
|
164
|
+
def slide_pattern(self) -> Pattern:
|
|
165
|
+
"""Compiled regex pattern for matching slide tags."""
|
|
166
|
+
if self._slide_pattern is None:
|
|
167
|
+
escaped_prefix = re.escape(self._config.slide_prefix)
|
|
168
|
+
escaped_suffix = re.escape(self._config.slide_suffix)
|
|
169
|
+
self._slide_pattern = re.compile(
|
|
170
|
+
f'{escaped_prefix}(\\d+){escaped_suffix}',
|
|
171
|
+
re.IGNORECASE
|
|
172
|
+
)
|
|
173
|
+
return self._slide_pattern
|
|
174
|
+
|
|
175
|
+
@property
|
|
176
|
+
def sheet_pattern(self) -> Pattern:
|
|
177
|
+
"""Compiled regex pattern for matching sheet tags."""
|
|
178
|
+
if self._sheet_pattern is None:
|
|
179
|
+
escaped_prefix = re.escape(self._config.sheet_prefix)
|
|
180
|
+
escaped_suffix = re.escape(self._config.sheet_suffix)
|
|
181
|
+
self._sheet_pattern = re.compile(
|
|
182
|
+
f'{escaped_prefix}([^\\]]+){escaped_suffix}',
|
|
183
|
+
re.IGNORECASE
|
|
184
|
+
)
|
|
185
|
+
return self._sheet_pattern
|
|
186
|
+
|
|
187
|
+
def create_tag(self, page_number: int, tag_type: PageTagType = PageTagType.PAGE) -> str:
|
|
188
|
+
"""
|
|
189
|
+
Create a page/slide/sheet tag.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
page_number: Page, slide, or sheet number
|
|
193
|
+
tag_type: Type of tag (PAGE, SLIDE, SHEET)
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
Formatted tag string
|
|
197
|
+
|
|
198
|
+
Example:
|
|
199
|
+
>>> processor = PageTagProcessor()
|
|
200
|
+
>>> processor.create_tag(1)
|
|
201
|
+
'[Page Number: 1]'
|
|
202
|
+
>>> processor.create_tag(1, PageTagType.SLIDE)
|
|
203
|
+
'[Slide Number: 1]'
|
|
204
|
+
"""
|
|
205
|
+
if tag_type == PageTagType.SLIDE:
|
|
206
|
+
return f"{self._config.slide_prefix}{page_number}{self._config.slide_suffix}"
|
|
207
|
+
elif tag_type == PageTagType.SHEET:
|
|
208
|
+
return f"{self._config.sheet_prefix}{page_number}{self._config.sheet_suffix}"
|
|
209
|
+
else:
|
|
210
|
+
return f"{self._config.tag_prefix}{page_number}{self._config.tag_suffix}"
|
|
211
|
+
|
|
212
|
+
def create_page_tag(self, page_number: int) -> str:
|
|
213
|
+
"""
|
|
214
|
+
Create a page tag (convenience method).
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
page_number: Page number
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
Formatted page tag string
|
|
221
|
+
"""
|
|
222
|
+
return self.create_tag(page_number, PageTagType.PAGE)
|
|
223
|
+
|
|
224
|
+
def create_slide_tag(self, slide_number: int) -> str:
|
|
225
|
+
"""
|
|
226
|
+
Create a slide tag (convenience method).
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
slide_number: Slide number
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
Formatted slide tag string
|
|
233
|
+
"""
|
|
234
|
+
return self.create_tag(slide_number, PageTagType.SLIDE)
|
|
235
|
+
|
|
236
|
+
def create_sheet_tag(self, sheet_name: str) -> str:
|
|
237
|
+
"""
|
|
238
|
+
Create a sheet tag with name.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
sheet_name: Sheet name
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
Formatted sheet tag string
|
|
245
|
+
"""
|
|
246
|
+
return f"{self._config.sheet_prefix}{sheet_name}{self._config.sheet_suffix}"
|
|
247
|
+
|
|
248
|
+
def find_page_numbers(self, text: str) -> List[Tuple[int, int, int]]:
|
|
249
|
+
"""
|
|
250
|
+
Find all page numbers in text.
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
text: Text to search
|
|
254
|
+
|
|
255
|
+
Returns:
|
|
256
|
+
List of tuples: (page_number, start_pos, end_pos)
|
|
257
|
+
"""
|
|
258
|
+
results = []
|
|
259
|
+
for match in self.page_pattern.finditer(text):
|
|
260
|
+
page_num = int(match.group(1))
|
|
261
|
+
results.append((page_num, match.start(), match.end()))
|
|
262
|
+
return results
|
|
263
|
+
|
|
264
|
+
def find_slide_numbers(self, text: str) -> List[Tuple[int, int, int]]:
|
|
265
|
+
"""
|
|
266
|
+
Find all slide numbers in text.
|
|
267
|
+
|
|
268
|
+
Args:
|
|
269
|
+
text: Text to search
|
|
270
|
+
|
|
271
|
+
Returns:
|
|
272
|
+
List of tuples: (slide_number, start_pos, end_pos)
|
|
273
|
+
"""
|
|
274
|
+
results = []
|
|
275
|
+
for match in self.slide_pattern.finditer(text):
|
|
276
|
+
slide_num = int(match.group(1))
|
|
277
|
+
results.append((slide_num, match.start(), match.end()))
|
|
278
|
+
return results
|
|
279
|
+
|
|
280
|
+
def has_page_markers(self, text: str) -> bool:
|
|
281
|
+
"""
|
|
282
|
+
Check if text contains page markers.
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
text: Text to check
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
True if page markers found
|
|
289
|
+
"""
|
|
290
|
+
return bool(self.page_pattern.search(text))
|
|
291
|
+
|
|
292
|
+
def has_slide_markers(self, text: str) -> bool:
|
|
293
|
+
"""
|
|
294
|
+
Check if text contains slide markers.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
text: Text to check
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
True if slide markers found
|
|
301
|
+
"""
|
|
302
|
+
return bool(self.slide_pattern.search(text))
|
|
303
|
+
|
|
304
|
+
def get_pattern_string(self, tag_type: PageTagType = PageTagType.PAGE) -> str:
|
|
305
|
+
"""
|
|
306
|
+
Get the regex pattern string for the specified tag type.
|
|
307
|
+
|
|
308
|
+
Args:
|
|
309
|
+
tag_type: Type of tag
|
|
310
|
+
|
|
311
|
+
Returns:
|
|
312
|
+
Regex pattern string
|
|
313
|
+
"""
|
|
314
|
+
if tag_type == PageTagType.SLIDE:
|
|
315
|
+
escaped_prefix = re.escape(self._config.slide_prefix)
|
|
316
|
+
escaped_suffix = re.escape(self._config.slide_suffix)
|
|
317
|
+
elif tag_type == PageTagType.SHEET:
|
|
318
|
+
escaped_prefix = re.escape(self._config.sheet_prefix)
|
|
319
|
+
escaped_suffix = re.escape(self._config.sheet_suffix)
|
|
320
|
+
else:
|
|
321
|
+
escaped_prefix = re.escape(self._config.tag_prefix)
|
|
322
|
+
escaped_suffix = re.escape(self._config.tag_suffix)
|
|
323
|
+
|
|
324
|
+
return f'{escaped_prefix}(\\d+){escaped_suffix}'
|
|
325
|
+
|
|
326
|
+
def remove_page_markers(self, text: str) -> str:
|
|
327
|
+
"""
|
|
328
|
+
Remove all page markers from text.
|
|
329
|
+
|
|
330
|
+
Args:
|
|
331
|
+
text: Text with page markers
|
|
332
|
+
|
|
333
|
+
Returns:
|
|
334
|
+
Text with page markers removed
|
|
335
|
+
"""
|
|
336
|
+
text = self.page_pattern.sub('', text)
|
|
337
|
+
text = self.slide_pattern.sub('', text)
|
|
338
|
+
return text
|
|
339
|
+
|
|
340
|
+
def __repr__(self) -> str:
|
|
341
|
+
return (
|
|
342
|
+
f"PageTagProcessor(tag_prefix={self._config.tag_prefix!r}, "
|
|
343
|
+
f"tag_suffix={self._config.tag_suffix!r})"
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
# Default instance for convenience
|
|
348
|
+
_default_processor: Optional[PageTagProcessor] = None
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def get_default_page_tag_processor() -> PageTagProcessor:
|
|
352
|
+
"""Get the default PageTagProcessor instance."""
|
|
353
|
+
global _default_processor
|
|
354
|
+
if _default_processor is None:
|
|
355
|
+
_default_processor = PageTagProcessor()
|
|
356
|
+
return _default_processor
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def create_page_tag(page_number: int) -> str:
|
|
360
|
+
"""
|
|
361
|
+
Create a page tag using the default processor.
|
|
362
|
+
|
|
363
|
+
Args:
|
|
364
|
+
page_number: Page number
|
|
365
|
+
|
|
366
|
+
Returns:
|
|
367
|
+
Formatted page tag string
|
|
368
|
+
"""
|
|
369
|
+
return get_default_page_tag_processor().create_page_tag(page_number)
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
def create_slide_tag(slide_number: int) -> str:
|
|
373
|
+
"""
|
|
374
|
+
Create a slide tag using the default processor.
|
|
375
|
+
|
|
376
|
+
Args:
|
|
377
|
+
slide_number: Slide number
|
|
378
|
+
|
|
379
|
+
Returns:
|
|
380
|
+
Formatted slide tag string
|
|
381
|
+
"""
|
|
382
|
+
return get_default_page_tag_processor().create_slide_tag(slide_number)
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
__all__ = [
|
|
386
|
+
"PageTagType",
|
|
387
|
+
"PageTagConfig",
|
|
388
|
+
"PageTagProcessor",
|
|
389
|
+
"get_default_page_tag_processor",
|
|
390
|
+
"create_page_tag",
|
|
391
|
+
"create_slide_tag",
|
|
392
|
+
]
|
|
393
|
+
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/functions/preprocessor.py
|
|
2
|
+
"""
|
|
3
|
+
BasePreprocessor - Abstract base class for data preprocessing
|
|
4
|
+
|
|
5
|
+
Defines the interface for preprocessing data after file conversion.
|
|
6
|
+
Used when converted data needs special handling before content extraction.
|
|
7
|
+
|
|
8
|
+
The preprocessor's job is to:
|
|
9
|
+
1. Clean/normalize converted data
|
|
10
|
+
2. Extract embedded resources (images, etc.)
|
|
11
|
+
3. Detect encoding information
|
|
12
|
+
4. Return preprocessed data ready for further processing
|
|
13
|
+
|
|
14
|
+
Processing Pipeline Position:
|
|
15
|
+
1. FileConverter.convert() ??Format-specific object
|
|
16
|
+
2. Preprocessor.preprocess() ??Cleaned/processed data (THIS STEP)
|
|
17
|
+
3. MetadataExtractor.extract() ??Metadata
|
|
18
|
+
4. Content extraction
|
|
19
|
+
|
|
20
|
+
Usage:
|
|
21
|
+
class PDFPreprocessor(BasePreprocessor):
|
|
22
|
+
def preprocess(self, converted_data: Any, **kwargs) -> PreprocessedData:
|
|
23
|
+
# Process the fitz.Document, normalize pages, etc.
|
|
24
|
+
return PreprocessedData(
|
|
25
|
+
clean_content=b"",
|
|
26
|
+
encoding="utf-8",
|
|
27
|
+
extracted_resources={"document": converted_data}
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
def get_format_name(self) -> str:
|
|
31
|
+
return "PDF Preprocessor"
|
|
32
|
+
"""
|
|
33
|
+
from abc import ABC, abstractmethod
|
|
34
|
+
from dataclasses import dataclass, field
|
|
35
|
+
from typing import Any, Dict
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class PreprocessedData:
|
|
40
|
+
"""
|
|
41
|
+
Result of preprocessing operation.
|
|
42
|
+
|
|
43
|
+
Contains cleaned content and any extracted resources.
|
|
44
|
+
|
|
45
|
+
Attributes:
|
|
46
|
+
raw_content: Original input data (for reference)
|
|
47
|
+
clean_content: Processed content ready for use - THIS IS THE TRUE SOURCE
|
|
48
|
+
Can be any type: bytes, str, Document, Workbook, OleFileIO, etc.
|
|
49
|
+
encoding: Detected or default encoding (for text-based content)
|
|
50
|
+
extracted_resources: Dict of extracted resources (images, etc.)
|
|
51
|
+
metadata: Any metadata discovered during preprocessing
|
|
52
|
+
"""
|
|
53
|
+
raw_content: Any = None
|
|
54
|
+
clean_content: Any = None # TRUE SOURCE - The processed result
|
|
55
|
+
encoding: str = "utf-8"
|
|
56
|
+
extracted_resources: Dict[str, Any] = field(default_factory=dict)
|
|
57
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class BasePreprocessor(ABC):
|
|
61
|
+
"""
|
|
62
|
+
Abstract base class for data preprocessors.
|
|
63
|
+
|
|
64
|
+
Preprocesses converted data after FileConverter.convert().
|
|
65
|
+
Used when converted data needs normalization or special handling
|
|
66
|
+
before content extraction.
|
|
67
|
+
|
|
68
|
+
Processing Pipeline:
|
|
69
|
+
1. FileConverter.convert() ??Format-specific object
|
|
70
|
+
2. Preprocessor.preprocess() ??Cleaned/processed data (THIS STEP)
|
|
71
|
+
3. MetadataExtractor.extract() ??Metadata
|
|
72
|
+
4. Content extraction
|
|
73
|
+
|
|
74
|
+
Subclasses must implement:
|
|
75
|
+
- preprocess(): Process converted data and return PreprocessedData
|
|
76
|
+
- get_format_name(): Return human-readable format name
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
@abstractmethod
|
|
80
|
+
def preprocess(
|
|
81
|
+
self,
|
|
82
|
+
converted_data: Any,
|
|
83
|
+
**kwargs
|
|
84
|
+
) -> PreprocessedData:
|
|
85
|
+
"""
|
|
86
|
+
Preprocess converted data.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
converted_data: Data from FileConverter.convert()
|
|
90
|
+
(format-specific object, bytes, or other type)
|
|
91
|
+
**kwargs: Additional format-specific options
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
PreprocessedData containing cleaned content and extracted resources
|
|
95
|
+
|
|
96
|
+
Raises:
|
|
97
|
+
PreprocessingError: If preprocessing fails
|
|
98
|
+
"""
|
|
99
|
+
pass
|
|
100
|
+
|
|
101
|
+
@abstractmethod
|
|
102
|
+
def get_format_name(self) -> str:
|
|
103
|
+
"""
|
|
104
|
+
Return human-readable format name.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
Format name string (e.g., "PDF Preprocessor")
|
|
108
|
+
"""
|
|
109
|
+
pass
|
|
110
|
+
|
|
111
|
+
def validate(self, data: Any) -> bool:
|
|
112
|
+
"""
|
|
113
|
+
Validate if the data can be preprocessed by this preprocessor.
|
|
114
|
+
|
|
115
|
+
Override this method to add format-specific validation.
|
|
116
|
+
Default implementation returns True.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
data: Data to validate (converted data or raw bytes)
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
True if data can be preprocessed, False otherwise
|
|
123
|
+
"""
|
|
124
|
+
_ = data # Suppress unused argument warning
|
|
125
|
+
return True
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
class NullPreprocessor(BasePreprocessor):
|
|
129
|
+
"""
|
|
130
|
+
Null preprocessor that passes data through unchanged.
|
|
131
|
+
|
|
132
|
+
Used as default when no preprocessing is needed.
|
|
133
|
+
clean_content always contains the processed result (same as input for pass-through).
|
|
134
|
+
"""
|
|
135
|
+
|
|
136
|
+
def preprocess(
|
|
137
|
+
self,
|
|
138
|
+
converted_data: Any,
|
|
139
|
+
**kwargs
|
|
140
|
+
) -> PreprocessedData:
|
|
141
|
+
"""Pass data through unchanged. clean_content = converted_data."""
|
|
142
|
+
encoding = kwargs.get("encoding", "utf-8")
|
|
143
|
+
|
|
144
|
+
# clean_content is ALWAYS the True Source - contains the processed result
|
|
145
|
+
# For pass-through, it's the same as the input
|
|
146
|
+
return PreprocessedData(
|
|
147
|
+
raw_content=converted_data,
|
|
148
|
+
clean_content=converted_data, # TRUE SOURCE
|
|
149
|
+
encoding=encoding,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
def get_format_name(self) -> str:
|
|
153
|
+
"""Return format name."""
|
|
154
|
+
return "Null Preprocessor (pass-through)"
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
__all__ = [
|
|
158
|
+
'BasePreprocessor',
|
|
159
|
+
'NullPreprocessor',
|
|
160
|
+
'PreprocessedData',
|
|
161
|
+
]
|
|
162
|
+
|