xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,393 @@
1
+ # xgen_doc2chunk/core/functions/page_tag_processor.py
2
+ """
3
+ Page Tag Processor Module
4
+
5
+ Provides functionality for generating and parsing page/slide/sheet markers in extracted text.
6
+ This module standardizes page numbering format across all document handlers.
7
+
8
+ === Architecture Overview ===
9
+
10
+ 1. Creation:
11
+ - PageTagProcessor instance is created when DocumentProcessor is initialized.
12
+ - Created via DocumentProcessor.__init__() calling _create_page_tag_processor() method.
13
+
14
+ 2. Propagation:
15
+ - The created PageTagProcessor is passed to ALL handlers.
16
+ - In DocumentProcessor._get_handler_registry(), each handler is created with
17
+ page_tag_processor=self._page_tag_processor parameter.
18
+ - Even handlers that don't use page tags receive it for consistency.
19
+
20
+ 3. Access from Handlers:
21
+ - Each Handler inherits from BaseHandler and can access via self.page_tag_processor.
22
+ - Convenience methods: self.create_page_tag(n), self.create_slide_tag(n), self.create_sheet_tag(name)
23
+
24
+ 4. Components:
25
+ - PageTagConfig: Dataclass holding tag prefix/suffix settings
26
+ - PageTagProcessor: Main class for tag generation and parsing
27
+ - PageTagType: Enum distinguishing PAGE, SLIDE, SHEET types
28
+
29
+ === Usage Examples ===
30
+
31
+ # Custom settings at DocumentProcessor level
32
+ from xgen_doc2chunk.core.document_processor import DocumentProcessor
33
+
34
+ processor = DocumentProcessor(
35
+ page_tag_prefix="<page>",
36
+ page_tag_suffix="</page>",
37
+ slide_tag_prefix="<slide>",
38
+ slide_tag_suffix="</slide>"
39
+ )
40
+
41
+ # Usage inside Handler (BaseHandler subclass)
42
+ class MyHandler(BaseHandler):
43
+ def extract_text(self, ...):
44
+ tag = self.create_page_tag(1) # "[Page Number: 1]" or custom format
45
+ slide_tag = self.create_slide_tag(1) # "[Slide Number: 1]"
46
+ sheet_tag = self.create_sheet_tag("Sheet1") # "[Sheet: Sheet1]"
47
+
48
+ === Default Tag Formats ===
49
+
50
+ - Page: [Page Number: 1]
51
+ - Slide: [Slide Number: 1]
52
+ - Sheet: [Sheet: Sheet1]
53
+
54
+ === Supported Handlers ===
55
+
56
+ - PDFHandler: Uses create_page_tag()
57
+ - DOCXHandler: Uses create_page_tag()
58
+ - DOCHandler: Uses create_page_tag()
59
+ - PPTHandler: Uses create_slide_tag()
60
+ - ExcelHandler: Uses create_sheet_tag()
61
+ - HWPHandler, HWPXHandler, CSVHandler, TextHandler: Propagated but not used
62
+
63
+ """
64
+ import logging
65
+ import re
66
+ from dataclasses import dataclass
67
+ from enum import Enum
68
+ from typing import List, Optional, Pattern, Tuple
69
+
70
+ logger = logging.getLogger("document-processor")
71
+
72
+
73
+ class PageTagType(Enum):
74
+ """Type of page tag for different document formats."""
75
+ PAGE = "page" # PDF, DOCX, DOC, HWP
76
+ SLIDE = "slide" # PPT, PPTX
77
+ SHEET = "sheet" # Excel (XLSX, XLS)
78
+
79
+
80
+ @dataclass
81
+ class PageTagConfig:
82
+ """
83
+ PageTagProcessor configuration.
84
+
85
+ Attributes:
86
+ tag_prefix: Tag prefix (e.g., "[Page Number: ")
87
+ tag_suffix: Tag suffix (e.g., "]")
88
+ slide_prefix: Slide tag prefix for presentations (e.g., "[Slide Number: ")
89
+ slide_suffix: Slide tag suffix (e.g., "]")
90
+ sheet_prefix: Sheet tag prefix for spreadsheets (e.g., "[Sheet: ")
91
+ sheet_suffix: Sheet tag suffix (e.g., "]")
92
+ """
93
+ tag_prefix: str = "[Page Number: "
94
+ tag_suffix: str = "]"
95
+ slide_prefix: str = "[Slide Number: "
96
+ slide_suffix: str = "]"
97
+ sheet_prefix: str = "[Sheet: "
98
+ sheet_suffix: str = "]"
99
+
100
+
101
+ class PageTagProcessor:
102
+ """
103
+ Page Tag Processor Class
104
+
105
+ Generates and parses page/slide/sheet markers for document text extraction.
106
+ Provides a standardized interface for all document handlers.
107
+
108
+ Args:
109
+ tag_prefix: Page tag prefix (default: "[Page Number: ")
110
+ tag_suffix: Page tag suffix (default: "]")
111
+ slide_prefix: Slide tag prefix (default: "[Slide Number: ")
112
+ slide_suffix: Slide tag suffix (default: "]")
113
+ sheet_prefix: Sheet tag prefix (default: "[Sheet: ")
114
+ sheet_suffix: Sheet tag suffix (default: "]")
115
+ config: PageTagConfig instance (overrides individual parameters)
116
+ """
117
+
118
+ def __init__(
119
+ self,
120
+ tag_prefix: Optional[str] = None,
121
+ tag_suffix: Optional[str] = None,
122
+ slide_prefix: Optional[str] = None,
123
+ slide_suffix: Optional[str] = None,
124
+ sheet_prefix: Optional[str] = None,
125
+ sheet_suffix: Optional[str] = None,
126
+ config: Optional[PageTagConfig] = None
127
+ ):
128
+ """Initialize PageTagProcessor with configuration."""
129
+ if config is not None:
130
+ self._config = config
131
+ else:
132
+ self._config = PageTagConfig(
133
+ tag_prefix=tag_prefix if tag_prefix is not None else PageTagConfig.tag_prefix,
134
+ tag_suffix=tag_suffix if tag_suffix is not None else PageTagConfig.tag_suffix,
135
+ slide_prefix=slide_prefix if slide_prefix is not None else PageTagConfig.slide_prefix,
136
+ slide_suffix=slide_suffix if slide_suffix is not None else PageTagConfig.slide_suffix,
137
+ sheet_prefix=sheet_prefix if sheet_prefix is not None else PageTagConfig.sheet_prefix,
138
+ sheet_suffix=sheet_suffix if sheet_suffix is not None else PageTagConfig.sheet_suffix,
139
+ )
140
+
141
+ # Pre-compile regex patterns for parsing
142
+ self._page_pattern: Optional[Pattern] = None
143
+ self._slide_pattern: Optional[Pattern] = None
144
+ self._sheet_pattern: Optional[Pattern] = None
145
+
146
+ @property
147
+ def config(self) -> PageTagConfig:
148
+ """Current configuration."""
149
+ return self._config
150
+
151
+ @property
152
+ def page_pattern(self) -> Pattern:
153
+ """Compiled regex pattern for matching page tags."""
154
+ if self._page_pattern is None:
155
+ escaped_prefix = re.escape(self._config.tag_prefix)
156
+ escaped_suffix = re.escape(self._config.tag_suffix)
157
+ self._page_pattern = re.compile(
158
+ f'{escaped_prefix}(\\d+){escaped_suffix}',
159
+ re.IGNORECASE
160
+ )
161
+ return self._page_pattern
162
+
163
+ @property
164
+ def slide_pattern(self) -> Pattern:
165
+ """Compiled regex pattern for matching slide tags."""
166
+ if self._slide_pattern is None:
167
+ escaped_prefix = re.escape(self._config.slide_prefix)
168
+ escaped_suffix = re.escape(self._config.slide_suffix)
169
+ self._slide_pattern = re.compile(
170
+ f'{escaped_prefix}(\\d+){escaped_suffix}',
171
+ re.IGNORECASE
172
+ )
173
+ return self._slide_pattern
174
+
175
+ @property
176
+ def sheet_pattern(self) -> Pattern:
177
+ """Compiled regex pattern for matching sheet tags."""
178
+ if self._sheet_pattern is None:
179
+ escaped_prefix = re.escape(self._config.sheet_prefix)
180
+ escaped_suffix = re.escape(self._config.sheet_suffix)
181
+ self._sheet_pattern = re.compile(
182
+ f'{escaped_prefix}([^\\]]+){escaped_suffix}',
183
+ re.IGNORECASE
184
+ )
185
+ return self._sheet_pattern
186
+
187
+ def create_tag(self, page_number: int, tag_type: PageTagType = PageTagType.PAGE) -> str:
188
+ """
189
+ Create a page/slide/sheet tag.
190
+
191
+ Args:
192
+ page_number: Page, slide, or sheet number
193
+ tag_type: Type of tag (PAGE, SLIDE, SHEET)
194
+
195
+ Returns:
196
+ Formatted tag string
197
+
198
+ Example:
199
+ >>> processor = PageTagProcessor()
200
+ >>> processor.create_tag(1)
201
+ '[Page Number: 1]'
202
+ >>> processor.create_tag(1, PageTagType.SLIDE)
203
+ '[Slide Number: 1]'
204
+ """
205
+ if tag_type == PageTagType.SLIDE:
206
+ return f"{self._config.slide_prefix}{page_number}{self._config.slide_suffix}"
207
+ elif tag_type == PageTagType.SHEET:
208
+ return f"{self._config.sheet_prefix}{page_number}{self._config.sheet_suffix}"
209
+ else:
210
+ return f"{self._config.tag_prefix}{page_number}{self._config.tag_suffix}"
211
+
212
+ def create_page_tag(self, page_number: int) -> str:
213
+ """
214
+ Create a page tag (convenience method).
215
+
216
+ Args:
217
+ page_number: Page number
218
+
219
+ Returns:
220
+ Formatted page tag string
221
+ """
222
+ return self.create_tag(page_number, PageTagType.PAGE)
223
+
224
+ def create_slide_tag(self, slide_number: int) -> str:
225
+ """
226
+ Create a slide tag (convenience method).
227
+
228
+ Args:
229
+ slide_number: Slide number
230
+
231
+ Returns:
232
+ Formatted slide tag string
233
+ """
234
+ return self.create_tag(slide_number, PageTagType.SLIDE)
235
+
236
+ def create_sheet_tag(self, sheet_name: str) -> str:
237
+ """
238
+ Create a sheet tag with name.
239
+
240
+ Args:
241
+ sheet_name: Sheet name
242
+
243
+ Returns:
244
+ Formatted sheet tag string
245
+ """
246
+ return f"{self._config.sheet_prefix}{sheet_name}{self._config.sheet_suffix}"
247
+
248
+ def find_page_numbers(self, text: str) -> List[Tuple[int, int, int]]:
249
+ """
250
+ Find all page numbers in text.
251
+
252
+ Args:
253
+ text: Text to search
254
+
255
+ Returns:
256
+ List of tuples: (page_number, start_pos, end_pos)
257
+ """
258
+ results = []
259
+ for match in self.page_pattern.finditer(text):
260
+ page_num = int(match.group(1))
261
+ results.append((page_num, match.start(), match.end()))
262
+ return results
263
+
264
+ def find_slide_numbers(self, text: str) -> List[Tuple[int, int, int]]:
265
+ """
266
+ Find all slide numbers in text.
267
+
268
+ Args:
269
+ text: Text to search
270
+
271
+ Returns:
272
+ List of tuples: (slide_number, start_pos, end_pos)
273
+ """
274
+ results = []
275
+ for match in self.slide_pattern.finditer(text):
276
+ slide_num = int(match.group(1))
277
+ results.append((slide_num, match.start(), match.end()))
278
+ return results
279
+
280
+ def has_page_markers(self, text: str) -> bool:
281
+ """
282
+ Check if text contains page markers.
283
+
284
+ Args:
285
+ text: Text to check
286
+
287
+ Returns:
288
+ True if page markers found
289
+ """
290
+ return bool(self.page_pattern.search(text))
291
+
292
+ def has_slide_markers(self, text: str) -> bool:
293
+ """
294
+ Check if text contains slide markers.
295
+
296
+ Args:
297
+ text: Text to check
298
+
299
+ Returns:
300
+ True if slide markers found
301
+ """
302
+ return bool(self.slide_pattern.search(text))
303
+
304
+ def get_pattern_string(self, tag_type: PageTagType = PageTagType.PAGE) -> str:
305
+ """
306
+ Get the regex pattern string for the specified tag type.
307
+
308
+ Args:
309
+ tag_type: Type of tag
310
+
311
+ Returns:
312
+ Regex pattern string
313
+ """
314
+ if tag_type == PageTagType.SLIDE:
315
+ escaped_prefix = re.escape(self._config.slide_prefix)
316
+ escaped_suffix = re.escape(self._config.slide_suffix)
317
+ elif tag_type == PageTagType.SHEET:
318
+ escaped_prefix = re.escape(self._config.sheet_prefix)
319
+ escaped_suffix = re.escape(self._config.sheet_suffix)
320
+ else:
321
+ escaped_prefix = re.escape(self._config.tag_prefix)
322
+ escaped_suffix = re.escape(self._config.tag_suffix)
323
+
324
+ return f'{escaped_prefix}(\\d+){escaped_suffix}'
325
+
326
+ def remove_page_markers(self, text: str) -> str:
327
+ """
328
+ Remove all page markers from text.
329
+
330
+ Args:
331
+ text: Text with page markers
332
+
333
+ Returns:
334
+ Text with page markers removed
335
+ """
336
+ text = self.page_pattern.sub('', text)
337
+ text = self.slide_pattern.sub('', text)
338
+ return text
339
+
340
+ def __repr__(self) -> str:
341
+ return (
342
+ f"PageTagProcessor(tag_prefix={self._config.tag_prefix!r}, "
343
+ f"tag_suffix={self._config.tag_suffix!r})"
344
+ )
345
+
346
+
347
+ # Default instance for convenience
348
+ _default_processor: Optional[PageTagProcessor] = None
349
+
350
+
351
+ def get_default_page_tag_processor() -> PageTagProcessor:
352
+ """Get the default PageTagProcessor instance."""
353
+ global _default_processor
354
+ if _default_processor is None:
355
+ _default_processor = PageTagProcessor()
356
+ return _default_processor
357
+
358
+
359
+ def create_page_tag(page_number: int) -> str:
360
+ """
361
+ Create a page tag using the default processor.
362
+
363
+ Args:
364
+ page_number: Page number
365
+
366
+ Returns:
367
+ Formatted page tag string
368
+ """
369
+ return get_default_page_tag_processor().create_page_tag(page_number)
370
+
371
+
372
+ def create_slide_tag(slide_number: int) -> str:
373
+ """
374
+ Create a slide tag using the default processor.
375
+
376
+ Args:
377
+ slide_number: Slide number
378
+
379
+ Returns:
380
+ Formatted slide tag string
381
+ """
382
+ return get_default_page_tag_processor().create_slide_tag(slide_number)
383
+
384
+
385
+ __all__ = [
386
+ "PageTagType",
387
+ "PageTagConfig",
388
+ "PageTagProcessor",
389
+ "get_default_page_tag_processor",
390
+ "create_page_tag",
391
+ "create_slide_tag",
392
+ ]
393
+
@@ -0,0 +1,162 @@
1
+ # xgen_doc2chunk/core/functions/preprocessor.py
2
+ """
3
+ BasePreprocessor - Abstract base class for data preprocessing
4
+
5
+ Defines the interface for preprocessing data after file conversion.
6
+ Used when converted data needs special handling before content extraction.
7
+
8
+ The preprocessor's job is to:
9
+ 1. Clean/normalize converted data
10
+ 2. Extract embedded resources (images, etc.)
11
+ 3. Detect encoding information
12
+ 4. Return preprocessed data ready for further processing
13
+
14
+ Processing Pipeline Position:
15
+ 1. FileConverter.convert() ??Format-specific object
16
+ 2. Preprocessor.preprocess() ??Cleaned/processed data (THIS STEP)
17
+ 3. MetadataExtractor.extract() ??Metadata
18
+ 4. Content extraction
19
+
20
+ Usage:
21
+ class PDFPreprocessor(BasePreprocessor):
22
+ def preprocess(self, converted_data: Any, **kwargs) -> PreprocessedData:
23
+ # Process the fitz.Document, normalize pages, etc.
24
+ return PreprocessedData(
25
+ clean_content=b"",
26
+ encoding="utf-8",
27
+ extracted_resources={"document": converted_data}
28
+ )
29
+
30
+ def get_format_name(self) -> str:
31
+ return "PDF Preprocessor"
32
+ """
33
+ from abc import ABC, abstractmethod
34
+ from dataclasses import dataclass, field
35
+ from typing import Any, Dict
36
+
37
+
38
+ @dataclass
39
+ class PreprocessedData:
40
+ """
41
+ Result of preprocessing operation.
42
+
43
+ Contains cleaned content and any extracted resources.
44
+
45
+ Attributes:
46
+ raw_content: Original input data (for reference)
47
+ clean_content: Processed content ready for use - THIS IS THE TRUE SOURCE
48
+ Can be any type: bytes, str, Document, Workbook, OleFileIO, etc.
49
+ encoding: Detected or default encoding (for text-based content)
50
+ extracted_resources: Dict of extracted resources (images, etc.)
51
+ metadata: Any metadata discovered during preprocessing
52
+ """
53
+ raw_content: Any = None
54
+ clean_content: Any = None # TRUE SOURCE - The processed result
55
+ encoding: str = "utf-8"
56
+ extracted_resources: Dict[str, Any] = field(default_factory=dict)
57
+ metadata: Dict[str, Any] = field(default_factory=dict)
58
+
59
+
60
+ class BasePreprocessor(ABC):
61
+ """
62
+ Abstract base class for data preprocessors.
63
+
64
+ Preprocesses converted data after FileConverter.convert().
65
+ Used when converted data needs normalization or special handling
66
+ before content extraction.
67
+
68
+ Processing Pipeline:
69
+ 1. FileConverter.convert() ??Format-specific object
70
+ 2. Preprocessor.preprocess() ??Cleaned/processed data (THIS STEP)
71
+ 3. MetadataExtractor.extract() ??Metadata
72
+ 4. Content extraction
73
+
74
+ Subclasses must implement:
75
+ - preprocess(): Process converted data and return PreprocessedData
76
+ - get_format_name(): Return human-readable format name
77
+ """
78
+
79
+ @abstractmethod
80
+ def preprocess(
81
+ self,
82
+ converted_data: Any,
83
+ **kwargs
84
+ ) -> PreprocessedData:
85
+ """
86
+ Preprocess converted data.
87
+
88
+ Args:
89
+ converted_data: Data from FileConverter.convert()
90
+ (format-specific object, bytes, or other type)
91
+ **kwargs: Additional format-specific options
92
+
93
+ Returns:
94
+ PreprocessedData containing cleaned content and extracted resources
95
+
96
+ Raises:
97
+ PreprocessingError: If preprocessing fails
98
+ """
99
+ pass
100
+
101
+ @abstractmethod
102
+ def get_format_name(self) -> str:
103
+ """
104
+ Return human-readable format name.
105
+
106
+ Returns:
107
+ Format name string (e.g., "PDF Preprocessor")
108
+ """
109
+ pass
110
+
111
+ def validate(self, data: Any) -> bool:
112
+ """
113
+ Validate if the data can be preprocessed by this preprocessor.
114
+
115
+ Override this method to add format-specific validation.
116
+ Default implementation returns True.
117
+
118
+ Args:
119
+ data: Data to validate (converted data or raw bytes)
120
+
121
+ Returns:
122
+ True if data can be preprocessed, False otherwise
123
+ """
124
+ _ = data # Suppress unused argument warning
125
+ return True
126
+
127
+
128
+ class NullPreprocessor(BasePreprocessor):
129
+ """
130
+ Null preprocessor that passes data through unchanged.
131
+
132
+ Used as default when no preprocessing is needed.
133
+ clean_content always contains the processed result (same as input for pass-through).
134
+ """
135
+
136
+ def preprocess(
137
+ self,
138
+ converted_data: Any,
139
+ **kwargs
140
+ ) -> PreprocessedData:
141
+ """Pass data through unchanged. clean_content = converted_data."""
142
+ encoding = kwargs.get("encoding", "utf-8")
143
+
144
+ # clean_content is ALWAYS the True Source - contains the processed result
145
+ # For pass-through, it's the same as the input
146
+ return PreprocessedData(
147
+ raw_content=converted_data,
148
+ clean_content=converted_data, # TRUE SOURCE
149
+ encoding=encoding,
150
+ )
151
+
152
+ def get_format_name(self) -> str:
153
+ """Return format name."""
154
+ return "Null Preprocessor (pass-through)"
155
+
156
+
157
+ __all__ = [
158
+ 'BasePreprocessor',
159
+ 'NullPreprocessor',
160
+ 'PreprocessedData',
161
+ ]
162
+