xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1307 @@
1
+ # xgen_doc2chunk/core/document_processor.py
2
+ """DocumentProcessor - Document Processing Class
3
+
4
+ Main document processing class for the xgen_doc2chunk library.
5
+ Provides a unified interface for extracting text from various document formats
6
+ (PDF, DOCX, PPT, Excel, HWP, etc.) and performing text chunking.
7
+
8
+ This class is the recommended entry point when using the library.
9
+
10
+ Usage Example:
11
+ from xgen_doc2chunk.core.document_processor import DocumentProcessor
12
+ from xgen_doc2chunk.ocr.ocr_engine import OpenAIOCR
13
+
14
+ # Create instance (with optional OCR engine)
15
+ ocr_engine = OpenAIOCR(api_key="sk-...", model="gpt-4o")
16
+ processor = DocumentProcessor(ocr_engine=ocr_engine)
17
+
18
+ # Extract text from file
19
+ text = processor.extract_text(file_path, file_extension)
20
+
21
+ # Extract text with OCR processing
22
+ text = processor.extract_text(file_path, file_extension, ocr_processing=True)
23
+
24
+ # Chunk text
25
+ chunks = processor.chunk_text(text, chunk_size=1000)
26
+ """
27
+
28
+ import io
29
+ import logging
30
+ import os
31
+ from pathlib import Path
32
+ from typing import Any, Callable, Dict, List, Optional, Union, TypedDict
33
+
34
+ logger = logging.getLogger("xgen_doc2chunk")
35
+
36
+
37
+ class CurrentFile(TypedDict, total=False):
38
+ """
39
+ TypedDict containing file information.
40
+
41
+ Standard structure for reading files at binary level and passing to handlers.
42
+ Resolves file system issues such as non-ASCII (Korean, etc.) paths.
43
+
44
+ Attributes:
45
+ file_path: Absolute path of the original file
46
+ file_name: File name (including extension)
47
+ file_extension: File extension (lowercase, without dot)
48
+ file_data: Binary data of the file
49
+ file_stream: BytesIO stream (reusable)
50
+ file_size: File size in bytes
51
+ """
52
+ file_path: str
53
+ file_name: str
54
+ file_extension: str
55
+ file_data: bytes
56
+ file_stream: io.BytesIO
57
+ file_size: int
58
+
59
+
60
+ class ChunkResult:
61
+ """
62
+ Container class for extracted text chunks.
63
+
64
+ Provides convenient access to chunks and utility methods for saving.
65
+ Supports both simple text chunks and chunks with position metadata.
66
+
67
+ Attributes:
68
+ chunks: List of text chunks
69
+ chunks_with_metadata: List of chunk dictionaries with position metadata
70
+ source_file: Original source file path (if available)
71
+ has_metadata: Whether position metadata is available
72
+
73
+ Example:
74
+ >>> result = processor.extract_chunks("document.pdf")
75
+ >>> print(len(result.chunks))
76
+ >>> result.save_to_md("output/chunks")
77
+ >>>
78
+ >>> # Access position metadata (if available)
79
+ >>> if result.has_metadata:
80
+ ... for chunk_data in result.chunks_with_metadata:
81
+ ... print(f"Page {chunk_data['page_number']}: {chunk_data['text'][:50]}")
82
+ """
83
+
84
+ def __init__(
85
+ self,
86
+ chunks: Union[List[str], List[Dict[str, Any]]],
87
+ source_file: Optional[str] = None
88
+ ):
89
+ """
90
+ Initialize ChunkResult.
91
+
92
+ Args:
93
+ chunks: List of text chunks or list of chunk dictionaries with metadata
94
+ source_file: Original source file path
95
+ """
96
+ self._source_file = source_file
97
+
98
+ # Detect if chunks contain metadata (list of dicts with 'text' key)
99
+ if chunks and isinstance(chunks[0], dict) and 'text' in chunks[0]:
100
+ self._chunks_with_metadata = chunks
101
+ self._chunks = [c['text'] for c in chunks]
102
+ self._has_metadata = True
103
+ else:
104
+ self._chunks = chunks if chunks else []
105
+ self._chunks_with_metadata = None
106
+ self._has_metadata = False
107
+
108
+ @property
109
+ def chunks(self) -> List[str]:
110
+ """Return list of text chunks."""
111
+ return self._chunks
112
+
113
+ @property
114
+ def chunks_with_metadata(self) -> Optional[List[Dict[str, Any]]]:
115
+ """
116
+ Return list of chunks with position metadata.
117
+
118
+ Each chunk dictionary contains:
119
+ - text: Chunk text content
120
+ - page_number: Page number where chunk starts
121
+ - line_start: Starting line number
122
+ - line_end: Ending line number
123
+ - global_start: Global character start position
124
+ - global_end: Global character end position
125
+ - chunk_index: Index of this chunk
126
+
127
+ Returns:
128
+ List of chunk dictionaries if metadata available, None otherwise
129
+ """
130
+ return self._chunks_with_metadata
131
+
132
+ @property
133
+ def has_metadata(self) -> bool:
134
+ """Return whether position metadata is available."""
135
+ return self._has_metadata
136
+
137
+ @property
138
+ def source_file(self) -> Optional[str]:
139
+ """Return original source file path."""
140
+ return self._source_file
141
+
142
+ def save_to_md(
143
+ self,
144
+ path: Optional[Union[str, Path]] = None,
145
+ *,
146
+ filename: str = "chunks.md",
147
+ separator: str = "---",
148
+ include_metadata: bool = True
149
+ ) -> str:
150
+ """
151
+ Save all chunks to a single markdown file with separators.
152
+
153
+ Args:
154
+ path: File path or directory to save (default: current directory)
155
+ - If path ends with .md, uses it as the file path
156
+ - Otherwise, treats as directory and uses filename parameter
157
+ filename: Filename to use when path is a directory (default: "chunks.md")
158
+ separator: Separator string between chunks (default: "---")
159
+ include_metadata: Whether to include metadata header
160
+
161
+ Returns:
162
+ Saved file path
163
+
164
+ Example:
165
+ >>> result = processor.extract_chunks("document.pdf")
166
+ >>> saved_path = result.save_to_md()
167
+ >>> # Creates: ./chunks.md
168
+
169
+ >>> result.save_to_md("output/my_chunks.md")
170
+ >>> # Creates: output/my_chunks.md
171
+
172
+ >>> result.save_to_md("output/", filename="document_chunks.md")
173
+ >>> # Creates: output/document_chunks.md
174
+ """
175
+ # Determine save path
176
+ if path is None:
177
+ file_path = Path.cwd() / filename
178
+ else:
179
+ path = Path(path)
180
+ if path.suffix.lower() == ".md":
181
+ file_path = path
182
+ else:
183
+ # Treat as directory
184
+ path.mkdir(parents=True, exist_ok=True)
185
+ file_path = path / filename
186
+
187
+ # Ensure parent directory exists
188
+ file_path.parent.mkdir(parents=True, exist_ok=True)
189
+
190
+ # Handle duplicate filename
191
+ if file_path.exists():
192
+ base = file_path.stem
193
+ suffix = file_path.suffix
194
+ parent = file_path.parent
195
+ counter = 1
196
+ while file_path.exists():
197
+ file_path = parent / f"{base}_{counter}{suffix}"
198
+ counter += 1
199
+
200
+ total_chunks = len(self._chunks)
201
+ content_parts = []
202
+
203
+ # Add metadata header
204
+ if include_metadata:
205
+ content_parts.append("---")
206
+ content_parts.append(f"total_chunks: {total_chunks}")
207
+ if self._source_file:
208
+ content_parts.append(f"source_file: {self._source_file}")
209
+ content_parts.append("---")
210
+ content_parts.append("")
211
+
212
+ # Add each chunk with separator
213
+ for idx, chunk in enumerate(self._chunks, start=1):
214
+ content_parts.append(f"## Chunk {idx}/{total_chunks}")
215
+ content_parts.append("")
216
+ content_parts.append(chunk)
217
+ content_parts.append("")
218
+
219
+ # Add separator between chunks (not after the last one)
220
+ if idx < total_chunks:
221
+ content_parts.append(separator)
222
+ content_parts.append("")
223
+
224
+ # Write file (handle surrogate characters)
225
+ content = "\n".join(content_parts)
226
+ # Remove surrogate characters that can't be encoded in UTF-8
227
+ content = content.encode('utf-8', errors='surrogatepass').decode('utf-8', errors='replace')
228
+ file_path.write_text(content, encoding="utf-8")
229
+
230
+ logger.info(f"Saved {total_chunks} chunks to {file_path}")
231
+ return str(file_path)
232
+
233
+ def __len__(self) -> int:
234
+ """Return number of chunks."""
235
+ return len(self._chunks)
236
+
237
+ def __iter__(self):
238
+ """Iterate over chunks."""
239
+ return iter(self._chunks)
240
+
241
+ def __getitem__(self, index: int) -> str:
242
+ """Get chunk by index."""
243
+ return self._chunks[index]
244
+
245
+ def __repr__(self) -> str:
246
+ return f"ChunkResult(chunks={len(self._chunks)}, source_file={self._source_file!r})"
247
+
248
+ def __str__(self) -> str:
249
+ return f"ChunkResult with {len(self._chunks)} chunks"
250
+
251
+
252
+ class DocumentProcessor:
253
+ """
254
+ xgen_doc2chunk Main Document Processing Class
255
+
256
+ A unified interface for processing various document formats and extracting text.
257
+
258
+ Attributes:
259
+ config: Configuration dictionary or ConfigComposer instance
260
+ supported_extensions: List of supported file extensions
261
+
262
+ Example:
263
+ >>> processor = DocumentProcessor()
264
+ >>> text = processor.extract_text("document.pdf", "pdf")
265
+ >>> chunks = processor.chunk_text(text, chunk_size=1000)
266
+ """
267
+
268
+ # === Supported File Type Classifications ===
269
+ DOCUMENT_TYPES = frozenset(['pdf', 'docx', 'doc', 'rtf', 'pptx', 'ppt', 'hwp', 'hwpx'])
270
+ TEXT_TYPES = frozenset(['txt', 'md', 'markdown'])
271
+ CODE_TYPES = frozenset([
272
+ 'py', 'js', 'ts', 'java', 'cpp', 'c', 'h', 'cs', 'go', 'rs',
273
+ 'php', 'rb', 'swift', 'kt', 'scala', 'dart', 'r', 'sql',
274
+ 'html', 'css', 'jsx', 'tsx', 'vue', 'svelte'
275
+ ])
276
+ CONFIG_TYPES = frozenset(['json', 'yaml', 'yml', 'xml', 'toml', 'ini', 'cfg', 'conf', 'properties', 'env'])
277
+ DATA_TYPES = frozenset(['csv', 'tsv', 'xlsx', 'xls'])
278
+ SCRIPT_TYPES = frozenset(['sh', 'bat', 'ps1', 'zsh', 'fish'])
279
+ LOG_TYPES = frozenset(['log'])
280
+ WEB_TYPES = frozenset(['htm', 'xhtml'])
281
+ IMAGE_TYPES = frozenset(['jpg', 'jpeg', 'png', 'gif', 'bmp', 'webp'])
282
+
283
+ def __init__(
284
+ self,
285
+ config: Optional[Union[Dict[str, Any], Any]] = None,
286
+ ocr_engine: Optional[Any] = None,
287
+ *,
288
+ image_directory: Optional[str] = None,
289
+ image_tag_prefix: Optional[str] = None,
290
+ image_tag_suffix: Optional[str] = None,
291
+ page_tag_prefix: Optional[str] = None,
292
+ page_tag_suffix: Optional[str] = None,
293
+ slide_tag_prefix: Optional[str] = None,
294
+ slide_tag_suffix: Optional[str] = None,
295
+ chart_tag_prefix: Optional[str] = None,
296
+ chart_tag_suffix: Optional[str] = None,
297
+ metadata_tag_prefix: Optional[str] = None,
298
+ metadata_tag_suffix: Optional[str] = None,
299
+ **kwargs
300
+ ):
301
+ """
302
+ Initialize DocumentProcessor.
303
+
304
+ Args:
305
+ config: Configuration dictionary or ConfigComposer instance
306
+ - Dict: Pass configuration dictionary directly
307
+ - ConfigComposer: Existing config_composer instance
308
+ - None: Use default settings
309
+ ocr_engine: OCR engine instance (BaseOCR subclass)
310
+ - If provided, OCR processing can be enabled in extract_text
311
+ - Example: OpenAIOCR, AnthropicOCR, GeminiOCR, VllmOCR
312
+ image_directory: Directory path for saving extracted images
313
+ - Default: "temp/images"
314
+ image_tag_prefix: Prefix for image tags in extracted text
315
+ - Default: "[Image:"
316
+ - Example: "<img src='" for HTML format
317
+ image_tag_suffix: Suffix for image tags in extracted text
318
+ - Default: "]"
319
+ - Example: "'/>" for HTML format
320
+ page_tag_prefix: Prefix for page number tags in extracted text
321
+ - Default: "[Page Number: "
322
+ - Example: "<page>" for XML format
323
+ page_tag_suffix: Suffix for page number tags in extracted text
324
+ - Default: "]"
325
+ - Example: "</page>" for XML format
326
+ slide_tag_prefix: Prefix for slide number tags (presentations)
327
+ - Default: "[Slide Number: "
328
+ slide_tag_suffix: Suffix for slide number tags
329
+ - Default: "]"
330
+ chart_tag_prefix: Prefix for chart tags in extracted text
331
+ - Default: "[chart]"
332
+ - Example: "<chart>" for XML format
333
+ chart_tag_suffix: Suffix for chart tags in extracted text
334
+ - Default: "[/chart]"
335
+ - Example: "</chart>" for XML format
336
+ metadata_tag_prefix: Opening tag for metadata section
337
+ - Default: "<Document-Metadata>"
338
+ - Example: "<metadata>" for custom format
339
+ metadata_tag_suffix: Closing tag for metadata section
340
+ - Default: "</Document-Metadata>"
341
+ - Example: "</metadata>" for custom format
342
+ **kwargs: Additional configuration options
343
+
344
+ Example:
345
+ >>> # Default tags: [Image:...], [Page Number: 1]
346
+ >>> processor = DocumentProcessor()
347
+
348
+ >>> # Custom HTML format
349
+ >>> processor = DocumentProcessor(
350
+ ... image_directory="output/images",
351
+ ... image_tag_prefix="<img src='",
352
+ ... image_tag_suffix="'/>",
353
+ ... page_tag_prefix="<page>",
354
+ ... page_tag_suffix="</page>",
355
+ ... chart_tag_prefix="<chart>",
356
+ ... chart_tag_suffix="</chart>",
357
+ ... metadata_tag_prefix="<meta>",
358
+ ... metadata_tag_suffix="</meta>"
359
+ ... )
360
+
361
+ >>> # Markdown format
362
+ >>> processor = DocumentProcessor(
363
+ ... image_tag_prefix="![image](",
364
+ ... image_tag_suffix=")",
365
+ ... page_tag_prefix="<!-- Page ",
366
+ ... page_tag_suffix=" -->",
367
+ ... chart_tag_prefix="```chart",
368
+ ... chart_tag_suffix="```"
369
+ ... )
370
+ """
371
+ self._config = config or {}
372
+ self._ocr_engine = ocr_engine
373
+ self._kwargs = kwargs
374
+ self._supported_extensions: Optional[List[str]] = None
375
+
376
+ # Store metadata tag settings
377
+ self._metadata_tag_prefix = metadata_tag_prefix
378
+ self._metadata_tag_suffix = metadata_tag_suffix
379
+
380
+ # Logger setup
381
+ self._logger = logging.getLogger("xgen_doc2chunk.processor")
382
+
383
+ # Cache for library availability check results
384
+ self._library_availability: Optional[Dict[str, bool]] = None
385
+
386
+ # Handler registry
387
+ self._handler_registry: Optional[Dict[str, Callable]] = None
388
+
389
+ # Create instance-specific ImageProcessor
390
+ self._image_processor = self._create_image_processor(
391
+ directory=image_directory,
392
+ tag_prefix=image_tag_prefix,
393
+ tag_suffix=image_tag_suffix
394
+ )
395
+
396
+ # Create instance-specific PageTagProcessor
397
+ self._page_tag_processor = self._create_page_tag_processor(
398
+ page_tag_prefix=page_tag_prefix,
399
+ page_tag_suffix=page_tag_suffix,
400
+ slide_tag_prefix=slide_tag_prefix,
401
+ slide_tag_suffix=slide_tag_suffix
402
+ )
403
+
404
+ # Create instance-specific ChartProcessor
405
+ self._chart_processor = self._create_chart_processor(
406
+ chart_tag_prefix=chart_tag_prefix,
407
+ chart_tag_suffix=chart_tag_suffix
408
+ )
409
+
410
+ # Create instance-specific MetadataFormatter
411
+ self._metadata_formatter = self._create_metadata_formatter(
412
+ metadata_tag_prefix=metadata_tag_prefix,
413
+ metadata_tag_suffix=metadata_tag_suffix
414
+ )
415
+
416
+ # Add processors to config for handlers to access
417
+ if isinstance(self._config, dict):
418
+ self._config["image_processor"] = self._image_processor
419
+ self._config["page_tag_processor"] = self._page_tag_processor
420
+ self._config["chart_processor"] = self._chart_processor
421
+ self._config["metadata_formatter"] = self._metadata_formatter
422
+
423
+ # =========================================================================
424
+ # Public Properties
425
+ # =========================================================================
426
+
427
+ @property
428
+ def supported_extensions(self) -> List[str]:
429
+ """List of all supported file extensions."""
430
+ if self._supported_extensions is None:
431
+ self._supported_extensions = self._build_supported_extensions()
432
+ return self._supported_extensions.copy()
433
+
434
+ @property
435
+ def config(self) -> Optional[Union[Dict[str, Any], Any]]:
436
+ """Current configuration."""
437
+ return self._config
438
+
439
+ @property
440
+ def image_config(self) -> Dict[str, Any]:
441
+ """
442
+ Current image processor configuration.
443
+
444
+ Returns:
445
+ Dictionary containing:
446
+ - directory_path: Image save directory
447
+ - tag_prefix: Image tag prefix
448
+ - tag_suffix: Image tag suffix
449
+ - naming_strategy: File naming strategy
450
+ """
451
+ return {
452
+ "directory_path": self._image_processor.config.directory_path,
453
+ "tag_prefix": self._image_processor.config.tag_prefix,
454
+ "tag_suffix": self._image_processor.config.tag_suffix,
455
+ "naming_strategy": self._image_processor.config.naming_strategy.value,
456
+ }
457
+
458
+ @property
459
+ def image_processor(self) -> Any:
460
+ """Current ImageProcessor instance for this DocumentProcessor."""
461
+ return self._image_processor
462
+
463
+ @property
464
+ def page_tag_config(self) -> Dict[str, Any]:
465
+ """
466
+ Current page tag processor configuration.
467
+
468
+ Returns:
469
+ Dictionary containing:
470
+ - tag_prefix: Page tag prefix
471
+ - tag_suffix: Page tag suffix
472
+ - slide_prefix: Slide tag prefix
473
+ - slide_suffix: Slide tag suffix
474
+ - sheet_prefix: Sheet tag prefix
475
+ - sheet_suffix: Sheet tag suffix
476
+ """
477
+ return {
478
+ "tag_prefix": self._page_tag_processor.config.tag_prefix,
479
+ "tag_suffix": self._page_tag_processor.config.tag_suffix,
480
+ "slide_prefix": self._page_tag_processor.config.slide_prefix,
481
+ "slide_suffix": self._page_tag_processor.config.slide_suffix,
482
+ "sheet_prefix": self._page_tag_processor.config.sheet_prefix,
483
+ "sheet_suffix": self._page_tag_processor.config.sheet_suffix,
484
+ }
485
+
486
+ @property
487
+ def page_tag_processor(self) -> Any:
488
+ """Current PageTagProcessor instance for this DocumentProcessor."""
489
+ return self._page_tag_processor
490
+
491
+ @property
492
+ def chart_tag_config(self) -> Dict[str, Any]:
493
+ """
494
+ Current chart processor configuration.
495
+
496
+ Returns:
497
+ Dictionary containing:
498
+ - tag_prefix: Chart tag prefix
499
+ - tag_suffix: Chart tag suffix
500
+ """
501
+ return {
502
+ "tag_prefix": self._chart_processor.config.tag_prefix,
503
+ "tag_suffix": self._chart_processor.config.tag_suffix,
504
+ }
505
+
506
+ @property
507
+ def chart_processor(self) -> Any:
508
+ """Current ChartProcessor instance for this DocumentProcessor."""
509
+ return self._chart_processor
510
+
511
+ @property
512
+ def metadata_tag_config(self) -> Dict[str, Any]:
513
+ """
514
+ Current metadata formatter configuration.
515
+
516
+ Returns:
517
+ Dictionary containing:
518
+ - metadata_tag_prefix: Opening tag for metadata section
519
+ - metadata_tag_suffix: Closing tag for metadata section
520
+ """
521
+ return {
522
+ "metadata_tag_prefix": self._metadata_formatter.metadata_tag_prefix,
523
+ "metadata_tag_suffix": self._metadata_formatter.metadata_tag_suffix,
524
+ }
525
+
526
+ @property
527
+ def metadata_formatter(self) -> Any:
528
+ """Current MetadataFormatter instance for this DocumentProcessor."""
529
+ return self._metadata_formatter
530
+
531
+ @property
532
+ def ocr_engine(self) -> Optional[Any]:
533
+ """Current OCR engine instance."""
534
+ return self._ocr_engine
535
+
536
+ @ocr_engine.setter
537
+ def ocr_engine(self, engine: Optional[Any]) -> None:
538
+ """
539
+ Set OCR engine instance.
540
+
541
+ When OCR engine is changed, the handler registry is invalidated
542
+ to ensure ImageFileHandler gets the updated OCR engine.
543
+ """
544
+ self._ocr_engine = engine
545
+ # Invalidate handler registry so it gets rebuilt with new OCR engine
546
+ self._handler_registry = None
547
+
548
+ # =========================================================================
549
+ # Public Methods - Text Extraction
550
+ # =========================================================================
551
+
552
+ def extract_text(
553
+ self,
554
+ file_path: Union[str, Path],
555
+ file_extension: Optional[str] = None,
556
+ *,
557
+ extract_metadata: bool = True,
558
+ ocr_processing: bool = False,
559
+ **kwargs
560
+ ) -> str:
561
+ """
562
+ Extract text from a file.
563
+
564
+ Args:
565
+ file_path: File path
566
+ file_extension: File extension (if None, auto-extracted from file_path)
567
+ extract_metadata: Whether to extract metadata
568
+ ocr_processing: Whether to perform OCR on image tags in extracted text
569
+ - If True and ocr_engine is set, processes [Image:...] tags
570
+ - If True but ocr_engine is None, skips OCR processing
571
+ **kwargs: Additional handler-specific options
572
+
573
+ Returns:
574
+ Extracted text string
575
+
576
+ Raises:
577
+ FileNotFoundError: If file cannot be found
578
+ ValueError: If file format is not supported
579
+ """
580
+ # Convert to string path
581
+ file_path_str = str(file_path)
582
+
583
+ # Check file existence
584
+ if not os.path.exists(file_path_str):
585
+ raise FileNotFoundError(f"File not found: {file_path_str}")
586
+
587
+ # Extract extension if not provided
588
+ if file_extension is None:
589
+ file_extension = os.path.splitext(file_path_str)[1].lstrip('.')
590
+
591
+ ext = file_extension.lower().lstrip('.')
592
+
593
+ # Check if extension is supported
594
+ if not self.is_supported(ext):
595
+ raise ValueError(f"Unsupported file format: {ext}")
596
+
597
+ self._logger.info(f"Extracting text from: {file_path_str} (ext={ext})")
598
+
599
+ # Create current_file dict with binary data
600
+ current_file = self._create_current_file(file_path_str, ext)
601
+
602
+ # Get handler and extract text
603
+ handler = self._get_handler(ext)
604
+ text = self._invoke_handler(handler, current_file, ext, extract_metadata, **kwargs)
605
+
606
+ # Apply OCR processing if enabled and ocr_engine is available
607
+ if ocr_processing and self._ocr_engine is not None:
608
+ self._logger.info(f"Applying OCR processing with {self._ocr_engine}")
609
+ # Get image pattern from ImageProcessor to pass to OCR engine
610
+ import re
611
+ image_pattern = re.compile(self._image_processor.get_pattern_string())
612
+ text = self._ocr_engine.process_text(text, image_pattern=image_pattern)
613
+ elif ocr_processing and self._ocr_engine is None:
614
+ self._logger.warning("OCR processing requested but no ocr_engine is configured. Skipping OCR.")
615
+
616
+ return text
617
+
618
+ # =========================================================================
619
+ # Public Methods - Text Chunking
620
+ # =========================================================================
621
+
622
+ def chunk_text(
623
+ self,
624
+ text: str,
625
+ *,
626
+ chunk_size: int = 1000,
627
+ chunk_overlap: int = 200,
628
+ file_extension: Optional[str] = None,
629
+ preserve_tables: bool = True,
630
+ include_position_metadata: bool = False,
631
+ ) -> Union[List[str], List[Dict[str, Any]]]:
632
+ """
633
+ Split text into chunks.
634
+
635
+ Args:
636
+ text: Text to split
637
+ chunk_size: Chunk size (character count)
638
+ chunk_overlap: Overlap size between chunks
639
+ file_extension: File extension (used for table-based file processing)
640
+ preserve_tables: Whether to preserve table structure
641
+ include_position_metadata: Whether to include position metadata
642
+ - True: Returns list of dicts with text, page_number, line_start, etc.
643
+ - False: Returns list of text strings (default)
644
+
645
+ Returns:
646
+ List of chunk strings or list of chunk dictionaries with metadata
647
+ """
648
+ from xgen_doc2chunk.chunking.chunking import create_chunks
649
+
650
+ if not text or not text.strip():
651
+ return [""]
652
+
653
+ # Use force_chunking to disable table protection if preserve_tables is False
654
+ force_chunking = not preserve_tables
655
+
656
+ result = create_chunks(
657
+ text=text,
658
+ file_extension=file_extension or "",
659
+ chunk_size=chunk_size,
660
+ chunk_overlap=chunk_overlap,
661
+ force_chunking=force_chunking,
662
+ include_position_metadata=include_position_metadata,
663
+ page_tag_processor=self._page_tag_processor,
664
+ image_processor=self._image_processor,
665
+ chart_processor=self._chart_processor,
666
+ metadata_formatter=self._metadata_formatter
667
+ )
668
+
669
+ return result
670
+
671
+ def extract_chunks(
672
+ self,
673
+ file_path: Union[str, Path],
674
+ file_extension: Optional[str] = None,
675
+ *,
676
+ extract_metadata: bool = True,
677
+ ocr_processing: bool = False,
678
+ chunk_size: int = 1000,
679
+ chunk_overlap: int = 200,
680
+ preserve_tables: bool = True,
681
+ include_position_metadata: bool = False,
682
+ **kwargs
683
+ ) -> ChunkResult:
684
+ """
685
+ Extract text from a file and split into chunks in one step.
686
+
687
+ This is a convenience method that combines extract_text() and chunk_text().
688
+ Returns a ChunkResult object that provides convenient access to chunks
689
+ and utility methods for saving.
690
+
691
+ Args:
692
+ file_path: File path
693
+ file_extension: File extension (if None, auto-extracted from file_path)
694
+ extract_metadata: Whether to extract metadata
695
+ ocr_processing: Whether to perform OCR on image tags in extracted text
696
+ chunk_size: Chunk size (character count)
697
+ chunk_overlap: Overlap size between chunks
698
+ preserve_tables: Whether to preserve table structure
699
+ include_position_metadata: Whether to include position metadata
700
+ - True: Each chunk includes page_number, line_start, line_end, etc.
701
+ - False: Standard text chunks only (default)
702
+ **kwargs: Additional handler-specific options
703
+
704
+ Returns:
705
+ ChunkResult object containing chunks with utility methods
706
+ - .chunks: Access list of chunk strings
707
+ - .chunks_with_metadata: Access chunks with position metadata (if enabled)
708
+ - .has_metadata: Check if position metadata is available
709
+ - .save_to_md(path): Save chunks as markdown files
710
+
711
+ Raises:
712
+ FileNotFoundError: If file cannot be found
713
+ ValueError: If file format is not supported
714
+
715
+ Example:
716
+ >>> processor = DocumentProcessor()
717
+ >>> result = processor.extract_chunks("document.pdf", chunk_size=1000)
718
+ >>> for i, chunk in enumerate(result.chunks):
719
+ ... print(f"Chunk {i+1}: {len(chunk)} chars")
720
+ >>> # Save chunks to markdown files
721
+ >>> result.save_to_md("output/chunks")
722
+ >>>
723
+ >>> # With position metadata
724
+ >>> result = processor.extract_chunks("doc.pdf", include_position_metadata=True)
725
+ >>> if result.has_metadata:
726
+ ... for chunk_data in result.chunks_with_metadata:
727
+ ... print(f"Page {chunk_data['page_number']}: lines {chunk_data['line_start']}-{chunk_data['line_end']}")
728
+ """
729
+ # Extract text
730
+ text = self.extract_text(
731
+ file_path=file_path,
732
+ file_extension=file_extension,
733
+ extract_metadata=extract_metadata,
734
+ ocr_processing=ocr_processing,
735
+ **kwargs
736
+ )
737
+
738
+ # Determine file extension for chunking
739
+ if file_extension is None:
740
+ file_extension = os.path.splitext(str(file_path))[1].lstrip('.')
741
+
742
+ # Chunk text
743
+ chunks = self.chunk_text(
744
+ text=text,
745
+ chunk_size=chunk_size,
746
+ chunk_overlap=chunk_overlap,
747
+ file_extension=file_extension,
748
+ preserve_tables=preserve_tables,
749
+ include_position_metadata=include_position_metadata
750
+ )
751
+
752
+ # Return ChunkResult with source file info
753
+ return ChunkResult(
754
+ chunks=chunks,
755
+ source_file=str(file_path)
756
+ )
757
+
758
+ # =========================================================================
759
+ # Public Methods - Utilities
760
+ # =========================================================================
761
+
762
+ def get_file_category(self, file_extension: str) -> str:
763
+ """
764
+ Return the category of a file extension.
765
+
766
+ Args:
767
+ file_extension: File extension
768
+
769
+ Returns:
770
+ Category string ('document', 'text', 'code', 'data', etc.)
771
+ """
772
+ ext = file_extension.lower().lstrip('.')
773
+
774
+ if ext in self.DOCUMENT_TYPES:
775
+ return 'document'
776
+ if ext in self.TEXT_TYPES:
777
+ return 'text'
778
+ if ext in self.CODE_TYPES:
779
+ return 'code'
780
+ if ext in self.CONFIG_TYPES:
781
+ return 'config'
782
+ if ext in self.DATA_TYPES:
783
+ return 'data'
784
+ if ext in self.SCRIPT_TYPES:
785
+ return 'script'
786
+ if ext in self.LOG_TYPES:
787
+ return 'log'
788
+ if ext in self.WEB_TYPES:
789
+ return 'web'
790
+ if ext in self.IMAGE_TYPES:
791
+ return 'image'
792
+
793
+ return 'unknown'
794
+
795
+ def is_supported(self, file_extension: str) -> bool:
796
+ """
797
+ Check if a file extension is supported.
798
+
799
+ Args:
800
+ file_extension: File extension
801
+
802
+ Returns:
803
+ Whether supported
804
+ """
805
+ ext = file_extension.lower().lstrip('.')
806
+ return ext in self.supported_extensions
807
+
808
+ @staticmethod
809
+ def clean_text(text: str) -> str:
810
+ """
811
+ Clean text.
812
+
813
+ Args:
814
+ text: Text to clean
815
+
816
+ Returns:
817
+ Cleaned text
818
+ """
819
+ from xgen_doc2chunk.core.functions.utils import clean_text as _clean_text
820
+ return _clean_text(text)
821
+
822
+ @staticmethod
823
+ def clean_code_text(text: str) -> str:
824
+ """
825
+ Clean code text.
826
+
827
+ Args:
828
+ text: Code text to clean
829
+
830
+ Returns:
831
+ Cleaned code text
832
+ """
833
+ from xgen_doc2chunk.core.functions.utils import clean_code_text as _clean_code_text
834
+ return _clean_code_text(text)
835
+
836
+ # =========================================================================
837
+ # Private Methods
838
+ # =========================================================================
839
+
840
+ def _create_image_processor(
841
+ self,
842
+ directory: Optional[str] = None,
843
+ tag_prefix: Optional[str] = None,
844
+ tag_suffix: Optional[str] = None
845
+ ) -> Any:
846
+ """
847
+ Create an ImageProcessor instance for this DocumentProcessor.
848
+
849
+ This creates an instance-specific ImageProcessor that will be
850
+ passed to handlers via config.
851
+
852
+ Args:
853
+ directory: Image save directory
854
+ tag_prefix: Image tag prefix
855
+ tag_suffix: Image tag suffix
856
+
857
+ Returns:
858
+ ImageProcessor instance
859
+ """
860
+ from xgen_doc2chunk.core.functions.img_processor import create_image_processor
861
+
862
+ return create_image_processor(
863
+ directory_path=directory,
864
+ tag_prefix=tag_prefix,
865
+ tag_suffix=tag_suffix
866
+ )
867
+
868
+ def _create_page_tag_processor(
869
+ self,
870
+ page_tag_prefix: Optional[str] = None,
871
+ page_tag_suffix: Optional[str] = None,
872
+ slide_tag_prefix: Optional[str] = None,
873
+ slide_tag_suffix: Optional[str] = None
874
+ ) -> Any:
875
+ """
876
+ Create a PageTagProcessor instance for this DocumentProcessor.
877
+
878
+ This creates an instance-specific PageTagProcessor that will be
879
+ passed to handlers via config.
880
+
881
+ Args:
882
+ page_tag_prefix: Page tag prefix (default: "[Page Number: ")
883
+ page_tag_suffix: Page tag suffix (default: "]")
884
+ slide_tag_prefix: Slide tag prefix (default: "[Slide Number: ")
885
+ slide_tag_suffix: Slide tag suffix (default: "]")
886
+
887
+ Returns:
888
+ PageTagProcessor instance
889
+ """
890
+ from xgen_doc2chunk.core.functions.page_tag_processor import PageTagProcessor
891
+
892
+ return PageTagProcessor(
893
+ tag_prefix=page_tag_prefix,
894
+ tag_suffix=page_tag_suffix,
895
+ slide_prefix=slide_tag_prefix,
896
+ slide_suffix=slide_tag_suffix
897
+ )
898
+
899
+ def _create_chart_processor(
900
+ self,
901
+ chart_tag_prefix: Optional[str] = None,
902
+ chart_tag_suffix: Optional[str] = None
903
+ ) -> Any:
904
+ """
905
+ Create a ChartProcessor instance for this DocumentProcessor.
906
+
907
+ This creates an instance-specific ChartProcessor that will be
908
+ passed to handlers via config.
909
+
910
+ Args:
911
+ chart_tag_prefix: Chart tag prefix (default: "[chart]")
912
+ chart_tag_suffix: Chart tag suffix (default: "[/chart]")
913
+
914
+ Returns:
915
+ ChartProcessor instance
916
+ """
917
+ from xgen_doc2chunk.core.functions.chart_processor import ChartProcessor
918
+
919
+ return ChartProcessor(
920
+ tag_prefix=chart_tag_prefix,
921
+ tag_suffix=chart_tag_suffix
922
+ )
923
+
924
+ def _create_metadata_formatter(
925
+ self,
926
+ metadata_tag_prefix: Optional[str] = None,
927
+ metadata_tag_suffix: Optional[str] = None
928
+ ) -> Any:
929
+ """
930
+ Create a MetadataFormatter instance for this DocumentProcessor.
931
+
932
+ This creates an instance-specific MetadataFormatter that will be
933
+ passed to handlers via config.
934
+
935
+ Args:
936
+ metadata_tag_prefix: Opening tag (default: "<Document-Metadata>")
937
+ metadata_tag_suffix: Closing tag (default: "</Document-Metadata>")
938
+
939
+ Returns:
940
+ MetadataFormatter instance
941
+ """
942
+ from xgen_doc2chunk.core.functions.metadata_extractor import MetadataFormatter
943
+
944
+ kwargs = {}
945
+ if metadata_tag_prefix is not None:
946
+ kwargs["metadata_tag_prefix"] = metadata_tag_prefix
947
+ if metadata_tag_suffix is not None:
948
+ kwargs["metadata_tag_suffix"] = metadata_tag_suffix
949
+
950
+ return MetadataFormatter(**kwargs)
951
+
952
+ def _build_supported_extensions(self) -> List[str]:
953
+ """Build list of supported extensions."""
954
+ extensions = list(
955
+ self.DOCUMENT_TYPES |
956
+ self.TEXT_TYPES |
957
+ self.CODE_TYPES |
958
+ self.CONFIG_TYPES |
959
+ self.DATA_TYPES |
960
+ self.SCRIPT_TYPES |
961
+ self.LOG_TYPES |
962
+ self.WEB_TYPES |
963
+ self.IMAGE_TYPES
964
+ )
965
+
966
+ return sorted(extensions)
967
+
968
+ def _get_handler_registry(self) -> Dict[str, Callable]:
969
+ """Build and cache handler registry.
970
+
971
+ All handlers are class-based, inheriting from BaseHandler.
972
+ """
973
+ if self._handler_registry is not None:
974
+ return self._handler_registry
975
+
976
+ self._handler_registry = {}
977
+
978
+ # PDF handler
979
+ try:
980
+ from xgen_doc2chunk.core.processor.pdf_handler import PDFHandler
981
+ pdf_handler = PDFHandler(
982
+ config=self._config,
983
+ image_processor=self._image_processor,
984
+ page_tag_processor=self._page_tag_processor,
985
+ chart_processor=self._chart_processor
986
+ )
987
+ self._handler_registry['pdf'] = pdf_handler.extract_text
988
+ except ImportError as e:
989
+ self._logger.warning(f"PDF handler not available: {e}")
990
+
991
+ # DOCX handler
992
+ try:
993
+ from xgen_doc2chunk.core.processor.docx_handler import DOCXHandler
994
+ docx_handler = DOCXHandler(
995
+ config=self._config,
996
+ image_processor=self._image_processor,
997
+ page_tag_processor=self._page_tag_processor,
998
+ chart_processor=self._chart_processor
999
+ )
1000
+ self._handler_registry['docx'] = docx_handler.extract_text
1001
+ except ImportError as e:
1002
+ self._logger.warning(f"DOCX handler not available: {e}")
1003
+
1004
+ # DOC handler
1005
+ try:
1006
+ from xgen_doc2chunk.core.processor.doc_handler import DOCHandler
1007
+ doc_handler = DOCHandler(
1008
+ config=self._config,
1009
+ image_processor=self._image_processor,
1010
+ page_tag_processor=self._page_tag_processor,
1011
+ chart_processor=self._chart_processor
1012
+ )
1013
+ self._handler_registry['doc'] = doc_handler.extract_text
1014
+ except ImportError as e:
1015
+ self._logger.warning(f"DOC handler not available: {e}")
1016
+
1017
+ # RTF handler
1018
+ try:
1019
+ from xgen_doc2chunk.core.processor.rtf_handler import RTFHandler
1020
+ rtf_handler = RTFHandler(
1021
+ config=self._config,
1022
+ image_processor=self._image_processor,
1023
+ page_tag_processor=self._page_tag_processor,
1024
+ chart_processor=self._chart_processor
1025
+ )
1026
+ self._handler_registry['rtf'] = rtf_handler.extract_text
1027
+ except ImportError as e:
1028
+ self._logger.warning(f"RTF handler not available: {e}")
1029
+
1030
+ # PPT/PPTX handler
1031
+ try:
1032
+ from xgen_doc2chunk.core.processor.ppt_handler import PPTHandler
1033
+ ppt_handler = PPTHandler(
1034
+ config=self._config,
1035
+ image_processor=self._image_processor,
1036
+ page_tag_processor=self._page_tag_processor,
1037
+ chart_processor=self._chart_processor
1038
+ )
1039
+ self._handler_registry['ppt'] = ppt_handler.extract_text
1040
+ self._handler_registry['pptx'] = ppt_handler.extract_text
1041
+ except ImportError as e:
1042
+ self._logger.warning(f"PPT handler not available: {e}")
1043
+
1044
+ # Excel handler
1045
+ try:
1046
+ from xgen_doc2chunk.core.processor.excel_handler import ExcelHandler
1047
+ excel_handler = ExcelHandler(
1048
+ config=self._config,
1049
+ image_processor=self._image_processor,
1050
+ page_tag_processor=self._page_tag_processor,
1051
+ chart_processor=self._chart_processor
1052
+ )
1053
+ self._handler_registry['xlsx'] = excel_handler.extract_text
1054
+ self._handler_registry['xls'] = excel_handler.extract_text
1055
+ except ImportError as e:
1056
+ self._logger.warning(f"Excel handler not available: {e}")
1057
+
1058
+ # CSV/TSV handler
1059
+ try:
1060
+ from xgen_doc2chunk.core.processor.csv_handler import CSVHandler
1061
+ csv_handler = CSVHandler(
1062
+ config=self._config,
1063
+ image_processor=self._image_processor,
1064
+ page_tag_processor=self._page_tag_processor,
1065
+ chart_processor=self._chart_processor
1066
+ )
1067
+ self._handler_registry['csv'] = csv_handler.extract_text
1068
+ self._handler_registry['tsv'] = csv_handler.extract_text
1069
+ except ImportError as e:
1070
+ self._logger.warning(f"CSV handler not available: {e}")
1071
+
1072
+ # HWP handler
1073
+ try:
1074
+ from xgen_doc2chunk.core.processor.hwp_handler import HWPHandler
1075
+ hwp_handler = HWPHandler(
1076
+ config=self._config,
1077
+ image_processor=self._image_processor,
1078
+ page_tag_processor=self._page_tag_processor,
1079
+ chart_processor=self._chart_processor
1080
+ )
1081
+ self._handler_registry['hwp'] = hwp_handler.extract_text
1082
+ except ImportError as e:
1083
+ self._logger.warning(f"HWP handler not available: {e}")
1084
+
1085
+ # HWPX handler
1086
+ try:
1087
+ from xgen_doc2chunk.core.processor.hwpx_handler import HWPXHandler
1088
+ hwpx_handler = HWPXHandler(
1089
+ config=self._config,
1090
+ image_processor=self._image_processor,
1091
+ page_tag_processor=self._page_tag_processor,
1092
+ chart_processor=self._chart_processor
1093
+ )
1094
+ self._handler_registry['hwpx'] = hwpx_handler.extract_text
1095
+ except ImportError as e:
1096
+ self._logger.warning(f"HWPX handler not available: {e}")
1097
+
1098
+ # Text handler (for text, code, config, script, log, web types)
1099
+ try:
1100
+ from xgen_doc2chunk.core.processor.text_handler import TextHandler
1101
+ text_handler = TextHandler(
1102
+ config=self._config,
1103
+ image_processor=self._image_processor,
1104
+ page_tag_processor=self._page_tag_processor,
1105
+ chart_processor=self._chart_processor
1106
+ )
1107
+ text_extensions = (
1108
+ self.TEXT_TYPES |
1109
+ self.CODE_TYPES |
1110
+ self.CONFIG_TYPES |
1111
+ self.SCRIPT_TYPES |
1112
+ self.LOG_TYPES |
1113
+ self.WEB_TYPES
1114
+ )
1115
+ for ext in text_extensions:
1116
+ self._handler_registry[ext] = text_handler.extract_text
1117
+ except ImportError as e:
1118
+ self._logger.warning(f"Text handler not available: {e}")
1119
+
1120
+ # Image file handler (for standalone image files)
1121
+ # Requires OCR engine for text extraction
1122
+ try:
1123
+ from xgen_doc2chunk.core.processor.image_file_handler import ImageFileHandler
1124
+ image_handler = ImageFileHandler(
1125
+ config=self._config,
1126
+ image_processor=self._image_processor,
1127
+ page_tag_processor=self._page_tag_processor,
1128
+ chart_processor=self._chart_processor,
1129
+ ocr_engine=self._ocr_engine
1130
+ )
1131
+ for ext in self.IMAGE_TYPES:
1132
+ self._handler_registry[ext] = image_handler.extract_text
1133
+ except ImportError as e:
1134
+ self._logger.warning(f"Image file handler not available: {e}")
1135
+
1136
+ return self._handler_registry
1137
+
1138
+ def _create_current_file(self, file_path: str, ext: str) -> CurrentFile:
1139
+ """
1140
+ Create a CurrentFile dict from a file path.
1141
+
1142
+ Reads the file at binary level to avoid path encoding issues
1143
+ (e.g., Korean characters in Windows paths).
1144
+
1145
+ Args:
1146
+ file_path: Absolute path to the file
1147
+ ext: File extension (lowercase, without dot)
1148
+
1149
+ Returns:
1150
+ CurrentFile dict containing file info and binary data
1151
+
1152
+ Raises:
1153
+ IOError: If file cannot be read
1154
+ """
1155
+ file_path = os.path.abspath(file_path)
1156
+ file_name = os.path.basename(file_path)
1157
+
1158
+ # Read file as binary
1159
+ with open(file_path, 'rb') as f:
1160
+ file_data = f.read()
1161
+
1162
+ # Create BytesIO stream for handlers that need seekable stream
1163
+ file_stream = io.BytesIO(file_data)
1164
+
1165
+ # Return as plain dict (TypedDict is for type hints only)
1166
+ return {
1167
+ "file_path": file_path,
1168
+ "file_name": file_name,
1169
+ "file_extension": ext,
1170
+ "file_data": file_data,
1171
+ "file_stream": file_stream,
1172
+ "file_size": len(file_data)
1173
+ }
1174
+
1175
+ def _get_handler(self, ext: str) -> Optional[Callable]:
1176
+ """Get handler for file extension."""
1177
+ registry = self._get_handler_registry()
1178
+ return registry.get(ext)
1179
+
1180
+ def _invoke_handler(
1181
+ self,
1182
+ handler: Optional[Callable],
1183
+ current_file: CurrentFile,
1184
+ ext: str,
1185
+ extract_metadata: bool,
1186
+ **kwargs
1187
+ ) -> str:
1188
+ """
1189
+ Invoke the appropriate handler based on extension.
1190
+
1191
+ All handlers are class-based and use the same signature:
1192
+ handler(current_file, extract_metadata=..., **kwargs)
1193
+
1194
+ Args:
1195
+ handler: Handler method (bound method from Handler class)
1196
+ current_file: CurrentFile dict containing file info and binary data
1197
+ ext: File extension
1198
+ extract_metadata: Whether to extract metadata
1199
+ **kwargs: Additional options
1200
+
1201
+ Returns:
1202
+ Extracted text
1203
+ """
1204
+ if handler is None:
1205
+ raise ValueError(f"No handler available for extension: {ext}")
1206
+
1207
+ # Determine if this is a code file
1208
+ is_code = ext in self.CODE_TYPES
1209
+
1210
+ # Text-based files include file_type and is_code in kwargs
1211
+ text_extensions = (
1212
+ self.TEXT_TYPES |
1213
+ self.CODE_TYPES |
1214
+ self.CONFIG_TYPES |
1215
+ self.SCRIPT_TYPES |
1216
+ self.LOG_TYPES |
1217
+ self.WEB_TYPES
1218
+ )
1219
+
1220
+ if ext in text_extensions:
1221
+ return handler(current_file, extract_metadata=extract_metadata, file_type=ext, is_code=is_code, **kwargs)
1222
+
1223
+ # All other handlers use standard signature
1224
+ return handler(current_file, extract_metadata=extract_metadata, **kwargs)
1225
+
1226
+ # =========================================================================
1227
+ # Context Manager Support
1228
+ # =========================================================================
1229
+
1230
+ def __enter__(self) -> "DocumentProcessor":
1231
+ """Context manager entry."""
1232
+ return self
1233
+
1234
+ def __exit__(self, exc_type, exc_val, exc_tb) -> None:
1235
+ """Context manager exit."""
1236
+ # Perform resource cleanup here if needed
1237
+ pass
1238
+
1239
+ # =========================================================================
1240
+ # String Representation
1241
+ # =========================================================================
1242
+
1243
+ def __repr__(self) -> str:
1244
+ return f"DocumentProcessor(supported_extensions={len(self.supported_extensions)})"
1245
+
1246
+ def __str__(self) -> str:
1247
+ return f"xgen_doc2chunk DocumentProcessor ({len(self.supported_extensions)} supported formats)"
1248
+
1249
+
1250
+ # === Module-level Convenience Functions ===
1251
+
1252
+ def create_processor(
1253
+ config: Optional[Union[Dict[str, Any], Any]] = None,
1254
+ ocr_engine: Optional[Any] = None,
1255
+ *,
1256
+ image_directory: Optional[str] = None,
1257
+ image_tag_prefix: Optional[str] = None,
1258
+ image_tag_suffix: Optional[str] = None,
1259
+ **kwargs
1260
+ ) -> DocumentProcessor:
1261
+ """
1262
+ Create a DocumentProcessor instance.
1263
+
1264
+ Args:
1265
+ config: Configuration dictionary or ConfigComposer instance
1266
+ ocr_engine: OCR engine instance (BaseOCR subclass)
1267
+ image_directory: Directory path for saving extracted images
1268
+ image_tag_prefix: Prefix for image tags (default: "[Image:")
1269
+ image_tag_suffix: Suffix for image tags (default: "]")
1270
+ **kwargs: Additional configuration options
1271
+
1272
+ Returns:
1273
+ DocumentProcessor instance
1274
+
1275
+ Example:
1276
+ >>> processor = create_processor()
1277
+ >>> processor = create_processor(config={"vision_model": "gpt-4-vision"})
1278
+
1279
+ # With OCR engine
1280
+ >>> from xgen_doc2chunk.ocr.ocr_engine import OpenAIOCR
1281
+ >>> ocr = OpenAIOCR(api_key="sk-...", model="gpt-4o")
1282
+ >>> processor = create_processor(ocr_engine=ocr)
1283
+
1284
+ # With custom image tags (HTML format)
1285
+ >>> processor = create_processor(
1286
+ ... image_directory="output/images",
1287
+ ... image_tag_prefix="<img src='",
1288
+ ... image_tag_suffix="'/>"
1289
+ ... )
1290
+ """
1291
+ return DocumentProcessor(
1292
+ config=config,
1293
+ ocr_engine=ocr_engine,
1294
+ image_directory=image_directory,
1295
+ image_tag_prefix=image_tag_prefix,
1296
+ image_tag_suffix=image_tag_suffix,
1297
+ **kwargs
1298
+ )
1299
+
1300
+
1301
+ __all__ = [
1302
+ "DocumentProcessor",
1303
+ "CurrentFile",
1304
+ "ChunkResult",
1305
+ "create_processor",
1306
+ ]
1307
+