xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,1307 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/document_processor.py
|
|
2
|
+
"""DocumentProcessor - Document Processing Class
|
|
3
|
+
|
|
4
|
+
Main document processing class for the xgen_doc2chunk library.
|
|
5
|
+
Provides a unified interface for extracting text from various document formats
|
|
6
|
+
(PDF, DOCX, PPT, Excel, HWP, etc.) and performing text chunking.
|
|
7
|
+
|
|
8
|
+
This class is the recommended entry point when using the library.
|
|
9
|
+
|
|
10
|
+
Usage Example:
|
|
11
|
+
from xgen_doc2chunk.core.document_processor import DocumentProcessor
|
|
12
|
+
from xgen_doc2chunk.ocr.ocr_engine import OpenAIOCR
|
|
13
|
+
|
|
14
|
+
# Create instance (with optional OCR engine)
|
|
15
|
+
ocr_engine = OpenAIOCR(api_key="sk-...", model="gpt-4o")
|
|
16
|
+
processor = DocumentProcessor(ocr_engine=ocr_engine)
|
|
17
|
+
|
|
18
|
+
# Extract text from file
|
|
19
|
+
text = processor.extract_text(file_path, file_extension)
|
|
20
|
+
|
|
21
|
+
# Extract text with OCR processing
|
|
22
|
+
text = processor.extract_text(file_path, file_extension, ocr_processing=True)
|
|
23
|
+
|
|
24
|
+
# Chunk text
|
|
25
|
+
chunks = processor.chunk_text(text, chunk_size=1000)
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
import io
|
|
29
|
+
import logging
|
|
30
|
+
import os
|
|
31
|
+
from pathlib import Path
|
|
32
|
+
from typing import Any, Callable, Dict, List, Optional, Union, TypedDict
|
|
33
|
+
|
|
34
|
+
logger = logging.getLogger("xgen_doc2chunk")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class CurrentFile(TypedDict, total=False):
|
|
38
|
+
"""
|
|
39
|
+
TypedDict containing file information.
|
|
40
|
+
|
|
41
|
+
Standard structure for reading files at binary level and passing to handlers.
|
|
42
|
+
Resolves file system issues such as non-ASCII (Korean, etc.) paths.
|
|
43
|
+
|
|
44
|
+
Attributes:
|
|
45
|
+
file_path: Absolute path of the original file
|
|
46
|
+
file_name: File name (including extension)
|
|
47
|
+
file_extension: File extension (lowercase, without dot)
|
|
48
|
+
file_data: Binary data of the file
|
|
49
|
+
file_stream: BytesIO stream (reusable)
|
|
50
|
+
file_size: File size in bytes
|
|
51
|
+
"""
|
|
52
|
+
file_path: str
|
|
53
|
+
file_name: str
|
|
54
|
+
file_extension: str
|
|
55
|
+
file_data: bytes
|
|
56
|
+
file_stream: io.BytesIO
|
|
57
|
+
file_size: int
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class ChunkResult:
|
|
61
|
+
"""
|
|
62
|
+
Container class for extracted text chunks.
|
|
63
|
+
|
|
64
|
+
Provides convenient access to chunks and utility methods for saving.
|
|
65
|
+
Supports both simple text chunks and chunks with position metadata.
|
|
66
|
+
|
|
67
|
+
Attributes:
|
|
68
|
+
chunks: List of text chunks
|
|
69
|
+
chunks_with_metadata: List of chunk dictionaries with position metadata
|
|
70
|
+
source_file: Original source file path (if available)
|
|
71
|
+
has_metadata: Whether position metadata is available
|
|
72
|
+
|
|
73
|
+
Example:
|
|
74
|
+
>>> result = processor.extract_chunks("document.pdf")
|
|
75
|
+
>>> print(len(result.chunks))
|
|
76
|
+
>>> result.save_to_md("output/chunks")
|
|
77
|
+
>>>
|
|
78
|
+
>>> # Access position metadata (if available)
|
|
79
|
+
>>> if result.has_metadata:
|
|
80
|
+
... for chunk_data in result.chunks_with_metadata:
|
|
81
|
+
... print(f"Page {chunk_data['page_number']}: {chunk_data['text'][:50]}")
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
def __init__(
|
|
85
|
+
self,
|
|
86
|
+
chunks: Union[List[str], List[Dict[str, Any]]],
|
|
87
|
+
source_file: Optional[str] = None
|
|
88
|
+
):
|
|
89
|
+
"""
|
|
90
|
+
Initialize ChunkResult.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
chunks: List of text chunks or list of chunk dictionaries with metadata
|
|
94
|
+
source_file: Original source file path
|
|
95
|
+
"""
|
|
96
|
+
self._source_file = source_file
|
|
97
|
+
|
|
98
|
+
# Detect if chunks contain metadata (list of dicts with 'text' key)
|
|
99
|
+
if chunks and isinstance(chunks[0], dict) and 'text' in chunks[0]:
|
|
100
|
+
self._chunks_with_metadata = chunks
|
|
101
|
+
self._chunks = [c['text'] for c in chunks]
|
|
102
|
+
self._has_metadata = True
|
|
103
|
+
else:
|
|
104
|
+
self._chunks = chunks if chunks else []
|
|
105
|
+
self._chunks_with_metadata = None
|
|
106
|
+
self._has_metadata = False
|
|
107
|
+
|
|
108
|
+
@property
|
|
109
|
+
def chunks(self) -> List[str]:
|
|
110
|
+
"""Return list of text chunks."""
|
|
111
|
+
return self._chunks
|
|
112
|
+
|
|
113
|
+
@property
|
|
114
|
+
def chunks_with_metadata(self) -> Optional[List[Dict[str, Any]]]:
|
|
115
|
+
"""
|
|
116
|
+
Return list of chunks with position metadata.
|
|
117
|
+
|
|
118
|
+
Each chunk dictionary contains:
|
|
119
|
+
- text: Chunk text content
|
|
120
|
+
- page_number: Page number where chunk starts
|
|
121
|
+
- line_start: Starting line number
|
|
122
|
+
- line_end: Ending line number
|
|
123
|
+
- global_start: Global character start position
|
|
124
|
+
- global_end: Global character end position
|
|
125
|
+
- chunk_index: Index of this chunk
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
List of chunk dictionaries if metadata available, None otherwise
|
|
129
|
+
"""
|
|
130
|
+
return self._chunks_with_metadata
|
|
131
|
+
|
|
132
|
+
@property
|
|
133
|
+
def has_metadata(self) -> bool:
|
|
134
|
+
"""Return whether position metadata is available."""
|
|
135
|
+
return self._has_metadata
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
def source_file(self) -> Optional[str]:
|
|
139
|
+
"""Return original source file path."""
|
|
140
|
+
return self._source_file
|
|
141
|
+
|
|
142
|
+
def save_to_md(
|
|
143
|
+
self,
|
|
144
|
+
path: Optional[Union[str, Path]] = None,
|
|
145
|
+
*,
|
|
146
|
+
filename: str = "chunks.md",
|
|
147
|
+
separator: str = "---",
|
|
148
|
+
include_metadata: bool = True
|
|
149
|
+
) -> str:
|
|
150
|
+
"""
|
|
151
|
+
Save all chunks to a single markdown file with separators.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
path: File path or directory to save (default: current directory)
|
|
155
|
+
- If path ends with .md, uses it as the file path
|
|
156
|
+
- Otherwise, treats as directory and uses filename parameter
|
|
157
|
+
filename: Filename to use when path is a directory (default: "chunks.md")
|
|
158
|
+
separator: Separator string between chunks (default: "---")
|
|
159
|
+
include_metadata: Whether to include metadata header
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
Saved file path
|
|
163
|
+
|
|
164
|
+
Example:
|
|
165
|
+
>>> result = processor.extract_chunks("document.pdf")
|
|
166
|
+
>>> saved_path = result.save_to_md()
|
|
167
|
+
>>> # Creates: ./chunks.md
|
|
168
|
+
|
|
169
|
+
>>> result.save_to_md("output/my_chunks.md")
|
|
170
|
+
>>> # Creates: output/my_chunks.md
|
|
171
|
+
|
|
172
|
+
>>> result.save_to_md("output/", filename="document_chunks.md")
|
|
173
|
+
>>> # Creates: output/document_chunks.md
|
|
174
|
+
"""
|
|
175
|
+
# Determine save path
|
|
176
|
+
if path is None:
|
|
177
|
+
file_path = Path.cwd() / filename
|
|
178
|
+
else:
|
|
179
|
+
path = Path(path)
|
|
180
|
+
if path.suffix.lower() == ".md":
|
|
181
|
+
file_path = path
|
|
182
|
+
else:
|
|
183
|
+
# Treat as directory
|
|
184
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
185
|
+
file_path = path / filename
|
|
186
|
+
|
|
187
|
+
# Ensure parent directory exists
|
|
188
|
+
file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
189
|
+
|
|
190
|
+
# Handle duplicate filename
|
|
191
|
+
if file_path.exists():
|
|
192
|
+
base = file_path.stem
|
|
193
|
+
suffix = file_path.suffix
|
|
194
|
+
parent = file_path.parent
|
|
195
|
+
counter = 1
|
|
196
|
+
while file_path.exists():
|
|
197
|
+
file_path = parent / f"{base}_{counter}{suffix}"
|
|
198
|
+
counter += 1
|
|
199
|
+
|
|
200
|
+
total_chunks = len(self._chunks)
|
|
201
|
+
content_parts = []
|
|
202
|
+
|
|
203
|
+
# Add metadata header
|
|
204
|
+
if include_metadata:
|
|
205
|
+
content_parts.append("---")
|
|
206
|
+
content_parts.append(f"total_chunks: {total_chunks}")
|
|
207
|
+
if self._source_file:
|
|
208
|
+
content_parts.append(f"source_file: {self._source_file}")
|
|
209
|
+
content_parts.append("---")
|
|
210
|
+
content_parts.append("")
|
|
211
|
+
|
|
212
|
+
# Add each chunk with separator
|
|
213
|
+
for idx, chunk in enumerate(self._chunks, start=1):
|
|
214
|
+
content_parts.append(f"## Chunk {idx}/{total_chunks}")
|
|
215
|
+
content_parts.append("")
|
|
216
|
+
content_parts.append(chunk)
|
|
217
|
+
content_parts.append("")
|
|
218
|
+
|
|
219
|
+
# Add separator between chunks (not after the last one)
|
|
220
|
+
if idx < total_chunks:
|
|
221
|
+
content_parts.append(separator)
|
|
222
|
+
content_parts.append("")
|
|
223
|
+
|
|
224
|
+
# Write file (handle surrogate characters)
|
|
225
|
+
content = "\n".join(content_parts)
|
|
226
|
+
# Remove surrogate characters that can't be encoded in UTF-8
|
|
227
|
+
content = content.encode('utf-8', errors='surrogatepass').decode('utf-8', errors='replace')
|
|
228
|
+
file_path.write_text(content, encoding="utf-8")
|
|
229
|
+
|
|
230
|
+
logger.info(f"Saved {total_chunks} chunks to {file_path}")
|
|
231
|
+
return str(file_path)
|
|
232
|
+
|
|
233
|
+
def __len__(self) -> int:
|
|
234
|
+
"""Return number of chunks."""
|
|
235
|
+
return len(self._chunks)
|
|
236
|
+
|
|
237
|
+
def __iter__(self):
|
|
238
|
+
"""Iterate over chunks."""
|
|
239
|
+
return iter(self._chunks)
|
|
240
|
+
|
|
241
|
+
def __getitem__(self, index: int) -> str:
|
|
242
|
+
"""Get chunk by index."""
|
|
243
|
+
return self._chunks[index]
|
|
244
|
+
|
|
245
|
+
def __repr__(self) -> str:
|
|
246
|
+
return f"ChunkResult(chunks={len(self._chunks)}, source_file={self._source_file!r})"
|
|
247
|
+
|
|
248
|
+
def __str__(self) -> str:
|
|
249
|
+
return f"ChunkResult with {len(self._chunks)} chunks"
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
class DocumentProcessor:
|
|
253
|
+
"""
|
|
254
|
+
xgen_doc2chunk Main Document Processing Class
|
|
255
|
+
|
|
256
|
+
A unified interface for processing various document formats and extracting text.
|
|
257
|
+
|
|
258
|
+
Attributes:
|
|
259
|
+
config: Configuration dictionary or ConfigComposer instance
|
|
260
|
+
supported_extensions: List of supported file extensions
|
|
261
|
+
|
|
262
|
+
Example:
|
|
263
|
+
>>> processor = DocumentProcessor()
|
|
264
|
+
>>> text = processor.extract_text("document.pdf", "pdf")
|
|
265
|
+
>>> chunks = processor.chunk_text(text, chunk_size=1000)
|
|
266
|
+
"""
|
|
267
|
+
|
|
268
|
+
# === Supported File Type Classifications ===
|
|
269
|
+
DOCUMENT_TYPES = frozenset(['pdf', 'docx', 'doc', 'rtf', 'pptx', 'ppt', 'hwp', 'hwpx'])
|
|
270
|
+
TEXT_TYPES = frozenset(['txt', 'md', 'markdown'])
|
|
271
|
+
CODE_TYPES = frozenset([
|
|
272
|
+
'py', 'js', 'ts', 'java', 'cpp', 'c', 'h', 'cs', 'go', 'rs',
|
|
273
|
+
'php', 'rb', 'swift', 'kt', 'scala', 'dart', 'r', 'sql',
|
|
274
|
+
'html', 'css', 'jsx', 'tsx', 'vue', 'svelte'
|
|
275
|
+
])
|
|
276
|
+
CONFIG_TYPES = frozenset(['json', 'yaml', 'yml', 'xml', 'toml', 'ini', 'cfg', 'conf', 'properties', 'env'])
|
|
277
|
+
DATA_TYPES = frozenset(['csv', 'tsv', 'xlsx', 'xls'])
|
|
278
|
+
SCRIPT_TYPES = frozenset(['sh', 'bat', 'ps1', 'zsh', 'fish'])
|
|
279
|
+
LOG_TYPES = frozenset(['log'])
|
|
280
|
+
WEB_TYPES = frozenset(['htm', 'xhtml'])
|
|
281
|
+
IMAGE_TYPES = frozenset(['jpg', 'jpeg', 'png', 'gif', 'bmp', 'webp'])
|
|
282
|
+
|
|
283
|
+
def __init__(
|
|
284
|
+
self,
|
|
285
|
+
config: Optional[Union[Dict[str, Any], Any]] = None,
|
|
286
|
+
ocr_engine: Optional[Any] = None,
|
|
287
|
+
*,
|
|
288
|
+
image_directory: Optional[str] = None,
|
|
289
|
+
image_tag_prefix: Optional[str] = None,
|
|
290
|
+
image_tag_suffix: Optional[str] = None,
|
|
291
|
+
page_tag_prefix: Optional[str] = None,
|
|
292
|
+
page_tag_suffix: Optional[str] = None,
|
|
293
|
+
slide_tag_prefix: Optional[str] = None,
|
|
294
|
+
slide_tag_suffix: Optional[str] = None,
|
|
295
|
+
chart_tag_prefix: Optional[str] = None,
|
|
296
|
+
chart_tag_suffix: Optional[str] = None,
|
|
297
|
+
metadata_tag_prefix: Optional[str] = None,
|
|
298
|
+
metadata_tag_suffix: Optional[str] = None,
|
|
299
|
+
**kwargs
|
|
300
|
+
):
|
|
301
|
+
"""
|
|
302
|
+
Initialize DocumentProcessor.
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
config: Configuration dictionary or ConfigComposer instance
|
|
306
|
+
- Dict: Pass configuration dictionary directly
|
|
307
|
+
- ConfigComposer: Existing config_composer instance
|
|
308
|
+
- None: Use default settings
|
|
309
|
+
ocr_engine: OCR engine instance (BaseOCR subclass)
|
|
310
|
+
- If provided, OCR processing can be enabled in extract_text
|
|
311
|
+
- Example: OpenAIOCR, AnthropicOCR, GeminiOCR, VllmOCR
|
|
312
|
+
image_directory: Directory path for saving extracted images
|
|
313
|
+
- Default: "temp/images"
|
|
314
|
+
image_tag_prefix: Prefix for image tags in extracted text
|
|
315
|
+
- Default: "[Image:"
|
|
316
|
+
- Example: "<img src='" for HTML format
|
|
317
|
+
image_tag_suffix: Suffix for image tags in extracted text
|
|
318
|
+
- Default: "]"
|
|
319
|
+
- Example: "'/>" for HTML format
|
|
320
|
+
page_tag_prefix: Prefix for page number tags in extracted text
|
|
321
|
+
- Default: "[Page Number: "
|
|
322
|
+
- Example: "<page>" for XML format
|
|
323
|
+
page_tag_suffix: Suffix for page number tags in extracted text
|
|
324
|
+
- Default: "]"
|
|
325
|
+
- Example: "</page>" for XML format
|
|
326
|
+
slide_tag_prefix: Prefix for slide number tags (presentations)
|
|
327
|
+
- Default: "[Slide Number: "
|
|
328
|
+
slide_tag_suffix: Suffix for slide number tags
|
|
329
|
+
- Default: "]"
|
|
330
|
+
chart_tag_prefix: Prefix for chart tags in extracted text
|
|
331
|
+
- Default: "[chart]"
|
|
332
|
+
- Example: "<chart>" for XML format
|
|
333
|
+
chart_tag_suffix: Suffix for chart tags in extracted text
|
|
334
|
+
- Default: "[/chart]"
|
|
335
|
+
- Example: "</chart>" for XML format
|
|
336
|
+
metadata_tag_prefix: Opening tag for metadata section
|
|
337
|
+
- Default: "<Document-Metadata>"
|
|
338
|
+
- Example: "<metadata>" for custom format
|
|
339
|
+
metadata_tag_suffix: Closing tag for metadata section
|
|
340
|
+
- Default: "</Document-Metadata>"
|
|
341
|
+
- Example: "</metadata>" for custom format
|
|
342
|
+
**kwargs: Additional configuration options
|
|
343
|
+
|
|
344
|
+
Example:
|
|
345
|
+
>>> # Default tags: [Image:...], [Page Number: 1]
|
|
346
|
+
>>> processor = DocumentProcessor()
|
|
347
|
+
|
|
348
|
+
>>> # Custom HTML format
|
|
349
|
+
>>> processor = DocumentProcessor(
|
|
350
|
+
... image_directory="output/images",
|
|
351
|
+
... image_tag_prefix="<img src='",
|
|
352
|
+
... image_tag_suffix="'/>",
|
|
353
|
+
... page_tag_prefix="<page>",
|
|
354
|
+
... page_tag_suffix="</page>",
|
|
355
|
+
... chart_tag_prefix="<chart>",
|
|
356
|
+
... chart_tag_suffix="</chart>",
|
|
357
|
+
... metadata_tag_prefix="<meta>",
|
|
358
|
+
... metadata_tag_suffix="</meta>"
|
|
359
|
+
... )
|
|
360
|
+
|
|
361
|
+
>>> # Markdown format
|
|
362
|
+
>>> processor = DocumentProcessor(
|
|
363
|
+
... image_tag_prefix="",
|
|
365
|
+
... page_tag_prefix="<!-- Page ",
|
|
366
|
+
... page_tag_suffix=" -->",
|
|
367
|
+
... chart_tag_prefix="```chart",
|
|
368
|
+
... chart_tag_suffix="```"
|
|
369
|
+
... )
|
|
370
|
+
"""
|
|
371
|
+
self._config = config or {}
|
|
372
|
+
self._ocr_engine = ocr_engine
|
|
373
|
+
self._kwargs = kwargs
|
|
374
|
+
self._supported_extensions: Optional[List[str]] = None
|
|
375
|
+
|
|
376
|
+
# Store metadata tag settings
|
|
377
|
+
self._metadata_tag_prefix = metadata_tag_prefix
|
|
378
|
+
self._metadata_tag_suffix = metadata_tag_suffix
|
|
379
|
+
|
|
380
|
+
# Logger setup
|
|
381
|
+
self._logger = logging.getLogger("xgen_doc2chunk.processor")
|
|
382
|
+
|
|
383
|
+
# Cache for library availability check results
|
|
384
|
+
self._library_availability: Optional[Dict[str, bool]] = None
|
|
385
|
+
|
|
386
|
+
# Handler registry
|
|
387
|
+
self._handler_registry: Optional[Dict[str, Callable]] = None
|
|
388
|
+
|
|
389
|
+
# Create instance-specific ImageProcessor
|
|
390
|
+
self._image_processor = self._create_image_processor(
|
|
391
|
+
directory=image_directory,
|
|
392
|
+
tag_prefix=image_tag_prefix,
|
|
393
|
+
tag_suffix=image_tag_suffix
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
# Create instance-specific PageTagProcessor
|
|
397
|
+
self._page_tag_processor = self._create_page_tag_processor(
|
|
398
|
+
page_tag_prefix=page_tag_prefix,
|
|
399
|
+
page_tag_suffix=page_tag_suffix,
|
|
400
|
+
slide_tag_prefix=slide_tag_prefix,
|
|
401
|
+
slide_tag_suffix=slide_tag_suffix
|
|
402
|
+
)
|
|
403
|
+
|
|
404
|
+
# Create instance-specific ChartProcessor
|
|
405
|
+
self._chart_processor = self._create_chart_processor(
|
|
406
|
+
chart_tag_prefix=chart_tag_prefix,
|
|
407
|
+
chart_tag_suffix=chart_tag_suffix
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
# Create instance-specific MetadataFormatter
|
|
411
|
+
self._metadata_formatter = self._create_metadata_formatter(
|
|
412
|
+
metadata_tag_prefix=metadata_tag_prefix,
|
|
413
|
+
metadata_tag_suffix=metadata_tag_suffix
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
# Add processors to config for handlers to access
|
|
417
|
+
if isinstance(self._config, dict):
|
|
418
|
+
self._config["image_processor"] = self._image_processor
|
|
419
|
+
self._config["page_tag_processor"] = self._page_tag_processor
|
|
420
|
+
self._config["chart_processor"] = self._chart_processor
|
|
421
|
+
self._config["metadata_formatter"] = self._metadata_formatter
|
|
422
|
+
|
|
423
|
+
# =========================================================================
|
|
424
|
+
# Public Properties
|
|
425
|
+
# =========================================================================
|
|
426
|
+
|
|
427
|
+
@property
|
|
428
|
+
def supported_extensions(self) -> List[str]:
|
|
429
|
+
"""List of all supported file extensions."""
|
|
430
|
+
if self._supported_extensions is None:
|
|
431
|
+
self._supported_extensions = self._build_supported_extensions()
|
|
432
|
+
return self._supported_extensions.copy()
|
|
433
|
+
|
|
434
|
+
@property
|
|
435
|
+
def config(self) -> Optional[Union[Dict[str, Any], Any]]:
|
|
436
|
+
"""Current configuration."""
|
|
437
|
+
return self._config
|
|
438
|
+
|
|
439
|
+
@property
|
|
440
|
+
def image_config(self) -> Dict[str, Any]:
|
|
441
|
+
"""
|
|
442
|
+
Current image processor configuration.
|
|
443
|
+
|
|
444
|
+
Returns:
|
|
445
|
+
Dictionary containing:
|
|
446
|
+
- directory_path: Image save directory
|
|
447
|
+
- tag_prefix: Image tag prefix
|
|
448
|
+
- tag_suffix: Image tag suffix
|
|
449
|
+
- naming_strategy: File naming strategy
|
|
450
|
+
"""
|
|
451
|
+
return {
|
|
452
|
+
"directory_path": self._image_processor.config.directory_path,
|
|
453
|
+
"tag_prefix": self._image_processor.config.tag_prefix,
|
|
454
|
+
"tag_suffix": self._image_processor.config.tag_suffix,
|
|
455
|
+
"naming_strategy": self._image_processor.config.naming_strategy.value,
|
|
456
|
+
}
|
|
457
|
+
|
|
458
|
+
@property
|
|
459
|
+
def image_processor(self) -> Any:
|
|
460
|
+
"""Current ImageProcessor instance for this DocumentProcessor."""
|
|
461
|
+
return self._image_processor
|
|
462
|
+
|
|
463
|
+
@property
|
|
464
|
+
def page_tag_config(self) -> Dict[str, Any]:
|
|
465
|
+
"""
|
|
466
|
+
Current page tag processor configuration.
|
|
467
|
+
|
|
468
|
+
Returns:
|
|
469
|
+
Dictionary containing:
|
|
470
|
+
- tag_prefix: Page tag prefix
|
|
471
|
+
- tag_suffix: Page tag suffix
|
|
472
|
+
- slide_prefix: Slide tag prefix
|
|
473
|
+
- slide_suffix: Slide tag suffix
|
|
474
|
+
- sheet_prefix: Sheet tag prefix
|
|
475
|
+
- sheet_suffix: Sheet tag suffix
|
|
476
|
+
"""
|
|
477
|
+
return {
|
|
478
|
+
"tag_prefix": self._page_tag_processor.config.tag_prefix,
|
|
479
|
+
"tag_suffix": self._page_tag_processor.config.tag_suffix,
|
|
480
|
+
"slide_prefix": self._page_tag_processor.config.slide_prefix,
|
|
481
|
+
"slide_suffix": self._page_tag_processor.config.slide_suffix,
|
|
482
|
+
"sheet_prefix": self._page_tag_processor.config.sheet_prefix,
|
|
483
|
+
"sheet_suffix": self._page_tag_processor.config.sheet_suffix,
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
@property
|
|
487
|
+
def page_tag_processor(self) -> Any:
|
|
488
|
+
"""Current PageTagProcessor instance for this DocumentProcessor."""
|
|
489
|
+
return self._page_tag_processor
|
|
490
|
+
|
|
491
|
+
@property
|
|
492
|
+
def chart_tag_config(self) -> Dict[str, Any]:
|
|
493
|
+
"""
|
|
494
|
+
Current chart processor configuration.
|
|
495
|
+
|
|
496
|
+
Returns:
|
|
497
|
+
Dictionary containing:
|
|
498
|
+
- tag_prefix: Chart tag prefix
|
|
499
|
+
- tag_suffix: Chart tag suffix
|
|
500
|
+
"""
|
|
501
|
+
return {
|
|
502
|
+
"tag_prefix": self._chart_processor.config.tag_prefix,
|
|
503
|
+
"tag_suffix": self._chart_processor.config.tag_suffix,
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
@property
|
|
507
|
+
def chart_processor(self) -> Any:
|
|
508
|
+
"""Current ChartProcessor instance for this DocumentProcessor."""
|
|
509
|
+
return self._chart_processor
|
|
510
|
+
|
|
511
|
+
@property
|
|
512
|
+
def metadata_tag_config(self) -> Dict[str, Any]:
|
|
513
|
+
"""
|
|
514
|
+
Current metadata formatter configuration.
|
|
515
|
+
|
|
516
|
+
Returns:
|
|
517
|
+
Dictionary containing:
|
|
518
|
+
- metadata_tag_prefix: Opening tag for metadata section
|
|
519
|
+
- metadata_tag_suffix: Closing tag for metadata section
|
|
520
|
+
"""
|
|
521
|
+
return {
|
|
522
|
+
"metadata_tag_prefix": self._metadata_formatter.metadata_tag_prefix,
|
|
523
|
+
"metadata_tag_suffix": self._metadata_formatter.metadata_tag_suffix,
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
@property
|
|
527
|
+
def metadata_formatter(self) -> Any:
|
|
528
|
+
"""Current MetadataFormatter instance for this DocumentProcessor."""
|
|
529
|
+
return self._metadata_formatter
|
|
530
|
+
|
|
531
|
+
@property
|
|
532
|
+
def ocr_engine(self) -> Optional[Any]:
|
|
533
|
+
"""Current OCR engine instance."""
|
|
534
|
+
return self._ocr_engine
|
|
535
|
+
|
|
536
|
+
@ocr_engine.setter
|
|
537
|
+
def ocr_engine(self, engine: Optional[Any]) -> None:
|
|
538
|
+
"""
|
|
539
|
+
Set OCR engine instance.
|
|
540
|
+
|
|
541
|
+
When OCR engine is changed, the handler registry is invalidated
|
|
542
|
+
to ensure ImageFileHandler gets the updated OCR engine.
|
|
543
|
+
"""
|
|
544
|
+
self._ocr_engine = engine
|
|
545
|
+
# Invalidate handler registry so it gets rebuilt with new OCR engine
|
|
546
|
+
self._handler_registry = None
|
|
547
|
+
|
|
548
|
+
# =========================================================================
|
|
549
|
+
# Public Methods - Text Extraction
|
|
550
|
+
# =========================================================================
|
|
551
|
+
|
|
552
|
+
def extract_text(
|
|
553
|
+
self,
|
|
554
|
+
file_path: Union[str, Path],
|
|
555
|
+
file_extension: Optional[str] = None,
|
|
556
|
+
*,
|
|
557
|
+
extract_metadata: bool = True,
|
|
558
|
+
ocr_processing: bool = False,
|
|
559
|
+
**kwargs
|
|
560
|
+
) -> str:
|
|
561
|
+
"""
|
|
562
|
+
Extract text from a file.
|
|
563
|
+
|
|
564
|
+
Args:
|
|
565
|
+
file_path: File path
|
|
566
|
+
file_extension: File extension (if None, auto-extracted from file_path)
|
|
567
|
+
extract_metadata: Whether to extract metadata
|
|
568
|
+
ocr_processing: Whether to perform OCR on image tags in extracted text
|
|
569
|
+
- If True and ocr_engine is set, processes [Image:...] tags
|
|
570
|
+
- If True but ocr_engine is None, skips OCR processing
|
|
571
|
+
**kwargs: Additional handler-specific options
|
|
572
|
+
|
|
573
|
+
Returns:
|
|
574
|
+
Extracted text string
|
|
575
|
+
|
|
576
|
+
Raises:
|
|
577
|
+
FileNotFoundError: If file cannot be found
|
|
578
|
+
ValueError: If file format is not supported
|
|
579
|
+
"""
|
|
580
|
+
# Convert to string path
|
|
581
|
+
file_path_str = str(file_path)
|
|
582
|
+
|
|
583
|
+
# Check file existence
|
|
584
|
+
if not os.path.exists(file_path_str):
|
|
585
|
+
raise FileNotFoundError(f"File not found: {file_path_str}")
|
|
586
|
+
|
|
587
|
+
# Extract extension if not provided
|
|
588
|
+
if file_extension is None:
|
|
589
|
+
file_extension = os.path.splitext(file_path_str)[1].lstrip('.')
|
|
590
|
+
|
|
591
|
+
ext = file_extension.lower().lstrip('.')
|
|
592
|
+
|
|
593
|
+
# Check if extension is supported
|
|
594
|
+
if not self.is_supported(ext):
|
|
595
|
+
raise ValueError(f"Unsupported file format: {ext}")
|
|
596
|
+
|
|
597
|
+
self._logger.info(f"Extracting text from: {file_path_str} (ext={ext})")
|
|
598
|
+
|
|
599
|
+
# Create current_file dict with binary data
|
|
600
|
+
current_file = self._create_current_file(file_path_str, ext)
|
|
601
|
+
|
|
602
|
+
# Get handler and extract text
|
|
603
|
+
handler = self._get_handler(ext)
|
|
604
|
+
text = self._invoke_handler(handler, current_file, ext, extract_metadata, **kwargs)
|
|
605
|
+
|
|
606
|
+
# Apply OCR processing if enabled and ocr_engine is available
|
|
607
|
+
if ocr_processing and self._ocr_engine is not None:
|
|
608
|
+
self._logger.info(f"Applying OCR processing with {self._ocr_engine}")
|
|
609
|
+
# Get image pattern from ImageProcessor to pass to OCR engine
|
|
610
|
+
import re
|
|
611
|
+
image_pattern = re.compile(self._image_processor.get_pattern_string())
|
|
612
|
+
text = self._ocr_engine.process_text(text, image_pattern=image_pattern)
|
|
613
|
+
elif ocr_processing and self._ocr_engine is None:
|
|
614
|
+
self._logger.warning("OCR processing requested but no ocr_engine is configured. Skipping OCR.")
|
|
615
|
+
|
|
616
|
+
return text
|
|
617
|
+
|
|
618
|
+
# =========================================================================
|
|
619
|
+
# Public Methods - Text Chunking
|
|
620
|
+
# =========================================================================
|
|
621
|
+
|
|
622
|
+
def chunk_text(
|
|
623
|
+
self,
|
|
624
|
+
text: str,
|
|
625
|
+
*,
|
|
626
|
+
chunk_size: int = 1000,
|
|
627
|
+
chunk_overlap: int = 200,
|
|
628
|
+
file_extension: Optional[str] = None,
|
|
629
|
+
preserve_tables: bool = True,
|
|
630
|
+
include_position_metadata: bool = False,
|
|
631
|
+
) -> Union[List[str], List[Dict[str, Any]]]:
|
|
632
|
+
"""
|
|
633
|
+
Split text into chunks.
|
|
634
|
+
|
|
635
|
+
Args:
|
|
636
|
+
text: Text to split
|
|
637
|
+
chunk_size: Chunk size (character count)
|
|
638
|
+
chunk_overlap: Overlap size between chunks
|
|
639
|
+
file_extension: File extension (used for table-based file processing)
|
|
640
|
+
preserve_tables: Whether to preserve table structure
|
|
641
|
+
include_position_metadata: Whether to include position metadata
|
|
642
|
+
- True: Returns list of dicts with text, page_number, line_start, etc.
|
|
643
|
+
- False: Returns list of text strings (default)
|
|
644
|
+
|
|
645
|
+
Returns:
|
|
646
|
+
List of chunk strings or list of chunk dictionaries with metadata
|
|
647
|
+
"""
|
|
648
|
+
from xgen_doc2chunk.chunking.chunking import create_chunks
|
|
649
|
+
|
|
650
|
+
if not text or not text.strip():
|
|
651
|
+
return [""]
|
|
652
|
+
|
|
653
|
+
# Use force_chunking to disable table protection if preserve_tables is False
|
|
654
|
+
force_chunking = not preserve_tables
|
|
655
|
+
|
|
656
|
+
result = create_chunks(
|
|
657
|
+
text=text,
|
|
658
|
+
file_extension=file_extension or "",
|
|
659
|
+
chunk_size=chunk_size,
|
|
660
|
+
chunk_overlap=chunk_overlap,
|
|
661
|
+
force_chunking=force_chunking,
|
|
662
|
+
include_position_metadata=include_position_metadata,
|
|
663
|
+
page_tag_processor=self._page_tag_processor,
|
|
664
|
+
image_processor=self._image_processor,
|
|
665
|
+
chart_processor=self._chart_processor,
|
|
666
|
+
metadata_formatter=self._metadata_formatter
|
|
667
|
+
)
|
|
668
|
+
|
|
669
|
+
return result
|
|
670
|
+
|
|
671
|
+
def extract_chunks(
|
|
672
|
+
self,
|
|
673
|
+
file_path: Union[str, Path],
|
|
674
|
+
file_extension: Optional[str] = None,
|
|
675
|
+
*,
|
|
676
|
+
extract_metadata: bool = True,
|
|
677
|
+
ocr_processing: bool = False,
|
|
678
|
+
chunk_size: int = 1000,
|
|
679
|
+
chunk_overlap: int = 200,
|
|
680
|
+
preserve_tables: bool = True,
|
|
681
|
+
include_position_metadata: bool = False,
|
|
682
|
+
**kwargs
|
|
683
|
+
) -> ChunkResult:
|
|
684
|
+
"""
|
|
685
|
+
Extract text from a file and split into chunks in one step.
|
|
686
|
+
|
|
687
|
+
This is a convenience method that combines extract_text() and chunk_text().
|
|
688
|
+
Returns a ChunkResult object that provides convenient access to chunks
|
|
689
|
+
and utility methods for saving.
|
|
690
|
+
|
|
691
|
+
Args:
|
|
692
|
+
file_path: File path
|
|
693
|
+
file_extension: File extension (if None, auto-extracted from file_path)
|
|
694
|
+
extract_metadata: Whether to extract metadata
|
|
695
|
+
ocr_processing: Whether to perform OCR on image tags in extracted text
|
|
696
|
+
chunk_size: Chunk size (character count)
|
|
697
|
+
chunk_overlap: Overlap size between chunks
|
|
698
|
+
preserve_tables: Whether to preserve table structure
|
|
699
|
+
include_position_metadata: Whether to include position metadata
|
|
700
|
+
- True: Each chunk includes page_number, line_start, line_end, etc.
|
|
701
|
+
- False: Standard text chunks only (default)
|
|
702
|
+
**kwargs: Additional handler-specific options
|
|
703
|
+
|
|
704
|
+
Returns:
|
|
705
|
+
ChunkResult object containing chunks with utility methods
|
|
706
|
+
- .chunks: Access list of chunk strings
|
|
707
|
+
- .chunks_with_metadata: Access chunks with position metadata (if enabled)
|
|
708
|
+
- .has_metadata: Check if position metadata is available
|
|
709
|
+
- .save_to_md(path): Save chunks as markdown files
|
|
710
|
+
|
|
711
|
+
Raises:
|
|
712
|
+
FileNotFoundError: If file cannot be found
|
|
713
|
+
ValueError: If file format is not supported
|
|
714
|
+
|
|
715
|
+
Example:
|
|
716
|
+
>>> processor = DocumentProcessor()
|
|
717
|
+
>>> result = processor.extract_chunks("document.pdf", chunk_size=1000)
|
|
718
|
+
>>> for i, chunk in enumerate(result.chunks):
|
|
719
|
+
... print(f"Chunk {i+1}: {len(chunk)} chars")
|
|
720
|
+
>>> # Save chunks to markdown files
|
|
721
|
+
>>> result.save_to_md("output/chunks")
|
|
722
|
+
>>>
|
|
723
|
+
>>> # With position metadata
|
|
724
|
+
>>> result = processor.extract_chunks("doc.pdf", include_position_metadata=True)
|
|
725
|
+
>>> if result.has_metadata:
|
|
726
|
+
... for chunk_data in result.chunks_with_metadata:
|
|
727
|
+
... print(f"Page {chunk_data['page_number']}: lines {chunk_data['line_start']}-{chunk_data['line_end']}")
|
|
728
|
+
"""
|
|
729
|
+
# Extract text
|
|
730
|
+
text = self.extract_text(
|
|
731
|
+
file_path=file_path,
|
|
732
|
+
file_extension=file_extension,
|
|
733
|
+
extract_metadata=extract_metadata,
|
|
734
|
+
ocr_processing=ocr_processing,
|
|
735
|
+
**kwargs
|
|
736
|
+
)
|
|
737
|
+
|
|
738
|
+
# Determine file extension for chunking
|
|
739
|
+
if file_extension is None:
|
|
740
|
+
file_extension = os.path.splitext(str(file_path))[1].lstrip('.')
|
|
741
|
+
|
|
742
|
+
# Chunk text
|
|
743
|
+
chunks = self.chunk_text(
|
|
744
|
+
text=text,
|
|
745
|
+
chunk_size=chunk_size,
|
|
746
|
+
chunk_overlap=chunk_overlap,
|
|
747
|
+
file_extension=file_extension,
|
|
748
|
+
preserve_tables=preserve_tables,
|
|
749
|
+
include_position_metadata=include_position_metadata
|
|
750
|
+
)
|
|
751
|
+
|
|
752
|
+
# Return ChunkResult with source file info
|
|
753
|
+
return ChunkResult(
|
|
754
|
+
chunks=chunks,
|
|
755
|
+
source_file=str(file_path)
|
|
756
|
+
)
|
|
757
|
+
|
|
758
|
+
# =========================================================================
|
|
759
|
+
# Public Methods - Utilities
|
|
760
|
+
# =========================================================================
|
|
761
|
+
|
|
762
|
+
def get_file_category(self, file_extension: str) -> str:
|
|
763
|
+
"""
|
|
764
|
+
Return the category of a file extension.
|
|
765
|
+
|
|
766
|
+
Args:
|
|
767
|
+
file_extension: File extension
|
|
768
|
+
|
|
769
|
+
Returns:
|
|
770
|
+
Category string ('document', 'text', 'code', 'data', etc.)
|
|
771
|
+
"""
|
|
772
|
+
ext = file_extension.lower().lstrip('.')
|
|
773
|
+
|
|
774
|
+
if ext in self.DOCUMENT_TYPES:
|
|
775
|
+
return 'document'
|
|
776
|
+
if ext in self.TEXT_TYPES:
|
|
777
|
+
return 'text'
|
|
778
|
+
if ext in self.CODE_TYPES:
|
|
779
|
+
return 'code'
|
|
780
|
+
if ext in self.CONFIG_TYPES:
|
|
781
|
+
return 'config'
|
|
782
|
+
if ext in self.DATA_TYPES:
|
|
783
|
+
return 'data'
|
|
784
|
+
if ext in self.SCRIPT_TYPES:
|
|
785
|
+
return 'script'
|
|
786
|
+
if ext in self.LOG_TYPES:
|
|
787
|
+
return 'log'
|
|
788
|
+
if ext in self.WEB_TYPES:
|
|
789
|
+
return 'web'
|
|
790
|
+
if ext in self.IMAGE_TYPES:
|
|
791
|
+
return 'image'
|
|
792
|
+
|
|
793
|
+
return 'unknown'
|
|
794
|
+
|
|
795
|
+
def is_supported(self, file_extension: str) -> bool:
|
|
796
|
+
"""
|
|
797
|
+
Check if a file extension is supported.
|
|
798
|
+
|
|
799
|
+
Args:
|
|
800
|
+
file_extension: File extension
|
|
801
|
+
|
|
802
|
+
Returns:
|
|
803
|
+
Whether supported
|
|
804
|
+
"""
|
|
805
|
+
ext = file_extension.lower().lstrip('.')
|
|
806
|
+
return ext in self.supported_extensions
|
|
807
|
+
|
|
808
|
+
@staticmethod
|
|
809
|
+
def clean_text(text: str) -> str:
|
|
810
|
+
"""
|
|
811
|
+
Clean text.
|
|
812
|
+
|
|
813
|
+
Args:
|
|
814
|
+
text: Text to clean
|
|
815
|
+
|
|
816
|
+
Returns:
|
|
817
|
+
Cleaned text
|
|
818
|
+
"""
|
|
819
|
+
from xgen_doc2chunk.core.functions.utils import clean_text as _clean_text
|
|
820
|
+
return _clean_text(text)
|
|
821
|
+
|
|
822
|
+
@staticmethod
|
|
823
|
+
def clean_code_text(text: str) -> str:
|
|
824
|
+
"""
|
|
825
|
+
Clean code text.
|
|
826
|
+
|
|
827
|
+
Args:
|
|
828
|
+
text: Code text to clean
|
|
829
|
+
|
|
830
|
+
Returns:
|
|
831
|
+
Cleaned code text
|
|
832
|
+
"""
|
|
833
|
+
from xgen_doc2chunk.core.functions.utils import clean_code_text as _clean_code_text
|
|
834
|
+
return _clean_code_text(text)
|
|
835
|
+
|
|
836
|
+
# =========================================================================
|
|
837
|
+
# Private Methods
|
|
838
|
+
# =========================================================================
|
|
839
|
+
|
|
840
|
+
def _create_image_processor(
|
|
841
|
+
self,
|
|
842
|
+
directory: Optional[str] = None,
|
|
843
|
+
tag_prefix: Optional[str] = None,
|
|
844
|
+
tag_suffix: Optional[str] = None
|
|
845
|
+
) -> Any:
|
|
846
|
+
"""
|
|
847
|
+
Create an ImageProcessor instance for this DocumentProcessor.
|
|
848
|
+
|
|
849
|
+
This creates an instance-specific ImageProcessor that will be
|
|
850
|
+
passed to handlers via config.
|
|
851
|
+
|
|
852
|
+
Args:
|
|
853
|
+
directory: Image save directory
|
|
854
|
+
tag_prefix: Image tag prefix
|
|
855
|
+
tag_suffix: Image tag suffix
|
|
856
|
+
|
|
857
|
+
Returns:
|
|
858
|
+
ImageProcessor instance
|
|
859
|
+
"""
|
|
860
|
+
from xgen_doc2chunk.core.functions.img_processor import create_image_processor
|
|
861
|
+
|
|
862
|
+
return create_image_processor(
|
|
863
|
+
directory_path=directory,
|
|
864
|
+
tag_prefix=tag_prefix,
|
|
865
|
+
tag_suffix=tag_suffix
|
|
866
|
+
)
|
|
867
|
+
|
|
868
|
+
def _create_page_tag_processor(
|
|
869
|
+
self,
|
|
870
|
+
page_tag_prefix: Optional[str] = None,
|
|
871
|
+
page_tag_suffix: Optional[str] = None,
|
|
872
|
+
slide_tag_prefix: Optional[str] = None,
|
|
873
|
+
slide_tag_suffix: Optional[str] = None
|
|
874
|
+
) -> Any:
|
|
875
|
+
"""
|
|
876
|
+
Create a PageTagProcessor instance for this DocumentProcessor.
|
|
877
|
+
|
|
878
|
+
This creates an instance-specific PageTagProcessor that will be
|
|
879
|
+
passed to handlers via config.
|
|
880
|
+
|
|
881
|
+
Args:
|
|
882
|
+
page_tag_prefix: Page tag prefix (default: "[Page Number: ")
|
|
883
|
+
page_tag_suffix: Page tag suffix (default: "]")
|
|
884
|
+
slide_tag_prefix: Slide tag prefix (default: "[Slide Number: ")
|
|
885
|
+
slide_tag_suffix: Slide tag suffix (default: "]")
|
|
886
|
+
|
|
887
|
+
Returns:
|
|
888
|
+
PageTagProcessor instance
|
|
889
|
+
"""
|
|
890
|
+
from xgen_doc2chunk.core.functions.page_tag_processor import PageTagProcessor
|
|
891
|
+
|
|
892
|
+
return PageTagProcessor(
|
|
893
|
+
tag_prefix=page_tag_prefix,
|
|
894
|
+
tag_suffix=page_tag_suffix,
|
|
895
|
+
slide_prefix=slide_tag_prefix,
|
|
896
|
+
slide_suffix=slide_tag_suffix
|
|
897
|
+
)
|
|
898
|
+
|
|
899
|
+
def _create_chart_processor(
|
|
900
|
+
self,
|
|
901
|
+
chart_tag_prefix: Optional[str] = None,
|
|
902
|
+
chart_tag_suffix: Optional[str] = None
|
|
903
|
+
) -> Any:
|
|
904
|
+
"""
|
|
905
|
+
Create a ChartProcessor instance for this DocumentProcessor.
|
|
906
|
+
|
|
907
|
+
This creates an instance-specific ChartProcessor that will be
|
|
908
|
+
passed to handlers via config.
|
|
909
|
+
|
|
910
|
+
Args:
|
|
911
|
+
chart_tag_prefix: Chart tag prefix (default: "[chart]")
|
|
912
|
+
chart_tag_suffix: Chart tag suffix (default: "[/chart]")
|
|
913
|
+
|
|
914
|
+
Returns:
|
|
915
|
+
ChartProcessor instance
|
|
916
|
+
"""
|
|
917
|
+
from xgen_doc2chunk.core.functions.chart_processor import ChartProcessor
|
|
918
|
+
|
|
919
|
+
return ChartProcessor(
|
|
920
|
+
tag_prefix=chart_tag_prefix,
|
|
921
|
+
tag_suffix=chart_tag_suffix
|
|
922
|
+
)
|
|
923
|
+
|
|
924
|
+
def _create_metadata_formatter(
|
|
925
|
+
self,
|
|
926
|
+
metadata_tag_prefix: Optional[str] = None,
|
|
927
|
+
metadata_tag_suffix: Optional[str] = None
|
|
928
|
+
) -> Any:
|
|
929
|
+
"""
|
|
930
|
+
Create a MetadataFormatter instance for this DocumentProcessor.
|
|
931
|
+
|
|
932
|
+
This creates an instance-specific MetadataFormatter that will be
|
|
933
|
+
passed to handlers via config.
|
|
934
|
+
|
|
935
|
+
Args:
|
|
936
|
+
metadata_tag_prefix: Opening tag (default: "<Document-Metadata>")
|
|
937
|
+
metadata_tag_suffix: Closing tag (default: "</Document-Metadata>")
|
|
938
|
+
|
|
939
|
+
Returns:
|
|
940
|
+
MetadataFormatter instance
|
|
941
|
+
"""
|
|
942
|
+
from xgen_doc2chunk.core.functions.metadata_extractor import MetadataFormatter
|
|
943
|
+
|
|
944
|
+
kwargs = {}
|
|
945
|
+
if metadata_tag_prefix is not None:
|
|
946
|
+
kwargs["metadata_tag_prefix"] = metadata_tag_prefix
|
|
947
|
+
if metadata_tag_suffix is not None:
|
|
948
|
+
kwargs["metadata_tag_suffix"] = metadata_tag_suffix
|
|
949
|
+
|
|
950
|
+
return MetadataFormatter(**kwargs)
|
|
951
|
+
|
|
952
|
+
def _build_supported_extensions(self) -> List[str]:
|
|
953
|
+
"""Build list of supported extensions."""
|
|
954
|
+
extensions = list(
|
|
955
|
+
self.DOCUMENT_TYPES |
|
|
956
|
+
self.TEXT_TYPES |
|
|
957
|
+
self.CODE_TYPES |
|
|
958
|
+
self.CONFIG_TYPES |
|
|
959
|
+
self.DATA_TYPES |
|
|
960
|
+
self.SCRIPT_TYPES |
|
|
961
|
+
self.LOG_TYPES |
|
|
962
|
+
self.WEB_TYPES |
|
|
963
|
+
self.IMAGE_TYPES
|
|
964
|
+
)
|
|
965
|
+
|
|
966
|
+
return sorted(extensions)
|
|
967
|
+
|
|
968
|
+
def _get_handler_registry(self) -> Dict[str, Callable]:
|
|
969
|
+
"""Build and cache handler registry.
|
|
970
|
+
|
|
971
|
+
All handlers are class-based, inheriting from BaseHandler.
|
|
972
|
+
"""
|
|
973
|
+
if self._handler_registry is not None:
|
|
974
|
+
return self._handler_registry
|
|
975
|
+
|
|
976
|
+
self._handler_registry = {}
|
|
977
|
+
|
|
978
|
+
# PDF handler
|
|
979
|
+
try:
|
|
980
|
+
from xgen_doc2chunk.core.processor.pdf_handler import PDFHandler
|
|
981
|
+
pdf_handler = PDFHandler(
|
|
982
|
+
config=self._config,
|
|
983
|
+
image_processor=self._image_processor,
|
|
984
|
+
page_tag_processor=self._page_tag_processor,
|
|
985
|
+
chart_processor=self._chart_processor
|
|
986
|
+
)
|
|
987
|
+
self._handler_registry['pdf'] = pdf_handler.extract_text
|
|
988
|
+
except ImportError as e:
|
|
989
|
+
self._logger.warning(f"PDF handler not available: {e}")
|
|
990
|
+
|
|
991
|
+
# DOCX handler
|
|
992
|
+
try:
|
|
993
|
+
from xgen_doc2chunk.core.processor.docx_handler import DOCXHandler
|
|
994
|
+
docx_handler = DOCXHandler(
|
|
995
|
+
config=self._config,
|
|
996
|
+
image_processor=self._image_processor,
|
|
997
|
+
page_tag_processor=self._page_tag_processor,
|
|
998
|
+
chart_processor=self._chart_processor
|
|
999
|
+
)
|
|
1000
|
+
self._handler_registry['docx'] = docx_handler.extract_text
|
|
1001
|
+
except ImportError as e:
|
|
1002
|
+
self._logger.warning(f"DOCX handler not available: {e}")
|
|
1003
|
+
|
|
1004
|
+
# DOC handler
|
|
1005
|
+
try:
|
|
1006
|
+
from xgen_doc2chunk.core.processor.doc_handler import DOCHandler
|
|
1007
|
+
doc_handler = DOCHandler(
|
|
1008
|
+
config=self._config,
|
|
1009
|
+
image_processor=self._image_processor,
|
|
1010
|
+
page_tag_processor=self._page_tag_processor,
|
|
1011
|
+
chart_processor=self._chart_processor
|
|
1012
|
+
)
|
|
1013
|
+
self._handler_registry['doc'] = doc_handler.extract_text
|
|
1014
|
+
except ImportError as e:
|
|
1015
|
+
self._logger.warning(f"DOC handler not available: {e}")
|
|
1016
|
+
|
|
1017
|
+
# RTF handler
|
|
1018
|
+
try:
|
|
1019
|
+
from xgen_doc2chunk.core.processor.rtf_handler import RTFHandler
|
|
1020
|
+
rtf_handler = RTFHandler(
|
|
1021
|
+
config=self._config,
|
|
1022
|
+
image_processor=self._image_processor,
|
|
1023
|
+
page_tag_processor=self._page_tag_processor,
|
|
1024
|
+
chart_processor=self._chart_processor
|
|
1025
|
+
)
|
|
1026
|
+
self._handler_registry['rtf'] = rtf_handler.extract_text
|
|
1027
|
+
except ImportError as e:
|
|
1028
|
+
self._logger.warning(f"RTF handler not available: {e}")
|
|
1029
|
+
|
|
1030
|
+
# PPT/PPTX handler
|
|
1031
|
+
try:
|
|
1032
|
+
from xgen_doc2chunk.core.processor.ppt_handler import PPTHandler
|
|
1033
|
+
ppt_handler = PPTHandler(
|
|
1034
|
+
config=self._config,
|
|
1035
|
+
image_processor=self._image_processor,
|
|
1036
|
+
page_tag_processor=self._page_tag_processor,
|
|
1037
|
+
chart_processor=self._chart_processor
|
|
1038
|
+
)
|
|
1039
|
+
self._handler_registry['ppt'] = ppt_handler.extract_text
|
|
1040
|
+
self._handler_registry['pptx'] = ppt_handler.extract_text
|
|
1041
|
+
except ImportError as e:
|
|
1042
|
+
self._logger.warning(f"PPT handler not available: {e}")
|
|
1043
|
+
|
|
1044
|
+
# Excel handler
|
|
1045
|
+
try:
|
|
1046
|
+
from xgen_doc2chunk.core.processor.excel_handler import ExcelHandler
|
|
1047
|
+
excel_handler = ExcelHandler(
|
|
1048
|
+
config=self._config,
|
|
1049
|
+
image_processor=self._image_processor,
|
|
1050
|
+
page_tag_processor=self._page_tag_processor,
|
|
1051
|
+
chart_processor=self._chart_processor
|
|
1052
|
+
)
|
|
1053
|
+
self._handler_registry['xlsx'] = excel_handler.extract_text
|
|
1054
|
+
self._handler_registry['xls'] = excel_handler.extract_text
|
|
1055
|
+
except ImportError as e:
|
|
1056
|
+
self._logger.warning(f"Excel handler not available: {e}")
|
|
1057
|
+
|
|
1058
|
+
# CSV/TSV handler
|
|
1059
|
+
try:
|
|
1060
|
+
from xgen_doc2chunk.core.processor.csv_handler import CSVHandler
|
|
1061
|
+
csv_handler = CSVHandler(
|
|
1062
|
+
config=self._config,
|
|
1063
|
+
image_processor=self._image_processor,
|
|
1064
|
+
page_tag_processor=self._page_tag_processor,
|
|
1065
|
+
chart_processor=self._chart_processor
|
|
1066
|
+
)
|
|
1067
|
+
self._handler_registry['csv'] = csv_handler.extract_text
|
|
1068
|
+
self._handler_registry['tsv'] = csv_handler.extract_text
|
|
1069
|
+
except ImportError as e:
|
|
1070
|
+
self._logger.warning(f"CSV handler not available: {e}")
|
|
1071
|
+
|
|
1072
|
+
# HWP handler
|
|
1073
|
+
try:
|
|
1074
|
+
from xgen_doc2chunk.core.processor.hwp_handler import HWPHandler
|
|
1075
|
+
hwp_handler = HWPHandler(
|
|
1076
|
+
config=self._config,
|
|
1077
|
+
image_processor=self._image_processor,
|
|
1078
|
+
page_tag_processor=self._page_tag_processor,
|
|
1079
|
+
chart_processor=self._chart_processor
|
|
1080
|
+
)
|
|
1081
|
+
self._handler_registry['hwp'] = hwp_handler.extract_text
|
|
1082
|
+
except ImportError as e:
|
|
1083
|
+
self._logger.warning(f"HWP handler not available: {e}")
|
|
1084
|
+
|
|
1085
|
+
# HWPX handler
|
|
1086
|
+
try:
|
|
1087
|
+
from xgen_doc2chunk.core.processor.hwpx_handler import HWPXHandler
|
|
1088
|
+
hwpx_handler = HWPXHandler(
|
|
1089
|
+
config=self._config,
|
|
1090
|
+
image_processor=self._image_processor,
|
|
1091
|
+
page_tag_processor=self._page_tag_processor,
|
|
1092
|
+
chart_processor=self._chart_processor
|
|
1093
|
+
)
|
|
1094
|
+
self._handler_registry['hwpx'] = hwpx_handler.extract_text
|
|
1095
|
+
except ImportError as e:
|
|
1096
|
+
self._logger.warning(f"HWPX handler not available: {e}")
|
|
1097
|
+
|
|
1098
|
+
# Text handler (for text, code, config, script, log, web types)
|
|
1099
|
+
try:
|
|
1100
|
+
from xgen_doc2chunk.core.processor.text_handler import TextHandler
|
|
1101
|
+
text_handler = TextHandler(
|
|
1102
|
+
config=self._config,
|
|
1103
|
+
image_processor=self._image_processor,
|
|
1104
|
+
page_tag_processor=self._page_tag_processor,
|
|
1105
|
+
chart_processor=self._chart_processor
|
|
1106
|
+
)
|
|
1107
|
+
text_extensions = (
|
|
1108
|
+
self.TEXT_TYPES |
|
|
1109
|
+
self.CODE_TYPES |
|
|
1110
|
+
self.CONFIG_TYPES |
|
|
1111
|
+
self.SCRIPT_TYPES |
|
|
1112
|
+
self.LOG_TYPES |
|
|
1113
|
+
self.WEB_TYPES
|
|
1114
|
+
)
|
|
1115
|
+
for ext in text_extensions:
|
|
1116
|
+
self._handler_registry[ext] = text_handler.extract_text
|
|
1117
|
+
except ImportError as e:
|
|
1118
|
+
self._logger.warning(f"Text handler not available: {e}")
|
|
1119
|
+
|
|
1120
|
+
# Image file handler (for standalone image files)
|
|
1121
|
+
# Requires OCR engine for text extraction
|
|
1122
|
+
try:
|
|
1123
|
+
from xgen_doc2chunk.core.processor.image_file_handler import ImageFileHandler
|
|
1124
|
+
image_handler = ImageFileHandler(
|
|
1125
|
+
config=self._config,
|
|
1126
|
+
image_processor=self._image_processor,
|
|
1127
|
+
page_tag_processor=self._page_tag_processor,
|
|
1128
|
+
chart_processor=self._chart_processor,
|
|
1129
|
+
ocr_engine=self._ocr_engine
|
|
1130
|
+
)
|
|
1131
|
+
for ext in self.IMAGE_TYPES:
|
|
1132
|
+
self._handler_registry[ext] = image_handler.extract_text
|
|
1133
|
+
except ImportError as e:
|
|
1134
|
+
self._logger.warning(f"Image file handler not available: {e}")
|
|
1135
|
+
|
|
1136
|
+
return self._handler_registry
|
|
1137
|
+
|
|
1138
|
+
def _create_current_file(self, file_path: str, ext: str) -> CurrentFile:
|
|
1139
|
+
"""
|
|
1140
|
+
Create a CurrentFile dict from a file path.
|
|
1141
|
+
|
|
1142
|
+
Reads the file at binary level to avoid path encoding issues
|
|
1143
|
+
(e.g., Korean characters in Windows paths).
|
|
1144
|
+
|
|
1145
|
+
Args:
|
|
1146
|
+
file_path: Absolute path to the file
|
|
1147
|
+
ext: File extension (lowercase, without dot)
|
|
1148
|
+
|
|
1149
|
+
Returns:
|
|
1150
|
+
CurrentFile dict containing file info and binary data
|
|
1151
|
+
|
|
1152
|
+
Raises:
|
|
1153
|
+
IOError: If file cannot be read
|
|
1154
|
+
"""
|
|
1155
|
+
file_path = os.path.abspath(file_path)
|
|
1156
|
+
file_name = os.path.basename(file_path)
|
|
1157
|
+
|
|
1158
|
+
# Read file as binary
|
|
1159
|
+
with open(file_path, 'rb') as f:
|
|
1160
|
+
file_data = f.read()
|
|
1161
|
+
|
|
1162
|
+
# Create BytesIO stream for handlers that need seekable stream
|
|
1163
|
+
file_stream = io.BytesIO(file_data)
|
|
1164
|
+
|
|
1165
|
+
# Return as plain dict (TypedDict is for type hints only)
|
|
1166
|
+
return {
|
|
1167
|
+
"file_path": file_path,
|
|
1168
|
+
"file_name": file_name,
|
|
1169
|
+
"file_extension": ext,
|
|
1170
|
+
"file_data": file_data,
|
|
1171
|
+
"file_stream": file_stream,
|
|
1172
|
+
"file_size": len(file_data)
|
|
1173
|
+
}
|
|
1174
|
+
|
|
1175
|
+
def _get_handler(self, ext: str) -> Optional[Callable]:
|
|
1176
|
+
"""Get handler for file extension."""
|
|
1177
|
+
registry = self._get_handler_registry()
|
|
1178
|
+
return registry.get(ext)
|
|
1179
|
+
|
|
1180
|
+
def _invoke_handler(
|
|
1181
|
+
self,
|
|
1182
|
+
handler: Optional[Callable],
|
|
1183
|
+
current_file: CurrentFile,
|
|
1184
|
+
ext: str,
|
|
1185
|
+
extract_metadata: bool,
|
|
1186
|
+
**kwargs
|
|
1187
|
+
) -> str:
|
|
1188
|
+
"""
|
|
1189
|
+
Invoke the appropriate handler based on extension.
|
|
1190
|
+
|
|
1191
|
+
All handlers are class-based and use the same signature:
|
|
1192
|
+
handler(current_file, extract_metadata=..., **kwargs)
|
|
1193
|
+
|
|
1194
|
+
Args:
|
|
1195
|
+
handler: Handler method (bound method from Handler class)
|
|
1196
|
+
current_file: CurrentFile dict containing file info and binary data
|
|
1197
|
+
ext: File extension
|
|
1198
|
+
extract_metadata: Whether to extract metadata
|
|
1199
|
+
**kwargs: Additional options
|
|
1200
|
+
|
|
1201
|
+
Returns:
|
|
1202
|
+
Extracted text
|
|
1203
|
+
"""
|
|
1204
|
+
if handler is None:
|
|
1205
|
+
raise ValueError(f"No handler available for extension: {ext}")
|
|
1206
|
+
|
|
1207
|
+
# Determine if this is a code file
|
|
1208
|
+
is_code = ext in self.CODE_TYPES
|
|
1209
|
+
|
|
1210
|
+
# Text-based files include file_type and is_code in kwargs
|
|
1211
|
+
text_extensions = (
|
|
1212
|
+
self.TEXT_TYPES |
|
|
1213
|
+
self.CODE_TYPES |
|
|
1214
|
+
self.CONFIG_TYPES |
|
|
1215
|
+
self.SCRIPT_TYPES |
|
|
1216
|
+
self.LOG_TYPES |
|
|
1217
|
+
self.WEB_TYPES
|
|
1218
|
+
)
|
|
1219
|
+
|
|
1220
|
+
if ext in text_extensions:
|
|
1221
|
+
return handler(current_file, extract_metadata=extract_metadata, file_type=ext, is_code=is_code, **kwargs)
|
|
1222
|
+
|
|
1223
|
+
# All other handlers use standard signature
|
|
1224
|
+
return handler(current_file, extract_metadata=extract_metadata, **kwargs)
|
|
1225
|
+
|
|
1226
|
+
# =========================================================================
|
|
1227
|
+
# Context Manager Support
|
|
1228
|
+
# =========================================================================
|
|
1229
|
+
|
|
1230
|
+
def __enter__(self) -> "DocumentProcessor":
|
|
1231
|
+
"""Context manager entry."""
|
|
1232
|
+
return self
|
|
1233
|
+
|
|
1234
|
+
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
|
|
1235
|
+
"""Context manager exit."""
|
|
1236
|
+
# Perform resource cleanup here if needed
|
|
1237
|
+
pass
|
|
1238
|
+
|
|
1239
|
+
# =========================================================================
|
|
1240
|
+
# String Representation
|
|
1241
|
+
# =========================================================================
|
|
1242
|
+
|
|
1243
|
+
def __repr__(self) -> str:
|
|
1244
|
+
return f"DocumentProcessor(supported_extensions={len(self.supported_extensions)})"
|
|
1245
|
+
|
|
1246
|
+
def __str__(self) -> str:
|
|
1247
|
+
return f"xgen_doc2chunk DocumentProcessor ({len(self.supported_extensions)} supported formats)"
|
|
1248
|
+
|
|
1249
|
+
|
|
1250
|
+
# === Module-level Convenience Functions ===
|
|
1251
|
+
|
|
1252
|
+
def create_processor(
|
|
1253
|
+
config: Optional[Union[Dict[str, Any], Any]] = None,
|
|
1254
|
+
ocr_engine: Optional[Any] = None,
|
|
1255
|
+
*,
|
|
1256
|
+
image_directory: Optional[str] = None,
|
|
1257
|
+
image_tag_prefix: Optional[str] = None,
|
|
1258
|
+
image_tag_suffix: Optional[str] = None,
|
|
1259
|
+
**kwargs
|
|
1260
|
+
) -> DocumentProcessor:
|
|
1261
|
+
"""
|
|
1262
|
+
Create a DocumentProcessor instance.
|
|
1263
|
+
|
|
1264
|
+
Args:
|
|
1265
|
+
config: Configuration dictionary or ConfigComposer instance
|
|
1266
|
+
ocr_engine: OCR engine instance (BaseOCR subclass)
|
|
1267
|
+
image_directory: Directory path for saving extracted images
|
|
1268
|
+
image_tag_prefix: Prefix for image tags (default: "[Image:")
|
|
1269
|
+
image_tag_suffix: Suffix for image tags (default: "]")
|
|
1270
|
+
**kwargs: Additional configuration options
|
|
1271
|
+
|
|
1272
|
+
Returns:
|
|
1273
|
+
DocumentProcessor instance
|
|
1274
|
+
|
|
1275
|
+
Example:
|
|
1276
|
+
>>> processor = create_processor()
|
|
1277
|
+
>>> processor = create_processor(config={"vision_model": "gpt-4-vision"})
|
|
1278
|
+
|
|
1279
|
+
# With OCR engine
|
|
1280
|
+
>>> from xgen_doc2chunk.ocr.ocr_engine import OpenAIOCR
|
|
1281
|
+
>>> ocr = OpenAIOCR(api_key="sk-...", model="gpt-4o")
|
|
1282
|
+
>>> processor = create_processor(ocr_engine=ocr)
|
|
1283
|
+
|
|
1284
|
+
# With custom image tags (HTML format)
|
|
1285
|
+
>>> processor = create_processor(
|
|
1286
|
+
... image_directory="output/images",
|
|
1287
|
+
... image_tag_prefix="<img src='",
|
|
1288
|
+
... image_tag_suffix="'/>"
|
|
1289
|
+
... )
|
|
1290
|
+
"""
|
|
1291
|
+
return DocumentProcessor(
|
|
1292
|
+
config=config,
|
|
1293
|
+
ocr_engine=ocr_engine,
|
|
1294
|
+
image_directory=image_directory,
|
|
1295
|
+
image_tag_prefix=image_tag_prefix,
|
|
1296
|
+
image_tag_suffix=image_tag_suffix,
|
|
1297
|
+
**kwargs
|
|
1298
|
+
)
|
|
1299
|
+
|
|
1300
|
+
|
|
1301
|
+
__all__ = [
|
|
1302
|
+
"DocumentProcessor",
|
|
1303
|
+
"CurrentFile",
|
|
1304
|
+
"ChunkResult",
|
|
1305
|
+
"create_processor",
|
|
1306
|
+
]
|
|
1307
|
+
|