xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,381 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/functions/storage_backend.py
|
|
2
|
+
"""
|
|
3
|
+
Storage Backend Module
|
|
4
|
+
|
|
5
|
+
Provides abstract base class and implementations for image storage backends.
|
|
6
|
+
ImageProcessor uses these backends to save images to different storage systems.
|
|
7
|
+
|
|
8
|
+
Storage Backends:
|
|
9
|
+
- LocalStorageBackend: Save to local file system
|
|
10
|
+
- MinIOStorageBackend: Save to MinIO object storage (stub)
|
|
11
|
+
- S3StorageBackend: Save to AWS S3 (stub)
|
|
12
|
+
|
|
13
|
+
Usage Example:
|
|
14
|
+
from xgen_doc2chunk.core.functions.storage_backend import (
|
|
15
|
+
LocalStorageBackend,
|
|
16
|
+
MinIOStorageBackend,
|
|
17
|
+
)
|
|
18
|
+
from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
|
|
19
|
+
|
|
20
|
+
# Use local storage (default)
|
|
21
|
+
processor = ImageProcessor()
|
|
22
|
+
|
|
23
|
+
# Use MinIO storage
|
|
24
|
+
minio_backend = MinIOStorageBackend(
|
|
25
|
+
endpoint="localhost:9000",
|
|
26
|
+
bucket="images"
|
|
27
|
+
)
|
|
28
|
+
processor = ImageProcessor(storage_backend=minio_backend)
|
|
29
|
+
"""
|
|
30
|
+
import logging
|
|
31
|
+
import os
|
|
32
|
+
from abc import ABC, abstractmethod
|
|
33
|
+
from enum import Enum
|
|
34
|
+
from pathlib import Path
|
|
35
|
+
from typing import Any, Dict, Optional
|
|
36
|
+
|
|
37
|
+
logger = logging.getLogger("xgen_doc2chunk.storage")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class StorageType(Enum):
|
|
41
|
+
"""Storage backend types."""
|
|
42
|
+
LOCAL = "local"
|
|
43
|
+
MINIO = "minio"
|
|
44
|
+
S3 = "s3"
|
|
45
|
+
AZURE_BLOB = "azure_blob"
|
|
46
|
+
GCS = "gcs" # Google Cloud Storage
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class BaseStorageBackend(ABC):
|
|
50
|
+
"""
|
|
51
|
+
Abstract base class for storage backends.
|
|
52
|
+
|
|
53
|
+
Each storage type implements this interface to provide
|
|
54
|
+
storage-specific save/delete logic.
|
|
55
|
+
|
|
56
|
+
Subclasses must implement:
|
|
57
|
+
- save(): Save data to storage
|
|
58
|
+
- delete(): Delete file from storage
|
|
59
|
+
- exists(): Check if file exists
|
|
60
|
+
- ensure_ready(): Prepare storage (create dirs, validate connection)
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
def __init__(self, storage_type: StorageType):
|
|
64
|
+
self._storage_type = storage_type
|
|
65
|
+
self._logger = logging.getLogger(
|
|
66
|
+
f"xgen_doc2chunk.storage.{self.__class__.__name__}"
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def storage_type(self) -> StorageType:
|
|
71
|
+
"""Get storage type."""
|
|
72
|
+
return self._storage_type
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def logger(self) -> logging.Logger:
|
|
76
|
+
"""Get logger."""
|
|
77
|
+
return self._logger
|
|
78
|
+
|
|
79
|
+
@abstractmethod
|
|
80
|
+
def save(self, data: bytes, file_path: str) -> bool:
|
|
81
|
+
"""
|
|
82
|
+
Save data to storage.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
data: Binary data to save
|
|
86
|
+
file_path: Target file path or key
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
True if successful, False otherwise
|
|
90
|
+
"""
|
|
91
|
+
pass
|
|
92
|
+
|
|
93
|
+
@abstractmethod
|
|
94
|
+
def delete(self, file_path: str) -> bool:
|
|
95
|
+
"""
|
|
96
|
+
Delete file from storage.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
file_path: File path or key to delete
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
True if successful, False otherwise
|
|
103
|
+
"""
|
|
104
|
+
pass
|
|
105
|
+
|
|
106
|
+
@abstractmethod
|
|
107
|
+
def exists(self, file_path: str) -> bool:
|
|
108
|
+
"""
|
|
109
|
+
Check if file exists in storage.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
file_path: File path or key to check
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
True if file exists
|
|
116
|
+
"""
|
|
117
|
+
pass
|
|
118
|
+
|
|
119
|
+
@abstractmethod
|
|
120
|
+
def ensure_ready(self, directory_path: str) -> None:
|
|
121
|
+
"""
|
|
122
|
+
Ensure storage is ready (create directory, validate connection, etc.).
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
directory_path: Base directory or bucket path
|
|
126
|
+
"""
|
|
127
|
+
pass
|
|
128
|
+
|
|
129
|
+
def build_url(self, file_path: str) -> str:
|
|
130
|
+
"""
|
|
131
|
+
Build URL or path for the saved file.
|
|
132
|
+
|
|
133
|
+
Override in subclasses for storage-specific URL formats.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
file_path: File path or key
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
URL or path string
|
|
140
|
+
"""
|
|
141
|
+
return file_path.replace("\\", "/")
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class LocalStorageBackend(BaseStorageBackend):
|
|
145
|
+
"""
|
|
146
|
+
Local file system storage backend.
|
|
147
|
+
|
|
148
|
+
Saves files to the local file system.
|
|
149
|
+
"""
|
|
150
|
+
|
|
151
|
+
def __init__(self):
|
|
152
|
+
super().__init__(StorageType.LOCAL)
|
|
153
|
+
|
|
154
|
+
def save(self, data: bytes, file_path: str) -> bool:
|
|
155
|
+
"""Save data to local file."""
|
|
156
|
+
try:
|
|
157
|
+
with open(file_path, 'wb') as f:
|
|
158
|
+
f.write(data)
|
|
159
|
+
return True
|
|
160
|
+
except Exception as e:
|
|
161
|
+
self._logger.error(f"Failed to save file {file_path}: {e}")
|
|
162
|
+
return False
|
|
163
|
+
|
|
164
|
+
def delete(self, file_path: str) -> bool:
|
|
165
|
+
"""Delete local file."""
|
|
166
|
+
try:
|
|
167
|
+
if os.path.exists(file_path):
|
|
168
|
+
os.remove(file_path)
|
|
169
|
+
return True
|
|
170
|
+
return False
|
|
171
|
+
except Exception as e:
|
|
172
|
+
self._logger.warning(f"Failed to delete file {file_path}: {e}")
|
|
173
|
+
return False
|
|
174
|
+
|
|
175
|
+
def exists(self, file_path: str) -> bool:
|
|
176
|
+
"""Check if local file exists."""
|
|
177
|
+
return os.path.exists(file_path)
|
|
178
|
+
|
|
179
|
+
def ensure_ready(self, directory_path: str) -> None:
|
|
180
|
+
"""Create directory if it doesn't exist."""
|
|
181
|
+
path = Path(directory_path)
|
|
182
|
+
if not path.exists():
|
|
183
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
184
|
+
self._logger.debug(f"Created directory: {path}")
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
class MinIOStorageBackend(BaseStorageBackend):
|
|
188
|
+
"""
|
|
189
|
+
MinIO object storage backend (STUB - Not Implemented).
|
|
190
|
+
|
|
191
|
+
This is a placeholder for MinIO integration.
|
|
192
|
+
Requires minio package to be installed.
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
endpoint: MinIO server endpoint
|
|
196
|
+
access_key: MinIO access key
|
|
197
|
+
secret_key: MinIO secret key
|
|
198
|
+
bucket: Target bucket name
|
|
199
|
+
secure: Use HTTPS (default: True)
|
|
200
|
+
"""
|
|
201
|
+
|
|
202
|
+
def __init__(
|
|
203
|
+
self,
|
|
204
|
+
endpoint: str = "localhost:9000",
|
|
205
|
+
access_key: str = "",
|
|
206
|
+
secret_key: str = "",
|
|
207
|
+
bucket: str = "images",
|
|
208
|
+
secure: bool = True,
|
|
209
|
+
):
|
|
210
|
+
super().__init__(StorageType.MINIO)
|
|
211
|
+
self._endpoint = endpoint
|
|
212
|
+
self._access_key = access_key
|
|
213
|
+
self._secret_key = secret_key
|
|
214
|
+
self._bucket = bucket
|
|
215
|
+
self._secure = secure
|
|
216
|
+
self._client = None
|
|
217
|
+
|
|
218
|
+
self._logger.warning(
|
|
219
|
+
"MinIOStorageBackend is a stub implementation. "
|
|
220
|
+
"Full implementation is pending."
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
@property
|
|
224
|
+
def bucket(self) -> str:
|
|
225
|
+
"""Get bucket name."""
|
|
226
|
+
return self._bucket
|
|
227
|
+
|
|
228
|
+
@property
|
|
229
|
+
def endpoint(self) -> str:
|
|
230
|
+
"""Get endpoint."""
|
|
231
|
+
return self._endpoint
|
|
232
|
+
|
|
233
|
+
def save(self, data: bytes, file_path: str) -> bool:
|
|
234
|
+
"""Upload data to MinIO bucket."""
|
|
235
|
+
raise NotImplementedError(
|
|
236
|
+
"MinIOStorageBackend.save() is not yet implemented. "
|
|
237
|
+
"Use LocalStorageBackend for now."
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
def delete(self, file_path: str) -> bool:
|
|
241
|
+
"""Delete object from MinIO bucket."""
|
|
242
|
+
raise NotImplementedError(
|
|
243
|
+
"MinIOStorageBackend.delete() is not yet implemented."
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
def exists(self, file_path: str) -> bool:
|
|
247
|
+
"""Check if object exists in MinIO bucket."""
|
|
248
|
+
raise NotImplementedError(
|
|
249
|
+
"MinIOStorageBackend.exists() is not yet implemented."
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
def ensure_ready(self, directory_path: str) -> None:
|
|
253
|
+
"""Initialize MinIO client and ensure bucket exists."""
|
|
254
|
+
raise NotImplementedError(
|
|
255
|
+
"MinIOStorageBackend.ensure_ready() is not yet implemented."
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
def build_url(self, file_path: str) -> str:
|
|
259
|
+
"""Build MinIO URL for the file."""
|
|
260
|
+
# Would return presigned URL or object path
|
|
261
|
+
protocol = "https" if self._secure else "http"
|
|
262
|
+
return f"{protocol}://{self._endpoint}/{self._bucket}/{file_path}"
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
class S3StorageBackend(BaseStorageBackend):
|
|
266
|
+
"""
|
|
267
|
+
AWS S3 storage backend (STUB - Not Implemented).
|
|
268
|
+
|
|
269
|
+
This is a placeholder for AWS S3 integration.
|
|
270
|
+
Requires boto3 package to be installed.
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
bucket: S3 bucket name
|
|
274
|
+
region: AWS region (default: "us-east-1")
|
|
275
|
+
prefix: Key prefix for uploaded objects
|
|
276
|
+
"""
|
|
277
|
+
|
|
278
|
+
def __init__(
|
|
279
|
+
self,
|
|
280
|
+
bucket: str = "",
|
|
281
|
+
region: str = "us-east-1",
|
|
282
|
+
prefix: str = "",
|
|
283
|
+
):
|
|
284
|
+
super().__init__(StorageType.S3)
|
|
285
|
+
self._bucket = bucket
|
|
286
|
+
self._region = region
|
|
287
|
+
self._prefix = prefix
|
|
288
|
+
self._client = None
|
|
289
|
+
|
|
290
|
+
self._logger.warning(
|
|
291
|
+
"S3StorageBackend is a stub implementation. "
|
|
292
|
+
"Full implementation is pending."
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
@property
|
|
296
|
+
def bucket(self) -> str:
|
|
297
|
+
"""Get bucket name."""
|
|
298
|
+
return self._bucket
|
|
299
|
+
|
|
300
|
+
@property
|
|
301
|
+
def region(self) -> str:
|
|
302
|
+
"""Get region."""
|
|
303
|
+
return self._region
|
|
304
|
+
|
|
305
|
+
def save(self, data: bytes, file_path: str) -> bool:
|
|
306
|
+
"""Upload data to S3 bucket."""
|
|
307
|
+
raise NotImplementedError(
|
|
308
|
+
"S3StorageBackend.save() is not yet implemented. "
|
|
309
|
+
"Use LocalStorageBackend for now."
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
def delete(self, file_path: str) -> bool:
|
|
313
|
+
"""Delete object from S3 bucket."""
|
|
314
|
+
raise NotImplementedError(
|
|
315
|
+
"S3StorageBackend.delete() is not yet implemented."
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
def exists(self, file_path: str) -> bool:
|
|
319
|
+
"""Check if object exists in S3 bucket."""
|
|
320
|
+
raise NotImplementedError(
|
|
321
|
+
"S3StorageBackend.exists() is not yet implemented."
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
def ensure_ready(self, directory_path: str) -> None:
|
|
325
|
+
"""Initialize S3 client and verify bucket access."""
|
|
326
|
+
raise NotImplementedError(
|
|
327
|
+
"S3StorageBackend.ensure_ready() is not yet implemented."
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
def build_url(self, file_path: str) -> str:
|
|
331
|
+
"""Build S3 URL for the file."""
|
|
332
|
+
# Would return S3 URI or presigned URL
|
|
333
|
+
return f"s3://{self._bucket}/{file_path}"
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
# Default backend instance
|
|
337
|
+
_default_backend = LocalStorageBackend()
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def get_default_backend() -> BaseStorageBackend:
|
|
341
|
+
"""Get the default storage backend (local)."""
|
|
342
|
+
return _default_backend
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def create_storage_backend(
|
|
346
|
+
storage_type: StorageType = StorageType.LOCAL,
|
|
347
|
+
**kwargs
|
|
348
|
+
) -> BaseStorageBackend:
|
|
349
|
+
"""
|
|
350
|
+
Factory function to create a storage backend.
|
|
351
|
+
|
|
352
|
+
Args:
|
|
353
|
+
storage_type: Type of storage backend
|
|
354
|
+
**kwargs: Storage-specific options
|
|
355
|
+
|
|
356
|
+
Returns:
|
|
357
|
+
BaseStorageBackend instance
|
|
358
|
+
"""
|
|
359
|
+
if storage_type == StorageType.LOCAL:
|
|
360
|
+
return LocalStorageBackend()
|
|
361
|
+
elif storage_type == StorageType.MINIO:
|
|
362
|
+
return MinIOStorageBackend(**kwargs)
|
|
363
|
+
elif storage_type == StorageType.S3:
|
|
364
|
+
return S3StorageBackend(**kwargs)
|
|
365
|
+
else:
|
|
366
|
+
raise ValueError(f"Unsupported storage type: {storage_type}")
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
__all__ = [
|
|
370
|
+
# Enum
|
|
371
|
+
"StorageType",
|
|
372
|
+
# Base class
|
|
373
|
+
"BaseStorageBackend",
|
|
374
|
+
# Implementations
|
|
375
|
+
"LocalStorageBackend",
|
|
376
|
+
"MinIOStorageBackend",
|
|
377
|
+
"S3StorageBackend",
|
|
378
|
+
# Factory
|
|
379
|
+
"create_storage_backend",
|
|
380
|
+
"get_default_backend",
|
|
381
|
+
]
|