xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,381 @@
1
+ # xgen_doc2chunk/core/functions/storage_backend.py
2
+ """
3
+ Storage Backend Module
4
+
5
+ Provides abstract base class and implementations for image storage backends.
6
+ ImageProcessor uses these backends to save images to different storage systems.
7
+
8
+ Storage Backends:
9
+ - LocalStorageBackend: Save to local file system
10
+ - MinIOStorageBackend: Save to MinIO object storage (stub)
11
+ - S3StorageBackend: Save to AWS S3 (stub)
12
+
13
+ Usage Example:
14
+ from xgen_doc2chunk.core.functions.storage_backend import (
15
+ LocalStorageBackend,
16
+ MinIOStorageBackend,
17
+ )
18
+ from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
19
+
20
+ # Use local storage (default)
21
+ processor = ImageProcessor()
22
+
23
+ # Use MinIO storage
24
+ minio_backend = MinIOStorageBackend(
25
+ endpoint="localhost:9000",
26
+ bucket="images"
27
+ )
28
+ processor = ImageProcessor(storage_backend=minio_backend)
29
+ """
30
+ import logging
31
+ import os
32
+ from abc import ABC, abstractmethod
33
+ from enum import Enum
34
+ from pathlib import Path
35
+ from typing import Any, Dict, Optional
36
+
37
+ logger = logging.getLogger("xgen_doc2chunk.storage")
38
+
39
+
40
+ class StorageType(Enum):
41
+ """Storage backend types."""
42
+ LOCAL = "local"
43
+ MINIO = "minio"
44
+ S3 = "s3"
45
+ AZURE_BLOB = "azure_blob"
46
+ GCS = "gcs" # Google Cloud Storage
47
+
48
+
49
+ class BaseStorageBackend(ABC):
50
+ """
51
+ Abstract base class for storage backends.
52
+
53
+ Each storage type implements this interface to provide
54
+ storage-specific save/delete logic.
55
+
56
+ Subclasses must implement:
57
+ - save(): Save data to storage
58
+ - delete(): Delete file from storage
59
+ - exists(): Check if file exists
60
+ - ensure_ready(): Prepare storage (create dirs, validate connection)
61
+ """
62
+
63
+ def __init__(self, storage_type: StorageType):
64
+ self._storage_type = storage_type
65
+ self._logger = logging.getLogger(
66
+ f"xgen_doc2chunk.storage.{self.__class__.__name__}"
67
+ )
68
+
69
+ @property
70
+ def storage_type(self) -> StorageType:
71
+ """Get storage type."""
72
+ return self._storage_type
73
+
74
+ @property
75
+ def logger(self) -> logging.Logger:
76
+ """Get logger."""
77
+ return self._logger
78
+
79
+ @abstractmethod
80
+ def save(self, data: bytes, file_path: str) -> bool:
81
+ """
82
+ Save data to storage.
83
+
84
+ Args:
85
+ data: Binary data to save
86
+ file_path: Target file path or key
87
+
88
+ Returns:
89
+ True if successful, False otherwise
90
+ """
91
+ pass
92
+
93
+ @abstractmethod
94
+ def delete(self, file_path: str) -> bool:
95
+ """
96
+ Delete file from storage.
97
+
98
+ Args:
99
+ file_path: File path or key to delete
100
+
101
+ Returns:
102
+ True if successful, False otherwise
103
+ """
104
+ pass
105
+
106
+ @abstractmethod
107
+ def exists(self, file_path: str) -> bool:
108
+ """
109
+ Check if file exists in storage.
110
+
111
+ Args:
112
+ file_path: File path or key to check
113
+
114
+ Returns:
115
+ True if file exists
116
+ """
117
+ pass
118
+
119
+ @abstractmethod
120
+ def ensure_ready(self, directory_path: str) -> None:
121
+ """
122
+ Ensure storage is ready (create directory, validate connection, etc.).
123
+
124
+ Args:
125
+ directory_path: Base directory or bucket path
126
+ """
127
+ pass
128
+
129
+ def build_url(self, file_path: str) -> str:
130
+ """
131
+ Build URL or path for the saved file.
132
+
133
+ Override in subclasses for storage-specific URL formats.
134
+
135
+ Args:
136
+ file_path: File path or key
137
+
138
+ Returns:
139
+ URL or path string
140
+ """
141
+ return file_path.replace("\\", "/")
142
+
143
+
144
+ class LocalStorageBackend(BaseStorageBackend):
145
+ """
146
+ Local file system storage backend.
147
+
148
+ Saves files to the local file system.
149
+ """
150
+
151
+ def __init__(self):
152
+ super().__init__(StorageType.LOCAL)
153
+
154
+ def save(self, data: bytes, file_path: str) -> bool:
155
+ """Save data to local file."""
156
+ try:
157
+ with open(file_path, 'wb') as f:
158
+ f.write(data)
159
+ return True
160
+ except Exception as e:
161
+ self._logger.error(f"Failed to save file {file_path}: {e}")
162
+ return False
163
+
164
+ def delete(self, file_path: str) -> bool:
165
+ """Delete local file."""
166
+ try:
167
+ if os.path.exists(file_path):
168
+ os.remove(file_path)
169
+ return True
170
+ return False
171
+ except Exception as e:
172
+ self._logger.warning(f"Failed to delete file {file_path}: {e}")
173
+ return False
174
+
175
+ def exists(self, file_path: str) -> bool:
176
+ """Check if local file exists."""
177
+ return os.path.exists(file_path)
178
+
179
+ def ensure_ready(self, directory_path: str) -> None:
180
+ """Create directory if it doesn't exist."""
181
+ path = Path(directory_path)
182
+ if not path.exists():
183
+ path.mkdir(parents=True, exist_ok=True)
184
+ self._logger.debug(f"Created directory: {path}")
185
+
186
+
187
+ class MinIOStorageBackend(BaseStorageBackend):
188
+ """
189
+ MinIO object storage backend (STUB - Not Implemented).
190
+
191
+ This is a placeholder for MinIO integration.
192
+ Requires minio package to be installed.
193
+
194
+ Args:
195
+ endpoint: MinIO server endpoint
196
+ access_key: MinIO access key
197
+ secret_key: MinIO secret key
198
+ bucket: Target bucket name
199
+ secure: Use HTTPS (default: True)
200
+ """
201
+
202
+ def __init__(
203
+ self,
204
+ endpoint: str = "localhost:9000",
205
+ access_key: str = "",
206
+ secret_key: str = "",
207
+ bucket: str = "images",
208
+ secure: bool = True,
209
+ ):
210
+ super().__init__(StorageType.MINIO)
211
+ self._endpoint = endpoint
212
+ self._access_key = access_key
213
+ self._secret_key = secret_key
214
+ self._bucket = bucket
215
+ self._secure = secure
216
+ self._client = None
217
+
218
+ self._logger.warning(
219
+ "MinIOStorageBackend is a stub implementation. "
220
+ "Full implementation is pending."
221
+ )
222
+
223
+ @property
224
+ def bucket(self) -> str:
225
+ """Get bucket name."""
226
+ return self._bucket
227
+
228
+ @property
229
+ def endpoint(self) -> str:
230
+ """Get endpoint."""
231
+ return self._endpoint
232
+
233
+ def save(self, data: bytes, file_path: str) -> bool:
234
+ """Upload data to MinIO bucket."""
235
+ raise NotImplementedError(
236
+ "MinIOStorageBackend.save() is not yet implemented. "
237
+ "Use LocalStorageBackend for now."
238
+ )
239
+
240
+ def delete(self, file_path: str) -> bool:
241
+ """Delete object from MinIO bucket."""
242
+ raise NotImplementedError(
243
+ "MinIOStorageBackend.delete() is not yet implemented."
244
+ )
245
+
246
+ def exists(self, file_path: str) -> bool:
247
+ """Check if object exists in MinIO bucket."""
248
+ raise NotImplementedError(
249
+ "MinIOStorageBackend.exists() is not yet implemented."
250
+ )
251
+
252
+ def ensure_ready(self, directory_path: str) -> None:
253
+ """Initialize MinIO client and ensure bucket exists."""
254
+ raise NotImplementedError(
255
+ "MinIOStorageBackend.ensure_ready() is not yet implemented."
256
+ )
257
+
258
+ def build_url(self, file_path: str) -> str:
259
+ """Build MinIO URL for the file."""
260
+ # Would return presigned URL or object path
261
+ protocol = "https" if self._secure else "http"
262
+ return f"{protocol}://{self._endpoint}/{self._bucket}/{file_path}"
263
+
264
+
265
+ class S3StorageBackend(BaseStorageBackend):
266
+ """
267
+ AWS S3 storage backend (STUB - Not Implemented).
268
+
269
+ This is a placeholder for AWS S3 integration.
270
+ Requires boto3 package to be installed.
271
+
272
+ Args:
273
+ bucket: S3 bucket name
274
+ region: AWS region (default: "us-east-1")
275
+ prefix: Key prefix for uploaded objects
276
+ """
277
+
278
+ def __init__(
279
+ self,
280
+ bucket: str = "",
281
+ region: str = "us-east-1",
282
+ prefix: str = "",
283
+ ):
284
+ super().__init__(StorageType.S3)
285
+ self._bucket = bucket
286
+ self._region = region
287
+ self._prefix = prefix
288
+ self._client = None
289
+
290
+ self._logger.warning(
291
+ "S3StorageBackend is a stub implementation. "
292
+ "Full implementation is pending."
293
+ )
294
+
295
+ @property
296
+ def bucket(self) -> str:
297
+ """Get bucket name."""
298
+ return self._bucket
299
+
300
+ @property
301
+ def region(self) -> str:
302
+ """Get region."""
303
+ return self._region
304
+
305
+ def save(self, data: bytes, file_path: str) -> bool:
306
+ """Upload data to S3 bucket."""
307
+ raise NotImplementedError(
308
+ "S3StorageBackend.save() is not yet implemented. "
309
+ "Use LocalStorageBackend for now."
310
+ )
311
+
312
+ def delete(self, file_path: str) -> bool:
313
+ """Delete object from S3 bucket."""
314
+ raise NotImplementedError(
315
+ "S3StorageBackend.delete() is not yet implemented."
316
+ )
317
+
318
+ def exists(self, file_path: str) -> bool:
319
+ """Check if object exists in S3 bucket."""
320
+ raise NotImplementedError(
321
+ "S3StorageBackend.exists() is not yet implemented."
322
+ )
323
+
324
+ def ensure_ready(self, directory_path: str) -> None:
325
+ """Initialize S3 client and verify bucket access."""
326
+ raise NotImplementedError(
327
+ "S3StorageBackend.ensure_ready() is not yet implemented."
328
+ )
329
+
330
+ def build_url(self, file_path: str) -> str:
331
+ """Build S3 URL for the file."""
332
+ # Would return S3 URI or presigned URL
333
+ return f"s3://{self._bucket}/{file_path}"
334
+
335
+
336
+ # Default backend instance
337
+ _default_backend = LocalStorageBackend()
338
+
339
+
340
+ def get_default_backend() -> BaseStorageBackend:
341
+ """Get the default storage backend (local)."""
342
+ return _default_backend
343
+
344
+
345
+ def create_storage_backend(
346
+ storage_type: StorageType = StorageType.LOCAL,
347
+ **kwargs
348
+ ) -> BaseStorageBackend:
349
+ """
350
+ Factory function to create a storage backend.
351
+
352
+ Args:
353
+ storage_type: Type of storage backend
354
+ **kwargs: Storage-specific options
355
+
356
+ Returns:
357
+ BaseStorageBackend instance
358
+ """
359
+ if storage_type == StorageType.LOCAL:
360
+ return LocalStorageBackend()
361
+ elif storage_type == StorageType.MINIO:
362
+ return MinIOStorageBackend(**kwargs)
363
+ elif storage_type == StorageType.S3:
364
+ return S3StorageBackend(**kwargs)
365
+ else:
366
+ raise ValueError(f"Unsupported storage type: {storage_type}")
367
+
368
+
369
+ __all__ = [
370
+ # Enum
371
+ "StorageType",
372
+ # Base class
373
+ "BaseStorageBackend",
374
+ # Implementations
375
+ "LocalStorageBackend",
376
+ "MinIOStorageBackend",
377
+ "S3StorageBackend",
378
+ # Factory
379
+ "create_storage_backend",
380
+ "get_default_backend",
381
+ ]