earthcatalog 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- earthcatalog/__init__.py +164 -0
- earthcatalog/async_http_client.py +1006 -0
- earthcatalog/config.py +97 -0
- earthcatalog/engines/__init__.py +308 -0
- earthcatalog/engines/rustac_engine.py +142 -0
- earthcatalog/engines/stac_geoparquet_engine.py +126 -0
- earthcatalog/exceptions.py +471 -0
- earthcatalog/grid_systems.py +1114 -0
- earthcatalog/ingestion_pipeline.py +2281 -0
- earthcatalog/input_readers.py +603 -0
- earthcatalog/job_tracking.py +485 -0
- earthcatalog/pipeline.py +606 -0
- earthcatalog/schema_generator.py +911 -0
- earthcatalog/spatial_resolver.py +1207 -0
- earthcatalog/stac_hooks.py +754 -0
- earthcatalog/statistics.py +677 -0
- earthcatalog/storage_backends.py +548 -0
- earthcatalog/tests/__init__.py +1 -0
- earthcatalog/tests/conftest.py +76 -0
- earthcatalog/tests/test_all_grids.py +793 -0
- earthcatalog/tests/test_async_http.py +700 -0
- earthcatalog/tests/test_cli_and_storage.py +230 -0
- earthcatalog/tests/test_config.py +245 -0
- earthcatalog/tests/test_dask_integration.py +580 -0
- earthcatalog/tests/test_e2e_synthetic.py +1624 -0
- earthcatalog/tests/test_engines.py +272 -0
- earthcatalog/tests/test_exceptions.py +346 -0
- earthcatalog/tests/test_file_structure.py +245 -0
- earthcatalog/tests/test_input_readers.py +666 -0
- earthcatalog/tests/test_integration.py +200 -0
- earthcatalog/tests/test_integration_async.py +283 -0
- earthcatalog/tests/test_job_tracking.py +603 -0
- earthcatalog/tests/test_multi_file_input.py +336 -0
- earthcatalog/tests/test_passthrough_hook.py +196 -0
- earthcatalog/tests/test_pipeline.py +684 -0
- earthcatalog/tests/test_pipeline_components.py +665 -0
- earthcatalog/tests/test_schema_generator.py +506 -0
- earthcatalog/tests/test_spatial_resolver.py +413 -0
- earthcatalog/tests/test_stac_hooks.py +776 -0
- earthcatalog/tests/test_statistics.py +477 -0
- earthcatalog/tests/test_storage_backends.py +236 -0
- earthcatalog/tests/test_validation.py +435 -0
- earthcatalog/tests/test_workers.py +653 -0
- earthcatalog/validation.py +921 -0
- earthcatalog/workers.py +682 -0
- earthcatalog-0.2.0.dist-info/METADATA +333 -0
- earthcatalog-0.2.0.dist-info/RECORD +50 -0
- earthcatalog-0.2.0.dist-info/WHEEL +5 -0
- earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
- earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,548 @@
|
|
|
1
|
+
"""Storage backend abstractions for unified local and cloud storage operations.
|
|
2
|
+
|
|
3
|
+
This module provides a pluggable storage abstraction layer that enables EarthCatalog
|
|
4
|
+
to work seamlessly across different storage systems without code changes. The unified
|
|
5
|
+
interface abstracts away the complexity of different storage protocols while maintaining
|
|
6
|
+
high performance and reliability for large-scale data operations.
|
|
7
|
+
|
|
8
|
+
Async HTTP Integration:
|
|
9
|
+
Storage backends are optimized to work efficiently with EarthCatalog's async HTTP
|
|
10
|
+
processing, providing the storage layer that complements the 3-6x performance
|
|
11
|
+
improvements from concurrent HTTP requests. The storage systems handle the high
|
|
12
|
+
throughput data ingestion that async HTTP enables.
|
|
13
|
+
|
|
14
|
+
Supported Storage Systems:
|
|
15
|
+
LocalStorage: Local filesystem operations with high-performance file I/O
|
|
16
|
+
- Optimized for async HTTP development and testing workflows
|
|
17
|
+
- SSD-friendly I/O patterns for maximum throughput
|
|
18
|
+
- Efficient handling of concurrent worker outputs
|
|
19
|
+
|
|
20
|
+
S3Storage: AWS S3 and S3-compatible storage (MinIO, DigitalOcean Spaces, etc.)
|
|
21
|
+
- Cloud-scale storage for async HTTP production deployments
|
|
22
|
+
- Multipart upload optimization for high-throughput ingestion
|
|
23
|
+
- Connection pooling complements async HTTP connection management
|
|
24
|
+
- Automatic retry strategies aligned with HTTP retry logic
|
|
25
|
+
|
|
26
|
+
Future: Google Cloud Storage, Azure Blob Storage, HDFS via extensible design
|
|
27
|
+
|
|
28
|
+
Key Benefits:
|
|
29
|
+
- Unified API across all storage systems for seamless deployment portability
|
|
30
|
+
- Async HTTP-optimized implementations for maximum throughput
|
|
31
|
+
- Automatic protocol detection and appropriate backend selection
|
|
32
|
+
- Consistent error handling and retry strategies aligned with async HTTP
|
|
33
|
+
- Thread-safe operations for concurrent async worker access patterns
|
|
34
|
+
- Storage throughput scaling that matches async HTTP performance gains
|
|
35
|
+
|
|
36
|
+
Performance Optimizations with Async HTTP:
|
|
37
|
+
Local Storage + Async HTTP:
|
|
38
|
+
- Efficient system calls and memory mapping for high write throughput
|
|
39
|
+
- Optimized file I/O patterns for concurrent worker outputs
|
|
40
|
+
- SSD-friendly write strategies for maximum async HTTP benefit
|
|
41
|
+
|
|
42
|
+
S3 Storage + Async HTTP:
|
|
43
|
+
- Multipart uploads and connection pooling complement async HTTP
|
|
44
|
+
- Batch operations reduce API overhead for high-throughput ingestion
|
|
45
|
+
- Streaming I/O matches async HTTP memory efficiency patterns
|
|
46
|
+
- Cloud-scale storage bandwidth utilizes full async HTTP performance
|
|
47
|
+
|
|
48
|
+
Performance Characteristics:
|
|
49
|
+
- Local + Async: 3-6x faster ingestion with NVMe SSD storage
|
|
50
|
+
- S3 + Async: Linear throughput scaling with worker count
|
|
51
|
+
- Memory efficiency: Storage patterns optimized for async batch processing
|
|
52
|
+
|
|
53
|
+
Design Patterns:
|
|
54
|
+
The storage backends follow the Strategy pattern, allowing the async HTTP-enabled
|
|
55
|
+
ingestion pipeline to operate uniformly regardless of the underlying storage system.
|
|
56
|
+
This enables:
|
|
57
|
+
- Easy migration between storage systems without affecting async HTTP performance
|
|
58
|
+
- Development on local storage with async HTTP, production deployment on S3
|
|
59
|
+
- Multi-tier storage architectures (local cache + cloud persistence)
|
|
60
|
+
- Testing async HTTP performance with different storage backends
|
|
61
|
+
|
|
62
|
+
Configuration Examples:
|
|
63
|
+
|
|
64
|
+
Local Development with Async HTTP:
|
|
65
|
+
>>> config = ProcessingConfig(
|
|
66
|
+
... output_catalog='./catalog', # Local storage
|
|
67
|
+
... scratch_location='./scratch', # Local scratch space
|
|
68
|
+
... enable_concurrent_http=True, # Async HTTP enabled
|
|
69
|
+
... concurrent_requests=50 # Optimized for local SSD
|
|
70
|
+
... )
|
|
71
|
+
|
|
72
|
+
Production S3 with High-Performance Async:
|
|
73
|
+
>>> config = ProcessingConfig(
|
|
74
|
+
... output_catalog='s3://bucket/catalog', # S3 storage
|
|
75
|
+
... scratch_location='s3://bucket/scratch', # S3 scratch space
|
|
76
|
+
... enable_concurrent_http=True, # Async HTTP enabled
|
|
77
|
+
... concurrent_requests=100, # High concurrency for cloud
|
|
78
|
+
... batch_size=2000 # Large batches for S3 efficiency
|
|
79
|
+
... )
|
|
80
|
+
|
|
81
|
+
Hybrid Architecture (Local + S3):
|
|
82
|
+
>>> config = ProcessingConfig(
|
|
83
|
+
... output_catalog='s3://bucket/catalog', # Final storage in S3
|
|
84
|
+
... scratch_location='./scratch', # Local high-speed scratch
|
|
85
|
+
... enable_concurrent_http=True, # Async HTTP processing
|
|
86
|
+
... concurrent_requests=75 # Balanced for hybrid setup
|
|
87
|
+
... )
|
|
88
|
+
|
|
89
|
+
Storage Backend Selection:
|
|
90
|
+
>>> # Automatic backend selection based on path (async-optimized)
|
|
91
|
+
>>> if path.startswith('s3://'):
|
|
92
|
+
... storage = S3Storage(path) # S3 with multipart upload optimization
|
|
93
|
+
... else:
|
|
94
|
+
... storage = LocalStorage(path) # Local with async-friendly I/O patterns
|
|
95
|
+
>>>
|
|
96
|
+
>>> # Unified operations across all backends (async-compatible)
|
|
97
|
+
>>> if storage.exists('data.parquet'):
|
|
98
|
+
... with storage.open('data.parquet', 'rb') as f:
|
|
99
|
+
... data = f.read()
|
|
100
|
+
>>> storage.upload('local_file.txt', 'remote_file.txt')
|
|
101
|
+
|
|
102
|
+
Integration:
|
|
103
|
+
Storage backends integrate transparently with EarthCatalog's async HTTP-enabled
|
|
104
|
+
ingestion pipeline through automatic backend detection based on URL schemes.
|
|
105
|
+
The pipeline selects the appropriate backend without requiring explicit configuration,
|
|
106
|
+
with storage performance automatically optimized for the async HTTP processing patterns.
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
import fnmatch
|
|
110
|
+
import hashlib
|
|
111
|
+
import shutil
|
|
112
|
+
from abc import ABC, abstractmethod
|
|
113
|
+
from pathlib import Path
|
|
114
|
+
from typing import BinaryIO, cast
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class StorageBackend(ABC):
|
|
118
|
+
"""Abstract base class defining the unified storage interface for all backend implementations.
|
|
119
|
+
|
|
120
|
+
This abstract base class establishes the contract that all storage backend implementations
|
|
121
|
+
must follow, ensuring consistent behavior across local filesystems, cloud storage, and
|
|
122
|
+
other storage systems. The interface is designed for high-performance I/O operations
|
|
123
|
+
while abstracting away storage-specific implementation details.
|
|
124
|
+
|
|
125
|
+
Interface Design Principles:
|
|
126
|
+
- Consistent method signatures across all storage systems
|
|
127
|
+
- Binary I/O focus for maximum performance and reliability
|
|
128
|
+
- Path-based operations using string paths for flexibility
|
|
129
|
+
- Exception handling that translates storage-specific errors to common patterns
|
|
130
|
+
- Thread-safe operations where supported by the underlying storage system
|
|
131
|
+
|
|
132
|
+
Implementation Requirements:
|
|
133
|
+
All concrete storage backends must implement all abstract methods with behavior
|
|
134
|
+
that matches the documented interface contract. Implementations should handle
|
|
135
|
+
storage-specific optimizations while maintaining interface consistency.
|
|
136
|
+
|
|
137
|
+
Error Handling:
|
|
138
|
+
Implementations should translate storage-specific exceptions into standard
|
|
139
|
+
Python exceptions (FileNotFoundError, PermissionError, etc.) for consistent
|
|
140
|
+
error handling across different storage backends.
|
|
141
|
+
|
|
142
|
+
Performance Considerations:
|
|
143
|
+
- Method implementations should be optimized for the specific storage system
|
|
144
|
+
- Batch operations should be preferred where the underlying storage supports them
|
|
145
|
+
- Connection pooling and resource reuse should be implemented where applicable
|
|
146
|
+
- Memory efficiency should be maintained for large file operations
|
|
147
|
+
"""
|
|
148
|
+
|
|
149
|
+
@abstractmethod
|
|
150
|
+
def exists(self, path: str) -> bool:
|
|
151
|
+
"""Check if path exists."""
|
|
152
|
+
pass
|
|
153
|
+
|
|
154
|
+
@abstractmethod
|
|
155
|
+
def open(self, path: str, mode: str) -> BinaryIO:
|
|
156
|
+
"""Open file for reading/writing."""
|
|
157
|
+
pass
|
|
158
|
+
|
|
159
|
+
@abstractmethod
|
|
160
|
+
def makedirs(self, path: Path):
|
|
161
|
+
"""Create directory and parents."""
|
|
162
|
+
pass
|
|
163
|
+
|
|
164
|
+
@abstractmethod
|
|
165
|
+
def remove(self, path: str):
|
|
166
|
+
"""Remove file."""
|
|
167
|
+
pass
|
|
168
|
+
|
|
169
|
+
@abstractmethod
|
|
170
|
+
def rename(self, src: str, dst: str):
|
|
171
|
+
"""Rename/move file."""
|
|
172
|
+
pass
|
|
173
|
+
|
|
174
|
+
@abstractmethod
|
|
175
|
+
def upload(self, local_path: str, remote_path: str):
|
|
176
|
+
"""Upload local file to storage."""
|
|
177
|
+
pass
|
|
178
|
+
|
|
179
|
+
@abstractmethod
|
|
180
|
+
def get_etag(self, path: str) -> str | None:
|
|
181
|
+
"""Get ETag/checksum for a file.
|
|
182
|
+
|
|
183
|
+
Returns a content-based hash for local files or the S3 ETag for cloud storage.
|
|
184
|
+
Useful for detecting file changes and implementing optimistic concurrency.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
path: Path to the file.
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
ETag string if file exists, None if file doesn't exist.
|
|
191
|
+
"""
|
|
192
|
+
pass
|
|
193
|
+
|
|
194
|
+
@abstractmethod
|
|
195
|
+
def rmtree(self, path: str) -> None:
|
|
196
|
+
"""Recursively remove directory and all contents.
|
|
197
|
+
|
|
198
|
+
Similar to shutil.rmtree for local files or deleting all objects
|
|
199
|
+
with a given prefix for cloud storage.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
path: Directory path to remove.
|
|
203
|
+
|
|
204
|
+
Note:
|
|
205
|
+
Does not raise an error if the directory doesn't exist.
|
|
206
|
+
"""
|
|
207
|
+
pass
|
|
208
|
+
|
|
209
|
+
@abstractmethod
|
|
210
|
+
def list_files(self, path: str, pattern: str = "*") -> list[str]:
|
|
211
|
+
"""List files in directory matching a glob pattern.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
path: Directory path to list.
|
|
215
|
+
pattern: Glob pattern to match (default: "*" for all files).
|
|
216
|
+
Supports "**" for recursive matching.
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
List of full file paths matching the pattern.
|
|
220
|
+
Returns empty list if directory doesn't exist.
|
|
221
|
+
"""
|
|
222
|
+
pass
|
|
223
|
+
|
|
224
|
+
@abstractmethod
|
|
225
|
+
def list_dirs(self, path: str) -> list[str]:
|
|
226
|
+
"""List immediate subdirectories in a directory.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
path: Directory path to list.
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
List of full directory paths (immediate children only).
|
|
233
|
+
Returns empty list if directory doesn't exist.
|
|
234
|
+
"""
|
|
235
|
+
pass
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
class LocalStorage(StorageBackend):
|
|
239
|
+
"""High-performance local filesystem storage backend optimized for development and single-node deployments.
|
|
240
|
+
|
|
241
|
+
This storage backend provides optimized access to local filesystems with support for
|
|
242
|
+
all standard file operations. Designed for development environments, single-machine
|
|
243
|
+
deployments, and as a performance baseline for comparing cloud storage backends.
|
|
244
|
+
|
|
245
|
+
Key Features:
|
|
246
|
+
- Direct filesystem access with minimal overhead
|
|
247
|
+
- Memory-efficient streaming for large files
|
|
248
|
+
- Atomic operations where supported by the filesystem
|
|
249
|
+
- Cross-platform compatibility (Windows, macOS, Linux)
|
|
250
|
+
- Comprehensive error handling with informative messages
|
|
251
|
+
|
|
252
|
+
Performance Characteristics:
|
|
253
|
+
- Excellent for development and testing with immediate feedback
|
|
254
|
+
- High throughput for large file operations (limited by disk I/O)
|
|
255
|
+
- Low latency for small file operations
|
|
256
|
+
- Efficient memory usage with streaming operations
|
|
257
|
+
- Optimal for scenarios where data locality is guaranteed
|
|
258
|
+
|
|
259
|
+
Use Cases:
|
|
260
|
+
- Development and testing environments
|
|
261
|
+
- Single-machine production deployments
|
|
262
|
+
- Local caching layer in multi-tier architectures
|
|
263
|
+
- High-performance computing environments with shared filesystems
|
|
264
|
+
- Scenarios requiring guaranteed data locality
|
|
265
|
+
|
|
266
|
+
Thread Safety:
|
|
267
|
+
This backend is thread-safe for most operations, relying on the underlying
|
|
268
|
+
filesystem's thread safety guarantees. Concurrent reads are fully supported,
|
|
269
|
+
while concurrent writes should be coordinated at the application level.
|
|
270
|
+
|
|
271
|
+
Example:
|
|
272
|
+
>>> storage = LocalStorage('/path/to/catalog')
|
|
273
|
+
>>> if storage.exists('partition_001.parquet'):
|
|
274
|
+
... with storage.open('partition_001.parquet', 'rb') as f:
|
|
275
|
+
... data = f.read()
|
|
276
|
+
>>> storage.makedirs(Path('new_partition'))
|
|
277
|
+
"""
|
|
278
|
+
|
|
279
|
+
def __init__(self, base_path: str):
|
|
280
|
+
self.base_path = Path(base_path)
|
|
281
|
+
|
|
282
|
+
def exists(self, path: str) -> bool:
|
|
283
|
+
return Path(path).exists()
|
|
284
|
+
|
|
285
|
+
def open(self, path: str, mode: str) -> BinaryIO:
|
|
286
|
+
# Ensure binary mode for BinaryIO compatibility
|
|
287
|
+
if "b" not in mode:
|
|
288
|
+
mode += "b"
|
|
289
|
+
result = open(path, mode)
|
|
290
|
+
return cast(BinaryIO, result)
|
|
291
|
+
|
|
292
|
+
def makedirs(self, path: Path):
|
|
293
|
+
Path(path).mkdir(parents=True, exist_ok=True)
|
|
294
|
+
|
|
295
|
+
def remove(self, path: str):
|
|
296
|
+
Path(path).unlink(missing_ok=True)
|
|
297
|
+
|
|
298
|
+
def rename(self, src: str, dst: str):
|
|
299
|
+
Path(src).rename(dst)
|
|
300
|
+
|
|
301
|
+
def upload(self, local_path: str, remote_path: str):
|
|
302
|
+
# For local storage, just copy
|
|
303
|
+
self.makedirs(Path(remote_path).parent)
|
|
304
|
+
shutil.copy2(local_path, remote_path)
|
|
305
|
+
|
|
306
|
+
def get_etag(self, path: str) -> str | None:
|
|
307
|
+
"""Get MD5 hash of file contents as ETag."""
|
|
308
|
+
file_path = Path(path)
|
|
309
|
+
if not file_path.exists():
|
|
310
|
+
return None
|
|
311
|
+
|
|
312
|
+
# Use MD5 hash of file contents (not for security, just checksums)
|
|
313
|
+
hash_md5 = hashlib.md5(usedforsecurity=False)
|
|
314
|
+
with open(file_path, "rb") as f:
|
|
315
|
+
for chunk in iter(lambda: f.read(8192), b""):
|
|
316
|
+
hash_md5.update(chunk)
|
|
317
|
+
return hash_md5.hexdigest()
|
|
318
|
+
|
|
319
|
+
def rmtree(self, path: str) -> None:
|
|
320
|
+
"""Recursively remove directory and contents."""
|
|
321
|
+
dir_path = Path(path)
|
|
322
|
+
if dir_path.exists():
|
|
323
|
+
shutil.rmtree(dir_path)
|
|
324
|
+
|
|
325
|
+
def list_files(self, path: str, pattern: str = "*") -> list[str]:
|
|
326
|
+
"""List files matching glob pattern."""
|
|
327
|
+
dir_path = Path(path)
|
|
328
|
+
if not dir_path.exists():
|
|
329
|
+
return []
|
|
330
|
+
|
|
331
|
+
# Handle recursive pattern
|
|
332
|
+
if "**" in pattern:
|
|
333
|
+
return [str(p) for p in dir_path.glob(pattern) if p.is_file()]
|
|
334
|
+
else:
|
|
335
|
+
# Non-recursive - match in the directory
|
|
336
|
+
return [str(p) for p in dir_path.iterdir() if p.is_file() and fnmatch.fnmatch(p.name, pattern)]
|
|
337
|
+
|
|
338
|
+
def list_dirs(self, path: str) -> list[str]:
|
|
339
|
+
"""List immediate subdirectories."""
|
|
340
|
+
dir_path = Path(path)
|
|
341
|
+
if not dir_path.exists():
|
|
342
|
+
return []
|
|
343
|
+
|
|
344
|
+
return [str(p) for p in dir_path.iterdir() if p.is_dir()]
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
class S3Storage(StorageBackend):
|
|
348
|
+
"""AWS S3 and S3-compatible cloud storage backend with optimized performance for large-scale data operations.
|
|
349
|
+
|
|
350
|
+
This storage backend provides high-performance access to S3 and S3-compatible storage
|
|
351
|
+
systems (AWS S3, MinIO, DigitalOcean Spaces, etc.) with intelligent optimizations
|
|
352
|
+
for the unique characteristics of object storage systems.
|
|
353
|
+
|
|
354
|
+
Cloud Storage Optimizations:
|
|
355
|
+
- Connection pooling and persistent connections for reduced latency
|
|
356
|
+
- Multipart upload support for large files with automatic chunking
|
|
357
|
+
- Retry strategies with exponential backoff for resilient operations
|
|
358
|
+
- Efficient metadata operations minimizing API calls
|
|
359
|
+
- Batch operations where supported by the S3 API
|
|
360
|
+
|
|
361
|
+
Performance Features:
|
|
362
|
+
- Parallel uploads and downloads for maximum throughput
|
|
363
|
+
- Streaming operations for memory-efficient large file handling
|
|
364
|
+
- Intelligent part sizing based on file size and network conditions
|
|
365
|
+
- Connection reuse across operations for reduced overhead
|
|
366
|
+
- Asynchronous operations where supported
|
|
367
|
+
|
|
368
|
+
Compatibility:
|
|
369
|
+
- AWS S3: Full feature support including advanced S3 features
|
|
370
|
+
- MinIO: Complete compatibility for self-hosted object storage
|
|
371
|
+
- DigitalOcean Spaces: Full compatibility with Spaces API
|
|
372
|
+
- Google Cloud Storage: Compatible via S3 compatibility layer
|
|
373
|
+
- Other S3-compatible systems: Broad compatibility with standard S3 API
|
|
374
|
+
|
|
375
|
+
Configuration:
|
|
376
|
+
Authentication and configuration handled through standard AWS SDK methods:
|
|
377
|
+
- AWS credentials file (~/.aws/credentials)
|
|
378
|
+
- Environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
|
|
379
|
+
- IAM roles for EC2/container deployments
|
|
380
|
+
- Custom endpoint URLs for S3-compatible services
|
|
381
|
+
|
|
382
|
+
Use Cases:
|
|
383
|
+
- Production cloud deployments requiring scalable storage
|
|
384
|
+
- Multi-region data distribution and disaster recovery
|
|
385
|
+
- Large-scale data processing with virtually unlimited storage
|
|
386
|
+
- Cost-effective long-term data archival
|
|
387
|
+
- Serverless and containerized application deployments
|
|
388
|
+
|
|
389
|
+
Performance Considerations:
|
|
390
|
+
- Latency higher than local storage but excellent throughput
|
|
391
|
+
- Optimized for large files (>1MB) with multipart operations
|
|
392
|
+
- Network bandwidth typically the limiting factor
|
|
393
|
+
- Cost-effective for infrequent access patterns with intelligent tiering
|
|
394
|
+
|
|
395
|
+
Example:
|
|
396
|
+
>>> storage = S3Storage('s3://my-bucket/catalog/')
|
|
397
|
+
>>> # Efficient large file upload with automatic multipart
|
|
398
|
+
>>> storage.upload('large_dataset.parquet', 's3://bucket/data.parquet')
|
|
399
|
+
>>> # Streaming read for memory efficiency
|
|
400
|
+
>>> with storage.open('s3://bucket/data.parquet', 'rb') as f:
|
|
401
|
+
... chunk = f.read(8192) # Streaming read
|
|
402
|
+
"""
|
|
403
|
+
|
|
404
|
+
def __init__(
|
|
405
|
+
self,
|
|
406
|
+
base_path: str,
|
|
407
|
+
connect_timeout: float = 30.0,
|
|
408
|
+
read_timeout: float = 60.0,
|
|
409
|
+
retries: int = 3,
|
|
410
|
+
):
|
|
411
|
+
"""Initialize S3 storage backend.
|
|
412
|
+
|
|
413
|
+
Args:
|
|
414
|
+
base_path: S3 path (e.g., 's3://bucket/prefix/')
|
|
415
|
+
connect_timeout: Connection timeout in seconds (default: 30.0)
|
|
416
|
+
read_timeout: Read timeout in seconds (default: 60.0)
|
|
417
|
+
retries: Number of retry attempts (default: 3)
|
|
418
|
+
"""
|
|
419
|
+
try:
|
|
420
|
+
import s3fs
|
|
421
|
+
from botocore.config import Config
|
|
422
|
+
|
|
423
|
+
# Configure boto3 with timeouts and retries
|
|
424
|
+
client_kwargs = {
|
|
425
|
+
"config": Config(
|
|
426
|
+
connect_timeout=connect_timeout,
|
|
427
|
+
read_timeout=read_timeout,
|
|
428
|
+
retries={"max_attempts": retries, "mode": "adaptive"},
|
|
429
|
+
)
|
|
430
|
+
}
|
|
431
|
+
self.fs = s3fs.S3FileSystem(config_kwargs=client_kwargs)
|
|
432
|
+
self.base_path = base_path
|
|
433
|
+
except ImportError:
|
|
434
|
+
raise ImportError("s3fs required for S3 storage: pip install s3fs") from None
|
|
435
|
+
|
|
436
|
+
def exists(self, path: str) -> bool:
|
|
437
|
+
result = self.fs.exists(path)
|
|
438
|
+
return cast(bool, result)
|
|
439
|
+
|
|
440
|
+
def open(self, path: str, mode: str) -> BinaryIO:
|
|
441
|
+
result = self.fs.open(path, mode)
|
|
442
|
+
return cast(BinaryIO, result)
|
|
443
|
+
|
|
444
|
+
def makedirs(self, path: Path):
|
|
445
|
+
# S3 doesn't require explicit directory creation
|
|
446
|
+
pass
|
|
447
|
+
|
|
448
|
+
def remove(self, path: str):
|
|
449
|
+
if self.fs.exists(path):
|
|
450
|
+
self.fs.rm(path)
|
|
451
|
+
|
|
452
|
+
def rename(self, src: str, dst: str):
|
|
453
|
+
self.fs.mv(src, dst)
|
|
454
|
+
|
|
455
|
+
def upload(self, local_path: str, remote_path: str):
|
|
456
|
+
self.fs.put(local_path, remote_path)
|
|
457
|
+
|
|
458
|
+
def get_etag(self, path: str) -> str | None:
|
|
459
|
+
"""Get S3 ETag for a file."""
|
|
460
|
+
if not self.fs.exists(path):
|
|
461
|
+
return None
|
|
462
|
+
|
|
463
|
+
try:
|
|
464
|
+
info = self.fs.info(path)
|
|
465
|
+
etag = info.get("ETag", "")
|
|
466
|
+
# S3 ETags are quoted, strip quotes
|
|
467
|
+
return etag.strip('"') if etag else None
|
|
468
|
+
except (OSError, ValueError, TypeError):
|
|
469
|
+
return None
|
|
470
|
+
|
|
471
|
+
def rmtree(self, path: str) -> None:
|
|
472
|
+
"""Recursively delete all objects under a prefix."""
|
|
473
|
+
if self.fs.exists(path):
|
|
474
|
+
# rm with recursive=True handles directories
|
|
475
|
+
self.fs.rm(path, recursive=True)
|
|
476
|
+
|
|
477
|
+
def list_files(self, path: str, pattern: str = "*") -> list[str]:
|
|
478
|
+
"""List files in S3 matching a glob pattern."""
|
|
479
|
+
# Normalize path - remove trailing slash for consistency
|
|
480
|
+
path = path.rstrip("/")
|
|
481
|
+
|
|
482
|
+
if not self.fs.exists(path):
|
|
483
|
+
return []
|
|
484
|
+
|
|
485
|
+
try:
|
|
486
|
+
# Use glob for pattern matching
|
|
487
|
+
if pattern == "*":
|
|
488
|
+
# List all files directly in path
|
|
489
|
+
all_files = self.fs.ls(path, detail=False)
|
|
490
|
+
# Filter to only files (not directories)
|
|
491
|
+
return [f"s3://{f}" for f in all_files if not self.fs.isdir(f"s3://{f}")]
|
|
492
|
+
else:
|
|
493
|
+
# Use glob for pattern matching
|
|
494
|
+
glob_pattern = f"{path}/{pattern}"
|
|
495
|
+
files = self.fs.glob(glob_pattern)
|
|
496
|
+
return [f"s3://{f}" for f in files]
|
|
497
|
+
except (OSError, ValueError, TypeError):
|
|
498
|
+
return []
|
|
499
|
+
|
|
500
|
+
def list_dirs(self, path: str) -> list[str]:
|
|
501
|
+
"""List immediate subdirectories in S3."""
|
|
502
|
+
# Normalize path - remove trailing slash for consistency
|
|
503
|
+
path = path.rstrip("/")
|
|
504
|
+
|
|
505
|
+
if not self.fs.exists(path):
|
|
506
|
+
return []
|
|
507
|
+
|
|
508
|
+
try:
|
|
509
|
+
# List all items in path
|
|
510
|
+
all_items = self.fs.ls(path, detail=False)
|
|
511
|
+
# Filter to only directories
|
|
512
|
+
return [f"s3://{d}" for d in all_items if self.fs.isdir(f"s3://{d}")]
|
|
513
|
+
except (OSError, ValueError, TypeError):
|
|
514
|
+
return []
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
def get_storage_backend(path: str, **kwargs) -> StorageBackend:
|
|
518
|
+
"""Factory function to get the appropriate storage backend based on path.
|
|
519
|
+
|
|
520
|
+
Automatically detects the storage type from the path scheme and returns
|
|
521
|
+
the appropriate backend instance.
|
|
522
|
+
|
|
523
|
+
Args:
|
|
524
|
+
path: Storage path (local path or cloud URL like s3://).
|
|
525
|
+
**kwargs: Additional arguments passed to the backend constructor.
|
|
526
|
+
For S3Storage:
|
|
527
|
+
- connect_timeout: Connection timeout in seconds (default: 30.0)
|
|
528
|
+
- read_timeout: Read timeout in seconds (default: 60.0)
|
|
529
|
+
- retries: Number of retry attempts (default: 3)
|
|
530
|
+
|
|
531
|
+
Returns:
|
|
532
|
+
StorageBackend instance appropriate for the path.
|
|
533
|
+
|
|
534
|
+
Raises:
|
|
535
|
+
ValueError: If the path scheme is not supported.
|
|
536
|
+
|
|
537
|
+
Example:
|
|
538
|
+
>>> storage = get_storage_backend('./local/catalog')
|
|
539
|
+
>>> storage = get_storage_backend('s3://bucket/catalog')
|
|
540
|
+
"""
|
|
541
|
+
if path.startswith("s3://"):
|
|
542
|
+
return S3Storage(path, **kwargs)
|
|
543
|
+
elif path.startswith(("gs://", "az://", "abfs://")):
|
|
544
|
+
# Future: Add GCS and Azure support
|
|
545
|
+
raise ValueError(f"Storage scheme not yet supported: {path.split('://')[0]}")
|
|
546
|
+
else:
|
|
547
|
+
# Default to local storage
|
|
548
|
+
return LocalStorage(path)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Test package for STAC ingestion pipeline
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""Pytest configuration and fixtures for EarthCatalog tests.
|
|
2
|
+
|
|
3
|
+
This module provides:
|
|
4
|
+
- Custom command-line options for e2e tests
|
|
5
|
+
- Shared fixtures across test modules
|
|
6
|
+
- Pytest hooks for test collection and configuration
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def pytest_addoption(parser):
|
|
11
|
+
"""Add custom command-line options for e2e tests."""
|
|
12
|
+
# Data generation options
|
|
13
|
+
parser.addoption(
|
|
14
|
+
"--e2e-items",
|
|
15
|
+
action="store",
|
|
16
|
+
default="100",
|
|
17
|
+
help="Number of synthetic STAC items to generate for e2e tests",
|
|
18
|
+
)
|
|
19
|
+
parser.addoption(
|
|
20
|
+
"--e2e-outlier-tiny",
|
|
21
|
+
action="store",
|
|
22
|
+
default="5",
|
|
23
|
+
help="Percentage of tiny geometry outliers (0-100)",
|
|
24
|
+
)
|
|
25
|
+
parser.addoption(
|
|
26
|
+
"--e2e-outlier-huge",
|
|
27
|
+
action="store",
|
|
28
|
+
default="5",
|
|
29
|
+
help="Percentage of huge geometry outliers (0-100)",
|
|
30
|
+
)
|
|
31
|
+
parser.addoption(
|
|
32
|
+
"--e2e-seed",
|
|
33
|
+
action="store",
|
|
34
|
+
default=None,
|
|
35
|
+
help="Random seed for reproducibility",
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
# Grid configuration options
|
|
39
|
+
parser.addoption(
|
|
40
|
+
"--e2e-grid",
|
|
41
|
+
action="store",
|
|
42
|
+
default="h3",
|
|
43
|
+
help="Grid system to use: h3, s2, mgrs, latlon, geojson (default: h3)",
|
|
44
|
+
)
|
|
45
|
+
parser.addoption(
|
|
46
|
+
"--e2e-grid-level",
|
|
47
|
+
action="store",
|
|
48
|
+
default="2",
|
|
49
|
+
help="Grid resolution/level (default: 2 for H3)",
|
|
50
|
+
)
|
|
51
|
+
parser.addoption(
|
|
52
|
+
"--e2e-temporal",
|
|
53
|
+
action="store",
|
|
54
|
+
default="month",
|
|
55
|
+
help="Temporal binning: year, month, day (default: month)",
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Query performance profiling options
|
|
59
|
+
parser.addoption(
|
|
60
|
+
"--e2e-profile-queries",
|
|
61
|
+
action="store_true",
|
|
62
|
+
default=False,
|
|
63
|
+
help="Enable query performance profiling",
|
|
64
|
+
)
|
|
65
|
+
parser.addoption(
|
|
66
|
+
"--e2e-query-iterations",
|
|
67
|
+
action="store",
|
|
68
|
+
default="10",
|
|
69
|
+
help="Number of query iterations for profiling (default: 10)",
|
|
70
|
+
)
|
|
71
|
+
parser.addoption(
|
|
72
|
+
"--e2e-query-engines",
|
|
73
|
+
action="store",
|
|
74
|
+
default="duckdb,rustac",
|
|
75
|
+
help="Comma-separated query engines to profile: duckdb, rustac (default: duckdb,rustac)",
|
|
76
|
+
)
|