earthcatalog 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- earthcatalog/__init__.py +164 -0
- earthcatalog/async_http_client.py +1006 -0
- earthcatalog/config.py +97 -0
- earthcatalog/engines/__init__.py +308 -0
- earthcatalog/engines/rustac_engine.py +142 -0
- earthcatalog/engines/stac_geoparquet_engine.py +126 -0
- earthcatalog/exceptions.py +471 -0
- earthcatalog/grid_systems.py +1114 -0
- earthcatalog/ingestion_pipeline.py +2281 -0
- earthcatalog/input_readers.py +603 -0
- earthcatalog/job_tracking.py +485 -0
- earthcatalog/pipeline.py +606 -0
- earthcatalog/schema_generator.py +911 -0
- earthcatalog/spatial_resolver.py +1207 -0
- earthcatalog/stac_hooks.py +754 -0
- earthcatalog/statistics.py +677 -0
- earthcatalog/storage_backends.py +548 -0
- earthcatalog/tests/__init__.py +1 -0
- earthcatalog/tests/conftest.py +76 -0
- earthcatalog/tests/test_all_grids.py +793 -0
- earthcatalog/tests/test_async_http.py +700 -0
- earthcatalog/tests/test_cli_and_storage.py +230 -0
- earthcatalog/tests/test_config.py +245 -0
- earthcatalog/tests/test_dask_integration.py +580 -0
- earthcatalog/tests/test_e2e_synthetic.py +1624 -0
- earthcatalog/tests/test_engines.py +272 -0
- earthcatalog/tests/test_exceptions.py +346 -0
- earthcatalog/tests/test_file_structure.py +245 -0
- earthcatalog/tests/test_input_readers.py +666 -0
- earthcatalog/tests/test_integration.py +200 -0
- earthcatalog/tests/test_integration_async.py +283 -0
- earthcatalog/tests/test_job_tracking.py +603 -0
- earthcatalog/tests/test_multi_file_input.py +336 -0
- earthcatalog/tests/test_passthrough_hook.py +196 -0
- earthcatalog/tests/test_pipeline.py +684 -0
- earthcatalog/tests/test_pipeline_components.py +665 -0
- earthcatalog/tests/test_schema_generator.py +506 -0
- earthcatalog/tests/test_spatial_resolver.py +413 -0
- earthcatalog/tests/test_stac_hooks.py +776 -0
- earthcatalog/tests/test_statistics.py +477 -0
- earthcatalog/tests/test_storage_backends.py +236 -0
- earthcatalog/tests/test_validation.py +435 -0
- earthcatalog/tests/test_workers.py +653 -0
- earthcatalog/validation.py +921 -0
- earthcatalog/workers.py +682 -0
- earthcatalog-0.2.0.dist-info/METADATA +333 -0
- earthcatalog-0.2.0.dist-info/RECORD +50 -0
- earthcatalog-0.2.0.dist-info/WHEEL +5 -0
- earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
- earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
earthcatalog/__init__.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""EarthCatalog - High-Performance STAC Ingestion with Async HTTP Processing.
|
|
2
|
+
|
|
3
|
+
A scalable, production-ready library for processing STAC items into spatially partitioned
|
|
4
|
+
GeoParquet catalogs with built-in 3-6x performance improvements through async HTTP processing.
|
|
5
|
+
|
|
6
|
+
Key Features:
|
|
7
|
+
- **Async HTTP Processing**: 50-100+ concurrent requests per worker (3-6x speedup)
|
|
8
|
+
- **Spatial Partitioning**: H3, S2, MGRS, UTM, and custom grid systems
|
|
9
|
+
- **Distributed Processing**: Single-machine to cluster-scale processing
|
|
10
|
+
- **Cloud-Native**: Native S3, GCS, Azure integration
|
|
11
|
+
- **Memory Efficient**: Handles 100M+ URL datasets with constant memory usage
|
|
12
|
+
- **Production Ready**: Comprehensive error handling, monitoring, and reliability
|
|
13
|
+
|
|
14
|
+
Performance:
|
|
15
|
+
- Sequential HTTP: ~16 requests/second
|
|
16
|
+
- Async HTTP: 50-100+ requests/second (3-6x improvement)
|
|
17
|
+
- Large datasets: 100M URLs processed in 7-14 hours vs 71 hours sequential
|
|
18
|
+
- Memory usage: Linear with worker count, not dataset size
|
|
19
|
+
|
|
20
|
+
Quick Start:
|
|
21
|
+
>>> import earthcatalog
|
|
22
|
+
>>>
|
|
23
|
+
>>> # Basic configuration (async enabled by default)
|
|
24
|
+
>>> config = earthcatalog.ProcessingConfig(
|
|
25
|
+
... input_file='urls.parquet',
|
|
26
|
+
... output_catalog='./catalog',
|
|
27
|
+
... scratch_location='./scratch'
|
|
28
|
+
... )
|
|
29
|
+
>>>
|
|
30
|
+
>>> # Process with local workers
|
|
31
|
+
>>> processor = earthcatalog.LocalProcessor(n_workers=4)
|
|
32
|
+
>>> pipeline = earthcatalog.STACIngestionPipeline(config, processor)
|
|
33
|
+
>>> pipeline.run()
|
|
34
|
+
|
|
35
|
+
High-Performance Example:
|
|
36
|
+
>>> # Optimized for 100M+ URLs
|
|
37
|
+
>>> config = earthcatalog.ProcessingConfig(
|
|
38
|
+
... input_file='s3://bucket/urls.parquet',
|
|
39
|
+
... output_catalog='s3://bucket/catalog',
|
|
40
|
+
... scratch_location='s3://bucket/scratch',
|
|
41
|
+
... enable_concurrent_http=True, # Default: True
|
|
42
|
+
... concurrent_requests=100, # High concurrency
|
|
43
|
+
... batch_size=2000, # Large batches
|
|
44
|
+
... max_workers=32 # Multiple workers
|
|
45
|
+
... )
|
|
46
|
+
|
|
47
|
+
Async HTTP Configuration:
|
|
48
|
+
- enable_concurrent_http: Enable/disable async processing (default: True)
|
|
49
|
+
- concurrent_requests: Simultaneous requests per worker (default: 50)
|
|
50
|
+
- batch_size: URLs processed per batch (default: 1000)
|
|
51
|
+
- request_timeout: Request timeout in seconds (default: 30)
|
|
52
|
+
- retry_attempts: Maximum retry attempts (default: 3)
|
|
53
|
+
|
|
54
|
+
Grid Systems with Async Optimization:
|
|
55
|
+
- H3GridSystem: Recommended for global datasets (optimal async performance)
|
|
56
|
+
- S2GridSystem: Excellent for polar regions (adaptive async batching)
|
|
57
|
+
- UTMGridSystem: High precision for regional data (zone-optimized async)
|
|
58
|
+
- MGRSGridSystem: Government/defense standard (structured async processing)
|
|
59
|
+
|
|
60
|
+
Requirements:
|
|
61
|
+
- Python 3.11+
|
|
62
|
+
- aiohttp>=3.9.0 (for async HTTP - automatically installed)
|
|
63
|
+
- Optional: Dask for distributed processing
|
|
64
|
+
- Optional: S3FS for cloud storage
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
__version__ = "0.2.0"
|
|
68
|
+
__author__ = "betolink"
|
|
69
|
+
__email__ = "betolin@gmail.com"
|
|
70
|
+
|
|
71
|
+
# Import main classes for convenient access
|
|
72
|
+
from .engines import (
|
|
73
|
+
EngineNotAvailableError,
|
|
74
|
+
STACEngine,
|
|
75
|
+
get_engine,
|
|
76
|
+
)
|
|
77
|
+
from .grid_systems import (
|
|
78
|
+
GridSystem,
|
|
79
|
+
H3GridSystem,
|
|
80
|
+
MGRSGridSystem,
|
|
81
|
+
S2GridSystem,
|
|
82
|
+
SimpleLatLonGrid,
|
|
83
|
+
UTMGridSystem,
|
|
84
|
+
)
|
|
85
|
+
from .ingestion_pipeline import (
|
|
86
|
+
DaskDistributedProcessor,
|
|
87
|
+
LocalProcessor,
|
|
88
|
+
ProcessingConfig,
|
|
89
|
+
STACIngestionPipeline,
|
|
90
|
+
)
|
|
91
|
+
from .input_readers import (
|
|
92
|
+
CSVReader,
|
|
93
|
+
InputReader,
|
|
94
|
+
ParquetReader,
|
|
95
|
+
)
|
|
96
|
+
from .schema_generator import (
|
|
97
|
+
SchemaGenerator,
|
|
98
|
+
)
|
|
99
|
+
from .spatial_resolver import (
|
|
100
|
+
SpatialPartitionResolver,
|
|
101
|
+
resolve_and_query,
|
|
102
|
+
spatial_resolver,
|
|
103
|
+
)
|
|
104
|
+
from .storage_backends import (
|
|
105
|
+
LocalStorage,
|
|
106
|
+
S3Storage,
|
|
107
|
+
StorageBackend,
|
|
108
|
+
get_storage_backend,
|
|
109
|
+
)
|
|
110
|
+
from .validation import (
|
|
111
|
+
CatalogValidationResult,
|
|
112
|
+
ValidationIssue,
|
|
113
|
+
ValidationResult,
|
|
114
|
+
validate_catalog,
|
|
115
|
+
validate_geoparquet_file,
|
|
116
|
+
validate_stac_item,
|
|
117
|
+
validate_stac_items_batch,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
__all__ = [
|
|
121
|
+
# Version info
|
|
122
|
+
"__version__",
|
|
123
|
+
"__author__",
|
|
124
|
+
"__email__",
|
|
125
|
+
# Core pipeline
|
|
126
|
+
"STACIngestionPipeline",
|
|
127
|
+
"ProcessingConfig",
|
|
128
|
+
"LocalProcessor",
|
|
129
|
+
"DaskDistributedProcessor",
|
|
130
|
+
# Storage backends
|
|
131
|
+
"StorageBackend",
|
|
132
|
+
"LocalStorage",
|
|
133
|
+
"S3Storage",
|
|
134
|
+
"get_storage_backend",
|
|
135
|
+
# STAC Engines
|
|
136
|
+
"STACEngine",
|
|
137
|
+
"EngineNotAvailableError",
|
|
138
|
+
"get_engine",
|
|
139
|
+
# Grid systems
|
|
140
|
+
"GridSystem",
|
|
141
|
+
"H3GridSystem",
|
|
142
|
+
"S2GridSystem",
|
|
143
|
+
"MGRSGridSystem",
|
|
144
|
+
"UTMGridSystem",
|
|
145
|
+
"SimpleLatLonGrid",
|
|
146
|
+
# Input readers
|
|
147
|
+
"InputReader",
|
|
148
|
+
"ParquetReader",
|
|
149
|
+
"CSVReader",
|
|
150
|
+
# Schema generation
|
|
151
|
+
"SchemaGenerator",
|
|
152
|
+
# Spatial resolution
|
|
153
|
+
"SpatialPartitionResolver",
|
|
154
|
+
"spatial_resolver",
|
|
155
|
+
"resolve_and_query",
|
|
156
|
+
# Validation
|
|
157
|
+
"ValidationResult",
|
|
158
|
+
"ValidationIssue",
|
|
159
|
+
"CatalogValidationResult",
|
|
160
|
+
"validate_stac_item",
|
|
161
|
+
"validate_stac_items_batch",
|
|
162
|
+
"validate_geoparquet_file",
|
|
163
|
+
"validate_catalog",
|
|
164
|
+
]
|