earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. earthcatalog/__init__.py +164 -0
  2. earthcatalog/async_http_client.py +1006 -0
  3. earthcatalog/config.py +97 -0
  4. earthcatalog/engines/__init__.py +308 -0
  5. earthcatalog/engines/rustac_engine.py +142 -0
  6. earthcatalog/engines/stac_geoparquet_engine.py +126 -0
  7. earthcatalog/exceptions.py +471 -0
  8. earthcatalog/grid_systems.py +1114 -0
  9. earthcatalog/ingestion_pipeline.py +2281 -0
  10. earthcatalog/input_readers.py +603 -0
  11. earthcatalog/job_tracking.py +485 -0
  12. earthcatalog/pipeline.py +606 -0
  13. earthcatalog/schema_generator.py +911 -0
  14. earthcatalog/spatial_resolver.py +1207 -0
  15. earthcatalog/stac_hooks.py +754 -0
  16. earthcatalog/statistics.py +677 -0
  17. earthcatalog/storage_backends.py +548 -0
  18. earthcatalog/tests/__init__.py +1 -0
  19. earthcatalog/tests/conftest.py +76 -0
  20. earthcatalog/tests/test_all_grids.py +793 -0
  21. earthcatalog/tests/test_async_http.py +700 -0
  22. earthcatalog/tests/test_cli_and_storage.py +230 -0
  23. earthcatalog/tests/test_config.py +245 -0
  24. earthcatalog/tests/test_dask_integration.py +580 -0
  25. earthcatalog/tests/test_e2e_synthetic.py +1624 -0
  26. earthcatalog/tests/test_engines.py +272 -0
  27. earthcatalog/tests/test_exceptions.py +346 -0
  28. earthcatalog/tests/test_file_structure.py +245 -0
  29. earthcatalog/tests/test_input_readers.py +666 -0
  30. earthcatalog/tests/test_integration.py +200 -0
  31. earthcatalog/tests/test_integration_async.py +283 -0
  32. earthcatalog/tests/test_job_tracking.py +603 -0
  33. earthcatalog/tests/test_multi_file_input.py +336 -0
  34. earthcatalog/tests/test_passthrough_hook.py +196 -0
  35. earthcatalog/tests/test_pipeline.py +684 -0
  36. earthcatalog/tests/test_pipeline_components.py +665 -0
  37. earthcatalog/tests/test_schema_generator.py +506 -0
  38. earthcatalog/tests/test_spatial_resolver.py +413 -0
  39. earthcatalog/tests/test_stac_hooks.py +776 -0
  40. earthcatalog/tests/test_statistics.py +477 -0
  41. earthcatalog/tests/test_storage_backends.py +236 -0
  42. earthcatalog/tests/test_validation.py +435 -0
  43. earthcatalog/tests/test_workers.py +653 -0
  44. earthcatalog/validation.py +921 -0
  45. earthcatalog/workers.py +682 -0
  46. earthcatalog-0.2.0.dist-info/METADATA +333 -0
  47. earthcatalog-0.2.0.dist-info/RECORD +50 -0
  48. earthcatalog-0.2.0.dist-info/WHEEL +5 -0
  49. earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
  50. earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,164 @@
1
+ """EarthCatalog - High-Performance STAC Ingestion with Async HTTP Processing.
2
+
3
+ A scalable, production-ready library for processing STAC items into spatially partitioned
4
+ GeoParquet catalogs with built-in 3-6x performance improvements through async HTTP processing.
5
+
6
+ Key Features:
7
+ - **Async HTTP Processing**: 50-100+ concurrent requests per worker (3-6x speedup)
8
+ - **Spatial Partitioning**: H3, S2, MGRS, UTM, and custom grid systems
9
+ - **Distributed Processing**: Single-machine to cluster-scale processing
10
+ - **Cloud-Native**: Native S3, GCS, Azure integration
11
+ - **Memory Efficient**: Handles 100M+ URL datasets with constant memory usage
12
+ - **Production Ready**: Comprehensive error handling, monitoring, and reliability
13
+
14
+ Performance:
15
+ - Sequential HTTP: ~16 requests/second
16
+ - Async HTTP: 50-100+ requests/second (3-6x improvement)
17
+ - Large datasets: 100M URLs processed in 7-14 hours vs 71 hours sequential
18
+ - Memory usage: Linear with worker count, not dataset size
19
+
20
+ Quick Start:
21
+ >>> import earthcatalog
22
+ >>>
23
+ >>> # Basic configuration (async enabled by default)
24
+ >>> config = earthcatalog.ProcessingConfig(
25
+ ... input_file='urls.parquet',
26
+ ... output_catalog='./catalog',
27
+ ... scratch_location='./scratch'
28
+ ... )
29
+ >>>
30
+ >>> # Process with local workers
31
+ >>> processor = earthcatalog.LocalProcessor(n_workers=4)
32
+ >>> pipeline = earthcatalog.STACIngestionPipeline(config, processor)
33
+ >>> pipeline.run()
34
+
35
+ High-Performance Example:
36
+ >>> # Optimized for 100M+ URLs
37
+ >>> config = earthcatalog.ProcessingConfig(
38
+ ... input_file='s3://bucket/urls.parquet',
39
+ ... output_catalog='s3://bucket/catalog',
40
+ ... scratch_location='s3://bucket/scratch',
41
+ ... enable_concurrent_http=True, # Default: True
42
+ ... concurrent_requests=100, # High concurrency
43
+ ... batch_size=2000, # Large batches
44
+ ... max_workers=32 # Multiple workers
45
+ ... )
46
+
47
+ Async HTTP Configuration:
48
+ - enable_concurrent_http: Enable/disable async processing (default: True)
49
+ - concurrent_requests: Simultaneous requests per worker (default: 50)
50
+ - batch_size: URLs processed per batch (default: 1000)
51
+ - request_timeout: Request timeout in seconds (default: 30)
52
+ - retry_attempts: Maximum retry attempts (default: 3)
53
+
54
+ Grid Systems with Async Optimization:
55
+ - H3GridSystem: Recommended for global datasets (optimal async performance)
56
+ - S2GridSystem: Excellent for polar regions (adaptive async batching)
57
+ - UTMGridSystem: High precision for regional data (zone-optimized async)
58
+ - MGRSGridSystem: Government/defense standard (structured async processing)
59
+
60
+ Requirements:
61
+ - Python 3.11+
62
+ - aiohttp>=3.9.0 (for async HTTP - automatically installed)
63
+ - Optional: Dask for distributed processing
64
+ - Optional: S3FS for cloud storage
65
+ """
66
+
67
+ __version__ = "0.2.0"
68
+ __author__ = "betolink"
69
+ __email__ = "betolin@gmail.com"
70
+
71
+ # Import main classes for convenient access
72
+ from .engines import (
73
+ EngineNotAvailableError,
74
+ STACEngine,
75
+ get_engine,
76
+ )
77
+ from .grid_systems import (
78
+ GridSystem,
79
+ H3GridSystem,
80
+ MGRSGridSystem,
81
+ S2GridSystem,
82
+ SimpleLatLonGrid,
83
+ UTMGridSystem,
84
+ )
85
+ from .ingestion_pipeline import (
86
+ DaskDistributedProcessor,
87
+ LocalProcessor,
88
+ ProcessingConfig,
89
+ STACIngestionPipeline,
90
+ )
91
+ from .input_readers import (
92
+ CSVReader,
93
+ InputReader,
94
+ ParquetReader,
95
+ )
96
+ from .schema_generator import (
97
+ SchemaGenerator,
98
+ )
99
+ from .spatial_resolver import (
100
+ SpatialPartitionResolver,
101
+ resolve_and_query,
102
+ spatial_resolver,
103
+ )
104
+ from .storage_backends import (
105
+ LocalStorage,
106
+ S3Storage,
107
+ StorageBackend,
108
+ get_storage_backend,
109
+ )
110
+ from .validation import (
111
+ CatalogValidationResult,
112
+ ValidationIssue,
113
+ ValidationResult,
114
+ validate_catalog,
115
+ validate_geoparquet_file,
116
+ validate_stac_item,
117
+ validate_stac_items_batch,
118
+ )
119
+
120
+ __all__ = [
121
+ # Version info
122
+ "__version__",
123
+ "__author__",
124
+ "__email__",
125
+ # Core pipeline
126
+ "STACIngestionPipeline",
127
+ "ProcessingConfig",
128
+ "LocalProcessor",
129
+ "DaskDistributedProcessor",
130
+ # Storage backends
131
+ "StorageBackend",
132
+ "LocalStorage",
133
+ "S3Storage",
134
+ "get_storage_backend",
135
+ # STAC Engines
136
+ "STACEngine",
137
+ "EngineNotAvailableError",
138
+ "get_engine",
139
+ # Grid systems
140
+ "GridSystem",
141
+ "H3GridSystem",
142
+ "S2GridSystem",
143
+ "MGRSGridSystem",
144
+ "UTMGridSystem",
145
+ "SimpleLatLonGrid",
146
+ # Input readers
147
+ "InputReader",
148
+ "ParquetReader",
149
+ "CSVReader",
150
+ # Schema generation
151
+ "SchemaGenerator",
152
+ # Spatial resolution
153
+ "SpatialPartitionResolver",
154
+ "spatial_resolver",
155
+ "resolve_and_query",
156
+ # Validation
157
+ "ValidationResult",
158
+ "ValidationIssue",
159
+ "CatalogValidationResult",
160
+ "validate_stac_item",
161
+ "validate_stac_items_batch",
162
+ "validate_geoparquet_file",
163
+ "validate_catalog",
164
+ ]