earthcatalog 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- earthcatalog/__init__.py +164 -0
- earthcatalog/async_http_client.py +1006 -0
- earthcatalog/config.py +97 -0
- earthcatalog/engines/__init__.py +308 -0
- earthcatalog/engines/rustac_engine.py +142 -0
- earthcatalog/engines/stac_geoparquet_engine.py +126 -0
- earthcatalog/exceptions.py +471 -0
- earthcatalog/grid_systems.py +1114 -0
- earthcatalog/ingestion_pipeline.py +2281 -0
- earthcatalog/input_readers.py +603 -0
- earthcatalog/job_tracking.py +485 -0
- earthcatalog/pipeline.py +606 -0
- earthcatalog/schema_generator.py +911 -0
- earthcatalog/spatial_resolver.py +1207 -0
- earthcatalog/stac_hooks.py +754 -0
- earthcatalog/statistics.py +677 -0
- earthcatalog/storage_backends.py +548 -0
- earthcatalog/tests/__init__.py +1 -0
- earthcatalog/tests/conftest.py +76 -0
- earthcatalog/tests/test_all_grids.py +793 -0
- earthcatalog/tests/test_async_http.py +700 -0
- earthcatalog/tests/test_cli_and_storage.py +230 -0
- earthcatalog/tests/test_config.py +245 -0
- earthcatalog/tests/test_dask_integration.py +580 -0
- earthcatalog/tests/test_e2e_synthetic.py +1624 -0
- earthcatalog/tests/test_engines.py +272 -0
- earthcatalog/tests/test_exceptions.py +346 -0
- earthcatalog/tests/test_file_structure.py +245 -0
- earthcatalog/tests/test_input_readers.py +666 -0
- earthcatalog/tests/test_integration.py +200 -0
- earthcatalog/tests/test_integration_async.py +283 -0
- earthcatalog/tests/test_job_tracking.py +603 -0
- earthcatalog/tests/test_multi_file_input.py +336 -0
- earthcatalog/tests/test_passthrough_hook.py +196 -0
- earthcatalog/tests/test_pipeline.py +684 -0
- earthcatalog/tests/test_pipeline_components.py +665 -0
- earthcatalog/tests/test_schema_generator.py +506 -0
- earthcatalog/tests/test_spatial_resolver.py +413 -0
- earthcatalog/tests/test_stac_hooks.py +776 -0
- earthcatalog/tests/test_statistics.py +477 -0
- earthcatalog/tests/test_storage_backends.py +236 -0
- earthcatalog/tests/test_validation.py +435 -0
- earthcatalog/tests/test_workers.py +653 -0
- earthcatalog/validation.py +921 -0
- earthcatalog/workers.py +682 -0
- earthcatalog-0.2.0.dist-info/METADATA +333 -0
- earthcatalog-0.2.0.dist-info/RECORD +50 -0
- earthcatalog-0.2.0.dist-info/WHEEL +5 -0
- earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
- earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,2281 @@
|
|
|
1
|
+
# ingestion_pipeline.py
|
|
2
|
+
"""High-performance distributed STAC ingestion pipeline for massive geospatial dataset processing.
|
|
3
|
+
|
|
4
|
+
This module provides EarthCatalog's core ingestion capabilities, transforming URL lists
|
|
5
|
+
into spatially-partitioned, query-optimized GeoParquet catalogs. Designed for processing
|
|
6
|
+
100M+ STAC items efficiently across distributed computing environments with intelligent
|
|
7
|
+
partitioning, concurrent HTTP processing, and adaptive resource management.
|
|
8
|
+
|
|
9
|
+
Architecture Overview:
|
|
10
|
+
The pipeline follows a multi-stage processing architecture:
|
|
11
|
+
1. URL Reading: Flexible input from Parquet, CSV, or TSV files
|
|
12
|
+
2. Batch Processing: URLs chunked into worker-manageable batches
|
|
13
|
+
3. Concurrent Download: High-performance HTTP with connection pooling
|
|
14
|
+
4. Spatial Partitioning: Grid-based organization for efficient queries
|
|
15
|
+
5. Temporal Binning: Time-based organization (year/month/day)
|
|
16
|
+
6. Consolidated Output: Optimized GeoParquet files with spatial indexing
|
|
17
|
+
|
|
18
|
+
Key Components:
|
|
19
|
+
ProcessingConfig: Comprehensive configuration with production defaults
|
|
20
|
+
STACIngestionPipeline: Main pipeline orchestrating the complete workflow
|
|
21
|
+
LocalProcessor: Single-machine processing with async HTTP capabilities
|
|
22
|
+
DaskDistributedProcessor: Cluster-based processing for massive scale
|
|
23
|
+
DistributedProcessor: Abstract base for pluggable processing backends
|
|
24
|
+
|
|
25
|
+
Performance Features:
|
|
26
|
+
- Async HTTP processing: 3-6x faster than sequential downloads
|
|
27
|
+
- Concurrent request processing: 50-100+ requests/second per worker
|
|
28
|
+
- Memory-efficient batching: Handles unlimited URL counts
|
|
29
|
+
- Adaptive resource management: Scales from laptops to clusters
|
|
30
|
+
- Intelligent retry strategies: Robust error handling and recovery
|
|
31
|
+
- Connection pooling: Optimized network resource usage
|
|
32
|
+
|
|
33
|
+
Scalability:
|
|
34
|
+
Tested Performance Metrics:
|
|
35
|
+
- 100M URLs: ~7-14 hours with 32 workers (vs 71 hours sequential)
|
|
36
|
+
- Memory usage: <20% per worker with 16GB RAM
|
|
37
|
+
- Throughput: 50-100 STAC items/second per worker
|
|
38
|
+
- Storage: Linear scaling with intelligent partitioning
|
|
39
|
+
|
|
40
|
+
Grid System Integration:
|
|
41
|
+
Supports all major spatial partitioning systems:
|
|
42
|
+
- H3: Recommended for most global applications
|
|
43
|
+
- S2: Optimal for polar regions and spherical accuracy
|
|
44
|
+
- UTM: High precision for regional datasets
|
|
45
|
+
- MGRS: Standard for government and defense applications
|
|
46
|
+
- Custom GeoJSON: Maximum flexibility for special use cases
|
|
47
|
+
|
|
48
|
+
Storage Backend Support:
|
|
49
|
+
- Local filesystem: Development and small-scale production
|
|
50
|
+
- S3: AWS cloud storage with optimized multipart uploads
|
|
51
|
+
- GCS: Google Cloud Storage via fsspec integration
|
|
52
|
+
- Azure: Azure Blob Storage support
|
|
53
|
+
- Custom backends: Extensible storage abstraction
|
|
54
|
+
|
|
55
|
+
Temporal Organization:
|
|
56
|
+
Flexible time-based partitioning:
|
|
57
|
+
- Year-level: Long-term climate datasets
|
|
58
|
+
- Month-level: Recommended for most time-series data
|
|
59
|
+
- Day-level: High-frequency monitoring applications
|
|
60
|
+
- Custom binning: Configurable temporal windows
|
|
61
|
+
|
|
62
|
+
Processing Modes:
|
|
63
|
+
LocalProcessor:
|
|
64
|
+
- Single machine processing with async HTTP
|
|
65
|
+
- Ideal for development and small-to-medium datasets
|
|
66
|
+
- Memory efficient with configurable batch sizes
|
|
67
|
+
- Built-in progress tracking and error reporting
|
|
68
|
+
|
|
69
|
+
DaskDistributedProcessor:
|
|
70
|
+
- Cluster-based processing for massive scale
|
|
71
|
+
- Automatic task distribution and load balancing
|
|
72
|
+
- Fault tolerance and automatic recovery
|
|
73
|
+
- Resource monitoring and adaptive scheduling
|
|
74
|
+
|
|
75
|
+
Example Usage:
|
|
76
|
+
>>> # Basic local processing
|
|
77
|
+
>>> config = ProcessingConfig(
|
|
78
|
+
... input_file='urls.parquet',
|
|
79
|
+
... output_catalog='./catalog',
|
|
80
|
+
... scratch_location='./scratch'
|
|
81
|
+
... )
|
|
82
|
+
>>> pipeline = STACIngestionPipeline(config, LocalProcessor())
|
|
83
|
+
>>> pipeline.run()
|
|
84
|
+
>>>
|
|
85
|
+
>>> # High-performance distributed processing
|
|
86
|
+
>>> config = ProcessingConfig(
|
|
87
|
+
... input_file='s3://bucket/urls.parquet',
|
|
88
|
+
... output_catalog='s3://bucket/catalog',
|
|
89
|
+
... scratch_location='s3://bucket/scratch',
|
|
90
|
+
... concurrent_requests=100, # High concurrency
|
|
91
|
+
... max_workers=32 # Multiple workers
|
|
92
|
+
... )
|
|
93
|
+
>>> processor = DaskDistributedProcessor('scheduler-address:8786')
|
|
94
|
+
>>> pipeline = STACIngestionPipeline(config, processor)
|
|
95
|
+
>>> pipeline.run()
|
|
96
|
+
|
|
97
|
+
Configuration Best Practices:
|
|
98
|
+
- Use H3 grid system for global datasets
|
|
99
|
+
- Set concurrent_requests=50-100 for cloud storage
|
|
100
|
+
- Configure batch_size=1000-5000 based on available memory
|
|
101
|
+
- Use month-level temporal binning for time-series data
|
|
102
|
+
- Enable global partitioning for datasets with large geometries
|
|
103
|
+
|
|
104
|
+
Error Handling:
|
|
105
|
+
- Comprehensive retry strategies with exponential backoff
|
|
106
|
+
- Failed URL logging with detailed error categorization
|
|
107
|
+
- Graceful degradation for network and server issues
|
|
108
|
+
- Progress tracking with automatic resumption capabilities
|
|
109
|
+
- Memory pressure monitoring and adaptive batch sizing
|
|
110
|
+
"""
|
|
111
|
+
|
|
112
|
+
import asyncio
|
|
113
|
+
import glob as glob_module
|
|
114
|
+
import json
|
|
115
|
+
import logging
|
|
116
|
+
import tempfile
|
|
117
|
+
import time
|
|
118
|
+
import uuid
|
|
119
|
+
from abc import ABC, abstractmethod
|
|
120
|
+
from collections import defaultdict
|
|
121
|
+
from collections.abc import Callable
|
|
122
|
+
from dataclasses import dataclass
|
|
123
|
+
from pathlib import Path
|
|
124
|
+
from typing import Any, cast
|
|
125
|
+
|
|
126
|
+
import fsspec
|
|
127
|
+
import geopandas as gpd
|
|
128
|
+
import pandas as pd
|
|
129
|
+
import requests
|
|
130
|
+
from tqdm import tqdm
|
|
131
|
+
|
|
132
|
+
# Import STAC engine abstraction
|
|
133
|
+
from .engines import STACEngine, get_engine
|
|
134
|
+
|
|
135
|
+
# Conditional async HTTP imports
|
|
136
|
+
try:
|
|
137
|
+
from .async_http_client import HAS_ASYNC_HTTP, download_stac_items_async
|
|
138
|
+
except ImportError:
|
|
139
|
+
HAS_ASYNC_HTTP = False
|
|
140
|
+
|
|
141
|
+
# Create a dummy async function for type checking
|
|
142
|
+
async def download_stac_items_async(*args, **kwargs): # type: ignore
|
|
143
|
+
"""Dummy async function for type checking when async HTTP is not available."""
|
|
144
|
+
raise ImportError("Async HTTP client not available")
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
# Import from package modules
|
|
148
|
+
from . import grid_systems, input_readers, schema_generator, storage_backends
|
|
149
|
+
from .job_tracking import JobLogger, JobManifest, JobStatus
|
|
150
|
+
from .statistics import IngestionStatistics
|
|
151
|
+
|
|
152
|
+
logger = logging.getLogger(__name__)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
@dataclass
|
|
156
|
+
class ProcessingConfig:
|
|
157
|
+
"""Comprehensive configuration for EarthCatalog's STAC ingestion pipeline with production defaults.
|
|
158
|
+
|
|
159
|
+
This dataclass provides all configuration options for controlling pipeline behavior,
|
|
160
|
+
from basic input/output paths to advanced performance tuning. Designed with sensible
|
|
161
|
+
defaults that work well for most use cases while allowing fine-grained control for
|
|
162
|
+
specialized requirements and performance optimization.
|
|
163
|
+
|
|
164
|
+
Configuration Categories:
|
|
165
|
+
Input/Output: File paths and formats for data sources and destinations
|
|
166
|
+
Spatial: Grid system selection and partitioning parameters
|
|
167
|
+
Temporal: Time-based binning and organization
|
|
168
|
+
Async HTTP: High-performance concurrent request processing (3-6x speedup)
|
|
169
|
+
Performance: Concurrency, memory management, and optimization
|
|
170
|
+
Processing: Worker configuration and distributed computing
|
|
171
|
+
Storage: Backend selection and cloud storage settings
|
|
172
|
+
|
|
173
|
+
Async HTTP Performance Configuration:
|
|
174
|
+
EarthCatalog includes built-in async HTTP capabilities that provide 3-6x performance
|
|
175
|
+
improvements over sequential processing. Key async parameters:
|
|
176
|
+
|
|
177
|
+
- enable_concurrent_http (bool, default=True): Enable async HTTP processing
|
|
178
|
+
- concurrent_requests (int, default=50): Simultaneous HTTP requests per worker
|
|
179
|
+
- batch_size (int, default=1000): URLs processed in each batch
|
|
180
|
+
- connection_pool_size (int, default=100): HTTP connection pool size
|
|
181
|
+
- request_timeout (int, default=30): Individual request timeout in seconds
|
|
182
|
+
- retry_attempts (int, default=3): Maximum retry attempts per request
|
|
183
|
+
- retry_delay (float, default=1.0): Base delay between retries in seconds
|
|
184
|
+
|
|
185
|
+
Performance Tuning Guidelines:
|
|
186
|
+
Development/Testing:
|
|
187
|
+
- concurrent_requests=10-25, batch_size=100-500
|
|
188
|
+
- Lower resource usage, easier debugging
|
|
189
|
+
|
|
190
|
+
Production/Fast Networks:
|
|
191
|
+
- concurrent_requests=50-100, batch_size=1000-2000
|
|
192
|
+
- Maximum throughput for well-provisioned systems
|
|
193
|
+
|
|
194
|
+
Unreliable Networks:
|
|
195
|
+
- concurrent_requests=10-25, request_timeout=60-120
|
|
196
|
+
- Higher reliability with longer timeouts
|
|
197
|
+
|
|
198
|
+
Memory Constrained:
|
|
199
|
+
- batch_size=100-500, connection_pool_size=25-50
|
|
200
|
+
- Reduced memory footprint
|
|
201
|
+
|
|
202
|
+
Cloud Storage Optimization:
|
|
203
|
+
Special considerations for cloud deployments:
|
|
204
|
+
- Use s3_multipart_threshold_mb for large file handling
|
|
205
|
+
- Configure temp_dir_location for local staging
|
|
206
|
+
- Set appropriate timeouts for network latency
|
|
207
|
+
- Enable streaming_merge for memory efficiency
|
|
208
|
+
- Higher concurrent_requests work well with cloud storage
|
|
209
|
+
|
|
210
|
+
Configuration Examples:
|
|
211
|
+
|
|
212
|
+
Basic Configuration (3-6x speedup with defaults):
|
|
213
|
+
>>> config = ProcessingConfig(
|
|
214
|
+
... input_file='urls.parquet',
|
|
215
|
+
... output_catalog='./catalog',
|
|
216
|
+
... scratch_location='./scratch'
|
|
217
|
+
... # async HTTP enabled by default
|
|
218
|
+
... )
|
|
219
|
+
|
|
220
|
+
High-Performance Cloud Configuration:
|
|
221
|
+
>>> config = ProcessingConfig(
|
|
222
|
+
... input_file='s3://bucket/urls.parquet',
|
|
223
|
+
... output_catalog='s3://bucket/catalog',
|
|
224
|
+
... scratch_location='s3://bucket/scratch',
|
|
225
|
+
... # Async HTTP tuning for cloud scale
|
|
226
|
+
... enable_concurrent_http=True,
|
|
227
|
+
... concurrent_requests=100,
|
|
228
|
+
... batch_size=2000,
|
|
229
|
+
... connection_pool_size=200,
|
|
230
|
+
... request_timeout=30,
|
|
231
|
+
... # Worker and processing settings
|
|
232
|
+
... max_workers=32,
|
|
233
|
+
... items_per_shard=20000
|
|
234
|
+
... )
|
|
235
|
+
|
|
236
|
+
Conservative/Reliable Configuration:
|
|
237
|
+
>>> config = ProcessingConfig(
|
|
238
|
+
... input_file='urls.parquet',
|
|
239
|
+
... output_catalog='./catalog',
|
|
240
|
+
... scratch_location='./scratch',
|
|
241
|
+
... # Conservative async settings
|
|
242
|
+
... concurrent_requests=25,
|
|
243
|
+
... request_timeout=60,
|
|
244
|
+
... retry_attempts=5,
|
|
245
|
+
... retry_delay=2.0,
|
|
246
|
+
... batch_size=500
|
|
247
|
+
... )
|
|
248
|
+
|
|
249
|
+
Development/Debug Configuration:
|
|
250
|
+
>>> config = ProcessingConfig(
|
|
251
|
+
... input_file='sample_urls.parquet',
|
|
252
|
+
... output_catalog='./test_catalog',
|
|
253
|
+
... scratch_location='./test_scratch',
|
|
254
|
+
... # Disable async for easier debugging
|
|
255
|
+
... enable_concurrent_http=False,
|
|
256
|
+
... max_workers=1,
|
|
257
|
+
... verbose=True
|
|
258
|
+
... )
|
|
259
|
+
|
|
260
|
+
Performance Monitoring:
|
|
261
|
+
Use these settings to monitor and optimize performance:
|
|
262
|
+
>>> config = ProcessingConfig(
|
|
263
|
+
... stats_file='processing_stats.json', # Performance metrics
|
|
264
|
+
... verbose=True, # Detailed logging
|
|
265
|
+
... progress_interval=100 # Progress updates
|
|
266
|
+
... )
|
|
267
|
+
|
|
268
|
+
Validation:
|
|
269
|
+
Configuration validation ensures proper setup:
|
|
270
|
+
>>> config.validate() # Raises ValueError for invalid settings
|
|
271
|
+
>>> # Automatically validates paths, dependencies, and parameter ranges
|
|
272
|
+
>>> config.validate() # Raises exceptions for invalid settings
|
|
273
|
+
"""
|
|
274
|
+
|
|
275
|
+
input_file: str
|
|
276
|
+
output_catalog: str
|
|
277
|
+
scratch_location: str
|
|
278
|
+
input_format: str = "auto" # auto, parquet, csv, tsv, ndjson, jsonl
|
|
279
|
+
url_column: str = "url" # Column name containing URLs
|
|
280
|
+
# Multi-file input support (glob patterns)
|
|
281
|
+
# If specified, input_file is treated as a base path and input_pattern is used for file discovery
|
|
282
|
+
# Example: "s3://bucket/bulk/2020_*.ndjson" will discover all matching files
|
|
283
|
+
input_pattern: str = "" # Glob pattern for multi-file input
|
|
284
|
+
grid_system: str = "h3" # h3 only for simplicity
|
|
285
|
+
grid_resolution: int = 2 # H3 resolution (default level 2)
|
|
286
|
+
temporal_bin: str = "month" # year, month, day
|
|
287
|
+
|
|
288
|
+
# Output format options
|
|
289
|
+
output_format: str = "geoparquet" # "geoparquet" or "ndjson"
|
|
290
|
+
mission_field: str = "dataset_id" # Field to extract mission from
|
|
291
|
+
sort_key: str = "datetime"
|
|
292
|
+
sort_ascending: bool = True
|
|
293
|
+
items_per_shard: int = 10000
|
|
294
|
+
max_workers: int = 8
|
|
295
|
+
# Global partitioning options
|
|
296
|
+
enable_global_partitioning: bool = True # Route multi-cell items to /global/
|
|
297
|
+
global_partition_threshold: int = 50 # H3 resolution 6 threshold
|
|
298
|
+
# Schema generation options
|
|
299
|
+
generate_schema: bool = True # Generate catalog schema metadata (default: enabled)
|
|
300
|
+
schema_filename: str = "catalog_schema.json" # Schema output filename
|
|
301
|
+
geojson_path: str = "" # Path to custom GeoJSON tiles (for geojson grid system)
|
|
302
|
+
# Consolidation options
|
|
303
|
+
max_memory_per_partition_mb: int = 1024 # Memory limit per partition
|
|
304
|
+
enable_streaming_merge: bool = True # Use streaming for large files
|
|
305
|
+
s3_multipart_threshold_mb: int = 100 # When to use multipart upload
|
|
306
|
+
temp_dir_location: str = tempfile.gettempdir() # Local temp space for staging
|
|
307
|
+
|
|
308
|
+
# Async HTTP Configuration
|
|
309
|
+
enable_concurrent_http: bool = True # Enable async HTTP processing
|
|
310
|
+
concurrent_requests: int = 50 # Concurrent requests per worker
|
|
311
|
+
connection_pool_size: int = 100 # HTTP connection pool size
|
|
312
|
+
request_timeout: int = 30 # Request timeout in seconds
|
|
313
|
+
retry_attempts: int = 3 # Max retry attempts
|
|
314
|
+
retry_delay: float = 1.0 # Base retry delay in seconds
|
|
315
|
+
batch_size: int = 1000 # URLs processed per async batch
|
|
316
|
+
|
|
317
|
+
# Validation Configuration
|
|
318
|
+
enable_validation: bool = True # Enable STAC item validation on ingest
|
|
319
|
+
fix_invalid_geometry: bool = True # Attempt to fix invalid geometries
|
|
320
|
+
fix_bbox_mismatch: bool = True # Correct bbox when it doesn't match geometry
|
|
321
|
+
bbox_tolerance: float = 1e-6 # Tolerance for bbox comparison (degrees)
|
|
322
|
+
log_validation_warnings: bool = True # Log validation warnings
|
|
323
|
+
|
|
324
|
+
# STAC Engine Configuration
|
|
325
|
+
stac_engine: str = "rustac" # "rustac", "stac-geoparquet", or "auto"
|
|
326
|
+
|
|
327
|
+
# STAC Fetch Hook Configuration
|
|
328
|
+
# Allows custom STAC item generation when URLs don't point to STAC JSON
|
|
329
|
+
# Supported formats:
|
|
330
|
+
# - "default": Standard STAC JSON fetch (default behavior)
|
|
331
|
+
# - "module:package.module:function": Python function import path
|
|
332
|
+
# - "script:/path/to/script": External executable
|
|
333
|
+
# - "script:python:/path/to/script.py": Script with interpreter
|
|
334
|
+
stac_hook: str = "default"
|
|
335
|
+
|
|
336
|
+
# Batch mode configuration
|
|
337
|
+
# Controls when to use simple local processing vs full distributed processing
|
|
338
|
+
batch_threshold: int = 10000 # Below this, use simple local processing
|
|
339
|
+
distributed: bool | None = None # True=force distributed, False=force local, None=auto
|
|
340
|
+
large_batch_confirm_threshold: int = 20000 # Prompt user if local mode exceeds this
|
|
341
|
+
|
|
342
|
+
# Dask distributed configuration
|
|
343
|
+
dask_scheduler_address: str = (
|
|
344
|
+
"" # Dask scheduler address (e.g., 'tcp://localhost:8786'). Empty = create local cluster
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
# Checkpoint configuration for distributed processing
|
|
348
|
+
# Use time-based checkpointing for better performance with many partitions
|
|
349
|
+
checkpoint_interval_seconds: int = 30 # Save checkpoint every N seconds (0 to disable)
|
|
350
|
+
checkpoint_partition_count: int = 0 # Save checkpoint every N partitions (0 to disable)
|
|
351
|
+
# Note: If both are 0, defaults to time-based (30 seconds)
|
|
352
|
+
|
|
353
|
+
def validate(self):
|
|
354
|
+
"""Validate configuration before processing."""
|
|
355
|
+
# Validate input source - either input_file or input_pattern
|
|
356
|
+
if not self.input_pattern:
|
|
357
|
+
# Traditional single file mode - validate file exists
|
|
358
|
+
if not Path(self.input_file).exists() and not self.input_file.startswith("s3://"):
|
|
359
|
+
raise FileNotFoundError(f"Input file not found: {self.input_file}")
|
|
360
|
+
# If input_pattern is provided, file existence will be validated during pattern resolution
|
|
361
|
+
if self.grid_resolution < 0 or self.grid_resolution > 15:
|
|
362
|
+
raise ValueError("H3 resolution must be 0-15")
|
|
363
|
+
if self.temporal_bin not in ["year", "month", "day"]:
|
|
364
|
+
raise ValueError("temporal_bin must be 'year', 'month', or 'day'")
|
|
365
|
+
if self.output_format not in ["geoparquet", "ndjson"]:
|
|
366
|
+
raise ValueError("output_format must be 'geoparquet' or 'ndjson'")
|
|
367
|
+
if self.items_per_shard <= 0:
|
|
368
|
+
raise ValueError("items_per_shard must be positive")
|
|
369
|
+
if self.max_workers <= 0:
|
|
370
|
+
raise ValueError("max_workers must be positive")
|
|
371
|
+
if self.max_memory_per_partition_mb <= 0:
|
|
372
|
+
raise ValueError("max_memory_per_partition_mb must be positive")
|
|
373
|
+
if self.s3_multipart_threshold_mb <= 0:
|
|
374
|
+
raise ValueError("s3_multipart_threshold_mb must be positive")
|
|
375
|
+
|
|
376
|
+
# Async HTTP validation
|
|
377
|
+
if self.concurrent_requests <= 0:
|
|
378
|
+
raise ValueError("concurrent_requests must be positive")
|
|
379
|
+
if self.connection_pool_size <= 0:
|
|
380
|
+
raise ValueError("connection_pool_size must be positive")
|
|
381
|
+
if self.request_timeout <= 0:
|
|
382
|
+
raise ValueError("request_timeout must be positive")
|
|
383
|
+
if self.retry_attempts < 0:
|
|
384
|
+
raise ValueError("retry_attempts must be non-negative")
|
|
385
|
+
if self.retry_delay < 0:
|
|
386
|
+
raise ValueError("retry_delay must be non-negative")
|
|
387
|
+
if self.batch_size <= 0:
|
|
388
|
+
raise ValueError("batch_size must be positive")
|
|
389
|
+
|
|
390
|
+
# STAC engine validation
|
|
391
|
+
valid_engines = ("rustac", "stac-geoparquet", "auto")
|
|
392
|
+
if self.stac_engine not in valid_engines:
|
|
393
|
+
raise ValueError(f"stac_engine must be one of {valid_engines}, got: {self.stac_engine}")
|
|
394
|
+
|
|
395
|
+
# STAC hook validation
|
|
396
|
+
valid_hook_prefixes = ("default", "passthrough", "module:", "script:")
|
|
397
|
+
if not any(self.stac_hook.startswith(prefix) for prefix in valid_hook_prefixes):
|
|
398
|
+
raise ValueError(
|
|
399
|
+
f"stac_hook must be 'default', 'passthrough', 'module:path:func', or 'script:/path', got: {self.stac_hook}"
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
# Dask scheduler validation - warn if remote scheduler with local storage
|
|
403
|
+
if self.dask_scheduler_address and not self.dask_scheduler_address.startswith("local"):
|
|
404
|
+
# Remote scheduler detected
|
|
405
|
+
local_paths = []
|
|
406
|
+
if not self.scratch_location.startswith(("s3://", "gs://", "az://")):
|
|
407
|
+
local_paths.append("scratch_location")
|
|
408
|
+
if not self.output_catalog.startswith(("s3://", "gs://", "az://")):
|
|
409
|
+
local_paths.append("output_catalog")
|
|
410
|
+
|
|
411
|
+
if local_paths:
|
|
412
|
+
logger.warning(
|
|
413
|
+
f"Using remote Dask scheduler ({self.dask_scheduler_address}) with local storage paths: "
|
|
414
|
+
f"{', '.join(local_paths)}. "
|
|
415
|
+
f"Remote workers may not have access to local paths. "
|
|
416
|
+
f"Consider using cloud storage (s3://, gs://) for {', '.join(local_paths)}."
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
# Batch mode validation
|
|
420
|
+
if self.batch_threshold <= 0:
|
|
421
|
+
raise ValueError("batch_threshold must be positive")
|
|
422
|
+
if self.large_batch_confirm_threshold <= 0:
|
|
423
|
+
raise ValueError("large_batch_confirm_threshold must be positive")
|
|
424
|
+
|
|
425
|
+
def __repr__(self) -> str:
|
|
426
|
+
"""Return concise string representation with key configuration."""
|
|
427
|
+
return (
|
|
428
|
+
f"ProcessingConfig("
|
|
429
|
+
f"input='{self.input_file}', "
|
|
430
|
+
f"output='{self.output_catalog}', "
|
|
431
|
+
f"grid={self.grid_system}@{self.grid_resolution}, "
|
|
432
|
+
f"workers={self.max_workers})"
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
def __bool__(self) -> bool:
|
|
436
|
+
"""Return True if configuration is valid."""
|
|
437
|
+
try:
|
|
438
|
+
self.validate()
|
|
439
|
+
return True
|
|
440
|
+
except (ValueError, FileNotFoundError):
|
|
441
|
+
return False
|
|
442
|
+
|
|
443
|
+
def to_dict(self) -> dict[str, Any]:
|
|
444
|
+
"""Serialize configuration to dictionary for storage or transmission.
|
|
445
|
+
|
|
446
|
+
All fields are included, enabling complete reconstruction via from_dict().
|
|
447
|
+
Useful for:
|
|
448
|
+
- Storing config in job manifests for recovery
|
|
449
|
+
- Passing config to remote Dask workers
|
|
450
|
+
- Caching/resumption of processing jobs
|
|
451
|
+
|
|
452
|
+
Returns:
|
|
453
|
+
Dictionary with all configuration fields.
|
|
454
|
+
"""
|
|
455
|
+
return {
|
|
456
|
+
"input_file": self.input_file,
|
|
457
|
+
"output_catalog": self.output_catalog,
|
|
458
|
+
"scratch_location": self.scratch_location,
|
|
459
|
+
"input_format": self.input_format,
|
|
460
|
+
"url_column": self.url_column,
|
|
461
|
+
"input_pattern": self.input_pattern,
|
|
462
|
+
"grid_system": self.grid_system,
|
|
463
|
+
"grid_resolution": self.grid_resolution,
|
|
464
|
+
"temporal_bin": self.temporal_bin,
|
|
465
|
+
"output_format": self.output_format,
|
|
466
|
+
"mission_field": self.mission_field,
|
|
467
|
+
"sort_key": self.sort_key,
|
|
468
|
+
"sort_ascending": self.sort_ascending,
|
|
469
|
+
"items_per_shard": self.items_per_shard,
|
|
470
|
+
"max_workers": self.max_workers,
|
|
471
|
+
"enable_global_partitioning": self.enable_global_partitioning,
|
|
472
|
+
"global_partition_threshold": self.global_partition_threshold,
|
|
473
|
+
"generate_schema": self.generate_schema,
|
|
474
|
+
"schema_filename": self.schema_filename,
|
|
475
|
+
"geojson_path": self.geojson_path,
|
|
476
|
+
"max_memory_per_partition_mb": self.max_memory_per_partition_mb,
|
|
477
|
+
"enable_streaming_merge": self.enable_streaming_merge,
|
|
478
|
+
"s3_multipart_threshold_mb": self.s3_multipart_threshold_mb,
|
|
479
|
+
"temp_dir_location": self.temp_dir_location,
|
|
480
|
+
"enable_concurrent_http": self.enable_concurrent_http,
|
|
481
|
+
"concurrent_requests": self.concurrent_requests,
|
|
482
|
+
"connection_pool_size": self.connection_pool_size,
|
|
483
|
+
"request_timeout": self.request_timeout,
|
|
484
|
+
"retry_attempts": self.retry_attempts,
|
|
485
|
+
"retry_delay": self.retry_delay,
|
|
486
|
+
"batch_size": self.batch_size,
|
|
487
|
+
"enable_validation": self.enable_validation,
|
|
488
|
+
"fix_invalid_geometry": self.fix_invalid_geometry,
|
|
489
|
+
"fix_bbox_mismatch": self.fix_bbox_mismatch,
|
|
490
|
+
"bbox_tolerance": self.bbox_tolerance,
|
|
491
|
+
"log_validation_warnings": self.log_validation_warnings,
|
|
492
|
+
"stac_engine": self.stac_engine,
|
|
493
|
+
"stac_hook": self.stac_hook,
|
|
494
|
+
"batch_threshold": self.batch_threshold,
|
|
495
|
+
"distributed": self.distributed,
|
|
496
|
+
"large_batch_confirm_threshold": self.large_batch_confirm_threshold,
|
|
497
|
+
"dask_scheduler_address": self.dask_scheduler_address,
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
@classmethod
|
|
501
|
+
def from_dict(cls, data: dict[str, Any]) -> "ProcessingConfig":
|
|
502
|
+
"""Reconstruct configuration from dictionary.
|
|
503
|
+
|
|
504
|
+
Only includes fields that are present in the dictionary, allowing
|
|
505
|
+
for backward compatibility when new fields are added.
|
|
506
|
+
|
|
507
|
+
Args:
|
|
508
|
+
data: Dictionary with configuration fields.
|
|
509
|
+
|
|
510
|
+
Returns:
|
|
511
|
+
ProcessingConfig instance.
|
|
512
|
+
"""
|
|
513
|
+
# Required fields
|
|
514
|
+
config = cls(
|
|
515
|
+
input_file=data["input_file"],
|
|
516
|
+
output_catalog=data["output_catalog"],
|
|
517
|
+
scratch_location=data["scratch_location"],
|
|
518
|
+
)
|
|
519
|
+
|
|
520
|
+
# Optional fields - update only if present in data
|
|
521
|
+
optional_fields = [
|
|
522
|
+
"input_format",
|
|
523
|
+
"url_column",
|
|
524
|
+
"input_pattern",
|
|
525
|
+
"grid_system",
|
|
526
|
+
"grid_resolution",
|
|
527
|
+
"temporal_bin",
|
|
528
|
+
"output_format",
|
|
529
|
+
"mission_field",
|
|
530
|
+
"sort_key",
|
|
531
|
+
"sort_ascending",
|
|
532
|
+
"items_per_shard",
|
|
533
|
+
"max_workers",
|
|
534
|
+
"enable_global_partitioning",
|
|
535
|
+
"global_partition_threshold",
|
|
536
|
+
"generate_schema",
|
|
537
|
+
"schema_filename",
|
|
538
|
+
"geojson_path",
|
|
539
|
+
"max_memory_per_partition_mb",
|
|
540
|
+
"enable_streaming_merge",
|
|
541
|
+
"s3_multipart_threshold_mb",
|
|
542
|
+
"temp_dir_location",
|
|
543
|
+
"enable_concurrent_http",
|
|
544
|
+
"concurrent_requests",
|
|
545
|
+
"connection_pool_size",
|
|
546
|
+
"request_timeout",
|
|
547
|
+
"retry_attempts",
|
|
548
|
+
"retry_delay",
|
|
549
|
+
"batch_size",
|
|
550
|
+
"enable_validation",
|
|
551
|
+
"fix_invalid_geometry",
|
|
552
|
+
"fix_bbox_mismatch",
|
|
553
|
+
"bbox_tolerance",
|
|
554
|
+
"log_validation_warnings",
|
|
555
|
+
"stac_engine",
|
|
556
|
+
"stac_hook",
|
|
557
|
+
"batch_threshold",
|
|
558
|
+
"distributed",
|
|
559
|
+
"large_batch_confirm_threshold",
|
|
560
|
+
"dask_scheduler_address",
|
|
561
|
+
]
|
|
562
|
+
|
|
563
|
+
for field in optional_fields:
|
|
564
|
+
if field in data:
|
|
565
|
+
setattr(config, field, data[field])
|
|
566
|
+
|
|
567
|
+
return config
|
|
568
|
+
|
|
569
|
+
def config_hash(self) -> str:
|
|
570
|
+
"""Generate a hash of configuration for idempotency checking.
|
|
571
|
+
|
|
572
|
+
The hash covers settings that affect output content, excluding
|
|
573
|
+
paths and runtime-only settings. Two configs with the same hash
|
|
574
|
+
would produce identical output given the same input.
|
|
575
|
+
|
|
576
|
+
Returns:
|
|
577
|
+
Hex digest of the configuration hash.
|
|
578
|
+
"""
|
|
579
|
+
import hashlib
|
|
580
|
+
|
|
581
|
+
# Fields that affect output content (not paths or runtime settings)
|
|
582
|
+
content_affecting = {
|
|
583
|
+
"grid_system": self.grid_system,
|
|
584
|
+
"grid_resolution": self.grid_resolution,
|
|
585
|
+
"temporal_bin": self.temporal_bin,
|
|
586
|
+
"output_format": self.output_format,
|
|
587
|
+
"mission_field": self.mission_field,
|
|
588
|
+
"sort_key": self.sort_key,
|
|
589
|
+
"sort_ascending": self.sort_ascending,
|
|
590
|
+
"items_per_shard": self.items_per_shard,
|
|
591
|
+
"enable_global_partitioning": self.enable_global_partitioning,
|
|
592
|
+
"global_partition_threshold": self.global_partition_threshold,
|
|
593
|
+
"enable_validation": self.enable_validation,
|
|
594
|
+
"fix_invalid_geometry": self.fix_invalid_geometry,
|
|
595
|
+
"fix_bbox_mismatch": self.fix_bbox_mismatch,
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
content_str = json.dumps(content_affecting, sort_keys=True)
|
|
599
|
+
return hashlib.sha256(content_str.encode()).hexdigest()[:16]
|
|
600
|
+
|
|
601
|
+
|
|
602
|
+
class DistributedProcessor(ABC):
|
|
603
|
+
"""Abstract base for different parallel processing backends."""
|
|
604
|
+
|
|
605
|
+
@abstractmethod
|
|
606
|
+
def process_urls(self, url_chunks: list[list[str]], process_fn: Callable, **kwargs) -> list[Any]:
|
|
607
|
+
"""Process URL chunks in parallel."""
|
|
608
|
+
pass
|
|
609
|
+
|
|
610
|
+
@abstractmethod
|
|
611
|
+
def consolidate_shards(self, partition_items: list[tuple], consolidate_fn: Callable, **kwargs) -> list[Any]:
|
|
612
|
+
"""Consolidate shards in parallel."""
|
|
613
|
+
pass
|
|
614
|
+
|
|
615
|
+
@abstractmethod
|
|
616
|
+
def close(self):
|
|
617
|
+
"""Clean up resources."""
|
|
618
|
+
pass
|
|
619
|
+
|
|
620
|
+
|
|
621
|
+
class DaskDistributedProcessor(DistributedProcessor):
|
|
622
|
+
"""Dask-based distributed processing using serializable worker functions.
|
|
623
|
+
|
|
624
|
+
This processor uses the module-level functions from workers.py which can be
|
|
625
|
+
pickled and sent to remote Dask workers. Unlike LocalProcessor, it cannot
|
|
626
|
+
use closures that capture instance state.
|
|
627
|
+
|
|
628
|
+
Supports two modes:
|
|
629
|
+
1. Local cluster: Creates a local Dask cluster (default)
|
|
630
|
+
2. Remote cluster: Connects to an existing Dask scheduler via scheduler_address
|
|
631
|
+
"""
|
|
632
|
+
|
|
633
|
+
def __init__(self, n_workers: int = 8, threads_per_worker: int = 1, scheduler_address: str | None = None):
|
|
634
|
+
"""Initialize Dask distributed processor with specified worker configuration.
|
|
635
|
+
|
|
636
|
+
Args:
|
|
637
|
+
n_workers: Number of workers (only used when creating local cluster)
|
|
638
|
+
threads_per_worker: Threads per worker (only used when creating local cluster)
|
|
639
|
+
scheduler_address: Optional Dask scheduler address (e.g., 'tcp://localhost:8786').
|
|
640
|
+
If provided, connects to existing cluster instead of creating local one.
|
|
641
|
+
"""
|
|
642
|
+
self.n_workers = n_workers
|
|
643
|
+
self.threads_per_worker = threads_per_worker
|
|
644
|
+
self.scheduler_address = scheduler_address
|
|
645
|
+
try:
|
|
646
|
+
import dask.distributed as dd
|
|
647
|
+
|
|
648
|
+
if scheduler_address:
|
|
649
|
+
# Connect to existing Dask cluster
|
|
650
|
+
self.client = dd.Client(scheduler_address)
|
|
651
|
+
logger.info(f"Connected to Dask scheduler at {scheduler_address}")
|
|
652
|
+
logger.info(f"Dask dashboard: {self.client.dashboard_link}")
|
|
653
|
+
else:
|
|
654
|
+
# Create local Dask cluster
|
|
655
|
+
self.client = dd.Client(n_workers=n_workers, threads_per_worker=threads_per_worker, memory_limit="4GB")
|
|
656
|
+
logger.info(f"Created local Dask cluster with {n_workers} workers")
|
|
657
|
+
logger.info(f"Dask dashboard: {self.client.dashboard_link}")
|
|
658
|
+
except ImportError:
|
|
659
|
+
raise ImportError("Dask distributed required: pip install dask distributed") from None
|
|
660
|
+
|
|
661
|
+
def __repr__(self) -> str:
|
|
662
|
+
"""Return string representation."""
|
|
663
|
+
if self.scheduler_address:
|
|
664
|
+
return f"DaskDistributedProcessor(scheduler_address='{self.scheduler_address}')"
|
|
665
|
+
return f"DaskDistributedProcessor(n_workers={self.n_workers}, threads_per_worker={self.threads_per_worker})"
|
|
666
|
+
|
|
667
|
+
def close(self):
|
|
668
|
+
"""Close the Dask client connection."""
|
|
669
|
+
if hasattr(self, "client") and self.client:
|
|
670
|
+
self.client.close()
|
|
671
|
+
logger.info("Dask client closed")
|
|
672
|
+
|
|
673
|
+
def process_urls(self, url_chunks: list[list[str]], process_fn: Callable, **kwargs) -> list[Any]:
|
|
674
|
+
"""Process URL chunks using Dask distributed workers.
|
|
675
|
+
|
|
676
|
+
Note: For Dask, we use the serializable workers.process_url_batch function
|
|
677
|
+
instead of the provided process_fn (which may capture non-serializable state).
|
|
678
|
+
The process_fn is ignored and workers.process_url_batch is used instead.
|
|
679
|
+
"""
|
|
680
|
+
# Import here to avoid circular imports
|
|
681
|
+
from .workers import process_url_batch
|
|
682
|
+
|
|
683
|
+
# Extract required kwargs
|
|
684
|
+
config_dict = kwargs.get("config_dict")
|
|
685
|
+
job_id = kwargs.get("job_id", "unknown")
|
|
686
|
+
|
|
687
|
+
if config_dict is None:
|
|
688
|
+
# Fall back to old behavior if config_dict not provided
|
|
689
|
+
futures = [self.client.submit(process_fn, chunk, idx, **kwargs) for idx, chunk in enumerate(url_chunks)]
|
|
690
|
+
return cast(list[Any], self.client.gather(futures))
|
|
691
|
+
|
|
692
|
+
# Use serializable worker function
|
|
693
|
+
futures = [
|
|
694
|
+
self.client.submit(
|
|
695
|
+
process_url_batch,
|
|
696
|
+
urls=chunk,
|
|
697
|
+
worker_id=idx,
|
|
698
|
+
config_dict=config_dict,
|
|
699
|
+
job_id=job_id,
|
|
700
|
+
batch_idx=idx,
|
|
701
|
+
)
|
|
702
|
+
for idx, chunk in enumerate(url_chunks)
|
|
703
|
+
]
|
|
704
|
+
return cast(list[Any], self.client.gather(futures))
|
|
705
|
+
|
|
706
|
+
def consolidate_shards(self, partition_items: list[tuple], consolidate_fn: Callable, **kwargs) -> list[Any]:
|
|
707
|
+
"""Consolidate shards using Dask distributed workers.
|
|
708
|
+
|
|
709
|
+
Note: For Dask, we use the serializable workers.consolidate_partition function
|
|
710
|
+
instead of the provided consolidate_fn.
|
|
711
|
+
"""
|
|
712
|
+
from .workers import consolidate_partition
|
|
713
|
+
|
|
714
|
+
config_dict = kwargs.get("config_dict")
|
|
715
|
+
|
|
716
|
+
if config_dict is None:
|
|
717
|
+
# Fall back to old behavior
|
|
718
|
+
futures = [
|
|
719
|
+
self.client.submit(consolidate_fn, partition_key, shard_paths, **kwargs)
|
|
720
|
+
for partition_key, shard_paths in partition_items
|
|
721
|
+
]
|
|
722
|
+
return cast(list[Any], self.client.gather(futures))
|
|
723
|
+
|
|
724
|
+
# Use serializable worker function
|
|
725
|
+
futures = [
|
|
726
|
+
self.client.submit(
|
|
727
|
+
consolidate_partition,
|
|
728
|
+
partition_key=partition_key,
|
|
729
|
+
shard_paths=shard_paths,
|
|
730
|
+
config_dict=config_dict,
|
|
731
|
+
)
|
|
732
|
+
for partition_key, shard_paths in partition_items
|
|
733
|
+
]
|
|
734
|
+
return cast(list[Any], self.client.gather(futures))
|
|
735
|
+
|
|
736
|
+
|
|
737
|
+
class LocalProcessor(DistributedProcessor):
|
|
738
|
+
"""Local multi-threading processor using concurrent.futures."""
|
|
739
|
+
|
|
740
|
+
def __init__(self, n_workers: int = 8):
|
|
741
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
742
|
+
|
|
743
|
+
self.n_workers = n_workers
|
|
744
|
+
self.executor = ThreadPoolExecutor(max_workers=n_workers)
|
|
745
|
+
logger.info(f"Local processor started with {n_workers} workers")
|
|
746
|
+
|
|
747
|
+
def __repr__(self) -> str:
|
|
748
|
+
"""Return string representation."""
|
|
749
|
+
return f"LocalProcessor(n_workers={self.n_workers})"
|
|
750
|
+
|
|
751
|
+
def process_urls(self, url_chunks: list[list[str]], process_fn: Callable, **kwargs) -> list[Any]:
|
|
752
|
+
futures = [self.executor.submit(process_fn, chunk, idx, **kwargs) for idx, chunk in enumerate(url_chunks)]
|
|
753
|
+
return [f.result() for f in futures]
|
|
754
|
+
|
|
755
|
+
def consolidate_shards(self, partition_items: list[tuple], consolidate_fn: Callable, **kwargs) -> list[Any]:
|
|
756
|
+
futures = [
|
|
757
|
+
self.executor.submit(consolidate_fn, partition_key, shard_paths, **kwargs)
|
|
758
|
+
for partition_key, shard_paths in partition_items
|
|
759
|
+
]
|
|
760
|
+
return [f.result() for f in futures]
|
|
761
|
+
|
|
762
|
+
def close(self):
|
|
763
|
+
"""Shutdown the executor."""
|
|
764
|
+
self.executor.shutdown(wait=True)
|
|
765
|
+
|
|
766
|
+
|
|
767
|
+
class STACIngestionPipeline:
|
|
768
|
+
"""Enterprise-grade STAC ingestion pipeline for massive geospatial dataset processing.
|
|
769
|
+
|
|
770
|
+
This is the main orchestrator class that coordinates the complete STAC ingestion
|
|
771
|
+
workflow, from URL reading through spatial partitioning to optimized catalog
|
|
772
|
+
generation. Designed for production environments processing 100M+ STAC items
|
|
773
|
+
with high performance, reliability, and scalability.
|
|
774
|
+
|
|
775
|
+
Pipeline Architecture:
|
|
776
|
+
The pipeline implements a sophisticated multi-stage processing architecture:
|
|
777
|
+
1. Input Reading: Flexible support for Parquet, CSV, TSV with auto-detection
|
|
778
|
+
2. URL Chunking: Intelligent batching based on memory and processing constraints
|
|
779
|
+
3. Distributed Processing: Pluggable processors for local or cluster execution
|
|
780
|
+
4. Concurrent Download: High-performance async HTTP with connection pooling
|
|
781
|
+
5. Spatial Partitioning: Grid-based organization optimized for spatial queries
|
|
782
|
+
6. Temporal Binning: Time-based organization for efficient time-series queries
|
|
783
|
+
7. Consolidation: Memory-efficient merging with streaming for large datasets
|
|
784
|
+
8. Output Generation: Optimized GeoParquet with spatial indexing
|
|
785
|
+
|
|
786
|
+
Performance Features:
|
|
787
|
+
- Async HTTP: 3-6x faster downloads with 50-100+ concurrent requests
|
|
788
|
+
- Memory Management: Constant memory usage regardless of dataset size
|
|
789
|
+
- Incremental Processing: Resume interrupted jobs and update existing catalogs
|
|
790
|
+
- Error Recovery: Comprehensive retry strategies and graceful degradation
|
|
791
|
+
- Progress Tracking: Real-time monitoring with detailed logging
|
|
792
|
+
- Resource Optimization: Adaptive batching and memory pressure handling
|
|
793
|
+
|
|
794
|
+
Storage Integration:
|
|
795
|
+
- Local Filesystem: Development and small-scale production
|
|
796
|
+
- S3 Compatible: AWS S3, MinIO, with optimized multipart uploads
|
|
797
|
+
- Google Cloud Storage: Native integration via fsspec
|
|
798
|
+
- Azure Blob Storage: Full support for Azure cloud deployments
|
|
799
|
+
- Custom Storage: Extensible backend system for specialized requirements
|
|
800
|
+
|
|
801
|
+
Incremental Updates:
|
|
802
|
+
Advanced incremental processing capabilities:
|
|
803
|
+
- Automatic detection of existing partitions
|
|
804
|
+
- Efficient merging of new data with existing catalogs
|
|
805
|
+
- Temporal range updates without full reprocessing
|
|
806
|
+
- Safe atomic operations preventing catalog corruption
|
|
807
|
+
- Rollback capabilities for failed updates
|
|
808
|
+
|
|
809
|
+
Processing Modes:
|
|
810
|
+
LocalProcessor:
|
|
811
|
+
- Single-machine processing optimized for async HTTP
|
|
812
|
+
- Memory-efficient for datasets up to tens of millions of items
|
|
813
|
+
- Ideal for development, testing, and medium-scale production
|
|
814
|
+
|
|
815
|
+
DaskDistributedProcessor:
|
|
816
|
+
- Cluster-based processing for massive scale
|
|
817
|
+
- Fault tolerance with automatic task recovery
|
|
818
|
+
- Dynamic scaling based on cluster resources
|
|
819
|
+
- Optimized for 100M+ item datasets
|
|
820
|
+
|
|
821
|
+
Quality Assurance:
|
|
822
|
+
- Comprehensive input validation and error checking
|
|
823
|
+
- STAC specification compliance verification
|
|
824
|
+
- Spatial geometry validation and repair
|
|
825
|
+
- Temporal data consistency checking
|
|
826
|
+
- Output format validation and optimization
|
|
827
|
+
|
|
828
|
+
Example:
|
|
829
|
+
>>> # Basic pipeline setup
|
|
830
|
+
>>> config = ProcessingConfig(
|
|
831
|
+
... input_file='stac_urls.parquet',
|
|
832
|
+
... output_catalog='./catalog',
|
|
833
|
+
... scratch_location='./scratch'
|
|
834
|
+
... )
|
|
835
|
+
>>> pipeline = STACIngestionPipeline(config, LocalProcessor())
|
|
836
|
+
>>>
|
|
837
|
+
>>> # Execute complete pipeline
|
|
838
|
+
>>> pipeline.run() # Processes all URLs and creates catalog
|
|
839
|
+
>>>
|
|
840
|
+
>>> # Advanced configuration for large-scale processing
|
|
841
|
+
>>> config = ProcessingConfig(
|
|
842
|
+
... input_file='s3://data/urls.parquet',
|
|
843
|
+
... output_catalog='s3://catalog/output',
|
|
844
|
+
... scratch_location='s3://catalog/scratch',
|
|
845
|
+
... grid_system='h3',
|
|
846
|
+
... grid_resolution=6,
|
|
847
|
+
... concurrent_requests=100,
|
|
848
|
+
... batch_size=5000,
|
|
849
|
+
... max_workers=32
|
|
850
|
+
... )
|
|
851
|
+
>>> processor = DaskDistributedProcessor('scheduler:8786')
|
|
852
|
+
>>> pipeline = STACIngestionPipeline(config, processor)
|
|
853
|
+
>>> pipeline.run()
|
|
854
|
+
|
|
855
|
+
Thread Safety:
|
|
856
|
+
The pipeline is designed for single-threaded execution within each worker
|
|
857
|
+
process. Multiple pipeline instances can run concurrently in separate
|
|
858
|
+
processes for distributed processing scenarios.
|
|
859
|
+
|
|
860
|
+
Monitoring:
|
|
861
|
+
Comprehensive monitoring and observability:
|
|
862
|
+
- Progress bars with ETA and throughput metrics
|
|
863
|
+
- Detailed logging of all processing stages
|
|
864
|
+
- Error categorization and reporting
|
|
865
|
+
- Performance metrics and bottleneck identification
|
|
866
|
+
- Memory usage tracking and optimization recommendations
|
|
867
|
+
"""
|
|
868
|
+
|
|
869
|
+
def __init__(self, config: ProcessingConfig, processor: DistributedProcessor):
|
|
870
|
+
self.config = config
|
|
871
|
+
self.processor = processor
|
|
872
|
+
self.storage = self._init_storage(config.output_catalog)
|
|
873
|
+
self.scratch_storage = self._init_storage(config.scratch_location)
|
|
874
|
+
self.grid = grid_systems.get_grid_system(config.grid_system, config.grid_resolution)
|
|
875
|
+
|
|
876
|
+
# Initialize STAC engine for item conversion
|
|
877
|
+
self.engine: STACEngine = get_engine(config.stac_engine) # type: ignore[arg-type]
|
|
878
|
+
logger.info(f"Using STAC engine: {self.engine.name}")
|
|
879
|
+
|
|
880
|
+
# Initialize statistics collector
|
|
881
|
+
self.stats = IngestionStatistics()
|
|
882
|
+
|
|
883
|
+
def __repr__(self) -> str:
|
|
884
|
+
"""Return string representation with key pipeline state."""
|
|
885
|
+
return (
|
|
886
|
+
f"STACIngestionPipeline("
|
|
887
|
+
f"input='{self.config.input_file}', "
|
|
888
|
+
f"output='{self.config.output_catalog}', "
|
|
889
|
+
f"grid={self.config.grid_system}@{self.config.grid_resolution}, "
|
|
890
|
+
f"processor={self.processor.__class__.__name__})"
|
|
891
|
+
)
|
|
892
|
+
|
|
893
|
+
def _init_storage(self, path: str) -> storage_backends.StorageBackend:
|
|
894
|
+
"""Initialize storage backend."""
|
|
895
|
+
if path.startswith("s3://"):
|
|
896
|
+
return storage_backends.S3Storage(path)
|
|
897
|
+
else:
|
|
898
|
+
return storage_backends.LocalStorage(path)
|
|
899
|
+
|
|
900
|
+
def _should_use_distributed(self, url_count: int) -> bool:
|
|
901
|
+
"""Determine whether to use distributed processing based on URL count and config.
|
|
902
|
+
|
|
903
|
+
Processing mode selection logic:
|
|
904
|
+
1. If config.distributed is True: always use distributed
|
|
905
|
+
2. If config.distributed is False: always use simple local
|
|
906
|
+
3. If config.distributed is None (auto): use URL count threshold
|
|
907
|
+
|
|
908
|
+
Args:
|
|
909
|
+
url_count: Number of URLs to process.
|
|
910
|
+
|
|
911
|
+
Returns:
|
|
912
|
+
True if distributed processing should be used, False for simple local.
|
|
913
|
+
"""
|
|
914
|
+
if self.config.distributed is True:
|
|
915
|
+
logger.info("Using distributed processing (explicitly enabled)")
|
|
916
|
+
return True
|
|
917
|
+
if self.config.distributed is False:
|
|
918
|
+
logger.info("Using simple local processing (explicitly disabled distributed)")
|
|
919
|
+
return False
|
|
920
|
+
# Auto mode: decide based on threshold
|
|
921
|
+
use_distributed = url_count >= self.config.batch_threshold
|
|
922
|
+
if use_distributed:
|
|
923
|
+
logger.info(
|
|
924
|
+
f"Using distributed processing: {url_count:,} URLs >= threshold {self.config.batch_threshold:,}"
|
|
925
|
+
)
|
|
926
|
+
else:
|
|
927
|
+
logger.info(
|
|
928
|
+
f"Using simple local processing: {url_count:,} URLs < threshold {self.config.batch_threshold:,}"
|
|
929
|
+
)
|
|
930
|
+
return use_distributed
|
|
931
|
+
|
|
932
|
+
def run(self, job_id: str | None = None, resume: bool = False) -> dict[str, Any]:
|
|
933
|
+
"""Execute the complete ingestion pipeline with job tracking.
|
|
934
|
+
|
|
935
|
+
Args:
|
|
936
|
+
job_id: Optional job ID to use. If None, generates a new UUID.
|
|
937
|
+
resume: If True, attempts to resume an incomplete job. Ignores job_id
|
|
938
|
+
and searches for the most recent incomplete job.
|
|
939
|
+
|
|
940
|
+
Returns:
|
|
941
|
+
Dictionary mapping partition keys to consolidation stats.
|
|
942
|
+
|
|
943
|
+
Raises:
|
|
944
|
+
ValueError: If resume=True but no incomplete job found.
|
|
945
|
+
"""
|
|
946
|
+
# Handle resume case
|
|
947
|
+
if resume:
|
|
948
|
+
manifest = JobManifest.find_incomplete(self.storage, self.config.output_catalog)
|
|
949
|
+
if manifest is None:
|
|
950
|
+
raise ValueError("No incomplete job found to resume")
|
|
951
|
+
job_id = manifest.job_id
|
|
952
|
+
logger.info(f"Resuming job {job_id} from {manifest.status.value} phase")
|
|
953
|
+
elif job_id is None:
|
|
954
|
+
job_id = str(uuid.uuid4())
|
|
955
|
+
|
|
956
|
+
# Initialize job logger
|
|
957
|
+
job_logger = JobLogger(self.storage, self.config.output_catalog, job_id)
|
|
958
|
+
|
|
959
|
+
logger.info("=" * 80)
|
|
960
|
+
logger.info("Starting STAC ingestion pipeline")
|
|
961
|
+
logger.info(f"Job ID: {job_id}")
|
|
962
|
+
logger.info(f"Input: {self.config.input_file}")
|
|
963
|
+
logger.info(f"Output: {self.config.output_catalog}")
|
|
964
|
+
logger.info(f"Grid: {self.config.grid_system} (resolution {self.config.grid_resolution})")
|
|
965
|
+
logger.info(f"Temporal binning: {self.config.temporal_bin}")
|
|
966
|
+
logger.info("=" * 80)
|
|
967
|
+
|
|
968
|
+
# Start statistics tracking
|
|
969
|
+
self.stats.start_processing()
|
|
970
|
+
|
|
971
|
+
# Phase 1: Read input and distribute work
|
|
972
|
+
urls = self._read_input_urls()
|
|
973
|
+
logger.info(f"Loaded {len(urls):,} URLs from input file")
|
|
974
|
+
|
|
975
|
+
# Check if we should use simple mode (for small batches)
|
|
976
|
+
# Note: Resume always uses distributed mode
|
|
977
|
+
if not resume and not self._should_use_distributed(len(urls)):
|
|
978
|
+
# Check if local mode with large batch needs confirmation
|
|
979
|
+
if len(urls) > self.config.large_batch_confirm_threshold:
|
|
980
|
+
logger.warning(
|
|
981
|
+
f"Processing {len(urls):,} URLs in local mode. "
|
|
982
|
+
f"Consider using --distributed for batches > {self.config.large_batch_confirm_threshold:,}"
|
|
983
|
+
)
|
|
984
|
+
# job_id is guaranteed to be set at this point (either from param or uuid)
|
|
985
|
+
assert job_id is not None
|
|
986
|
+
return self._run_simple(urls, job_id)
|
|
987
|
+
|
|
988
|
+
# Create or load job manifest
|
|
989
|
+
if resume:
|
|
990
|
+
manifest = JobManifest.load(self.storage, self.config.output_catalog, job_id)
|
|
991
|
+
else:
|
|
992
|
+
manifest = JobManifest(
|
|
993
|
+
job_id=job_id,
|
|
994
|
+
input_urls_count=len(urls),
|
|
995
|
+
config_hash=self.config.config_hash(),
|
|
996
|
+
)
|
|
997
|
+
|
|
998
|
+
# Track shard info for consolidation
|
|
999
|
+
shard_info: list[dict[str, Any]] = []
|
|
1000
|
+
|
|
1001
|
+
try:
|
|
1002
|
+
# Phase 2: Parallel processing to scratch location
|
|
1003
|
+
with tqdm(total=4, desc="Pipeline Progress", unit="phase") as pipeline_pbar:
|
|
1004
|
+
# Skip download phase if already completed (resume case)
|
|
1005
|
+
if manifest.download_phase.completed:
|
|
1006
|
+
logger.info("Download phase already completed, skipping")
|
|
1007
|
+
shard_info = [
|
|
1008
|
+
{"shard_path": path, "item_count": 0, "partition_key": ""}
|
|
1009
|
+
for path in manifest.download_phase.shards_written
|
|
1010
|
+
]
|
|
1011
|
+
pipeline_pbar.update(1)
|
|
1012
|
+
else:
|
|
1013
|
+
pipeline_pbar.set_description("Processing URLs")
|
|
1014
|
+
manifest.status = JobStatus.DOWNLOADING
|
|
1015
|
+
manifest.save(self.storage, self.config.output_catalog)
|
|
1016
|
+
job_logger.log_phase_start("download")
|
|
1017
|
+
|
|
1018
|
+
shard_info = self._process_urls_distributed(urls, job_id=job_id)
|
|
1019
|
+
total_shards = len(shard_info)
|
|
1020
|
+
total_items = sum(s["item_count"] for s in shard_info)
|
|
1021
|
+
logger.info(f"Generated {total_shards:,} shards ({total_items:,} items) in scratch space")
|
|
1022
|
+
|
|
1023
|
+
# Update manifest with download phase completion
|
|
1024
|
+
manifest.download_phase.completed = True
|
|
1025
|
+
manifest.download_phase.shards_written = [s["shard_path"] for s in shard_info]
|
|
1026
|
+
manifest.download_phase.urls_processed = len(urls)
|
|
1027
|
+
manifest.save(self.storage, self.config.output_catalog)
|
|
1028
|
+
job_logger.log_phase_complete("download", {"shards": total_shards, "items": total_items})
|
|
1029
|
+
pipeline_pbar.update(1)
|
|
1030
|
+
|
|
1031
|
+
# Phase 3: Consolidate shards into final catalog
|
|
1032
|
+
# Skip if already completed (resume case)
|
|
1033
|
+
if manifest.consolidation_phase.completed:
|
|
1034
|
+
logger.info("Consolidation phase already completed, skipping")
|
|
1035
|
+
pipeline_pbar.update(1)
|
|
1036
|
+
final_stats = {} # Will be re-computed from catalog
|
|
1037
|
+
else:
|
|
1038
|
+
pipeline_pbar.set_description("Consolidating shards")
|
|
1039
|
+
manifest.status = JobStatus.CONSOLIDATING
|
|
1040
|
+
manifest.save(self.storage, self.config.output_catalog)
|
|
1041
|
+
job_logger.log_phase_start("consolidation")
|
|
1042
|
+
|
|
1043
|
+
# Filter out already-completed partitions for resume
|
|
1044
|
+
completed_partitions = set(manifest.consolidation_phase.completed_partitions)
|
|
1045
|
+
final_stats = self._consolidate_shards(
|
|
1046
|
+
shard_info,
|
|
1047
|
+
skip_partitions=completed_partitions,
|
|
1048
|
+
manifest=manifest,
|
|
1049
|
+
)
|
|
1050
|
+
logger.info(f"Consolidated into {len(final_stats):,} final partitions")
|
|
1051
|
+
|
|
1052
|
+
manifest.consolidation_phase.completed = True
|
|
1053
|
+
manifest.save(self.storage, self.config.output_catalog)
|
|
1054
|
+
job_logger.log_phase_complete("consolidation", {"partitions": len(final_stats)})
|
|
1055
|
+
pipeline_pbar.update(1)
|
|
1056
|
+
|
|
1057
|
+
# Phase 4: Cleanup scratch space
|
|
1058
|
+
pipeline_pbar.set_description("Cleaning up")
|
|
1059
|
+
self._cleanup_scratch(shard_info)
|
|
1060
|
+
logger.info("Scratch space cleanup completed")
|
|
1061
|
+
pipeline_pbar.update(1)
|
|
1062
|
+
|
|
1063
|
+
# Mark job as completed
|
|
1064
|
+
manifest.status = JobStatus.COMPLETED
|
|
1065
|
+
manifest.save(self.storage, self.config.output_catalog)
|
|
1066
|
+
job_logger.log("INFO", "Job completed successfully")
|
|
1067
|
+
|
|
1068
|
+
except Exception as e:
|
|
1069
|
+
# Mark job as failed
|
|
1070
|
+
manifest.status = JobStatus.FAILED
|
|
1071
|
+
manifest.error = str(e)
|
|
1072
|
+
manifest.save(self.storage, self.config.output_catalog)
|
|
1073
|
+
job_logger.log_error(f"Job failed: {e}")
|
|
1074
|
+
raise
|
|
1075
|
+
|
|
1076
|
+
# Finish statistics tracking
|
|
1077
|
+
self.stats.finish_processing()
|
|
1078
|
+
|
|
1079
|
+
# Update statistics with consolidation results
|
|
1080
|
+
for _partition_key, stats in final_stats.items():
|
|
1081
|
+
self.stats.record_consolidation(
|
|
1082
|
+
new_items=stats.get("new_items", 0),
|
|
1083
|
+
existing_items=stats.get("existing_items", 0),
|
|
1084
|
+
duplicates_removed=stats.get("duplicates_removed", 0),
|
|
1085
|
+
)
|
|
1086
|
+
|
|
1087
|
+
# Generate schema if requested
|
|
1088
|
+
if self.config.generate_schema:
|
|
1089
|
+
logger.info("Generating catalog schema metadata...")
|
|
1090
|
+
schema_gen = schema_generator.SchemaGenerator(self.config, self.grid, self.storage, self.stats)
|
|
1091
|
+
schema_gen.generate_catalog_schema(final_stats, self.config.schema_filename)
|
|
1092
|
+
|
|
1093
|
+
# Summary with enhanced statistics
|
|
1094
|
+
stats_summary = self.stats.get_summary()
|
|
1095
|
+
logger.info("=" * 80)
|
|
1096
|
+
logger.info("PIPELINE COMPLETED SUCCESSFULLY")
|
|
1097
|
+
logger.info(f"Job ID: {job_id}")
|
|
1098
|
+
logger.info(f"Total partitions: {len(final_stats)}")
|
|
1099
|
+
logger.info(f"Unique granules: {stats_summary['unique_granules']:,}")
|
|
1100
|
+
logger.info(f"Stored references: {stats_summary['stored_references']:,}")
|
|
1101
|
+
logger.info(
|
|
1102
|
+
f"Overhead: {stats_summary['overhead']['overhead_percentage']:.1f}% "
|
|
1103
|
+
f"({stats_summary['overhead']['spanning_items']:,} spanning items)"
|
|
1104
|
+
)
|
|
1105
|
+
total_new = sum(s["new_items"] for s in final_stats.values())
|
|
1106
|
+
logger.info(f"New items: {total_new:,}")
|
|
1107
|
+
if self.config.generate_schema:
|
|
1108
|
+
logger.info(f"Schema metadata: {self.config.output_catalog}/{self.config.schema_filename}")
|
|
1109
|
+
logger.info("=" * 80)
|
|
1110
|
+
|
|
1111
|
+
return final_stats
|
|
1112
|
+
|
|
1113
|
+
def _run_simple(self, urls: list[str], job_id: str) -> dict[str, Any]:
|
|
1114
|
+
"""Execute simplified pipeline for small batches without full distributed overhead.
|
|
1115
|
+
|
|
1116
|
+
This method provides a streamlined processing path for datasets below the
|
|
1117
|
+
batch_threshold. It still:
|
|
1118
|
+
- Uses async HTTP for performance
|
|
1119
|
+
- Creates a job manifest for observability
|
|
1120
|
+
- Uses the same partitioning and consolidation logic
|
|
1121
|
+
|
|
1122
|
+
But it skips:
|
|
1123
|
+
- Resume/checkpoint capability (not needed for small batches)
|
|
1124
|
+
- Complex worker distribution logic
|
|
1125
|
+
- Multi-phase progress tracking
|
|
1126
|
+
|
|
1127
|
+
Args:
|
|
1128
|
+
urls: List of URLs to process.
|
|
1129
|
+
job_id: Job identifier for logging.
|
|
1130
|
+
|
|
1131
|
+
Returns:
|
|
1132
|
+
Dictionary mapping partition keys to consolidation stats.
|
|
1133
|
+
"""
|
|
1134
|
+
logger.info("=" * 80)
|
|
1135
|
+
logger.info("Running simplified pipeline (small batch mode)")
|
|
1136
|
+
logger.info(f"Job ID: {job_id}")
|
|
1137
|
+
logger.info(f"URLs: {len(urls):,}")
|
|
1138
|
+
logger.info("=" * 80)
|
|
1139
|
+
|
|
1140
|
+
# Start statistics tracking
|
|
1141
|
+
self.stats.start_processing()
|
|
1142
|
+
|
|
1143
|
+
# Create minimal job manifest for observability
|
|
1144
|
+
manifest = JobManifest(
|
|
1145
|
+
job_id=job_id,
|
|
1146
|
+
input_urls_count=len(urls),
|
|
1147
|
+
config_hash=self.config.config_hash(),
|
|
1148
|
+
)
|
|
1149
|
+
manifest.status = JobStatus.DOWNLOADING
|
|
1150
|
+
manifest.save(self.storage, self.config.output_catalog)
|
|
1151
|
+
|
|
1152
|
+
# Process all URLs in a single batch using async HTTP
|
|
1153
|
+
worker_tag = f"simple-{uuid.uuid4().hex[:8]}"
|
|
1154
|
+
all_items: list[dict[str, Any]] = []
|
|
1155
|
+
|
|
1156
|
+
# Use async HTTP if available and enabled
|
|
1157
|
+
if self.config.enable_concurrent_http and HAS_ASYNC_HTTP:
|
|
1158
|
+
logger.info(f"Downloading {len(urls):,} items with async HTTP...")
|
|
1159
|
+
items = self._download_stac_items_batch_async(urls, worker_id=0)
|
|
1160
|
+
for item in items:
|
|
1161
|
+
if item:
|
|
1162
|
+
all_items.append(item)
|
|
1163
|
+
self.stats.record_url_processed(success=True)
|
|
1164
|
+
else:
|
|
1165
|
+
self.stats.record_url_processed(success=False)
|
|
1166
|
+
else:
|
|
1167
|
+
# Fallback to sync processing
|
|
1168
|
+
logger.info(f"Downloading {len(urls):,} items with sync HTTP...")
|
|
1169
|
+
with tqdm(total=len(urls), desc="Downloading", unit="items") as pbar:
|
|
1170
|
+
for url in urls:
|
|
1171
|
+
fetched_item = self._download_stac_item(url)
|
|
1172
|
+
if fetched_item:
|
|
1173
|
+
all_items.append(fetched_item)
|
|
1174
|
+
self.stats.record_url_processed(success=True)
|
|
1175
|
+
else:
|
|
1176
|
+
self.stats.record_url_processed(success=False)
|
|
1177
|
+
pbar.update(1)
|
|
1178
|
+
|
|
1179
|
+
logger.info(f"Downloaded {len(all_items):,} items successfully")
|
|
1180
|
+
|
|
1181
|
+
if not all_items:
|
|
1182
|
+
logger.warning("No items downloaded, nothing to process")
|
|
1183
|
+
manifest.status = JobStatus.COMPLETED
|
|
1184
|
+
manifest.save(self.storage, self.config.output_catalog)
|
|
1185
|
+
self.stats.finish_processing()
|
|
1186
|
+
return {}
|
|
1187
|
+
|
|
1188
|
+
# Write partition shards
|
|
1189
|
+
logger.info("Writing partition shards...")
|
|
1190
|
+
shard_info = self._write_partition_shards(all_items, worker_tag, self.stats)
|
|
1191
|
+
|
|
1192
|
+
# Update manifest
|
|
1193
|
+
manifest.download_phase.completed = True
|
|
1194
|
+
manifest.download_phase.shards_written = [s["shard_path"] for s in shard_info]
|
|
1195
|
+
manifest.download_phase.urls_processed = len(urls)
|
|
1196
|
+
manifest.status = JobStatus.CONSOLIDATING
|
|
1197
|
+
manifest.save(self.storage, self.config.output_catalog)
|
|
1198
|
+
|
|
1199
|
+
# Consolidate shards
|
|
1200
|
+
logger.info("Consolidating partitions...")
|
|
1201
|
+
final_stats = self._consolidate_shards(shard_info, manifest=manifest)
|
|
1202
|
+
|
|
1203
|
+
# Cleanup scratch
|
|
1204
|
+
self._cleanup_scratch(shard_info)
|
|
1205
|
+
|
|
1206
|
+
# Mark complete
|
|
1207
|
+
manifest.status = JobStatus.COMPLETED
|
|
1208
|
+
manifest.consolidation_phase.completed = True
|
|
1209
|
+
manifest.save(self.storage, self.config.output_catalog)
|
|
1210
|
+
|
|
1211
|
+
# Finish statistics
|
|
1212
|
+
self.stats.finish_processing()
|
|
1213
|
+
|
|
1214
|
+
# Update statistics with consolidation results
|
|
1215
|
+
for _partition_key, stats in final_stats.items():
|
|
1216
|
+
self.stats.record_consolidation(
|
|
1217
|
+
new_items=stats.get("new_items", 0),
|
|
1218
|
+
existing_items=stats.get("existing_items", 0),
|
|
1219
|
+
duplicates_removed=stats.get("duplicates_removed", 0),
|
|
1220
|
+
)
|
|
1221
|
+
|
|
1222
|
+
# Generate schema if requested
|
|
1223
|
+
if self.config.generate_schema:
|
|
1224
|
+
logger.info("Generating catalog schema metadata...")
|
|
1225
|
+
schema_gen = schema_generator.SchemaGenerator(self.config, self.grid, self.storage, self.stats)
|
|
1226
|
+
schema_gen.generate_catalog_schema(final_stats, self.config.schema_filename)
|
|
1227
|
+
|
|
1228
|
+
# Summary
|
|
1229
|
+
stats_summary = self.stats.get_summary()
|
|
1230
|
+
logger.info("=" * 80)
|
|
1231
|
+
logger.info("SIMPLE PIPELINE COMPLETED SUCCESSFULLY")
|
|
1232
|
+
logger.info(f"Job ID: {job_id}")
|
|
1233
|
+
logger.info(f"Total partitions: {len(final_stats)}")
|
|
1234
|
+
logger.info(f"Unique granules: {stats_summary['unique_granules']:,}")
|
|
1235
|
+
total_new = sum(s.get("new_items", 0) for s in final_stats.values())
|
|
1236
|
+
logger.info(f"New items: {total_new:,}")
|
|
1237
|
+
logger.info("=" * 80)
|
|
1238
|
+
|
|
1239
|
+
return final_stats
|
|
1240
|
+
|
|
1241
|
+
def _read_input_urls(self) -> list[str]:
|
|
1242
|
+
"""Read URLs from input file(s) using appropriate format reader.
|
|
1243
|
+
|
|
1244
|
+
If input_pattern is configured, discovers all matching files and reads URLs
|
|
1245
|
+
from each one. Otherwise, reads from the single input_file.
|
|
1246
|
+
|
|
1247
|
+
Returns:
|
|
1248
|
+
List of URLs from all discovered files.
|
|
1249
|
+
"""
|
|
1250
|
+
# Check if we should use pattern-based file discovery
|
|
1251
|
+
if self.config.input_pattern:
|
|
1252
|
+
return self._read_urls_from_pattern()
|
|
1253
|
+
|
|
1254
|
+
# Traditional single file mode
|
|
1255
|
+
return self._read_urls_from_file(self.config.input_file)
|
|
1256
|
+
|
|
1257
|
+
def _read_urls_from_file(self, file_path: str) -> list[str]:
|
|
1258
|
+
"""Read URLs from a single file using appropriate format reader.
|
|
1259
|
+
|
|
1260
|
+
Args:
|
|
1261
|
+
file_path: Path to the input file (local or S3).
|
|
1262
|
+
|
|
1263
|
+
Returns:
|
|
1264
|
+
List of URLs extracted from the file.
|
|
1265
|
+
"""
|
|
1266
|
+
ReaderFactory = input_readers.ReaderFactory
|
|
1267
|
+
|
|
1268
|
+
# Determine format
|
|
1269
|
+
if self.config.input_format == "auto":
|
|
1270
|
+
format_name = ReaderFactory.auto_detect_format(file_path)
|
|
1271
|
+
else:
|
|
1272
|
+
format_name = self.config.input_format
|
|
1273
|
+
|
|
1274
|
+
# Get appropriate reader
|
|
1275
|
+
reader = ReaderFactory.get_reader(format_name)
|
|
1276
|
+
|
|
1277
|
+
# Read URLs
|
|
1278
|
+
return reader.read_urls(file_path, self.config.url_column)
|
|
1279
|
+
|
|
1280
|
+
def _read_urls_from_pattern(self) -> list[str]:
|
|
1281
|
+
"""Read URLs from multiple files matching a glob pattern.
|
|
1282
|
+
|
|
1283
|
+
Uses StorageBackend.list_files() to discover files matching the pattern,
|
|
1284
|
+
then reads URLs from each discovered file.
|
|
1285
|
+
|
|
1286
|
+
Returns:
|
|
1287
|
+
Concatenated list of URLs from all matching files.
|
|
1288
|
+
|
|
1289
|
+
Raises:
|
|
1290
|
+
ValueError: If no files match the pattern.
|
|
1291
|
+
"""
|
|
1292
|
+
pattern = self.config.input_pattern
|
|
1293
|
+
all_urls: list[str] = []
|
|
1294
|
+
|
|
1295
|
+
logger.info(f"Discovering files matching pattern: {pattern}")
|
|
1296
|
+
|
|
1297
|
+
# Check if pattern is for S3 or local filesystem
|
|
1298
|
+
if pattern.startswith("s3://"):
|
|
1299
|
+
# Use fsspec for S3 pattern matching
|
|
1300
|
+
all_urls = self._read_s3_pattern(pattern)
|
|
1301
|
+
else:
|
|
1302
|
+
# Use glob for local filesystem
|
|
1303
|
+
matching_files = glob_module.glob(pattern, recursive=True)
|
|
1304
|
+
|
|
1305
|
+
if not matching_files:
|
|
1306
|
+
raise ValueError(f"No files found matching pattern: {pattern}")
|
|
1307
|
+
|
|
1308
|
+
logger.info(f"Found {len(matching_files)} files matching pattern")
|
|
1309
|
+
|
|
1310
|
+
# Read URLs from each file
|
|
1311
|
+
for file_path in matching_files:
|
|
1312
|
+
logger.info(f"Reading URLs from: {file_path}")
|
|
1313
|
+
urls = self._read_urls_from_file(file_path)
|
|
1314
|
+
all_urls.extend(urls)
|
|
1315
|
+
|
|
1316
|
+
logger.info(f"Total URLs read from all files: {len(all_urls)}")
|
|
1317
|
+
return all_urls
|
|
1318
|
+
|
|
1319
|
+
def _read_s3_pattern(self, pattern: str) -> list[str]:
|
|
1320
|
+
"""Read URLs from S3 files matching a glob pattern.
|
|
1321
|
+
|
|
1322
|
+
Args:
|
|
1323
|
+
pattern: S3 path with glob pattern (e.g., s3://bucket/bulk/2020_*.ndjson).
|
|
1324
|
+
|
|
1325
|
+
Returns:
|
|
1326
|
+
Concatenated list of URLs from all matching S3 files.
|
|
1327
|
+
|
|
1328
|
+
Raises:
|
|
1329
|
+
ValueError: If no files match the pattern or fsspec not available.
|
|
1330
|
+
"""
|
|
1331
|
+
if not pattern.startswith("s3://"):
|
|
1332
|
+
raise ValueError(f"S3 pattern must start with 's3://', got: {pattern}")
|
|
1333
|
+
|
|
1334
|
+
# Use fsspec.glob() directly for pattern matching
|
|
1335
|
+
matching_files = fsspec.glob(pattern)
|
|
1336
|
+
|
|
1337
|
+
if not matching_files:
|
|
1338
|
+
raise ValueError(f"No S3 files found matching pattern: {pattern}")
|
|
1339
|
+
|
|
1340
|
+
logger.info(f"Found {len(matching_files)} S3 files matching pattern")
|
|
1341
|
+
|
|
1342
|
+
# Read URLs from each file
|
|
1343
|
+
all_urls: list[str] = []
|
|
1344
|
+
for file_path in matching_files:
|
|
1345
|
+
logger.info(f"Reading URLs from S3: {file_path}")
|
|
1346
|
+
urls = self._read_urls_from_file(file_path)
|
|
1347
|
+
all_urls.extend(urls)
|
|
1348
|
+
|
|
1349
|
+
return all_urls
|
|
1350
|
+
|
|
1351
|
+
def _process_urls_distributed(self, urls: list[str], job_id: str | None = None) -> list[dict[str, Any]]:
|
|
1352
|
+
"""Process URLs in parallel and write to scratch location.
|
|
1353
|
+
|
|
1354
|
+
For distributed processing (Dask), uses serializable worker functions from
|
|
1355
|
+
workers.py. For local processing, uses inline closures that capture self.
|
|
1356
|
+
|
|
1357
|
+
Worker stats are returned alongside shard info and merged into the
|
|
1358
|
+
pipeline's main stats at the end.
|
|
1359
|
+
|
|
1360
|
+
Args:
|
|
1361
|
+
urls: List of STAC item URLs to process.
|
|
1362
|
+
job_id: Job ID for tracking. If None, generates a new UUID.
|
|
1363
|
+
|
|
1364
|
+
Returns:
|
|
1365
|
+
List of shard info dictionaries.
|
|
1366
|
+
"""
|
|
1367
|
+
# Use provided job_id or generate one
|
|
1368
|
+
if job_id is None:
|
|
1369
|
+
job_id = str(uuid.uuid4())
|
|
1370
|
+
logger.info(f"Starting job {job_id[:8]}...")
|
|
1371
|
+
|
|
1372
|
+
# Serialize config for distributed workers
|
|
1373
|
+
config_dict = self.config.to_dict()
|
|
1374
|
+
|
|
1375
|
+
def process_url_batch(
|
|
1376
|
+
url_batch: list[str], worker_id: int, **kwargs: Any
|
|
1377
|
+
) -> tuple[list[dict[str, Any]], IngestionStatistics]:
|
|
1378
|
+
"""Process a batch of URLs on a single worker with async HTTP support.
|
|
1379
|
+
|
|
1380
|
+
This closure captures self and is used for LocalProcessor.
|
|
1381
|
+
For DaskDistributedProcessor, the serializable workers.process_url_batch is used instead.
|
|
1382
|
+
|
|
1383
|
+
Returns:
|
|
1384
|
+
Tuple of (shard_info_list, worker_statistics)
|
|
1385
|
+
"""
|
|
1386
|
+
# Create worker-local statistics collector
|
|
1387
|
+
worker_stats = IngestionStatistics()
|
|
1388
|
+
|
|
1389
|
+
# Choose processing method based on configuration
|
|
1390
|
+
if self.config.enable_concurrent_http and HAS_ASYNC_HTTP and len(url_batch) >= self.config.batch_size:
|
|
1391
|
+
# Use async HTTP processing for better performance
|
|
1392
|
+
shards = self._process_batch_with_async_http(url_batch, worker_id, worker_stats)
|
|
1393
|
+
return (shards, worker_stats)
|
|
1394
|
+
|
|
1395
|
+
# Traditional synchronous processing (original implementation)
|
|
1396
|
+
batch_shards = []
|
|
1397
|
+
current_shard_items = []
|
|
1398
|
+
worker_tag = f"worker-{worker_id}-{uuid.uuid4().hex[:8]}"
|
|
1399
|
+
|
|
1400
|
+
# Add progress bar for this batch
|
|
1401
|
+
with tqdm(total=len(url_batch), desc=f"Worker {worker_id}", unit="urls", leave=False) as pbar:
|
|
1402
|
+
for _idx, url in enumerate(url_batch):
|
|
1403
|
+
try:
|
|
1404
|
+
# Download and parse STAC item
|
|
1405
|
+
item = self._download_stac_item(url)
|
|
1406
|
+
if not item:
|
|
1407
|
+
# Record failed URL
|
|
1408
|
+
worker_stats.record_url_processed(success=False)
|
|
1409
|
+
pbar.update(1)
|
|
1410
|
+
continue
|
|
1411
|
+
|
|
1412
|
+
# Record successful URL
|
|
1413
|
+
worker_stats.record_url_processed(success=True)
|
|
1414
|
+
|
|
1415
|
+
# Add to current shard
|
|
1416
|
+
current_shard_items.append(item)
|
|
1417
|
+
|
|
1418
|
+
# Write shard when it reaches target size
|
|
1419
|
+
if len(current_shard_items) >= self.config.items_per_shard:
|
|
1420
|
+
# Use partition-aware sharding
|
|
1421
|
+
partition_shards = self._write_partition_shards(
|
|
1422
|
+
current_shard_items, worker_tag, worker_stats
|
|
1423
|
+
)
|
|
1424
|
+
batch_shards.extend(partition_shards)
|
|
1425
|
+
current_shard_items = []
|
|
1426
|
+
|
|
1427
|
+
except (
|
|
1428
|
+
requests.exceptions.RequestException,
|
|
1429
|
+
json.JSONDecodeError,
|
|
1430
|
+
OSError,
|
|
1431
|
+
ValueError,
|
|
1432
|
+
) as e:
|
|
1433
|
+
logger.error(f"Error processing URL {url}: {e}")
|
|
1434
|
+
worker_stats.record_url_processed(success=False)
|
|
1435
|
+
pbar.update(1)
|
|
1436
|
+
continue
|
|
1437
|
+
|
|
1438
|
+
pbar.update(1)
|
|
1439
|
+
|
|
1440
|
+
# Write final shard for this batch
|
|
1441
|
+
if current_shard_items:
|
|
1442
|
+
# Use partition-aware sharding
|
|
1443
|
+
partition_shards = self._write_partition_shards(current_shard_items, worker_tag, worker_stats)
|
|
1444
|
+
batch_shards.extend(partition_shards)
|
|
1445
|
+
|
|
1446
|
+
total_items = sum(s["item_count"] for s in batch_shards)
|
|
1447
|
+
logger.info(f"Worker {worker_id} completed: {len(batch_shards)} shards, {total_items} items")
|
|
1448
|
+
return (batch_shards, worker_stats)
|
|
1449
|
+
|
|
1450
|
+
# Distribute URLs to workers
|
|
1451
|
+
url_chunks = self._chunk_urls(urls, self.config.max_workers)
|
|
1452
|
+
|
|
1453
|
+
# Process in parallel - pass config_dict and job_id for Dask
|
|
1454
|
+
all_results = self.processor.process_urls(
|
|
1455
|
+
url_chunks,
|
|
1456
|
+
process_url_batch,
|
|
1457
|
+
config_dict=config_dict,
|
|
1458
|
+
job_id=job_id,
|
|
1459
|
+
)
|
|
1460
|
+
|
|
1461
|
+
# Merge worker statistics into main stats and flatten shard results
|
|
1462
|
+
all_shards = []
|
|
1463
|
+
for result in all_results:
|
|
1464
|
+
# Handle both tuple format (LocalProcessor) and dict format (DaskDistributedProcessor)
|
|
1465
|
+
if isinstance(result, tuple):
|
|
1466
|
+
shards, worker_stats = result
|
|
1467
|
+
self.stats.merge(worker_stats)
|
|
1468
|
+
all_shards.extend(shards)
|
|
1469
|
+
elif isinstance(result, dict):
|
|
1470
|
+
# Dict format from workers.process_url_batch
|
|
1471
|
+
shards = result.get("shards", [])
|
|
1472
|
+
stats_dict = result.get("stats", {})
|
|
1473
|
+
failed_urls = result.get("failed_urls", [])
|
|
1474
|
+
|
|
1475
|
+
# Reconstruct stats from dict
|
|
1476
|
+
worker_stats = IngestionStatistics()
|
|
1477
|
+
for _ in range(stats_dict.get("urls_processed", 0)):
|
|
1478
|
+
worker_stats.record_url_processed(success=True)
|
|
1479
|
+
for _ in range(stats_dict.get("urls_failed", 0)):
|
|
1480
|
+
worker_stats.record_url_processed(success=False)
|
|
1481
|
+
|
|
1482
|
+
self.stats.merge(worker_stats)
|
|
1483
|
+
all_shards.extend(shards)
|
|
1484
|
+
|
|
1485
|
+
if failed_urls:
|
|
1486
|
+
logger.warning(f"Failed URLs: {len(failed_urls)}")
|
|
1487
|
+
else:
|
|
1488
|
+
logger.error(f"Unexpected result type: {type(result)}")
|
|
1489
|
+
|
|
1490
|
+
return all_shards
|
|
1491
|
+
|
|
1492
|
+
def _download_stac_items_batch_async(self, urls: list[str], worker_id: int) -> list[dict[str, Any]]:
|
|
1493
|
+
"""Download STAC items using async HTTP client for improved performance.
|
|
1494
|
+
|
|
1495
|
+
This method provides a high-performance alternative to sequential downloads
|
|
1496
|
+
by utilizing concurrent HTTP requests with connection pooling and rate limiting.
|
|
1497
|
+
|
|
1498
|
+
Args:
|
|
1499
|
+
urls: List of URLs to download
|
|
1500
|
+
worker_id: Worker identifier for logging and progress tracking
|
|
1501
|
+
|
|
1502
|
+
Returns:
|
|
1503
|
+
List of successfully downloaded STAC item dictionaries
|
|
1504
|
+
"""
|
|
1505
|
+
if not HAS_ASYNC_HTTP:
|
|
1506
|
+
# Fallback to synchronous processing
|
|
1507
|
+
logger.warning("Async HTTP not available, falling back to synchronous processing")
|
|
1508
|
+
items = []
|
|
1509
|
+
for url in urls:
|
|
1510
|
+
item = self._download_stac_item(url)
|
|
1511
|
+
if item is not None:
|
|
1512
|
+
items.append(item)
|
|
1513
|
+
return items
|
|
1514
|
+
|
|
1515
|
+
def run_async_download() -> list[dict[str, Any]]:
|
|
1516
|
+
"""Run async download in a new event loop with proper context handling.
|
|
1517
|
+
|
|
1518
|
+
This nested function handles the complexities of running async code from
|
|
1519
|
+
a synchronous context, particularly dealing with existing event loops
|
|
1520
|
+
in environments like Jupyter notebooks or async web frameworks.
|
|
1521
|
+
|
|
1522
|
+
Returns:
|
|
1523
|
+
List of successfully downloaded STAC item dictionaries.
|
|
1524
|
+
|
|
1525
|
+
Note:
|
|
1526
|
+
Automatically detects running event loops and uses thread pool
|
|
1527
|
+
execution when necessary to avoid "RuntimeError: cannot be called
|
|
1528
|
+
from a running event loop" issues.
|
|
1529
|
+
"""
|
|
1530
|
+
try:
|
|
1531
|
+
# Check if we're already in an async context
|
|
1532
|
+
asyncio.current_task()
|
|
1533
|
+
# If we reach here, there's a running event loop
|
|
1534
|
+
# We need to run in a thread to avoid issues
|
|
1535
|
+
import concurrent.futures
|
|
1536
|
+
|
|
1537
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
1538
|
+
future = executor.submit(asyncio.run, self._async_download_worker(urls))
|
|
1539
|
+
return future.result()
|
|
1540
|
+
except RuntimeError:
|
|
1541
|
+
# No event loop exists or not in async context, create a new one
|
|
1542
|
+
return asyncio.run(self._async_download_worker(urls))
|
|
1543
|
+
|
|
1544
|
+
try:
|
|
1545
|
+
return run_async_download()
|
|
1546
|
+
except (RuntimeError, asyncio.CancelledError, OSError) as e:
|
|
1547
|
+
logger.error(f"Async download failed for worker {worker_id}, falling back to sync: {e}")
|
|
1548
|
+
# Fallback to synchronous processing
|
|
1549
|
+
items = []
|
|
1550
|
+
for url in urls:
|
|
1551
|
+
item = self._download_stac_item(url)
|
|
1552
|
+
if item is not None:
|
|
1553
|
+
items.append(item)
|
|
1554
|
+
return items
|
|
1555
|
+
|
|
1556
|
+
async def _async_download_worker(self, urls: list[str]) -> list[dict[str, Any]]:
|
|
1557
|
+
"""Internal async worker for downloading STAC items with high concurrency.
|
|
1558
|
+
|
|
1559
|
+
This method serves as the async wrapper that calls the async HTTP client
|
|
1560
|
+
with the current pipeline configuration. It handles the async/await mechanics
|
|
1561
|
+
while maintaining error handling compatibility.
|
|
1562
|
+
|
|
1563
|
+
Args:
|
|
1564
|
+
urls: List of URLs to download concurrently.
|
|
1565
|
+
|
|
1566
|
+
Returns:
|
|
1567
|
+
List of successfully downloaded and parsed STAC item dictionaries.
|
|
1568
|
+
Failed downloads are filtered out and logged separately.
|
|
1569
|
+
|
|
1570
|
+
Raises:
|
|
1571
|
+
RuntimeError: If async HTTP client is not available when called.
|
|
1572
|
+
|
|
1573
|
+
Example:
|
|
1574
|
+
>>> urls = ["https://example.com/item1.json", "https://example.com/item2.json"]
|
|
1575
|
+
>>> items = await self._async_download_worker(urls)
|
|
1576
|
+
>>> print(f"Downloaded {len(items)} items")
|
|
1577
|
+
"""
|
|
1578
|
+
if not HAS_ASYNC_HTTP or download_stac_items_async is None:
|
|
1579
|
+
raise RuntimeError("Async HTTP client not available")
|
|
1580
|
+
|
|
1581
|
+
return await download_stac_items_async(
|
|
1582
|
+
urls=urls,
|
|
1583
|
+
concurrent_requests=self.config.concurrent_requests,
|
|
1584
|
+
connection_pool_size=self.config.connection_pool_size,
|
|
1585
|
+
request_timeout=self.config.request_timeout,
|
|
1586
|
+
retry_attempts=self.config.retry_attempts,
|
|
1587
|
+
retry_delay=self.config.retry_delay,
|
|
1588
|
+
batch_size=self.config.batch_size,
|
|
1589
|
+
)
|
|
1590
|
+
|
|
1591
|
+
def _process_batch_with_async_http(
|
|
1592
|
+
self, url_batch: list[str], worker_id: int, stats: IngestionStatistics
|
|
1593
|
+
) -> list[dict[str, Any]]:
|
|
1594
|
+
"""Process URL batch using async HTTP with adaptive batching and memory management.
|
|
1595
|
+
|
|
1596
|
+
This method processes URLs in smaller concurrent batches while maintaining
|
|
1597
|
+
the same shard writing patterns as the original synchronous version. It provides
|
|
1598
|
+
the bridge between sync pipeline processing and async HTTP downloading.
|
|
1599
|
+
|
|
1600
|
+
The method adapts batch sizes based on configuration and processes URLs in
|
|
1601
|
+
sub-batches to manage memory usage while maximizing concurrency within each batch.
|
|
1602
|
+
|
|
1603
|
+
Args:
|
|
1604
|
+
url_batch: List of URLs to process in this batch.
|
|
1605
|
+
worker_id: Unique identifier for this worker process, used for logging
|
|
1606
|
+
and temporary file naming to avoid conflicts.
|
|
1607
|
+
|
|
1608
|
+
Returns:
|
|
1609
|
+
List of successfully downloaded and processed STAC item dictionaries.
|
|
1610
|
+
Items are filtered, validated, and ready for catalog ingestion.
|
|
1611
|
+
|
|
1612
|
+
Note:
|
|
1613
|
+
This method maintains the same error handling semantics as the original
|
|
1614
|
+
synchronous version - failed downloads return empty lists and errors are logged.
|
|
1615
|
+
|
|
1616
|
+
Example:
|
|
1617
|
+
>>> batch = ["https://example.com/item1.json", "https://example.com/item2.json"]
|
|
1618
|
+
>>> items = pipeline._process_batch_with_async_http(batch, worker_id=1)
|
|
1619
|
+
>>> print(f"Processed {len(items)} items from batch")
|
|
1620
|
+
"""
|
|
1621
|
+
batch_shards = []
|
|
1622
|
+
worker_tag = f"worker-{worker_id}-{uuid.uuid4().hex[:8]}"
|
|
1623
|
+
|
|
1624
|
+
# Process URLs in smaller async batches for memory management
|
|
1625
|
+
async_batch_size = min(self.config.batch_size, len(url_batch))
|
|
1626
|
+
|
|
1627
|
+
with tqdm(total=len(url_batch), desc=f"Worker {worker_id} (Async)", unit="urls", leave=False) as pbar:
|
|
1628
|
+
for i in range(0, len(url_batch), async_batch_size):
|
|
1629
|
+
batch_urls = url_batch[i : i + async_batch_size]
|
|
1630
|
+
|
|
1631
|
+
# Download batch concurrently
|
|
1632
|
+
items = self._download_stac_items_batch_async(batch_urls, worker_id)
|
|
1633
|
+
|
|
1634
|
+
# Record URL processing stats (async batch) - use passed stats
|
|
1635
|
+
success_count = len(items)
|
|
1636
|
+
failed_count = len(batch_urls) - success_count
|
|
1637
|
+
for _ in range(success_count):
|
|
1638
|
+
stats.record_url_processed(success=True)
|
|
1639
|
+
for _ in range(failed_count):
|
|
1640
|
+
stats.record_url_processed(success=False)
|
|
1641
|
+
|
|
1642
|
+
# Process items into shards
|
|
1643
|
+
if items:
|
|
1644
|
+
partition_shards = self._write_partition_shards(items, worker_tag, stats)
|
|
1645
|
+
batch_shards.extend(partition_shards)
|
|
1646
|
+
|
|
1647
|
+
pbar.update(len(batch_urls))
|
|
1648
|
+
|
|
1649
|
+
# Log completion stats
|
|
1650
|
+
total_items = sum(shard.get("item_count", 0) for shard in batch_shards)
|
|
1651
|
+
logger.info(f"Worker {worker_id} completed (async): {len(batch_shards)} shards, {total_items} items")
|
|
1652
|
+
|
|
1653
|
+
return batch_shards
|
|
1654
|
+
|
|
1655
|
+
def _validate_and_fix_items(self, items: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
1656
|
+
"""Validate STAC items and optionally fix geometry/bbox issues.
|
|
1657
|
+
|
|
1658
|
+
Performs validation on each item including:
|
|
1659
|
+
- Geometry validity check (self-intersection, ring orientation)
|
|
1660
|
+
- Bbox-geometry consistency check
|
|
1661
|
+
- Optionally fixes invalid geometries and mismatched bboxes
|
|
1662
|
+
|
|
1663
|
+
Args:
|
|
1664
|
+
items: List of STAC item dictionaries
|
|
1665
|
+
|
|
1666
|
+
Returns:
|
|
1667
|
+
List of validated (and optionally corrected) STAC items
|
|
1668
|
+
"""
|
|
1669
|
+
from .validation import validate_stac_item
|
|
1670
|
+
|
|
1671
|
+
validated_items = []
|
|
1672
|
+
validation_stats = {"total": len(items), "warnings": 0, "fixed_geometry": 0, "fixed_bbox": 0}
|
|
1673
|
+
|
|
1674
|
+
for item in items:
|
|
1675
|
+
result, corrected_item = validate_stac_item(
|
|
1676
|
+
item,
|
|
1677
|
+
fix_geometry=self.config.fix_invalid_geometry,
|
|
1678
|
+
bbox_tolerance=self.config.bbox_tolerance,
|
|
1679
|
+
)
|
|
1680
|
+
|
|
1681
|
+
# Track statistics
|
|
1682
|
+
if result.warnings:
|
|
1683
|
+
validation_stats["warnings"] += len(result.warnings)
|
|
1684
|
+
if self.config.log_validation_warnings:
|
|
1685
|
+
for warning in result.warnings:
|
|
1686
|
+
logger.debug(f"Validation warning for item {item.get('id')}: {warning}")
|
|
1687
|
+
|
|
1688
|
+
if result.metadata.get("geometry_fixed"):
|
|
1689
|
+
validation_stats["fixed_geometry"] += 1
|
|
1690
|
+
if result.metadata.get("bbox_corrected"):
|
|
1691
|
+
validation_stats["fixed_bbox"] += 1
|
|
1692
|
+
|
|
1693
|
+
# Use corrected item if available, otherwise use original
|
|
1694
|
+
if corrected_item is not None:
|
|
1695
|
+
validated_items.append(corrected_item)
|
|
1696
|
+
else:
|
|
1697
|
+
validated_items.append(item)
|
|
1698
|
+
|
|
1699
|
+
# Log summary
|
|
1700
|
+
if validation_stats["warnings"] > 0 or validation_stats["fixed_geometry"] > 0:
|
|
1701
|
+
logger.debug(
|
|
1702
|
+
f"Validation: {validation_stats['total']} items, "
|
|
1703
|
+
f"{validation_stats['warnings']} warnings, "
|
|
1704
|
+
f"{validation_stats['fixed_geometry']} geometries fixed, "
|
|
1705
|
+
f"{validation_stats['fixed_bbox']} bboxes corrected"
|
|
1706
|
+
)
|
|
1707
|
+
|
|
1708
|
+
return validated_items
|
|
1709
|
+
|
|
1710
|
+
def _write_partition_shards(
|
|
1711
|
+
self,
|
|
1712
|
+
items: list[dict[str, Any]],
|
|
1713
|
+
worker_id: str,
|
|
1714
|
+
stats: IngestionStatistics | None = None,
|
|
1715
|
+
) -> list[dict[str, Any]]:
|
|
1716
|
+
"""Group items by partition and write partition-aware shards.
|
|
1717
|
+
|
|
1718
|
+
Also records comprehensive statistics for each item including:
|
|
1719
|
+
- Unique item tracking via HyperLogLog
|
|
1720
|
+
- Spanning item detection and tile counts
|
|
1721
|
+
- Spatial/temporal distribution
|
|
1722
|
+
- Data quality metrics
|
|
1723
|
+
|
|
1724
|
+
Args:
|
|
1725
|
+
items: List of STAC items to process
|
|
1726
|
+
worker_id: Unique identifier for this worker
|
|
1727
|
+
stats: Optional IngestionStatistics to record to. If None, uses self.stats.
|
|
1728
|
+
"""
|
|
1729
|
+
# Use provided stats or fall back to pipeline's stats
|
|
1730
|
+
target_stats = stats if stats is not None else self.stats
|
|
1731
|
+
|
|
1732
|
+
# Validate and optionally fix items if validation is enabled
|
|
1733
|
+
if self.config.enable_validation:
|
|
1734
|
+
items = self._validate_and_fix_items(items)
|
|
1735
|
+
|
|
1736
|
+
# Group items by partition key while recording statistics
|
|
1737
|
+
partition_groups: dict[str, list[dict[str, Any]]] = {}
|
|
1738
|
+
|
|
1739
|
+
for item in items:
|
|
1740
|
+
# Get geometry and compute tiles with spanning detection
|
|
1741
|
+
geom = item.get("geometry")
|
|
1742
|
+
mission = self._extract_mission(item)
|
|
1743
|
+
|
|
1744
|
+
if not geom:
|
|
1745
|
+
tiles: list[str] = []
|
|
1746
|
+
is_spanning = False
|
|
1747
|
+
routed_to_global = False
|
|
1748
|
+
else:
|
|
1749
|
+
tiles, is_spanning = self.grid.tiles_for_geometry_with_spanning_detection(geom)
|
|
1750
|
+
threshold = self.config.global_partition_threshold
|
|
1751
|
+
routed_to_global = self.config.enable_global_partitioning and len(tiles) > threshold
|
|
1752
|
+
|
|
1753
|
+
# Record item statistics
|
|
1754
|
+
target_stats.record_item(
|
|
1755
|
+
item=item,
|
|
1756
|
+
tiles=tiles,
|
|
1757
|
+
is_spanning=is_spanning,
|
|
1758
|
+
routed_to_global=routed_to_global,
|
|
1759
|
+
mission=mission,
|
|
1760
|
+
)
|
|
1761
|
+
|
|
1762
|
+
# Compute partition key and group
|
|
1763
|
+
partition_key = self._compute_partition_key(item)
|
|
1764
|
+
if partition_key not in partition_groups:
|
|
1765
|
+
partition_groups[partition_key] = []
|
|
1766
|
+
partition_groups[partition_key].append(item)
|
|
1767
|
+
|
|
1768
|
+
# Write shard for each partition
|
|
1769
|
+
shard_info = []
|
|
1770
|
+
for partition_key, partition_items in partition_groups.items():
|
|
1771
|
+
if not partition_items:
|
|
1772
|
+
continue
|
|
1773
|
+
|
|
1774
|
+
# Convert to GeoDataFrame using engine
|
|
1775
|
+
gdf = self.engine.items_to_geodataframe(partition_items)
|
|
1776
|
+
|
|
1777
|
+
# Sort items within shard
|
|
1778
|
+
if self.config.sort_key in gdf.columns:
|
|
1779
|
+
gdf = gdf.sort_values(self.config.sort_key, ascending=self.config.sort_ascending)
|
|
1780
|
+
|
|
1781
|
+
# Determine file extension based on output format
|
|
1782
|
+
file_ext = ".ndjson" if self.config.output_format == "ndjson" else ".parquet"
|
|
1783
|
+
shard_path = f"{self.config.scratch_location}/shards/{partition_key}/{worker_id}{file_ext}"
|
|
1784
|
+
|
|
1785
|
+
self.scratch_storage.makedirs(Path(shard_path).parent)
|
|
1786
|
+
self._write_partition_shard(gdf, shard_path)
|
|
1787
|
+
|
|
1788
|
+
shard_info.append(
|
|
1789
|
+
{
|
|
1790
|
+
"shard_path": shard_path,
|
|
1791
|
+
"partition_key": partition_key,
|
|
1792
|
+
"item_count": len(partition_items),
|
|
1793
|
+
"worker_id": worker_id,
|
|
1794
|
+
}
|
|
1795
|
+
)
|
|
1796
|
+
|
|
1797
|
+
return shard_info
|
|
1798
|
+
|
|
1799
|
+
def _write_partition_shard(self, gdf, shard_path: str):
|
|
1800
|
+
"""Write partition shard in configured format."""
|
|
1801
|
+
|
|
1802
|
+
if self.config.output_format == "ndjson":
|
|
1803
|
+
# Convert to NDJSON format
|
|
1804
|
+
self._write_ndjson_shard(gdf, shard_path)
|
|
1805
|
+
else:
|
|
1806
|
+
# Default GeoParquet format
|
|
1807
|
+
with self.scratch_storage.open(shard_path, "wb") as f:
|
|
1808
|
+
gdf.to_parquet(f, index=False, compression="snappy")
|
|
1809
|
+
|
|
1810
|
+
def _write_ndjson_shard(self, gdf, shard_path: str):
|
|
1811
|
+
"""Write GeoDataFrame as NDJSON format."""
|
|
1812
|
+
import orjson
|
|
1813
|
+
|
|
1814
|
+
try:
|
|
1815
|
+
# Use engine to convert GeoDataFrame back to STAC items
|
|
1816
|
+
features = self.engine.geodataframe_to_items(gdf)
|
|
1817
|
+
|
|
1818
|
+
with self.scratch_storage.open(shard_path, "wb") as f:
|
|
1819
|
+
for item in features:
|
|
1820
|
+
f.write(orjson.dumps(item) + b"\n")
|
|
1821
|
+
|
|
1822
|
+
except (TypeError, ValueError, OSError) as e:
|
|
1823
|
+
logger.warning(f"Failed to convert to STAC format, using raw GeoJSON: {e}")
|
|
1824
|
+
|
|
1825
|
+
# Fallback: write as GeoJSON features
|
|
1826
|
+
with self.scratch_storage.open(shard_path, "wb") as f:
|
|
1827
|
+
for _, row in gdf.iterrows():
|
|
1828
|
+
# Convert each row to a GeoJSON-like feature
|
|
1829
|
+
geom = row.get("geometry")
|
|
1830
|
+
if geom is not None and hasattr(geom, "__geo_interface__"):
|
|
1831
|
+
geometry_dict = geom.__geo_interface__
|
|
1832
|
+
else:
|
|
1833
|
+
geometry_dict = None
|
|
1834
|
+
|
|
1835
|
+
feature = {
|
|
1836
|
+
"type": "Feature",
|
|
1837
|
+
"geometry": geometry_dict,
|
|
1838
|
+
"properties": {k: v for k, v in row.items() if k != "geometry"},
|
|
1839
|
+
}
|
|
1840
|
+
f.write(orjson.dumps(feature) + b"\n")
|
|
1841
|
+
|
|
1842
|
+
def _consolidate_shards(
|
|
1843
|
+
self,
|
|
1844
|
+
shard_info: list[dict[str, Any]],
|
|
1845
|
+
skip_partitions: set[str] | None = None,
|
|
1846
|
+
manifest: JobManifest | None = None,
|
|
1847
|
+
) -> dict[str, dict[str, int]]:
|
|
1848
|
+
"""Consolidate worker shards into final partitioned catalog.
|
|
1849
|
+
|
|
1850
|
+
Args:
|
|
1851
|
+
shard_info: List of shard info dictionaries.
|
|
1852
|
+
skip_partitions: Set of partition keys to skip (for resume).
|
|
1853
|
+
manifest: Job manifest to update as partitions complete.
|
|
1854
|
+
|
|
1855
|
+
Returns:
|
|
1856
|
+
Dictionary mapping partition keys to consolidation stats.
|
|
1857
|
+
"""
|
|
1858
|
+
if skip_partitions is None:
|
|
1859
|
+
skip_partitions = set()
|
|
1860
|
+
|
|
1861
|
+
def consolidate_partition_shards(partition_key: str, shard_paths: list[str], **kwargs: Any) -> dict[str, Any]:
|
|
1862
|
+
"""Consolidate all shards for a single partition, merging with existing data."""
|
|
1863
|
+
return _consolidate_partition(partition_key, shard_paths)
|
|
1864
|
+
|
|
1865
|
+
def _consolidate_partition(partition_key: str, shard_paths: list[str]) -> dict[str, Any]:
|
|
1866
|
+
"""Efficient consolidation approach using local staging for atomic S3 operations."""
|
|
1867
|
+
import io
|
|
1868
|
+
import tempfile
|
|
1869
|
+
from pathlib import Path
|
|
1870
|
+
|
|
1871
|
+
import pyarrow.parquet as pq
|
|
1872
|
+
|
|
1873
|
+
final_path = self._get_final_partition_path(partition_key)
|
|
1874
|
+
existing_count = 0
|
|
1875
|
+
new_count = 0
|
|
1876
|
+
|
|
1877
|
+
# Use temporary directory for local staging
|
|
1878
|
+
with tempfile.TemporaryDirectory(dir=self.config.temp_dir_location) as temp_dir:
|
|
1879
|
+
temp_existing_path = Path(temp_dir) / "existing.parquet"
|
|
1880
|
+
temp_merged_path = Path(temp_dir) / "merged.parquet"
|
|
1881
|
+
|
|
1882
|
+
all_items = []
|
|
1883
|
+
|
|
1884
|
+
# Step 1: Download existing partition to local temp (if exists)
|
|
1885
|
+
if self.storage.exists(final_path):
|
|
1886
|
+
try:
|
|
1887
|
+
logger.info(f"Partition {partition_key}: downloading existing data to local staging")
|
|
1888
|
+
|
|
1889
|
+
# Download to temporary file
|
|
1890
|
+
with self.storage.open(final_path, "rb") as remote_f:
|
|
1891
|
+
with open(temp_existing_path, "wb") as local_f:
|
|
1892
|
+
# Stream in chunks to avoid memory issues
|
|
1893
|
+
chunk_size = self.config.max_memory_per_partition_mb * 1024 * 1024 // 10
|
|
1894
|
+
while True:
|
|
1895
|
+
chunk = remote_f.read(chunk_size)
|
|
1896
|
+
if not chunk:
|
|
1897
|
+
break
|
|
1898
|
+
local_f.write(chunk)
|
|
1899
|
+
|
|
1900
|
+
# Read existing data from temp file
|
|
1901
|
+
table = pq.read_table(temp_existing_path)
|
|
1902
|
+
df = table.to_pandas()
|
|
1903
|
+
existing_gdf = gpd.GeoDataFrame(df)
|
|
1904
|
+
existing_count = len(existing_gdf)
|
|
1905
|
+
all_items.append(existing_gdf)
|
|
1906
|
+
|
|
1907
|
+
except (OSError, ValueError, RuntimeError) as e:
|
|
1908
|
+
logger.error(f"Error downloading existing partition {final_path}: {e}")
|
|
1909
|
+
|
|
1910
|
+
# Step 2: Read all new shards for this partition (using streaming)
|
|
1911
|
+
for shard_path in shard_paths:
|
|
1912
|
+
try:
|
|
1913
|
+
with self.scratch_storage.open(shard_path, "rb") as f:
|
|
1914
|
+
binary_data = f.read()
|
|
1915
|
+
table = pq.read_table(io.BytesIO(binary_data))
|
|
1916
|
+
df = table.to_pandas()
|
|
1917
|
+
gdf = gpd.GeoDataFrame(df)
|
|
1918
|
+
all_items.append(gdf)
|
|
1919
|
+
|
|
1920
|
+
except (OSError, ValueError, RuntimeError) as e:
|
|
1921
|
+
logger.error(f"Error reading shard {shard_path}: {e}")
|
|
1922
|
+
continue
|
|
1923
|
+
|
|
1924
|
+
if not all_items:
|
|
1925
|
+
return {"partition": partition_key, "item_count": 0, "existing_count": 0, "new_count": 0}
|
|
1926
|
+
|
|
1927
|
+
# Step 3: Merge data using chunked processing if enabled
|
|
1928
|
+
if self.config.enable_streaming_merge and len(all_items) > 1:
|
|
1929
|
+
# Process in batches to manage memory
|
|
1930
|
+
batch_size = max(1, self.config.max_memory_per_partition_mb // 100) # Conservative estimate
|
|
1931
|
+
merged = pd.DataFrame()
|
|
1932
|
+
|
|
1933
|
+
for i in range(0, len(all_items), batch_size):
|
|
1934
|
+
batch = all_items[i : i + batch_size]
|
|
1935
|
+
batch_merged = pd.concat(batch, ignore_index=True)
|
|
1936
|
+
|
|
1937
|
+
if len(merged) == 0:
|
|
1938
|
+
merged = batch_merged
|
|
1939
|
+
else:
|
|
1940
|
+
# Merge with existing and deduplicate incrementally
|
|
1941
|
+
merged = pd.concat([merged, batch_merged], ignore_index=True)
|
|
1942
|
+
merged = merged.drop_duplicates(subset=["id"], keep="last")
|
|
1943
|
+
else:
|
|
1944
|
+
# Standard merge for smaller datasets
|
|
1945
|
+
merged = pd.concat(all_items, ignore_index=True)
|
|
1946
|
+
|
|
1947
|
+
# Step 4: Deduplicate (keep="last" to prefer new data over existing)
|
|
1948
|
+
original_count = len(merged)
|
|
1949
|
+
merged = merged.drop_duplicates(subset=["id"], keep="last")
|
|
1950
|
+
duplicates_removed = original_count - len(merged)
|
|
1951
|
+
|
|
1952
|
+
# Step 5: Final sort
|
|
1953
|
+
if self.config.sort_key in merged.columns:
|
|
1954
|
+
merged = merged.sort_values(self.config.sort_key, ascending=self.config.sort_ascending)
|
|
1955
|
+
|
|
1956
|
+
# Step 6: Write merged data to local temp file
|
|
1957
|
+
merged_gdf = gpd.GeoDataFrame(merged)
|
|
1958
|
+
merged_gdf.to_parquet(temp_merged_path, index=False, compression="snappy")
|
|
1959
|
+
|
|
1960
|
+
# Step 7: Atomic upload to final location
|
|
1961
|
+
if final_path.startswith("s3://"):
|
|
1962
|
+
# For S3, use storage.upload which should handle large files appropriately
|
|
1963
|
+
self.storage.upload(str(temp_merged_path), final_path)
|
|
1964
|
+
else:
|
|
1965
|
+
# Standard atomic write
|
|
1966
|
+
self._write_final_partition(merged_gdf, final_path)
|
|
1967
|
+
|
|
1968
|
+
new_count = len(merged) - existing_count
|
|
1969
|
+
logger.info(
|
|
1970
|
+
f"Partition {partition_key}: {existing_count} existing + "
|
|
1971
|
+
f"{new_count} new = {len(merged)} total "
|
|
1972
|
+
f"({duplicates_removed} duplicates removed) [efficient]"
|
|
1973
|
+
)
|
|
1974
|
+
|
|
1975
|
+
return {
|
|
1976
|
+
"partition": partition_key,
|
|
1977
|
+
"item_count": len(merged),
|
|
1978
|
+
"existing_count": existing_count,
|
|
1979
|
+
"new_count": new_count,
|
|
1980
|
+
"duplicates_removed": duplicates_removed,
|
|
1981
|
+
"final_path": final_path,
|
|
1982
|
+
}
|
|
1983
|
+
|
|
1984
|
+
# Group shards by their target partition
|
|
1985
|
+
partition_shards = self._group_shards_by_partition(shard_info)
|
|
1986
|
+
logger.info(f"Grouped shards into {len(partition_shards)} partitions")
|
|
1987
|
+
|
|
1988
|
+
# Filter out already-completed partitions (for resume)
|
|
1989
|
+
if skip_partitions:
|
|
1990
|
+
original_count = len(partition_shards)
|
|
1991
|
+
partition_shards = {k: v for k, v in partition_shards.items() if k not in skip_partitions}
|
|
1992
|
+
skipped_count = original_count - len(partition_shards)
|
|
1993
|
+
if skipped_count > 0:
|
|
1994
|
+
logger.info(f"Skipping {skipped_count} already-completed partitions")
|
|
1995
|
+
|
|
1996
|
+
# Consolidate partitions in parallel - pass config_dict for Dask
|
|
1997
|
+
config_dict = self.config.to_dict()
|
|
1998
|
+
consolidation_results = self.processor.consolidate_shards(
|
|
1999
|
+
list(partition_shards.items()),
|
|
2000
|
+
consolidate_partition_shards,
|
|
2001
|
+
config_dict=config_dict,
|
|
2002
|
+
)
|
|
2003
|
+
|
|
2004
|
+
# Determine checkpoint strategy
|
|
2005
|
+
use_time_checkpoint = self.config.checkpoint_interval_seconds > 0
|
|
2006
|
+
use_count_checkpoint = self.config.checkpoint_partition_count > 0
|
|
2007
|
+
# Default to time-based (30s) if neither is set
|
|
2008
|
+
if not use_time_checkpoint and not use_count_checkpoint:
|
|
2009
|
+
use_time_checkpoint = True
|
|
2010
|
+
checkpoint_interval = 30
|
|
2011
|
+
else:
|
|
2012
|
+
checkpoint_interval = self.config.checkpoint_interval_seconds
|
|
2013
|
+
|
|
2014
|
+
last_checkpoint_time = time.time()
|
|
2015
|
+
|
|
2016
|
+
# Convert to stats - handle both dict formats
|
|
2017
|
+
final_stats = {}
|
|
2018
|
+
for result in consolidation_results:
|
|
2019
|
+
if result.get("item_count", 0) > 0:
|
|
2020
|
+
# Handle both "partition" key (from local) and "partition_key" (from workers.py)
|
|
2021
|
+
partition = result.get("partition") or result.get("partition_key", "unknown")
|
|
2022
|
+
final_stats[partition] = {
|
|
2023
|
+
"total_items": result.get("item_count", 0),
|
|
2024
|
+
"existing_items": result.get("existing_count", 0),
|
|
2025
|
+
"new_items": result.get("new_count", 0),
|
|
2026
|
+
"duplicates_removed": result.get("duplicates_removed", 0),
|
|
2027
|
+
}
|
|
2028
|
+
|
|
2029
|
+
# Update manifest if provided (for checkpointing)
|
|
2030
|
+
if manifest is not None:
|
|
2031
|
+
manifest.consolidation_phase.completed_partitions.append(partition)
|
|
2032
|
+
manifest.consolidation_phase.partitions_completed += 1
|
|
2033
|
+
|
|
2034
|
+
# Determine if we should checkpoint
|
|
2035
|
+
should_checkpoint = False
|
|
2036
|
+
current_time = time.time()
|
|
2037
|
+
|
|
2038
|
+
if use_time_checkpoint and (current_time - last_checkpoint_time) >= checkpoint_interval:
|
|
2039
|
+
should_checkpoint = True
|
|
2040
|
+
last_checkpoint_time = current_time
|
|
2041
|
+
elif (
|
|
2042
|
+
use_count_checkpoint
|
|
2043
|
+
and manifest.consolidation_phase.partitions_completed % self.config.checkpoint_partition_count
|
|
2044
|
+
== 0
|
|
2045
|
+
):
|
|
2046
|
+
should_checkpoint = True
|
|
2047
|
+
|
|
2048
|
+
if should_checkpoint:
|
|
2049
|
+
manifest.save(self.storage, self.config.output_catalog)
|
|
2050
|
+
|
|
2051
|
+
return final_stats
|
|
2052
|
+
|
|
2053
|
+
def _group_shards_by_partition(self, shard_info: list[dict[str, Any]]) -> dict[str, list[str]]:
|
|
2054
|
+
"""Group shard paths by their target partition."""
|
|
2055
|
+
partition_shards: dict[str, list[str]] = defaultdict(list)
|
|
2056
|
+
|
|
2057
|
+
for info in shard_info:
|
|
2058
|
+
shard_path = info["shard_path"]
|
|
2059
|
+
|
|
2060
|
+
if "partition_key" in info:
|
|
2061
|
+
# Use partition key from shard info
|
|
2062
|
+
partition_key = info["partition_key"]
|
|
2063
|
+
partition_shards[partition_key].append(shard_path)
|
|
2064
|
+
else:
|
|
2065
|
+
# Fallback: read shard to determine partitions (for backward compatibility)
|
|
2066
|
+
try:
|
|
2067
|
+
with self.scratch_storage.open(shard_path, "rb") as f:
|
|
2068
|
+
import io
|
|
2069
|
+
|
|
2070
|
+
import pyarrow.parquet as pq
|
|
2071
|
+
|
|
2072
|
+
binary_data = f.read()
|
|
2073
|
+
table = pq.read_table(io.BytesIO(binary_data))
|
|
2074
|
+
df = table.to_pandas()
|
|
2075
|
+
gdf = gpd.GeoDataFrame(df)
|
|
2076
|
+
|
|
2077
|
+
if len(gdf) == 0:
|
|
2078
|
+
continue
|
|
2079
|
+
|
|
2080
|
+
# Group items within shard by partition
|
|
2081
|
+
for _idx, row in gdf.iterrows():
|
|
2082
|
+
item = row.to_dict()
|
|
2083
|
+
partition_key = self._compute_partition_key(item)
|
|
2084
|
+
partition_shards[partition_key].append(shard_path)
|
|
2085
|
+
|
|
2086
|
+
# Remove duplicates (same shard might have items for same partition)
|
|
2087
|
+
partition_shards = cast(
|
|
2088
|
+
dict[str, list[str]], {k: list(set(v)) for k, v in partition_shards.items()}
|
|
2089
|
+
)
|
|
2090
|
+
|
|
2091
|
+
except (OSError, ValueError, RuntimeError, AttributeError, TypeError) as e:
|
|
2092
|
+
logger.error(f"Error processing shard {shard_path}: {e}")
|
|
2093
|
+
continue
|
|
2094
|
+
|
|
2095
|
+
return dict(partition_shards)
|
|
2096
|
+
|
|
2097
|
+
def _extract_mission(self, item: dict[str, Any]) -> str:
|
|
2098
|
+
"""Extract mission/dataset identifier from STAC item."""
|
|
2099
|
+
props = item.get("properties", {})
|
|
2100
|
+
|
|
2101
|
+
# Primary: Extract from dataset_id field
|
|
2102
|
+
dataset_id = props.get(self.config.mission_field, "")
|
|
2103
|
+
if dataset_id:
|
|
2104
|
+
return self._sanitize_mission_name(str(dataset_id))
|
|
2105
|
+
|
|
2106
|
+
# Fallback to collection if dataset_id not found
|
|
2107
|
+
collection = props.get("collection", "")
|
|
2108
|
+
if collection:
|
|
2109
|
+
return self._sanitize_mission_name(str(collection))
|
|
2110
|
+
|
|
2111
|
+
# Final fallback
|
|
2112
|
+
return "unknown_mission"
|
|
2113
|
+
|
|
2114
|
+
def _sanitize_mission_name(self, name: str) -> str:
|
|
2115
|
+
"""Sanitize mission name for filesystem compatibility."""
|
|
2116
|
+
import re
|
|
2117
|
+
|
|
2118
|
+
# Replace invalid filesystem chars with underscores, convert to lowercase
|
|
2119
|
+
sanitized = re.sub(r"[^\w]", "_", name.lower().strip())
|
|
2120
|
+
# Remove consecutive underscores
|
|
2121
|
+
sanitized = re.sub(r"_+", "_", sanitized)
|
|
2122
|
+
# Remove leading/trailing underscores
|
|
2123
|
+
return sanitized.strip("_") or "unnamed"
|
|
2124
|
+
|
|
2125
|
+
def _compute_partition_key(self, item: dict[str, Any]) -> str:
|
|
2126
|
+
"""Compute partition key with Hive-style temporal partitioning.
|
|
2127
|
+
|
|
2128
|
+
Format: {dataset_id}/partition=h3/level={resolution}/{h3_cell_id}/year=YYYY/month=MM[/day=DD]
|
|
2129
|
+
|
|
2130
|
+
The temporal partition depth is controlled by config.temporal_bin:
|
|
2131
|
+
- year: year=YYYY
|
|
2132
|
+
- month: year=YYYY/month=MM
|
|
2133
|
+
- day: year=YYYY/month=MM/day=DD
|
|
2134
|
+
|
|
2135
|
+
This Hive-style naming enables directory-level pruning in DuckDB, Athena,
|
|
2136
|
+
Spark, and other query engines for optimal query performance.
|
|
2137
|
+
"""
|
|
2138
|
+
# Extract mission (dataset identifier)
|
|
2139
|
+
mission = self._extract_mission(item)
|
|
2140
|
+
|
|
2141
|
+
# Extract spatial information
|
|
2142
|
+
geom = item.get("geometry")
|
|
2143
|
+
if not geom:
|
|
2144
|
+
h3_cell = "unknown"
|
|
2145
|
+
else:
|
|
2146
|
+
# Use the new method with spanning detection
|
|
2147
|
+
tiles, is_spanning = self.grid.tiles_for_geometry_with_spanning_detection(geom)
|
|
2148
|
+
|
|
2149
|
+
# Get resolution-specific threshold
|
|
2150
|
+
threshold = self.config.global_partition_threshold
|
|
2151
|
+
|
|
2152
|
+
# Apply global partitioning logic with resolution-specific threshold
|
|
2153
|
+
if self.config.enable_global_partitioning and len(tiles) > threshold:
|
|
2154
|
+
# Multi-cell geometry - route to global partition
|
|
2155
|
+
h3_cell = "global"
|
|
2156
|
+
else:
|
|
2157
|
+
# Single-cell geometry - use specific tile
|
|
2158
|
+
h3_cell = tiles[0] if tiles else "unknown"
|
|
2159
|
+
|
|
2160
|
+
# Extract temporal information using Hive-style partitioning
|
|
2161
|
+
temporal_parts = self._extract_temporal_hive_parts(item)
|
|
2162
|
+
|
|
2163
|
+
# Build partition path
|
|
2164
|
+
grid_type = self.config.grid_system # Should be 'h3'
|
|
2165
|
+
resolution = self.config.grid_resolution
|
|
2166
|
+
|
|
2167
|
+
return f"{mission}/partition={grid_type}/level={resolution}/{h3_cell}/{temporal_parts}"
|
|
2168
|
+
|
|
2169
|
+
def _extract_temporal_hive_parts(self, item: dict[str, Any]) -> str:
|
|
2170
|
+
"""Extract Hive-style temporal partition parts from STAC item.
|
|
2171
|
+
|
|
2172
|
+
Generates directory path fragments using Hive partition naming convention
|
|
2173
|
+
for optimal query pruning in DuckDB, Athena, Spark, and other query engines.
|
|
2174
|
+
|
|
2175
|
+
Returns:
|
|
2176
|
+
Path fragment based on temporal_bin configuration:
|
|
2177
|
+
- year: "year=2024"
|
|
2178
|
+
- month: "year=2024/month=01"
|
|
2179
|
+
- day: "year=2024/month=01/day=15"
|
|
2180
|
+
- unknown: "unknown" (for missing/invalid datetimes)
|
|
2181
|
+
"""
|
|
2182
|
+
props = item.get("properties", {})
|
|
2183
|
+
dt_str = props.get("datetime")
|
|
2184
|
+
|
|
2185
|
+
if not dt_str:
|
|
2186
|
+
return "unknown"
|
|
2187
|
+
|
|
2188
|
+
try:
|
|
2189
|
+
dt = pd.to_datetime(dt_str)
|
|
2190
|
+
if self.config.temporal_bin == "year":
|
|
2191
|
+
return f"year={dt.year}"
|
|
2192
|
+
elif self.config.temporal_bin == "month":
|
|
2193
|
+
return f"year={dt.year}/month={dt.month:02d}"
|
|
2194
|
+
elif self.config.temporal_bin == "day":
|
|
2195
|
+
return f"year={dt.year}/month={dt.month:02d}/day={dt.day:02d}"
|
|
2196
|
+
else:
|
|
2197
|
+
return "unknown"
|
|
2198
|
+
except (ValueError, TypeError):
|
|
2199
|
+
return "unknown"
|
|
2200
|
+
|
|
2201
|
+
def _get_final_partition_path(self, partition_key: str) -> str:
|
|
2202
|
+
"""Get final path for partition structure with Hive-style naming.
|
|
2203
|
+
|
|
2204
|
+
The partition_key already contains Hive-style temporal directories
|
|
2205
|
+
(e.g., mission/partition=h3/level=2/cell/year=2024/month=01).
|
|
2206
|
+
This method appends the final filename (items.parquet or items.ndjson).
|
|
2207
|
+
"""
|
|
2208
|
+
# Determine file extension based on output format
|
|
2209
|
+
if self.config.output_format == "ndjson":
|
|
2210
|
+
file_extension = ".ndjson"
|
|
2211
|
+
else:
|
|
2212
|
+
file_extension = ".parquet" # Default to .parquet for geoparquet
|
|
2213
|
+
|
|
2214
|
+
return f"{self.config.output_catalog}/{partition_key}/items{file_extension}"
|
|
2215
|
+
|
|
2216
|
+
def _write_final_partition(self, gdf: gpd.GeoDataFrame, path: str):
|
|
2217
|
+
"""Write final partition with atomic safety."""
|
|
2218
|
+
import tempfile
|
|
2219
|
+
|
|
2220
|
+
if path.startswith("s3://"):
|
|
2221
|
+
# For S3, write to temporary local file then upload
|
|
2222
|
+
with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
|
|
2223
|
+
tmp_path = tmp.name
|
|
2224
|
+
gdf.to_parquet(tmp_path, index=False, compression="snappy")
|
|
2225
|
+
self.storage.upload(tmp_path, path)
|
|
2226
|
+
Path(tmp_path).unlink()
|
|
2227
|
+
else:
|
|
2228
|
+
# Local filesystem - atomic write
|
|
2229
|
+
tmp_path = f"{path}.tmp-{uuid.uuid4().hex}"
|
|
2230
|
+
self.storage.makedirs(Path(path).parent)
|
|
2231
|
+
|
|
2232
|
+
with self.storage.open(tmp_path, "wb") as f:
|
|
2233
|
+
gdf.to_parquet(f, index=False, compression="snappy")
|
|
2234
|
+
|
|
2235
|
+
self.storage.rename(tmp_path, path)
|
|
2236
|
+
|
|
2237
|
+
def _download_stac_item(self, url: str) -> dict[str, Any] | None:
|
|
2238
|
+
"""Download and parse a STAC item from URL."""
|
|
2239
|
+
try:
|
|
2240
|
+
if url.startswith("s3://"):
|
|
2241
|
+
fs = fsspec.filesystem("s3")
|
|
2242
|
+
with fs.open(url, "r") as f:
|
|
2243
|
+
return cast(dict[str, Any], json.load(f))
|
|
2244
|
+
else:
|
|
2245
|
+
import requests
|
|
2246
|
+
|
|
2247
|
+
response = requests.get(url, timeout=30)
|
|
2248
|
+
response.raise_for_status()
|
|
2249
|
+
return cast(dict[str, Any], response.json())
|
|
2250
|
+
except (
|
|
2251
|
+
requests.exceptions.RequestException,
|
|
2252
|
+
json.JSONDecodeError,
|
|
2253
|
+
OSError,
|
|
2254
|
+
ValueError,
|
|
2255
|
+
) as e:
|
|
2256
|
+
logger.error(f"Failed to download STAC item from {url}: {e}")
|
|
2257
|
+
return None
|
|
2258
|
+
|
|
2259
|
+
def _chunk_urls(self, urls: list[str], n_chunks: int) -> list[list[str]]:
|
|
2260
|
+
"""Split URLs into chunks for parallel processing."""
|
|
2261
|
+
chunk_size = max(1, len(urls) // n_chunks)
|
|
2262
|
+
chunks = [urls[i : i + chunk_size] for i in range(0, len(urls), chunk_size)]
|
|
2263
|
+
# Ensure we have exactly n_chunks by combining last small chunk if needed
|
|
2264
|
+
if len(chunks) > n_chunks:
|
|
2265
|
+
chunks[-2].extend(chunks[-1])
|
|
2266
|
+
chunks = chunks[:-1]
|
|
2267
|
+
return chunks
|
|
2268
|
+
|
|
2269
|
+
def _cleanup_scratch(self, shard_info: list[dict[str, Any]]):
|
|
2270
|
+
"""Clean up scratch space after successful processing."""
|
|
2271
|
+
logger.info("Cleaning up scratch space...")
|
|
2272
|
+
failed = 0
|
|
2273
|
+
for info in shard_info:
|
|
2274
|
+
try:
|
|
2275
|
+
self.scratch_storage.remove(info["shard_path"])
|
|
2276
|
+
except OSError as e:
|
|
2277
|
+
failed += 1
|
|
2278
|
+
logger.debug(f"Failed to cleanup shard {info['shard_path']}: {e}")
|
|
2279
|
+
|
|
2280
|
+
if failed > 0:
|
|
2281
|
+
logger.warning(f"Failed to cleanup {failed} shards")
|