earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. earthcatalog/__init__.py +164 -0
  2. earthcatalog/async_http_client.py +1006 -0
  3. earthcatalog/config.py +97 -0
  4. earthcatalog/engines/__init__.py +308 -0
  5. earthcatalog/engines/rustac_engine.py +142 -0
  6. earthcatalog/engines/stac_geoparquet_engine.py +126 -0
  7. earthcatalog/exceptions.py +471 -0
  8. earthcatalog/grid_systems.py +1114 -0
  9. earthcatalog/ingestion_pipeline.py +2281 -0
  10. earthcatalog/input_readers.py +603 -0
  11. earthcatalog/job_tracking.py +485 -0
  12. earthcatalog/pipeline.py +606 -0
  13. earthcatalog/schema_generator.py +911 -0
  14. earthcatalog/spatial_resolver.py +1207 -0
  15. earthcatalog/stac_hooks.py +754 -0
  16. earthcatalog/statistics.py +677 -0
  17. earthcatalog/storage_backends.py +548 -0
  18. earthcatalog/tests/__init__.py +1 -0
  19. earthcatalog/tests/conftest.py +76 -0
  20. earthcatalog/tests/test_all_grids.py +793 -0
  21. earthcatalog/tests/test_async_http.py +700 -0
  22. earthcatalog/tests/test_cli_and_storage.py +230 -0
  23. earthcatalog/tests/test_config.py +245 -0
  24. earthcatalog/tests/test_dask_integration.py +580 -0
  25. earthcatalog/tests/test_e2e_synthetic.py +1624 -0
  26. earthcatalog/tests/test_engines.py +272 -0
  27. earthcatalog/tests/test_exceptions.py +346 -0
  28. earthcatalog/tests/test_file_structure.py +245 -0
  29. earthcatalog/tests/test_input_readers.py +666 -0
  30. earthcatalog/tests/test_integration.py +200 -0
  31. earthcatalog/tests/test_integration_async.py +283 -0
  32. earthcatalog/tests/test_job_tracking.py +603 -0
  33. earthcatalog/tests/test_multi_file_input.py +336 -0
  34. earthcatalog/tests/test_passthrough_hook.py +196 -0
  35. earthcatalog/tests/test_pipeline.py +684 -0
  36. earthcatalog/tests/test_pipeline_components.py +665 -0
  37. earthcatalog/tests/test_schema_generator.py +506 -0
  38. earthcatalog/tests/test_spatial_resolver.py +413 -0
  39. earthcatalog/tests/test_stac_hooks.py +776 -0
  40. earthcatalog/tests/test_statistics.py +477 -0
  41. earthcatalog/tests/test_storage_backends.py +236 -0
  42. earthcatalog/tests/test_validation.py +435 -0
  43. earthcatalog/tests/test_workers.py +653 -0
  44. earthcatalog/validation.py +921 -0
  45. earthcatalog/workers.py +682 -0
  46. earthcatalog-0.2.0.dist-info/METADATA +333 -0
  47. earthcatalog-0.2.0.dist-info/RECORD +50 -0
  48. earthcatalog-0.2.0.dist-info/WHEEL +5 -0
  49. earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
  50. earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2281 @@
1
+ # ingestion_pipeline.py
2
+ """High-performance distributed STAC ingestion pipeline for massive geospatial dataset processing.
3
+
4
+ This module provides EarthCatalog's core ingestion capabilities, transforming URL lists
5
+ into spatially-partitioned, query-optimized GeoParquet catalogs. Designed for processing
6
+ 100M+ STAC items efficiently across distributed computing environments with intelligent
7
+ partitioning, concurrent HTTP processing, and adaptive resource management.
8
+
9
+ Architecture Overview:
10
+ The pipeline follows a multi-stage processing architecture:
11
+ 1. URL Reading: Flexible input from Parquet, CSV, or TSV files
12
+ 2. Batch Processing: URLs chunked into worker-manageable batches
13
+ 3. Concurrent Download: High-performance HTTP with connection pooling
14
+ 4. Spatial Partitioning: Grid-based organization for efficient queries
15
+ 5. Temporal Binning: Time-based organization (year/month/day)
16
+ 6. Consolidated Output: Optimized GeoParquet files with spatial indexing
17
+
18
+ Key Components:
19
+ ProcessingConfig: Comprehensive configuration with production defaults
20
+ STACIngestionPipeline: Main pipeline orchestrating the complete workflow
21
+ LocalProcessor: Single-machine processing with async HTTP capabilities
22
+ DaskDistributedProcessor: Cluster-based processing for massive scale
23
+ DistributedProcessor: Abstract base for pluggable processing backends
24
+
25
+ Performance Features:
26
+ - Async HTTP processing: 3-6x faster than sequential downloads
27
+ - Concurrent request processing: 50-100+ requests/second per worker
28
+ - Memory-efficient batching: Handles unlimited URL counts
29
+ - Adaptive resource management: Scales from laptops to clusters
30
+ - Intelligent retry strategies: Robust error handling and recovery
31
+ - Connection pooling: Optimized network resource usage
32
+
33
+ Scalability:
34
+ Tested Performance Metrics:
35
+ - 100M URLs: ~7-14 hours with 32 workers (vs 71 hours sequential)
36
+ - Memory usage: <20% per worker with 16GB RAM
37
+ - Throughput: 50-100 STAC items/second per worker
38
+ - Storage: Linear scaling with intelligent partitioning
39
+
40
+ Grid System Integration:
41
+ Supports all major spatial partitioning systems:
42
+ - H3: Recommended for most global applications
43
+ - S2: Optimal for polar regions and spherical accuracy
44
+ - UTM: High precision for regional datasets
45
+ - MGRS: Standard for government and defense applications
46
+ - Custom GeoJSON: Maximum flexibility for special use cases
47
+
48
+ Storage Backend Support:
49
+ - Local filesystem: Development and small-scale production
50
+ - S3: AWS cloud storage with optimized multipart uploads
51
+ - GCS: Google Cloud Storage via fsspec integration
52
+ - Azure: Azure Blob Storage support
53
+ - Custom backends: Extensible storage abstraction
54
+
55
+ Temporal Organization:
56
+ Flexible time-based partitioning:
57
+ - Year-level: Long-term climate datasets
58
+ - Month-level: Recommended for most time-series data
59
+ - Day-level: High-frequency monitoring applications
60
+ - Custom binning: Configurable temporal windows
61
+
62
+ Processing Modes:
63
+ LocalProcessor:
64
+ - Single machine processing with async HTTP
65
+ - Ideal for development and small-to-medium datasets
66
+ - Memory efficient with configurable batch sizes
67
+ - Built-in progress tracking and error reporting
68
+
69
+ DaskDistributedProcessor:
70
+ - Cluster-based processing for massive scale
71
+ - Automatic task distribution and load balancing
72
+ - Fault tolerance and automatic recovery
73
+ - Resource monitoring and adaptive scheduling
74
+
75
+ Example Usage:
76
+ >>> # Basic local processing
77
+ >>> config = ProcessingConfig(
78
+ ... input_file='urls.parquet',
79
+ ... output_catalog='./catalog',
80
+ ... scratch_location='./scratch'
81
+ ... )
82
+ >>> pipeline = STACIngestionPipeline(config, LocalProcessor())
83
+ >>> pipeline.run()
84
+ >>>
85
+ >>> # High-performance distributed processing
86
+ >>> config = ProcessingConfig(
87
+ ... input_file='s3://bucket/urls.parquet',
88
+ ... output_catalog='s3://bucket/catalog',
89
+ ... scratch_location='s3://bucket/scratch',
90
+ ... concurrent_requests=100, # High concurrency
91
+ ... max_workers=32 # Multiple workers
92
+ ... )
93
+ >>> processor = DaskDistributedProcessor('scheduler-address:8786')
94
+ >>> pipeline = STACIngestionPipeline(config, processor)
95
+ >>> pipeline.run()
96
+
97
+ Configuration Best Practices:
98
+ - Use H3 grid system for global datasets
99
+ - Set concurrent_requests=50-100 for cloud storage
100
+ - Configure batch_size=1000-5000 based on available memory
101
+ - Use month-level temporal binning for time-series data
102
+ - Enable global partitioning for datasets with large geometries
103
+
104
+ Error Handling:
105
+ - Comprehensive retry strategies with exponential backoff
106
+ - Failed URL logging with detailed error categorization
107
+ - Graceful degradation for network and server issues
108
+ - Progress tracking with automatic resumption capabilities
109
+ - Memory pressure monitoring and adaptive batch sizing
110
+ """
111
+
112
+ import asyncio
113
+ import glob as glob_module
114
+ import json
115
+ import logging
116
+ import tempfile
117
+ import time
118
+ import uuid
119
+ from abc import ABC, abstractmethod
120
+ from collections import defaultdict
121
+ from collections.abc import Callable
122
+ from dataclasses import dataclass
123
+ from pathlib import Path
124
+ from typing import Any, cast
125
+
126
+ import fsspec
127
+ import geopandas as gpd
128
+ import pandas as pd
129
+ import requests
130
+ from tqdm import tqdm
131
+
132
+ # Import STAC engine abstraction
133
+ from .engines import STACEngine, get_engine
134
+
135
+ # Conditional async HTTP imports
136
+ try:
137
+ from .async_http_client import HAS_ASYNC_HTTP, download_stac_items_async
138
+ except ImportError:
139
+ HAS_ASYNC_HTTP = False
140
+
141
+ # Create a dummy async function for type checking
142
+ async def download_stac_items_async(*args, **kwargs): # type: ignore
143
+ """Dummy async function for type checking when async HTTP is not available."""
144
+ raise ImportError("Async HTTP client not available")
145
+
146
+
147
+ # Import from package modules
148
+ from . import grid_systems, input_readers, schema_generator, storage_backends
149
+ from .job_tracking import JobLogger, JobManifest, JobStatus
150
+ from .statistics import IngestionStatistics
151
+
152
+ logger = logging.getLogger(__name__)
153
+
154
+
155
+ @dataclass
156
+ class ProcessingConfig:
157
+ """Comprehensive configuration for EarthCatalog's STAC ingestion pipeline with production defaults.
158
+
159
+ This dataclass provides all configuration options for controlling pipeline behavior,
160
+ from basic input/output paths to advanced performance tuning. Designed with sensible
161
+ defaults that work well for most use cases while allowing fine-grained control for
162
+ specialized requirements and performance optimization.
163
+
164
+ Configuration Categories:
165
+ Input/Output: File paths and formats for data sources and destinations
166
+ Spatial: Grid system selection and partitioning parameters
167
+ Temporal: Time-based binning and organization
168
+ Async HTTP: High-performance concurrent request processing (3-6x speedup)
169
+ Performance: Concurrency, memory management, and optimization
170
+ Processing: Worker configuration and distributed computing
171
+ Storage: Backend selection and cloud storage settings
172
+
173
+ Async HTTP Performance Configuration:
174
+ EarthCatalog includes built-in async HTTP capabilities that provide 3-6x performance
175
+ improvements over sequential processing. Key async parameters:
176
+
177
+ - enable_concurrent_http (bool, default=True): Enable async HTTP processing
178
+ - concurrent_requests (int, default=50): Simultaneous HTTP requests per worker
179
+ - batch_size (int, default=1000): URLs processed in each batch
180
+ - connection_pool_size (int, default=100): HTTP connection pool size
181
+ - request_timeout (int, default=30): Individual request timeout in seconds
182
+ - retry_attempts (int, default=3): Maximum retry attempts per request
183
+ - retry_delay (float, default=1.0): Base delay between retries in seconds
184
+
185
+ Performance Tuning Guidelines:
186
+ Development/Testing:
187
+ - concurrent_requests=10-25, batch_size=100-500
188
+ - Lower resource usage, easier debugging
189
+
190
+ Production/Fast Networks:
191
+ - concurrent_requests=50-100, batch_size=1000-2000
192
+ - Maximum throughput for well-provisioned systems
193
+
194
+ Unreliable Networks:
195
+ - concurrent_requests=10-25, request_timeout=60-120
196
+ - Higher reliability with longer timeouts
197
+
198
+ Memory Constrained:
199
+ - batch_size=100-500, connection_pool_size=25-50
200
+ - Reduced memory footprint
201
+
202
+ Cloud Storage Optimization:
203
+ Special considerations for cloud deployments:
204
+ - Use s3_multipart_threshold_mb for large file handling
205
+ - Configure temp_dir_location for local staging
206
+ - Set appropriate timeouts for network latency
207
+ - Enable streaming_merge for memory efficiency
208
+ - Higher concurrent_requests work well with cloud storage
209
+
210
+ Configuration Examples:
211
+
212
+ Basic Configuration (3-6x speedup with defaults):
213
+ >>> config = ProcessingConfig(
214
+ ... input_file='urls.parquet',
215
+ ... output_catalog='./catalog',
216
+ ... scratch_location='./scratch'
217
+ ... # async HTTP enabled by default
218
+ ... )
219
+
220
+ High-Performance Cloud Configuration:
221
+ >>> config = ProcessingConfig(
222
+ ... input_file='s3://bucket/urls.parquet',
223
+ ... output_catalog='s3://bucket/catalog',
224
+ ... scratch_location='s3://bucket/scratch',
225
+ ... # Async HTTP tuning for cloud scale
226
+ ... enable_concurrent_http=True,
227
+ ... concurrent_requests=100,
228
+ ... batch_size=2000,
229
+ ... connection_pool_size=200,
230
+ ... request_timeout=30,
231
+ ... # Worker and processing settings
232
+ ... max_workers=32,
233
+ ... items_per_shard=20000
234
+ ... )
235
+
236
+ Conservative/Reliable Configuration:
237
+ >>> config = ProcessingConfig(
238
+ ... input_file='urls.parquet',
239
+ ... output_catalog='./catalog',
240
+ ... scratch_location='./scratch',
241
+ ... # Conservative async settings
242
+ ... concurrent_requests=25,
243
+ ... request_timeout=60,
244
+ ... retry_attempts=5,
245
+ ... retry_delay=2.0,
246
+ ... batch_size=500
247
+ ... )
248
+
249
+ Development/Debug Configuration:
250
+ >>> config = ProcessingConfig(
251
+ ... input_file='sample_urls.parquet',
252
+ ... output_catalog='./test_catalog',
253
+ ... scratch_location='./test_scratch',
254
+ ... # Disable async for easier debugging
255
+ ... enable_concurrent_http=False,
256
+ ... max_workers=1,
257
+ ... verbose=True
258
+ ... )
259
+
260
+ Performance Monitoring:
261
+ Use these settings to monitor and optimize performance:
262
+ >>> config = ProcessingConfig(
263
+ ... stats_file='processing_stats.json', # Performance metrics
264
+ ... verbose=True, # Detailed logging
265
+ ... progress_interval=100 # Progress updates
266
+ ... )
267
+
268
+ Validation:
269
+ Configuration validation ensures proper setup:
270
+ >>> config.validate() # Raises ValueError for invalid settings
271
+ >>> # Automatically validates paths, dependencies, and parameter ranges
272
+ >>> config.validate() # Raises exceptions for invalid settings
273
+ """
274
+
275
+ input_file: str
276
+ output_catalog: str
277
+ scratch_location: str
278
+ input_format: str = "auto" # auto, parquet, csv, tsv, ndjson, jsonl
279
+ url_column: str = "url" # Column name containing URLs
280
+ # Multi-file input support (glob patterns)
281
+ # If specified, input_file is treated as a base path and input_pattern is used for file discovery
282
+ # Example: "s3://bucket/bulk/2020_*.ndjson" will discover all matching files
283
+ input_pattern: str = "" # Glob pattern for multi-file input
284
+ grid_system: str = "h3" # h3 only for simplicity
285
+ grid_resolution: int = 2 # H3 resolution (default level 2)
286
+ temporal_bin: str = "month" # year, month, day
287
+
288
+ # Output format options
289
+ output_format: str = "geoparquet" # "geoparquet" or "ndjson"
290
+ mission_field: str = "dataset_id" # Field to extract mission from
291
+ sort_key: str = "datetime"
292
+ sort_ascending: bool = True
293
+ items_per_shard: int = 10000
294
+ max_workers: int = 8
295
+ # Global partitioning options
296
+ enable_global_partitioning: bool = True # Route multi-cell items to /global/
297
+ global_partition_threshold: int = 50 # H3 resolution 6 threshold
298
+ # Schema generation options
299
+ generate_schema: bool = True # Generate catalog schema metadata (default: enabled)
300
+ schema_filename: str = "catalog_schema.json" # Schema output filename
301
+ geojson_path: str = "" # Path to custom GeoJSON tiles (for geojson grid system)
302
+ # Consolidation options
303
+ max_memory_per_partition_mb: int = 1024 # Memory limit per partition
304
+ enable_streaming_merge: bool = True # Use streaming for large files
305
+ s3_multipart_threshold_mb: int = 100 # When to use multipart upload
306
+ temp_dir_location: str = tempfile.gettempdir() # Local temp space for staging
307
+
308
+ # Async HTTP Configuration
309
+ enable_concurrent_http: bool = True # Enable async HTTP processing
310
+ concurrent_requests: int = 50 # Concurrent requests per worker
311
+ connection_pool_size: int = 100 # HTTP connection pool size
312
+ request_timeout: int = 30 # Request timeout in seconds
313
+ retry_attempts: int = 3 # Max retry attempts
314
+ retry_delay: float = 1.0 # Base retry delay in seconds
315
+ batch_size: int = 1000 # URLs processed per async batch
316
+
317
+ # Validation Configuration
318
+ enable_validation: bool = True # Enable STAC item validation on ingest
319
+ fix_invalid_geometry: bool = True # Attempt to fix invalid geometries
320
+ fix_bbox_mismatch: bool = True # Correct bbox when it doesn't match geometry
321
+ bbox_tolerance: float = 1e-6 # Tolerance for bbox comparison (degrees)
322
+ log_validation_warnings: bool = True # Log validation warnings
323
+
324
+ # STAC Engine Configuration
325
+ stac_engine: str = "rustac" # "rustac", "stac-geoparquet", or "auto"
326
+
327
+ # STAC Fetch Hook Configuration
328
+ # Allows custom STAC item generation when URLs don't point to STAC JSON
329
+ # Supported formats:
330
+ # - "default": Standard STAC JSON fetch (default behavior)
331
+ # - "module:package.module:function": Python function import path
332
+ # - "script:/path/to/script": External executable
333
+ # - "script:python:/path/to/script.py": Script with interpreter
334
+ stac_hook: str = "default"
335
+
336
+ # Batch mode configuration
337
+ # Controls when to use simple local processing vs full distributed processing
338
+ batch_threshold: int = 10000 # Below this, use simple local processing
339
+ distributed: bool | None = None # True=force distributed, False=force local, None=auto
340
+ large_batch_confirm_threshold: int = 20000 # Prompt user if local mode exceeds this
341
+
342
+ # Dask distributed configuration
343
+ dask_scheduler_address: str = (
344
+ "" # Dask scheduler address (e.g., 'tcp://localhost:8786'). Empty = create local cluster
345
+ )
346
+
347
+ # Checkpoint configuration for distributed processing
348
+ # Use time-based checkpointing for better performance with many partitions
349
+ checkpoint_interval_seconds: int = 30 # Save checkpoint every N seconds (0 to disable)
350
+ checkpoint_partition_count: int = 0 # Save checkpoint every N partitions (0 to disable)
351
+ # Note: If both are 0, defaults to time-based (30 seconds)
352
+
353
+ def validate(self):
354
+ """Validate configuration before processing."""
355
+ # Validate input source - either input_file or input_pattern
356
+ if not self.input_pattern:
357
+ # Traditional single file mode - validate file exists
358
+ if not Path(self.input_file).exists() and not self.input_file.startswith("s3://"):
359
+ raise FileNotFoundError(f"Input file not found: {self.input_file}")
360
+ # If input_pattern is provided, file existence will be validated during pattern resolution
361
+ if self.grid_resolution < 0 or self.grid_resolution > 15:
362
+ raise ValueError("H3 resolution must be 0-15")
363
+ if self.temporal_bin not in ["year", "month", "day"]:
364
+ raise ValueError("temporal_bin must be 'year', 'month', or 'day'")
365
+ if self.output_format not in ["geoparquet", "ndjson"]:
366
+ raise ValueError("output_format must be 'geoparquet' or 'ndjson'")
367
+ if self.items_per_shard <= 0:
368
+ raise ValueError("items_per_shard must be positive")
369
+ if self.max_workers <= 0:
370
+ raise ValueError("max_workers must be positive")
371
+ if self.max_memory_per_partition_mb <= 0:
372
+ raise ValueError("max_memory_per_partition_mb must be positive")
373
+ if self.s3_multipart_threshold_mb <= 0:
374
+ raise ValueError("s3_multipart_threshold_mb must be positive")
375
+
376
+ # Async HTTP validation
377
+ if self.concurrent_requests <= 0:
378
+ raise ValueError("concurrent_requests must be positive")
379
+ if self.connection_pool_size <= 0:
380
+ raise ValueError("connection_pool_size must be positive")
381
+ if self.request_timeout <= 0:
382
+ raise ValueError("request_timeout must be positive")
383
+ if self.retry_attempts < 0:
384
+ raise ValueError("retry_attempts must be non-negative")
385
+ if self.retry_delay < 0:
386
+ raise ValueError("retry_delay must be non-negative")
387
+ if self.batch_size <= 0:
388
+ raise ValueError("batch_size must be positive")
389
+
390
+ # STAC engine validation
391
+ valid_engines = ("rustac", "stac-geoparquet", "auto")
392
+ if self.stac_engine not in valid_engines:
393
+ raise ValueError(f"stac_engine must be one of {valid_engines}, got: {self.stac_engine}")
394
+
395
+ # STAC hook validation
396
+ valid_hook_prefixes = ("default", "passthrough", "module:", "script:")
397
+ if not any(self.stac_hook.startswith(prefix) for prefix in valid_hook_prefixes):
398
+ raise ValueError(
399
+ f"stac_hook must be 'default', 'passthrough', 'module:path:func', or 'script:/path', got: {self.stac_hook}"
400
+ )
401
+
402
+ # Dask scheduler validation - warn if remote scheduler with local storage
403
+ if self.dask_scheduler_address and not self.dask_scheduler_address.startswith("local"):
404
+ # Remote scheduler detected
405
+ local_paths = []
406
+ if not self.scratch_location.startswith(("s3://", "gs://", "az://")):
407
+ local_paths.append("scratch_location")
408
+ if not self.output_catalog.startswith(("s3://", "gs://", "az://")):
409
+ local_paths.append("output_catalog")
410
+
411
+ if local_paths:
412
+ logger.warning(
413
+ f"Using remote Dask scheduler ({self.dask_scheduler_address}) with local storage paths: "
414
+ f"{', '.join(local_paths)}. "
415
+ f"Remote workers may not have access to local paths. "
416
+ f"Consider using cloud storage (s3://, gs://) for {', '.join(local_paths)}."
417
+ )
418
+
419
+ # Batch mode validation
420
+ if self.batch_threshold <= 0:
421
+ raise ValueError("batch_threshold must be positive")
422
+ if self.large_batch_confirm_threshold <= 0:
423
+ raise ValueError("large_batch_confirm_threshold must be positive")
424
+
425
+ def __repr__(self) -> str:
426
+ """Return concise string representation with key configuration."""
427
+ return (
428
+ f"ProcessingConfig("
429
+ f"input='{self.input_file}', "
430
+ f"output='{self.output_catalog}', "
431
+ f"grid={self.grid_system}@{self.grid_resolution}, "
432
+ f"workers={self.max_workers})"
433
+ )
434
+
435
+ def __bool__(self) -> bool:
436
+ """Return True if configuration is valid."""
437
+ try:
438
+ self.validate()
439
+ return True
440
+ except (ValueError, FileNotFoundError):
441
+ return False
442
+
443
+ def to_dict(self) -> dict[str, Any]:
444
+ """Serialize configuration to dictionary for storage or transmission.
445
+
446
+ All fields are included, enabling complete reconstruction via from_dict().
447
+ Useful for:
448
+ - Storing config in job manifests for recovery
449
+ - Passing config to remote Dask workers
450
+ - Caching/resumption of processing jobs
451
+
452
+ Returns:
453
+ Dictionary with all configuration fields.
454
+ """
455
+ return {
456
+ "input_file": self.input_file,
457
+ "output_catalog": self.output_catalog,
458
+ "scratch_location": self.scratch_location,
459
+ "input_format": self.input_format,
460
+ "url_column": self.url_column,
461
+ "input_pattern": self.input_pattern,
462
+ "grid_system": self.grid_system,
463
+ "grid_resolution": self.grid_resolution,
464
+ "temporal_bin": self.temporal_bin,
465
+ "output_format": self.output_format,
466
+ "mission_field": self.mission_field,
467
+ "sort_key": self.sort_key,
468
+ "sort_ascending": self.sort_ascending,
469
+ "items_per_shard": self.items_per_shard,
470
+ "max_workers": self.max_workers,
471
+ "enable_global_partitioning": self.enable_global_partitioning,
472
+ "global_partition_threshold": self.global_partition_threshold,
473
+ "generate_schema": self.generate_schema,
474
+ "schema_filename": self.schema_filename,
475
+ "geojson_path": self.geojson_path,
476
+ "max_memory_per_partition_mb": self.max_memory_per_partition_mb,
477
+ "enable_streaming_merge": self.enable_streaming_merge,
478
+ "s3_multipart_threshold_mb": self.s3_multipart_threshold_mb,
479
+ "temp_dir_location": self.temp_dir_location,
480
+ "enable_concurrent_http": self.enable_concurrent_http,
481
+ "concurrent_requests": self.concurrent_requests,
482
+ "connection_pool_size": self.connection_pool_size,
483
+ "request_timeout": self.request_timeout,
484
+ "retry_attempts": self.retry_attempts,
485
+ "retry_delay": self.retry_delay,
486
+ "batch_size": self.batch_size,
487
+ "enable_validation": self.enable_validation,
488
+ "fix_invalid_geometry": self.fix_invalid_geometry,
489
+ "fix_bbox_mismatch": self.fix_bbox_mismatch,
490
+ "bbox_tolerance": self.bbox_tolerance,
491
+ "log_validation_warnings": self.log_validation_warnings,
492
+ "stac_engine": self.stac_engine,
493
+ "stac_hook": self.stac_hook,
494
+ "batch_threshold": self.batch_threshold,
495
+ "distributed": self.distributed,
496
+ "large_batch_confirm_threshold": self.large_batch_confirm_threshold,
497
+ "dask_scheduler_address": self.dask_scheduler_address,
498
+ }
499
+
500
+ @classmethod
501
+ def from_dict(cls, data: dict[str, Any]) -> "ProcessingConfig":
502
+ """Reconstruct configuration from dictionary.
503
+
504
+ Only includes fields that are present in the dictionary, allowing
505
+ for backward compatibility when new fields are added.
506
+
507
+ Args:
508
+ data: Dictionary with configuration fields.
509
+
510
+ Returns:
511
+ ProcessingConfig instance.
512
+ """
513
+ # Required fields
514
+ config = cls(
515
+ input_file=data["input_file"],
516
+ output_catalog=data["output_catalog"],
517
+ scratch_location=data["scratch_location"],
518
+ )
519
+
520
+ # Optional fields - update only if present in data
521
+ optional_fields = [
522
+ "input_format",
523
+ "url_column",
524
+ "input_pattern",
525
+ "grid_system",
526
+ "grid_resolution",
527
+ "temporal_bin",
528
+ "output_format",
529
+ "mission_field",
530
+ "sort_key",
531
+ "sort_ascending",
532
+ "items_per_shard",
533
+ "max_workers",
534
+ "enable_global_partitioning",
535
+ "global_partition_threshold",
536
+ "generate_schema",
537
+ "schema_filename",
538
+ "geojson_path",
539
+ "max_memory_per_partition_mb",
540
+ "enable_streaming_merge",
541
+ "s3_multipart_threshold_mb",
542
+ "temp_dir_location",
543
+ "enable_concurrent_http",
544
+ "concurrent_requests",
545
+ "connection_pool_size",
546
+ "request_timeout",
547
+ "retry_attempts",
548
+ "retry_delay",
549
+ "batch_size",
550
+ "enable_validation",
551
+ "fix_invalid_geometry",
552
+ "fix_bbox_mismatch",
553
+ "bbox_tolerance",
554
+ "log_validation_warnings",
555
+ "stac_engine",
556
+ "stac_hook",
557
+ "batch_threshold",
558
+ "distributed",
559
+ "large_batch_confirm_threshold",
560
+ "dask_scheduler_address",
561
+ ]
562
+
563
+ for field in optional_fields:
564
+ if field in data:
565
+ setattr(config, field, data[field])
566
+
567
+ return config
568
+
569
+ def config_hash(self) -> str:
570
+ """Generate a hash of configuration for idempotency checking.
571
+
572
+ The hash covers settings that affect output content, excluding
573
+ paths and runtime-only settings. Two configs with the same hash
574
+ would produce identical output given the same input.
575
+
576
+ Returns:
577
+ Hex digest of the configuration hash.
578
+ """
579
+ import hashlib
580
+
581
+ # Fields that affect output content (not paths or runtime settings)
582
+ content_affecting = {
583
+ "grid_system": self.grid_system,
584
+ "grid_resolution": self.grid_resolution,
585
+ "temporal_bin": self.temporal_bin,
586
+ "output_format": self.output_format,
587
+ "mission_field": self.mission_field,
588
+ "sort_key": self.sort_key,
589
+ "sort_ascending": self.sort_ascending,
590
+ "items_per_shard": self.items_per_shard,
591
+ "enable_global_partitioning": self.enable_global_partitioning,
592
+ "global_partition_threshold": self.global_partition_threshold,
593
+ "enable_validation": self.enable_validation,
594
+ "fix_invalid_geometry": self.fix_invalid_geometry,
595
+ "fix_bbox_mismatch": self.fix_bbox_mismatch,
596
+ }
597
+
598
+ content_str = json.dumps(content_affecting, sort_keys=True)
599
+ return hashlib.sha256(content_str.encode()).hexdigest()[:16]
600
+
601
+
602
+ class DistributedProcessor(ABC):
603
+ """Abstract base for different parallel processing backends."""
604
+
605
+ @abstractmethod
606
+ def process_urls(self, url_chunks: list[list[str]], process_fn: Callable, **kwargs) -> list[Any]:
607
+ """Process URL chunks in parallel."""
608
+ pass
609
+
610
+ @abstractmethod
611
+ def consolidate_shards(self, partition_items: list[tuple], consolidate_fn: Callable, **kwargs) -> list[Any]:
612
+ """Consolidate shards in parallel."""
613
+ pass
614
+
615
+ @abstractmethod
616
+ def close(self):
617
+ """Clean up resources."""
618
+ pass
619
+
620
+
621
+ class DaskDistributedProcessor(DistributedProcessor):
622
+ """Dask-based distributed processing using serializable worker functions.
623
+
624
+ This processor uses the module-level functions from workers.py which can be
625
+ pickled and sent to remote Dask workers. Unlike LocalProcessor, it cannot
626
+ use closures that capture instance state.
627
+
628
+ Supports two modes:
629
+ 1. Local cluster: Creates a local Dask cluster (default)
630
+ 2. Remote cluster: Connects to an existing Dask scheduler via scheduler_address
631
+ """
632
+
633
+ def __init__(self, n_workers: int = 8, threads_per_worker: int = 1, scheduler_address: str | None = None):
634
+ """Initialize Dask distributed processor with specified worker configuration.
635
+
636
+ Args:
637
+ n_workers: Number of workers (only used when creating local cluster)
638
+ threads_per_worker: Threads per worker (only used when creating local cluster)
639
+ scheduler_address: Optional Dask scheduler address (e.g., 'tcp://localhost:8786').
640
+ If provided, connects to existing cluster instead of creating local one.
641
+ """
642
+ self.n_workers = n_workers
643
+ self.threads_per_worker = threads_per_worker
644
+ self.scheduler_address = scheduler_address
645
+ try:
646
+ import dask.distributed as dd
647
+
648
+ if scheduler_address:
649
+ # Connect to existing Dask cluster
650
+ self.client = dd.Client(scheduler_address)
651
+ logger.info(f"Connected to Dask scheduler at {scheduler_address}")
652
+ logger.info(f"Dask dashboard: {self.client.dashboard_link}")
653
+ else:
654
+ # Create local Dask cluster
655
+ self.client = dd.Client(n_workers=n_workers, threads_per_worker=threads_per_worker, memory_limit="4GB")
656
+ logger.info(f"Created local Dask cluster with {n_workers} workers")
657
+ logger.info(f"Dask dashboard: {self.client.dashboard_link}")
658
+ except ImportError:
659
+ raise ImportError("Dask distributed required: pip install dask distributed") from None
660
+
661
+ def __repr__(self) -> str:
662
+ """Return string representation."""
663
+ if self.scheduler_address:
664
+ return f"DaskDistributedProcessor(scheduler_address='{self.scheduler_address}')"
665
+ return f"DaskDistributedProcessor(n_workers={self.n_workers}, threads_per_worker={self.threads_per_worker})"
666
+
667
+ def close(self):
668
+ """Close the Dask client connection."""
669
+ if hasattr(self, "client") and self.client:
670
+ self.client.close()
671
+ logger.info("Dask client closed")
672
+
673
+ def process_urls(self, url_chunks: list[list[str]], process_fn: Callable, **kwargs) -> list[Any]:
674
+ """Process URL chunks using Dask distributed workers.
675
+
676
+ Note: For Dask, we use the serializable workers.process_url_batch function
677
+ instead of the provided process_fn (which may capture non-serializable state).
678
+ The process_fn is ignored and workers.process_url_batch is used instead.
679
+ """
680
+ # Import here to avoid circular imports
681
+ from .workers import process_url_batch
682
+
683
+ # Extract required kwargs
684
+ config_dict = kwargs.get("config_dict")
685
+ job_id = kwargs.get("job_id", "unknown")
686
+
687
+ if config_dict is None:
688
+ # Fall back to old behavior if config_dict not provided
689
+ futures = [self.client.submit(process_fn, chunk, idx, **kwargs) for idx, chunk in enumerate(url_chunks)]
690
+ return cast(list[Any], self.client.gather(futures))
691
+
692
+ # Use serializable worker function
693
+ futures = [
694
+ self.client.submit(
695
+ process_url_batch,
696
+ urls=chunk,
697
+ worker_id=idx,
698
+ config_dict=config_dict,
699
+ job_id=job_id,
700
+ batch_idx=idx,
701
+ )
702
+ for idx, chunk in enumerate(url_chunks)
703
+ ]
704
+ return cast(list[Any], self.client.gather(futures))
705
+
706
+ def consolidate_shards(self, partition_items: list[tuple], consolidate_fn: Callable, **kwargs) -> list[Any]:
707
+ """Consolidate shards using Dask distributed workers.
708
+
709
+ Note: For Dask, we use the serializable workers.consolidate_partition function
710
+ instead of the provided consolidate_fn.
711
+ """
712
+ from .workers import consolidate_partition
713
+
714
+ config_dict = kwargs.get("config_dict")
715
+
716
+ if config_dict is None:
717
+ # Fall back to old behavior
718
+ futures = [
719
+ self.client.submit(consolidate_fn, partition_key, shard_paths, **kwargs)
720
+ for partition_key, shard_paths in partition_items
721
+ ]
722
+ return cast(list[Any], self.client.gather(futures))
723
+
724
+ # Use serializable worker function
725
+ futures = [
726
+ self.client.submit(
727
+ consolidate_partition,
728
+ partition_key=partition_key,
729
+ shard_paths=shard_paths,
730
+ config_dict=config_dict,
731
+ )
732
+ for partition_key, shard_paths in partition_items
733
+ ]
734
+ return cast(list[Any], self.client.gather(futures))
735
+
736
+
737
+ class LocalProcessor(DistributedProcessor):
738
+ """Local multi-threading processor using concurrent.futures."""
739
+
740
+ def __init__(self, n_workers: int = 8):
741
+ from concurrent.futures import ThreadPoolExecutor
742
+
743
+ self.n_workers = n_workers
744
+ self.executor = ThreadPoolExecutor(max_workers=n_workers)
745
+ logger.info(f"Local processor started with {n_workers} workers")
746
+
747
+ def __repr__(self) -> str:
748
+ """Return string representation."""
749
+ return f"LocalProcessor(n_workers={self.n_workers})"
750
+
751
+ def process_urls(self, url_chunks: list[list[str]], process_fn: Callable, **kwargs) -> list[Any]:
752
+ futures = [self.executor.submit(process_fn, chunk, idx, **kwargs) for idx, chunk in enumerate(url_chunks)]
753
+ return [f.result() for f in futures]
754
+
755
+ def consolidate_shards(self, partition_items: list[tuple], consolidate_fn: Callable, **kwargs) -> list[Any]:
756
+ futures = [
757
+ self.executor.submit(consolidate_fn, partition_key, shard_paths, **kwargs)
758
+ for partition_key, shard_paths in partition_items
759
+ ]
760
+ return [f.result() for f in futures]
761
+
762
+ def close(self):
763
+ """Shutdown the executor."""
764
+ self.executor.shutdown(wait=True)
765
+
766
+
767
+ class STACIngestionPipeline:
768
+ """Enterprise-grade STAC ingestion pipeline for massive geospatial dataset processing.
769
+
770
+ This is the main orchestrator class that coordinates the complete STAC ingestion
771
+ workflow, from URL reading through spatial partitioning to optimized catalog
772
+ generation. Designed for production environments processing 100M+ STAC items
773
+ with high performance, reliability, and scalability.
774
+
775
+ Pipeline Architecture:
776
+ The pipeline implements a sophisticated multi-stage processing architecture:
777
+ 1. Input Reading: Flexible support for Parquet, CSV, TSV with auto-detection
778
+ 2. URL Chunking: Intelligent batching based on memory and processing constraints
779
+ 3. Distributed Processing: Pluggable processors for local or cluster execution
780
+ 4. Concurrent Download: High-performance async HTTP with connection pooling
781
+ 5. Spatial Partitioning: Grid-based organization optimized for spatial queries
782
+ 6. Temporal Binning: Time-based organization for efficient time-series queries
783
+ 7. Consolidation: Memory-efficient merging with streaming for large datasets
784
+ 8. Output Generation: Optimized GeoParquet with spatial indexing
785
+
786
+ Performance Features:
787
+ - Async HTTP: 3-6x faster downloads with 50-100+ concurrent requests
788
+ - Memory Management: Constant memory usage regardless of dataset size
789
+ - Incremental Processing: Resume interrupted jobs and update existing catalogs
790
+ - Error Recovery: Comprehensive retry strategies and graceful degradation
791
+ - Progress Tracking: Real-time monitoring with detailed logging
792
+ - Resource Optimization: Adaptive batching and memory pressure handling
793
+
794
+ Storage Integration:
795
+ - Local Filesystem: Development and small-scale production
796
+ - S3 Compatible: AWS S3, MinIO, with optimized multipart uploads
797
+ - Google Cloud Storage: Native integration via fsspec
798
+ - Azure Blob Storage: Full support for Azure cloud deployments
799
+ - Custom Storage: Extensible backend system for specialized requirements
800
+
801
+ Incremental Updates:
802
+ Advanced incremental processing capabilities:
803
+ - Automatic detection of existing partitions
804
+ - Efficient merging of new data with existing catalogs
805
+ - Temporal range updates without full reprocessing
806
+ - Safe atomic operations preventing catalog corruption
807
+ - Rollback capabilities for failed updates
808
+
809
+ Processing Modes:
810
+ LocalProcessor:
811
+ - Single-machine processing optimized for async HTTP
812
+ - Memory-efficient for datasets up to tens of millions of items
813
+ - Ideal for development, testing, and medium-scale production
814
+
815
+ DaskDistributedProcessor:
816
+ - Cluster-based processing for massive scale
817
+ - Fault tolerance with automatic task recovery
818
+ - Dynamic scaling based on cluster resources
819
+ - Optimized for 100M+ item datasets
820
+
821
+ Quality Assurance:
822
+ - Comprehensive input validation and error checking
823
+ - STAC specification compliance verification
824
+ - Spatial geometry validation and repair
825
+ - Temporal data consistency checking
826
+ - Output format validation and optimization
827
+
828
+ Example:
829
+ >>> # Basic pipeline setup
830
+ >>> config = ProcessingConfig(
831
+ ... input_file='stac_urls.parquet',
832
+ ... output_catalog='./catalog',
833
+ ... scratch_location='./scratch'
834
+ ... )
835
+ >>> pipeline = STACIngestionPipeline(config, LocalProcessor())
836
+ >>>
837
+ >>> # Execute complete pipeline
838
+ >>> pipeline.run() # Processes all URLs and creates catalog
839
+ >>>
840
+ >>> # Advanced configuration for large-scale processing
841
+ >>> config = ProcessingConfig(
842
+ ... input_file='s3://data/urls.parquet',
843
+ ... output_catalog='s3://catalog/output',
844
+ ... scratch_location='s3://catalog/scratch',
845
+ ... grid_system='h3',
846
+ ... grid_resolution=6,
847
+ ... concurrent_requests=100,
848
+ ... batch_size=5000,
849
+ ... max_workers=32
850
+ ... )
851
+ >>> processor = DaskDistributedProcessor('scheduler:8786')
852
+ >>> pipeline = STACIngestionPipeline(config, processor)
853
+ >>> pipeline.run()
854
+
855
+ Thread Safety:
856
+ The pipeline is designed for single-threaded execution within each worker
857
+ process. Multiple pipeline instances can run concurrently in separate
858
+ processes for distributed processing scenarios.
859
+
860
+ Monitoring:
861
+ Comprehensive monitoring and observability:
862
+ - Progress bars with ETA and throughput metrics
863
+ - Detailed logging of all processing stages
864
+ - Error categorization and reporting
865
+ - Performance metrics and bottleneck identification
866
+ - Memory usage tracking and optimization recommendations
867
+ """
868
+
869
+ def __init__(self, config: ProcessingConfig, processor: DistributedProcessor):
870
+ self.config = config
871
+ self.processor = processor
872
+ self.storage = self._init_storage(config.output_catalog)
873
+ self.scratch_storage = self._init_storage(config.scratch_location)
874
+ self.grid = grid_systems.get_grid_system(config.grid_system, config.grid_resolution)
875
+
876
+ # Initialize STAC engine for item conversion
877
+ self.engine: STACEngine = get_engine(config.stac_engine) # type: ignore[arg-type]
878
+ logger.info(f"Using STAC engine: {self.engine.name}")
879
+
880
+ # Initialize statistics collector
881
+ self.stats = IngestionStatistics()
882
+
883
+ def __repr__(self) -> str:
884
+ """Return string representation with key pipeline state."""
885
+ return (
886
+ f"STACIngestionPipeline("
887
+ f"input='{self.config.input_file}', "
888
+ f"output='{self.config.output_catalog}', "
889
+ f"grid={self.config.grid_system}@{self.config.grid_resolution}, "
890
+ f"processor={self.processor.__class__.__name__})"
891
+ )
892
+
893
+ def _init_storage(self, path: str) -> storage_backends.StorageBackend:
894
+ """Initialize storage backend."""
895
+ if path.startswith("s3://"):
896
+ return storage_backends.S3Storage(path)
897
+ else:
898
+ return storage_backends.LocalStorage(path)
899
+
900
+ def _should_use_distributed(self, url_count: int) -> bool:
901
+ """Determine whether to use distributed processing based on URL count and config.
902
+
903
+ Processing mode selection logic:
904
+ 1. If config.distributed is True: always use distributed
905
+ 2. If config.distributed is False: always use simple local
906
+ 3. If config.distributed is None (auto): use URL count threshold
907
+
908
+ Args:
909
+ url_count: Number of URLs to process.
910
+
911
+ Returns:
912
+ True if distributed processing should be used, False for simple local.
913
+ """
914
+ if self.config.distributed is True:
915
+ logger.info("Using distributed processing (explicitly enabled)")
916
+ return True
917
+ if self.config.distributed is False:
918
+ logger.info("Using simple local processing (explicitly disabled distributed)")
919
+ return False
920
+ # Auto mode: decide based on threshold
921
+ use_distributed = url_count >= self.config.batch_threshold
922
+ if use_distributed:
923
+ logger.info(
924
+ f"Using distributed processing: {url_count:,} URLs >= threshold {self.config.batch_threshold:,}"
925
+ )
926
+ else:
927
+ logger.info(
928
+ f"Using simple local processing: {url_count:,} URLs < threshold {self.config.batch_threshold:,}"
929
+ )
930
+ return use_distributed
931
+
932
+ def run(self, job_id: str | None = None, resume: bool = False) -> dict[str, Any]:
933
+ """Execute the complete ingestion pipeline with job tracking.
934
+
935
+ Args:
936
+ job_id: Optional job ID to use. If None, generates a new UUID.
937
+ resume: If True, attempts to resume an incomplete job. Ignores job_id
938
+ and searches for the most recent incomplete job.
939
+
940
+ Returns:
941
+ Dictionary mapping partition keys to consolidation stats.
942
+
943
+ Raises:
944
+ ValueError: If resume=True but no incomplete job found.
945
+ """
946
+ # Handle resume case
947
+ if resume:
948
+ manifest = JobManifest.find_incomplete(self.storage, self.config.output_catalog)
949
+ if manifest is None:
950
+ raise ValueError("No incomplete job found to resume")
951
+ job_id = manifest.job_id
952
+ logger.info(f"Resuming job {job_id} from {manifest.status.value} phase")
953
+ elif job_id is None:
954
+ job_id = str(uuid.uuid4())
955
+
956
+ # Initialize job logger
957
+ job_logger = JobLogger(self.storage, self.config.output_catalog, job_id)
958
+
959
+ logger.info("=" * 80)
960
+ logger.info("Starting STAC ingestion pipeline")
961
+ logger.info(f"Job ID: {job_id}")
962
+ logger.info(f"Input: {self.config.input_file}")
963
+ logger.info(f"Output: {self.config.output_catalog}")
964
+ logger.info(f"Grid: {self.config.grid_system} (resolution {self.config.grid_resolution})")
965
+ logger.info(f"Temporal binning: {self.config.temporal_bin}")
966
+ logger.info("=" * 80)
967
+
968
+ # Start statistics tracking
969
+ self.stats.start_processing()
970
+
971
+ # Phase 1: Read input and distribute work
972
+ urls = self._read_input_urls()
973
+ logger.info(f"Loaded {len(urls):,} URLs from input file")
974
+
975
+ # Check if we should use simple mode (for small batches)
976
+ # Note: Resume always uses distributed mode
977
+ if not resume and not self._should_use_distributed(len(urls)):
978
+ # Check if local mode with large batch needs confirmation
979
+ if len(urls) > self.config.large_batch_confirm_threshold:
980
+ logger.warning(
981
+ f"Processing {len(urls):,} URLs in local mode. "
982
+ f"Consider using --distributed for batches > {self.config.large_batch_confirm_threshold:,}"
983
+ )
984
+ # job_id is guaranteed to be set at this point (either from param or uuid)
985
+ assert job_id is not None
986
+ return self._run_simple(urls, job_id)
987
+
988
+ # Create or load job manifest
989
+ if resume:
990
+ manifest = JobManifest.load(self.storage, self.config.output_catalog, job_id)
991
+ else:
992
+ manifest = JobManifest(
993
+ job_id=job_id,
994
+ input_urls_count=len(urls),
995
+ config_hash=self.config.config_hash(),
996
+ )
997
+
998
+ # Track shard info for consolidation
999
+ shard_info: list[dict[str, Any]] = []
1000
+
1001
+ try:
1002
+ # Phase 2: Parallel processing to scratch location
1003
+ with tqdm(total=4, desc="Pipeline Progress", unit="phase") as pipeline_pbar:
1004
+ # Skip download phase if already completed (resume case)
1005
+ if manifest.download_phase.completed:
1006
+ logger.info("Download phase already completed, skipping")
1007
+ shard_info = [
1008
+ {"shard_path": path, "item_count": 0, "partition_key": ""}
1009
+ for path in manifest.download_phase.shards_written
1010
+ ]
1011
+ pipeline_pbar.update(1)
1012
+ else:
1013
+ pipeline_pbar.set_description("Processing URLs")
1014
+ manifest.status = JobStatus.DOWNLOADING
1015
+ manifest.save(self.storage, self.config.output_catalog)
1016
+ job_logger.log_phase_start("download")
1017
+
1018
+ shard_info = self._process_urls_distributed(urls, job_id=job_id)
1019
+ total_shards = len(shard_info)
1020
+ total_items = sum(s["item_count"] for s in shard_info)
1021
+ logger.info(f"Generated {total_shards:,} shards ({total_items:,} items) in scratch space")
1022
+
1023
+ # Update manifest with download phase completion
1024
+ manifest.download_phase.completed = True
1025
+ manifest.download_phase.shards_written = [s["shard_path"] for s in shard_info]
1026
+ manifest.download_phase.urls_processed = len(urls)
1027
+ manifest.save(self.storage, self.config.output_catalog)
1028
+ job_logger.log_phase_complete("download", {"shards": total_shards, "items": total_items})
1029
+ pipeline_pbar.update(1)
1030
+
1031
+ # Phase 3: Consolidate shards into final catalog
1032
+ # Skip if already completed (resume case)
1033
+ if manifest.consolidation_phase.completed:
1034
+ logger.info("Consolidation phase already completed, skipping")
1035
+ pipeline_pbar.update(1)
1036
+ final_stats = {} # Will be re-computed from catalog
1037
+ else:
1038
+ pipeline_pbar.set_description("Consolidating shards")
1039
+ manifest.status = JobStatus.CONSOLIDATING
1040
+ manifest.save(self.storage, self.config.output_catalog)
1041
+ job_logger.log_phase_start("consolidation")
1042
+
1043
+ # Filter out already-completed partitions for resume
1044
+ completed_partitions = set(manifest.consolidation_phase.completed_partitions)
1045
+ final_stats = self._consolidate_shards(
1046
+ shard_info,
1047
+ skip_partitions=completed_partitions,
1048
+ manifest=manifest,
1049
+ )
1050
+ logger.info(f"Consolidated into {len(final_stats):,} final partitions")
1051
+
1052
+ manifest.consolidation_phase.completed = True
1053
+ manifest.save(self.storage, self.config.output_catalog)
1054
+ job_logger.log_phase_complete("consolidation", {"partitions": len(final_stats)})
1055
+ pipeline_pbar.update(1)
1056
+
1057
+ # Phase 4: Cleanup scratch space
1058
+ pipeline_pbar.set_description("Cleaning up")
1059
+ self._cleanup_scratch(shard_info)
1060
+ logger.info("Scratch space cleanup completed")
1061
+ pipeline_pbar.update(1)
1062
+
1063
+ # Mark job as completed
1064
+ manifest.status = JobStatus.COMPLETED
1065
+ manifest.save(self.storage, self.config.output_catalog)
1066
+ job_logger.log("INFO", "Job completed successfully")
1067
+
1068
+ except Exception as e:
1069
+ # Mark job as failed
1070
+ manifest.status = JobStatus.FAILED
1071
+ manifest.error = str(e)
1072
+ manifest.save(self.storage, self.config.output_catalog)
1073
+ job_logger.log_error(f"Job failed: {e}")
1074
+ raise
1075
+
1076
+ # Finish statistics tracking
1077
+ self.stats.finish_processing()
1078
+
1079
+ # Update statistics with consolidation results
1080
+ for _partition_key, stats in final_stats.items():
1081
+ self.stats.record_consolidation(
1082
+ new_items=stats.get("new_items", 0),
1083
+ existing_items=stats.get("existing_items", 0),
1084
+ duplicates_removed=stats.get("duplicates_removed", 0),
1085
+ )
1086
+
1087
+ # Generate schema if requested
1088
+ if self.config.generate_schema:
1089
+ logger.info("Generating catalog schema metadata...")
1090
+ schema_gen = schema_generator.SchemaGenerator(self.config, self.grid, self.storage, self.stats)
1091
+ schema_gen.generate_catalog_schema(final_stats, self.config.schema_filename)
1092
+
1093
+ # Summary with enhanced statistics
1094
+ stats_summary = self.stats.get_summary()
1095
+ logger.info("=" * 80)
1096
+ logger.info("PIPELINE COMPLETED SUCCESSFULLY")
1097
+ logger.info(f"Job ID: {job_id}")
1098
+ logger.info(f"Total partitions: {len(final_stats)}")
1099
+ logger.info(f"Unique granules: {stats_summary['unique_granules']:,}")
1100
+ logger.info(f"Stored references: {stats_summary['stored_references']:,}")
1101
+ logger.info(
1102
+ f"Overhead: {stats_summary['overhead']['overhead_percentage']:.1f}% "
1103
+ f"({stats_summary['overhead']['spanning_items']:,} spanning items)"
1104
+ )
1105
+ total_new = sum(s["new_items"] for s in final_stats.values())
1106
+ logger.info(f"New items: {total_new:,}")
1107
+ if self.config.generate_schema:
1108
+ logger.info(f"Schema metadata: {self.config.output_catalog}/{self.config.schema_filename}")
1109
+ logger.info("=" * 80)
1110
+
1111
+ return final_stats
1112
+
1113
+ def _run_simple(self, urls: list[str], job_id: str) -> dict[str, Any]:
1114
+ """Execute simplified pipeline for small batches without full distributed overhead.
1115
+
1116
+ This method provides a streamlined processing path for datasets below the
1117
+ batch_threshold. It still:
1118
+ - Uses async HTTP for performance
1119
+ - Creates a job manifest for observability
1120
+ - Uses the same partitioning and consolidation logic
1121
+
1122
+ But it skips:
1123
+ - Resume/checkpoint capability (not needed for small batches)
1124
+ - Complex worker distribution logic
1125
+ - Multi-phase progress tracking
1126
+
1127
+ Args:
1128
+ urls: List of URLs to process.
1129
+ job_id: Job identifier for logging.
1130
+
1131
+ Returns:
1132
+ Dictionary mapping partition keys to consolidation stats.
1133
+ """
1134
+ logger.info("=" * 80)
1135
+ logger.info("Running simplified pipeline (small batch mode)")
1136
+ logger.info(f"Job ID: {job_id}")
1137
+ logger.info(f"URLs: {len(urls):,}")
1138
+ logger.info("=" * 80)
1139
+
1140
+ # Start statistics tracking
1141
+ self.stats.start_processing()
1142
+
1143
+ # Create minimal job manifest for observability
1144
+ manifest = JobManifest(
1145
+ job_id=job_id,
1146
+ input_urls_count=len(urls),
1147
+ config_hash=self.config.config_hash(),
1148
+ )
1149
+ manifest.status = JobStatus.DOWNLOADING
1150
+ manifest.save(self.storage, self.config.output_catalog)
1151
+
1152
+ # Process all URLs in a single batch using async HTTP
1153
+ worker_tag = f"simple-{uuid.uuid4().hex[:8]}"
1154
+ all_items: list[dict[str, Any]] = []
1155
+
1156
+ # Use async HTTP if available and enabled
1157
+ if self.config.enable_concurrent_http and HAS_ASYNC_HTTP:
1158
+ logger.info(f"Downloading {len(urls):,} items with async HTTP...")
1159
+ items = self._download_stac_items_batch_async(urls, worker_id=0)
1160
+ for item in items:
1161
+ if item:
1162
+ all_items.append(item)
1163
+ self.stats.record_url_processed(success=True)
1164
+ else:
1165
+ self.stats.record_url_processed(success=False)
1166
+ else:
1167
+ # Fallback to sync processing
1168
+ logger.info(f"Downloading {len(urls):,} items with sync HTTP...")
1169
+ with tqdm(total=len(urls), desc="Downloading", unit="items") as pbar:
1170
+ for url in urls:
1171
+ fetched_item = self._download_stac_item(url)
1172
+ if fetched_item:
1173
+ all_items.append(fetched_item)
1174
+ self.stats.record_url_processed(success=True)
1175
+ else:
1176
+ self.stats.record_url_processed(success=False)
1177
+ pbar.update(1)
1178
+
1179
+ logger.info(f"Downloaded {len(all_items):,} items successfully")
1180
+
1181
+ if not all_items:
1182
+ logger.warning("No items downloaded, nothing to process")
1183
+ manifest.status = JobStatus.COMPLETED
1184
+ manifest.save(self.storage, self.config.output_catalog)
1185
+ self.stats.finish_processing()
1186
+ return {}
1187
+
1188
+ # Write partition shards
1189
+ logger.info("Writing partition shards...")
1190
+ shard_info = self._write_partition_shards(all_items, worker_tag, self.stats)
1191
+
1192
+ # Update manifest
1193
+ manifest.download_phase.completed = True
1194
+ manifest.download_phase.shards_written = [s["shard_path"] for s in shard_info]
1195
+ manifest.download_phase.urls_processed = len(urls)
1196
+ manifest.status = JobStatus.CONSOLIDATING
1197
+ manifest.save(self.storage, self.config.output_catalog)
1198
+
1199
+ # Consolidate shards
1200
+ logger.info("Consolidating partitions...")
1201
+ final_stats = self._consolidate_shards(shard_info, manifest=manifest)
1202
+
1203
+ # Cleanup scratch
1204
+ self._cleanup_scratch(shard_info)
1205
+
1206
+ # Mark complete
1207
+ manifest.status = JobStatus.COMPLETED
1208
+ manifest.consolidation_phase.completed = True
1209
+ manifest.save(self.storage, self.config.output_catalog)
1210
+
1211
+ # Finish statistics
1212
+ self.stats.finish_processing()
1213
+
1214
+ # Update statistics with consolidation results
1215
+ for _partition_key, stats in final_stats.items():
1216
+ self.stats.record_consolidation(
1217
+ new_items=stats.get("new_items", 0),
1218
+ existing_items=stats.get("existing_items", 0),
1219
+ duplicates_removed=stats.get("duplicates_removed", 0),
1220
+ )
1221
+
1222
+ # Generate schema if requested
1223
+ if self.config.generate_schema:
1224
+ logger.info("Generating catalog schema metadata...")
1225
+ schema_gen = schema_generator.SchemaGenerator(self.config, self.grid, self.storage, self.stats)
1226
+ schema_gen.generate_catalog_schema(final_stats, self.config.schema_filename)
1227
+
1228
+ # Summary
1229
+ stats_summary = self.stats.get_summary()
1230
+ logger.info("=" * 80)
1231
+ logger.info("SIMPLE PIPELINE COMPLETED SUCCESSFULLY")
1232
+ logger.info(f"Job ID: {job_id}")
1233
+ logger.info(f"Total partitions: {len(final_stats)}")
1234
+ logger.info(f"Unique granules: {stats_summary['unique_granules']:,}")
1235
+ total_new = sum(s.get("new_items", 0) for s in final_stats.values())
1236
+ logger.info(f"New items: {total_new:,}")
1237
+ logger.info("=" * 80)
1238
+
1239
+ return final_stats
1240
+
1241
+ def _read_input_urls(self) -> list[str]:
1242
+ """Read URLs from input file(s) using appropriate format reader.
1243
+
1244
+ If input_pattern is configured, discovers all matching files and reads URLs
1245
+ from each one. Otherwise, reads from the single input_file.
1246
+
1247
+ Returns:
1248
+ List of URLs from all discovered files.
1249
+ """
1250
+ # Check if we should use pattern-based file discovery
1251
+ if self.config.input_pattern:
1252
+ return self._read_urls_from_pattern()
1253
+
1254
+ # Traditional single file mode
1255
+ return self._read_urls_from_file(self.config.input_file)
1256
+
1257
+ def _read_urls_from_file(self, file_path: str) -> list[str]:
1258
+ """Read URLs from a single file using appropriate format reader.
1259
+
1260
+ Args:
1261
+ file_path: Path to the input file (local or S3).
1262
+
1263
+ Returns:
1264
+ List of URLs extracted from the file.
1265
+ """
1266
+ ReaderFactory = input_readers.ReaderFactory
1267
+
1268
+ # Determine format
1269
+ if self.config.input_format == "auto":
1270
+ format_name = ReaderFactory.auto_detect_format(file_path)
1271
+ else:
1272
+ format_name = self.config.input_format
1273
+
1274
+ # Get appropriate reader
1275
+ reader = ReaderFactory.get_reader(format_name)
1276
+
1277
+ # Read URLs
1278
+ return reader.read_urls(file_path, self.config.url_column)
1279
+
1280
+ def _read_urls_from_pattern(self) -> list[str]:
1281
+ """Read URLs from multiple files matching a glob pattern.
1282
+
1283
+ Uses StorageBackend.list_files() to discover files matching the pattern,
1284
+ then reads URLs from each discovered file.
1285
+
1286
+ Returns:
1287
+ Concatenated list of URLs from all matching files.
1288
+
1289
+ Raises:
1290
+ ValueError: If no files match the pattern.
1291
+ """
1292
+ pattern = self.config.input_pattern
1293
+ all_urls: list[str] = []
1294
+
1295
+ logger.info(f"Discovering files matching pattern: {pattern}")
1296
+
1297
+ # Check if pattern is for S3 or local filesystem
1298
+ if pattern.startswith("s3://"):
1299
+ # Use fsspec for S3 pattern matching
1300
+ all_urls = self._read_s3_pattern(pattern)
1301
+ else:
1302
+ # Use glob for local filesystem
1303
+ matching_files = glob_module.glob(pattern, recursive=True)
1304
+
1305
+ if not matching_files:
1306
+ raise ValueError(f"No files found matching pattern: {pattern}")
1307
+
1308
+ logger.info(f"Found {len(matching_files)} files matching pattern")
1309
+
1310
+ # Read URLs from each file
1311
+ for file_path in matching_files:
1312
+ logger.info(f"Reading URLs from: {file_path}")
1313
+ urls = self._read_urls_from_file(file_path)
1314
+ all_urls.extend(urls)
1315
+
1316
+ logger.info(f"Total URLs read from all files: {len(all_urls)}")
1317
+ return all_urls
1318
+
1319
+ def _read_s3_pattern(self, pattern: str) -> list[str]:
1320
+ """Read URLs from S3 files matching a glob pattern.
1321
+
1322
+ Args:
1323
+ pattern: S3 path with glob pattern (e.g., s3://bucket/bulk/2020_*.ndjson).
1324
+
1325
+ Returns:
1326
+ Concatenated list of URLs from all matching S3 files.
1327
+
1328
+ Raises:
1329
+ ValueError: If no files match the pattern or fsspec not available.
1330
+ """
1331
+ if not pattern.startswith("s3://"):
1332
+ raise ValueError(f"S3 pattern must start with 's3://', got: {pattern}")
1333
+
1334
+ # Use fsspec.glob() directly for pattern matching
1335
+ matching_files = fsspec.glob(pattern)
1336
+
1337
+ if not matching_files:
1338
+ raise ValueError(f"No S3 files found matching pattern: {pattern}")
1339
+
1340
+ logger.info(f"Found {len(matching_files)} S3 files matching pattern")
1341
+
1342
+ # Read URLs from each file
1343
+ all_urls: list[str] = []
1344
+ for file_path in matching_files:
1345
+ logger.info(f"Reading URLs from S3: {file_path}")
1346
+ urls = self._read_urls_from_file(file_path)
1347
+ all_urls.extend(urls)
1348
+
1349
+ return all_urls
1350
+
1351
+ def _process_urls_distributed(self, urls: list[str], job_id: str | None = None) -> list[dict[str, Any]]:
1352
+ """Process URLs in parallel and write to scratch location.
1353
+
1354
+ For distributed processing (Dask), uses serializable worker functions from
1355
+ workers.py. For local processing, uses inline closures that capture self.
1356
+
1357
+ Worker stats are returned alongside shard info and merged into the
1358
+ pipeline's main stats at the end.
1359
+
1360
+ Args:
1361
+ urls: List of STAC item URLs to process.
1362
+ job_id: Job ID for tracking. If None, generates a new UUID.
1363
+
1364
+ Returns:
1365
+ List of shard info dictionaries.
1366
+ """
1367
+ # Use provided job_id or generate one
1368
+ if job_id is None:
1369
+ job_id = str(uuid.uuid4())
1370
+ logger.info(f"Starting job {job_id[:8]}...")
1371
+
1372
+ # Serialize config for distributed workers
1373
+ config_dict = self.config.to_dict()
1374
+
1375
+ def process_url_batch(
1376
+ url_batch: list[str], worker_id: int, **kwargs: Any
1377
+ ) -> tuple[list[dict[str, Any]], IngestionStatistics]:
1378
+ """Process a batch of URLs on a single worker with async HTTP support.
1379
+
1380
+ This closure captures self and is used for LocalProcessor.
1381
+ For DaskDistributedProcessor, the serializable workers.process_url_batch is used instead.
1382
+
1383
+ Returns:
1384
+ Tuple of (shard_info_list, worker_statistics)
1385
+ """
1386
+ # Create worker-local statistics collector
1387
+ worker_stats = IngestionStatistics()
1388
+
1389
+ # Choose processing method based on configuration
1390
+ if self.config.enable_concurrent_http and HAS_ASYNC_HTTP and len(url_batch) >= self.config.batch_size:
1391
+ # Use async HTTP processing for better performance
1392
+ shards = self._process_batch_with_async_http(url_batch, worker_id, worker_stats)
1393
+ return (shards, worker_stats)
1394
+
1395
+ # Traditional synchronous processing (original implementation)
1396
+ batch_shards = []
1397
+ current_shard_items = []
1398
+ worker_tag = f"worker-{worker_id}-{uuid.uuid4().hex[:8]}"
1399
+
1400
+ # Add progress bar for this batch
1401
+ with tqdm(total=len(url_batch), desc=f"Worker {worker_id}", unit="urls", leave=False) as pbar:
1402
+ for _idx, url in enumerate(url_batch):
1403
+ try:
1404
+ # Download and parse STAC item
1405
+ item = self._download_stac_item(url)
1406
+ if not item:
1407
+ # Record failed URL
1408
+ worker_stats.record_url_processed(success=False)
1409
+ pbar.update(1)
1410
+ continue
1411
+
1412
+ # Record successful URL
1413
+ worker_stats.record_url_processed(success=True)
1414
+
1415
+ # Add to current shard
1416
+ current_shard_items.append(item)
1417
+
1418
+ # Write shard when it reaches target size
1419
+ if len(current_shard_items) >= self.config.items_per_shard:
1420
+ # Use partition-aware sharding
1421
+ partition_shards = self._write_partition_shards(
1422
+ current_shard_items, worker_tag, worker_stats
1423
+ )
1424
+ batch_shards.extend(partition_shards)
1425
+ current_shard_items = []
1426
+
1427
+ except (
1428
+ requests.exceptions.RequestException,
1429
+ json.JSONDecodeError,
1430
+ OSError,
1431
+ ValueError,
1432
+ ) as e:
1433
+ logger.error(f"Error processing URL {url}: {e}")
1434
+ worker_stats.record_url_processed(success=False)
1435
+ pbar.update(1)
1436
+ continue
1437
+
1438
+ pbar.update(1)
1439
+
1440
+ # Write final shard for this batch
1441
+ if current_shard_items:
1442
+ # Use partition-aware sharding
1443
+ partition_shards = self._write_partition_shards(current_shard_items, worker_tag, worker_stats)
1444
+ batch_shards.extend(partition_shards)
1445
+
1446
+ total_items = sum(s["item_count"] for s in batch_shards)
1447
+ logger.info(f"Worker {worker_id} completed: {len(batch_shards)} shards, {total_items} items")
1448
+ return (batch_shards, worker_stats)
1449
+
1450
+ # Distribute URLs to workers
1451
+ url_chunks = self._chunk_urls(urls, self.config.max_workers)
1452
+
1453
+ # Process in parallel - pass config_dict and job_id for Dask
1454
+ all_results = self.processor.process_urls(
1455
+ url_chunks,
1456
+ process_url_batch,
1457
+ config_dict=config_dict,
1458
+ job_id=job_id,
1459
+ )
1460
+
1461
+ # Merge worker statistics into main stats and flatten shard results
1462
+ all_shards = []
1463
+ for result in all_results:
1464
+ # Handle both tuple format (LocalProcessor) and dict format (DaskDistributedProcessor)
1465
+ if isinstance(result, tuple):
1466
+ shards, worker_stats = result
1467
+ self.stats.merge(worker_stats)
1468
+ all_shards.extend(shards)
1469
+ elif isinstance(result, dict):
1470
+ # Dict format from workers.process_url_batch
1471
+ shards = result.get("shards", [])
1472
+ stats_dict = result.get("stats", {})
1473
+ failed_urls = result.get("failed_urls", [])
1474
+
1475
+ # Reconstruct stats from dict
1476
+ worker_stats = IngestionStatistics()
1477
+ for _ in range(stats_dict.get("urls_processed", 0)):
1478
+ worker_stats.record_url_processed(success=True)
1479
+ for _ in range(stats_dict.get("urls_failed", 0)):
1480
+ worker_stats.record_url_processed(success=False)
1481
+
1482
+ self.stats.merge(worker_stats)
1483
+ all_shards.extend(shards)
1484
+
1485
+ if failed_urls:
1486
+ logger.warning(f"Failed URLs: {len(failed_urls)}")
1487
+ else:
1488
+ logger.error(f"Unexpected result type: {type(result)}")
1489
+
1490
+ return all_shards
1491
+
1492
+ def _download_stac_items_batch_async(self, urls: list[str], worker_id: int) -> list[dict[str, Any]]:
1493
+ """Download STAC items using async HTTP client for improved performance.
1494
+
1495
+ This method provides a high-performance alternative to sequential downloads
1496
+ by utilizing concurrent HTTP requests with connection pooling and rate limiting.
1497
+
1498
+ Args:
1499
+ urls: List of URLs to download
1500
+ worker_id: Worker identifier for logging and progress tracking
1501
+
1502
+ Returns:
1503
+ List of successfully downloaded STAC item dictionaries
1504
+ """
1505
+ if not HAS_ASYNC_HTTP:
1506
+ # Fallback to synchronous processing
1507
+ logger.warning("Async HTTP not available, falling back to synchronous processing")
1508
+ items = []
1509
+ for url in urls:
1510
+ item = self._download_stac_item(url)
1511
+ if item is not None:
1512
+ items.append(item)
1513
+ return items
1514
+
1515
+ def run_async_download() -> list[dict[str, Any]]:
1516
+ """Run async download in a new event loop with proper context handling.
1517
+
1518
+ This nested function handles the complexities of running async code from
1519
+ a synchronous context, particularly dealing with existing event loops
1520
+ in environments like Jupyter notebooks or async web frameworks.
1521
+
1522
+ Returns:
1523
+ List of successfully downloaded STAC item dictionaries.
1524
+
1525
+ Note:
1526
+ Automatically detects running event loops and uses thread pool
1527
+ execution when necessary to avoid "RuntimeError: cannot be called
1528
+ from a running event loop" issues.
1529
+ """
1530
+ try:
1531
+ # Check if we're already in an async context
1532
+ asyncio.current_task()
1533
+ # If we reach here, there's a running event loop
1534
+ # We need to run in a thread to avoid issues
1535
+ import concurrent.futures
1536
+
1537
+ with concurrent.futures.ThreadPoolExecutor() as executor:
1538
+ future = executor.submit(asyncio.run, self._async_download_worker(urls))
1539
+ return future.result()
1540
+ except RuntimeError:
1541
+ # No event loop exists or not in async context, create a new one
1542
+ return asyncio.run(self._async_download_worker(urls))
1543
+
1544
+ try:
1545
+ return run_async_download()
1546
+ except (RuntimeError, asyncio.CancelledError, OSError) as e:
1547
+ logger.error(f"Async download failed for worker {worker_id}, falling back to sync: {e}")
1548
+ # Fallback to synchronous processing
1549
+ items = []
1550
+ for url in urls:
1551
+ item = self._download_stac_item(url)
1552
+ if item is not None:
1553
+ items.append(item)
1554
+ return items
1555
+
1556
+ async def _async_download_worker(self, urls: list[str]) -> list[dict[str, Any]]:
1557
+ """Internal async worker for downloading STAC items with high concurrency.
1558
+
1559
+ This method serves as the async wrapper that calls the async HTTP client
1560
+ with the current pipeline configuration. It handles the async/await mechanics
1561
+ while maintaining error handling compatibility.
1562
+
1563
+ Args:
1564
+ urls: List of URLs to download concurrently.
1565
+
1566
+ Returns:
1567
+ List of successfully downloaded and parsed STAC item dictionaries.
1568
+ Failed downloads are filtered out and logged separately.
1569
+
1570
+ Raises:
1571
+ RuntimeError: If async HTTP client is not available when called.
1572
+
1573
+ Example:
1574
+ >>> urls = ["https://example.com/item1.json", "https://example.com/item2.json"]
1575
+ >>> items = await self._async_download_worker(urls)
1576
+ >>> print(f"Downloaded {len(items)} items")
1577
+ """
1578
+ if not HAS_ASYNC_HTTP or download_stac_items_async is None:
1579
+ raise RuntimeError("Async HTTP client not available")
1580
+
1581
+ return await download_stac_items_async(
1582
+ urls=urls,
1583
+ concurrent_requests=self.config.concurrent_requests,
1584
+ connection_pool_size=self.config.connection_pool_size,
1585
+ request_timeout=self.config.request_timeout,
1586
+ retry_attempts=self.config.retry_attempts,
1587
+ retry_delay=self.config.retry_delay,
1588
+ batch_size=self.config.batch_size,
1589
+ )
1590
+
1591
+ def _process_batch_with_async_http(
1592
+ self, url_batch: list[str], worker_id: int, stats: IngestionStatistics
1593
+ ) -> list[dict[str, Any]]:
1594
+ """Process URL batch using async HTTP with adaptive batching and memory management.
1595
+
1596
+ This method processes URLs in smaller concurrent batches while maintaining
1597
+ the same shard writing patterns as the original synchronous version. It provides
1598
+ the bridge between sync pipeline processing and async HTTP downloading.
1599
+
1600
+ The method adapts batch sizes based on configuration and processes URLs in
1601
+ sub-batches to manage memory usage while maximizing concurrency within each batch.
1602
+
1603
+ Args:
1604
+ url_batch: List of URLs to process in this batch.
1605
+ worker_id: Unique identifier for this worker process, used for logging
1606
+ and temporary file naming to avoid conflicts.
1607
+
1608
+ Returns:
1609
+ List of successfully downloaded and processed STAC item dictionaries.
1610
+ Items are filtered, validated, and ready for catalog ingestion.
1611
+
1612
+ Note:
1613
+ This method maintains the same error handling semantics as the original
1614
+ synchronous version - failed downloads return empty lists and errors are logged.
1615
+
1616
+ Example:
1617
+ >>> batch = ["https://example.com/item1.json", "https://example.com/item2.json"]
1618
+ >>> items = pipeline._process_batch_with_async_http(batch, worker_id=1)
1619
+ >>> print(f"Processed {len(items)} items from batch")
1620
+ """
1621
+ batch_shards = []
1622
+ worker_tag = f"worker-{worker_id}-{uuid.uuid4().hex[:8]}"
1623
+
1624
+ # Process URLs in smaller async batches for memory management
1625
+ async_batch_size = min(self.config.batch_size, len(url_batch))
1626
+
1627
+ with tqdm(total=len(url_batch), desc=f"Worker {worker_id} (Async)", unit="urls", leave=False) as pbar:
1628
+ for i in range(0, len(url_batch), async_batch_size):
1629
+ batch_urls = url_batch[i : i + async_batch_size]
1630
+
1631
+ # Download batch concurrently
1632
+ items = self._download_stac_items_batch_async(batch_urls, worker_id)
1633
+
1634
+ # Record URL processing stats (async batch) - use passed stats
1635
+ success_count = len(items)
1636
+ failed_count = len(batch_urls) - success_count
1637
+ for _ in range(success_count):
1638
+ stats.record_url_processed(success=True)
1639
+ for _ in range(failed_count):
1640
+ stats.record_url_processed(success=False)
1641
+
1642
+ # Process items into shards
1643
+ if items:
1644
+ partition_shards = self._write_partition_shards(items, worker_tag, stats)
1645
+ batch_shards.extend(partition_shards)
1646
+
1647
+ pbar.update(len(batch_urls))
1648
+
1649
+ # Log completion stats
1650
+ total_items = sum(shard.get("item_count", 0) for shard in batch_shards)
1651
+ logger.info(f"Worker {worker_id} completed (async): {len(batch_shards)} shards, {total_items} items")
1652
+
1653
+ return batch_shards
1654
+
1655
+ def _validate_and_fix_items(self, items: list[dict[str, Any]]) -> list[dict[str, Any]]:
1656
+ """Validate STAC items and optionally fix geometry/bbox issues.
1657
+
1658
+ Performs validation on each item including:
1659
+ - Geometry validity check (self-intersection, ring orientation)
1660
+ - Bbox-geometry consistency check
1661
+ - Optionally fixes invalid geometries and mismatched bboxes
1662
+
1663
+ Args:
1664
+ items: List of STAC item dictionaries
1665
+
1666
+ Returns:
1667
+ List of validated (and optionally corrected) STAC items
1668
+ """
1669
+ from .validation import validate_stac_item
1670
+
1671
+ validated_items = []
1672
+ validation_stats = {"total": len(items), "warnings": 0, "fixed_geometry": 0, "fixed_bbox": 0}
1673
+
1674
+ for item in items:
1675
+ result, corrected_item = validate_stac_item(
1676
+ item,
1677
+ fix_geometry=self.config.fix_invalid_geometry,
1678
+ bbox_tolerance=self.config.bbox_tolerance,
1679
+ )
1680
+
1681
+ # Track statistics
1682
+ if result.warnings:
1683
+ validation_stats["warnings"] += len(result.warnings)
1684
+ if self.config.log_validation_warnings:
1685
+ for warning in result.warnings:
1686
+ logger.debug(f"Validation warning for item {item.get('id')}: {warning}")
1687
+
1688
+ if result.metadata.get("geometry_fixed"):
1689
+ validation_stats["fixed_geometry"] += 1
1690
+ if result.metadata.get("bbox_corrected"):
1691
+ validation_stats["fixed_bbox"] += 1
1692
+
1693
+ # Use corrected item if available, otherwise use original
1694
+ if corrected_item is not None:
1695
+ validated_items.append(corrected_item)
1696
+ else:
1697
+ validated_items.append(item)
1698
+
1699
+ # Log summary
1700
+ if validation_stats["warnings"] > 0 or validation_stats["fixed_geometry"] > 0:
1701
+ logger.debug(
1702
+ f"Validation: {validation_stats['total']} items, "
1703
+ f"{validation_stats['warnings']} warnings, "
1704
+ f"{validation_stats['fixed_geometry']} geometries fixed, "
1705
+ f"{validation_stats['fixed_bbox']} bboxes corrected"
1706
+ )
1707
+
1708
+ return validated_items
1709
+
1710
+ def _write_partition_shards(
1711
+ self,
1712
+ items: list[dict[str, Any]],
1713
+ worker_id: str,
1714
+ stats: IngestionStatistics | None = None,
1715
+ ) -> list[dict[str, Any]]:
1716
+ """Group items by partition and write partition-aware shards.
1717
+
1718
+ Also records comprehensive statistics for each item including:
1719
+ - Unique item tracking via HyperLogLog
1720
+ - Spanning item detection and tile counts
1721
+ - Spatial/temporal distribution
1722
+ - Data quality metrics
1723
+
1724
+ Args:
1725
+ items: List of STAC items to process
1726
+ worker_id: Unique identifier for this worker
1727
+ stats: Optional IngestionStatistics to record to. If None, uses self.stats.
1728
+ """
1729
+ # Use provided stats or fall back to pipeline's stats
1730
+ target_stats = stats if stats is not None else self.stats
1731
+
1732
+ # Validate and optionally fix items if validation is enabled
1733
+ if self.config.enable_validation:
1734
+ items = self._validate_and_fix_items(items)
1735
+
1736
+ # Group items by partition key while recording statistics
1737
+ partition_groups: dict[str, list[dict[str, Any]]] = {}
1738
+
1739
+ for item in items:
1740
+ # Get geometry and compute tiles with spanning detection
1741
+ geom = item.get("geometry")
1742
+ mission = self._extract_mission(item)
1743
+
1744
+ if not geom:
1745
+ tiles: list[str] = []
1746
+ is_spanning = False
1747
+ routed_to_global = False
1748
+ else:
1749
+ tiles, is_spanning = self.grid.tiles_for_geometry_with_spanning_detection(geom)
1750
+ threshold = self.config.global_partition_threshold
1751
+ routed_to_global = self.config.enable_global_partitioning and len(tiles) > threshold
1752
+
1753
+ # Record item statistics
1754
+ target_stats.record_item(
1755
+ item=item,
1756
+ tiles=tiles,
1757
+ is_spanning=is_spanning,
1758
+ routed_to_global=routed_to_global,
1759
+ mission=mission,
1760
+ )
1761
+
1762
+ # Compute partition key and group
1763
+ partition_key = self._compute_partition_key(item)
1764
+ if partition_key not in partition_groups:
1765
+ partition_groups[partition_key] = []
1766
+ partition_groups[partition_key].append(item)
1767
+
1768
+ # Write shard for each partition
1769
+ shard_info = []
1770
+ for partition_key, partition_items in partition_groups.items():
1771
+ if not partition_items:
1772
+ continue
1773
+
1774
+ # Convert to GeoDataFrame using engine
1775
+ gdf = self.engine.items_to_geodataframe(partition_items)
1776
+
1777
+ # Sort items within shard
1778
+ if self.config.sort_key in gdf.columns:
1779
+ gdf = gdf.sort_values(self.config.sort_key, ascending=self.config.sort_ascending)
1780
+
1781
+ # Determine file extension based on output format
1782
+ file_ext = ".ndjson" if self.config.output_format == "ndjson" else ".parquet"
1783
+ shard_path = f"{self.config.scratch_location}/shards/{partition_key}/{worker_id}{file_ext}"
1784
+
1785
+ self.scratch_storage.makedirs(Path(shard_path).parent)
1786
+ self._write_partition_shard(gdf, shard_path)
1787
+
1788
+ shard_info.append(
1789
+ {
1790
+ "shard_path": shard_path,
1791
+ "partition_key": partition_key,
1792
+ "item_count": len(partition_items),
1793
+ "worker_id": worker_id,
1794
+ }
1795
+ )
1796
+
1797
+ return shard_info
1798
+
1799
+ def _write_partition_shard(self, gdf, shard_path: str):
1800
+ """Write partition shard in configured format."""
1801
+
1802
+ if self.config.output_format == "ndjson":
1803
+ # Convert to NDJSON format
1804
+ self._write_ndjson_shard(gdf, shard_path)
1805
+ else:
1806
+ # Default GeoParquet format
1807
+ with self.scratch_storage.open(shard_path, "wb") as f:
1808
+ gdf.to_parquet(f, index=False, compression="snappy")
1809
+
1810
+ def _write_ndjson_shard(self, gdf, shard_path: str):
1811
+ """Write GeoDataFrame as NDJSON format."""
1812
+ import orjson
1813
+
1814
+ try:
1815
+ # Use engine to convert GeoDataFrame back to STAC items
1816
+ features = self.engine.geodataframe_to_items(gdf)
1817
+
1818
+ with self.scratch_storage.open(shard_path, "wb") as f:
1819
+ for item in features:
1820
+ f.write(orjson.dumps(item) + b"\n")
1821
+
1822
+ except (TypeError, ValueError, OSError) as e:
1823
+ logger.warning(f"Failed to convert to STAC format, using raw GeoJSON: {e}")
1824
+
1825
+ # Fallback: write as GeoJSON features
1826
+ with self.scratch_storage.open(shard_path, "wb") as f:
1827
+ for _, row in gdf.iterrows():
1828
+ # Convert each row to a GeoJSON-like feature
1829
+ geom = row.get("geometry")
1830
+ if geom is not None and hasattr(geom, "__geo_interface__"):
1831
+ geometry_dict = geom.__geo_interface__
1832
+ else:
1833
+ geometry_dict = None
1834
+
1835
+ feature = {
1836
+ "type": "Feature",
1837
+ "geometry": geometry_dict,
1838
+ "properties": {k: v for k, v in row.items() if k != "geometry"},
1839
+ }
1840
+ f.write(orjson.dumps(feature) + b"\n")
1841
+
1842
+ def _consolidate_shards(
1843
+ self,
1844
+ shard_info: list[dict[str, Any]],
1845
+ skip_partitions: set[str] | None = None,
1846
+ manifest: JobManifest | None = None,
1847
+ ) -> dict[str, dict[str, int]]:
1848
+ """Consolidate worker shards into final partitioned catalog.
1849
+
1850
+ Args:
1851
+ shard_info: List of shard info dictionaries.
1852
+ skip_partitions: Set of partition keys to skip (for resume).
1853
+ manifest: Job manifest to update as partitions complete.
1854
+
1855
+ Returns:
1856
+ Dictionary mapping partition keys to consolidation stats.
1857
+ """
1858
+ if skip_partitions is None:
1859
+ skip_partitions = set()
1860
+
1861
+ def consolidate_partition_shards(partition_key: str, shard_paths: list[str], **kwargs: Any) -> dict[str, Any]:
1862
+ """Consolidate all shards for a single partition, merging with existing data."""
1863
+ return _consolidate_partition(partition_key, shard_paths)
1864
+
1865
+ def _consolidate_partition(partition_key: str, shard_paths: list[str]) -> dict[str, Any]:
1866
+ """Efficient consolidation approach using local staging for atomic S3 operations."""
1867
+ import io
1868
+ import tempfile
1869
+ from pathlib import Path
1870
+
1871
+ import pyarrow.parquet as pq
1872
+
1873
+ final_path = self._get_final_partition_path(partition_key)
1874
+ existing_count = 0
1875
+ new_count = 0
1876
+
1877
+ # Use temporary directory for local staging
1878
+ with tempfile.TemporaryDirectory(dir=self.config.temp_dir_location) as temp_dir:
1879
+ temp_existing_path = Path(temp_dir) / "existing.parquet"
1880
+ temp_merged_path = Path(temp_dir) / "merged.parquet"
1881
+
1882
+ all_items = []
1883
+
1884
+ # Step 1: Download existing partition to local temp (if exists)
1885
+ if self.storage.exists(final_path):
1886
+ try:
1887
+ logger.info(f"Partition {partition_key}: downloading existing data to local staging")
1888
+
1889
+ # Download to temporary file
1890
+ with self.storage.open(final_path, "rb") as remote_f:
1891
+ with open(temp_existing_path, "wb") as local_f:
1892
+ # Stream in chunks to avoid memory issues
1893
+ chunk_size = self.config.max_memory_per_partition_mb * 1024 * 1024 // 10
1894
+ while True:
1895
+ chunk = remote_f.read(chunk_size)
1896
+ if not chunk:
1897
+ break
1898
+ local_f.write(chunk)
1899
+
1900
+ # Read existing data from temp file
1901
+ table = pq.read_table(temp_existing_path)
1902
+ df = table.to_pandas()
1903
+ existing_gdf = gpd.GeoDataFrame(df)
1904
+ existing_count = len(existing_gdf)
1905
+ all_items.append(existing_gdf)
1906
+
1907
+ except (OSError, ValueError, RuntimeError) as e:
1908
+ logger.error(f"Error downloading existing partition {final_path}: {e}")
1909
+
1910
+ # Step 2: Read all new shards for this partition (using streaming)
1911
+ for shard_path in shard_paths:
1912
+ try:
1913
+ with self.scratch_storage.open(shard_path, "rb") as f:
1914
+ binary_data = f.read()
1915
+ table = pq.read_table(io.BytesIO(binary_data))
1916
+ df = table.to_pandas()
1917
+ gdf = gpd.GeoDataFrame(df)
1918
+ all_items.append(gdf)
1919
+
1920
+ except (OSError, ValueError, RuntimeError) as e:
1921
+ logger.error(f"Error reading shard {shard_path}: {e}")
1922
+ continue
1923
+
1924
+ if not all_items:
1925
+ return {"partition": partition_key, "item_count": 0, "existing_count": 0, "new_count": 0}
1926
+
1927
+ # Step 3: Merge data using chunked processing if enabled
1928
+ if self.config.enable_streaming_merge and len(all_items) > 1:
1929
+ # Process in batches to manage memory
1930
+ batch_size = max(1, self.config.max_memory_per_partition_mb // 100) # Conservative estimate
1931
+ merged = pd.DataFrame()
1932
+
1933
+ for i in range(0, len(all_items), batch_size):
1934
+ batch = all_items[i : i + batch_size]
1935
+ batch_merged = pd.concat(batch, ignore_index=True)
1936
+
1937
+ if len(merged) == 0:
1938
+ merged = batch_merged
1939
+ else:
1940
+ # Merge with existing and deduplicate incrementally
1941
+ merged = pd.concat([merged, batch_merged], ignore_index=True)
1942
+ merged = merged.drop_duplicates(subset=["id"], keep="last")
1943
+ else:
1944
+ # Standard merge for smaller datasets
1945
+ merged = pd.concat(all_items, ignore_index=True)
1946
+
1947
+ # Step 4: Deduplicate (keep="last" to prefer new data over existing)
1948
+ original_count = len(merged)
1949
+ merged = merged.drop_duplicates(subset=["id"], keep="last")
1950
+ duplicates_removed = original_count - len(merged)
1951
+
1952
+ # Step 5: Final sort
1953
+ if self.config.sort_key in merged.columns:
1954
+ merged = merged.sort_values(self.config.sort_key, ascending=self.config.sort_ascending)
1955
+
1956
+ # Step 6: Write merged data to local temp file
1957
+ merged_gdf = gpd.GeoDataFrame(merged)
1958
+ merged_gdf.to_parquet(temp_merged_path, index=False, compression="snappy")
1959
+
1960
+ # Step 7: Atomic upload to final location
1961
+ if final_path.startswith("s3://"):
1962
+ # For S3, use storage.upload which should handle large files appropriately
1963
+ self.storage.upload(str(temp_merged_path), final_path)
1964
+ else:
1965
+ # Standard atomic write
1966
+ self._write_final_partition(merged_gdf, final_path)
1967
+
1968
+ new_count = len(merged) - existing_count
1969
+ logger.info(
1970
+ f"Partition {partition_key}: {existing_count} existing + "
1971
+ f"{new_count} new = {len(merged)} total "
1972
+ f"({duplicates_removed} duplicates removed) [efficient]"
1973
+ )
1974
+
1975
+ return {
1976
+ "partition": partition_key,
1977
+ "item_count": len(merged),
1978
+ "existing_count": existing_count,
1979
+ "new_count": new_count,
1980
+ "duplicates_removed": duplicates_removed,
1981
+ "final_path": final_path,
1982
+ }
1983
+
1984
+ # Group shards by their target partition
1985
+ partition_shards = self._group_shards_by_partition(shard_info)
1986
+ logger.info(f"Grouped shards into {len(partition_shards)} partitions")
1987
+
1988
+ # Filter out already-completed partitions (for resume)
1989
+ if skip_partitions:
1990
+ original_count = len(partition_shards)
1991
+ partition_shards = {k: v for k, v in partition_shards.items() if k not in skip_partitions}
1992
+ skipped_count = original_count - len(partition_shards)
1993
+ if skipped_count > 0:
1994
+ logger.info(f"Skipping {skipped_count} already-completed partitions")
1995
+
1996
+ # Consolidate partitions in parallel - pass config_dict for Dask
1997
+ config_dict = self.config.to_dict()
1998
+ consolidation_results = self.processor.consolidate_shards(
1999
+ list(partition_shards.items()),
2000
+ consolidate_partition_shards,
2001
+ config_dict=config_dict,
2002
+ )
2003
+
2004
+ # Determine checkpoint strategy
2005
+ use_time_checkpoint = self.config.checkpoint_interval_seconds > 0
2006
+ use_count_checkpoint = self.config.checkpoint_partition_count > 0
2007
+ # Default to time-based (30s) if neither is set
2008
+ if not use_time_checkpoint and not use_count_checkpoint:
2009
+ use_time_checkpoint = True
2010
+ checkpoint_interval = 30
2011
+ else:
2012
+ checkpoint_interval = self.config.checkpoint_interval_seconds
2013
+
2014
+ last_checkpoint_time = time.time()
2015
+
2016
+ # Convert to stats - handle both dict formats
2017
+ final_stats = {}
2018
+ for result in consolidation_results:
2019
+ if result.get("item_count", 0) > 0:
2020
+ # Handle both "partition" key (from local) and "partition_key" (from workers.py)
2021
+ partition = result.get("partition") or result.get("partition_key", "unknown")
2022
+ final_stats[partition] = {
2023
+ "total_items": result.get("item_count", 0),
2024
+ "existing_items": result.get("existing_count", 0),
2025
+ "new_items": result.get("new_count", 0),
2026
+ "duplicates_removed": result.get("duplicates_removed", 0),
2027
+ }
2028
+
2029
+ # Update manifest if provided (for checkpointing)
2030
+ if manifest is not None:
2031
+ manifest.consolidation_phase.completed_partitions.append(partition)
2032
+ manifest.consolidation_phase.partitions_completed += 1
2033
+
2034
+ # Determine if we should checkpoint
2035
+ should_checkpoint = False
2036
+ current_time = time.time()
2037
+
2038
+ if use_time_checkpoint and (current_time - last_checkpoint_time) >= checkpoint_interval:
2039
+ should_checkpoint = True
2040
+ last_checkpoint_time = current_time
2041
+ elif (
2042
+ use_count_checkpoint
2043
+ and manifest.consolidation_phase.partitions_completed % self.config.checkpoint_partition_count
2044
+ == 0
2045
+ ):
2046
+ should_checkpoint = True
2047
+
2048
+ if should_checkpoint:
2049
+ manifest.save(self.storage, self.config.output_catalog)
2050
+
2051
+ return final_stats
2052
+
2053
+ def _group_shards_by_partition(self, shard_info: list[dict[str, Any]]) -> dict[str, list[str]]:
2054
+ """Group shard paths by their target partition."""
2055
+ partition_shards: dict[str, list[str]] = defaultdict(list)
2056
+
2057
+ for info in shard_info:
2058
+ shard_path = info["shard_path"]
2059
+
2060
+ if "partition_key" in info:
2061
+ # Use partition key from shard info
2062
+ partition_key = info["partition_key"]
2063
+ partition_shards[partition_key].append(shard_path)
2064
+ else:
2065
+ # Fallback: read shard to determine partitions (for backward compatibility)
2066
+ try:
2067
+ with self.scratch_storage.open(shard_path, "rb") as f:
2068
+ import io
2069
+
2070
+ import pyarrow.parquet as pq
2071
+
2072
+ binary_data = f.read()
2073
+ table = pq.read_table(io.BytesIO(binary_data))
2074
+ df = table.to_pandas()
2075
+ gdf = gpd.GeoDataFrame(df)
2076
+
2077
+ if len(gdf) == 0:
2078
+ continue
2079
+
2080
+ # Group items within shard by partition
2081
+ for _idx, row in gdf.iterrows():
2082
+ item = row.to_dict()
2083
+ partition_key = self._compute_partition_key(item)
2084
+ partition_shards[partition_key].append(shard_path)
2085
+
2086
+ # Remove duplicates (same shard might have items for same partition)
2087
+ partition_shards = cast(
2088
+ dict[str, list[str]], {k: list(set(v)) for k, v in partition_shards.items()}
2089
+ )
2090
+
2091
+ except (OSError, ValueError, RuntimeError, AttributeError, TypeError) as e:
2092
+ logger.error(f"Error processing shard {shard_path}: {e}")
2093
+ continue
2094
+
2095
+ return dict(partition_shards)
2096
+
2097
+ def _extract_mission(self, item: dict[str, Any]) -> str:
2098
+ """Extract mission/dataset identifier from STAC item."""
2099
+ props = item.get("properties", {})
2100
+
2101
+ # Primary: Extract from dataset_id field
2102
+ dataset_id = props.get(self.config.mission_field, "")
2103
+ if dataset_id:
2104
+ return self._sanitize_mission_name(str(dataset_id))
2105
+
2106
+ # Fallback to collection if dataset_id not found
2107
+ collection = props.get("collection", "")
2108
+ if collection:
2109
+ return self._sanitize_mission_name(str(collection))
2110
+
2111
+ # Final fallback
2112
+ return "unknown_mission"
2113
+
2114
+ def _sanitize_mission_name(self, name: str) -> str:
2115
+ """Sanitize mission name for filesystem compatibility."""
2116
+ import re
2117
+
2118
+ # Replace invalid filesystem chars with underscores, convert to lowercase
2119
+ sanitized = re.sub(r"[^\w]", "_", name.lower().strip())
2120
+ # Remove consecutive underscores
2121
+ sanitized = re.sub(r"_+", "_", sanitized)
2122
+ # Remove leading/trailing underscores
2123
+ return sanitized.strip("_") or "unnamed"
2124
+
2125
+ def _compute_partition_key(self, item: dict[str, Any]) -> str:
2126
+ """Compute partition key with Hive-style temporal partitioning.
2127
+
2128
+ Format: {dataset_id}/partition=h3/level={resolution}/{h3_cell_id}/year=YYYY/month=MM[/day=DD]
2129
+
2130
+ The temporal partition depth is controlled by config.temporal_bin:
2131
+ - year: year=YYYY
2132
+ - month: year=YYYY/month=MM
2133
+ - day: year=YYYY/month=MM/day=DD
2134
+
2135
+ This Hive-style naming enables directory-level pruning in DuckDB, Athena,
2136
+ Spark, and other query engines for optimal query performance.
2137
+ """
2138
+ # Extract mission (dataset identifier)
2139
+ mission = self._extract_mission(item)
2140
+
2141
+ # Extract spatial information
2142
+ geom = item.get("geometry")
2143
+ if not geom:
2144
+ h3_cell = "unknown"
2145
+ else:
2146
+ # Use the new method with spanning detection
2147
+ tiles, is_spanning = self.grid.tiles_for_geometry_with_spanning_detection(geom)
2148
+
2149
+ # Get resolution-specific threshold
2150
+ threshold = self.config.global_partition_threshold
2151
+
2152
+ # Apply global partitioning logic with resolution-specific threshold
2153
+ if self.config.enable_global_partitioning and len(tiles) > threshold:
2154
+ # Multi-cell geometry - route to global partition
2155
+ h3_cell = "global"
2156
+ else:
2157
+ # Single-cell geometry - use specific tile
2158
+ h3_cell = tiles[0] if tiles else "unknown"
2159
+
2160
+ # Extract temporal information using Hive-style partitioning
2161
+ temporal_parts = self._extract_temporal_hive_parts(item)
2162
+
2163
+ # Build partition path
2164
+ grid_type = self.config.grid_system # Should be 'h3'
2165
+ resolution = self.config.grid_resolution
2166
+
2167
+ return f"{mission}/partition={grid_type}/level={resolution}/{h3_cell}/{temporal_parts}"
2168
+
2169
+ def _extract_temporal_hive_parts(self, item: dict[str, Any]) -> str:
2170
+ """Extract Hive-style temporal partition parts from STAC item.
2171
+
2172
+ Generates directory path fragments using Hive partition naming convention
2173
+ for optimal query pruning in DuckDB, Athena, Spark, and other query engines.
2174
+
2175
+ Returns:
2176
+ Path fragment based on temporal_bin configuration:
2177
+ - year: "year=2024"
2178
+ - month: "year=2024/month=01"
2179
+ - day: "year=2024/month=01/day=15"
2180
+ - unknown: "unknown" (for missing/invalid datetimes)
2181
+ """
2182
+ props = item.get("properties", {})
2183
+ dt_str = props.get("datetime")
2184
+
2185
+ if not dt_str:
2186
+ return "unknown"
2187
+
2188
+ try:
2189
+ dt = pd.to_datetime(dt_str)
2190
+ if self.config.temporal_bin == "year":
2191
+ return f"year={dt.year}"
2192
+ elif self.config.temporal_bin == "month":
2193
+ return f"year={dt.year}/month={dt.month:02d}"
2194
+ elif self.config.temporal_bin == "day":
2195
+ return f"year={dt.year}/month={dt.month:02d}/day={dt.day:02d}"
2196
+ else:
2197
+ return "unknown"
2198
+ except (ValueError, TypeError):
2199
+ return "unknown"
2200
+
2201
+ def _get_final_partition_path(self, partition_key: str) -> str:
2202
+ """Get final path for partition structure with Hive-style naming.
2203
+
2204
+ The partition_key already contains Hive-style temporal directories
2205
+ (e.g., mission/partition=h3/level=2/cell/year=2024/month=01).
2206
+ This method appends the final filename (items.parquet or items.ndjson).
2207
+ """
2208
+ # Determine file extension based on output format
2209
+ if self.config.output_format == "ndjson":
2210
+ file_extension = ".ndjson"
2211
+ else:
2212
+ file_extension = ".parquet" # Default to .parquet for geoparquet
2213
+
2214
+ return f"{self.config.output_catalog}/{partition_key}/items{file_extension}"
2215
+
2216
+ def _write_final_partition(self, gdf: gpd.GeoDataFrame, path: str):
2217
+ """Write final partition with atomic safety."""
2218
+ import tempfile
2219
+
2220
+ if path.startswith("s3://"):
2221
+ # For S3, write to temporary local file then upload
2222
+ with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
2223
+ tmp_path = tmp.name
2224
+ gdf.to_parquet(tmp_path, index=False, compression="snappy")
2225
+ self.storage.upload(tmp_path, path)
2226
+ Path(tmp_path).unlink()
2227
+ else:
2228
+ # Local filesystem - atomic write
2229
+ tmp_path = f"{path}.tmp-{uuid.uuid4().hex}"
2230
+ self.storage.makedirs(Path(path).parent)
2231
+
2232
+ with self.storage.open(tmp_path, "wb") as f:
2233
+ gdf.to_parquet(f, index=False, compression="snappy")
2234
+
2235
+ self.storage.rename(tmp_path, path)
2236
+
2237
+ def _download_stac_item(self, url: str) -> dict[str, Any] | None:
2238
+ """Download and parse a STAC item from URL."""
2239
+ try:
2240
+ if url.startswith("s3://"):
2241
+ fs = fsspec.filesystem("s3")
2242
+ with fs.open(url, "r") as f:
2243
+ return cast(dict[str, Any], json.load(f))
2244
+ else:
2245
+ import requests
2246
+
2247
+ response = requests.get(url, timeout=30)
2248
+ response.raise_for_status()
2249
+ return cast(dict[str, Any], response.json())
2250
+ except (
2251
+ requests.exceptions.RequestException,
2252
+ json.JSONDecodeError,
2253
+ OSError,
2254
+ ValueError,
2255
+ ) as e:
2256
+ logger.error(f"Failed to download STAC item from {url}: {e}")
2257
+ return None
2258
+
2259
+ def _chunk_urls(self, urls: list[str], n_chunks: int) -> list[list[str]]:
2260
+ """Split URLs into chunks for parallel processing."""
2261
+ chunk_size = max(1, len(urls) // n_chunks)
2262
+ chunks = [urls[i : i + chunk_size] for i in range(0, len(urls), chunk_size)]
2263
+ # Ensure we have exactly n_chunks by combining last small chunk if needed
2264
+ if len(chunks) > n_chunks:
2265
+ chunks[-2].extend(chunks[-1])
2266
+ chunks = chunks[:-1]
2267
+ return chunks
2268
+
2269
+ def _cleanup_scratch(self, shard_info: list[dict[str, Any]]):
2270
+ """Clean up scratch space after successful processing."""
2271
+ logger.info("Cleaning up scratch space...")
2272
+ failed = 0
2273
+ for info in shard_info:
2274
+ try:
2275
+ self.scratch_storage.remove(info["shard_path"])
2276
+ except OSError as e:
2277
+ failed += 1
2278
+ logger.debug(f"Failed to cleanup shard {info['shard_path']}: {e}")
2279
+
2280
+ if failed > 0:
2281
+ logger.warning(f"Failed to cleanup {failed} shards")