earthcatalog 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- earthcatalog/__init__.py +164 -0
- earthcatalog/async_http_client.py +1006 -0
- earthcatalog/config.py +97 -0
- earthcatalog/engines/__init__.py +308 -0
- earthcatalog/engines/rustac_engine.py +142 -0
- earthcatalog/engines/stac_geoparquet_engine.py +126 -0
- earthcatalog/exceptions.py +471 -0
- earthcatalog/grid_systems.py +1114 -0
- earthcatalog/ingestion_pipeline.py +2281 -0
- earthcatalog/input_readers.py +603 -0
- earthcatalog/job_tracking.py +485 -0
- earthcatalog/pipeline.py +606 -0
- earthcatalog/schema_generator.py +911 -0
- earthcatalog/spatial_resolver.py +1207 -0
- earthcatalog/stac_hooks.py +754 -0
- earthcatalog/statistics.py +677 -0
- earthcatalog/storage_backends.py +548 -0
- earthcatalog/tests/__init__.py +1 -0
- earthcatalog/tests/conftest.py +76 -0
- earthcatalog/tests/test_all_grids.py +793 -0
- earthcatalog/tests/test_async_http.py +700 -0
- earthcatalog/tests/test_cli_and_storage.py +230 -0
- earthcatalog/tests/test_config.py +245 -0
- earthcatalog/tests/test_dask_integration.py +580 -0
- earthcatalog/tests/test_e2e_synthetic.py +1624 -0
- earthcatalog/tests/test_engines.py +272 -0
- earthcatalog/tests/test_exceptions.py +346 -0
- earthcatalog/tests/test_file_structure.py +245 -0
- earthcatalog/tests/test_input_readers.py +666 -0
- earthcatalog/tests/test_integration.py +200 -0
- earthcatalog/tests/test_integration_async.py +283 -0
- earthcatalog/tests/test_job_tracking.py +603 -0
- earthcatalog/tests/test_multi_file_input.py +336 -0
- earthcatalog/tests/test_passthrough_hook.py +196 -0
- earthcatalog/tests/test_pipeline.py +684 -0
- earthcatalog/tests/test_pipeline_components.py +665 -0
- earthcatalog/tests/test_schema_generator.py +506 -0
- earthcatalog/tests/test_spatial_resolver.py +413 -0
- earthcatalog/tests/test_stac_hooks.py +776 -0
- earthcatalog/tests/test_statistics.py +477 -0
- earthcatalog/tests/test_storage_backends.py +236 -0
- earthcatalog/tests/test_validation.py +435 -0
- earthcatalog/tests/test_workers.py +653 -0
- earthcatalog/validation.py +921 -0
- earthcatalog/workers.py +682 -0
- earthcatalog-0.2.0.dist-info/METADATA +333 -0
- earthcatalog-0.2.0.dist-info/RECORD +50 -0
- earthcatalog-0.2.0.dist-info/WHEEL +5 -0
- earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
- earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
earthcatalog/workers.py
ADDED
|
@@ -0,0 +1,682 @@
|
|
|
1
|
+
# workers.py
|
|
2
|
+
"""Serializable worker functions for distributed STAC ingestion.
|
|
3
|
+
|
|
4
|
+
This module provides **module-level functions** that can be pickled and sent
|
|
5
|
+
to remote Dask workers. Unlike the methods in `ingestion_pipeline.py`, these
|
|
6
|
+
functions do not capture `self` and receive all dependencies as explicit parameters.
|
|
7
|
+
|
|
8
|
+
Architecture:
|
|
9
|
+
These functions are designed to be called from both:
|
|
10
|
+
1. ThreadPoolExecutor (local processing)
|
|
11
|
+
2. Dask distributed workers (cluster processing)
|
|
12
|
+
|
|
13
|
+
Each function receives a serialized config dict and storage paths,
|
|
14
|
+
avoiding closure captures that would prevent pickling.
|
|
15
|
+
|
|
16
|
+
Key Functions:
|
|
17
|
+
process_url_batch: Download STAC items from URLs and write shards
|
|
18
|
+
consolidate_partition: Merge shards for a partition with existing data
|
|
19
|
+
|
|
20
|
+
Usage:
|
|
21
|
+
>>> from earthcatalog.workers import process_url_batch
|
|
22
|
+
>>> from earthcatalog.ingestion_pipeline import ProcessingConfig
|
|
23
|
+
>>>
|
|
24
|
+
>>> config = ProcessingConfig(...)
|
|
25
|
+
>>> config_dict = config.to_dict()
|
|
26
|
+
>>>
|
|
27
|
+
>>> # Can be called directly or via Dask
|
|
28
|
+
>>> result = process_url_batch(
|
|
29
|
+
... urls=["http://example.com/item1.json"],
|
|
30
|
+
... worker_id=0,
|
|
31
|
+
... config_dict=config_dict,
|
|
32
|
+
... job_id="abc-123",
|
|
33
|
+
... )
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
from __future__ import annotations
|
|
37
|
+
|
|
38
|
+
import asyncio
|
|
39
|
+
import logging
|
|
40
|
+
import tempfile
|
|
41
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
42
|
+
from pathlib import Path
|
|
43
|
+
from typing import TYPE_CHECKING, Any
|
|
44
|
+
|
|
45
|
+
import geopandas as gpd
|
|
46
|
+
import pandas as pd
|
|
47
|
+
from tqdm import tqdm
|
|
48
|
+
|
|
49
|
+
# Conditional async HTTP imports
|
|
50
|
+
try:
|
|
51
|
+
from .async_http_client import HAS_ASYNC_HTTP, download_stac_items_async
|
|
52
|
+
except ImportError:
|
|
53
|
+
HAS_ASYNC_HTTP = False
|
|
54
|
+
|
|
55
|
+
async def download_stac_items_async(*args, **kwargs) -> list[dict[str, Any]]: # type: ignore
|
|
56
|
+
"""Dummy async function when async HTTP is not available."""
|
|
57
|
+
raise ImportError("Async HTTP client not available")
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
from .engines import get_engine
|
|
61
|
+
from .grid_systems import get_grid_system
|
|
62
|
+
from .statistics import IngestionStatistics
|
|
63
|
+
from .storage_backends import get_storage_backend
|
|
64
|
+
|
|
65
|
+
if TYPE_CHECKING:
|
|
66
|
+
from .ingestion_pipeline import ProcessingConfig
|
|
67
|
+
|
|
68
|
+
logger = logging.getLogger(__name__)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# =============================================================================
|
|
72
|
+
# Worker Result Types
|
|
73
|
+
# =============================================================================
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class DownloadResult:
|
|
77
|
+
"""Result of processing a batch of URLs.
|
|
78
|
+
|
|
79
|
+
Attributes:
|
|
80
|
+
shards: List of shard info dictionaries with paths and counts.
|
|
81
|
+
stats: Statistics from this worker's processing.
|
|
82
|
+
failed_urls: URLs that failed after all retries.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
def __init__(
|
|
86
|
+
self,
|
|
87
|
+
shards: list[dict[str, Any]],
|
|
88
|
+
stats: IngestionStatistics,
|
|
89
|
+
failed_urls: list[str],
|
|
90
|
+
):
|
|
91
|
+
self.shards = shards
|
|
92
|
+
self.stats = stats
|
|
93
|
+
self.failed_urls = failed_urls
|
|
94
|
+
|
|
95
|
+
def to_dict(self) -> dict[str, Any]:
|
|
96
|
+
"""Serialize for transmission."""
|
|
97
|
+
return {
|
|
98
|
+
"shards": self.shards,
|
|
99
|
+
"stats": {
|
|
100
|
+
"urls_processed": self.stats.urls_processed,
|
|
101
|
+
"urls_failed": self.stats.urls_failed,
|
|
102
|
+
},
|
|
103
|
+
"failed_urls": self.failed_urls,
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
@classmethod
|
|
107
|
+
def from_dict(cls, data: dict[str, Any]) -> DownloadResult:
|
|
108
|
+
"""Deserialize from dictionary."""
|
|
109
|
+
stats = IngestionStatistics()
|
|
110
|
+
stats_data = data.get("stats", {})
|
|
111
|
+
# Reconstruct stats from dict
|
|
112
|
+
if "urls_processed" in stats_data:
|
|
113
|
+
stats.urls_processed = stats_data["urls_processed"]
|
|
114
|
+
if "urls_failed" in stats_data:
|
|
115
|
+
stats.urls_failed = stats_data["urls_failed"]
|
|
116
|
+
return cls(
|
|
117
|
+
shards=data.get("shards", []),
|
|
118
|
+
stats=stats,
|
|
119
|
+
failed_urls=data.get("failed_urls", []),
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class ConsolidationResult:
|
|
124
|
+
"""Result of consolidating a partition.
|
|
125
|
+
|
|
126
|
+
Attributes:
|
|
127
|
+
partition_key: The partition that was consolidated.
|
|
128
|
+
item_count: Total items in the final partition.
|
|
129
|
+
existing_count: Items that existed before.
|
|
130
|
+
new_count: New items added.
|
|
131
|
+
duplicates_removed: Duplicates that were removed.
|
|
132
|
+
final_path: Path to the final partition file.
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
def __init__(
|
|
136
|
+
self,
|
|
137
|
+
partition_key: str,
|
|
138
|
+
item_count: int,
|
|
139
|
+
existing_count: int,
|
|
140
|
+
new_count: int,
|
|
141
|
+
duplicates_removed: int = 0,
|
|
142
|
+
final_path: str = "",
|
|
143
|
+
):
|
|
144
|
+
self.partition_key = partition_key
|
|
145
|
+
self.item_count = item_count
|
|
146
|
+
self.existing_count = existing_count
|
|
147
|
+
self.new_count = new_count
|
|
148
|
+
self.duplicates_removed = duplicates_removed
|
|
149
|
+
self.final_path = final_path
|
|
150
|
+
|
|
151
|
+
def to_dict(self) -> dict[str, Any]:
|
|
152
|
+
"""Serialize for transmission."""
|
|
153
|
+
return {
|
|
154
|
+
"partition_key": self.partition_key,
|
|
155
|
+
"item_count": self.item_count,
|
|
156
|
+
"existing_count": self.existing_count,
|
|
157
|
+
"new_count": self.new_count,
|
|
158
|
+
"duplicates_removed": self.duplicates_removed,
|
|
159
|
+
"final_path": self.final_path,
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
# =============================================================================
|
|
164
|
+
# Helper Functions
|
|
165
|
+
# =============================================================================
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _get_stac_hook(hook_config: str) -> Any:
|
|
169
|
+
"""Get a STAC hook instance from configuration string.
|
|
170
|
+
|
|
171
|
+
This is called on workers to instantiate the hook from the serialized config.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
hook_config: Hook configuration string (e.g., "default", "module:pkg:func").
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
Configured hook instance (BaseSTACHook).
|
|
178
|
+
"""
|
|
179
|
+
from .stac_hooks import get_hook
|
|
180
|
+
|
|
181
|
+
return get_hook(hook_config)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def _download_stac_item(
|
|
185
|
+
url: str,
|
|
186
|
+
timeout: int = 30,
|
|
187
|
+
retry_attempts: int = 3,
|
|
188
|
+
hook_config: str = "default",
|
|
189
|
+
) -> dict[str, Any] | None:
|
|
190
|
+
"""Download/generate a single STAC item from URL using configured hook.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
url: URL to the STAC item JSON or data file.
|
|
194
|
+
timeout: Request timeout in seconds.
|
|
195
|
+
retry_attempts: Number of retry attempts.
|
|
196
|
+
hook_config: STAC hook configuration string.
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
Parsed STAC item dictionary, or None if download failed.
|
|
200
|
+
"""
|
|
201
|
+
hook = _get_stac_hook(hook_config)
|
|
202
|
+
return hook.fetch(url, timeout=timeout, retry_attempts=retry_attempts)
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _download_stac_items_batch(
|
|
206
|
+
urls: list[str],
|
|
207
|
+
timeout: int = 30,
|
|
208
|
+
retry_attempts: int = 3,
|
|
209
|
+
hook_config: str = "default",
|
|
210
|
+
) -> list[dict[str, Any] | None]:
|
|
211
|
+
"""Download/generate multiple STAC items using configured hook.
|
|
212
|
+
|
|
213
|
+
This function leverages the hook's batch processing capability if available,
|
|
214
|
+
falling back to sequential processing otherwise.
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
urls: List of URLs to process.
|
|
218
|
+
timeout: Request timeout in seconds.
|
|
219
|
+
retry_attempts: Number of retry attempts.
|
|
220
|
+
hook_config: STAC hook configuration string.
|
|
221
|
+
|
|
222
|
+
Returns:
|
|
223
|
+
List of STAC items (or None for failures), same order as input.
|
|
224
|
+
"""
|
|
225
|
+
hook = _get_stac_hook(hook_config)
|
|
226
|
+
return hook.fetch_batch(urls, timeout=timeout, retry_attempts=retry_attempts)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def _get_partition_key(
|
|
230
|
+
item: dict[str, Any],
|
|
231
|
+
grid_resolver: Any,
|
|
232
|
+
temporal_bin: str,
|
|
233
|
+
enable_global: bool,
|
|
234
|
+
global_threshold: int,
|
|
235
|
+
) -> str:
|
|
236
|
+
"""Compute the partition key for a STAC item.
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
item: STAC item dictionary.
|
|
240
|
+
grid_resolver: Grid resolver instance.
|
|
241
|
+
temporal_bin: Temporal binning level ("year", "month", "day").
|
|
242
|
+
enable_global: Whether to enable global partitioning.
|
|
243
|
+
global_threshold: Threshold for global partition routing.
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
Partition key string like "h3_82/2024/01" or "global/2024/01".
|
|
247
|
+
"""
|
|
248
|
+
# Get geometry for grid cell resolution
|
|
249
|
+
geometry = item.get("geometry")
|
|
250
|
+
|
|
251
|
+
# Get grid cells for this item
|
|
252
|
+
if geometry:
|
|
253
|
+
tiles, is_spanning = grid_resolver.tiles_for_geometry_with_spanning_detection(geometry)
|
|
254
|
+
if not tiles:
|
|
255
|
+
# Fallback to bbox center
|
|
256
|
+
bbox = item.get("bbox", [0, 0, 0, 0])
|
|
257
|
+
center_geom = {"type": "Point", "coordinates": [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2]}
|
|
258
|
+
tiles = grid_resolver.tiles_for_geometry(center_geom)
|
|
259
|
+
grid_cell = tiles[0] if tiles else "unknown"
|
|
260
|
+
|
|
261
|
+
# Check for global partition routing
|
|
262
|
+
if enable_global and len(tiles) > global_threshold:
|
|
263
|
+
grid_cell = "global"
|
|
264
|
+
else:
|
|
265
|
+
# Use bbox center if no geometry
|
|
266
|
+
bbox = item.get("bbox", [0, 0, 0, 0])
|
|
267
|
+
center_geom = {"type": "Point", "coordinates": [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2]}
|
|
268
|
+
tiles = grid_resolver.tiles_for_geometry(center_geom)
|
|
269
|
+
grid_cell = tiles[0] if tiles else "unknown"
|
|
270
|
+
|
|
271
|
+
# Get temporal component
|
|
272
|
+
datetime_str = item.get("properties", {}).get("datetime", "")
|
|
273
|
+
if datetime_str:
|
|
274
|
+
try:
|
|
275
|
+
from datetime import datetime
|
|
276
|
+
|
|
277
|
+
dt = datetime.fromisoformat(datetime_str.replace("Z", "+00:00"))
|
|
278
|
+
if temporal_bin == "year":
|
|
279
|
+
temporal = str(dt.year)
|
|
280
|
+
elif temporal_bin == "month":
|
|
281
|
+
temporal = f"{dt.year}/{dt.month:02d}"
|
|
282
|
+
else: # day
|
|
283
|
+
temporal = f"{dt.year}/{dt.month:02d}/{dt.day:02d}"
|
|
284
|
+
except (ValueError, AttributeError):
|
|
285
|
+
temporal = "unknown"
|
|
286
|
+
else:
|
|
287
|
+
temporal = "unknown"
|
|
288
|
+
|
|
289
|
+
return f"{grid_cell}/{temporal}"
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def _write_shard_to_scratch(
|
|
293
|
+
items: list[dict[str, Any]],
|
|
294
|
+
partition_key: str,
|
|
295
|
+
scratch_path: str,
|
|
296
|
+
worker_tag: str,
|
|
297
|
+
shard_id: int,
|
|
298
|
+
engine_type: str = "rustac",
|
|
299
|
+
) -> dict[str, Any]:
|
|
300
|
+
"""Write items to a shard file in scratch storage.
|
|
301
|
+
|
|
302
|
+
Args:
|
|
303
|
+
items: STAC items to write.
|
|
304
|
+
partition_key: Partition key for directory organization.
|
|
305
|
+
scratch_path: Base scratch directory path.
|
|
306
|
+
worker_tag: Unique worker identifier.
|
|
307
|
+
shard_id: Shard number for this worker.
|
|
308
|
+
engine_type: STAC engine to use.
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
Shard info dictionary with path and metadata.
|
|
312
|
+
"""
|
|
313
|
+
storage = get_storage_backend(scratch_path)
|
|
314
|
+
|
|
315
|
+
# Create shard path
|
|
316
|
+
safe_partition = partition_key.replace("/", "_")
|
|
317
|
+
shard_filename = f"{safe_partition}_{worker_tag}_{shard_id}.parquet"
|
|
318
|
+
shard_path = f"{scratch_path}/shards/{shard_filename}"
|
|
319
|
+
|
|
320
|
+
# Ensure directory exists
|
|
321
|
+
storage.makedirs(Path(shard_path).parent)
|
|
322
|
+
|
|
323
|
+
# Convert items to GeoDataFrame using engine
|
|
324
|
+
engine = get_engine(engine_type) # type: ignore
|
|
325
|
+
gdf = engine.items_to_geodataframe(items)
|
|
326
|
+
|
|
327
|
+
# Write to scratch
|
|
328
|
+
with storage.open(shard_path, "wb") as f:
|
|
329
|
+
gdf.to_parquet(f, index=False, compression="snappy")
|
|
330
|
+
|
|
331
|
+
return {
|
|
332
|
+
"shard_path": shard_path,
|
|
333
|
+
"partition_key": partition_key,
|
|
334
|
+
"item_count": len(items),
|
|
335
|
+
"worker_tag": worker_tag,
|
|
336
|
+
"shard_id": shard_id,
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
# =============================================================================
|
|
341
|
+
# Main Worker Functions (Module-Level, Serializable)
|
|
342
|
+
# =============================================================================
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def process_url_batch(
|
|
346
|
+
urls: list[str],
|
|
347
|
+
worker_id: int,
|
|
348
|
+
config_dict: dict[str, Any],
|
|
349
|
+
job_id: str,
|
|
350
|
+
batch_idx: int = 0,
|
|
351
|
+
) -> dict[str, Any]:
|
|
352
|
+
"""Process a batch of URLs, downloading STAC items and writing shards.
|
|
353
|
+
|
|
354
|
+
This is a **module-level function** that can be pickled and sent to remote
|
|
355
|
+
Dask workers. All dependencies are passed as parameters.
|
|
356
|
+
|
|
357
|
+
Args:
|
|
358
|
+
urls: List of STAC item URLs to download.
|
|
359
|
+
worker_id: Unique worker identifier.
|
|
360
|
+
config_dict: Serialized ProcessingConfig dictionary.
|
|
361
|
+
job_id: Job ID for tracking.
|
|
362
|
+
batch_idx: Batch index for this worker.
|
|
363
|
+
|
|
364
|
+
Returns:
|
|
365
|
+
Dictionary with:
|
|
366
|
+
- shards: List of shard info dictionaries
|
|
367
|
+
- stats: Worker statistics as dict
|
|
368
|
+
- failed_urls: List of URLs that failed
|
|
369
|
+
"""
|
|
370
|
+
# Reconstruct config from dict
|
|
371
|
+
from .ingestion_pipeline import ProcessingConfig
|
|
372
|
+
|
|
373
|
+
config = ProcessingConfig.from_dict(config_dict)
|
|
374
|
+
|
|
375
|
+
# Create worker-local stats
|
|
376
|
+
worker_stats = IngestionStatistics()
|
|
377
|
+
failed_urls: list[str] = []
|
|
378
|
+
|
|
379
|
+
# Create grid resolver
|
|
380
|
+
grid_resolver = get_grid_system(
|
|
381
|
+
config.grid_system,
|
|
382
|
+
resolution=config.grid_resolution,
|
|
383
|
+
geojson_path=config.geojson_path if config.grid_system == "geojson" else None,
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
# Unique tag for this worker's shards
|
|
387
|
+
worker_tag = f"w{worker_id}-{job_id[:8]}-{batch_idx}"
|
|
388
|
+
|
|
389
|
+
# Group items by partition as we process
|
|
390
|
+
partition_items: dict[str, list[dict[str, Any]]] = {}
|
|
391
|
+
shards: list[dict[str, Any]] = []
|
|
392
|
+
|
|
393
|
+
# Choose processing method
|
|
394
|
+
use_async = config.enable_concurrent_http and HAS_ASYNC_HTTP and len(urls) >= config.batch_size
|
|
395
|
+
|
|
396
|
+
if use_async:
|
|
397
|
+
# Async HTTP processing
|
|
398
|
+
items = _download_batch_async(urls, config)
|
|
399
|
+
for item in items:
|
|
400
|
+
if item:
|
|
401
|
+
worker_stats.record_url_processed(success=True)
|
|
402
|
+
partition_key = _get_partition_key(
|
|
403
|
+
item,
|
|
404
|
+
grid_resolver,
|
|
405
|
+
config.temporal_bin,
|
|
406
|
+
config.enable_global_partitioning,
|
|
407
|
+
config.global_partition_threshold,
|
|
408
|
+
)
|
|
409
|
+
if partition_key not in partition_items:
|
|
410
|
+
partition_items[partition_key] = []
|
|
411
|
+
partition_items[partition_key].append(item)
|
|
412
|
+
else:
|
|
413
|
+
# Synchronous processing with progress bar
|
|
414
|
+
# Get hook config from ProcessingConfig
|
|
415
|
+
hook_config = config_dict.get("stac_hook", "default")
|
|
416
|
+
with tqdm(total=len(urls), desc=f"Worker {worker_id}", unit="urls", leave=False) as pbar:
|
|
417
|
+
for url in urls:
|
|
418
|
+
fetched_item = _download_stac_item(
|
|
419
|
+
url,
|
|
420
|
+
timeout=config.request_timeout,
|
|
421
|
+
retry_attempts=config.retry_attempts,
|
|
422
|
+
hook_config=hook_config,
|
|
423
|
+
)
|
|
424
|
+
if fetched_item:
|
|
425
|
+
worker_stats.record_url_processed(success=True)
|
|
426
|
+
partition_key = _get_partition_key(
|
|
427
|
+
fetched_item,
|
|
428
|
+
grid_resolver,
|
|
429
|
+
config.temporal_bin,
|
|
430
|
+
config.enable_global_partitioning,
|
|
431
|
+
config.global_partition_threshold,
|
|
432
|
+
)
|
|
433
|
+
if partition_key not in partition_items:
|
|
434
|
+
partition_items[partition_key] = []
|
|
435
|
+
partition_items[partition_key].append(fetched_item)
|
|
436
|
+
else:
|
|
437
|
+
worker_stats.record_url_processed(success=False)
|
|
438
|
+
failed_urls.append(url)
|
|
439
|
+
pbar.update(1)
|
|
440
|
+
|
|
441
|
+
# Write shards for each partition
|
|
442
|
+
shard_counter = 0
|
|
443
|
+
for partition_key, items in partition_items.items():
|
|
444
|
+
# Split into shard-sized chunks
|
|
445
|
+
for i in range(0, len(items), config.items_per_shard):
|
|
446
|
+
chunk = items[i : i + config.items_per_shard]
|
|
447
|
+
shard_info = _write_shard_to_scratch(
|
|
448
|
+
items=chunk,
|
|
449
|
+
partition_key=partition_key,
|
|
450
|
+
scratch_path=config.scratch_location,
|
|
451
|
+
worker_tag=worker_tag,
|
|
452
|
+
shard_id=shard_counter,
|
|
453
|
+
engine_type=config.stac_engine,
|
|
454
|
+
)
|
|
455
|
+
shards.append(shard_info)
|
|
456
|
+
shard_counter += 1
|
|
457
|
+
|
|
458
|
+
logger.info(
|
|
459
|
+
f"Worker {worker_id}: processed {len(urls)} URLs, wrote {len(shards)} shards, {len(failed_urls)} failures"
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
# Serialize stats as simple dict with basic counters
|
|
463
|
+
stats_dict = {
|
|
464
|
+
"urls_processed": worker_stats.urls_processed,
|
|
465
|
+
"urls_failed": worker_stats.urls_failed,
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
return {
|
|
469
|
+
"shards": shards,
|
|
470
|
+
"stats": stats_dict,
|
|
471
|
+
"failed_urls": failed_urls,
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def _download_batch_async(urls: list[str], config: ProcessingConfig) -> list[dict[str, Any]]:
|
|
476
|
+
"""Download URLs using async HTTP."""
|
|
477
|
+
try:
|
|
478
|
+
# Try to get or create event loop
|
|
479
|
+
try:
|
|
480
|
+
loop = asyncio.get_event_loop()
|
|
481
|
+
if loop.is_running():
|
|
482
|
+
# Running in async context (Jupyter, etc.)
|
|
483
|
+
with ThreadPoolExecutor(max_workers=1) as _:
|
|
484
|
+
try:
|
|
485
|
+
import nest_asyncio
|
|
486
|
+
|
|
487
|
+
nest_asyncio.apply()
|
|
488
|
+
except ImportError:
|
|
489
|
+
pass # nest_asyncio not available
|
|
490
|
+
return loop.run_until_complete(
|
|
491
|
+
download_stac_items_async(
|
|
492
|
+
urls,
|
|
493
|
+
concurrent_requests=config.concurrent_requests,
|
|
494
|
+
connection_pool_size=config.connection_pool_size,
|
|
495
|
+
request_timeout=config.request_timeout,
|
|
496
|
+
retry_attempts=config.retry_attempts,
|
|
497
|
+
retry_delay=config.retry_delay,
|
|
498
|
+
batch_size=config.batch_size,
|
|
499
|
+
)
|
|
500
|
+
)
|
|
501
|
+
except RuntimeError:
|
|
502
|
+
pass
|
|
503
|
+
|
|
504
|
+
# Create new event loop
|
|
505
|
+
loop = asyncio.new_event_loop()
|
|
506
|
+
asyncio.set_event_loop(loop)
|
|
507
|
+
try:
|
|
508
|
+
return loop.run_until_complete(
|
|
509
|
+
download_stac_items_async(
|
|
510
|
+
urls,
|
|
511
|
+
concurrent_requests=config.concurrent_requests,
|
|
512
|
+
connection_pool_size=config.connection_pool_size,
|
|
513
|
+
request_timeout=config.request_timeout,
|
|
514
|
+
retry_attempts=config.retry_attempts,
|
|
515
|
+
retry_delay=config.retry_delay,
|
|
516
|
+
batch_size=config.batch_size,
|
|
517
|
+
)
|
|
518
|
+
)
|
|
519
|
+
finally:
|
|
520
|
+
loop.close()
|
|
521
|
+
except (ValueError, TypeError, RuntimeError, ConnectionError) as e:
|
|
522
|
+
logger.error(f"Async download failed: {e}, falling back to sync")
|
|
523
|
+
# Fallback to sync
|
|
524
|
+
items = []
|
|
525
|
+
for url in urls:
|
|
526
|
+
item = _download_stac_item(url, config.request_timeout, config.retry_attempts)
|
|
527
|
+
if item:
|
|
528
|
+
items.append(item)
|
|
529
|
+
return items
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
def consolidate_partition(
|
|
533
|
+
partition_key: str,
|
|
534
|
+
shard_paths: list[str],
|
|
535
|
+
config_dict: dict[str, Any],
|
|
536
|
+
) -> dict[str, Any]:
|
|
537
|
+
"""Consolidate shards for a partition, merging with existing data.
|
|
538
|
+
|
|
539
|
+
This is a **module-level function** that can be pickled and sent to remote
|
|
540
|
+
Dask workers. All dependencies are passed as parameters.
|
|
541
|
+
|
|
542
|
+
Deduplication uses newer datetime wins strategy: when items have the same ID,
|
|
543
|
+
the one with the more recent datetime property is kept.
|
|
544
|
+
|
|
545
|
+
Args:
|
|
546
|
+
partition_key: Partition key (e.g., "h3_82/2024/01").
|
|
547
|
+
shard_paths: List of shard file paths to merge.
|
|
548
|
+
config_dict: Serialized ProcessingConfig dictionary.
|
|
549
|
+
|
|
550
|
+
Returns:
|
|
551
|
+
Dictionary with consolidation results:
|
|
552
|
+
- partition_key: The partition that was consolidated
|
|
553
|
+
- item_count: Total items in final partition
|
|
554
|
+
- existing_count: Items that existed before
|
|
555
|
+
- new_count: New items added
|
|
556
|
+
- duplicates_removed: Number of duplicates removed
|
|
557
|
+
- final_path: Path to final partition file
|
|
558
|
+
"""
|
|
559
|
+
from .ingestion_pipeline import ProcessingConfig
|
|
560
|
+
|
|
561
|
+
config = ProcessingConfig.from_dict(config_dict)
|
|
562
|
+
|
|
563
|
+
# Get storage backends
|
|
564
|
+
catalog_storage = get_storage_backend(config.output_catalog)
|
|
565
|
+
scratch_storage = get_storage_backend(config.scratch_location)
|
|
566
|
+
|
|
567
|
+
# Compute final path
|
|
568
|
+
final_path = f"{config.output_catalog}/{partition_key}/data.parquet"
|
|
569
|
+
|
|
570
|
+
existing_count = 0
|
|
571
|
+
all_gdfs: list[gpd.GeoDataFrame] = []
|
|
572
|
+
|
|
573
|
+
# Use temp directory for staging
|
|
574
|
+
with tempfile.TemporaryDirectory(dir=config.temp_dir_location) as temp_dir:
|
|
575
|
+
temp_existing_path = Path(temp_dir) / "existing.parquet"
|
|
576
|
+
temp_merged_path = Path(temp_dir) / "merged.parquet"
|
|
577
|
+
|
|
578
|
+
# Step 1: Download existing partition if it exists
|
|
579
|
+
if catalog_storage.exists(final_path):
|
|
580
|
+
try:
|
|
581
|
+
logger.debug(f"Partition {partition_key}: downloading existing data")
|
|
582
|
+
with catalog_storage.open(final_path, "rb") as f:
|
|
583
|
+
binary_data = f.read()
|
|
584
|
+
# Write to temp file so we can use gpd.read_parquet
|
|
585
|
+
with open(temp_existing_path, "wb") as temp_f:
|
|
586
|
+
temp_f.write(binary_data)
|
|
587
|
+
existing_gdf = gpd.read_parquet(temp_existing_path)
|
|
588
|
+
existing_count = len(existing_gdf)
|
|
589
|
+
all_gdfs.append(existing_gdf)
|
|
590
|
+
except (OSError, ValueError, TypeError) as e:
|
|
591
|
+
logger.error(f"Error reading existing partition {final_path}: {e}")
|
|
592
|
+
|
|
593
|
+
# Step 2: Read all new shards
|
|
594
|
+
for idx, shard_path in enumerate(shard_paths):
|
|
595
|
+
try:
|
|
596
|
+
with scratch_storage.open(shard_path, "rb") as f:
|
|
597
|
+
binary_data = f.read()
|
|
598
|
+
# Write to temp file so we can use gpd.read_parquet
|
|
599
|
+
temp_shard_path = Path(temp_dir) / f"shard_{idx}.parquet"
|
|
600
|
+
with open(temp_shard_path, "wb") as temp_f:
|
|
601
|
+
temp_f.write(binary_data)
|
|
602
|
+
gdf = gpd.read_parquet(temp_shard_path)
|
|
603
|
+
all_gdfs.append(gdf)
|
|
604
|
+
except (OSError, ValueError, TypeError) as e:
|
|
605
|
+
logger.error(f"Error reading shard {shard_path}: {e}")
|
|
606
|
+
continue
|
|
607
|
+
|
|
608
|
+
if not all_gdfs:
|
|
609
|
+
return {
|
|
610
|
+
"partition_key": partition_key,
|
|
611
|
+
"item_count": 0,
|
|
612
|
+
"existing_count": 0,
|
|
613
|
+
"new_count": 0,
|
|
614
|
+
"duplicates_removed": 0,
|
|
615
|
+
"final_path": final_path,
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
# Step 3: Merge all data
|
|
619
|
+
merged = pd.concat(all_gdfs, ignore_index=True)
|
|
620
|
+
original_count = len(merged)
|
|
621
|
+
|
|
622
|
+
# Step 4: Deduplicate - newer datetime wins
|
|
623
|
+
if "datetime" in merged.columns and "id" in merged.columns:
|
|
624
|
+
# Sort by datetime descending so newer items come first
|
|
625
|
+
try:
|
|
626
|
+
merged["_dt_sort"] = pd.to_datetime(merged["datetime"], errors="coerce")
|
|
627
|
+
merged = merged.sort_values("_dt_sort", ascending=False, na_position="last")
|
|
628
|
+
merged = merged.drop_duplicates(subset=["id"], keep="first")
|
|
629
|
+
merged = merged.drop(columns=["_dt_sort"])
|
|
630
|
+
except (ValueError, TypeError):
|
|
631
|
+
# Fallback to simple dedup
|
|
632
|
+
merged = merged.drop_duplicates(subset=["id"], keep="last")
|
|
633
|
+
elif "id" in merged.columns:
|
|
634
|
+
merged = merged.drop_duplicates(subset=["id"], keep="last")
|
|
635
|
+
|
|
636
|
+
duplicates_removed = original_count - len(merged)
|
|
637
|
+
|
|
638
|
+
# Step 5: Sort by configured key
|
|
639
|
+
if config.sort_key in merged.columns:
|
|
640
|
+
merged = merged.sort_values(config.sort_key, ascending=config.sort_ascending)
|
|
641
|
+
|
|
642
|
+
# Step 6: Write to temp file
|
|
643
|
+
# Ensure geometry column is properly set
|
|
644
|
+
if "geometry" in merged.columns:
|
|
645
|
+
merged_gdf = gpd.GeoDataFrame(merged, geometry="geometry")
|
|
646
|
+
else:
|
|
647
|
+
merged_gdf = gpd.GeoDataFrame(merged)
|
|
648
|
+
merged_gdf.to_parquet(temp_merged_path, index=False, compression="snappy")
|
|
649
|
+
|
|
650
|
+
# Step 7: Upload to final location
|
|
651
|
+
catalog_storage.makedirs(Path(final_path).parent)
|
|
652
|
+
if final_path.startswith("s3://"):
|
|
653
|
+
catalog_storage.upload(str(temp_merged_path), final_path)
|
|
654
|
+
else:
|
|
655
|
+
# Local storage - atomic move
|
|
656
|
+
import shutil
|
|
657
|
+
|
|
658
|
+
shutil.copy2(temp_merged_path, final_path)
|
|
659
|
+
|
|
660
|
+
new_count = len(merged) - existing_count
|
|
661
|
+
|
|
662
|
+
logger.info(
|
|
663
|
+
f"Partition {partition_key}: {existing_count} existing + {new_count} new = "
|
|
664
|
+
f"{len(merged)} total ({duplicates_removed} duplicates removed)"
|
|
665
|
+
)
|
|
666
|
+
|
|
667
|
+
return {
|
|
668
|
+
"partition_key": partition_key,
|
|
669
|
+
"item_count": len(merged),
|
|
670
|
+
"existing_count": existing_count,
|
|
671
|
+
"new_count": new_count,
|
|
672
|
+
"duplicates_removed": duplicates_removed,
|
|
673
|
+
"final_path": final_path,
|
|
674
|
+
}
|
|
675
|
+
|
|
676
|
+
|
|
677
|
+
__all__ = [
|
|
678
|
+
"DownloadResult",
|
|
679
|
+
"ConsolidationResult",
|
|
680
|
+
"process_url_batch",
|
|
681
|
+
"consolidate_partition",
|
|
682
|
+
]
|