earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. earthcatalog/__init__.py +164 -0
  2. earthcatalog/async_http_client.py +1006 -0
  3. earthcatalog/config.py +97 -0
  4. earthcatalog/engines/__init__.py +308 -0
  5. earthcatalog/engines/rustac_engine.py +142 -0
  6. earthcatalog/engines/stac_geoparquet_engine.py +126 -0
  7. earthcatalog/exceptions.py +471 -0
  8. earthcatalog/grid_systems.py +1114 -0
  9. earthcatalog/ingestion_pipeline.py +2281 -0
  10. earthcatalog/input_readers.py +603 -0
  11. earthcatalog/job_tracking.py +485 -0
  12. earthcatalog/pipeline.py +606 -0
  13. earthcatalog/schema_generator.py +911 -0
  14. earthcatalog/spatial_resolver.py +1207 -0
  15. earthcatalog/stac_hooks.py +754 -0
  16. earthcatalog/statistics.py +677 -0
  17. earthcatalog/storage_backends.py +548 -0
  18. earthcatalog/tests/__init__.py +1 -0
  19. earthcatalog/tests/conftest.py +76 -0
  20. earthcatalog/tests/test_all_grids.py +793 -0
  21. earthcatalog/tests/test_async_http.py +700 -0
  22. earthcatalog/tests/test_cli_and_storage.py +230 -0
  23. earthcatalog/tests/test_config.py +245 -0
  24. earthcatalog/tests/test_dask_integration.py +580 -0
  25. earthcatalog/tests/test_e2e_synthetic.py +1624 -0
  26. earthcatalog/tests/test_engines.py +272 -0
  27. earthcatalog/tests/test_exceptions.py +346 -0
  28. earthcatalog/tests/test_file_structure.py +245 -0
  29. earthcatalog/tests/test_input_readers.py +666 -0
  30. earthcatalog/tests/test_integration.py +200 -0
  31. earthcatalog/tests/test_integration_async.py +283 -0
  32. earthcatalog/tests/test_job_tracking.py +603 -0
  33. earthcatalog/tests/test_multi_file_input.py +336 -0
  34. earthcatalog/tests/test_passthrough_hook.py +196 -0
  35. earthcatalog/tests/test_pipeline.py +684 -0
  36. earthcatalog/tests/test_pipeline_components.py +665 -0
  37. earthcatalog/tests/test_schema_generator.py +506 -0
  38. earthcatalog/tests/test_spatial_resolver.py +413 -0
  39. earthcatalog/tests/test_stac_hooks.py +776 -0
  40. earthcatalog/tests/test_statistics.py +477 -0
  41. earthcatalog/tests/test_storage_backends.py +236 -0
  42. earthcatalog/tests/test_validation.py +435 -0
  43. earthcatalog/tests/test_workers.py +653 -0
  44. earthcatalog/validation.py +921 -0
  45. earthcatalog/workers.py +682 -0
  46. earthcatalog-0.2.0.dist-info/METADATA +333 -0
  47. earthcatalog-0.2.0.dist-info/RECORD +50 -0
  48. earthcatalog-0.2.0.dist-info/WHEEL +5 -0
  49. earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
  50. earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,682 @@
1
+ # workers.py
2
+ """Serializable worker functions for distributed STAC ingestion.
3
+
4
+ This module provides **module-level functions** that can be pickled and sent
5
+ to remote Dask workers. Unlike the methods in `ingestion_pipeline.py`, these
6
+ functions do not capture `self` and receive all dependencies as explicit parameters.
7
+
8
+ Architecture:
9
+ These functions are designed to be called from both:
10
+ 1. ThreadPoolExecutor (local processing)
11
+ 2. Dask distributed workers (cluster processing)
12
+
13
+ Each function receives a serialized config dict and storage paths,
14
+ avoiding closure captures that would prevent pickling.
15
+
16
+ Key Functions:
17
+ process_url_batch: Download STAC items from URLs and write shards
18
+ consolidate_partition: Merge shards for a partition with existing data
19
+
20
+ Usage:
21
+ >>> from earthcatalog.workers import process_url_batch
22
+ >>> from earthcatalog.ingestion_pipeline import ProcessingConfig
23
+ >>>
24
+ >>> config = ProcessingConfig(...)
25
+ >>> config_dict = config.to_dict()
26
+ >>>
27
+ >>> # Can be called directly or via Dask
28
+ >>> result = process_url_batch(
29
+ ... urls=["http://example.com/item1.json"],
30
+ ... worker_id=0,
31
+ ... config_dict=config_dict,
32
+ ... job_id="abc-123",
33
+ ... )
34
+ """
35
+
36
+ from __future__ import annotations
37
+
38
+ import asyncio
39
+ import logging
40
+ import tempfile
41
+ from concurrent.futures import ThreadPoolExecutor
42
+ from pathlib import Path
43
+ from typing import TYPE_CHECKING, Any
44
+
45
+ import geopandas as gpd
46
+ import pandas as pd
47
+ from tqdm import tqdm
48
+
49
+ # Conditional async HTTP imports
50
+ try:
51
+ from .async_http_client import HAS_ASYNC_HTTP, download_stac_items_async
52
+ except ImportError:
53
+ HAS_ASYNC_HTTP = False
54
+
55
+ async def download_stac_items_async(*args, **kwargs) -> list[dict[str, Any]]: # type: ignore
56
+ """Dummy async function when async HTTP is not available."""
57
+ raise ImportError("Async HTTP client not available")
58
+
59
+
60
+ from .engines import get_engine
61
+ from .grid_systems import get_grid_system
62
+ from .statistics import IngestionStatistics
63
+ from .storage_backends import get_storage_backend
64
+
65
+ if TYPE_CHECKING:
66
+ from .ingestion_pipeline import ProcessingConfig
67
+
68
+ logger = logging.getLogger(__name__)
69
+
70
+
71
+ # =============================================================================
72
+ # Worker Result Types
73
+ # =============================================================================
74
+
75
+
76
+ class DownloadResult:
77
+ """Result of processing a batch of URLs.
78
+
79
+ Attributes:
80
+ shards: List of shard info dictionaries with paths and counts.
81
+ stats: Statistics from this worker's processing.
82
+ failed_urls: URLs that failed after all retries.
83
+ """
84
+
85
+ def __init__(
86
+ self,
87
+ shards: list[dict[str, Any]],
88
+ stats: IngestionStatistics,
89
+ failed_urls: list[str],
90
+ ):
91
+ self.shards = shards
92
+ self.stats = stats
93
+ self.failed_urls = failed_urls
94
+
95
+ def to_dict(self) -> dict[str, Any]:
96
+ """Serialize for transmission."""
97
+ return {
98
+ "shards": self.shards,
99
+ "stats": {
100
+ "urls_processed": self.stats.urls_processed,
101
+ "urls_failed": self.stats.urls_failed,
102
+ },
103
+ "failed_urls": self.failed_urls,
104
+ }
105
+
106
+ @classmethod
107
+ def from_dict(cls, data: dict[str, Any]) -> DownloadResult:
108
+ """Deserialize from dictionary."""
109
+ stats = IngestionStatistics()
110
+ stats_data = data.get("stats", {})
111
+ # Reconstruct stats from dict
112
+ if "urls_processed" in stats_data:
113
+ stats.urls_processed = stats_data["urls_processed"]
114
+ if "urls_failed" in stats_data:
115
+ stats.urls_failed = stats_data["urls_failed"]
116
+ return cls(
117
+ shards=data.get("shards", []),
118
+ stats=stats,
119
+ failed_urls=data.get("failed_urls", []),
120
+ )
121
+
122
+
123
+ class ConsolidationResult:
124
+ """Result of consolidating a partition.
125
+
126
+ Attributes:
127
+ partition_key: The partition that was consolidated.
128
+ item_count: Total items in the final partition.
129
+ existing_count: Items that existed before.
130
+ new_count: New items added.
131
+ duplicates_removed: Duplicates that were removed.
132
+ final_path: Path to the final partition file.
133
+ """
134
+
135
+ def __init__(
136
+ self,
137
+ partition_key: str,
138
+ item_count: int,
139
+ existing_count: int,
140
+ new_count: int,
141
+ duplicates_removed: int = 0,
142
+ final_path: str = "",
143
+ ):
144
+ self.partition_key = partition_key
145
+ self.item_count = item_count
146
+ self.existing_count = existing_count
147
+ self.new_count = new_count
148
+ self.duplicates_removed = duplicates_removed
149
+ self.final_path = final_path
150
+
151
+ def to_dict(self) -> dict[str, Any]:
152
+ """Serialize for transmission."""
153
+ return {
154
+ "partition_key": self.partition_key,
155
+ "item_count": self.item_count,
156
+ "existing_count": self.existing_count,
157
+ "new_count": self.new_count,
158
+ "duplicates_removed": self.duplicates_removed,
159
+ "final_path": self.final_path,
160
+ }
161
+
162
+
163
+ # =============================================================================
164
+ # Helper Functions
165
+ # =============================================================================
166
+
167
+
168
+ def _get_stac_hook(hook_config: str) -> Any:
169
+ """Get a STAC hook instance from configuration string.
170
+
171
+ This is called on workers to instantiate the hook from the serialized config.
172
+
173
+ Args:
174
+ hook_config: Hook configuration string (e.g., "default", "module:pkg:func").
175
+
176
+ Returns:
177
+ Configured hook instance (BaseSTACHook).
178
+ """
179
+ from .stac_hooks import get_hook
180
+
181
+ return get_hook(hook_config)
182
+
183
+
184
+ def _download_stac_item(
185
+ url: str,
186
+ timeout: int = 30,
187
+ retry_attempts: int = 3,
188
+ hook_config: str = "default",
189
+ ) -> dict[str, Any] | None:
190
+ """Download/generate a single STAC item from URL using configured hook.
191
+
192
+ Args:
193
+ url: URL to the STAC item JSON or data file.
194
+ timeout: Request timeout in seconds.
195
+ retry_attempts: Number of retry attempts.
196
+ hook_config: STAC hook configuration string.
197
+
198
+ Returns:
199
+ Parsed STAC item dictionary, or None if download failed.
200
+ """
201
+ hook = _get_stac_hook(hook_config)
202
+ return hook.fetch(url, timeout=timeout, retry_attempts=retry_attempts)
203
+
204
+
205
+ def _download_stac_items_batch(
206
+ urls: list[str],
207
+ timeout: int = 30,
208
+ retry_attempts: int = 3,
209
+ hook_config: str = "default",
210
+ ) -> list[dict[str, Any] | None]:
211
+ """Download/generate multiple STAC items using configured hook.
212
+
213
+ This function leverages the hook's batch processing capability if available,
214
+ falling back to sequential processing otherwise.
215
+
216
+ Args:
217
+ urls: List of URLs to process.
218
+ timeout: Request timeout in seconds.
219
+ retry_attempts: Number of retry attempts.
220
+ hook_config: STAC hook configuration string.
221
+
222
+ Returns:
223
+ List of STAC items (or None for failures), same order as input.
224
+ """
225
+ hook = _get_stac_hook(hook_config)
226
+ return hook.fetch_batch(urls, timeout=timeout, retry_attempts=retry_attempts)
227
+
228
+
229
+ def _get_partition_key(
230
+ item: dict[str, Any],
231
+ grid_resolver: Any,
232
+ temporal_bin: str,
233
+ enable_global: bool,
234
+ global_threshold: int,
235
+ ) -> str:
236
+ """Compute the partition key for a STAC item.
237
+
238
+ Args:
239
+ item: STAC item dictionary.
240
+ grid_resolver: Grid resolver instance.
241
+ temporal_bin: Temporal binning level ("year", "month", "day").
242
+ enable_global: Whether to enable global partitioning.
243
+ global_threshold: Threshold for global partition routing.
244
+
245
+ Returns:
246
+ Partition key string like "h3_82/2024/01" or "global/2024/01".
247
+ """
248
+ # Get geometry for grid cell resolution
249
+ geometry = item.get("geometry")
250
+
251
+ # Get grid cells for this item
252
+ if geometry:
253
+ tiles, is_spanning = grid_resolver.tiles_for_geometry_with_spanning_detection(geometry)
254
+ if not tiles:
255
+ # Fallback to bbox center
256
+ bbox = item.get("bbox", [0, 0, 0, 0])
257
+ center_geom = {"type": "Point", "coordinates": [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2]}
258
+ tiles = grid_resolver.tiles_for_geometry(center_geom)
259
+ grid_cell = tiles[0] if tiles else "unknown"
260
+
261
+ # Check for global partition routing
262
+ if enable_global and len(tiles) > global_threshold:
263
+ grid_cell = "global"
264
+ else:
265
+ # Use bbox center if no geometry
266
+ bbox = item.get("bbox", [0, 0, 0, 0])
267
+ center_geom = {"type": "Point", "coordinates": [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2]}
268
+ tiles = grid_resolver.tiles_for_geometry(center_geom)
269
+ grid_cell = tiles[0] if tiles else "unknown"
270
+
271
+ # Get temporal component
272
+ datetime_str = item.get("properties", {}).get("datetime", "")
273
+ if datetime_str:
274
+ try:
275
+ from datetime import datetime
276
+
277
+ dt = datetime.fromisoformat(datetime_str.replace("Z", "+00:00"))
278
+ if temporal_bin == "year":
279
+ temporal = str(dt.year)
280
+ elif temporal_bin == "month":
281
+ temporal = f"{dt.year}/{dt.month:02d}"
282
+ else: # day
283
+ temporal = f"{dt.year}/{dt.month:02d}/{dt.day:02d}"
284
+ except (ValueError, AttributeError):
285
+ temporal = "unknown"
286
+ else:
287
+ temporal = "unknown"
288
+
289
+ return f"{grid_cell}/{temporal}"
290
+
291
+
292
+ def _write_shard_to_scratch(
293
+ items: list[dict[str, Any]],
294
+ partition_key: str,
295
+ scratch_path: str,
296
+ worker_tag: str,
297
+ shard_id: int,
298
+ engine_type: str = "rustac",
299
+ ) -> dict[str, Any]:
300
+ """Write items to a shard file in scratch storage.
301
+
302
+ Args:
303
+ items: STAC items to write.
304
+ partition_key: Partition key for directory organization.
305
+ scratch_path: Base scratch directory path.
306
+ worker_tag: Unique worker identifier.
307
+ shard_id: Shard number for this worker.
308
+ engine_type: STAC engine to use.
309
+
310
+ Returns:
311
+ Shard info dictionary with path and metadata.
312
+ """
313
+ storage = get_storage_backend(scratch_path)
314
+
315
+ # Create shard path
316
+ safe_partition = partition_key.replace("/", "_")
317
+ shard_filename = f"{safe_partition}_{worker_tag}_{shard_id}.parquet"
318
+ shard_path = f"{scratch_path}/shards/{shard_filename}"
319
+
320
+ # Ensure directory exists
321
+ storage.makedirs(Path(shard_path).parent)
322
+
323
+ # Convert items to GeoDataFrame using engine
324
+ engine = get_engine(engine_type) # type: ignore
325
+ gdf = engine.items_to_geodataframe(items)
326
+
327
+ # Write to scratch
328
+ with storage.open(shard_path, "wb") as f:
329
+ gdf.to_parquet(f, index=False, compression="snappy")
330
+
331
+ return {
332
+ "shard_path": shard_path,
333
+ "partition_key": partition_key,
334
+ "item_count": len(items),
335
+ "worker_tag": worker_tag,
336
+ "shard_id": shard_id,
337
+ }
338
+
339
+
340
+ # =============================================================================
341
+ # Main Worker Functions (Module-Level, Serializable)
342
+ # =============================================================================
343
+
344
+
345
+ def process_url_batch(
346
+ urls: list[str],
347
+ worker_id: int,
348
+ config_dict: dict[str, Any],
349
+ job_id: str,
350
+ batch_idx: int = 0,
351
+ ) -> dict[str, Any]:
352
+ """Process a batch of URLs, downloading STAC items and writing shards.
353
+
354
+ This is a **module-level function** that can be pickled and sent to remote
355
+ Dask workers. All dependencies are passed as parameters.
356
+
357
+ Args:
358
+ urls: List of STAC item URLs to download.
359
+ worker_id: Unique worker identifier.
360
+ config_dict: Serialized ProcessingConfig dictionary.
361
+ job_id: Job ID for tracking.
362
+ batch_idx: Batch index for this worker.
363
+
364
+ Returns:
365
+ Dictionary with:
366
+ - shards: List of shard info dictionaries
367
+ - stats: Worker statistics as dict
368
+ - failed_urls: List of URLs that failed
369
+ """
370
+ # Reconstruct config from dict
371
+ from .ingestion_pipeline import ProcessingConfig
372
+
373
+ config = ProcessingConfig.from_dict(config_dict)
374
+
375
+ # Create worker-local stats
376
+ worker_stats = IngestionStatistics()
377
+ failed_urls: list[str] = []
378
+
379
+ # Create grid resolver
380
+ grid_resolver = get_grid_system(
381
+ config.grid_system,
382
+ resolution=config.grid_resolution,
383
+ geojson_path=config.geojson_path if config.grid_system == "geojson" else None,
384
+ )
385
+
386
+ # Unique tag for this worker's shards
387
+ worker_tag = f"w{worker_id}-{job_id[:8]}-{batch_idx}"
388
+
389
+ # Group items by partition as we process
390
+ partition_items: dict[str, list[dict[str, Any]]] = {}
391
+ shards: list[dict[str, Any]] = []
392
+
393
+ # Choose processing method
394
+ use_async = config.enable_concurrent_http and HAS_ASYNC_HTTP and len(urls) >= config.batch_size
395
+
396
+ if use_async:
397
+ # Async HTTP processing
398
+ items = _download_batch_async(urls, config)
399
+ for item in items:
400
+ if item:
401
+ worker_stats.record_url_processed(success=True)
402
+ partition_key = _get_partition_key(
403
+ item,
404
+ grid_resolver,
405
+ config.temporal_bin,
406
+ config.enable_global_partitioning,
407
+ config.global_partition_threshold,
408
+ )
409
+ if partition_key not in partition_items:
410
+ partition_items[partition_key] = []
411
+ partition_items[partition_key].append(item)
412
+ else:
413
+ # Synchronous processing with progress bar
414
+ # Get hook config from ProcessingConfig
415
+ hook_config = config_dict.get("stac_hook", "default")
416
+ with tqdm(total=len(urls), desc=f"Worker {worker_id}", unit="urls", leave=False) as pbar:
417
+ for url in urls:
418
+ fetched_item = _download_stac_item(
419
+ url,
420
+ timeout=config.request_timeout,
421
+ retry_attempts=config.retry_attempts,
422
+ hook_config=hook_config,
423
+ )
424
+ if fetched_item:
425
+ worker_stats.record_url_processed(success=True)
426
+ partition_key = _get_partition_key(
427
+ fetched_item,
428
+ grid_resolver,
429
+ config.temporal_bin,
430
+ config.enable_global_partitioning,
431
+ config.global_partition_threshold,
432
+ )
433
+ if partition_key not in partition_items:
434
+ partition_items[partition_key] = []
435
+ partition_items[partition_key].append(fetched_item)
436
+ else:
437
+ worker_stats.record_url_processed(success=False)
438
+ failed_urls.append(url)
439
+ pbar.update(1)
440
+
441
+ # Write shards for each partition
442
+ shard_counter = 0
443
+ for partition_key, items in partition_items.items():
444
+ # Split into shard-sized chunks
445
+ for i in range(0, len(items), config.items_per_shard):
446
+ chunk = items[i : i + config.items_per_shard]
447
+ shard_info = _write_shard_to_scratch(
448
+ items=chunk,
449
+ partition_key=partition_key,
450
+ scratch_path=config.scratch_location,
451
+ worker_tag=worker_tag,
452
+ shard_id=shard_counter,
453
+ engine_type=config.stac_engine,
454
+ )
455
+ shards.append(shard_info)
456
+ shard_counter += 1
457
+
458
+ logger.info(
459
+ f"Worker {worker_id}: processed {len(urls)} URLs, wrote {len(shards)} shards, {len(failed_urls)} failures"
460
+ )
461
+
462
+ # Serialize stats as simple dict with basic counters
463
+ stats_dict = {
464
+ "urls_processed": worker_stats.urls_processed,
465
+ "urls_failed": worker_stats.urls_failed,
466
+ }
467
+
468
+ return {
469
+ "shards": shards,
470
+ "stats": stats_dict,
471
+ "failed_urls": failed_urls,
472
+ }
473
+
474
+
475
+ def _download_batch_async(urls: list[str], config: ProcessingConfig) -> list[dict[str, Any]]:
476
+ """Download URLs using async HTTP."""
477
+ try:
478
+ # Try to get or create event loop
479
+ try:
480
+ loop = asyncio.get_event_loop()
481
+ if loop.is_running():
482
+ # Running in async context (Jupyter, etc.)
483
+ with ThreadPoolExecutor(max_workers=1) as _:
484
+ try:
485
+ import nest_asyncio
486
+
487
+ nest_asyncio.apply()
488
+ except ImportError:
489
+ pass # nest_asyncio not available
490
+ return loop.run_until_complete(
491
+ download_stac_items_async(
492
+ urls,
493
+ concurrent_requests=config.concurrent_requests,
494
+ connection_pool_size=config.connection_pool_size,
495
+ request_timeout=config.request_timeout,
496
+ retry_attempts=config.retry_attempts,
497
+ retry_delay=config.retry_delay,
498
+ batch_size=config.batch_size,
499
+ )
500
+ )
501
+ except RuntimeError:
502
+ pass
503
+
504
+ # Create new event loop
505
+ loop = asyncio.new_event_loop()
506
+ asyncio.set_event_loop(loop)
507
+ try:
508
+ return loop.run_until_complete(
509
+ download_stac_items_async(
510
+ urls,
511
+ concurrent_requests=config.concurrent_requests,
512
+ connection_pool_size=config.connection_pool_size,
513
+ request_timeout=config.request_timeout,
514
+ retry_attempts=config.retry_attempts,
515
+ retry_delay=config.retry_delay,
516
+ batch_size=config.batch_size,
517
+ )
518
+ )
519
+ finally:
520
+ loop.close()
521
+ except (ValueError, TypeError, RuntimeError, ConnectionError) as e:
522
+ logger.error(f"Async download failed: {e}, falling back to sync")
523
+ # Fallback to sync
524
+ items = []
525
+ for url in urls:
526
+ item = _download_stac_item(url, config.request_timeout, config.retry_attempts)
527
+ if item:
528
+ items.append(item)
529
+ return items
530
+
531
+
532
+ def consolidate_partition(
533
+ partition_key: str,
534
+ shard_paths: list[str],
535
+ config_dict: dict[str, Any],
536
+ ) -> dict[str, Any]:
537
+ """Consolidate shards for a partition, merging with existing data.
538
+
539
+ This is a **module-level function** that can be pickled and sent to remote
540
+ Dask workers. All dependencies are passed as parameters.
541
+
542
+ Deduplication uses newer datetime wins strategy: when items have the same ID,
543
+ the one with the more recent datetime property is kept.
544
+
545
+ Args:
546
+ partition_key: Partition key (e.g., "h3_82/2024/01").
547
+ shard_paths: List of shard file paths to merge.
548
+ config_dict: Serialized ProcessingConfig dictionary.
549
+
550
+ Returns:
551
+ Dictionary with consolidation results:
552
+ - partition_key: The partition that was consolidated
553
+ - item_count: Total items in final partition
554
+ - existing_count: Items that existed before
555
+ - new_count: New items added
556
+ - duplicates_removed: Number of duplicates removed
557
+ - final_path: Path to final partition file
558
+ """
559
+ from .ingestion_pipeline import ProcessingConfig
560
+
561
+ config = ProcessingConfig.from_dict(config_dict)
562
+
563
+ # Get storage backends
564
+ catalog_storage = get_storage_backend(config.output_catalog)
565
+ scratch_storage = get_storage_backend(config.scratch_location)
566
+
567
+ # Compute final path
568
+ final_path = f"{config.output_catalog}/{partition_key}/data.parquet"
569
+
570
+ existing_count = 0
571
+ all_gdfs: list[gpd.GeoDataFrame] = []
572
+
573
+ # Use temp directory for staging
574
+ with tempfile.TemporaryDirectory(dir=config.temp_dir_location) as temp_dir:
575
+ temp_existing_path = Path(temp_dir) / "existing.parquet"
576
+ temp_merged_path = Path(temp_dir) / "merged.parquet"
577
+
578
+ # Step 1: Download existing partition if it exists
579
+ if catalog_storage.exists(final_path):
580
+ try:
581
+ logger.debug(f"Partition {partition_key}: downloading existing data")
582
+ with catalog_storage.open(final_path, "rb") as f:
583
+ binary_data = f.read()
584
+ # Write to temp file so we can use gpd.read_parquet
585
+ with open(temp_existing_path, "wb") as temp_f:
586
+ temp_f.write(binary_data)
587
+ existing_gdf = gpd.read_parquet(temp_existing_path)
588
+ existing_count = len(existing_gdf)
589
+ all_gdfs.append(existing_gdf)
590
+ except (OSError, ValueError, TypeError) as e:
591
+ logger.error(f"Error reading existing partition {final_path}: {e}")
592
+
593
+ # Step 2: Read all new shards
594
+ for idx, shard_path in enumerate(shard_paths):
595
+ try:
596
+ with scratch_storage.open(shard_path, "rb") as f:
597
+ binary_data = f.read()
598
+ # Write to temp file so we can use gpd.read_parquet
599
+ temp_shard_path = Path(temp_dir) / f"shard_{idx}.parquet"
600
+ with open(temp_shard_path, "wb") as temp_f:
601
+ temp_f.write(binary_data)
602
+ gdf = gpd.read_parquet(temp_shard_path)
603
+ all_gdfs.append(gdf)
604
+ except (OSError, ValueError, TypeError) as e:
605
+ logger.error(f"Error reading shard {shard_path}: {e}")
606
+ continue
607
+
608
+ if not all_gdfs:
609
+ return {
610
+ "partition_key": partition_key,
611
+ "item_count": 0,
612
+ "existing_count": 0,
613
+ "new_count": 0,
614
+ "duplicates_removed": 0,
615
+ "final_path": final_path,
616
+ }
617
+
618
+ # Step 3: Merge all data
619
+ merged = pd.concat(all_gdfs, ignore_index=True)
620
+ original_count = len(merged)
621
+
622
+ # Step 4: Deduplicate - newer datetime wins
623
+ if "datetime" in merged.columns and "id" in merged.columns:
624
+ # Sort by datetime descending so newer items come first
625
+ try:
626
+ merged["_dt_sort"] = pd.to_datetime(merged["datetime"], errors="coerce")
627
+ merged = merged.sort_values("_dt_sort", ascending=False, na_position="last")
628
+ merged = merged.drop_duplicates(subset=["id"], keep="first")
629
+ merged = merged.drop(columns=["_dt_sort"])
630
+ except (ValueError, TypeError):
631
+ # Fallback to simple dedup
632
+ merged = merged.drop_duplicates(subset=["id"], keep="last")
633
+ elif "id" in merged.columns:
634
+ merged = merged.drop_duplicates(subset=["id"], keep="last")
635
+
636
+ duplicates_removed = original_count - len(merged)
637
+
638
+ # Step 5: Sort by configured key
639
+ if config.sort_key in merged.columns:
640
+ merged = merged.sort_values(config.sort_key, ascending=config.sort_ascending)
641
+
642
+ # Step 6: Write to temp file
643
+ # Ensure geometry column is properly set
644
+ if "geometry" in merged.columns:
645
+ merged_gdf = gpd.GeoDataFrame(merged, geometry="geometry")
646
+ else:
647
+ merged_gdf = gpd.GeoDataFrame(merged)
648
+ merged_gdf.to_parquet(temp_merged_path, index=False, compression="snappy")
649
+
650
+ # Step 7: Upload to final location
651
+ catalog_storage.makedirs(Path(final_path).parent)
652
+ if final_path.startswith("s3://"):
653
+ catalog_storage.upload(str(temp_merged_path), final_path)
654
+ else:
655
+ # Local storage - atomic move
656
+ import shutil
657
+
658
+ shutil.copy2(temp_merged_path, final_path)
659
+
660
+ new_count = len(merged) - existing_count
661
+
662
+ logger.info(
663
+ f"Partition {partition_key}: {existing_count} existing + {new_count} new = "
664
+ f"{len(merged)} total ({duplicates_removed} duplicates removed)"
665
+ )
666
+
667
+ return {
668
+ "partition_key": partition_key,
669
+ "item_count": len(merged),
670
+ "existing_count": existing_count,
671
+ "new_count": new_count,
672
+ "duplicates_removed": duplicates_removed,
673
+ "final_path": final_path,
674
+ }
675
+
676
+
677
+ __all__ = [
678
+ "DownloadResult",
679
+ "ConsolidationResult",
680
+ "process_url_batch",
681
+ "consolidate_partition",
682
+ ]