earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. earthcatalog/__init__.py +164 -0
  2. earthcatalog/async_http_client.py +1006 -0
  3. earthcatalog/config.py +97 -0
  4. earthcatalog/engines/__init__.py +308 -0
  5. earthcatalog/engines/rustac_engine.py +142 -0
  6. earthcatalog/engines/stac_geoparquet_engine.py +126 -0
  7. earthcatalog/exceptions.py +471 -0
  8. earthcatalog/grid_systems.py +1114 -0
  9. earthcatalog/ingestion_pipeline.py +2281 -0
  10. earthcatalog/input_readers.py +603 -0
  11. earthcatalog/job_tracking.py +485 -0
  12. earthcatalog/pipeline.py +606 -0
  13. earthcatalog/schema_generator.py +911 -0
  14. earthcatalog/spatial_resolver.py +1207 -0
  15. earthcatalog/stac_hooks.py +754 -0
  16. earthcatalog/statistics.py +677 -0
  17. earthcatalog/storage_backends.py +548 -0
  18. earthcatalog/tests/__init__.py +1 -0
  19. earthcatalog/tests/conftest.py +76 -0
  20. earthcatalog/tests/test_all_grids.py +793 -0
  21. earthcatalog/tests/test_async_http.py +700 -0
  22. earthcatalog/tests/test_cli_and_storage.py +230 -0
  23. earthcatalog/tests/test_config.py +245 -0
  24. earthcatalog/tests/test_dask_integration.py +580 -0
  25. earthcatalog/tests/test_e2e_synthetic.py +1624 -0
  26. earthcatalog/tests/test_engines.py +272 -0
  27. earthcatalog/tests/test_exceptions.py +346 -0
  28. earthcatalog/tests/test_file_structure.py +245 -0
  29. earthcatalog/tests/test_input_readers.py +666 -0
  30. earthcatalog/tests/test_integration.py +200 -0
  31. earthcatalog/tests/test_integration_async.py +283 -0
  32. earthcatalog/tests/test_job_tracking.py +603 -0
  33. earthcatalog/tests/test_multi_file_input.py +336 -0
  34. earthcatalog/tests/test_passthrough_hook.py +196 -0
  35. earthcatalog/tests/test_pipeline.py +684 -0
  36. earthcatalog/tests/test_pipeline_components.py +665 -0
  37. earthcatalog/tests/test_schema_generator.py +506 -0
  38. earthcatalog/tests/test_spatial_resolver.py +413 -0
  39. earthcatalog/tests/test_stac_hooks.py +776 -0
  40. earthcatalog/tests/test_statistics.py +477 -0
  41. earthcatalog/tests/test_storage_backends.py +236 -0
  42. earthcatalog/tests/test_validation.py +435 -0
  43. earthcatalog/tests/test_workers.py +653 -0
  44. earthcatalog/validation.py +921 -0
  45. earthcatalog/workers.py +682 -0
  46. earthcatalog-0.2.0.dist-info/METADATA +333 -0
  47. earthcatalog-0.2.0.dist-info/RECORD +50 -0
  48. earthcatalog-0.2.0.dist-info/WHEEL +5 -0
  49. earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
  50. earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,606 @@
1
+ # pipeline.py
2
+ """Pipeline components for batch processing and shard consolidation.
3
+
4
+ This module provides reusable components for the STAC ingestion pipeline,
5
+ including configuration dataclasses and helper utilities for batch processing
6
+ and shard consolidation operations.
7
+
8
+ Components:
9
+ BatchConfig: Configuration for URL batch processing
10
+ ConsolidationConfig: Configuration for shard consolidation
11
+ ShardInfo: Metadata about a written shard
12
+ PartitionResult: Result of consolidating a partition
13
+ BatchResult: Result of processing a URL batch
14
+
15
+ These components are designed to work with the main STACIngestionPipeline
16
+ while providing clear, typed interfaces for configuration and results.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import io
22
+ import logging
23
+ import tempfile
24
+ from dataclasses import dataclass, field
25
+ from pathlib import Path
26
+ from typing import TYPE_CHECKING, Any, Literal
27
+
28
+ import geopandas as gpd
29
+ import pandas as pd
30
+
31
+ if TYPE_CHECKING:
32
+ from .storage_backends import StorageBackend
33
+
34
+ # Type alias for duplicate handling
35
+ DropKeep = Literal["first", "last"]
36
+ CompressionType = Literal["snappy", "gzip", "brotli"]
37
+
38
+ logger = logging.getLogger(__name__)
39
+
40
+
41
+ @dataclass
42
+ class BatchConfig:
43
+ """Configuration for URL batch processing.
44
+
45
+ Controls how URLs are chunked and processed across workers,
46
+ including async HTTP settings and memory management.
47
+
48
+ Attributes:
49
+ batch_size: Number of URLs to process in each async batch.
50
+ items_per_shard: Target number of items per shard file.
51
+ enable_concurrent_http: Whether to use async HTTP processing.
52
+ concurrent_requests: Number of concurrent HTTP requests.
53
+ connection_pool_size: Size of the HTTP connection pool.
54
+ request_timeout: Timeout for each HTTP request in seconds.
55
+ retry_attempts: Number of retry attempts for failed requests.
56
+ retry_delay: Base delay between retries in seconds.
57
+
58
+ Example:
59
+ >>> config = BatchConfig(
60
+ ... batch_size=1000,
61
+ ... concurrent_requests=50,
62
+ ... enable_concurrent_http=True
63
+ ... )
64
+ >>> print(f"Processing {config.batch_size} URLs per batch")
65
+ """
66
+
67
+ batch_size: int = 1000
68
+ items_per_shard: int = 10000
69
+ enable_concurrent_http: bool = True
70
+ concurrent_requests: int = 50
71
+ connection_pool_size: int = 100
72
+ request_timeout: int = 30
73
+ retry_attempts: int = 3
74
+ retry_delay: float = 1.0
75
+
76
+ def __post_init__(self) -> None:
77
+ """Validate configuration after initialization."""
78
+ self._validate()
79
+
80
+ def _validate(self) -> None:
81
+ """Validate configuration values."""
82
+ if self.batch_size <= 0:
83
+ raise ValueError("batch_size must be positive")
84
+ if self.items_per_shard <= 0:
85
+ raise ValueError("items_per_shard must be positive")
86
+ if self.concurrent_requests <= 0:
87
+ raise ValueError("concurrent_requests must be positive")
88
+ if self.connection_pool_size <= 0:
89
+ raise ValueError("connection_pool_size must be positive")
90
+ if self.request_timeout <= 0:
91
+ raise ValueError("request_timeout must be positive")
92
+ if self.retry_attempts < 0:
93
+ raise ValueError("retry_attempts must be non-negative")
94
+ if self.retry_delay < 0:
95
+ raise ValueError("retry_delay must be non-negative")
96
+
97
+ def __repr__(self) -> str:
98
+ """Return detailed string representation."""
99
+ return (
100
+ f"BatchConfig(batch_size={self.batch_size}, "
101
+ f"items_per_shard={self.items_per_shard}, "
102
+ f"enable_concurrent_http={self.enable_concurrent_http}, "
103
+ f"concurrent_requests={self.concurrent_requests})"
104
+ )
105
+
106
+ def __bool__(self) -> bool:
107
+ """Return True if configuration is valid."""
108
+ try:
109
+ self._validate()
110
+ return True
111
+ except ValueError:
112
+ return False
113
+
114
+
115
+ @dataclass
116
+ class ConsolidationConfig:
117
+ """Configuration for shard consolidation operations.
118
+
119
+ Controls memory management, merge strategies, and output settings
120
+ for consolidating worker shards into final partitioned catalogs.
121
+
122
+ Attributes:
123
+ strategy: Consolidation strategy - "efficient" or "legacy".
124
+ max_memory_per_partition_mb: Memory limit per partition in MB.
125
+ enable_streaming_merge: Use streaming for large file merges.
126
+ s3_multipart_threshold_mb: Threshold for S3 multipart uploads.
127
+ temp_dir_location: Location for temporary staging files.
128
+ sort_key: Column to sort consolidated data by.
129
+ sort_ascending: Sort order (True for ascending).
130
+ deduplicate_key: Column to use for deduplication.
131
+ keep_duplicates: Which duplicate to keep ("first" or "last").
132
+
133
+ Example:
134
+ >>> config = ConsolidationConfig(
135
+ ... strategy="efficient",
136
+ ... max_memory_per_partition_mb=1024,
137
+ ... enable_streaming_merge=True
138
+ ... )
139
+ >>> print(f"Using {config.strategy} consolidation")
140
+ """
141
+
142
+ strategy: str = "efficient"
143
+ max_memory_per_partition_mb: int = 1024
144
+ enable_streaming_merge: bool = True
145
+ s3_multipart_threshold_mb: int = 100
146
+ temp_dir_location: str = field(default_factory=tempfile.gettempdir)
147
+ sort_key: str = "datetime"
148
+ sort_ascending: bool = True
149
+ deduplicate_key: str = "id"
150
+ keep_duplicates: str = "last"
151
+
152
+ def __post_init__(self) -> None:
153
+ """Validate configuration after initialization."""
154
+ self._validate()
155
+
156
+ def _validate(self) -> None:
157
+ """Validate configuration values."""
158
+ if self.strategy not in ("efficient", "legacy"):
159
+ raise ValueError("strategy must be 'efficient' or 'legacy'")
160
+ if self.max_memory_per_partition_mb <= 0:
161
+ raise ValueError("max_memory_per_partition_mb must be positive")
162
+ if self.s3_multipart_threshold_mb <= 0:
163
+ raise ValueError("s3_multipart_threshold_mb must be positive")
164
+ if self.keep_duplicates not in ("first", "last"):
165
+ raise ValueError("keep_duplicates must be 'first' or 'last'")
166
+
167
+ def __repr__(self) -> str:
168
+ """Return detailed string representation."""
169
+ return (
170
+ f"ConsolidationConfig(strategy='{self.strategy}', "
171
+ f"max_memory_mb={self.max_memory_per_partition_mb}, "
172
+ f"streaming={self.enable_streaming_merge})"
173
+ )
174
+
175
+ def __bool__(self) -> bool:
176
+ """Return True if configuration is valid."""
177
+ try:
178
+ self._validate()
179
+ return True
180
+ except ValueError:
181
+ return False
182
+
183
+
184
+ @dataclass
185
+ class ShardInfo:
186
+ """Metadata about a written shard file.
187
+
188
+ Captures information about a shard written during batch processing,
189
+ including its location, size, and partition assignment.
190
+
191
+ Attributes:
192
+ shard_path: Full path to the shard file.
193
+ partition_key: Partition key this shard belongs to.
194
+ item_count: Number of items in the shard.
195
+ worker_id: ID of the worker that created this shard.
196
+ shard_id: Sequential ID within the worker's shards.
197
+
198
+ Example:
199
+ >>> shard = ShardInfo(
200
+ ... shard_path="/scratch/shards/h3_82/worker-0.parquet",
201
+ ... partition_key="dataset/partition=h3/level=2/82/year=2023/month=01",
202
+ ... item_count=500,
203
+ ... worker_id="worker-0-abc123"
204
+ ... )
205
+ >>> print(f"Shard has {shard.item_count} items")
206
+ """
207
+
208
+ shard_path: str
209
+ partition_key: str
210
+ item_count: int
211
+ worker_id: str
212
+ shard_id: int = 0
213
+
214
+ def __repr__(self) -> str:
215
+ """Return detailed string representation."""
216
+ return f"ShardInfo(path='{self.shard_path}', partition='{self.partition_key}', items={self.item_count})"
217
+
218
+ def __bool__(self) -> bool:
219
+ """Return True if shard has items."""
220
+ return self.item_count > 0
221
+
222
+ def to_dict(self) -> dict[str, Any]:
223
+ """Convert to dictionary for serialization."""
224
+ return {
225
+ "shard_path": self.shard_path,
226
+ "partition_key": self.partition_key,
227
+ "item_count": self.item_count,
228
+ "worker_id": self.worker_id,
229
+ "shard_id": self.shard_id,
230
+ }
231
+
232
+ @classmethod
233
+ def from_dict(cls, data: dict[str, Any]) -> ShardInfo:
234
+ """Create ShardInfo from dictionary."""
235
+ return cls(
236
+ shard_path=data["shard_path"],
237
+ partition_key=data.get("partition_key", ""),
238
+ item_count=data["item_count"],
239
+ worker_id=data["worker_id"],
240
+ shard_id=data.get("shard_id", 0),
241
+ )
242
+
243
+
244
+ @dataclass
245
+ class PartitionResult:
246
+ """Result of consolidating a single partition.
247
+
248
+ Captures the outcome of merging shards into a final partition file,
249
+ including counts of existing, new, and deduplicated items.
250
+
251
+ Attributes:
252
+ partition_key: The partition that was consolidated.
253
+ item_count: Total items in the final partition.
254
+ existing_count: Items that existed before consolidation.
255
+ new_count: New items added during consolidation.
256
+ duplicates_removed: Number of duplicate items removed.
257
+ final_path: Path to the consolidated partition file.
258
+ success: Whether consolidation succeeded.
259
+ error: Error message if consolidation failed.
260
+
261
+ Example:
262
+ >>> result = PartitionResult(
263
+ ... partition_key="dataset/partition=h3/level=2/82/year=2023/month=01",
264
+ ... item_count=1500,
265
+ ... existing_count=1000,
266
+ ... new_count=550,
267
+ ... duplicates_removed=50
268
+ ... )
269
+ >>> print(f"Partition has {result.item_count} total items")
270
+ """
271
+
272
+ partition_key: str
273
+ item_count: int = 0
274
+ existing_count: int = 0
275
+ new_count: int = 0
276
+ duplicates_removed: int = 0
277
+ final_path: str = ""
278
+ success: bool = True
279
+ error: str = ""
280
+
281
+ def __repr__(self) -> str:
282
+ """Return detailed string representation."""
283
+ status = "OK" if self.success else f"FAILED: {self.error}"
284
+ return (
285
+ f"PartitionResult(partition='{self.partition_key}', "
286
+ f"items={self.item_count}, new={self.new_count}, "
287
+ f"deduped={self.duplicates_removed}, status={status})"
288
+ )
289
+
290
+ def __bool__(self) -> bool:
291
+ """Return True if consolidation was successful with items."""
292
+ return self.success and self.item_count > 0
293
+
294
+ def to_dict(self) -> dict[str, Any]:
295
+ """Convert to dictionary for serialization."""
296
+ return {
297
+ "partition": self.partition_key,
298
+ "item_count": self.item_count,
299
+ "existing_count": self.existing_count,
300
+ "new_count": self.new_count,
301
+ "duplicates_removed": self.duplicates_removed,
302
+ "final_path": self.final_path,
303
+ "success": self.success,
304
+ "error": self.error,
305
+ }
306
+
307
+ @classmethod
308
+ def from_dict(cls, data: dict[str, Any]) -> PartitionResult:
309
+ """Create PartitionResult from dictionary."""
310
+ return cls(
311
+ partition_key=data.get("partition", ""),
312
+ item_count=data.get("item_count", 0),
313
+ existing_count=data.get("existing_count", 0),
314
+ new_count=data.get("new_count", 0),
315
+ duplicates_removed=data.get("duplicates_removed", 0),
316
+ final_path=data.get("final_path", ""),
317
+ success=data.get("success", True),
318
+ error=data.get("error", ""),
319
+ )
320
+
321
+ @classmethod
322
+ def empty(cls, partition_key: str) -> PartitionResult:
323
+ """Create an empty result for a partition with no items."""
324
+ return cls(partition_key=partition_key, item_count=0)
325
+
326
+ @classmethod
327
+ def failed(cls, partition_key: str, error: str) -> PartitionResult:
328
+ """Create a failed result for a partition."""
329
+ return cls(partition_key=partition_key, success=False, error=error)
330
+
331
+
332
+ @dataclass
333
+ class BatchResult:
334
+ """Result of processing a batch of URLs.
335
+
336
+ Captures the outcome of processing a URL batch, including
337
+ the shards created and statistics collected.
338
+
339
+ Attributes:
340
+ worker_id: ID of the worker that processed this batch.
341
+ shards: List of ShardInfo for shards created.
342
+ urls_processed: Total URLs attempted.
343
+ urls_succeeded: URLs successfully processed.
344
+ urls_failed: URLs that failed processing.
345
+ stats: Statistics collected during processing.
346
+
347
+ Example:
348
+ >>> result = BatchResult(
349
+ ... worker_id="worker-0",
350
+ ... shards=[shard1, shard2],
351
+ ... urls_processed=1000,
352
+ ... urls_succeeded=995,
353
+ ... urls_failed=5
354
+ ... )
355
+ >>> print(f"Success rate: {result.success_rate:.1%}")
356
+ """
357
+
358
+ worker_id: str
359
+ shards: list[ShardInfo] = field(default_factory=list)
360
+ urls_processed: int = 0
361
+ urls_succeeded: int = 0
362
+ urls_failed: int = 0
363
+ stats: Any = None # IngestionStatistics when available
364
+
365
+ @property
366
+ def success_rate(self) -> float:
367
+ """Calculate the success rate for this batch."""
368
+ if self.urls_processed == 0:
369
+ return 0.0
370
+ return self.urls_succeeded / self.urls_processed
371
+
372
+ @property
373
+ def total_items(self) -> int:
374
+ """Total items across all shards."""
375
+ return sum(shard.item_count for shard in self.shards)
376
+
377
+ def __repr__(self) -> str:
378
+ """Return detailed string representation."""
379
+ return (
380
+ f"BatchResult(worker='{self.worker_id}', "
381
+ f"shards={len(self.shards)}, items={self.total_items}, "
382
+ f"success_rate={self.success_rate:.1%})"
383
+ )
384
+
385
+ def __bool__(self) -> bool:
386
+ """Return True if batch produced any items."""
387
+ return self.total_items > 0
388
+
389
+ def to_dict(self) -> dict[str, Any]:
390
+ """Convert to dictionary for serialization."""
391
+ return {
392
+ "worker_id": self.worker_id,
393
+ "shards": [s.to_dict() for s in self.shards],
394
+ "urls_processed": self.urls_processed,
395
+ "urls_succeeded": self.urls_succeeded,
396
+ "urls_failed": self.urls_failed,
397
+ "total_items": self.total_items,
398
+ "success_rate": self.success_rate,
399
+ }
400
+
401
+
402
+ def merge_geodataframes(
403
+ dataframes: list[gpd.GeoDataFrame],
404
+ deduplicate_key: str = "id",
405
+ keep: DropKeep = "last",
406
+ sort_key: str | None = None,
407
+ sort_ascending: bool = True,
408
+ ) -> gpd.GeoDataFrame:
409
+ """Merge multiple GeoDataFrames with deduplication and sorting.
410
+
411
+ Utility function for consolidation operations that combines multiple
412
+ GeoDataFrames, removes duplicates, and optionally sorts the result.
413
+
414
+ Args:
415
+ dataframes: List of GeoDataFrames to merge.
416
+ deduplicate_key: Column to use for deduplication.
417
+ keep: Which duplicate to keep - "first" or "last".
418
+ sort_key: Optional column to sort by after merging.
419
+ sort_ascending: Sort order if sort_key is specified.
420
+
421
+ Returns:
422
+ Merged GeoDataFrame with duplicates removed.
423
+
424
+ Example:
425
+ >>> merged = merge_geodataframes(
426
+ ... [existing_gdf, new_gdf],
427
+ ... deduplicate_key="id",
428
+ ... keep="last",
429
+ ... sort_key="datetime"
430
+ ... )
431
+ """
432
+ if not dataframes:
433
+ return gpd.GeoDataFrame()
434
+
435
+ if len(dataframes) == 1:
436
+ merged = dataframes[0].copy()
437
+ else:
438
+ merged = pd.concat(dataframes, ignore_index=True)
439
+
440
+ # Deduplicate
441
+ if deduplicate_key in merged.columns:
442
+ original_count = len(merged)
443
+ merged = merged.drop_duplicates(subset=[deduplicate_key], keep=keep)
444
+ duplicates_removed = original_count - len(merged)
445
+ if duplicates_removed > 0:
446
+ logger.debug(f"Removed {duplicates_removed} duplicate items")
447
+
448
+ # Sort if requested
449
+ if sort_key and sort_key in merged.columns:
450
+ merged = merged.sort_values(sort_key, ascending=sort_ascending)
451
+
452
+ return gpd.GeoDataFrame(merged)
453
+
454
+
455
+ def read_parquet_from_storage(
456
+ storage: StorageBackend,
457
+ path: str,
458
+ ) -> gpd.GeoDataFrame:
459
+ """Read a GeoParquet file from storage backend.
460
+
461
+ Handles the complexity of reading binary data from various storage
462
+ backends and converting to GeoDataFrame.
463
+
464
+ Args:
465
+ storage: Storage backend to read from.
466
+ path: Path to the parquet file.
467
+
468
+ Returns:
469
+ GeoDataFrame with the file contents.
470
+
471
+ Raises:
472
+ IOError: If file cannot be read.
473
+
474
+ Example:
475
+ >>> from earthcatalog.storage_backends import get_storage_backend
476
+ >>> storage = get_storage_backend("s3://bucket/catalog")
477
+ >>> gdf = read_parquet_from_storage(storage, "s3://bucket/catalog/data.parquet")
478
+ """
479
+ import pyarrow.parquet as pq
480
+
481
+ with storage.open(path, "rb") as f:
482
+ binary_data = f.read()
483
+ table = pq.read_table(io.BytesIO(binary_data))
484
+ df = table.to_pandas()
485
+ return gpd.GeoDataFrame(df)
486
+
487
+
488
+ def write_parquet_to_storage(
489
+ gdf: gpd.GeoDataFrame,
490
+ storage: StorageBackend,
491
+ path: str,
492
+ compression: CompressionType = "snappy",
493
+ ) -> None:
494
+ """Write a GeoDataFrame to storage backend as GeoParquet.
495
+
496
+ Handles the complexity of writing binary data to various storage
497
+ backends with proper compression settings.
498
+
499
+ Args:
500
+ gdf: GeoDataFrame to write.
501
+ storage: Storage backend to write to.
502
+ path: Destination path for the parquet file.
503
+ compression: Compression codec to use (default: snappy).
504
+
505
+ Raises:
506
+ IOError: If file cannot be written.
507
+
508
+ Example:
509
+ >>> from earthcatalog.storage_backends import get_storage_backend
510
+ >>> storage = get_storage_backend("s3://bucket/catalog")
511
+ >>> write_parquet_to_storage(gdf, storage, "s3://bucket/catalog/data.parquet")
512
+ """
513
+ storage.makedirs(Path(path).parent)
514
+ with storage.open(path, "wb") as f:
515
+ gdf.to_parquet(f, index=False, compression=compression)
516
+
517
+
518
+ def group_shards_by_partition(
519
+ shards: list[ShardInfo | dict[str, Any]],
520
+ ) -> dict[str, list[str]]:
521
+ """Group shard paths by their partition key.
522
+
523
+ Utility function that organizes shards by their target partition
524
+ for efficient consolidation processing.
525
+
526
+ Args:
527
+ shards: List of ShardInfo objects or dictionaries with shard metadata.
528
+
529
+ Returns:
530
+ Dictionary mapping partition keys to lists of shard paths.
531
+
532
+ Example:
533
+ >>> shards = [
534
+ ... ShardInfo(shard_path="/a.parquet", partition_key="p1", item_count=10, worker_id="w1"),
535
+ ... ShardInfo(shard_path="/b.parquet", partition_key="p1", item_count=20, worker_id="w2"),
536
+ ... ShardInfo(shard_path="/c.parquet", partition_key="p2", item_count=15, worker_id="w1"),
537
+ ... ]
538
+ >>> groups = group_shards_by_partition(shards)
539
+ >>> print(groups)
540
+ {'p1': ['/a.parquet', '/b.parquet'], 'p2': ['/c.parquet']}
541
+ """
542
+ partition_shards: dict[str, list[str]] = {}
543
+
544
+ for shard in shards:
545
+ if isinstance(shard, ShardInfo):
546
+ partition_key = shard.partition_key
547
+ shard_path = shard.shard_path
548
+ else:
549
+ partition_key = shard.get("partition_key", "")
550
+ shard_path = shard.get("shard_path", "")
551
+
552
+ if not partition_key:
553
+ continue
554
+
555
+ if partition_key not in partition_shards:
556
+ partition_shards[partition_key] = []
557
+ partition_shards[partition_key].append(shard_path)
558
+
559
+ return partition_shards
560
+
561
+
562
+ def chunk_urls(urls: list[str], num_chunks: int) -> list[list[str]]:
563
+ """Split URLs into approximately equal chunks for parallel processing.
564
+
565
+ Args:
566
+ urls: List of URLs to chunk.
567
+ num_chunks: Number of chunks to create.
568
+
569
+ Returns:
570
+ List of URL lists, one per chunk.
571
+
572
+ Example:
573
+ >>> urls = ["url1", "url2", "url3", "url4", "url5"]
574
+ >>> chunks = chunk_urls(urls, 2)
575
+ >>> print(chunks)
576
+ [['url1', 'url2', 'url3'], ['url4', 'url5']]
577
+ """
578
+ if num_chunks <= 0:
579
+ raise ValueError("num_chunks must be positive")
580
+
581
+ if not urls:
582
+ return []
583
+
584
+ chunk_size = max(1, len(urls) // num_chunks)
585
+ chunks = []
586
+
587
+ for i in range(0, len(urls), chunk_size):
588
+ chunk = urls[i : i + chunk_size]
589
+ if chunk:
590
+ chunks.append(chunk)
591
+
592
+ return chunks
593
+
594
+
595
+ __all__ = [
596
+ "BatchConfig",
597
+ "ConsolidationConfig",
598
+ "ShardInfo",
599
+ "PartitionResult",
600
+ "BatchResult",
601
+ "merge_geodataframes",
602
+ "read_parquet_from_storage",
603
+ "write_parquet_to_storage",
604
+ "group_shards_by_partition",
605
+ "chunk_urls",
606
+ ]