PyPI - earthcatalog - Versions diffs - 0.2.0__py3-none-any.whl - Mend

earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

earthcatalog/__init__.py +164 -0
earthcatalog/async_http_client.py +1006 -0
earthcatalog/config.py +97 -0
earthcatalog/engines/__init__.py +308 -0
earthcatalog/engines/rustac_engine.py +142 -0
earthcatalog/engines/stac_geoparquet_engine.py +126 -0
earthcatalog/exceptions.py +471 -0
earthcatalog/grid_systems.py +1114 -0
earthcatalog/ingestion_pipeline.py +2281 -0
earthcatalog/input_readers.py +603 -0
earthcatalog/job_tracking.py +485 -0
earthcatalog/pipeline.py +606 -0
earthcatalog/schema_generator.py +911 -0
earthcatalog/spatial_resolver.py +1207 -0
earthcatalog/stac_hooks.py +754 -0
earthcatalog/statistics.py +677 -0
earthcatalog/storage_backends.py +548 -0
earthcatalog/tests/__init__.py +1 -0
earthcatalog/tests/conftest.py +76 -0
earthcatalog/tests/test_all_grids.py +793 -0
earthcatalog/tests/test_async_http.py +700 -0
earthcatalog/tests/test_cli_and_storage.py +230 -0
earthcatalog/tests/test_config.py +245 -0
earthcatalog/tests/test_dask_integration.py +580 -0
earthcatalog/tests/test_e2e_synthetic.py +1624 -0
earthcatalog/tests/test_engines.py +272 -0
earthcatalog/tests/test_exceptions.py +346 -0
earthcatalog/tests/test_file_structure.py +245 -0
earthcatalog/tests/test_input_readers.py +666 -0
earthcatalog/tests/test_integration.py +200 -0
earthcatalog/tests/test_integration_async.py +283 -0
earthcatalog/tests/test_job_tracking.py +603 -0
earthcatalog/tests/test_multi_file_input.py +336 -0
earthcatalog/tests/test_passthrough_hook.py +196 -0
earthcatalog/tests/test_pipeline.py +684 -0
earthcatalog/tests/test_pipeline_components.py +665 -0
earthcatalog/tests/test_schema_generator.py +506 -0
earthcatalog/tests/test_spatial_resolver.py +413 -0
earthcatalog/tests/test_stac_hooks.py +776 -0
earthcatalog/tests/test_statistics.py +477 -0
earthcatalog/tests/test_storage_backends.py +236 -0
earthcatalog/tests/test_validation.py +435 -0
earthcatalog/tests/test_workers.py +653 -0
earthcatalog/validation.py +921 -0
earthcatalog/workers.py +682 -0
earthcatalog-0.2.0.dist-info/METADATA +333 -0
earthcatalog-0.2.0.dist-info/RECORD +50 -0
earthcatalog-0.2.0.dist-info/WHEEL +5 -0
earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
earthcatalog-0.2.0.dist-info/top_level.txt +1 -0

earthcatalog/pipeline.py ADDED Viewed

@@ -0,0 +1,606 @@
+# pipeline.py
+"""Pipeline components for batch processing and shard consolidation.
+This module provides reusable components for the STAC ingestion pipeline,
+including configuration dataclasses and helper utilities for batch processing
+and shard consolidation operations.
+Components:
+    BatchConfig: Configuration for URL batch processing
+    ConsolidationConfig: Configuration for shard consolidation
+    ShardInfo: Metadata about a written shard
+    PartitionResult: Result of consolidating a partition
+    BatchResult: Result of processing a URL batch
+These components are designed to work with the main STACIngestionPipeline
+while providing clear, typed interfaces for configuration and results.
+"""
+from __future__ import annotations
+import io
+import logging
+import tempfile
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Literal
+import geopandas as gpd
+import pandas as pd
+if TYPE_CHECKING:
+    from .storage_backends import StorageBackend
+# Type alias for duplicate handling
+DropKeep = Literal["first", "last"]
+CompressionType = Literal["snappy", "gzip", "brotli"]
+logger = logging.getLogger(__name__)
+@dataclass
+class BatchConfig:
+    """Configuration for URL batch processing.
+    Controls how URLs are chunked and processed across workers,
+    including async HTTP settings and memory management.
+    Attributes:
+        batch_size: Number of URLs to process in each async batch.
+        items_per_shard: Target number of items per shard file.
+        enable_concurrent_http: Whether to use async HTTP processing.
+        concurrent_requests: Number of concurrent HTTP requests.
+        connection_pool_size: Size of the HTTP connection pool.
+        request_timeout: Timeout for each HTTP request in seconds.
+        retry_attempts: Number of retry attempts for failed requests.
+        retry_delay: Base delay between retries in seconds.
+    Example:
+        >>> config = BatchConfig(
+        ...     batch_size=1000,
+        ...     concurrent_requests=50,
+        ...     enable_concurrent_http=True
+        ... )
+        >>> print(f"Processing {config.batch_size} URLs per batch")
+    """
+    batch_size: int = 1000
+    items_per_shard: int = 10000
+    enable_concurrent_http: bool = True
+    concurrent_requests: int = 50
+    connection_pool_size: int = 100
+    request_timeout: int = 30
+    retry_attempts: int = 3
+    retry_delay: float = 1.0
+    def __post_init__(self) -> None:
+        """Validate configuration after initialization."""
+        self._validate()
+    def _validate(self) -> None:
+        """Validate configuration values."""
+        if self.batch_size <= 0:
+            raise ValueError("batch_size must be positive")
+        if self.items_per_shard <= 0:
+            raise ValueError("items_per_shard must be positive")
+        if self.concurrent_requests <= 0:
+            raise ValueError("concurrent_requests must be positive")
+        if self.connection_pool_size <= 0:
+            raise ValueError("connection_pool_size must be positive")
+        if self.request_timeout <= 0:
+            raise ValueError("request_timeout must be positive")
+        if self.retry_attempts < 0:
+            raise ValueError("retry_attempts must be non-negative")
+        if self.retry_delay < 0:
+            raise ValueError("retry_delay must be non-negative")
+    def __repr__(self) -> str:
+        """Return detailed string representation."""
+        return (
+            f"BatchConfig(batch_size={self.batch_size}, "
+            f"items_per_shard={self.items_per_shard}, "
+            f"enable_concurrent_http={self.enable_concurrent_http}, "
+            f"concurrent_requests={self.concurrent_requests})"
+        )
+    def __bool__(self) -> bool:
+        """Return True if configuration is valid."""
+        try:
+            self._validate()
+            return True
+        except ValueError:
+            return False
+@dataclass
+class ConsolidationConfig:
+    """Configuration for shard consolidation operations.
+    Controls memory management, merge strategies, and output settings
+    for consolidating worker shards into final partitioned catalogs.
+    Attributes:
+        strategy: Consolidation strategy - "efficient" or "legacy".
+        max_memory_per_partition_mb: Memory limit per partition in MB.
+        enable_streaming_merge: Use streaming for large file merges.
+        s3_multipart_threshold_mb: Threshold for S3 multipart uploads.
+        temp_dir_location: Location for temporary staging files.
+        sort_key: Column to sort consolidated data by.
+        sort_ascending: Sort order (True for ascending).
+        deduplicate_key: Column to use for deduplication.
+        keep_duplicates: Which duplicate to keep ("first" or "last").
+    Example:
+        >>> config = ConsolidationConfig(
+        ...     strategy="efficient",
+        ...     max_memory_per_partition_mb=1024,
+        ...     enable_streaming_merge=True
+        ... )
+        >>> print(f"Using {config.strategy} consolidation")
+    """
+    strategy: str = "efficient"
+    max_memory_per_partition_mb: int = 1024
+    enable_streaming_merge: bool = True
+    s3_multipart_threshold_mb: int = 100
+    temp_dir_location: str = field(default_factory=tempfile.gettempdir)
+    sort_key: str = "datetime"
+    sort_ascending: bool = True
+    deduplicate_key: str = "id"
+    keep_duplicates: str = "last"
+    def __post_init__(self) -> None:
+        """Validate configuration after initialization."""
+        self._validate()
+    def _validate(self) -> None:
+        """Validate configuration values."""
+        if self.strategy not in ("efficient", "legacy"):
+            raise ValueError("strategy must be 'efficient' or 'legacy'")
+        if self.max_memory_per_partition_mb <= 0:
+            raise ValueError("max_memory_per_partition_mb must be positive")
+        if self.s3_multipart_threshold_mb <= 0:
+            raise ValueError("s3_multipart_threshold_mb must be positive")
+        if self.keep_duplicates not in ("first", "last"):
+            raise ValueError("keep_duplicates must be 'first' or 'last'")
+    def __repr__(self) -> str:
+        """Return detailed string representation."""
+        return (
+            f"ConsolidationConfig(strategy='{self.strategy}', "
+            f"max_memory_mb={self.max_memory_per_partition_mb}, "
+            f"streaming={self.enable_streaming_merge})"
+        )
+    def __bool__(self) -> bool:
+        """Return True if configuration is valid."""
+        try:
+            self._validate()
+            return True
+        except ValueError:
+            return False
+@dataclass
+class ShardInfo:
+    """Metadata about a written shard file.
+    Captures information about a shard written during batch processing,
+    including its location, size, and partition assignment.
+    Attributes:
+        shard_path: Full path to the shard file.
+        partition_key: Partition key this shard belongs to.
+        item_count: Number of items in the shard.
+        worker_id: ID of the worker that created this shard.
+        shard_id: Sequential ID within the worker's shards.
+    Example:
+        >>> shard = ShardInfo(
+        ...     shard_path="/scratch/shards/h3_82/worker-0.parquet",
+        ...     partition_key="dataset/partition=h3/level=2/82/year=2023/month=01",
+        ...     item_count=500,
+        ...     worker_id="worker-0-abc123"
+        ... )
+        >>> print(f"Shard has {shard.item_count} items")
+    """
+    shard_path: str
+    partition_key: str
+    item_count: int
+    worker_id: str
+    shard_id: int = 0
+    def __repr__(self) -> str:
+        """Return detailed string representation."""
+        return f"ShardInfo(path='{self.shard_path}', partition='{self.partition_key}', items={self.item_count})"
+    def __bool__(self) -> bool:
+        """Return True if shard has items."""
+        return self.item_count > 0
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary for serialization."""
+        return {
+            "shard_path": self.shard_path,
+            "partition_key": self.partition_key,
+            "item_count": self.item_count,
+            "worker_id": self.worker_id,
+            "shard_id": self.shard_id,
+        }
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> ShardInfo:
+        """Create ShardInfo from dictionary."""
+        return cls(
+            shard_path=data["shard_path"],
+            partition_key=data.get("partition_key", ""),
+            item_count=data["item_count"],
+            worker_id=data["worker_id"],
+            shard_id=data.get("shard_id", 0),
+        )
+@dataclass
+class PartitionResult:
+    """Result of consolidating a single partition.
+    Captures the outcome of merging shards into a final partition file,
+    including counts of existing, new, and deduplicated items.
+    Attributes:
+        partition_key: The partition that was consolidated.
+        item_count: Total items in the final partition.
+        existing_count: Items that existed before consolidation.
+        new_count: New items added during consolidation.
+        duplicates_removed: Number of duplicate items removed.
+        final_path: Path to the consolidated partition file.
+        success: Whether consolidation succeeded.
+        error: Error message if consolidation failed.
+    Example:
+        >>> result = PartitionResult(
+        ...     partition_key="dataset/partition=h3/level=2/82/year=2023/month=01",
+        ...     item_count=1500,
+        ...     existing_count=1000,
+        ...     new_count=550,
+        ...     duplicates_removed=50
+        ... )
+        >>> print(f"Partition has {result.item_count} total items")
+    """
+    partition_key: str
+    item_count: int = 0
+    existing_count: int = 0
+    new_count: int = 0
+    duplicates_removed: int = 0
+    final_path: str = ""
+    success: bool = True
+    error: str = ""
+    def __repr__(self) -> str:
+        """Return detailed string representation."""
+        status = "OK" if self.success else f"FAILED: {self.error}"
+        return (
+            f"PartitionResult(partition='{self.partition_key}', "
+            f"items={self.item_count}, new={self.new_count}, "
+            f"deduped={self.duplicates_removed}, status={status})"
+        )
+    def __bool__(self) -> bool:
+        """Return True if consolidation was successful with items."""
+        return self.success and self.item_count > 0
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary for serialization."""
+        return {
+            "partition": self.partition_key,
+            "item_count": self.item_count,
+            "existing_count": self.existing_count,
+            "new_count": self.new_count,
+            "duplicates_removed": self.duplicates_removed,
+            "final_path": self.final_path,
+            "success": self.success,
+            "error": self.error,
+        }
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> PartitionResult:
+        """Create PartitionResult from dictionary."""
+        return cls(
+            partition_key=data.get("partition", ""),
+            item_count=data.get("item_count", 0),
+            existing_count=data.get("existing_count", 0),
+            new_count=data.get("new_count", 0),
+            duplicates_removed=data.get("duplicates_removed", 0),
+            final_path=data.get("final_path", ""),
+            success=data.get("success", True),
+            error=data.get("error", ""),
+        )
+    @classmethod
+    def empty(cls, partition_key: str) -> PartitionResult:
+        """Create an empty result for a partition with no items."""
+        return cls(partition_key=partition_key, item_count=0)
+    @classmethod
+    def failed(cls, partition_key: str, error: str) -> PartitionResult:
+        """Create a failed result for a partition."""
+        return cls(partition_key=partition_key, success=False, error=error)
+@dataclass
+class BatchResult:
+    """Result of processing a batch of URLs.
+    Captures the outcome of processing a URL batch, including
+    the shards created and statistics collected.
+    Attributes:
+        worker_id: ID of the worker that processed this batch.
+        shards: List of ShardInfo for shards created.
+        urls_processed: Total URLs attempted.
+        urls_succeeded: URLs successfully processed.
+        urls_failed: URLs that failed processing.
+        stats: Statistics collected during processing.
+    Example:
+        >>> result = BatchResult(
+        ...     worker_id="worker-0",
+        ...     shards=[shard1, shard2],
+        ...     urls_processed=1000,
+        ...     urls_succeeded=995,
+        ...     urls_failed=5
+        ... )
+        >>> print(f"Success rate: {result.success_rate:.1%}")
+    """
+    worker_id: str
+    shards: list[ShardInfo] = field(default_factory=list)
+    urls_processed: int = 0
+    urls_succeeded: int = 0
+    urls_failed: int = 0
+    stats: Any = None  # IngestionStatistics when available
+    @property
+    def success_rate(self) -> float:
+        """Calculate the success rate for this batch."""
+        if self.urls_processed == 0:
+            return 0.0
+        return self.urls_succeeded / self.urls_processed
+    @property
+    def total_items(self) -> int:
+        """Total items across all shards."""
+        return sum(shard.item_count for shard in self.shards)
+    def __repr__(self) -> str:
+        """Return detailed string representation."""
+        return (
+            f"BatchResult(worker='{self.worker_id}', "
+            f"shards={len(self.shards)}, items={self.total_items}, "
+            f"success_rate={self.success_rate:.1%})"
+        )
+    def __bool__(self) -> bool:
+        """Return True if batch produced any items."""
+        return self.total_items > 0
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary for serialization."""
+        return {
+            "worker_id": self.worker_id,
+            "shards": [s.to_dict() for s in self.shards],
+            "urls_processed": self.urls_processed,
+            "urls_succeeded": self.urls_succeeded,
+            "urls_failed": self.urls_failed,
+            "total_items": self.total_items,
+            "success_rate": self.success_rate,
+        }
+def merge_geodataframes(
+    dataframes: list[gpd.GeoDataFrame],
+    deduplicate_key: str = "id",
+    keep: DropKeep = "last",
+    sort_key: str | None = None,
+    sort_ascending: bool = True,
+) -> gpd.GeoDataFrame:
+    """Merge multiple GeoDataFrames with deduplication and sorting.
+    Utility function for consolidation operations that combines multiple
+    GeoDataFrames, removes duplicates, and optionally sorts the result.
+    Args:
+        dataframes: List of GeoDataFrames to merge.
+        deduplicate_key: Column to use for deduplication.
+        keep: Which duplicate to keep - "first" or "last".
+        sort_key: Optional column to sort by after merging.
+        sort_ascending: Sort order if sort_key is specified.
+    Returns:
+        Merged GeoDataFrame with duplicates removed.
+    Example:
+        >>> merged = merge_geodataframes(
+        ...     [existing_gdf, new_gdf],
+        ...     deduplicate_key="id",
+        ...     keep="last",
+        ...     sort_key="datetime"
+        ... )
+    """
+    if not dataframes:
+        return gpd.GeoDataFrame()
+    if len(dataframes) == 1:
+        merged = dataframes[0].copy()
+    else:
+        merged = pd.concat(dataframes, ignore_index=True)
+    # Deduplicate
+    if deduplicate_key in merged.columns:
+        original_count = len(merged)
+        merged = merged.drop_duplicates(subset=[deduplicate_key], keep=keep)
+        duplicates_removed = original_count - len(merged)
+        if duplicates_removed > 0:
+            logger.debug(f"Removed {duplicates_removed} duplicate items")
+    # Sort if requested
+    if sort_key and sort_key in merged.columns:
+        merged = merged.sort_values(sort_key, ascending=sort_ascending)
+    return gpd.GeoDataFrame(merged)
+def read_parquet_from_storage(
+    storage: StorageBackend,
+    path: str,
+) -> gpd.GeoDataFrame:
+    """Read a GeoParquet file from storage backend.
+    Handles the complexity of reading binary data from various storage
+    backends and converting to GeoDataFrame.
+    Args:
+        storage: Storage backend to read from.
+        path: Path to the parquet file.
+    Returns:
+        GeoDataFrame with the file contents.
+    Raises:
+        IOError: If file cannot be read.
+    Example:
+        >>> from earthcatalog.storage_backends import get_storage_backend
+        >>> storage = get_storage_backend("s3://bucket/catalog")
+        >>> gdf = read_parquet_from_storage(storage, "s3://bucket/catalog/data.parquet")
+    """
+    import pyarrow.parquet as pq
+    with storage.open(path, "rb") as f:
+        binary_data = f.read()
+        table = pq.read_table(io.BytesIO(binary_data))
+        df = table.to_pandas()
+        return gpd.GeoDataFrame(df)
+def write_parquet_to_storage(
+    gdf: gpd.GeoDataFrame,
+    storage: StorageBackend,
+    path: str,
+    compression: CompressionType = "snappy",
+) -> None:
+    """Write a GeoDataFrame to storage backend as GeoParquet.
+    Handles the complexity of writing binary data to various storage
+    backends with proper compression settings.
+    Args:
+        gdf: GeoDataFrame to write.
+        storage: Storage backend to write to.
+        path: Destination path for the parquet file.
+        compression: Compression codec to use (default: snappy).
+    Raises:
+        IOError: If file cannot be written.
+    Example:
+        >>> from earthcatalog.storage_backends import get_storage_backend
+        >>> storage = get_storage_backend("s3://bucket/catalog")
+        >>> write_parquet_to_storage(gdf, storage, "s3://bucket/catalog/data.parquet")
+    """
+    storage.makedirs(Path(path).parent)
+    with storage.open(path, "wb") as f:
+        gdf.to_parquet(f, index=False, compression=compression)
+def group_shards_by_partition(
+    shards: list[ShardInfo | dict[str, Any]],
+) -> dict[str, list[str]]:
+    """Group shard paths by their partition key.
+    Utility function that organizes shards by their target partition
+    for efficient consolidation processing.
+    Args:
+        shards: List of ShardInfo objects or dictionaries with shard metadata.
+    Returns:
+        Dictionary mapping partition keys to lists of shard paths.
+    Example:
+        >>> shards = [
+        ...     ShardInfo(shard_path="/a.parquet", partition_key="p1", item_count=10, worker_id="w1"),
+        ...     ShardInfo(shard_path="/b.parquet", partition_key="p1", item_count=20, worker_id="w2"),
+        ...     ShardInfo(shard_path="/c.parquet", partition_key="p2", item_count=15, worker_id="w1"),
+        ... ]
+        >>> groups = group_shards_by_partition(shards)
+        >>> print(groups)
+        {'p1': ['/a.parquet', '/b.parquet'], 'p2': ['/c.parquet']}
+    """
+    partition_shards: dict[str, list[str]] = {}
+    for shard in shards:
+        if isinstance(shard, ShardInfo):
+            partition_key = shard.partition_key
+            shard_path = shard.shard_path
+        else:
+            partition_key = shard.get("partition_key", "")
+            shard_path = shard.get("shard_path", "")
+        if not partition_key:
+            continue
+        if partition_key not in partition_shards:
+            partition_shards[partition_key] = []
+        partition_shards[partition_key].append(shard_path)
+    return partition_shards
+def chunk_urls(urls: list[str], num_chunks: int) -> list[list[str]]:
+    """Split URLs into approximately equal chunks for parallel processing.
+    Args:
+        urls: List of URLs to chunk.
+        num_chunks: Number of chunks to create.
+    Returns:
+        List of URL lists, one per chunk.
+    Example:
+        >>> urls = ["url1", "url2", "url3", "url4", "url5"]
+        >>> chunks = chunk_urls(urls, 2)
+        >>> print(chunks)
+        [['url1', 'url2', 'url3'], ['url4', 'url5']]
+    """
+    if num_chunks <= 0:
+        raise ValueError("num_chunks must be positive")
+    if not urls:
+        return []
+    chunk_size = max(1, len(urls) // num_chunks)
+    chunks = []
+    for i in range(0, len(urls), chunk_size):
+        chunk = urls[i : i + chunk_size]
+        if chunk:
+            chunks.append(chunk)
+    return chunks
+__all__ = [
+    "BatchConfig",
+    "ConsolidationConfig",
+    "ShardInfo",
+    "PartitionResult",
+    "BatchResult",
+    "merge_geodataframes",
+    "read_parquet_from_storage",
+    "write_parquet_to_storage",
+    "group_shards_by_partition",
+    "chunk_urls",
+]