PyPI - earthcatalog - Versions diffs - 0.2.0__py3-none-any.whl - Mend

earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

earthcatalog/__init__.py +164 -0
earthcatalog/async_http_client.py +1006 -0
earthcatalog/config.py +97 -0
earthcatalog/engines/__init__.py +308 -0
earthcatalog/engines/rustac_engine.py +142 -0
earthcatalog/engines/stac_geoparquet_engine.py +126 -0
earthcatalog/exceptions.py +471 -0
earthcatalog/grid_systems.py +1114 -0
earthcatalog/ingestion_pipeline.py +2281 -0
earthcatalog/input_readers.py +603 -0
earthcatalog/job_tracking.py +485 -0
earthcatalog/pipeline.py +606 -0
earthcatalog/schema_generator.py +911 -0
earthcatalog/spatial_resolver.py +1207 -0
earthcatalog/stac_hooks.py +754 -0
earthcatalog/statistics.py +677 -0
earthcatalog/storage_backends.py +548 -0
earthcatalog/tests/__init__.py +1 -0
earthcatalog/tests/conftest.py +76 -0
earthcatalog/tests/test_all_grids.py +793 -0
earthcatalog/tests/test_async_http.py +700 -0
earthcatalog/tests/test_cli_and_storage.py +230 -0
earthcatalog/tests/test_config.py +245 -0
earthcatalog/tests/test_dask_integration.py +580 -0
earthcatalog/tests/test_e2e_synthetic.py +1624 -0
earthcatalog/tests/test_engines.py +272 -0
earthcatalog/tests/test_exceptions.py +346 -0
earthcatalog/tests/test_file_structure.py +245 -0
earthcatalog/tests/test_input_readers.py +666 -0
earthcatalog/tests/test_integration.py +200 -0
earthcatalog/tests/test_integration_async.py +283 -0
earthcatalog/tests/test_job_tracking.py +603 -0
earthcatalog/tests/test_multi_file_input.py +336 -0
earthcatalog/tests/test_passthrough_hook.py +196 -0
earthcatalog/tests/test_pipeline.py +684 -0
earthcatalog/tests/test_pipeline_components.py +665 -0
earthcatalog/tests/test_schema_generator.py +506 -0
earthcatalog/tests/test_spatial_resolver.py +413 -0
earthcatalog/tests/test_stac_hooks.py +776 -0
earthcatalog/tests/test_statistics.py +477 -0
earthcatalog/tests/test_storage_backends.py +236 -0
earthcatalog/tests/test_validation.py +435 -0
earthcatalog/tests/test_workers.py +653 -0
earthcatalog/validation.py +921 -0
earthcatalog/workers.py +682 -0
earthcatalog-0.2.0.dist-info/METADATA +333 -0
earthcatalog-0.2.0.dist-info/RECORD +50 -0
earthcatalog-0.2.0.dist-info/WHEEL +5 -0
earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
earthcatalog-0.2.0.dist-info/top_level.txt +1 -0

earthcatalog/config.py ADDED Viewed

@@ -0,0 +1,97 @@
+"""Configuration file loader for EarthCatalog.
+Supports YAML configuration files with CLI override capability.
+The config file allows users to set defaults for all ProcessingConfig options.
+"""
+from pathlib import Path
+from typing import Any
+import yaml
+def load_config(config_path: Path | str | None = None) -> dict[str, Any]:
+    """Load configuration from YAML file.
+    Search order:
+    1. Explicit path if provided
+    2. ./earthcatalog.yaml in current directory
+    3. Empty dict (use defaults)
+    Args:
+        config_path: Optional explicit path to config file.
+    Returns:
+        Dictionary with configuration values.
+    Raises:
+        FileNotFoundError: If explicit config_path is provided but doesn't exist.
+        yaml.YAMLError: If config file is invalid YAML.
+    """
+    if config_path is not None:
+        path = Path(config_path)
+        if not path.exists():
+            raise FileNotFoundError(f"Config file not found: {config_path}")
+        return _load_yaml_file(path)
+    # Check for default config file in current directory
+    default_path = Path("earthcatalog.yaml")
+    if default_path.exists():
+        return _load_yaml_file(default_path)
+    # No config file found, return empty dict
+    return {}
+def _load_yaml_file(path: Path) -> dict[str, Any]:
+    """Load and parse a YAML file.
+    Args:
+        path: Path to YAML file.
+    Returns:
+        Parsed YAML content as dictionary.
+    """
+    with open(path) as f:
+        content = yaml.safe_load(f)
+        # Handle empty files
+        if content is None:
+            return {}
+        if not isinstance(content, dict):
+            raise ValueError(f"Config file must contain a YAML mapping, got: {type(content).__name__}")
+        return content
+def merge_cli_overrides(config: dict[str, Any], cli_args: dict[str, Any]) -> dict[str, Any]:
+    """Merge CLI arguments over config file values.
+    CLI args with None values are ignored (use config/default).
+    This allows CLI to override config file values while preserving
+    unspecified defaults.
+    Args:
+        config: Base configuration from file.
+        cli_args: CLI arguments (may contain None values).
+    Returns:
+        Merged configuration dictionary.
+    """
+    result = config.copy()
+    for key, value in cli_args.items():
+        if value is not None:
+            result[key] = value
+    return result
+def save_config(config: dict[str, Any], path: Path | str) -> None:
+    """Save configuration to YAML file.
+    Args:
+        config: Configuration dictionary to save.
+        path: Path to write YAML file.
+    """
+    path = Path(path)
+    with open(path, "w") as f:
+        yaml.safe_dump(config, f, default_flow_style=False, sort_keys=False)

earthcatalog/engines/__init__.py ADDED Viewed

@@ -0,0 +1,308 @@
+"""STAC I/O engine abstraction layer.
+This module provides pluggable backends for STAC item conversion and GeoParquet I/O,
+enabling EarthCatalog to use different underlying implementations without code changes.
+Supported Engines:
+    rustac: High-performance Rust-based engine using the rustac library.
+        - Zero Python dependencies for core operations
+        - Native async support for I/O operations
+        - Built-in object store integration (S3, GCS, Azure)
+        - Recommended for production use
+    stac-geoparquet: Legacy Python-based engine using stac-geoparquet library.
+        - Pure Python implementation
+        - Well-tested and stable
+        - Fallback option for compatibility
+Engine Selection:
+    The engine can be selected via ProcessingConfig.stac_engine parameter:
+    - "rustac" (default): Use rustac engine
+    - "stac-geoparquet": Use legacy stac-geoparquet engine
+    - "auto": Auto-detect based on available libraries (prefers rustac)
+Example:
+    >>> from earthcatalog.engines import get_engine
+    >>> engine = get_engine("rustac")
+    >>> gdf = engine.items_to_geodataframe(items)
+    >>> items = engine.geodataframe_to_items(gdf)
+Performance:
+    The rustac engine provides significant performance improvements:
+    - Faster Arrow conversion through Rust implementation
+    - Native async I/O for cloud storage operations
+    - Reduced memory overhead for large datasets
+"""
+import io
+import logging
+from abc import ABC, abstractmethod
+from typing import Any, Literal, cast
+import geopandas as gpd
+import pyarrow.parquet as pq
+logger = logging.getLogger(__name__)
+EngineType = Literal["rustac", "stac-geoparquet", "auto"]
+CompressionType = Literal["snappy", "gzip", "brotli"]
+class GeoParquetIOMixin:
+    """Mixin providing shared GeoParquet I/O operations for STAC engines.
+    This mixin contains the common implementation for reading and writing
+    GeoParquet files that is shared between all engine implementations.
+    Extracting this logic ensures bug fixes and improvements are applied
+    consistently across all engines.
+    """
+    def write_geoparquet_sync(
+        self,
+        gdf: gpd.GeoDataFrame,
+        path: str,
+        storage: Any,
+        compression: str = "snappy",
+    ) -> None:
+        """Write GeoDataFrame to GeoParquet synchronously.
+        Uses GeoPandas' native to_parquet() method with storage backend
+        support for cloud paths.
+        Args:
+            gdf: GeoDataFrame to write.
+            path: Output path (local or s3://).
+            storage: Storage backend for cloud operations.
+            compression: Parquet compression (default: snappy).
+        Raises:
+            OSError: If write operation fails.
+        """
+        if gdf.empty:
+            logger.warning(f"Skipping write of empty GeoDataFrame to {path}")
+            return
+        try:
+            # Cast compression to the expected literal type
+            comp = cast(CompressionType, compression) if compression in ("snappy", "gzip", "brotli") else "snappy"
+            if path.startswith("s3://"):
+                # Use storage backend for S3
+                with storage.open(path, "wb") as f:
+                    gdf.to_parquet(f, index=False, compression=comp)
+            else:
+                # Local filesystem - direct write
+                gdf.to_parquet(path, index=False, compression=comp)
+        except Exception as e:
+            logger.error(f"Error writing GeoParquet to {path}: {e}")
+            raise OSError(f"Failed to write GeoParquet to {path}: {e}") from e
+    def read_geoparquet_sync(self, path: str, storage: Any) -> gpd.GeoDataFrame:
+        """Read GeoParquet file to GeoDataFrame synchronously.
+        Uses PyArrow for reading with storage backend support for cloud paths.
+        Args:
+            path: Input path (local or s3://).
+            storage: Storage backend for cloud operations.
+        Returns:
+            GeoDataFrame containing the data.
+        Raises:
+            FileNotFoundError: If file does not exist.
+            OSError: If read fails.
+        """
+        try:
+            if path.startswith("s3://"):
+                # Use storage backend for S3
+                with storage.open(path, "rb") as f:
+                    binary_data = f.read()
+                    table = pq.read_table(io.BytesIO(binary_data))
+            else:
+                # Local filesystem
+                table = pq.read_table(path)
+            # Convert to GeoDataFrame (preserve geometry column type)
+            gdf = gpd.GeoDataFrame.from_arrow(table)
+            return gdf
+        except FileNotFoundError:
+            raise
+        except Exception as e:
+            logger.error(f"Error reading GeoParquet from {path}: {e}")
+            raise OSError(f"Failed to read GeoParquet from {path}: {e}") from e
+class STACEngine(ABC):
+    """Abstract base class for STAC I/O operations.
+    This class defines the interface that all STAC engines must implement.
+    Engines handle the conversion between STAC items and GeoDataFrames,
+    as well as reading/writing GeoParquet files.
+    The interface is designed to be synchronous for GeoDataFrame operations
+    (which require in-memory processing) and provides both sync and async
+    options for I/O operations.
+    Thread Safety:
+        Engine instances should be thread-safe for read operations.
+        Write operations may require external synchronization.
+    """
+    @property
+    @abstractmethod
+    def name(self) -> str:
+        """Return the engine name identifier."""
+        pass
+    @abstractmethod
+    def items_to_geodataframe(self, items: list[dict[str, Any]]) -> gpd.GeoDataFrame:
+        """Convert STAC items to a GeoDataFrame for sorting and processing.
+        This is a synchronous operation as it requires in-memory processing
+        of the entire item set for operations like sorting and deduplication.
+        Args:
+            items: List of STAC item dictionaries.
+        Returns:
+            GeoDataFrame with STAC item data, geometry column, and all properties.
+        Raises:
+            ValueError: If items cannot be converted to GeoDataFrame.
+        """
+        pass
+    @abstractmethod
+    def geodataframe_to_items(self, gdf: gpd.GeoDataFrame) -> list[dict[str, Any]]:
+        """Convert a GeoDataFrame back to STAC item dictionaries.
+        Args:
+            gdf: GeoDataFrame containing STAC item data.
+        Returns:
+            List of STAC item dictionaries.
+        Raises:
+            ValueError: If GeoDataFrame cannot be converted to STAC items.
+        """
+        pass
+    @abstractmethod
+    def write_geoparquet_sync(
+        self,
+        gdf: gpd.GeoDataFrame,
+        path: str,
+        storage: Any,
+        compression: str = "snappy",
+    ) -> None:
+        """Write a GeoDataFrame to GeoParquet file synchronously.
+        This method handles both local and cloud storage paths using the
+        provided storage backend for cloud operations.
+        Args:
+            gdf: GeoDataFrame to write.
+            path: Output path (local or cloud URL like s3://).
+            storage: Storage backend instance for cloud operations.
+            compression: Parquet compression algorithm (default: snappy).
+        Raises:
+            IOError: If write operation fails.
+        """
+        pass
+    @abstractmethod
+    def read_geoparquet_sync(self, path: str, storage: Any) -> gpd.GeoDataFrame:
+        """Read a GeoParquet file to GeoDataFrame synchronously.
+        Args:
+            path: Input path (local or cloud URL).
+            storage: Storage backend instance for cloud operations.
+        Returns:
+            GeoDataFrame containing the data.
+        Raises:
+            FileNotFoundError: If the file does not exist.
+            IOError: If read operation fails.
+        """
+        pass
+class EngineNotAvailableError(Exception):
+    """Raised when the requested engine is not available."""
+    pass
+def get_engine(engine_type: EngineType = "rustac") -> STACEngine:
+    """Factory function to get the appropriate STAC engine.
+    Args:
+        engine_type: Engine to use. Options:
+            - "rustac": Use rustac engine (default, recommended)
+            - "stac-geoparquet": Use legacy stac-geoparquet engine
+            - "auto": Auto-detect best available engine (prefers rustac)
+    Returns:
+        STACEngine instance configured for the requested backend.
+    Raises:
+        EngineNotAvailableError: If the requested engine is not available.
+        ValueError: If an invalid engine type is specified.
+    Example:
+        >>> engine = get_engine("rustac")
+        >>> gdf = engine.items_to_geodataframe(items)
+    """
+    if engine_type == "auto":
+        # Try rustac first, fall back to stac-geoparquet
+        try:
+            from .rustac_engine import RustacEngine
+            return RustacEngine()
+        except ImportError:
+            try:
+                from .stac_geoparquet_engine import StacGeoparquetEngine
+                return StacGeoparquetEngine()
+            except ImportError:
+                raise EngineNotAvailableError(
+                    "No STAC engine available. Install rustac[arrow] or stac-geoparquet."
+                ) from None
+    elif engine_type == "rustac":
+        try:
+            from .rustac_engine import RustacEngine
+            return RustacEngine()
+        except ImportError as e:
+            raise EngineNotAvailableError(
+                f"rustac engine not available: {e}. Install with: pip install 'rustac[arrow]'"
+            ) from e
+    elif engine_type == "stac-geoparquet":
+        try:
+            from .stac_geoparquet_engine import StacGeoparquetEngine
+            return StacGeoparquetEngine()
+        except ImportError as e:
+            raise EngineNotAvailableError(
+                f"stac-geoparquet engine not available: {e}. Install with: pip install stac-geoparquet"
+            ) from e
+    else:
+        raise ValueError(f"Unknown engine type: {engine_type}. Valid options: rustac, stac-geoparquet, auto")
+__all__ = [
+    "STACEngine",
+    "GeoParquetIOMixin",
+    "EngineType",
+    "EngineNotAvailableError",
+    "get_engine",
+]

earthcatalog/engines/rustac_engine.py ADDED Viewed

@@ -0,0 +1,142 @@
+"""Rustac-based STAC I/O engine for high-performance operations.
+This module provides the rustac engine implementation using the Rust-based
+rustac library for STAC item conversion and GeoParquet I/O. It offers significant
+performance improvements over pure Python implementations.
+Key Features:
+    - High-performance Arrow conversion through Rust
+    - Native object store support for S3, GCS, Azure
+    - Memory-efficient processing for large datasets
+    - Zero Python dependencies for core operations
+Requirements:
+    - rustac[arrow]>=0.9.0
+Example:
+    >>> from earthcatalog.engines.rustac_engine import RustacEngine
+    >>> engine = RustacEngine()
+    >>> gdf = engine.items_to_geodataframe(items)
+    >>> engine.write_geoparquet_sync(gdf, "output.parquet", storage)
+"""
+import logging
+from typing import Any
+import geopandas as gpd
+from . import GeoParquetIOMixin, STACEngine
+logger = logging.getLogger(__name__)
+# Import rustac - will raise ImportError if not available
+try:
+    import rustac
+    HAS_RUSTAC = True
+except ImportError:
+    HAS_RUSTAC = False
+    rustac = None  # type: ignore
+class RustacEngine(GeoParquetIOMixin, STACEngine):
+    """High-performance STAC engine using rustac library.
+    This engine leverages rustac's Rust implementation for fast Arrow conversion
+    and provides native support for cloud object stores.
+    The engine uses synchronous wrappers around rustac's async functions for
+    compatibility with the existing pipeline architecture. For direct async usage,
+    use the rustac library directly.
+    GeoParquet I/O operations are inherited from GeoParquetIOMixin to ensure
+    consistent behavior across all engine implementations.
+    Attributes:
+        name: Engine identifier string ("rustac").
+    Example:
+        >>> engine = RustacEngine()
+        >>> items = [{"type": "Feature", "id": "item1", ...}]
+        >>> gdf = engine.items_to_geodataframe(items)
+        >>> # Process GeoDataFrame (sort, filter, etc.)
+        >>> items_out = engine.geodataframe_to_items(gdf)
+    """
+    def __init__(self) -> None:
+        """Initialize the rustac engine.
+        Raises:
+            ImportError: If rustac library is not installed.
+        """
+        if not HAS_RUSTAC:
+            raise ImportError("rustac library not available. Install with: pip install 'rustac[arrow]'")
+    @property
+    def name(self) -> str:
+        """Return the engine name."""
+        return "rustac"
+    def items_to_geodataframe(self, items: list[dict[str, Any]]) -> gpd.GeoDataFrame:
+        """Convert STAC items to GeoDataFrame using rustac.to_arrow().
+        Uses rustac's Rust-based Arrow conversion for better performance
+        compared to pure Python implementations.
+        Args:
+            items: List of STAC item dictionaries.
+        Returns:
+            GeoDataFrame with geometry and all STAC properties.
+        Raises:
+            ValueError: If items cannot be converted.
+        """
+        if not items:
+            return gpd.GeoDataFrame()
+        try:
+            # rustac.to_arrow() accepts a list of items or an item collection
+            table = rustac.to_arrow(items)  # type: ignore[union-attr]
+            # Convert Arrow table to GeoDataFrame
+            # GeoDataFrame.from_arrow() handles the geometry column correctly
+            gdf = gpd.GeoDataFrame.from_arrow(table)
+            return gdf
+        except Exception as e:
+            logger.error(f"Error converting items to GeoDataFrame with rustac: {e}")
+            raise ValueError(f"Failed to convert STAC items to GeoDataFrame: {e}") from e
+    def geodataframe_to_items(self, gdf: gpd.GeoDataFrame) -> list[dict[str, Any]]:
+        """Convert GeoDataFrame back to STAC items using rustac.from_arrow().
+        Args:
+            gdf: GeoDataFrame containing STAC item data.
+        Returns:
+            List of STAC item dictionaries.
+        Raises:
+            ValueError: If GeoDataFrame cannot be converted.
+        """
+        if gdf.empty:
+            return []
+        try:
+            # Convert GeoDataFrame to Arrow table
+            table = gdf.to_arrow()
+            # Use rustac to convert Arrow table to STAC item collection
+            item_collection = rustac.from_arrow(table)  # type: ignore[union-attr, arg-type]
+            # Extract features from the item collection
+            if isinstance(item_collection, dict):
+                return item_collection.get("features", [])
+            else:
+                return []
+        except Exception as e:
+            logger.error(f"Error converting GeoDataFrame to items with rustac: {e}")
+            raise ValueError(f"Failed to convert GeoDataFrame to STAC items: {e}") from e

earthcatalog/engines/stac_geoparquet_engine.py ADDED Viewed

@@ -0,0 +1,126 @@
+"""Legacy STAC engine using stac-geoparquet library.
+This module provides the stac-geoparquet engine implementation using the
+pure Python stac-geoparquet library for STAC item conversion and GeoParquet I/O.
+This engine is provided for backward compatibility and as a fallback when
+rustac is not available.
+Requirements:
+    - stac-geoparquet>=0.2.0
+Example:
+    >>> from earthcatalog.engines.stac_geoparquet_engine import StacGeoparquetEngine
+    >>> engine = StacGeoparquetEngine()
+    >>> gdf = engine.items_to_geodataframe(items)
+    >>> engine.write_geoparquet_sync(gdf, "output.parquet", storage)
+"""
+import logging
+from typing import Any
+import geopandas as gpd
+from . import GeoParquetIOMixin, STACEngine
+logger = logging.getLogger(__name__)
+# Import stac_geoparquet - will raise ImportError if not available
+try:
+    from stac_geoparquet import to_geodataframe, to_item_collection
+    HAS_STAC_GEOPARQUET = True
+except ImportError:
+    HAS_STAC_GEOPARQUET = False
+    to_geodataframe = None  # type: ignore
+    to_item_collection = None  # type: ignore
+class StacGeoparquetEngine(GeoParquetIOMixin, STACEngine):
+    """Legacy STAC engine using stac-geoparquet library.
+    This engine uses the stac-geoparquet library for STAC item conversion.
+    It provides a stable fallback option when rustac is not available.
+    GeoParquet I/O operations are inherited from GeoParquetIOMixin to ensure
+    consistent behavior across all engine implementations.
+    Attributes:
+        name: Engine identifier string ("stac-geoparquet").
+    Example:
+        >>> engine = StacGeoparquetEngine()
+        >>> items = [{"type": "Feature", "id": "item1", ...}]
+        >>> gdf = engine.items_to_geodataframe(items)
+        >>> items_out = engine.geodataframe_to_items(gdf)
+    """
+    def __init__(self) -> None:
+        """Initialize the stac-geoparquet engine.
+        Raises:
+            ImportError: If stac-geoparquet library is not installed.
+        """
+        if not HAS_STAC_GEOPARQUET:
+            raise ImportError("stac-geoparquet library not available. Install with: pip install stac-geoparquet")
+    @property
+    def name(self) -> str:
+        """Return the engine name."""
+        return "stac-geoparquet"
+    def items_to_geodataframe(self, items: list[dict[str, Any]]) -> gpd.GeoDataFrame:
+        """Convert STAC items to GeoDataFrame using stac_geoparquet.to_geodataframe().
+        Args:
+            items: List of STAC item dictionaries.
+        Returns:
+            GeoDataFrame with geometry and all STAC properties.
+        Raises:
+            ValueError: If items cannot be converted.
+        """
+        if not items:
+            return gpd.GeoDataFrame()
+        try:
+            # Use numpy_nullable to maintain current behavior and avoid FutureWarning
+            gdf = to_geodataframe(items, dtype_backend="numpy_nullable")  # type: ignore[misc]
+            return gdf
+        except Exception as e:
+            logger.error(f"Error converting items to GeoDataFrame with stac-geoparquet: {e}")
+            raise ValueError(f"Failed to convert STAC items to GeoDataFrame: {e}") from e
+    def geodataframe_to_items(self, gdf: gpd.GeoDataFrame) -> list[dict[str, Any]]:
+        """Convert GeoDataFrame back to STAC items using stac_geoparquet.to_item_collection().
+        Args:
+            gdf: GeoDataFrame containing STAC item data.
+        Returns:
+            List of STAC item dictionaries.
+        Raises:
+            ValueError: If GeoDataFrame cannot be converted.
+        """
+        if gdf.empty:
+            return []
+        try:
+            item_collection = to_item_collection(gdf)  # type: ignore[misc]
+            # Handle both dict and FeatureCollection objects
+            if hasattr(item_collection, "to_dict"):
+                features = item_collection.to_dict().get("features", [])
+            elif isinstance(item_collection, dict):
+                features = item_collection.get("features", [])
+            else:
+                features = []
+            return features
+        except Exception as e:
+            logger.error(f"Error converting GeoDataFrame to items with stac-geoparquet: {e}")
+            raise ValueError(f"Failed to convert GeoDataFrame to STAC items: {e}") from e