PyPI - earthcatalog - Versions diffs - 0.2.0__py3-none-any.whl - Mend

earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

earthcatalog/__init__.py +164 -0
earthcatalog/async_http_client.py +1006 -0
earthcatalog/config.py +97 -0
earthcatalog/engines/__init__.py +308 -0
earthcatalog/engines/rustac_engine.py +142 -0
earthcatalog/engines/stac_geoparquet_engine.py +126 -0
earthcatalog/exceptions.py +471 -0
earthcatalog/grid_systems.py +1114 -0
earthcatalog/ingestion_pipeline.py +2281 -0
earthcatalog/input_readers.py +603 -0
earthcatalog/job_tracking.py +485 -0
earthcatalog/pipeline.py +606 -0
earthcatalog/schema_generator.py +911 -0
earthcatalog/spatial_resolver.py +1207 -0
earthcatalog/stac_hooks.py +754 -0
earthcatalog/statistics.py +677 -0
earthcatalog/storage_backends.py +548 -0
earthcatalog/tests/__init__.py +1 -0
earthcatalog/tests/conftest.py +76 -0
earthcatalog/tests/test_all_grids.py +793 -0
earthcatalog/tests/test_async_http.py +700 -0
earthcatalog/tests/test_cli_and_storage.py +230 -0
earthcatalog/tests/test_config.py +245 -0
earthcatalog/tests/test_dask_integration.py +580 -0
earthcatalog/tests/test_e2e_synthetic.py +1624 -0
earthcatalog/tests/test_engines.py +272 -0
earthcatalog/tests/test_exceptions.py +346 -0
earthcatalog/tests/test_file_structure.py +245 -0
earthcatalog/tests/test_input_readers.py +666 -0
earthcatalog/tests/test_integration.py +200 -0
earthcatalog/tests/test_integration_async.py +283 -0
earthcatalog/tests/test_job_tracking.py +603 -0
earthcatalog/tests/test_multi_file_input.py +336 -0
earthcatalog/tests/test_passthrough_hook.py +196 -0
earthcatalog/tests/test_pipeline.py +684 -0
earthcatalog/tests/test_pipeline_components.py +665 -0
earthcatalog/tests/test_schema_generator.py +506 -0
earthcatalog/tests/test_spatial_resolver.py +413 -0
earthcatalog/tests/test_stac_hooks.py +776 -0
earthcatalog/tests/test_statistics.py +477 -0
earthcatalog/tests/test_storage_backends.py +236 -0
earthcatalog/tests/test_validation.py +435 -0
earthcatalog/tests/test_workers.py +653 -0
earthcatalog/validation.py +921 -0
earthcatalog/workers.py +682 -0
earthcatalog-0.2.0.dist-info/METADATA +333 -0
earthcatalog-0.2.0.dist-info/RECORD +50 -0
earthcatalog-0.2.0.dist-info/WHEEL +5 -0
earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
earthcatalog-0.2.0.dist-info/top_level.txt +1 -0

earthcatalog/tests/test_engines.py ADDED Viewed

@@ -0,0 +1,272 @@
+"""Tests for STAC engine abstraction layer.
+Tests both rustac and stac-geoparquet engines to ensure consistent behavior.
+"""
+import tempfile
+from pathlib import Path
+import geopandas as gpd
+import pytest
+from earthcatalog.engines import (
+    STACEngine,
+    get_engine,
+)
+from earthcatalog.storage_backends import LocalStorage
+# Sample STAC items for testing
+SAMPLE_ITEMS = [
+    {
+        "type": "Feature",
+        "id": "test-item-1",
+        "stac_version": "1.0.0",
+        "geometry": {"type": "Point", "coordinates": [-122.4, 37.8]},
+        "bbox": [-122.5, 37.7, -122.3, 37.9],
+        "properties": {"datetime": "2024-01-01T00:00:00Z"},
+        "links": [{"rel": "self", "href": "https://example.com/item1.json"}],
+        "assets": {"data": {"href": "https://example.com/data1.tif"}},
+    },
+    {
+        "type": "Feature",
+        "id": "test-item-2",
+        "stac_version": "1.0.0",
+        "geometry": {
+            "type": "Polygon",
+            "coordinates": [[[-122.0, 37.0], [-121.0, 37.0], [-121.0, 38.0], [-122.0, 38.0], [-122.0, 37.0]]],
+        },
+        "bbox": [-122.0, 37.0, -121.0, 38.0],
+        "properties": {"datetime": "2024-06-15T12:00:00Z"},
+        "links": [{"rel": "self", "href": "https://example.com/item2.json"}],
+        "assets": {"data": {"href": "https://example.com/data2.tif"}},
+    },
+]
+@pytest.fixture(params=["rustac", "stac-geoparquet"])
+def engine(request) -> STACEngine:
+    """Parameterized fixture that yields both engine types."""
+    return get_engine(request.param)
+class TestEngineFactory:
+    """Tests for the engine factory function."""
+    def test_get_rustac_engine(self):
+        """Test getting rustac engine explicitly."""
+        engine = get_engine("rustac")
+        assert engine.name == "rustac"
+    def test_get_stac_geoparquet_engine(self):
+        """Test getting stac-geoparquet engine explicitly."""
+        engine = get_engine("stac-geoparquet")
+        assert engine.name == "stac-geoparquet"
+    def test_get_auto_engine(self):
+        """Test auto engine selection (should prefer rustac)."""
+        engine = get_engine("auto")
+        # Should return one of the available engines
+        assert engine.name in ("rustac", "stac-geoparquet")
+    def test_invalid_engine_type(self):
+        """Test that invalid engine type raises ValueError."""
+        with pytest.raises(ValueError, match="Unknown engine type"):
+            get_engine("invalid-engine")  # type: ignore
+    def test_engine_is_stac_engine(self):
+        """Test that returned engines implement STACEngine."""
+        engine = get_engine("rustac")
+        assert isinstance(engine, STACEngine)
+class TestItemsToGeoDataFrame:
+    """Tests for converting STAC items to GeoDataFrame."""
+    def test_single_item_conversion(self, engine: STACEngine):
+        """Test converting a single STAC item."""
+        gdf = engine.items_to_geodataframe([SAMPLE_ITEMS[0]])
+        assert isinstance(gdf, gpd.GeoDataFrame)
+        assert len(gdf) == 1
+        assert "geometry" in gdf.columns
+        assert gdf.iloc[0]["id"] == "test-item-1"
+    def test_multiple_items_conversion(self, engine: STACEngine):
+        """Test converting multiple STAC items."""
+        gdf = engine.items_to_geodataframe(SAMPLE_ITEMS)
+        assert isinstance(gdf, gpd.GeoDataFrame)
+        assert len(gdf) == 2
+        assert set(gdf["id"].tolist()) == {"test-item-1", "test-item-2"}
+    def test_empty_items_list(self, engine: STACEngine):
+        """Test converting empty list returns empty GeoDataFrame."""
+        gdf = engine.items_to_geodataframe([])
+        assert isinstance(gdf, gpd.GeoDataFrame)
+        assert len(gdf) == 0
+    def test_geometry_preserved(self, engine: STACEngine):
+        """Test that geometry is correctly preserved."""
+        gdf = engine.items_to_geodataframe([SAMPLE_ITEMS[0]])
+        geom = gdf.iloc[0].geometry
+        assert geom is not None
+        assert geom.geom_type == "Point"
+class TestGeoDataFrameToItems:
+    """Tests for converting GeoDataFrame back to STAC items."""
+    def test_roundtrip_conversion(self, engine: STACEngine):
+        """Test that items survive a roundtrip conversion."""
+        # Convert to GeoDataFrame
+        gdf = engine.items_to_geodataframe(SAMPLE_ITEMS)
+        # Convert back to items
+        items = engine.geodataframe_to_items(gdf)
+        assert len(items) == 2
+        item_ids = {item["id"] for item in items}
+        assert item_ids == {"test-item-1", "test-item-2"}
+    def test_empty_geodataframe(self, engine: STACEngine):
+        """Test converting empty GeoDataFrame returns empty list."""
+        gdf = gpd.GeoDataFrame()
+        items = engine.geodataframe_to_items(gdf)
+        assert items == []
+    def test_geometry_preserved_in_roundtrip(self, engine: STACEngine):
+        """Test that geometry is preserved in roundtrip."""
+        original = [SAMPLE_ITEMS[0]]
+        gdf = engine.items_to_geodataframe(original)
+        items = engine.geodataframe_to_items(gdf)
+        assert len(items) == 1
+        item = items[0]
+        assert "geometry" in item
+        assert item["geometry"]["type"] == "Point"
+class TestGeoParquetIO:
+    """Tests for GeoParquet read/write operations."""
+    def test_write_and_read_geoparquet(self, engine: STACEngine):
+        """Test writing and reading GeoParquet files."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = f"{tmpdir}/test.parquet"
+            storage = LocalStorage(tmpdir)
+            # Convert items to GeoDataFrame
+            gdf = engine.items_to_geodataframe(SAMPLE_ITEMS)
+            # Write to GeoParquet
+            engine.write_geoparquet_sync(gdf, path, storage)
+            # Verify file was created
+            assert Path(path).exists()
+            # Read back
+            gdf_read = engine.read_geoparquet_sync(path, storage)
+            # Verify data
+            assert isinstance(gdf_read, gpd.GeoDataFrame)
+            assert len(gdf_read) == 2
+    def test_write_empty_geodataframe(self, engine: STACEngine):
+        """Test that writing empty GeoDataFrame doesn't create file."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = f"{tmpdir}/empty.parquet"
+            storage = LocalStorage(tmpdir)
+            gdf = gpd.GeoDataFrame()
+            engine.write_geoparquet_sync(gdf, path, storage)
+            # Empty GeoDataFrame should not create a file
+            assert not Path(path).exists()
+    def test_read_nonexistent_file(self, engine: STACEngine):
+        """Test reading nonexistent file raises appropriate error."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = f"{tmpdir}/nonexistent.parquet"
+            storage = LocalStorage(tmpdir)
+            with pytest.raises((FileNotFoundError, IOError)):
+                engine.read_geoparquet_sync(path, storage)
+class TestEngineConsistency:
+    """Tests to verify both engines produce consistent results."""
+    def test_geodataframe_columns_present(self):
+        """Test that both engines include essential columns."""
+        rustac_engine = get_engine("rustac")
+        sgp_engine = get_engine("stac-geoparquet")
+        gdf_rustac = rustac_engine.items_to_geodataframe(SAMPLE_ITEMS)
+        gdf_sgp = sgp_engine.items_to_geodataframe(SAMPLE_ITEMS)
+        # Both should have geometry column
+        assert "geometry" in gdf_rustac.columns
+        assert "geometry" in gdf_sgp.columns
+        # Both should have id column
+        assert "id" in gdf_rustac.columns
+        assert "id" in gdf_sgp.columns
+    def test_item_ids_preserved(self):
+        """Test that both engines preserve item IDs."""
+        rustac_engine = get_engine("rustac")
+        sgp_engine = get_engine("stac-geoparquet")
+        gdf_rustac = rustac_engine.items_to_geodataframe(SAMPLE_ITEMS)
+        gdf_sgp = sgp_engine.items_to_geodataframe(SAMPLE_ITEMS)
+        ids_rustac = set(gdf_rustac["id"].tolist())
+        ids_sgp = set(gdf_sgp["id"].tolist())
+        expected_ids = {"test-item-1", "test-item-2"}
+        assert ids_rustac == expected_ids
+        assert ids_sgp == expected_ids
+    def test_roundtrip_preserves_ids(self):
+        """Test that roundtrip conversion preserves IDs in both engines."""
+        rustac_engine = get_engine("rustac")
+        sgp_engine = get_engine("stac-geoparquet")
+        for engine in [rustac_engine, sgp_engine]:
+            gdf = engine.items_to_geodataframe(SAMPLE_ITEMS)
+            items = engine.geodataframe_to_items(gdf)
+            item_ids = {item["id"] for item in items}
+            assert item_ids == {"test-item-1", "test-item-2"}, f"Failed for {engine.name}"
+class TestEnginePerformance:
+    """Basic performance sanity tests."""
+    def test_handles_larger_datasets(self, engine: STACEngine):
+        """Test that engines can handle larger datasets."""
+        # Create 100 items
+        items = []
+        for i in range(100):
+            item = {
+                "type": "Feature",
+                "id": f"test-item-{i}",
+                "stac_version": "1.0.0",
+                "geometry": {"type": "Point", "coordinates": [-122.0 + (i * 0.01), 37.0 + (i * 0.01)]},
+                "bbox": [-122.1, 36.9, -121.9, 37.1],
+                "properties": {"datetime": "2024-01-01T00:00:00Z"},
+                "links": [],
+                "assets": {},
+            }
+            items.append(item)
+        # Convert to GeoDataFrame
+        gdf = engine.items_to_geodataframe(items)
+        assert len(gdf) == 100
+        # Convert back
+        converted = engine.geodataframe_to_items(gdf)
+        assert len(converted) == 100

earthcatalog/tests/test_exceptions.py ADDED Viewed

@@ -0,0 +1,346 @@
+"""Tests for the exceptions module."""
+import pytest
+from earthcatalog.exceptions import (
+    ConfigurationError,
+    ConsolidationError,
+    DownloadError,
+    EarthCatalogError,
+    IngestionError,
+    InvalidGridConfigError,
+    InvalidStorageConfigError,
+    ItemValidationError,
+    QueryError,
+    SpatialResolverError,
+    StorageConnectionError,
+    StorageError,
+    StorageReadError,
+    StorageWriteError,
+)
+class TestEarthCatalogError:
+    """Tests for base EarthCatalogError class."""
+    def test_basic_initialization(self):
+        """Test basic exception initialization."""
+        exc = EarthCatalogError("Something went wrong")
+        assert str(exc) == "Something went wrong"
+        assert exc.message == "Something went wrong"
+        assert exc.details == {}
+    def test_initialization_with_details(self):
+        """Test exception with details dict."""
+        details = {"key": "value", "count": 42}
+        exc = EarthCatalogError("Error occurred", details=details)
+        assert exc.details == details
+        assert exc.details["key"] == "value"
+    def test_repr(self):
+        """Test __repr__ method."""
+        exc = EarthCatalogError("Test error")
+        assert repr(exc) == "EarthCatalogError('Test error')"
+    def test_bool(self):
+        """Test __bool__ method - exceptions are always truthy."""
+        exc = EarthCatalogError("Error")
+        assert bool(exc) is True
+    def test_is_exception(self):
+        """Test that it can be raised and caught."""
+        with pytest.raises(EarthCatalogError):
+            raise EarthCatalogError("Test")
+    def test_catch_as_base_exception(self):
+        """Test that exception inherits from Exception."""
+        exc = EarthCatalogError("Test")
+        assert isinstance(exc, Exception)
+class TestConfigurationError:
+    """Tests for ConfigurationError and subclasses."""
+    def test_configuration_error_inheritance(self):
+        """Test ConfigurationError inherits from EarthCatalogError."""
+        exc = ConfigurationError("Invalid config")
+        assert isinstance(exc, EarthCatalogError)
+    def test_invalid_grid_config_error(self):
+        """Test InvalidGridConfigError with all attributes."""
+        exc = InvalidGridConfigError(
+            "H3 resolution must be 0-15",
+            grid_system="h3",
+            resolution=20,
+        )
+        assert exc.grid_system == "h3"
+        assert exc.resolution == 20
+        assert exc.details["grid_system"] == "h3"
+        assert exc.details["resolution"] == 20
+        assert isinstance(exc, ConfigurationError)
+    def test_invalid_grid_config_error_with_extra_kwargs(self):
+        """Test InvalidGridConfigError with additional kwargs."""
+        exc = InvalidGridConfigError(
+            "Error",
+            grid_system="h3",
+            resolution=6,
+            extra_info="some value",
+        )
+        assert exc.details["extra_info"] == "some value"
+    def test_invalid_storage_config_error(self):
+        """Test InvalidStorageConfigError with all attributes."""
+        exc = InvalidStorageConfigError(
+            "Bucket does not exist",
+            backend="s3",
+            path="s3://nonexistent/bucket",
+        )
+        assert exc.backend == "s3"
+        assert exc.path == "s3://nonexistent/bucket"
+        assert isinstance(exc, ConfigurationError)
+class TestIngestionError:
+    """Tests for IngestionError and subclasses."""
+    def test_ingestion_error_inheritance(self):
+        """Test IngestionError inherits from EarthCatalogError."""
+        exc = IngestionError("Ingestion failed")
+        assert isinstance(exc, EarthCatalogError)
+    def test_download_error_basic(self):
+        """Test DownloadError with minimal attributes."""
+        exc = DownloadError("Connection failed")
+        assert exc.url is None
+        assert exc.status_code is None
+        assert exc.retry_count == 0
+        assert isinstance(exc, IngestionError)
+    def test_download_error_full(self):
+        """Test DownloadError with all attributes."""
+        exc = DownloadError(
+            "Request timed out",
+            url="https://api.example.com/item.json",
+            status_code=504,
+            retry_count=3,
+            error_type="timeout",
+        )
+        assert exc.url == "https://api.example.com/item.json"
+        assert exc.status_code == 504
+        assert exc.retry_count == 3
+        assert exc.error_type == "timeout"
+        assert exc.details["url"] == "https://api.example.com/item.json"
+    def test_item_validation_error(self):
+        """Test ItemValidationError with all attributes."""
+        exc = ItemValidationError(
+            "Geometry is self-intersecting",
+            item_id="ITEM_123",
+            issue_code="INVALID_GEOMETRY",
+        )
+        assert exc.item_id == "ITEM_123"
+        assert exc.issue_code == "INVALID_GEOMETRY"
+        assert isinstance(exc, IngestionError)
+    def test_consolidation_error(self):
+        """Test ConsolidationError with all attributes."""
+        exc = ConsolidationError(
+            "Memory limit exceeded",
+            partition_key="h3=abc123/year=2024/month=01",
+            shard_count=50,
+        )
+        assert exc.partition_key == "h3=abc123/year=2024/month=01"
+        assert exc.shard_count == 50
+        assert isinstance(exc, IngestionError)
+class TestStorageError:
+    """Tests for StorageError and subclasses."""
+    def test_storage_error_inheritance(self):
+        """Test StorageError inherits from EarthCatalogError."""
+        exc = StorageError("Storage failed")
+        assert isinstance(exc, EarthCatalogError)
+    def test_storage_error_with_path(self):
+        """Test StorageError with path attribute."""
+        exc = StorageError("Cannot access path", path="/data/catalog")
+        assert exc.path == "/data/catalog"
+        assert exc.details["path"] == "/data/catalog"
+    def test_storage_connection_error(self):
+        """Test StorageConnectionError."""
+        exc = StorageConnectionError(
+            "Access denied",
+            path="s3://bucket/catalog",
+        )
+        assert exc.path == "s3://bucket/catalog"
+        assert isinstance(exc, StorageError)
+    def test_storage_write_error(self):
+        """Test StorageWriteError with bytes_written."""
+        exc = StorageWriteError(
+            "Disk full",
+            path="/catalog/partition.parquet",
+            bytes_written=1024000,
+        )
+        assert exc.path == "/catalog/partition.parquet"
+        assert exc.bytes_written == 1024000
+        assert exc.details["bytes_written"] == 1024000
+        assert isinstance(exc, StorageError)
+    def test_storage_read_error(self):
+        """Test StorageReadError."""
+        exc = StorageReadError(
+            "File not found",
+            path="s3://bucket/missing.parquet",
+        )
+        assert exc.path == "s3://bucket/missing.parquet"
+        assert isinstance(exc, StorageError)
+class TestQueryError:
+    """Tests for QueryError and subclasses."""
+    def test_query_error_inheritance(self):
+        """Test QueryError inherits from EarthCatalogError."""
+        exc = QueryError("Query failed")
+        assert isinstance(exc, EarthCatalogError)
+    def test_spatial_resolver_error(self):
+        """Test SpatialResolverError with geometry_type."""
+        exc = SpatialResolverError(
+            "Cannot resolve partitions",
+            geometry_type="Polygon",
+        )
+        assert exc.geometry_type == "Polygon"
+        assert exc.details["geometry_type"] == "Polygon"
+        assert isinstance(exc, QueryError)
+class TestExceptionHierarchy:
+    """Tests for the exception hierarchy."""
+    def test_all_inherit_from_base(self):
+        """Test all exceptions inherit from EarthCatalogError."""
+        exceptions = [
+            ConfigurationError("test"),
+            InvalidGridConfigError("test"),
+            InvalidStorageConfigError("test"),
+            IngestionError("test"),
+            DownloadError("test"),
+            ItemValidationError("test"),
+            ConsolidationError("test"),
+            StorageError("test"),
+            StorageConnectionError("test"),
+            StorageWriteError("test"),
+            StorageReadError("test"),
+            QueryError("test"),
+            SpatialResolverError("test"),
+        ]
+        for exc in exceptions:
+            assert isinstance(exc, EarthCatalogError), f"{type(exc).__name__} should inherit from EarthCatalogError"
+    def test_catch_all_with_base(self):
+        """Test that all exceptions can be caught with base class."""
+        exceptions_to_test = [
+            DownloadError("test"),
+            ConsolidationError("test"),
+            StorageWriteError("test"),
+            SpatialResolverError("test"),
+        ]
+        for exc in exceptions_to_test:
+            try:
+                raise exc
+            except EarthCatalogError as caught:
+                assert caught is exc
+    @pytest.mark.parametrize(
+        "exception_class,parent_class",
+        [
+            # Configuration errors
+            (InvalidGridConfigError, ConfigurationError),
+            (InvalidStorageConfigError, ConfigurationError),
+            # Ingestion errors
+            (DownloadError, IngestionError),
+            (ItemValidationError, IngestionError),
+            (ConsolidationError, IngestionError),
+            # Storage errors
+            (StorageConnectionError, StorageError),
+            (StorageWriteError, StorageError),
+            (StorageReadError, StorageError),
+            # Query errors
+            (SpatialResolverError, QueryError),
+        ],
+        ids=[
+            "InvalidGridConfigError->ConfigurationError",
+            "InvalidStorageConfigError->ConfigurationError",
+            "DownloadError->IngestionError",
+            "ItemValidationError->IngestionError",
+            "ConsolidationError->IngestionError",
+            "StorageConnectionError->StorageError",
+            "StorageWriteError->StorageError",
+            "StorageReadError->StorageError",
+            "SpatialResolverError->QueryError",
+        ],
+    )
+    def test_inheritance_chains(self, exception_class, parent_class):
+        """Test specific inheritance chains."""
+        exc = exception_class("test")
+        assert isinstance(exc, parent_class), f"{exception_class.__name__} should inherit from {parent_class.__name__}"
+class TestExceptionUsage:
+    """Test realistic usage patterns."""
+    def test_exception_in_try_except(self):
+        """Test exception usage in try/except."""
+        def download_item(url: str) -> dict:
+            if "invalid" in url:
+                raise DownloadError(
+                    f"Failed to download {url}",
+                    url=url,
+                    status_code=404,
+                    error_type="not_found",
+                )
+            return {"id": "item"}
+        # Should not raise
+        result = download_item("https://valid.com/item.json")
+        assert result == {"id": "item"}
+        # Should raise
+        with pytest.raises(DownloadError) as exc_info:
+            download_item("https://invalid.com/item.json")
+        assert exc_info.value.status_code == 404
+        assert exc_info.value.error_type == "not_found"
+    def test_exception_chaining(self):
+        """Test exception chaining with __cause__."""
+        try:
+            try:
+                raise ValueError("Original error")
+            except ValueError as e:
+                raise DownloadError("Download failed") from e
+        except DownloadError as exc:
+            assert exc.__cause__ is not None
+            assert isinstance(exc.__cause__, ValueError)
+    def test_exception_details_for_logging(self):
+        """Test that exception details are useful for logging."""
+        exc = ConsolidationError(
+            "Failed to merge shards",
+            partition_key="h3=abc/year=2024",
+            shard_count=10,
+            memory_used_mb=1024,
+        )
+        # Details should be suitable for structured logging
+        log_context = exc.details
+        assert "partition_key" in log_context
+        assert "shard_count" in log_context
+        assert "memory_used_mb" in log_context