PyPI - earthcatalog - Versions diffs - 0.2.0__py3-none-any.whl - Mend

earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

earthcatalog/__init__.py +164 -0
earthcatalog/async_http_client.py +1006 -0
earthcatalog/config.py +97 -0
earthcatalog/engines/__init__.py +308 -0
earthcatalog/engines/rustac_engine.py +142 -0
earthcatalog/engines/stac_geoparquet_engine.py +126 -0
earthcatalog/exceptions.py +471 -0
earthcatalog/grid_systems.py +1114 -0
earthcatalog/ingestion_pipeline.py +2281 -0
earthcatalog/input_readers.py +603 -0
earthcatalog/job_tracking.py +485 -0
earthcatalog/pipeline.py +606 -0
earthcatalog/schema_generator.py +911 -0
earthcatalog/spatial_resolver.py +1207 -0
earthcatalog/stac_hooks.py +754 -0
earthcatalog/statistics.py +677 -0
earthcatalog/storage_backends.py +548 -0
earthcatalog/tests/__init__.py +1 -0
earthcatalog/tests/conftest.py +76 -0
earthcatalog/tests/test_all_grids.py +793 -0
earthcatalog/tests/test_async_http.py +700 -0
earthcatalog/tests/test_cli_and_storage.py +230 -0
earthcatalog/tests/test_config.py +245 -0
earthcatalog/tests/test_dask_integration.py +580 -0
earthcatalog/tests/test_e2e_synthetic.py +1624 -0
earthcatalog/tests/test_engines.py +272 -0
earthcatalog/tests/test_exceptions.py +346 -0
earthcatalog/tests/test_file_structure.py +245 -0
earthcatalog/tests/test_input_readers.py +666 -0
earthcatalog/tests/test_integration.py +200 -0
earthcatalog/tests/test_integration_async.py +283 -0
earthcatalog/tests/test_job_tracking.py +603 -0
earthcatalog/tests/test_multi_file_input.py +336 -0
earthcatalog/tests/test_passthrough_hook.py +196 -0
earthcatalog/tests/test_pipeline.py +684 -0
earthcatalog/tests/test_pipeline_components.py +665 -0
earthcatalog/tests/test_schema_generator.py +506 -0
earthcatalog/tests/test_spatial_resolver.py +413 -0
earthcatalog/tests/test_stac_hooks.py +776 -0
earthcatalog/tests/test_statistics.py +477 -0
earthcatalog/tests/test_storage_backends.py +236 -0
earthcatalog/tests/test_validation.py +435 -0
earthcatalog/tests/test_workers.py +653 -0
earthcatalog/validation.py +921 -0
earthcatalog/workers.py +682 -0
earthcatalog-0.2.0.dist-info/METADATA +333 -0
earthcatalog-0.2.0.dist-info/RECORD +50 -0
earthcatalog-0.2.0.dist-info/WHEEL +5 -0
earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
earthcatalog-0.2.0.dist-info/top_level.txt +1 -0

earthcatalog/tests/test_integration.py ADDED Viewed

@@ -0,0 +1,200 @@
+"""Integration tests for end-to-end catalog generation and spatial resolution."""
+import json
+import tempfile
+from pathlib import Path
+import pytest
+from shapely.geometry import box
+from earthcatalog.spatial_resolver import spatial_resolver
+class TestIntegration:
+    """Integration tests for schema generation and spatial resolution workflow."""
+    def setup_method(self):
+        """Set up test fixtures."""
+        self.temp_dir = tempfile.mkdtemp()
+        self.catalog_path = Path(self.temp_dir) / "test_catalog"
+        self.catalog_path.mkdir(exist_ok=True)
+    def test_schema_generation_and_spatial_resolution(self):
+        """Test schema generation functionality and spatial resolution."""
+        # Create a mock schema file (as would be generated by the pipeline)
+        mock_schema = {
+            "spatial_partitioning": {
+                "grid_system": "h3",
+                "resolution": 6,
+                "coordinate_system": "WGS84 (EPSG:4326)",
+                "cell_area_km2_avg": 36.13,
+                "cell_edge_length_km_avg": 3.23,
+            },
+            "global_partitioning": {
+                "enabled": True,
+                "threshold": 2,
+                "description": "Items that span more than 2 spatial cells are stored in global partition",
+            },
+            "temporal_partitioning": {"scheme": "year-month", "levels": ["year", "month"]},
+            "partition_structure": {
+                "spatial_partitions": ["861d907dfffffff", "861d90757ffffff"],
+                "temporal_partitions": ["2024", "2024-01", "2024-02"],
+                "global_partition": True,
+                "total_partitions": 12,
+            },
+            "usage_examples": {
+                "query_pattern": "SELECT * FROM read_parquet('{catalog_path}/spatial_partition={{partition_id}}/temporal_partition=2024-*/data.parquet')",
+                "description": "Replace {{partition_id}} with resolved spatial partition IDs for efficient querying",
+            },
+        }
+        # Save mock schema file
+        schema_file = self.catalog_path / "catalog_schema.json"
+        with open(schema_file, "w") as f:
+            json.dump(mock_schema, f, indent=2)
+        # Test spatial resolution with mock schema
+        resolver = spatial_resolver(str(schema_file), str(self.catalog_path))
+        # Verify resolver initialization
+        assert resolver.grid_system == "h3"
+        assert resolver.spatial_config["resolution"] == 6
+        assert resolver.global_enabled is True
+        assert resolver.global_threshold == 2
+        # Test resolution for San Francisco area
+        sf_area = box(-122.5, 37.7, -122.0, 38.0)
+        sf_partitions = resolver.resolve_partitions(sf_area, overlap=True)
+        assert len(sf_partitions) > 0, "Should resolve at least one partition for SF area"
+        # Test query path generation (creates mock partition directories for testing)
+        for partition_id in sf_partitions[:2]:  # Only test a couple to avoid creating too many dirs
+            if partition_id != "global":  # Skip global partition for this test
+                partition_dir = self.catalog_path / partition_id
+                partition_dir.mkdir(exist_ok=True)
+        query_paths = resolver.generate_query_paths(sf_partitions, "2024-*")
+        assert isinstance(query_paths, list), "Should return query path list"
+        if len(query_paths) > 0:  # Only check content if paths exist
+            assert any("2024-*" in path for path in query_paths), "Should contain temporal filter"
+        # Test resolution with manual global override
+        small_area = box(-122.1, 37.8, -122.05, 37.85)  # Very small area in SF
+        small_partitions_no_global = resolver.resolve_partitions(small_area, include_global=False)
+        small_partitions_with_global = resolver.resolve_partitions(small_area, include_global=True)
+        assert "global" not in small_partitions_no_global, "Manual override should exclude global"
+        assert "global" in small_partitions_with_global, "Manual override should include global"
+    def test_geojson_schema_and_resolution(self):
+        """Test schema generation and resolution with custom GeoJSON grid system."""
+        # Create custom GeoJSON tiles structure (as would be in a schema)
+        geojson_tiles = {
+            "type": "FeatureCollection",
+            "features": [
+                {
+                    "type": "Feature",
+                    "id": "tile_001",
+                    "properties": {"id": "tile_001"},  # ID should be in properties for resolver
+                    "geometry": {
+                        "type": "Polygon",
+                        "coordinates": [
+                            [[-122.5, 37.7], [-122.0, 37.7], [-122.0, 38.0], [-122.5, 38.0], [-122.5, 37.7]]
+                        ],
+                    },
+                },
+                {
+                    "type": "Feature",
+                    "id": "tile_002",
+                    "properties": {"id": "tile_002"},  # ID should be in properties for resolver
+                    "geometry": {
+                        "type": "Polygon",
+                        "coordinates": [
+                            [[-121.0, 37.0], [-120.5, 37.0], [-120.5, 37.5], [-121.0, 37.5], [-121.0, 37.0]]
+                        ],
+                    },
+                },
+            ],
+        }
+        # Save GeoJSON tiles to a file (as would happen in real usage)
+        geojson_file = self.catalog_path / "custom_tiles.geojson"
+        with open(geojson_file, "w") as f:
+            json.dump(geojson_tiles, f)
+        # Create mock schema with GeoJSON grid (matching schema_generator format)
+        geojson_schema = {
+            "spatial_partitioning": {
+                "grid_system": "geojson",
+                "geojson_source": str(geojson_file),
+                "custom_tiles": {
+                    "total_tiles": 2,
+                    "tile_ids": ["tile_001", "tile_002"],
+                    "source_file": str(geojson_file),
+                },
+                "coordinate_system": "WGS84 (EPSG:4326)",
+            },
+            "global_partitioning": {"enabled": False},
+            "temporal_partitioning": {"scheme": "year-month", "levels": ["year", "month"]},
+            "partition_structure": {
+                "spatial_partitions": ["tile_001", "tile_002"],
+                "temporal_partitions": ["2024", "2024-01"],
+                "global_partition": False,
+                "total_partitions": 4,
+            },
+        }
+        # Save schema file
+        schema_file = self.catalog_path / "geojson_schema.json"
+        with open(schema_file, "w") as f:
+            json.dump(geojson_schema, f, indent=2)
+        # Test spatial resolution
+        resolver = spatial_resolver(str(schema_file), str(self.catalog_path))
+        # Verify GeoJSON resolver initialization
+        assert resolver.grid_system == "geojson"
+        assert resolver.global_enabled is False
+        # Query area that overlaps with tile_001
+        query_area = box(-122.4, 37.75, -122.1, 37.95)
+        partitions = resolver.resolve_partitions(query_area)
+        assert "tile_001" in partitions, "Should resolve tile_001 for the query area"
+        assert "global" not in partitions, "Global partition should be disabled"
+        # Test query path generation for GeoJSON tiles (create mock directory)
+        tile_dir = self.catalog_path / "tile_001"
+        tile_dir.mkdir(exist_ok=True)
+        query_paths = resolver.generate_query_paths(["tile_001"], "2024-*")
+        assert isinstance(query_paths, list), "Should return list of query paths"
+        if len(query_paths) > 0:
+            assert any("tile_001" in path for path in query_paths), "Query path should include resolved tile"
+    def test_schema_validation_and_error_handling(self):
+        """Test schema validation and error handling in integration scenarios."""
+        # Test with invalid schema
+        invalid_schema = {"spatial_partitioning": {"grid_system": "invalid_grid"}}  # Invalid grid system
+        schema_file = self.catalog_path / "invalid_schema.json"
+        with open(schema_file, "w") as f:
+            json.dump(invalid_schema, f)
+        # Should raise error for invalid grid system
+        with pytest.raises(ValueError, match="Unsupported grid system"):
+            resolver = spatial_resolver(str(schema_file), str(self.catalog_path))
+            resolver.resolve_partitions(box(-122.5, 37.7, -122.0, 38.0))
+        # Test with missing schema file
+        with pytest.raises(FileNotFoundError):
+            spatial_resolver("/nonexistent/schema.json", str(self.catalog_path))
+if __name__ == "__main__":
+    # Allow running tests directly
+    pytest.main([__file__])

earthcatalog/tests/test_integration_async.py ADDED Viewed

@@ -0,0 +1,283 @@
+"""
+Integration test for EarthCatalog async HTTP with mocked STAC catalog endpoints.
+This test validates that the async HTTP implementation works correctly with
+simulated STAC catalog services, ensuring compatibility with real-world data patterns.
+Optimized for fast execution while maintaining comprehensive coverage.
+"""
+# mypy: ignore-errors
+import logging
+import tempfile
+import time
+from pathlib import Path
+import pandas as pd
+import pytest
+# Import async HTTP testing utilities
+try:
+    import aiohttp
+    from aioresponses import aioresponses
+    HAS_ASYNC_TEST_SUPPORT = True
+except ImportError:
+    HAS_ASYNC_TEST_SUPPORT = False
+    aioresponses = None  # type: ignore
+    aiohttp = None  # type: ignore
+# Configure test logging
+logging.basicConfig(level=logging.INFO)
+@pytest.fixture
+def mock_stac_urls() -> list[str]:
+    """Provide a set of STAC catalog URLs for integration testing."""
+    return [
+        # Microsoft Planetary Computer - Sentinel-2 L2A
+        "https://planetarycomputer.microsoft.com/api/stac/v1/collections/sentinel-2-l2a/items?limit=5&bbox=-122.5,37.7,-122.3,37.9",
+        # AWS Earth Search - Landsat Collection 2 L2
+        "https://earth-search.aws.element84.com/v1/collections/landsat-c2l2-sr/items?limit=5&bbox=-122.5,37.7,-122.3,37.9",
+        # ESA Copernicus Data Space - Sentinel-1
+        "https://catalogue.dataspace.copernicus.eu/stac/collections/SENTINEL-1/items?limit=5&bbox=-122.5,37.7,-122.3,37.9",
+    ]
+@pytest.fixture
+def temp_test_dir():
+    """Create temporary directory for test files."""
+    with tempfile.TemporaryDirectory() as temp_dir:
+        yield Path(temp_dir)
+@pytest.fixture
+def sample_stac_response():
+    """Sample STAC FeatureCollection response for mocking."""
+    return {
+        "type": "FeatureCollection",
+        "links": [],
+        "features": [
+            {
+                "id": "S2A_MSIL2A_20241201T123456_N0500_R001_T10SDF_20241201T154321",
+                "type": "Feature",
+                "geometry": {"type": "Point", "coordinates": [-122.4, 37.8]},
+                "properties": {"datetime": "2024-12-01T12:34:56Z", "collection": "sentinel-2-l2a"},
+            },
+            {
+                "id": "LC08_L2SP_123456_20241201_20241201_02_T1",
+                "type": "Feature",
+                "geometry": {"type": "Point", "coordinates": [-122.35, 37.85]},
+                "properties": {"datetime": "2024-12-01T12:34:56Z", "collection": "landsat-c2l2-sr"},
+            },
+        ],
+    }
+def create_test_input_file(urls: list[str], output_path: Path) -> None:
+    """Create a parquet file with STAC URLs for testing."""
+    df = pd.DataFrame(
+        {
+            "url": urls,
+            "collection": [f"test_collection_{i}" for i in range(len(urls))],
+            "grid_id": [f"h{i:02d}v{i:02d}" for i in range(len(urls))],
+        }
+    )
+    df.to_parquet(output_path, index=False)
+@pytest.mark.skipif(not HAS_ASYNC_TEST_SUPPORT, reason="aioresponses not available")
+@pytest.mark.asyncio
+async def test_async_http_client_with_mock_stac(sample_stac_response):
+    """Test AsyncHTTPClient with mocked STAC catalog endpoints."""
+    pytest.importorskip("aiohttp", reason="aiohttp required for async HTTP client")
+    from earthcatalog.async_http_client import AsyncHTTPClient
+    # Test URLs (mocked for fast, reliable testing)
+    test_urls = [
+        "https://planetarycomputer.microsoft.com/api/stac/v1/collections/sentinel-2-l2a/items?limit=2&bbox=-122.5,37.7,-122.3,37.9",
+        "https://earth-search.aws.element84.com/v1/collections/landsat-c2l2-sr/items?limit=2&bbox=-122.5,37.7,-122.3,37.9",
+    ]
+    with aioresponses() as mock_responses:
+        # Mock all URLs with successful STAC responses
+        for url in test_urls:
+            mock_responses.get(url, payload=sample_stac_response)
+        async with AsyncHTTPClient(concurrent_requests=2, request_timeout=5) as client:
+            # Use download_batch method which returns RequestResult objects
+            results = await client.download_batch(test_urls)
+            # Validate results
+            assert len(results) == len(test_urls), "Should receive result for each URL"
+            successful_results = [r for r in results if r.success and r.data]
+            assert len(successful_results) == len(test_urls), "All requests should succeed with mocked responses"
+            for i, result in enumerate(successful_results):
+                assert result.data is not None, f"Result {i} should have data"
+                # Validate STAC response structure (data is already parsed)
+                data = result.data
+                assert "type" in data, "STAC response should have 'type' field"
+                assert "features" in data, "STAC response should have 'features' field"
+                assert isinstance(data["features"], list), "Features should be a list"
+                assert len(data["features"]) == 2, "Should have 2 STAC items as mocked"
+                logging.info(f"URL {i}: Retrieved {len(data['features'])} STAC items")
+@pytest.mark.skipif(not HAS_ASYNC_TEST_SUPPORT, reason="aioresponses not available")
+@pytest.mark.asyncio
+async def test_batch_downloader_with_mock_stac(sample_stac_response):
+    """Test BatchDownloader with mocked STAC catalog endpoints."""
+    pytest.importorskip("aiohttp", reason="aiohttp required for async HTTP client")
+    from earthcatalog.async_http_client import BatchDownloader
+    # Use mocked endpoints for fast testing
+    test_urls = [
+        "https://planetarycomputer.microsoft.com/api/stac/v1/collections/sentinel-2-l2a/items?limit=1&bbox=-122.4,37.8,-122.35,37.85",
+    ]
+    with aioresponses() as mock_responses:
+        # Mock with STAC response
+        mock_responses.get(test_urls[0], payload=sample_stac_response)
+        downloader = BatchDownloader(concurrent_requests=1, batch_size=1, request_timeout=5, retry_attempts=2)
+        try:
+            results = await downloader.download_all(test_urls)
+            # Validate results (download_all returns list of responses, not individual items)
+            assert len(results) == 1, "Should receive 1 response from mocked endpoint"
+            # The response is a FeatureCollection
+            response = results[0]
+            assert response is not None, "Response should not be None"
+            assert isinstance(response, dict), "Response should be dict"
+            # Validate STAC FeatureCollection structure
+            assert "type" in response, "Should have STAC 'type' field"
+            assert response["type"] == "FeatureCollection", "Should be a FeatureCollection"
+            assert "features" in response, "Should have 'features' field"
+            assert len(response["features"]) == 2, "Should have 2 features in the collection"
+            logging.info(f"Batch download: Retrieved FeatureCollection with {len(response['features'])} features")
+        finally:
+            await downloader.close()
+def test_processing_config_async_defaults():
+    """Test that ProcessingConfig has correct async defaults."""
+    from earthcatalog.ingestion_pipeline import ProcessingConfig
+    config = ProcessingConfig(input_file="test.parquet", output_catalog="./output", scratch_location="./scratch")
+    # Validate async HTTP is enabled by default
+    assert config.enable_concurrent_http is True, "Async HTTP should be enabled by default"
+    assert config.concurrent_requests == 50, "Default concurrent requests should be 50"
+    assert config.batch_size == 1000, "Default batch size should be 1000"
+    assert config.request_timeout == 30, "Default timeout should be 30 seconds"
+    assert config.retry_attempts == 3, "Default retry attempts should be 3"
+@pytest.mark.asyncio
+async def test_end_to_end_async_integration(mock_stac_urls, temp_test_dir):
+    """End-to-end integration test with mocked STAC URLs."""
+    pytest.importorskip("aiohttp", reason="aiohttp required for async processing")
+    from earthcatalog.ingestion_pipeline import LocalProcessor, ProcessingConfig, STACIngestionPipeline
+    # Setup test files
+    input_file = temp_test_dir / "test_stac_urls.parquet"
+    output_catalog = temp_test_dir / "output_catalog"
+    scratch_location = temp_test_dir / "scratch"
+    # Use subset of URLs for faster testing
+    test_urls = mock_stac_urls[:2]  # Only test first 2 URLs
+    create_test_input_file(test_urls, input_file)
+    # Configure for async processing
+    config = ProcessingConfig(
+        input_file=str(input_file),
+        output_catalog=str(output_catalog),
+        scratch_location=str(scratch_location),
+        enable_concurrent_http=True,
+        concurrent_requests=2,  # Conservative for testing
+        batch_size=10,
+        request_timeout=5,  # Short timeout for fast testing
+        retry_attempts=1,  # Minimal retries for speed
+        grid_system="h3",
+        grid_resolution=2,
+        temporal_bin="year",
+    )
+    # Create processor and pipeline
+    processor = LocalProcessor(n_workers=1)  # Single worker for testing
+    try:
+        pipeline = STACIngestionPipeline(config, processor)
+        # Just test that the pipeline initializes correctly with async config
+        # Running the full pipeline with STAC catalog URLs is complex for unit testing
+        # since they return FeatureCollections, not individual STAC items
+        assert pipeline.config.enable_concurrent_http is True
+        assert pipeline.config.concurrent_requests == 2
+        assert pipeline.config.request_timeout == 5
+        # Test that the async HTTP client is detected
+        from earthcatalog.async_http_client import HAS_ASYNC_HTTP
+        if HAS_ASYNC_HTTP:
+            logging.info("End-to-end async integration: Pipeline configured correctly with async HTTP")
+        else:
+            logging.info("End-to-end async integration: Pipeline configured for sync fallback")
+    except (ValueError, TypeError, AttributeError, ImportError, RuntimeError) as e:
+        pytest.skip(f"Pipeline initialization failed: {e}")
+    finally:
+        processor.close()
+@pytest.mark.parametrize("concurrent_requests", [1, 5, 10])
+@pytest.mark.skipif(not HAS_ASYNC_TEST_SUPPORT, reason="aioresponses not available")
+@pytest.mark.asyncio
+async def test_concurrent_requests_scaling(concurrent_requests, sample_stac_response):
+    """Test async performance scales with concurrent requests using mocked responses."""
+    pytest.importorskip("aiohttp", reason="aiohttp required for async HTTP client")
+    from earthcatalog.async_http_client import AsyncHTTPClient
+    # Use mocked responses for fast, reliable testing
+    test_urls = [f"https://example.com/stac/item_{i}.json" for i in range(5)]
+    start_time = time.time()
+    with aioresponses() as mock_responses:
+        # Mock all URLs with successful responses
+        for url in test_urls:
+            mock_responses.get(url, payload=sample_stac_response)
+        async with AsyncHTTPClient(concurrent_requests=concurrent_requests, request_timeout=5) as client:
+            results = await client.download_batch(test_urls)
+    total_time = time.time() - start_time
+    success_count = sum(1 for r in results if r.success)
+    # Validate performance characteristics
+    assert success_count == len(test_urls), "All mocked requests should succeed"
+    # With mocked responses, this should be very fast (under 1 second)
+    assert total_time < 2.0, f"Mocked requests should be fast: {total_time:.1f}s"
+    logging.info(
+        f"Concurrent requests {concurrent_requests}: {success_count}/{len(test_urls)} success in {total_time:.2f}s"
+    )
+if __name__ == "__main__":
+    # Run integration tests directly
+    pytest.main([__file__, "-v", "-s"])