PyPI - earthcatalog - Versions diffs - 0.2.0__py3-none-any.whl - Mend

earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

earthcatalog/__init__.py +164 -0
earthcatalog/async_http_client.py +1006 -0
earthcatalog/config.py +97 -0
earthcatalog/engines/__init__.py +308 -0
earthcatalog/engines/rustac_engine.py +142 -0
earthcatalog/engines/stac_geoparquet_engine.py +126 -0
earthcatalog/exceptions.py +471 -0
earthcatalog/grid_systems.py +1114 -0
earthcatalog/ingestion_pipeline.py +2281 -0
earthcatalog/input_readers.py +603 -0
earthcatalog/job_tracking.py +485 -0
earthcatalog/pipeline.py +606 -0
earthcatalog/schema_generator.py +911 -0
earthcatalog/spatial_resolver.py +1207 -0
earthcatalog/stac_hooks.py +754 -0
earthcatalog/statistics.py +677 -0
earthcatalog/storage_backends.py +548 -0
earthcatalog/tests/__init__.py +1 -0
earthcatalog/tests/conftest.py +76 -0
earthcatalog/tests/test_all_grids.py +793 -0
earthcatalog/tests/test_async_http.py +700 -0
earthcatalog/tests/test_cli_and_storage.py +230 -0
earthcatalog/tests/test_config.py +245 -0
earthcatalog/tests/test_dask_integration.py +580 -0
earthcatalog/tests/test_e2e_synthetic.py +1624 -0
earthcatalog/tests/test_engines.py +272 -0
earthcatalog/tests/test_exceptions.py +346 -0
earthcatalog/tests/test_file_structure.py +245 -0
earthcatalog/tests/test_input_readers.py +666 -0
earthcatalog/tests/test_integration.py +200 -0
earthcatalog/tests/test_integration_async.py +283 -0
earthcatalog/tests/test_job_tracking.py +603 -0
earthcatalog/tests/test_multi_file_input.py +336 -0
earthcatalog/tests/test_passthrough_hook.py +196 -0
earthcatalog/tests/test_pipeline.py +684 -0
earthcatalog/tests/test_pipeline_components.py +665 -0
earthcatalog/tests/test_schema_generator.py +506 -0
earthcatalog/tests/test_spatial_resolver.py +413 -0
earthcatalog/tests/test_stac_hooks.py +776 -0
earthcatalog/tests/test_statistics.py +477 -0
earthcatalog/tests/test_storage_backends.py +236 -0
earthcatalog/tests/test_validation.py +435 -0
earthcatalog/tests/test_workers.py +653 -0
earthcatalog/validation.py +921 -0
earthcatalog/workers.py +682 -0
earthcatalog-0.2.0.dist-info/METADATA +333 -0
earthcatalog-0.2.0.dist-info/RECORD +50 -0
earthcatalog-0.2.0.dist-info/WHEEL +5 -0
earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
earthcatalog-0.2.0.dist-info/top_level.txt +1 -0

earthcatalog/tests/test_dask_integration.py ADDED Viewed

@@ -0,0 +1,580 @@
+"""Dask integration tests for STAC ingestion pipeline with multi-file input and passthrough hook."""
+import json
+import os
+from pathlib import Path
+import pytest
+from earthcatalog.ingestion_pipeline import ProcessingConfig
+from earthcatalog.stac_hooks import PassthroughSTACHook
+# =============================================================================
+# Module-level fixtures (shared across all test classes)
+# =============================================================================
+@pytest.fixture
+def dask_scheduler_address():
+    """Get Dask scheduler address from environment or create ephemeral cluster.
+    Priority:
+    1. If DASK_TESTING=TRUE: Create ephemeral local cluster (auto-destroyed after tests)
+    2. If DASK_SCHEDULER_ADDRESS set: Use that address
+    3. Otherwise: Skip tests
+    To enable auto-created ephemeral cluster (recommended for development):
+        export DASK_TESTING=TRUE
+        pytest earthcatalog/tests/test_dask_integration.py -v
+    To use existing cluster:
+        export DASK_SCHEDULER_ADDRESS=localhost:8786
+        pytest earthcatalog/tests/test_dask_integration.py -v
+    """
+    # Check for auto-testing mode first
+    if os.environ.get("DASK_TESTING", "").upper() == "TRUE":
+        try:
+            from dask.distributed import LocalCluster
+        except ImportError:
+            pytest.skip("Dask distributed not installed: pip install 'dask[distributed]'")
+        # Create ephemeral cluster with minimal workers for testing
+        cluster = LocalCluster(
+            n_workers=2,
+            threads_per_worker=1,
+            processes=True,
+            silence_logs=False,
+            dashboard_address=None,  # Disable dashboard for tests
+        )
+        # Get the scheduler address
+        scheduler_address = cluster.scheduler_address
+        print(f"\n[Ephemeral Dask cluster created at {scheduler_address}]")
+        # Yield the address and cleanup after tests
+        yield scheduler_address
+        # Cleanup: close cluster
+        print("\n[Destroying ephemeral Dask cluster...]")
+        cluster.close()
+        return
+    # Fall back to manual scheduler address configuration
+    scheduler = os.environ.get("DASK_SCHEDULER_ADDRESS")
+    if not scheduler:
+        pytest.skip(
+            "DASK_SCHEDULER_ADDRESS not set. "
+            "Set DASK_TESTING=TRUE to auto-create ephemeral cluster, "
+            "or set DASK_SCHEDULER_ADDRESS to connect to existing cluster."
+        )
+    return scheduler
+@pytest.fixture
+def synthetic_bulk_data(tmp_path: Path):
+    """Create synthetic bulk data for Dask testing.
+    Creates 10 NDJSON files with 20 STAC items each (200 total items).
+    Files follow the ITS_LIVE pattern: {year}_{chunk_no}.ndjson
+    """
+    bulk_dir = tmp_path / "dask_bulk"
+    bulk_dir.mkdir()
+    # Create 10 files with 20 items each
+    for chunk in range(1, 11):
+        items = [
+            {
+                "type": "Feature",
+                "id": f"item_{chunk}_{i}",
+                "geometry": {
+                    "type": "Point",
+                    "coordinates": [i * 0.1, i * 0.1],
+                },
+                "properties": {
+                    "datetime": f"2020-01-01T{i:02d}:00:00Z",
+                    "dataset_id": f"test_dataset_{chunk}",
+                },
+            }
+            for i in range(20)
+        ]
+        filename = bulk_dir / f"2020_{chunk}.ndjson"
+        with filename.open("w") as f:
+            for item in items:
+                f.write(json.dumps(item) + "\n")
+    return bulk_dir
+# =============================================================================
+# Test Classes
+# =============================================================================
+@pytest.mark.dask
+class TestDaskIntegration:
+    """Integration tests for Dask distributed processing.
+    These tests require Dask to be installed and can run in three modes:
+    1. Auto-created ephemeral cluster: Set DASK_TESTING=TRUE (recommended for development)
+    2. With an existing Dask cluster: Set DASK_SCHEDULER_ADDRESS environment variable
+    3. Skip: If neither is set, tests are skipped
+    To run with auto-created ephemeral cluster (recommended):
+        export DASK_TESTING=TRUE
+        pytest earthcatalog/tests/test_dask_integration.py -v
+    To run with an existing cluster:
+        export DASK_SCHEDULER_ADDRESS=localhost:8786
+        pytest earthcatalog/tests/test_dask_integration.py -v
+    """
+    def test_dask_scheduler_address_env(self, dask_scheduler_address):
+        """Test that Dask scheduler address is configured."""
+        # Address can be:
+        # - "localhost:8786" (manual config)
+        # - "local" (manual config, now deprecated)
+        # - "tcp://127.0.0.1:XXXXX" (ephemeral cluster from DASK_TESTING=TRUE)
+        assert dask_scheduler_address in ["localhost:8786", "local"] or ":" in dask_scheduler_address
+    def test_processing_config_for_dask(self, synthetic_bulk_data: Path, dask_scheduler_address, tmp_path: Path):
+        """Test ProcessingConfig configuration for Dask with passthrough hook."""
+        config = ProcessingConfig(
+            input_file=str(synthetic_bulk_data),
+            output_catalog=str(tmp_path / "output"),
+            scratch_location=str(tmp_path / "scratch"),
+            input_pattern=str(synthetic_bulk_data / "*.ndjson"),
+            input_format="ndjson",
+            stac_hook="passthrough",  # URLs are pre-fetched STAC JSON
+            grid_system="h3",
+            grid_resolution=2,
+            max_workers=4,
+        )
+        # Verify configuration
+        assert config.stac_hook == "passthrough"
+        assert config.input_pattern.endswith("*.ndjson")
+        # Validation should pass
+        try:
+            config.validate()
+        except (ValueError, FileNotFoundError) as e:
+            pytest.fail(f"Config validation failed: {e}")
+    def test_dask_processor_creation(self, dask_scheduler_address):
+        """Test DaskDistributedProcessor creation.
+        This test creates a Dask processor but doesn't run a full pipeline.
+        """
+        from earthcatalog.ingestion_pipeline import DaskDistributedProcessor
+        # For ephemeral clusters (DASK_TESTING mode), just verify connection
+        if os.environ.get("DASK_TESTING", "").upper() == "TRUE":
+            # Use the ephemeral cluster
+            processor = DaskDistributedProcessor(n_workers=2, scheduler_address=dask_scheduler_address)
+            # Verify processor was created
+            assert processor.n_workers == 2
+            assert processor.scheduler_address == dask_scheduler_address
+            # Close the processor
+            processor.close()
+        elif dask_scheduler_address == "local":
+            # Legacy local mode - create a local Dask cluster for testing
+            from dask.distributed import Client
+            with Client(n_workers=2, threads_per_worker=1, processes=True) as client:
+                scheduler_address = client.scheduler_address
+                processor = DaskDistributedProcessor(n_workers=2, scheduler_address=scheduler_address)
+                # Verify processor was created
+                assert processor.n_workers == 2
+                assert processor.scheduler_address == scheduler_address
+                # Close the processor
+                processor.close()
+        else:
+            # Connect to existing cluster
+            processor = DaskDistributedProcessor(n_workers=4, scheduler_address=dask_scheduler_address)
+            assert processor.n_workers == 4
+            assert processor.scheduler_address == dask_scheduler_address
+            processor.close()
+    def test_passthrough_hook_dask_compatibility(self):
+        """Test that PassthroughSTACHook is Dask-compatible (serializable)."""
+        from earthcatalog.stac_hooks import serialize_hook
+        hook = PassthroughSTACHook()
+        # Should serialize to a simple string
+        config_str = serialize_hook(hook)
+        assert config_str == "passthrough"
+        # Should be deserializable
+        from earthcatalog.stac_hooks import parse_hook_config
+        restored_hook = parse_hook_config(config_str)
+        assert isinstance(restored_hook, PassthroughSTACHook)
+    def test_passthrough_with_batch_processing(self):
+        """Test that passthrough hook works with batch processing."""
+        hook = PassthroughSTACHook()
+        # Create batch of STAC items
+        batch = [
+            json.dumps(
+                {
+                    "type": "Feature",
+                    "id": f"batch_item_{i}",
+                    "geometry": {"type": "Point", "coordinates": [0, 0]},
+                    "properties": {"datetime": "2024-01-01T00:00:00Z"},
+                }
+            )
+            for i in range(100)
+        ]
+        results = hook.fetch_batch(batch)
+        # All items should be processed
+        assert len(results) == 100
+        assert all(r["id"] == f"batch_item_{i}" for i, r in enumerate(results) if r)
+    @pytest.mark.skipif(
+        os.environ.get("DASK_TESTING", "").upper() != "TRUE" and os.environ.get("DASK_SCHEDULER_ADDRESS") is None,
+        reason="DASK_TESTING=TRUE or DASK_SCHEDULER_ADDRESS not set",
+    )
+    def test_full_pipeline_config_dask(self, synthetic_bulk_data: Path, dask_scheduler_address, tmp_path: Path):
+        """Test full pipeline configuration for Dask processing.
+        This test verifies the configuration is ready for Dask processing
+        but doesn't run the actual pipeline (which would require a full cluster).
+        """
+        config = ProcessingConfig(
+            input_file=str(synthetic_bulk_data),
+            output_catalog=str(tmp_path / "catalog"),
+            scratch_location=str(tmp_path / "scratch"),
+            input_pattern=str(synthetic_bulk_data / "2020_*.ndjson"),
+            input_format="ndjson",
+            url_column="url",  # Will be ignored by passthrough hook
+            stac_hook="passthrough",
+            grid_system="h3",
+            grid_resolution=2,
+            temporal_bin="month",
+            enable_concurrent_http=False,  # Passthrough doesn't need HTTP
+            max_workers=4,
+        )
+        # Verify all key settings
+        assert config.stac_hook == "passthrough"
+        assert config.input_format == "ndjson"
+        assert not config.enable_concurrent_http  # Should be disabled
+        assert config.grid_system == "h3"
+        assert config.grid_resolution == 2
+class TestDaskClusterInfo:
+    """Information about Dask cluster integration."""
+    @pytest.mark.skipif(
+        os.environ.get("DASK_TESTING", "").upper() != "TRUE" and os.environ.get("DASK_SCHEDULER_ADDRESS") is None,
+        reason="DASK_TESTING=TRUE or DASK_SCHEDULER_ADDRESS not set",
+    )
+    def test_dask_cluster_info(self, dask_scheduler_address):
+        """Display information about Dask cluster connection.
+        This test provides helpful information about connecting to Dask clusters.
+        """
+        # Skip for ephemeral clusters (DASK_TESTING mode)
+        if os.environ.get("DASK_TESTING", "").upper() == "TRUE":
+            pytest.skip("Ephemeral cluster - no external cluster info needed")
+        print("\n=== Dask Cluster Information ===")
+        print(f"Scheduler Address: {dask_scheduler_address}")
+        print("\nTo start a Dask cluster:")
+        print("  dask scheduler")
+        print("  # Or with specific options:")
+        print("  dask scheduler --port 8786")
+        print("\nTo verify cluster is running:")
+        print("  dask info")
+        print("\nTo check workers:")
+        print("  dask worker tcp://<scheduler-ip>:8786")
+        print("====================================\n")
+    @pytest.mark.skipif(True, reason="Replaced by DASK_TESTING mode")
+    def test_local_cluster_info(self):
+        """Display information about local Dask cluster setup.
+        This test is deprecated - use DASK_TESTING=TRUE instead.
+        """
+        print("\n=== Local Dask Cluster Information ===")
+        print("To run Dask integration tests with auto-created cluster:")
+        print("  export DASK_TESTING=TRUE")
+        print("  pytest earthcatalog/tests/test_dask_integration.py -v")
+        print("\nThe test will automatically create and destroy a local Dask cluster.")
+        print("=============================================\n")
+class TestDaskCompatibility:
+    """Tests for Dask compatibility of components."""
+    def test_passthrough_hook_is_dask_compatible(self):
+        """Verify PassthroughSTACHook is serializable for Dask."""
+        from earthcatalog.stac_hooks import parse_hook_config, serialize_hook
+        hook = PassthroughSTACHook()
+        # Must serialize to a string for Dask transmission
+        config_str = serialize_hook(hook)
+        assert isinstance(config_str, str)
+        # Must be deserializable on workers
+        restored = parse_hook_config(config_str)
+        assert isinstance(restored, PassthroughSTACHook)
+    def test_processing_config_serialization_for_dask(self):
+        """Test that ProcessingConfig serializes correctly for Dask."""
+        config = ProcessingConfig(
+            input_file="./data",
+            output_catalog="./catalog",
+            scratch_location="./scratch",
+            input_pattern="./data/*.ndjson",
+            stac_hook="passthrough",
+        )
+        # Serialize config for Dask worker transmission
+        config_dict = config.to_dict()
+        # Verify passthrough is in the dict
+        assert "input_pattern" in config_dict
+        assert "stac_hook" in config_dict
+        assert config_dict["stac_hook"] == "passthrough"
+        # Verify deserialization works
+        restored = ProcessingConfig.from_dict(config_dict)
+        assert restored.stac_hook == "passthrough"
+        assert restored.input_pattern == "./data/*.ndjson"
+@pytest.mark.dask
+@pytest.mark.skipif(
+    os.environ.get("DASK_TESTING", "").upper() != "TRUE" and os.environ.get("DASK_SCHEDULER_ADDRESS") is None,
+    reason="DASK_TESTING=TRUE or DASK_SCHEDULER_ADDRESS not set",
+)
+class TestDaskPipelineIntegration:
+    """End-to-end Dask pipeline integration tests.
+    These tests require a running Dask cluster and synthetic data.
+    """
+    def test_dask_scheduler_connection(self, dask_scheduler_address):
+        """Test connection to Dask scheduler."""
+        from earthcatalog.ingestion_pipeline import DaskDistributedProcessor
+        # Skip for ephemeral clusters (they're created/destroyed per test class)
+        if os.environ.get("DASK_TESTING", "").upper() == "TRUE":
+            pytest.skip("Ephemeral cluster - connection test not applicable")
+        try:
+            processor = DaskDistributedProcessor(
+                n_workers=2,
+                scheduler_address=dask_scheduler_address,
+            )
+            # If we get here, connection succeeded
+            assert processor.scheduler_address == dask_scheduler_address
+            processor.close()
+        except (OSError, ConnectionError, ValueError) as e:
+            pytest.fail(f"Failed to connect to Dask scheduler: {e}")
+    def test_synthetic_data_for_dask(self, synthetic_bulk_data: Path):
+        """Verify synthetic data was created correctly."""
+        import glob as glob_module
+        pattern = str(synthetic_bulk_data / "*.ndjson")
+        files = glob_module.glob(pattern)
+        assert len(files) == 10
+        # Verify each file has 20 lines
+        for file_path in files:
+            with open(file_path) as f:
+                lines = f.readlines()
+                assert len(lines) == 20
+                # Each line should be valid JSON
+                for line in lines:
+                    item = json.loads(line)
+                    assert item["type"] == "Feature"
+class TestDaskSchedulerAddress:
+    """Tests for Dask scheduler address configuration."""
+    def test_config_with_scheduler_address(self):
+        """Test ProcessingConfig with dask_scheduler_address."""
+        config = ProcessingConfig(
+            input_file="./test_input.parquet",
+            output_catalog="./test_output",
+            scratch_location="./test_scratch",
+            dask_scheduler_address="tcp://localhost:8786",
+        )
+        assert config.dask_scheduler_address == "tcp://localhost:8786"
+    def test_config_without_scheduler_address(self):
+        """Test ProcessingConfig without dask_scheduler_address defaults to empty string."""
+        config = ProcessingConfig(
+            input_file="./test_input.parquet",
+            output_catalog="./test_output",
+            scratch_location="./test_scratch",
+        )
+        assert config.dask_scheduler_address == ""
+    def test_config_to_dict_includes_scheduler_address(self):
+        """Test that dask_scheduler_address is included in to_dict()."""
+        config = ProcessingConfig(
+            input_file="./test_input.parquet",
+            output_catalog="./test_output",
+            scratch_location="./test_scratch",
+            dask_scheduler_address="tcp://scheduler:8786",
+        )
+        config_dict = config.to_dict()
+        assert "dask_scheduler_address" in config_dict
+        assert config_dict["dask_scheduler_address"] == "tcp://scheduler:8786"
+    def test_config_from_dict_with_scheduler_address(self):
+        """Test that from_dict() restores dask_scheduler_address."""
+        config_dict = {
+            "input_file": "./test_input.parquet",
+            "output_catalog": "./test_output",
+            "scratch_location": "./test_scratch",
+            "dask_scheduler_address": "tcp://remote:8786",
+        }
+        config = ProcessingConfig.from_dict(config_dict)
+        assert config.dask_scheduler_address == "tcp://remote:8786"
+    def test_config_from_dict_without_scheduler_address(self):
+        """Test backward compatibility when dask_scheduler_address is missing."""
+        config_dict = {
+            "input_file": "./test_input.parquet",
+            "output_catalog": "./test_output",
+            "scratch_location": "./test_scratch",
+        }
+        config = ProcessingConfig.from_dict(config_dict)
+        assert config.dask_scheduler_address == ""
+    def test_validation_warns_remote_scheduler_local_scratch(self, caplog):
+        """Test that validation warns when using remote scheduler with local scratch."""
+        import logging
+        config = ProcessingConfig(
+            input_file="s3://bucket/input.parquet",
+            output_catalog="s3://bucket/output",
+            scratch_location="./local_scratch",  # Local path with remote scheduler
+            dask_scheduler_address="tcp://remote:8786",
+        )
+        with caplog.at_level(logging.WARNING):
+            config.validate()
+        # Should warn about local scratch with remote scheduler
+        warning_messages = [record.message for record in caplog.records if record.levelno == logging.WARNING]
+        assert any("local storage paths" in msg and "scratch_location" in msg for msg in warning_messages)
+    def test_validation_warns_remote_scheduler_local_output(self, caplog):
+        """Test that validation warns when using remote scheduler with local output."""
+        import logging
+        config = ProcessingConfig(
+            input_file="s3://bucket/input.parquet",
+            output_catalog="./local_output",  # Local path with remote scheduler
+            scratch_location="s3://bucket/scratch",
+            dask_scheduler_address="tcp://remote:8786",
+        )
+        with caplog.at_level(logging.WARNING):
+            config.validate()
+        # Should warn about local output with remote scheduler
+        warning_messages = [record.message for record in caplog.records if record.levelno == logging.WARNING]
+        assert any("local storage paths" in msg and "output_catalog" in msg for msg in warning_messages)
+    def test_validation_no_warning_cloud_storage(self, caplog):
+        """Test that no warning is issued when using cloud storage with remote scheduler."""
+        import logging
+        config = ProcessingConfig(
+            input_file="s3://bucket/input.parquet",
+            output_catalog="s3://bucket/output",
+            scratch_location="s3://bucket/scratch",
+            dask_scheduler_address="tcp://remote:8786",
+        )
+        with caplog.at_level(logging.WARNING):
+            config.validate()
+        # Should not warn about cloud storage
+        warning_messages = [record.message for record in caplog.records if record.levelno == logging.WARNING]
+        assert not any("local storage paths" in msg for msg in warning_messages)
+    def test_validation_no_warning_local_scheduler(self, caplog):
+        """Test that no warning is issued when using local scheduler (empty string)."""
+        import logging
+        config = ProcessingConfig(
+            input_file="./input.parquet",
+            input_pattern="*.ndjson",  # Use pattern to bypass file existence check
+            output_catalog="./output",
+            scratch_location="./scratch",
+            dask_scheduler_address="",  # Empty = local cluster
+        )
+        with caplog.at_level(logging.WARNING):
+            config.validate()
+        # Should not warn when scheduler_address is empty (local cluster)
+        warning_messages = [record.message for record in caplog.records if record.levelno == logging.WARNING]
+        assert not any("local storage paths" in msg for msg in warning_messages)
+    @pytest.mark.skipif(
+        os.environ.get("DASK_TESTING", "").upper() != "TRUE" and os.environ.get("DASK_SCHEDULER_ADDRESS") is None,
+        reason="DASK_TESTING=TRUE or DASK_SCHEDULER_ADDRESS not set",
+    )
+    def test_processor_repr_with_scheduler_address(self, dask_scheduler_address):
+        """Test DaskDistributedProcessor repr with scheduler address."""
+        from earthcatalog.ingestion_pipeline import DaskDistributedProcessor
+        processor = DaskDistributedProcessor(scheduler_address=dask_scheduler_address)
+        repr_str = repr(processor)
+        assert "scheduler_address" in repr_str
+        assert dask_scheduler_address in repr_str
+        processor.close()
+    @pytest.mark.skipif(
+        os.environ.get("DASK_TESTING", "").upper() != "TRUE" and os.environ.get("DASK_SCHEDULER_ADDRESS") is None,
+        reason="DASK_TESTING=TRUE or DASK_SCHEDULER_ADDRESS not set",
+    )
+    def test_processor_repr_without_scheduler_address(self):
+        """Test DaskDistributedProcessor repr without scheduler address."""
+        from earthcatalog.ingestion_pipeline import DaskDistributedProcessor
+        processor = DaskDistributedProcessor(n_workers=4)
+        repr_str = repr(processor)
+        assert "n_workers" in repr_str
+        assert "4" in repr_str
+        processor.close()