PyPI - earthcatalog - Versions diffs - 0.2.0__py3-none-any.whl - Mend

earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

earthcatalog/__init__.py +164 -0
earthcatalog/async_http_client.py +1006 -0
earthcatalog/config.py +97 -0
earthcatalog/engines/__init__.py +308 -0
earthcatalog/engines/rustac_engine.py +142 -0
earthcatalog/engines/stac_geoparquet_engine.py +126 -0
earthcatalog/exceptions.py +471 -0
earthcatalog/grid_systems.py +1114 -0
earthcatalog/ingestion_pipeline.py +2281 -0
earthcatalog/input_readers.py +603 -0
earthcatalog/job_tracking.py +485 -0
earthcatalog/pipeline.py +606 -0
earthcatalog/schema_generator.py +911 -0
earthcatalog/spatial_resolver.py +1207 -0
earthcatalog/stac_hooks.py +754 -0
earthcatalog/statistics.py +677 -0
earthcatalog/storage_backends.py +548 -0
earthcatalog/tests/__init__.py +1 -0
earthcatalog/tests/conftest.py +76 -0
earthcatalog/tests/test_all_grids.py +793 -0
earthcatalog/tests/test_async_http.py +700 -0
earthcatalog/tests/test_cli_and_storage.py +230 -0
earthcatalog/tests/test_config.py +245 -0
earthcatalog/tests/test_dask_integration.py +580 -0
earthcatalog/tests/test_e2e_synthetic.py +1624 -0
earthcatalog/tests/test_engines.py +272 -0
earthcatalog/tests/test_exceptions.py +346 -0
earthcatalog/tests/test_file_structure.py +245 -0
earthcatalog/tests/test_input_readers.py +666 -0
earthcatalog/tests/test_integration.py +200 -0
earthcatalog/tests/test_integration_async.py +283 -0
earthcatalog/tests/test_job_tracking.py +603 -0
earthcatalog/tests/test_multi_file_input.py +336 -0
earthcatalog/tests/test_passthrough_hook.py +196 -0
earthcatalog/tests/test_pipeline.py +684 -0
earthcatalog/tests/test_pipeline_components.py +665 -0
earthcatalog/tests/test_schema_generator.py +506 -0
earthcatalog/tests/test_spatial_resolver.py +413 -0
earthcatalog/tests/test_stac_hooks.py +776 -0
earthcatalog/tests/test_statistics.py +477 -0
earthcatalog/tests/test_storage_backends.py +236 -0
earthcatalog/tests/test_validation.py +435 -0
earthcatalog/tests/test_workers.py +653 -0
earthcatalog/validation.py +921 -0
earthcatalog/workers.py +682 -0
earthcatalog-0.2.0.dist-info/METADATA +333 -0
earthcatalog-0.2.0.dist-info/RECORD +50 -0
earthcatalog-0.2.0.dist-info/WHEEL +5 -0
earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
earthcatalog-0.2.0.dist-info/top_level.txt +1 -0

earthcatalog/tests/test_cli_and_storage.py ADDED Viewed

@@ -0,0 +1,230 @@
+"""
+Tests for CLI features and storage backend enhancements.
+These tests validate:
+- CLI --dry-run option (f1)
+- S3 timeout configuration (h2)
+"""
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+from unittest.mock import patch
+import pandas as pd
+import pytest
+class TestCLIDryRun:
+    """Test CLI --dry-run feature (f1)."""
+    def setup_method(self):
+        """Create temporary input file for testing."""
+        self.temp_dir = tempfile.mkdtemp()
+        self.input_file = Path(self.temp_dir) / "test_input.parquet"
+        # Create a simple test parquet file
+        df = pd.DataFrame({"url": ["https://example.com/item1.json", "https://example.com/item2.json"]})
+        df.to_parquet(self.input_file)
+    def teardown_method(self):
+        """Clean up temporary files."""
+        import shutil
+        shutil.rmtree(self.temp_dir, ignore_errors=True)
+    def test_dry_run_exits_with_zero(self):
+        """Test that --dry-run exits with code 0."""
+        result = subprocess.run(
+            [
+                sys.executable,
+                "-m",
+                "cli",
+                "--input",
+                str(self.input_file),
+                "--output",
+                "/tmp/test_output",
+                "--scratch",
+                "/tmp/test_scratch",
+                "--dry-run",
+            ],
+            capture_output=True,
+            text=True,
+            cwd=str(Path(__file__).parent.parent.parent),
+        )
+        assert result.returncode == 0, f"Expected exit code 0, got {result.returncode}. stderr: {result.stderr}"
+    def test_dry_run_shows_configuration(self):
+        """Test that --dry-run displays configuration info."""
+        result = subprocess.run(
+            [
+                sys.executable,
+                "-m",
+                "cli",
+                "--input",
+                str(self.input_file),
+                "--output",
+                "/tmp/test_output",
+                "--scratch",
+                "/tmp/test_scratch",
+                "--grid",
+                "h3",
+                "--grid-resolution",
+                "4",
+                "--dry-run",
+            ],
+            capture_output=True,
+            text=True,
+            cwd=str(Path(__file__).parent.parent.parent),
+        )
+        # Check that configuration is displayed
+        output = result.stdout + result.stderr
+        assert "DRY RUN" in output
+        assert "h3" in output.lower() or "H3" in output
+        assert "validated" in output.lower() or "ready" in output.lower()
+    def test_dry_run_does_not_create_output(self):
+        """Test that --dry-run doesn't create any output files."""
+        output_dir = Path(self.temp_dir) / "output"
+        scratch_dir = Path(self.temp_dir) / "scratch"
+        subprocess.run(
+            [
+                sys.executable,
+                "-m",
+                "cli",
+                "--input",
+                str(self.input_file),
+                "--output",
+                str(output_dir),
+                "--scratch",
+                str(scratch_dir),
+                "--dry-run",
+            ],
+            capture_output=True,
+            text=True,
+            cwd=str(Path(__file__).parent.parent.parent),
+        )
+        # Output directories should not be created
+        assert not output_dir.exists()
+        assert not scratch_dir.exists()
+    def test_dry_run_validates_invalid_config(self):
+        """Test that --dry-run still validates configuration."""
+        # Test with invalid grid type
+        result = subprocess.run(
+            [
+                sys.executable,
+                "-m",
+                "cli",
+                "--input",
+                str(self.input_file),
+                "--output",
+                "/tmp/test_output",
+                "--scratch",
+                "/tmp/test_scratch",
+                "--grid",
+                "invalid_grid",
+                "--dry-run",
+            ],
+            capture_output=True,
+            text=True,
+            cwd=str(Path(__file__).parent.parent.parent),
+        )
+        # Should fail due to invalid grid
+        assert result.returncode != 0
+class TestS3TimeoutConfiguration:
+    """Test S3 storage backend timeout configuration (h2)."""
+    @pytest.mark.skipif(not pytest.importorskip("s3fs", reason="s3fs not available"), reason="s3fs library required")
+    def test_s3_storage_accepts_timeout_params(self):
+        """Test that S3Storage accepts custom timeout parameters."""
+        with patch("s3fs.S3FileSystem") as mock_fs:
+            from earthcatalog.storage_backends import S3Storage
+            S3Storage(
+                base_path="s3://test-bucket/prefix/",
+                connect_timeout=45.0,
+                read_timeout=90.0,
+                retries=5,
+            )
+            # Verify S3FileSystem was called with config
+            mock_fs.assert_called_once()
+            call_kwargs = mock_fs.call_args[1]
+            assert "config_kwargs" in call_kwargs
+    @pytest.mark.skipif(not pytest.importorskip("s3fs", reason="s3fs not available"), reason="s3fs library required")
+    def test_s3_storage_default_timeouts(self):
+        """Test that S3Storage has sensible default timeout values."""
+        with patch("s3fs.S3FileSystem") as mock_fs:
+            from earthcatalog.storage_backends import S3Storage
+            # Create with defaults
+            S3Storage(base_path="s3://test-bucket/prefix/")
+            # Verify defaults were applied
+            mock_fs.assert_called_once()
+            call_kwargs = mock_fs.call_args[1]
+            assert "config_kwargs" in call_kwargs
+            # Extract config from call
+            config_kwargs = call_kwargs["config_kwargs"]
+            assert "config" in config_kwargs
+    def test_storage_backend_direct_instantiation(self):
+        """Test direct instantiation of storage backends."""
+        from earthcatalog.storage_backends import LocalStorage
+        # Test local storage can be instantiated
+        local_storage = LocalStorage("/tmp/test")
+        assert local_storage is not None
+        assert hasattr(local_storage, "exists")
+        assert hasattr(local_storage, "open")
+    def test_local_storage_interface(self):
+        """Test LocalStorage interface for comparison."""
+        from earthcatalog.storage_backends import LocalStorage
+        storage = LocalStorage("/tmp/test_base")
+        assert hasattr(storage, "exists")
+        assert hasattr(storage, "open")
+        assert hasattr(storage, "makedirs")
+        assert hasattr(storage, "remove")
+        assert hasattr(storage, "rename")
+class TestS3StorageWithMockedBotocore:
+    """Test S3 storage with mocked botocore Config."""
+    @pytest.mark.skipif(not pytest.importorskip("s3fs", reason="s3fs not available"), reason="s3fs library required")
+    def test_botocore_config_applied(self):
+        """Test that botocore Config is properly created with timeouts."""
+        with patch("s3fs.S3FileSystem"):
+            with patch("botocore.config.Config") as mock_config:
+                from earthcatalog.storage_backends import S3Storage
+                S3Storage(
+                    base_path="s3://bucket/",
+                    connect_timeout=15.0,
+                    read_timeout=30.0,
+                    retries=2,
+                )
+                # Verify Config was called with correct parameters
+                mock_config.assert_called_once_with(
+                    connect_timeout=15.0,
+                    read_timeout=30.0,
+                    retries={"max_attempts": 2, "mode": "adaptive"},
+                )
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])

earthcatalog/tests/test_config.py ADDED Viewed

@@ -0,0 +1,245 @@
+# test_config.py
+"""Tests for configuration file loader module."""
+import os
+import tempfile
+from pathlib import Path
+import pytest
+import yaml
+from earthcatalog.config import load_config, merge_cli_overrides, save_config
+class TestLoadConfig:
+    """Tests for load_config function."""
+    def test_load_explicit_path(self):
+        """Test loading config from explicit path."""
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
+            yaml.dump({"grid_resolution": 5, "batch_threshold": 5000}, f)
+            temp_path = f.name
+        try:
+            config = load_config(temp_path)
+            assert config["grid_resolution"] == 5
+            assert config["batch_threshold"] == 5000
+        finally:
+            os.unlink(temp_path)
+    def test_load_missing_explicit_path_raises(self):
+        """Test that missing explicit path raises FileNotFoundError."""
+        with pytest.raises(FileNotFoundError):
+            load_config("/nonexistent/path/config.yaml")
+    def test_load_default_path_if_exists(self, tmp_path, monkeypatch):
+        """Test loading from ./earthcatalog.yaml if it exists."""
+        config_content = {"grid_system": "s2", "concurrent_requests": 100}
+        # Create config file in temp directory
+        config_file = tmp_path / "earthcatalog.yaml"
+        with open(config_file, "w") as f:
+            yaml.dump(config_content, f)
+        # Change to temp directory
+        monkeypatch.chdir(tmp_path)
+        config = load_config()
+        assert config["grid_system"] == "s2"
+        assert config["concurrent_requests"] == 100
+    def test_load_returns_empty_when_no_config(self, tmp_path, monkeypatch):
+        """Test that empty dict is returned when no config file exists."""
+        # Change to empty temp directory
+        monkeypatch.chdir(tmp_path)
+        config = load_config()
+        assert config == {}
+    def test_load_empty_yaml_file(self):
+        """Test loading an empty YAML file returns empty dict."""
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
+            f.write("")  # Empty file
+            temp_path = f.name
+        try:
+            config = load_config(temp_path)
+            assert config == {}
+        finally:
+            os.unlink(temp_path)
+    def test_load_invalid_yaml_raises(self):
+        """Test that invalid YAML raises an error."""
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
+            f.write("invalid: yaml: content: [")
+            temp_path = f.name
+        try:
+            with pytest.raises(yaml.YAMLError):
+                load_config(temp_path)
+        finally:
+            os.unlink(temp_path)
+    def test_load_non_mapping_yaml_raises(self):
+        """Test that non-mapping YAML content raises ValueError."""
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
+            f.write("- item1\n- item2\n")  # List instead of mapping
+            temp_path = f.name
+        try:
+            with pytest.raises(ValueError, match="YAML mapping"):
+                load_config(temp_path)
+        finally:
+            os.unlink(temp_path)
+class TestMergeCliOverrides:
+    """Tests for merge_cli_overrides function."""
+    def test_cli_overrides_config(self):
+        """Test that CLI args override config file values."""
+        file_config = {"grid_resolution": 2, "batch_threshold": 10000}
+        cli_args = {"grid_resolution": 5}  # Override
+        result = merge_cli_overrides(file_config, cli_args)
+        assert result["grid_resolution"] == 5  # Overridden
+        assert result["batch_threshold"] == 10000  # Preserved
+    def test_none_cli_args_ignored(self):
+        """Test that None CLI args don't override config values."""
+        file_config = {"grid_resolution": 2, "batch_threshold": 10000}
+        cli_args = {"grid_resolution": None, "batch_threshold": None}
+        result = merge_cli_overrides(file_config, cli_args)
+        # Original values preserved because CLI args are None
+        assert result["grid_resolution"] == 2
+        assert result["batch_threshold"] == 10000
+    def test_cli_adds_new_keys(self):
+        """Test that CLI can add keys not in config file."""
+        file_config = {"grid_resolution": 2}
+        cli_args = {"distributed": True, "stac_hook": "module:pkg:func"}
+        result = merge_cli_overrides(file_config, cli_args)
+        assert result["grid_resolution"] == 2
+        assert result["distributed"] is True
+        assert result["stac_hook"] == "module:pkg:func"
+    def test_empty_config_uses_cli(self):
+        """Test that empty config uses all CLI values."""
+        file_config = {}
+        cli_args = {
+            "input_file": "data.parquet",
+            "grid_resolution": 5,
+            "batch_threshold": 5000,
+        }
+        result = merge_cli_overrides(file_config, cli_args)
+        assert result["input_file"] == "data.parquet"
+        assert result["grid_resolution"] == 5
+        assert result["batch_threshold"] == 5000
+class TestSaveConfig:
+    """Tests for save_config function."""
+    def test_save_config_creates_file(self):
+        """Test that save_config creates a YAML file."""
+        config = {"grid_resolution": 5, "batch_threshold": 7500}
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            config_path = Path(tmp_dir) / "test_config.yaml"
+            save_config(config, config_path)
+            assert config_path.exists()
+            # Verify content
+            with open(config_path) as f:
+                loaded = yaml.safe_load(f)
+            assert loaded["grid_resolution"] == 5
+            assert loaded["batch_threshold"] == 7500
+    def test_save_config_overwrites_existing(self):
+        """Test that save_config overwrites existing file."""
+        original = {"value": "original"}
+        updated = {"value": "updated"}
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            config_path = Path(tmp_dir) / "config.yaml"
+            # Save original
+            save_config(original, config_path)
+            # Overwrite with updated
+            save_config(updated, config_path)
+            with open(config_path) as f:
+                loaded = yaml.safe_load(f)
+            assert loaded["value"] == "updated"
+class TestConfigIntegration:
+    """Integration tests for config loading with ProcessingConfig."""
+    def test_full_config_workflow(self):
+        """Test complete workflow: load config, merge CLI, create ProcessingConfig."""
+        from earthcatalog.ingestion_pipeline import ProcessingConfig
+        # Create a config file
+        file_config = {
+            "grid_resolution": 4,
+            "batch_threshold": 8000,
+            "concurrent_requests": 75,
+        }
+        with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
+            yaml.dump(file_config, f)
+            config_path = f.name
+        # Create a dummy input file
+        with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as pq:
+            import pandas as pd
+            pd.DataFrame({"url": ["url1"]}).to_parquet(pq.name)
+            input_path = pq.name
+        try:
+            # Load config file
+            loaded = load_config(config_path)
+            # Simulate CLI args (some override, some None)
+            cli_args = {
+                "input_file": input_path,
+                "output_catalog": "/tmp/catalog",
+                "scratch_location": "/tmp/scratch",
+                "grid_resolution": None,  # Use config value
+                "batch_threshold": 5000,  # Override
+            }
+            # Merge
+            merged = merge_cli_overrides(loaded, cli_args)
+            # Create ProcessingConfig
+            config = ProcessingConfig(
+                input_file=merged["input_file"],
+                output_catalog=merged["output_catalog"],
+                scratch_location=merged["scratch_location"],
+                grid_resolution=merged.get("grid_resolution", 2),
+                batch_threshold=merged.get("batch_threshold", 10000),
+                concurrent_requests=merged.get("concurrent_requests", 50),
+            )
+            # Verify merged values
+            assert config.grid_resolution == 4  # From file
+            assert config.batch_threshold == 5000  # From CLI (override)
+            assert config.concurrent_requests == 75  # From file
+        finally:
+            os.unlink(config_path)
+            os.unlink(input_path)