PyPI - earthcatalog - Versions diffs - 0.2.0__py3-none-any.whl - Mend

earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

earthcatalog/__init__.py +164 -0
earthcatalog/async_http_client.py +1006 -0
earthcatalog/config.py +97 -0
earthcatalog/engines/__init__.py +308 -0
earthcatalog/engines/rustac_engine.py +142 -0
earthcatalog/engines/stac_geoparquet_engine.py +126 -0
earthcatalog/exceptions.py +471 -0
earthcatalog/grid_systems.py +1114 -0
earthcatalog/ingestion_pipeline.py +2281 -0
earthcatalog/input_readers.py +603 -0
earthcatalog/job_tracking.py +485 -0
earthcatalog/pipeline.py +606 -0
earthcatalog/schema_generator.py +911 -0
earthcatalog/spatial_resolver.py +1207 -0
earthcatalog/stac_hooks.py +754 -0
earthcatalog/statistics.py +677 -0
earthcatalog/storage_backends.py +548 -0
earthcatalog/tests/__init__.py +1 -0
earthcatalog/tests/conftest.py +76 -0
earthcatalog/tests/test_all_grids.py +793 -0
earthcatalog/tests/test_async_http.py +700 -0
earthcatalog/tests/test_cli_and_storage.py +230 -0
earthcatalog/tests/test_config.py +245 -0
earthcatalog/tests/test_dask_integration.py +580 -0
earthcatalog/tests/test_e2e_synthetic.py +1624 -0
earthcatalog/tests/test_engines.py +272 -0
earthcatalog/tests/test_exceptions.py +346 -0
earthcatalog/tests/test_file_structure.py +245 -0
earthcatalog/tests/test_input_readers.py +666 -0
earthcatalog/tests/test_integration.py +200 -0
earthcatalog/tests/test_integration_async.py +283 -0
earthcatalog/tests/test_job_tracking.py +603 -0
earthcatalog/tests/test_multi_file_input.py +336 -0
earthcatalog/tests/test_passthrough_hook.py +196 -0
earthcatalog/tests/test_pipeline.py +684 -0
earthcatalog/tests/test_pipeline_components.py +665 -0
earthcatalog/tests/test_schema_generator.py +506 -0
earthcatalog/tests/test_spatial_resolver.py +413 -0
earthcatalog/tests/test_stac_hooks.py +776 -0
earthcatalog/tests/test_statistics.py +477 -0
earthcatalog/tests/test_storage_backends.py +236 -0
earthcatalog/tests/test_validation.py +435 -0
earthcatalog/tests/test_workers.py +653 -0
earthcatalog/validation.py +921 -0
earthcatalog/workers.py +682 -0
earthcatalog-0.2.0.dist-info/METADATA +333 -0
earthcatalog-0.2.0.dist-info/RECORD +50 -0
earthcatalog-0.2.0.dist-info/WHEEL +5 -0
earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
earthcatalog-0.2.0.dist-info/top_level.txt +1 -0

earthcatalog/tests/test_schema_generator.py ADDED Viewed

@@ -0,0 +1,506 @@
+"""Tests for the schema_generator module."""
+import json
+from datetime import datetime
+from typing import Any
+from unittest.mock import MagicMock, Mock, create_autospec
+import pytest
+from earthcatalog import grid_systems
+from earthcatalog.schema_generator import SchemaGenerator
+from earthcatalog.statistics import IngestionStatistics
+class MockProcessingConfig:
+    """Mock ProcessingConfig for testing."""
+    def __init__(
+        self,
+        grid_system: str = "h3",
+        grid_resolution: int = 6,
+        temporal_bin: str = "month",
+        enable_global_partitioning: bool = True,
+        global_partition_threshold: int = 100,
+        output_catalog: str = "./test_catalog",
+        input_file: str = "./test_input.parquet",
+        sort_key: str = "datetime",
+        sort_ascending: bool = True,
+        items_per_shard: int = 10000,
+        max_workers: int = 4,
+        output_format: str = "geoparquet",
+        mission_field: str = "dataset_id",
+        geojson_path: str | None = None,
+    ):
+        self.grid_system = grid_system
+        self.grid_resolution = grid_resolution
+        self.temporal_bin = temporal_bin
+        self.enable_global_partitioning = enable_global_partitioning
+        self.global_partition_threshold = global_partition_threshold
+        self.output_catalog = output_catalog
+        self.input_file = input_file
+        self.sort_key = sort_key
+        self.sort_ascending = sort_ascending
+        self.items_per_shard = items_per_shard
+        self.max_workers = max_workers
+        self.output_format = output_format
+        self.mission_field = mission_field
+        self.geojson_path = geojson_path
+def create_mock_grid(grid_type: str = "h3", resolution: int = 6) -> Any:
+    """Create a mock GridSystem that passes type checking."""
+    mock = create_autospec(grid_systems.GridSystem, instance=True)
+    mock.grid_type = grid_type
+    mock.resolution = resolution
+    mock.tiles_for_geometry.return_value = ["tile_001", "tile_002"]
+    return mock
+class MockStorage:
+    """Mock StorageBackend for testing."""
+    def __init__(self):
+        self.written_files = {}
+    def makedirs(self, path):
+        pass
+    def open(self, path, mode):
+        mock_file = MagicMock()
+        mock_file.__enter__ = Mock(return_value=mock_file)
+        mock_file.__exit__ = Mock(return_value=False)
+        mock_file.write = Mock(side_effect=lambda data: self.written_files.update({path: data}))
+        return mock_file
+    def exists(self, path):
+        return path in self.written_files
+class TestSchemaGenerator:
+    """Tests for SchemaGenerator class initialization and basic operations."""
+    @pytest.fixture
+    def mock_config(self):
+        """Create a mock ProcessingConfig."""
+        return MockProcessingConfig()
+    @pytest.fixture
+    def mock_grid(self):
+        """Create a mock grid system."""
+        return create_mock_grid()
+    @pytest.fixture
+    def mock_storage(self):
+        """Create a mock storage backend."""
+        return MockStorage()
+    @pytest.fixture
+    def mock_stats(self):
+        """Create mock IngestionStatistics with sample data."""
+        stats = IngestionStatistics()
+        stats.stored_references = 1000
+        stats.spanning_items_count = 50
+        stats.items_routed_to_global = 10
+        for i in range(100):
+            stats.unique_ids.add(f"item_{i}")
+        return stats
+    @pytest.fixture
+    def generator(self, mock_config, mock_grid, mock_storage):
+        """Create a SchemaGenerator instance."""
+        return SchemaGenerator(mock_config, mock_grid, mock_storage)
+    @pytest.fixture
+    def generator_with_stats(self, mock_config, mock_grid, mock_storage, mock_stats):
+        """Create a SchemaGenerator instance with stats."""
+        return SchemaGenerator(mock_config, mock_grid, mock_storage, mock_stats)
+    def test_initialization(self, mock_config, mock_grid, mock_storage):
+        """Test SchemaGenerator initializes correctly."""
+        generator = SchemaGenerator(mock_config, mock_grid, mock_storage)
+        assert generator.config == mock_config
+        assert generator.grid == mock_grid
+        assert generator.storage == mock_storage
+        assert generator.stats is None
+    def test_initialization_with_stats(self, mock_config, mock_grid, mock_storage, mock_stats):
+        """Test SchemaGenerator initializes with stats."""
+        generator = SchemaGenerator(mock_config, mock_grid, mock_storage, mock_stats)
+        assert generator.stats == mock_stats
+class TestSchemaGeneratorBasicSchema:
+    """Tests for basic schema generation."""
+    @pytest.fixture
+    def generator(self):
+        """Create a basic generator for testing."""
+        config = MockProcessingConfig()
+        grid = create_mock_grid()
+        storage = MockStorage()
+        return SchemaGenerator(config, grid, storage)
+    def test_generate_catalog_schema_returns_dict(self, generator):
+        """Test schema generation returns a dictionary."""
+        partition_stats = {"partition_1": {"total_items": 100, "new_items": 100, "existing_items": 0}}
+        schema = generator.generate_catalog_schema(partition_stats)
+        assert isinstance(schema, dict)
+    def test_schema_has_required_top_level_keys(self, generator):
+        """Test schema includes all expected top-level keys."""
+        partition_stats = {"partition_1": {"total_items": 100}}
+        schema = generator.generate_catalog_schema(partition_stats)
+        required_keys = [
+            "earthcatalog_version",
+            "schema_version",
+            "generated_at",
+            "catalog_info",
+            "spatial_partitioning",
+            "temporal_partitioning",
+            "partition_structure",
+            "global_partitioning",
+            "statistics",
+            "usage",
+        ]
+        for key in required_keys:
+            assert key in schema, f"Missing required key: {key}"
+    def test_schema_version_present(self, generator):
+        """Test earthcatalog_version is included."""
+        partition_stats = {}
+        schema = generator.generate_catalog_schema(partition_stats)
+        assert "earthcatalog_version" in schema
+        assert schema["earthcatalog_version"] == "1.0.0"
+        assert "schema_version" in schema
+        assert schema["schema_version"] == "1.0.0"
+    def test_generated_at_timestamp_format(self, generator):
+        """Test generated_at timestamp is ISO format with Z suffix."""
+        partition_stats = {}
+        schema = generator.generate_catalog_schema(partition_stats)
+        assert "generated_at" in schema
+        timestamp = schema["generated_at"]
+        assert timestamp.endswith("Z")
+        # Should be parseable as ISO format
+        datetime.fromisoformat(timestamp.replace("Z", "+00:00"))
+    def test_schema_json_serializable(self, generator):
+        """Test generated schema can be serialized to JSON."""
+        partition_stats = {"partition_1": {"total_items": 100}}
+        schema = generator.generate_catalog_schema(partition_stats)
+        # Should not raise
+        json_str = json.dumps(schema)
+        assert isinstance(json_str, str)
+        # Should be valid JSON
+        parsed = json.loads(json_str)
+        assert parsed == schema
+class TestSchemaGeneratorSpatialPartitioning:
+    """Tests for spatial partitioning metadata."""
+    @pytest.mark.parametrize(
+        "grid_system,resolution,expected_keys,expected_values",
+        [
+            (
+                "h3",
+                6,
+                ["grid_system", "resolution", "cell_area_km2", "cell_edge_length_km", "coordinate_system"],
+                {"grid_system": "h3", "resolution": 6, "coordinate_system": "EPSG:4326"},
+            ),
+            ("s2", 10, ["grid_system", "level", "average_cell_area_km2"], {"grid_system": "s2", "level": 10}),
+            ("mgrs", 2, ["grid_system", "precision", "precision_description"], {"grid_system": "mgrs", "precision": 2}),
+            ("utm", 3, ["grid_system", "precision"], {"grid_system": "utm", "precision": 3}),
+            ("latlon", 1, ["grid_system", "cell_size_degrees"], {"grid_system": "latlon", "cell_size_degrees": 1}),
+            (
+                "itslive",
+                10,
+                ["grid_system", "cell_size_degrees", "naming_convention"],
+                {"grid_system": "itslive", "cell_size_degrees": 10},
+            ),
+        ],
+        ids=["h3", "s2", "mgrs", "utm", "latlon", "itslive"],
+    )
+    def test_spatial_partitioning_metadata(self, grid_system, resolution, expected_keys, expected_values):
+        """Test spatial partitioning metadata for various grid systems."""
+        config = MockProcessingConfig(grid_system=grid_system, grid_resolution=resolution)
+        generator = SchemaGenerator(config, create_mock_grid(grid_system, resolution), MockStorage())
+        schema = generator.generate_catalog_schema({})
+        spatial = schema["spatial_partitioning"]
+        for key in expected_keys:
+            assert key in spatial, f"Missing key '{key}' for {grid_system}"
+        for key, value in expected_values.items():
+            assert spatial[key] == value, f"Wrong value for '{key}' in {grid_system}"
+    def test_geojson_spatial_partitioning(self):
+        """Test GeoJSON grid metadata is correct."""
+        config = MockProcessingConfig(grid_system="geojson", grid_resolution=0, geojson_path="custom.geojson")
+        generator = SchemaGenerator(config, create_mock_grid("geojson", 0), MockStorage())
+        schema = generator.generate_catalog_schema({})
+        spatial = schema["spatial_partitioning"]
+        assert spatial["grid_system"] == "geojson"
+        assert spatial["custom_grid"] is True
+class TestSchemaGeneratorTemporalPartitioning:
+    """Tests for temporal partitioning metadata."""
+    @pytest.mark.parametrize(
+        "temporal_bin,expected_pattern",
+        [
+            ("year", "year=2024/items.parquet"),
+            ("month", "year=2024/month=01/items.parquet"),
+            ("day", "year=2024/month=01/day=15/items.parquet"),
+        ],
+    )
+    def test_temporal_bin_examples(self, temporal_bin, expected_pattern):
+        """Test temporal binning produces correct path examples."""
+        config = MockProcessingConfig(temporal_bin=temporal_bin)
+        generator = SchemaGenerator(config, create_mock_grid(), MockStorage())
+        schema = generator.generate_catalog_schema({})
+        temporal = schema["temporal_partitioning"]
+        assert temporal["temporal_bin"] == temporal_bin
+        assert temporal["hive_path_examples"] == expected_pattern
+    def test_temporal_partitioning_fields(self):
+        """Test temporal partitioning includes all expected fields."""
+        config = MockProcessingConfig(temporal_bin="month")
+        generator = SchemaGenerator(config, create_mock_grid(), MockStorage())
+        schema = generator.generate_catalog_schema({})
+        temporal = schema["temporal_partitioning"]
+        assert "temporal_bin" in temporal
+        assert "temporal_bin_description" in temporal
+        assert "datetime_field" in temporal
+        assert "pruning_benefit" in temporal
+class TestSchemaGeneratorGlobalPartitioning:
+    """Tests for global partitioning metadata."""
+    def test_global_partitioning_enabled(self):
+        """Test global partitioning info when enabled."""
+        config = MockProcessingConfig(enable_global_partitioning=True, global_partition_threshold=100)
+        generator = SchemaGenerator(config, create_mock_grid(), MockStorage())
+        schema = generator.generate_catalog_schema({})
+        global_part = schema["global_partitioning"]
+        assert global_part["enabled"] is True
+        assert global_part["threshold"] == 100
+        assert "description" in global_part
+    def test_global_partitioning_disabled(self):
+        """Test global partitioning info when disabled."""
+        config = MockProcessingConfig(enable_global_partitioning=False)
+        generator = SchemaGenerator(config, create_mock_grid(), MockStorage())
+        schema = generator.generate_catalog_schema({})
+        global_part = schema["global_partitioning"]
+        assert global_part["enabled"] is False
+class TestSchemaGeneratorStatistics:
+    """Tests for statistics integration in schema."""
+    def test_statistics_from_ingestion_stats(self):
+        """Test statistics are taken from IngestionStatistics when provided."""
+        config = MockProcessingConfig()
+        stats = IngestionStatistics()
+        stats.stored_references = 5000
+        stats.spanning_items_count = 250
+        for i in range(1000):
+            stats.unique_ids.add(f"item_{i}")
+        generator = SchemaGenerator(config, create_mock_grid(), MockStorage(), stats)
+        schema = generator.generate_catalog_schema({})
+        # Should use stats.get_summary()
+        statistics = schema["statistics"]
+        assert "stored_references" in statistics
+        assert statistics["stored_references"] == 5000
+    def test_statistics_fallback_without_ingestion_stats(self):
+        """Test statistics fallback when IngestionStatistics not provided."""
+        config = MockProcessingConfig()
+        generator = SchemaGenerator(config, create_mock_grid(), MockStorage())
+        partition_stats = {
+            "partition_1": {"total_items": 100, "new_items": 80, "existing_items": 20},
+            "partition_2": {"total_items": 200, "new_items": 150, "existing_items": 50},
+        }
+        schema = generator.generate_catalog_schema(partition_stats)
+        statistics = schema["statistics"]
+        assert statistics["stored_references"] == 300  # 100 + 200
+        assert statistics["unique_granules"] == 300
+class TestSchemaGeneratorPartitionStructure:
+    """Tests for partition structure metadata."""
+    def test_partition_structure_counts(self):
+        """Test partition structure includes correct counts."""
+        config = MockProcessingConfig()
+        generator = SchemaGenerator(config, create_mock_grid(), MockStorage())
+        partition_stats = {
+            "sentinel2/partition=h3/level=6/abc123/year=2024/month=01": {"total_items": 100},
+            "sentinel2/partition=h3/level=6/def456/year=2024/month=02": {"total_items": 200},
+            "landsat8/partition=h3/level=6/abc123/year=2024/month=01": {"total_items": 150},
+        }
+        schema = generator.generate_catalog_schema(partition_stats)
+        structure = schema["partition_structure"]
+        assert structure["total_partitions"] == 3
+        assert "spatial_partitions_count" in structure
+        assert "temporal_partitions_count" in structure
+        assert "missions_count" in structure
+class TestSchemaGeneratorUsage:
+    """Tests for usage examples and recommendations."""
+    def test_usage_section_exists(self):
+        """Test usage section includes expected subsections."""
+        config = MockProcessingConfig()
+        generator = SchemaGenerator(config, create_mock_grid(), MockStorage())
+        schema = generator.generate_catalog_schema({})
+        usage = schema["usage"]
+        assert "file_structure" in usage
+        assert "spatial_partition_resolution" in usage
+        assert "partition_pruning" in usage
+        assert "recommended_tools" in usage
+    def test_usage_includes_python_example(self):
+        """Test usage includes Python example code."""
+        config = MockProcessingConfig()
+        generator = SchemaGenerator(config, create_mock_grid(), MockStorage())
+        schema = generator.generate_catalog_schema({})
+        spatial_resolution = schema["usage"]["spatial_partition_resolution"]
+        assert "python_example" in spatial_resolution
+        assert "spatial_resolver" in spatial_resolution["python_example"]
+class TestSchemaGeneratorHelperMethods:
+    """Tests for helper methods."""
+    @pytest.mark.parametrize(
+        "hive_parts,expected_result",
+        [
+            (["year=2024"], "2024"),
+            (["year=2024", "month=06"], "2024-06"),
+            (["year=2024", "month=06", "day=15"], "2024-06-15"),
+            ([], "unknown"),
+        ],
+        ids=["year", "month", "day", "empty"],
+    )
+    def test_hive_parts_to_temporal_bin(self, hive_parts, expected_result):
+        """Test converting Hive parts to temporal bin format."""
+        generator = SchemaGenerator(MockProcessingConfig(), create_mock_grid(), MockStorage())
+        result = generator._hive_parts_to_temporal_bin(hive_parts)
+        assert result == expected_result
+    def test_get_h3_average_area(self):
+        """Test H3 area lookup."""
+        generator = SchemaGenerator(MockProcessingConfig(), create_mock_grid(), MockStorage())
+        area = generator._get_h3_average_area(6)
+        assert area is not None
+        assert area == pytest.approx(36.129, rel=0.01)
+    def test_get_h3_average_edge_length(self):
+        """Test H3 edge length lookup."""
+        generator = SchemaGenerator(MockProcessingConfig(), create_mock_grid(), MockStorage())
+        edge = generator._get_h3_average_edge_length(6)
+        assert edge is not None
+        assert edge == pytest.approx(3.23, rel=0.01)
+    def test_get_s2_average_area(self):
+        """Test S2 area calculation."""
+        generator = SchemaGenerator(MockProcessingConfig(), create_mock_grid(), MockStorage())
+        area = generator._get_s2_average_area(10)
+        assert area is not None
+        assert area > 0
+    def test_get_grid_description(self):
+        """Test grid description for all systems."""
+        for grid_system in ["h3", "s2", "mgrs", "utm", "latlon", "itslive", "geojson"]:
+            config = MockProcessingConfig(grid_system=grid_system)
+            generator = SchemaGenerator(config, create_mock_grid(grid_system), MockStorage())
+            description = generator._get_grid_description()
+            assert isinstance(description, str)
+            assert len(description) > 0
+class TestSchemaGeneratorWriting:
+    """Tests for schema file writing."""
+    def test_schema_written_to_storage(self):
+        """Test schema is written to storage backend."""
+        storage = MockStorage()
+        config = MockProcessingConfig(output_catalog="./test_catalog")
+        generator = SchemaGenerator(config, create_mock_grid(), storage)
+        generator.generate_catalog_schema({})
+        # Check that something was written
+        assert len(storage.written_files) > 0
+    def test_schema_custom_filename(self):
+        """Test schema can be written with custom filename."""
+        storage = MockStorage()
+        config = MockProcessingConfig(output_catalog="./test_catalog")
+        generator = SchemaGenerator(config, create_mock_grid(), storage)
+        generator.generate_catalog_schema({}, output_filename="custom_schema.json")
+        # Verify custom filename was used
+        written_paths = list(storage.written_files.keys())
+        assert any("custom_schema.json" in path for path in written_paths)
+class TestSchemaGeneratorEdgeCases:
+    """Edge case tests for schema generation."""
+    def test_empty_partition_stats(self):
+        """Test schema generation with empty partition stats."""
+        config = MockProcessingConfig()
+        generator = SchemaGenerator(config, create_mock_grid(), MockStorage())
+        schema = generator.generate_catalog_schema({})
+        assert schema["partition_structure"]["total_partitions"] == 0
+    def test_none_temporal_binning_fallback(self):
+        """Test schema handles unusual temporal bin gracefully."""
+        # This tests the temporal_bin_description dict lookup
+        config = MockProcessingConfig(temporal_bin="month")
+        generator = SchemaGenerator(config, create_mock_grid(), MockStorage())
+        schema = generator.generate_catalog_schema({})
+        assert schema["temporal_partitioning"]["temporal_bin"] == "month"
+    def test_unknown_grid_system_description(self):
+        """Test description for unknown grid system."""
+        config = MockProcessingConfig(grid_system="unknown_grid")
+        generator = SchemaGenerator(config, create_mock_grid("unknown_grid"), MockStorage())
+        description = generator._get_grid_description()
+        assert "Unknown grid system" in description
+    def test_large_partition_stats(self):
+        """Test schema with many partitions."""
+        config = MockProcessingConfig()
+        generator = SchemaGenerator(config, create_mock_grid(), MockStorage())
+        # Create 100 partitions
+        partition_stats = {
+            f"mission/partition=h3/level=6/cell_{i}/year=2024/month=01": {"total_items": i * 10} for i in range(100)
+        }
+        schema = generator.generate_catalog_schema(partition_stats)
+        assert schema["partition_structure"]["total_partitions"] == 100