PyPI - earthcatalog - Versions diffs - 0.2.0__py3-none-any.whl - Mend

earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

earthcatalog/__init__.py +164 -0
earthcatalog/async_http_client.py +1006 -0
earthcatalog/config.py +97 -0
earthcatalog/engines/__init__.py +308 -0
earthcatalog/engines/rustac_engine.py +142 -0
earthcatalog/engines/stac_geoparquet_engine.py +126 -0
earthcatalog/exceptions.py +471 -0
earthcatalog/grid_systems.py +1114 -0
earthcatalog/ingestion_pipeline.py +2281 -0
earthcatalog/input_readers.py +603 -0
earthcatalog/job_tracking.py +485 -0
earthcatalog/pipeline.py +606 -0
earthcatalog/schema_generator.py +911 -0
earthcatalog/spatial_resolver.py +1207 -0
earthcatalog/stac_hooks.py +754 -0
earthcatalog/statistics.py +677 -0
earthcatalog/storage_backends.py +548 -0
earthcatalog/tests/__init__.py +1 -0
earthcatalog/tests/conftest.py +76 -0
earthcatalog/tests/test_all_grids.py +793 -0
earthcatalog/tests/test_async_http.py +700 -0
earthcatalog/tests/test_cli_and_storage.py +230 -0
earthcatalog/tests/test_config.py +245 -0
earthcatalog/tests/test_dask_integration.py +580 -0
earthcatalog/tests/test_e2e_synthetic.py +1624 -0
earthcatalog/tests/test_engines.py +272 -0
earthcatalog/tests/test_exceptions.py +346 -0
earthcatalog/tests/test_file_structure.py +245 -0
earthcatalog/tests/test_input_readers.py +666 -0
earthcatalog/tests/test_integration.py +200 -0
earthcatalog/tests/test_integration_async.py +283 -0
earthcatalog/tests/test_job_tracking.py +603 -0
earthcatalog/tests/test_multi_file_input.py +336 -0
earthcatalog/tests/test_passthrough_hook.py +196 -0
earthcatalog/tests/test_pipeline.py +684 -0
earthcatalog/tests/test_pipeline_components.py +665 -0
earthcatalog/tests/test_schema_generator.py +506 -0
earthcatalog/tests/test_spatial_resolver.py +413 -0
earthcatalog/tests/test_stac_hooks.py +776 -0
earthcatalog/tests/test_statistics.py +477 -0
earthcatalog/tests/test_storage_backends.py +236 -0
earthcatalog/tests/test_validation.py +435 -0
earthcatalog/tests/test_workers.py +653 -0
earthcatalog/validation.py +921 -0
earthcatalog/workers.py +682 -0
earthcatalog-0.2.0.dist-info/METADATA +333 -0
earthcatalog-0.2.0.dist-info/RECORD +50 -0
earthcatalog-0.2.0.dist-info/WHEEL +5 -0
earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
earthcatalog-0.2.0.dist-info/top_level.txt +1 -0

earthcatalog/tests/test_statistics.py ADDED Viewed

@@ -0,0 +1,477 @@
+"""Tests for the statistics module."""
+import time
+import pytest
+from earthcatalog.statistics import HyperLogLog, IngestionStatistics
+class TestHyperLogLog:
+    """Tests for HyperLogLog approximate cardinality estimator."""
+    def test_empty_hll(self):
+        """Test that empty HyperLogLog returns 0."""
+        hll = HyperLogLog(precision=14)
+        assert hll.count() == 0
+    def test_single_element(self):
+        """Test adding a single element."""
+        hll = HyperLogLog(precision=14)
+        hll.add("item_001")
+        assert hll.count() >= 1
+    def test_duplicate_elements(self):
+        """Test that duplicates don't increase count."""
+        hll = HyperLogLog(precision=14)
+        for _ in range(100):
+            hll.add("same_item")
+        # Should still be approximately 1
+        assert hll.count() <= 2
+    def test_many_unique_elements(self):
+        """Test counting many unique elements with acceptable error."""
+        hll = HyperLogLog(precision=14)
+        n = 10000
+        for i in range(n):
+            hll.add(f"item_{i:08d}")
+        count = hll.count()
+        # With precision=14, error should be ~0.8%, allow 5% for safety
+        error = abs(count - n) / n
+        assert error < 0.05, f"Error {error:.2%} exceeds 5% threshold"
+    def test_precision_affects_accuracy(self):
+        """Test that higher precision gives better accuracy."""
+        n = 5000
+        # Lower precision (less accurate)
+        hll_low = HyperLogLog(precision=10)
+        for i in range(n):
+            hll_low.add(f"item_{i}")
+        # Higher precision (more accurate)
+        hll_high = HyperLogLog(precision=14)
+        for i in range(n):
+            hll_high.add(f"item_{i}")
+        error_low = abs(hll_low.count() - n) / n
+        error_high = abs(hll_high.count() - n) / n
+        # Both should be reasonably accurate
+        assert error_low < 0.15  # Low precision within 15%
+        # Higher precision should generally have lower error
+        # (though not guaranteed for any single run)
+        assert error_high < 0.10  # High precision within 10%
+    def test_merge_hlls(self):
+        """Test merging two HyperLogLog instances."""
+        hll1 = HyperLogLog(precision=14)
+        hll2 = HyperLogLog(precision=14)
+        # Add different items to each
+        for i in range(1000):
+            hll1.add(f"item_a_{i}")
+        for i in range(1000):
+            hll2.add(f"item_b_{i}")
+        # Merge
+        hll1.merge(hll2)
+        # Should have approximately 2000 unique items
+        count = hll1.count()
+        error = abs(count - 2000) / 2000
+        assert error < 0.10
+    def test_merge_with_overlap(self):
+        """Test merging HLLs with overlapping items."""
+        hll1 = HyperLogLog(precision=14)
+        hll2 = HyperLogLog(precision=14)
+        # Add 1000 items to hll1
+        for i in range(1000):
+            hll1.add(f"item_{i}")
+        # Add 500 overlapping + 500 new to hll2
+        for i in range(500, 1500):
+            hll2.add(f"item_{i}")
+        hll1.merge(hll2)
+        # Should have approximately 1500 unique items
+        count = hll1.count()
+        error = abs(count - 1500) / 1500
+        assert error < 0.10
+    def test_merge_different_precision_fails(self):
+        """Test that merging different precision HLLs raises error."""
+        hll1 = HyperLogLog(precision=10)
+        hll2 = HyperLogLog(precision=14)
+        with pytest.raises(ValueError, match="different precision"):
+            hll1.merge(hll2)
+    def test_invalid_precision(self):
+        """Test that invalid precision raises error."""
+        with pytest.raises(ValueError):
+            HyperLogLog(precision=3)  # Too low
+        with pytest.raises(ValueError):
+            HyperLogLog(precision=17)  # Too high
+class TestIngestionStatistics:
+    """Tests for IngestionStatistics collector."""
+    @pytest.fixture
+    def sample_item(self):
+        """Create a sample STAC item for testing."""
+        return {
+            "id": "test_item_001",
+            "type": "Feature",
+            "geometry": {
+                "type": "Polygon",
+                "coordinates": [
+                    [
+                        [-122.5, 37.5],
+                        [-122.0, 37.5],
+                        [-122.0, 38.0],
+                        [-122.5, 38.0],
+                        [-122.5, 37.5],
+                    ]
+                ],
+            },
+            "properties": {
+                "datetime": "2024-06-15T10:30:00Z",
+                "collection": "sentinel-2",
+            },
+        }
+    @pytest.fixture
+    def stats(self):
+        """Create a fresh statistics instance."""
+        return IngestionStatistics()
+    def test_empty_stats(self, stats):
+        """Test empty statistics summary."""
+        summary = stats.get_summary()
+        assert summary["unique_granules"] == 0
+        assert summary["stored_references"] == 0
+        assert summary["overhead"]["spanning_items"] == 0
+    def test_record_single_item(self, stats, sample_item):
+        """Test recording a single item."""
+        stats.record_item(
+            item=sample_item,
+            tiles=["abc123"],
+            is_spanning=False,
+            routed_to_global=False,
+            mission="sentinel2",
+        )
+        summary = stats.get_summary()
+        assert summary["unique_granules"] == 1
+        assert summary["stored_references"] == 1
+        assert summary["missions"]["sentinel2"] == 1
+        assert summary["quality"]["geometry_types"]["Polygon"] == 1
+    def test_record_spanning_item(self, stats, sample_item):
+        """Test recording a spanning item (multiple tiles)."""
+        stats.record_item(
+            item=sample_item,
+            tiles=["tile1", "tile2", "tile3"],
+            is_spanning=True,
+            routed_to_global=False,
+            mission="landsat",
+        )
+        summary = stats.get_summary()
+        assert summary["unique_granules"] == 1
+        assert summary["stored_references"] == 3  # Stored in 3 tiles
+        assert summary["overhead"]["spanning_items"] == 1
+        assert summary["overhead"]["max_tiles_per_item"] == 3
+        assert summary["overhead"]["duplication_ratio"] == 3.0
+    def test_record_global_routed_item(self, stats, sample_item):
+        """Test recording an item routed to global partition."""
+        stats.record_item(
+            item=sample_item,
+            tiles=["t1", "t2", "t3", "t4", "t5"],  # Would span 5 tiles
+            is_spanning=True,
+            routed_to_global=True,
+            mission="modis",
+        )
+        summary = stats.get_summary()
+        # Routed to global = only 1 stored reference
+        assert summary["stored_references"] == 1
+        assert summary["global_partition"]["items_routed_to_global"] == 1
+    def test_temporal_tracking(self, stats):
+        """Test temporal distribution tracking."""
+        items = [
+            {"id": f"item_{i}", "geometry": None, "properties": {"datetime": f"2024-0{m}-15T00:00:00Z"}}
+            for i, m in enumerate([1, 1, 2, 3, 3, 3], 1)
+        ]
+        for item in items:
+            stats.record_item(item, tiles=[], is_spanning=False, mission="test")
+        summary = stats.get_summary()
+        assert "2024" in summary["temporal"]["years_with_data"]
+        assert summary["temporal"]["distribution"]["2024"]["months"]["01"] == 2
+        assert summary["temporal"]["distribution"]["2024"]["months"]["02"] == 1
+        assert summary["temporal"]["distribution"]["2024"]["months"]["03"] == 3
+    def test_spatial_bbox_tracking(self, stats):
+        """Test spatial bounding box tracking."""
+        items = [
+            {
+                "id": "item1",
+                "geometry": {"type": "Point", "coordinates": [-100, 30]},
+                "properties": {},
+            },
+            {
+                "id": "item2",
+                "geometry": {"type": "Point", "coordinates": [-80, 45]},
+                "properties": {},
+            },
+        ]
+        for item in items:
+            stats.record_item(item, tiles=["t1"], is_spanning=False, mission="test")
+        summary = stats.get_summary()
+        bbox = summary["spatial"]["bbox"]
+        assert bbox[0] == -100  # min lon
+        assert bbox[1] == 30  # min lat
+        assert bbox[2] == -80  # max lon
+        assert bbox[3] == 45  # max lat
+    def test_quality_metrics(self, stats):
+        """Test data quality metric tracking."""
+        # Item with null geometry
+        stats.record_item(
+            {"id": "null_geom", "geometry": None, "properties": {}}, tiles=[], is_spanning=False, mission="test"
+        )
+        # Item with missing datetime
+        stats.record_item(
+            {"id": "no_dt", "geometry": {"type": "Point", "coordinates": [0, 0]}, "properties": {}},
+            tiles=["t1"],
+            is_spanning=False,
+            mission="test",
+        )
+        summary = stats.get_summary()
+        assert summary["quality"]["null_geometries"] == 1
+        assert summary["quality"]["missing_datetime"] == 2  # Both items missing datetime
+    def test_url_processing_stats(self, stats):
+        """Test URL processing statistics."""
+        stats.record_url_processed(success=True)
+        stats.record_url_processed(success=True)
+        stats.record_url_processed(success=False)
+        summary = stats.get_summary()
+        assert summary["processing"]["urls_processed"] == 3
+        assert summary["processing"]["urls_failed"] == 1
+        assert summary["processing"]["success_rate"] == pytest.approx(66.67, rel=0.01)
+    def test_timing_metrics(self, stats):
+        """Test processing timing metrics."""
+        stats.start_processing()
+        time.sleep(0.1)  # Brief delay
+        stats.finish_processing()
+        summary = stats.get_summary()
+        assert summary["processing"]["duration_seconds"] >= 0.1
+    def test_hotspot_detection(self, stats):
+        """Test hotspot cell detection."""
+        # Create uneven distribution
+        for i in range(100):
+            stats.record_item(
+                {"id": f"item_{i}", "geometry": None, "properties": {}},
+                tiles=["hot_cell"],
+                is_spanning=False,
+                mission="test",
+            )
+        for i in range(10):
+            stats.record_item(
+                {"id": f"cold_item_{i}", "geometry": None, "properties": {}},
+                tiles=["cold_cell"],
+                is_spanning=False,
+                mission="test",
+            )
+        summary = stats.get_summary()
+        hotspots = summary["spatial"]["hotspot_cells"]
+        # Hot cell should be first
+        assert hotspots[0]["cell"] == "hot_cell"
+        assert hotspots[0]["count"] == 100
+    def test_merge_statistics(self, stats, sample_item):
+        """Test merging statistics from multiple workers."""
+        stats2 = IngestionStatistics()
+        # Record items in first stats
+        for i in range(5):
+            item = sample_item.copy()
+            item["id"] = f"item_a_{i}"
+            stats.record_item(item, tiles=["cell_a"], is_spanning=False, mission="mission_a")
+        # Record items in second stats
+        for i in range(3):
+            item = sample_item.copy()
+            item["id"] = f"item_b_{i}"
+            stats2.record_item(item, tiles=["cell_b"], is_spanning=False, mission="mission_b")
+        # Merge
+        stats.merge(stats2)
+        summary = stats.get_summary()
+        assert summary["unique_granules"] == 8
+        assert summary["stored_references"] == 8
+        assert summary["missions"]["mission_a"] == 5
+        assert summary["missions"]["mission_b"] == 3
+    def test_consolidation_stats(self, stats):
+        """Test consolidation statistics recording."""
+        stats.record_consolidation(new_items=100, existing_items=50, duplicates_removed=10)
+        stats.record_consolidation(new_items=200, existing_items=100, duplicates_removed=5)
+        summary = stats.get_summary()
+        assert summary["processing"]["new_items"] == 300
+        assert summary["processing"]["existing_items"] == 150
+        assert summary["processing"]["duplicates_removed"] == 15
+    def test_overhead_percentage(self, stats, sample_item):
+        """Test overhead percentage calculation."""
+        # Add 10 unique items, 3 of which span 2 tiles each
+        for i in range(7):
+            item = sample_item.copy()
+            item["id"] = f"single_{i}"
+            stats.record_item(item, tiles=[f"t{i}"], is_spanning=False, mission="test")
+        for i in range(3):
+            item = sample_item.copy()
+            item["id"] = f"spanning_{i}"
+            stats.record_item(item, tiles=["tA", "tB"], is_spanning=True, mission="test")
+        summary = stats.get_summary()
+        # 10 unique items, 7 + 6 = 13 stored references
+        assert summary["unique_granules"] == 10
+        assert summary["stored_references"] == 13
+        assert summary["overhead"]["overhead_percentage"] == 30.0  # (13-10)/10 * 100
+    def test_tiles_distribution_histogram(self, stats, sample_item):
+        """Test the tiles-per-spanning-item histogram."""
+        # Add spanning items with different tile counts
+        for i in range(5):
+            item = sample_item.copy()
+            item["id"] = f"span2_{i}"
+            stats.record_item(item, tiles=["a", "b"], is_spanning=True, mission="test")
+        for i in range(3):
+            item = sample_item.copy()
+            item["id"] = f"span5_{i}"
+            stats.record_item(item, tiles=["a", "b", "c", "d", "e"], is_spanning=True, mission="test")
+        summary = stats.get_summary()
+        # Check histogram buckets
+        assert summary["overhead"]["tiles_distribution"][2] == 5  # 5 items with 2 tiles
+        assert summary["overhead"]["tiles_distribution"][5] == 3  # 3 items with 3-5 tiles
+    def test_datetime_parsing_z_suffix(self, stats):
+        """Test datetime parsing with Z suffix (common in STAC)."""
+        item = {
+            "id": "z_suffix_item",
+            "geometry": None,
+            "properties": {"datetime": "2024-06-15T10:30:00Z"},
+        }
+        stats.record_item(item, tiles=[], is_spanning=False, mission="test")
+        summary = stats.get_summary()
+        assert summary["temporal"]["distribution"]["2024"]["months"]["06"] == 1
+        assert summary["quality"]["missing_datetime"] == 0
+    def test_datetime_parsing_with_timezone(self, stats):
+        """Test datetime parsing with explicit timezone offset."""
+        item = {
+            "id": "tz_item",
+            "geometry": None,
+            "properties": {"datetime": "2024-03-20T15:45:30+05:30"},
+        }
+        stats.record_item(item, tiles=[], is_spanning=False, mission="test")
+        summary = stats.get_summary()
+        assert summary["temporal"]["distribution"]["2024"]["months"]["03"] == 1
+    def test_datetime_parsing_no_timezone(self, stats):
+        """Test datetime parsing without timezone (naive datetime)."""
+        item = {
+            "id": "naive_item",
+            "geometry": None,
+            "properties": {"datetime": "2024-12-25T00:00:00"},
+        }
+        stats.record_item(item, tiles=[], is_spanning=False, mission="test")
+        summary = stats.get_summary()
+        assert summary["temporal"]["distribution"]["2024"]["months"]["12"] == 1
+    def test_datetime_parsing_with_microseconds(self, stats):
+        """Test datetime parsing with microseconds."""
+        item = {
+            "id": "micro_item",
+            "geometry": None,
+            "properties": {"datetime": "2024-01-01T12:00:00.123456Z"},
+        }
+        stats.record_item(item, tiles=[], is_spanning=False, mission="test")
+        summary = stats.get_summary()
+        assert summary["temporal"]["distribution"]["2024"]["months"]["01"] == 1
+    def test_datetime_parsing_invalid_format(self, stats):
+        """Test that invalid datetime formats are counted as missing."""
+        item = {
+            "id": "invalid_dt",
+            "geometry": None,
+            "properties": {"datetime": "not-a-date"},
+        }
+        stats.record_item(item, tiles=[], is_spanning=False, mission="test")
+        summary = stats.get_summary()
+        assert summary["quality"]["missing_datetime"] == 1
+class TestHyperLogLogLargeScale:
+    """Large-scale tests for HyperLogLog (optional, slower)."""
+    @pytest.mark.slow
+    def test_million_elements(self):
+        """Test with 1 million unique elements."""
+        hll = HyperLogLog(precision=14)
+        n = 1_000_000
+        for i in range(n):
+            hll.add(f"item_{i:08d}")
+        count = hll.count()
+        error = abs(count - n) / n
+        # Should be within 2% for precision=14
+        assert error < 0.02, f"Error {error:.2%} exceeds 2% threshold"