PyPI - earthcatalog - Versions diffs - 0.2.0__py3-none-any.whl - Mend

earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

earthcatalog/__init__.py +164 -0
earthcatalog/async_http_client.py +1006 -0
earthcatalog/config.py +97 -0
earthcatalog/engines/__init__.py +308 -0
earthcatalog/engines/rustac_engine.py +142 -0
earthcatalog/engines/stac_geoparquet_engine.py +126 -0
earthcatalog/exceptions.py +471 -0
earthcatalog/grid_systems.py +1114 -0
earthcatalog/ingestion_pipeline.py +2281 -0
earthcatalog/input_readers.py +603 -0
earthcatalog/job_tracking.py +485 -0
earthcatalog/pipeline.py +606 -0
earthcatalog/schema_generator.py +911 -0
earthcatalog/spatial_resolver.py +1207 -0
earthcatalog/stac_hooks.py +754 -0
earthcatalog/statistics.py +677 -0
earthcatalog/storage_backends.py +548 -0
earthcatalog/tests/__init__.py +1 -0
earthcatalog/tests/conftest.py +76 -0
earthcatalog/tests/test_all_grids.py +793 -0
earthcatalog/tests/test_async_http.py +700 -0
earthcatalog/tests/test_cli_and_storage.py +230 -0
earthcatalog/tests/test_config.py +245 -0
earthcatalog/tests/test_dask_integration.py +580 -0
earthcatalog/tests/test_e2e_synthetic.py +1624 -0
earthcatalog/tests/test_engines.py +272 -0
earthcatalog/tests/test_exceptions.py +346 -0
earthcatalog/tests/test_file_structure.py +245 -0
earthcatalog/tests/test_input_readers.py +666 -0
earthcatalog/tests/test_integration.py +200 -0
earthcatalog/tests/test_integration_async.py +283 -0
earthcatalog/tests/test_job_tracking.py +603 -0
earthcatalog/tests/test_multi_file_input.py +336 -0
earthcatalog/tests/test_passthrough_hook.py +196 -0
earthcatalog/tests/test_pipeline.py +684 -0
earthcatalog/tests/test_pipeline_components.py +665 -0
earthcatalog/tests/test_schema_generator.py +506 -0
earthcatalog/tests/test_spatial_resolver.py +413 -0
earthcatalog/tests/test_stac_hooks.py +776 -0
earthcatalog/tests/test_statistics.py +477 -0
earthcatalog/tests/test_storage_backends.py +236 -0
earthcatalog/tests/test_validation.py +435 -0
earthcatalog/tests/test_workers.py +653 -0
earthcatalog/validation.py +921 -0
earthcatalog/workers.py +682 -0
earthcatalog-0.2.0.dist-info/METADATA +333 -0
earthcatalog-0.2.0.dist-info/RECORD +50 -0
earthcatalog-0.2.0.dist-info/WHEEL +5 -0
earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
earthcatalog-0.2.0.dist-info/top_level.txt +1 -0

earthcatalog/tests/test_file_structure.py ADDED Viewed

@@ -0,0 +1,245 @@
+"""Tests for file structure features."""
+import tempfile
+import unittest
+from pathlib import Path
+from earthcatalog.ingestion_pipeline import LocalProcessor, ProcessingConfig, STACIngestionPipeline
+from earthcatalog.spatial_resolver import SpatialPartitionResolver
+class TestFileStructure(unittest.TestCase):
+    """Test file structure features."""
+    def setUp(self):
+        """Set up test environment."""
+        self.temp_dir = Path(tempfile.mkdtemp())
+        self.sample_stac_item = {
+            "type": "Feature",
+            "id": "test_001",
+            "geometry": {"type": "Point", "coordinates": [-105.0, 40.0]},
+            "properties": {
+                "datetime": "2024-01-15T10:30:00Z",
+                "dataset_id": "landsat8_test",
+                "collection": "test-collection",
+            },
+        }
+    def tearDown(self):
+        """Clean up test environment."""
+        import shutil
+        shutil.rmtree(self.temp_dir, ignore_errors=True)
+    def test_mission_extraction_from_dataset_id(self):
+        """Test mission extraction from dataset_id field."""
+        config = ProcessingConfig(
+            input_file=str(self.temp_dir / "test.parquet"),
+            output_catalog=str(self.temp_dir / "catalog"),
+            scratch_location=str(self.temp_dir / "scratch"),
+            mission_field="dataset_id",
+        )
+        processor = LocalProcessor(n_workers=1)
+        pipeline = STACIngestionPipeline(config, processor)
+        # Test mission extraction
+        mission = pipeline._extract_mission(self.sample_stac_item)
+        self.assertEqual(mission, "landsat8_test")
+    def test_mission_extraction_fallback_to_collection(self):
+        """Test mission extraction falls back to collection field."""
+        item_without_dataset_id = {
+            "type": "Feature",
+            "properties": {"datetime": "2024-01-15T10:30:00Z", "collection": "sentinel2-collection"},
+        }
+        config = ProcessingConfig(
+            input_file=str(self.temp_dir / "test.parquet"),
+            output_catalog=str(self.temp_dir / "catalog"),
+            scratch_location=str(self.temp_dir / "scratch"),
+            mission_field="dataset_id",
+        )
+        processor = LocalProcessor(n_workers=1)
+        pipeline = STACIngestionPipeline(config, processor)
+        mission = pipeline._extract_mission(item_without_dataset_id)
+        self.assertEqual(mission, "sentinel2_collection")
+    def test_mission_sanitization(self):
+        """Test mission name sanitization for filesystem compatibility."""
+        config = ProcessingConfig(
+            input_file=str(self.temp_dir / "test.parquet"),
+            output_catalog=str(self.temp_dir / "catalog"),
+            scratch_location=str(self.temp_dir / "scratch"),
+        )
+        processor = LocalProcessor(n_workers=1)
+        pipeline = STACIngestionPipeline(config, processor)
+        # Test various mission names that need sanitization
+        test_cases = [
+            ("Landsat-8 Collection", "landsat_8_collection"),
+            ("MODIS/Terra", "modis_terra"),
+            ("Sentinel-2A/B", "sentinel_2a_b"),
+            ("Test@Collection#1", "test_collection_1"),
+            ("___test___", "test"),
+            ("", "unnamed"),
+        ]
+        for input_name, expected in test_cases:
+            sanitized = pipeline._sanitize_mission_name(input_name)
+            self.assertEqual(sanitized, expected)
+    def test_partition_key_format(self):
+        """Test new partition key format with Hive-style temporal partitioning."""
+        config = ProcessingConfig(
+            input_file=str(self.temp_dir / "test.parquet"),
+            output_catalog=str(self.temp_dir / "catalog"),
+            scratch_location=str(self.temp_dir / "scratch"),
+            grid_resolution=2,
+            temporal_bin="month",
+        )
+        processor = LocalProcessor(n_workers=1)
+        pipeline = STACIngestionPipeline(config, processor)
+        partition_key = pipeline._compute_partition_key(self.sample_stac_item)
+        # Should follow Hive-style format: mission/partition=h3/level=2/cell_id/year=2024/month=01
+        parts = partition_key.split("/")
+        self.assertEqual(len(parts), 6)
+        self.assertEqual(parts[0], "landsat8_test")  # mission
+        self.assertEqual(parts[1], "partition=h3")  # partition type
+        self.assertEqual(parts[2], "level=2")  # resolution level
+        self.assertTrue(parts[3].startswith("8"))  # H3 cell (starts with 8)
+        self.assertEqual(parts[4], "year=2024")  # Hive-style year
+        self.assertEqual(parts[5], "month=01")  # Hive-style month
+    def test_h3_level_2_default(self):
+        """Test H3 level 2 as new default."""
+        config = ProcessingConfig(
+            input_file=str(self.temp_dir / "test.parquet"),
+            output_catalog=str(self.temp_dir / "catalog"),
+            scratch_location=str(self.temp_dir / "scratch"),
+        )
+        # Should default to level 2
+        self.assertEqual(config.grid_resolution, 2)
+    def test_output_format_geoparquet_default(self):
+        """Test that GeoParquet is the default output format."""
+        config = ProcessingConfig(
+            input_file=str(self.temp_dir / "test.parquet"),
+            output_catalog=str(self.temp_dir / "catalog"),
+            scratch_location=str(self.temp_dir / "scratch"),
+        )
+        self.assertEqual(config.output_format, "geoparquet")
+    def test_output_format_ndjson_option(self):
+        """Test NDJSON as an output format option."""
+        config = ProcessingConfig(
+            input_file=str(self.temp_dir / "test.parquet"),
+            output_catalog=str(self.temp_dir / "catalog"),
+            scratch_location=str(self.temp_dir / "scratch"),
+            output_format="ndjson",
+        )
+        self.assertEqual(config.output_format, "ndjson")
+    def test_final_partition_path_geoparquet(self):
+        """Test final partition path generation with GeoParquet using Hive-style."""
+        config = ProcessingConfig(
+            input_file=str(self.temp_dir / "test.parquet"),
+            output_catalog=str(self.temp_dir / "catalog"),
+            scratch_location=str(self.temp_dir / "scratch"),
+            output_format="geoparquet",
+        )
+        processor = LocalProcessor(n_workers=1)
+        pipeline = STACIngestionPipeline(config, processor)
+        # Hive-style partition key
+        partition_key = "landsat8_test/partition=h3/level=2/821f7ffffffffff/year=2024/month=01"
+        expected_path = f"{config.output_catalog}/{partition_key}/items.parquet"
+        result_path = pipeline._get_final_partition_path(partition_key)
+        self.assertEqual(result_path, expected_path)
+    def test_final_partition_path_ndjson(self):
+        """Test final partition path generation with NDJSON using Hive-style."""
+        config = ProcessingConfig(
+            input_file=str(self.temp_dir / "test.parquet"),
+            output_catalog=str(self.temp_dir / "catalog"),
+            scratch_location=str(self.temp_dir / "scratch"),
+            output_format="ndjson",
+        )
+        processor = LocalProcessor(n_workers=1)
+        pipeline = STACIngestionPipeline(config, processor)
+        # Hive-style partition key
+        partition_key = "landsat8_test/partition=h3/level=2/821f7ffffffffff/year=2024/month=01"
+        expected_path = f"{config.output_catalog}/{partition_key}/items.ndjson"
+        result_path = pipeline._get_final_partition_path(partition_key)
+        self.assertEqual(result_path, expected_path)
+    def test_spatial_resolver_structure_detection(self):
+        """Test spatial resolver detects file structure."""
+        # Create a schema
+        schema = {
+            "spatial_partitioning": {
+                "grid_system": "h3",
+                "resolution": 2,
+                "partitioning_scheme": "default",
+                "structure": "/{mission}/partition={grid}/level={resolution}/{spatial_id}/{temporal}.parquet",
+                "example_paths": [
+                    "landsat8/partition=h3/level=2/821f7ffffffffff/2024-01.parquet",
+                    "sentinel2/partition=h3/level=2/821f7ffffffffff/2024-01.parquet",
+                ],
+            },
+            "global_partitioning": {"enabled": True, "threshold": 1},
+        }
+        resolver = SpatialPartitionResolver(schema, str(self.temp_dir))
+        self.assertIn("landsat8", resolver.missions)
+        self.assertIn("sentinel2", resolver.missions)
+    def test_config_validation_output_format(self):
+        """Test configuration validation for output format."""
+        # Create test file
+        test_file = self.temp_dir / "test.parquet"
+        test_file.write_text("")  # Create empty file for validation
+        # Valid format should not raise
+        config = ProcessingConfig(
+            input_file=str(test_file),
+            output_catalog=str(self.temp_dir / "catalog"),
+            scratch_location=str(self.temp_dir / "scratch"),
+            output_format="geoparquet",
+        )
+        # This should not raise an exception
+        try:
+            config.validate()
+        except ValueError:
+            self.fail("Valid configuration should not raise ValueError")
+        # Invalid format should raise
+        with self.assertRaises(ValueError) as context:
+            config = ProcessingConfig(
+                input_file=str(self.temp_dir / "test.parquet"),
+                output_catalog=str(self.temp_dir / "catalog"),
+                scratch_location=str(self.temp_dir / "scratch"),
+                output_format="invalid_format",
+            )
+            config.validate()
+        self.assertIn("output_format must be 'geoparquet' or 'ndjson'", str(context.exception))
+if __name__ == "__main__":
+    unittest.main()