PyPI - earthcatalog - Versions diffs - 0.2.0__py3-none-any.whl - Mend

earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

earthcatalog/__init__.py +164 -0
earthcatalog/async_http_client.py +1006 -0
earthcatalog/config.py +97 -0
earthcatalog/engines/__init__.py +308 -0
earthcatalog/engines/rustac_engine.py +142 -0
earthcatalog/engines/stac_geoparquet_engine.py +126 -0
earthcatalog/exceptions.py +471 -0
earthcatalog/grid_systems.py +1114 -0
earthcatalog/ingestion_pipeline.py +2281 -0
earthcatalog/input_readers.py +603 -0
earthcatalog/job_tracking.py +485 -0
earthcatalog/pipeline.py +606 -0
earthcatalog/schema_generator.py +911 -0
earthcatalog/spatial_resolver.py +1207 -0
earthcatalog/stac_hooks.py +754 -0
earthcatalog/statistics.py +677 -0
earthcatalog/storage_backends.py +548 -0
earthcatalog/tests/__init__.py +1 -0
earthcatalog/tests/conftest.py +76 -0
earthcatalog/tests/test_all_grids.py +793 -0
earthcatalog/tests/test_async_http.py +700 -0
earthcatalog/tests/test_cli_and_storage.py +230 -0
earthcatalog/tests/test_config.py +245 -0
earthcatalog/tests/test_dask_integration.py +580 -0
earthcatalog/tests/test_e2e_synthetic.py +1624 -0
earthcatalog/tests/test_engines.py +272 -0
earthcatalog/tests/test_exceptions.py +346 -0
earthcatalog/tests/test_file_structure.py +245 -0
earthcatalog/tests/test_input_readers.py +666 -0
earthcatalog/tests/test_integration.py +200 -0
earthcatalog/tests/test_integration_async.py +283 -0
earthcatalog/tests/test_job_tracking.py +603 -0
earthcatalog/tests/test_multi_file_input.py +336 -0
earthcatalog/tests/test_passthrough_hook.py +196 -0
earthcatalog/tests/test_pipeline.py +684 -0
earthcatalog/tests/test_pipeline_components.py +665 -0
earthcatalog/tests/test_schema_generator.py +506 -0
earthcatalog/tests/test_spatial_resolver.py +413 -0
earthcatalog/tests/test_stac_hooks.py +776 -0
earthcatalog/tests/test_statistics.py +477 -0
earthcatalog/tests/test_storage_backends.py +236 -0
earthcatalog/tests/test_validation.py +435 -0
earthcatalog/tests/test_workers.py +653 -0
earthcatalog/validation.py +921 -0
earthcatalog/workers.py +682 -0
earthcatalog-0.2.0.dist-info/METADATA +333 -0
earthcatalog-0.2.0.dist-info/RECORD +50 -0
earthcatalog-0.2.0.dist-info/WHEEL +5 -0
earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
earthcatalog-0.2.0.dist-info/top_level.txt +1 -0

earthcatalog/tests/test_multi_file_input.py ADDED Viewed

@@ -0,0 +1,336 @@
+"""Tests for multi-file input pattern support."""
+import json
+from pathlib import Path
+import pytest
+from earthcatalog.ingestion_pipeline import ProcessingConfig
+from earthcatalog.input_readers import ReaderFactory
+class TestMultiFileInput:
+    """Tests for glob pattern-based multi-file input."""
+    @pytest.fixture
+    def synthetic_bulk_dir(self, tmp_path: Path) -> Path:
+        """Create a temporary directory with synthetic bulk data files.
+        Creates files following the ITS_LIVE bulk data pattern:
+        - {year}_{chunk_no}.ndjson (e.g., 2020_1.ndjson, 2020_2.ndjson)
+        - Each file contains 5-10 STAC items with url field
+        Args:
+            tmp_path: Pytest temporary directory fixture.
+        Returns:
+            Path to the temporary directory containing synthetic data.
+        """
+        bulk_dir = tmp_path / "bulk_data"
+        bulk_dir.mkdir()
+        # Create synthetic data files for different years and chunks
+        synthetic_data = {
+            "2020_1.ndjson": [
+                {"url": f"https://example.com/item_{i}.json", "id": f"item_{i}", "year": 2020} for i in range(5)
+            ],
+            "2020_2.ndjson": [
+                {"url": f"https://example.com/item_{i}.json", "id": f"item_{i}", "year": 2020} for i in range(5, 10)
+            ],
+            "2021_1.ndjson": [
+                {"url": f"https://example.com/item_{i}.json", "id": f"item_{i}", "year": 2021} for i in range(10, 15)
+            ],
+            "2021_2.ndjson": [
+                {"url": f"https://example.com/item_{i}.json", "id": f"item_{i}", "year": 2021} for i in range(15, 20)
+            ],
+            "2022_1.ndjson": [
+                {"url": f"https://example.com/item_{i}.json", "id": f"item_{i}", "year": 2022} for i in range(20, 25)
+            ],
+        }
+        # Write each file
+        for filename, items in synthetic_data.items():
+            file_path = bulk_dir / filename
+            with file_path.open("w") as f:
+                for item in items:
+                    f.write(json.dumps(item) + "\n")
+        return bulk_dir
+    def test_reader_factory_ndjson_support(self):
+        """Test that ReaderFactory supports ndjson and jsonl formats."""
+        formats = ReaderFactory.get_supported_formats()
+        assert "ndjson" in formats
+        assert "jsonl" in formats
+    def test_auto_detect_ndjson_format(self, tmp_path: Path):
+        """Test auto-detection of .ndjson files."""
+        test_file = tmp_path / "test.ndjson"
+        test_file.write_text('{"url": "https://example.com/item.json"}\n')
+        format_detected = ReaderFactory.auto_detect_format(str(test_file))
+        assert format_detected == "ndjson"
+    def test_auto_detect_jsonl_format(self, tmp_path: Path):
+        """Test auto-detection of .jsonl files."""
+        test_file = tmp_path / "test.jsonl"
+        test_file.write_text('{"url": "https://example.com/item.json"}\n')
+        format_detected = ReaderFactory.auto_detect_format(str(test_file))
+        assert format_detected == "jsonl"
+    def test_read_single_ndjson_file(self, synthetic_bulk_dir: Path):
+        """Test reading URLs from a single NDJSON file."""
+        test_file = synthetic_bulk_dir / "2020_1.ndjson"
+        reader = ReaderFactory.get_reader("ndjson")
+        urls = reader.read_urls(str(test_file), "url")
+        assert len(urls) == 5
+        assert all(url.startswith("https://example.com/item_") for url in urls)
+    def test_processing_config_with_pattern(self):
+        """Test ProcessingConfig accepts input_pattern field."""
+        config = ProcessingConfig(
+            input_file="./data",
+            output_catalog="./catalog",
+            scratch_location="./scratch",
+            input_pattern="./data/2020_*.ndjson",
+        )
+        assert config.input_pattern == "./data/2020_*.ndjson"
+        # validate() should not raise when pattern is provided
+        # (skips file existence check for input_file)
+        try:
+            config.validate()
+        except (ValueError, TypeError, OSError, RuntimeError) as e:
+            pytest.fail(f"validate() raised unexpected exception: {e}")
+    def test_processing_config_pattern_validation(self):
+        """Test that validation passes when input_pattern is provided."""
+        config = ProcessingConfig(
+            input_file="./nonexistent",  # Can be non-existent when pattern is provided
+            output_catalog="./catalog",
+            scratch_location="./scratch",
+            input_pattern="./real_data/*.ndjson",
+        )
+        # Should not raise FileNotFoundError since pattern is provided
+        # Note: Actual file discovery happens during processing, not validation
+        assert config.input_pattern == "./real_data/*.ndjson"
+    def test_glob_pattern_local_filesystem(self, synthetic_bulk_dir: Path):
+        """Test glob pattern matching on local filesystem."""
+        import glob as glob_module
+        pattern = str(synthetic_bulk_dir / "2020_*.ndjson")
+        matching_files = glob_module.glob(pattern)
+        assert len(matching_files) == 2
+        assert all("2020_" in f for f in matching_files)
+        assert all(f.endswith(".ndjson") for f in matching_files)
+    def test_glob_pattern_all_years(self, synthetic_bulk_dir: Path):
+        """Test glob pattern matching across all years."""
+        import glob as glob_module
+        pattern = str(synthetic_bulk_dir / "*.ndjson")
+        matching_files = sorted(glob_module.glob(pattern))
+        assert len(matching_files) == 5
+        # Check we get all expected files
+        expected_files = [
+            "2020_1.ndjson",
+            "2020_2.ndjson",
+            "2021_1.ndjson",
+            "2021_2.ndjson",
+            "2022_1.ndjson",
+        ]
+        actual_files = [Path(f).name for f in matching_files]
+        assert actual_files == expected_files
+    def test_read_urls_from_multiple_files(self, synthetic_bulk_dir: Path):
+        """Test reading URLs from multiple files using a pattern."""
+        import glob as glob_module
+        pattern = str(synthetic_bulk_dir / "2020_*.ndjson")
+        matching_files = glob_module.glob(pattern)
+        all_urls = []
+        reader = ReaderFactory.get_reader("ndjson")
+        for file_path in matching_files:
+            urls = reader.read_urls(file_path, "url")
+            all_urls.extend(urls)
+        assert len(all_urls) == 10  # 5 from each of 2 files
+        # Check that URLs from different files are all included
+        assert "https://example.com/item_0.json" in all_urls
+        assert "https://example.com/item_9.json" in all_urls
+    def test_pattern_year_specific(self, synthetic_bulk_dir: Path):
+        """Test pattern matching specific year."""
+        import glob as glob_module
+        pattern = str(synthetic_bulk_dir / "2021_*.ndjson")
+        matching_files = glob_module.glob(pattern)
+        assert len(matching_files) == 2
+        for f in matching_files:
+            assert "2021_" in Path(f).name
+    def test_pattern_chunk_specific(self, synthetic_bulk_dir: Path):
+        """Test pattern matching specific chunk number across years."""
+        import glob as glob_module
+        pattern = str(synthetic_bulk_dir / "*_1.ndjson")
+        matching_files = sorted(glob_module.glob(pattern))
+        assert len(matching_files) == 3
+        # Should get 2020_1.ndjson, 2021_1.ndjson, 2022_1.ndjson
+        expected_names = ["2020_1.ndjson", "2021_1.ndjson", "2022_1.ndjson"]
+        actual_names = [Path(f).name for f in matching_files]
+        assert actual_names == expected_names
+    def test_synthetic_data_cleanup(self, synthetic_bulk_dir: Path):
+        """Test that synthetic data is in temporary directory and will be cleaned up.
+        This test verifies that the synthetic_bulk_dir fixture creates files
+        within pytest's tmp_path, which will be automatically cleaned up.
+        """
+        # Verify files exist
+        assert synthetic_bulk_dir.exists()
+        assert (synthetic_bulk_dir / "2020_1.ndjson").exists()
+        assert (synthetic_bulk_dir / "2021_1.ndjson").exists()
+        # Verify files are in pytest's temporary directory
+        # The tmp_path fixture is managed by pytest and will be cleaned up
+        # Parent directory should be part of pytest's temp directory structure
+        assert "pytest-of-" in str(synthetic_bulk_dir) or "tmp" in str(synthetic_bulk_dir).lower()
+    def test_pattern_no_matches(self, tmp_path: Path):
+        """Test behavior when pattern matches no files."""
+        import glob as glob_module
+        pattern = str(tmp_path / "nonexistent_*.ndjson")
+        matching_files = glob_module.glob(pattern)
+        assert matching_files == []
+    def test_empty_directory_handling(self, tmp_path: Path):
+        """Test handling of empty directory with pattern."""
+        empty_dir = tmp_path / "empty"
+        empty_dir.mkdir()
+        import glob as glob_module
+        pattern = str(empty_dir / "*.ndjson")
+        matching_files = glob_module.glob(pattern)
+        assert matching_files == []
+    def test_mixed_file_extensions(self, synthetic_bulk_dir: Path):
+        """Test that pattern only matches .ndjson files."""
+        # Create a non-ndjson file
+        (synthetic_bulk_dir / "readme.txt").write_text("This is a readme")
+        import glob as glob_module
+        pattern = str(synthetic_bulk_dir / "*.ndjson")
+        matching_files = glob_module.glob(pattern)
+        # Should only match .ndjson files, not .txt
+        assert len(matching_files) == 5
+        assert all(f.endswith(".ndjson") for f in matching_files)
+    def test_pattern_recursive_directory(self, tmp_path: Path):
+        """Test recursive pattern matching with subdirectories."""
+        # Create nested directory structure
+        nested_dir = tmp_path / "level1" / "level2"
+        nested_dir.mkdir(parents=True)
+        # Create file in nested directory
+        nested_file = nested_dir / "nested_1.ndjson"
+        nested_file.write_text('{"url": "https://example.com/nested.json"}\n')
+        # Create file in top level
+        top_file = tmp_path / "top_1.ndjson"
+        top_file.write_text('{"url": "https://example.com/top.json"}\n')
+        import glob as glob_module
+        # Non-recursive should only find top level
+        pattern = str(tmp_path / "*_1.ndjson")
+        matching_files = sorted(glob_module.glob(pattern))
+        assert len(matching_files) == 1
+        assert "top_1.ndjson" in matching_files[0]
+        # Recursive should find both
+        pattern_recursive = str(tmp_path / "**" / "*_1.ndjson")
+        matching_files_recursive = sorted(glob_module.glob(pattern_recursive, recursive=True))
+        assert len(matching_files_recursive) == 2
+class TestPatternValidation:
+    """Tests for pattern validation and edge cases."""
+    def test_config_with_empty_pattern(self):
+        """Test ProcessingConfig with empty pattern string."""
+        config = ProcessingConfig(
+            input_file="./data.parquet",
+            output_catalog="./catalog",
+            scratch_location="./scratch",
+            input_pattern="",  # Empty pattern
+        )
+        assert config.input_pattern == ""
+        # Empty pattern should be treated as no pattern (single file mode)
+    def test_config_pattern_with_s3_wildcard(self):
+        """Test S3 pattern configuration."""
+        config = ProcessingConfig(
+            input_file="s3://bucket/bulk",
+            output_catalog="s3://bucket/catalog",
+            scratch_location="s3://bucket/scratch",
+            input_pattern="s3://bucket/bulk/2020_*.ndjson",
+        )
+        assert config.input_pattern == "s3://bucket/bulk/2020_*.ndjson"
+    def test_config_to_dict_includes_pattern(self):
+        """Test that input_pattern is included in to_dict()."""
+        config = ProcessingConfig(
+            input_file="./data",
+            output_catalog="./catalog",
+            scratch_location="./scratch",
+            input_pattern="./data/*.ndjson",
+        )
+        config_dict = config.to_dict()
+        assert "input_pattern" in config_dict
+        assert config_dict["input_pattern"] == "./data/*.ndjson"
+    def test_config_from_dict_with_pattern(self):
+        """Test creating config from dict with input_pattern."""
+        config_data = {
+            "input_file": "./data",
+            "output_catalog": "./catalog",
+            "scratch_location": "./scratch",
+            "input_pattern": "./data/2020_*.ndjson",
+        }
+        config = ProcessingConfig.from_dict(config_data)
+        assert config.input_pattern == "./data/2020_*.ndjson"
+    def test_config_from_dict_without_pattern(self):
+        """Test creating config from dict without input_pattern (backward compatibility)."""
+        config_data = {
+            "input_file": "./data.parquet",
+            "output_catalog": "./catalog",
+            "scratch_location": "./scratch",
+        }
+        config = ProcessingConfig.from_dict(config_data)
+        assert config.input_pattern == ""  # Default value

earthcatalog/tests/test_passthrough_hook.py ADDED Viewed

@@ -0,0 +1,196 @@
+"""Tests for PassthroughSTACHook."""
+import json
+from earthcatalog.stac_hooks import (
+    PassthroughSTACHook,
+    parse_hook_config,
+    serialize_hook,
+)
+class TestPassthroughSTACHook:
+    """Tests for the passthrough STAC hook."""
+    def test_fetch_valid_stac_item(self):
+        """Test fetching a valid STAC item from JSON string."""
+        hook = PassthroughSTACHook()
+        stac_json = json.dumps(
+            {
+                "type": "Feature",
+                "id": "test_item",
+                "geometry": {"type": "Point", "coordinates": [0, 0]},
+                "properties": {"datetime": "2024-01-01T00:00:00Z"},
+            }
+        )
+        result = hook.fetch(stac_json)
+        assert result is not None
+        assert result["id"] == "test_item"
+        assert result["type"] == "Feature"
+    def test_fetch_missing_type_field(self):
+        """Test that missing 'type' field returns None."""
+        hook = PassthroughSTACHook()
+        invalid_json = json.dumps({"id": "test", "geometry": {}, "properties": {}})
+        result = hook.fetch(invalid_json)
+        assert result is None
+    def test_fetch_missing_geometry(self):
+        """Test that missing 'geometry' field returns None."""
+        hook = PassthroughSTACHook()
+        invalid_json = json.dumps({"type": "Feature", "id": "test", "properties": {}})
+        result = hook.fetch(invalid_json)
+        assert result is None
+    def test_fetch_missing_properties(self):
+        """Test that missing 'properties' field returns None."""
+        hook = PassthroughSTACHook()
+        invalid_json = json.dumps({"type": "Feature", "id": "test", "geometry": {}})
+        result = hook.fetch(invalid_json)
+        assert result is None
+    def test_fetch_invalid_json(self):
+        """Test that invalid JSON string returns None."""
+        hook = PassthroughSTACHook()
+        invalid_json = "not valid json"
+        result = hook.fetch(invalid_json)
+        assert result is None
+    def test_fetch_non_dict_json(self):
+        """Test that non-dict JSON returns None."""
+        hook = PassthroughSTACHook()
+        # JSON array instead of object
+        invalid_json = json.dumps([{"id": "test"}])
+        result = hook.fetch(invalid_json)
+        assert result is None
+    def test_fetch_batch(self):
+        """Test batch fetching multiple STAC items."""
+        hook = PassthroughSTACHook()
+        stac_items = [
+            json.dumps({"type": "Feature", "id": "item1", "geometry": {}, "properties": {}}),
+            json.dumps({"type": "Feature", "id": "item2", "geometry": {}, "properties": {}}),
+            json.dumps({"type": "Feature", "id": "item3", "geometry": {}, "properties": {}}),
+        ]
+        results = hook.fetch_batch(stac_items)
+        assert len(results) == 3
+        assert results[0]["id"] == "item1"
+        assert results[1]["id"] == "item2"
+        assert results[2]["id"] == "item3"
+    def test_fetch_batch_with_invalid_items(self):
+        """Test batch fetching with some invalid items."""
+        hook = PassthroughSTACHook()
+        items = [
+            json.dumps({"type": "Feature", "id": "valid1", "geometry": {}, "properties": {}}),
+            "invalid json",
+            json.dumps({"type": "Feature", "id": "valid2", "geometry": {}, "properties": {}}),
+            '{"type": "NotFeature", "geometry": {}, "properties": {}}',  # Wrong type
+        ]
+        results = hook.fetch_batch(items)
+        assert len(results) == 4
+        assert results[0]["id"] == "valid1"
+        assert results[1] is None  # invalid json
+        assert results[2]["id"] == "valid2"
+        assert results[3] is None  # wrong type
+    def test_to_config(self):
+        """Test serialization to config string."""
+        hook = PassthroughSTACHook()
+        config_str = hook.to_config()
+        assert config_str == "passthrough"
+    def test_parse_hook_config_passthrough(self):
+        """Test parsing 'passthrough' string returns PassthroughSTACHook."""
+        hook = parse_hook_config("passthrough")
+        assert isinstance(hook, PassthroughSTACHook)
+    def test_parse_hook_config_default(self):
+        """Test parsing 'default' string returns DefaultSTACHook."""
+        from earthcatalog.stac_hooks import DefaultSTACHook
+        hook = parse_hook_config("default")
+        assert isinstance(hook, DefaultSTACHook)
+        assert not isinstance(hook, PassthroughSTACHook)
+    def test_serialize_passthrough_hook(self):
+        """Test serializing PassthroughSTACHook."""
+        hook = PassthroughSTACHook()
+        config_str = serialize_hook(hook)
+        assert config_str == "passthrough"
+    def test_serialize_and_parse_passthrough(self):
+        """Test round-trip serialization for passthrough hook."""
+        original = PassthroughSTACHook()
+        config_str = serialize_hook(original)
+        restored = parse_hook_config(config_str)
+        assert isinstance(restored, PassthroughSTACHook)
+class TestPassthroughIntegration:
+    """Integration tests for passthrough hook usage."""
+    def test_passthrough_with_its_live_like_data(self):
+        """Test passthrough hook with ITS_LIVE-style bulk data."""
+        hook = PassthroughSTACHook()
+        # Simulate ITS_LIVE NDJSON line with STAC item
+        its_live_item = {
+            "type": "Feature",
+            "id": "ITS_LIVE_test_item",
+            "geometry": {
+                "type": "Polygon",
+                "coordinates": [[[[-180, -90], [180, -90], [180, 90], [-180, 90], [-180, -90]]]],
+            },
+            "properties": {
+                "datetime": "2020-01-01T00:00:00Z",
+                "dataset_id": "TEST_DATASET",
+            },
+        }
+        url = json.dumps(its_live_item)
+        result = hook.fetch(url)
+        assert result is not None
+        assert result["id"] == "ITS_LIVE_test_item"
+        assert result["properties"]["dataset_id"] == "TEST_DATASET"
+    def test_passthrough_performance_skip_http(self):
+        """Test that passthrough doesn't make HTTP requests."""
+        hook = PassthroughSTACHook()
+        # Even with timeout/retry params, they're ignored
+        result = hook.fetch(
+            '{"type": "Feature", "id": "test", "geometry": {}, "properties": {}}',
+            timeout=999,  # Should be ignored
+            retry_attempts=999,  # Should be ignored
+        )
+        assert result is not None
+        assert result["id"] == "test"