PyPI - earthcatalog - Versions diffs - 0.2.0__py3-none-any.whl - Mend

earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

earthcatalog/__init__.py +164 -0
earthcatalog/async_http_client.py +1006 -0
earthcatalog/config.py +97 -0
earthcatalog/engines/__init__.py +308 -0
earthcatalog/engines/rustac_engine.py +142 -0
earthcatalog/engines/stac_geoparquet_engine.py +126 -0
earthcatalog/exceptions.py +471 -0
earthcatalog/grid_systems.py +1114 -0
earthcatalog/ingestion_pipeline.py +2281 -0
earthcatalog/input_readers.py +603 -0
earthcatalog/job_tracking.py +485 -0
earthcatalog/pipeline.py +606 -0
earthcatalog/schema_generator.py +911 -0
earthcatalog/spatial_resolver.py +1207 -0
earthcatalog/stac_hooks.py +754 -0
earthcatalog/statistics.py +677 -0
earthcatalog/storage_backends.py +548 -0
earthcatalog/tests/__init__.py +1 -0
earthcatalog/tests/conftest.py +76 -0
earthcatalog/tests/test_all_grids.py +793 -0
earthcatalog/tests/test_async_http.py +700 -0
earthcatalog/tests/test_cli_and_storage.py +230 -0
earthcatalog/tests/test_config.py +245 -0
earthcatalog/tests/test_dask_integration.py +580 -0
earthcatalog/tests/test_e2e_synthetic.py +1624 -0
earthcatalog/tests/test_engines.py +272 -0
earthcatalog/tests/test_exceptions.py +346 -0
earthcatalog/tests/test_file_structure.py +245 -0
earthcatalog/tests/test_input_readers.py +666 -0
earthcatalog/tests/test_integration.py +200 -0
earthcatalog/tests/test_integration_async.py +283 -0
earthcatalog/tests/test_job_tracking.py +603 -0
earthcatalog/tests/test_multi_file_input.py +336 -0
earthcatalog/tests/test_passthrough_hook.py +196 -0
earthcatalog/tests/test_pipeline.py +684 -0
earthcatalog/tests/test_pipeline_components.py +665 -0
earthcatalog/tests/test_schema_generator.py +506 -0
earthcatalog/tests/test_spatial_resolver.py +413 -0
earthcatalog/tests/test_stac_hooks.py +776 -0
earthcatalog/tests/test_statistics.py +477 -0
earthcatalog/tests/test_storage_backends.py +236 -0
earthcatalog/tests/test_validation.py +435 -0
earthcatalog/tests/test_workers.py +653 -0
earthcatalog/validation.py +921 -0
earthcatalog/workers.py +682 -0
earthcatalog-0.2.0.dist-info/METADATA +333 -0
earthcatalog-0.2.0.dist-info/RECORD +50 -0
earthcatalog-0.2.0.dist-info/WHEEL +5 -0
earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
earthcatalog-0.2.0.dist-info/top_level.txt +1 -0

earthcatalog/tests/test_job_tracking.py ADDED Viewed

@@ -0,0 +1,603 @@
+# test_job_tracking.py
+"""Tests for job tracking module.
+This module tests:
+- JobManifest: Job state persistence and recovery detection
+- JobLogger: Structured logging for ingestion jobs
+"""
+import json
+import tempfile
+from pathlib import Path
+import pytest
+from earthcatalog.job_tracking import (
+    ConsolidationPhaseState,
+    DownloadPhaseState,
+    JobLogger,
+    JobManifest,
+    JobStatus,
+)
+from earthcatalog.storage_backends import LocalStorage
+class TestJobStatus:
+    """Test JobStatus enum."""
+    def test_status_values(self):
+        """JobStatus should have expected values."""
+        assert JobStatus.PENDING == "pending"
+        assert JobStatus.DOWNLOADING == "downloading"
+        assert JobStatus.CONSOLIDATING == "consolidating"
+        assert JobStatus.COMPLETED == "completed"
+        assert JobStatus.FAILED == "failed"
+class TestDownloadPhaseState:
+    """Test DownloadPhaseState dataclass."""
+    def test_default_values(self):
+        """DownloadPhaseState should have sensible defaults."""
+        state = DownloadPhaseState()
+        assert state.completed is False
+        assert state.batches_total == 0
+        assert state.batches_completed == 0
+        assert state.urls_processed == 0
+        assert state.urls_failed == 0
+        assert state.shards_written == []
+    def test_to_dict_from_dict_roundtrip(self):
+        """DownloadPhaseState should serialize/deserialize correctly."""
+        state = DownloadPhaseState(
+            completed=True,
+            batches_total=10,
+            batches_completed=8,
+            urls_processed=1000,
+            urls_failed=5,
+            shards_written=["shard1.parquet", "shard2.parquet"],
+        )
+        data = state.to_dict()
+        restored = DownloadPhaseState.from_dict(data)
+        assert restored.completed == state.completed
+        assert restored.batches_total == state.batches_total
+        assert restored.batches_completed == state.batches_completed
+        assert restored.urls_processed == state.urls_processed
+        assert restored.urls_failed == state.urls_failed
+        assert restored.shards_written == state.shards_written
+class TestConsolidationPhaseState:
+    """Test ConsolidationPhaseState dataclass."""
+    def test_default_values(self):
+        """ConsolidationPhaseState should have sensible defaults."""
+        state = ConsolidationPhaseState()
+        assert state.completed is False
+        assert state.partitions_total == 0
+        assert state.partitions_completed == 0
+        assert state.completed_partitions == []
+    def test_to_dict_from_dict_roundtrip(self):
+        """ConsolidationPhaseState should serialize/deserialize correctly."""
+        state = ConsolidationPhaseState(
+            completed=True,
+            partitions_total=5,
+            partitions_completed=5,
+            completed_partitions=["p1", "p2", "p3", "p4", "p5"],
+        )
+        data = state.to_dict()
+        restored = ConsolidationPhaseState.from_dict(data)
+        assert restored.completed == state.completed
+        assert restored.partitions_total == state.partitions_total
+        assert restored.partitions_completed == state.partitions_completed
+        assert restored.completed_partitions == state.completed_partitions
+class TestJobManifest:
+    """Test JobManifest dataclass."""
+    @pytest.fixture
+    def temp_dir(self):
+        """Create a temporary directory for testing."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            yield tmpdir
+    @pytest.fixture
+    def storage(self, temp_dir):
+        """Create a LocalStorage instance."""
+        return LocalStorage(temp_dir)
+    def test_create_new_manifest(self):
+        """Creating a new JobManifest should set defaults correctly."""
+        manifest = JobManifest(
+            job_id="test-job-123",
+            input_urls_count=1000,
+        )
+        assert manifest.job_id == "test-job-123"
+        assert manifest.status == JobStatus.PENDING
+        assert manifest.input_urls_count == 1000
+        assert manifest.created_at is not None
+        assert manifest.download_phase is not None
+        assert manifest.consolidation_phase is not None
+    def test_to_dict_from_dict_roundtrip(self):
+        """JobManifest should serialize/deserialize correctly."""
+        manifest = JobManifest(
+            job_id="test-job-456",
+            input_urls_count=5000,
+            config_hash="abc123",
+        )
+        manifest.status = JobStatus.DOWNLOADING
+        manifest.download_phase.batches_total = 10
+        manifest.download_phase.batches_completed = 5
+        data = manifest.to_dict()
+        restored = JobManifest.from_dict(data)
+        assert restored.job_id == manifest.job_id
+        assert restored.status == manifest.status
+        assert restored.input_urls_count == manifest.input_urls_count
+        assert restored.config_hash == manifest.config_hash
+        assert restored.download_phase.batches_total == 10
+        assert restored.download_phase.batches_completed == 5
+    def test_save_creates_manifest_file(self, storage, temp_dir):
+        """save() should create manifest.json in the correct location."""
+        manifest = JobManifest(
+            job_id="save-test-job",
+            input_urls_count=100,
+        )
+        manifest.save(storage, temp_dir)
+        expected_path = Path(temp_dir) / "jobs" / "save-test-job" / "manifest.json"
+        assert expected_path.exists()
+        # Verify content
+        with open(expected_path) as f:
+            data = json.load(f)
+        assert data["job_id"] == "save-test-job"
+    def test_load_reads_manifest_file(self, storage, temp_dir):
+        """load() should read an existing manifest."""
+        # Create and save a manifest
+        original = JobManifest(
+            job_id="load-test-job",
+            input_urls_count=200,
+        )
+        original.status = JobStatus.CONSOLIDATING
+        original.save(storage, temp_dir)
+        # Load it back
+        loaded = JobManifest.load(storage, temp_dir, "load-test-job")
+        assert loaded.job_id == "load-test-job"
+        assert loaded.status == JobStatus.CONSOLIDATING
+        assert loaded.input_urls_count == 200
+    def test_load_raises_for_missing_manifest(self, storage, temp_dir):
+        """load() should raise FileNotFoundError for missing manifests."""
+        with pytest.raises(FileNotFoundError):
+            JobManifest.load(storage, temp_dir, "nonexistent-job")
+    def test_find_incomplete_returns_none_when_no_jobs(self, storage, temp_dir):
+        """find_incomplete() should return None when no jobs directory exists."""
+        result = JobManifest.find_incomplete(storage, temp_dir)
+        assert result is None
+    def test_find_incomplete_returns_none_when_all_complete(self, storage, temp_dir):
+        """find_incomplete() should return None when all jobs are complete."""
+        # Create a completed job
+        completed = JobManifest(
+            job_id="completed-job",
+            input_urls_count=100,
+        )
+        completed.status = JobStatus.COMPLETED
+        completed.save(storage, temp_dir)
+        result = JobManifest.find_incomplete(storage, temp_dir)
+        assert result is None
+    def test_find_incomplete_finds_downloading_job(self, storage, temp_dir):
+        """find_incomplete() should find jobs in DOWNLOADING status."""
+        incomplete = JobManifest(
+            job_id="incomplete-job",
+            input_urls_count=100,
+        )
+        incomplete.status = JobStatus.DOWNLOADING
+        incomplete.save(storage, temp_dir)
+        result = JobManifest.find_incomplete(storage, temp_dir)
+        assert result is not None
+        assert result.job_id == "incomplete-job"
+    def test_find_incomplete_finds_consolidating_job(self, storage, temp_dir):
+        """find_incomplete() should find jobs in CONSOLIDATING status."""
+        incomplete = JobManifest(
+            job_id="consolidating-job",
+            input_urls_count=100,
+        )
+        incomplete.status = JobStatus.CONSOLIDATING
+        incomplete.save(storage, temp_dir)
+        result = JobManifest.find_incomplete(storage, temp_dir)
+        assert result is not None
+        assert result.job_id == "consolidating-job"
+    def test_manifest_path_property(self, temp_dir):
+        """manifest_path() should return correct path."""
+        manifest = JobManifest(job_id="path-test", input_urls_count=0)
+        path = manifest.manifest_path(temp_dir)
+        assert path == f"{temp_dir}/jobs/path-test/manifest.json"
+class TestJobLogger:
+    """Test JobLogger class."""
+    @pytest.fixture
+    def temp_dir(self):
+        """Create a temporary directory for testing."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            yield tmpdir
+    @pytest.fixture
+    def storage(self, temp_dir):
+        """Create a LocalStorage instance."""
+        return LocalStorage(temp_dir)
+    def test_logger_creates_log_file(self, storage, temp_dir):
+        """JobLogger should create log file on first log."""
+        logger = JobLogger(storage, temp_dir, "test-job")
+        logger.log("INFO", "Test message")
+        # Check that logs directory exists
+        logs_dir = Path(temp_dir) / "jobs" / "logs"
+        assert logs_dir.exists()
+        # Check that a log file was created
+        log_files = list(logs_dir.glob("*.txt"))
+        assert len(log_files) == 1
+    def test_log_writes_message(self, storage, temp_dir):
+        """log() should write message to file."""
+        logger = JobLogger(storage, temp_dir, "test-job")
+        logger.log("INFO", "Hello world")
+        # Read the log file
+        logs_dir = Path(temp_dir) / "jobs" / "logs"
+        log_file = next(logs_dir.glob("*.txt"))
+        content = log_file.read_text()
+        assert "Hello world" in content
+        assert "INFO" in content
+        assert "test-job" in content
+    def test_log_with_context(self, storage, temp_dir):
+        """log() should include context in message."""
+        logger = JobLogger(storage, temp_dir, "test-job")
+        logger.log("WARNING", "Something happened", url="http://example.com", count=42)
+        logs_dir = Path(temp_dir) / "jobs" / "logs"
+        log_file = next(logs_dir.glob("*.txt"))
+        content = log_file.read_text()
+        assert "Something happened" in content
+        assert "url" in content
+        assert "http://example.com" in content
+    def test_log_phase_start(self, storage, temp_dir):
+        """log_phase_start() should log phase beginning."""
+        logger = JobLogger(storage, temp_dir, "test-job")
+        logger.log_phase_start("download")
+        logs_dir = Path(temp_dir) / "jobs" / "logs"
+        log_file = next(logs_dir.glob("*.txt"))
+        content = log_file.read_text()
+        assert "download" in content.lower()
+        assert "start" in content.lower() or "begin" in content.lower()
+    def test_log_phase_complete(self, storage, temp_dir):
+        """log_phase_complete() should log phase completion with stats."""
+        logger = JobLogger(storage, temp_dir, "test-job")
+        logger.log_phase_complete("download", {"urls_processed": 1000, "shards": 10})
+        logs_dir = Path(temp_dir) / "jobs" / "logs"
+        log_file = next(logs_dir.glob("*.txt"))
+        content = log_file.read_text()
+        assert "download" in content.lower()
+        assert "1000" in content or "urls_processed" in content
+    def test_log_error(self, storage, temp_dir):
+        """log_error() should log error with context."""
+        logger = JobLogger(storage, temp_dir, "test-job")
+        logger.log_error("Something failed", url="http://bad.com")
+        logs_dir = Path(temp_dir) / "jobs" / "logs"
+        log_file = next(logs_dir.glob("*.txt"))
+        content = log_file.read_text()
+        assert "ERROR" in content
+        assert "Something failed" in content
+    def test_log_appends_to_existing_file(self, storage, temp_dir):
+        """Multiple log() calls should append to same file."""
+        logger = JobLogger(storage, temp_dir, "test-job")
+        logger.log("INFO", "First message")
+        logger.log("INFO", "Second message")
+        logs_dir = Path(temp_dir) / "jobs" / "logs"
+        log_file = next(logs_dir.glob("*.txt"))
+        content = log_file.read_text()
+        assert "First message" in content
+        assert "Second message" in content
+class TestPipelineResumeIntegration:
+    """Test pipeline resume functionality with job tracking."""
+    @pytest.fixture
+    def temp_dir(self):
+        """Create a temporary directory for testing."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            yield tmpdir
+    @pytest.fixture
+    def storage(self, temp_dir):
+        """Create a LocalStorage instance."""
+        return LocalStorage(temp_dir)
+    def test_run_creates_job_manifest(self, temp_dir):
+        """run() should create a job manifest."""
+        from unittest.mock import patch
+        import pandas as pd
+        from earthcatalog.ingestion_pipeline import (
+            LocalProcessor,
+            ProcessingConfig,
+            STACIngestionPipeline,
+        )
+        # Create config
+        input_file = Path(temp_dir) / "input.parquet"
+        output_catalog = Path(temp_dir) / "catalog"
+        scratch_location = Path(temp_dir) / "scratch"
+        df = pd.DataFrame({"url": ["http://example.com/item1.json"]})
+        df.to_parquet(input_file, index=False)
+        config = ProcessingConfig(
+            input_file=str(input_file),
+            output_catalog=str(output_catalog),
+            scratch_location=str(scratch_location),
+            max_workers=1,
+        )
+        processor = LocalProcessor(n_workers=1)
+        pipeline = STACIngestionPipeline(config, processor)
+        # Mock the download to return nothing (empty processing)
+        with patch.object(pipeline, "_download_stac_item", return_value=None):
+            pipeline.run(job_id="test-manifest-job")
+        # Check that manifest was created
+        manifest_path = output_catalog / "jobs" / "test-manifest-job" / "manifest.json"
+        assert manifest_path.exists()
+        # Load and verify
+        storage = LocalStorage(str(output_catalog))
+        manifest = JobManifest.load(storage, str(output_catalog), "test-manifest-job")
+        assert manifest.status == JobStatus.COMPLETED
+        processor.close()
+    def test_run_marks_job_failed_on_exception(self, temp_dir):
+        """run() should mark job as FAILED when exception occurs."""
+        from unittest.mock import patch
+        import pandas as pd
+        from earthcatalog.ingestion_pipeline import (
+            LocalProcessor,
+            ProcessingConfig,
+            STACIngestionPipeline,
+        )
+        # Create config
+        input_file = Path(temp_dir) / "input.parquet"
+        output_catalog = Path(temp_dir) / "catalog"
+        scratch_location = Path(temp_dir) / "scratch"
+        df = pd.DataFrame({"url": ["http://example.com/item1.json"]})
+        df.to_parquet(input_file, index=False)
+        config = ProcessingConfig(
+            input_file=str(input_file),
+            output_catalog=str(output_catalog),
+            scratch_location=str(scratch_location),
+            max_workers=1,
+            distributed=True,  # Force distributed mode to test _process_urls_distributed
+        )
+        processor = LocalProcessor(n_workers=1)
+        pipeline = STACIngestionPipeline(config, processor)
+        # Mock to raise an exception
+        with patch.object(pipeline, "_process_urls_distributed", side_effect=RuntimeError("Test error")):
+            with pytest.raises(RuntimeError, match="Test error"):
+                pipeline.run(job_id="failed-job")
+        # Check that manifest shows FAILED
+        storage = LocalStorage(str(output_catalog))
+        manifest = JobManifest.load(storage, str(output_catalog), "failed-job")
+        assert manifest.status == JobStatus.FAILED
+        assert "Test error" in manifest.error
+        processor.close()
+    def test_resume_raises_when_no_incomplete_job(self, temp_dir):
+        """run(resume=True) should raise ValueError if no incomplete job exists."""
+        import pandas as pd
+        from earthcatalog.ingestion_pipeline import (
+            LocalProcessor,
+            ProcessingConfig,
+            STACIngestionPipeline,
+        )
+        input_file = Path(temp_dir) / "input.parquet"
+        output_catalog = Path(temp_dir) / "catalog"
+        scratch_location = Path(temp_dir) / "scratch"
+        df = pd.DataFrame({"url": ["http://example.com/item.json"]})
+        df.to_parquet(input_file, index=False)
+        config = ProcessingConfig(
+            input_file=str(input_file),
+            output_catalog=str(output_catalog),
+            scratch_location=str(scratch_location),
+        )
+        processor = LocalProcessor(n_workers=1)
+        pipeline = STACIngestionPipeline(config, processor)
+        with pytest.raises(ValueError, match="No incomplete job found"):
+            pipeline.run(resume=True)
+        processor.close()
+    def test_resume_finds_and_uses_incomplete_job(self, temp_dir):
+        """run(resume=True) should find and continue an incomplete job."""
+        from unittest.mock import patch
+        import pandas as pd
+        from earthcatalog.ingestion_pipeline import (
+            LocalProcessor,
+            ProcessingConfig,
+            STACIngestionPipeline,
+        )
+        # Create config
+        input_file = Path(temp_dir) / "input.parquet"
+        output_catalog = Path(temp_dir) / "catalog"
+        scratch_location = Path(temp_dir) / "scratch"
+        df = pd.DataFrame({"url": ["http://example.com/item1.json"]})
+        df.to_parquet(input_file, index=False)
+        config = ProcessingConfig(
+            input_file=str(input_file),
+            output_catalog=str(output_catalog),
+            scratch_location=str(scratch_location),
+            max_workers=1,
+        )
+        # Create an incomplete job manifest manually
+        storage = LocalStorage(str(output_catalog))
+        incomplete_manifest = JobManifest(
+            job_id="incomplete-job-to-resume",
+            input_urls_count=1,
+        )
+        incomplete_manifest.status = JobStatus.DOWNLOADING
+        incomplete_manifest.download_phase.completed = False
+        incomplete_manifest.save(storage, str(output_catalog))
+        processor = LocalProcessor(n_workers=1)
+        pipeline = STACIngestionPipeline(config, processor)
+        # Mock the download to return nothing
+        with patch.object(pipeline, "_download_stac_item", return_value=None):
+            pipeline.run(resume=True)
+        # Check that the same job was resumed and completed
+        manifest = JobManifest.load(storage, str(output_catalog), "incomplete-job-to-resume")
+        assert manifest.status == JobStatus.COMPLETED
+        processor.close()
+    def test_resume_skips_completed_download_phase(self, temp_dir):
+        """run(resume=True) should skip download phase if already completed."""
+        from unittest.mock import MagicMock, patch
+        import pandas as pd
+        from earthcatalog.ingestion_pipeline import (
+            LocalProcessor,
+            ProcessingConfig,
+            STACIngestionPipeline,
+        )
+        input_file = Path(temp_dir) / "input.parquet"
+        output_catalog = Path(temp_dir) / "catalog"
+        scratch_location = Path(temp_dir) / "scratch"
+        df = pd.DataFrame({"url": ["http://example.com/item1.json"]})
+        df.to_parquet(input_file, index=False)
+        config = ProcessingConfig(
+            input_file=str(input_file),
+            output_catalog=str(output_catalog),
+            scratch_location=str(scratch_location),
+            max_workers=1,
+        )
+        # Create manifest with download phase completed
+        storage = LocalStorage(str(output_catalog))
+        manifest = JobManifest(
+            job_id="download-complete-job",
+            input_urls_count=1,
+        )
+        manifest.status = JobStatus.CONSOLIDATING
+        manifest.download_phase.completed = True
+        manifest.download_phase.shards_written = []  # No shards
+        manifest.save(storage, str(output_catalog))
+        processor = LocalProcessor(n_workers=1)
+        pipeline = STACIngestionPipeline(config, processor)
+        # Mock _process_urls_distributed to track if it's called
+        mock_process = MagicMock()
+        with patch.object(pipeline, "_process_urls_distributed", mock_process):
+            pipeline.run(resume=True)
+        # Download phase should NOT have been called (it was already complete)
+        mock_process.assert_not_called()
+        # Job should be completed
+        manifest = JobManifest.load(storage, str(output_catalog), "download-complete-job")
+        assert manifest.status == JobStatus.COMPLETED
+        processor.close()
+    def test_consolidation_checkpoints_progress(self, storage, temp_dir):
+        """Consolidation should checkpoint progress in manifest."""
+        # This is tested indirectly through the _consolidate_shards behavior
+        # Create a manifest to track
+        manifest = JobManifest(
+            job_id="checkpoint-test",
+            input_urls_count=100,
+        )
+        manifest.status = JobStatus.CONSOLIDATING
+        manifest.consolidation_phase.partitions_total = 20
+        # Simulate adding completed partitions
+        for i in range(15):
+            manifest.consolidation_phase.completed_partitions.append(f"partition_{i}")
+            manifest.consolidation_phase.partitions_completed += 1
+        manifest.save(storage, temp_dir)
+        # Load and verify checkpoint
+        loaded = JobManifest.load(storage, temp_dir, "checkpoint-test")
+        assert len(loaded.consolidation_phase.completed_partitions) == 15
+        assert loaded.consolidation_phase.partitions_completed == 15