PyPI - earthcatalog - Versions diffs - 0.2.0__py3-none-any.whl - Mend

earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

earthcatalog/__init__.py +164 -0
earthcatalog/async_http_client.py +1006 -0
earthcatalog/config.py +97 -0
earthcatalog/engines/__init__.py +308 -0
earthcatalog/engines/rustac_engine.py +142 -0
earthcatalog/engines/stac_geoparquet_engine.py +126 -0
earthcatalog/exceptions.py +471 -0
earthcatalog/grid_systems.py +1114 -0
earthcatalog/ingestion_pipeline.py +2281 -0
earthcatalog/input_readers.py +603 -0
earthcatalog/job_tracking.py +485 -0
earthcatalog/pipeline.py +606 -0
earthcatalog/schema_generator.py +911 -0
earthcatalog/spatial_resolver.py +1207 -0
earthcatalog/stac_hooks.py +754 -0
earthcatalog/statistics.py +677 -0
earthcatalog/storage_backends.py +548 -0
earthcatalog/tests/__init__.py +1 -0
earthcatalog/tests/conftest.py +76 -0
earthcatalog/tests/test_all_grids.py +793 -0
earthcatalog/tests/test_async_http.py +700 -0
earthcatalog/tests/test_cli_and_storage.py +230 -0
earthcatalog/tests/test_config.py +245 -0
earthcatalog/tests/test_dask_integration.py +580 -0
earthcatalog/tests/test_e2e_synthetic.py +1624 -0
earthcatalog/tests/test_engines.py +272 -0
earthcatalog/tests/test_exceptions.py +346 -0
earthcatalog/tests/test_file_structure.py +245 -0
earthcatalog/tests/test_input_readers.py +666 -0
earthcatalog/tests/test_integration.py +200 -0
earthcatalog/tests/test_integration_async.py +283 -0
earthcatalog/tests/test_job_tracking.py +603 -0
earthcatalog/tests/test_multi_file_input.py +336 -0
earthcatalog/tests/test_passthrough_hook.py +196 -0
earthcatalog/tests/test_pipeline.py +684 -0
earthcatalog/tests/test_pipeline_components.py +665 -0
earthcatalog/tests/test_schema_generator.py +506 -0
earthcatalog/tests/test_spatial_resolver.py +413 -0
earthcatalog/tests/test_stac_hooks.py +776 -0
earthcatalog/tests/test_statistics.py +477 -0
earthcatalog/tests/test_storage_backends.py +236 -0
earthcatalog/tests/test_validation.py +435 -0
earthcatalog/tests/test_workers.py +653 -0
earthcatalog/validation.py +921 -0
earthcatalog/workers.py +682 -0
earthcatalog-0.2.0.dist-info/METADATA +333 -0
earthcatalog-0.2.0.dist-info/RECORD +50 -0
earthcatalog-0.2.0.dist-info/WHEEL +5 -0
earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
earthcatalog-0.2.0.dist-info/top_level.txt +1 -0

earthcatalog/job_tracking.py ADDED Viewed

@@ -0,0 +1,485 @@
+# job_tracking.py
+"""Job tracking and recovery for STAC ingestion pipelines.
+This module provides job state persistence and structured logging for the
+ingestion pipeline, enabling:
+1. **Recovery from failures**: Resume interrupted jobs from the last checkpoint
+2. **Audit trail**: Structured logs for debugging and monitoring
+3. **Idempotency**: Detect and skip already-completed work
+Architecture:
+    Jobs are tracked via manifest files stored in the catalog:
+    - {catalog}/jobs/{job_id}/manifest.json - Job state and progress
+    - {catalog}/jobs/logs/{date}-ingest.txt - Structured log entries
+Example:
+    >>> from earthcatalog.job_tracking import JobManifest, JobLogger
+    >>> from earthcatalog.storage_backends import get_storage_backend
+    >>>
+    >>> storage = get_storage_backend("s3://bucket/catalog")
+    >>> manifest = JobManifest(job_id="job-123", input_urls_count=10000)
+    >>> manifest.status = JobStatus.DOWNLOADING
+    >>> manifest.save(storage, "s3://bucket/catalog")
+    >>>
+    >>> logger = JobLogger(storage, "s3://bucket/catalog", "job-123")
+    >>> logger.log("INFO", "Processing started", urls=10000)
+"""
+from __future__ import annotations
+import json
+import logging
+import os
+from dataclasses import dataclass, field
+from datetime import UTC, date, datetime
+from enum import StrEnum
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+if TYPE_CHECKING:
+    from .storage_backends import StorageBackend
+logger = logging.getLogger(__name__)
+class JobStatus(StrEnum):
+    """Status of an ingestion job."""
+    PENDING = "pending"
+    DOWNLOADING = "downloading"
+    CONSOLIDATING = "consolidating"
+    COMPLETED = "completed"
+    FAILED = "failed"
+@dataclass
+class DownloadPhaseState:
+    """State of the download phase for recovery.
+    Tracks which URL batches have been processed so we can resume
+    from the last completed batch on failure.
+    Attributes:
+        completed: Whether the entire download phase is complete.
+        batches_total: Total number of URL batches to process.
+        batches_completed: Number of batches successfully completed.
+        urls_processed: Total URLs attempted (success + failure).
+        urls_failed: URLs that failed after all retries.
+        shards_written: List of shard file paths created.
+    """
+    completed: bool = False
+    batches_total: int = 0
+    batches_completed: int = 0
+    urls_processed: int = 0
+    urls_failed: int = 0
+    shards_written: list[str] = field(default_factory=list)
+    def to_dict(self) -> dict[str, Any]:
+        """Serialize to dictionary for JSON storage."""
+        return {
+            "completed": self.completed,
+            "batches_total": self.batches_total,
+            "batches_completed": self.batches_completed,
+            "urls_processed": self.urls_processed,
+            "urls_failed": self.urls_failed,
+            "shards_written": self.shards_written,
+        }
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> DownloadPhaseState:
+        """Deserialize from dictionary."""
+        return cls(
+            completed=data.get("completed", False),
+            batches_total=data.get("batches_total", 0),
+            batches_completed=data.get("batches_completed", 0),
+            urls_processed=data.get("urls_processed", 0),
+            urls_failed=data.get("urls_failed", 0),
+            shards_written=data.get("shards_written", []),
+        )
+@dataclass
+class ConsolidationPhaseState:
+    """State of the consolidation phase for recovery.
+    Tracks which partitions have been consolidated so we can resume
+    from the last completed partition on failure.
+    Attributes:
+        completed: Whether the entire consolidation phase is complete.
+        partitions_total: Total number of partitions to consolidate.
+        partitions_completed: Number of partitions successfully consolidated.
+        completed_partitions: List of partition keys that are complete.
+    """
+    completed: bool = False
+    partitions_total: int = 0
+    partitions_completed: int = 0
+    completed_partitions: list[str] = field(default_factory=list)
+    def to_dict(self) -> dict[str, Any]:
+        """Serialize to dictionary for JSON storage."""
+        return {
+            "completed": self.completed,
+            "partitions_total": self.partitions_total,
+            "partitions_completed": self.partitions_completed,
+            "completed_partitions": self.completed_partitions,
+        }
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> ConsolidationPhaseState:
+        """Deserialize from dictionary."""
+        return cls(
+            completed=data.get("completed", False),
+            partitions_total=data.get("partitions_total", 0),
+            partitions_completed=data.get("partitions_completed", 0),
+            completed_partitions=data.get("completed_partitions", []),
+        )
+@dataclass
+class JobManifest:
+    """Persistent job state for recovery and tracking.
+    The manifest tracks the complete state of an ingestion job, enabling:
+    - Resume from the exact point of failure
+    - Skip already-completed work on re-run
+    - Audit trail of job progress
+    Location: {catalog}/jobs/{job_id}/manifest.json
+    Attributes:
+        job_id: Unique identifier for this job (UUID).
+        input_urls_count: Total number of URLs to process.
+        status: Current job status.
+        config_hash: Hash of configuration for idempotency checking.
+        created_at: When the job was created.
+        updated_at: When the manifest was last updated.
+        error: Error message if job failed.
+        download_phase: State of download phase.
+        consolidation_phase: State of consolidation phase.
+    Example:
+        >>> manifest = JobManifest(job_id="abc-123", input_urls_count=10000)
+        >>> manifest.status = JobStatus.DOWNLOADING
+        >>> manifest.download_phase.batches_completed = 5
+        >>> manifest.save(storage, catalog_path)
+    """
+    job_id: str
+    input_urls_count: int
+    status: JobStatus = JobStatus.PENDING
+    config_hash: str = ""
+    created_at: datetime = field(default_factory=lambda: datetime.now(UTC))
+    updated_at: datetime = field(default_factory=lambda: datetime.now(UTC))
+    error: str = ""
+    download_phase: DownloadPhaseState = field(default_factory=DownloadPhaseState)
+    consolidation_phase: ConsolidationPhaseState = field(default_factory=ConsolidationPhaseState)
+    def manifest_path(self, catalog_path: str) -> str:
+        """Get the path to this job's manifest file.
+        Args:
+            catalog_path: Base catalog path.
+        Returns:
+            Full path to manifest.json file.
+        """
+        return f"{catalog_path}/jobs/{self.job_id}/manifest.json"
+    def to_dict(self) -> dict[str, Any]:
+        """Serialize to dictionary for JSON storage."""
+        return {
+            "job_id": self.job_id,
+            "input_urls_count": self.input_urls_count,
+            "status": self.status.value,
+            "config_hash": self.config_hash,
+            "created_at": self.created_at.isoformat(),
+            "updated_at": datetime.now(UTC).isoformat(),
+            "error": self.error,
+            "download_phase": self.download_phase.to_dict(),
+            "consolidation_phase": self.consolidation_phase.to_dict(),
+        }
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> JobManifest:
+        """Deserialize from dictionary."""
+        return cls(
+            job_id=data["job_id"],
+            input_urls_count=data.get("input_urls_count", 0),
+            status=JobStatus(data.get("status", "pending")),
+            config_hash=data.get("config_hash", ""),
+            created_at=datetime.fromisoformat(data["created_at"]),
+            updated_at=datetime.fromisoformat(data.get("updated_at", data["created_at"])),
+            error=data.get("error", ""),
+            download_phase=DownloadPhaseState.from_dict(data.get("download_phase", {})),
+            consolidation_phase=ConsolidationPhaseState.from_dict(data.get("consolidation_phase", {})),
+        )
+    def save(self, storage: StorageBackend, catalog_path: str) -> None:
+        """Save manifest to storage.
+        Creates the jobs directory if it doesn't exist.
+        Args:
+            storage: Storage backend to write to.
+            catalog_path: Base catalog path.
+        """
+        path = self.manifest_path(catalog_path)
+        # Ensure directory exists
+        storage.makedirs(Path(path).parent)
+        # Write manifest as JSON
+        content = json.dumps(self.to_dict(), indent=2).encode("utf-8")
+        with storage.open(path, "wb") as f:
+            f.write(content)
+        logger.debug(f"Saved manifest to {path}")
+    @classmethod
+    def load(cls, storage: StorageBackend, catalog_path: str, job_id: str) -> JobManifest:
+        """Load manifest from storage.
+        Args:
+            storage: Storage backend to read from.
+            catalog_path: Base catalog path.
+            job_id: Job ID to load.
+        Returns:
+            Loaded JobManifest.
+        Raises:
+            FileNotFoundError: If manifest doesn't exist.
+        """
+        path = f"{catalog_path}/jobs/{job_id}/manifest.json"
+        if not storage.exists(path):
+            raise FileNotFoundError(f"Manifest not found: {path}")
+        with storage.open(path, "rb") as f:
+            data = json.load(f)
+        return cls.from_dict(data)
+    @classmethod
+    def find_incomplete(cls, storage: StorageBackend, catalog_path: str) -> JobManifest | None:
+        """Find the most recent incomplete job for this catalog.
+        Scans the jobs directory for manifests with status other than
+        COMPLETED or FAILED, returning the most recently updated one.
+        Args:
+            storage: Storage backend to search.
+            catalog_path: Base catalog path.
+        Returns:
+            Most recent incomplete JobManifest, or None if no incomplete jobs.
+        """
+        jobs_dir = f"{catalog_path}/jobs"
+        # Check if jobs directory exists
+        if not storage.exists(jobs_dir):
+            return None
+        # List job directories (not files)
+        try:
+            job_dirs = storage.list_dirs(jobs_dir)
+        except OSError:
+            return None
+        incomplete_jobs: list[JobManifest] = []
+        for job_path in job_dirs:
+            # Extract job_id from path
+            job_id = Path(job_path).name
+            # Skip logs directory and other non-job items
+            if job_id == "logs" or not job_id:
+                continue
+            try:
+                manifest = cls.load(storage, catalog_path, job_id)
+                if manifest.status in (JobStatus.DOWNLOADING, JobStatus.CONSOLIDATING):
+                    incomplete_jobs.append(manifest)
+            except (FileNotFoundError, json.JSONDecodeError, KeyError):
+                continue
+        if not incomplete_jobs:
+            return None
+        # Return most recently updated
+        return max(incomplete_jobs, key=lambda m: m.updated_at)
+class JobLogger:
+    """Structured logger for ingestion jobs.
+    Writes structured log entries to a date-based log file in the catalog.
+    Each entry includes timestamp, job ID, level, message, and optional context.
+    Location: {catalog}/jobs/logs/{date}-ingest.txt (local)
+              {catalog}/jobs/logs/{date}-{job_id}-{worker_id}.txt (S3, per-worker)
+    Log Format:
+        {timestamp} [{level}] job={job_id} - {message} {context}
+    Note:
+        For S3 storage, each worker writes to its own log file to avoid race
+        conditions from concurrent read-modify-write operations. Log files can
+        be merged post-processing if needed.
+    Example:
+        >>> logger = JobLogger(storage, catalog_path, "job-123")
+        >>> logger.log("INFO", "Processing started", urls=10000)
+        >>> logger.log_phase_start("download")
+        >>> logger.log_phase_complete("download", {"urls": 10000, "shards": 50})
+        >>> logger.log_error("Failed to fetch URL", url="http://example.com")
+    """
+    def __init__(
+        self,
+        storage: StorageBackend,
+        catalog_path: str,
+        job_id: str,
+        worker_id: str | None = None,
+    ) -> None:
+        """Initialize the logger.
+        Args:
+            storage: Storage backend for writing logs.
+            catalog_path: Base catalog path.
+            job_id: Job ID to include in log entries.
+            worker_id: Optional worker ID for per-worker S3 log files.
+                       If not provided, uses process ID for S3 logs.
+        """
+        self.storage = storage
+        self.catalog_path = catalog_path
+        self.job_id = job_id
+        self.worker_id = worker_id or str(os.getpid())
+        self._log_path: str | None = None
+        self._is_s3 = catalog_path.startswith("s3://")
+    @property
+    def log_path(self) -> str:
+        """Get the path to today's log file.
+        For S3 storage, returns a per-worker log file to avoid race conditions.
+        For local storage, returns a shared log file.
+        """
+        if self._log_path is None:
+            today = date.today().isoformat()
+            if self._is_s3:
+                # Per-worker log file to avoid race conditions
+                self._log_path = f"{self.catalog_path}/jobs/logs/{today}-{self.job_id}-{self.worker_id}.txt"
+            else:
+                # Shared log file for local storage (append is atomic)
+                self._log_path = f"{self.catalog_path}/jobs/logs/{today}-ingest.txt"
+        return self._log_path
+    def _ensure_log_dir(self) -> None:
+        """Ensure the logs directory exists."""
+        self.storage.makedirs(Path(self.log_path).parent)
+    def _format_entry(self, level: str, message: str, **context: Any) -> str:
+        """Format a log entry.
+        Args:
+            level: Log level (INFO, WARNING, ERROR, etc.).
+            message: Log message.
+            **context: Additional context key-value pairs.
+        Returns:
+            Formatted log line with newline.
+        """
+        timestamp = datetime.now(UTC).isoformat()
+        entry = f"{timestamp} [{level}] job={self.job_id} - {message}"
+        if context:
+            ctx_str = " ".join(f"{k}={v}" for k, v in context.items())
+            entry = f"{entry} | {ctx_str}"
+        return entry + "\n"
+    def log(self, level: str, message: str, **context: Any) -> None:
+        """Write a log entry.
+        Args:
+            level: Log level (INFO, WARNING, ERROR, etc.).
+            message: Log message.
+            **context: Additional context to include.
+        """
+        self._ensure_log_dir()
+        entry = self._format_entry(level, message, **context)
+        # Append to log file
+        # For S3, we need to read-append-write since S3 doesn't support append
+        if self.log_path.startswith("s3://"):
+            self._append_s3(entry)
+        else:
+            self._append_local(entry)
+    def _append_local(self, entry: str) -> None:
+        """Append entry to local file."""
+        with open(self.log_path, "a") as f:
+            f.write(entry)
+    def _append_s3(self, entry: str) -> None:
+        """Append entry to S3 file (read-modify-write).
+        Note: Each worker uses its own log file (per worker_id) to avoid race
+        conditions from concurrent read-modify-write operations. Log files can
+        be merged post-processing using: cat logs/{date}-{job_id}-*.txt > combined.txt
+        """
+        try:
+            if self.storage.exists(self.log_path):
+                with self.storage.open(self.log_path, "rb") as f:
+                    existing = f.read().decode("utf-8")
+            else:
+                existing = ""
+            content = existing + entry
+            with self.storage.open(self.log_path, "wb") as f:
+                f.write(content.encode("utf-8"))
+        except OSError as e:
+            # Log to Python logger if storage logging fails
+            logger.error(f"Failed to write to job log: {e}")
+    def log_phase_start(self, phase: str) -> None:
+        """Log the start of a processing phase.
+        Args:
+            phase: Phase name (e.g., "download", "consolidation").
+        """
+        self.log("INFO", f"Phase '{phase}' started")
+    def log_phase_complete(self, phase: str, stats: dict[str, Any]) -> None:
+        """Log the completion of a processing phase.
+        Args:
+            phase: Phase name.
+            stats: Statistics dictionary to include.
+        """
+        stats_str = ", ".join(f"{k}={v}" for k, v in stats.items())
+        self.log("INFO", f"Phase '{phase}' completed: {stats_str}")
+    def log_error(self, message: str, **context: Any) -> None:
+        """Log an error.
+        Args:
+            message: Error message.
+            **context: Additional context.
+        """
+        self.log("ERROR", message, **context)
+__all__ = [
+    "JobStatus",
+    "DownloadPhaseState",
+    "ConsolidationPhaseState",
+    "JobManifest",
+    "JobLogger",
+]