earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. earthcatalog/__init__.py +164 -0
  2. earthcatalog/async_http_client.py +1006 -0
  3. earthcatalog/config.py +97 -0
  4. earthcatalog/engines/__init__.py +308 -0
  5. earthcatalog/engines/rustac_engine.py +142 -0
  6. earthcatalog/engines/stac_geoparquet_engine.py +126 -0
  7. earthcatalog/exceptions.py +471 -0
  8. earthcatalog/grid_systems.py +1114 -0
  9. earthcatalog/ingestion_pipeline.py +2281 -0
  10. earthcatalog/input_readers.py +603 -0
  11. earthcatalog/job_tracking.py +485 -0
  12. earthcatalog/pipeline.py +606 -0
  13. earthcatalog/schema_generator.py +911 -0
  14. earthcatalog/spatial_resolver.py +1207 -0
  15. earthcatalog/stac_hooks.py +754 -0
  16. earthcatalog/statistics.py +677 -0
  17. earthcatalog/storage_backends.py +548 -0
  18. earthcatalog/tests/__init__.py +1 -0
  19. earthcatalog/tests/conftest.py +76 -0
  20. earthcatalog/tests/test_all_grids.py +793 -0
  21. earthcatalog/tests/test_async_http.py +700 -0
  22. earthcatalog/tests/test_cli_and_storage.py +230 -0
  23. earthcatalog/tests/test_config.py +245 -0
  24. earthcatalog/tests/test_dask_integration.py +580 -0
  25. earthcatalog/tests/test_e2e_synthetic.py +1624 -0
  26. earthcatalog/tests/test_engines.py +272 -0
  27. earthcatalog/tests/test_exceptions.py +346 -0
  28. earthcatalog/tests/test_file_structure.py +245 -0
  29. earthcatalog/tests/test_input_readers.py +666 -0
  30. earthcatalog/tests/test_integration.py +200 -0
  31. earthcatalog/tests/test_integration_async.py +283 -0
  32. earthcatalog/tests/test_job_tracking.py +603 -0
  33. earthcatalog/tests/test_multi_file_input.py +336 -0
  34. earthcatalog/tests/test_passthrough_hook.py +196 -0
  35. earthcatalog/tests/test_pipeline.py +684 -0
  36. earthcatalog/tests/test_pipeline_components.py +665 -0
  37. earthcatalog/tests/test_schema_generator.py +506 -0
  38. earthcatalog/tests/test_spatial_resolver.py +413 -0
  39. earthcatalog/tests/test_stac_hooks.py +776 -0
  40. earthcatalog/tests/test_statistics.py +477 -0
  41. earthcatalog/tests/test_storage_backends.py +236 -0
  42. earthcatalog/tests/test_validation.py +435 -0
  43. earthcatalog/tests/test_workers.py +653 -0
  44. earthcatalog/validation.py +921 -0
  45. earthcatalog/workers.py +682 -0
  46. earthcatalog-0.2.0.dist-info/METADATA +333 -0
  47. earthcatalog-0.2.0.dist-info/RECORD +50 -0
  48. earthcatalog-0.2.0.dist-info/WHEEL +5 -0
  49. earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
  50. earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,485 @@
1
+ # job_tracking.py
2
+ """Job tracking and recovery for STAC ingestion pipelines.
3
+
4
+ This module provides job state persistence and structured logging for the
5
+ ingestion pipeline, enabling:
6
+
7
+ 1. **Recovery from failures**: Resume interrupted jobs from the last checkpoint
8
+ 2. **Audit trail**: Structured logs for debugging and monitoring
9
+ 3. **Idempotency**: Detect and skip already-completed work
10
+
11
+ Architecture:
12
+ Jobs are tracked via manifest files stored in the catalog:
13
+ - {catalog}/jobs/{job_id}/manifest.json - Job state and progress
14
+ - {catalog}/jobs/logs/{date}-ingest.txt - Structured log entries
15
+
16
+ Example:
17
+ >>> from earthcatalog.job_tracking import JobManifest, JobLogger
18
+ >>> from earthcatalog.storage_backends import get_storage_backend
19
+ >>>
20
+ >>> storage = get_storage_backend("s3://bucket/catalog")
21
+ >>> manifest = JobManifest(job_id="job-123", input_urls_count=10000)
22
+ >>> manifest.status = JobStatus.DOWNLOADING
23
+ >>> manifest.save(storage, "s3://bucket/catalog")
24
+ >>>
25
+ >>> logger = JobLogger(storage, "s3://bucket/catalog", "job-123")
26
+ >>> logger.log("INFO", "Processing started", urls=10000)
27
+ """
28
+
29
+ from __future__ import annotations
30
+
31
+ import json
32
+ import logging
33
+ import os
34
+ from dataclasses import dataclass, field
35
+ from datetime import UTC, date, datetime
36
+ from enum import StrEnum
37
+ from pathlib import Path
38
+ from typing import TYPE_CHECKING, Any
39
+
40
+ if TYPE_CHECKING:
41
+ from .storage_backends import StorageBackend
42
+
43
+ logger = logging.getLogger(__name__)
44
+
45
+
46
+ class JobStatus(StrEnum):
47
+ """Status of an ingestion job."""
48
+
49
+ PENDING = "pending"
50
+ DOWNLOADING = "downloading"
51
+ CONSOLIDATING = "consolidating"
52
+ COMPLETED = "completed"
53
+ FAILED = "failed"
54
+
55
+
56
+ @dataclass
57
+ class DownloadPhaseState:
58
+ """State of the download phase for recovery.
59
+
60
+ Tracks which URL batches have been processed so we can resume
61
+ from the last completed batch on failure.
62
+
63
+ Attributes:
64
+ completed: Whether the entire download phase is complete.
65
+ batches_total: Total number of URL batches to process.
66
+ batches_completed: Number of batches successfully completed.
67
+ urls_processed: Total URLs attempted (success + failure).
68
+ urls_failed: URLs that failed after all retries.
69
+ shards_written: List of shard file paths created.
70
+ """
71
+
72
+ completed: bool = False
73
+ batches_total: int = 0
74
+ batches_completed: int = 0
75
+ urls_processed: int = 0
76
+ urls_failed: int = 0
77
+ shards_written: list[str] = field(default_factory=list)
78
+
79
+ def to_dict(self) -> dict[str, Any]:
80
+ """Serialize to dictionary for JSON storage."""
81
+ return {
82
+ "completed": self.completed,
83
+ "batches_total": self.batches_total,
84
+ "batches_completed": self.batches_completed,
85
+ "urls_processed": self.urls_processed,
86
+ "urls_failed": self.urls_failed,
87
+ "shards_written": self.shards_written,
88
+ }
89
+
90
+ @classmethod
91
+ def from_dict(cls, data: dict[str, Any]) -> DownloadPhaseState:
92
+ """Deserialize from dictionary."""
93
+ return cls(
94
+ completed=data.get("completed", False),
95
+ batches_total=data.get("batches_total", 0),
96
+ batches_completed=data.get("batches_completed", 0),
97
+ urls_processed=data.get("urls_processed", 0),
98
+ urls_failed=data.get("urls_failed", 0),
99
+ shards_written=data.get("shards_written", []),
100
+ )
101
+
102
+
103
+ @dataclass
104
+ class ConsolidationPhaseState:
105
+ """State of the consolidation phase for recovery.
106
+
107
+ Tracks which partitions have been consolidated so we can resume
108
+ from the last completed partition on failure.
109
+
110
+ Attributes:
111
+ completed: Whether the entire consolidation phase is complete.
112
+ partitions_total: Total number of partitions to consolidate.
113
+ partitions_completed: Number of partitions successfully consolidated.
114
+ completed_partitions: List of partition keys that are complete.
115
+ """
116
+
117
+ completed: bool = False
118
+ partitions_total: int = 0
119
+ partitions_completed: int = 0
120
+ completed_partitions: list[str] = field(default_factory=list)
121
+
122
+ def to_dict(self) -> dict[str, Any]:
123
+ """Serialize to dictionary for JSON storage."""
124
+ return {
125
+ "completed": self.completed,
126
+ "partitions_total": self.partitions_total,
127
+ "partitions_completed": self.partitions_completed,
128
+ "completed_partitions": self.completed_partitions,
129
+ }
130
+
131
+ @classmethod
132
+ def from_dict(cls, data: dict[str, Any]) -> ConsolidationPhaseState:
133
+ """Deserialize from dictionary."""
134
+ return cls(
135
+ completed=data.get("completed", False),
136
+ partitions_total=data.get("partitions_total", 0),
137
+ partitions_completed=data.get("partitions_completed", 0),
138
+ completed_partitions=data.get("completed_partitions", []),
139
+ )
140
+
141
+
142
+ @dataclass
143
+ class JobManifest:
144
+ """Persistent job state for recovery and tracking.
145
+
146
+ The manifest tracks the complete state of an ingestion job, enabling:
147
+ - Resume from the exact point of failure
148
+ - Skip already-completed work on re-run
149
+ - Audit trail of job progress
150
+
151
+ Location: {catalog}/jobs/{job_id}/manifest.json
152
+
153
+ Attributes:
154
+ job_id: Unique identifier for this job (UUID).
155
+ input_urls_count: Total number of URLs to process.
156
+ status: Current job status.
157
+ config_hash: Hash of configuration for idempotency checking.
158
+ created_at: When the job was created.
159
+ updated_at: When the manifest was last updated.
160
+ error: Error message if job failed.
161
+ download_phase: State of download phase.
162
+ consolidation_phase: State of consolidation phase.
163
+
164
+ Example:
165
+ >>> manifest = JobManifest(job_id="abc-123", input_urls_count=10000)
166
+ >>> manifest.status = JobStatus.DOWNLOADING
167
+ >>> manifest.download_phase.batches_completed = 5
168
+ >>> manifest.save(storage, catalog_path)
169
+ """
170
+
171
+ job_id: str
172
+ input_urls_count: int
173
+ status: JobStatus = JobStatus.PENDING
174
+ config_hash: str = ""
175
+ created_at: datetime = field(default_factory=lambda: datetime.now(UTC))
176
+ updated_at: datetime = field(default_factory=lambda: datetime.now(UTC))
177
+ error: str = ""
178
+ download_phase: DownloadPhaseState = field(default_factory=DownloadPhaseState)
179
+ consolidation_phase: ConsolidationPhaseState = field(default_factory=ConsolidationPhaseState)
180
+
181
+ def manifest_path(self, catalog_path: str) -> str:
182
+ """Get the path to this job's manifest file.
183
+
184
+ Args:
185
+ catalog_path: Base catalog path.
186
+
187
+ Returns:
188
+ Full path to manifest.json file.
189
+ """
190
+ return f"{catalog_path}/jobs/{self.job_id}/manifest.json"
191
+
192
+ def to_dict(self) -> dict[str, Any]:
193
+ """Serialize to dictionary for JSON storage."""
194
+ return {
195
+ "job_id": self.job_id,
196
+ "input_urls_count": self.input_urls_count,
197
+ "status": self.status.value,
198
+ "config_hash": self.config_hash,
199
+ "created_at": self.created_at.isoformat(),
200
+ "updated_at": datetime.now(UTC).isoformat(),
201
+ "error": self.error,
202
+ "download_phase": self.download_phase.to_dict(),
203
+ "consolidation_phase": self.consolidation_phase.to_dict(),
204
+ }
205
+
206
+ @classmethod
207
+ def from_dict(cls, data: dict[str, Any]) -> JobManifest:
208
+ """Deserialize from dictionary."""
209
+ return cls(
210
+ job_id=data["job_id"],
211
+ input_urls_count=data.get("input_urls_count", 0),
212
+ status=JobStatus(data.get("status", "pending")),
213
+ config_hash=data.get("config_hash", ""),
214
+ created_at=datetime.fromisoformat(data["created_at"]),
215
+ updated_at=datetime.fromisoformat(data.get("updated_at", data["created_at"])),
216
+ error=data.get("error", ""),
217
+ download_phase=DownloadPhaseState.from_dict(data.get("download_phase", {})),
218
+ consolidation_phase=ConsolidationPhaseState.from_dict(data.get("consolidation_phase", {})),
219
+ )
220
+
221
+ def save(self, storage: StorageBackend, catalog_path: str) -> None:
222
+ """Save manifest to storage.
223
+
224
+ Creates the jobs directory if it doesn't exist.
225
+
226
+ Args:
227
+ storage: Storage backend to write to.
228
+ catalog_path: Base catalog path.
229
+ """
230
+ path = self.manifest_path(catalog_path)
231
+
232
+ # Ensure directory exists
233
+ storage.makedirs(Path(path).parent)
234
+
235
+ # Write manifest as JSON
236
+ content = json.dumps(self.to_dict(), indent=2).encode("utf-8")
237
+ with storage.open(path, "wb") as f:
238
+ f.write(content)
239
+
240
+ logger.debug(f"Saved manifest to {path}")
241
+
242
+ @classmethod
243
+ def load(cls, storage: StorageBackend, catalog_path: str, job_id: str) -> JobManifest:
244
+ """Load manifest from storage.
245
+
246
+ Args:
247
+ storage: Storage backend to read from.
248
+ catalog_path: Base catalog path.
249
+ job_id: Job ID to load.
250
+
251
+ Returns:
252
+ Loaded JobManifest.
253
+
254
+ Raises:
255
+ FileNotFoundError: If manifest doesn't exist.
256
+ """
257
+ path = f"{catalog_path}/jobs/{job_id}/manifest.json"
258
+
259
+ if not storage.exists(path):
260
+ raise FileNotFoundError(f"Manifest not found: {path}")
261
+
262
+ with storage.open(path, "rb") as f:
263
+ data = json.load(f)
264
+
265
+ return cls.from_dict(data)
266
+
267
+ @classmethod
268
+ def find_incomplete(cls, storage: StorageBackend, catalog_path: str) -> JobManifest | None:
269
+ """Find the most recent incomplete job for this catalog.
270
+
271
+ Scans the jobs directory for manifests with status other than
272
+ COMPLETED or FAILED, returning the most recently updated one.
273
+
274
+ Args:
275
+ storage: Storage backend to search.
276
+ catalog_path: Base catalog path.
277
+
278
+ Returns:
279
+ Most recent incomplete JobManifest, or None if no incomplete jobs.
280
+ """
281
+ jobs_dir = f"{catalog_path}/jobs"
282
+
283
+ # Check if jobs directory exists
284
+ if not storage.exists(jobs_dir):
285
+ return None
286
+
287
+ # List job directories (not files)
288
+ try:
289
+ job_dirs = storage.list_dirs(jobs_dir)
290
+ except OSError:
291
+ return None
292
+
293
+ incomplete_jobs: list[JobManifest] = []
294
+
295
+ for job_path in job_dirs:
296
+ # Extract job_id from path
297
+ job_id = Path(job_path).name
298
+
299
+ # Skip logs directory and other non-job items
300
+ if job_id == "logs" or not job_id:
301
+ continue
302
+
303
+ try:
304
+ manifest = cls.load(storage, catalog_path, job_id)
305
+ if manifest.status in (JobStatus.DOWNLOADING, JobStatus.CONSOLIDATING):
306
+ incomplete_jobs.append(manifest)
307
+ except (FileNotFoundError, json.JSONDecodeError, KeyError):
308
+ continue
309
+
310
+ if not incomplete_jobs:
311
+ return None
312
+
313
+ # Return most recently updated
314
+ return max(incomplete_jobs, key=lambda m: m.updated_at)
315
+
316
+
317
+ class JobLogger:
318
+ """Structured logger for ingestion jobs.
319
+
320
+ Writes structured log entries to a date-based log file in the catalog.
321
+ Each entry includes timestamp, job ID, level, message, and optional context.
322
+
323
+ Location: {catalog}/jobs/logs/{date}-ingest.txt (local)
324
+ {catalog}/jobs/logs/{date}-{job_id}-{worker_id}.txt (S3, per-worker)
325
+
326
+ Log Format:
327
+ {timestamp} [{level}] job={job_id} - {message} {context}
328
+
329
+ Note:
330
+ For S3 storage, each worker writes to its own log file to avoid race
331
+ conditions from concurrent read-modify-write operations. Log files can
332
+ be merged post-processing if needed.
333
+
334
+ Example:
335
+ >>> logger = JobLogger(storage, catalog_path, "job-123")
336
+ >>> logger.log("INFO", "Processing started", urls=10000)
337
+ >>> logger.log_phase_start("download")
338
+ >>> logger.log_phase_complete("download", {"urls": 10000, "shards": 50})
339
+ >>> logger.log_error("Failed to fetch URL", url="http://example.com")
340
+ """
341
+
342
+ def __init__(
343
+ self,
344
+ storage: StorageBackend,
345
+ catalog_path: str,
346
+ job_id: str,
347
+ worker_id: str | None = None,
348
+ ) -> None:
349
+ """Initialize the logger.
350
+
351
+ Args:
352
+ storage: Storage backend for writing logs.
353
+ catalog_path: Base catalog path.
354
+ job_id: Job ID to include in log entries.
355
+ worker_id: Optional worker ID for per-worker S3 log files.
356
+ If not provided, uses process ID for S3 logs.
357
+ """
358
+ self.storage = storage
359
+ self.catalog_path = catalog_path
360
+ self.job_id = job_id
361
+ self.worker_id = worker_id or str(os.getpid())
362
+ self._log_path: str | None = None
363
+ self._is_s3 = catalog_path.startswith("s3://")
364
+
365
+ @property
366
+ def log_path(self) -> str:
367
+ """Get the path to today's log file.
368
+
369
+ For S3 storage, returns a per-worker log file to avoid race conditions.
370
+ For local storage, returns a shared log file.
371
+ """
372
+ if self._log_path is None:
373
+ today = date.today().isoformat()
374
+ if self._is_s3:
375
+ # Per-worker log file to avoid race conditions
376
+ self._log_path = f"{self.catalog_path}/jobs/logs/{today}-{self.job_id}-{self.worker_id}.txt"
377
+ else:
378
+ # Shared log file for local storage (append is atomic)
379
+ self._log_path = f"{self.catalog_path}/jobs/logs/{today}-ingest.txt"
380
+ return self._log_path
381
+
382
+ def _ensure_log_dir(self) -> None:
383
+ """Ensure the logs directory exists."""
384
+ self.storage.makedirs(Path(self.log_path).parent)
385
+
386
+ def _format_entry(self, level: str, message: str, **context: Any) -> str:
387
+ """Format a log entry.
388
+
389
+ Args:
390
+ level: Log level (INFO, WARNING, ERROR, etc.).
391
+ message: Log message.
392
+ **context: Additional context key-value pairs.
393
+
394
+ Returns:
395
+ Formatted log line with newline.
396
+ """
397
+ timestamp = datetime.now(UTC).isoformat()
398
+ entry = f"{timestamp} [{level}] job={self.job_id} - {message}"
399
+
400
+ if context:
401
+ ctx_str = " ".join(f"{k}={v}" for k, v in context.items())
402
+ entry = f"{entry} | {ctx_str}"
403
+
404
+ return entry + "\n"
405
+
406
+ def log(self, level: str, message: str, **context: Any) -> None:
407
+ """Write a log entry.
408
+
409
+ Args:
410
+ level: Log level (INFO, WARNING, ERROR, etc.).
411
+ message: Log message.
412
+ **context: Additional context to include.
413
+ """
414
+ self._ensure_log_dir()
415
+
416
+ entry = self._format_entry(level, message, **context)
417
+
418
+ # Append to log file
419
+ # For S3, we need to read-append-write since S3 doesn't support append
420
+ if self.log_path.startswith("s3://"):
421
+ self._append_s3(entry)
422
+ else:
423
+ self._append_local(entry)
424
+
425
+ def _append_local(self, entry: str) -> None:
426
+ """Append entry to local file."""
427
+ with open(self.log_path, "a") as f:
428
+ f.write(entry)
429
+
430
+ def _append_s3(self, entry: str) -> None:
431
+ """Append entry to S3 file (read-modify-write).
432
+
433
+ Note: Each worker uses its own log file (per worker_id) to avoid race
434
+ conditions from concurrent read-modify-write operations. Log files can
435
+ be merged post-processing using: cat logs/{date}-{job_id}-*.txt > combined.txt
436
+ """
437
+ try:
438
+ if self.storage.exists(self.log_path):
439
+ with self.storage.open(self.log_path, "rb") as f:
440
+ existing = f.read().decode("utf-8")
441
+ else:
442
+ existing = ""
443
+
444
+ content = existing + entry
445
+ with self.storage.open(self.log_path, "wb") as f:
446
+ f.write(content.encode("utf-8"))
447
+ except OSError as e:
448
+ # Log to Python logger if storage logging fails
449
+ logger.error(f"Failed to write to job log: {e}")
450
+
451
+ def log_phase_start(self, phase: str) -> None:
452
+ """Log the start of a processing phase.
453
+
454
+ Args:
455
+ phase: Phase name (e.g., "download", "consolidation").
456
+ """
457
+ self.log("INFO", f"Phase '{phase}' started")
458
+
459
+ def log_phase_complete(self, phase: str, stats: dict[str, Any]) -> None:
460
+ """Log the completion of a processing phase.
461
+
462
+ Args:
463
+ phase: Phase name.
464
+ stats: Statistics dictionary to include.
465
+ """
466
+ stats_str = ", ".join(f"{k}={v}" for k, v in stats.items())
467
+ self.log("INFO", f"Phase '{phase}' completed: {stats_str}")
468
+
469
+ def log_error(self, message: str, **context: Any) -> None:
470
+ """Log an error.
471
+
472
+ Args:
473
+ message: Error message.
474
+ **context: Additional context.
475
+ """
476
+ self.log("ERROR", message, **context)
477
+
478
+
479
+ __all__ = [
480
+ "JobStatus",
481
+ "DownloadPhaseState",
482
+ "ConsolidationPhaseState",
483
+ "JobManifest",
484
+ "JobLogger",
485
+ ]