earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. earthcatalog/__init__.py +164 -0
  2. earthcatalog/async_http_client.py +1006 -0
  3. earthcatalog/config.py +97 -0
  4. earthcatalog/engines/__init__.py +308 -0
  5. earthcatalog/engines/rustac_engine.py +142 -0
  6. earthcatalog/engines/stac_geoparquet_engine.py +126 -0
  7. earthcatalog/exceptions.py +471 -0
  8. earthcatalog/grid_systems.py +1114 -0
  9. earthcatalog/ingestion_pipeline.py +2281 -0
  10. earthcatalog/input_readers.py +603 -0
  11. earthcatalog/job_tracking.py +485 -0
  12. earthcatalog/pipeline.py +606 -0
  13. earthcatalog/schema_generator.py +911 -0
  14. earthcatalog/spatial_resolver.py +1207 -0
  15. earthcatalog/stac_hooks.py +754 -0
  16. earthcatalog/statistics.py +677 -0
  17. earthcatalog/storage_backends.py +548 -0
  18. earthcatalog/tests/__init__.py +1 -0
  19. earthcatalog/tests/conftest.py +76 -0
  20. earthcatalog/tests/test_all_grids.py +793 -0
  21. earthcatalog/tests/test_async_http.py +700 -0
  22. earthcatalog/tests/test_cli_and_storage.py +230 -0
  23. earthcatalog/tests/test_config.py +245 -0
  24. earthcatalog/tests/test_dask_integration.py +580 -0
  25. earthcatalog/tests/test_e2e_synthetic.py +1624 -0
  26. earthcatalog/tests/test_engines.py +272 -0
  27. earthcatalog/tests/test_exceptions.py +346 -0
  28. earthcatalog/tests/test_file_structure.py +245 -0
  29. earthcatalog/tests/test_input_readers.py +666 -0
  30. earthcatalog/tests/test_integration.py +200 -0
  31. earthcatalog/tests/test_integration_async.py +283 -0
  32. earthcatalog/tests/test_job_tracking.py +603 -0
  33. earthcatalog/tests/test_multi_file_input.py +336 -0
  34. earthcatalog/tests/test_passthrough_hook.py +196 -0
  35. earthcatalog/tests/test_pipeline.py +684 -0
  36. earthcatalog/tests/test_pipeline_components.py +665 -0
  37. earthcatalog/tests/test_schema_generator.py +506 -0
  38. earthcatalog/tests/test_spatial_resolver.py +413 -0
  39. earthcatalog/tests/test_stac_hooks.py +776 -0
  40. earthcatalog/tests/test_statistics.py +477 -0
  41. earthcatalog/tests/test_storage_backends.py +236 -0
  42. earthcatalog/tests/test_validation.py +435 -0
  43. earthcatalog/tests/test_workers.py +653 -0
  44. earthcatalog/validation.py +921 -0
  45. earthcatalog/workers.py +682 -0
  46. earthcatalog-0.2.0.dist-info/METADATA +333 -0
  47. earthcatalog-0.2.0.dist-info/RECORD +50 -0
  48. earthcatalog-0.2.0.dist-info/WHEEL +5 -0
  49. earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
  50. earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,603 @@
1
+ # test_job_tracking.py
2
+ """Tests for job tracking module.
3
+
4
+ This module tests:
5
+ - JobManifest: Job state persistence and recovery detection
6
+ - JobLogger: Structured logging for ingestion jobs
7
+ """
8
+
9
+ import json
10
+ import tempfile
11
+ from pathlib import Path
12
+
13
+ import pytest
14
+
15
+ from earthcatalog.job_tracking import (
16
+ ConsolidationPhaseState,
17
+ DownloadPhaseState,
18
+ JobLogger,
19
+ JobManifest,
20
+ JobStatus,
21
+ )
22
+ from earthcatalog.storage_backends import LocalStorage
23
+
24
+
25
+ class TestJobStatus:
26
+ """Test JobStatus enum."""
27
+
28
+ def test_status_values(self):
29
+ """JobStatus should have expected values."""
30
+ assert JobStatus.PENDING == "pending"
31
+ assert JobStatus.DOWNLOADING == "downloading"
32
+ assert JobStatus.CONSOLIDATING == "consolidating"
33
+ assert JobStatus.COMPLETED == "completed"
34
+ assert JobStatus.FAILED == "failed"
35
+
36
+
37
+ class TestDownloadPhaseState:
38
+ """Test DownloadPhaseState dataclass."""
39
+
40
+ def test_default_values(self):
41
+ """DownloadPhaseState should have sensible defaults."""
42
+ state = DownloadPhaseState()
43
+ assert state.completed is False
44
+ assert state.batches_total == 0
45
+ assert state.batches_completed == 0
46
+ assert state.urls_processed == 0
47
+ assert state.urls_failed == 0
48
+ assert state.shards_written == []
49
+
50
+ def test_to_dict_from_dict_roundtrip(self):
51
+ """DownloadPhaseState should serialize/deserialize correctly."""
52
+ state = DownloadPhaseState(
53
+ completed=True,
54
+ batches_total=10,
55
+ batches_completed=8,
56
+ urls_processed=1000,
57
+ urls_failed=5,
58
+ shards_written=["shard1.parquet", "shard2.parquet"],
59
+ )
60
+
61
+ data = state.to_dict()
62
+ restored = DownloadPhaseState.from_dict(data)
63
+
64
+ assert restored.completed == state.completed
65
+ assert restored.batches_total == state.batches_total
66
+ assert restored.batches_completed == state.batches_completed
67
+ assert restored.urls_processed == state.urls_processed
68
+ assert restored.urls_failed == state.urls_failed
69
+ assert restored.shards_written == state.shards_written
70
+
71
+
72
+ class TestConsolidationPhaseState:
73
+ """Test ConsolidationPhaseState dataclass."""
74
+
75
+ def test_default_values(self):
76
+ """ConsolidationPhaseState should have sensible defaults."""
77
+ state = ConsolidationPhaseState()
78
+ assert state.completed is False
79
+ assert state.partitions_total == 0
80
+ assert state.partitions_completed == 0
81
+ assert state.completed_partitions == []
82
+
83
+ def test_to_dict_from_dict_roundtrip(self):
84
+ """ConsolidationPhaseState should serialize/deserialize correctly."""
85
+ state = ConsolidationPhaseState(
86
+ completed=True,
87
+ partitions_total=5,
88
+ partitions_completed=5,
89
+ completed_partitions=["p1", "p2", "p3", "p4", "p5"],
90
+ )
91
+
92
+ data = state.to_dict()
93
+ restored = ConsolidationPhaseState.from_dict(data)
94
+
95
+ assert restored.completed == state.completed
96
+ assert restored.partitions_total == state.partitions_total
97
+ assert restored.partitions_completed == state.partitions_completed
98
+ assert restored.completed_partitions == state.completed_partitions
99
+
100
+
101
+ class TestJobManifest:
102
+ """Test JobManifest dataclass."""
103
+
104
+ @pytest.fixture
105
+ def temp_dir(self):
106
+ """Create a temporary directory for testing."""
107
+ with tempfile.TemporaryDirectory() as tmpdir:
108
+ yield tmpdir
109
+
110
+ @pytest.fixture
111
+ def storage(self, temp_dir):
112
+ """Create a LocalStorage instance."""
113
+ return LocalStorage(temp_dir)
114
+
115
+ def test_create_new_manifest(self):
116
+ """Creating a new JobManifest should set defaults correctly."""
117
+ manifest = JobManifest(
118
+ job_id="test-job-123",
119
+ input_urls_count=1000,
120
+ )
121
+
122
+ assert manifest.job_id == "test-job-123"
123
+ assert manifest.status == JobStatus.PENDING
124
+ assert manifest.input_urls_count == 1000
125
+ assert manifest.created_at is not None
126
+ assert manifest.download_phase is not None
127
+ assert manifest.consolidation_phase is not None
128
+
129
+ def test_to_dict_from_dict_roundtrip(self):
130
+ """JobManifest should serialize/deserialize correctly."""
131
+ manifest = JobManifest(
132
+ job_id="test-job-456",
133
+ input_urls_count=5000,
134
+ config_hash="abc123",
135
+ )
136
+ manifest.status = JobStatus.DOWNLOADING
137
+ manifest.download_phase.batches_total = 10
138
+ manifest.download_phase.batches_completed = 5
139
+
140
+ data = manifest.to_dict()
141
+ restored = JobManifest.from_dict(data)
142
+
143
+ assert restored.job_id == manifest.job_id
144
+ assert restored.status == manifest.status
145
+ assert restored.input_urls_count == manifest.input_urls_count
146
+ assert restored.config_hash == manifest.config_hash
147
+ assert restored.download_phase.batches_total == 10
148
+ assert restored.download_phase.batches_completed == 5
149
+
150
+ def test_save_creates_manifest_file(self, storage, temp_dir):
151
+ """save() should create manifest.json in the correct location."""
152
+ manifest = JobManifest(
153
+ job_id="save-test-job",
154
+ input_urls_count=100,
155
+ )
156
+
157
+ manifest.save(storage, temp_dir)
158
+
159
+ expected_path = Path(temp_dir) / "jobs" / "save-test-job" / "manifest.json"
160
+ assert expected_path.exists()
161
+
162
+ # Verify content
163
+ with open(expected_path) as f:
164
+ data = json.load(f)
165
+ assert data["job_id"] == "save-test-job"
166
+
167
+ def test_load_reads_manifest_file(self, storage, temp_dir):
168
+ """load() should read an existing manifest."""
169
+ # Create and save a manifest
170
+ original = JobManifest(
171
+ job_id="load-test-job",
172
+ input_urls_count=200,
173
+ )
174
+ original.status = JobStatus.CONSOLIDATING
175
+ original.save(storage, temp_dir)
176
+
177
+ # Load it back
178
+ loaded = JobManifest.load(storage, temp_dir, "load-test-job")
179
+
180
+ assert loaded.job_id == "load-test-job"
181
+ assert loaded.status == JobStatus.CONSOLIDATING
182
+ assert loaded.input_urls_count == 200
183
+
184
+ def test_load_raises_for_missing_manifest(self, storage, temp_dir):
185
+ """load() should raise FileNotFoundError for missing manifests."""
186
+ with pytest.raises(FileNotFoundError):
187
+ JobManifest.load(storage, temp_dir, "nonexistent-job")
188
+
189
+ def test_find_incomplete_returns_none_when_no_jobs(self, storage, temp_dir):
190
+ """find_incomplete() should return None when no jobs directory exists."""
191
+ result = JobManifest.find_incomplete(storage, temp_dir)
192
+ assert result is None
193
+
194
+ def test_find_incomplete_returns_none_when_all_complete(self, storage, temp_dir):
195
+ """find_incomplete() should return None when all jobs are complete."""
196
+ # Create a completed job
197
+ completed = JobManifest(
198
+ job_id="completed-job",
199
+ input_urls_count=100,
200
+ )
201
+ completed.status = JobStatus.COMPLETED
202
+ completed.save(storage, temp_dir)
203
+
204
+ result = JobManifest.find_incomplete(storage, temp_dir)
205
+ assert result is None
206
+
207
+ def test_find_incomplete_finds_downloading_job(self, storage, temp_dir):
208
+ """find_incomplete() should find jobs in DOWNLOADING status."""
209
+ incomplete = JobManifest(
210
+ job_id="incomplete-job",
211
+ input_urls_count=100,
212
+ )
213
+ incomplete.status = JobStatus.DOWNLOADING
214
+ incomplete.save(storage, temp_dir)
215
+
216
+ result = JobManifest.find_incomplete(storage, temp_dir)
217
+ assert result is not None
218
+ assert result.job_id == "incomplete-job"
219
+
220
+ def test_find_incomplete_finds_consolidating_job(self, storage, temp_dir):
221
+ """find_incomplete() should find jobs in CONSOLIDATING status."""
222
+ incomplete = JobManifest(
223
+ job_id="consolidating-job",
224
+ input_urls_count=100,
225
+ )
226
+ incomplete.status = JobStatus.CONSOLIDATING
227
+ incomplete.save(storage, temp_dir)
228
+
229
+ result = JobManifest.find_incomplete(storage, temp_dir)
230
+ assert result is not None
231
+ assert result.job_id == "consolidating-job"
232
+
233
+ def test_manifest_path_property(self, temp_dir):
234
+ """manifest_path() should return correct path."""
235
+ manifest = JobManifest(job_id="path-test", input_urls_count=0)
236
+ path = manifest.manifest_path(temp_dir)
237
+ assert path == f"{temp_dir}/jobs/path-test/manifest.json"
238
+
239
+
240
+ class TestJobLogger:
241
+ """Test JobLogger class."""
242
+
243
+ @pytest.fixture
244
+ def temp_dir(self):
245
+ """Create a temporary directory for testing."""
246
+ with tempfile.TemporaryDirectory() as tmpdir:
247
+ yield tmpdir
248
+
249
+ @pytest.fixture
250
+ def storage(self, temp_dir):
251
+ """Create a LocalStorage instance."""
252
+ return LocalStorage(temp_dir)
253
+
254
+ def test_logger_creates_log_file(self, storage, temp_dir):
255
+ """JobLogger should create log file on first log."""
256
+ logger = JobLogger(storage, temp_dir, "test-job")
257
+ logger.log("INFO", "Test message")
258
+
259
+ # Check that logs directory exists
260
+ logs_dir = Path(temp_dir) / "jobs" / "logs"
261
+ assert logs_dir.exists()
262
+
263
+ # Check that a log file was created
264
+ log_files = list(logs_dir.glob("*.txt"))
265
+ assert len(log_files) == 1
266
+
267
+ def test_log_writes_message(self, storage, temp_dir):
268
+ """log() should write message to file."""
269
+ logger = JobLogger(storage, temp_dir, "test-job")
270
+ logger.log("INFO", "Hello world")
271
+
272
+ # Read the log file
273
+ logs_dir = Path(temp_dir) / "jobs" / "logs"
274
+ log_file = next(logs_dir.glob("*.txt"))
275
+ content = log_file.read_text()
276
+
277
+ assert "Hello world" in content
278
+ assert "INFO" in content
279
+ assert "test-job" in content
280
+
281
+ def test_log_with_context(self, storage, temp_dir):
282
+ """log() should include context in message."""
283
+ logger = JobLogger(storage, temp_dir, "test-job")
284
+ logger.log("WARNING", "Something happened", url="http://example.com", count=42)
285
+
286
+ logs_dir = Path(temp_dir) / "jobs" / "logs"
287
+ log_file = next(logs_dir.glob("*.txt"))
288
+ content = log_file.read_text()
289
+
290
+ assert "Something happened" in content
291
+ assert "url" in content
292
+ assert "http://example.com" in content
293
+
294
+ def test_log_phase_start(self, storage, temp_dir):
295
+ """log_phase_start() should log phase beginning."""
296
+ logger = JobLogger(storage, temp_dir, "test-job")
297
+ logger.log_phase_start("download")
298
+
299
+ logs_dir = Path(temp_dir) / "jobs" / "logs"
300
+ log_file = next(logs_dir.glob("*.txt"))
301
+ content = log_file.read_text()
302
+
303
+ assert "download" in content.lower()
304
+ assert "start" in content.lower() or "begin" in content.lower()
305
+
306
+ def test_log_phase_complete(self, storage, temp_dir):
307
+ """log_phase_complete() should log phase completion with stats."""
308
+ logger = JobLogger(storage, temp_dir, "test-job")
309
+ logger.log_phase_complete("download", {"urls_processed": 1000, "shards": 10})
310
+
311
+ logs_dir = Path(temp_dir) / "jobs" / "logs"
312
+ log_file = next(logs_dir.glob("*.txt"))
313
+ content = log_file.read_text()
314
+
315
+ assert "download" in content.lower()
316
+ assert "1000" in content or "urls_processed" in content
317
+
318
+ def test_log_error(self, storage, temp_dir):
319
+ """log_error() should log error with context."""
320
+ logger = JobLogger(storage, temp_dir, "test-job")
321
+ logger.log_error("Something failed", url="http://bad.com")
322
+
323
+ logs_dir = Path(temp_dir) / "jobs" / "logs"
324
+ log_file = next(logs_dir.glob("*.txt"))
325
+ content = log_file.read_text()
326
+
327
+ assert "ERROR" in content
328
+ assert "Something failed" in content
329
+
330
+ def test_log_appends_to_existing_file(self, storage, temp_dir):
331
+ """Multiple log() calls should append to same file."""
332
+ logger = JobLogger(storage, temp_dir, "test-job")
333
+ logger.log("INFO", "First message")
334
+ logger.log("INFO", "Second message")
335
+
336
+ logs_dir = Path(temp_dir) / "jobs" / "logs"
337
+ log_file = next(logs_dir.glob("*.txt"))
338
+ content = log_file.read_text()
339
+
340
+ assert "First message" in content
341
+ assert "Second message" in content
342
+
343
+
344
+ class TestPipelineResumeIntegration:
345
+ """Test pipeline resume functionality with job tracking."""
346
+
347
+ @pytest.fixture
348
+ def temp_dir(self):
349
+ """Create a temporary directory for testing."""
350
+ with tempfile.TemporaryDirectory() as tmpdir:
351
+ yield tmpdir
352
+
353
+ @pytest.fixture
354
+ def storage(self, temp_dir):
355
+ """Create a LocalStorage instance."""
356
+ return LocalStorage(temp_dir)
357
+
358
+ def test_run_creates_job_manifest(self, temp_dir):
359
+ """run() should create a job manifest."""
360
+ from unittest.mock import patch
361
+
362
+ import pandas as pd
363
+
364
+ from earthcatalog.ingestion_pipeline import (
365
+ LocalProcessor,
366
+ ProcessingConfig,
367
+ STACIngestionPipeline,
368
+ )
369
+
370
+ # Create config
371
+ input_file = Path(temp_dir) / "input.parquet"
372
+ output_catalog = Path(temp_dir) / "catalog"
373
+ scratch_location = Path(temp_dir) / "scratch"
374
+
375
+ df = pd.DataFrame({"url": ["http://example.com/item1.json"]})
376
+ df.to_parquet(input_file, index=False)
377
+
378
+ config = ProcessingConfig(
379
+ input_file=str(input_file),
380
+ output_catalog=str(output_catalog),
381
+ scratch_location=str(scratch_location),
382
+ max_workers=1,
383
+ )
384
+
385
+ processor = LocalProcessor(n_workers=1)
386
+ pipeline = STACIngestionPipeline(config, processor)
387
+
388
+ # Mock the download to return nothing (empty processing)
389
+ with patch.object(pipeline, "_download_stac_item", return_value=None):
390
+ pipeline.run(job_id="test-manifest-job")
391
+
392
+ # Check that manifest was created
393
+ manifest_path = output_catalog / "jobs" / "test-manifest-job" / "manifest.json"
394
+ assert manifest_path.exists()
395
+
396
+ # Load and verify
397
+ storage = LocalStorage(str(output_catalog))
398
+ manifest = JobManifest.load(storage, str(output_catalog), "test-manifest-job")
399
+ assert manifest.status == JobStatus.COMPLETED
400
+
401
+ processor.close()
402
+
403
+ def test_run_marks_job_failed_on_exception(self, temp_dir):
404
+ """run() should mark job as FAILED when exception occurs."""
405
+ from unittest.mock import patch
406
+
407
+ import pandas as pd
408
+
409
+ from earthcatalog.ingestion_pipeline import (
410
+ LocalProcessor,
411
+ ProcessingConfig,
412
+ STACIngestionPipeline,
413
+ )
414
+
415
+ # Create config
416
+ input_file = Path(temp_dir) / "input.parquet"
417
+ output_catalog = Path(temp_dir) / "catalog"
418
+ scratch_location = Path(temp_dir) / "scratch"
419
+
420
+ df = pd.DataFrame({"url": ["http://example.com/item1.json"]})
421
+ df.to_parquet(input_file, index=False)
422
+
423
+ config = ProcessingConfig(
424
+ input_file=str(input_file),
425
+ output_catalog=str(output_catalog),
426
+ scratch_location=str(scratch_location),
427
+ max_workers=1,
428
+ distributed=True, # Force distributed mode to test _process_urls_distributed
429
+ )
430
+
431
+ processor = LocalProcessor(n_workers=1)
432
+ pipeline = STACIngestionPipeline(config, processor)
433
+
434
+ # Mock to raise an exception
435
+ with patch.object(pipeline, "_process_urls_distributed", side_effect=RuntimeError("Test error")):
436
+ with pytest.raises(RuntimeError, match="Test error"):
437
+ pipeline.run(job_id="failed-job")
438
+
439
+ # Check that manifest shows FAILED
440
+ storage = LocalStorage(str(output_catalog))
441
+ manifest = JobManifest.load(storage, str(output_catalog), "failed-job")
442
+ assert manifest.status == JobStatus.FAILED
443
+ assert "Test error" in manifest.error
444
+
445
+ processor.close()
446
+
447
+ def test_resume_raises_when_no_incomplete_job(self, temp_dir):
448
+ """run(resume=True) should raise ValueError if no incomplete job exists."""
449
+ import pandas as pd
450
+
451
+ from earthcatalog.ingestion_pipeline import (
452
+ LocalProcessor,
453
+ ProcessingConfig,
454
+ STACIngestionPipeline,
455
+ )
456
+
457
+ input_file = Path(temp_dir) / "input.parquet"
458
+ output_catalog = Path(temp_dir) / "catalog"
459
+ scratch_location = Path(temp_dir) / "scratch"
460
+
461
+ df = pd.DataFrame({"url": ["http://example.com/item.json"]})
462
+ df.to_parquet(input_file, index=False)
463
+
464
+ config = ProcessingConfig(
465
+ input_file=str(input_file),
466
+ output_catalog=str(output_catalog),
467
+ scratch_location=str(scratch_location),
468
+ )
469
+
470
+ processor = LocalProcessor(n_workers=1)
471
+ pipeline = STACIngestionPipeline(config, processor)
472
+
473
+ with pytest.raises(ValueError, match="No incomplete job found"):
474
+ pipeline.run(resume=True)
475
+
476
+ processor.close()
477
+
478
+ def test_resume_finds_and_uses_incomplete_job(self, temp_dir):
479
+ """run(resume=True) should find and continue an incomplete job."""
480
+ from unittest.mock import patch
481
+
482
+ import pandas as pd
483
+
484
+ from earthcatalog.ingestion_pipeline import (
485
+ LocalProcessor,
486
+ ProcessingConfig,
487
+ STACIngestionPipeline,
488
+ )
489
+
490
+ # Create config
491
+ input_file = Path(temp_dir) / "input.parquet"
492
+ output_catalog = Path(temp_dir) / "catalog"
493
+ scratch_location = Path(temp_dir) / "scratch"
494
+
495
+ df = pd.DataFrame({"url": ["http://example.com/item1.json"]})
496
+ df.to_parquet(input_file, index=False)
497
+
498
+ config = ProcessingConfig(
499
+ input_file=str(input_file),
500
+ output_catalog=str(output_catalog),
501
+ scratch_location=str(scratch_location),
502
+ max_workers=1,
503
+ )
504
+
505
+ # Create an incomplete job manifest manually
506
+ storage = LocalStorage(str(output_catalog))
507
+ incomplete_manifest = JobManifest(
508
+ job_id="incomplete-job-to-resume",
509
+ input_urls_count=1,
510
+ )
511
+ incomplete_manifest.status = JobStatus.DOWNLOADING
512
+ incomplete_manifest.download_phase.completed = False
513
+ incomplete_manifest.save(storage, str(output_catalog))
514
+
515
+ processor = LocalProcessor(n_workers=1)
516
+ pipeline = STACIngestionPipeline(config, processor)
517
+
518
+ # Mock the download to return nothing
519
+ with patch.object(pipeline, "_download_stac_item", return_value=None):
520
+ pipeline.run(resume=True)
521
+
522
+ # Check that the same job was resumed and completed
523
+ manifest = JobManifest.load(storage, str(output_catalog), "incomplete-job-to-resume")
524
+ assert manifest.status == JobStatus.COMPLETED
525
+
526
+ processor.close()
527
+
528
+ def test_resume_skips_completed_download_phase(self, temp_dir):
529
+ """run(resume=True) should skip download phase if already completed."""
530
+ from unittest.mock import MagicMock, patch
531
+
532
+ import pandas as pd
533
+
534
+ from earthcatalog.ingestion_pipeline import (
535
+ LocalProcessor,
536
+ ProcessingConfig,
537
+ STACIngestionPipeline,
538
+ )
539
+
540
+ input_file = Path(temp_dir) / "input.parquet"
541
+ output_catalog = Path(temp_dir) / "catalog"
542
+ scratch_location = Path(temp_dir) / "scratch"
543
+
544
+ df = pd.DataFrame({"url": ["http://example.com/item1.json"]})
545
+ df.to_parquet(input_file, index=False)
546
+
547
+ config = ProcessingConfig(
548
+ input_file=str(input_file),
549
+ output_catalog=str(output_catalog),
550
+ scratch_location=str(scratch_location),
551
+ max_workers=1,
552
+ )
553
+
554
+ # Create manifest with download phase completed
555
+ storage = LocalStorage(str(output_catalog))
556
+ manifest = JobManifest(
557
+ job_id="download-complete-job",
558
+ input_urls_count=1,
559
+ )
560
+ manifest.status = JobStatus.CONSOLIDATING
561
+ manifest.download_phase.completed = True
562
+ manifest.download_phase.shards_written = [] # No shards
563
+ manifest.save(storage, str(output_catalog))
564
+
565
+ processor = LocalProcessor(n_workers=1)
566
+ pipeline = STACIngestionPipeline(config, processor)
567
+
568
+ # Mock _process_urls_distributed to track if it's called
569
+ mock_process = MagicMock()
570
+ with patch.object(pipeline, "_process_urls_distributed", mock_process):
571
+ pipeline.run(resume=True)
572
+
573
+ # Download phase should NOT have been called (it was already complete)
574
+ mock_process.assert_not_called()
575
+
576
+ # Job should be completed
577
+ manifest = JobManifest.load(storage, str(output_catalog), "download-complete-job")
578
+ assert manifest.status == JobStatus.COMPLETED
579
+
580
+ processor.close()
581
+
582
+ def test_consolidation_checkpoints_progress(self, storage, temp_dir):
583
+ """Consolidation should checkpoint progress in manifest."""
584
+ # This is tested indirectly through the _consolidate_shards behavior
585
+ # Create a manifest to track
586
+ manifest = JobManifest(
587
+ job_id="checkpoint-test",
588
+ input_urls_count=100,
589
+ )
590
+ manifest.status = JobStatus.CONSOLIDATING
591
+ manifest.consolidation_phase.partitions_total = 20
592
+
593
+ # Simulate adding completed partitions
594
+ for i in range(15):
595
+ manifest.consolidation_phase.completed_partitions.append(f"partition_{i}")
596
+ manifest.consolidation_phase.partitions_completed += 1
597
+
598
+ manifest.save(storage, temp_dir)
599
+
600
+ # Load and verify checkpoint
601
+ loaded = JobManifest.load(storage, temp_dir, "checkpoint-test")
602
+ assert len(loaded.consolidation_phase.completed_partitions) == 15
603
+ assert loaded.consolidation_phase.partitions_completed == 15