earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. earthcatalog/__init__.py +164 -0
  2. earthcatalog/async_http_client.py +1006 -0
  3. earthcatalog/config.py +97 -0
  4. earthcatalog/engines/__init__.py +308 -0
  5. earthcatalog/engines/rustac_engine.py +142 -0
  6. earthcatalog/engines/stac_geoparquet_engine.py +126 -0
  7. earthcatalog/exceptions.py +471 -0
  8. earthcatalog/grid_systems.py +1114 -0
  9. earthcatalog/ingestion_pipeline.py +2281 -0
  10. earthcatalog/input_readers.py +603 -0
  11. earthcatalog/job_tracking.py +485 -0
  12. earthcatalog/pipeline.py +606 -0
  13. earthcatalog/schema_generator.py +911 -0
  14. earthcatalog/spatial_resolver.py +1207 -0
  15. earthcatalog/stac_hooks.py +754 -0
  16. earthcatalog/statistics.py +677 -0
  17. earthcatalog/storage_backends.py +548 -0
  18. earthcatalog/tests/__init__.py +1 -0
  19. earthcatalog/tests/conftest.py +76 -0
  20. earthcatalog/tests/test_all_grids.py +793 -0
  21. earthcatalog/tests/test_async_http.py +700 -0
  22. earthcatalog/tests/test_cli_and_storage.py +230 -0
  23. earthcatalog/tests/test_config.py +245 -0
  24. earthcatalog/tests/test_dask_integration.py +580 -0
  25. earthcatalog/tests/test_e2e_synthetic.py +1624 -0
  26. earthcatalog/tests/test_engines.py +272 -0
  27. earthcatalog/tests/test_exceptions.py +346 -0
  28. earthcatalog/tests/test_file_structure.py +245 -0
  29. earthcatalog/tests/test_input_readers.py +666 -0
  30. earthcatalog/tests/test_integration.py +200 -0
  31. earthcatalog/tests/test_integration_async.py +283 -0
  32. earthcatalog/tests/test_job_tracking.py +603 -0
  33. earthcatalog/tests/test_multi_file_input.py +336 -0
  34. earthcatalog/tests/test_passthrough_hook.py +196 -0
  35. earthcatalog/tests/test_pipeline.py +684 -0
  36. earthcatalog/tests/test_pipeline_components.py +665 -0
  37. earthcatalog/tests/test_schema_generator.py +506 -0
  38. earthcatalog/tests/test_spatial_resolver.py +413 -0
  39. earthcatalog/tests/test_stac_hooks.py +776 -0
  40. earthcatalog/tests/test_statistics.py +477 -0
  41. earthcatalog/tests/test_storage_backends.py +236 -0
  42. earthcatalog/tests/test_validation.py +435 -0
  43. earthcatalog/tests/test_workers.py +653 -0
  44. earthcatalog/validation.py +921 -0
  45. earthcatalog/workers.py +682 -0
  46. earthcatalog-0.2.0.dist-info/METADATA +333 -0
  47. earthcatalog-0.2.0.dist-info/RECORD +50 -0
  48. earthcatalog-0.2.0.dist-info/WHEEL +5 -0
  49. earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
  50. earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,580 @@
1
+ """Dask integration tests for STAC ingestion pipeline with multi-file input and passthrough hook."""
2
+
3
+ import json
4
+ import os
5
+ from pathlib import Path
6
+
7
+ import pytest
8
+
9
+ from earthcatalog.ingestion_pipeline import ProcessingConfig
10
+ from earthcatalog.stac_hooks import PassthroughSTACHook
11
+
12
+ # =============================================================================
13
+ # Module-level fixtures (shared across all test classes)
14
+ # =============================================================================
15
+
16
+
17
+ @pytest.fixture
18
+ def dask_scheduler_address():
19
+ """Get Dask scheduler address from environment or create ephemeral cluster.
20
+
21
+ Priority:
22
+ 1. If DASK_TESTING=TRUE: Create ephemeral local cluster (auto-destroyed after tests)
23
+ 2. If DASK_SCHEDULER_ADDRESS set: Use that address
24
+ 3. Otherwise: Skip tests
25
+
26
+ To enable auto-created ephemeral cluster (recommended for development):
27
+ export DASK_TESTING=TRUE
28
+ pytest earthcatalog/tests/test_dask_integration.py -v
29
+
30
+ To use existing cluster:
31
+ export DASK_SCHEDULER_ADDRESS=localhost:8786
32
+ pytest earthcatalog/tests/test_dask_integration.py -v
33
+ """
34
+ # Check for auto-testing mode first
35
+ if os.environ.get("DASK_TESTING", "").upper() == "TRUE":
36
+ try:
37
+ from dask.distributed import LocalCluster
38
+ except ImportError:
39
+ pytest.skip("Dask distributed not installed: pip install 'dask[distributed]'")
40
+
41
+ # Create ephemeral cluster with minimal workers for testing
42
+ cluster = LocalCluster(
43
+ n_workers=2,
44
+ threads_per_worker=1,
45
+ processes=True,
46
+ silence_logs=False,
47
+ dashboard_address=None, # Disable dashboard for tests
48
+ )
49
+
50
+ # Get the scheduler address
51
+ scheduler_address = cluster.scheduler_address
52
+
53
+ print(f"\n[Ephemeral Dask cluster created at {scheduler_address}]")
54
+
55
+ # Yield the address and cleanup after tests
56
+ yield scheduler_address
57
+
58
+ # Cleanup: close cluster
59
+ print("\n[Destroying ephemeral Dask cluster...]")
60
+ cluster.close()
61
+ return
62
+
63
+ # Fall back to manual scheduler address configuration
64
+ scheduler = os.environ.get("DASK_SCHEDULER_ADDRESS")
65
+ if not scheduler:
66
+ pytest.skip(
67
+ "DASK_SCHEDULER_ADDRESS not set. "
68
+ "Set DASK_TESTING=TRUE to auto-create ephemeral cluster, "
69
+ "or set DASK_SCHEDULER_ADDRESS to connect to existing cluster."
70
+ )
71
+ return scheduler
72
+
73
+
74
+ @pytest.fixture
75
+ def synthetic_bulk_data(tmp_path: Path):
76
+ """Create synthetic bulk data for Dask testing.
77
+
78
+ Creates 10 NDJSON files with 20 STAC items each (200 total items).
79
+ Files follow the ITS_LIVE pattern: {year}_{chunk_no}.ndjson
80
+ """
81
+ bulk_dir = tmp_path / "dask_bulk"
82
+ bulk_dir.mkdir()
83
+
84
+ # Create 10 files with 20 items each
85
+ for chunk in range(1, 11):
86
+ items = [
87
+ {
88
+ "type": "Feature",
89
+ "id": f"item_{chunk}_{i}",
90
+ "geometry": {
91
+ "type": "Point",
92
+ "coordinates": [i * 0.1, i * 0.1],
93
+ },
94
+ "properties": {
95
+ "datetime": f"2020-01-01T{i:02d}:00:00Z",
96
+ "dataset_id": f"test_dataset_{chunk}",
97
+ },
98
+ }
99
+ for i in range(20)
100
+ ]
101
+
102
+ filename = bulk_dir / f"2020_{chunk}.ndjson"
103
+ with filename.open("w") as f:
104
+ for item in items:
105
+ f.write(json.dumps(item) + "\n")
106
+
107
+ return bulk_dir
108
+
109
+
110
+ # =============================================================================
111
+ # Test Classes
112
+ # =============================================================================
113
+
114
+
115
+ @pytest.mark.dask
116
+ class TestDaskIntegration:
117
+ """Integration tests for Dask distributed processing.
118
+
119
+ These tests require Dask to be installed and can run in three modes:
120
+ 1. Auto-created ephemeral cluster: Set DASK_TESTING=TRUE (recommended for development)
121
+ 2. With an existing Dask cluster: Set DASK_SCHEDULER_ADDRESS environment variable
122
+ 3. Skip: If neither is set, tests are skipped
123
+
124
+ To run with auto-created ephemeral cluster (recommended):
125
+ export DASK_TESTING=TRUE
126
+ pytest earthcatalog/tests/test_dask_integration.py -v
127
+
128
+ To run with an existing cluster:
129
+ export DASK_SCHEDULER_ADDRESS=localhost:8786
130
+ pytest earthcatalog/tests/test_dask_integration.py -v
131
+ """
132
+
133
+ def test_dask_scheduler_address_env(self, dask_scheduler_address):
134
+ """Test that Dask scheduler address is configured."""
135
+ # Address can be:
136
+ # - "localhost:8786" (manual config)
137
+ # - "local" (manual config, now deprecated)
138
+ # - "tcp://127.0.0.1:XXXXX" (ephemeral cluster from DASK_TESTING=TRUE)
139
+ assert dask_scheduler_address in ["localhost:8786", "local"] or ":" in dask_scheduler_address
140
+
141
+ def test_processing_config_for_dask(self, synthetic_bulk_data: Path, dask_scheduler_address, tmp_path: Path):
142
+ """Test ProcessingConfig configuration for Dask with passthrough hook."""
143
+ config = ProcessingConfig(
144
+ input_file=str(synthetic_bulk_data),
145
+ output_catalog=str(tmp_path / "output"),
146
+ scratch_location=str(tmp_path / "scratch"),
147
+ input_pattern=str(synthetic_bulk_data / "*.ndjson"),
148
+ input_format="ndjson",
149
+ stac_hook="passthrough", # URLs are pre-fetched STAC JSON
150
+ grid_system="h3",
151
+ grid_resolution=2,
152
+ max_workers=4,
153
+ )
154
+
155
+ # Verify configuration
156
+ assert config.stac_hook == "passthrough"
157
+ assert config.input_pattern.endswith("*.ndjson")
158
+
159
+ # Validation should pass
160
+ try:
161
+ config.validate()
162
+ except (ValueError, FileNotFoundError) as e:
163
+ pytest.fail(f"Config validation failed: {e}")
164
+
165
+ def test_dask_processor_creation(self, dask_scheduler_address):
166
+ """Test DaskDistributedProcessor creation.
167
+
168
+ This test creates a Dask processor but doesn't run a full pipeline.
169
+ """
170
+ from earthcatalog.ingestion_pipeline import DaskDistributedProcessor
171
+
172
+ # For ephemeral clusters (DASK_TESTING mode), just verify connection
173
+ if os.environ.get("DASK_TESTING", "").upper() == "TRUE":
174
+ # Use the ephemeral cluster
175
+ processor = DaskDistributedProcessor(n_workers=2, scheduler_address=dask_scheduler_address)
176
+
177
+ # Verify processor was created
178
+ assert processor.n_workers == 2
179
+ assert processor.scheduler_address == dask_scheduler_address
180
+
181
+ # Close the processor
182
+ processor.close()
183
+ elif dask_scheduler_address == "local":
184
+ # Legacy local mode - create a local Dask cluster for testing
185
+ from dask.distributed import Client
186
+
187
+ with Client(n_workers=2, threads_per_worker=1, processes=True) as client:
188
+ scheduler_address = client.scheduler_address
189
+ processor = DaskDistributedProcessor(n_workers=2, scheduler_address=scheduler_address)
190
+
191
+ # Verify processor was created
192
+ assert processor.n_workers == 2
193
+ assert processor.scheduler_address == scheduler_address
194
+
195
+ # Close the processor
196
+ processor.close()
197
+ else:
198
+ # Connect to existing cluster
199
+ processor = DaskDistributedProcessor(n_workers=4, scheduler_address=dask_scheduler_address)
200
+
201
+ assert processor.n_workers == 4
202
+ assert processor.scheduler_address == dask_scheduler_address
203
+
204
+ processor.close()
205
+
206
+ def test_passthrough_hook_dask_compatibility(self):
207
+ """Test that PassthroughSTACHook is Dask-compatible (serializable)."""
208
+ from earthcatalog.stac_hooks import serialize_hook
209
+
210
+ hook = PassthroughSTACHook()
211
+
212
+ # Should serialize to a simple string
213
+ config_str = serialize_hook(hook)
214
+ assert config_str == "passthrough"
215
+
216
+ # Should be deserializable
217
+ from earthcatalog.stac_hooks import parse_hook_config
218
+
219
+ restored_hook = parse_hook_config(config_str)
220
+
221
+ assert isinstance(restored_hook, PassthroughSTACHook)
222
+
223
+ def test_passthrough_with_batch_processing(self):
224
+ """Test that passthrough hook works with batch processing."""
225
+ hook = PassthroughSTACHook()
226
+
227
+ # Create batch of STAC items
228
+ batch = [
229
+ json.dumps(
230
+ {
231
+ "type": "Feature",
232
+ "id": f"batch_item_{i}",
233
+ "geometry": {"type": "Point", "coordinates": [0, 0]},
234
+ "properties": {"datetime": "2024-01-01T00:00:00Z"},
235
+ }
236
+ )
237
+ for i in range(100)
238
+ ]
239
+
240
+ results = hook.fetch_batch(batch)
241
+
242
+ # All items should be processed
243
+ assert len(results) == 100
244
+ assert all(r["id"] == f"batch_item_{i}" for i, r in enumerate(results) if r)
245
+
246
+ @pytest.mark.skipif(
247
+ os.environ.get("DASK_TESTING", "").upper() != "TRUE" and os.environ.get("DASK_SCHEDULER_ADDRESS") is None,
248
+ reason="DASK_TESTING=TRUE or DASK_SCHEDULER_ADDRESS not set",
249
+ )
250
+ def test_full_pipeline_config_dask(self, synthetic_bulk_data: Path, dask_scheduler_address, tmp_path: Path):
251
+ """Test full pipeline configuration for Dask processing.
252
+
253
+ This test verifies the configuration is ready for Dask processing
254
+ but doesn't run the actual pipeline (which would require a full cluster).
255
+ """
256
+ config = ProcessingConfig(
257
+ input_file=str(synthetic_bulk_data),
258
+ output_catalog=str(tmp_path / "catalog"),
259
+ scratch_location=str(tmp_path / "scratch"),
260
+ input_pattern=str(synthetic_bulk_data / "2020_*.ndjson"),
261
+ input_format="ndjson",
262
+ url_column="url", # Will be ignored by passthrough hook
263
+ stac_hook="passthrough",
264
+ grid_system="h3",
265
+ grid_resolution=2,
266
+ temporal_bin="month",
267
+ enable_concurrent_http=False, # Passthrough doesn't need HTTP
268
+ max_workers=4,
269
+ )
270
+
271
+ # Verify all key settings
272
+ assert config.stac_hook == "passthrough"
273
+ assert config.input_format == "ndjson"
274
+ assert not config.enable_concurrent_http # Should be disabled
275
+ assert config.grid_system == "h3"
276
+ assert config.grid_resolution == 2
277
+
278
+
279
+ class TestDaskClusterInfo:
280
+ """Information about Dask cluster integration."""
281
+
282
+ @pytest.mark.skipif(
283
+ os.environ.get("DASK_TESTING", "").upper() != "TRUE" and os.environ.get("DASK_SCHEDULER_ADDRESS") is None,
284
+ reason="DASK_TESTING=TRUE or DASK_SCHEDULER_ADDRESS not set",
285
+ )
286
+ def test_dask_cluster_info(self, dask_scheduler_address):
287
+ """Display information about Dask cluster connection.
288
+
289
+ This test provides helpful information about connecting to Dask clusters.
290
+ """
291
+ # Skip for ephemeral clusters (DASK_TESTING mode)
292
+ if os.environ.get("DASK_TESTING", "").upper() == "TRUE":
293
+ pytest.skip("Ephemeral cluster - no external cluster info needed")
294
+
295
+ print("\n=== Dask Cluster Information ===")
296
+ print(f"Scheduler Address: {dask_scheduler_address}")
297
+ print("\nTo start a Dask cluster:")
298
+ print(" dask scheduler")
299
+ print(" # Or with specific options:")
300
+ print(" dask scheduler --port 8786")
301
+ print("\nTo verify cluster is running:")
302
+ print(" dask info")
303
+ print("\nTo check workers:")
304
+ print(" dask worker tcp://<scheduler-ip>:8786")
305
+ print("====================================\n")
306
+
307
+ @pytest.mark.skipif(True, reason="Replaced by DASK_TESTING mode")
308
+ def test_local_cluster_info(self):
309
+ """Display information about local Dask cluster setup.
310
+
311
+ This test is deprecated - use DASK_TESTING=TRUE instead.
312
+ """
313
+ print("\n=== Local Dask Cluster Information ===")
314
+ print("To run Dask integration tests with auto-created cluster:")
315
+ print(" export DASK_TESTING=TRUE")
316
+ print(" pytest earthcatalog/tests/test_dask_integration.py -v")
317
+ print("\nThe test will automatically create and destroy a local Dask cluster.")
318
+ print("=============================================\n")
319
+
320
+
321
+ class TestDaskCompatibility:
322
+ """Tests for Dask compatibility of components."""
323
+
324
+ def test_passthrough_hook_is_dask_compatible(self):
325
+ """Verify PassthroughSTACHook is serializable for Dask."""
326
+ from earthcatalog.stac_hooks import parse_hook_config, serialize_hook
327
+
328
+ hook = PassthroughSTACHook()
329
+
330
+ # Must serialize to a string for Dask transmission
331
+ config_str = serialize_hook(hook)
332
+ assert isinstance(config_str, str)
333
+
334
+ # Must be deserializable on workers
335
+ restored = parse_hook_config(config_str)
336
+ assert isinstance(restored, PassthroughSTACHook)
337
+
338
+ def test_processing_config_serialization_for_dask(self):
339
+ """Test that ProcessingConfig serializes correctly for Dask."""
340
+ config = ProcessingConfig(
341
+ input_file="./data",
342
+ output_catalog="./catalog",
343
+ scratch_location="./scratch",
344
+ input_pattern="./data/*.ndjson",
345
+ stac_hook="passthrough",
346
+ )
347
+
348
+ # Serialize config for Dask worker transmission
349
+ config_dict = config.to_dict()
350
+
351
+ # Verify passthrough is in the dict
352
+ assert "input_pattern" in config_dict
353
+ assert "stac_hook" in config_dict
354
+ assert config_dict["stac_hook"] == "passthrough"
355
+
356
+ # Verify deserialization works
357
+ restored = ProcessingConfig.from_dict(config_dict)
358
+ assert restored.stac_hook == "passthrough"
359
+ assert restored.input_pattern == "./data/*.ndjson"
360
+
361
+
362
+ @pytest.mark.dask
363
+ @pytest.mark.skipif(
364
+ os.environ.get("DASK_TESTING", "").upper() != "TRUE" and os.environ.get("DASK_SCHEDULER_ADDRESS") is None,
365
+ reason="DASK_TESTING=TRUE or DASK_SCHEDULER_ADDRESS not set",
366
+ )
367
+ class TestDaskPipelineIntegration:
368
+ """End-to-end Dask pipeline integration tests.
369
+
370
+ These tests require a running Dask cluster and synthetic data.
371
+ """
372
+
373
+ def test_dask_scheduler_connection(self, dask_scheduler_address):
374
+ """Test connection to Dask scheduler."""
375
+ from earthcatalog.ingestion_pipeline import DaskDistributedProcessor
376
+
377
+ # Skip for ephemeral clusters (they're created/destroyed per test class)
378
+ if os.environ.get("DASK_TESTING", "").upper() == "TRUE":
379
+ pytest.skip("Ephemeral cluster - connection test not applicable")
380
+
381
+ try:
382
+ processor = DaskDistributedProcessor(
383
+ n_workers=2,
384
+ scheduler_address=dask_scheduler_address,
385
+ )
386
+
387
+ # If we get here, connection succeeded
388
+ assert processor.scheduler_address == dask_scheduler_address
389
+
390
+ processor.close()
391
+ except (OSError, ConnectionError, ValueError) as e:
392
+ pytest.fail(f"Failed to connect to Dask scheduler: {e}")
393
+
394
+ def test_synthetic_data_for_dask(self, synthetic_bulk_data: Path):
395
+ """Verify synthetic data was created correctly."""
396
+ import glob as glob_module
397
+
398
+ pattern = str(synthetic_bulk_data / "*.ndjson")
399
+ files = glob_module.glob(pattern)
400
+
401
+ assert len(files) == 10
402
+
403
+ # Verify each file has 20 lines
404
+ for file_path in files:
405
+ with open(file_path) as f:
406
+ lines = f.readlines()
407
+ assert len(lines) == 20
408
+ # Each line should be valid JSON
409
+ for line in lines:
410
+ item = json.loads(line)
411
+ assert item["type"] == "Feature"
412
+
413
+
414
+ class TestDaskSchedulerAddress:
415
+ """Tests for Dask scheduler address configuration."""
416
+
417
+ def test_config_with_scheduler_address(self):
418
+ """Test ProcessingConfig with dask_scheduler_address."""
419
+ config = ProcessingConfig(
420
+ input_file="./test_input.parquet",
421
+ output_catalog="./test_output",
422
+ scratch_location="./test_scratch",
423
+ dask_scheduler_address="tcp://localhost:8786",
424
+ )
425
+
426
+ assert config.dask_scheduler_address == "tcp://localhost:8786"
427
+
428
+ def test_config_without_scheduler_address(self):
429
+ """Test ProcessingConfig without dask_scheduler_address defaults to empty string."""
430
+ config = ProcessingConfig(
431
+ input_file="./test_input.parquet",
432
+ output_catalog="./test_output",
433
+ scratch_location="./test_scratch",
434
+ )
435
+
436
+ assert config.dask_scheduler_address == ""
437
+
438
+ def test_config_to_dict_includes_scheduler_address(self):
439
+ """Test that dask_scheduler_address is included in to_dict()."""
440
+ config = ProcessingConfig(
441
+ input_file="./test_input.parquet",
442
+ output_catalog="./test_output",
443
+ scratch_location="./test_scratch",
444
+ dask_scheduler_address="tcp://scheduler:8786",
445
+ )
446
+
447
+ config_dict = config.to_dict()
448
+
449
+ assert "dask_scheduler_address" in config_dict
450
+ assert config_dict["dask_scheduler_address"] == "tcp://scheduler:8786"
451
+
452
+ def test_config_from_dict_with_scheduler_address(self):
453
+ """Test that from_dict() restores dask_scheduler_address."""
454
+ config_dict = {
455
+ "input_file": "./test_input.parquet",
456
+ "output_catalog": "./test_output",
457
+ "scratch_location": "./test_scratch",
458
+ "dask_scheduler_address": "tcp://remote:8786",
459
+ }
460
+
461
+ config = ProcessingConfig.from_dict(config_dict)
462
+
463
+ assert config.dask_scheduler_address == "tcp://remote:8786"
464
+
465
+ def test_config_from_dict_without_scheduler_address(self):
466
+ """Test backward compatibility when dask_scheduler_address is missing."""
467
+ config_dict = {
468
+ "input_file": "./test_input.parquet",
469
+ "output_catalog": "./test_output",
470
+ "scratch_location": "./test_scratch",
471
+ }
472
+
473
+ config = ProcessingConfig.from_dict(config_dict)
474
+
475
+ assert config.dask_scheduler_address == ""
476
+
477
+ def test_validation_warns_remote_scheduler_local_scratch(self, caplog):
478
+ """Test that validation warns when using remote scheduler with local scratch."""
479
+ import logging
480
+
481
+ config = ProcessingConfig(
482
+ input_file="s3://bucket/input.parquet",
483
+ output_catalog="s3://bucket/output",
484
+ scratch_location="./local_scratch", # Local path with remote scheduler
485
+ dask_scheduler_address="tcp://remote:8786",
486
+ )
487
+
488
+ with caplog.at_level(logging.WARNING):
489
+ config.validate()
490
+
491
+ # Should warn about local scratch with remote scheduler
492
+ warning_messages = [record.message for record in caplog.records if record.levelno == logging.WARNING]
493
+ assert any("local storage paths" in msg and "scratch_location" in msg for msg in warning_messages)
494
+
495
+ def test_validation_warns_remote_scheduler_local_output(self, caplog):
496
+ """Test that validation warns when using remote scheduler with local output."""
497
+ import logging
498
+
499
+ config = ProcessingConfig(
500
+ input_file="s3://bucket/input.parquet",
501
+ output_catalog="./local_output", # Local path with remote scheduler
502
+ scratch_location="s3://bucket/scratch",
503
+ dask_scheduler_address="tcp://remote:8786",
504
+ )
505
+
506
+ with caplog.at_level(logging.WARNING):
507
+ config.validate()
508
+
509
+ # Should warn about local output with remote scheduler
510
+ warning_messages = [record.message for record in caplog.records if record.levelno == logging.WARNING]
511
+ assert any("local storage paths" in msg and "output_catalog" in msg for msg in warning_messages)
512
+
513
+ def test_validation_no_warning_cloud_storage(self, caplog):
514
+ """Test that no warning is issued when using cloud storage with remote scheduler."""
515
+ import logging
516
+
517
+ config = ProcessingConfig(
518
+ input_file="s3://bucket/input.parquet",
519
+ output_catalog="s3://bucket/output",
520
+ scratch_location="s3://bucket/scratch",
521
+ dask_scheduler_address="tcp://remote:8786",
522
+ )
523
+
524
+ with caplog.at_level(logging.WARNING):
525
+ config.validate()
526
+
527
+ # Should not warn about cloud storage
528
+ warning_messages = [record.message for record in caplog.records if record.levelno == logging.WARNING]
529
+ assert not any("local storage paths" in msg for msg in warning_messages)
530
+
531
+ def test_validation_no_warning_local_scheduler(self, caplog):
532
+ """Test that no warning is issued when using local scheduler (empty string)."""
533
+ import logging
534
+
535
+ config = ProcessingConfig(
536
+ input_file="./input.parquet",
537
+ input_pattern="*.ndjson", # Use pattern to bypass file existence check
538
+ output_catalog="./output",
539
+ scratch_location="./scratch",
540
+ dask_scheduler_address="", # Empty = local cluster
541
+ )
542
+
543
+ with caplog.at_level(logging.WARNING):
544
+ config.validate()
545
+
546
+ # Should not warn when scheduler_address is empty (local cluster)
547
+ warning_messages = [record.message for record in caplog.records if record.levelno == logging.WARNING]
548
+ assert not any("local storage paths" in msg for msg in warning_messages)
549
+
550
+ @pytest.mark.skipif(
551
+ os.environ.get("DASK_TESTING", "").upper() != "TRUE" and os.environ.get("DASK_SCHEDULER_ADDRESS") is None,
552
+ reason="DASK_TESTING=TRUE or DASK_SCHEDULER_ADDRESS not set",
553
+ )
554
+ def test_processor_repr_with_scheduler_address(self, dask_scheduler_address):
555
+ """Test DaskDistributedProcessor repr with scheduler address."""
556
+ from earthcatalog.ingestion_pipeline import DaskDistributedProcessor
557
+
558
+ processor = DaskDistributedProcessor(scheduler_address=dask_scheduler_address)
559
+
560
+ repr_str = repr(processor)
561
+ assert "scheduler_address" in repr_str
562
+ assert dask_scheduler_address in repr_str
563
+
564
+ processor.close()
565
+
566
+ @pytest.mark.skipif(
567
+ os.environ.get("DASK_TESTING", "").upper() != "TRUE" and os.environ.get("DASK_SCHEDULER_ADDRESS") is None,
568
+ reason="DASK_TESTING=TRUE or DASK_SCHEDULER_ADDRESS not set",
569
+ )
570
+ def test_processor_repr_without_scheduler_address(self):
571
+ """Test DaskDistributedProcessor repr without scheduler address."""
572
+ from earthcatalog.ingestion_pipeline import DaskDistributedProcessor
573
+
574
+ processor = DaskDistributedProcessor(n_workers=4)
575
+
576
+ repr_str = repr(processor)
577
+ assert "n_workers" in repr_str
578
+ assert "4" in repr_str
579
+
580
+ processor.close()