earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. earthcatalog/__init__.py +164 -0
  2. earthcatalog/async_http_client.py +1006 -0
  3. earthcatalog/config.py +97 -0
  4. earthcatalog/engines/__init__.py +308 -0
  5. earthcatalog/engines/rustac_engine.py +142 -0
  6. earthcatalog/engines/stac_geoparquet_engine.py +126 -0
  7. earthcatalog/exceptions.py +471 -0
  8. earthcatalog/grid_systems.py +1114 -0
  9. earthcatalog/ingestion_pipeline.py +2281 -0
  10. earthcatalog/input_readers.py +603 -0
  11. earthcatalog/job_tracking.py +485 -0
  12. earthcatalog/pipeline.py +606 -0
  13. earthcatalog/schema_generator.py +911 -0
  14. earthcatalog/spatial_resolver.py +1207 -0
  15. earthcatalog/stac_hooks.py +754 -0
  16. earthcatalog/statistics.py +677 -0
  17. earthcatalog/storage_backends.py +548 -0
  18. earthcatalog/tests/__init__.py +1 -0
  19. earthcatalog/tests/conftest.py +76 -0
  20. earthcatalog/tests/test_all_grids.py +793 -0
  21. earthcatalog/tests/test_async_http.py +700 -0
  22. earthcatalog/tests/test_cli_and_storage.py +230 -0
  23. earthcatalog/tests/test_config.py +245 -0
  24. earthcatalog/tests/test_dask_integration.py +580 -0
  25. earthcatalog/tests/test_e2e_synthetic.py +1624 -0
  26. earthcatalog/tests/test_engines.py +272 -0
  27. earthcatalog/tests/test_exceptions.py +346 -0
  28. earthcatalog/tests/test_file_structure.py +245 -0
  29. earthcatalog/tests/test_input_readers.py +666 -0
  30. earthcatalog/tests/test_integration.py +200 -0
  31. earthcatalog/tests/test_integration_async.py +283 -0
  32. earthcatalog/tests/test_job_tracking.py +603 -0
  33. earthcatalog/tests/test_multi_file_input.py +336 -0
  34. earthcatalog/tests/test_passthrough_hook.py +196 -0
  35. earthcatalog/tests/test_pipeline.py +684 -0
  36. earthcatalog/tests/test_pipeline_components.py +665 -0
  37. earthcatalog/tests/test_schema_generator.py +506 -0
  38. earthcatalog/tests/test_spatial_resolver.py +413 -0
  39. earthcatalog/tests/test_stac_hooks.py +776 -0
  40. earthcatalog/tests/test_statistics.py +477 -0
  41. earthcatalog/tests/test_storage_backends.py +236 -0
  42. earthcatalog/tests/test_validation.py +435 -0
  43. earthcatalog/tests/test_workers.py +653 -0
  44. earthcatalog/validation.py +921 -0
  45. earthcatalog/workers.py +682 -0
  46. earthcatalog-0.2.0.dist-info/METADATA +333 -0
  47. earthcatalog-0.2.0.dist-info/RECORD +50 -0
  48. earthcatalog-0.2.0.dist-info/WHEEL +5 -0
  49. earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
  50. earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,230 @@
1
+ """
2
+ Tests for CLI features and storage backend enhancements.
3
+
4
+ These tests validate:
5
+ - CLI --dry-run option (f1)
6
+ - S3 timeout configuration (h2)
7
+ """
8
+
9
+ import subprocess
10
+ import sys
11
+ import tempfile
12
+ from pathlib import Path
13
+ from unittest.mock import patch
14
+
15
+ import pandas as pd
16
+ import pytest
17
+
18
+
19
+ class TestCLIDryRun:
20
+ """Test CLI --dry-run feature (f1)."""
21
+
22
+ def setup_method(self):
23
+ """Create temporary input file for testing."""
24
+ self.temp_dir = tempfile.mkdtemp()
25
+ self.input_file = Path(self.temp_dir) / "test_input.parquet"
26
+
27
+ # Create a simple test parquet file
28
+ df = pd.DataFrame({"url": ["https://example.com/item1.json", "https://example.com/item2.json"]})
29
+ df.to_parquet(self.input_file)
30
+
31
+ def teardown_method(self):
32
+ """Clean up temporary files."""
33
+ import shutil
34
+
35
+ shutil.rmtree(self.temp_dir, ignore_errors=True)
36
+
37
+ def test_dry_run_exits_with_zero(self):
38
+ """Test that --dry-run exits with code 0."""
39
+ result = subprocess.run(
40
+ [
41
+ sys.executable,
42
+ "-m",
43
+ "cli",
44
+ "--input",
45
+ str(self.input_file),
46
+ "--output",
47
+ "/tmp/test_output",
48
+ "--scratch",
49
+ "/tmp/test_scratch",
50
+ "--dry-run",
51
+ ],
52
+ capture_output=True,
53
+ text=True,
54
+ cwd=str(Path(__file__).parent.parent.parent),
55
+ )
56
+
57
+ assert result.returncode == 0, f"Expected exit code 0, got {result.returncode}. stderr: {result.stderr}"
58
+
59
+ def test_dry_run_shows_configuration(self):
60
+ """Test that --dry-run displays configuration info."""
61
+ result = subprocess.run(
62
+ [
63
+ sys.executable,
64
+ "-m",
65
+ "cli",
66
+ "--input",
67
+ str(self.input_file),
68
+ "--output",
69
+ "/tmp/test_output",
70
+ "--scratch",
71
+ "/tmp/test_scratch",
72
+ "--grid",
73
+ "h3",
74
+ "--grid-resolution",
75
+ "4",
76
+ "--dry-run",
77
+ ],
78
+ capture_output=True,
79
+ text=True,
80
+ cwd=str(Path(__file__).parent.parent.parent),
81
+ )
82
+
83
+ # Check that configuration is displayed
84
+ output = result.stdout + result.stderr
85
+ assert "DRY RUN" in output
86
+ assert "h3" in output.lower() or "H3" in output
87
+ assert "validated" in output.lower() or "ready" in output.lower()
88
+
89
+ def test_dry_run_does_not_create_output(self):
90
+ """Test that --dry-run doesn't create any output files."""
91
+ output_dir = Path(self.temp_dir) / "output"
92
+ scratch_dir = Path(self.temp_dir) / "scratch"
93
+
94
+ subprocess.run(
95
+ [
96
+ sys.executable,
97
+ "-m",
98
+ "cli",
99
+ "--input",
100
+ str(self.input_file),
101
+ "--output",
102
+ str(output_dir),
103
+ "--scratch",
104
+ str(scratch_dir),
105
+ "--dry-run",
106
+ ],
107
+ capture_output=True,
108
+ text=True,
109
+ cwd=str(Path(__file__).parent.parent.parent),
110
+ )
111
+
112
+ # Output directories should not be created
113
+ assert not output_dir.exists()
114
+ assert not scratch_dir.exists()
115
+
116
+ def test_dry_run_validates_invalid_config(self):
117
+ """Test that --dry-run still validates configuration."""
118
+ # Test with invalid grid type
119
+ result = subprocess.run(
120
+ [
121
+ sys.executable,
122
+ "-m",
123
+ "cli",
124
+ "--input",
125
+ str(self.input_file),
126
+ "--output",
127
+ "/tmp/test_output",
128
+ "--scratch",
129
+ "/tmp/test_scratch",
130
+ "--grid",
131
+ "invalid_grid",
132
+ "--dry-run",
133
+ ],
134
+ capture_output=True,
135
+ text=True,
136
+ cwd=str(Path(__file__).parent.parent.parent),
137
+ )
138
+
139
+ # Should fail due to invalid grid
140
+ assert result.returncode != 0
141
+
142
+
143
+ class TestS3TimeoutConfiguration:
144
+ """Test S3 storage backend timeout configuration (h2)."""
145
+
146
+ @pytest.mark.skipif(not pytest.importorskip("s3fs", reason="s3fs not available"), reason="s3fs library required")
147
+ def test_s3_storage_accepts_timeout_params(self):
148
+ """Test that S3Storage accepts custom timeout parameters."""
149
+ with patch("s3fs.S3FileSystem") as mock_fs:
150
+ from earthcatalog.storage_backends import S3Storage
151
+
152
+ S3Storage(
153
+ base_path="s3://test-bucket/prefix/",
154
+ connect_timeout=45.0,
155
+ read_timeout=90.0,
156
+ retries=5,
157
+ )
158
+
159
+ # Verify S3FileSystem was called with config
160
+ mock_fs.assert_called_once()
161
+ call_kwargs = mock_fs.call_args[1]
162
+ assert "config_kwargs" in call_kwargs
163
+
164
+ @pytest.mark.skipif(not pytest.importorskip("s3fs", reason="s3fs not available"), reason="s3fs library required")
165
+ def test_s3_storage_default_timeouts(self):
166
+ """Test that S3Storage has sensible default timeout values."""
167
+ with patch("s3fs.S3FileSystem") as mock_fs:
168
+ from earthcatalog.storage_backends import S3Storage
169
+
170
+ # Create with defaults
171
+ S3Storage(base_path="s3://test-bucket/prefix/")
172
+
173
+ # Verify defaults were applied
174
+ mock_fs.assert_called_once()
175
+ call_kwargs = mock_fs.call_args[1]
176
+ assert "config_kwargs" in call_kwargs
177
+
178
+ # Extract config from call
179
+ config_kwargs = call_kwargs["config_kwargs"]
180
+ assert "config" in config_kwargs
181
+
182
+ def test_storage_backend_direct_instantiation(self):
183
+ """Test direct instantiation of storage backends."""
184
+ from earthcatalog.storage_backends import LocalStorage
185
+
186
+ # Test local storage can be instantiated
187
+ local_storage = LocalStorage("/tmp/test")
188
+ assert local_storage is not None
189
+ assert hasattr(local_storage, "exists")
190
+ assert hasattr(local_storage, "open")
191
+
192
+ def test_local_storage_interface(self):
193
+ """Test LocalStorage interface for comparison."""
194
+ from earthcatalog.storage_backends import LocalStorage
195
+
196
+ storage = LocalStorage("/tmp/test_base")
197
+ assert hasattr(storage, "exists")
198
+ assert hasattr(storage, "open")
199
+ assert hasattr(storage, "makedirs")
200
+ assert hasattr(storage, "remove")
201
+ assert hasattr(storage, "rename")
202
+
203
+
204
+ class TestS3StorageWithMockedBotocore:
205
+ """Test S3 storage with mocked botocore Config."""
206
+
207
+ @pytest.mark.skipif(not pytest.importorskip("s3fs", reason="s3fs not available"), reason="s3fs library required")
208
+ def test_botocore_config_applied(self):
209
+ """Test that botocore Config is properly created with timeouts."""
210
+ with patch("s3fs.S3FileSystem"):
211
+ with patch("botocore.config.Config") as mock_config:
212
+ from earthcatalog.storage_backends import S3Storage
213
+
214
+ S3Storage(
215
+ base_path="s3://bucket/",
216
+ connect_timeout=15.0,
217
+ read_timeout=30.0,
218
+ retries=2,
219
+ )
220
+
221
+ # Verify Config was called with correct parameters
222
+ mock_config.assert_called_once_with(
223
+ connect_timeout=15.0,
224
+ read_timeout=30.0,
225
+ retries={"max_attempts": 2, "mode": "adaptive"},
226
+ )
227
+
228
+
229
+ if __name__ == "__main__":
230
+ pytest.main([__file__, "-v"])
@@ -0,0 +1,245 @@
1
+ # test_config.py
2
+ """Tests for configuration file loader module."""
3
+
4
+ import os
5
+ import tempfile
6
+ from pathlib import Path
7
+
8
+ import pytest
9
+ import yaml
10
+
11
+ from earthcatalog.config import load_config, merge_cli_overrides, save_config
12
+
13
+
14
+ class TestLoadConfig:
15
+ """Tests for load_config function."""
16
+
17
+ def test_load_explicit_path(self):
18
+ """Test loading config from explicit path."""
19
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
20
+ yaml.dump({"grid_resolution": 5, "batch_threshold": 5000}, f)
21
+ temp_path = f.name
22
+
23
+ try:
24
+ config = load_config(temp_path)
25
+ assert config["grid_resolution"] == 5
26
+ assert config["batch_threshold"] == 5000
27
+ finally:
28
+ os.unlink(temp_path)
29
+
30
+ def test_load_missing_explicit_path_raises(self):
31
+ """Test that missing explicit path raises FileNotFoundError."""
32
+ with pytest.raises(FileNotFoundError):
33
+ load_config("/nonexistent/path/config.yaml")
34
+
35
+ def test_load_default_path_if_exists(self, tmp_path, monkeypatch):
36
+ """Test loading from ./earthcatalog.yaml if it exists."""
37
+ config_content = {"grid_system": "s2", "concurrent_requests": 100}
38
+
39
+ # Create config file in temp directory
40
+ config_file = tmp_path / "earthcatalog.yaml"
41
+ with open(config_file, "w") as f:
42
+ yaml.dump(config_content, f)
43
+
44
+ # Change to temp directory
45
+ monkeypatch.chdir(tmp_path)
46
+
47
+ config = load_config()
48
+ assert config["grid_system"] == "s2"
49
+ assert config["concurrent_requests"] == 100
50
+
51
+ def test_load_returns_empty_when_no_config(self, tmp_path, monkeypatch):
52
+ """Test that empty dict is returned when no config file exists."""
53
+ # Change to empty temp directory
54
+ monkeypatch.chdir(tmp_path)
55
+
56
+ config = load_config()
57
+ assert config == {}
58
+
59
+ def test_load_empty_yaml_file(self):
60
+ """Test loading an empty YAML file returns empty dict."""
61
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
62
+ f.write("") # Empty file
63
+ temp_path = f.name
64
+
65
+ try:
66
+ config = load_config(temp_path)
67
+ assert config == {}
68
+ finally:
69
+ os.unlink(temp_path)
70
+
71
+ def test_load_invalid_yaml_raises(self):
72
+ """Test that invalid YAML raises an error."""
73
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
74
+ f.write("invalid: yaml: content: [")
75
+ temp_path = f.name
76
+
77
+ try:
78
+ with pytest.raises(yaml.YAMLError):
79
+ load_config(temp_path)
80
+ finally:
81
+ os.unlink(temp_path)
82
+
83
+ def test_load_non_mapping_yaml_raises(self):
84
+ """Test that non-mapping YAML content raises ValueError."""
85
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
86
+ f.write("- item1\n- item2\n") # List instead of mapping
87
+ temp_path = f.name
88
+
89
+ try:
90
+ with pytest.raises(ValueError, match="YAML mapping"):
91
+ load_config(temp_path)
92
+ finally:
93
+ os.unlink(temp_path)
94
+
95
+
96
+ class TestMergeCliOverrides:
97
+ """Tests for merge_cli_overrides function."""
98
+
99
+ def test_cli_overrides_config(self):
100
+ """Test that CLI args override config file values."""
101
+ file_config = {"grid_resolution": 2, "batch_threshold": 10000}
102
+ cli_args = {"grid_resolution": 5} # Override
103
+
104
+ result = merge_cli_overrides(file_config, cli_args)
105
+
106
+ assert result["grid_resolution"] == 5 # Overridden
107
+ assert result["batch_threshold"] == 10000 # Preserved
108
+
109
+ def test_none_cli_args_ignored(self):
110
+ """Test that None CLI args don't override config values."""
111
+ file_config = {"grid_resolution": 2, "batch_threshold": 10000}
112
+ cli_args = {"grid_resolution": None, "batch_threshold": None}
113
+
114
+ result = merge_cli_overrides(file_config, cli_args)
115
+
116
+ # Original values preserved because CLI args are None
117
+ assert result["grid_resolution"] == 2
118
+ assert result["batch_threshold"] == 10000
119
+
120
+ def test_cli_adds_new_keys(self):
121
+ """Test that CLI can add keys not in config file."""
122
+ file_config = {"grid_resolution": 2}
123
+ cli_args = {"distributed": True, "stac_hook": "module:pkg:func"}
124
+
125
+ result = merge_cli_overrides(file_config, cli_args)
126
+
127
+ assert result["grid_resolution"] == 2
128
+ assert result["distributed"] is True
129
+ assert result["stac_hook"] == "module:pkg:func"
130
+
131
+ def test_empty_config_uses_cli(self):
132
+ """Test that empty config uses all CLI values."""
133
+ file_config = {}
134
+ cli_args = {
135
+ "input_file": "data.parquet",
136
+ "grid_resolution": 5,
137
+ "batch_threshold": 5000,
138
+ }
139
+
140
+ result = merge_cli_overrides(file_config, cli_args)
141
+
142
+ assert result["input_file"] == "data.parquet"
143
+ assert result["grid_resolution"] == 5
144
+ assert result["batch_threshold"] == 5000
145
+
146
+
147
+ class TestSaveConfig:
148
+ """Tests for save_config function."""
149
+
150
+ def test_save_config_creates_file(self):
151
+ """Test that save_config creates a YAML file."""
152
+ config = {"grid_resolution": 5, "batch_threshold": 7500}
153
+
154
+ with tempfile.TemporaryDirectory() as tmp_dir:
155
+ config_path = Path(tmp_dir) / "test_config.yaml"
156
+ save_config(config, config_path)
157
+
158
+ assert config_path.exists()
159
+
160
+ # Verify content
161
+ with open(config_path) as f:
162
+ loaded = yaml.safe_load(f)
163
+
164
+ assert loaded["grid_resolution"] == 5
165
+ assert loaded["batch_threshold"] == 7500
166
+
167
+ def test_save_config_overwrites_existing(self):
168
+ """Test that save_config overwrites existing file."""
169
+ original = {"value": "original"}
170
+ updated = {"value": "updated"}
171
+
172
+ with tempfile.TemporaryDirectory() as tmp_dir:
173
+ config_path = Path(tmp_dir) / "config.yaml"
174
+
175
+ # Save original
176
+ save_config(original, config_path)
177
+
178
+ # Overwrite with updated
179
+ save_config(updated, config_path)
180
+
181
+ with open(config_path) as f:
182
+ loaded = yaml.safe_load(f)
183
+
184
+ assert loaded["value"] == "updated"
185
+
186
+
187
+ class TestConfigIntegration:
188
+ """Integration tests for config loading with ProcessingConfig."""
189
+
190
+ def test_full_config_workflow(self):
191
+ """Test complete workflow: load config, merge CLI, create ProcessingConfig."""
192
+ from earthcatalog.ingestion_pipeline import ProcessingConfig
193
+
194
+ # Create a config file
195
+ file_config = {
196
+ "grid_resolution": 4,
197
+ "batch_threshold": 8000,
198
+ "concurrent_requests": 75,
199
+ }
200
+
201
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
202
+ yaml.dump(file_config, f)
203
+ config_path = f.name
204
+
205
+ # Create a dummy input file
206
+ with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as pq:
207
+ import pandas as pd
208
+
209
+ pd.DataFrame({"url": ["url1"]}).to_parquet(pq.name)
210
+ input_path = pq.name
211
+
212
+ try:
213
+ # Load config file
214
+ loaded = load_config(config_path)
215
+
216
+ # Simulate CLI args (some override, some None)
217
+ cli_args = {
218
+ "input_file": input_path,
219
+ "output_catalog": "/tmp/catalog",
220
+ "scratch_location": "/tmp/scratch",
221
+ "grid_resolution": None, # Use config value
222
+ "batch_threshold": 5000, # Override
223
+ }
224
+
225
+ # Merge
226
+ merged = merge_cli_overrides(loaded, cli_args)
227
+
228
+ # Create ProcessingConfig
229
+ config = ProcessingConfig(
230
+ input_file=merged["input_file"],
231
+ output_catalog=merged["output_catalog"],
232
+ scratch_location=merged["scratch_location"],
233
+ grid_resolution=merged.get("grid_resolution", 2),
234
+ batch_threshold=merged.get("batch_threshold", 10000),
235
+ concurrent_requests=merged.get("concurrent_requests", 50),
236
+ )
237
+
238
+ # Verify merged values
239
+ assert config.grid_resolution == 4 # From file
240
+ assert config.batch_threshold == 5000 # From CLI (override)
241
+ assert config.concurrent_requests == 75 # From file
242
+
243
+ finally:
244
+ os.unlink(config_path)
245
+ os.unlink(input_path)