earthcatalog 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- earthcatalog/__init__.py +164 -0
- earthcatalog/async_http_client.py +1006 -0
- earthcatalog/config.py +97 -0
- earthcatalog/engines/__init__.py +308 -0
- earthcatalog/engines/rustac_engine.py +142 -0
- earthcatalog/engines/stac_geoparquet_engine.py +126 -0
- earthcatalog/exceptions.py +471 -0
- earthcatalog/grid_systems.py +1114 -0
- earthcatalog/ingestion_pipeline.py +2281 -0
- earthcatalog/input_readers.py +603 -0
- earthcatalog/job_tracking.py +485 -0
- earthcatalog/pipeline.py +606 -0
- earthcatalog/schema_generator.py +911 -0
- earthcatalog/spatial_resolver.py +1207 -0
- earthcatalog/stac_hooks.py +754 -0
- earthcatalog/statistics.py +677 -0
- earthcatalog/storage_backends.py +548 -0
- earthcatalog/tests/__init__.py +1 -0
- earthcatalog/tests/conftest.py +76 -0
- earthcatalog/tests/test_all_grids.py +793 -0
- earthcatalog/tests/test_async_http.py +700 -0
- earthcatalog/tests/test_cli_and_storage.py +230 -0
- earthcatalog/tests/test_config.py +245 -0
- earthcatalog/tests/test_dask_integration.py +580 -0
- earthcatalog/tests/test_e2e_synthetic.py +1624 -0
- earthcatalog/tests/test_engines.py +272 -0
- earthcatalog/tests/test_exceptions.py +346 -0
- earthcatalog/tests/test_file_structure.py +245 -0
- earthcatalog/tests/test_input_readers.py +666 -0
- earthcatalog/tests/test_integration.py +200 -0
- earthcatalog/tests/test_integration_async.py +283 -0
- earthcatalog/tests/test_job_tracking.py +603 -0
- earthcatalog/tests/test_multi_file_input.py +336 -0
- earthcatalog/tests/test_passthrough_hook.py +196 -0
- earthcatalog/tests/test_pipeline.py +684 -0
- earthcatalog/tests/test_pipeline_components.py +665 -0
- earthcatalog/tests/test_schema_generator.py +506 -0
- earthcatalog/tests/test_spatial_resolver.py +413 -0
- earthcatalog/tests/test_stac_hooks.py +776 -0
- earthcatalog/tests/test_statistics.py +477 -0
- earthcatalog/tests/test_storage_backends.py +236 -0
- earthcatalog/tests/test_validation.py +435 -0
- earthcatalog/tests/test_workers.py +653 -0
- earthcatalog/validation.py +921 -0
- earthcatalog/workers.py +682 -0
- earthcatalog-0.2.0.dist-info/METADATA +333 -0
- earthcatalog-0.2.0.dist-info/RECORD +50 -0
- earthcatalog-0.2.0.dist-info/WHEEL +5 -0
- earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
- earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
# test_storage_backends.py
|
|
2
|
+
"""Tests for storage backend enhancements.
|
|
3
|
+
|
|
4
|
+
This module tests the new storage backend methods:
|
|
5
|
+
- get_etag: Get file checksum/ETag for change detection
|
|
6
|
+
- rmtree: Recursive directory deletion
|
|
7
|
+
- list_files: List files in directory with pattern matching
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import tempfile
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
import pytest
|
|
14
|
+
|
|
15
|
+
from earthcatalog.storage_backends import LocalStorage, get_storage_backend
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class TestLocalStorageEnhancements:
|
|
19
|
+
"""Test LocalStorage new methods."""
|
|
20
|
+
|
|
21
|
+
@pytest.fixture
|
|
22
|
+
def temp_dir(self):
|
|
23
|
+
"""Create a temporary directory for testing."""
|
|
24
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
25
|
+
yield tmpdir
|
|
26
|
+
|
|
27
|
+
@pytest.fixture
|
|
28
|
+
def local_storage(self, temp_dir):
|
|
29
|
+
"""Create a LocalStorage instance."""
|
|
30
|
+
return LocalStorage(temp_dir)
|
|
31
|
+
|
|
32
|
+
def test_get_etag_returns_hash_for_existing_file(self, local_storage, temp_dir):
|
|
33
|
+
"""get_etag should return a hash string for existing files."""
|
|
34
|
+
# Create a test file
|
|
35
|
+
test_file = Path(temp_dir) / "test.txt"
|
|
36
|
+
test_file.write_text("hello world")
|
|
37
|
+
|
|
38
|
+
etag = local_storage.get_etag(str(test_file))
|
|
39
|
+
|
|
40
|
+
assert etag is not None
|
|
41
|
+
assert isinstance(etag, str)
|
|
42
|
+
assert len(etag) > 0
|
|
43
|
+
|
|
44
|
+
def test_get_etag_returns_none_for_missing_file(self, local_storage, temp_dir):
|
|
45
|
+
"""get_etag should return None for non-existent files."""
|
|
46
|
+
missing_file = Path(temp_dir) / "nonexistent.txt"
|
|
47
|
+
|
|
48
|
+
etag = local_storage.get_etag(str(missing_file))
|
|
49
|
+
|
|
50
|
+
assert etag is None
|
|
51
|
+
|
|
52
|
+
def test_get_etag_different_content_different_hash(self, local_storage, temp_dir):
|
|
53
|
+
"""Different file contents should produce different ETags."""
|
|
54
|
+
file1 = Path(temp_dir) / "file1.txt"
|
|
55
|
+
file2 = Path(temp_dir) / "file2.txt"
|
|
56
|
+
file1.write_text("content one")
|
|
57
|
+
file2.write_text("content two")
|
|
58
|
+
|
|
59
|
+
etag1 = local_storage.get_etag(str(file1))
|
|
60
|
+
etag2 = local_storage.get_etag(str(file2))
|
|
61
|
+
|
|
62
|
+
assert etag1 != etag2
|
|
63
|
+
|
|
64
|
+
def test_get_etag_same_content_same_hash(self, local_storage, temp_dir):
|
|
65
|
+
"""Same file contents should produce same ETags."""
|
|
66
|
+
file1 = Path(temp_dir) / "file1.txt"
|
|
67
|
+
file2 = Path(temp_dir) / "file2.txt"
|
|
68
|
+
file1.write_text("identical content")
|
|
69
|
+
file2.write_text("identical content")
|
|
70
|
+
|
|
71
|
+
etag1 = local_storage.get_etag(str(file1))
|
|
72
|
+
etag2 = local_storage.get_etag(str(file2))
|
|
73
|
+
|
|
74
|
+
assert etag1 == etag2
|
|
75
|
+
|
|
76
|
+
def test_rmtree_removes_directory_recursively(self, local_storage, temp_dir):
|
|
77
|
+
"""rmtree should remove directory and all contents."""
|
|
78
|
+
# Create nested directory structure
|
|
79
|
+
nested_dir = Path(temp_dir) / "parent" / "child"
|
|
80
|
+
nested_dir.mkdir(parents=True)
|
|
81
|
+
(nested_dir / "file.txt").write_text("content")
|
|
82
|
+
(Path(temp_dir) / "parent" / "sibling.txt").write_text("other")
|
|
83
|
+
|
|
84
|
+
parent_path = str(Path(temp_dir) / "parent")
|
|
85
|
+
local_storage.rmtree(parent_path)
|
|
86
|
+
|
|
87
|
+
assert not Path(parent_path).exists()
|
|
88
|
+
|
|
89
|
+
def test_rmtree_handles_missing_directory(self, local_storage, temp_dir):
|
|
90
|
+
"""rmtree should not raise error for non-existent directory."""
|
|
91
|
+
missing_dir = Path(temp_dir) / "does_not_exist"
|
|
92
|
+
|
|
93
|
+
# Should not raise
|
|
94
|
+
local_storage.rmtree(str(missing_dir))
|
|
95
|
+
|
|
96
|
+
def test_rmtree_handles_empty_directory(self, local_storage, temp_dir):
|
|
97
|
+
"""rmtree should handle empty directories."""
|
|
98
|
+
empty_dir = Path(temp_dir) / "empty"
|
|
99
|
+
empty_dir.mkdir()
|
|
100
|
+
|
|
101
|
+
local_storage.rmtree(str(empty_dir))
|
|
102
|
+
|
|
103
|
+
assert not empty_dir.exists()
|
|
104
|
+
|
|
105
|
+
def test_list_files_returns_matching_files(self, local_storage, temp_dir):
|
|
106
|
+
"""list_files should return files matching the pattern."""
|
|
107
|
+
# Create test files
|
|
108
|
+
(Path(temp_dir) / "data1.parquet").write_text("data")
|
|
109
|
+
(Path(temp_dir) / "data2.parquet").write_text("data")
|
|
110
|
+
(Path(temp_dir) / "readme.txt").write_text("text")
|
|
111
|
+
|
|
112
|
+
files = local_storage.list_files(temp_dir, "*.parquet")
|
|
113
|
+
|
|
114
|
+
assert len(files) == 2
|
|
115
|
+
assert all(".parquet" in f for f in files)
|
|
116
|
+
|
|
117
|
+
def test_list_files_returns_empty_for_no_matches(self, local_storage, temp_dir):
|
|
118
|
+
"""list_files should return empty list when no files match."""
|
|
119
|
+
(Path(temp_dir) / "data.txt").write_text("data")
|
|
120
|
+
|
|
121
|
+
files = local_storage.list_files(temp_dir, "*.parquet")
|
|
122
|
+
|
|
123
|
+
assert files == []
|
|
124
|
+
|
|
125
|
+
def test_list_files_handles_missing_directory(self, local_storage, temp_dir):
|
|
126
|
+
"""list_files should return empty list for non-existent directory."""
|
|
127
|
+
missing_dir = Path(temp_dir) / "nonexistent"
|
|
128
|
+
|
|
129
|
+
files = local_storage.list_files(str(missing_dir), "*")
|
|
130
|
+
|
|
131
|
+
assert files == []
|
|
132
|
+
|
|
133
|
+
def test_list_files_recursive_with_glob_pattern(self, local_storage, temp_dir):
|
|
134
|
+
"""list_files with ** should match files recursively."""
|
|
135
|
+
# Create nested structure
|
|
136
|
+
subdir = Path(temp_dir) / "subdir"
|
|
137
|
+
subdir.mkdir()
|
|
138
|
+
(Path(temp_dir) / "root.parquet").write_text("data")
|
|
139
|
+
(subdir / "nested.parquet").write_text("data")
|
|
140
|
+
|
|
141
|
+
files = local_storage.list_files(temp_dir, "**/*.parquet")
|
|
142
|
+
|
|
143
|
+
# Should find both files
|
|
144
|
+
assert len(files) >= 1 # At least the nested one with **
|
|
145
|
+
|
|
146
|
+
def test_list_files_default_pattern_matches_all(self, local_storage, temp_dir):
|
|
147
|
+
"""list_files with default pattern should match all files."""
|
|
148
|
+
(Path(temp_dir) / "file1.txt").write_text("data")
|
|
149
|
+
(Path(temp_dir) / "file2.json").write_text("data")
|
|
150
|
+
|
|
151
|
+
files = local_storage.list_files(temp_dir)
|
|
152
|
+
|
|
153
|
+
assert len(files) == 2
|
|
154
|
+
|
|
155
|
+
def test_list_dirs_returns_subdirectories(self, local_storage, temp_dir):
|
|
156
|
+
"""list_dirs should return immediate subdirectories."""
|
|
157
|
+
# Create subdirectories
|
|
158
|
+
(Path(temp_dir) / "subdir1").mkdir()
|
|
159
|
+
(Path(temp_dir) / "subdir2").mkdir()
|
|
160
|
+
# Create a file (should not be included)
|
|
161
|
+
(Path(temp_dir) / "file.txt").write_text("data")
|
|
162
|
+
|
|
163
|
+
dirs = local_storage.list_dirs(temp_dir)
|
|
164
|
+
|
|
165
|
+
assert len(dirs) == 2
|
|
166
|
+
dir_names = [Path(d).name for d in dirs]
|
|
167
|
+
assert "subdir1" in dir_names
|
|
168
|
+
assert "subdir2" in dir_names
|
|
169
|
+
|
|
170
|
+
def test_list_dirs_returns_empty_for_no_subdirs(self, local_storage, temp_dir):
|
|
171
|
+
"""list_dirs should return empty list when no subdirectories exist."""
|
|
172
|
+
# Create only files
|
|
173
|
+
(Path(temp_dir) / "file1.txt").write_text("data")
|
|
174
|
+
(Path(temp_dir) / "file2.txt").write_text("data")
|
|
175
|
+
|
|
176
|
+
dirs = local_storage.list_dirs(temp_dir)
|
|
177
|
+
|
|
178
|
+
assert dirs == []
|
|
179
|
+
|
|
180
|
+
def test_list_dirs_handles_missing_directory(self, local_storage, temp_dir):
|
|
181
|
+
"""list_dirs should return empty list for non-existent directory."""
|
|
182
|
+
missing_dir = Path(temp_dir) / "nonexistent"
|
|
183
|
+
|
|
184
|
+
dirs = local_storage.list_dirs(str(missing_dir))
|
|
185
|
+
|
|
186
|
+
assert dirs == []
|
|
187
|
+
|
|
188
|
+
def test_list_dirs_does_not_recurse(self, local_storage, temp_dir):
|
|
189
|
+
"""list_dirs should only return immediate children, not nested."""
|
|
190
|
+
# Create nested structure
|
|
191
|
+
(Path(temp_dir) / "parent").mkdir()
|
|
192
|
+
(Path(temp_dir) / "parent" / "child").mkdir()
|
|
193
|
+
|
|
194
|
+
dirs = local_storage.list_dirs(temp_dir)
|
|
195
|
+
|
|
196
|
+
assert len(dirs) == 1
|
|
197
|
+
assert Path(dirs[0]).name == "parent"
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
class TestGetStorageBackend:
|
|
201
|
+
"""Test the storage backend factory function."""
|
|
202
|
+
|
|
203
|
+
def test_local_path_returns_local_storage(self):
|
|
204
|
+
"""Local paths should return LocalStorage."""
|
|
205
|
+
storage = get_storage_backend("/tmp/test")
|
|
206
|
+
assert isinstance(storage, LocalStorage)
|
|
207
|
+
|
|
208
|
+
def test_relative_path_returns_local_storage(self):
|
|
209
|
+
"""Relative paths should return LocalStorage."""
|
|
210
|
+
storage = get_storage_backend("./catalog")
|
|
211
|
+
assert isinstance(storage, LocalStorage)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
# S3 tests would require moto or similar mocking
|
|
215
|
+
class TestS3StorageEnhancements:
|
|
216
|
+
"""Test S3Storage new methods (requires mocking or live S3)."""
|
|
217
|
+
|
|
218
|
+
@pytest.mark.skip(reason="Requires S3 credentials or mocking")
|
|
219
|
+
def test_get_etag_returns_s3_etag(self):
|
|
220
|
+
"""S3 get_etag should return the S3 ETag header value."""
|
|
221
|
+
pass
|
|
222
|
+
|
|
223
|
+
@pytest.mark.skip(reason="Requires S3 credentials or mocking")
|
|
224
|
+
def test_rmtree_deletes_s3_prefix(self):
|
|
225
|
+
"""S3 rmtree should delete all objects with the given prefix."""
|
|
226
|
+
pass
|
|
227
|
+
|
|
228
|
+
@pytest.mark.skip(reason="Requires S3 credentials or mocking")
|
|
229
|
+
def test_list_files_lists_s3_objects(self):
|
|
230
|
+
"""S3 list_files should list objects matching pattern."""
|
|
231
|
+
pass
|
|
232
|
+
|
|
233
|
+
@pytest.mark.skip(reason="Requires S3 credentials or mocking")
|
|
234
|
+
def test_list_dirs_lists_s3_prefixes(self):
|
|
235
|
+
"""S3 list_dirs should list directory-like prefixes."""
|
|
236
|
+
pass
|
|
@@ -0,0 +1,435 @@
|
|
|
1
|
+
"""Tests for STAC GeoParquet validation module."""
|
|
2
|
+
|
|
3
|
+
import geopandas as gpd
|
|
4
|
+
import pytest
|
|
5
|
+
from shapely.geometry import Point
|
|
6
|
+
|
|
7
|
+
from earthcatalog.validation import (
|
|
8
|
+
CatalogValidationResult,
|
|
9
|
+
ValidationIssue,
|
|
10
|
+
ValidationResult,
|
|
11
|
+
get_geoparquet_metadata,
|
|
12
|
+
validate_catalog,
|
|
13
|
+
validate_geoparquet_file,
|
|
14
|
+
validate_stac_item,
|
|
15
|
+
validate_stac_items_batch,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class TestValidationResult:
|
|
20
|
+
"""Tests for ValidationResult data class."""
|
|
21
|
+
|
|
22
|
+
def test_validation_result_defaults(self):
|
|
23
|
+
"""Test that ValidationResult has correct defaults."""
|
|
24
|
+
result = ValidationResult(is_valid=True)
|
|
25
|
+
assert result.is_valid is True
|
|
26
|
+
assert result.issues == []
|
|
27
|
+
assert result.metadata == {}
|
|
28
|
+
assert result.warnings == []
|
|
29
|
+
assert result.errors == []
|
|
30
|
+
|
|
31
|
+
def test_add_warning(self):
|
|
32
|
+
"""Test adding warnings to result."""
|
|
33
|
+
result = ValidationResult(is_valid=True)
|
|
34
|
+
result.add_warning("TEST_WARNING", "Test message", key="value")
|
|
35
|
+
|
|
36
|
+
assert len(result.warnings) == 1
|
|
37
|
+
assert result.warnings[0].code == "TEST_WARNING"
|
|
38
|
+
assert result.warnings[0].message == "Test message"
|
|
39
|
+
assert result.warnings[0].context == {"key": "value"}
|
|
40
|
+
assert result.is_valid is True # Warnings don't invalidate
|
|
41
|
+
|
|
42
|
+
def test_add_error(self):
|
|
43
|
+
"""Test adding errors to result."""
|
|
44
|
+
result = ValidationResult(is_valid=True)
|
|
45
|
+
result.add_error("TEST_ERROR", "Test error message")
|
|
46
|
+
|
|
47
|
+
assert len(result.errors) == 1
|
|
48
|
+
assert result.errors[0].code == "TEST_ERROR"
|
|
49
|
+
assert result.is_valid is False # Errors invalidate
|
|
50
|
+
|
|
51
|
+
def test_merge_results(self):
|
|
52
|
+
"""Test merging two validation results."""
|
|
53
|
+
result1 = ValidationResult(is_valid=True)
|
|
54
|
+
result1.add_warning("WARN1", "Warning 1")
|
|
55
|
+
|
|
56
|
+
result2 = ValidationResult(is_valid=True)
|
|
57
|
+
result2.add_error("ERR1", "Error 1")
|
|
58
|
+
|
|
59
|
+
merged = result1.merge(result2)
|
|
60
|
+
|
|
61
|
+
assert len(merged.issues) == 2
|
|
62
|
+
assert merged.is_valid is False
|
|
63
|
+
assert len(merged.warnings) == 1
|
|
64
|
+
assert len(merged.errors) == 1
|
|
65
|
+
|
|
66
|
+
def test_summary(self):
|
|
67
|
+
"""Test summary generation."""
|
|
68
|
+
result = ValidationResult(is_valid=True)
|
|
69
|
+
result.add_warning("WARN", "A warning")
|
|
70
|
+
result.add_error("ERR", "An error")
|
|
71
|
+
|
|
72
|
+
summary = result.summary()
|
|
73
|
+
assert "Valid: False" in summary
|
|
74
|
+
assert "Warnings: 1" in summary
|
|
75
|
+
assert "Errors: 1" in summary
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class TestValidationIssue:
|
|
79
|
+
"""Tests for ValidationIssue data class."""
|
|
80
|
+
|
|
81
|
+
def test_str_representation(self):
|
|
82
|
+
"""Test string representation of issue."""
|
|
83
|
+
issue = ValidationIssue(
|
|
84
|
+
level="warning",
|
|
85
|
+
code="TEST_CODE",
|
|
86
|
+
message="Test message",
|
|
87
|
+
context={"key": "value"},
|
|
88
|
+
)
|
|
89
|
+
assert "[WARNING] TEST_CODE: Test message" == str(issue)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class TestSTACItemValidation:
|
|
93
|
+
"""Tests for STAC item validation."""
|
|
94
|
+
|
|
95
|
+
@pytest.fixture
|
|
96
|
+
def valid_stac_item(self):
|
|
97
|
+
"""Create a valid STAC item for testing."""
|
|
98
|
+
return {
|
|
99
|
+
"type": "Feature",
|
|
100
|
+
"stac_version": "1.0.0",
|
|
101
|
+
"id": "test-item-001",
|
|
102
|
+
"geometry": {
|
|
103
|
+
"type": "Polygon",
|
|
104
|
+
"coordinates": [[[-10, -10], [10, -10], [10, 10], [-10, 10], [-10, -10]]],
|
|
105
|
+
},
|
|
106
|
+
"bbox": [-10, -10, 10, 10],
|
|
107
|
+
"properties": {
|
|
108
|
+
"datetime": "2024-01-15T10:00:00Z",
|
|
109
|
+
},
|
|
110
|
+
"links": [],
|
|
111
|
+
"assets": {},
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
def test_validate_valid_item(self, valid_stac_item):
|
|
115
|
+
"""Test validation of a valid STAC item."""
|
|
116
|
+
result, corrected = validate_stac_item(valid_stac_item)
|
|
117
|
+
|
|
118
|
+
assert result.is_valid is True
|
|
119
|
+
assert len(result.errors) == 0
|
|
120
|
+
assert corrected is not None
|
|
121
|
+
|
|
122
|
+
def test_validate_missing_required_fields(self):
|
|
123
|
+
"""Test validation catches missing required fields."""
|
|
124
|
+
item = {
|
|
125
|
+
"properties": {"datetime": "2024-01-15T10:00:00Z"},
|
|
126
|
+
}
|
|
127
|
+
result, corrected = validate_stac_item(item)
|
|
128
|
+
|
|
129
|
+
# Should have warnings for missing id, type, geometry
|
|
130
|
+
assert len(result.warnings) >= 2
|
|
131
|
+
warning_codes = [w.code for w in result.warnings]
|
|
132
|
+
assert "MISSING_FIELD" in warning_codes or "NULL_GEOMETRY" in warning_codes
|
|
133
|
+
|
|
134
|
+
def test_validate_invalid_type(self):
|
|
135
|
+
"""Test validation warns on wrong type."""
|
|
136
|
+
item = {
|
|
137
|
+
"type": "Collection", # Wrong type
|
|
138
|
+
"id": "test",
|
|
139
|
+
"geometry": {"type": "Point", "coordinates": [0, 0]},
|
|
140
|
+
"properties": {"datetime": "2024-01-15T10:00:00Z"},
|
|
141
|
+
}
|
|
142
|
+
result, _ = validate_stac_item(item)
|
|
143
|
+
|
|
144
|
+
warning_codes = [w.code for w in result.warnings]
|
|
145
|
+
assert "INVALID_TYPE" in warning_codes
|
|
146
|
+
|
|
147
|
+
def test_validate_null_geometry(self):
|
|
148
|
+
"""Test validation handles null geometry."""
|
|
149
|
+
item = {
|
|
150
|
+
"type": "Feature",
|
|
151
|
+
"id": "test",
|
|
152
|
+
"geometry": None,
|
|
153
|
+
"properties": {"datetime": "2024-01-15T10:00:00Z"},
|
|
154
|
+
}
|
|
155
|
+
result, _ = validate_stac_item(item)
|
|
156
|
+
|
|
157
|
+
warning_codes = [w.code for w in result.warnings]
|
|
158
|
+
assert "NULL_GEOMETRY" in warning_codes
|
|
159
|
+
|
|
160
|
+
def test_validate_invalid_geometry_fixed(self):
|
|
161
|
+
"""Test that invalid geometry can be fixed."""
|
|
162
|
+
# Self-intersecting polygon (bowtie)
|
|
163
|
+
item = {
|
|
164
|
+
"type": "Feature",
|
|
165
|
+
"id": "test",
|
|
166
|
+
"geometry": {
|
|
167
|
+
"type": "Polygon",
|
|
168
|
+
"coordinates": [[[0, 0], [10, 10], [10, 0], [0, 10], [0, 0]]],
|
|
169
|
+
},
|
|
170
|
+
"properties": {"datetime": "2024-01-15T10:00:00Z"},
|
|
171
|
+
}
|
|
172
|
+
result, corrected = validate_stac_item(item, fix_geometry=True)
|
|
173
|
+
|
|
174
|
+
# Should have warning about invalid geometry
|
|
175
|
+
warning_codes = [w.code for w in result.warnings]
|
|
176
|
+
assert "INVALID_GEOMETRY" in warning_codes
|
|
177
|
+
|
|
178
|
+
# Should have been fixed
|
|
179
|
+
if result.metadata.get("geometry_fixed"):
|
|
180
|
+
from shapely.geometry import shape
|
|
181
|
+
|
|
182
|
+
fixed_geom = shape(corrected["geometry"])
|
|
183
|
+
assert fixed_geom.is_valid
|
|
184
|
+
|
|
185
|
+
def test_validate_bbox_mismatch(self, valid_stac_item):
|
|
186
|
+
"""Test validation catches bbox mismatch."""
|
|
187
|
+
# Set incorrect bbox
|
|
188
|
+
valid_stac_item["bbox"] = [0, 0, 5, 5] # Doesn't match geometry
|
|
189
|
+
|
|
190
|
+
result, corrected = validate_stac_item(valid_stac_item)
|
|
191
|
+
|
|
192
|
+
warning_codes = [w.code for w in result.warnings]
|
|
193
|
+
assert "BBOX_MISMATCH" in warning_codes
|
|
194
|
+
|
|
195
|
+
# Should have corrected bbox
|
|
196
|
+
if result.metadata.get("bbox_corrected"):
|
|
197
|
+
assert corrected["bbox"] == [-10, -10, 10, 10]
|
|
198
|
+
|
|
199
|
+
def test_validate_missing_datetime(self):
|
|
200
|
+
"""Test validation warns on missing datetime."""
|
|
201
|
+
item = {
|
|
202
|
+
"type": "Feature",
|
|
203
|
+
"id": "test",
|
|
204
|
+
"geometry": {"type": "Point", "coordinates": [0, 0]},
|
|
205
|
+
"properties": {}, # No datetime
|
|
206
|
+
}
|
|
207
|
+
result, _ = validate_stac_item(item)
|
|
208
|
+
|
|
209
|
+
warning_codes = [w.code for w in result.warnings]
|
|
210
|
+
assert "MISSING_DATETIME" in warning_codes
|
|
211
|
+
|
|
212
|
+
def test_validate_datetime_range_valid(self):
|
|
213
|
+
"""Test validation accepts datetime range."""
|
|
214
|
+
item = {
|
|
215
|
+
"type": "Feature",
|
|
216
|
+
"id": "test",
|
|
217
|
+
"geometry": {"type": "Point", "coordinates": [0, 0]},
|
|
218
|
+
"properties": {
|
|
219
|
+
"datetime": None,
|
|
220
|
+
"start_datetime": "2024-01-01T00:00:00Z",
|
|
221
|
+
"end_datetime": "2024-01-31T23:59:59Z",
|
|
222
|
+
},
|
|
223
|
+
}
|
|
224
|
+
result, _ = validate_stac_item(item)
|
|
225
|
+
|
|
226
|
+
# Should NOT have MISSING_DATETIME warning
|
|
227
|
+
warning_codes = [w.code for w in result.warnings]
|
|
228
|
+
assert "MISSING_DATETIME" not in warning_codes
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
class TestBatchValidation:
|
|
232
|
+
"""Tests for batch STAC item validation."""
|
|
233
|
+
|
|
234
|
+
def test_validate_batch(self):
|
|
235
|
+
"""Test batch validation of multiple items."""
|
|
236
|
+
items = [
|
|
237
|
+
{
|
|
238
|
+
"type": "Feature",
|
|
239
|
+
"id": f"item-{i}",
|
|
240
|
+
"geometry": {"type": "Point", "coordinates": [i, i]},
|
|
241
|
+
"properties": {"datetime": "2024-01-15T10:00:00Z"},
|
|
242
|
+
}
|
|
243
|
+
for i in range(5)
|
|
244
|
+
]
|
|
245
|
+
|
|
246
|
+
results, corrected = validate_stac_items_batch(items)
|
|
247
|
+
|
|
248
|
+
assert len(results) == 5
|
|
249
|
+
assert len(corrected) == 5
|
|
250
|
+
assert all(r.is_valid for r in results)
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
class TestGeoParquetValidation:
|
|
254
|
+
"""Tests for GeoParquet file validation."""
|
|
255
|
+
|
|
256
|
+
@pytest.fixture
|
|
257
|
+
def valid_geoparquet(self, tmp_path):
|
|
258
|
+
"""Create a valid GeoParquet file for testing."""
|
|
259
|
+
# Create a GeoDataFrame
|
|
260
|
+
gdf = gpd.GeoDataFrame(
|
|
261
|
+
{
|
|
262
|
+
"id": ["item-1", "item-2", "item-3"],
|
|
263
|
+
"datetime": ["2024-01-01", "2024-01-02", "2024-01-03"],
|
|
264
|
+
"geometry": [
|
|
265
|
+
Point(0, 0),
|
|
266
|
+
Point(1, 1),
|
|
267
|
+
Point(2, 2),
|
|
268
|
+
],
|
|
269
|
+
},
|
|
270
|
+
crs="EPSG:4326",
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
file_path = tmp_path / "test.parquet"
|
|
274
|
+
gdf.to_parquet(file_path)
|
|
275
|
+
return file_path
|
|
276
|
+
|
|
277
|
+
def test_validate_valid_geoparquet(self, valid_geoparquet):
|
|
278
|
+
"""Test validation of a valid GeoParquet file."""
|
|
279
|
+
result = validate_geoparquet_file(valid_geoparquet)
|
|
280
|
+
|
|
281
|
+
assert result.is_valid is True
|
|
282
|
+
assert "num_rows" in result.metadata
|
|
283
|
+
assert result.metadata["num_rows"] == 3
|
|
284
|
+
|
|
285
|
+
def test_validate_nonexistent_file(self, tmp_path):
|
|
286
|
+
"""Test validation of nonexistent file."""
|
|
287
|
+
result = validate_geoparquet_file(tmp_path / "nonexistent.parquet")
|
|
288
|
+
|
|
289
|
+
assert result.is_valid is False
|
|
290
|
+
error_codes = [e.code for e in result.errors]
|
|
291
|
+
assert "FILE_NOT_FOUND" in error_codes
|
|
292
|
+
|
|
293
|
+
def test_validate_geoparquet_has_geo_metadata(self, valid_geoparquet):
|
|
294
|
+
"""Test that geo metadata is validated."""
|
|
295
|
+
result = validate_geoparquet_file(valid_geoparquet)
|
|
296
|
+
|
|
297
|
+
# GeoPandas creates proper geo metadata
|
|
298
|
+
assert "primary_column" in result.metadata
|
|
299
|
+
assert result.metadata["primary_column"] == "geometry"
|
|
300
|
+
|
|
301
|
+
def test_get_geoparquet_metadata(self, valid_geoparquet):
|
|
302
|
+
"""Test extracting geo metadata from file."""
|
|
303
|
+
metadata = get_geoparquet_metadata(valid_geoparquet)
|
|
304
|
+
|
|
305
|
+
assert "primary_column" in metadata
|
|
306
|
+
assert "columns" in metadata
|
|
307
|
+
assert "geometry" in metadata["columns"]
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
class TestCatalogValidation:
|
|
311
|
+
"""Tests for catalog-level validation."""
|
|
312
|
+
|
|
313
|
+
@pytest.fixture
|
|
314
|
+
def catalog_with_files(self, tmp_path):
|
|
315
|
+
"""Create a catalog directory with multiple GeoParquet files."""
|
|
316
|
+
# Create subdirectory structure
|
|
317
|
+
partition1 = tmp_path / "partition1"
|
|
318
|
+
partition2 = tmp_path / "partition2"
|
|
319
|
+
partition1.mkdir()
|
|
320
|
+
partition2.mkdir()
|
|
321
|
+
|
|
322
|
+
# Create GeoDataFrames
|
|
323
|
+
for i, partition_dir in enumerate([partition1, partition2]):
|
|
324
|
+
gdf = gpd.GeoDataFrame(
|
|
325
|
+
{
|
|
326
|
+
"id": [f"item-{i}-{j}" for j in range(3)],
|
|
327
|
+
"geometry": [Point(j + i * 10, j) for j in range(3)],
|
|
328
|
+
},
|
|
329
|
+
crs="EPSG:4326",
|
|
330
|
+
)
|
|
331
|
+
gdf.to_parquet(partition_dir / "data.parquet")
|
|
332
|
+
|
|
333
|
+
return tmp_path
|
|
334
|
+
|
|
335
|
+
def test_validate_catalog(self, catalog_with_files):
|
|
336
|
+
"""Test validating an entire catalog."""
|
|
337
|
+
result = validate_catalog(catalog_with_files)
|
|
338
|
+
|
|
339
|
+
assert result.total_files == 2
|
|
340
|
+
assert result.valid_files >= 0 # May have warnings but should parse
|
|
341
|
+
assert result.is_valid or result.warnings_count > 0
|
|
342
|
+
|
|
343
|
+
def test_validate_catalog_summary(self, catalog_with_files):
|
|
344
|
+
"""Test catalog validation summary."""
|
|
345
|
+
result = validate_catalog(catalog_with_files)
|
|
346
|
+
summary = result.summary()
|
|
347
|
+
|
|
348
|
+
assert "Total files: 2" in summary
|
|
349
|
+
|
|
350
|
+
def test_validate_single_file_as_catalog(self, tmp_path):
|
|
351
|
+
"""Test validating a single file through catalog interface."""
|
|
352
|
+
gdf = gpd.GeoDataFrame(
|
|
353
|
+
{"id": ["item-1"], "geometry": [Point(0, 0)]},
|
|
354
|
+
crs="EPSG:4326",
|
|
355
|
+
)
|
|
356
|
+
file_path = tmp_path / "single.parquet"
|
|
357
|
+
gdf.to_parquet(file_path)
|
|
358
|
+
|
|
359
|
+
result = validate_catalog(file_path)
|
|
360
|
+
|
|
361
|
+
assert result.total_files == 1
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
class TestCatalogValidationResult:
|
|
365
|
+
"""Tests for CatalogValidationResult data class."""
|
|
366
|
+
|
|
367
|
+
def test_add_file_result(self):
|
|
368
|
+
"""Test adding file results."""
|
|
369
|
+
catalog_result = CatalogValidationResult()
|
|
370
|
+
|
|
371
|
+
file_result1 = ValidationResult(is_valid=True)
|
|
372
|
+
file_result1.add_warning("WARN", "warning")
|
|
373
|
+
|
|
374
|
+
file_result2 = ValidationResult(is_valid=False)
|
|
375
|
+
file_result2.add_error("ERR", "error")
|
|
376
|
+
|
|
377
|
+
catalog_result.add_file_result("file1.parquet", file_result1)
|
|
378
|
+
catalog_result.add_file_result("file2.parquet", file_result2)
|
|
379
|
+
|
|
380
|
+
assert catalog_result.total_files == 2
|
|
381
|
+
assert catalog_result.valid_files == 1
|
|
382
|
+
assert catalog_result.invalid_files == 1
|
|
383
|
+
assert catalog_result.warnings_count == 1
|
|
384
|
+
assert catalog_result.errors_count == 1
|
|
385
|
+
assert catalog_result.is_valid is False
|
|
386
|
+
|
|
387
|
+
def test_summary(self):
|
|
388
|
+
"""Test catalog summary generation."""
|
|
389
|
+
catalog_result = CatalogValidationResult()
|
|
390
|
+
file_result = ValidationResult(is_valid=False)
|
|
391
|
+
file_result.add_error("ERR", "error message")
|
|
392
|
+
catalog_result.add_file_result("bad.parquet", file_result)
|
|
393
|
+
|
|
394
|
+
summary = catalog_result.summary()
|
|
395
|
+
assert "Invalid files: 1" in summary
|
|
396
|
+
assert "bad.parquet" in summary
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
class TestIntegrationWithPipeline:
|
|
400
|
+
"""Integration tests for validation with ingestion pipeline."""
|
|
401
|
+
|
|
402
|
+
def test_processing_config_has_validation_options(self):
|
|
403
|
+
"""Test that ProcessingConfig has validation options."""
|
|
404
|
+
from earthcatalog.ingestion_pipeline import ProcessingConfig
|
|
405
|
+
|
|
406
|
+
config = ProcessingConfig(
|
|
407
|
+
input_file="test.parquet",
|
|
408
|
+
output_catalog="./output",
|
|
409
|
+
scratch_location="./scratch",
|
|
410
|
+
enable_validation=True,
|
|
411
|
+
fix_invalid_geometry=True,
|
|
412
|
+
fix_bbox_mismatch=True,
|
|
413
|
+
bbox_tolerance=1e-6,
|
|
414
|
+
log_validation_warnings=True,
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
assert config.enable_validation is True
|
|
418
|
+
assert config.fix_invalid_geometry is True
|
|
419
|
+
assert config.fix_bbox_mismatch is True
|
|
420
|
+
assert config.bbox_tolerance == 1e-6
|
|
421
|
+
assert config.log_validation_warnings is True
|
|
422
|
+
|
|
423
|
+
def test_processing_config_validation_defaults(self):
|
|
424
|
+
"""Test that ProcessingConfig has sensible validation defaults."""
|
|
425
|
+
from earthcatalog.ingestion_pipeline import ProcessingConfig
|
|
426
|
+
|
|
427
|
+
config = ProcessingConfig(
|
|
428
|
+
input_file="test.parquet",
|
|
429
|
+
output_catalog="./output",
|
|
430
|
+
scratch_location="./scratch",
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
# Validation should be enabled by default
|
|
434
|
+
assert config.enable_validation is True
|
|
435
|
+
assert config.fix_invalid_geometry is True
|