earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. earthcatalog/__init__.py +164 -0
  2. earthcatalog/async_http_client.py +1006 -0
  3. earthcatalog/config.py +97 -0
  4. earthcatalog/engines/__init__.py +308 -0
  5. earthcatalog/engines/rustac_engine.py +142 -0
  6. earthcatalog/engines/stac_geoparquet_engine.py +126 -0
  7. earthcatalog/exceptions.py +471 -0
  8. earthcatalog/grid_systems.py +1114 -0
  9. earthcatalog/ingestion_pipeline.py +2281 -0
  10. earthcatalog/input_readers.py +603 -0
  11. earthcatalog/job_tracking.py +485 -0
  12. earthcatalog/pipeline.py +606 -0
  13. earthcatalog/schema_generator.py +911 -0
  14. earthcatalog/spatial_resolver.py +1207 -0
  15. earthcatalog/stac_hooks.py +754 -0
  16. earthcatalog/statistics.py +677 -0
  17. earthcatalog/storage_backends.py +548 -0
  18. earthcatalog/tests/__init__.py +1 -0
  19. earthcatalog/tests/conftest.py +76 -0
  20. earthcatalog/tests/test_all_grids.py +793 -0
  21. earthcatalog/tests/test_async_http.py +700 -0
  22. earthcatalog/tests/test_cli_and_storage.py +230 -0
  23. earthcatalog/tests/test_config.py +245 -0
  24. earthcatalog/tests/test_dask_integration.py +580 -0
  25. earthcatalog/tests/test_e2e_synthetic.py +1624 -0
  26. earthcatalog/tests/test_engines.py +272 -0
  27. earthcatalog/tests/test_exceptions.py +346 -0
  28. earthcatalog/tests/test_file_structure.py +245 -0
  29. earthcatalog/tests/test_input_readers.py +666 -0
  30. earthcatalog/tests/test_integration.py +200 -0
  31. earthcatalog/tests/test_integration_async.py +283 -0
  32. earthcatalog/tests/test_job_tracking.py +603 -0
  33. earthcatalog/tests/test_multi_file_input.py +336 -0
  34. earthcatalog/tests/test_passthrough_hook.py +196 -0
  35. earthcatalog/tests/test_pipeline.py +684 -0
  36. earthcatalog/tests/test_pipeline_components.py +665 -0
  37. earthcatalog/tests/test_schema_generator.py +506 -0
  38. earthcatalog/tests/test_spatial_resolver.py +413 -0
  39. earthcatalog/tests/test_stac_hooks.py +776 -0
  40. earthcatalog/tests/test_statistics.py +477 -0
  41. earthcatalog/tests/test_storage_backends.py +236 -0
  42. earthcatalog/tests/test_validation.py +435 -0
  43. earthcatalog/tests/test_workers.py +653 -0
  44. earthcatalog/validation.py +921 -0
  45. earthcatalog/workers.py +682 -0
  46. earthcatalog-0.2.0.dist-info/METADATA +333 -0
  47. earthcatalog-0.2.0.dist-info/RECORD +50 -0
  48. earthcatalog-0.2.0.dist-info/WHEEL +5 -0
  49. earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
  50. earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,236 @@
1
+ # test_storage_backends.py
2
+ """Tests for storage backend enhancements.
3
+
4
+ This module tests the new storage backend methods:
5
+ - get_etag: Get file checksum/ETag for change detection
6
+ - rmtree: Recursive directory deletion
7
+ - list_files: List files in directory with pattern matching
8
+ """
9
+
10
+ import tempfile
11
+ from pathlib import Path
12
+
13
+ import pytest
14
+
15
+ from earthcatalog.storage_backends import LocalStorage, get_storage_backend
16
+
17
+
18
+ class TestLocalStorageEnhancements:
19
+ """Test LocalStorage new methods."""
20
+
21
+ @pytest.fixture
22
+ def temp_dir(self):
23
+ """Create a temporary directory for testing."""
24
+ with tempfile.TemporaryDirectory() as tmpdir:
25
+ yield tmpdir
26
+
27
+ @pytest.fixture
28
+ def local_storage(self, temp_dir):
29
+ """Create a LocalStorage instance."""
30
+ return LocalStorage(temp_dir)
31
+
32
+ def test_get_etag_returns_hash_for_existing_file(self, local_storage, temp_dir):
33
+ """get_etag should return a hash string for existing files."""
34
+ # Create a test file
35
+ test_file = Path(temp_dir) / "test.txt"
36
+ test_file.write_text("hello world")
37
+
38
+ etag = local_storage.get_etag(str(test_file))
39
+
40
+ assert etag is not None
41
+ assert isinstance(etag, str)
42
+ assert len(etag) > 0
43
+
44
+ def test_get_etag_returns_none_for_missing_file(self, local_storage, temp_dir):
45
+ """get_etag should return None for non-existent files."""
46
+ missing_file = Path(temp_dir) / "nonexistent.txt"
47
+
48
+ etag = local_storage.get_etag(str(missing_file))
49
+
50
+ assert etag is None
51
+
52
+ def test_get_etag_different_content_different_hash(self, local_storage, temp_dir):
53
+ """Different file contents should produce different ETags."""
54
+ file1 = Path(temp_dir) / "file1.txt"
55
+ file2 = Path(temp_dir) / "file2.txt"
56
+ file1.write_text("content one")
57
+ file2.write_text("content two")
58
+
59
+ etag1 = local_storage.get_etag(str(file1))
60
+ etag2 = local_storage.get_etag(str(file2))
61
+
62
+ assert etag1 != etag2
63
+
64
+ def test_get_etag_same_content_same_hash(self, local_storage, temp_dir):
65
+ """Same file contents should produce same ETags."""
66
+ file1 = Path(temp_dir) / "file1.txt"
67
+ file2 = Path(temp_dir) / "file2.txt"
68
+ file1.write_text("identical content")
69
+ file2.write_text("identical content")
70
+
71
+ etag1 = local_storage.get_etag(str(file1))
72
+ etag2 = local_storage.get_etag(str(file2))
73
+
74
+ assert etag1 == etag2
75
+
76
+ def test_rmtree_removes_directory_recursively(self, local_storage, temp_dir):
77
+ """rmtree should remove directory and all contents."""
78
+ # Create nested directory structure
79
+ nested_dir = Path(temp_dir) / "parent" / "child"
80
+ nested_dir.mkdir(parents=True)
81
+ (nested_dir / "file.txt").write_text("content")
82
+ (Path(temp_dir) / "parent" / "sibling.txt").write_text("other")
83
+
84
+ parent_path = str(Path(temp_dir) / "parent")
85
+ local_storage.rmtree(parent_path)
86
+
87
+ assert not Path(parent_path).exists()
88
+
89
+ def test_rmtree_handles_missing_directory(self, local_storage, temp_dir):
90
+ """rmtree should not raise error for non-existent directory."""
91
+ missing_dir = Path(temp_dir) / "does_not_exist"
92
+
93
+ # Should not raise
94
+ local_storage.rmtree(str(missing_dir))
95
+
96
+ def test_rmtree_handles_empty_directory(self, local_storage, temp_dir):
97
+ """rmtree should handle empty directories."""
98
+ empty_dir = Path(temp_dir) / "empty"
99
+ empty_dir.mkdir()
100
+
101
+ local_storage.rmtree(str(empty_dir))
102
+
103
+ assert not empty_dir.exists()
104
+
105
+ def test_list_files_returns_matching_files(self, local_storage, temp_dir):
106
+ """list_files should return files matching the pattern."""
107
+ # Create test files
108
+ (Path(temp_dir) / "data1.parquet").write_text("data")
109
+ (Path(temp_dir) / "data2.parquet").write_text("data")
110
+ (Path(temp_dir) / "readme.txt").write_text("text")
111
+
112
+ files = local_storage.list_files(temp_dir, "*.parquet")
113
+
114
+ assert len(files) == 2
115
+ assert all(".parquet" in f for f in files)
116
+
117
+ def test_list_files_returns_empty_for_no_matches(self, local_storage, temp_dir):
118
+ """list_files should return empty list when no files match."""
119
+ (Path(temp_dir) / "data.txt").write_text("data")
120
+
121
+ files = local_storage.list_files(temp_dir, "*.parquet")
122
+
123
+ assert files == []
124
+
125
+ def test_list_files_handles_missing_directory(self, local_storage, temp_dir):
126
+ """list_files should return empty list for non-existent directory."""
127
+ missing_dir = Path(temp_dir) / "nonexistent"
128
+
129
+ files = local_storage.list_files(str(missing_dir), "*")
130
+
131
+ assert files == []
132
+
133
+ def test_list_files_recursive_with_glob_pattern(self, local_storage, temp_dir):
134
+ """list_files with ** should match files recursively."""
135
+ # Create nested structure
136
+ subdir = Path(temp_dir) / "subdir"
137
+ subdir.mkdir()
138
+ (Path(temp_dir) / "root.parquet").write_text("data")
139
+ (subdir / "nested.parquet").write_text("data")
140
+
141
+ files = local_storage.list_files(temp_dir, "**/*.parquet")
142
+
143
+ # Should find both files
144
+ assert len(files) >= 1 # At least the nested one with **
145
+
146
+ def test_list_files_default_pattern_matches_all(self, local_storage, temp_dir):
147
+ """list_files with default pattern should match all files."""
148
+ (Path(temp_dir) / "file1.txt").write_text("data")
149
+ (Path(temp_dir) / "file2.json").write_text("data")
150
+
151
+ files = local_storage.list_files(temp_dir)
152
+
153
+ assert len(files) == 2
154
+
155
+ def test_list_dirs_returns_subdirectories(self, local_storage, temp_dir):
156
+ """list_dirs should return immediate subdirectories."""
157
+ # Create subdirectories
158
+ (Path(temp_dir) / "subdir1").mkdir()
159
+ (Path(temp_dir) / "subdir2").mkdir()
160
+ # Create a file (should not be included)
161
+ (Path(temp_dir) / "file.txt").write_text("data")
162
+
163
+ dirs = local_storage.list_dirs(temp_dir)
164
+
165
+ assert len(dirs) == 2
166
+ dir_names = [Path(d).name for d in dirs]
167
+ assert "subdir1" in dir_names
168
+ assert "subdir2" in dir_names
169
+
170
+ def test_list_dirs_returns_empty_for_no_subdirs(self, local_storage, temp_dir):
171
+ """list_dirs should return empty list when no subdirectories exist."""
172
+ # Create only files
173
+ (Path(temp_dir) / "file1.txt").write_text("data")
174
+ (Path(temp_dir) / "file2.txt").write_text("data")
175
+
176
+ dirs = local_storage.list_dirs(temp_dir)
177
+
178
+ assert dirs == []
179
+
180
+ def test_list_dirs_handles_missing_directory(self, local_storage, temp_dir):
181
+ """list_dirs should return empty list for non-existent directory."""
182
+ missing_dir = Path(temp_dir) / "nonexistent"
183
+
184
+ dirs = local_storage.list_dirs(str(missing_dir))
185
+
186
+ assert dirs == []
187
+
188
+ def test_list_dirs_does_not_recurse(self, local_storage, temp_dir):
189
+ """list_dirs should only return immediate children, not nested."""
190
+ # Create nested structure
191
+ (Path(temp_dir) / "parent").mkdir()
192
+ (Path(temp_dir) / "parent" / "child").mkdir()
193
+
194
+ dirs = local_storage.list_dirs(temp_dir)
195
+
196
+ assert len(dirs) == 1
197
+ assert Path(dirs[0]).name == "parent"
198
+
199
+
200
+ class TestGetStorageBackend:
201
+ """Test the storage backend factory function."""
202
+
203
+ def test_local_path_returns_local_storage(self):
204
+ """Local paths should return LocalStorage."""
205
+ storage = get_storage_backend("/tmp/test")
206
+ assert isinstance(storage, LocalStorage)
207
+
208
+ def test_relative_path_returns_local_storage(self):
209
+ """Relative paths should return LocalStorage."""
210
+ storage = get_storage_backend("./catalog")
211
+ assert isinstance(storage, LocalStorage)
212
+
213
+
214
+ # S3 tests would require moto or similar mocking
215
+ class TestS3StorageEnhancements:
216
+ """Test S3Storage new methods (requires mocking or live S3)."""
217
+
218
+ @pytest.mark.skip(reason="Requires S3 credentials or mocking")
219
+ def test_get_etag_returns_s3_etag(self):
220
+ """S3 get_etag should return the S3 ETag header value."""
221
+ pass
222
+
223
+ @pytest.mark.skip(reason="Requires S3 credentials or mocking")
224
+ def test_rmtree_deletes_s3_prefix(self):
225
+ """S3 rmtree should delete all objects with the given prefix."""
226
+ pass
227
+
228
+ @pytest.mark.skip(reason="Requires S3 credentials or mocking")
229
+ def test_list_files_lists_s3_objects(self):
230
+ """S3 list_files should list objects matching pattern."""
231
+ pass
232
+
233
+ @pytest.mark.skip(reason="Requires S3 credentials or mocking")
234
+ def test_list_dirs_lists_s3_prefixes(self):
235
+ """S3 list_dirs should list directory-like prefixes."""
236
+ pass
@@ -0,0 +1,435 @@
1
+ """Tests for STAC GeoParquet validation module."""
2
+
3
+ import geopandas as gpd
4
+ import pytest
5
+ from shapely.geometry import Point
6
+
7
+ from earthcatalog.validation import (
8
+ CatalogValidationResult,
9
+ ValidationIssue,
10
+ ValidationResult,
11
+ get_geoparquet_metadata,
12
+ validate_catalog,
13
+ validate_geoparquet_file,
14
+ validate_stac_item,
15
+ validate_stac_items_batch,
16
+ )
17
+
18
+
19
+ class TestValidationResult:
20
+ """Tests for ValidationResult data class."""
21
+
22
+ def test_validation_result_defaults(self):
23
+ """Test that ValidationResult has correct defaults."""
24
+ result = ValidationResult(is_valid=True)
25
+ assert result.is_valid is True
26
+ assert result.issues == []
27
+ assert result.metadata == {}
28
+ assert result.warnings == []
29
+ assert result.errors == []
30
+
31
+ def test_add_warning(self):
32
+ """Test adding warnings to result."""
33
+ result = ValidationResult(is_valid=True)
34
+ result.add_warning("TEST_WARNING", "Test message", key="value")
35
+
36
+ assert len(result.warnings) == 1
37
+ assert result.warnings[0].code == "TEST_WARNING"
38
+ assert result.warnings[0].message == "Test message"
39
+ assert result.warnings[0].context == {"key": "value"}
40
+ assert result.is_valid is True # Warnings don't invalidate
41
+
42
+ def test_add_error(self):
43
+ """Test adding errors to result."""
44
+ result = ValidationResult(is_valid=True)
45
+ result.add_error("TEST_ERROR", "Test error message")
46
+
47
+ assert len(result.errors) == 1
48
+ assert result.errors[0].code == "TEST_ERROR"
49
+ assert result.is_valid is False # Errors invalidate
50
+
51
+ def test_merge_results(self):
52
+ """Test merging two validation results."""
53
+ result1 = ValidationResult(is_valid=True)
54
+ result1.add_warning("WARN1", "Warning 1")
55
+
56
+ result2 = ValidationResult(is_valid=True)
57
+ result2.add_error("ERR1", "Error 1")
58
+
59
+ merged = result1.merge(result2)
60
+
61
+ assert len(merged.issues) == 2
62
+ assert merged.is_valid is False
63
+ assert len(merged.warnings) == 1
64
+ assert len(merged.errors) == 1
65
+
66
+ def test_summary(self):
67
+ """Test summary generation."""
68
+ result = ValidationResult(is_valid=True)
69
+ result.add_warning("WARN", "A warning")
70
+ result.add_error("ERR", "An error")
71
+
72
+ summary = result.summary()
73
+ assert "Valid: False" in summary
74
+ assert "Warnings: 1" in summary
75
+ assert "Errors: 1" in summary
76
+
77
+
78
+ class TestValidationIssue:
79
+ """Tests for ValidationIssue data class."""
80
+
81
+ def test_str_representation(self):
82
+ """Test string representation of issue."""
83
+ issue = ValidationIssue(
84
+ level="warning",
85
+ code="TEST_CODE",
86
+ message="Test message",
87
+ context={"key": "value"},
88
+ )
89
+ assert "[WARNING] TEST_CODE: Test message" == str(issue)
90
+
91
+
92
+ class TestSTACItemValidation:
93
+ """Tests for STAC item validation."""
94
+
95
+ @pytest.fixture
96
+ def valid_stac_item(self):
97
+ """Create a valid STAC item for testing."""
98
+ return {
99
+ "type": "Feature",
100
+ "stac_version": "1.0.0",
101
+ "id": "test-item-001",
102
+ "geometry": {
103
+ "type": "Polygon",
104
+ "coordinates": [[[-10, -10], [10, -10], [10, 10], [-10, 10], [-10, -10]]],
105
+ },
106
+ "bbox": [-10, -10, 10, 10],
107
+ "properties": {
108
+ "datetime": "2024-01-15T10:00:00Z",
109
+ },
110
+ "links": [],
111
+ "assets": {},
112
+ }
113
+
114
+ def test_validate_valid_item(self, valid_stac_item):
115
+ """Test validation of a valid STAC item."""
116
+ result, corrected = validate_stac_item(valid_stac_item)
117
+
118
+ assert result.is_valid is True
119
+ assert len(result.errors) == 0
120
+ assert corrected is not None
121
+
122
+ def test_validate_missing_required_fields(self):
123
+ """Test validation catches missing required fields."""
124
+ item = {
125
+ "properties": {"datetime": "2024-01-15T10:00:00Z"},
126
+ }
127
+ result, corrected = validate_stac_item(item)
128
+
129
+ # Should have warnings for missing id, type, geometry
130
+ assert len(result.warnings) >= 2
131
+ warning_codes = [w.code for w in result.warnings]
132
+ assert "MISSING_FIELD" in warning_codes or "NULL_GEOMETRY" in warning_codes
133
+
134
+ def test_validate_invalid_type(self):
135
+ """Test validation warns on wrong type."""
136
+ item = {
137
+ "type": "Collection", # Wrong type
138
+ "id": "test",
139
+ "geometry": {"type": "Point", "coordinates": [0, 0]},
140
+ "properties": {"datetime": "2024-01-15T10:00:00Z"},
141
+ }
142
+ result, _ = validate_stac_item(item)
143
+
144
+ warning_codes = [w.code for w in result.warnings]
145
+ assert "INVALID_TYPE" in warning_codes
146
+
147
+ def test_validate_null_geometry(self):
148
+ """Test validation handles null geometry."""
149
+ item = {
150
+ "type": "Feature",
151
+ "id": "test",
152
+ "geometry": None,
153
+ "properties": {"datetime": "2024-01-15T10:00:00Z"},
154
+ }
155
+ result, _ = validate_stac_item(item)
156
+
157
+ warning_codes = [w.code for w in result.warnings]
158
+ assert "NULL_GEOMETRY" in warning_codes
159
+
160
+ def test_validate_invalid_geometry_fixed(self):
161
+ """Test that invalid geometry can be fixed."""
162
+ # Self-intersecting polygon (bowtie)
163
+ item = {
164
+ "type": "Feature",
165
+ "id": "test",
166
+ "geometry": {
167
+ "type": "Polygon",
168
+ "coordinates": [[[0, 0], [10, 10], [10, 0], [0, 10], [0, 0]]],
169
+ },
170
+ "properties": {"datetime": "2024-01-15T10:00:00Z"},
171
+ }
172
+ result, corrected = validate_stac_item(item, fix_geometry=True)
173
+
174
+ # Should have warning about invalid geometry
175
+ warning_codes = [w.code for w in result.warnings]
176
+ assert "INVALID_GEOMETRY" in warning_codes
177
+
178
+ # Should have been fixed
179
+ if result.metadata.get("geometry_fixed"):
180
+ from shapely.geometry import shape
181
+
182
+ fixed_geom = shape(corrected["geometry"])
183
+ assert fixed_geom.is_valid
184
+
185
+ def test_validate_bbox_mismatch(self, valid_stac_item):
186
+ """Test validation catches bbox mismatch."""
187
+ # Set incorrect bbox
188
+ valid_stac_item["bbox"] = [0, 0, 5, 5] # Doesn't match geometry
189
+
190
+ result, corrected = validate_stac_item(valid_stac_item)
191
+
192
+ warning_codes = [w.code for w in result.warnings]
193
+ assert "BBOX_MISMATCH" in warning_codes
194
+
195
+ # Should have corrected bbox
196
+ if result.metadata.get("bbox_corrected"):
197
+ assert corrected["bbox"] == [-10, -10, 10, 10]
198
+
199
+ def test_validate_missing_datetime(self):
200
+ """Test validation warns on missing datetime."""
201
+ item = {
202
+ "type": "Feature",
203
+ "id": "test",
204
+ "geometry": {"type": "Point", "coordinates": [0, 0]},
205
+ "properties": {}, # No datetime
206
+ }
207
+ result, _ = validate_stac_item(item)
208
+
209
+ warning_codes = [w.code for w in result.warnings]
210
+ assert "MISSING_DATETIME" in warning_codes
211
+
212
+ def test_validate_datetime_range_valid(self):
213
+ """Test validation accepts datetime range."""
214
+ item = {
215
+ "type": "Feature",
216
+ "id": "test",
217
+ "geometry": {"type": "Point", "coordinates": [0, 0]},
218
+ "properties": {
219
+ "datetime": None,
220
+ "start_datetime": "2024-01-01T00:00:00Z",
221
+ "end_datetime": "2024-01-31T23:59:59Z",
222
+ },
223
+ }
224
+ result, _ = validate_stac_item(item)
225
+
226
+ # Should NOT have MISSING_DATETIME warning
227
+ warning_codes = [w.code for w in result.warnings]
228
+ assert "MISSING_DATETIME" not in warning_codes
229
+
230
+
231
+ class TestBatchValidation:
232
+ """Tests for batch STAC item validation."""
233
+
234
+ def test_validate_batch(self):
235
+ """Test batch validation of multiple items."""
236
+ items = [
237
+ {
238
+ "type": "Feature",
239
+ "id": f"item-{i}",
240
+ "geometry": {"type": "Point", "coordinates": [i, i]},
241
+ "properties": {"datetime": "2024-01-15T10:00:00Z"},
242
+ }
243
+ for i in range(5)
244
+ ]
245
+
246
+ results, corrected = validate_stac_items_batch(items)
247
+
248
+ assert len(results) == 5
249
+ assert len(corrected) == 5
250
+ assert all(r.is_valid for r in results)
251
+
252
+
253
+ class TestGeoParquetValidation:
254
+ """Tests for GeoParquet file validation."""
255
+
256
+ @pytest.fixture
257
+ def valid_geoparquet(self, tmp_path):
258
+ """Create a valid GeoParquet file for testing."""
259
+ # Create a GeoDataFrame
260
+ gdf = gpd.GeoDataFrame(
261
+ {
262
+ "id": ["item-1", "item-2", "item-3"],
263
+ "datetime": ["2024-01-01", "2024-01-02", "2024-01-03"],
264
+ "geometry": [
265
+ Point(0, 0),
266
+ Point(1, 1),
267
+ Point(2, 2),
268
+ ],
269
+ },
270
+ crs="EPSG:4326",
271
+ )
272
+
273
+ file_path = tmp_path / "test.parquet"
274
+ gdf.to_parquet(file_path)
275
+ return file_path
276
+
277
+ def test_validate_valid_geoparquet(self, valid_geoparquet):
278
+ """Test validation of a valid GeoParquet file."""
279
+ result = validate_geoparquet_file(valid_geoparquet)
280
+
281
+ assert result.is_valid is True
282
+ assert "num_rows" in result.metadata
283
+ assert result.metadata["num_rows"] == 3
284
+
285
+ def test_validate_nonexistent_file(self, tmp_path):
286
+ """Test validation of nonexistent file."""
287
+ result = validate_geoparquet_file(tmp_path / "nonexistent.parquet")
288
+
289
+ assert result.is_valid is False
290
+ error_codes = [e.code for e in result.errors]
291
+ assert "FILE_NOT_FOUND" in error_codes
292
+
293
+ def test_validate_geoparquet_has_geo_metadata(self, valid_geoparquet):
294
+ """Test that geo metadata is validated."""
295
+ result = validate_geoparquet_file(valid_geoparquet)
296
+
297
+ # GeoPandas creates proper geo metadata
298
+ assert "primary_column" in result.metadata
299
+ assert result.metadata["primary_column"] == "geometry"
300
+
301
+ def test_get_geoparquet_metadata(self, valid_geoparquet):
302
+ """Test extracting geo metadata from file."""
303
+ metadata = get_geoparquet_metadata(valid_geoparquet)
304
+
305
+ assert "primary_column" in metadata
306
+ assert "columns" in metadata
307
+ assert "geometry" in metadata["columns"]
308
+
309
+
310
+ class TestCatalogValidation:
311
+ """Tests for catalog-level validation."""
312
+
313
+ @pytest.fixture
314
+ def catalog_with_files(self, tmp_path):
315
+ """Create a catalog directory with multiple GeoParquet files."""
316
+ # Create subdirectory structure
317
+ partition1 = tmp_path / "partition1"
318
+ partition2 = tmp_path / "partition2"
319
+ partition1.mkdir()
320
+ partition2.mkdir()
321
+
322
+ # Create GeoDataFrames
323
+ for i, partition_dir in enumerate([partition1, partition2]):
324
+ gdf = gpd.GeoDataFrame(
325
+ {
326
+ "id": [f"item-{i}-{j}" for j in range(3)],
327
+ "geometry": [Point(j + i * 10, j) for j in range(3)],
328
+ },
329
+ crs="EPSG:4326",
330
+ )
331
+ gdf.to_parquet(partition_dir / "data.parquet")
332
+
333
+ return tmp_path
334
+
335
+ def test_validate_catalog(self, catalog_with_files):
336
+ """Test validating an entire catalog."""
337
+ result = validate_catalog(catalog_with_files)
338
+
339
+ assert result.total_files == 2
340
+ assert result.valid_files >= 0 # May have warnings but should parse
341
+ assert result.is_valid or result.warnings_count > 0
342
+
343
+ def test_validate_catalog_summary(self, catalog_with_files):
344
+ """Test catalog validation summary."""
345
+ result = validate_catalog(catalog_with_files)
346
+ summary = result.summary()
347
+
348
+ assert "Total files: 2" in summary
349
+
350
+ def test_validate_single_file_as_catalog(self, tmp_path):
351
+ """Test validating a single file through catalog interface."""
352
+ gdf = gpd.GeoDataFrame(
353
+ {"id": ["item-1"], "geometry": [Point(0, 0)]},
354
+ crs="EPSG:4326",
355
+ )
356
+ file_path = tmp_path / "single.parquet"
357
+ gdf.to_parquet(file_path)
358
+
359
+ result = validate_catalog(file_path)
360
+
361
+ assert result.total_files == 1
362
+
363
+
364
+ class TestCatalogValidationResult:
365
+ """Tests for CatalogValidationResult data class."""
366
+
367
+ def test_add_file_result(self):
368
+ """Test adding file results."""
369
+ catalog_result = CatalogValidationResult()
370
+
371
+ file_result1 = ValidationResult(is_valid=True)
372
+ file_result1.add_warning("WARN", "warning")
373
+
374
+ file_result2 = ValidationResult(is_valid=False)
375
+ file_result2.add_error("ERR", "error")
376
+
377
+ catalog_result.add_file_result("file1.parquet", file_result1)
378
+ catalog_result.add_file_result("file2.parquet", file_result2)
379
+
380
+ assert catalog_result.total_files == 2
381
+ assert catalog_result.valid_files == 1
382
+ assert catalog_result.invalid_files == 1
383
+ assert catalog_result.warnings_count == 1
384
+ assert catalog_result.errors_count == 1
385
+ assert catalog_result.is_valid is False
386
+
387
+ def test_summary(self):
388
+ """Test catalog summary generation."""
389
+ catalog_result = CatalogValidationResult()
390
+ file_result = ValidationResult(is_valid=False)
391
+ file_result.add_error("ERR", "error message")
392
+ catalog_result.add_file_result("bad.parquet", file_result)
393
+
394
+ summary = catalog_result.summary()
395
+ assert "Invalid files: 1" in summary
396
+ assert "bad.parquet" in summary
397
+
398
+
399
+ class TestIntegrationWithPipeline:
400
+ """Integration tests for validation with ingestion pipeline."""
401
+
402
+ def test_processing_config_has_validation_options(self):
403
+ """Test that ProcessingConfig has validation options."""
404
+ from earthcatalog.ingestion_pipeline import ProcessingConfig
405
+
406
+ config = ProcessingConfig(
407
+ input_file="test.parquet",
408
+ output_catalog="./output",
409
+ scratch_location="./scratch",
410
+ enable_validation=True,
411
+ fix_invalid_geometry=True,
412
+ fix_bbox_mismatch=True,
413
+ bbox_tolerance=1e-6,
414
+ log_validation_warnings=True,
415
+ )
416
+
417
+ assert config.enable_validation is True
418
+ assert config.fix_invalid_geometry is True
419
+ assert config.fix_bbox_mismatch is True
420
+ assert config.bbox_tolerance == 1e-6
421
+ assert config.log_validation_warnings is True
422
+
423
+ def test_processing_config_validation_defaults(self):
424
+ """Test that ProcessingConfig has sensible validation defaults."""
425
+ from earthcatalog.ingestion_pipeline import ProcessingConfig
426
+
427
+ config = ProcessingConfig(
428
+ input_file="test.parquet",
429
+ output_catalog="./output",
430
+ scratch_location="./scratch",
431
+ )
432
+
433
+ # Validation should be enabled by default
434
+ assert config.enable_validation is True
435
+ assert config.fix_invalid_geometry is True