earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. earthcatalog/__init__.py +164 -0
  2. earthcatalog/async_http_client.py +1006 -0
  3. earthcatalog/config.py +97 -0
  4. earthcatalog/engines/__init__.py +308 -0
  5. earthcatalog/engines/rustac_engine.py +142 -0
  6. earthcatalog/engines/stac_geoparquet_engine.py +126 -0
  7. earthcatalog/exceptions.py +471 -0
  8. earthcatalog/grid_systems.py +1114 -0
  9. earthcatalog/ingestion_pipeline.py +2281 -0
  10. earthcatalog/input_readers.py +603 -0
  11. earthcatalog/job_tracking.py +485 -0
  12. earthcatalog/pipeline.py +606 -0
  13. earthcatalog/schema_generator.py +911 -0
  14. earthcatalog/spatial_resolver.py +1207 -0
  15. earthcatalog/stac_hooks.py +754 -0
  16. earthcatalog/statistics.py +677 -0
  17. earthcatalog/storage_backends.py +548 -0
  18. earthcatalog/tests/__init__.py +1 -0
  19. earthcatalog/tests/conftest.py +76 -0
  20. earthcatalog/tests/test_all_grids.py +793 -0
  21. earthcatalog/tests/test_async_http.py +700 -0
  22. earthcatalog/tests/test_cli_and_storage.py +230 -0
  23. earthcatalog/tests/test_config.py +245 -0
  24. earthcatalog/tests/test_dask_integration.py +580 -0
  25. earthcatalog/tests/test_e2e_synthetic.py +1624 -0
  26. earthcatalog/tests/test_engines.py +272 -0
  27. earthcatalog/tests/test_exceptions.py +346 -0
  28. earthcatalog/tests/test_file_structure.py +245 -0
  29. earthcatalog/tests/test_input_readers.py +666 -0
  30. earthcatalog/tests/test_integration.py +200 -0
  31. earthcatalog/tests/test_integration_async.py +283 -0
  32. earthcatalog/tests/test_job_tracking.py +603 -0
  33. earthcatalog/tests/test_multi_file_input.py +336 -0
  34. earthcatalog/tests/test_passthrough_hook.py +196 -0
  35. earthcatalog/tests/test_pipeline.py +684 -0
  36. earthcatalog/tests/test_pipeline_components.py +665 -0
  37. earthcatalog/tests/test_schema_generator.py +506 -0
  38. earthcatalog/tests/test_spatial_resolver.py +413 -0
  39. earthcatalog/tests/test_stac_hooks.py +776 -0
  40. earthcatalog/tests/test_statistics.py +477 -0
  41. earthcatalog/tests/test_storage_backends.py +236 -0
  42. earthcatalog/tests/test_validation.py +435 -0
  43. earthcatalog/tests/test_workers.py +653 -0
  44. earthcatalog/validation.py +921 -0
  45. earthcatalog/workers.py +682 -0
  46. earthcatalog-0.2.0.dist-info/METADATA +333 -0
  47. earthcatalog-0.2.0.dist-info/RECORD +50 -0
  48. earthcatalog-0.2.0.dist-info/WHEEL +5 -0
  49. earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
  50. earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,413 @@
1
+ """Tests for spatial partition resolver functionality."""
2
+
3
+ import json
4
+ import shutil
5
+ import tempfile
6
+ from pathlib import Path
7
+
8
+ import pytest
9
+ from shapely.geometry import Point, box
10
+
11
+ # Import from earthcatalog package
12
+ from earthcatalog.spatial_resolver import (
13
+ SpatialPartitionResolver,
14
+ resolve_and_query,
15
+ spatial_resolver,
16
+ )
17
+
18
+
19
+ class TestSpatialPartitionResolver:
20
+ """Test SpatialPartitionResolver class."""
21
+
22
+ def setup_method(self):
23
+ """Set up test fixtures."""
24
+ self.temp_dir = tempfile.mkdtemp()
25
+ self.catalog_path = Path(self.temp_dir) / "catalog"
26
+ self.catalog_path.mkdir(parents=True, exist_ok=True)
27
+
28
+ # Create mock schema configurations
29
+ self.h3_schema = {
30
+ "spatial_partitioning": {
31
+ "grid_system": "h3",
32
+ "resolution": 6,
33
+ "coordinate_system": "EPSG:4326",
34
+ "example_paths": [
35
+ "testmission/partition=h3/level=6/86283082fffffff/year=2024/month=01/items.parquet",
36
+ "testmission/partition=h3/level=6/global/year=2024/month=01/items.parquet",
37
+ ],
38
+ },
39
+ "global_partitioning": {
40
+ "enabled": True,
41
+ "threshold": 50,
42
+ "description": "Items spanning >50 H3 cells go to /global/",
43
+ },
44
+ }
45
+
46
+ self.geojson_schema = {
47
+ "spatial_partitioning": {
48
+ "grid_system": "geojson",
49
+ "custom_grid": True,
50
+ "geojson_source": str(Path(__file__).parent.parent / "examples" / "custom_tiles.geojson"),
51
+ "custom_tiles": {
52
+ "total_tiles": 3,
53
+ "tile_ids": ["region_a", "region_b", "region_c"],
54
+ },
55
+ },
56
+ "global_partitioning": {"enabled": False},
57
+ }
58
+
59
+ def teardown_method(self):
60
+ """Clean up test fixtures."""
61
+ shutil.rmtree(self.temp_dir, ignore_errors=True)
62
+
63
+ def test_resolver_initialization(self):
64
+ """Test resolver initialization."""
65
+ resolver = SpatialPartitionResolver(self.h3_schema, str(self.catalog_path))
66
+
67
+ assert resolver.grid_system == "h3"
68
+ assert resolver.global_enabled is True
69
+ assert resolver.global_threshold == 50
70
+ assert resolver.catalog_path == self.catalog_path
71
+
72
+ def test_h3_point_resolution(self):
73
+ """Test H3 point resolution."""
74
+ resolver = SpatialPartitionResolver(self.h3_schema, str(self.catalog_path))
75
+
76
+ # San Francisco coordinates
77
+ sf_point = Point(-122.4194, 37.7749)
78
+ partitions = resolver.resolve_partitions(sf_point)
79
+
80
+ assert len(partitions) == 1
81
+ assert isinstance(partitions[0], str)
82
+ assert len(partitions[0]) == 15 # H3 cell ID length
83
+ assert "global" not in partitions # Small query shouldn't include global
84
+
85
+ def test_h3_polygon_resolution(self):
86
+ """Test H3 polygon resolution."""
87
+ resolver = SpatialPartitionResolver(self.h3_schema, str(self.catalog_path))
88
+
89
+ # San Francisco Bay Area
90
+ sf_bay = box(-122.5, 37.5, -122.0, 38.0)
91
+ partitions = resolver.resolve_partitions(sf_bay, overlap=True)
92
+
93
+ assert len(partitions) > 10 # Should cover multiple cells
94
+ assert all(isinstance(p, str) for p in partitions if p != "global")
95
+ # Moderate area might include global depending on exact cell count
96
+
97
+ def test_h3_buffer_cells(self):
98
+ """Test H3 resolution with buffer cells."""
99
+ resolver = SpatialPartitionResolver(self.h3_schema, str(self.catalog_path))
100
+
101
+ sf_point = Point(-122.4194, 37.7749)
102
+
103
+ # No buffer
104
+ partitions_no_buffer = resolver.resolve_partitions(sf_point, buffer_cells=0)
105
+
106
+ # With buffer
107
+ partitions_with_buffer = resolver.resolve_partitions(sf_point, buffer_cells=2)
108
+
109
+ assert len(partitions_no_buffer) == 1
110
+ assert len(partitions_with_buffer) > len(partitions_no_buffer)
111
+ assert partitions_no_buffer[0] in partitions_with_buffer
112
+
113
+ def test_global_partition_threshold_logic(self):
114
+ """Test global partition inclusion based on threshold."""
115
+ resolver = SpatialPartitionResolver(self.h3_schema, str(self.catalog_path))
116
+
117
+ # Small area - should not include global
118
+ small_area = Point(-122.4194, 37.7749).buffer(0.01)
119
+ small_partitions = resolver.resolve_partitions(small_area)
120
+ assert "global" not in small_partitions
121
+
122
+ # Large area - should include global due to threshold
123
+ large_area = box(-125.0, 32.0, -115.0, 42.0) # Entire California
124
+ large_partitions = resolver.resolve_partitions(large_area)
125
+ assert "global" in large_partitions
126
+ assert len([p for p in large_partitions if p != "global"]) > 50
127
+
128
+ def test_global_partition_geometry_size_logic(self):
129
+ """Test global partition inclusion based on geometry size."""
130
+ resolver = SpatialPartitionResolver(self.h3_schema, str(self.catalog_path))
131
+
132
+ # Very large geometry (continental scale)
133
+ continental = box(-130.0, 25.0, -65.0, 50.0)
134
+ partitions = resolver.resolve_partitions(continental)
135
+
136
+ assert "global" in partitions
137
+ # Should trigger large geometry threshold even if cell count logic doesn't
138
+
139
+ def test_global_partition_manual_override(self):
140
+ """Test manual global partition inclusion/exclusion."""
141
+ resolver = SpatialPartitionResolver(self.h3_schema, str(self.catalog_path))
142
+
143
+ small_area = Point(-122.4194, 37.7749).buffer(0.01)
144
+ large_area = box(-125.0, 32.0, -115.0, 42.0)
145
+
146
+ # Force global on small area
147
+ partitions_force_global = resolver.resolve_partitions(small_area, include_global=True)
148
+ assert "global" in partitions_force_global
149
+
150
+ # Disable global on large area
151
+ partitions_no_global = resolver.resolve_partitions(large_area, include_global=False)
152
+ assert "global" not in partitions_no_global
153
+
154
+ def test_global_partition_disabled(self):
155
+ """Test behavior when global partitioning is disabled."""
156
+ schema_no_global = {
157
+ "spatial_partitioning": {"grid_system": "h3", "resolution": 6},
158
+ "global_partitioning": {"enabled": False},
159
+ }
160
+
161
+ resolver = SpatialPartitionResolver(schema_no_global, str(self.catalog_path))
162
+
163
+ # Even large areas shouldn't include global when disabled
164
+ large_area = box(-125.0, 32.0, -115.0, 42.0)
165
+ partitions = resolver.resolve_partitions(large_area)
166
+
167
+ assert "global" not in partitions
168
+ assert resolver.global_enabled is False
169
+
170
+ def test_geojson_resolution(self):
171
+ """Test GeoJSON grid resolution."""
172
+ resolver = SpatialPartitionResolver(self.geojson_schema, str(self.catalog_path))
173
+
174
+ # Area that should intersect region_a (Northern region: -120 to -110, 40 to 50)
175
+ northern_area = box(-119, 41, -111, 49)
176
+ partitions = resolver.resolve_partitions(northern_area)
177
+
178
+ assert "region_a" in partitions
179
+ assert len(partitions) >= 1
180
+
181
+ # Large area spanning multiple regions
182
+ large_area = box(-119, 32, -105, 48)
183
+ spanning_partitions = resolver.resolve_partitions(large_area)
184
+
185
+ assert len(spanning_partitions) > 1
186
+ assert "region_a" in spanning_partitions
187
+ assert "region_b" in spanning_partitions
188
+
189
+ def test_path_existence_checking(self):
190
+ """Test existing partition path filtering."""
191
+ resolver = SpatialPartitionResolver(self.h3_schema, str(self.catalog_path))
192
+
193
+ # Create some test directories in file structure
194
+ test_partitions = ["86283082fffffff", "862830827ffffff", "global"]
195
+ existing_partitions = test_partitions[:2] # Only create first two
196
+
197
+ for partition_id in existing_partitions:
198
+ partition_path = self.catalog_path / "testmission" / "partition=h3" / "level=6" / partition_id
199
+ partition_path.mkdir(parents=True, exist_ok=True)
200
+
201
+ # Test path filtering
202
+ existing_paths = resolver.get_existing_partition_paths(test_partitions)
203
+
204
+ assert len(existing_paths) == len(existing_partitions)
205
+ for partition_id in existing_partitions:
206
+ assert any(partition_id in path for path in existing_paths)
207
+
208
+ def test_query_path_generation(self):
209
+ """Test query path pattern generation with Hive-style temporal partitioning."""
210
+ resolver = SpatialPartitionResolver(self.h3_schema, str(self.catalog_path))
211
+
212
+ # Create test directories in file structure
213
+ test_partitions = ["86283082fffffff", "global"]
214
+ for partition_id in test_partitions:
215
+ partition_path = self.catalog_path / "testmission" / "partition=h3" / "level=6" / partition_id
216
+ partition_path.mkdir(parents=True, exist_ok=True)
217
+
218
+ # Test without temporal filter - should glob all items files
219
+ query_patterns = resolver.generate_query_paths(test_partitions)
220
+ assert len(query_patterns) == len(test_partitions)
221
+ assert all("/**/items.parquet" in pattern for pattern in query_patterns)
222
+
223
+ # Test with temporal filter - should convert to Hive-style path
224
+ temporal_patterns = resolver.generate_query_paths(test_partitions, "2024-01")
225
+ assert len(temporal_patterns) == len(test_partitions)
226
+ assert all("year=2024/month=01/items.parquet" in pattern for pattern in temporal_patterns)
227
+
228
+ def test_different_grid_systems(self):
229
+ """Test resolution across different grid systems."""
230
+ test_point = Point(-122.4194, 37.7749)
231
+
232
+ grid_schemas = {
233
+ "h3": {"spatial_partitioning": {"grid_system": "h3", "resolution": 6}},
234
+ "s2": {"spatial_partitioning": {"grid_system": "s2", "level": 13}},
235
+ "utm": {"spatial_partitioning": {"grid_system": "utm", "precision": 1}},
236
+ "latlon": {"spatial_partitioning": {"grid_system": "latlon", "cell_size_degrees": 0.1}},
237
+ }
238
+
239
+ for grid_name, schema in grid_schemas.items():
240
+ schema["global_partitioning"] = {"enabled": False} # Disable for simplicity
241
+
242
+ try:
243
+ resolver = SpatialPartitionResolver(schema, str(self.catalog_path))
244
+ partitions = resolver.resolve_partitions(test_point)
245
+
246
+ assert len(partitions) >= 1
247
+ assert all(isinstance(p, str) for p in partitions)
248
+
249
+ except ImportError:
250
+ # Skip if optional dependencies not available
251
+ pytest.skip(f"Optional dependency for {grid_name} grid not available")
252
+
253
+ def test_effective_global_threshold(self):
254
+ """Test effective global threshold calculation."""
255
+ resolver = SpatialPartitionResolver(self.h3_schema, str(self.catalog_path))
256
+
257
+ # Should use configured threshold
258
+ threshold = resolver._get_effective_global_threshold()
259
+ assert threshold == 50
260
+
261
+ # Test with custom thresholds in schema
262
+ schema_with_custom = self.h3_schema.copy()
263
+ schema_with_custom["custom_thresholds"] = {"h3": {"6": 100, "7": 200}}
264
+
265
+ resolver_custom = SpatialPartitionResolver(schema_with_custom, str(self.catalog_path))
266
+ custom_threshold = resolver_custom._get_effective_global_threshold()
267
+ assert custom_threshold == 100 # Should use custom threshold for H3 resolution 6
268
+
269
+ def test_parse_temporal_filter_to_hive(self):
270
+ """Test temporal filter to Hive path conversion."""
271
+ resolver = SpatialPartitionResolver(self.h3_schema, str(self.catalog_path))
272
+
273
+ # Test year only
274
+ assert resolver._parse_temporal_filter_to_hive("2024") == "year=2024"
275
+
276
+ # Test year-month
277
+ assert resolver._parse_temporal_filter_to_hive("2024-01") == "year=2024/month=01"
278
+
279
+ # Test year-month-day
280
+ assert resolver._parse_temporal_filter_to_hive("2024-01-15") == "year=2024/month=01/day=15"
281
+
282
+ # Test year with wildcard
283
+ assert resolver._parse_temporal_filter_to_hive("2024-*") == "year=2024/*"
284
+
285
+ # Test year-month with wildcard
286
+ assert resolver._parse_temporal_filter_to_hive("2024-01-*") == "year=2024/month=01/*"
287
+
288
+ # Test empty or wildcard only
289
+ assert resolver._parse_temporal_filter_to_hive("*") == "*"
290
+ assert resolver._parse_temporal_filter_to_hive("") == "*"
291
+
292
+ # Test single-digit month padding
293
+ assert resolver._parse_temporal_filter_to_hive("2024-1") == "year=2024/month=01"
294
+
295
+
296
+ class TestConvenienceFunctions:
297
+ """Test convenience functions."""
298
+
299
+ def setup_method(self):
300
+ """Set up test fixtures."""
301
+ self.temp_dir = tempfile.mkdtemp()
302
+ self.schema_file = Path(self.temp_dir) / "schema.json"
303
+ self.catalog_path = Path(self.temp_dir) / "catalog"
304
+
305
+ # Create test schema
306
+ schema = {
307
+ "spatial_partitioning": {"grid_system": "h3", "resolution": 7},
308
+ "global_partitioning": {"enabled": True, "threshold": 50},
309
+ }
310
+
311
+ with open(self.schema_file, "w") as f:
312
+ json.dump(schema, f)
313
+
314
+ def teardown_method(self):
315
+ """Clean up test fixtures."""
316
+ shutil.rmtree(self.temp_dir, ignore_errors=True)
317
+
318
+ def test_create_resolver_from_schema_file(self):
319
+ """Test creating resolver from schema file (using modern spatial_resolver function)."""
320
+ resolver = spatial_resolver(str(self.schema_file), str(self.catalog_path))
321
+
322
+ assert isinstance(resolver, SpatialPartitionResolver)
323
+ assert resolver.grid_system == "h3"
324
+ assert resolver.spatial_config["resolution"] == 7
325
+
326
+ def test_spatial_resolver_function(self):
327
+ """Test the new spatial_resolver function."""
328
+ resolver = spatial_resolver(str(self.schema_file), str(self.catalog_path))
329
+
330
+ assert isinstance(resolver, SpatialPartitionResolver)
331
+ assert resolver.grid_system == "h3"
332
+ assert resolver.spatial_config["resolution"] == 7
333
+
334
+ # Test with catalog_path defaulting to schema directory
335
+ resolver2 = spatial_resolver(str(self.schema_file))
336
+ assert isinstance(resolver2, SpatialPartitionResolver)
337
+
338
+ def test_remote_schema_support(self):
339
+ """Test remote URL support in spatial_resolver function."""
340
+ # Test error when catalog_path not provided for various remote URLs
341
+ remote_urls = [
342
+ "s3://bucket/schema.json",
343
+ "s3a://bucket/schema.json",
344
+ "s3n://bucket/schema.json",
345
+ "gcs://bucket/schema.json",
346
+ "gs://bucket/schema.json",
347
+ "https://example.com/schema.json",
348
+ "abfs://container/schema.json",
349
+ ]
350
+
351
+ for url in remote_urls:
352
+ with pytest.raises(ValueError, match="catalog_path must be explicitly provided"):
353
+ spatial_resolver(url)
354
+
355
+ def test_resolve_and_query_function(self):
356
+ """Test the convenience resolve_and_query function."""
357
+ # Create a test geometry
358
+ test_geometry = box(-122.5, 37.7, -122.0, 38.0)
359
+
360
+ partition_ids, query = resolve_and_query(
361
+ schema_path=str(self.schema_file),
362
+ catalog_path=str(self.catalog_path),
363
+ aoi_geometry=test_geometry,
364
+ temporal_filter="2024-*",
365
+ overlap=True,
366
+ )
367
+
368
+ assert isinstance(partition_ids, list)
369
+ assert len(partition_ids) > 0
370
+ assert isinstance(query, str)
371
+ # Query will be empty if no existing partition paths, which is expected in test
372
+
373
+
374
+ class TestErrorHandling:
375
+ """Test error handling and edge cases."""
376
+
377
+ def test_unsupported_grid_system(self):
378
+ """Test error handling for unsupported grid systems."""
379
+ schema = {
380
+ "spatial_partitioning": {"grid_system": "invalid_grid"},
381
+ "global_partitioning": {"enabled": False},
382
+ }
383
+
384
+ resolver = SpatialPartitionResolver(schema, "/tmp")
385
+
386
+ with pytest.raises(ValueError, match="Unsupported grid system"):
387
+ resolver.resolve_partitions(Point(0, 0))
388
+
389
+ def test_missing_dependencies(self):
390
+ """Test handling of missing optional dependencies."""
391
+ # This test would need to mock the import failures
392
+ # Implementation depends on how you want to handle missing dependencies
393
+ pass
394
+
395
+ def test_invalid_geometry(self):
396
+ """Test handling of invalid geometries."""
397
+ from shapely.errors import GeometryTypeError
398
+
399
+ schema = {
400
+ "spatial_partitioning": {"grid_system": "h3", "resolution": 6},
401
+ "global_partitioning": {"enabled": False},
402
+ }
403
+
404
+ resolver = SpatialPartitionResolver(schema, "/tmp")
405
+
406
+ # Test with invalid geometry dict
407
+ with pytest.raises(GeometryTypeError):
408
+ resolver.resolve_partitions({"type": "Invalid"})
409
+
410
+
411
+ if __name__ == "__main__":
412
+ # Allow running tests directly
413
+ pytest.main([__file__])