earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. earthcatalog/__init__.py +164 -0
  2. earthcatalog/async_http_client.py +1006 -0
  3. earthcatalog/config.py +97 -0
  4. earthcatalog/engines/__init__.py +308 -0
  5. earthcatalog/engines/rustac_engine.py +142 -0
  6. earthcatalog/engines/stac_geoparquet_engine.py +126 -0
  7. earthcatalog/exceptions.py +471 -0
  8. earthcatalog/grid_systems.py +1114 -0
  9. earthcatalog/ingestion_pipeline.py +2281 -0
  10. earthcatalog/input_readers.py +603 -0
  11. earthcatalog/job_tracking.py +485 -0
  12. earthcatalog/pipeline.py +606 -0
  13. earthcatalog/schema_generator.py +911 -0
  14. earthcatalog/spatial_resolver.py +1207 -0
  15. earthcatalog/stac_hooks.py +754 -0
  16. earthcatalog/statistics.py +677 -0
  17. earthcatalog/storage_backends.py +548 -0
  18. earthcatalog/tests/__init__.py +1 -0
  19. earthcatalog/tests/conftest.py +76 -0
  20. earthcatalog/tests/test_all_grids.py +793 -0
  21. earthcatalog/tests/test_async_http.py +700 -0
  22. earthcatalog/tests/test_cli_and_storage.py +230 -0
  23. earthcatalog/tests/test_config.py +245 -0
  24. earthcatalog/tests/test_dask_integration.py +580 -0
  25. earthcatalog/tests/test_e2e_synthetic.py +1624 -0
  26. earthcatalog/tests/test_engines.py +272 -0
  27. earthcatalog/tests/test_exceptions.py +346 -0
  28. earthcatalog/tests/test_file_structure.py +245 -0
  29. earthcatalog/tests/test_input_readers.py +666 -0
  30. earthcatalog/tests/test_integration.py +200 -0
  31. earthcatalog/tests/test_integration_async.py +283 -0
  32. earthcatalog/tests/test_job_tracking.py +603 -0
  33. earthcatalog/tests/test_multi_file_input.py +336 -0
  34. earthcatalog/tests/test_passthrough_hook.py +196 -0
  35. earthcatalog/tests/test_pipeline.py +684 -0
  36. earthcatalog/tests/test_pipeline_components.py +665 -0
  37. earthcatalog/tests/test_schema_generator.py +506 -0
  38. earthcatalog/tests/test_spatial_resolver.py +413 -0
  39. earthcatalog/tests/test_stac_hooks.py +776 -0
  40. earthcatalog/tests/test_statistics.py +477 -0
  41. earthcatalog/tests/test_storage_backends.py +236 -0
  42. earthcatalog/tests/test_validation.py +435 -0
  43. earthcatalog/tests/test_workers.py +653 -0
  44. earthcatalog/validation.py +921 -0
  45. earthcatalog/workers.py +682 -0
  46. earthcatalog-0.2.0.dist-info/METADATA +333 -0
  47. earthcatalog-0.2.0.dist-info/RECORD +50 -0
  48. earthcatalog-0.2.0.dist-info/WHEEL +5 -0
  49. earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
  50. earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,245 @@
1
+ """Tests for file structure features."""
2
+
3
+ import tempfile
4
+ import unittest
5
+ from pathlib import Path
6
+
7
+ from earthcatalog.ingestion_pipeline import LocalProcessor, ProcessingConfig, STACIngestionPipeline
8
+ from earthcatalog.spatial_resolver import SpatialPartitionResolver
9
+
10
+
11
+ class TestFileStructure(unittest.TestCase):
12
+ """Test file structure features."""
13
+
14
+ def setUp(self):
15
+ """Set up test environment."""
16
+ self.temp_dir = Path(tempfile.mkdtemp())
17
+ self.sample_stac_item = {
18
+ "type": "Feature",
19
+ "id": "test_001",
20
+ "geometry": {"type": "Point", "coordinates": [-105.0, 40.0]},
21
+ "properties": {
22
+ "datetime": "2024-01-15T10:30:00Z",
23
+ "dataset_id": "landsat8_test",
24
+ "collection": "test-collection",
25
+ },
26
+ }
27
+
28
+ def tearDown(self):
29
+ """Clean up test environment."""
30
+ import shutil
31
+
32
+ shutil.rmtree(self.temp_dir, ignore_errors=True)
33
+
34
+ def test_mission_extraction_from_dataset_id(self):
35
+ """Test mission extraction from dataset_id field."""
36
+ config = ProcessingConfig(
37
+ input_file=str(self.temp_dir / "test.parquet"),
38
+ output_catalog=str(self.temp_dir / "catalog"),
39
+ scratch_location=str(self.temp_dir / "scratch"),
40
+ mission_field="dataset_id",
41
+ )
42
+
43
+ processor = LocalProcessor(n_workers=1)
44
+ pipeline = STACIngestionPipeline(config, processor)
45
+
46
+ # Test mission extraction
47
+ mission = pipeline._extract_mission(self.sample_stac_item)
48
+ self.assertEqual(mission, "landsat8_test")
49
+
50
+ def test_mission_extraction_fallback_to_collection(self):
51
+ """Test mission extraction falls back to collection field."""
52
+ item_without_dataset_id = {
53
+ "type": "Feature",
54
+ "properties": {"datetime": "2024-01-15T10:30:00Z", "collection": "sentinel2-collection"},
55
+ }
56
+
57
+ config = ProcessingConfig(
58
+ input_file=str(self.temp_dir / "test.parquet"),
59
+ output_catalog=str(self.temp_dir / "catalog"),
60
+ scratch_location=str(self.temp_dir / "scratch"),
61
+ mission_field="dataset_id",
62
+ )
63
+
64
+ processor = LocalProcessor(n_workers=1)
65
+ pipeline = STACIngestionPipeline(config, processor)
66
+
67
+ mission = pipeline._extract_mission(item_without_dataset_id)
68
+ self.assertEqual(mission, "sentinel2_collection")
69
+
70
+ def test_mission_sanitization(self):
71
+ """Test mission name sanitization for filesystem compatibility."""
72
+ config = ProcessingConfig(
73
+ input_file=str(self.temp_dir / "test.parquet"),
74
+ output_catalog=str(self.temp_dir / "catalog"),
75
+ scratch_location=str(self.temp_dir / "scratch"),
76
+ )
77
+
78
+ processor = LocalProcessor(n_workers=1)
79
+ pipeline = STACIngestionPipeline(config, processor)
80
+
81
+ # Test various mission names that need sanitization
82
+ test_cases = [
83
+ ("Landsat-8 Collection", "landsat_8_collection"),
84
+ ("MODIS/Terra", "modis_terra"),
85
+ ("Sentinel-2A/B", "sentinel_2a_b"),
86
+ ("Test@Collection#1", "test_collection_1"),
87
+ ("___test___", "test"),
88
+ ("", "unnamed"),
89
+ ]
90
+
91
+ for input_name, expected in test_cases:
92
+ sanitized = pipeline._sanitize_mission_name(input_name)
93
+ self.assertEqual(sanitized, expected)
94
+
95
+ def test_partition_key_format(self):
96
+ """Test new partition key format with Hive-style temporal partitioning."""
97
+ config = ProcessingConfig(
98
+ input_file=str(self.temp_dir / "test.parquet"),
99
+ output_catalog=str(self.temp_dir / "catalog"),
100
+ scratch_location=str(self.temp_dir / "scratch"),
101
+ grid_resolution=2,
102
+ temporal_bin="month",
103
+ )
104
+
105
+ processor = LocalProcessor(n_workers=1)
106
+ pipeline = STACIngestionPipeline(config, processor)
107
+
108
+ partition_key = pipeline._compute_partition_key(self.sample_stac_item)
109
+
110
+ # Should follow Hive-style format: mission/partition=h3/level=2/cell_id/year=2024/month=01
111
+ parts = partition_key.split("/")
112
+ self.assertEqual(len(parts), 6)
113
+ self.assertEqual(parts[0], "landsat8_test") # mission
114
+ self.assertEqual(parts[1], "partition=h3") # partition type
115
+ self.assertEqual(parts[2], "level=2") # resolution level
116
+ self.assertTrue(parts[3].startswith("8")) # H3 cell (starts with 8)
117
+ self.assertEqual(parts[4], "year=2024") # Hive-style year
118
+ self.assertEqual(parts[5], "month=01") # Hive-style month
119
+
120
+ def test_h3_level_2_default(self):
121
+ """Test H3 level 2 as new default."""
122
+ config = ProcessingConfig(
123
+ input_file=str(self.temp_dir / "test.parquet"),
124
+ output_catalog=str(self.temp_dir / "catalog"),
125
+ scratch_location=str(self.temp_dir / "scratch"),
126
+ )
127
+
128
+ # Should default to level 2
129
+ self.assertEqual(config.grid_resolution, 2)
130
+
131
+ def test_output_format_geoparquet_default(self):
132
+ """Test that GeoParquet is the default output format."""
133
+ config = ProcessingConfig(
134
+ input_file=str(self.temp_dir / "test.parquet"),
135
+ output_catalog=str(self.temp_dir / "catalog"),
136
+ scratch_location=str(self.temp_dir / "scratch"),
137
+ )
138
+
139
+ self.assertEqual(config.output_format, "geoparquet")
140
+
141
+ def test_output_format_ndjson_option(self):
142
+ """Test NDJSON as an output format option."""
143
+ config = ProcessingConfig(
144
+ input_file=str(self.temp_dir / "test.parquet"),
145
+ output_catalog=str(self.temp_dir / "catalog"),
146
+ scratch_location=str(self.temp_dir / "scratch"),
147
+ output_format="ndjson",
148
+ )
149
+
150
+ self.assertEqual(config.output_format, "ndjson")
151
+
152
+ def test_final_partition_path_geoparquet(self):
153
+ """Test final partition path generation with GeoParquet using Hive-style."""
154
+ config = ProcessingConfig(
155
+ input_file=str(self.temp_dir / "test.parquet"),
156
+ output_catalog=str(self.temp_dir / "catalog"),
157
+ scratch_location=str(self.temp_dir / "scratch"),
158
+ output_format="geoparquet",
159
+ )
160
+
161
+ processor = LocalProcessor(n_workers=1)
162
+ pipeline = STACIngestionPipeline(config, processor)
163
+
164
+ # Hive-style partition key
165
+ partition_key = "landsat8_test/partition=h3/level=2/821f7ffffffffff/year=2024/month=01"
166
+ expected_path = f"{config.output_catalog}/{partition_key}/items.parquet"
167
+
168
+ result_path = pipeline._get_final_partition_path(partition_key)
169
+ self.assertEqual(result_path, expected_path)
170
+
171
+ def test_final_partition_path_ndjson(self):
172
+ """Test final partition path generation with NDJSON using Hive-style."""
173
+ config = ProcessingConfig(
174
+ input_file=str(self.temp_dir / "test.parquet"),
175
+ output_catalog=str(self.temp_dir / "catalog"),
176
+ scratch_location=str(self.temp_dir / "scratch"),
177
+ output_format="ndjson",
178
+ )
179
+
180
+ processor = LocalProcessor(n_workers=1)
181
+ pipeline = STACIngestionPipeline(config, processor)
182
+
183
+ # Hive-style partition key
184
+ partition_key = "landsat8_test/partition=h3/level=2/821f7ffffffffff/year=2024/month=01"
185
+ expected_path = f"{config.output_catalog}/{partition_key}/items.ndjson"
186
+
187
+ result_path = pipeline._get_final_partition_path(partition_key)
188
+ self.assertEqual(result_path, expected_path)
189
+
190
+ def test_spatial_resolver_structure_detection(self):
191
+ """Test spatial resolver detects file structure."""
192
+ # Create a schema
193
+ schema = {
194
+ "spatial_partitioning": {
195
+ "grid_system": "h3",
196
+ "resolution": 2,
197
+ "partitioning_scheme": "default",
198
+ "structure": "/{mission}/partition={grid}/level={resolution}/{spatial_id}/{temporal}.parquet",
199
+ "example_paths": [
200
+ "landsat8/partition=h3/level=2/821f7ffffffffff/2024-01.parquet",
201
+ "sentinel2/partition=h3/level=2/821f7ffffffffff/2024-01.parquet",
202
+ ],
203
+ },
204
+ "global_partitioning": {"enabled": True, "threshold": 1},
205
+ }
206
+
207
+ resolver = SpatialPartitionResolver(schema, str(self.temp_dir))
208
+
209
+ self.assertIn("landsat8", resolver.missions)
210
+ self.assertIn("sentinel2", resolver.missions)
211
+
212
+ def test_config_validation_output_format(self):
213
+ """Test configuration validation for output format."""
214
+ # Create test file
215
+ test_file = self.temp_dir / "test.parquet"
216
+ test_file.write_text("") # Create empty file for validation
217
+
218
+ # Valid format should not raise
219
+ config = ProcessingConfig(
220
+ input_file=str(test_file),
221
+ output_catalog=str(self.temp_dir / "catalog"),
222
+ scratch_location=str(self.temp_dir / "scratch"),
223
+ output_format="geoparquet",
224
+ )
225
+ # This should not raise an exception
226
+ try:
227
+ config.validate()
228
+ except ValueError:
229
+ self.fail("Valid configuration should not raise ValueError")
230
+
231
+ # Invalid format should raise
232
+ with self.assertRaises(ValueError) as context:
233
+ config = ProcessingConfig(
234
+ input_file=str(self.temp_dir / "test.parquet"),
235
+ output_catalog=str(self.temp_dir / "catalog"),
236
+ scratch_location=str(self.temp_dir / "scratch"),
237
+ output_format="invalid_format",
238
+ )
239
+ config.validate()
240
+
241
+ self.assertIn("output_format must be 'geoparquet' or 'ndjson'", str(context.exception))
242
+
243
+
244
+ if __name__ == "__main__":
245
+ unittest.main()