earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. earthcatalog/__init__.py +164 -0
  2. earthcatalog/async_http_client.py +1006 -0
  3. earthcatalog/config.py +97 -0
  4. earthcatalog/engines/__init__.py +308 -0
  5. earthcatalog/engines/rustac_engine.py +142 -0
  6. earthcatalog/engines/stac_geoparquet_engine.py +126 -0
  7. earthcatalog/exceptions.py +471 -0
  8. earthcatalog/grid_systems.py +1114 -0
  9. earthcatalog/ingestion_pipeline.py +2281 -0
  10. earthcatalog/input_readers.py +603 -0
  11. earthcatalog/job_tracking.py +485 -0
  12. earthcatalog/pipeline.py +606 -0
  13. earthcatalog/schema_generator.py +911 -0
  14. earthcatalog/spatial_resolver.py +1207 -0
  15. earthcatalog/stac_hooks.py +754 -0
  16. earthcatalog/statistics.py +677 -0
  17. earthcatalog/storage_backends.py +548 -0
  18. earthcatalog/tests/__init__.py +1 -0
  19. earthcatalog/tests/conftest.py +76 -0
  20. earthcatalog/tests/test_all_grids.py +793 -0
  21. earthcatalog/tests/test_async_http.py +700 -0
  22. earthcatalog/tests/test_cli_and_storage.py +230 -0
  23. earthcatalog/tests/test_config.py +245 -0
  24. earthcatalog/tests/test_dask_integration.py +580 -0
  25. earthcatalog/tests/test_e2e_synthetic.py +1624 -0
  26. earthcatalog/tests/test_engines.py +272 -0
  27. earthcatalog/tests/test_exceptions.py +346 -0
  28. earthcatalog/tests/test_file_structure.py +245 -0
  29. earthcatalog/tests/test_input_readers.py +666 -0
  30. earthcatalog/tests/test_integration.py +200 -0
  31. earthcatalog/tests/test_integration_async.py +283 -0
  32. earthcatalog/tests/test_job_tracking.py +603 -0
  33. earthcatalog/tests/test_multi_file_input.py +336 -0
  34. earthcatalog/tests/test_passthrough_hook.py +196 -0
  35. earthcatalog/tests/test_pipeline.py +684 -0
  36. earthcatalog/tests/test_pipeline_components.py +665 -0
  37. earthcatalog/tests/test_schema_generator.py +506 -0
  38. earthcatalog/tests/test_spatial_resolver.py +413 -0
  39. earthcatalog/tests/test_stac_hooks.py +776 -0
  40. earthcatalog/tests/test_statistics.py +477 -0
  41. earthcatalog/tests/test_storage_backends.py +236 -0
  42. earthcatalog/tests/test_validation.py +435 -0
  43. earthcatalog/tests/test_workers.py +653 -0
  44. earthcatalog/validation.py +921 -0
  45. earthcatalog/workers.py +682 -0
  46. earthcatalog-0.2.0.dist-info/METADATA +333 -0
  47. earthcatalog-0.2.0.dist-info/RECORD +50 -0
  48. earthcatalog-0.2.0.dist-info/WHEEL +5 -0
  49. earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
  50. earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,477 @@
1
+ """Tests for the statistics module."""
2
+
3
+ import time
4
+
5
+ import pytest
6
+
7
+ from earthcatalog.statistics import HyperLogLog, IngestionStatistics
8
+
9
+
10
+ class TestHyperLogLog:
11
+ """Tests for HyperLogLog approximate cardinality estimator."""
12
+
13
+ def test_empty_hll(self):
14
+ """Test that empty HyperLogLog returns 0."""
15
+ hll = HyperLogLog(precision=14)
16
+ assert hll.count() == 0
17
+
18
+ def test_single_element(self):
19
+ """Test adding a single element."""
20
+ hll = HyperLogLog(precision=14)
21
+ hll.add("item_001")
22
+ assert hll.count() >= 1
23
+
24
+ def test_duplicate_elements(self):
25
+ """Test that duplicates don't increase count."""
26
+ hll = HyperLogLog(precision=14)
27
+ for _ in range(100):
28
+ hll.add("same_item")
29
+ # Should still be approximately 1
30
+ assert hll.count() <= 2
31
+
32
+ def test_many_unique_elements(self):
33
+ """Test counting many unique elements with acceptable error."""
34
+ hll = HyperLogLog(precision=14)
35
+ n = 10000
36
+ for i in range(n):
37
+ hll.add(f"item_{i:08d}")
38
+
39
+ count = hll.count()
40
+ # With precision=14, error should be ~0.8%, allow 5% for safety
41
+ error = abs(count - n) / n
42
+ assert error < 0.05, f"Error {error:.2%} exceeds 5% threshold"
43
+
44
+ def test_precision_affects_accuracy(self):
45
+ """Test that higher precision gives better accuracy."""
46
+ n = 5000
47
+
48
+ # Lower precision (less accurate)
49
+ hll_low = HyperLogLog(precision=10)
50
+ for i in range(n):
51
+ hll_low.add(f"item_{i}")
52
+
53
+ # Higher precision (more accurate)
54
+ hll_high = HyperLogLog(precision=14)
55
+ for i in range(n):
56
+ hll_high.add(f"item_{i}")
57
+
58
+ error_low = abs(hll_low.count() - n) / n
59
+ error_high = abs(hll_high.count() - n) / n
60
+
61
+ # Both should be reasonably accurate
62
+ assert error_low < 0.15 # Low precision within 15%
63
+ # Higher precision should generally have lower error
64
+ # (though not guaranteed for any single run)
65
+ assert error_high < 0.10 # High precision within 10%
66
+
67
+ def test_merge_hlls(self):
68
+ """Test merging two HyperLogLog instances."""
69
+ hll1 = HyperLogLog(precision=14)
70
+ hll2 = HyperLogLog(precision=14)
71
+
72
+ # Add different items to each
73
+ for i in range(1000):
74
+ hll1.add(f"item_a_{i}")
75
+ for i in range(1000):
76
+ hll2.add(f"item_b_{i}")
77
+
78
+ # Merge
79
+ hll1.merge(hll2)
80
+
81
+ # Should have approximately 2000 unique items
82
+ count = hll1.count()
83
+ error = abs(count - 2000) / 2000
84
+ assert error < 0.10
85
+
86
+ def test_merge_with_overlap(self):
87
+ """Test merging HLLs with overlapping items."""
88
+ hll1 = HyperLogLog(precision=14)
89
+ hll2 = HyperLogLog(precision=14)
90
+
91
+ # Add 1000 items to hll1
92
+ for i in range(1000):
93
+ hll1.add(f"item_{i}")
94
+
95
+ # Add 500 overlapping + 500 new to hll2
96
+ for i in range(500, 1500):
97
+ hll2.add(f"item_{i}")
98
+
99
+ hll1.merge(hll2)
100
+
101
+ # Should have approximately 1500 unique items
102
+ count = hll1.count()
103
+ error = abs(count - 1500) / 1500
104
+ assert error < 0.10
105
+
106
+ def test_merge_different_precision_fails(self):
107
+ """Test that merging different precision HLLs raises error."""
108
+ hll1 = HyperLogLog(precision=10)
109
+ hll2 = HyperLogLog(precision=14)
110
+
111
+ with pytest.raises(ValueError, match="different precision"):
112
+ hll1.merge(hll2)
113
+
114
+ def test_invalid_precision(self):
115
+ """Test that invalid precision raises error."""
116
+ with pytest.raises(ValueError):
117
+ HyperLogLog(precision=3) # Too low
118
+ with pytest.raises(ValueError):
119
+ HyperLogLog(precision=17) # Too high
120
+
121
+
122
+ class TestIngestionStatistics:
123
+ """Tests for IngestionStatistics collector."""
124
+
125
+ @pytest.fixture
126
+ def sample_item(self):
127
+ """Create a sample STAC item for testing."""
128
+ return {
129
+ "id": "test_item_001",
130
+ "type": "Feature",
131
+ "geometry": {
132
+ "type": "Polygon",
133
+ "coordinates": [
134
+ [
135
+ [-122.5, 37.5],
136
+ [-122.0, 37.5],
137
+ [-122.0, 38.0],
138
+ [-122.5, 38.0],
139
+ [-122.5, 37.5],
140
+ ]
141
+ ],
142
+ },
143
+ "properties": {
144
+ "datetime": "2024-06-15T10:30:00Z",
145
+ "collection": "sentinel-2",
146
+ },
147
+ }
148
+
149
+ @pytest.fixture
150
+ def stats(self):
151
+ """Create a fresh statistics instance."""
152
+ return IngestionStatistics()
153
+
154
+ def test_empty_stats(self, stats):
155
+ """Test empty statistics summary."""
156
+ summary = stats.get_summary()
157
+
158
+ assert summary["unique_granules"] == 0
159
+ assert summary["stored_references"] == 0
160
+ assert summary["overhead"]["spanning_items"] == 0
161
+
162
+ def test_record_single_item(self, stats, sample_item):
163
+ """Test recording a single item."""
164
+ stats.record_item(
165
+ item=sample_item,
166
+ tiles=["abc123"],
167
+ is_spanning=False,
168
+ routed_to_global=False,
169
+ mission="sentinel2",
170
+ )
171
+
172
+ summary = stats.get_summary()
173
+
174
+ assert summary["unique_granules"] == 1
175
+ assert summary["stored_references"] == 1
176
+ assert summary["missions"]["sentinel2"] == 1
177
+ assert summary["quality"]["geometry_types"]["Polygon"] == 1
178
+
179
+ def test_record_spanning_item(self, stats, sample_item):
180
+ """Test recording a spanning item (multiple tiles)."""
181
+ stats.record_item(
182
+ item=sample_item,
183
+ tiles=["tile1", "tile2", "tile3"],
184
+ is_spanning=True,
185
+ routed_to_global=False,
186
+ mission="landsat",
187
+ )
188
+
189
+ summary = stats.get_summary()
190
+
191
+ assert summary["unique_granules"] == 1
192
+ assert summary["stored_references"] == 3 # Stored in 3 tiles
193
+ assert summary["overhead"]["spanning_items"] == 1
194
+ assert summary["overhead"]["max_tiles_per_item"] == 3
195
+ assert summary["overhead"]["duplication_ratio"] == 3.0
196
+
197
+ def test_record_global_routed_item(self, stats, sample_item):
198
+ """Test recording an item routed to global partition."""
199
+ stats.record_item(
200
+ item=sample_item,
201
+ tiles=["t1", "t2", "t3", "t4", "t5"], # Would span 5 tiles
202
+ is_spanning=True,
203
+ routed_to_global=True,
204
+ mission="modis",
205
+ )
206
+
207
+ summary = stats.get_summary()
208
+
209
+ # Routed to global = only 1 stored reference
210
+ assert summary["stored_references"] == 1
211
+ assert summary["global_partition"]["items_routed_to_global"] == 1
212
+
213
+ def test_temporal_tracking(self, stats):
214
+ """Test temporal distribution tracking."""
215
+ items = [
216
+ {"id": f"item_{i}", "geometry": None, "properties": {"datetime": f"2024-0{m}-15T00:00:00Z"}}
217
+ for i, m in enumerate([1, 1, 2, 3, 3, 3], 1)
218
+ ]
219
+
220
+ for item in items:
221
+ stats.record_item(item, tiles=[], is_spanning=False, mission="test")
222
+
223
+ summary = stats.get_summary()
224
+
225
+ assert "2024" in summary["temporal"]["years_with_data"]
226
+ assert summary["temporal"]["distribution"]["2024"]["months"]["01"] == 2
227
+ assert summary["temporal"]["distribution"]["2024"]["months"]["02"] == 1
228
+ assert summary["temporal"]["distribution"]["2024"]["months"]["03"] == 3
229
+
230
+ def test_spatial_bbox_tracking(self, stats):
231
+ """Test spatial bounding box tracking."""
232
+ items = [
233
+ {
234
+ "id": "item1",
235
+ "geometry": {"type": "Point", "coordinates": [-100, 30]},
236
+ "properties": {},
237
+ },
238
+ {
239
+ "id": "item2",
240
+ "geometry": {"type": "Point", "coordinates": [-80, 45]},
241
+ "properties": {},
242
+ },
243
+ ]
244
+
245
+ for item in items:
246
+ stats.record_item(item, tiles=["t1"], is_spanning=False, mission="test")
247
+
248
+ summary = stats.get_summary()
249
+ bbox = summary["spatial"]["bbox"]
250
+
251
+ assert bbox[0] == -100 # min lon
252
+ assert bbox[1] == 30 # min lat
253
+ assert bbox[2] == -80 # max lon
254
+ assert bbox[3] == 45 # max lat
255
+
256
+ def test_quality_metrics(self, stats):
257
+ """Test data quality metric tracking."""
258
+ # Item with null geometry
259
+ stats.record_item(
260
+ {"id": "null_geom", "geometry": None, "properties": {}}, tiles=[], is_spanning=False, mission="test"
261
+ )
262
+
263
+ # Item with missing datetime
264
+ stats.record_item(
265
+ {"id": "no_dt", "geometry": {"type": "Point", "coordinates": [0, 0]}, "properties": {}},
266
+ tiles=["t1"],
267
+ is_spanning=False,
268
+ mission="test",
269
+ )
270
+
271
+ summary = stats.get_summary()
272
+
273
+ assert summary["quality"]["null_geometries"] == 1
274
+ assert summary["quality"]["missing_datetime"] == 2 # Both items missing datetime
275
+
276
+ def test_url_processing_stats(self, stats):
277
+ """Test URL processing statistics."""
278
+ stats.record_url_processed(success=True)
279
+ stats.record_url_processed(success=True)
280
+ stats.record_url_processed(success=False)
281
+
282
+ summary = stats.get_summary()
283
+
284
+ assert summary["processing"]["urls_processed"] == 3
285
+ assert summary["processing"]["urls_failed"] == 1
286
+ assert summary["processing"]["success_rate"] == pytest.approx(66.67, rel=0.01)
287
+
288
+ def test_timing_metrics(self, stats):
289
+ """Test processing timing metrics."""
290
+ stats.start_processing()
291
+ time.sleep(0.1) # Brief delay
292
+ stats.finish_processing()
293
+
294
+ summary = stats.get_summary()
295
+
296
+ assert summary["processing"]["duration_seconds"] >= 0.1
297
+
298
+ def test_hotspot_detection(self, stats):
299
+ """Test hotspot cell detection."""
300
+ # Create uneven distribution
301
+ for i in range(100):
302
+ stats.record_item(
303
+ {"id": f"item_{i}", "geometry": None, "properties": {}},
304
+ tiles=["hot_cell"],
305
+ is_spanning=False,
306
+ mission="test",
307
+ )
308
+ for i in range(10):
309
+ stats.record_item(
310
+ {"id": f"cold_item_{i}", "geometry": None, "properties": {}},
311
+ tiles=["cold_cell"],
312
+ is_spanning=False,
313
+ mission="test",
314
+ )
315
+
316
+ summary = stats.get_summary()
317
+ hotspots = summary["spatial"]["hotspot_cells"]
318
+
319
+ # Hot cell should be first
320
+ assert hotspots[0]["cell"] == "hot_cell"
321
+ assert hotspots[0]["count"] == 100
322
+
323
+ def test_merge_statistics(self, stats, sample_item):
324
+ """Test merging statistics from multiple workers."""
325
+ stats2 = IngestionStatistics()
326
+
327
+ # Record items in first stats
328
+ for i in range(5):
329
+ item = sample_item.copy()
330
+ item["id"] = f"item_a_{i}"
331
+ stats.record_item(item, tiles=["cell_a"], is_spanning=False, mission="mission_a")
332
+
333
+ # Record items in second stats
334
+ for i in range(3):
335
+ item = sample_item.copy()
336
+ item["id"] = f"item_b_{i}"
337
+ stats2.record_item(item, tiles=["cell_b"], is_spanning=False, mission="mission_b")
338
+
339
+ # Merge
340
+ stats.merge(stats2)
341
+
342
+ summary = stats.get_summary()
343
+
344
+ assert summary["unique_granules"] == 8
345
+ assert summary["stored_references"] == 8
346
+ assert summary["missions"]["mission_a"] == 5
347
+ assert summary["missions"]["mission_b"] == 3
348
+
349
+ def test_consolidation_stats(self, stats):
350
+ """Test consolidation statistics recording."""
351
+ stats.record_consolidation(new_items=100, existing_items=50, duplicates_removed=10)
352
+ stats.record_consolidation(new_items=200, existing_items=100, duplicates_removed=5)
353
+
354
+ summary = stats.get_summary()
355
+
356
+ assert summary["processing"]["new_items"] == 300
357
+ assert summary["processing"]["existing_items"] == 150
358
+ assert summary["processing"]["duplicates_removed"] == 15
359
+
360
+ def test_overhead_percentage(self, stats, sample_item):
361
+ """Test overhead percentage calculation."""
362
+ # Add 10 unique items, 3 of which span 2 tiles each
363
+ for i in range(7):
364
+ item = sample_item.copy()
365
+ item["id"] = f"single_{i}"
366
+ stats.record_item(item, tiles=[f"t{i}"], is_spanning=False, mission="test")
367
+
368
+ for i in range(3):
369
+ item = sample_item.copy()
370
+ item["id"] = f"spanning_{i}"
371
+ stats.record_item(item, tiles=["tA", "tB"], is_spanning=True, mission="test")
372
+
373
+ summary = stats.get_summary()
374
+
375
+ # 10 unique items, 7 + 6 = 13 stored references
376
+ assert summary["unique_granules"] == 10
377
+ assert summary["stored_references"] == 13
378
+ assert summary["overhead"]["overhead_percentage"] == 30.0 # (13-10)/10 * 100
379
+
380
+ def test_tiles_distribution_histogram(self, stats, sample_item):
381
+ """Test the tiles-per-spanning-item histogram."""
382
+ # Add spanning items with different tile counts
383
+ for i in range(5):
384
+ item = sample_item.copy()
385
+ item["id"] = f"span2_{i}"
386
+ stats.record_item(item, tiles=["a", "b"], is_spanning=True, mission="test")
387
+
388
+ for i in range(3):
389
+ item = sample_item.copy()
390
+ item["id"] = f"span5_{i}"
391
+ stats.record_item(item, tiles=["a", "b", "c", "d", "e"], is_spanning=True, mission="test")
392
+
393
+ summary = stats.get_summary()
394
+
395
+ # Check histogram buckets
396
+ assert summary["overhead"]["tiles_distribution"][2] == 5 # 5 items with 2 tiles
397
+ assert summary["overhead"]["tiles_distribution"][5] == 3 # 3 items with 3-5 tiles
398
+
399
+ def test_datetime_parsing_z_suffix(self, stats):
400
+ """Test datetime parsing with Z suffix (common in STAC)."""
401
+ item = {
402
+ "id": "z_suffix_item",
403
+ "geometry": None,
404
+ "properties": {"datetime": "2024-06-15T10:30:00Z"},
405
+ }
406
+ stats.record_item(item, tiles=[], is_spanning=False, mission="test")
407
+
408
+ summary = stats.get_summary()
409
+ assert summary["temporal"]["distribution"]["2024"]["months"]["06"] == 1
410
+ assert summary["quality"]["missing_datetime"] == 0
411
+
412
+ def test_datetime_parsing_with_timezone(self, stats):
413
+ """Test datetime parsing with explicit timezone offset."""
414
+ item = {
415
+ "id": "tz_item",
416
+ "geometry": None,
417
+ "properties": {"datetime": "2024-03-20T15:45:30+05:30"},
418
+ }
419
+ stats.record_item(item, tiles=[], is_spanning=False, mission="test")
420
+
421
+ summary = stats.get_summary()
422
+ assert summary["temporal"]["distribution"]["2024"]["months"]["03"] == 1
423
+
424
+ def test_datetime_parsing_no_timezone(self, stats):
425
+ """Test datetime parsing without timezone (naive datetime)."""
426
+ item = {
427
+ "id": "naive_item",
428
+ "geometry": None,
429
+ "properties": {"datetime": "2024-12-25T00:00:00"},
430
+ }
431
+ stats.record_item(item, tiles=[], is_spanning=False, mission="test")
432
+
433
+ summary = stats.get_summary()
434
+ assert summary["temporal"]["distribution"]["2024"]["months"]["12"] == 1
435
+
436
+ def test_datetime_parsing_with_microseconds(self, stats):
437
+ """Test datetime parsing with microseconds."""
438
+ item = {
439
+ "id": "micro_item",
440
+ "geometry": None,
441
+ "properties": {"datetime": "2024-01-01T12:00:00.123456Z"},
442
+ }
443
+ stats.record_item(item, tiles=[], is_spanning=False, mission="test")
444
+
445
+ summary = stats.get_summary()
446
+ assert summary["temporal"]["distribution"]["2024"]["months"]["01"] == 1
447
+
448
+ def test_datetime_parsing_invalid_format(self, stats):
449
+ """Test that invalid datetime formats are counted as missing."""
450
+ item = {
451
+ "id": "invalid_dt",
452
+ "geometry": None,
453
+ "properties": {"datetime": "not-a-date"},
454
+ }
455
+ stats.record_item(item, tiles=[], is_spanning=False, mission="test")
456
+
457
+ summary = stats.get_summary()
458
+ assert summary["quality"]["missing_datetime"] == 1
459
+
460
+
461
+ class TestHyperLogLogLargeScale:
462
+ """Large-scale tests for HyperLogLog (optional, slower)."""
463
+
464
+ @pytest.mark.slow
465
+ def test_million_elements(self):
466
+ """Test with 1 million unique elements."""
467
+ hll = HyperLogLog(precision=14)
468
+ n = 1_000_000
469
+
470
+ for i in range(n):
471
+ hll.add(f"item_{i:08d}")
472
+
473
+ count = hll.count()
474
+ error = abs(count - n) / n
475
+
476
+ # Should be within 2% for precision=14
477
+ assert error < 0.02, f"Error {error:.2%} exceeds 2% threshold"