earthcatalog 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- earthcatalog/__init__.py +164 -0
- earthcatalog/async_http_client.py +1006 -0
- earthcatalog/config.py +97 -0
- earthcatalog/engines/__init__.py +308 -0
- earthcatalog/engines/rustac_engine.py +142 -0
- earthcatalog/engines/stac_geoparquet_engine.py +126 -0
- earthcatalog/exceptions.py +471 -0
- earthcatalog/grid_systems.py +1114 -0
- earthcatalog/ingestion_pipeline.py +2281 -0
- earthcatalog/input_readers.py +603 -0
- earthcatalog/job_tracking.py +485 -0
- earthcatalog/pipeline.py +606 -0
- earthcatalog/schema_generator.py +911 -0
- earthcatalog/spatial_resolver.py +1207 -0
- earthcatalog/stac_hooks.py +754 -0
- earthcatalog/statistics.py +677 -0
- earthcatalog/storage_backends.py +548 -0
- earthcatalog/tests/__init__.py +1 -0
- earthcatalog/tests/conftest.py +76 -0
- earthcatalog/tests/test_all_grids.py +793 -0
- earthcatalog/tests/test_async_http.py +700 -0
- earthcatalog/tests/test_cli_and_storage.py +230 -0
- earthcatalog/tests/test_config.py +245 -0
- earthcatalog/tests/test_dask_integration.py +580 -0
- earthcatalog/tests/test_e2e_synthetic.py +1624 -0
- earthcatalog/tests/test_engines.py +272 -0
- earthcatalog/tests/test_exceptions.py +346 -0
- earthcatalog/tests/test_file_structure.py +245 -0
- earthcatalog/tests/test_input_readers.py +666 -0
- earthcatalog/tests/test_integration.py +200 -0
- earthcatalog/tests/test_integration_async.py +283 -0
- earthcatalog/tests/test_job_tracking.py +603 -0
- earthcatalog/tests/test_multi_file_input.py +336 -0
- earthcatalog/tests/test_passthrough_hook.py +196 -0
- earthcatalog/tests/test_pipeline.py +684 -0
- earthcatalog/tests/test_pipeline_components.py +665 -0
- earthcatalog/tests/test_schema_generator.py +506 -0
- earthcatalog/tests/test_spatial_resolver.py +413 -0
- earthcatalog/tests/test_stac_hooks.py +776 -0
- earthcatalog/tests/test_statistics.py +477 -0
- earthcatalog/tests/test_storage_backends.py +236 -0
- earthcatalog/tests/test_validation.py +435 -0
- earthcatalog/tests/test_workers.py +653 -0
- earthcatalog/validation.py +921 -0
- earthcatalog/workers.py +682 -0
- earthcatalog-0.2.0.dist-info/METADATA +333 -0
- earthcatalog-0.2.0.dist-info/RECORD +50 -0
- earthcatalog-0.2.0.dist-info/WHEEL +5 -0
- earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
- earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,700 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tests for async HTTP client functionality in EarthCatalog.
|
|
3
|
+
|
|
4
|
+
These tests validate the concurrent HTTP processing capabilities including
|
|
5
|
+
performance improvements, error handling, and integration with the existing
|
|
6
|
+
pipeline architecture.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
# mypy: ignore-errors
|
|
10
|
+
|
|
11
|
+
import asyncio
|
|
12
|
+
import time
|
|
13
|
+
from unittest.mock import patch
|
|
14
|
+
|
|
15
|
+
import pytest
|
|
16
|
+
|
|
17
|
+
# Import async HTTP testing utilities
|
|
18
|
+
try:
|
|
19
|
+
import aiohttp
|
|
20
|
+
from aioresponses import aioresponses
|
|
21
|
+
|
|
22
|
+
HAS_ASYNC_TEST_SUPPORT = True
|
|
23
|
+
except ImportError:
|
|
24
|
+
HAS_ASYNC_TEST_SUPPORT = False
|
|
25
|
+
aioresponses = None # type: ignore
|
|
26
|
+
aiohttp = None # type: ignore
|
|
27
|
+
|
|
28
|
+
# Import grid systems for validation tests
|
|
29
|
+
from earthcatalog.grid_systems import (
|
|
30
|
+
H3GridSystem,
|
|
31
|
+
MGRSGridSystem,
|
|
32
|
+
S2GridSystem,
|
|
33
|
+
SimpleLatLonGrid,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
# Import EarthCatalog modules
|
|
37
|
+
from earthcatalog.ingestion_pipeline import ProcessingConfig, STACIngestionPipeline
|
|
38
|
+
|
|
39
|
+
# Grid systems are now managed by the pipeline internally
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
from earthcatalog.async_http_client import (
|
|
43
|
+
HAS_ASYNC_HTTP,
|
|
44
|
+
AsyncHTTPClient,
|
|
45
|
+
BatchDownloader,
|
|
46
|
+
ErrorType,
|
|
47
|
+
RequestResult,
|
|
48
|
+
download_stac_items_async,
|
|
49
|
+
)
|
|
50
|
+
except ImportError:
|
|
51
|
+
HAS_ASYNC_HTTP = False
|
|
52
|
+
# Type stubs for mypy
|
|
53
|
+
AsyncHTTPClient = None # type: ignore
|
|
54
|
+
BatchDownloader = None # type: ignore
|
|
55
|
+
RequestResult = None # type: ignore
|
|
56
|
+
ErrorType = None # type: ignore
|
|
57
|
+
download_stac_items_async = None # type: ignore
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@pytest.mark.skipif(not HAS_ASYNC_HTTP, reason="Async HTTP client not available")
|
|
61
|
+
class TestAsyncHTTPClient:
|
|
62
|
+
"""Test suite for AsyncHTTPClient functionality."""
|
|
63
|
+
|
|
64
|
+
def setup_method(self):
|
|
65
|
+
"""Set up test fixtures."""
|
|
66
|
+
self.test_config = {
|
|
67
|
+
"concurrent_requests": 10,
|
|
68
|
+
"connection_pool_size": 20,
|
|
69
|
+
"request_timeout": 5,
|
|
70
|
+
"retry_attempts": 2,
|
|
71
|
+
"retry_delay": 0.1,
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
# Sample STAC item for responses
|
|
75
|
+
self.sample_stac_item = {
|
|
76
|
+
"id": "test_item",
|
|
77
|
+
"type": "Feature",
|
|
78
|
+
"geometry": {"type": "Point", "coordinates": [-122.4, 37.8]},
|
|
79
|
+
"properties": {"datetime": "2024-01-01T00:00:00Z", "collection": "test_collection"},
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
@pytest.mark.asyncio
|
|
83
|
+
async def test_async_http_client_initialization(self):
|
|
84
|
+
"""Test AsyncHTTPClient can be initialized with proper configuration."""
|
|
85
|
+
async with AsyncHTTPClient(**self.test_config) as client:
|
|
86
|
+
assert client.concurrent_requests == 10
|
|
87
|
+
assert client.connection_pool_size == 20
|
|
88
|
+
assert client.request_timeout == 5
|
|
89
|
+
assert client.retry_attempts == 2
|
|
90
|
+
assert client.retry_delay == 0.1
|
|
91
|
+
assert client.session is not None
|
|
92
|
+
|
|
93
|
+
@pytest.mark.skipif(not HAS_ASYNC_TEST_SUPPORT, reason="aioresponses not available")
|
|
94
|
+
@pytest.mark.asyncio
|
|
95
|
+
async def test_single_successful_request(self):
|
|
96
|
+
"""Test successful download of a single STAC item."""
|
|
97
|
+
url = "https://example.com/item.json"
|
|
98
|
+
|
|
99
|
+
with aioresponses() as mock_responses:
|
|
100
|
+
mock_responses.get(url, payload=self.sample_stac_item)
|
|
101
|
+
|
|
102
|
+
async with AsyncHTTPClient(**self.test_config) as client:
|
|
103
|
+
result = await client._fetch_single_url(url)
|
|
104
|
+
|
|
105
|
+
assert result.success is True
|
|
106
|
+
assert result.data == self.sample_stac_item
|
|
107
|
+
assert result.error is None
|
|
108
|
+
assert result.attempts == 1
|
|
109
|
+
assert result.response_time > 0
|
|
110
|
+
|
|
111
|
+
@pytest.mark.skipif(not HAS_ASYNC_TEST_SUPPORT, reason="aioresponses not available")
|
|
112
|
+
@pytest.mark.asyncio
|
|
113
|
+
async def test_batch_download(self):
|
|
114
|
+
"""Test concurrent download of multiple STAC items."""
|
|
115
|
+
urls = [f"https://example.com/item_{i}.json" for i in range(100)]
|
|
116
|
+
|
|
117
|
+
with aioresponses() as mock_responses:
|
|
118
|
+
# Mock all URLs with successful responses
|
|
119
|
+
for i, url in enumerate(urls):
|
|
120
|
+
item = self.sample_stac_item.copy()
|
|
121
|
+
item["id"] = f"item_{i}"
|
|
122
|
+
mock_responses.get(url, payload=item)
|
|
123
|
+
|
|
124
|
+
async with AsyncHTTPClient(**self.test_config) as client:
|
|
125
|
+
start_time = time.time()
|
|
126
|
+
results = await client.download_batch(urls)
|
|
127
|
+
duration = time.time() - start_time
|
|
128
|
+
|
|
129
|
+
# Validate results
|
|
130
|
+
assert len(results) == 100
|
|
131
|
+
successful = [r for r in results if r.success]
|
|
132
|
+
assert len(successful) == 100
|
|
133
|
+
|
|
134
|
+
# Should complete much faster than sequential (under 2 seconds)
|
|
135
|
+
assert duration < 2.0
|
|
136
|
+
|
|
137
|
+
# Validate data integrity
|
|
138
|
+
for i, result in enumerate(successful):
|
|
139
|
+
assert result.data is not None
|
|
140
|
+
assert result.data["id"] == f"item_{i}"
|
|
141
|
+
|
|
142
|
+
@pytest.mark.skipif(not HAS_ASYNC_TEST_SUPPORT, reason="aioresponses not available")
|
|
143
|
+
@pytest.mark.asyncio
|
|
144
|
+
async def test_error_handling_and_retries(self):
|
|
145
|
+
"""Test error handling with different HTTP error conditions using mocked responses."""
|
|
146
|
+
# Use mocked responses for fast, reliable testing
|
|
147
|
+
urls = [
|
|
148
|
+
"https://example.com/server_error.json", # Will mock 500 error
|
|
149
|
+
"https://example.com/not_found.json", # Will mock 404 error
|
|
150
|
+
"https://example.com/rate_limit.json", # Will mock 429 error
|
|
151
|
+
"https://example.com/success.json", # Will mock success
|
|
152
|
+
"https://example.com/timeout.json", # Will mock timeout
|
|
153
|
+
]
|
|
154
|
+
|
|
155
|
+
# Fast test configuration
|
|
156
|
+
test_config = {
|
|
157
|
+
"concurrent_requests": 5,
|
|
158
|
+
"connection_pool_size": 10,
|
|
159
|
+
"request_timeout": 1, # Short timeout for fast tests
|
|
160
|
+
"retry_attempts": 2, # Limited retries for speed
|
|
161
|
+
"retry_delay": 0.01, # Minimal delay for speed
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
with aioresponses() as mock_responses:
|
|
165
|
+
# Mock different error conditions
|
|
166
|
+
mock_responses.get(urls[0], status=500) # Server error
|
|
167
|
+
mock_responses.get(urls[1], status=404) # Not found
|
|
168
|
+
mock_responses.get(urls[2], status=429) # Rate limit
|
|
169
|
+
mock_responses.get(urls[3], payload={"id": "success", "type": "Feature"}) # Success
|
|
170
|
+
# urls[4] not mocked - will timeout quickly
|
|
171
|
+
|
|
172
|
+
async with AsyncHTTPClient(**test_config) as client:
|
|
173
|
+
results = await client.download_batch(urls)
|
|
174
|
+
|
|
175
|
+
# Validate we got results for all URLs
|
|
176
|
+
assert len(results) == len(urls)
|
|
177
|
+
|
|
178
|
+
# Analyze results
|
|
179
|
+
success_count = sum(1 for r in results if r.success)
|
|
180
|
+
failure_count = sum(1 for r in results if not r.success)
|
|
181
|
+
|
|
182
|
+
# Should have exactly one success (the mocked success)
|
|
183
|
+
assert success_count >= 1, "Should have at least one successful request"
|
|
184
|
+
assert failure_count >= 1, "Should have failed requests"
|
|
185
|
+
|
|
186
|
+
# Check that different error types are detected
|
|
187
|
+
error_types = {r.error_type for r in results if not r.success and r.error_type}
|
|
188
|
+
assert len(error_types) >= 1, "Should detect different error types"
|
|
189
|
+
|
|
190
|
+
# Validate retry behavior - failed requests should have retry attempts
|
|
191
|
+
retry_counts = [r.attempts for r in results if not r.success]
|
|
192
|
+
if retry_counts: # Only check if there are failed requests
|
|
193
|
+
max_retries = max(retry_counts)
|
|
194
|
+
assert max_retries >= 1, "Should have retry attempts for failed requests"
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
@pytest.mark.skipif(not HAS_ASYNC_HTTP, reason="Async HTTP client not available")
|
|
198
|
+
class TestBatchDownloader:
|
|
199
|
+
"""Test suite for BatchDownloader functionality."""
|
|
200
|
+
|
|
201
|
+
def setup_method(self):
|
|
202
|
+
"""Set up test fixtures."""
|
|
203
|
+
self.downloader = BatchDownloader(batch_size=50, concurrent_requests=10, request_timeout=5, retry_attempts=2)
|
|
204
|
+
|
|
205
|
+
@pytest.mark.skipif(not HAS_ASYNC_TEST_SUPPORT, reason="aioresponses not available")
|
|
206
|
+
@pytest.mark.asyncio
|
|
207
|
+
async def test_batch_processing_with_memory_management(self):
|
|
208
|
+
"""Test batch processing handles memory efficiently."""
|
|
209
|
+
# Create 1000 URLs to test batch processing
|
|
210
|
+
urls = [f"https://example.com/item_{i}.json" for i in range(1000)]
|
|
211
|
+
|
|
212
|
+
with aioresponses() as mock_responses:
|
|
213
|
+
# Mock all URLs with successful responses
|
|
214
|
+
for i, url in enumerate(urls):
|
|
215
|
+
item = {
|
|
216
|
+
"id": f"item_{i}",
|
|
217
|
+
"type": "Feature",
|
|
218
|
+
"geometry": {"type": "Point", "coordinates": [0, 0]},
|
|
219
|
+
"properties": {"datetime": "2024-01-01T00:00:00Z"},
|
|
220
|
+
}
|
|
221
|
+
mock_responses.get(url, payload=item)
|
|
222
|
+
|
|
223
|
+
# Download all items
|
|
224
|
+
items = await self.downloader.download_all(urls)
|
|
225
|
+
|
|
226
|
+
# Validate results
|
|
227
|
+
assert len(items) == 1000
|
|
228
|
+
for i, item in enumerate(items):
|
|
229
|
+
assert item["id"] == f"item_{i}"
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
class TestAsyncHTTPIntegration:
|
|
233
|
+
"""Integration tests for async HTTP with EarthCatalog pipeline."""
|
|
234
|
+
|
|
235
|
+
def setup_method(self):
|
|
236
|
+
"""Set up test fixtures."""
|
|
237
|
+
import tempfile
|
|
238
|
+
|
|
239
|
+
import pandas as pd
|
|
240
|
+
|
|
241
|
+
# Create temporary input file
|
|
242
|
+
self.temp_input_file = tempfile.NamedTemporaryFile(suffix=".parquet", delete=False)
|
|
243
|
+
df = pd.DataFrame({"url": ["https://example.com/test.json"]})
|
|
244
|
+
df.to_parquet(self.temp_input_file.name)
|
|
245
|
+
|
|
246
|
+
self.config = ProcessingConfig(
|
|
247
|
+
input_file=self.temp_input_file.name,
|
|
248
|
+
output_catalog="/tmp/test_catalog",
|
|
249
|
+
scratch_location="/tmp/test_scratch",
|
|
250
|
+
grid_system="h3",
|
|
251
|
+
grid_resolution=2,
|
|
252
|
+
temporal_bin="month",
|
|
253
|
+
# Enable async HTTP
|
|
254
|
+
enable_concurrent_http=True,
|
|
255
|
+
concurrent_requests=5,
|
|
256
|
+
batch_size=10,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
# Create a local processor for testing
|
|
260
|
+
from earthcatalog.ingestion_pipeline import LocalProcessor
|
|
261
|
+
|
|
262
|
+
self.processor = LocalProcessor()
|
|
263
|
+
self.pipeline = STACIngestionPipeline(self.config, self.processor)
|
|
264
|
+
|
|
265
|
+
def teardown_method(self):
|
|
266
|
+
"""Clean up test fixtures."""
|
|
267
|
+
import os
|
|
268
|
+
|
|
269
|
+
if hasattr(self, "temp_input_file") and os.path.exists(self.temp_input_file.name):
|
|
270
|
+
os.unlink(self.temp_input_file.name)
|
|
271
|
+
if hasattr(self, "processor"):
|
|
272
|
+
self.processor.close()
|
|
273
|
+
|
|
274
|
+
def test_configuration_validation(self):
|
|
275
|
+
"""Test async HTTP configuration validation."""
|
|
276
|
+
# Valid configuration should pass
|
|
277
|
+
self.config.validate()
|
|
278
|
+
|
|
279
|
+
# Invalid concurrent_requests should fail
|
|
280
|
+
invalid_config = ProcessingConfig(
|
|
281
|
+
input_file=self.temp_input_file.name, # Use existing temp file
|
|
282
|
+
output_catalog="/tmp/test_catalog",
|
|
283
|
+
scratch_location="/tmp/test_scratch",
|
|
284
|
+
concurrent_requests=0,
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
with pytest.raises(ValueError, match="concurrent_requests must be positive"):
|
|
288
|
+
invalid_config.validate()
|
|
289
|
+
|
|
290
|
+
def test_async_fallback_behavior(self):
|
|
291
|
+
"""Test graceful fallback when async HTTP is not available."""
|
|
292
|
+
# Create config with async enabled
|
|
293
|
+
config = self.config
|
|
294
|
+
config.enable_concurrent_http = True
|
|
295
|
+
|
|
296
|
+
# Mock HAS_ASYNC_HTTP to False to test fallback
|
|
297
|
+
with patch("earthcatalog.ingestion_pipeline.HAS_ASYNC_HTTP", False):
|
|
298
|
+
# Should still create pipeline without errors
|
|
299
|
+
pipeline = STACIngestionPipeline(config, self.processor)
|
|
300
|
+
|
|
301
|
+
# Should be able to process small batch (will use sync processing)
|
|
302
|
+
urls = ["https://example.com/item1.json", "https://example.com/item2.json"]
|
|
303
|
+
|
|
304
|
+
with patch.object(pipeline, "_download_stac_item") as mock_download:
|
|
305
|
+
mock_download.return_value = {
|
|
306
|
+
"id": "test_item",
|
|
307
|
+
"type": "Feature",
|
|
308
|
+
"geometry": {"type": "Point", "coordinates": [0, 0]},
|
|
309
|
+
"properties": {"datetime": "2024-01-01T00:00:00Z"},
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
# This should use sync processing
|
|
313
|
+
items = pipeline._download_stac_items_batch_async(urls, 1)
|
|
314
|
+
assert len(items) == 2
|
|
315
|
+
assert mock_download.call_count == 2
|
|
316
|
+
|
|
317
|
+
@pytest.mark.skipif(not HAS_ASYNC_HTTP, reason="Async HTTP client not available")
|
|
318
|
+
def test_async_processing_configuration(self):
|
|
319
|
+
"""Test that pipeline correctly configures async vs sync processing."""
|
|
320
|
+
# Test that async is enabled in the configuration
|
|
321
|
+
assert self.pipeline.config.enable_concurrent_http is True
|
|
322
|
+
assert self.pipeline.config.concurrent_requests == 5 # As configured in setup_method
|
|
323
|
+
assert self.pipeline.config.batch_size == 10 # As configured in setup_method
|
|
324
|
+
|
|
325
|
+
# Test that HAS_ASYNC_HTTP is properly detected
|
|
326
|
+
from earthcatalog.async_http_client import HAS_ASYNC_HTTP
|
|
327
|
+
|
|
328
|
+
if HAS_ASYNC_HTTP:
|
|
329
|
+
# Test async configuration is valid
|
|
330
|
+
assert self.pipeline.config.concurrent_requests > 0
|
|
331
|
+
assert self.pipeline.config.request_timeout > 0
|
|
332
|
+
assert self.pipeline.config.retry_attempts > 0
|
|
333
|
+
else:
|
|
334
|
+
# If async not available, should still work
|
|
335
|
+
assert self.pipeline.config.enable_concurrent_http is True # Config can be True even if not available
|
|
336
|
+
|
|
337
|
+
@pytest.mark.skipif(not HAS_ASYNC_TEST_SUPPORT, reason="aioresponses not available")
|
|
338
|
+
@pytest.mark.asyncio
|
|
339
|
+
async def test_performance_improvement_validation(self):
|
|
340
|
+
"""Test that async processing provides performance improvement."""
|
|
341
|
+
# Create test URLs
|
|
342
|
+
test_urls = [f"https://example.com/item_{i}.json" for i in range(100)]
|
|
343
|
+
|
|
344
|
+
sample_item = {
|
|
345
|
+
"id": "test_item",
|
|
346
|
+
"type": "Feature",
|
|
347
|
+
"geometry": {"type": "Point", "coordinates": [0, 0]},
|
|
348
|
+
"properties": {"datetime": "2024-01-01T00:00:00Z"},
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
with aioresponses() as mock_responses:
|
|
352
|
+
# Mock all URLs with small delay to simulate network
|
|
353
|
+
for url in test_urls:
|
|
354
|
+
mock_responses.get(url, payload=sample_item)
|
|
355
|
+
|
|
356
|
+
# Test async performance
|
|
357
|
+
start_time = time.time()
|
|
358
|
+
async_items = await download_stac_items_async(urls=test_urls, concurrent_requests=20, batch_size=50)
|
|
359
|
+
async_duration = time.time() - start_time
|
|
360
|
+
|
|
361
|
+
# Validate async results
|
|
362
|
+
assert len(async_items) == 100
|
|
363
|
+
assert async_duration < 5.0 # Should complete quickly
|
|
364
|
+
|
|
365
|
+
def test_error_logging_compatibility(self):
|
|
366
|
+
"""Test that async errors are logged in the same format as sync errors."""
|
|
367
|
+
with patch("earthcatalog.ingestion_pipeline.logger"):
|
|
368
|
+
# Create a temporary config with fast timeout for failed requests
|
|
369
|
+
import tempfile
|
|
370
|
+
|
|
371
|
+
import pandas as pd
|
|
372
|
+
|
|
373
|
+
# Create temp file for fast test
|
|
374
|
+
with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp_file:
|
|
375
|
+
df = pd.DataFrame({"url": ["https://invalid-url-fast-fail.example.com/item.json"]})
|
|
376
|
+
df.to_parquet(tmp_file.name)
|
|
377
|
+
|
|
378
|
+
# Create config with very short timeouts to fail fast
|
|
379
|
+
fast_config = ProcessingConfig(
|
|
380
|
+
input_file=tmp_file.name,
|
|
381
|
+
output_catalog="/tmp/test_catalog",
|
|
382
|
+
scratch_location="/tmp/test_scratch",
|
|
383
|
+
enable_concurrent_http=True,
|
|
384
|
+
concurrent_requests=1,
|
|
385
|
+
request_timeout=1, # Very short timeout for fast failure
|
|
386
|
+
retry_attempts=1, # Minimal retries for speed
|
|
387
|
+
retry_delay=0.01, # Minimal delay
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
from earthcatalog.ingestion_pipeline import LocalProcessor, STACIngestionPipeline
|
|
391
|
+
|
|
392
|
+
processor = LocalProcessor()
|
|
393
|
+
pipeline = STACIngestionPipeline(fast_config, processor)
|
|
394
|
+
|
|
395
|
+
# Test with URLs that will fail quickly
|
|
396
|
+
failed_urls = ["https://invalid-url-fast-fail.example.com/item.json"]
|
|
397
|
+
|
|
398
|
+
# This should fail quickly and generate logs
|
|
399
|
+
items = pipeline._download_stac_items_batch_async(failed_urls, 1)
|
|
400
|
+
|
|
401
|
+
# Verify error handling occurred (items should be empty or contain errors)
|
|
402
|
+
assert isinstance(items, list) # Should return a list even on failure
|
|
403
|
+
|
|
404
|
+
# Clean up
|
|
405
|
+
import os
|
|
406
|
+
|
|
407
|
+
os.unlink(tmp_file.name)
|
|
408
|
+
processor.close()
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
@pytest.mark.skipif(not HAS_ASYNC_HTTP, reason="Async HTTP client not available")
|
|
412
|
+
class TestAsyncHTTPClientEdgeCases:
|
|
413
|
+
"""Test edge cases and error conditions for async HTTP client."""
|
|
414
|
+
|
|
415
|
+
@pytest.mark.asyncio
|
|
416
|
+
async def test_client_without_context_manager(self):
|
|
417
|
+
"""Test that client properly fails when used without context manager."""
|
|
418
|
+
client = AsyncHTTPClient()
|
|
419
|
+
|
|
420
|
+
with pytest.raises(RuntimeError, match="not initialized"):
|
|
421
|
+
await client._fetch_single_url("https://example.com/test.json")
|
|
422
|
+
|
|
423
|
+
@pytest.mark.asyncio
|
|
424
|
+
async def test_empty_url_list(self):
|
|
425
|
+
"""Test handling of empty URL list."""
|
|
426
|
+
async with AsyncHTTPClient() as client:
|
|
427
|
+
results = await client.download_batch([])
|
|
428
|
+
assert results == []
|
|
429
|
+
|
|
430
|
+
@pytest.mark.asyncio
|
|
431
|
+
async def test_malformed_urls(self):
|
|
432
|
+
"""Test handling of malformed URLs."""
|
|
433
|
+
malformed_urls = ["not-a-url", "ftp://invalid-protocol.com", ""]
|
|
434
|
+
|
|
435
|
+
async with AsyncHTTPClient(retry_attempts=1) as client:
|
|
436
|
+
results = await client.download_batch(malformed_urls)
|
|
437
|
+
|
|
438
|
+
# All should fail with connection errors
|
|
439
|
+
assert len(results) == 3
|
|
440
|
+
for result in results:
|
|
441
|
+
assert result.success is False
|
|
442
|
+
assert result.error_type in [ErrorType.CONNECTION, ErrorType.PARSE_ERROR]
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
def test_import_compatibility():
|
|
446
|
+
"""Test that module imports work correctly even without async dependencies."""
|
|
447
|
+
# This test should always pass, testing import structure
|
|
448
|
+
from earthcatalog.ingestion_pipeline import ProcessingConfig
|
|
449
|
+
|
|
450
|
+
config = ProcessingConfig(
|
|
451
|
+
input_file="/tmp/test.parquet", output_catalog="/tmp/test_catalog", scratch_location="/tmp/test_scratch"
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
# Should be able to create config even if async HTTP not available
|
|
455
|
+
assert config.enable_concurrent_http is True # Default should be True
|
|
456
|
+
assert config.concurrent_requests == 50 # Default value
|
|
457
|
+
assert config.batch_size == 1000 # Default value
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
def test_backward_compatibility():
|
|
461
|
+
"""Test that all existing functionality works unchanged."""
|
|
462
|
+
import os
|
|
463
|
+
import tempfile
|
|
464
|
+
|
|
465
|
+
import pandas as pd
|
|
466
|
+
|
|
467
|
+
# Create a temporary parquet file
|
|
468
|
+
with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp_file:
|
|
469
|
+
df = pd.DataFrame({"url": ["https://example.com/test.json"]})
|
|
470
|
+
df.to_parquet(tmp_file.name)
|
|
471
|
+
|
|
472
|
+
config = ProcessingConfig(
|
|
473
|
+
input_file=tmp_file.name,
|
|
474
|
+
output_catalog="/tmp/test_catalog",
|
|
475
|
+
scratch_location="/tmp/test_scratch",
|
|
476
|
+
enable_concurrent_http=False, # Explicitly disable async
|
|
477
|
+
)
|
|
478
|
+
|
|
479
|
+
from earthcatalog.ingestion_pipeline import LocalProcessor
|
|
480
|
+
|
|
481
|
+
processor = LocalProcessor()
|
|
482
|
+
pipeline = STACIngestionPipeline(config, processor)
|
|
483
|
+
|
|
484
|
+
# Should be able to create pipeline and call existing methods
|
|
485
|
+
assert hasattr(pipeline, "_download_stac_item")
|
|
486
|
+
assert hasattr(pipeline, "_compute_partition_key")
|
|
487
|
+
|
|
488
|
+
# Configuration validation should work
|
|
489
|
+
config.validate()
|
|
490
|
+
|
|
491
|
+
# Clean up
|
|
492
|
+
os.unlink(tmp_file.name)
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
@pytest.mark.skipif(not HAS_ASYNC_HTTP, reason="Async HTTP client not available")
|
|
496
|
+
class TestSessionLockAndCleanup:
|
|
497
|
+
"""Test session lock and cleanup improvements (c1, c2)."""
|
|
498
|
+
|
|
499
|
+
@pytest.mark.asyncio
|
|
500
|
+
async def test_session_lock_prevents_race_condition(self):
|
|
501
|
+
"""Test that session lock prevents race conditions during concurrent initialization."""
|
|
502
|
+
client = AsyncHTTPClient(concurrent_requests=10)
|
|
503
|
+
|
|
504
|
+
# Create multiple concurrent tasks that all try to enter the context
|
|
505
|
+
async def enter_context():
|
|
506
|
+
async with client:
|
|
507
|
+
# Session should be properly initialized
|
|
508
|
+
assert client.session is not None
|
|
509
|
+
return True
|
|
510
|
+
|
|
511
|
+
# Run multiple concurrent context manager entries
|
|
512
|
+
# This would cause issues without the session lock
|
|
513
|
+
tasks = [enter_context() for _ in range(5)]
|
|
514
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
515
|
+
|
|
516
|
+
# All tasks should complete successfully
|
|
517
|
+
for result in results:
|
|
518
|
+
assert result is True or isinstance(result, Exception)
|
|
519
|
+
|
|
520
|
+
@pytest.mark.asyncio
|
|
521
|
+
async def test_session_cleanup_on_normal_exit(self):
|
|
522
|
+
"""Test session is properly cleaned up on normal exit."""
|
|
523
|
+
client = AsyncHTTPClient()
|
|
524
|
+
|
|
525
|
+
async with client:
|
|
526
|
+
assert client.session is not None
|
|
527
|
+
|
|
528
|
+
# After exit, session should be None
|
|
529
|
+
assert client.session is None
|
|
530
|
+
|
|
531
|
+
@pytest.mark.asyncio
|
|
532
|
+
async def test_session_cleanup_on_exception(self):
|
|
533
|
+
"""Test session is properly cleaned up even when exception occurs."""
|
|
534
|
+
client = AsyncHTTPClient()
|
|
535
|
+
|
|
536
|
+
class TestException(Exception):
|
|
537
|
+
pass
|
|
538
|
+
|
|
539
|
+
try:
|
|
540
|
+
async with client:
|
|
541
|
+
assert client.session is not None
|
|
542
|
+
raise TestException("Test exception")
|
|
543
|
+
except TestException:
|
|
544
|
+
pass
|
|
545
|
+
|
|
546
|
+
# Session should still be cleaned up
|
|
547
|
+
assert client.session is None
|
|
548
|
+
|
|
549
|
+
@pytest.mark.asyncio
|
|
550
|
+
async def test_concurrent_session_access_with_lock(self):
|
|
551
|
+
"""Test that multiple coroutines can safely share a session."""
|
|
552
|
+
|
|
553
|
+
async with AsyncHTTPClient(concurrent_requests=5) as client:
|
|
554
|
+
# Verify session is created
|
|
555
|
+
assert client.session is not None
|
|
556
|
+
|
|
557
|
+
# Multiple coroutines accessing session should work safely
|
|
558
|
+
async def check_session():
|
|
559
|
+
return client.session is not None
|
|
560
|
+
|
|
561
|
+
results = await asyncio.gather(*[check_session() for _ in range(10)])
|
|
562
|
+
assert all(results)
|
|
563
|
+
|
|
564
|
+
|
|
565
|
+
@pytest.mark.skipif(not HAS_ASYNC_HTTP, reason="Async HTTP client not available")
|
|
566
|
+
class TestDownloadResultAndFailureTracking:
|
|
567
|
+
"""Test DownloadResult dataclass and failure tracking (c3)."""
|
|
568
|
+
|
|
569
|
+
@pytest.mark.skipif(not HAS_ASYNC_TEST_SUPPORT, reason="aioresponses not available")
|
|
570
|
+
@pytest.mark.asyncio
|
|
571
|
+
async def test_download_result_structure(self):
|
|
572
|
+
"""Test DownloadResult dataclass has correct structure."""
|
|
573
|
+
from earthcatalog.async_http_client import DownloadResult
|
|
574
|
+
|
|
575
|
+
# Create a mock result
|
|
576
|
+
result = DownloadResult(
|
|
577
|
+
items=[{"id": "item1", "type": "Feature"}],
|
|
578
|
+
failed_urls=[{"url": "http://fail.com", "error": "404", "error_type": "http_error"}],
|
|
579
|
+
metrics={"total_requests": 2, "success_rate_percent": 50.0},
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
assert len(result.items) == 1
|
|
583
|
+
assert len(result.failed_urls) == 1
|
|
584
|
+
assert result.metrics["success_rate_percent"] == 50.0
|
|
585
|
+
|
|
586
|
+
@pytest.mark.skipif(not HAS_ASYNC_TEST_SUPPORT, reason="aioresponses not available")
|
|
587
|
+
@pytest.mark.asyncio
|
|
588
|
+
async def test_download_with_failures_tracking(self):
|
|
589
|
+
"""Test that download_all_with_failures properly tracks failures."""
|
|
590
|
+
from earthcatalog.async_http_client import BatchDownloader
|
|
591
|
+
|
|
592
|
+
urls = [
|
|
593
|
+
"https://example.com/success.json",
|
|
594
|
+
"https://example.com/fail_404.json",
|
|
595
|
+
"https://example.com/success2.json",
|
|
596
|
+
]
|
|
597
|
+
|
|
598
|
+
with aioresponses() as mock_responses:
|
|
599
|
+
# Mock responses
|
|
600
|
+
mock_responses.get(urls[0], payload={"id": "item1", "type": "Feature"})
|
|
601
|
+
mock_responses.get(urls[1], status=404)
|
|
602
|
+
mock_responses.get(urls[2], payload={"id": "item2", "type": "Feature"})
|
|
603
|
+
|
|
604
|
+
downloader = BatchDownloader(batch_size=10, concurrent_requests=5, request_timeout=5, retry_attempts=1)
|
|
605
|
+
|
|
606
|
+
result = await downloader.download_all_with_failures(urls)
|
|
607
|
+
|
|
608
|
+
# Should have 2 successful items
|
|
609
|
+
assert len(result.items) == 2
|
|
610
|
+
assert result.items[0]["id"] == "item1"
|
|
611
|
+
assert result.items[1]["id"] == "item2"
|
|
612
|
+
|
|
613
|
+
# Should have 1 failed URL with details
|
|
614
|
+
assert len(result.failed_urls) == 1
|
|
615
|
+
assert result.failed_urls[0]["url"] == urls[1]
|
|
616
|
+
assert "error" in result.failed_urls[0]
|
|
617
|
+
assert "error_type" in result.failed_urls[0]
|
|
618
|
+
|
|
619
|
+
# Metrics should be populated
|
|
620
|
+
assert "total_requests" in result.metrics
|
|
621
|
+
assert "success_rate_percent" in result.metrics
|
|
622
|
+
|
|
623
|
+
@pytest.mark.skipif(not HAS_ASYNC_TEST_SUPPORT, reason="aioresponses not available")
|
|
624
|
+
@pytest.mark.asyncio
|
|
625
|
+
async def test_download_stac_items_async_with_failures_function(self):
|
|
626
|
+
"""Test the convenience function download_stac_items_async_with_failures."""
|
|
627
|
+
from earthcatalog.async_http_client import download_stac_items_async_with_failures
|
|
628
|
+
|
|
629
|
+
urls = ["https://example.com/item1.json", "https://example.com/item2.json"]
|
|
630
|
+
|
|
631
|
+
with aioresponses() as mock_responses:
|
|
632
|
+
mock_responses.get(urls[0], payload={"id": "item1", "type": "Feature"})
|
|
633
|
+
mock_responses.get(urls[1], status=500) # Server error
|
|
634
|
+
|
|
635
|
+
result = await download_stac_items_async_with_failures(
|
|
636
|
+
urls=urls, concurrent_requests=5, batch_size=10, retry_attempts=1
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
assert isinstance(result.items, list)
|
|
640
|
+
assert isinstance(result.failed_urls, list)
|
|
641
|
+
assert isinstance(result.metrics, dict)
|
|
642
|
+
|
|
643
|
+
|
|
644
|
+
class TestGridParameterValidation:
|
|
645
|
+
"""Test grid parameter validation (h1)."""
|
|
646
|
+
|
|
647
|
+
def test_h3_resolution_validation(self):
|
|
648
|
+
"""Test H3 resolution validation (0-15)."""
|
|
649
|
+
# Valid resolutions
|
|
650
|
+
H3GridSystem(resolution=0)
|
|
651
|
+
H3GridSystem(resolution=15)
|
|
652
|
+
H3GridSystem(resolution=6)
|
|
653
|
+
|
|
654
|
+
# Invalid resolutions
|
|
655
|
+
with pytest.raises(ValueError, match="H3 resolution must be 0-15"):
|
|
656
|
+
H3GridSystem(resolution=-1)
|
|
657
|
+
with pytest.raises(ValueError, match="H3 resolution must be 0-15"):
|
|
658
|
+
H3GridSystem(resolution=16)
|
|
659
|
+
|
|
660
|
+
def test_s2_resolution_validation(self):
|
|
661
|
+
"""Test S2 resolution validation (0-30)."""
|
|
662
|
+
# Valid resolutions
|
|
663
|
+
S2GridSystem(resolution=0)
|
|
664
|
+
S2GridSystem(resolution=30)
|
|
665
|
+
S2GridSystem(resolution=13)
|
|
666
|
+
|
|
667
|
+
# Invalid resolutions
|
|
668
|
+
with pytest.raises(ValueError, match="S2 resolution must be 0-30"):
|
|
669
|
+
S2GridSystem(resolution=-1)
|
|
670
|
+
with pytest.raises(ValueError, match="S2 resolution must be 0-30"):
|
|
671
|
+
S2GridSystem(resolution=31)
|
|
672
|
+
|
|
673
|
+
def test_mgrs_resolution_validation(self):
|
|
674
|
+
"""Test MGRS resolution validation (1-5)."""
|
|
675
|
+
# Valid resolutions
|
|
676
|
+
MGRSGridSystem(resolution=1)
|
|
677
|
+
MGRSGridSystem(resolution=5)
|
|
678
|
+
MGRSGridSystem(resolution=3)
|
|
679
|
+
|
|
680
|
+
# Invalid resolutions
|
|
681
|
+
with pytest.raises(ValueError, match="MGRS resolution must be 1-5"):
|
|
682
|
+
MGRSGridSystem(resolution=0)
|
|
683
|
+
with pytest.raises(ValueError, match="MGRS resolution must be 1-5"):
|
|
684
|
+
MGRSGridSystem(resolution=6)
|
|
685
|
+
|
|
686
|
+
def test_latlon_resolution_validation(self):
|
|
687
|
+
"""Test LatLon resolution validation (must be positive)."""
|
|
688
|
+
# Valid resolutions
|
|
689
|
+
SimpleLatLonGrid(resolution=1)
|
|
690
|
+
SimpleLatLonGrid(resolution=10)
|
|
691
|
+
|
|
692
|
+
# Invalid resolutions
|
|
693
|
+
with pytest.raises(ValueError, match="LatLon resolution must be positive"):
|
|
694
|
+
SimpleLatLonGrid(resolution=0)
|
|
695
|
+
with pytest.raises(ValueError, match="LatLon resolution must be positive"):
|
|
696
|
+
SimpleLatLonGrid(resolution=-1)
|
|
697
|
+
|
|
698
|
+
|
|
699
|
+
if __name__ == "__main__":
|
|
700
|
+
pytest.main([__file__, "-v"])
|