earthcatalog 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- earthcatalog/__init__.py +164 -0
- earthcatalog/async_http_client.py +1006 -0
- earthcatalog/config.py +97 -0
- earthcatalog/engines/__init__.py +308 -0
- earthcatalog/engines/rustac_engine.py +142 -0
- earthcatalog/engines/stac_geoparquet_engine.py +126 -0
- earthcatalog/exceptions.py +471 -0
- earthcatalog/grid_systems.py +1114 -0
- earthcatalog/ingestion_pipeline.py +2281 -0
- earthcatalog/input_readers.py +603 -0
- earthcatalog/job_tracking.py +485 -0
- earthcatalog/pipeline.py +606 -0
- earthcatalog/schema_generator.py +911 -0
- earthcatalog/spatial_resolver.py +1207 -0
- earthcatalog/stac_hooks.py +754 -0
- earthcatalog/statistics.py +677 -0
- earthcatalog/storage_backends.py +548 -0
- earthcatalog/tests/__init__.py +1 -0
- earthcatalog/tests/conftest.py +76 -0
- earthcatalog/tests/test_all_grids.py +793 -0
- earthcatalog/tests/test_async_http.py +700 -0
- earthcatalog/tests/test_cli_and_storage.py +230 -0
- earthcatalog/tests/test_config.py +245 -0
- earthcatalog/tests/test_dask_integration.py +580 -0
- earthcatalog/tests/test_e2e_synthetic.py +1624 -0
- earthcatalog/tests/test_engines.py +272 -0
- earthcatalog/tests/test_exceptions.py +346 -0
- earthcatalog/tests/test_file_structure.py +245 -0
- earthcatalog/tests/test_input_readers.py +666 -0
- earthcatalog/tests/test_integration.py +200 -0
- earthcatalog/tests/test_integration_async.py +283 -0
- earthcatalog/tests/test_job_tracking.py +603 -0
- earthcatalog/tests/test_multi_file_input.py +336 -0
- earthcatalog/tests/test_passthrough_hook.py +196 -0
- earthcatalog/tests/test_pipeline.py +684 -0
- earthcatalog/tests/test_pipeline_components.py +665 -0
- earthcatalog/tests/test_schema_generator.py +506 -0
- earthcatalog/tests/test_spatial_resolver.py +413 -0
- earthcatalog/tests/test_stac_hooks.py +776 -0
- earthcatalog/tests/test_statistics.py +477 -0
- earthcatalog/tests/test_storage_backends.py +236 -0
- earthcatalog/tests/test_validation.py +435 -0
- earthcatalog/tests/test_workers.py +653 -0
- earthcatalog/validation.py +921 -0
- earthcatalog/workers.py +682 -0
- earthcatalog-0.2.0.dist-info/METADATA +333 -0
- earthcatalog-0.2.0.dist-info/RECORD +50 -0
- earthcatalog-0.2.0.dist-info/WHEEL +5 -0
- earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
- earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1624 @@
|
|
|
1
|
+
"""End-to-end tests using synthetic STAC items.
|
|
2
|
+
|
|
3
|
+
This module provides comprehensive end-to-end testing of the EarthCatalog ingestion
|
|
4
|
+
pipeline using synthetically generated STAC items. These tests are designed to verify
|
|
5
|
+
the complete workflow from URL ingestion through consolidation with realistic,
|
|
6
|
+
parameterizable data.
|
|
7
|
+
|
|
8
|
+
These tests are NOT run as part of the regular test suite. They are intended to be
|
|
9
|
+
run on-demand for integration testing and performance benchmarking.
|
|
10
|
+
|
|
11
|
+
Run with:
|
|
12
|
+
pytest earthcatalog/tests/test_e2e_synthetic.py -v -m e2e
|
|
13
|
+
|
|
14
|
+
Or for specific test configurations:
|
|
15
|
+
pytest earthcatalog/tests/test_e2e_synthetic.py -v -k "test_e2e_pipeline" --e2e-items=1000
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import json
|
|
19
|
+
import random
|
|
20
|
+
import shutil
|
|
21
|
+
import tempfile
|
|
22
|
+
from dataclasses import dataclass
|
|
23
|
+
from datetime import datetime, timedelta
|
|
24
|
+
from http.server import HTTPServer, SimpleHTTPRequestHandler
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
from threading import Thread
|
|
27
|
+
from typing import Any
|
|
28
|
+
|
|
29
|
+
import pandas as pd
|
|
30
|
+
import pyarrow as pa
|
|
31
|
+
import pyarrow.parquet as pq
|
|
32
|
+
import pytest
|
|
33
|
+
from jinja2 import Environment, FileSystemLoader
|
|
34
|
+
|
|
35
|
+
from earthcatalog.ingestion_pipeline import (
|
|
36
|
+
LocalProcessor,
|
|
37
|
+
ProcessingConfig,
|
|
38
|
+
STACIngestionPipeline,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Mark entire module for e2e testing - skipped by default
|
|
42
|
+
pytestmark = pytest.mark.e2e
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class GeometrySpec:
|
|
47
|
+
"""Specification for geometry size/shape."""
|
|
48
|
+
|
|
49
|
+
width_deg: float # Width in degrees longitude
|
|
50
|
+
height_deg: float # Height in degrees latitude
|
|
51
|
+
name: str # Descriptive name
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# Standard Landsat scene size (approximately 185km x 180km, ~1.7deg x 1.6deg at equator)
|
|
55
|
+
LANDSAT_SCENE = GeometrySpec(width_deg=1.7, height_deg=1.6, name="landsat")
|
|
56
|
+
|
|
57
|
+
# Sentinel-2 tile (100km x 100km, ~0.9deg x 0.9deg at equator)
|
|
58
|
+
SENTINEL2_TILE = GeometrySpec(width_deg=0.9, height_deg=0.9, name="sentinel2")
|
|
59
|
+
|
|
60
|
+
# Very small geometry (outlier - too small, like a single point observation)
|
|
61
|
+
TINY_GEOMETRY = GeometrySpec(width_deg=0.001, height_deg=0.001, name="tiny")
|
|
62
|
+
|
|
63
|
+
# Very large geometry (outlier - spans many grid cells, like MODIS swath)
|
|
64
|
+
HUGE_GEOMETRY = GeometrySpec(width_deg=20.0, height_deg=15.0, name="huge")
|
|
65
|
+
|
|
66
|
+
# Continental scale (extreme outlier - should go to global partition)
|
|
67
|
+
CONTINENTAL = GeometrySpec(width_deg=50.0, height_deg=30.0, name="continental")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class SyntheticSTACGenerator:
|
|
71
|
+
"""Generator for synthetic STAC items using Jinja templates.
|
|
72
|
+
|
|
73
|
+
This class creates realistic STAC items with configurable parameters for
|
|
74
|
+
testing the ingestion pipeline with various geometry sizes, temporal
|
|
75
|
+
distributions, and outlier conditions.
|
|
76
|
+
|
|
77
|
+
Attributes:
|
|
78
|
+
template_dir: Directory containing Jinja templates
|
|
79
|
+
output_dir: Directory for generated STAC JSON files
|
|
80
|
+
base_geometry: Default geometry specification for items
|
|
81
|
+
outlier_percent: Percentage of items with outlier geometries
|
|
82
|
+
seed: Random seed for reproducibility
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
def __init__(
|
|
86
|
+
self,
|
|
87
|
+
output_dir: Path,
|
|
88
|
+
base_geometry: GeometrySpec = LANDSAT_SCENE,
|
|
89
|
+
outlier_tiny_percent: float = 5.0,
|
|
90
|
+
outlier_huge_percent: float = 5.0,
|
|
91
|
+
seed: int | None = None,
|
|
92
|
+
):
|
|
93
|
+
"""Initialize the synthetic STAC generator.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
output_dir: Directory to write generated STAC JSON files
|
|
97
|
+
base_geometry: Default geometry size specification
|
|
98
|
+
outlier_tiny_percent: Percentage of items with tiny geometries (0-100)
|
|
99
|
+
outlier_huge_percent: Percentage of items with huge geometries (0-100)
|
|
100
|
+
seed: Random seed for reproducibility
|
|
101
|
+
"""
|
|
102
|
+
self.output_dir = Path(output_dir)
|
|
103
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
104
|
+
|
|
105
|
+
self.base_geometry = base_geometry
|
|
106
|
+
self.outlier_tiny_percent = outlier_tiny_percent
|
|
107
|
+
self.outlier_huge_percent = outlier_huge_percent
|
|
108
|
+
|
|
109
|
+
# Set random seed for reproducibility
|
|
110
|
+
if seed is not None:
|
|
111
|
+
random.seed(seed)
|
|
112
|
+
|
|
113
|
+
# Initialize Jinja environment
|
|
114
|
+
template_dir = Path(__file__).parent / "fixtures"
|
|
115
|
+
self.env = Environment(loader=FileSystemLoader(template_dir))
|
|
116
|
+
self.template = self.env.get_template("stac_item_template.jinja2")
|
|
117
|
+
|
|
118
|
+
# Track generated items
|
|
119
|
+
self.generated_items: list[dict[str, Any]] = []
|
|
120
|
+
self.generated_files: list[Path] = []
|
|
121
|
+
|
|
122
|
+
def _select_geometry_spec(self) -> GeometrySpec:
|
|
123
|
+
"""Select geometry specification based on outlier percentages."""
|
|
124
|
+
roll = random.uniform(0, 100)
|
|
125
|
+
|
|
126
|
+
if roll < self.outlier_tiny_percent:
|
|
127
|
+
return TINY_GEOMETRY
|
|
128
|
+
elif roll < self.outlier_tiny_percent + self.outlier_huge_percent:
|
|
129
|
+
# Randomly choose between huge and continental for large outliers
|
|
130
|
+
return random.choice([HUGE_GEOMETRY, CONTINENTAL])
|
|
131
|
+
else:
|
|
132
|
+
return self.base_geometry
|
|
133
|
+
|
|
134
|
+
def _generate_random_bbox(self, geom_spec: GeometrySpec) -> tuple[float, float, float, float]:
|
|
135
|
+
"""Generate a random bounding box within valid global bounds.
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
Tuple of (min_lon, min_lat, max_lon, max_lat)
|
|
139
|
+
"""
|
|
140
|
+
# Ensure geometry fits within global bounds
|
|
141
|
+
max_start_lon = 180.0 - geom_spec.width_deg
|
|
142
|
+
max_start_lat = 90.0 - geom_spec.height_deg
|
|
143
|
+
|
|
144
|
+
min_lon = random.uniform(-180.0, max_start_lon)
|
|
145
|
+
min_lat = random.uniform(-90.0, max_start_lat)
|
|
146
|
+
|
|
147
|
+
max_lon = min_lon + geom_spec.width_deg
|
|
148
|
+
max_lat = min_lat + geom_spec.height_deg
|
|
149
|
+
|
|
150
|
+
return (min_lon, min_lat, max_lon, max_lat)
|
|
151
|
+
|
|
152
|
+
def _generate_random_datetime(
|
|
153
|
+
self,
|
|
154
|
+
start_date: datetime = datetime(2020, 1, 1),
|
|
155
|
+
end_date: datetime = datetime(2024, 12, 31),
|
|
156
|
+
) -> datetime:
|
|
157
|
+
"""Generate a random datetime within the specified range."""
|
|
158
|
+
delta = end_date - start_date
|
|
159
|
+
random_days = random.randint(0, delta.days)
|
|
160
|
+
random_seconds = random.randint(0, 86400)
|
|
161
|
+
return start_date + timedelta(days=random_days, seconds=random_seconds)
|
|
162
|
+
|
|
163
|
+
def generate_item(
|
|
164
|
+
self,
|
|
165
|
+
item_index: int,
|
|
166
|
+
dataset_id: str = "landsat_c2l2",
|
|
167
|
+
collection: str = "landsat-c2-l2",
|
|
168
|
+
) -> dict[str, Any]:
|
|
169
|
+
"""Generate a single synthetic STAC item.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
item_index: Index of this item (for unique ID generation)
|
|
173
|
+
dataset_id: Dataset identifier for partitioning
|
|
174
|
+
collection: STAC collection name
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
Dictionary containing the generated STAC item
|
|
178
|
+
"""
|
|
179
|
+
# Select geometry specification (may be outlier)
|
|
180
|
+
geom_spec = self._select_geometry_spec()
|
|
181
|
+
|
|
182
|
+
# Generate random bbox
|
|
183
|
+
min_lon, min_lat, max_lon, max_lat = self._generate_random_bbox(geom_spec)
|
|
184
|
+
|
|
185
|
+
# Generate random datetime
|
|
186
|
+
dt = self._generate_random_datetime()
|
|
187
|
+
dt_str = dt.strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
188
|
+
created_str = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
189
|
+
|
|
190
|
+
# Generate unique identifiers
|
|
191
|
+
item_id = f"LC08_L2SP_{random.randint(1, 999):03d}{random.randint(1, 999):03d}_{dt.strftime('%Y%m%d')}_{item_index:06d}"
|
|
192
|
+
scene_id = f"LC08_{random.randint(1, 999):03d}{random.randint(1, 999):03d}_{dt.strftime('%Y%m%d')}_02_T1"
|
|
193
|
+
|
|
194
|
+
# WRS path/row (Landsat grid)
|
|
195
|
+
wrs_path = random.randint(1, 233)
|
|
196
|
+
wrs_row = random.randint(1, 248)
|
|
197
|
+
|
|
198
|
+
# Base URL for assets
|
|
199
|
+
base_url = f"https://example.com/data/{collection}/{item_id}"
|
|
200
|
+
|
|
201
|
+
# Template context
|
|
202
|
+
context = {
|
|
203
|
+
# Identification
|
|
204
|
+
"item_id": item_id,
|
|
205
|
+
"scene_id": scene_id,
|
|
206
|
+
# Geometry
|
|
207
|
+
"min_lon": round(min_lon, 6),
|
|
208
|
+
"min_lat": round(min_lat, 6),
|
|
209
|
+
"max_lon": round(max_lon, 6),
|
|
210
|
+
"max_lat": round(max_lat, 6),
|
|
211
|
+
# Temporal
|
|
212
|
+
"datetime": dt_str,
|
|
213
|
+
"created": created_str,
|
|
214
|
+
"updated": created_str,
|
|
215
|
+
# Platform info
|
|
216
|
+
"platform": "landsat-8",
|
|
217
|
+
"instruments": ["oli", "tirs"],
|
|
218
|
+
"constellation": "landsat",
|
|
219
|
+
"mission": "landsat-8",
|
|
220
|
+
"gsd": 30,
|
|
221
|
+
# Collection info
|
|
222
|
+
"collection": collection,
|
|
223
|
+
"dataset_id": dataset_id,
|
|
224
|
+
# Scene properties
|
|
225
|
+
"cloud_cover": round(random.uniform(0, 100), 2),
|
|
226
|
+
"off_nadir": round(random.uniform(0, 15), 2),
|
|
227
|
+
"sun_azimuth": round(random.uniform(0, 360), 2),
|
|
228
|
+
"sun_elevation": round(random.uniform(10, 80), 2),
|
|
229
|
+
# Projection
|
|
230
|
+
"epsg": 32600 + random.randint(1, 60), # UTM zones
|
|
231
|
+
"proj_shape": [7611, 7531], # Standard Landsat dimensions
|
|
232
|
+
"proj_transform": [30.0, 0.0, min_lon * 111000, 0.0, -30.0, max_lat * 111000],
|
|
233
|
+
# Landsat-specific
|
|
234
|
+
"wrs_path": wrs_path,
|
|
235
|
+
"wrs_row": wrs_row,
|
|
236
|
+
"collection_category": "T1",
|
|
237
|
+
"collection_number": "02",
|
|
238
|
+
# Links
|
|
239
|
+
"self_href": f"{base_url}/{item_id}.json",
|
|
240
|
+
"parent_href": f"https://example.com/collections/{collection}",
|
|
241
|
+
"collection_href": f"https://example.com/collections/{collection}",
|
|
242
|
+
"root_href": "https://example.com",
|
|
243
|
+
# Asset URLs
|
|
244
|
+
"thumbnail_href": f"{base_url}/{item_id}_thumb_small.jpeg",
|
|
245
|
+
"browse_href": f"{base_url}/{item_id}_thumb_large.jpeg",
|
|
246
|
+
"mtl_href": f"{base_url}/{item_id}_MTL.json",
|
|
247
|
+
"mtl_txt_href": f"{base_url}/{item_id}_MTL.txt",
|
|
248
|
+
"mtl_xml_href": f"{base_url}/{item_id}_MTL.xml",
|
|
249
|
+
"ang_href": f"{base_url}/{item_id}_ANG.txt",
|
|
250
|
+
"qa_pixel_href": f"{base_url}/{item_id}_QA_PIXEL.TIF",
|
|
251
|
+
"qa_radsat_href": f"{base_url}/{item_id}_QA_RADSAT.TIF",
|
|
252
|
+
"coastal_href": f"{base_url}/{item_id}_SR_B1.TIF",
|
|
253
|
+
"blue_href": f"{base_url}/{item_id}_SR_B2.TIF",
|
|
254
|
+
"green_href": f"{base_url}/{item_id}_SR_B3.TIF",
|
|
255
|
+
"red_href": f"{base_url}/{item_id}_SR_B4.TIF",
|
|
256
|
+
"nir08_href": f"{base_url}/{item_id}_SR_B5.TIF",
|
|
257
|
+
"swir16_href": f"{base_url}/{item_id}_SR_B6.TIF",
|
|
258
|
+
"swir22_href": f"{base_url}/{item_id}_SR_B7.TIF",
|
|
259
|
+
"lwir11_href": f"{base_url}/{item_id}_ST_B10.TIF",
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
# Render template
|
|
263
|
+
rendered = self.template.render(**context)
|
|
264
|
+
item = json.loads(rendered)
|
|
265
|
+
|
|
266
|
+
# Store metadata about geometry type for analysis
|
|
267
|
+
item["_synthetic_metadata"] = {
|
|
268
|
+
"geometry_type": geom_spec.name,
|
|
269
|
+
"index": item_index,
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
return item
|
|
273
|
+
|
|
274
|
+
def generate_items(
|
|
275
|
+
self,
|
|
276
|
+
n_items: int,
|
|
277
|
+
dataset_ids: list[str] | None = None,
|
|
278
|
+
shuffle: bool = True,
|
|
279
|
+
) -> list[dict[str, Any]]:
|
|
280
|
+
"""Generate multiple synthetic STAC items.
|
|
281
|
+
|
|
282
|
+
Args:
|
|
283
|
+
n_items: Number of items to generate
|
|
284
|
+
dataset_ids: List of dataset IDs to distribute items across
|
|
285
|
+
shuffle: Whether to shuffle the items (simulates unordered real-world data)
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
List of generated STAC items
|
|
289
|
+
"""
|
|
290
|
+
if dataset_ids is None:
|
|
291
|
+
dataset_ids = ["landsat_c2l2", "sentinel2_l2a", "modis_mcd43a4"]
|
|
292
|
+
|
|
293
|
+
items = []
|
|
294
|
+
for i in range(n_items):
|
|
295
|
+
dataset_id = random.choice(dataset_ids)
|
|
296
|
+
collection = dataset_id.replace("_", "-")
|
|
297
|
+
item = self.generate_item(i, dataset_id=dataset_id, collection=collection)
|
|
298
|
+
items.append(item)
|
|
299
|
+
|
|
300
|
+
if shuffle:
|
|
301
|
+
random.shuffle(items)
|
|
302
|
+
|
|
303
|
+
self.generated_items = items
|
|
304
|
+
return items
|
|
305
|
+
|
|
306
|
+
def write_items_to_files(self, items: list[dict[str, Any]] | None = None) -> list[Path]:
|
|
307
|
+
"""Write STAC items to individual JSON files.
|
|
308
|
+
|
|
309
|
+
Args:
|
|
310
|
+
items: List of items to write. Uses self.generated_items if None.
|
|
311
|
+
|
|
312
|
+
Returns:
|
|
313
|
+
List of paths to generated files
|
|
314
|
+
"""
|
|
315
|
+
if items is None:
|
|
316
|
+
items = self.generated_items
|
|
317
|
+
|
|
318
|
+
files = []
|
|
319
|
+
for item in items:
|
|
320
|
+
# Remove synthetic metadata before writing
|
|
321
|
+
item_clean = {k: v for k, v in item.items() if not k.startswith("_")}
|
|
322
|
+
item_id = item_clean["id"]
|
|
323
|
+
file_path = self.output_dir / f"{item_id}.json"
|
|
324
|
+
|
|
325
|
+
with open(file_path, "w") as f:
|
|
326
|
+
json.dump(item_clean, f, indent=2)
|
|
327
|
+
|
|
328
|
+
files.append(file_path)
|
|
329
|
+
|
|
330
|
+
self.generated_files = files
|
|
331
|
+
return files
|
|
332
|
+
|
|
333
|
+
def create_url_parquet(
|
|
334
|
+
self,
|
|
335
|
+
base_url: str,
|
|
336
|
+
output_path: Path,
|
|
337
|
+
shuffle: bool = True,
|
|
338
|
+
) -> Path:
|
|
339
|
+
"""Create a Parquet file with URLs pointing to generated STAC items.
|
|
340
|
+
|
|
341
|
+
Args:
|
|
342
|
+
base_url: Base URL where items will be served (e.g., http://localhost:8000)
|
|
343
|
+
output_path: Path for the output Parquet file
|
|
344
|
+
shuffle: Whether to shuffle URLs (simulates unordered real-world data)
|
|
345
|
+
|
|
346
|
+
Returns:
|
|
347
|
+
Path to the created Parquet file
|
|
348
|
+
"""
|
|
349
|
+
urls = [f"{base_url}/{f.name}" for f in self.generated_files]
|
|
350
|
+
|
|
351
|
+
if shuffle:
|
|
352
|
+
random.shuffle(urls)
|
|
353
|
+
|
|
354
|
+
# Create DataFrame and write to Parquet
|
|
355
|
+
df = pd.DataFrame({"url": urls})
|
|
356
|
+
table = pa.Table.from_pandas(df)
|
|
357
|
+
pq.write_table(table, output_path)
|
|
358
|
+
|
|
359
|
+
return output_path
|
|
360
|
+
|
|
361
|
+
def get_geometry_distribution(self) -> dict[str, int]:
|
|
362
|
+
"""Get distribution of geometry types in generated items."""
|
|
363
|
+
distribution: dict[str, int] = {}
|
|
364
|
+
for item in self.generated_items:
|
|
365
|
+
geom_type = item.get("_synthetic_metadata", {}).get("geometry_type", "unknown")
|
|
366
|
+
distribution[geom_type] = distribution.get(geom_type, 0) + 1
|
|
367
|
+
return distribution
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
class SimpleSTACServer:
|
|
371
|
+
"""Simple HTTP server for serving STAC items during tests."""
|
|
372
|
+
|
|
373
|
+
def __init__(self, directory: Path, port: int = 0):
|
|
374
|
+
"""Initialize the server.
|
|
375
|
+
|
|
376
|
+
Args:
|
|
377
|
+
directory: Directory containing STAC JSON files to serve
|
|
378
|
+
port: Port to listen on (0 for auto-assign)
|
|
379
|
+
"""
|
|
380
|
+
self.directory = directory
|
|
381
|
+
self.port = port
|
|
382
|
+
self.server: HTTPServer | None = None
|
|
383
|
+
self.thread: Thread | None = None
|
|
384
|
+
|
|
385
|
+
def start(self) -> str:
|
|
386
|
+
"""Start the HTTP server in a background thread.
|
|
387
|
+
|
|
388
|
+
Returns:
|
|
389
|
+
Base URL of the server
|
|
390
|
+
"""
|
|
391
|
+
import functools
|
|
392
|
+
|
|
393
|
+
# Create handler that serves from our directory
|
|
394
|
+
handler = functools.partial(SimpleHTTPRequestHandler, directory=str(self.directory))
|
|
395
|
+
|
|
396
|
+
self.server = HTTPServer(("localhost", self.port), handler)
|
|
397
|
+
self.port = self.server.server_port
|
|
398
|
+
|
|
399
|
+
self.thread = Thread(target=self.server.serve_forever, daemon=True)
|
|
400
|
+
self.thread.start()
|
|
401
|
+
|
|
402
|
+
return f"http://localhost:{self.port}"
|
|
403
|
+
|
|
404
|
+
def stop(self):
|
|
405
|
+
"""Stop the HTTP server."""
|
|
406
|
+
if self.server:
|
|
407
|
+
self.server.shutdown()
|
|
408
|
+
self.server = None
|
|
409
|
+
if self.thread:
|
|
410
|
+
self.thread.join(timeout=5)
|
|
411
|
+
self.thread = None
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
# Pytest fixtures
|
|
415
|
+
# Note: pytest_addoption, pytest_configure, and pytest_collection_modifyitems
|
|
416
|
+
# are defined in conftest.py to be picked up by pytest
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
@pytest.fixture(scope="module")
|
|
420
|
+
def e2e_config(request):
|
|
421
|
+
"""Get e2e test configuration from command line options."""
|
|
422
|
+
|
|
423
|
+
class E2EConfig:
|
|
424
|
+
n_items = int(request.config.getoption("--e2e-items", 100))
|
|
425
|
+
outlier_tiny_percent = float(request.config.getoption("--e2e-outlier-tiny", 5))
|
|
426
|
+
outlier_huge_percent = float(request.config.getoption("--e2e-outlier-huge", 5))
|
|
427
|
+
seed = request.config.getoption("--e2e-seed", None)
|
|
428
|
+
if seed is not None:
|
|
429
|
+
seed = int(seed)
|
|
430
|
+
|
|
431
|
+
return E2EConfig()
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
@pytest.fixture(scope="module")
|
|
435
|
+
def temp_test_dir():
|
|
436
|
+
"""Create a temporary directory for test files."""
|
|
437
|
+
temp_dir = Path(tempfile.mkdtemp(prefix="earthcatalog_e2e_"))
|
|
438
|
+
yield temp_dir
|
|
439
|
+
# Cleanup after all tests in module
|
|
440
|
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
@pytest.fixture(scope="module")
|
|
444
|
+
def synthetic_generator(temp_test_dir, e2e_config):
|
|
445
|
+
"""Create and configure a synthetic STAC generator."""
|
|
446
|
+
items_dir = temp_test_dir / "stac_items"
|
|
447
|
+
items_dir.mkdir(parents=True, exist_ok=True)
|
|
448
|
+
|
|
449
|
+
generator = SyntheticSTACGenerator(
|
|
450
|
+
output_dir=items_dir,
|
|
451
|
+
base_geometry=LANDSAT_SCENE,
|
|
452
|
+
outlier_tiny_percent=e2e_config.outlier_tiny_percent,
|
|
453
|
+
outlier_huge_percent=e2e_config.outlier_huge_percent,
|
|
454
|
+
seed=e2e_config.seed,
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
return generator
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
@pytest.fixture(scope="module")
|
|
461
|
+
def generated_items(synthetic_generator, e2e_config):
|
|
462
|
+
"""Generate synthetic STAC items for testing."""
|
|
463
|
+
items = synthetic_generator.generate_items(
|
|
464
|
+
n_items=e2e_config.n_items,
|
|
465
|
+
shuffle=True,
|
|
466
|
+
)
|
|
467
|
+
synthetic_generator.write_items_to_files(items)
|
|
468
|
+
return items
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
@pytest.fixture(scope="module")
|
|
472
|
+
def stac_server(synthetic_generator, generated_items):
|
|
473
|
+
"""Start HTTP server to serve STAC items."""
|
|
474
|
+
server = SimpleSTACServer(synthetic_generator.output_dir)
|
|
475
|
+
base_url = server.start()
|
|
476
|
+
yield base_url
|
|
477
|
+
server.stop()
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
@pytest.fixture(scope="module")
|
|
481
|
+
def urls_parquet(synthetic_generator, stac_server, temp_test_dir):
|
|
482
|
+
"""Create Parquet file with URLs pointing to served STAC items."""
|
|
483
|
+
parquet_path = temp_test_dir / "stac_urls.parquet"
|
|
484
|
+
synthetic_generator.create_url_parquet(stac_server, parquet_path, shuffle=True)
|
|
485
|
+
return parquet_path
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
class TestE2ESyntheticPipeline:
|
|
489
|
+
"""End-to-end tests using synthetic STAC items and the full ingestion pipeline."""
|
|
490
|
+
|
|
491
|
+
@pytest.mark.e2e
|
|
492
|
+
def test_e2e_pipeline_basic(self, urls_parquet, temp_test_dir, e2e_config):
|
|
493
|
+
"""Test basic end-to-end pipeline with synthetic data.
|
|
494
|
+
|
|
495
|
+
This test verifies:
|
|
496
|
+
1. Pipeline can ingest URLs from Parquet file
|
|
497
|
+
2. Items are correctly partitioned spatially and temporally
|
|
498
|
+
3. Consolidation produces valid GeoParquet output
|
|
499
|
+
4. Schema generation works correctly
|
|
500
|
+
"""
|
|
501
|
+
output_catalog = temp_test_dir / "catalog_basic"
|
|
502
|
+
scratch_location = temp_test_dir / "scratch_basic"
|
|
503
|
+
|
|
504
|
+
config = ProcessingConfig(
|
|
505
|
+
input_file=str(urls_parquet),
|
|
506
|
+
output_catalog=str(output_catalog),
|
|
507
|
+
scratch_location=str(scratch_location),
|
|
508
|
+
grid_system="h3",
|
|
509
|
+
grid_resolution=2,
|
|
510
|
+
temporal_bin="month",
|
|
511
|
+
generate_schema=True,
|
|
512
|
+
enable_concurrent_http=True,
|
|
513
|
+
concurrent_requests=10,
|
|
514
|
+
batch_size=50,
|
|
515
|
+
max_workers=2,
|
|
516
|
+
items_per_shard=100,
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
processor = LocalProcessor(n_workers=config.max_workers)
|
|
520
|
+
pipeline = STACIngestionPipeline(config, processor)
|
|
521
|
+
|
|
522
|
+
try:
|
|
523
|
+
stats = pipeline.run()
|
|
524
|
+
|
|
525
|
+
# Verify output
|
|
526
|
+
assert len(stats) > 0, "Pipeline should produce at least one partition"
|
|
527
|
+
|
|
528
|
+
total_items = sum(s["total_items"] for s in stats.values())
|
|
529
|
+
assert total_items > 0, "Pipeline should ingest at least some items"
|
|
530
|
+
|
|
531
|
+
# Check catalog directory structure
|
|
532
|
+
assert output_catalog.exists(), "Output catalog should exist"
|
|
533
|
+
|
|
534
|
+
# Check schema file was created
|
|
535
|
+
schema_file = output_catalog / "catalog_schema.json"
|
|
536
|
+
assert schema_file.exists(), "Schema file should be generated"
|
|
537
|
+
|
|
538
|
+
with open(schema_file) as f:
|
|
539
|
+
schema = json.load(f)
|
|
540
|
+
assert "spatial_partitioning" in schema
|
|
541
|
+
assert schema["spatial_partitioning"]["grid_system"] == "h3"
|
|
542
|
+
|
|
543
|
+
# Verify Hive-style directory structure
|
|
544
|
+
parquet_files = list(output_catalog.rglob("items.parquet"))
|
|
545
|
+
assert len(parquet_files) > 0, "Should produce at least one items.parquet file"
|
|
546
|
+
|
|
547
|
+
# Check for Hive-style directories
|
|
548
|
+
for pf in parquet_files:
|
|
549
|
+
path_str = str(pf)
|
|
550
|
+
assert "partition=h3" in path_str or "partition=geojson" in path_str, (
|
|
551
|
+
f"Path should have Hive-style partition: {path_str}"
|
|
552
|
+
)
|
|
553
|
+
|
|
554
|
+
finally:
|
|
555
|
+
processor.close()
|
|
556
|
+
|
|
557
|
+
@pytest.mark.e2e
|
|
558
|
+
def test_e2e_pipeline_with_global_partition(self, urls_parquet, temp_test_dir, e2e_config, synthetic_generator):
|
|
559
|
+
"""Test that large geometries are routed to global partition.
|
|
560
|
+
|
|
561
|
+
This test verifies:
|
|
562
|
+
1. Items with geometries spanning many cells go to global partition
|
|
563
|
+
2. Regular items go to spatial partitions
|
|
564
|
+
3. Global partition threshold is respected
|
|
565
|
+
"""
|
|
566
|
+
# Ensure we have some large geometries
|
|
567
|
+
geom_dist = synthetic_generator.get_geometry_distribution()
|
|
568
|
+
large_count = geom_dist.get("huge", 0) + geom_dist.get("continental", 0)
|
|
569
|
+
|
|
570
|
+
if large_count == 0:
|
|
571
|
+
pytest.skip("No large geometry outliers in generated data")
|
|
572
|
+
|
|
573
|
+
output_catalog = temp_test_dir / "catalog_global"
|
|
574
|
+
scratch_location = temp_test_dir / "scratch_global"
|
|
575
|
+
|
|
576
|
+
config = ProcessingConfig(
|
|
577
|
+
input_file=str(urls_parquet),
|
|
578
|
+
output_catalog=str(output_catalog),
|
|
579
|
+
scratch_location=str(scratch_location),
|
|
580
|
+
grid_system="h3",
|
|
581
|
+
grid_resolution=2,
|
|
582
|
+
temporal_bin="month",
|
|
583
|
+
enable_global_partitioning=True,
|
|
584
|
+
global_partition_threshold=5, # Low threshold to trigger global routing
|
|
585
|
+
generate_schema=True,
|
|
586
|
+
enable_concurrent_http=True,
|
|
587
|
+
concurrent_requests=10,
|
|
588
|
+
batch_size=50,
|
|
589
|
+
max_workers=2,
|
|
590
|
+
)
|
|
591
|
+
|
|
592
|
+
processor = LocalProcessor(n_workers=config.max_workers)
|
|
593
|
+
pipeline = STACIngestionPipeline(config, processor)
|
|
594
|
+
|
|
595
|
+
try:
|
|
596
|
+
stats = pipeline.run()
|
|
597
|
+
|
|
598
|
+
# Check if any partitions have "global" in their path
|
|
599
|
+
_global_partitions = [k for k in stats.keys() if "global" in k.lower()]
|
|
600
|
+
|
|
601
|
+
# Note: Global partition may or may not exist depending on geometry distribution
|
|
602
|
+
# The test mainly verifies the pipeline handles this correctly without errors
|
|
603
|
+
|
|
604
|
+
total_items = sum(s["total_items"] for s in stats.values())
|
|
605
|
+
assert total_items > 0, "Pipeline should ingest items"
|
|
606
|
+
|
|
607
|
+
finally:
|
|
608
|
+
processor.close()
|
|
609
|
+
|
|
610
|
+
@pytest.mark.e2e
|
|
611
|
+
def test_e2e_pipeline_temporal_partitions(self, urls_parquet, temp_test_dir, e2e_config):
|
|
612
|
+
"""Test temporal partitioning creates correct Hive-style directories.
|
|
613
|
+
|
|
614
|
+
This test verifies:
|
|
615
|
+
1. Year/month directories are created with Hive naming
|
|
616
|
+
2. Items are correctly distributed across temporal partitions
|
|
617
|
+
"""
|
|
618
|
+
output_catalog = temp_test_dir / "catalog_temporal"
|
|
619
|
+
scratch_location = temp_test_dir / "scratch_temporal"
|
|
620
|
+
|
|
621
|
+
config = ProcessingConfig(
|
|
622
|
+
input_file=str(urls_parquet),
|
|
623
|
+
output_catalog=str(output_catalog),
|
|
624
|
+
scratch_location=str(scratch_location),
|
|
625
|
+
grid_system="h3",
|
|
626
|
+
grid_resolution=2,
|
|
627
|
+
temporal_bin="month",
|
|
628
|
+
generate_schema=True,
|
|
629
|
+
enable_concurrent_http=True,
|
|
630
|
+
concurrent_requests=10,
|
|
631
|
+
batch_size=50,
|
|
632
|
+
max_workers=2,
|
|
633
|
+
)
|
|
634
|
+
|
|
635
|
+
processor = LocalProcessor(n_workers=config.max_workers)
|
|
636
|
+
pipeline = STACIngestionPipeline(config, processor)
|
|
637
|
+
|
|
638
|
+
try:
|
|
639
|
+
_stats = pipeline.run()
|
|
640
|
+
|
|
641
|
+
# Check for Hive-style temporal directories
|
|
642
|
+
parquet_files = list(output_catalog.rglob("items.parquet"))
|
|
643
|
+
assert len(parquet_files) > 0
|
|
644
|
+
|
|
645
|
+
year_dirs = set()
|
|
646
|
+
month_dirs = set()
|
|
647
|
+
|
|
648
|
+
for pf in parquet_files:
|
|
649
|
+
path_parts = str(pf).split("/")
|
|
650
|
+
for part in path_parts:
|
|
651
|
+
if part.startswith("year="):
|
|
652
|
+
year_dirs.add(part)
|
|
653
|
+
elif part.startswith("month="):
|
|
654
|
+
month_dirs.add(part)
|
|
655
|
+
|
|
656
|
+
# Should have at least one year and month partition
|
|
657
|
+
assert len(year_dirs) > 0, "Should have year= directories"
|
|
658
|
+
assert len(month_dirs) > 0, "Should have month= directories"
|
|
659
|
+
|
|
660
|
+
finally:
|
|
661
|
+
processor.close()
|
|
662
|
+
|
|
663
|
+
@pytest.mark.e2e
|
|
664
|
+
def test_e2e_pipeline_multiple_datasets(self, temp_test_dir, e2e_config):
|
|
665
|
+
"""Test pipeline with items from multiple datasets.
|
|
666
|
+
|
|
667
|
+
This test verifies:
|
|
668
|
+
1. Items are partitioned by dataset/mission
|
|
669
|
+
2. Multiple dataset directories are created
|
|
670
|
+
"""
|
|
671
|
+
# Generate items with multiple datasets
|
|
672
|
+
items_dir = temp_test_dir / "multi_dataset_items"
|
|
673
|
+
items_dir.mkdir(parents=True, exist_ok=True)
|
|
674
|
+
|
|
675
|
+
generator = SyntheticSTACGenerator(
|
|
676
|
+
output_dir=items_dir,
|
|
677
|
+
base_geometry=LANDSAT_SCENE,
|
|
678
|
+
outlier_tiny_percent=0, # No outliers for this test
|
|
679
|
+
outlier_huge_percent=0,
|
|
680
|
+
seed=42,
|
|
681
|
+
)
|
|
682
|
+
|
|
683
|
+
# Generate items for 3 different datasets
|
|
684
|
+
dataset_ids = ["landsat_c2l2", "sentinel2_l2a", "modis_mcd43a4"]
|
|
685
|
+
items = generator.generate_items(
|
|
686
|
+
n_items=min(e2e_config.n_items, 60), # At least 20 per dataset
|
|
687
|
+
dataset_ids=dataset_ids,
|
|
688
|
+
shuffle=True,
|
|
689
|
+
)
|
|
690
|
+
generator.write_items_to_files(items)
|
|
691
|
+
|
|
692
|
+
# Start server and create URL parquet
|
|
693
|
+
server = SimpleSTACServer(items_dir)
|
|
694
|
+
base_url = server.start()
|
|
695
|
+
|
|
696
|
+
try:
|
|
697
|
+
parquet_path = temp_test_dir / "multi_dataset_urls.parquet"
|
|
698
|
+
generator.create_url_parquet(base_url, parquet_path, shuffle=True)
|
|
699
|
+
|
|
700
|
+
output_catalog = temp_test_dir / "catalog_multi_dataset"
|
|
701
|
+
scratch_location = temp_test_dir / "scratch_multi_dataset"
|
|
702
|
+
|
|
703
|
+
config = ProcessingConfig(
|
|
704
|
+
input_file=str(parquet_path),
|
|
705
|
+
output_catalog=str(output_catalog),
|
|
706
|
+
scratch_location=str(scratch_location),
|
|
707
|
+
grid_system="h3",
|
|
708
|
+
grid_resolution=2,
|
|
709
|
+
temporal_bin="month",
|
|
710
|
+
generate_schema=True,
|
|
711
|
+
enable_concurrent_http=True,
|
|
712
|
+
concurrent_requests=10,
|
|
713
|
+
batch_size=50,
|
|
714
|
+
max_workers=2,
|
|
715
|
+
)
|
|
716
|
+
|
|
717
|
+
processor = LocalProcessor(n_workers=config.max_workers)
|
|
718
|
+
pipeline = STACIngestionPipeline(config, processor)
|
|
719
|
+
|
|
720
|
+
try:
|
|
721
|
+
_stats = pipeline.run()
|
|
722
|
+
|
|
723
|
+
# Check for multiple dataset directories
|
|
724
|
+
assert output_catalog.exists()
|
|
725
|
+
|
|
726
|
+
# Get top-level directories (should be dataset names)
|
|
727
|
+
top_dirs = [d.name for d in output_catalog.iterdir() if d.is_dir()]
|
|
728
|
+
|
|
729
|
+
# Should have directories for multiple datasets
|
|
730
|
+
# (some items may fail to download, so just check we have at least 1)
|
|
731
|
+
assert len(top_dirs) >= 1, f"Should have dataset directories, found: {top_dirs}"
|
|
732
|
+
|
|
733
|
+
finally:
|
|
734
|
+
processor.close()
|
|
735
|
+
|
|
736
|
+
finally:
|
|
737
|
+
server.stop()
|
|
738
|
+
|
|
739
|
+
@pytest.mark.e2e
|
|
740
|
+
def test_e2e_consolidation_deduplication(self, temp_test_dir, e2e_config):
|
|
741
|
+
"""Test that consolidation properly deduplicates items.
|
|
742
|
+
|
|
743
|
+
This test verifies:
|
|
744
|
+
1. Running pipeline twice with overlapping data deduplicates correctly
|
|
745
|
+
2. Newer items override older ones (keep='last' behavior)
|
|
746
|
+
"""
|
|
747
|
+
items_dir = temp_test_dir / "dedup_items"
|
|
748
|
+
items_dir.mkdir(parents=True, exist_ok=True)
|
|
749
|
+
|
|
750
|
+
generator = SyntheticSTACGenerator(
|
|
751
|
+
output_dir=items_dir,
|
|
752
|
+
base_geometry=LANDSAT_SCENE,
|
|
753
|
+
outlier_tiny_percent=0,
|
|
754
|
+
outlier_huge_percent=0,
|
|
755
|
+
seed=42,
|
|
756
|
+
)
|
|
757
|
+
|
|
758
|
+
# Generate initial set of items
|
|
759
|
+
items = generator.generate_items(n_items=20, shuffle=False)
|
|
760
|
+
generator.write_items_to_files(items)
|
|
761
|
+
|
|
762
|
+
server = SimpleSTACServer(items_dir)
|
|
763
|
+
base_url = server.start()
|
|
764
|
+
|
|
765
|
+
try:
|
|
766
|
+
parquet_path = temp_test_dir / "dedup_urls.parquet"
|
|
767
|
+
generator.create_url_parquet(base_url, parquet_path, shuffle=True)
|
|
768
|
+
|
|
769
|
+
output_catalog = temp_test_dir / "catalog_dedup"
|
|
770
|
+
scratch_location = temp_test_dir / "scratch_dedup"
|
|
771
|
+
|
|
772
|
+
config = ProcessingConfig(
|
|
773
|
+
input_file=str(parquet_path),
|
|
774
|
+
output_catalog=str(output_catalog),
|
|
775
|
+
scratch_location=str(scratch_location),
|
|
776
|
+
grid_system="h3",
|
|
777
|
+
grid_resolution=2,
|
|
778
|
+
temporal_bin="month",
|
|
779
|
+
enable_concurrent_http=True,
|
|
780
|
+
concurrent_requests=5,
|
|
781
|
+
batch_size=20,
|
|
782
|
+
max_workers=1,
|
|
783
|
+
)
|
|
784
|
+
|
|
785
|
+
processor = LocalProcessor(n_workers=1)
|
|
786
|
+
pipeline = STACIngestionPipeline(config, processor)
|
|
787
|
+
|
|
788
|
+
try:
|
|
789
|
+
# First run
|
|
790
|
+
stats1 = pipeline.run()
|
|
791
|
+
total1 = sum(s["total_items"] for s in stats1.values())
|
|
792
|
+
|
|
793
|
+
# Second run with same data (should deduplicate)
|
|
794
|
+
stats2 = pipeline.run()
|
|
795
|
+
total2 = sum(s["total_items"] for s in stats2.values())
|
|
796
|
+
|
|
797
|
+
# Should have same total (deduplication working)
|
|
798
|
+
assert total2 == total1, f"Second run should have same total due to deduplication: {total2} vs {total1}"
|
|
799
|
+
|
|
800
|
+
finally:
|
|
801
|
+
processor.close()
|
|
802
|
+
|
|
803
|
+
finally:
|
|
804
|
+
server.stop()
|
|
805
|
+
|
|
806
|
+
|
|
807
|
+
class TestSyntheticGenerator:
|
|
808
|
+
"""Tests for the synthetic STAC generator itself."""
|
|
809
|
+
|
|
810
|
+
@pytest.mark.e2e
|
|
811
|
+
def test_generator_produces_valid_items(self, synthetic_generator, e2e_config):
|
|
812
|
+
"""Test that generated items have valid STAC structure."""
|
|
813
|
+
items = synthetic_generator.generate_items(n_items=10, shuffle=False)
|
|
814
|
+
|
|
815
|
+
for item in items:
|
|
816
|
+
# Check required STAC fields
|
|
817
|
+
assert "type" in item and item["type"] == "Feature"
|
|
818
|
+
assert "stac_version" in item
|
|
819
|
+
assert "id" in item
|
|
820
|
+
assert "geometry" in item
|
|
821
|
+
assert "bbox" in item
|
|
822
|
+
assert "properties" in item
|
|
823
|
+
assert "assets" in item
|
|
824
|
+
assert "links" in item
|
|
825
|
+
|
|
826
|
+
# Check geometry structure
|
|
827
|
+
geom = item["geometry"]
|
|
828
|
+
assert geom["type"] == "Polygon"
|
|
829
|
+
assert "coordinates" in geom
|
|
830
|
+
assert len(geom["coordinates"][0]) == 5 # Closed polygon
|
|
831
|
+
|
|
832
|
+
# Check bbox consistency with geometry
|
|
833
|
+
bbox = item["bbox"]
|
|
834
|
+
coords = geom["coordinates"][0]
|
|
835
|
+
min_lon = min(c[0] for c in coords)
|
|
836
|
+
max_lon = max(c[0] for c in coords)
|
|
837
|
+
min_lat = min(c[1] for c in coords)
|
|
838
|
+
max_lat = max(c[1] for c in coords)
|
|
839
|
+
|
|
840
|
+
assert abs(bbox[0] - min_lon) < 0.0001
|
|
841
|
+
assert abs(bbox[1] - min_lat) < 0.0001
|
|
842
|
+
assert abs(bbox[2] - max_lon) < 0.0001
|
|
843
|
+
assert abs(bbox[3] - max_lat) < 0.0001
|
|
844
|
+
|
|
845
|
+
@pytest.mark.e2e
|
|
846
|
+
def test_generator_outlier_distribution(self, temp_test_dir):
|
|
847
|
+
"""Test that outlier percentages are respected."""
|
|
848
|
+
items_dir = temp_test_dir / "outlier_test"
|
|
849
|
+
items_dir.mkdir(parents=True, exist_ok=True)
|
|
850
|
+
|
|
851
|
+
generator = SyntheticSTACGenerator(
|
|
852
|
+
output_dir=items_dir,
|
|
853
|
+
outlier_tiny_percent=20.0,
|
|
854
|
+
outlier_huge_percent=10.0,
|
|
855
|
+
seed=42,
|
|
856
|
+
)
|
|
857
|
+
|
|
858
|
+
# Generate many items to get statistical significance
|
|
859
|
+
items = generator.generate_items(n_items=1000, shuffle=False)
|
|
860
|
+
dist = generator.get_geometry_distribution()
|
|
861
|
+
|
|
862
|
+
# Check distribution is roughly correct (within tolerance)
|
|
863
|
+
total = len(items)
|
|
864
|
+
tiny_pct = (dist.get("tiny", 0) / total) * 100
|
|
865
|
+
large_pct = ((dist.get("huge", 0) + dist.get("continental", 0)) / total) * 100
|
|
866
|
+
|
|
867
|
+
# Allow 50% tolerance due to randomness
|
|
868
|
+
assert 10 < tiny_pct < 30, f"Tiny outlier percentage {tiny_pct:.1f}% not in expected range"
|
|
869
|
+
assert 5 < large_pct < 20, f"Large outlier percentage {large_pct:.1f}% not in expected range"
|
|
870
|
+
|
|
871
|
+
@pytest.mark.e2e
|
|
872
|
+
def test_generator_shuffle(self, temp_test_dir):
|
|
873
|
+
"""Test that shuffle parameter works correctly."""
|
|
874
|
+
items_dir = temp_test_dir / "shuffle_test"
|
|
875
|
+
items_dir.mkdir(parents=True, exist_ok=True)
|
|
876
|
+
|
|
877
|
+
generator = SyntheticSTACGenerator(output_dir=items_dir, seed=42)
|
|
878
|
+
|
|
879
|
+
# Generate unshuffled
|
|
880
|
+
items_unshuffled = generator.generate_items(n_items=50, shuffle=False)
|
|
881
|
+
ids_unshuffled = [item["id"] for item in items_unshuffled]
|
|
882
|
+
|
|
883
|
+
# Reset and generate shuffled (same seed means same items, different order)
|
|
884
|
+
generator2 = SyntheticSTACGenerator(output_dir=items_dir, seed=42)
|
|
885
|
+
items_shuffled = generator2.generate_items(n_items=50, shuffle=True)
|
|
886
|
+
ids_shuffled = [item["id"] for item in items_shuffled]
|
|
887
|
+
|
|
888
|
+
# Should have same IDs but different order
|
|
889
|
+
assert set(ids_unshuffled) == set(ids_shuffled)
|
|
890
|
+
assert ids_unshuffled != ids_shuffled, "Shuffled order should differ from unshuffled"
|
|
891
|
+
|
|
892
|
+
|
|
893
|
+
# =============================================================================
|
|
894
|
+
# Query Performance Profiling Tests
|
|
895
|
+
# =============================================================================
|
|
896
|
+
|
|
897
|
+
|
|
898
|
+
@dataclass
|
|
899
|
+
class QueryProfileResult:
|
|
900
|
+
"""Result of a single query profile run."""
|
|
901
|
+
|
|
902
|
+
engine: str
|
|
903
|
+
query_type: str
|
|
904
|
+
iteration: int
|
|
905
|
+
duration_ms: float
|
|
906
|
+
rows_returned: int
|
|
907
|
+
partitions_scanned: int
|
|
908
|
+
error: str | None = None
|
|
909
|
+
|
|
910
|
+
|
|
911
|
+
@dataclass
|
|
912
|
+
class ProfileSummary:
|
|
913
|
+
"""Summary statistics for a profiling run."""
|
|
914
|
+
|
|
915
|
+
engine: str
|
|
916
|
+
query_type: str
|
|
917
|
+
iterations: int
|
|
918
|
+
min_ms: float
|
|
919
|
+
max_ms: float
|
|
920
|
+
mean_ms: float
|
|
921
|
+
median_ms: float
|
|
922
|
+
std_ms: float
|
|
923
|
+
total_rows: int
|
|
924
|
+
partitions_scanned: int
|
|
925
|
+
|
|
926
|
+
|
|
927
|
+
class QueryProfiler:
|
|
928
|
+
"""Profiler for measuring query performance across different engines.
|
|
929
|
+
|
|
930
|
+
Supports profiling queries using:
|
|
931
|
+
- DuckDB (with spatial extension)
|
|
932
|
+
- rustac (Rust-based STAC querying)
|
|
933
|
+
- Spatial resolver (partition resolution only)
|
|
934
|
+
"""
|
|
935
|
+
|
|
936
|
+
def __init__(
|
|
937
|
+
self,
|
|
938
|
+
catalog_path: Path,
|
|
939
|
+
schema_path: Path,
|
|
940
|
+
iterations: int = 10,
|
|
941
|
+
):
|
|
942
|
+
"""Initialize the query profiler.
|
|
943
|
+
|
|
944
|
+
Args:
|
|
945
|
+
catalog_path: Path to the catalog directory
|
|
946
|
+
schema_path: Path to the catalog schema JSON file
|
|
947
|
+
iterations: Number of iterations per query for averaging
|
|
948
|
+
"""
|
|
949
|
+
self.catalog_path = catalog_path
|
|
950
|
+
self.schema_path = schema_path
|
|
951
|
+
self.iterations = iterations
|
|
952
|
+
self.results: list[QueryProfileResult] = []
|
|
953
|
+
|
|
954
|
+
def _get_test_geometries(self) -> dict[str, Any]:
|
|
955
|
+
"""Generate test geometries of different sizes for profiling."""
|
|
956
|
+
from shapely.geometry import box
|
|
957
|
+
|
|
958
|
+
return {
|
|
959
|
+
"point_query": box(-105.0, 40.0, -104.99, 40.01), # ~1km bbox
|
|
960
|
+
"small_region": box(-106.0, 39.0, -104.0, 41.0), # ~200km bbox
|
|
961
|
+
"medium_region": box(-110.0, 35.0, -100.0, 45.0), # ~1000km bbox
|
|
962
|
+
"large_region": box(-125.0, 25.0, -65.0, 50.0), # Continental US
|
|
963
|
+
}
|
|
964
|
+
|
|
965
|
+
def profile_spatial_resolver(self, geometry, query_name: str) -> list[QueryProfileResult]:
|
|
966
|
+
"""Profile spatial resolver partition resolution performance."""
|
|
967
|
+
import time
|
|
968
|
+
|
|
969
|
+
from earthcatalog.spatial_resolver import spatial_resolver
|
|
970
|
+
|
|
971
|
+
results = []
|
|
972
|
+
|
|
973
|
+
for i in range(self.iterations):
|
|
974
|
+
try:
|
|
975
|
+
start = time.perf_counter()
|
|
976
|
+
|
|
977
|
+
resolver = spatial_resolver(str(self.schema_path), str(self.catalog_path))
|
|
978
|
+
partitions = resolver.resolve_partitions(geometry, overlap=True)
|
|
979
|
+
_query_paths = resolver.generate_query_paths(partitions, temporal_filter=None)
|
|
980
|
+
|
|
981
|
+
duration_ms = (time.perf_counter() - start) * 1000
|
|
982
|
+
|
|
983
|
+
results.append(
|
|
984
|
+
QueryProfileResult(
|
|
985
|
+
engine="spatial_resolver",
|
|
986
|
+
query_type=query_name,
|
|
987
|
+
iteration=i,
|
|
988
|
+
duration_ms=duration_ms,
|
|
989
|
+
rows_returned=0,
|
|
990
|
+
partitions_scanned=len(partitions),
|
|
991
|
+
)
|
|
992
|
+
)
|
|
993
|
+
|
|
994
|
+
except (ValueError, TypeError, AttributeError, OSError) as e:
|
|
995
|
+
results.append(
|
|
996
|
+
QueryProfileResult(
|
|
997
|
+
engine="spatial_resolver",
|
|
998
|
+
query_type=query_name,
|
|
999
|
+
iteration=i,
|
|
1000
|
+
duration_ms=0,
|
|
1001
|
+
rows_returned=0,
|
|
1002
|
+
partitions_scanned=0,
|
|
1003
|
+
error=str(e),
|
|
1004
|
+
)
|
|
1005
|
+
)
|
|
1006
|
+
|
|
1007
|
+
return results
|
|
1008
|
+
|
|
1009
|
+
def profile_duckdb(self, geometry, query_name: str, temporal_filter: str | None = None) -> list[QueryProfileResult]:
|
|
1010
|
+
"""Profile DuckDB query performance."""
|
|
1011
|
+
import time
|
|
1012
|
+
|
|
1013
|
+
try:
|
|
1014
|
+
import duckdb
|
|
1015
|
+
except ImportError:
|
|
1016
|
+
return [
|
|
1017
|
+
QueryProfileResult(
|
|
1018
|
+
engine="duckdb",
|
|
1019
|
+
query_type=query_name,
|
|
1020
|
+
iteration=0,
|
|
1021
|
+
duration_ms=0,
|
|
1022
|
+
rows_returned=0,
|
|
1023
|
+
partitions_scanned=0,
|
|
1024
|
+
error="duckdb not installed",
|
|
1025
|
+
)
|
|
1026
|
+
]
|
|
1027
|
+
|
|
1028
|
+
from earthcatalog.spatial_resolver import spatial_resolver
|
|
1029
|
+
|
|
1030
|
+
results = []
|
|
1031
|
+
|
|
1032
|
+
for i in range(self.iterations):
|
|
1033
|
+
try:
|
|
1034
|
+
# Get partition paths
|
|
1035
|
+
resolver = spatial_resolver(str(self.schema_path), str(self.catalog_path))
|
|
1036
|
+
partitions = resolver.resolve_partitions(geometry, overlap=True)
|
|
1037
|
+
query_paths = resolver.generate_query_paths(partitions, temporal_filter)
|
|
1038
|
+
|
|
1039
|
+
if not query_paths:
|
|
1040
|
+
results.append(
|
|
1041
|
+
QueryProfileResult(
|
|
1042
|
+
engine="duckdb",
|
|
1043
|
+
query_type=query_name,
|
|
1044
|
+
iteration=i,
|
|
1045
|
+
duration_ms=0,
|
|
1046
|
+
rows_returned=0,
|
|
1047
|
+
partitions_scanned=0,
|
|
1048
|
+
error="no partitions found",
|
|
1049
|
+
)
|
|
1050
|
+
)
|
|
1051
|
+
continue
|
|
1052
|
+
|
|
1053
|
+
# Build DuckDB query
|
|
1054
|
+
# Handle glob patterns in paths
|
|
1055
|
+
file_patterns = []
|
|
1056
|
+
for path in query_paths:
|
|
1057
|
+
if "*" in path:
|
|
1058
|
+
# Use glob for pattern matching
|
|
1059
|
+
from pathlib import Path as P
|
|
1060
|
+
|
|
1061
|
+
matching_files = list(P(self.catalog_path).glob(path.replace(str(self.catalog_path) + "/", "")))
|
|
1062
|
+
file_patterns.extend([str(f) for f in matching_files])
|
|
1063
|
+
else:
|
|
1064
|
+
file_patterns.append(path)
|
|
1065
|
+
|
|
1066
|
+
if not file_patterns:
|
|
1067
|
+
results.append(
|
|
1068
|
+
QueryProfileResult(
|
|
1069
|
+
engine="duckdb",
|
|
1070
|
+
query_type=query_name,
|
|
1071
|
+
iteration=i,
|
|
1072
|
+
duration_ms=0,
|
|
1073
|
+
rows_returned=0,
|
|
1074
|
+
partitions_scanned=len(partitions),
|
|
1075
|
+
error="no matching files",
|
|
1076
|
+
)
|
|
1077
|
+
)
|
|
1078
|
+
continue
|
|
1079
|
+
|
|
1080
|
+
# Create file list for DuckDB
|
|
1081
|
+
files_str = ", ".join([f"'{f}'" for f in file_patterns])
|
|
1082
|
+
query = f"SELECT COUNT(*) as cnt FROM read_parquet([{files_str}])"
|
|
1083
|
+
|
|
1084
|
+
start = time.perf_counter()
|
|
1085
|
+
result = duckdb.sql(query).fetchone()
|
|
1086
|
+
duration_ms = (time.perf_counter() - start) * 1000
|
|
1087
|
+
|
|
1088
|
+
row_count = result[0] if result else 0
|
|
1089
|
+
|
|
1090
|
+
results.append(
|
|
1091
|
+
QueryProfileResult(
|
|
1092
|
+
engine="duckdb",
|
|
1093
|
+
query_type=query_name,
|
|
1094
|
+
iteration=i,
|
|
1095
|
+
duration_ms=duration_ms,
|
|
1096
|
+
rows_returned=row_count,
|
|
1097
|
+
partitions_scanned=len(partitions),
|
|
1098
|
+
)
|
|
1099
|
+
)
|
|
1100
|
+
|
|
1101
|
+
except (ValueError, TypeError, AttributeError, OSError) as e:
|
|
1102
|
+
results.append(
|
|
1103
|
+
QueryProfileResult(
|
|
1104
|
+
engine="duckdb",
|
|
1105
|
+
query_type=query_name,
|
|
1106
|
+
iteration=i,
|
|
1107
|
+
duration_ms=0,
|
|
1108
|
+
rows_returned=0,
|
|
1109
|
+
partitions_scanned=0,
|
|
1110
|
+
error=str(e),
|
|
1111
|
+
)
|
|
1112
|
+
)
|
|
1113
|
+
|
|
1114
|
+
return results
|
|
1115
|
+
|
|
1116
|
+
def profile_rustac(self, geometry, query_name: str, temporal_filter: str | None = None) -> list[QueryProfileResult]:
|
|
1117
|
+
"""Profile rustac query performance."""
|
|
1118
|
+
import time
|
|
1119
|
+
|
|
1120
|
+
try:
|
|
1121
|
+
import rustac
|
|
1122
|
+
except ImportError:
|
|
1123
|
+
return [
|
|
1124
|
+
QueryProfileResult(
|
|
1125
|
+
engine="rustac",
|
|
1126
|
+
query_type=query_name,
|
|
1127
|
+
iteration=0,
|
|
1128
|
+
duration_ms=0,
|
|
1129
|
+
rows_returned=0,
|
|
1130
|
+
partitions_scanned=0,
|
|
1131
|
+
error="rustac not installed",
|
|
1132
|
+
)
|
|
1133
|
+
]
|
|
1134
|
+
|
|
1135
|
+
from earthcatalog.spatial_resolver import spatial_resolver
|
|
1136
|
+
|
|
1137
|
+
results = []
|
|
1138
|
+
|
|
1139
|
+
for i in range(self.iterations):
|
|
1140
|
+
try:
|
|
1141
|
+
# Get partition paths
|
|
1142
|
+
resolver = spatial_resolver(str(self.schema_path), str(self.catalog_path))
|
|
1143
|
+
partitions = resolver.resolve_partitions(geometry, overlap=True)
|
|
1144
|
+
query_paths = resolver.generate_query_paths(partitions, temporal_filter)
|
|
1145
|
+
|
|
1146
|
+
if not query_paths:
|
|
1147
|
+
results.append(
|
|
1148
|
+
QueryProfileResult(
|
|
1149
|
+
engine="rustac",
|
|
1150
|
+
query_type=query_name,
|
|
1151
|
+
iteration=i,
|
|
1152
|
+
duration_ms=0,
|
|
1153
|
+
rows_returned=0,
|
|
1154
|
+
partitions_scanned=0,
|
|
1155
|
+
error="no partitions found",
|
|
1156
|
+
)
|
|
1157
|
+
)
|
|
1158
|
+
continue
|
|
1159
|
+
|
|
1160
|
+
# Resolve glob patterns to actual files
|
|
1161
|
+
file_patterns = []
|
|
1162
|
+
for path in query_paths:
|
|
1163
|
+
if "*" in path:
|
|
1164
|
+
from pathlib import Path as P
|
|
1165
|
+
|
|
1166
|
+
matching_files = list(P(self.catalog_path).glob(path.replace(str(self.catalog_path) + "/", "")))
|
|
1167
|
+
file_patterns.extend([str(f) for f in matching_files])
|
|
1168
|
+
else:
|
|
1169
|
+
file_patterns.append(path)
|
|
1170
|
+
|
|
1171
|
+
if not file_patterns:
|
|
1172
|
+
results.append(
|
|
1173
|
+
QueryProfileResult(
|
|
1174
|
+
engine="rustac",
|
|
1175
|
+
query_type=query_name,
|
|
1176
|
+
iteration=i,
|
|
1177
|
+
duration_ms=0,
|
|
1178
|
+
rows_returned=0,
|
|
1179
|
+
partitions_scanned=len(partitions),
|
|
1180
|
+
error="no matching files",
|
|
1181
|
+
)
|
|
1182
|
+
)
|
|
1183
|
+
continue
|
|
1184
|
+
|
|
1185
|
+
start = time.perf_counter()
|
|
1186
|
+
|
|
1187
|
+
# Use rustac to read items (rustac.read is async)
|
|
1188
|
+
import asyncio
|
|
1189
|
+
|
|
1190
|
+
async def count_items_async(paths: list[str]) -> int:
|
|
1191
|
+
count = 0
|
|
1192
|
+
for file_path in paths:
|
|
1193
|
+
if Path(file_path).exists():
|
|
1194
|
+
items = await rustac.read(file_path)
|
|
1195
|
+
count += len(items)
|
|
1196
|
+
return count
|
|
1197
|
+
|
|
1198
|
+
total_items = asyncio.run(count_items_async(file_patterns))
|
|
1199
|
+
|
|
1200
|
+
duration_ms = (time.perf_counter() - start) * 1000
|
|
1201
|
+
|
|
1202
|
+
results.append(
|
|
1203
|
+
QueryProfileResult(
|
|
1204
|
+
engine="rustac",
|
|
1205
|
+
query_type=query_name,
|
|
1206
|
+
iteration=i,
|
|
1207
|
+
duration_ms=duration_ms,
|
|
1208
|
+
rows_returned=total_items,
|
|
1209
|
+
partitions_scanned=len(partitions),
|
|
1210
|
+
)
|
|
1211
|
+
)
|
|
1212
|
+
|
|
1213
|
+
except (ValueError, TypeError, AttributeError, OSError) as e:
|
|
1214
|
+
results.append(
|
|
1215
|
+
QueryProfileResult(
|
|
1216
|
+
engine="rustac",
|
|
1217
|
+
query_type=query_name,
|
|
1218
|
+
iteration=i,
|
|
1219
|
+
duration_ms=0,
|
|
1220
|
+
rows_returned=0,
|
|
1221
|
+
partitions_scanned=0,
|
|
1222
|
+
error=str(e),
|
|
1223
|
+
)
|
|
1224
|
+
)
|
|
1225
|
+
|
|
1226
|
+
return results
|
|
1227
|
+
|
|
1228
|
+
def run_all_profiles(self, engines: list[str] | None = None) -> list[QueryProfileResult]:
|
|
1229
|
+
"""Run all profiling queries across specified engines.
|
|
1230
|
+
|
|
1231
|
+
Args:
|
|
1232
|
+
engines: List of engines to profile. Options: 'spatial_resolver', 'duckdb', 'rustac'
|
|
1233
|
+
"""
|
|
1234
|
+
if engines is None:
|
|
1235
|
+
engines = ["spatial_resolver", "duckdb", "rustac"]
|
|
1236
|
+
|
|
1237
|
+
all_results = []
|
|
1238
|
+
test_geometries = self._get_test_geometries()
|
|
1239
|
+
|
|
1240
|
+
for query_name, geometry in test_geometries.items():
|
|
1241
|
+
for engine in engines:
|
|
1242
|
+
if engine == "spatial_resolver":
|
|
1243
|
+
results = self.profile_spatial_resolver(geometry, query_name)
|
|
1244
|
+
elif engine == "duckdb":
|
|
1245
|
+
results = self.profile_duckdb(geometry, query_name)
|
|
1246
|
+
elif engine == "rustac":
|
|
1247
|
+
results = self.profile_rustac(geometry, query_name)
|
|
1248
|
+
else:
|
|
1249
|
+
continue
|
|
1250
|
+
|
|
1251
|
+
all_results.extend(results)
|
|
1252
|
+
|
|
1253
|
+
self.results = all_results
|
|
1254
|
+
return all_results
|
|
1255
|
+
|
|
1256
|
+
def get_summary(self) -> list[ProfileSummary]:
|
|
1257
|
+
"""Calculate summary statistics for profiling results."""
|
|
1258
|
+
import statistics
|
|
1259
|
+
from collections import defaultdict
|
|
1260
|
+
|
|
1261
|
+
# Group by engine and query type
|
|
1262
|
+
groups: dict[tuple[str, str], list[QueryProfileResult]] = defaultdict(list)
|
|
1263
|
+
for r in self.results:
|
|
1264
|
+
if r.error is None:
|
|
1265
|
+
groups[(r.engine, r.query_type)].append(r)
|
|
1266
|
+
|
|
1267
|
+
summaries = []
|
|
1268
|
+
for (engine, query_type), results in groups.items():
|
|
1269
|
+
if not results:
|
|
1270
|
+
continue
|
|
1271
|
+
|
|
1272
|
+
durations = [r.duration_ms for r in results]
|
|
1273
|
+
summaries.append(
|
|
1274
|
+
ProfileSummary(
|
|
1275
|
+
engine=engine,
|
|
1276
|
+
query_type=query_type,
|
|
1277
|
+
iterations=len(results),
|
|
1278
|
+
min_ms=min(durations),
|
|
1279
|
+
max_ms=max(durations),
|
|
1280
|
+
mean_ms=statistics.mean(durations),
|
|
1281
|
+
median_ms=statistics.median(durations),
|
|
1282
|
+
std_ms=statistics.stdev(durations) if len(durations) > 1 else 0,
|
|
1283
|
+
total_rows=sum(r.rows_returned for r in results),
|
|
1284
|
+
partitions_scanned=results[0].partitions_scanned if results else 0,
|
|
1285
|
+
)
|
|
1286
|
+
)
|
|
1287
|
+
|
|
1288
|
+
return summaries
|
|
1289
|
+
|
|
1290
|
+
def print_report(self):
|
|
1291
|
+
"""Print a formatted profiling report."""
|
|
1292
|
+
summaries = self.get_summary()
|
|
1293
|
+
|
|
1294
|
+
print("\n" + "=" * 80)
|
|
1295
|
+
print("QUERY PERFORMANCE PROFILING REPORT")
|
|
1296
|
+
print("=" * 80)
|
|
1297
|
+
|
|
1298
|
+
# Group by query type
|
|
1299
|
+
from collections import defaultdict
|
|
1300
|
+
|
|
1301
|
+
by_query: dict[str, list[ProfileSummary]] = defaultdict(list)
|
|
1302
|
+
for s in summaries:
|
|
1303
|
+
by_query[s.query_type].append(s)
|
|
1304
|
+
|
|
1305
|
+
for query_type, engine_summaries in sorted(by_query.items()):
|
|
1306
|
+
print(f"\n{query_type}:")
|
|
1307
|
+
print("-" * 60)
|
|
1308
|
+
print(f"{'Engine':<20} {'Mean (ms)':<12} {'Median (ms)':<12} {'Std (ms)':<10} {'Partitions':<10}")
|
|
1309
|
+
print("-" * 60)
|
|
1310
|
+
|
|
1311
|
+
for s in sorted(engine_summaries, key=lambda x: x.mean_ms):
|
|
1312
|
+
print(
|
|
1313
|
+
f"{s.engine:<20} {s.mean_ms:<12.2f} {s.median_ms:<12.2f} {s.std_ms:<10.2f} {s.partitions_scanned:<10}"
|
|
1314
|
+
)
|
|
1315
|
+
|
|
1316
|
+
print("\n" + "=" * 80)
|
|
1317
|
+
|
|
1318
|
+
|
|
1319
|
+
@pytest.fixture(scope="module")
|
|
1320
|
+
def grid_config(request):
|
|
1321
|
+
"""Get grid configuration from command line options."""
|
|
1322
|
+
|
|
1323
|
+
class GridConfig:
|
|
1324
|
+
grid_system = request.config.getoption("--e2e-grid", "h3")
|
|
1325
|
+
grid_level = int(request.config.getoption("--e2e-grid-level", 2))
|
|
1326
|
+
temporal_bin = request.config.getoption("--e2e-temporal", "month")
|
|
1327
|
+
|
|
1328
|
+
return GridConfig()
|
|
1329
|
+
|
|
1330
|
+
|
|
1331
|
+
@pytest.fixture(scope="module")
|
|
1332
|
+
def profile_config(request):
|
|
1333
|
+
"""Get profiling configuration from command line options."""
|
|
1334
|
+
|
|
1335
|
+
class ProfileConfig:
|
|
1336
|
+
enabled = request.config.getoption("--e2e-profile-queries", False)
|
|
1337
|
+
iterations = int(request.config.getoption("--e2e-query-iterations", 10))
|
|
1338
|
+
engines = request.config.getoption("--e2e-query-engines", "duckdb,rustac").split(",")
|
|
1339
|
+
|
|
1340
|
+
return ProfileConfig()
|
|
1341
|
+
|
|
1342
|
+
|
|
1343
|
+
class TestQueryPerformance:
|
|
1344
|
+
"""Tests for query performance profiling with configurable grid systems."""
|
|
1345
|
+
|
|
1346
|
+
@pytest.mark.e2e
|
|
1347
|
+
def test_configurable_grid_pipeline(self, temp_test_dir, e2e_config, grid_config):
|
|
1348
|
+
"""Test pipeline with configurable grid system and resolution.
|
|
1349
|
+
|
|
1350
|
+
Run with:
|
|
1351
|
+
pytest -m e2e -k "test_configurable_grid" --e2e-grid=h3 --e2e-grid-level=4 --e2e-temporal=year
|
|
1352
|
+
"""
|
|
1353
|
+
items_dir = temp_test_dir / "grid_config_items"
|
|
1354
|
+
items_dir.mkdir(parents=True, exist_ok=True)
|
|
1355
|
+
|
|
1356
|
+
generator = SyntheticSTACGenerator(
|
|
1357
|
+
output_dir=items_dir,
|
|
1358
|
+
base_geometry=LANDSAT_SCENE,
|
|
1359
|
+
outlier_tiny_percent=e2e_config.outlier_tiny_percent,
|
|
1360
|
+
outlier_huge_percent=e2e_config.outlier_huge_percent,
|
|
1361
|
+
seed=e2e_config.seed,
|
|
1362
|
+
)
|
|
1363
|
+
|
|
1364
|
+
items = generator.generate_items(n_items=e2e_config.n_items, shuffle=True)
|
|
1365
|
+
generator.write_items_to_files(items)
|
|
1366
|
+
|
|
1367
|
+
server = SimpleSTACServer(items_dir)
|
|
1368
|
+
base_url = server.start()
|
|
1369
|
+
|
|
1370
|
+
try:
|
|
1371
|
+
parquet_path = temp_test_dir / "grid_config_urls.parquet"
|
|
1372
|
+
generator.create_url_parquet(base_url, parquet_path, shuffle=True)
|
|
1373
|
+
|
|
1374
|
+
output_catalog = temp_test_dir / "catalog_grid_config"
|
|
1375
|
+
scratch_location = temp_test_dir / "scratch_grid_config"
|
|
1376
|
+
|
|
1377
|
+
config = ProcessingConfig(
|
|
1378
|
+
input_file=str(parquet_path),
|
|
1379
|
+
output_catalog=str(output_catalog),
|
|
1380
|
+
scratch_location=str(scratch_location),
|
|
1381
|
+
grid_system=grid_config.grid_system,
|
|
1382
|
+
grid_resolution=grid_config.grid_level,
|
|
1383
|
+
temporal_bin=grid_config.temporal_bin,
|
|
1384
|
+
generate_schema=True,
|
|
1385
|
+
enable_concurrent_http=True,
|
|
1386
|
+
concurrent_requests=10,
|
|
1387
|
+
batch_size=50,
|
|
1388
|
+
max_workers=2,
|
|
1389
|
+
)
|
|
1390
|
+
|
|
1391
|
+
processor = LocalProcessor(n_workers=config.max_workers)
|
|
1392
|
+
pipeline = STACIngestionPipeline(config, processor)
|
|
1393
|
+
|
|
1394
|
+
try:
|
|
1395
|
+
stats = pipeline.run()
|
|
1396
|
+
|
|
1397
|
+
assert len(stats) > 0, "Pipeline should produce partitions"
|
|
1398
|
+
total_items = sum(s["total_items"] for s in stats.values())
|
|
1399
|
+
assert total_items > 0, "Pipeline should ingest items"
|
|
1400
|
+
|
|
1401
|
+
# Verify schema reflects configuration
|
|
1402
|
+
schema_file = output_catalog / "catalog_schema.json"
|
|
1403
|
+
assert schema_file.exists()
|
|
1404
|
+
|
|
1405
|
+
with open(schema_file) as f:
|
|
1406
|
+
schema = json.load(f)
|
|
1407
|
+
assert schema["spatial_partitioning"]["grid_system"] == grid_config.grid_system
|
|
1408
|
+
assert schema["temporal_partitioning"]["temporal_bin"] == grid_config.temporal_bin
|
|
1409
|
+
|
|
1410
|
+
print("\nGrid Config Test Results:")
|
|
1411
|
+
print(f" Grid System: {grid_config.grid_system}")
|
|
1412
|
+
print(f" Grid Level: {grid_config.grid_level}")
|
|
1413
|
+
print(f" Temporal Bin: {grid_config.temporal_bin}")
|
|
1414
|
+
print(f" Partitions Created: {len(stats)}")
|
|
1415
|
+
print(f" Items Ingested: {total_items}")
|
|
1416
|
+
|
|
1417
|
+
finally:
|
|
1418
|
+
processor.close()
|
|
1419
|
+
|
|
1420
|
+
finally:
|
|
1421
|
+
server.stop()
|
|
1422
|
+
|
|
1423
|
+
@pytest.mark.e2e
|
|
1424
|
+
def test_query_performance_profiling(self, temp_test_dir, e2e_config, grid_config, profile_config):
|
|
1425
|
+
"""Profile query performance across different engines.
|
|
1426
|
+
|
|
1427
|
+
Run with:
|
|
1428
|
+
pytest -m e2e -k "test_query_performance" \\
|
|
1429
|
+
--e2e-items=500 \\
|
|
1430
|
+
--e2e-grid=h3 --e2e-grid-level=2 \\
|
|
1431
|
+
--e2e-profile-queries \\
|
|
1432
|
+
--e2e-query-iterations=5 \\
|
|
1433
|
+
--e2e-query-engines=duckdb,rustac
|
|
1434
|
+
"""
|
|
1435
|
+
if not profile_config.enabled:
|
|
1436
|
+
pytest.skip("Query profiling not enabled. Use --e2e-profile-queries to enable.")
|
|
1437
|
+
|
|
1438
|
+
# First, create a catalog with enough items for meaningful profiling
|
|
1439
|
+
items_dir = temp_test_dir / "profile_items"
|
|
1440
|
+
items_dir.mkdir(parents=True, exist_ok=True)
|
|
1441
|
+
|
|
1442
|
+
generator = SyntheticSTACGenerator(
|
|
1443
|
+
output_dir=items_dir,
|
|
1444
|
+
base_geometry=LANDSAT_SCENE,
|
|
1445
|
+
outlier_tiny_percent=e2e_config.outlier_tiny_percent,
|
|
1446
|
+
outlier_huge_percent=e2e_config.outlier_huge_percent,
|
|
1447
|
+
seed=e2e_config.seed,
|
|
1448
|
+
)
|
|
1449
|
+
|
|
1450
|
+
items = generator.generate_items(n_items=e2e_config.n_items, shuffle=True)
|
|
1451
|
+
generator.write_items_to_files(items)
|
|
1452
|
+
|
|
1453
|
+
server = SimpleSTACServer(items_dir)
|
|
1454
|
+
base_url = server.start()
|
|
1455
|
+
|
|
1456
|
+
try:
|
|
1457
|
+
parquet_path = temp_test_dir / "profile_urls.parquet"
|
|
1458
|
+
generator.create_url_parquet(base_url, parquet_path, shuffle=True)
|
|
1459
|
+
|
|
1460
|
+
output_catalog = temp_test_dir / "catalog_profile"
|
|
1461
|
+
scratch_location = temp_test_dir / "scratch_profile"
|
|
1462
|
+
|
|
1463
|
+
config = ProcessingConfig(
|
|
1464
|
+
input_file=str(parquet_path),
|
|
1465
|
+
output_catalog=str(output_catalog),
|
|
1466
|
+
scratch_location=str(scratch_location),
|
|
1467
|
+
grid_system=grid_config.grid_system,
|
|
1468
|
+
grid_resolution=grid_config.grid_level,
|
|
1469
|
+
temporal_bin=grid_config.temporal_bin,
|
|
1470
|
+
generate_schema=True,
|
|
1471
|
+
enable_concurrent_http=True,
|
|
1472
|
+
concurrent_requests=20,
|
|
1473
|
+
batch_size=100,
|
|
1474
|
+
max_workers=4,
|
|
1475
|
+
)
|
|
1476
|
+
|
|
1477
|
+
processor = LocalProcessor(n_workers=config.max_workers)
|
|
1478
|
+
pipeline = STACIngestionPipeline(config, processor)
|
|
1479
|
+
|
|
1480
|
+
try:
|
|
1481
|
+
print(f"\n{'=' * 60}")
|
|
1482
|
+
print("INGESTING CATALOG FOR PROFILING")
|
|
1483
|
+
print(f"{'=' * 60}")
|
|
1484
|
+
stats = pipeline.run()
|
|
1485
|
+
|
|
1486
|
+
total_items = sum(s["total_items"] for s in stats.values())
|
|
1487
|
+
print(f"\nCatalog created: {len(stats)} partitions, {total_items} items")
|
|
1488
|
+
|
|
1489
|
+
finally:
|
|
1490
|
+
processor.close()
|
|
1491
|
+
|
|
1492
|
+
# Run query profiling
|
|
1493
|
+
schema_path = output_catalog / "catalog_schema.json"
|
|
1494
|
+
|
|
1495
|
+
profiler = QueryProfiler(
|
|
1496
|
+
catalog_path=output_catalog,
|
|
1497
|
+
schema_path=schema_path,
|
|
1498
|
+
iterations=profile_config.iterations,
|
|
1499
|
+
)
|
|
1500
|
+
|
|
1501
|
+
print(f"\n{'=' * 60}")
|
|
1502
|
+
print("RUNNING QUERY PERFORMANCE PROFILING")
|
|
1503
|
+
print(f"{'=' * 60}")
|
|
1504
|
+
print(f"Engines: {profile_config.engines}")
|
|
1505
|
+
print(f"Iterations per query: {profile_config.iterations}")
|
|
1506
|
+
|
|
1507
|
+
# Add spatial_resolver to engines for comparison
|
|
1508
|
+
engines = ["spatial_resolver"] + profile_config.engines
|
|
1509
|
+
profiler.run_all_profiles(engines=engines)
|
|
1510
|
+
|
|
1511
|
+
# Print report
|
|
1512
|
+
profiler.print_report()
|
|
1513
|
+
|
|
1514
|
+
# Verify we got results
|
|
1515
|
+
assert len(profiler.results) > 0, "Should have profiling results"
|
|
1516
|
+
|
|
1517
|
+
# Check for errors
|
|
1518
|
+
errors = [r for r in profiler.results if r.error is not None]
|
|
1519
|
+
if errors:
|
|
1520
|
+
print(f"\nWarning: {len(errors)} queries had errors:")
|
|
1521
|
+
for e in errors[:5]: # Show first 5 errors
|
|
1522
|
+
print(f" {e.engine}/{e.query_type}: {e.error}")
|
|
1523
|
+
|
|
1524
|
+
finally:
|
|
1525
|
+
server.stop()
|
|
1526
|
+
|
|
1527
|
+
@pytest.mark.e2e
|
|
1528
|
+
def test_temporal_query_performance(self, temp_test_dir, e2e_config, grid_config, profile_config):
|
|
1529
|
+
"""Profile query performance with temporal filtering.
|
|
1530
|
+
|
|
1531
|
+
Run with:
|
|
1532
|
+
pytest -m e2e -k "test_temporal_query" \\
|
|
1533
|
+
--e2e-items=200 \\
|
|
1534
|
+
--e2e-temporal=month \\
|
|
1535
|
+
--e2e-profile-queries
|
|
1536
|
+
"""
|
|
1537
|
+
if not profile_config.enabled:
|
|
1538
|
+
pytest.skip("Query profiling not enabled. Use --e2e-profile-queries to enable.")
|
|
1539
|
+
|
|
1540
|
+
items_dir = temp_test_dir / "temporal_profile_items"
|
|
1541
|
+
items_dir.mkdir(parents=True, exist_ok=True)
|
|
1542
|
+
|
|
1543
|
+
generator = SyntheticSTACGenerator(
|
|
1544
|
+
output_dir=items_dir,
|
|
1545
|
+
base_geometry=LANDSAT_SCENE,
|
|
1546
|
+
seed=e2e_config.seed,
|
|
1547
|
+
)
|
|
1548
|
+
|
|
1549
|
+
items = generator.generate_items(n_items=e2e_config.n_items, shuffle=True)
|
|
1550
|
+
generator.write_items_to_files(items)
|
|
1551
|
+
|
|
1552
|
+
server = SimpleSTACServer(items_dir)
|
|
1553
|
+
base_url = server.start()
|
|
1554
|
+
|
|
1555
|
+
try:
|
|
1556
|
+
parquet_path = temp_test_dir / "temporal_profile_urls.parquet"
|
|
1557
|
+
generator.create_url_parquet(base_url, parquet_path, shuffle=True)
|
|
1558
|
+
|
|
1559
|
+
output_catalog = temp_test_dir / "catalog_temporal_profile"
|
|
1560
|
+
scratch_location = temp_test_dir / "scratch_temporal_profile"
|
|
1561
|
+
|
|
1562
|
+
config = ProcessingConfig(
|
|
1563
|
+
input_file=str(parquet_path),
|
|
1564
|
+
output_catalog=str(output_catalog),
|
|
1565
|
+
scratch_location=str(scratch_location),
|
|
1566
|
+
grid_system=grid_config.grid_system,
|
|
1567
|
+
grid_resolution=grid_config.grid_level,
|
|
1568
|
+
temporal_bin=grid_config.temporal_bin,
|
|
1569
|
+
generate_schema=True,
|
|
1570
|
+
enable_concurrent_http=True,
|
|
1571
|
+
max_workers=2,
|
|
1572
|
+
)
|
|
1573
|
+
|
|
1574
|
+
processor = LocalProcessor(n_workers=config.max_workers)
|
|
1575
|
+
pipeline = STACIngestionPipeline(config, processor)
|
|
1576
|
+
|
|
1577
|
+
try:
|
|
1578
|
+
stats = pipeline.run()
|
|
1579
|
+
total_items = sum(s["total_items"] for s in stats.values())
|
|
1580
|
+
print(f"\nCatalog created: {len(stats)} partitions, {total_items} items")
|
|
1581
|
+
finally:
|
|
1582
|
+
processor.close()
|
|
1583
|
+
|
|
1584
|
+
# Profile with different temporal filters
|
|
1585
|
+
from shapely.geometry import box
|
|
1586
|
+
|
|
1587
|
+
test_geometry = box(-110.0, 35.0, -100.0, 45.0)
|
|
1588
|
+
schema_path = output_catalog / "catalog_schema.json"
|
|
1589
|
+
|
|
1590
|
+
from earthcatalog.spatial_resolver import spatial_resolver
|
|
1591
|
+
|
|
1592
|
+
resolver = spatial_resolver(str(schema_path), str(output_catalog))
|
|
1593
|
+
|
|
1594
|
+
temporal_filters = [
|
|
1595
|
+
None, # No filter
|
|
1596
|
+
"year=2024/*", # Specific year
|
|
1597
|
+
"year=202*/*", # Multiple years
|
|
1598
|
+
"year=2024/month=0*/*", # First half of year
|
|
1599
|
+
]
|
|
1600
|
+
|
|
1601
|
+
print(f"\n{'=' * 60}")
|
|
1602
|
+
print("TEMPORAL FILTERING PERFORMANCE")
|
|
1603
|
+
print(f"{'=' * 60}")
|
|
1604
|
+
|
|
1605
|
+
import time
|
|
1606
|
+
|
|
1607
|
+
for tf in temporal_filters:
|
|
1608
|
+
start = time.perf_counter()
|
|
1609
|
+
partitions = resolver.resolve_partitions(test_geometry, overlap=True)
|
|
1610
|
+
paths = resolver.generate_query_paths(partitions, tf)
|
|
1611
|
+
duration_ms = (time.perf_counter() - start) * 1000
|
|
1612
|
+
|
|
1613
|
+
print(f"\nFilter: {tf or 'None'}")
|
|
1614
|
+
print(f" Partitions: {len(partitions)}")
|
|
1615
|
+
print(f" Paths generated: {len(paths)}")
|
|
1616
|
+
print(f" Duration: {duration_ms:.2f} ms")
|
|
1617
|
+
|
|
1618
|
+
finally:
|
|
1619
|
+
server.stop()
|
|
1620
|
+
|
|
1621
|
+
|
|
1622
|
+
if __name__ == "__main__":
|
|
1623
|
+
# Allow running tests directly
|
|
1624
|
+
pytest.main([__file__, "-v", "-m", "e2e"])
|