earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. earthcatalog/__init__.py +164 -0
  2. earthcatalog/async_http_client.py +1006 -0
  3. earthcatalog/config.py +97 -0
  4. earthcatalog/engines/__init__.py +308 -0
  5. earthcatalog/engines/rustac_engine.py +142 -0
  6. earthcatalog/engines/stac_geoparquet_engine.py +126 -0
  7. earthcatalog/exceptions.py +471 -0
  8. earthcatalog/grid_systems.py +1114 -0
  9. earthcatalog/ingestion_pipeline.py +2281 -0
  10. earthcatalog/input_readers.py +603 -0
  11. earthcatalog/job_tracking.py +485 -0
  12. earthcatalog/pipeline.py +606 -0
  13. earthcatalog/schema_generator.py +911 -0
  14. earthcatalog/spatial_resolver.py +1207 -0
  15. earthcatalog/stac_hooks.py +754 -0
  16. earthcatalog/statistics.py +677 -0
  17. earthcatalog/storage_backends.py +548 -0
  18. earthcatalog/tests/__init__.py +1 -0
  19. earthcatalog/tests/conftest.py +76 -0
  20. earthcatalog/tests/test_all_grids.py +793 -0
  21. earthcatalog/tests/test_async_http.py +700 -0
  22. earthcatalog/tests/test_cli_and_storage.py +230 -0
  23. earthcatalog/tests/test_config.py +245 -0
  24. earthcatalog/tests/test_dask_integration.py +580 -0
  25. earthcatalog/tests/test_e2e_synthetic.py +1624 -0
  26. earthcatalog/tests/test_engines.py +272 -0
  27. earthcatalog/tests/test_exceptions.py +346 -0
  28. earthcatalog/tests/test_file_structure.py +245 -0
  29. earthcatalog/tests/test_input_readers.py +666 -0
  30. earthcatalog/tests/test_integration.py +200 -0
  31. earthcatalog/tests/test_integration_async.py +283 -0
  32. earthcatalog/tests/test_job_tracking.py +603 -0
  33. earthcatalog/tests/test_multi_file_input.py +336 -0
  34. earthcatalog/tests/test_passthrough_hook.py +196 -0
  35. earthcatalog/tests/test_pipeline.py +684 -0
  36. earthcatalog/tests/test_pipeline_components.py +665 -0
  37. earthcatalog/tests/test_schema_generator.py +506 -0
  38. earthcatalog/tests/test_spatial_resolver.py +413 -0
  39. earthcatalog/tests/test_stac_hooks.py +776 -0
  40. earthcatalog/tests/test_statistics.py +477 -0
  41. earthcatalog/tests/test_storage_backends.py +236 -0
  42. earthcatalog/tests/test_validation.py +435 -0
  43. earthcatalog/tests/test_workers.py +653 -0
  44. earthcatalog/validation.py +921 -0
  45. earthcatalog/workers.py +682 -0
  46. earthcatalog-0.2.0.dist-info/METADATA +333 -0
  47. earthcatalog-0.2.0.dist-info/RECORD +50 -0
  48. earthcatalog-0.2.0.dist-info/WHEEL +5 -0
  49. earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
  50. earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1624 @@
1
+ """End-to-end tests using synthetic STAC items.
2
+
3
+ This module provides comprehensive end-to-end testing of the EarthCatalog ingestion
4
+ pipeline using synthetically generated STAC items. These tests are designed to verify
5
+ the complete workflow from URL ingestion through consolidation with realistic,
6
+ parameterizable data.
7
+
8
+ These tests are NOT run as part of the regular test suite. They are intended to be
9
+ run on-demand for integration testing and performance benchmarking.
10
+
11
+ Run with:
12
+ pytest earthcatalog/tests/test_e2e_synthetic.py -v -m e2e
13
+
14
+ Or for specific test configurations:
15
+ pytest earthcatalog/tests/test_e2e_synthetic.py -v -k "test_e2e_pipeline" --e2e-items=1000
16
+ """
17
+
18
+ import json
19
+ import random
20
+ import shutil
21
+ import tempfile
22
+ from dataclasses import dataclass
23
+ from datetime import datetime, timedelta
24
+ from http.server import HTTPServer, SimpleHTTPRequestHandler
25
+ from pathlib import Path
26
+ from threading import Thread
27
+ from typing import Any
28
+
29
+ import pandas as pd
30
+ import pyarrow as pa
31
+ import pyarrow.parquet as pq
32
+ import pytest
33
+ from jinja2 import Environment, FileSystemLoader
34
+
35
+ from earthcatalog.ingestion_pipeline import (
36
+ LocalProcessor,
37
+ ProcessingConfig,
38
+ STACIngestionPipeline,
39
+ )
40
+
41
+ # Mark entire module for e2e testing - skipped by default
42
+ pytestmark = pytest.mark.e2e
43
+
44
+
45
+ @dataclass
46
+ class GeometrySpec:
47
+ """Specification for geometry size/shape."""
48
+
49
+ width_deg: float # Width in degrees longitude
50
+ height_deg: float # Height in degrees latitude
51
+ name: str # Descriptive name
52
+
53
+
54
+ # Standard Landsat scene size (approximately 185km x 180km, ~1.7deg x 1.6deg at equator)
55
+ LANDSAT_SCENE = GeometrySpec(width_deg=1.7, height_deg=1.6, name="landsat")
56
+
57
+ # Sentinel-2 tile (100km x 100km, ~0.9deg x 0.9deg at equator)
58
+ SENTINEL2_TILE = GeometrySpec(width_deg=0.9, height_deg=0.9, name="sentinel2")
59
+
60
+ # Very small geometry (outlier - too small, like a single point observation)
61
+ TINY_GEOMETRY = GeometrySpec(width_deg=0.001, height_deg=0.001, name="tiny")
62
+
63
+ # Very large geometry (outlier - spans many grid cells, like MODIS swath)
64
+ HUGE_GEOMETRY = GeometrySpec(width_deg=20.0, height_deg=15.0, name="huge")
65
+
66
+ # Continental scale (extreme outlier - should go to global partition)
67
+ CONTINENTAL = GeometrySpec(width_deg=50.0, height_deg=30.0, name="continental")
68
+
69
+
70
+ class SyntheticSTACGenerator:
71
+ """Generator for synthetic STAC items using Jinja templates.
72
+
73
+ This class creates realistic STAC items with configurable parameters for
74
+ testing the ingestion pipeline with various geometry sizes, temporal
75
+ distributions, and outlier conditions.
76
+
77
+ Attributes:
78
+ template_dir: Directory containing Jinja templates
79
+ output_dir: Directory for generated STAC JSON files
80
+ base_geometry: Default geometry specification for items
81
+ outlier_percent: Percentage of items with outlier geometries
82
+ seed: Random seed for reproducibility
83
+ """
84
+
85
+ def __init__(
86
+ self,
87
+ output_dir: Path,
88
+ base_geometry: GeometrySpec = LANDSAT_SCENE,
89
+ outlier_tiny_percent: float = 5.0,
90
+ outlier_huge_percent: float = 5.0,
91
+ seed: int | None = None,
92
+ ):
93
+ """Initialize the synthetic STAC generator.
94
+
95
+ Args:
96
+ output_dir: Directory to write generated STAC JSON files
97
+ base_geometry: Default geometry size specification
98
+ outlier_tiny_percent: Percentage of items with tiny geometries (0-100)
99
+ outlier_huge_percent: Percentage of items with huge geometries (0-100)
100
+ seed: Random seed for reproducibility
101
+ """
102
+ self.output_dir = Path(output_dir)
103
+ self.output_dir.mkdir(parents=True, exist_ok=True)
104
+
105
+ self.base_geometry = base_geometry
106
+ self.outlier_tiny_percent = outlier_tiny_percent
107
+ self.outlier_huge_percent = outlier_huge_percent
108
+
109
+ # Set random seed for reproducibility
110
+ if seed is not None:
111
+ random.seed(seed)
112
+
113
+ # Initialize Jinja environment
114
+ template_dir = Path(__file__).parent / "fixtures"
115
+ self.env = Environment(loader=FileSystemLoader(template_dir))
116
+ self.template = self.env.get_template("stac_item_template.jinja2")
117
+
118
+ # Track generated items
119
+ self.generated_items: list[dict[str, Any]] = []
120
+ self.generated_files: list[Path] = []
121
+
122
+ def _select_geometry_spec(self) -> GeometrySpec:
123
+ """Select geometry specification based on outlier percentages."""
124
+ roll = random.uniform(0, 100)
125
+
126
+ if roll < self.outlier_tiny_percent:
127
+ return TINY_GEOMETRY
128
+ elif roll < self.outlier_tiny_percent + self.outlier_huge_percent:
129
+ # Randomly choose between huge and continental for large outliers
130
+ return random.choice([HUGE_GEOMETRY, CONTINENTAL])
131
+ else:
132
+ return self.base_geometry
133
+
134
+ def _generate_random_bbox(self, geom_spec: GeometrySpec) -> tuple[float, float, float, float]:
135
+ """Generate a random bounding box within valid global bounds.
136
+
137
+ Returns:
138
+ Tuple of (min_lon, min_lat, max_lon, max_lat)
139
+ """
140
+ # Ensure geometry fits within global bounds
141
+ max_start_lon = 180.0 - geom_spec.width_deg
142
+ max_start_lat = 90.0 - geom_spec.height_deg
143
+
144
+ min_lon = random.uniform(-180.0, max_start_lon)
145
+ min_lat = random.uniform(-90.0, max_start_lat)
146
+
147
+ max_lon = min_lon + geom_spec.width_deg
148
+ max_lat = min_lat + geom_spec.height_deg
149
+
150
+ return (min_lon, min_lat, max_lon, max_lat)
151
+
152
+ def _generate_random_datetime(
153
+ self,
154
+ start_date: datetime = datetime(2020, 1, 1),
155
+ end_date: datetime = datetime(2024, 12, 31),
156
+ ) -> datetime:
157
+ """Generate a random datetime within the specified range."""
158
+ delta = end_date - start_date
159
+ random_days = random.randint(0, delta.days)
160
+ random_seconds = random.randint(0, 86400)
161
+ return start_date + timedelta(days=random_days, seconds=random_seconds)
162
+
163
+ def generate_item(
164
+ self,
165
+ item_index: int,
166
+ dataset_id: str = "landsat_c2l2",
167
+ collection: str = "landsat-c2-l2",
168
+ ) -> dict[str, Any]:
169
+ """Generate a single synthetic STAC item.
170
+
171
+ Args:
172
+ item_index: Index of this item (for unique ID generation)
173
+ dataset_id: Dataset identifier for partitioning
174
+ collection: STAC collection name
175
+
176
+ Returns:
177
+ Dictionary containing the generated STAC item
178
+ """
179
+ # Select geometry specification (may be outlier)
180
+ geom_spec = self._select_geometry_spec()
181
+
182
+ # Generate random bbox
183
+ min_lon, min_lat, max_lon, max_lat = self._generate_random_bbox(geom_spec)
184
+
185
+ # Generate random datetime
186
+ dt = self._generate_random_datetime()
187
+ dt_str = dt.strftime("%Y-%m-%dT%H:%M:%SZ")
188
+ created_str = datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
189
+
190
+ # Generate unique identifiers
191
+ item_id = f"LC08_L2SP_{random.randint(1, 999):03d}{random.randint(1, 999):03d}_{dt.strftime('%Y%m%d')}_{item_index:06d}"
192
+ scene_id = f"LC08_{random.randint(1, 999):03d}{random.randint(1, 999):03d}_{dt.strftime('%Y%m%d')}_02_T1"
193
+
194
+ # WRS path/row (Landsat grid)
195
+ wrs_path = random.randint(1, 233)
196
+ wrs_row = random.randint(1, 248)
197
+
198
+ # Base URL for assets
199
+ base_url = f"https://example.com/data/{collection}/{item_id}"
200
+
201
+ # Template context
202
+ context = {
203
+ # Identification
204
+ "item_id": item_id,
205
+ "scene_id": scene_id,
206
+ # Geometry
207
+ "min_lon": round(min_lon, 6),
208
+ "min_lat": round(min_lat, 6),
209
+ "max_lon": round(max_lon, 6),
210
+ "max_lat": round(max_lat, 6),
211
+ # Temporal
212
+ "datetime": dt_str,
213
+ "created": created_str,
214
+ "updated": created_str,
215
+ # Platform info
216
+ "platform": "landsat-8",
217
+ "instruments": ["oli", "tirs"],
218
+ "constellation": "landsat",
219
+ "mission": "landsat-8",
220
+ "gsd": 30,
221
+ # Collection info
222
+ "collection": collection,
223
+ "dataset_id": dataset_id,
224
+ # Scene properties
225
+ "cloud_cover": round(random.uniform(0, 100), 2),
226
+ "off_nadir": round(random.uniform(0, 15), 2),
227
+ "sun_azimuth": round(random.uniform(0, 360), 2),
228
+ "sun_elevation": round(random.uniform(10, 80), 2),
229
+ # Projection
230
+ "epsg": 32600 + random.randint(1, 60), # UTM zones
231
+ "proj_shape": [7611, 7531], # Standard Landsat dimensions
232
+ "proj_transform": [30.0, 0.0, min_lon * 111000, 0.0, -30.0, max_lat * 111000],
233
+ # Landsat-specific
234
+ "wrs_path": wrs_path,
235
+ "wrs_row": wrs_row,
236
+ "collection_category": "T1",
237
+ "collection_number": "02",
238
+ # Links
239
+ "self_href": f"{base_url}/{item_id}.json",
240
+ "parent_href": f"https://example.com/collections/{collection}",
241
+ "collection_href": f"https://example.com/collections/{collection}",
242
+ "root_href": "https://example.com",
243
+ # Asset URLs
244
+ "thumbnail_href": f"{base_url}/{item_id}_thumb_small.jpeg",
245
+ "browse_href": f"{base_url}/{item_id}_thumb_large.jpeg",
246
+ "mtl_href": f"{base_url}/{item_id}_MTL.json",
247
+ "mtl_txt_href": f"{base_url}/{item_id}_MTL.txt",
248
+ "mtl_xml_href": f"{base_url}/{item_id}_MTL.xml",
249
+ "ang_href": f"{base_url}/{item_id}_ANG.txt",
250
+ "qa_pixel_href": f"{base_url}/{item_id}_QA_PIXEL.TIF",
251
+ "qa_radsat_href": f"{base_url}/{item_id}_QA_RADSAT.TIF",
252
+ "coastal_href": f"{base_url}/{item_id}_SR_B1.TIF",
253
+ "blue_href": f"{base_url}/{item_id}_SR_B2.TIF",
254
+ "green_href": f"{base_url}/{item_id}_SR_B3.TIF",
255
+ "red_href": f"{base_url}/{item_id}_SR_B4.TIF",
256
+ "nir08_href": f"{base_url}/{item_id}_SR_B5.TIF",
257
+ "swir16_href": f"{base_url}/{item_id}_SR_B6.TIF",
258
+ "swir22_href": f"{base_url}/{item_id}_SR_B7.TIF",
259
+ "lwir11_href": f"{base_url}/{item_id}_ST_B10.TIF",
260
+ }
261
+
262
+ # Render template
263
+ rendered = self.template.render(**context)
264
+ item = json.loads(rendered)
265
+
266
+ # Store metadata about geometry type for analysis
267
+ item["_synthetic_metadata"] = {
268
+ "geometry_type": geom_spec.name,
269
+ "index": item_index,
270
+ }
271
+
272
+ return item
273
+
274
+ def generate_items(
275
+ self,
276
+ n_items: int,
277
+ dataset_ids: list[str] | None = None,
278
+ shuffle: bool = True,
279
+ ) -> list[dict[str, Any]]:
280
+ """Generate multiple synthetic STAC items.
281
+
282
+ Args:
283
+ n_items: Number of items to generate
284
+ dataset_ids: List of dataset IDs to distribute items across
285
+ shuffle: Whether to shuffle the items (simulates unordered real-world data)
286
+
287
+ Returns:
288
+ List of generated STAC items
289
+ """
290
+ if dataset_ids is None:
291
+ dataset_ids = ["landsat_c2l2", "sentinel2_l2a", "modis_mcd43a4"]
292
+
293
+ items = []
294
+ for i in range(n_items):
295
+ dataset_id = random.choice(dataset_ids)
296
+ collection = dataset_id.replace("_", "-")
297
+ item = self.generate_item(i, dataset_id=dataset_id, collection=collection)
298
+ items.append(item)
299
+
300
+ if shuffle:
301
+ random.shuffle(items)
302
+
303
+ self.generated_items = items
304
+ return items
305
+
306
+ def write_items_to_files(self, items: list[dict[str, Any]] | None = None) -> list[Path]:
307
+ """Write STAC items to individual JSON files.
308
+
309
+ Args:
310
+ items: List of items to write. Uses self.generated_items if None.
311
+
312
+ Returns:
313
+ List of paths to generated files
314
+ """
315
+ if items is None:
316
+ items = self.generated_items
317
+
318
+ files = []
319
+ for item in items:
320
+ # Remove synthetic metadata before writing
321
+ item_clean = {k: v for k, v in item.items() if not k.startswith("_")}
322
+ item_id = item_clean["id"]
323
+ file_path = self.output_dir / f"{item_id}.json"
324
+
325
+ with open(file_path, "w") as f:
326
+ json.dump(item_clean, f, indent=2)
327
+
328
+ files.append(file_path)
329
+
330
+ self.generated_files = files
331
+ return files
332
+
333
+ def create_url_parquet(
334
+ self,
335
+ base_url: str,
336
+ output_path: Path,
337
+ shuffle: bool = True,
338
+ ) -> Path:
339
+ """Create a Parquet file with URLs pointing to generated STAC items.
340
+
341
+ Args:
342
+ base_url: Base URL where items will be served (e.g., http://localhost:8000)
343
+ output_path: Path for the output Parquet file
344
+ shuffle: Whether to shuffle URLs (simulates unordered real-world data)
345
+
346
+ Returns:
347
+ Path to the created Parquet file
348
+ """
349
+ urls = [f"{base_url}/{f.name}" for f in self.generated_files]
350
+
351
+ if shuffle:
352
+ random.shuffle(urls)
353
+
354
+ # Create DataFrame and write to Parquet
355
+ df = pd.DataFrame({"url": urls})
356
+ table = pa.Table.from_pandas(df)
357
+ pq.write_table(table, output_path)
358
+
359
+ return output_path
360
+
361
+ def get_geometry_distribution(self) -> dict[str, int]:
362
+ """Get distribution of geometry types in generated items."""
363
+ distribution: dict[str, int] = {}
364
+ for item in self.generated_items:
365
+ geom_type = item.get("_synthetic_metadata", {}).get("geometry_type", "unknown")
366
+ distribution[geom_type] = distribution.get(geom_type, 0) + 1
367
+ return distribution
368
+
369
+
370
+ class SimpleSTACServer:
371
+ """Simple HTTP server for serving STAC items during tests."""
372
+
373
+ def __init__(self, directory: Path, port: int = 0):
374
+ """Initialize the server.
375
+
376
+ Args:
377
+ directory: Directory containing STAC JSON files to serve
378
+ port: Port to listen on (0 for auto-assign)
379
+ """
380
+ self.directory = directory
381
+ self.port = port
382
+ self.server: HTTPServer | None = None
383
+ self.thread: Thread | None = None
384
+
385
+ def start(self) -> str:
386
+ """Start the HTTP server in a background thread.
387
+
388
+ Returns:
389
+ Base URL of the server
390
+ """
391
+ import functools
392
+
393
+ # Create handler that serves from our directory
394
+ handler = functools.partial(SimpleHTTPRequestHandler, directory=str(self.directory))
395
+
396
+ self.server = HTTPServer(("localhost", self.port), handler)
397
+ self.port = self.server.server_port
398
+
399
+ self.thread = Thread(target=self.server.serve_forever, daemon=True)
400
+ self.thread.start()
401
+
402
+ return f"http://localhost:{self.port}"
403
+
404
+ def stop(self):
405
+ """Stop the HTTP server."""
406
+ if self.server:
407
+ self.server.shutdown()
408
+ self.server = None
409
+ if self.thread:
410
+ self.thread.join(timeout=5)
411
+ self.thread = None
412
+
413
+
414
+ # Pytest fixtures
415
+ # Note: pytest_addoption, pytest_configure, and pytest_collection_modifyitems
416
+ # are defined in conftest.py to be picked up by pytest
417
+
418
+
419
+ @pytest.fixture(scope="module")
420
+ def e2e_config(request):
421
+ """Get e2e test configuration from command line options."""
422
+
423
+ class E2EConfig:
424
+ n_items = int(request.config.getoption("--e2e-items", 100))
425
+ outlier_tiny_percent = float(request.config.getoption("--e2e-outlier-tiny", 5))
426
+ outlier_huge_percent = float(request.config.getoption("--e2e-outlier-huge", 5))
427
+ seed = request.config.getoption("--e2e-seed", None)
428
+ if seed is not None:
429
+ seed = int(seed)
430
+
431
+ return E2EConfig()
432
+
433
+
434
+ @pytest.fixture(scope="module")
435
+ def temp_test_dir():
436
+ """Create a temporary directory for test files."""
437
+ temp_dir = Path(tempfile.mkdtemp(prefix="earthcatalog_e2e_"))
438
+ yield temp_dir
439
+ # Cleanup after all tests in module
440
+ shutil.rmtree(temp_dir, ignore_errors=True)
441
+
442
+
443
+ @pytest.fixture(scope="module")
444
+ def synthetic_generator(temp_test_dir, e2e_config):
445
+ """Create and configure a synthetic STAC generator."""
446
+ items_dir = temp_test_dir / "stac_items"
447
+ items_dir.mkdir(parents=True, exist_ok=True)
448
+
449
+ generator = SyntheticSTACGenerator(
450
+ output_dir=items_dir,
451
+ base_geometry=LANDSAT_SCENE,
452
+ outlier_tiny_percent=e2e_config.outlier_tiny_percent,
453
+ outlier_huge_percent=e2e_config.outlier_huge_percent,
454
+ seed=e2e_config.seed,
455
+ )
456
+
457
+ return generator
458
+
459
+
460
+ @pytest.fixture(scope="module")
461
+ def generated_items(synthetic_generator, e2e_config):
462
+ """Generate synthetic STAC items for testing."""
463
+ items = synthetic_generator.generate_items(
464
+ n_items=e2e_config.n_items,
465
+ shuffle=True,
466
+ )
467
+ synthetic_generator.write_items_to_files(items)
468
+ return items
469
+
470
+
471
+ @pytest.fixture(scope="module")
472
+ def stac_server(synthetic_generator, generated_items):
473
+ """Start HTTP server to serve STAC items."""
474
+ server = SimpleSTACServer(synthetic_generator.output_dir)
475
+ base_url = server.start()
476
+ yield base_url
477
+ server.stop()
478
+
479
+
480
+ @pytest.fixture(scope="module")
481
+ def urls_parquet(synthetic_generator, stac_server, temp_test_dir):
482
+ """Create Parquet file with URLs pointing to served STAC items."""
483
+ parquet_path = temp_test_dir / "stac_urls.parquet"
484
+ synthetic_generator.create_url_parquet(stac_server, parquet_path, shuffle=True)
485
+ return parquet_path
486
+
487
+
488
+ class TestE2ESyntheticPipeline:
489
+ """End-to-end tests using synthetic STAC items and the full ingestion pipeline."""
490
+
491
+ @pytest.mark.e2e
492
+ def test_e2e_pipeline_basic(self, urls_parquet, temp_test_dir, e2e_config):
493
+ """Test basic end-to-end pipeline with synthetic data.
494
+
495
+ This test verifies:
496
+ 1. Pipeline can ingest URLs from Parquet file
497
+ 2. Items are correctly partitioned spatially and temporally
498
+ 3. Consolidation produces valid GeoParquet output
499
+ 4. Schema generation works correctly
500
+ """
501
+ output_catalog = temp_test_dir / "catalog_basic"
502
+ scratch_location = temp_test_dir / "scratch_basic"
503
+
504
+ config = ProcessingConfig(
505
+ input_file=str(urls_parquet),
506
+ output_catalog=str(output_catalog),
507
+ scratch_location=str(scratch_location),
508
+ grid_system="h3",
509
+ grid_resolution=2,
510
+ temporal_bin="month",
511
+ generate_schema=True,
512
+ enable_concurrent_http=True,
513
+ concurrent_requests=10,
514
+ batch_size=50,
515
+ max_workers=2,
516
+ items_per_shard=100,
517
+ )
518
+
519
+ processor = LocalProcessor(n_workers=config.max_workers)
520
+ pipeline = STACIngestionPipeline(config, processor)
521
+
522
+ try:
523
+ stats = pipeline.run()
524
+
525
+ # Verify output
526
+ assert len(stats) > 0, "Pipeline should produce at least one partition"
527
+
528
+ total_items = sum(s["total_items"] for s in stats.values())
529
+ assert total_items > 0, "Pipeline should ingest at least some items"
530
+
531
+ # Check catalog directory structure
532
+ assert output_catalog.exists(), "Output catalog should exist"
533
+
534
+ # Check schema file was created
535
+ schema_file = output_catalog / "catalog_schema.json"
536
+ assert schema_file.exists(), "Schema file should be generated"
537
+
538
+ with open(schema_file) as f:
539
+ schema = json.load(f)
540
+ assert "spatial_partitioning" in schema
541
+ assert schema["spatial_partitioning"]["grid_system"] == "h3"
542
+
543
+ # Verify Hive-style directory structure
544
+ parquet_files = list(output_catalog.rglob("items.parquet"))
545
+ assert len(parquet_files) > 0, "Should produce at least one items.parquet file"
546
+
547
+ # Check for Hive-style directories
548
+ for pf in parquet_files:
549
+ path_str = str(pf)
550
+ assert "partition=h3" in path_str or "partition=geojson" in path_str, (
551
+ f"Path should have Hive-style partition: {path_str}"
552
+ )
553
+
554
+ finally:
555
+ processor.close()
556
+
557
+ @pytest.mark.e2e
558
+ def test_e2e_pipeline_with_global_partition(self, urls_parquet, temp_test_dir, e2e_config, synthetic_generator):
559
+ """Test that large geometries are routed to global partition.
560
+
561
+ This test verifies:
562
+ 1. Items with geometries spanning many cells go to global partition
563
+ 2. Regular items go to spatial partitions
564
+ 3. Global partition threshold is respected
565
+ """
566
+ # Ensure we have some large geometries
567
+ geom_dist = synthetic_generator.get_geometry_distribution()
568
+ large_count = geom_dist.get("huge", 0) + geom_dist.get("continental", 0)
569
+
570
+ if large_count == 0:
571
+ pytest.skip("No large geometry outliers in generated data")
572
+
573
+ output_catalog = temp_test_dir / "catalog_global"
574
+ scratch_location = temp_test_dir / "scratch_global"
575
+
576
+ config = ProcessingConfig(
577
+ input_file=str(urls_parquet),
578
+ output_catalog=str(output_catalog),
579
+ scratch_location=str(scratch_location),
580
+ grid_system="h3",
581
+ grid_resolution=2,
582
+ temporal_bin="month",
583
+ enable_global_partitioning=True,
584
+ global_partition_threshold=5, # Low threshold to trigger global routing
585
+ generate_schema=True,
586
+ enable_concurrent_http=True,
587
+ concurrent_requests=10,
588
+ batch_size=50,
589
+ max_workers=2,
590
+ )
591
+
592
+ processor = LocalProcessor(n_workers=config.max_workers)
593
+ pipeline = STACIngestionPipeline(config, processor)
594
+
595
+ try:
596
+ stats = pipeline.run()
597
+
598
+ # Check if any partitions have "global" in their path
599
+ _global_partitions = [k for k in stats.keys() if "global" in k.lower()]
600
+
601
+ # Note: Global partition may or may not exist depending on geometry distribution
602
+ # The test mainly verifies the pipeline handles this correctly without errors
603
+
604
+ total_items = sum(s["total_items"] for s in stats.values())
605
+ assert total_items > 0, "Pipeline should ingest items"
606
+
607
+ finally:
608
+ processor.close()
609
+
610
+ @pytest.mark.e2e
611
+ def test_e2e_pipeline_temporal_partitions(self, urls_parquet, temp_test_dir, e2e_config):
612
+ """Test temporal partitioning creates correct Hive-style directories.
613
+
614
+ This test verifies:
615
+ 1. Year/month directories are created with Hive naming
616
+ 2. Items are correctly distributed across temporal partitions
617
+ """
618
+ output_catalog = temp_test_dir / "catalog_temporal"
619
+ scratch_location = temp_test_dir / "scratch_temporal"
620
+
621
+ config = ProcessingConfig(
622
+ input_file=str(urls_parquet),
623
+ output_catalog=str(output_catalog),
624
+ scratch_location=str(scratch_location),
625
+ grid_system="h3",
626
+ grid_resolution=2,
627
+ temporal_bin="month",
628
+ generate_schema=True,
629
+ enable_concurrent_http=True,
630
+ concurrent_requests=10,
631
+ batch_size=50,
632
+ max_workers=2,
633
+ )
634
+
635
+ processor = LocalProcessor(n_workers=config.max_workers)
636
+ pipeline = STACIngestionPipeline(config, processor)
637
+
638
+ try:
639
+ _stats = pipeline.run()
640
+
641
+ # Check for Hive-style temporal directories
642
+ parquet_files = list(output_catalog.rglob("items.parquet"))
643
+ assert len(parquet_files) > 0
644
+
645
+ year_dirs = set()
646
+ month_dirs = set()
647
+
648
+ for pf in parquet_files:
649
+ path_parts = str(pf).split("/")
650
+ for part in path_parts:
651
+ if part.startswith("year="):
652
+ year_dirs.add(part)
653
+ elif part.startswith("month="):
654
+ month_dirs.add(part)
655
+
656
+ # Should have at least one year and month partition
657
+ assert len(year_dirs) > 0, "Should have year= directories"
658
+ assert len(month_dirs) > 0, "Should have month= directories"
659
+
660
+ finally:
661
+ processor.close()
662
+
663
+ @pytest.mark.e2e
664
+ def test_e2e_pipeline_multiple_datasets(self, temp_test_dir, e2e_config):
665
+ """Test pipeline with items from multiple datasets.
666
+
667
+ This test verifies:
668
+ 1. Items are partitioned by dataset/mission
669
+ 2. Multiple dataset directories are created
670
+ """
671
+ # Generate items with multiple datasets
672
+ items_dir = temp_test_dir / "multi_dataset_items"
673
+ items_dir.mkdir(parents=True, exist_ok=True)
674
+
675
+ generator = SyntheticSTACGenerator(
676
+ output_dir=items_dir,
677
+ base_geometry=LANDSAT_SCENE,
678
+ outlier_tiny_percent=0, # No outliers for this test
679
+ outlier_huge_percent=0,
680
+ seed=42,
681
+ )
682
+
683
+ # Generate items for 3 different datasets
684
+ dataset_ids = ["landsat_c2l2", "sentinel2_l2a", "modis_mcd43a4"]
685
+ items = generator.generate_items(
686
+ n_items=min(e2e_config.n_items, 60), # At least 20 per dataset
687
+ dataset_ids=dataset_ids,
688
+ shuffle=True,
689
+ )
690
+ generator.write_items_to_files(items)
691
+
692
+ # Start server and create URL parquet
693
+ server = SimpleSTACServer(items_dir)
694
+ base_url = server.start()
695
+
696
+ try:
697
+ parquet_path = temp_test_dir / "multi_dataset_urls.parquet"
698
+ generator.create_url_parquet(base_url, parquet_path, shuffle=True)
699
+
700
+ output_catalog = temp_test_dir / "catalog_multi_dataset"
701
+ scratch_location = temp_test_dir / "scratch_multi_dataset"
702
+
703
+ config = ProcessingConfig(
704
+ input_file=str(parquet_path),
705
+ output_catalog=str(output_catalog),
706
+ scratch_location=str(scratch_location),
707
+ grid_system="h3",
708
+ grid_resolution=2,
709
+ temporal_bin="month",
710
+ generate_schema=True,
711
+ enable_concurrent_http=True,
712
+ concurrent_requests=10,
713
+ batch_size=50,
714
+ max_workers=2,
715
+ )
716
+
717
+ processor = LocalProcessor(n_workers=config.max_workers)
718
+ pipeline = STACIngestionPipeline(config, processor)
719
+
720
+ try:
721
+ _stats = pipeline.run()
722
+
723
+ # Check for multiple dataset directories
724
+ assert output_catalog.exists()
725
+
726
+ # Get top-level directories (should be dataset names)
727
+ top_dirs = [d.name for d in output_catalog.iterdir() if d.is_dir()]
728
+
729
+ # Should have directories for multiple datasets
730
+ # (some items may fail to download, so just check we have at least 1)
731
+ assert len(top_dirs) >= 1, f"Should have dataset directories, found: {top_dirs}"
732
+
733
+ finally:
734
+ processor.close()
735
+
736
+ finally:
737
+ server.stop()
738
+
739
+ @pytest.mark.e2e
740
+ def test_e2e_consolidation_deduplication(self, temp_test_dir, e2e_config):
741
+ """Test that consolidation properly deduplicates items.
742
+
743
+ This test verifies:
744
+ 1. Running pipeline twice with overlapping data deduplicates correctly
745
+ 2. Newer items override older ones (keep='last' behavior)
746
+ """
747
+ items_dir = temp_test_dir / "dedup_items"
748
+ items_dir.mkdir(parents=True, exist_ok=True)
749
+
750
+ generator = SyntheticSTACGenerator(
751
+ output_dir=items_dir,
752
+ base_geometry=LANDSAT_SCENE,
753
+ outlier_tiny_percent=0,
754
+ outlier_huge_percent=0,
755
+ seed=42,
756
+ )
757
+
758
+ # Generate initial set of items
759
+ items = generator.generate_items(n_items=20, shuffle=False)
760
+ generator.write_items_to_files(items)
761
+
762
+ server = SimpleSTACServer(items_dir)
763
+ base_url = server.start()
764
+
765
+ try:
766
+ parquet_path = temp_test_dir / "dedup_urls.parquet"
767
+ generator.create_url_parquet(base_url, parquet_path, shuffle=True)
768
+
769
+ output_catalog = temp_test_dir / "catalog_dedup"
770
+ scratch_location = temp_test_dir / "scratch_dedup"
771
+
772
+ config = ProcessingConfig(
773
+ input_file=str(parquet_path),
774
+ output_catalog=str(output_catalog),
775
+ scratch_location=str(scratch_location),
776
+ grid_system="h3",
777
+ grid_resolution=2,
778
+ temporal_bin="month",
779
+ enable_concurrent_http=True,
780
+ concurrent_requests=5,
781
+ batch_size=20,
782
+ max_workers=1,
783
+ )
784
+
785
+ processor = LocalProcessor(n_workers=1)
786
+ pipeline = STACIngestionPipeline(config, processor)
787
+
788
+ try:
789
+ # First run
790
+ stats1 = pipeline.run()
791
+ total1 = sum(s["total_items"] for s in stats1.values())
792
+
793
+ # Second run with same data (should deduplicate)
794
+ stats2 = pipeline.run()
795
+ total2 = sum(s["total_items"] for s in stats2.values())
796
+
797
+ # Should have same total (deduplication working)
798
+ assert total2 == total1, f"Second run should have same total due to deduplication: {total2} vs {total1}"
799
+
800
+ finally:
801
+ processor.close()
802
+
803
+ finally:
804
+ server.stop()
805
+
806
+
807
+ class TestSyntheticGenerator:
808
+ """Tests for the synthetic STAC generator itself."""
809
+
810
+ @pytest.mark.e2e
811
+ def test_generator_produces_valid_items(self, synthetic_generator, e2e_config):
812
+ """Test that generated items have valid STAC structure."""
813
+ items = synthetic_generator.generate_items(n_items=10, shuffle=False)
814
+
815
+ for item in items:
816
+ # Check required STAC fields
817
+ assert "type" in item and item["type"] == "Feature"
818
+ assert "stac_version" in item
819
+ assert "id" in item
820
+ assert "geometry" in item
821
+ assert "bbox" in item
822
+ assert "properties" in item
823
+ assert "assets" in item
824
+ assert "links" in item
825
+
826
+ # Check geometry structure
827
+ geom = item["geometry"]
828
+ assert geom["type"] == "Polygon"
829
+ assert "coordinates" in geom
830
+ assert len(geom["coordinates"][0]) == 5 # Closed polygon
831
+
832
+ # Check bbox consistency with geometry
833
+ bbox = item["bbox"]
834
+ coords = geom["coordinates"][0]
835
+ min_lon = min(c[0] for c in coords)
836
+ max_lon = max(c[0] for c in coords)
837
+ min_lat = min(c[1] for c in coords)
838
+ max_lat = max(c[1] for c in coords)
839
+
840
+ assert abs(bbox[0] - min_lon) < 0.0001
841
+ assert abs(bbox[1] - min_lat) < 0.0001
842
+ assert abs(bbox[2] - max_lon) < 0.0001
843
+ assert abs(bbox[3] - max_lat) < 0.0001
844
+
845
+ @pytest.mark.e2e
846
+ def test_generator_outlier_distribution(self, temp_test_dir):
847
+ """Test that outlier percentages are respected."""
848
+ items_dir = temp_test_dir / "outlier_test"
849
+ items_dir.mkdir(parents=True, exist_ok=True)
850
+
851
+ generator = SyntheticSTACGenerator(
852
+ output_dir=items_dir,
853
+ outlier_tiny_percent=20.0,
854
+ outlier_huge_percent=10.0,
855
+ seed=42,
856
+ )
857
+
858
+ # Generate many items to get statistical significance
859
+ items = generator.generate_items(n_items=1000, shuffle=False)
860
+ dist = generator.get_geometry_distribution()
861
+
862
+ # Check distribution is roughly correct (within tolerance)
863
+ total = len(items)
864
+ tiny_pct = (dist.get("tiny", 0) / total) * 100
865
+ large_pct = ((dist.get("huge", 0) + dist.get("continental", 0)) / total) * 100
866
+
867
+ # Allow 50% tolerance due to randomness
868
+ assert 10 < tiny_pct < 30, f"Tiny outlier percentage {tiny_pct:.1f}% not in expected range"
869
+ assert 5 < large_pct < 20, f"Large outlier percentage {large_pct:.1f}% not in expected range"
870
+
871
+ @pytest.mark.e2e
872
+ def test_generator_shuffle(self, temp_test_dir):
873
+ """Test that shuffle parameter works correctly."""
874
+ items_dir = temp_test_dir / "shuffle_test"
875
+ items_dir.mkdir(parents=True, exist_ok=True)
876
+
877
+ generator = SyntheticSTACGenerator(output_dir=items_dir, seed=42)
878
+
879
+ # Generate unshuffled
880
+ items_unshuffled = generator.generate_items(n_items=50, shuffle=False)
881
+ ids_unshuffled = [item["id"] for item in items_unshuffled]
882
+
883
+ # Reset and generate shuffled (same seed means same items, different order)
884
+ generator2 = SyntheticSTACGenerator(output_dir=items_dir, seed=42)
885
+ items_shuffled = generator2.generate_items(n_items=50, shuffle=True)
886
+ ids_shuffled = [item["id"] for item in items_shuffled]
887
+
888
+ # Should have same IDs but different order
889
+ assert set(ids_unshuffled) == set(ids_shuffled)
890
+ assert ids_unshuffled != ids_shuffled, "Shuffled order should differ from unshuffled"
891
+
892
+
893
+ # =============================================================================
894
+ # Query Performance Profiling Tests
895
+ # =============================================================================
896
+
897
+
898
+ @dataclass
899
+ class QueryProfileResult:
900
+ """Result of a single query profile run."""
901
+
902
+ engine: str
903
+ query_type: str
904
+ iteration: int
905
+ duration_ms: float
906
+ rows_returned: int
907
+ partitions_scanned: int
908
+ error: str | None = None
909
+
910
+
911
+ @dataclass
912
+ class ProfileSummary:
913
+ """Summary statistics for a profiling run."""
914
+
915
+ engine: str
916
+ query_type: str
917
+ iterations: int
918
+ min_ms: float
919
+ max_ms: float
920
+ mean_ms: float
921
+ median_ms: float
922
+ std_ms: float
923
+ total_rows: int
924
+ partitions_scanned: int
925
+
926
+
927
+ class QueryProfiler:
928
+ """Profiler for measuring query performance across different engines.
929
+
930
+ Supports profiling queries using:
931
+ - DuckDB (with spatial extension)
932
+ - rustac (Rust-based STAC querying)
933
+ - Spatial resolver (partition resolution only)
934
+ """
935
+
936
+ def __init__(
937
+ self,
938
+ catalog_path: Path,
939
+ schema_path: Path,
940
+ iterations: int = 10,
941
+ ):
942
+ """Initialize the query profiler.
943
+
944
+ Args:
945
+ catalog_path: Path to the catalog directory
946
+ schema_path: Path to the catalog schema JSON file
947
+ iterations: Number of iterations per query for averaging
948
+ """
949
+ self.catalog_path = catalog_path
950
+ self.schema_path = schema_path
951
+ self.iterations = iterations
952
+ self.results: list[QueryProfileResult] = []
953
+
954
+ def _get_test_geometries(self) -> dict[str, Any]:
955
+ """Generate test geometries of different sizes for profiling."""
956
+ from shapely.geometry import box
957
+
958
+ return {
959
+ "point_query": box(-105.0, 40.0, -104.99, 40.01), # ~1km bbox
960
+ "small_region": box(-106.0, 39.0, -104.0, 41.0), # ~200km bbox
961
+ "medium_region": box(-110.0, 35.0, -100.0, 45.0), # ~1000km bbox
962
+ "large_region": box(-125.0, 25.0, -65.0, 50.0), # Continental US
963
+ }
964
+
965
+ def profile_spatial_resolver(self, geometry, query_name: str) -> list[QueryProfileResult]:
966
+ """Profile spatial resolver partition resolution performance."""
967
+ import time
968
+
969
+ from earthcatalog.spatial_resolver import spatial_resolver
970
+
971
+ results = []
972
+
973
+ for i in range(self.iterations):
974
+ try:
975
+ start = time.perf_counter()
976
+
977
+ resolver = spatial_resolver(str(self.schema_path), str(self.catalog_path))
978
+ partitions = resolver.resolve_partitions(geometry, overlap=True)
979
+ _query_paths = resolver.generate_query_paths(partitions, temporal_filter=None)
980
+
981
+ duration_ms = (time.perf_counter() - start) * 1000
982
+
983
+ results.append(
984
+ QueryProfileResult(
985
+ engine="spatial_resolver",
986
+ query_type=query_name,
987
+ iteration=i,
988
+ duration_ms=duration_ms,
989
+ rows_returned=0,
990
+ partitions_scanned=len(partitions),
991
+ )
992
+ )
993
+
994
+ except (ValueError, TypeError, AttributeError, OSError) as e:
995
+ results.append(
996
+ QueryProfileResult(
997
+ engine="spatial_resolver",
998
+ query_type=query_name,
999
+ iteration=i,
1000
+ duration_ms=0,
1001
+ rows_returned=0,
1002
+ partitions_scanned=0,
1003
+ error=str(e),
1004
+ )
1005
+ )
1006
+
1007
+ return results
1008
+
1009
+ def profile_duckdb(self, geometry, query_name: str, temporal_filter: str | None = None) -> list[QueryProfileResult]:
1010
+ """Profile DuckDB query performance."""
1011
+ import time
1012
+
1013
+ try:
1014
+ import duckdb
1015
+ except ImportError:
1016
+ return [
1017
+ QueryProfileResult(
1018
+ engine="duckdb",
1019
+ query_type=query_name,
1020
+ iteration=0,
1021
+ duration_ms=0,
1022
+ rows_returned=0,
1023
+ partitions_scanned=0,
1024
+ error="duckdb not installed",
1025
+ )
1026
+ ]
1027
+
1028
+ from earthcatalog.spatial_resolver import spatial_resolver
1029
+
1030
+ results = []
1031
+
1032
+ for i in range(self.iterations):
1033
+ try:
1034
+ # Get partition paths
1035
+ resolver = spatial_resolver(str(self.schema_path), str(self.catalog_path))
1036
+ partitions = resolver.resolve_partitions(geometry, overlap=True)
1037
+ query_paths = resolver.generate_query_paths(partitions, temporal_filter)
1038
+
1039
+ if not query_paths:
1040
+ results.append(
1041
+ QueryProfileResult(
1042
+ engine="duckdb",
1043
+ query_type=query_name,
1044
+ iteration=i,
1045
+ duration_ms=0,
1046
+ rows_returned=0,
1047
+ partitions_scanned=0,
1048
+ error="no partitions found",
1049
+ )
1050
+ )
1051
+ continue
1052
+
1053
+ # Build DuckDB query
1054
+ # Handle glob patterns in paths
1055
+ file_patterns = []
1056
+ for path in query_paths:
1057
+ if "*" in path:
1058
+ # Use glob for pattern matching
1059
+ from pathlib import Path as P
1060
+
1061
+ matching_files = list(P(self.catalog_path).glob(path.replace(str(self.catalog_path) + "/", "")))
1062
+ file_patterns.extend([str(f) for f in matching_files])
1063
+ else:
1064
+ file_patterns.append(path)
1065
+
1066
+ if not file_patterns:
1067
+ results.append(
1068
+ QueryProfileResult(
1069
+ engine="duckdb",
1070
+ query_type=query_name,
1071
+ iteration=i,
1072
+ duration_ms=0,
1073
+ rows_returned=0,
1074
+ partitions_scanned=len(partitions),
1075
+ error="no matching files",
1076
+ )
1077
+ )
1078
+ continue
1079
+
1080
+ # Create file list for DuckDB
1081
+ files_str = ", ".join([f"'{f}'" for f in file_patterns])
1082
+ query = f"SELECT COUNT(*) as cnt FROM read_parquet([{files_str}])"
1083
+
1084
+ start = time.perf_counter()
1085
+ result = duckdb.sql(query).fetchone()
1086
+ duration_ms = (time.perf_counter() - start) * 1000
1087
+
1088
+ row_count = result[0] if result else 0
1089
+
1090
+ results.append(
1091
+ QueryProfileResult(
1092
+ engine="duckdb",
1093
+ query_type=query_name,
1094
+ iteration=i,
1095
+ duration_ms=duration_ms,
1096
+ rows_returned=row_count,
1097
+ partitions_scanned=len(partitions),
1098
+ )
1099
+ )
1100
+
1101
+ except (ValueError, TypeError, AttributeError, OSError) as e:
1102
+ results.append(
1103
+ QueryProfileResult(
1104
+ engine="duckdb",
1105
+ query_type=query_name,
1106
+ iteration=i,
1107
+ duration_ms=0,
1108
+ rows_returned=0,
1109
+ partitions_scanned=0,
1110
+ error=str(e),
1111
+ )
1112
+ )
1113
+
1114
+ return results
1115
+
1116
+ def profile_rustac(self, geometry, query_name: str, temporal_filter: str | None = None) -> list[QueryProfileResult]:
1117
+ """Profile rustac query performance."""
1118
+ import time
1119
+
1120
+ try:
1121
+ import rustac
1122
+ except ImportError:
1123
+ return [
1124
+ QueryProfileResult(
1125
+ engine="rustac",
1126
+ query_type=query_name,
1127
+ iteration=0,
1128
+ duration_ms=0,
1129
+ rows_returned=0,
1130
+ partitions_scanned=0,
1131
+ error="rustac not installed",
1132
+ )
1133
+ ]
1134
+
1135
+ from earthcatalog.spatial_resolver import spatial_resolver
1136
+
1137
+ results = []
1138
+
1139
+ for i in range(self.iterations):
1140
+ try:
1141
+ # Get partition paths
1142
+ resolver = spatial_resolver(str(self.schema_path), str(self.catalog_path))
1143
+ partitions = resolver.resolve_partitions(geometry, overlap=True)
1144
+ query_paths = resolver.generate_query_paths(partitions, temporal_filter)
1145
+
1146
+ if not query_paths:
1147
+ results.append(
1148
+ QueryProfileResult(
1149
+ engine="rustac",
1150
+ query_type=query_name,
1151
+ iteration=i,
1152
+ duration_ms=0,
1153
+ rows_returned=0,
1154
+ partitions_scanned=0,
1155
+ error="no partitions found",
1156
+ )
1157
+ )
1158
+ continue
1159
+
1160
+ # Resolve glob patterns to actual files
1161
+ file_patterns = []
1162
+ for path in query_paths:
1163
+ if "*" in path:
1164
+ from pathlib import Path as P
1165
+
1166
+ matching_files = list(P(self.catalog_path).glob(path.replace(str(self.catalog_path) + "/", "")))
1167
+ file_patterns.extend([str(f) for f in matching_files])
1168
+ else:
1169
+ file_patterns.append(path)
1170
+
1171
+ if not file_patterns:
1172
+ results.append(
1173
+ QueryProfileResult(
1174
+ engine="rustac",
1175
+ query_type=query_name,
1176
+ iteration=i,
1177
+ duration_ms=0,
1178
+ rows_returned=0,
1179
+ partitions_scanned=len(partitions),
1180
+ error="no matching files",
1181
+ )
1182
+ )
1183
+ continue
1184
+
1185
+ start = time.perf_counter()
1186
+
1187
+ # Use rustac to read items (rustac.read is async)
1188
+ import asyncio
1189
+
1190
+ async def count_items_async(paths: list[str]) -> int:
1191
+ count = 0
1192
+ for file_path in paths:
1193
+ if Path(file_path).exists():
1194
+ items = await rustac.read(file_path)
1195
+ count += len(items)
1196
+ return count
1197
+
1198
+ total_items = asyncio.run(count_items_async(file_patterns))
1199
+
1200
+ duration_ms = (time.perf_counter() - start) * 1000
1201
+
1202
+ results.append(
1203
+ QueryProfileResult(
1204
+ engine="rustac",
1205
+ query_type=query_name,
1206
+ iteration=i,
1207
+ duration_ms=duration_ms,
1208
+ rows_returned=total_items,
1209
+ partitions_scanned=len(partitions),
1210
+ )
1211
+ )
1212
+
1213
+ except (ValueError, TypeError, AttributeError, OSError) as e:
1214
+ results.append(
1215
+ QueryProfileResult(
1216
+ engine="rustac",
1217
+ query_type=query_name,
1218
+ iteration=i,
1219
+ duration_ms=0,
1220
+ rows_returned=0,
1221
+ partitions_scanned=0,
1222
+ error=str(e),
1223
+ )
1224
+ )
1225
+
1226
+ return results
1227
+
1228
+ def run_all_profiles(self, engines: list[str] | None = None) -> list[QueryProfileResult]:
1229
+ """Run all profiling queries across specified engines.
1230
+
1231
+ Args:
1232
+ engines: List of engines to profile. Options: 'spatial_resolver', 'duckdb', 'rustac'
1233
+ """
1234
+ if engines is None:
1235
+ engines = ["spatial_resolver", "duckdb", "rustac"]
1236
+
1237
+ all_results = []
1238
+ test_geometries = self._get_test_geometries()
1239
+
1240
+ for query_name, geometry in test_geometries.items():
1241
+ for engine in engines:
1242
+ if engine == "spatial_resolver":
1243
+ results = self.profile_spatial_resolver(geometry, query_name)
1244
+ elif engine == "duckdb":
1245
+ results = self.profile_duckdb(geometry, query_name)
1246
+ elif engine == "rustac":
1247
+ results = self.profile_rustac(geometry, query_name)
1248
+ else:
1249
+ continue
1250
+
1251
+ all_results.extend(results)
1252
+
1253
+ self.results = all_results
1254
+ return all_results
1255
+
1256
+ def get_summary(self) -> list[ProfileSummary]:
1257
+ """Calculate summary statistics for profiling results."""
1258
+ import statistics
1259
+ from collections import defaultdict
1260
+
1261
+ # Group by engine and query type
1262
+ groups: dict[tuple[str, str], list[QueryProfileResult]] = defaultdict(list)
1263
+ for r in self.results:
1264
+ if r.error is None:
1265
+ groups[(r.engine, r.query_type)].append(r)
1266
+
1267
+ summaries = []
1268
+ for (engine, query_type), results in groups.items():
1269
+ if not results:
1270
+ continue
1271
+
1272
+ durations = [r.duration_ms for r in results]
1273
+ summaries.append(
1274
+ ProfileSummary(
1275
+ engine=engine,
1276
+ query_type=query_type,
1277
+ iterations=len(results),
1278
+ min_ms=min(durations),
1279
+ max_ms=max(durations),
1280
+ mean_ms=statistics.mean(durations),
1281
+ median_ms=statistics.median(durations),
1282
+ std_ms=statistics.stdev(durations) if len(durations) > 1 else 0,
1283
+ total_rows=sum(r.rows_returned for r in results),
1284
+ partitions_scanned=results[0].partitions_scanned if results else 0,
1285
+ )
1286
+ )
1287
+
1288
+ return summaries
1289
+
1290
+ def print_report(self):
1291
+ """Print a formatted profiling report."""
1292
+ summaries = self.get_summary()
1293
+
1294
+ print("\n" + "=" * 80)
1295
+ print("QUERY PERFORMANCE PROFILING REPORT")
1296
+ print("=" * 80)
1297
+
1298
+ # Group by query type
1299
+ from collections import defaultdict
1300
+
1301
+ by_query: dict[str, list[ProfileSummary]] = defaultdict(list)
1302
+ for s in summaries:
1303
+ by_query[s.query_type].append(s)
1304
+
1305
+ for query_type, engine_summaries in sorted(by_query.items()):
1306
+ print(f"\n{query_type}:")
1307
+ print("-" * 60)
1308
+ print(f"{'Engine':<20} {'Mean (ms)':<12} {'Median (ms)':<12} {'Std (ms)':<10} {'Partitions':<10}")
1309
+ print("-" * 60)
1310
+
1311
+ for s in sorted(engine_summaries, key=lambda x: x.mean_ms):
1312
+ print(
1313
+ f"{s.engine:<20} {s.mean_ms:<12.2f} {s.median_ms:<12.2f} {s.std_ms:<10.2f} {s.partitions_scanned:<10}"
1314
+ )
1315
+
1316
+ print("\n" + "=" * 80)
1317
+
1318
+
1319
+ @pytest.fixture(scope="module")
1320
+ def grid_config(request):
1321
+ """Get grid configuration from command line options."""
1322
+
1323
+ class GridConfig:
1324
+ grid_system = request.config.getoption("--e2e-grid", "h3")
1325
+ grid_level = int(request.config.getoption("--e2e-grid-level", 2))
1326
+ temporal_bin = request.config.getoption("--e2e-temporal", "month")
1327
+
1328
+ return GridConfig()
1329
+
1330
+
1331
+ @pytest.fixture(scope="module")
1332
+ def profile_config(request):
1333
+ """Get profiling configuration from command line options."""
1334
+
1335
+ class ProfileConfig:
1336
+ enabled = request.config.getoption("--e2e-profile-queries", False)
1337
+ iterations = int(request.config.getoption("--e2e-query-iterations", 10))
1338
+ engines = request.config.getoption("--e2e-query-engines", "duckdb,rustac").split(",")
1339
+
1340
+ return ProfileConfig()
1341
+
1342
+
1343
+ class TestQueryPerformance:
1344
+ """Tests for query performance profiling with configurable grid systems."""
1345
+
1346
+ @pytest.mark.e2e
1347
+ def test_configurable_grid_pipeline(self, temp_test_dir, e2e_config, grid_config):
1348
+ """Test pipeline with configurable grid system and resolution.
1349
+
1350
+ Run with:
1351
+ pytest -m e2e -k "test_configurable_grid" --e2e-grid=h3 --e2e-grid-level=4 --e2e-temporal=year
1352
+ """
1353
+ items_dir = temp_test_dir / "grid_config_items"
1354
+ items_dir.mkdir(parents=True, exist_ok=True)
1355
+
1356
+ generator = SyntheticSTACGenerator(
1357
+ output_dir=items_dir,
1358
+ base_geometry=LANDSAT_SCENE,
1359
+ outlier_tiny_percent=e2e_config.outlier_tiny_percent,
1360
+ outlier_huge_percent=e2e_config.outlier_huge_percent,
1361
+ seed=e2e_config.seed,
1362
+ )
1363
+
1364
+ items = generator.generate_items(n_items=e2e_config.n_items, shuffle=True)
1365
+ generator.write_items_to_files(items)
1366
+
1367
+ server = SimpleSTACServer(items_dir)
1368
+ base_url = server.start()
1369
+
1370
+ try:
1371
+ parquet_path = temp_test_dir / "grid_config_urls.parquet"
1372
+ generator.create_url_parquet(base_url, parquet_path, shuffle=True)
1373
+
1374
+ output_catalog = temp_test_dir / "catalog_grid_config"
1375
+ scratch_location = temp_test_dir / "scratch_grid_config"
1376
+
1377
+ config = ProcessingConfig(
1378
+ input_file=str(parquet_path),
1379
+ output_catalog=str(output_catalog),
1380
+ scratch_location=str(scratch_location),
1381
+ grid_system=grid_config.grid_system,
1382
+ grid_resolution=grid_config.grid_level,
1383
+ temporal_bin=grid_config.temporal_bin,
1384
+ generate_schema=True,
1385
+ enable_concurrent_http=True,
1386
+ concurrent_requests=10,
1387
+ batch_size=50,
1388
+ max_workers=2,
1389
+ )
1390
+
1391
+ processor = LocalProcessor(n_workers=config.max_workers)
1392
+ pipeline = STACIngestionPipeline(config, processor)
1393
+
1394
+ try:
1395
+ stats = pipeline.run()
1396
+
1397
+ assert len(stats) > 0, "Pipeline should produce partitions"
1398
+ total_items = sum(s["total_items"] for s in stats.values())
1399
+ assert total_items > 0, "Pipeline should ingest items"
1400
+
1401
+ # Verify schema reflects configuration
1402
+ schema_file = output_catalog / "catalog_schema.json"
1403
+ assert schema_file.exists()
1404
+
1405
+ with open(schema_file) as f:
1406
+ schema = json.load(f)
1407
+ assert schema["spatial_partitioning"]["grid_system"] == grid_config.grid_system
1408
+ assert schema["temporal_partitioning"]["temporal_bin"] == grid_config.temporal_bin
1409
+
1410
+ print("\nGrid Config Test Results:")
1411
+ print(f" Grid System: {grid_config.grid_system}")
1412
+ print(f" Grid Level: {grid_config.grid_level}")
1413
+ print(f" Temporal Bin: {grid_config.temporal_bin}")
1414
+ print(f" Partitions Created: {len(stats)}")
1415
+ print(f" Items Ingested: {total_items}")
1416
+
1417
+ finally:
1418
+ processor.close()
1419
+
1420
+ finally:
1421
+ server.stop()
1422
+
1423
+ @pytest.mark.e2e
1424
+ def test_query_performance_profiling(self, temp_test_dir, e2e_config, grid_config, profile_config):
1425
+ """Profile query performance across different engines.
1426
+
1427
+ Run with:
1428
+ pytest -m e2e -k "test_query_performance" \\
1429
+ --e2e-items=500 \\
1430
+ --e2e-grid=h3 --e2e-grid-level=2 \\
1431
+ --e2e-profile-queries \\
1432
+ --e2e-query-iterations=5 \\
1433
+ --e2e-query-engines=duckdb,rustac
1434
+ """
1435
+ if not profile_config.enabled:
1436
+ pytest.skip("Query profiling not enabled. Use --e2e-profile-queries to enable.")
1437
+
1438
+ # First, create a catalog with enough items for meaningful profiling
1439
+ items_dir = temp_test_dir / "profile_items"
1440
+ items_dir.mkdir(parents=True, exist_ok=True)
1441
+
1442
+ generator = SyntheticSTACGenerator(
1443
+ output_dir=items_dir,
1444
+ base_geometry=LANDSAT_SCENE,
1445
+ outlier_tiny_percent=e2e_config.outlier_tiny_percent,
1446
+ outlier_huge_percent=e2e_config.outlier_huge_percent,
1447
+ seed=e2e_config.seed,
1448
+ )
1449
+
1450
+ items = generator.generate_items(n_items=e2e_config.n_items, shuffle=True)
1451
+ generator.write_items_to_files(items)
1452
+
1453
+ server = SimpleSTACServer(items_dir)
1454
+ base_url = server.start()
1455
+
1456
+ try:
1457
+ parquet_path = temp_test_dir / "profile_urls.parquet"
1458
+ generator.create_url_parquet(base_url, parquet_path, shuffle=True)
1459
+
1460
+ output_catalog = temp_test_dir / "catalog_profile"
1461
+ scratch_location = temp_test_dir / "scratch_profile"
1462
+
1463
+ config = ProcessingConfig(
1464
+ input_file=str(parquet_path),
1465
+ output_catalog=str(output_catalog),
1466
+ scratch_location=str(scratch_location),
1467
+ grid_system=grid_config.grid_system,
1468
+ grid_resolution=grid_config.grid_level,
1469
+ temporal_bin=grid_config.temporal_bin,
1470
+ generate_schema=True,
1471
+ enable_concurrent_http=True,
1472
+ concurrent_requests=20,
1473
+ batch_size=100,
1474
+ max_workers=4,
1475
+ )
1476
+
1477
+ processor = LocalProcessor(n_workers=config.max_workers)
1478
+ pipeline = STACIngestionPipeline(config, processor)
1479
+
1480
+ try:
1481
+ print(f"\n{'=' * 60}")
1482
+ print("INGESTING CATALOG FOR PROFILING")
1483
+ print(f"{'=' * 60}")
1484
+ stats = pipeline.run()
1485
+
1486
+ total_items = sum(s["total_items"] for s in stats.values())
1487
+ print(f"\nCatalog created: {len(stats)} partitions, {total_items} items")
1488
+
1489
+ finally:
1490
+ processor.close()
1491
+
1492
+ # Run query profiling
1493
+ schema_path = output_catalog / "catalog_schema.json"
1494
+
1495
+ profiler = QueryProfiler(
1496
+ catalog_path=output_catalog,
1497
+ schema_path=schema_path,
1498
+ iterations=profile_config.iterations,
1499
+ )
1500
+
1501
+ print(f"\n{'=' * 60}")
1502
+ print("RUNNING QUERY PERFORMANCE PROFILING")
1503
+ print(f"{'=' * 60}")
1504
+ print(f"Engines: {profile_config.engines}")
1505
+ print(f"Iterations per query: {profile_config.iterations}")
1506
+
1507
+ # Add spatial_resolver to engines for comparison
1508
+ engines = ["spatial_resolver"] + profile_config.engines
1509
+ profiler.run_all_profiles(engines=engines)
1510
+
1511
+ # Print report
1512
+ profiler.print_report()
1513
+
1514
+ # Verify we got results
1515
+ assert len(profiler.results) > 0, "Should have profiling results"
1516
+
1517
+ # Check for errors
1518
+ errors = [r for r in profiler.results if r.error is not None]
1519
+ if errors:
1520
+ print(f"\nWarning: {len(errors)} queries had errors:")
1521
+ for e in errors[:5]: # Show first 5 errors
1522
+ print(f" {e.engine}/{e.query_type}: {e.error}")
1523
+
1524
+ finally:
1525
+ server.stop()
1526
+
1527
+ @pytest.mark.e2e
1528
+ def test_temporal_query_performance(self, temp_test_dir, e2e_config, grid_config, profile_config):
1529
+ """Profile query performance with temporal filtering.
1530
+
1531
+ Run with:
1532
+ pytest -m e2e -k "test_temporal_query" \\
1533
+ --e2e-items=200 \\
1534
+ --e2e-temporal=month \\
1535
+ --e2e-profile-queries
1536
+ """
1537
+ if not profile_config.enabled:
1538
+ pytest.skip("Query profiling not enabled. Use --e2e-profile-queries to enable.")
1539
+
1540
+ items_dir = temp_test_dir / "temporal_profile_items"
1541
+ items_dir.mkdir(parents=True, exist_ok=True)
1542
+
1543
+ generator = SyntheticSTACGenerator(
1544
+ output_dir=items_dir,
1545
+ base_geometry=LANDSAT_SCENE,
1546
+ seed=e2e_config.seed,
1547
+ )
1548
+
1549
+ items = generator.generate_items(n_items=e2e_config.n_items, shuffle=True)
1550
+ generator.write_items_to_files(items)
1551
+
1552
+ server = SimpleSTACServer(items_dir)
1553
+ base_url = server.start()
1554
+
1555
+ try:
1556
+ parquet_path = temp_test_dir / "temporal_profile_urls.parquet"
1557
+ generator.create_url_parquet(base_url, parquet_path, shuffle=True)
1558
+
1559
+ output_catalog = temp_test_dir / "catalog_temporal_profile"
1560
+ scratch_location = temp_test_dir / "scratch_temporal_profile"
1561
+
1562
+ config = ProcessingConfig(
1563
+ input_file=str(parquet_path),
1564
+ output_catalog=str(output_catalog),
1565
+ scratch_location=str(scratch_location),
1566
+ grid_system=grid_config.grid_system,
1567
+ grid_resolution=grid_config.grid_level,
1568
+ temporal_bin=grid_config.temporal_bin,
1569
+ generate_schema=True,
1570
+ enable_concurrent_http=True,
1571
+ max_workers=2,
1572
+ )
1573
+
1574
+ processor = LocalProcessor(n_workers=config.max_workers)
1575
+ pipeline = STACIngestionPipeline(config, processor)
1576
+
1577
+ try:
1578
+ stats = pipeline.run()
1579
+ total_items = sum(s["total_items"] for s in stats.values())
1580
+ print(f"\nCatalog created: {len(stats)} partitions, {total_items} items")
1581
+ finally:
1582
+ processor.close()
1583
+
1584
+ # Profile with different temporal filters
1585
+ from shapely.geometry import box
1586
+
1587
+ test_geometry = box(-110.0, 35.0, -100.0, 45.0)
1588
+ schema_path = output_catalog / "catalog_schema.json"
1589
+
1590
+ from earthcatalog.spatial_resolver import spatial_resolver
1591
+
1592
+ resolver = spatial_resolver(str(schema_path), str(output_catalog))
1593
+
1594
+ temporal_filters = [
1595
+ None, # No filter
1596
+ "year=2024/*", # Specific year
1597
+ "year=202*/*", # Multiple years
1598
+ "year=2024/month=0*/*", # First half of year
1599
+ ]
1600
+
1601
+ print(f"\n{'=' * 60}")
1602
+ print("TEMPORAL FILTERING PERFORMANCE")
1603
+ print(f"{'=' * 60}")
1604
+
1605
+ import time
1606
+
1607
+ for tf in temporal_filters:
1608
+ start = time.perf_counter()
1609
+ partitions = resolver.resolve_partitions(test_geometry, overlap=True)
1610
+ paths = resolver.generate_query_paths(partitions, tf)
1611
+ duration_ms = (time.perf_counter() - start) * 1000
1612
+
1613
+ print(f"\nFilter: {tf or 'None'}")
1614
+ print(f" Partitions: {len(partitions)}")
1615
+ print(f" Paths generated: {len(paths)}")
1616
+ print(f" Duration: {duration_ms:.2f} ms")
1617
+
1618
+ finally:
1619
+ server.stop()
1620
+
1621
+
1622
+ if __name__ == "__main__":
1623
+ # Allow running tests directly
1624
+ pytest.main([__file__, "-v", "-m", "e2e"])