earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. earthcatalog/__init__.py +164 -0
  2. earthcatalog/async_http_client.py +1006 -0
  3. earthcatalog/config.py +97 -0
  4. earthcatalog/engines/__init__.py +308 -0
  5. earthcatalog/engines/rustac_engine.py +142 -0
  6. earthcatalog/engines/stac_geoparquet_engine.py +126 -0
  7. earthcatalog/exceptions.py +471 -0
  8. earthcatalog/grid_systems.py +1114 -0
  9. earthcatalog/ingestion_pipeline.py +2281 -0
  10. earthcatalog/input_readers.py +603 -0
  11. earthcatalog/job_tracking.py +485 -0
  12. earthcatalog/pipeline.py +606 -0
  13. earthcatalog/schema_generator.py +911 -0
  14. earthcatalog/spatial_resolver.py +1207 -0
  15. earthcatalog/stac_hooks.py +754 -0
  16. earthcatalog/statistics.py +677 -0
  17. earthcatalog/storage_backends.py +548 -0
  18. earthcatalog/tests/__init__.py +1 -0
  19. earthcatalog/tests/conftest.py +76 -0
  20. earthcatalog/tests/test_all_grids.py +793 -0
  21. earthcatalog/tests/test_async_http.py +700 -0
  22. earthcatalog/tests/test_cli_and_storage.py +230 -0
  23. earthcatalog/tests/test_config.py +245 -0
  24. earthcatalog/tests/test_dask_integration.py +580 -0
  25. earthcatalog/tests/test_e2e_synthetic.py +1624 -0
  26. earthcatalog/tests/test_engines.py +272 -0
  27. earthcatalog/tests/test_exceptions.py +346 -0
  28. earthcatalog/tests/test_file_structure.py +245 -0
  29. earthcatalog/tests/test_input_readers.py +666 -0
  30. earthcatalog/tests/test_integration.py +200 -0
  31. earthcatalog/tests/test_integration_async.py +283 -0
  32. earthcatalog/tests/test_job_tracking.py +603 -0
  33. earthcatalog/tests/test_multi_file_input.py +336 -0
  34. earthcatalog/tests/test_passthrough_hook.py +196 -0
  35. earthcatalog/tests/test_pipeline.py +684 -0
  36. earthcatalog/tests/test_pipeline_components.py +665 -0
  37. earthcatalog/tests/test_schema_generator.py +506 -0
  38. earthcatalog/tests/test_spatial_resolver.py +413 -0
  39. earthcatalog/tests/test_stac_hooks.py +776 -0
  40. earthcatalog/tests/test_statistics.py +477 -0
  41. earthcatalog/tests/test_storage_backends.py +236 -0
  42. earthcatalog/tests/test_validation.py +435 -0
  43. earthcatalog/tests/test_workers.py +653 -0
  44. earthcatalog/validation.py +921 -0
  45. earthcatalog/workers.py +682 -0
  46. earthcatalog-0.2.0.dist-info/METADATA +333 -0
  47. earthcatalog-0.2.0.dist-info/RECORD +50 -0
  48. earthcatalog-0.2.0.dist-info/WHEEL +5 -0
  49. earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
  50. earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1207 @@
1
+ """Spatial partition resolver for efficient geospatial catalog querying and analysis.
2
+
3
+ This module provides intelligent spatial partition resolution capabilities that automatically
4
+ determine which catalog partitions intersect with areas of interest. Eliminates the need
5
+ for manual partition management and enables efficient spatial queries across massive
6
+ geospatial datasets organized in EarthCatalog's partition structure.
7
+
8
+ Core Capabilities:
9
+ - Automatic spatial partition detection for any geometry
10
+ - Multi-grid system support (H3, S2, UTM, MGRS, LatLon, ITSLive, GeoJSON)
11
+ - Global partition handling for large geometries spanning many tiles
12
+ - Buffer zone support for boundary analysis and edge effects
13
+ - Temporal filtering integration for time-series spatial analysis
14
+ - Remote catalog support (S3, GCS, Azure, HTTP) via fsspec
15
+
16
+ Key Components:
17
+ SpatialPartitionResolver: Main class for partition resolution and querying
18
+ spatial_resolver(): Primary entry point for creating resolvers from schema files
19
+ resolve_and_query(): One-stop function combining resolution with DuckDB query generation
20
+
21
+ Performance Benefits:
22
+ - Sub-second query planning for catalogs with millions of partitions
23
+ - Eliminates need to scan entire catalogs for spatial queries
24
+ - Optimized spatial indexing for complex geometries
25
+ - Minimal memory usage regardless of catalog size
26
+ - Smart caching for repeated queries with similar geometries
27
+
28
+ Query Optimization:
29
+ The resolver generates optimized file access patterns by:
30
+ - Identifying only relevant spatial partitions
31
+ - Supporting efficient temporal filtering with glob patterns
32
+ - Providing buffer zones for boundary analysis
33
+ - Generating DuckDB-compatible SQL for immediate use
34
+
35
+ Integration Patterns:
36
+ >>> # Simple spatial query
37
+ >>> resolver = spatial_resolver('catalog_schema.json')
38
+ >>> partitions = resolver.resolve_partitions(aoi_geometry)
39
+ >>>
40
+ >>> # Combined spatial-temporal query with DuckDB
41
+ >>> partitions, query = resolve_and_query(
42
+ ... 'schema.json', 'catalog/', aoi_geometry,
43
+ ... temporal_filter='2024-*'
44
+ ... )
45
+ >>> results = duckdb.sql(query).to_df()
46
+
47
+ Use Cases:
48
+ - Interactive geospatial analysis in Jupyter notebooks
49
+ - Large-scale spatial analytics with optimized data access
50
+ - Time-series analysis combining spatial and temporal filtering
51
+ - Boundary analysis with automatic buffer zone handling
52
+ - Multi-resolution spatial analysis across different grid systems
53
+
54
+ Remote Catalog Support:
55
+ Full support for cloud-hosted catalogs via fsspec protocols:
56
+ - S3: s3://bucket/catalog/ (with s3fs)
57
+ - Google Cloud: gs://bucket/catalog/ (with gcsfs)
58
+ - Azure: abfs://container/catalog/ (with adlfs)
59
+ - HTTP: https://example.com/catalog/ (built-in)
60
+ """
61
+
62
+ import json
63
+ import logging
64
+ from pathlib import Path
65
+ from typing import Any
66
+
67
+ try:
68
+ import h3
69
+ except ImportError:
70
+ h3 = None
71
+
72
+ try:
73
+ import s2sphere
74
+ except ImportError:
75
+ s2sphere = None
76
+
77
+ from shapely.geometry import Point, box, shape
78
+ from shapely.geometry.base import BaseGeometry
79
+
80
+ logger = logging.getLogger(__name__)
81
+
82
+
83
+ class SpatialPartitionResolver:
84
+ """High-performance spatial partition resolver for efficient catalog querying and analysis.
85
+
86
+ This class provides the core functionality for determining which spatial partitions
87
+ in an EarthCatalog intersect with areas of interest. Optimized for both simple
88
+ point queries and complex polygon analysis across catalogs with millions of partitions.
89
+
90
+ The resolver automatically handles:
91
+ - Multi-grid system support with consistent interfaces
92
+ - Global partition detection for large geometries
93
+ - Boundary buffering for edge analysis
94
+ - Temporal filtering integration
95
+ - Performance optimization for repeated queries
96
+
97
+ Grid System Compatibility:
98
+ Supports all EarthCatalog grid systems with native optimizations:
99
+ - H3: Hexagonal grid with excellent global properties
100
+ - S2: Spherical geometry optimized for polar regions
101
+ - UTM: High-precision zoned coordinate system
102
+ - MGRS: Military grid reference system
103
+ - LatLon: Simple latitude/longitude grid
104
+ - ITSLive: ITS_LIVE center-based 10-degree grid for ice datasets
105
+ - GeoJSON: Custom polygon-based partitioning
106
+
107
+ Performance Characteristics:
108
+ - O(log n) partition resolution for most grid systems
109
+ - Sub-second response for catalogs with 10M+ partitions
110
+ - Efficient memory usage with lazy partition loading
111
+ - Optimized spatial computations using grid-native methods
112
+ - Smart caching for geometric operations
113
+
114
+ Thread Safety:
115
+ This class is thread-safe for read operations after initialization.
116
+ Multiple threads can safely call resolve_partitions() and query_partitions()
117
+ concurrently on the same resolver instance.
118
+
119
+ Example:
120
+ >>> # Initialize from catalog schema
121
+ >>> resolver = SpatialPartitionResolver(schema_dict, catalog_path)
122
+ >>>
123
+ >>> # Find intersecting partitions
124
+ >>> partitions = resolver.resolve_partitions(aoi_geometry)
125
+ >>>
126
+ >>> # Get file paths with temporal filtering
127
+ >>> files = resolver.query_partitions(
128
+ ... aoi_geometry,
129
+ ... temporal_filter='2024-*',
130
+ ... buffer_cells=1
131
+ ... )
132
+ >>>
133
+ >>> # Check if geometry uses global partition
134
+ >>> uses_global = resolver.should_use_global_partition(large_geometry)
135
+ """
136
+
137
+ def __init__(self, catalog_schema: dict[str, Any], catalog_path: str):
138
+ """Initialize resolver with catalog schema and path.
139
+
140
+ Args:
141
+ catalog_schema: Loaded catalog schema dictionary
142
+ catalog_path: Path to the catalog directory
143
+ """
144
+ self.schema = catalog_schema
145
+ self.catalog_path = Path(catalog_path)
146
+ self.grid_system = catalog_schema["spatial_partitioning"]["grid_system"]
147
+ self.spatial_config = catalog_schema["spatial_partitioning"]
148
+ self.global_config = catalog_schema.get("global_partitioning", {})
149
+ self.global_enabled = bool(self.global_config.get("enabled", False))
150
+ self.global_threshold = int(self.global_config.get("threshold", 1))
151
+
152
+ # Extract mission information from catalog structure
153
+ self.missions = self._extract_available_missions(catalog_schema)
154
+
155
+ def _extract_available_missions(self, schema: dict[str, Any]) -> list[str]:
156
+ """Extract available missions from schema metadata."""
157
+ # Extract from partition statistics or structure examples
158
+ partitioning = schema.get("spatial_partitioning", {})
159
+
160
+ if "example_paths" in partitioning:
161
+ missions = set()
162
+ for path in partitioning["example_paths"]:
163
+ if "/" in path:
164
+ mission = path.split("/")[0]
165
+ missions.add(mission)
166
+ return list(missions)
167
+
168
+ return ["unknown"] # Fallback
169
+
170
+ def resolve_partitions(
171
+ self,
172
+ geometry: dict[str, Any] | BaseGeometry,
173
+ overlap: bool = True,
174
+ buffer_cells: int = 0,
175
+ include_global: bool | None = None,
176
+ ) -> list[str]:
177
+ """Resolve spatial partitions that intersect with the given geometry.
178
+
179
+ Args:
180
+ geometry: GeoJSON geometry dict or Shapely geometry
181
+ overlap: Whether to include overlapping cells (True) or only covering cells (False)
182
+ buffer_cells: Number of additional cells to include around the boundary
183
+ include_global: Whether to include global partition. If None, auto-detect based on threshold
184
+
185
+ Returns:
186
+ List of spatial partition IDs that intersect with the geometry
187
+ """
188
+ if isinstance(geometry, dict):
189
+ shapely_geom = shape(geometry)
190
+ else:
191
+ shapely_geom = geometry
192
+
193
+ # Route to appropriate grid system handler
194
+ if self.grid_system == "h3":
195
+ spatial_partitions = self._resolve_h3_partitions(shapely_geom, overlap, buffer_cells)
196
+ elif self.grid_system == "s2":
197
+ spatial_partitions = self._resolve_s2_partitions(shapely_geom, overlap, buffer_cells)
198
+ elif self.grid_system == "mgrs":
199
+ spatial_partitions = self._resolve_mgrs_partitions(shapely_geom, overlap, buffer_cells)
200
+ elif self.grid_system == "utm":
201
+ spatial_partitions = self._resolve_utm_partitions(shapely_geom, overlap, buffer_cells)
202
+ elif self.grid_system == "latlon":
203
+ spatial_partitions = self._resolve_latlon_partitions(shapely_geom, overlap, buffer_cells)
204
+ elif self.grid_system == "itslive":
205
+ spatial_partitions = self._resolve_itslive_partitions(shapely_geom, overlap, buffer_cells)
206
+ elif self.grid_system == "geojson":
207
+ spatial_partitions = self._resolve_geojson_partitions(shapely_geom, overlap, buffer_cells)
208
+ else:
209
+ raise ValueError(f"Unsupported grid system: {self.grid_system}")
210
+
211
+ # Check if we should include the global partition
212
+ should_include_global = self._should_include_global_partition(spatial_partitions, shapely_geom, include_global)
213
+
214
+ if should_include_global:
215
+ spatial_partitions.append("global")
216
+
217
+ return spatial_partitions
218
+
219
+ def _should_include_global_partition(
220
+ self, spatial_partitions: list[str], geometry: BaseGeometry, include_global: bool | None = None
221
+ ) -> bool:
222
+ """Determine whether to include the global partition in results.
223
+
224
+ Args:
225
+ spatial_partitions: List of resolved spatial partitions
226
+ geometry: The query geometry
227
+ include_global: Explicit override (True/False/None for auto-detect)
228
+
229
+ Returns:
230
+ True if global partition should be included
231
+ """
232
+ # If explicitly specified, use that
233
+ if include_global is not None:
234
+ return include_global and self.global_enabled
235
+
236
+ # If global partitioning is disabled, never include
237
+ if not self.global_enabled:
238
+ return False
239
+
240
+ # Auto-detect based on threshold logic:
241
+ # Include global if the query spans more cells than the threshold,
242
+ # because large geometries are likely stored in global partition
243
+
244
+ # Get the threshold for this grid system and resolution
245
+ threshold = self._get_effective_global_threshold()
246
+
247
+ # Check if query exceeds threshold
248
+ if len(spatial_partitions) > threshold:
249
+ logger.debug(
250
+ f"Query spans {len(spatial_partitions)} partitions > threshold {threshold}, including global partition"
251
+ )
252
+ return True
253
+
254
+ # Additional logic: If geometry is very large, include global even if
255
+ # partition count is low (e.g., geometry spans continents but only touches
256
+ # a few high-level cells)
257
+ geometry_area = geometry.area # In square degrees
258
+ if geometry_area > self._get_large_geometry_threshold():
259
+ logger.debug(f"Geometry area {geometry_area:.2f} sq degrees is very large, including global partition")
260
+ return True
261
+
262
+ return False
263
+
264
+ def _get_effective_global_threshold(self) -> int:
265
+ """Get the effective global partition threshold for current grid system and resolution."""
266
+ grid_resolution = self._get_grid_resolution_key()
267
+
268
+ # Check for custom thresholds first
269
+ custom_thresholds = self.schema.get("custom_thresholds", {})
270
+ if custom_thresholds:
271
+ grid_thresholds = custom_thresholds.get(self.grid_system, {})
272
+ if str(grid_resolution) in grid_thresholds:
273
+ threshold = grid_thresholds[str(grid_resolution)]
274
+ return int(threshold)
275
+
276
+ # Fall back to configured threshold
277
+ return self.global_threshold
278
+
279
+ def _get_grid_resolution_key(self) -> int | float | str:
280
+ """Get the resolution/level key for the current grid system."""
281
+ if self.grid_system == "h3":
282
+ resolution = self.spatial_config.get("resolution", 6)
283
+ return int(resolution) if resolution is not None else 6
284
+ elif self.grid_system == "s2":
285
+ level = self.spatial_config.get("level", self.spatial_config.get("resolution", 13))
286
+ return int(level) if level is not None else 13
287
+ elif self.grid_system == "mgrs":
288
+ precision = self.spatial_config.get("precision", self.spatial_config.get("resolution", 3))
289
+ return int(precision) if precision is not None else 3
290
+ elif self.grid_system == "utm":
291
+ precision = self.spatial_config.get("precision", self.spatial_config.get("resolution", 1))
292
+ return int(precision) if precision is not None else 1
293
+ elif self.grid_system == "latlon":
294
+ cell_size = self.spatial_config.get("cell_size_degrees", self.spatial_config.get("resolution", 1.0))
295
+ return float(cell_size) if cell_size is not None else 1.0
296
+ elif self.grid_system == "itslive":
297
+ return 10 # ITSLive has fixed 10-degree resolution
298
+ else:
299
+ return 1
300
+
301
+ def _get_large_geometry_threshold(self) -> float:
302
+ """Get threshold for considering a geometry 'large' (in square degrees)."""
303
+ # Thresholds based on grid system characteristics
304
+ thresholds = {
305
+ "h3": 10.0, # ~1000km x 1000km at equator
306
+ "s2": 10.0, # Similar to H3
307
+ "mgrs": 5.0, # Military grids are more regional
308
+ "utm": 50.0, # UTM zones are large (6 degrees wide)
309
+ "latlon": 100.0, # Simple grids can handle large areas
310
+ "itslive": 50.0, # ITS_LIVE 10-degree cells are large
311
+ "geojson": 5.0, # Custom grids usually regional
312
+ }
313
+ return thresholds.get(self.grid_system, 10.0)
314
+
315
+ def get_existing_partition_paths(self, partition_ids: list[str], missions: list[str] | None = None) -> list[str]:
316
+ """Filter partition IDs to only those that exist in the catalog.
317
+
318
+ Args:
319
+ partition_ids: List of spatial partition IDs (H3 cells, etc.)
320
+ missions: Optional list of missions to filter. If None, include all.
321
+
322
+ Returns:
323
+ List of partition directory paths that actually exist
324
+ """
325
+ existing_paths = []
326
+
327
+ # File structure: mission/partition=h3/level=X/spatial_id/
328
+ missions_to_check = missions or self.missions
329
+ resolution = self.spatial_config.get("resolution", 2)
330
+ grid_system = self.grid_system
331
+
332
+ for mission in missions_to_check:
333
+ for partition_id in partition_ids:
334
+ partition_path = (
335
+ self.catalog_path / mission / f"partition={grid_system}" / f"level={resolution}" / partition_id
336
+ )
337
+ if partition_path.exists() and partition_path.is_dir():
338
+ existing_paths.append(str(partition_path))
339
+
340
+ return existing_paths
341
+
342
+ def _parse_temporal_filter_to_hive(self, temporal_filter: str) -> str:
343
+ """Convert temporal filter to Hive-style path pattern.
344
+
345
+ Converts user-friendly temporal filters into Hive partition directory patterns
346
+ for efficient directory-level pruning in DuckDB, Athena, and Spark.
347
+
348
+ Args:
349
+ temporal_filter: Temporal filter string in various formats
350
+
351
+ Returns:
352
+ Hive-style path pattern for directory matching
353
+
354
+ Examples:
355
+ "2024" → "year=2024"
356
+ "2024-*" → "year=2024/*"
357
+ "2024-01" → "year=2024/month=01"
358
+ "2024-01-*" → "year=2024/month=01/*"
359
+ "2024-01-15" → "year=2024/month=01/day=15"
360
+ "*" → "*"
361
+ """
362
+ if not temporal_filter or temporal_filter == "*":
363
+ return "*"
364
+
365
+ # Remove trailing wildcard for parsing, we'll handle it separately
366
+ has_wildcard = temporal_filter.endswith("*")
367
+ clean_filter = temporal_filter.rstrip("-*")
368
+
369
+ parts = clean_filter.split("-")
370
+
371
+ if len(parts) >= 1 and parts[0]:
372
+ year = parts[0]
373
+ result = f"year={year}"
374
+
375
+ if len(parts) >= 2 and parts[1]:
376
+ month = parts[1].zfill(2)
377
+ result += f"/month={month}"
378
+
379
+ if len(parts) >= 3 and parts[2]:
380
+ day = parts[2].zfill(2)
381
+ result += f"/day={day}"
382
+
383
+ # Add wildcard back if original had one
384
+ if has_wildcard:
385
+ result += "/*"
386
+
387
+ return result
388
+
389
+ return temporal_filter # Return as-is if can't parse
390
+
391
+ def generate_query_paths(
392
+ self,
393
+ partition_ids: list[str],
394
+ temporal_filter: str | None = None,
395
+ missions: list[str] | None = None,
396
+ output_format: str = "parquet",
397
+ ) -> list[str]:
398
+ """Generate file path patterns for querying specific partitions.
399
+
400
+ Uses Hive-style temporal partitioning for efficient directory-level pruning.
401
+ The temporal_filter is converted to Hive partition directories
402
+ (e.g., "2024-01" becomes "year=2024/month=01").
403
+
404
+ Args:
405
+ partition_ids: List of spatial partition IDs
406
+ temporal_filter: Optional temporal filter (e.g., "2024-*", "2024-01", "2024-01-15")
407
+ missions: Optional list of missions to include
408
+ output_format: File format ("parquet" or "ndjson")
409
+
410
+ Returns:
411
+ List of file path patterns for use with read_parquet() or similar
412
+ """
413
+ existing_paths = self.get_existing_partition_paths(partition_ids, missions)
414
+
415
+ file_extension = f".{output_format}" if output_format != "geoparquet" else ".parquet"
416
+
417
+ if temporal_filter:
418
+ # Convert temporal filter to Hive-style path pattern
419
+ hive_temporal = self._parse_temporal_filter_to_hive(temporal_filter)
420
+ patterns = [f"{path}/{hive_temporal}/items{file_extension}" for path in existing_paths]
421
+ else:
422
+ # Include all temporal partitions - glob for items files in any temporal directory
423
+ patterns = [f"{path}/**/items{file_extension}" for path in existing_paths]
424
+
425
+ return patterns
426
+
427
+ def _resolve_h3_partitions(self, geometry: BaseGeometry, overlap: bool, buffer_cells: int) -> list[str]:
428
+ """Resolve H3 partitions for the given geometry."""
429
+ if h3 is None:
430
+ raise ImportError("h3 library required for H3 grid resolution: pip install h3")
431
+
432
+ resolution = self.spatial_config["resolution"]
433
+
434
+ try:
435
+ # Convert Shapely geometry to H3 geometry format
436
+ if isinstance(geometry, Point):
437
+ # For points, get the single cell and optionally buffer
438
+ lat, lon = geometry.y, geometry.x
439
+ center_cell = h3.latlng_to_cell(lat, lon, resolution)
440
+
441
+ if buffer_cells > 0:
442
+ # Get cells within buffer distance
443
+ cells = {center_cell}
444
+ for ring in range(1, buffer_cells + 1):
445
+ cells.update(h3.grid_ring(center_cell, ring))
446
+ return list(cells)
447
+ else:
448
+ return [center_cell]
449
+
450
+ else:
451
+ # For polygons, use the experimental shape-to-cells function
452
+ try:
453
+ # Convert geometry to GeoJSON-like format for H3
454
+ geom_dict = geometry.__geo_interface__
455
+
456
+ # Use h3shape_to_cells_experimental if available
457
+ if hasattr(h3, "h3shape_to_cells_experimental"):
458
+ h3_shape = h3.geo_to_h3shape(geom_dict)
459
+ contain_mode = "overlap" if overlap else "center"
460
+ cells_result = h3.h3shape_to_cells_experimental(h3_shape, resolution, contain_mode)
461
+ initial_cells: list[str] = list(cells_result)
462
+ else:
463
+ # Use standard polygon_to_cells method
464
+ cells_result = h3.polygon_to_cells(geom_dict, resolution)
465
+ initial_cells = list(cells_result)
466
+
467
+ # Add buffer cells if requested
468
+ if buffer_cells > 0:
469
+ buffered_cells: set[str] = set(initial_cells)
470
+ for cell in initial_cells:
471
+ for ring in range(1, buffer_cells + 1):
472
+ try:
473
+ ring_cells = h3.grid_ring(cell, ring)
474
+ buffered_cells.update(ring_cells)
475
+ except (ValueError, TypeError, RuntimeError) as e:
476
+ # Some cells may not have valid rings at boundaries
477
+ logger.debug(f"Failed to get ring for cell: {e}")
478
+ return list(buffered_cells)
479
+
480
+ return initial_cells
481
+
482
+ except (ValueError, TypeError, AttributeError, RuntimeError) as e:
483
+ logger.warning(f"H3 experimental method failed, using fallback: {e}")
484
+ # Fallback to bounding box sampling approach
485
+ return self._h3_fallback_sampling(geometry, resolution, buffer_cells, h3)
486
+
487
+ except (ValueError, TypeError, AttributeError, RuntimeError) as e:
488
+ logger.error(f"Failed to resolve H3 partitions: {e}")
489
+ return []
490
+
491
+ def _h3_fallback_sampling(
492
+ self, geometry: BaseGeometry, resolution: int, buffer_cells: int, h3_module: Any
493
+ ) -> list[str]:
494
+ """Fallback H3 resolution using bounding box sampling."""
495
+ import numpy as np
496
+
497
+ bounds = geometry.bounds
498
+ min_lon, min_lat, max_lon, max_lat = bounds
499
+
500
+ # Generate sample points across the geometry
501
+ n_samples = min(1000, max(50, int(geometry.area * 10000)))
502
+
503
+ cells = set()
504
+ for _ in range(n_samples):
505
+ lat = np.random.uniform(min_lat, max_lat)
506
+ lon = np.random.uniform(min_lon, max_lon)
507
+ point = Point(lon, lat)
508
+
509
+ if geometry.contains(point) or geometry.intersects(point):
510
+ cell = h3_module.latlng_to_cell(lat, lon, resolution)
511
+ cells.add(cell)
512
+
513
+ # Add buffer if requested
514
+ if buffer_cells > 0:
515
+ buffered_cells = set(cells)
516
+ for cell in cells:
517
+ for ring in range(1, buffer_cells + 1):
518
+ try:
519
+ buffered_cells.update(h3_module.grid_ring(cell, ring))
520
+ except (ValueError, TypeError, RuntimeError) as e:
521
+ logger.debug(f"Failed to get H3 ring: {e}")
522
+ cells = buffered_cells
523
+
524
+ return list(cells)
525
+
526
+ def _resolve_s2_partitions(self, geometry: BaseGeometry, overlap: bool, buffer_cells: int) -> list[str]:
527
+ """Resolve S2 partitions for the given geometry."""
528
+ if s2sphere is None:
529
+ raise ImportError("s2sphere library required for S2 grid resolution: pip install s2sphere")
530
+
531
+ level = self.spatial_config.get("level", self.spatial_config.get("resolution"))
532
+
533
+ try:
534
+ # Convert geometry to S2 cells
535
+ if isinstance(geometry, Point):
536
+ # Type checking: s2sphere is guaranteed to be available after ImportError check above
537
+ assert s2sphere is not None
538
+ lat_lng = s2sphere.LatLng.from_degrees(geometry.y, geometry.x) # type: ignore
539
+ cell_id = s2sphere.CellId.from_lat_lng(lat_lng) # type: ignore
540
+ cell_id = cell_id.parent(level) # type: ignore
541
+ return [str(cell_id.id())] # type: ignore
542
+ else:
543
+ # For complex geometries, sample points and find covering cells
544
+ bounds = geometry.bounds
545
+ min_lon, min_lat, max_lon, max_lat = bounds
546
+
547
+ cells = set()
548
+ # Sample points across geometry bounding box
549
+ n_samples = min(500, max(25, int(geometry.area * 1000)))
550
+ import numpy as np
551
+
552
+ for _ in range(n_samples):
553
+ lat = np.random.uniform(min_lat, max_lat)
554
+ lon = np.random.uniform(min_lon, max_lon)
555
+ point = Point(lon, lat)
556
+
557
+ if geometry.contains(point) or geometry.intersects(point):
558
+ lat_lng = s2sphere.LatLng.from_degrees(lat, lon) # type: ignore
559
+ cell_id = s2sphere.CellId.from_lat_lng(lat_lng) # type: ignore
560
+ cell_id = cell_id.parent(level) # type: ignore
561
+ cells.add(str(cell_id.id())) # type: ignore
562
+
563
+ return list(cells)
564
+
565
+ except (ValueError, TypeError, AttributeError, RuntimeError) as e:
566
+ logger.error(f"Failed to resolve S2 partitions: {e}")
567
+ return []
568
+
569
+ def _resolve_mgrs_partitions(self, geometry: BaseGeometry, overlap: bool, buffer_cells: int) -> list[str]:
570
+ """Resolve MGRS partitions for the given geometry."""
571
+ try:
572
+ import mgrs
573
+ except ImportError as e:
574
+ raise ImportError("mgrs library required for MGRS grid resolution: pip install mgrs") from e
575
+
576
+ precision = self.spatial_config.get("precision", self.spatial_config.get("resolution"))
577
+ m = mgrs.MGRS()
578
+
579
+ try:
580
+ if isinstance(geometry, Point):
581
+ # Single point conversion
582
+ mgrs_code = m.toMGRS(geometry.y, geometry.x, MGRSPrecision=precision)
583
+ return [mgrs_code[: 2 + 2 + precision * 2]] # Format based on precision
584
+ else:
585
+ # Sample points across geometry
586
+ bounds = geometry.bounds
587
+ min_lon, min_lat, max_lon, max_lat = bounds
588
+
589
+ cells = set()
590
+ n_samples = min(200, max(20, int(geometry.area * 100)))
591
+ import numpy as np
592
+
593
+ for _ in range(n_samples):
594
+ lat = np.random.uniform(min_lat, max_lat)
595
+ lon = np.random.uniform(min_lon, max_lon)
596
+ point = Point(lon, lat)
597
+
598
+ if geometry.contains(point) or geometry.intersects(point):
599
+ try:
600
+ mgrs_code = m.toMGRS(lat, lon, MGRSPrecision=precision)
601
+ cells.add(mgrs_code[: 2 + 2 + precision * 2])
602
+ except (ValueError, TypeError, AttributeError) as e:
603
+ logger.debug(f"Invalid MGRS coordinates: {e}")
604
+
605
+ return list(cells)
606
+
607
+ except (ValueError, TypeError, AttributeError, RuntimeError) as e:
608
+ logger.error(f"Failed to resolve MGRS partitions: {e}")
609
+ return []
610
+
611
+ def _resolve_utm_partitions(self, geometry: BaseGeometry, overlap: bool, buffer_cells: int) -> list[str]:
612
+ """Resolve UTM partitions for the given geometry."""
613
+ # UTM zones are based on longitude ranges (6 degrees each)
614
+ bounds = geometry.bounds
615
+ min_lon, min_lat, max_lon, max_lat = bounds
616
+
617
+ zones = set()
618
+
619
+ # Calculate UTM zones that intersect the geometry
620
+ start_zone = int((min_lon + 180) // 6) + 1
621
+ end_zone = int((max_lon + 180) // 6) + 1
622
+
623
+ for zone in range(start_zone, end_zone + 1):
624
+ if 1 <= zone <= 60: # Valid UTM zones
625
+ # Determine hemisphere(s) using the same format as UTMGridSystem
626
+ if max_lat >= 0:
627
+ zones.add(f"{zone}N")
628
+ if min_lat < 0:
629
+ zones.add(f"{zone}S")
630
+
631
+ return list(zones)
632
+
633
+ def _resolve_latlon_partitions(self, geometry: BaseGeometry, overlap: bool, buffer_cells: int) -> list[str]:
634
+ """Resolve LatLon grid partitions for the given geometry."""
635
+ cell_size = self.spatial_config.get("cell_size_degrees", self.spatial_config.get("resolution"))
636
+
637
+ bounds = geometry.bounds
638
+ min_lon, min_lat, max_lon, max_lat = bounds
639
+
640
+ # Calculate grid cells that intersect the bounding box
641
+ min_cell_lon = int(min_lon // cell_size) * cell_size
642
+ max_cell_lon = int(max_lon // cell_size) * cell_size
643
+ min_cell_lat = int(min_lat // cell_size) * cell_size
644
+ max_cell_lat = int(max_lat // cell_size) * cell_size
645
+
646
+ cells = []
647
+ lat = min_cell_lat
648
+ while lat <= max_cell_lat:
649
+ lon = min_cell_lon
650
+ while lon <= max_cell_lon:
651
+ # Create cell geometry to test intersection
652
+ cell_geom = box(lon, lat, lon + cell_size, lat + cell_size)
653
+ if geometry.intersects(cell_geom):
654
+ cell_id = f"latlon_{lat}_{lon}"
655
+ cells.append(cell_id)
656
+ lon += cell_size
657
+ lat += cell_size
658
+
659
+ return cells
660
+
661
+ def _resolve_itslive_partitions(self, geometry: BaseGeometry, overlap: bool, buffer_cells: int) -> list[str]:
662
+ """Resolve ITS_LIVE grid partitions for the given geometry."""
663
+ from earthcatalog.grid_systems import ITSLiveGridSystem
664
+
665
+ # Create ITSLive grid system to handle the resolution
666
+ grid_system = ITSLiveGridSystem()
667
+
668
+ # Convert shapely geometry to GeoJSON format
669
+ if hasattr(geometry, "__geo_interface__"):
670
+ geom_dict = geometry.__geo_interface__
671
+ else:
672
+ # Fallback for basic geometries
673
+ # Fallback: use centroid for complex geometries
674
+ centroid = geometry.centroid
675
+ geom_dict = {"type": "Point", "coordinates": [centroid.x, centroid.y]}
676
+
677
+ # Get ITS_LIVE grid cells
678
+ cells = grid_system.tiles_for_geometry(geom_dict)
679
+
680
+ # Handle buffering if requested
681
+ if buffer_cells > 0:
682
+ # For ITS_LIVE, buffering means adding neighboring 10-degree cells
683
+ buffered_cells = set(cells)
684
+
685
+ for cell in cells:
686
+ # Parse cell name to get center coordinates
687
+ # Format: {N|S}{lat:02d}{E|W}{lon:03d}
688
+ if len(cell) >= 7:
689
+ lat_part = cell[1:3]
690
+ lon_part = cell[4:7]
691
+ lat_sign = 1 if cell[0] == "N" else -1
692
+ lon_sign = 1 if cell[3] == "E" else -1
693
+
694
+ try:
695
+ lat_center = lat_sign * int(lat_part)
696
+ lon_center = lon_sign * int(lon_part)
697
+
698
+ # Add neighboring cells
699
+ for lat_offset in range(-buffer_cells, buffer_cells + 1):
700
+ for lon_offset in range(-buffer_cells, buffer_cells + 1):
701
+ new_lat = lat_center + (lat_offset * 10)
702
+ new_lon = lon_center + (lon_offset * 10)
703
+
704
+ # Keep within valid coordinate ranges
705
+ if -90 <= new_lat <= 90 and -180 <= new_lon < 180:
706
+ neighbor_cell = grid_system._format_cell_name(new_lat, new_lon)
707
+ buffered_cells.add(neighbor_cell)
708
+ except (ValueError, IndexError):
709
+ # Skip invalid cell names
710
+ pass
711
+
712
+ return list(buffered_cells)
713
+
714
+ return cells
715
+
716
+ def _resolve_geojson_partitions(self, geometry: BaseGeometry, overlap: bool, buffer_cells: int) -> list[str]:
717
+ """Resolve GeoJSON partitions for the given geometry."""
718
+ # For custom GeoJSON grids, we need to check against each feature
719
+ custom_tiles = self.spatial_config.get("custom_tiles", {})
720
+
721
+ if not custom_tiles or "tile_ids" not in custom_tiles:
722
+ logger.warning("No custom tiles information available for GeoJSON grid")
723
+ return []
724
+
725
+ # If we have the source file, load it and check intersections
726
+ geojson_source = self.spatial_config.get("geojson_source")
727
+ if geojson_source and Path(geojson_source).exists():
728
+ try:
729
+ with open(geojson_source) as f:
730
+ geojson_data = json.load(f)
731
+
732
+ intersecting_tiles = []
733
+ for feature in geojson_data.get("features", []):
734
+ tile_id = feature.get("properties", {}).get("id")
735
+ if tile_id:
736
+ tile_geom = shape(feature["geometry"])
737
+ if geometry.intersects(tile_geom):
738
+ intersecting_tiles.append(tile_id)
739
+
740
+ return intersecting_tiles
741
+
742
+ except (OSError, json.JSONDecodeError, ValueError, TypeError) as e:
743
+ logger.error(f"Failed to load GeoJSON file {geojson_source}: {e}")
744
+
745
+ # Fallback: return all known tile IDs (user will need to filter)
746
+ logger.warning("Cannot perform spatial intersection, returning all tiles")
747
+ tile_ids = custom_tiles.get("tile_ids", [])
748
+ return list(tile_ids) if tile_ids is not None else []
749
+
750
+
751
+ def spatial_resolver(schema: str, catalog_path: str | None = None) -> SpatialPartitionResolver:
752
+ """Create a spatial partition resolver from catalog schema with automatic path detection.
753
+
754
+ This is the primary entry point for creating spatial resolvers in EarthCatalog.
755
+ Automatically handles both local and remote schema files with intelligent path
756
+ resolution for cloud storage systems (S3, GCS, Azure) and HTTP endpoints.
757
+
758
+ The resolver enables efficient spatial queries by determining which catalog
759
+ partitions intersect with areas of interest, eliminating the need to scan
760
+ entire catalogs for geospatial queries.
761
+
762
+ Supported Schema Sources:
763
+ - Local filesystem paths (absolute or relative)
764
+ - S3: s3://, s3a://, s3n:// protocols
765
+ - Google Cloud Storage: gcs://, gs:// protocols
766
+ - Azure: abfs://, az://, azure:// protocols
767
+ - HTTP/HTTPS URLs for publicly accessible schemas
768
+ - Any fsspec-supported filesystem protocol
769
+
770
+ Auto-Path Resolution:
771
+ For local files, catalog_path defaults to the schema file's directory.
772
+ For remote files, catalog_path must be explicitly provided since the
773
+ catalog and schema may be in different locations.
774
+
775
+ Args:
776
+ schema: Path or URL to the catalog schema JSON file. Can be local path
777
+ (e.g., './catalog_schema.json') or any fsspec-supported URL
778
+ (e.g., 's3://bucket/schema.json', 'https://example.com/schema.json').
779
+ catalog_path: Path to the catalog root directory. Required for remote
780
+ schema files, optional for local files (defaults to schema directory).
781
+ Should point to the directory containing partition subdirectories.
782
+
783
+ Returns:
784
+ SpatialPartitionResolver: Configured resolver ready for spatial queries.
785
+ Use resolver.resolve_partitions(geometry) to find intersecting partitions
786
+ or resolver.query_partitions(geometry) for direct file path resolution.
787
+
788
+ Raises:
789
+ ValueError: If catalog_path is not provided when required for remote schema
790
+ files, or if the schema file format is invalid.
791
+ FileNotFoundError: If the schema file cannot be found or accessed.
792
+ ImportError: If required dependencies (fsspec, s3fs, gcsfs, etc.) are not
793
+ installed for remote file access.
794
+ json.JSONDecodeError: If the schema file contains invalid JSON.
795
+
796
+ Example:
797
+ >>> # Local catalog
798
+ >>> resolver = spatial_resolver('./catalog_schema.json')
799
+ >>>
800
+ >>> # S3 catalog with explicit catalog path
801
+ >>> resolver = spatial_resolver(
802
+ ... 's3://bucket/metadata/schema.json',
803
+ ... catalog_path='s3://bucket/catalog/'
804
+ ... )
805
+ >>>
806
+ >>> # Find partitions for area of interest
807
+ >>> partitions = resolver.resolve_partitions(aoi_geometry)
808
+ >>> print(f"AOI intersects {len(partitions)} catalog partitions")
809
+ >>>
810
+ >>> # Get file paths directly
811
+ >>> files = resolver.query_partitions(aoi_geometry)
812
+ >>> print(f"Found {len(files)} data files in AOI")
813
+
814
+ Performance:
815
+ - Schema loading is cached for repeated use
816
+ - Spatial computations are optimized for the underlying grid system
817
+ - Large geometry handling includes global partition detection
818
+ - Minimal memory overhead regardless of catalog size
819
+
820
+ Integration:
821
+ Works seamlessly with catalogs created by STACIngestionPipeline and
822
+ integrates with popular geospatial libraries (geopandas, shapely, etc.).
823
+ """
824
+ import os
825
+
826
+ # Check if schema is a remote URL (S3, GCS, Azure, HTTP, etc.)
827
+ is_remote = any(
828
+ schema.startswith(prefix)
829
+ for prefix in [
830
+ "s3://",
831
+ "s3a://",
832
+ "s3n://", # S3 variants
833
+ "gcs://",
834
+ "gs://", # Google Cloud Storage
835
+ "abfs://",
836
+ "azure://", # Azure Blob Storage
837
+ "http://",
838
+ "https://", # HTTP/HTTPS
839
+ "ftp://",
840
+ "sftp://", # FTP variants
841
+ ]
842
+ )
843
+
844
+ if is_remote:
845
+ if catalog_path is None:
846
+ raise ValueError(
847
+ "catalog_path must be explicitly provided when schema is a remote URL. "
848
+ "Cannot auto-detect catalog directory for remote URLs."
849
+ )
850
+
851
+ try:
852
+ import fsspec
853
+ except ImportError as e:
854
+ raise ImportError("fsspec is required for remote file access. Install with: pip install fsspec[s3]") from e
855
+
856
+ # Use fsspec to read remote schema file
857
+ try:
858
+ # Type checking: fsspec is guaranteed to be available after ImportError check above
859
+ with fsspec.open(schema, "r") as f: # type: ignore
860
+ content = f.read() # type: ignore
861
+ schema_data = json.loads(content)
862
+ except (OSError, json.JSONDecodeError, ValueError, ConnectionError) as e:
863
+ raise FileNotFoundError(f"Failed to read schema from remote location: {schema}") from e
864
+ else:
865
+ # Local file handling
866
+ with open(schema) as f:
867
+ schema_data = json.load(f)
868
+
869
+ # If catalog_path not provided, use the directory containing the schema file
870
+ if catalog_path is None:
871
+ catalog_path = os.path.dirname(os.path.abspath(schema))
872
+
873
+ return SpatialPartitionResolver(schema_data, catalog_path)
874
+
875
+
876
+ def resolve_and_query(
877
+ schema_path: str,
878
+ catalog_path: str,
879
+ aoi_geometry: dict[str, Any] | BaseGeometry,
880
+ temporal_filter: str | None = None,
881
+ overlap: bool = True,
882
+ buffer_cells: int = 0,
883
+ ) -> tuple[list[str], str]:
884
+ """One-stop convenience function for spatial partition resolution with ready-to-use DuckDB queries.
885
+
886
+ This high-level function combines spatial partition resolution with DuckDB query
887
+ generation, providing an immediate solution for geospatial catalog queries.
888
+ Ideal for interactive analysis, Jupyter notebooks, and quick data exploration.
889
+
890
+ The function handles the complete workflow:
891
+ 1. Load catalog schema and initialize spatial resolver
892
+ 2. Resolve spatial partitions that intersect the area of interest
893
+ 3. Generate optimized DuckDB SQL query for the intersecting partitions
894
+ 4. Apply optional temporal filtering and buffering
895
+
896
+ Query Optimization Features:
897
+ - Generates efficient partition-aware SELECT statements
898
+ - Includes spatial filtering for precise boundary handling
899
+ - Supports temporal glob patterns for time-series data
900
+ - Optimizes buffer zones for boundary analysis
901
+ - Handles both point and polygon geometries efficiently
902
+
903
+ Use Cases:
904
+ - Interactive data exploration in Jupyter notebooks
905
+ - Quick spatial queries without manual partition management
906
+ - Temporal analysis with combined spatial/time filtering
907
+ - Boundary analysis with automatic buffer zones
908
+ - Prototype development and data discovery
909
+
910
+ Args:
911
+ schema_path: Path to catalog schema JSON file. Can be local file path
912
+ or any fsspec-supported URL (s3://, gcs://, https://, etc.).
913
+ catalog_path: Path to catalog root directory containing partition
914
+ subdirectories. Use same protocol as schema_path for consistency.
915
+ aoi_geometry: Area of interest as GeoJSON dictionary (e.g., from geojson.load())
916
+ or Shapely geometry object (Point, Polygon, MultiPolygon, etc.).
917
+ temporal_filter: Optional temporal filter using glob patterns.
918
+ Examples: "2024-*" (year 2024), "2024-0[1-6]*" (Jan-Jun 2024),
919
+ "*2024-12-*" (December 2024 across all years).
920
+ overlap: Include geometries that overlap partition boundaries.
921
+ True (default) includes all intersecting partitions, False only
922
+ includes partitions where geometry centroids fall within.
923
+ buffer_cells: Number of additional grid cells to include around the
924
+ geometry boundary. Useful for analysis requiring spatial context
925
+ or edge effects handling. Applied using grid system's native buffering.
926
+
927
+ Returns:
928
+ tuple[list[str], str]: Two-element tuple containing:
929
+ - List of partition identifiers that intersect the geometry
930
+ - Ready-to-execute DuckDB SQL query string for spatial analysis
931
+
932
+ Raises:
933
+ FileNotFoundError: If schema_path or catalog_path cannot be accessed.
934
+ ValueError: If aoi_geometry format is invalid or unsupported.
935
+ ImportError: If required dependencies are missing for remote file access.
936
+
937
+ Example:
938
+ >>> from shapely.geometry import box
939
+ >>>
940
+ >>> # Define area of interest (bounding box for New York City)
941
+ >>> nyc_bbox = box(-74.25, 40.49, -73.70, 40.92)
942
+ >>>
943
+ >>> # Get partitions and query for recent data
944
+ >>> partitions, query = resolve_and_query(
945
+ ... schema_path='s3://catalog/schema.json',
946
+ ... catalog_path='s3://catalog/data/',
947
+ ... aoi_geometry=nyc_bbox,
948
+ ... temporal_filter='2024-*',
949
+ ... buffer_cells=1 # Include surrounding context
950
+ ... )
951
+ >>>
952
+ >>> print(f"Found {len(partitions)} intersecting partitions")
953
+ >>> print("Generated DuckDB query:")
954
+ >>> print(query)
955
+ >>>
956
+ >>> # Execute query with DuckDB
957
+ >>> import duckdb
958
+ >>> results = duckdb.sql(query).to_df()
959
+ >>> print(f"Retrieved {len(results)} STAC items in AOI")
960
+
961
+ Performance:
962
+ - Optimized for interactive use with sub-second response times
963
+ - Efficient even for complex geometries and large catalogs
964
+ - Query generation scales linearly with partition count
965
+ - Minimal memory usage regardless of catalog size
966
+
967
+ Integration:
968
+ Works seamlessly with DuckDB, GeoPandas, and other spatial analysis tools.
969
+ Generated queries are compatible with DuckDB's spatial extension and
970
+ can be modified for advanced spatial operations.
971
+ """
972
+ resolver = spatial_resolver(schema_path, catalog_path)
973
+ partition_ids = resolver.resolve_partitions(aoi_geometry, overlap, buffer_cells)
974
+ query_patterns = resolver.generate_query_paths(partition_ids, temporal_filter)
975
+
976
+ if not query_patterns:
977
+ return partition_ids, ""
978
+
979
+ # Generate DuckDB query
980
+ patterns_str = "', '".join(query_patterns)
981
+ # Convert geometry to GeoJSON format for the query
982
+ if hasattr(aoi_geometry, "__geo_interface__"):
983
+ geom_geojson = aoi_geometry.__geo_interface__ # type: ignore
984
+ else:
985
+ geom_geojson = aoi_geometry
986
+
987
+ query = f"""
988
+ SELECT * FROM read_parquet(['{patterns_str}'])
989
+ WHERE ST_Intersects(geometry, ST_GeomFromGeoJSON('{json.dumps(geom_geojson)}'))
990
+ """.strip()
991
+
992
+ return partition_ids, query
993
+
994
+
995
+ def infer_catalog_schema(catalog_path: str) -> dict[str, Any]:
996
+ """Infer catalog schema from directory structure when no schema file is available.
997
+
998
+ This function analyzes the directory structure of an existing catalog to determine
999
+ its grid system, resolution, and available missions. Useful as a fallback when
1000
+ the catalog schema file is not available.
1001
+
1002
+ The function looks for patterns like:
1003
+ - partition=h3, partition=s2, partition=utm, etc.
1004
+ - level=2, level=5, etc.
1005
+ - Mission directories (landsat8, sentinel2, etc.)
1006
+
1007
+ Args:
1008
+ catalog_path: Path to the catalog root directory (local or remote).
1009
+ Supports S3, GCS, Azure via fsspec.
1010
+
1011
+ Returns:
1012
+ dict: Inferred schema dictionary with structure:
1013
+ {
1014
+ "spatial_partitioning": {
1015
+ "grid_system": str, # e.g., "h3", "s2", "itslive"
1016
+ "resolution": int,
1017
+ "partitioning_scheme": "default"
1018
+ },
1019
+ "global_partitioning": {"enabled": bool, "threshold": int},
1020
+ "inferred": True # Marker indicating this was auto-detected
1021
+ }
1022
+
1023
+ Raises:
1024
+ FileNotFoundError: If catalog_path doesn't exist or can't be accessed.
1025
+ ValueError: If the directory structure doesn't match expected patterns.
1026
+
1027
+ Example:
1028
+ >>> schema = infer_catalog_schema('./my_catalog/')
1029
+ >>> resolver = SpatialPartitionResolver(schema, './my_catalog/')
1030
+
1031
+ >>> # Or use with spatial_resolver_from_path
1032
+ >>> resolver = spatial_resolver_from_path('./my_catalog/')
1033
+ """
1034
+ import os
1035
+ import re
1036
+
1037
+ # Supported grid systems and their directory patterns
1038
+ grid_patterns = {
1039
+ "h3": re.compile(r"partition=h3", re.IGNORECASE),
1040
+ "s2": re.compile(r"partition=s2", re.IGNORECASE),
1041
+ "utm": re.compile(r"partition=utm", re.IGNORECASE),
1042
+ "mgrs": re.compile(r"partition=mgrs", re.IGNORECASE),
1043
+ "latlon": re.compile(r"partition=latlon", re.IGNORECASE),
1044
+ "itslive": re.compile(r"partition=itslive|[NS]\d{2}[EW]\d{3}", re.IGNORECASE),
1045
+ "geojson": re.compile(r"partition=geojson", re.IGNORECASE),
1046
+ }
1047
+
1048
+ resolution_pattern = re.compile(r"level=(\d+)")
1049
+
1050
+ # Check if path is remote
1051
+ remote_prefixes = ("s3://", "s3a://", "gs://", "gcs://", "az://", "abfs://", "azure://", "http://", "https://")
1052
+ is_remote = any(catalog_path.startswith(p) for p in remote_prefixes)
1053
+
1054
+ detected_grid = None
1055
+ detected_resolution = None
1056
+ detected_missions: set[str] = set()
1057
+ has_global = False
1058
+
1059
+ if is_remote:
1060
+ try:
1061
+ import fsspec
1062
+ except ImportError:
1063
+ raise ImportError("fsspec required for remote paths: pip install fsspec") from None
1064
+
1065
+ fs, path = fsspec.core.url_to_fs(catalog_path)
1066
+
1067
+ # List top-level directories
1068
+ try:
1069
+ entries = fs.ls(path, detail=False)
1070
+ except (OSError, ConnectionError, ValueError) as e:
1071
+ raise FileNotFoundError(f"Cannot access catalog path: {catalog_path}") from e
1072
+
1073
+ for entry in entries[:50]: # Limit scan
1074
+ entry_str = str(entry)
1075
+
1076
+ # Check for grid pattern
1077
+ for grid_name, pattern in grid_patterns.items():
1078
+ if pattern.search(entry_str):
1079
+ detected_grid = grid_name
1080
+ break
1081
+
1082
+ # Check for resolution
1083
+ res_match = resolution_pattern.search(entry_str)
1084
+ if res_match:
1085
+ detected_resolution = int(res_match.group(1))
1086
+
1087
+ # Check for global partition
1088
+ if "global" in entry_str.lower():
1089
+ has_global = True
1090
+
1091
+ # Extract mission names (top-level dirs that aren't system dirs)
1092
+ basename = os.path.basename(entry.rstrip("/"))
1093
+ if basename and not basename.startswith(".") and basename not in ["global"]:
1094
+ if "partition=" not in entry_str:
1095
+ detected_missions.add(basename)
1096
+ else:
1097
+ # Local path
1098
+ catalog_dir = Path(catalog_path)
1099
+ if not catalog_dir.exists():
1100
+ raise FileNotFoundError(f"Catalog path does not exist: {catalog_path}")
1101
+
1102
+ # Walk directory to find patterns
1103
+ for entry in catalog_dir.iterdir():
1104
+ if entry.is_dir():
1105
+ entry_str = str(entry)
1106
+ entry_name = entry.name
1107
+
1108
+ # Check for grid pattern
1109
+ for grid_name, pattern in grid_patterns.items():
1110
+ if pattern.search(entry_str):
1111
+ detected_grid = grid_name
1112
+ break
1113
+
1114
+ # Also check subdirectories
1115
+ for subentry in entry.iterdir():
1116
+ subentry_str = str(subentry)
1117
+
1118
+ for grid_name, pattern in grid_patterns.items():
1119
+ if pattern.search(subentry_str):
1120
+ detected_grid = grid_name
1121
+ break
1122
+
1123
+ res_match = resolution_pattern.search(subentry_str)
1124
+ if res_match:
1125
+ detected_resolution = int(res_match.group(1))
1126
+
1127
+ if detected_grid and detected_resolution:
1128
+ break
1129
+
1130
+ # Check for global partition
1131
+ if entry_name.lower() == "global":
1132
+ has_global = True
1133
+
1134
+ # Potential mission directory
1135
+ if not entry_name.startswith("."):
1136
+ detected_missions.add(entry_name)
1137
+
1138
+ # Validate we found something
1139
+ if detected_grid is None:
1140
+ raise ValueError(
1141
+ f"Could not detect grid system from catalog structure at {catalog_path}. "
1142
+ "Expected directories matching 'partition=h3', 'partition=s2', etc."
1143
+ )
1144
+
1145
+ # Default resolution if not found
1146
+ if detected_resolution is None:
1147
+ default_resolutions = {"h3": 2, "s2": 13, "utm": 1, "mgrs": 5, "latlon": 1, "itslive": 10, "geojson": 1}
1148
+ detected_resolution = default_resolutions.get(detected_grid, 1)
1149
+ logger.warning(f"Could not detect resolution, using default: {detected_resolution}")
1150
+
1151
+ # Build schema
1152
+ schema: dict[str, Any] = {
1153
+ "spatial_partitioning": {
1154
+ "grid_system": detected_grid,
1155
+ "resolution": detected_resolution,
1156
+ "partitioning_scheme": "default",
1157
+ },
1158
+ "global_partitioning": {
1159
+ "enabled": has_global,
1160
+ "threshold": 10 if has_global else 100,
1161
+ },
1162
+ "inferred": True, # Marker for auto-detected schema
1163
+ }
1164
+
1165
+ # Add missions if found
1166
+ if detected_missions:
1167
+ schema["spatial_partitioning"]["example_paths"] = [
1168
+ f"{m}/partition={detected_grid}/level={detected_resolution}/" for m in list(detected_missions)[:3]
1169
+ ]
1170
+
1171
+ logger.info(f"Inferred catalog schema: grid={detected_grid}, resolution={detected_resolution}")
1172
+ return schema
1173
+
1174
+
1175
+ def spatial_resolver_from_path(catalog_path: str) -> SpatialPartitionResolver:
1176
+ """Create a spatial resolver by inferring schema from catalog directory structure.
1177
+
1178
+ This is a convenience function for working with catalogs that don't have a
1179
+ schema file. It analyzes the directory structure to determine the grid system
1180
+ and resolution, then creates a resolver.
1181
+
1182
+ Use this when:
1183
+ - You have an existing catalog without a schema.json file
1184
+ - You're exploring a catalog and don't know its structure
1185
+ - The schema file is missing or inaccessible
1186
+
1187
+ Args:
1188
+ catalog_path: Path to the catalog root directory (local or remote).
1189
+
1190
+ Returns:
1191
+ SpatialPartitionResolver: Configured resolver based on inferred schema.
1192
+
1193
+ Raises:
1194
+ ValueError: If the directory structure doesn't match expected patterns.
1195
+ FileNotFoundError: If catalog_path doesn't exist.
1196
+
1197
+ Example:
1198
+ >>> # Create resolver by inferring structure
1199
+ >>> resolver = spatial_resolver_from_path('s3://bucket/my-catalog/')
1200
+ >>> partitions = resolver.resolve_partitions(my_geometry)
1201
+ >>>
1202
+ >>> # Equivalent to:
1203
+ >>> schema = infer_catalog_schema('s3://bucket/my-catalog/')
1204
+ >>> resolver = SpatialPartitionResolver(schema, 's3://bucket/my-catalog/')
1205
+ """
1206
+ schema = infer_catalog_schema(catalog_path)
1207
+ return SpatialPartitionResolver(schema, catalog_path)