earthcatalog 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- earthcatalog/__init__.py +164 -0
- earthcatalog/async_http_client.py +1006 -0
- earthcatalog/config.py +97 -0
- earthcatalog/engines/__init__.py +308 -0
- earthcatalog/engines/rustac_engine.py +142 -0
- earthcatalog/engines/stac_geoparquet_engine.py +126 -0
- earthcatalog/exceptions.py +471 -0
- earthcatalog/grid_systems.py +1114 -0
- earthcatalog/ingestion_pipeline.py +2281 -0
- earthcatalog/input_readers.py +603 -0
- earthcatalog/job_tracking.py +485 -0
- earthcatalog/pipeline.py +606 -0
- earthcatalog/schema_generator.py +911 -0
- earthcatalog/spatial_resolver.py +1207 -0
- earthcatalog/stac_hooks.py +754 -0
- earthcatalog/statistics.py +677 -0
- earthcatalog/storage_backends.py +548 -0
- earthcatalog/tests/__init__.py +1 -0
- earthcatalog/tests/conftest.py +76 -0
- earthcatalog/tests/test_all_grids.py +793 -0
- earthcatalog/tests/test_async_http.py +700 -0
- earthcatalog/tests/test_cli_and_storage.py +230 -0
- earthcatalog/tests/test_config.py +245 -0
- earthcatalog/tests/test_dask_integration.py +580 -0
- earthcatalog/tests/test_e2e_synthetic.py +1624 -0
- earthcatalog/tests/test_engines.py +272 -0
- earthcatalog/tests/test_exceptions.py +346 -0
- earthcatalog/tests/test_file_structure.py +245 -0
- earthcatalog/tests/test_input_readers.py +666 -0
- earthcatalog/tests/test_integration.py +200 -0
- earthcatalog/tests/test_integration_async.py +283 -0
- earthcatalog/tests/test_job_tracking.py +603 -0
- earthcatalog/tests/test_multi_file_input.py +336 -0
- earthcatalog/tests/test_passthrough_hook.py +196 -0
- earthcatalog/tests/test_pipeline.py +684 -0
- earthcatalog/tests/test_pipeline_components.py +665 -0
- earthcatalog/tests/test_schema_generator.py +506 -0
- earthcatalog/tests/test_spatial_resolver.py +413 -0
- earthcatalog/tests/test_stac_hooks.py +776 -0
- earthcatalog/tests/test_statistics.py +477 -0
- earthcatalog/tests/test_storage_backends.py +236 -0
- earthcatalog/tests/test_validation.py +435 -0
- earthcatalog/tests/test_workers.py +653 -0
- earthcatalog/validation.py +921 -0
- earthcatalog/workers.py +682 -0
- earthcatalog-0.2.0.dist-info/METADATA +333 -0
- earthcatalog-0.2.0.dist-info/RECORD +50 -0
- earthcatalog-0.2.0.dist-info/WHEEL +5 -0
- earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
- earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1207 @@
|
|
|
1
|
+
"""Spatial partition resolver for efficient geospatial catalog querying and analysis.
|
|
2
|
+
|
|
3
|
+
This module provides intelligent spatial partition resolution capabilities that automatically
|
|
4
|
+
determine which catalog partitions intersect with areas of interest. Eliminates the need
|
|
5
|
+
for manual partition management and enables efficient spatial queries across massive
|
|
6
|
+
geospatial datasets organized in EarthCatalog's partition structure.
|
|
7
|
+
|
|
8
|
+
Core Capabilities:
|
|
9
|
+
- Automatic spatial partition detection for any geometry
|
|
10
|
+
- Multi-grid system support (H3, S2, UTM, MGRS, LatLon, ITSLive, GeoJSON)
|
|
11
|
+
- Global partition handling for large geometries spanning many tiles
|
|
12
|
+
- Buffer zone support for boundary analysis and edge effects
|
|
13
|
+
- Temporal filtering integration for time-series spatial analysis
|
|
14
|
+
- Remote catalog support (S3, GCS, Azure, HTTP) via fsspec
|
|
15
|
+
|
|
16
|
+
Key Components:
|
|
17
|
+
SpatialPartitionResolver: Main class for partition resolution and querying
|
|
18
|
+
spatial_resolver(): Primary entry point for creating resolvers from schema files
|
|
19
|
+
resolve_and_query(): One-stop function combining resolution with DuckDB query generation
|
|
20
|
+
|
|
21
|
+
Performance Benefits:
|
|
22
|
+
- Sub-second query planning for catalogs with millions of partitions
|
|
23
|
+
- Eliminates need to scan entire catalogs for spatial queries
|
|
24
|
+
- Optimized spatial indexing for complex geometries
|
|
25
|
+
- Minimal memory usage regardless of catalog size
|
|
26
|
+
- Smart caching for repeated queries with similar geometries
|
|
27
|
+
|
|
28
|
+
Query Optimization:
|
|
29
|
+
The resolver generates optimized file access patterns by:
|
|
30
|
+
- Identifying only relevant spatial partitions
|
|
31
|
+
- Supporting efficient temporal filtering with glob patterns
|
|
32
|
+
- Providing buffer zones for boundary analysis
|
|
33
|
+
- Generating DuckDB-compatible SQL for immediate use
|
|
34
|
+
|
|
35
|
+
Integration Patterns:
|
|
36
|
+
>>> # Simple spatial query
|
|
37
|
+
>>> resolver = spatial_resolver('catalog_schema.json')
|
|
38
|
+
>>> partitions = resolver.resolve_partitions(aoi_geometry)
|
|
39
|
+
>>>
|
|
40
|
+
>>> # Combined spatial-temporal query with DuckDB
|
|
41
|
+
>>> partitions, query = resolve_and_query(
|
|
42
|
+
... 'schema.json', 'catalog/', aoi_geometry,
|
|
43
|
+
... temporal_filter='2024-*'
|
|
44
|
+
... )
|
|
45
|
+
>>> results = duckdb.sql(query).to_df()
|
|
46
|
+
|
|
47
|
+
Use Cases:
|
|
48
|
+
- Interactive geospatial analysis in Jupyter notebooks
|
|
49
|
+
- Large-scale spatial analytics with optimized data access
|
|
50
|
+
- Time-series analysis combining spatial and temporal filtering
|
|
51
|
+
- Boundary analysis with automatic buffer zone handling
|
|
52
|
+
- Multi-resolution spatial analysis across different grid systems
|
|
53
|
+
|
|
54
|
+
Remote Catalog Support:
|
|
55
|
+
Full support for cloud-hosted catalogs via fsspec protocols:
|
|
56
|
+
- S3: s3://bucket/catalog/ (with s3fs)
|
|
57
|
+
- Google Cloud: gs://bucket/catalog/ (with gcsfs)
|
|
58
|
+
- Azure: abfs://container/catalog/ (with adlfs)
|
|
59
|
+
- HTTP: https://example.com/catalog/ (built-in)
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
import json
|
|
63
|
+
import logging
|
|
64
|
+
from pathlib import Path
|
|
65
|
+
from typing import Any
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
import h3
|
|
69
|
+
except ImportError:
|
|
70
|
+
h3 = None
|
|
71
|
+
|
|
72
|
+
try:
|
|
73
|
+
import s2sphere
|
|
74
|
+
except ImportError:
|
|
75
|
+
s2sphere = None
|
|
76
|
+
|
|
77
|
+
from shapely.geometry import Point, box, shape
|
|
78
|
+
from shapely.geometry.base import BaseGeometry
|
|
79
|
+
|
|
80
|
+
logger = logging.getLogger(__name__)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class SpatialPartitionResolver:
|
|
84
|
+
"""High-performance spatial partition resolver for efficient catalog querying and analysis.
|
|
85
|
+
|
|
86
|
+
This class provides the core functionality for determining which spatial partitions
|
|
87
|
+
in an EarthCatalog intersect with areas of interest. Optimized for both simple
|
|
88
|
+
point queries and complex polygon analysis across catalogs with millions of partitions.
|
|
89
|
+
|
|
90
|
+
The resolver automatically handles:
|
|
91
|
+
- Multi-grid system support with consistent interfaces
|
|
92
|
+
- Global partition detection for large geometries
|
|
93
|
+
- Boundary buffering for edge analysis
|
|
94
|
+
- Temporal filtering integration
|
|
95
|
+
- Performance optimization for repeated queries
|
|
96
|
+
|
|
97
|
+
Grid System Compatibility:
|
|
98
|
+
Supports all EarthCatalog grid systems with native optimizations:
|
|
99
|
+
- H3: Hexagonal grid with excellent global properties
|
|
100
|
+
- S2: Spherical geometry optimized for polar regions
|
|
101
|
+
- UTM: High-precision zoned coordinate system
|
|
102
|
+
- MGRS: Military grid reference system
|
|
103
|
+
- LatLon: Simple latitude/longitude grid
|
|
104
|
+
- ITSLive: ITS_LIVE center-based 10-degree grid for ice datasets
|
|
105
|
+
- GeoJSON: Custom polygon-based partitioning
|
|
106
|
+
|
|
107
|
+
Performance Characteristics:
|
|
108
|
+
- O(log n) partition resolution for most grid systems
|
|
109
|
+
- Sub-second response for catalogs with 10M+ partitions
|
|
110
|
+
- Efficient memory usage with lazy partition loading
|
|
111
|
+
- Optimized spatial computations using grid-native methods
|
|
112
|
+
- Smart caching for geometric operations
|
|
113
|
+
|
|
114
|
+
Thread Safety:
|
|
115
|
+
This class is thread-safe for read operations after initialization.
|
|
116
|
+
Multiple threads can safely call resolve_partitions() and query_partitions()
|
|
117
|
+
concurrently on the same resolver instance.
|
|
118
|
+
|
|
119
|
+
Example:
|
|
120
|
+
>>> # Initialize from catalog schema
|
|
121
|
+
>>> resolver = SpatialPartitionResolver(schema_dict, catalog_path)
|
|
122
|
+
>>>
|
|
123
|
+
>>> # Find intersecting partitions
|
|
124
|
+
>>> partitions = resolver.resolve_partitions(aoi_geometry)
|
|
125
|
+
>>>
|
|
126
|
+
>>> # Get file paths with temporal filtering
|
|
127
|
+
>>> files = resolver.query_partitions(
|
|
128
|
+
... aoi_geometry,
|
|
129
|
+
... temporal_filter='2024-*',
|
|
130
|
+
... buffer_cells=1
|
|
131
|
+
... )
|
|
132
|
+
>>>
|
|
133
|
+
>>> # Check if geometry uses global partition
|
|
134
|
+
>>> uses_global = resolver.should_use_global_partition(large_geometry)
|
|
135
|
+
"""
|
|
136
|
+
|
|
137
|
+
def __init__(self, catalog_schema: dict[str, Any], catalog_path: str):
|
|
138
|
+
"""Initialize resolver with catalog schema and path.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
catalog_schema: Loaded catalog schema dictionary
|
|
142
|
+
catalog_path: Path to the catalog directory
|
|
143
|
+
"""
|
|
144
|
+
self.schema = catalog_schema
|
|
145
|
+
self.catalog_path = Path(catalog_path)
|
|
146
|
+
self.grid_system = catalog_schema["spatial_partitioning"]["grid_system"]
|
|
147
|
+
self.spatial_config = catalog_schema["spatial_partitioning"]
|
|
148
|
+
self.global_config = catalog_schema.get("global_partitioning", {})
|
|
149
|
+
self.global_enabled = bool(self.global_config.get("enabled", False))
|
|
150
|
+
self.global_threshold = int(self.global_config.get("threshold", 1))
|
|
151
|
+
|
|
152
|
+
# Extract mission information from catalog structure
|
|
153
|
+
self.missions = self._extract_available_missions(catalog_schema)
|
|
154
|
+
|
|
155
|
+
def _extract_available_missions(self, schema: dict[str, Any]) -> list[str]:
|
|
156
|
+
"""Extract available missions from schema metadata."""
|
|
157
|
+
# Extract from partition statistics or structure examples
|
|
158
|
+
partitioning = schema.get("spatial_partitioning", {})
|
|
159
|
+
|
|
160
|
+
if "example_paths" in partitioning:
|
|
161
|
+
missions = set()
|
|
162
|
+
for path in partitioning["example_paths"]:
|
|
163
|
+
if "/" in path:
|
|
164
|
+
mission = path.split("/")[0]
|
|
165
|
+
missions.add(mission)
|
|
166
|
+
return list(missions)
|
|
167
|
+
|
|
168
|
+
return ["unknown"] # Fallback
|
|
169
|
+
|
|
170
|
+
def resolve_partitions(
|
|
171
|
+
self,
|
|
172
|
+
geometry: dict[str, Any] | BaseGeometry,
|
|
173
|
+
overlap: bool = True,
|
|
174
|
+
buffer_cells: int = 0,
|
|
175
|
+
include_global: bool | None = None,
|
|
176
|
+
) -> list[str]:
|
|
177
|
+
"""Resolve spatial partitions that intersect with the given geometry.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
geometry: GeoJSON geometry dict or Shapely geometry
|
|
181
|
+
overlap: Whether to include overlapping cells (True) or only covering cells (False)
|
|
182
|
+
buffer_cells: Number of additional cells to include around the boundary
|
|
183
|
+
include_global: Whether to include global partition. If None, auto-detect based on threshold
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
List of spatial partition IDs that intersect with the geometry
|
|
187
|
+
"""
|
|
188
|
+
if isinstance(geometry, dict):
|
|
189
|
+
shapely_geom = shape(geometry)
|
|
190
|
+
else:
|
|
191
|
+
shapely_geom = geometry
|
|
192
|
+
|
|
193
|
+
# Route to appropriate grid system handler
|
|
194
|
+
if self.grid_system == "h3":
|
|
195
|
+
spatial_partitions = self._resolve_h3_partitions(shapely_geom, overlap, buffer_cells)
|
|
196
|
+
elif self.grid_system == "s2":
|
|
197
|
+
spatial_partitions = self._resolve_s2_partitions(shapely_geom, overlap, buffer_cells)
|
|
198
|
+
elif self.grid_system == "mgrs":
|
|
199
|
+
spatial_partitions = self._resolve_mgrs_partitions(shapely_geom, overlap, buffer_cells)
|
|
200
|
+
elif self.grid_system == "utm":
|
|
201
|
+
spatial_partitions = self._resolve_utm_partitions(shapely_geom, overlap, buffer_cells)
|
|
202
|
+
elif self.grid_system == "latlon":
|
|
203
|
+
spatial_partitions = self._resolve_latlon_partitions(shapely_geom, overlap, buffer_cells)
|
|
204
|
+
elif self.grid_system == "itslive":
|
|
205
|
+
spatial_partitions = self._resolve_itslive_partitions(shapely_geom, overlap, buffer_cells)
|
|
206
|
+
elif self.grid_system == "geojson":
|
|
207
|
+
spatial_partitions = self._resolve_geojson_partitions(shapely_geom, overlap, buffer_cells)
|
|
208
|
+
else:
|
|
209
|
+
raise ValueError(f"Unsupported grid system: {self.grid_system}")
|
|
210
|
+
|
|
211
|
+
# Check if we should include the global partition
|
|
212
|
+
should_include_global = self._should_include_global_partition(spatial_partitions, shapely_geom, include_global)
|
|
213
|
+
|
|
214
|
+
if should_include_global:
|
|
215
|
+
spatial_partitions.append("global")
|
|
216
|
+
|
|
217
|
+
return spatial_partitions
|
|
218
|
+
|
|
219
|
+
def _should_include_global_partition(
|
|
220
|
+
self, spatial_partitions: list[str], geometry: BaseGeometry, include_global: bool | None = None
|
|
221
|
+
) -> bool:
|
|
222
|
+
"""Determine whether to include the global partition in results.
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
spatial_partitions: List of resolved spatial partitions
|
|
226
|
+
geometry: The query geometry
|
|
227
|
+
include_global: Explicit override (True/False/None for auto-detect)
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
True if global partition should be included
|
|
231
|
+
"""
|
|
232
|
+
# If explicitly specified, use that
|
|
233
|
+
if include_global is not None:
|
|
234
|
+
return include_global and self.global_enabled
|
|
235
|
+
|
|
236
|
+
# If global partitioning is disabled, never include
|
|
237
|
+
if not self.global_enabled:
|
|
238
|
+
return False
|
|
239
|
+
|
|
240
|
+
# Auto-detect based on threshold logic:
|
|
241
|
+
# Include global if the query spans more cells than the threshold,
|
|
242
|
+
# because large geometries are likely stored in global partition
|
|
243
|
+
|
|
244
|
+
# Get the threshold for this grid system and resolution
|
|
245
|
+
threshold = self._get_effective_global_threshold()
|
|
246
|
+
|
|
247
|
+
# Check if query exceeds threshold
|
|
248
|
+
if len(spatial_partitions) > threshold:
|
|
249
|
+
logger.debug(
|
|
250
|
+
f"Query spans {len(spatial_partitions)} partitions > threshold {threshold}, including global partition"
|
|
251
|
+
)
|
|
252
|
+
return True
|
|
253
|
+
|
|
254
|
+
# Additional logic: If geometry is very large, include global even if
|
|
255
|
+
# partition count is low (e.g., geometry spans continents but only touches
|
|
256
|
+
# a few high-level cells)
|
|
257
|
+
geometry_area = geometry.area # In square degrees
|
|
258
|
+
if geometry_area > self._get_large_geometry_threshold():
|
|
259
|
+
logger.debug(f"Geometry area {geometry_area:.2f} sq degrees is very large, including global partition")
|
|
260
|
+
return True
|
|
261
|
+
|
|
262
|
+
return False
|
|
263
|
+
|
|
264
|
+
def _get_effective_global_threshold(self) -> int:
|
|
265
|
+
"""Get the effective global partition threshold for current grid system and resolution."""
|
|
266
|
+
grid_resolution = self._get_grid_resolution_key()
|
|
267
|
+
|
|
268
|
+
# Check for custom thresholds first
|
|
269
|
+
custom_thresholds = self.schema.get("custom_thresholds", {})
|
|
270
|
+
if custom_thresholds:
|
|
271
|
+
grid_thresholds = custom_thresholds.get(self.grid_system, {})
|
|
272
|
+
if str(grid_resolution) in grid_thresholds:
|
|
273
|
+
threshold = grid_thresholds[str(grid_resolution)]
|
|
274
|
+
return int(threshold)
|
|
275
|
+
|
|
276
|
+
# Fall back to configured threshold
|
|
277
|
+
return self.global_threshold
|
|
278
|
+
|
|
279
|
+
def _get_grid_resolution_key(self) -> int | float | str:
|
|
280
|
+
"""Get the resolution/level key for the current grid system."""
|
|
281
|
+
if self.grid_system == "h3":
|
|
282
|
+
resolution = self.spatial_config.get("resolution", 6)
|
|
283
|
+
return int(resolution) if resolution is not None else 6
|
|
284
|
+
elif self.grid_system == "s2":
|
|
285
|
+
level = self.spatial_config.get("level", self.spatial_config.get("resolution", 13))
|
|
286
|
+
return int(level) if level is not None else 13
|
|
287
|
+
elif self.grid_system == "mgrs":
|
|
288
|
+
precision = self.spatial_config.get("precision", self.spatial_config.get("resolution", 3))
|
|
289
|
+
return int(precision) if precision is not None else 3
|
|
290
|
+
elif self.grid_system == "utm":
|
|
291
|
+
precision = self.spatial_config.get("precision", self.spatial_config.get("resolution", 1))
|
|
292
|
+
return int(precision) if precision is not None else 1
|
|
293
|
+
elif self.grid_system == "latlon":
|
|
294
|
+
cell_size = self.spatial_config.get("cell_size_degrees", self.spatial_config.get("resolution", 1.0))
|
|
295
|
+
return float(cell_size) if cell_size is not None else 1.0
|
|
296
|
+
elif self.grid_system == "itslive":
|
|
297
|
+
return 10 # ITSLive has fixed 10-degree resolution
|
|
298
|
+
else:
|
|
299
|
+
return 1
|
|
300
|
+
|
|
301
|
+
def _get_large_geometry_threshold(self) -> float:
|
|
302
|
+
"""Get threshold for considering a geometry 'large' (in square degrees)."""
|
|
303
|
+
# Thresholds based on grid system characteristics
|
|
304
|
+
thresholds = {
|
|
305
|
+
"h3": 10.0, # ~1000km x 1000km at equator
|
|
306
|
+
"s2": 10.0, # Similar to H3
|
|
307
|
+
"mgrs": 5.0, # Military grids are more regional
|
|
308
|
+
"utm": 50.0, # UTM zones are large (6 degrees wide)
|
|
309
|
+
"latlon": 100.0, # Simple grids can handle large areas
|
|
310
|
+
"itslive": 50.0, # ITS_LIVE 10-degree cells are large
|
|
311
|
+
"geojson": 5.0, # Custom grids usually regional
|
|
312
|
+
}
|
|
313
|
+
return thresholds.get(self.grid_system, 10.0)
|
|
314
|
+
|
|
315
|
+
def get_existing_partition_paths(self, partition_ids: list[str], missions: list[str] | None = None) -> list[str]:
|
|
316
|
+
"""Filter partition IDs to only those that exist in the catalog.
|
|
317
|
+
|
|
318
|
+
Args:
|
|
319
|
+
partition_ids: List of spatial partition IDs (H3 cells, etc.)
|
|
320
|
+
missions: Optional list of missions to filter. If None, include all.
|
|
321
|
+
|
|
322
|
+
Returns:
|
|
323
|
+
List of partition directory paths that actually exist
|
|
324
|
+
"""
|
|
325
|
+
existing_paths = []
|
|
326
|
+
|
|
327
|
+
# File structure: mission/partition=h3/level=X/spatial_id/
|
|
328
|
+
missions_to_check = missions or self.missions
|
|
329
|
+
resolution = self.spatial_config.get("resolution", 2)
|
|
330
|
+
grid_system = self.grid_system
|
|
331
|
+
|
|
332
|
+
for mission in missions_to_check:
|
|
333
|
+
for partition_id in partition_ids:
|
|
334
|
+
partition_path = (
|
|
335
|
+
self.catalog_path / mission / f"partition={grid_system}" / f"level={resolution}" / partition_id
|
|
336
|
+
)
|
|
337
|
+
if partition_path.exists() and partition_path.is_dir():
|
|
338
|
+
existing_paths.append(str(partition_path))
|
|
339
|
+
|
|
340
|
+
return existing_paths
|
|
341
|
+
|
|
342
|
+
def _parse_temporal_filter_to_hive(self, temporal_filter: str) -> str:
|
|
343
|
+
"""Convert temporal filter to Hive-style path pattern.
|
|
344
|
+
|
|
345
|
+
Converts user-friendly temporal filters into Hive partition directory patterns
|
|
346
|
+
for efficient directory-level pruning in DuckDB, Athena, and Spark.
|
|
347
|
+
|
|
348
|
+
Args:
|
|
349
|
+
temporal_filter: Temporal filter string in various formats
|
|
350
|
+
|
|
351
|
+
Returns:
|
|
352
|
+
Hive-style path pattern for directory matching
|
|
353
|
+
|
|
354
|
+
Examples:
|
|
355
|
+
"2024" → "year=2024"
|
|
356
|
+
"2024-*" → "year=2024/*"
|
|
357
|
+
"2024-01" → "year=2024/month=01"
|
|
358
|
+
"2024-01-*" → "year=2024/month=01/*"
|
|
359
|
+
"2024-01-15" → "year=2024/month=01/day=15"
|
|
360
|
+
"*" → "*"
|
|
361
|
+
"""
|
|
362
|
+
if not temporal_filter or temporal_filter == "*":
|
|
363
|
+
return "*"
|
|
364
|
+
|
|
365
|
+
# Remove trailing wildcard for parsing, we'll handle it separately
|
|
366
|
+
has_wildcard = temporal_filter.endswith("*")
|
|
367
|
+
clean_filter = temporal_filter.rstrip("-*")
|
|
368
|
+
|
|
369
|
+
parts = clean_filter.split("-")
|
|
370
|
+
|
|
371
|
+
if len(parts) >= 1 and parts[0]:
|
|
372
|
+
year = parts[0]
|
|
373
|
+
result = f"year={year}"
|
|
374
|
+
|
|
375
|
+
if len(parts) >= 2 and parts[1]:
|
|
376
|
+
month = parts[1].zfill(2)
|
|
377
|
+
result += f"/month={month}"
|
|
378
|
+
|
|
379
|
+
if len(parts) >= 3 and parts[2]:
|
|
380
|
+
day = parts[2].zfill(2)
|
|
381
|
+
result += f"/day={day}"
|
|
382
|
+
|
|
383
|
+
# Add wildcard back if original had one
|
|
384
|
+
if has_wildcard:
|
|
385
|
+
result += "/*"
|
|
386
|
+
|
|
387
|
+
return result
|
|
388
|
+
|
|
389
|
+
return temporal_filter # Return as-is if can't parse
|
|
390
|
+
|
|
391
|
+
def generate_query_paths(
|
|
392
|
+
self,
|
|
393
|
+
partition_ids: list[str],
|
|
394
|
+
temporal_filter: str | None = None,
|
|
395
|
+
missions: list[str] | None = None,
|
|
396
|
+
output_format: str = "parquet",
|
|
397
|
+
) -> list[str]:
|
|
398
|
+
"""Generate file path patterns for querying specific partitions.
|
|
399
|
+
|
|
400
|
+
Uses Hive-style temporal partitioning for efficient directory-level pruning.
|
|
401
|
+
The temporal_filter is converted to Hive partition directories
|
|
402
|
+
(e.g., "2024-01" becomes "year=2024/month=01").
|
|
403
|
+
|
|
404
|
+
Args:
|
|
405
|
+
partition_ids: List of spatial partition IDs
|
|
406
|
+
temporal_filter: Optional temporal filter (e.g., "2024-*", "2024-01", "2024-01-15")
|
|
407
|
+
missions: Optional list of missions to include
|
|
408
|
+
output_format: File format ("parquet" or "ndjson")
|
|
409
|
+
|
|
410
|
+
Returns:
|
|
411
|
+
List of file path patterns for use with read_parquet() or similar
|
|
412
|
+
"""
|
|
413
|
+
existing_paths = self.get_existing_partition_paths(partition_ids, missions)
|
|
414
|
+
|
|
415
|
+
file_extension = f".{output_format}" if output_format != "geoparquet" else ".parquet"
|
|
416
|
+
|
|
417
|
+
if temporal_filter:
|
|
418
|
+
# Convert temporal filter to Hive-style path pattern
|
|
419
|
+
hive_temporal = self._parse_temporal_filter_to_hive(temporal_filter)
|
|
420
|
+
patterns = [f"{path}/{hive_temporal}/items{file_extension}" for path in existing_paths]
|
|
421
|
+
else:
|
|
422
|
+
# Include all temporal partitions - glob for items files in any temporal directory
|
|
423
|
+
patterns = [f"{path}/**/items{file_extension}" for path in existing_paths]
|
|
424
|
+
|
|
425
|
+
return patterns
|
|
426
|
+
|
|
427
|
+
def _resolve_h3_partitions(self, geometry: BaseGeometry, overlap: bool, buffer_cells: int) -> list[str]:
|
|
428
|
+
"""Resolve H3 partitions for the given geometry."""
|
|
429
|
+
if h3 is None:
|
|
430
|
+
raise ImportError("h3 library required for H3 grid resolution: pip install h3")
|
|
431
|
+
|
|
432
|
+
resolution = self.spatial_config["resolution"]
|
|
433
|
+
|
|
434
|
+
try:
|
|
435
|
+
# Convert Shapely geometry to H3 geometry format
|
|
436
|
+
if isinstance(geometry, Point):
|
|
437
|
+
# For points, get the single cell and optionally buffer
|
|
438
|
+
lat, lon = geometry.y, geometry.x
|
|
439
|
+
center_cell = h3.latlng_to_cell(lat, lon, resolution)
|
|
440
|
+
|
|
441
|
+
if buffer_cells > 0:
|
|
442
|
+
# Get cells within buffer distance
|
|
443
|
+
cells = {center_cell}
|
|
444
|
+
for ring in range(1, buffer_cells + 1):
|
|
445
|
+
cells.update(h3.grid_ring(center_cell, ring))
|
|
446
|
+
return list(cells)
|
|
447
|
+
else:
|
|
448
|
+
return [center_cell]
|
|
449
|
+
|
|
450
|
+
else:
|
|
451
|
+
# For polygons, use the experimental shape-to-cells function
|
|
452
|
+
try:
|
|
453
|
+
# Convert geometry to GeoJSON-like format for H3
|
|
454
|
+
geom_dict = geometry.__geo_interface__
|
|
455
|
+
|
|
456
|
+
# Use h3shape_to_cells_experimental if available
|
|
457
|
+
if hasattr(h3, "h3shape_to_cells_experimental"):
|
|
458
|
+
h3_shape = h3.geo_to_h3shape(geom_dict)
|
|
459
|
+
contain_mode = "overlap" if overlap else "center"
|
|
460
|
+
cells_result = h3.h3shape_to_cells_experimental(h3_shape, resolution, contain_mode)
|
|
461
|
+
initial_cells: list[str] = list(cells_result)
|
|
462
|
+
else:
|
|
463
|
+
# Use standard polygon_to_cells method
|
|
464
|
+
cells_result = h3.polygon_to_cells(geom_dict, resolution)
|
|
465
|
+
initial_cells = list(cells_result)
|
|
466
|
+
|
|
467
|
+
# Add buffer cells if requested
|
|
468
|
+
if buffer_cells > 0:
|
|
469
|
+
buffered_cells: set[str] = set(initial_cells)
|
|
470
|
+
for cell in initial_cells:
|
|
471
|
+
for ring in range(1, buffer_cells + 1):
|
|
472
|
+
try:
|
|
473
|
+
ring_cells = h3.grid_ring(cell, ring)
|
|
474
|
+
buffered_cells.update(ring_cells)
|
|
475
|
+
except (ValueError, TypeError, RuntimeError) as e:
|
|
476
|
+
# Some cells may not have valid rings at boundaries
|
|
477
|
+
logger.debug(f"Failed to get ring for cell: {e}")
|
|
478
|
+
return list(buffered_cells)
|
|
479
|
+
|
|
480
|
+
return initial_cells
|
|
481
|
+
|
|
482
|
+
except (ValueError, TypeError, AttributeError, RuntimeError) as e:
|
|
483
|
+
logger.warning(f"H3 experimental method failed, using fallback: {e}")
|
|
484
|
+
# Fallback to bounding box sampling approach
|
|
485
|
+
return self._h3_fallback_sampling(geometry, resolution, buffer_cells, h3)
|
|
486
|
+
|
|
487
|
+
except (ValueError, TypeError, AttributeError, RuntimeError) as e:
|
|
488
|
+
logger.error(f"Failed to resolve H3 partitions: {e}")
|
|
489
|
+
return []
|
|
490
|
+
|
|
491
|
+
def _h3_fallback_sampling(
|
|
492
|
+
self, geometry: BaseGeometry, resolution: int, buffer_cells: int, h3_module: Any
|
|
493
|
+
) -> list[str]:
|
|
494
|
+
"""Fallback H3 resolution using bounding box sampling."""
|
|
495
|
+
import numpy as np
|
|
496
|
+
|
|
497
|
+
bounds = geometry.bounds
|
|
498
|
+
min_lon, min_lat, max_lon, max_lat = bounds
|
|
499
|
+
|
|
500
|
+
# Generate sample points across the geometry
|
|
501
|
+
n_samples = min(1000, max(50, int(geometry.area * 10000)))
|
|
502
|
+
|
|
503
|
+
cells = set()
|
|
504
|
+
for _ in range(n_samples):
|
|
505
|
+
lat = np.random.uniform(min_lat, max_lat)
|
|
506
|
+
lon = np.random.uniform(min_lon, max_lon)
|
|
507
|
+
point = Point(lon, lat)
|
|
508
|
+
|
|
509
|
+
if geometry.contains(point) or geometry.intersects(point):
|
|
510
|
+
cell = h3_module.latlng_to_cell(lat, lon, resolution)
|
|
511
|
+
cells.add(cell)
|
|
512
|
+
|
|
513
|
+
# Add buffer if requested
|
|
514
|
+
if buffer_cells > 0:
|
|
515
|
+
buffered_cells = set(cells)
|
|
516
|
+
for cell in cells:
|
|
517
|
+
for ring in range(1, buffer_cells + 1):
|
|
518
|
+
try:
|
|
519
|
+
buffered_cells.update(h3_module.grid_ring(cell, ring))
|
|
520
|
+
except (ValueError, TypeError, RuntimeError) as e:
|
|
521
|
+
logger.debug(f"Failed to get H3 ring: {e}")
|
|
522
|
+
cells = buffered_cells
|
|
523
|
+
|
|
524
|
+
return list(cells)
|
|
525
|
+
|
|
526
|
+
def _resolve_s2_partitions(self, geometry: BaseGeometry, overlap: bool, buffer_cells: int) -> list[str]:
|
|
527
|
+
"""Resolve S2 partitions for the given geometry."""
|
|
528
|
+
if s2sphere is None:
|
|
529
|
+
raise ImportError("s2sphere library required for S2 grid resolution: pip install s2sphere")
|
|
530
|
+
|
|
531
|
+
level = self.spatial_config.get("level", self.spatial_config.get("resolution"))
|
|
532
|
+
|
|
533
|
+
try:
|
|
534
|
+
# Convert geometry to S2 cells
|
|
535
|
+
if isinstance(geometry, Point):
|
|
536
|
+
# Type checking: s2sphere is guaranteed to be available after ImportError check above
|
|
537
|
+
assert s2sphere is not None
|
|
538
|
+
lat_lng = s2sphere.LatLng.from_degrees(geometry.y, geometry.x) # type: ignore
|
|
539
|
+
cell_id = s2sphere.CellId.from_lat_lng(lat_lng) # type: ignore
|
|
540
|
+
cell_id = cell_id.parent(level) # type: ignore
|
|
541
|
+
return [str(cell_id.id())] # type: ignore
|
|
542
|
+
else:
|
|
543
|
+
# For complex geometries, sample points and find covering cells
|
|
544
|
+
bounds = geometry.bounds
|
|
545
|
+
min_lon, min_lat, max_lon, max_lat = bounds
|
|
546
|
+
|
|
547
|
+
cells = set()
|
|
548
|
+
# Sample points across geometry bounding box
|
|
549
|
+
n_samples = min(500, max(25, int(geometry.area * 1000)))
|
|
550
|
+
import numpy as np
|
|
551
|
+
|
|
552
|
+
for _ in range(n_samples):
|
|
553
|
+
lat = np.random.uniform(min_lat, max_lat)
|
|
554
|
+
lon = np.random.uniform(min_lon, max_lon)
|
|
555
|
+
point = Point(lon, lat)
|
|
556
|
+
|
|
557
|
+
if geometry.contains(point) or geometry.intersects(point):
|
|
558
|
+
lat_lng = s2sphere.LatLng.from_degrees(lat, lon) # type: ignore
|
|
559
|
+
cell_id = s2sphere.CellId.from_lat_lng(lat_lng) # type: ignore
|
|
560
|
+
cell_id = cell_id.parent(level) # type: ignore
|
|
561
|
+
cells.add(str(cell_id.id())) # type: ignore
|
|
562
|
+
|
|
563
|
+
return list(cells)
|
|
564
|
+
|
|
565
|
+
except (ValueError, TypeError, AttributeError, RuntimeError) as e:
|
|
566
|
+
logger.error(f"Failed to resolve S2 partitions: {e}")
|
|
567
|
+
return []
|
|
568
|
+
|
|
569
|
+
def _resolve_mgrs_partitions(self, geometry: BaseGeometry, overlap: bool, buffer_cells: int) -> list[str]:
|
|
570
|
+
"""Resolve MGRS partitions for the given geometry."""
|
|
571
|
+
try:
|
|
572
|
+
import mgrs
|
|
573
|
+
except ImportError as e:
|
|
574
|
+
raise ImportError("mgrs library required for MGRS grid resolution: pip install mgrs") from e
|
|
575
|
+
|
|
576
|
+
precision = self.spatial_config.get("precision", self.spatial_config.get("resolution"))
|
|
577
|
+
m = mgrs.MGRS()
|
|
578
|
+
|
|
579
|
+
try:
|
|
580
|
+
if isinstance(geometry, Point):
|
|
581
|
+
# Single point conversion
|
|
582
|
+
mgrs_code = m.toMGRS(geometry.y, geometry.x, MGRSPrecision=precision)
|
|
583
|
+
return [mgrs_code[: 2 + 2 + precision * 2]] # Format based on precision
|
|
584
|
+
else:
|
|
585
|
+
# Sample points across geometry
|
|
586
|
+
bounds = geometry.bounds
|
|
587
|
+
min_lon, min_lat, max_lon, max_lat = bounds
|
|
588
|
+
|
|
589
|
+
cells = set()
|
|
590
|
+
n_samples = min(200, max(20, int(geometry.area * 100)))
|
|
591
|
+
import numpy as np
|
|
592
|
+
|
|
593
|
+
for _ in range(n_samples):
|
|
594
|
+
lat = np.random.uniform(min_lat, max_lat)
|
|
595
|
+
lon = np.random.uniform(min_lon, max_lon)
|
|
596
|
+
point = Point(lon, lat)
|
|
597
|
+
|
|
598
|
+
if geometry.contains(point) or geometry.intersects(point):
|
|
599
|
+
try:
|
|
600
|
+
mgrs_code = m.toMGRS(lat, lon, MGRSPrecision=precision)
|
|
601
|
+
cells.add(mgrs_code[: 2 + 2 + precision * 2])
|
|
602
|
+
except (ValueError, TypeError, AttributeError) as e:
|
|
603
|
+
logger.debug(f"Invalid MGRS coordinates: {e}")
|
|
604
|
+
|
|
605
|
+
return list(cells)
|
|
606
|
+
|
|
607
|
+
except (ValueError, TypeError, AttributeError, RuntimeError) as e:
|
|
608
|
+
logger.error(f"Failed to resolve MGRS partitions: {e}")
|
|
609
|
+
return []
|
|
610
|
+
|
|
611
|
+
def _resolve_utm_partitions(self, geometry: BaseGeometry, overlap: bool, buffer_cells: int) -> list[str]:
|
|
612
|
+
"""Resolve UTM partitions for the given geometry."""
|
|
613
|
+
# UTM zones are based on longitude ranges (6 degrees each)
|
|
614
|
+
bounds = geometry.bounds
|
|
615
|
+
min_lon, min_lat, max_lon, max_lat = bounds
|
|
616
|
+
|
|
617
|
+
zones = set()
|
|
618
|
+
|
|
619
|
+
# Calculate UTM zones that intersect the geometry
|
|
620
|
+
start_zone = int((min_lon + 180) // 6) + 1
|
|
621
|
+
end_zone = int((max_lon + 180) // 6) + 1
|
|
622
|
+
|
|
623
|
+
for zone in range(start_zone, end_zone + 1):
|
|
624
|
+
if 1 <= zone <= 60: # Valid UTM zones
|
|
625
|
+
# Determine hemisphere(s) using the same format as UTMGridSystem
|
|
626
|
+
if max_lat >= 0:
|
|
627
|
+
zones.add(f"{zone}N")
|
|
628
|
+
if min_lat < 0:
|
|
629
|
+
zones.add(f"{zone}S")
|
|
630
|
+
|
|
631
|
+
return list(zones)
|
|
632
|
+
|
|
633
|
+
def _resolve_latlon_partitions(self, geometry: BaseGeometry, overlap: bool, buffer_cells: int) -> list[str]:
|
|
634
|
+
"""Resolve LatLon grid partitions for the given geometry."""
|
|
635
|
+
cell_size = self.spatial_config.get("cell_size_degrees", self.spatial_config.get("resolution"))
|
|
636
|
+
|
|
637
|
+
bounds = geometry.bounds
|
|
638
|
+
min_lon, min_lat, max_lon, max_lat = bounds
|
|
639
|
+
|
|
640
|
+
# Calculate grid cells that intersect the bounding box
|
|
641
|
+
min_cell_lon = int(min_lon // cell_size) * cell_size
|
|
642
|
+
max_cell_lon = int(max_lon // cell_size) * cell_size
|
|
643
|
+
min_cell_lat = int(min_lat // cell_size) * cell_size
|
|
644
|
+
max_cell_lat = int(max_lat // cell_size) * cell_size
|
|
645
|
+
|
|
646
|
+
cells = []
|
|
647
|
+
lat = min_cell_lat
|
|
648
|
+
while lat <= max_cell_lat:
|
|
649
|
+
lon = min_cell_lon
|
|
650
|
+
while lon <= max_cell_lon:
|
|
651
|
+
# Create cell geometry to test intersection
|
|
652
|
+
cell_geom = box(lon, lat, lon + cell_size, lat + cell_size)
|
|
653
|
+
if geometry.intersects(cell_geom):
|
|
654
|
+
cell_id = f"latlon_{lat}_{lon}"
|
|
655
|
+
cells.append(cell_id)
|
|
656
|
+
lon += cell_size
|
|
657
|
+
lat += cell_size
|
|
658
|
+
|
|
659
|
+
return cells
|
|
660
|
+
|
|
661
|
+
def _resolve_itslive_partitions(self, geometry: BaseGeometry, overlap: bool, buffer_cells: int) -> list[str]:
|
|
662
|
+
"""Resolve ITS_LIVE grid partitions for the given geometry."""
|
|
663
|
+
from earthcatalog.grid_systems import ITSLiveGridSystem
|
|
664
|
+
|
|
665
|
+
# Create ITSLive grid system to handle the resolution
|
|
666
|
+
grid_system = ITSLiveGridSystem()
|
|
667
|
+
|
|
668
|
+
# Convert shapely geometry to GeoJSON format
|
|
669
|
+
if hasattr(geometry, "__geo_interface__"):
|
|
670
|
+
geom_dict = geometry.__geo_interface__
|
|
671
|
+
else:
|
|
672
|
+
# Fallback for basic geometries
|
|
673
|
+
# Fallback: use centroid for complex geometries
|
|
674
|
+
centroid = geometry.centroid
|
|
675
|
+
geom_dict = {"type": "Point", "coordinates": [centroid.x, centroid.y]}
|
|
676
|
+
|
|
677
|
+
# Get ITS_LIVE grid cells
|
|
678
|
+
cells = grid_system.tiles_for_geometry(geom_dict)
|
|
679
|
+
|
|
680
|
+
# Handle buffering if requested
|
|
681
|
+
if buffer_cells > 0:
|
|
682
|
+
# For ITS_LIVE, buffering means adding neighboring 10-degree cells
|
|
683
|
+
buffered_cells = set(cells)
|
|
684
|
+
|
|
685
|
+
for cell in cells:
|
|
686
|
+
# Parse cell name to get center coordinates
|
|
687
|
+
# Format: {N|S}{lat:02d}{E|W}{lon:03d}
|
|
688
|
+
if len(cell) >= 7:
|
|
689
|
+
lat_part = cell[1:3]
|
|
690
|
+
lon_part = cell[4:7]
|
|
691
|
+
lat_sign = 1 if cell[0] == "N" else -1
|
|
692
|
+
lon_sign = 1 if cell[3] == "E" else -1
|
|
693
|
+
|
|
694
|
+
try:
|
|
695
|
+
lat_center = lat_sign * int(lat_part)
|
|
696
|
+
lon_center = lon_sign * int(lon_part)
|
|
697
|
+
|
|
698
|
+
# Add neighboring cells
|
|
699
|
+
for lat_offset in range(-buffer_cells, buffer_cells + 1):
|
|
700
|
+
for lon_offset in range(-buffer_cells, buffer_cells + 1):
|
|
701
|
+
new_lat = lat_center + (lat_offset * 10)
|
|
702
|
+
new_lon = lon_center + (lon_offset * 10)
|
|
703
|
+
|
|
704
|
+
# Keep within valid coordinate ranges
|
|
705
|
+
if -90 <= new_lat <= 90 and -180 <= new_lon < 180:
|
|
706
|
+
neighbor_cell = grid_system._format_cell_name(new_lat, new_lon)
|
|
707
|
+
buffered_cells.add(neighbor_cell)
|
|
708
|
+
except (ValueError, IndexError):
|
|
709
|
+
# Skip invalid cell names
|
|
710
|
+
pass
|
|
711
|
+
|
|
712
|
+
return list(buffered_cells)
|
|
713
|
+
|
|
714
|
+
return cells
|
|
715
|
+
|
|
716
|
+
def _resolve_geojson_partitions(self, geometry: BaseGeometry, overlap: bool, buffer_cells: int) -> list[str]:
|
|
717
|
+
"""Resolve GeoJSON partitions for the given geometry."""
|
|
718
|
+
# For custom GeoJSON grids, we need to check against each feature
|
|
719
|
+
custom_tiles = self.spatial_config.get("custom_tiles", {})
|
|
720
|
+
|
|
721
|
+
if not custom_tiles or "tile_ids" not in custom_tiles:
|
|
722
|
+
logger.warning("No custom tiles information available for GeoJSON grid")
|
|
723
|
+
return []
|
|
724
|
+
|
|
725
|
+
# If we have the source file, load it and check intersections
|
|
726
|
+
geojson_source = self.spatial_config.get("geojson_source")
|
|
727
|
+
if geojson_source and Path(geojson_source).exists():
|
|
728
|
+
try:
|
|
729
|
+
with open(geojson_source) as f:
|
|
730
|
+
geojson_data = json.load(f)
|
|
731
|
+
|
|
732
|
+
intersecting_tiles = []
|
|
733
|
+
for feature in geojson_data.get("features", []):
|
|
734
|
+
tile_id = feature.get("properties", {}).get("id")
|
|
735
|
+
if tile_id:
|
|
736
|
+
tile_geom = shape(feature["geometry"])
|
|
737
|
+
if geometry.intersects(tile_geom):
|
|
738
|
+
intersecting_tiles.append(tile_id)
|
|
739
|
+
|
|
740
|
+
return intersecting_tiles
|
|
741
|
+
|
|
742
|
+
except (OSError, json.JSONDecodeError, ValueError, TypeError) as e:
|
|
743
|
+
logger.error(f"Failed to load GeoJSON file {geojson_source}: {e}")
|
|
744
|
+
|
|
745
|
+
# Fallback: return all known tile IDs (user will need to filter)
|
|
746
|
+
logger.warning("Cannot perform spatial intersection, returning all tiles")
|
|
747
|
+
tile_ids = custom_tiles.get("tile_ids", [])
|
|
748
|
+
return list(tile_ids) if tile_ids is not None else []
|
|
749
|
+
|
|
750
|
+
|
|
751
|
+
def spatial_resolver(schema: str, catalog_path: str | None = None) -> SpatialPartitionResolver:
|
|
752
|
+
"""Create a spatial partition resolver from catalog schema with automatic path detection.
|
|
753
|
+
|
|
754
|
+
This is the primary entry point for creating spatial resolvers in EarthCatalog.
|
|
755
|
+
Automatically handles both local and remote schema files with intelligent path
|
|
756
|
+
resolution for cloud storage systems (S3, GCS, Azure) and HTTP endpoints.
|
|
757
|
+
|
|
758
|
+
The resolver enables efficient spatial queries by determining which catalog
|
|
759
|
+
partitions intersect with areas of interest, eliminating the need to scan
|
|
760
|
+
entire catalogs for geospatial queries.
|
|
761
|
+
|
|
762
|
+
Supported Schema Sources:
|
|
763
|
+
- Local filesystem paths (absolute or relative)
|
|
764
|
+
- S3: s3://, s3a://, s3n:// protocols
|
|
765
|
+
- Google Cloud Storage: gcs://, gs:// protocols
|
|
766
|
+
- Azure: abfs://, az://, azure:// protocols
|
|
767
|
+
- HTTP/HTTPS URLs for publicly accessible schemas
|
|
768
|
+
- Any fsspec-supported filesystem protocol
|
|
769
|
+
|
|
770
|
+
Auto-Path Resolution:
|
|
771
|
+
For local files, catalog_path defaults to the schema file's directory.
|
|
772
|
+
For remote files, catalog_path must be explicitly provided since the
|
|
773
|
+
catalog and schema may be in different locations.
|
|
774
|
+
|
|
775
|
+
Args:
|
|
776
|
+
schema: Path or URL to the catalog schema JSON file. Can be local path
|
|
777
|
+
(e.g., './catalog_schema.json') or any fsspec-supported URL
|
|
778
|
+
(e.g., 's3://bucket/schema.json', 'https://example.com/schema.json').
|
|
779
|
+
catalog_path: Path to the catalog root directory. Required for remote
|
|
780
|
+
schema files, optional for local files (defaults to schema directory).
|
|
781
|
+
Should point to the directory containing partition subdirectories.
|
|
782
|
+
|
|
783
|
+
Returns:
|
|
784
|
+
SpatialPartitionResolver: Configured resolver ready for spatial queries.
|
|
785
|
+
Use resolver.resolve_partitions(geometry) to find intersecting partitions
|
|
786
|
+
or resolver.query_partitions(geometry) for direct file path resolution.
|
|
787
|
+
|
|
788
|
+
Raises:
|
|
789
|
+
ValueError: If catalog_path is not provided when required for remote schema
|
|
790
|
+
files, or if the schema file format is invalid.
|
|
791
|
+
FileNotFoundError: If the schema file cannot be found or accessed.
|
|
792
|
+
ImportError: If required dependencies (fsspec, s3fs, gcsfs, etc.) are not
|
|
793
|
+
installed for remote file access.
|
|
794
|
+
json.JSONDecodeError: If the schema file contains invalid JSON.
|
|
795
|
+
|
|
796
|
+
Example:
|
|
797
|
+
>>> # Local catalog
|
|
798
|
+
>>> resolver = spatial_resolver('./catalog_schema.json')
|
|
799
|
+
>>>
|
|
800
|
+
>>> # S3 catalog with explicit catalog path
|
|
801
|
+
>>> resolver = spatial_resolver(
|
|
802
|
+
... 's3://bucket/metadata/schema.json',
|
|
803
|
+
... catalog_path='s3://bucket/catalog/'
|
|
804
|
+
... )
|
|
805
|
+
>>>
|
|
806
|
+
>>> # Find partitions for area of interest
|
|
807
|
+
>>> partitions = resolver.resolve_partitions(aoi_geometry)
|
|
808
|
+
>>> print(f"AOI intersects {len(partitions)} catalog partitions")
|
|
809
|
+
>>>
|
|
810
|
+
>>> # Get file paths directly
|
|
811
|
+
>>> files = resolver.query_partitions(aoi_geometry)
|
|
812
|
+
>>> print(f"Found {len(files)} data files in AOI")
|
|
813
|
+
|
|
814
|
+
Performance:
|
|
815
|
+
- Schema loading is cached for repeated use
|
|
816
|
+
- Spatial computations are optimized for the underlying grid system
|
|
817
|
+
- Large geometry handling includes global partition detection
|
|
818
|
+
- Minimal memory overhead regardless of catalog size
|
|
819
|
+
|
|
820
|
+
Integration:
|
|
821
|
+
Works seamlessly with catalogs created by STACIngestionPipeline and
|
|
822
|
+
integrates with popular geospatial libraries (geopandas, shapely, etc.).
|
|
823
|
+
"""
|
|
824
|
+
import os
|
|
825
|
+
|
|
826
|
+
# Check if schema is a remote URL (S3, GCS, Azure, HTTP, etc.)
|
|
827
|
+
is_remote = any(
|
|
828
|
+
schema.startswith(prefix)
|
|
829
|
+
for prefix in [
|
|
830
|
+
"s3://",
|
|
831
|
+
"s3a://",
|
|
832
|
+
"s3n://", # S3 variants
|
|
833
|
+
"gcs://",
|
|
834
|
+
"gs://", # Google Cloud Storage
|
|
835
|
+
"abfs://",
|
|
836
|
+
"azure://", # Azure Blob Storage
|
|
837
|
+
"http://",
|
|
838
|
+
"https://", # HTTP/HTTPS
|
|
839
|
+
"ftp://",
|
|
840
|
+
"sftp://", # FTP variants
|
|
841
|
+
]
|
|
842
|
+
)
|
|
843
|
+
|
|
844
|
+
if is_remote:
|
|
845
|
+
if catalog_path is None:
|
|
846
|
+
raise ValueError(
|
|
847
|
+
"catalog_path must be explicitly provided when schema is a remote URL. "
|
|
848
|
+
"Cannot auto-detect catalog directory for remote URLs."
|
|
849
|
+
)
|
|
850
|
+
|
|
851
|
+
try:
|
|
852
|
+
import fsspec
|
|
853
|
+
except ImportError as e:
|
|
854
|
+
raise ImportError("fsspec is required for remote file access. Install with: pip install fsspec[s3]") from e
|
|
855
|
+
|
|
856
|
+
# Use fsspec to read remote schema file
|
|
857
|
+
try:
|
|
858
|
+
# Type checking: fsspec is guaranteed to be available after ImportError check above
|
|
859
|
+
with fsspec.open(schema, "r") as f: # type: ignore
|
|
860
|
+
content = f.read() # type: ignore
|
|
861
|
+
schema_data = json.loads(content)
|
|
862
|
+
except (OSError, json.JSONDecodeError, ValueError, ConnectionError) as e:
|
|
863
|
+
raise FileNotFoundError(f"Failed to read schema from remote location: {schema}") from e
|
|
864
|
+
else:
|
|
865
|
+
# Local file handling
|
|
866
|
+
with open(schema) as f:
|
|
867
|
+
schema_data = json.load(f)
|
|
868
|
+
|
|
869
|
+
# If catalog_path not provided, use the directory containing the schema file
|
|
870
|
+
if catalog_path is None:
|
|
871
|
+
catalog_path = os.path.dirname(os.path.abspath(schema))
|
|
872
|
+
|
|
873
|
+
return SpatialPartitionResolver(schema_data, catalog_path)
|
|
874
|
+
|
|
875
|
+
|
|
876
|
+
def resolve_and_query(
|
|
877
|
+
schema_path: str,
|
|
878
|
+
catalog_path: str,
|
|
879
|
+
aoi_geometry: dict[str, Any] | BaseGeometry,
|
|
880
|
+
temporal_filter: str | None = None,
|
|
881
|
+
overlap: bool = True,
|
|
882
|
+
buffer_cells: int = 0,
|
|
883
|
+
) -> tuple[list[str], str]:
|
|
884
|
+
"""One-stop convenience function for spatial partition resolution with ready-to-use DuckDB queries.
|
|
885
|
+
|
|
886
|
+
This high-level function combines spatial partition resolution with DuckDB query
|
|
887
|
+
generation, providing an immediate solution for geospatial catalog queries.
|
|
888
|
+
Ideal for interactive analysis, Jupyter notebooks, and quick data exploration.
|
|
889
|
+
|
|
890
|
+
The function handles the complete workflow:
|
|
891
|
+
1. Load catalog schema and initialize spatial resolver
|
|
892
|
+
2. Resolve spatial partitions that intersect the area of interest
|
|
893
|
+
3. Generate optimized DuckDB SQL query for the intersecting partitions
|
|
894
|
+
4. Apply optional temporal filtering and buffering
|
|
895
|
+
|
|
896
|
+
Query Optimization Features:
|
|
897
|
+
- Generates efficient partition-aware SELECT statements
|
|
898
|
+
- Includes spatial filtering for precise boundary handling
|
|
899
|
+
- Supports temporal glob patterns for time-series data
|
|
900
|
+
- Optimizes buffer zones for boundary analysis
|
|
901
|
+
- Handles both point and polygon geometries efficiently
|
|
902
|
+
|
|
903
|
+
Use Cases:
|
|
904
|
+
- Interactive data exploration in Jupyter notebooks
|
|
905
|
+
- Quick spatial queries without manual partition management
|
|
906
|
+
- Temporal analysis with combined spatial/time filtering
|
|
907
|
+
- Boundary analysis with automatic buffer zones
|
|
908
|
+
- Prototype development and data discovery
|
|
909
|
+
|
|
910
|
+
Args:
|
|
911
|
+
schema_path: Path to catalog schema JSON file. Can be local file path
|
|
912
|
+
or any fsspec-supported URL (s3://, gcs://, https://, etc.).
|
|
913
|
+
catalog_path: Path to catalog root directory containing partition
|
|
914
|
+
subdirectories. Use same protocol as schema_path for consistency.
|
|
915
|
+
aoi_geometry: Area of interest as GeoJSON dictionary (e.g., from geojson.load())
|
|
916
|
+
or Shapely geometry object (Point, Polygon, MultiPolygon, etc.).
|
|
917
|
+
temporal_filter: Optional temporal filter using glob patterns.
|
|
918
|
+
Examples: "2024-*" (year 2024), "2024-0[1-6]*" (Jan-Jun 2024),
|
|
919
|
+
"*2024-12-*" (December 2024 across all years).
|
|
920
|
+
overlap: Include geometries that overlap partition boundaries.
|
|
921
|
+
True (default) includes all intersecting partitions, False only
|
|
922
|
+
includes partitions where geometry centroids fall within.
|
|
923
|
+
buffer_cells: Number of additional grid cells to include around the
|
|
924
|
+
geometry boundary. Useful for analysis requiring spatial context
|
|
925
|
+
or edge effects handling. Applied using grid system's native buffering.
|
|
926
|
+
|
|
927
|
+
Returns:
|
|
928
|
+
tuple[list[str], str]: Two-element tuple containing:
|
|
929
|
+
- List of partition identifiers that intersect the geometry
|
|
930
|
+
- Ready-to-execute DuckDB SQL query string for spatial analysis
|
|
931
|
+
|
|
932
|
+
Raises:
|
|
933
|
+
FileNotFoundError: If schema_path or catalog_path cannot be accessed.
|
|
934
|
+
ValueError: If aoi_geometry format is invalid or unsupported.
|
|
935
|
+
ImportError: If required dependencies are missing for remote file access.
|
|
936
|
+
|
|
937
|
+
Example:
|
|
938
|
+
>>> from shapely.geometry import box
|
|
939
|
+
>>>
|
|
940
|
+
>>> # Define area of interest (bounding box for New York City)
|
|
941
|
+
>>> nyc_bbox = box(-74.25, 40.49, -73.70, 40.92)
|
|
942
|
+
>>>
|
|
943
|
+
>>> # Get partitions and query for recent data
|
|
944
|
+
>>> partitions, query = resolve_and_query(
|
|
945
|
+
... schema_path='s3://catalog/schema.json',
|
|
946
|
+
... catalog_path='s3://catalog/data/',
|
|
947
|
+
... aoi_geometry=nyc_bbox,
|
|
948
|
+
... temporal_filter='2024-*',
|
|
949
|
+
... buffer_cells=1 # Include surrounding context
|
|
950
|
+
... )
|
|
951
|
+
>>>
|
|
952
|
+
>>> print(f"Found {len(partitions)} intersecting partitions")
|
|
953
|
+
>>> print("Generated DuckDB query:")
|
|
954
|
+
>>> print(query)
|
|
955
|
+
>>>
|
|
956
|
+
>>> # Execute query with DuckDB
|
|
957
|
+
>>> import duckdb
|
|
958
|
+
>>> results = duckdb.sql(query).to_df()
|
|
959
|
+
>>> print(f"Retrieved {len(results)} STAC items in AOI")
|
|
960
|
+
|
|
961
|
+
Performance:
|
|
962
|
+
- Optimized for interactive use with sub-second response times
|
|
963
|
+
- Efficient even for complex geometries and large catalogs
|
|
964
|
+
- Query generation scales linearly with partition count
|
|
965
|
+
- Minimal memory usage regardless of catalog size
|
|
966
|
+
|
|
967
|
+
Integration:
|
|
968
|
+
Works seamlessly with DuckDB, GeoPandas, and other spatial analysis tools.
|
|
969
|
+
Generated queries are compatible with DuckDB's spatial extension and
|
|
970
|
+
can be modified for advanced spatial operations.
|
|
971
|
+
"""
|
|
972
|
+
resolver = spatial_resolver(schema_path, catalog_path)
|
|
973
|
+
partition_ids = resolver.resolve_partitions(aoi_geometry, overlap, buffer_cells)
|
|
974
|
+
query_patterns = resolver.generate_query_paths(partition_ids, temporal_filter)
|
|
975
|
+
|
|
976
|
+
if not query_patterns:
|
|
977
|
+
return partition_ids, ""
|
|
978
|
+
|
|
979
|
+
# Generate DuckDB query
|
|
980
|
+
patterns_str = "', '".join(query_patterns)
|
|
981
|
+
# Convert geometry to GeoJSON format for the query
|
|
982
|
+
if hasattr(aoi_geometry, "__geo_interface__"):
|
|
983
|
+
geom_geojson = aoi_geometry.__geo_interface__ # type: ignore
|
|
984
|
+
else:
|
|
985
|
+
geom_geojson = aoi_geometry
|
|
986
|
+
|
|
987
|
+
query = f"""
|
|
988
|
+
SELECT * FROM read_parquet(['{patterns_str}'])
|
|
989
|
+
WHERE ST_Intersects(geometry, ST_GeomFromGeoJSON('{json.dumps(geom_geojson)}'))
|
|
990
|
+
""".strip()
|
|
991
|
+
|
|
992
|
+
return partition_ids, query
|
|
993
|
+
|
|
994
|
+
|
|
995
|
+
def infer_catalog_schema(catalog_path: str) -> dict[str, Any]:
|
|
996
|
+
"""Infer catalog schema from directory structure when no schema file is available.
|
|
997
|
+
|
|
998
|
+
This function analyzes the directory structure of an existing catalog to determine
|
|
999
|
+
its grid system, resolution, and available missions. Useful as a fallback when
|
|
1000
|
+
the catalog schema file is not available.
|
|
1001
|
+
|
|
1002
|
+
The function looks for patterns like:
|
|
1003
|
+
- partition=h3, partition=s2, partition=utm, etc.
|
|
1004
|
+
- level=2, level=5, etc.
|
|
1005
|
+
- Mission directories (landsat8, sentinel2, etc.)
|
|
1006
|
+
|
|
1007
|
+
Args:
|
|
1008
|
+
catalog_path: Path to the catalog root directory (local or remote).
|
|
1009
|
+
Supports S3, GCS, Azure via fsspec.
|
|
1010
|
+
|
|
1011
|
+
Returns:
|
|
1012
|
+
dict: Inferred schema dictionary with structure:
|
|
1013
|
+
{
|
|
1014
|
+
"spatial_partitioning": {
|
|
1015
|
+
"grid_system": str, # e.g., "h3", "s2", "itslive"
|
|
1016
|
+
"resolution": int,
|
|
1017
|
+
"partitioning_scheme": "default"
|
|
1018
|
+
},
|
|
1019
|
+
"global_partitioning": {"enabled": bool, "threshold": int},
|
|
1020
|
+
"inferred": True # Marker indicating this was auto-detected
|
|
1021
|
+
}
|
|
1022
|
+
|
|
1023
|
+
Raises:
|
|
1024
|
+
FileNotFoundError: If catalog_path doesn't exist or can't be accessed.
|
|
1025
|
+
ValueError: If the directory structure doesn't match expected patterns.
|
|
1026
|
+
|
|
1027
|
+
Example:
|
|
1028
|
+
>>> schema = infer_catalog_schema('./my_catalog/')
|
|
1029
|
+
>>> resolver = SpatialPartitionResolver(schema, './my_catalog/')
|
|
1030
|
+
|
|
1031
|
+
>>> # Or use with spatial_resolver_from_path
|
|
1032
|
+
>>> resolver = spatial_resolver_from_path('./my_catalog/')
|
|
1033
|
+
"""
|
|
1034
|
+
import os
|
|
1035
|
+
import re
|
|
1036
|
+
|
|
1037
|
+
# Supported grid systems and their directory patterns
|
|
1038
|
+
grid_patterns = {
|
|
1039
|
+
"h3": re.compile(r"partition=h3", re.IGNORECASE),
|
|
1040
|
+
"s2": re.compile(r"partition=s2", re.IGNORECASE),
|
|
1041
|
+
"utm": re.compile(r"partition=utm", re.IGNORECASE),
|
|
1042
|
+
"mgrs": re.compile(r"partition=mgrs", re.IGNORECASE),
|
|
1043
|
+
"latlon": re.compile(r"partition=latlon", re.IGNORECASE),
|
|
1044
|
+
"itslive": re.compile(r"partition=itslive|[NS]\d{2}[EW]\d{3}", re.IGNORECASE),
|
|
1045
|
+
"geojson": re.compile(r"partition=geojson", re.IGNORECASE),
|
|
1046
|
+
}
|
|
1047
|
+
|
|
1048
|
+
resolution_pattern = re.compile(r"level=(\d+)")
|
|
1049
|
+
|
|
1050
|
+
# Check if path is remote
|
|
1051
|
+
remote_prefixes = ("s3://", "s3a://", "gs://", "gcs://", "az://", "abfs://", "azure://", "http://", "https://")
|
|
1052
|
+
is_remote = any(catalog_path.startswith(p) for p in remote_prefixes)
|
|
1053
|
+
|
|
1054
|
+
detected_grid = None
|
|
1055
|
+
detected_resolution = None
|
|
1056
|
+
detected_missions: set[str] = set()
|
|
1057
|
+
has_global = False
|
|
1058
|
+
|
|
1059
|
+
if is_remote:
|
|
1060
|
+
try:
|
|
1061
|
+
import fsspec
|
|
1062
|
+
except ImportError:
|
|
1063
|
+
raise ImportError("fsspec required for remote paths: pip install fsspec") from None
|
|
1064
|
+
|
|
1065
|
+
fs, path = fsspec.core.url_to_fs(catalog_path)
|
|
1066
|
+
|
|
1067
|
+
# List top-level directories
|
|
1068
|
+
try:
|
|
1069
|
+
entries = fs.ls(path, detail=False)
|
|
1070
|
+
except (OSError, ConnectionError, ValueError) as e:
|
|
1071
|
+
raise FileNotFoundError(f"Cannot access catalog path: {catalog_path}") from e
|
|
1072
|
+
|
|
1073
|
+
for entry in entries[:50]: # Limit scan
|
|
1074
|
+
entry_str = str(entry)
|
|
1075
|
+
|
|
1076
|
+
# Check for grid pattern
|
|
1077
|
+
for grid_name, pattern in grid_patterns.items():
|
|
1078
|
+
if pattern.search(entry_str):
|
|
1079
|
+
detected_grid = grid_name
|
|
1080
|
+
break
|
|
1081
|
+
|
|
1082
|
+
# Check for resolution
|
|
1083
|
+
res_match = resolution_pattern.search(entry_str)
|
|
1084
|
+
if res_match:
|
|
1085
|
+
detected_resolution = int(res_match.group(1))
|
|
1086
|
+
|
|
1087
|
+
# Check for global partition
|
|
1088
|
+
if "global" in entry_str.lower():
|
|
1089
|
+
has_global = True
|
|
1090
|
+
|
|
1091
|
+
# Extract mission names (top-level dirs that aren't system dirs)
|
|
1092
|
+
basename = os.path.basename(entry.rstrip("/"))
|
|
1093
|
+
if basename and not basename.startswith(".") and basename not in ["global"]:
|
|
1094
|
+
if "partition=" not in entry_str:
|
|
1095
|
+
detected_missions.add(basename)
|
|
1096
|
+
else:
|
|
1097
|
+
# Local path
|
|
1098
|
+
catalog_dir = Path(catalog_path)
|
|
1099
|
+
if not catalog_dir.exists():
|
|
1100
|
+
raise FileNotFoundError(f"Catalog path does not exist: {catalog_path}")
|
|
1101
|
+
|
|
1102
|
+
# Walk directory to find patterns
|
|
1103
|
+
for entry in catalog_dir.iterdir():
|
|
1104
|
+
if entry.is_dir():
|
|
1105
|
+
entry_str = str(entry)
|
|
1106
|
+
entry_name = entry.name
|
|
1107
|
+
|
|
1108
|
+
# Check for grid pattern
|
|
1109
|
+
for grid_name, pattern in grid_patterns.items():
|
|
1110
|
+
if pattern.search(entry_str):
|
|
1111
|
+
detected_grid = grid_name
|
|
1112
|
+
break
|
|
1113
|
+
|
|
1114
|
+
# Also check subdirectories
|
|
1115
|
+
for subentry in entry.iterdir():
|
|
1116
|
+
subentry_str = str(subentry)
|
|
1117
|
+
|
|
1118
|
+
for grid_name, pattern in grid_patterns.items():
|
|
1119
|
+
if pattern.search(subentry_str):
|
|
1120
|
+
detected_grid = grid_name
|
|
1121
|
+
break
|
|
1122
|
+
|
|
1123
|
+
res_match = resolution_pattern.search(subentry_str)
|
|
1124
|
+
if res_match:
|
|
1125
|
+
detected_resolution = int(res_match.group(1))
|
|
1126
|
+
|
|
1127
|
+
if detected_grid and detected_resolution:
|
|
1128
|
+
break
|
|
1129
|
+
|
|
1130
|
+
# Check for global partition
|
|
1131
|
+
if entry_name.lower() == "global":
|
|
1132
|
+
has_global = True
|
|
1133
|
+
|
|
1134
|
+
# Potential mission directory
|
|
1135
|
+
if not entry_name.startswith("."):
|
|
1136
|
+
detected_missions.add(entry_name)
|
|
1137
|
+
|
|
1138
|
+
# Validate we found something
|
|
1139
|
+
if detected_grid is None:
|
|
1140
|
+
raise ValueError(
|
|
1141
|
+
f"Could not detect grid system from catalog structure at {catalog_path}. "
|
|
1142
|
+
"Expected directories matching 'partition=h3', 'partition=s2', etc."
|
|
1143
|
+
)
|
|
1144
|
+
|
|
1145
|
+
# Default resolution if not found
|
|
1146
|
+
if detected_resolution is None:
|
|
1147
|
+
default_resolutions = {"h3": 2, "s2": 13, "utm": 1, "mgrs": 5, "latlon": 1, "itslive": 10, "geojson": 1}
|
|
1148
|
+
detected_resolution = default_resolutions.get(detected_grid, 1)
|
|
1149
|
+
logger.warning(f"Could not detect resolution, using default: {detected_resolution}")
|
|
1150
|
+
|
|
1151
|
+
# Build schema
|
|
1152
|
+
schema: dict[str, Any] = {
|
|
1153
|
+
"spatial_partitioning": {
|
|
1154
|
+
"grid_system": detected_grid,
|
|
1155
|
+
"resolution": detected_resolution,
|
|
1156
|
+
"partitioning_scheme": "default",
|
|
1157
|
+
},
|
|
1158
|
+
"global_partitioning": {
|
|
1159
|
+
"enabled": has_global,
|
|
1160
|
+
"threshold": 10 if has_global else 100,
|
|
1161
|
+
},
|
|
1162
|
+
"inferred": True, # Marker for auto-detected schema
|
|
1163
|
+
}
|
|
1164
|
+
|
|
1165
|
+
# Add missions if found
|
|
1166
|
+
if detected_missions:
|
|
1167
|
+
schema["spatial_partitioning"]["example_paths"] = [
|
|
1168
|
+
f"{m}/partition={detected_grid}/level={detected_resolution}/" for m in list(detected_missions)[:3]
|
|
1169
|
+
]
|
|
1170
|
+
|
|
1171
|
+
logger.info(f"Inferred catalog schema: grid={detected_grid}, resolution={detected_resolution}")
|
|
1172
|
+
return schema
|
|
1173
|
+
|
|
1174
|
+
|
|
1175
|
+
def spatial_resolver_from_path(catalog_path: str) -> SpatialPartitionResolver:
|
|
1176
|
+
"""Create a spatial resolver by inferring schema from catalog directory structure.
|
|
1177
|
+
|
|
1178
|
+
This is a convenience function for working with catalogs that don't have a
|
|
1179
|
+
schema file. It analyzes the directory structure to determine the grid system
|
|
1180
|
+
and resolution, then creates a resolver.
|
|
1181
|
+
|
|
1182
|
+
Use this when:
|
|
1183
|
+
- You have an existing catalog without a schema.json file
|
|
1184
|
+
- You're exploring a catalog and don't know its structure
|
|
1185
|
+
- The schema file is missing or inaccessible
|
|
1186
|
+
|
|
1187
|
+
Args:
|
|
1188
|
+
catalog_path: Path to the catalog root directory (local or remote).
|
|
1189
|
+
|
|
1190
|
+
Returns:
|
|
1191
|
+
SpatialPartitionResolver: Configured resolver based on inferred schema.
|
|
1192
|
+
|
|
1193
|
+
Raises:
|
|
1194
|
+
ValueError: If the directory structure doesn't match expected patterns.
|
|
1195
|
+
FileNotFoundError: If catalog_path doesn't exist.
|
|
1196
|
+
|
|
1197
|
+
Example:
|
|
1198
|
+
>>> # Create resolver by inferring structure
|
|
1199
|
+
>>> resolver = spatial_resolver_from_path('s3://bucket/my-catalog/')
|
|
1200
|
+
>>> partitions = resolver.resolve_partitions(my_geometry)
|
|
1201
|
+
>>>
|
|
1202
|
+
>>> # Equivalent to:
|
|
1203
|
+
>>> schema = infer_catalog_schema('s3://bucket/my-catalog/')
|
|
1204
|
+
>>> resolver = SpatialPartitionResolver(schema, 's3://bucket/my-catalog/')
|
|
1205
|
+
"""
|
|
1206
|
+
schema = infer_catalog_schema(catalog_path)
|
|
1207
|
+
return SpatialPartitionResolver(schema, catalog_path)
|