earthcatalog 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- earthcatalog/__init__.py +164 -0
- earthcatalog/async_http_client.py +1006 -0
- earthcatalog/config.py +97 -0
- earthcatalog/engines/__init__.py +308 -0
- earthcatalog/engines/rustac_engine.py +142 -0
- earthcatalog/engines/stac_geoparquet_engine.py +126 -0
- earthcatalog/exceptions.py +471 -0
- earthcatalog/grid_systems.py +1114 -0
- earthcatalog/ingestion_pipeline.py +2281 -0
- earthcatalog/input_readers.py +603 -0
- earthcatalog/job_tracking.py +485 -0
- earthcatalog/pipeline.py +606 -0
- earthcatalog/schema_generator.py +911 -0
- earthcatalog/spatial_resolver.py +1207 -0
- earthcatalog/stac_hooks.py +754 -0
- earthcatalog/statistics.py +677 -0
- earthcatalog/storage_backends.py +548 -0
- earthcatalog/tests/__init__.py +1 -0
- earthcatalog/tests/conftest.py +76 -0
- earthcatalog/tests/test_all_grids.py +793 -0
- earthcatalog/tests/test_async_http.py +700 -0
- earthcatalog/tests/test_cli_and_storage.py +230 -0
- earthcatalog/tests/test_config.py +245 -0
- earthcatalog/tests/test_dask_integration.py +580 -0
- earthcatalog/tests/test_e2e_synthetic.py +1624 -0
- earthcatalog/tests/test_engines.py +272 -0
- earthcatalog/tests/test_exceptions.py +346 -0
- earthcatalog/tests/test_file_structure.py +245 -0
- earthcatalog/tests/test_input_readers.py +666 -0
- earthcatalog/tests/test_integration.py +200 -0
- earthcatalog/tests/test_integration_async.py +283 -0
- earthcatalog/tests/test_job_tracking.py +603 -0
- earthcatalog/tests/test_multi_file_input.py +336 -0
- earthcatalog/tests/test_passthrough_hook.py +196 -0
- earthcatalog/tests/test_pipeline.py +684 -0
- earthcatalog/tests/test_pipeline_components.py +665 -0
- earthcatalog/tests/test_schema_generator.py +506 -0
- earthcatalog/tests/test_spatial_resolver.py +413 -0
- earthcatalog/tests/test_stac_hooks.py +776 -0
- earthcatalog/tests/test_statistics.py +477 -0
- earthcatalog/tests/test_storage_backends.py +236 -0
- earthcatalog/tests/test_validation.py +435 -0
- earthcatalog/tests/test_workers.py +653 -0
- earthcatalog/validation.py +921 -0
- earthcatalog/workers.py +682 -0
- earthcatalog-0.2.0.dist-info/METADATA +333 -0
- earthcatalog-0.2.0.dist-info/RECORD +50 -0
- earthcatalog-0.2.0.dist-info/WHEEL +5 -0
- earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
- earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,911 @@
|
|
|
1
|
+
"""Comprehensive catalog schema generator for EarthCatalog metadata and documentation.
|
|
2
|
+
|
|
3
|
+
This module provides intelligent schema generation capabilities that create detailed
|
|
4
|
+
metadata describing catalog structure, partitioning strategies, performance characteristics,
|
|
5
|
+
and usage patterns. The generated schemas enable efficient catalog discovery, validation,
|
|
6
|
+
and integration with spatial query tools.
|
|
7
|
+
|
|
8
|
+
Schema Generation Features:
|
|
9
|
+
- Complete catalog metadata with processing configuration
|
|
10
|
+
- Spatial partitioning documentation with grid system details
|
|
11
|
+
- Temporal organization structure and binning strategies
|
|
12
|
+
- Statistical summaries and performance metrics
|
|
13
|
+
- Usage examples and query optimization guidance
|
|
14
|
+
- Version tracking and compatibility information
|
|
15
|
+
|
|
16
|
+
Generated Schema Components:
|
|
17
|
+
Catalog Info: Basic metadata about source data and processing configuration
|
|
18
|
+
Spatial Partitioning: Detailed grid system parameters and spatial organization
|
|
19
|
+
Temporal Partitioning: Time-based binning configuration and structure
|
|
20
|
+
Partition Structure: Directory layout and file organization patterns
|
|
21
|
+
Global Partitioning: Large geometry handling and optimization strategies
|
|
22
|
+
Statistics: Performance metrics, item counts, and processing summaries
|
|
23
|
+
Usage Guidelines: Query examples and optimization recommendations
|
|
24
|
+
|
|
25
|
+
Integration Benefits:
|
|
26
|
+
- Enables spatial_resolver to automatically configure partition resolution
|
|
27
|
+
- Provides documentation for catalog users and developers
|
|
28
|
+
- Supports catalog validation and integrity checking
|
|
29
|
+
- Facilitates integration with external tools and frameworks
|
|
30
|
+
- Enables performance monitoring and optimization tracking
|
|
31
|
+
|
|
32
|
+
Performance Tracking:
|
|
33
|
+
The schema includes detailed performance metrics that help users understand:
|
|
34
|
+
- Processing throughput and timing information
|
|
35
|
+
- Memory usage patterns and optimization opportunities
|
|
36
|
+
- Error rates and reliability statistics
|
|
37
|
+
- Partition distribution and load balancing effectiveness
|
|
38
|
+
|
|
39
|
+
Example Generated Schema Structure:
|
|
40
|
+
{
|
|
41
|
+
"earthcatalog_version": "1.0.0",
|
|
42
|
+
"generated_at": "2024-12-04T10:30:00Z",
|
|
43
|
+
"spatial_partitioning": {
|
|
44
|
+
"grid_system": "h3",
|
|
45
|
+
"resolution": 6,
|
|
46
|
+
"cell_area_km2": 36.1,
|
|
47
|
+
"description": "H3 hexagonal grid level 6..."
|
|
48
|
+
},
|
|
49
|
+
"temporal_partitioning": {
|
|
50
|
+
"bin_size": "month",
|
|
51
|
+
"pattern": "YYYY-MM",
|
|
52
|
+
"description": "Monthly temporal bins..."
|
|
53
|
+
},
|
|
54
|
+
"statistics": {
|
|
55
|
+
"total_items": 1000000,
|
|
56
|
+
"processing_time_seconds": 3600,
|
|
57
|
+
"partitions_created": 450,
|
|
58
|
+
"average_items_per_partition": 2222
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
Usage Patterns:
|
|
63
|
+
>>> # Generate schema during pipeline execution
|
|
64
|
+
>>> generator = SchemaGenerator(config, grid, storage)
|
|
65
|
+
>>> schema = generator.generate_catalog_schema(stats)
|
|
66
|
+
>>>
|
|
67
|
+
>>> # Load existing schema for analysis
|
|
68
|
+
>>> with open('catalog_schema.json') as f:
|
|
69
|
+
... schema = json.load(f)
|
|
70
|
+
>>> print(f"Catalog uses {schema['spatial_partitioning']['grid_system']} grid")
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
import json
|
|
74
|
+
import logging
|
|
75
|
+
from datetime import UTC, datetime
|
|
76
|
+
from pathlib import Path
|
|
77
|
+
from typing import Any
|
|
78
|
+
|
|
79
|
+
from . import grid_systems
|
|
80
|
+
from .statistics import IngestionStatistics
|
|
81
|
+
|
|
82
|
+
logger = logging.getLogger(__name__)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class SchemaGenerator:
|
|
86
|
+
"""Intelligent catalog schema generator producing comprehensive metadata for EarthCatalog outputs.
|
|
87
|
+
|
|
88
|
+
This class generates rich metadata schemas that document every aspect of catalog
|
|
89
|
+
creation, from spatial partitioning strategies to performance characteristics.
|
|
90
|
+
The schemas serve as both technical documentation and enable automated tooling
|
|
91
|
+
for catalog discovery, validation, and query optimization.
|
|
92
|
+
|
|
93
|
+
Schema Generation Philosophy:
|
|
94
|
+
The generator follows a comprehensive documentation approach that captures
|
|
95
|
+
not just configuration parameters, but also derived information, performance
|
|
96
|
+
metrics, and usage guidance. This enables users to understand catalog
|
|
97
|
+
characteristics without examining the underlying data structure.
|
|
98
|
+
|
|
99
|
+
Key Features:
|
|
100
|
+
- Automatic grid system parameter calculation and documentation
|
|
101
|
+
- Comprehensive performance metrics and statistical summaries
|
|
102
|
+
- Usage examples and query optimization recommendations
|
|
103
|
+
- Version tracking for catalog format evolution
|
|
104
|
+
- Integration metadata for external tool compatibility
|
|
105
|
+
|
|
106
|
+
Generated Metadata Categories:
|
|
107
|
+
Configuration: Complete processing configuration and parameters
|
|
108
|
+
Spatial: Grid system details, resolution characteristics, and spatial organization
|
|
109
|
+
Temporal: Time-based binning strategies and partition patterns
|
|
110
|
+
Structure: Directory layout, file naming conventions, and organization
|
|
111
|
+
Performance: Processing metrics, throughput statistics, and optimization data
|
|
112
|
+
Usage: Query examples, best practices, and integration guidance
|
|
113
|
+
|
|
114
|
+
Performance Documentation:
|
|
115
|
+
The generator captures detailed performance characteristics including:
|
|
116
|
+
- Processing throughput and timing breakdowns
|
|
117
|
+
- Memory usage patterns and resource requirements
|
|
118
|
+
- Error rates and reliability metrics
|
|
119
|
+
- Partition distribution and load balancing statistics
|
|
120
|
+
- Query performance optimization recommendations
|
|
121
|
+
|
|
122
|
+
Integration Support:
|
|
123
|
+
Generated schemas enable seamless integration with:
|
|
124
|
+
- spatial_resolver for automatic partition resolution
|
|
125
|
+
- DuckDB and other query engines via documented structure
|
|
126
|
+
- Monitoring and observability tools via performance metrics
|
|
127
|
+
- Catalog validation and integrity checking systems
|
|
128
|
+
- External geospatial analysis frameworks
|
|
129
|
+
|
|
130
|
+
Thread Safety:
|
|
131
|
+
This class is thread-safe for read operations after initialization.
|
|
132
|
+
Schema generation methods can be called concurrently, though each
|
|
133
|
+
instance should be used for a single catalog generation workflow.
|
|
134
|
+
|
|
135
|
+
Example:
|
|
136
|
+
>>> # Initialize with pipeline components
|
|
137
|
+
>>> generator = SchemaGenerator(config, grid_system, storage_backend)
|
|
138
|
+
>>>
|
|
139
|
+
>>> # Generate complete schema with processing statistics
|
|
140
|
+
>>> schema = generator.generate_catalog_schema(
|
|
141
|
+
... partition_stats={'total_items': 1000000, 'processing_time': 3600}
|
|
142
|
+
... )
|
|
143
|
+
>>>
|
|
144
|
+
>>> # Schema automatically written to catalog directory
|
|
145
|
+
>>> print(f"Generated schema for {schema['statistics']['total_items']} items")
|
|
146
|
+
"""
|
|
147
|
+
|
|
148
|
+
def __init__(
|
|
149
|
+
self,
|
|
150
|
+
config: Any,
|
|
151
|
+
grid: grid_systems.GridSystem,
|
|
152
|
+
storage: Any,
|
|
153
|
+
stats: IngestionStatistics | None = None,
|
|
154
|
+
):
|
|
155
|
+
"""Initialize schema generator with pipeline configuration.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
config: ProcessingConfig with all pipeline settings
|
|
159
|
+
grid: Grid system instance used for partitioning
|
|
160
|
+
storage: Storage backend for the output catalog
|
|
161
|
+
stats: Optional IngestionStatistics with comprehensive metrics
|
|
162
|
+
"""
|
|
163
|
+
self.config = config
|
|
164
|
+
self.grid = grid
|
|
165
|
+
self.storage = storage
|
|
166
|
+
self.stats = stats
|
|
167
|
+
|
|
168
|
+
def generate_catalog_schema(
|
|
169
|
+
self, partition_stats: dict[str, Any], output_filename: str = "catalog_schema.json"
|
|
170
|
+
) -> dict[str, Any]:
|
|
171
|
+
"""Generate complete catalog schema metadata.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
partition_stats: Statistics from pipeline execution
|
|
175
|
+
output_filename: Name of schema file to write
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
Dictionary containing the complete schema
|
|
179
|
+
"""
|
|
180
|
+
logger.info("Generating catalog schema metadata...")
|
|
181
|
+
|
|
182
|
+
# Build the schema
|
|
183
|
+
schema = {
|
|
184
|
+
"earthcatalog_version": "1.0.0", # TODO: Get from package version
|
|
185
|
+
"schema_version": "1.0.0",
|
|
186
|
+
"generated_at": datetime.now(UTC).isoformat().replace("+00:00", "Z"),
|
|
187
|
+
"catalog_info": self._get_catalog_info(),
|
|
188
|
+
"spatial_partitioning": self._get_spatial_partitioning_info(),
|
|
189
|
+
"temporal_partitioning": self._get_temporal_partitioning_info(),
|
|
190
|
+
"partition_structure": self._get_partition_structure(partition_stats),
|
|
191
|
+
"global_partitioning": self._get_global_partitioning_info(),
|
|
192
|
+
"statistics": self._get_catalog_statistics(partition_stats),
|
|
193
|
+
"usage": self._get_usage_info(),
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
# Write schema to output location
|
|
197
|
+
self._write_schema(schema, output_filename)
|
|
198
|
+
|
|
199
|
+
return schema
|
|
200
|
+
|
|
201
|
+
def _get_catalog_info(self) -> dict[str, Any]:
|
|
202
|
+
"""Get basic catalog information."""
|
|
203
|
+
config_dict = {
|
|
204
|
+
"sort_key": self.config.sort_key,
|
|
205
|
+
"sort_ascending": self.config.sort_ascending,
|
|
206
|
+
"items_per_shard": self.config.items_per_shard,
|
|
207
|
+
"max_workers": self.config.max_workers,
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
# Add standard configuration
|
|
211
|
+
config_dict["output_format"] = getattr(self.config, "output_format", "geoparquet")
|
|
212
|
+
config_dict["mission_field"] = getattr(self.config, "mission_field", "dataset_id")
|
|
213
|
+
|
|
214
|
+
return {
|
|
215
|
+
"output_path": self.config.output_catalog,
|
|
216
|
+
"input_source": self.config.input_file,
|
|
217
|
+
"processing_config": config_dict,
|
|
218
|
+
"directory_structure": "{mission}/partition={grid_type}/level={resolution}/{h3_cell}/{temporal_bin}.{ext}",
|
|
219
|
+
"mission_extraction": f"Extracts mission from '{config_dict['mission_field']}' field, falls back to 'collection'",
|
|
220
|
+
"output_format": config_dict["output_format"],
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
def _get_spatial_partitioning_info(self) -> dict[str, Any]:
|
|
224
|
+
"""Get spatial partitioning configuration."""
|
|
225
|
+
grid_info = {
|
|
226
|
+
"grid_system": self.config.grid_system,
|
|
227
|
+
"coordinate_system": "EPSG:4326", # All grids use WGS84
|
|
228
|
+
"description": self._get_grid_description(),
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
# Add grid-specific parameters
|
|
232
|
+
if self.config.grid_system == "h3":
|
|
233
|
+
grid_info.update(
|
|
234
|
+
{
|
|
235
|
+
"resolution": self.config.grid_resolution,
|
|
236
|
+
"resolution_description": self._get_h3_resolution_description(self.config.grid_resolution),
|
|
237
|
+
"cell_area_km2": self._get_h3_average_area(self.config.grid_resolution),
|
|
238
|
+
"cell_edge_length_km": self._get_h3_average_edge_length(self.config.grid_resolution),
|
|
239
|
+
}
|
|
240
|
+
)
|
|
241
|
+
elif self.config.grid_system == "s2":
|
|
242
|
+
grid_info.update(
|
|
243
|
+
{
|
|
244
|
+
"level": self.config.grid_resolution,
|
|
245
|
+
"level_description": self._get_s2_level_description(self.config.grid_resolution),
|
|
246
|
+
"average_cell_area_km2": self._get_s2_average_area(self.config.grid_resolution),
|
|
247
|
+
}
|
|
248
|
+
)
|
|
249
|
+
elif self.config.grid_system == "mgrs":
|
|
250
|
+
grid_info.update(
|
|
251
|
+
{
|
|
252
|
+
"precision": self.config.grid_resolution,
|
|
253
|
+
"precision_description": self._get_mgrs_precision_description(self.config.grid_resolution),
|
|
254
|
+
}
|
|
255
|
+
)
|
|
256
|
+
elif self.config.grid_system == "utm":
|
|
257
|
+
grid_info.update(
|
|
258
|
+
{
|
|
259
|
+
"precision": self.config.grid_resolution,
|
|
260
|
+
"precision_description": self._get_utm_precision_description(self.config.grid_resolution),
|
|
261
|
+
}
|
|
262
|
+
)
|
|
263
|
+
elif self.config.grid_system == "latlon":
|
|
264
|
+
grid_info.update(
|
|
265
|
+
{
|
|
266
|
+
"cell_size_degrees": self.config.grid_resolution,
|
|
267
|
+
"cell_size_description": f"Each cell is {self.config.grid_resolution}° x {self.config.grid_resolution}°",
|
|
268
|
+
}
|
|
269
|
+
)
|
|
270
|
+
elif self.config.grid_system == "itslive":
|
|
271
|
+
grid_info.update(
|
|
272
|
+
{
|
|
273
|
+
"cell_size_degrees": 10,
|
|
274
|
+
"cell_size_description": "Fixed 10° x 10° cells with center-based naming",
|
|
275
|
+
"naming_convention": "{N|S}{lat:02d}{E|W}{lon:03d}",
|
|
276
|
+
"example_cell_name": "N60W040",
|
|
277
|
+
}
|
|
278
|
+
)
|
|
279
|
+
elif self.config.grid_system == "geojson":
|
|
280
|
+
grid_info.update(
|
|
281
|
+
{
|
|
282
|
+
"custom_grid": True,
|
|
283
|
+
"geojson_source": getattr(self.config, "geojson_path", "unknown"),
|
|
284
|
+
"custom_tiles": self._get_custom_tiles_info(),
|
|
285
|
+
}
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
return grid_info
|
|
289
|
+
|
|
290
|
+
def _get_temporal_partitioning_info(self) -> dict[str, Any]:
|
|
291
|
+
"""Get temporal partitioning configuration."""
|
|
292
|
+
return {
|
|
293
|
+
"temporal_bin": self.config.temporal_bin,
|
|
294
|
+
"temporal_bin_description": {
|
|
295
|
+
"year": "Items partitioned by year using Hive-style directories (year=YYYY/)",
|
|
296
|
+
"month": "Items partitioned by year-month using Hive-style directories (year=YYYY/month=MM/)",
|
|
297
|
+
"day": "Items partitioned by year-month-day using Hive-style directories (year=YYYY/month=MM/day=DD/)",
|
|
298
|
+
}[self.config.temporal_bin],
|
|
299
|
+
"datetime_field": self.config.sort_key,
|
|
300
|
+
"hive_path_examples": {
|
|
301
|
+
"year": "year=2024/items.parquet",
|
|
302
|
+
"month": "year=2024/month=01/items.parquet",
|
|
303
|
+
"day": "year=2024/month=01/day=15/items.parquet",
|
|
304
|
+
}[self.config.temporal_bin],
|
|
305
|
+
"pruning_benefit": "Directory-level pruning in DuckDB, Athena, Spark, and Trino",
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
def _get_partition_structure(self, partition_stats: dict[str, Any]) -> dict[str, Any]:
|
|
309
|
+
"""Get information about the actual partitions created."""
|
|
310
|
+
partitions = []
|
|
311
|
+
spatial_partitions = set()
|
|
312
|
+
temporal_partitions = set()
|
|
313
|
+
missions = set()
|
|
314
|
+
|
|
315
|
+
for partition_key, stats in partition_stats.items():
|
|
316
|
+
# Parse partition key with Hive-style temporal parts:
|
|
317
|
+
# "mission/partition=grid_type/level=resolution/spatial_id/year=YYYY/month=MM[/day=DD]"
|
|
318
|
+
parts = partition_key.split("/")
|
|
319
|
+
if len(parts) >= 5:
|
|
320
|
+
mission = parts[0]
|
|
321
|
+
# Skip partition=grid_type and level=resolution parts
|
|
322
|
+
spatial_id = parts[3] if len(parts) > 3 else "unknown"
|
|
323
|
+
|
|
324
|
+
# Extract Hive-style temporal parts (year=YYYY, month=MM, day=DD)
|
|
325
|
+
temporal_parts = []
|
|
326
|
+
for part in parts[4:]:
|
|
327
|
+
if part.startswith(("year=", "month=", "day=")):
|
|
328
|
+
temporal_parts.append(part)
|
|
329
|
+
|
|
330
|
+
# Convert to readable format (e.g., "year=2024/month=01" -> "2024-01")
|
|
331
|
+
temporal_bin = self._hive_parts_to_temporal_bin(temporal_parts)
|
|
332
|
+
|
|
333
|
+
missions.add(mission)
|
|
334
|
+
spatial_partitions.add(spatial_id)
|
|
335
|
+
temporal_partitions.add(temporal_bin)
|
|
336
|
+
|
|
337
|
+
partitions.append(
|
|
338
|
+
{
|
|
339
|
+
"partition_key": partition_key,
|
|
340
|
+
"mission": mission,
|
|
341
|
+
"spatial_id": spatial_id,
|
|
342
|
+
"temporal_bin": temporal_bin,
|
|
343
|
+
"total_items": stats.get("total_items", 0),
|
|
344
|
+
"new_items": stats.get("new_items", 0),
|
|
345
|
+
"existing_items": stats.get("existing_items", 0),
|
|
346
|
+
}
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
result = {
|
|
350
|
+
"total_partitions": len(partitions),
|
|
351
|
+
"spatial_partitions_count": len(spatial_partitions),
|
|
352
|
+
"temporal_partitions_count": len(temporal_partitions),
|
|
353
|
+
"missions_count": len(missions),
|
|
354
|
+
"spatial_partitions": sorted(spatial_partitions),
|
|
355
|
+
"temporal_partitions": sorted(temporal_partitions),
|
|
356
|
+
"missions": sorted(missions),
|
|
357
|
+
"partitions": partitions,
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
return result
|
|
361
|
+
|
|
362
|
+
def _hive_parts_to_temporal_bin(self, temporal_parts: list[str]) -> str:
|
|
363
|
+
"""Convert Hive-style temporal parts to readable temporal bin format.
|
|
364
|
+
|
|
365
|
+
Args:
|
|
366
|
+
temporal_parts: List like ["year=2024", "month=01", "day=15"]
|
|
367
|
+
|
|
368
|
+
Returns:
|
|
369
|
+
Formatted string like "2024-01-15" or "2024-01" or "2024"
|
|
370
|
+
"""
|
|
371
|
+
if not temporal_parts:
|
|
372
|
+
return "unknown"
|
|
373
|
+
|
|
374
|
+
year = month = day = None
|
|
375
|
+
for part in temporal_parts:
|
|
376
|
+
if part.startswith("year="):
|
|
377
|
+
year = part.split("=")[1]
|
|
378
|
+
elif part.startswith("month="):
|
|
379
|
+
month = part.split("=")[1]
|
|
380
|
+
elif part.startswith("day="):
|
|
381
|
+
day = part.split("=")[1]
|
|
382
|
+
|
|
383
|
+
if year and month and day:
|
|
384
|
+
return f"{year}-{month}-{day}"
|
|
385
|
+
elif year and month:
|
|
386
|
+
return f"{year}-{month}"
|
|
387
|
+
elif year:
|
|
388
|
+
return year
|
|
389
|
+
else:
|
|
390
|
+
return "unknown"
|
|
391
|
+
|
|
392
|
+
def _get_global_partitioning_info(self) -> dict[str, Any]:
|
|
393
|
+
"""Get global partitioning configuration."""
|
|
394
|
+
return {
|
|
395
|
+
"enabled": self.config.enable_global_partitioning,
|
|
396
|
+
"threshold": self.config.global_partition_threshold,
|
|
397
|
+
"description": (
|
|
398
|
+
(
|
|
399
|
+
"Items spanning more than the threshold number of spatial cells "
|
|
400
|
+
"are placed in the 'global' partition instead of individual cells"
|
|
401
|
+
)
|
|
402
|
+
if self.config.enable_global_partitioning
|
|
403
|
+
else "Global partitioning disabled"
|
|
404
|
+
),
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
def _get_catalog_statistics(self, partition_stats: dict[str, Any]) -> dict[str, Any]:
|
|
408
|
+
"""Get overall catalog statistics.
|
|
409
|
+
|
|
410
|
+
If IngestionStatistics is available, returns comprehensive metrics including:
|
|
411
|
+
- Unique granule counts (via HyperLogLog)
|
|
412
|
+
- Overhead/duplication metrics
|
|
413
|
+
- Spatial and temporal distribution
|
|
414
|
+
- Data quality metrics
|
|
415
|
+
- Processing performance
|
|
416
|
+
|
|
417
|
+
Falls back to basic partition-derived stats if IngestionStatistics not provided.
|
|
418
|
+
"""
|
|
419
|
+
# If we have comprehensive statistics from ingestion, use them
|
|
420
|
+
if self.stats is not None:
|
|
421
|
+
return self.stats.get_summary()
|
|
422
|
+
|
|
423
|
+
# Fallback: derive basic stats from partition_stats
|
|
424
|
+
total_items = sum(s.get("total_items", 0) for s in partition_stats.values())
|
|
425
|
+
new_items = sum(s.get("new_items", 0) for s in partition_stats.values())
|
|
426
|
+
existing_items = sum(s.get("existing_items", 0) for s in partition_stats.values())
|
|
427
|
+
|
|
428
|
+
return {
|
|
429
|
+
"unique_granules": total_items, # Best approximation without HyperLogLog
|
|
430
|
+
"stored_references": total_items,
|
|
431
|
+
"total_partitions": len(partition_stats),
|
|
432
|
+
"total_files": len(partition_stats),
|
|
433
|
+
"overhead": {
|
|
434
|
+
"spanning_items": 0,
|
|
435
|
+
"spanning_percentage": 0.0,
|
|
436
|
+
"duplication_ratio": 1.0,
|
|
437
|
+
"overhead_percentage": 0.0,
|
|
438
|
+
"avg_tiles_per_spanning_item": 0.0,
|
|
439
|
+
"max_tiles_per_item": 0,
|
|
440
|
+
"tiles_distribution": {},
|
|
441
|
+
},
|
|
442
|
+
"global_partition": {
|
|
443
|
+
"items_routed_to_global": 0,
|
|
444
|
+
"percentage_global": 0.0,
|
|
445
|
+
},
|
|
446
|
+
"spatial": {
|
|
447
|
+
"bbox": None,
|
|
448
|
+
"cells_with_data": len(partition_stats),
|
|
449
|
+
"items_per_cell": None,
|
|
450
|
+
"hotspot_cells": [],
|
|
451
|
+
},
|
|
452
|
+
"temporal": {
|
|
453
|
+
"earliest": None,
|
|
454
|
+
"latest": None,
|
|
455
|
+
"years_with_data": [],
|
|
456
|
+
"distribution": {},
|
|
457
|
+
},
|
|
458
|
+
"quality": {
|
|
459
|
+
"null_geometries": 0,
|
|
460
|
+
"missing_datetime": 0,
|
|
461
|
+
"geometry_types": {},
|
|
462
|
+
},
|
|
463
|
+
"missions": {},
|
|
464
|
+
"files": {
|
|
465
|
+
"total_count": len(partition_stats),
|
|
466
|
+
"total_size_bytes": None,
|
|
467
|
+
"size_stats": None,
|
|
468
|
+
"items_per_file": None,
|
|
469
|
+
},
|
|
470
|
+
"processing": {
|
|
471
|
+
"run_timestamp": None,
|
|
472
|
+
"duration_seconds": 0,
|
|
473
|
+
"urls_processed": 0,
|
|
474
|
+
"urls_failed": 0,
|
|
475
|
+
"success_rate": 100.0,
|
|
476
|
+
"items_per_second": 0.0,
|
|
477
|
+
"new_items": new_items,
|
|
478
|
+
"existing_items": existing_items,
|
|
479
|
+
"duplicates_removed": 0,
|
|
480
|
+
},
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
def _get_usage_info(self) -> dict[str, Any]:
|
|
484
|
+
"""Get usage examples and recommendations."""
|
|
485
|
+
return {
|
|
486
|
+
"file_structure": self._get_file_structure_info(),
|
|
487
|
+
"spatial_partition_resolution": {
|
|
488
|
+
"description": "Automatically resolve spatial partitions that intersect with your area of interest",
|
|
489
|
+
"python_example": """
|
|
490
|
+
from earthcatalog.spatial_resolver import spatial_resolver
|
|
491
|
+
from shapely.geometry import box
|
|
492
|
+
import duckdb
|
|
493
|
+
|
|
494
|
+
# Load resolver from this schema
|
|
495
|
+
resolver = spatial_resolver('catalog_schema.json')
|
|
496
|
+
|
|
497
|
+
# For remote schemas (requires fsspec):
|
|
498
|
+
# resolver = spatial_resolver('s3://bucket/catalog_schema.json', 's3://bucket/catalog/')
|
|
499
|
+
# resolver = spatial_resolver('https://example.com/schema.json', './catalog/')
|
|
500
|
+
|
|
501
|
+
# Define area of interest (example: San Francisco Bay Area)
|
|
502
|
+
aoi = box(-122.5, 37.7, -122.0, 38.0)
|
|
503
|
+
|
|
504
|
+
# Resolve intersecting partitions dynamically
|
|
505
|
+
partition_ids = resolver.resolve_partitions(aoi, overlap=True, buffer_cells=1)
|
|
506
|
+
|
|
507
|
+
# Generate query paths with Hive-style temporal filtering
|
|
508
|
+
# '2024-01' becomes 'year=2024/month=01/items.parquet'
|
|
509
|
+
query_paths = resolver.generate_query_paths(partition_ids, '2024-01')
|
|
510
|
+
|
|
511
|
+
# Query only relevant data - DuckDB skips non-matching temporal directories
|
|
512
|
+
if query_paths:
|
|
513
|
+
result = duckdb.sql(f"SELECT * FROM read_parquet({query_paths})").df()
|
|
514
|
+
print(f"Queried {len(partition_ids)} partitions, found {len(result)} items")
|
|
515
|
+
""",
|
|
516
|
+
"grid_specific_notes": self._get_grid_specific_notes(),
|
|
517
|
+
},
|
|
518
|
+
"partition_pruning": {
|
|
519
|
+
"description": "Use Hive-style directory structure for automatic partition pruning",
|
|
520
|
+
"recommended_approach": "Use SpatialPartitionResolver for automatic spatial and temporal filtering",
|
|
521
|
+
"manual_spatial_filter_example": self._get_spatial_filter_example(),
|
|
522
|
+
"temporal_filter_note": "Temporal filtering uses Hive-style directories (year=YYYY/month=MM/) for directory-level pruning",
|
|
523
|
+
"duckdb_examples": self._get_duckdb_examples(),
|
|
524
|
+
},
|
|
525
|
+
"recommended_tools": [
|
|
526
|
+
{
|
|
527
|
+
"tool": "EarthCatalog SpatialPartitionResolver",
|
|
528
|
+
"use_case": "Automatic spatial partition resolution from geometry",
|
|
529
|
+
"example": "resolver.resolve_partitions(your_geometry) -> ['partition1', 'partition2']",
|
|
530
|
+
},
|
|
531
|
+
{
|
|
532
|
+
"tool": "DuckDB",
|
|
533
|
+
"use_case": "Fast analytical queries with automatic partition pruning",
|
|
534
|
+
"example": "SELECT * FROM read_parquet(['catalog/**/year=2024/month=01/items.parquet'])",
|
|
535
|
+
},
|
|
536
|
+
{
|
|
537
|
+
"tool": "Apache Arrow/Parquet",
|
|
538
|
+
"use_case": "Column-oriented analysis and filtering",
|
|
539
|
+
"example": "Use spatial_id and temporal_bin columns for efficient filtering",
|
|
540
|
+
},
|
|
541
|
+
],
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
def _get_grid_description(self) -> str:
|
|
545
|
+
"""Get human-readable description of the grid system."""
|
|
546
|
+
descriptions = {
|
|
547
|
+
"h3": "Uber H3 hexagonal hierarchical spatial index",
|
|
548
|
+
"s2": "Google S2 spherical geometry library cells",
|
|
549
|
+
"mgrs": "Military Grid Reference System (MGRS) grid",
|
|
550
|
+
"utm": "Universal Transverse Mercator (UTM) grid zones",
|
|
551
|
+
"latlon": "Simple latitude-longitude rectangular grid",
|
|
552
|
+
"itslive": "ITS_LIVE center-based 10-degree grid for glacier/ice datasets",
|
|
553
|
+
"geojson": "Custom grid defined by GeoJSON features",
|
|
554
|
+
}
|
|
555
|
+
return descriptions.get(self.config.grid_system, f"Unknown grid system: {self.config.grid_system}")
|
|
556
|
+
|
|
557
|
+
def _get_h3_resolution_description(self, resolution: int) -> str:
|
|
558
|
+
"""Get description of H3 resolution level."""
|
|
559
|
+
descriptions = {
|
|
560
|
+
0: "Very coarse - continents/countries",
|
|
561
|
+
1: "Large countries/regions",
|
|
562
|
+
2: "Countries/large states",
|
|
563
|
+
3: "States/provinces",
|
|
564
|
+
4: "Large counties/regions",
|
|
565
|
+
5: "Counties/metropolitan areas",
|
|
566
|
+
6: "Cities/large municipalities",
|
|
567
|
+
7: "City districts/neighborhoods",
|
|
568
|
+
8: "Neighborhoods/census blocks",
|
|
569
|
+
9: "City blocks/large buildings",
|
|
570
|
+
10: "Buildings/small areas",
|
|
571
|
+
11: "Building parts/rooms",
|
|
572
|
+
12: "Very fine - room-level detail",
|
|
573
|
+
13: "Sub-room level",
|
|
574
|
+
14: "Very high precision",
|
|
575
|
+
15: "Highest precision",
|
|
576
|
+
}
|
|
577
|
+
return descriptions.get(resolution, f"Resolution {resolution}")
|
|
578
|
+
|
|
579
|
+
def _get_h3_average_area(self, resolution: int) -> float | None:
|
|
580
|
+
"""Get average H3 cell area in km²."""
|
|
581
|
+
# Approximate areas from H3 documentation
|
|
582
|
+
areas = {
|
|
583
|
+
0: 4250546.848,
|
|
584
|
+
1: 607220.9782,
|
|
585
|
+
2: 86745.85403,
|
|
586
|
+
3: 12392.26486,
|
|
587
|
+
4: 1770.323552,
|
|
588
|
+
5: 252.9033645,
|
|
589
|
+
6: 36.1290521,
|
|
590
|
+
7: 5.1612932,
|
|
591
|
+
8: 0.7373276,
|
|
592
|
+
9: 0.1053325,
|
|
593
|
+
10: 0.0150475,
|
|
594
|
+
11: 0.0021496,
|
|
595
|
+
12: 0.0003071,
|
|
596
|
+
13: 0.0000439,
|
|
597
|
+
14: 0.0000063,
|
|
598
|
+
15: 0.0000009,
|
|
599
|
+
}
|
|
600
|
+
return areas.get(resolution)
|
|
601
|
+
|
|
602
|
+
def _get_h3_average_edge_length(self, resolution: int) -> float | None:
|
|
603
|
+
"""Get average H3 cell edge length in km."""
|
|
604
|
+
# Approximate edge lengths from H3 documentation
|
|
605
|
+
edges = {
|
|
606
|
+
0: 1107.712591,
|
|
607
|
+
1: 418.6760055,
|
|
608
|
+
2: 158.2446558,
|
|
609
|
+
3: 59.81085794,
|
|
610
|
+
4: 22.6063794,
|
|
611
|
+
5: 8.544408276,
|
|
612
|
+
6: 3.229953667,
|
|
613
|
+
7: 1.220629759,
|
|
614
|
+
8: 0.461354684,
|
|
615
|
+
9: 0.174375668,
|
|
616
|
+
10: 0.065907807,
|
|
617
|
+
11: 0.024910561,
|
|
618
|
+
12: 0.009415526,
|
|
619
|
+
13: 0.003559893,
|
|
620
|
+
14: 0.001348575,
|
|
621
|
+
15: 0.000509713,
|
|
622
|
+
}
|
|
623
|
+
return edges.get(resolution)
|
|
624
|
+
|
|
625
|
+
def _get_s2_level_description(self, level: int) -> str:
|
|
626
|
+
"""Get description of S2 level."""
|
|
627
|
+
if level <= 3:
|
|
628
|
+
return f"Very coarse - level {level}"
|
|
629
|
+
elif level <= 6:
|
|
630
|
+
return f"Coarse - level {level}"
|
|
631
|
+
elif level <= 10:
|
|
632
|
+
return f"Medium - level {level}"
|
|
633
|
+
elif level <= 15:
|
|
634
|
+
return f"Fine - level {level}"
|
|
635
|
+
else:
|
|
636
|
+
return f"Very fine - level {level}"
|
|
637
|
+
|
|
638
|
+
def _get_s2_average_area(self, level: int) -> float | None:
|
|
639
|
+
"""Get approximate S2 cell area in km²."""
|
|
640
|
+
# S2 cells get 4x smaller each level
|
|
641
|
+
base_area = 85011012.19 # Area at level 0 in km²
|
|
642
|
+
return float(base_area / (4**level))
|
|
643
|
+
|
|
644
|
+
def _get_mgrs_precision_description(self, precision: int) -> str:
|
|
645
|
+
"""Get MGRS precision description."""
|
|
646
|
+
descriptions = {
|
|
647
|
+
1: "100km x 100km grid squares",
|
|
648
|
+
2: "10km x 10km grid squares",
|
|
649
|
+
3: "1km x 1km grid squares",
|
|
650
|
+
4: "100m x 100m grid squares",
|
|
651
|
+
5: "10m x 10m grid squares",
|
|
652
|
+
6: "1m x 1m grid squares",
|
|
653
|
+
}
|
|
654
|
+
return descriptions.get(precision, f"Precision {precision}")
|
|
655
|
+
|
|
656
|
+
def _get_utm_precision_description(self, precision: int) -> str:
|
|
657
|
+
"""Get UTM precision description."""
|
|
658
|
+
descriptions = {
|
|
659
|
+
1: "UTM zones (6° wide)",
|
|
660
|
+
2: "100km x 100km squares",
|
|
661
|
+
3: "10km x 10km squares",
|
|
662
|
+
4: "1km x 1km squares",
|
|
663
|
+
5: "100m x 100m squares",
|
|
664
|
+
}
|
|
665
|
+
return descriptions.get(precision, f"Precision {precision}")
|
|
666
|
+
|
|
667
|
+
def _get_custom_tiles_info(self) -> dict[str, Any] | None:
|
|
668
|
+
"""Get information about custom tiles if using GeoJSON grid."""
|
|
669
|
+
if not hasattr(self.config, "geojson_path") or not self.config.geojson_path:
|
|
670
|
+
return None
|
|
671
|
+
|
|
672
|
+
try:
|
|
673
|
+
# Try to read the GeoJSON file to get tile information
|
|
674
|
+
if self.config.geojson_path.startswith("s3://"):
|
|
675
|
+
# Would need to implement S3 reading for geojson
|
|
676
|
+
return {"note": "Custom GeoJSON tiles (S3 path cannot be analyzed)"}
|
|
677
|
+
else:
|
|
678
|
+
with open(self.config.geojson_path) as f:
|
|
679
|
+
geojson_data = json.load(f)
|
|
680
|
+
|
|
681
|
+
features = geojson_data.get("features", [])
|
|
682
|
+
tile_ids = [f.get("properties", {}).get("id") for f in features]
|
|
683
|
+
tile_ids = [tid for tid in tile_ids if tid] # Filter None values
|
|
684
|
+
|
|
685
|
+
return {"total_tiles": len(features), "tile_ids": tile_ids, "source_file": self.config.geojson_path}
|
|
686
|
+
except (OSError, json.JSONDecodeError, ValueError, TypeError) as e:
|
|
687
|
+
logger.warning(f"Could not read custom GeoJSON file: {e}")
|
|
688
|
+
return {"note": "Custom GeoJSON tiles (file could not be read)"}
|
|
689
|
+
|
|
690
|
+
def _get_spatial_filter_example(self) -> str:
|
|
691
|
+
"""Get example of spatial filtering based on grid system."""
|
|
692
|
+
if self.config.grid_system == "h3":
|
|
693
|
+
return "spatial_id IN ('8a2a1072b59ffff', '8a2a1072b5bffff', ...)"
|
|
694
|
+
elif self.config.grid_system == "itslive":
|
|
695
|
+
return "spatial_id IN ('N60W040', 'N60W030', 'N70W040', ...)"
|
|
696
|
+
elif self.config.grid_system == "geojson":
|
|
697
|
+
return "spatial_id IN ('region_a', 'region_b', ...)"
|
|
698
|
+
else:
|
|
699
|
+
return f"spatial_id IN ('tile_1', 'tile_2', ...) -- {self.config.grid_system} tile IDs"
|
|
700
|
+
|
|
701
|
+
def _get_duckdb_examples(self) -> list[dict[str, str]]:
|
|
702
|
+
"""Get DuckDB query examples for partition pruning."""
|
|
703
|
+
examples = []
|
|
704
|
+
|
|
705
|
+
# Dynamic spatial resolution example (recommended approach)
|
|
706
|
+
global_note = ""
|
|
707
|
+
if self.config.enable_global_partitioning:
|
|
708
|
+
global_note = f"""
|
|
709
|
+
# NOTE: Large queries automatically include 'global' partition
|
|
710
|
+
# Threshold: {self.config.global_partition_threshold} cells
|
|
711
|
+
# Items spanning > threshold cells are stored in global/"""
|
|
712
|
+
|
|
713
|
+
examples.append(
|
|
714
|
+
{
|
|
715
|
+
"description": "Dynamic spatial partition resolution with global partition support (RECOMMENDED)",
|
|
716
|
+
"query": f"""
|
|
717
|
+
# Python code using SpatialPartitionResolver
|
|
718
|
+
from earthcatalog.spatial_resolver import spatial_resolver
|
|
719
|
+
from shapely.geometry import box
|
|
720
|
+
|
|
721
|
+
# Load resolver from schema
|
|
722
|
+
resolver = spatial_resolver('catalog_schema.json')
|
|
723
|
+
|
|
724
|
+
# Small area (city-scale) - no global partition needed
|
|
725
|
+
small_aoi = box(-122.5, 37.7, -122.0, 38.0) # San Francisco
|
|
726
|
+
small_partitions = resolver.resolve_partitions(small_aoi, overlap=True)
|
|
727
|
+
print(f"Small query: {{len(small_partitions)}} partitions, global: {{'global' in small_partitions}}")
|
|
728
|
+
|
|
729
|
+
# Large area (state-scale) - automatically includes global partition!
|
|
730
|
+
large_aoi = box(-124.0, 32.0, -114.0, 42.0) # California
|
|
731
|
+
large_partitions = resolver.resolve_partitions(large_aoi, overlap=True)
|
|
732
|
+
print(f"Large query: {{len(large_partitions)}} partitions, global: {{'global' in large_partitions}}")
|
|
733
|
+
|
|
734
|
+
# Generate Hive-style query paths with temporal filter
|
|
735
|
+
# '2024-01' becomes 'year=2024/month=01/items.parquet'
|
|
736
|
+
query_patterns = resolver.generate_query_paths(large_partitions, '2024-01')
|
|
737
|
+
|
|
738
|
+
# Query captures both spatial cells AND global partition items
|
|
739
|
+
# DuckDB skips non-matching temporal directories (directory-level pruning)
|
|
740
|
+
import duckdb
|
|
741
|
+
result = duckdb.sql(f"SELECT * FROM read_parquet({{query_patterns}})").df(){global_note}
|
|
742
|
+
""".strip(),
|
|
743
|
+
}
|
|
744
|
+
)
|
|
745
|
+
|
|
746
|
+
# Manual spatial filter (fallback)
|
|
747
|
+
examples.append(
|
|
748
|
+
{
|
|
749
|
+
"description": "Manual spatial partition filter (if you know the partition IDs)",
|
|
750
|
+
"query": """
|
|
751
|
+
SELECT * FROM read_parquet('catalog/**/items.parquet')
|
|
752
|
+
WHERE spatial_id IN ('8a2a1072b59ffff', '8a2a1072b5bffff')
|
|
753
|
+
""".strip(),
|
|
754
|
+
}
|
|
755
|
+
)
|
|
756
|
+
|
|
757
|
+
# Temporal filter using directory paths
|
|
758
|
+
examples.append(
|
|
759
|
+
{
|
|
760
|
+
"description": f"Filter by temporal range using Hive-style paths ({self.config.temporal_bin})",
|
|
761
|
+
"query": """
|
|
762
|
+
-- Use glob patterns for directory-level temporal pruning (most efficient)
|
|
763
|
+
SELECT * FROM read_parquet('catalog/**/year=2024/month=*/items.parquet')
|
|
764
|
+
|
|
765
|
+
-- Or query all and filter by datetime column (less efficient, but more flexible)
|
|
766
|
+
SELECT * FROM read_parquet('catalog/**/items.parquet')
|
|
767
|
+
WHERE datetime >= '2024-01-01' AND datetime < '2025-01-01'
|
|
768
|
+
""".strip(),
|
|
769
|
+
}
|
|
770
|
+
)
|
|
771
|
+
|
|
772
|
+
# Combined with spatial intersection
|
|
773
|
+
examples.append(
|
|
774
|
+
{
|
|
775
|
+
"description": "Combine partition pruning with geometric intersection",
|
|
776
|
+
"query": """
|
|
777
|
+
-- First use SpatialPartitionResolver to get relevant partitions, then:
|
|
778
|
+
-- The resolver returns Hive-style paths like 'catalog/.../year=2024/month=06/items.parquet'
|
|
779
|
+
SELECT * FROM read_parquet('catalog/[resolved_partitions]/**/year=2024/month=06/items.parquet')
|
|
780
|
+
WHERE ST_Intersects(geometry, ST_GeomFromText('POLYGON((-122.5 37.7, -122.0 37.7, -122.0 38.0, -122.5 38.0, -122.5 37.7))'))
|
|
781
|
+
""".strip(),
|
|
782
|
+
}
|
|
783
|
+
)
|
|
784
|
+
|
|
785
|
+
# Global partition query
|
|
786
|
+
if self.config.enable_global_partitioning:
|
|
787
|
+
examples.append(
|
|
788
|
+
{
|
|
789
|
+
"description": "Query items that span multiple spatial partitions",
|
|
790
|
+
"query": """
|
|
791
|
+
-- Query global partition with Hive-style temporal directory
|
|
792
|
+
SELECT * FROM read_parquet('catalog/**/global/year=2024/month=01/items.parquet')
|
|
793
|
+
""".strip(),
|
|
794
|
+
}
|
|
795
|
+
)
|
|
796
|
+
|
|
797
|
+
return examples
|
|
798
|
+
|
|
799
|
+
def _get_grid_specific_notes(self) -> dict[str, str]:
|
|
800
|
+
"""Get grid-specific notes for spatial resolution."""
|
|
801
|
+
notes = {
|
|
802
|
+
"h3": "H3 cells use hexagonal geometry. Use overlap=True to include boundary cells.",
|
|
803
|
+
"s2": "S2 cells use spherical geometry. Higher levels provide finer resolution.",
|
|
804
|
+
"mgrs": "MGRS uses military grid system. Precision affects cell size.",
|
|
805
|
+
"utm": "UTM zones are 6° wide. Consider zone boundaries for large areas.",
|
|
806
|
+
"latlon": "Simple rectangular grid. Cell size is in degrees.",
|
|
807
|
+
"itslive": "ITS_LIVE uses fixed 10° cells with center-based naming. Optimized for glacier datasets.",
|
|
808
|
+
"geojson": "Custom geometry tiles. Intersection depends on your tile definitions.",
|
|
809
|
+
}
|
|
810
|
+
return {self.config.grid_system: notes.get(self.config.grid_system, "Grid-specific resolution")}
|
|
811
|
+
|
|
812
|
+
def _get_file_structure_info(self) -> dict[str, str | list[str]]:
|
|
813
|
+
"""Get file structure description and examples."""
|
|
814
|
+
ext = getattr(self.config, "output_format", "geoparquet")
|
|
815
|
+
ext_suffix = "parquet" if ext == "geoparquet" else ext
|
|
816
|
+
|
|
817
|
+
# Describe Hive-style temporal partitioning
|
|
818
|
+
temporal_desc = {
|
|
819
|
+
"year": "year={YYYY}",
|
|
820
|
+
"month": "year={YYYY}/month={MM}",
|
|
821
|
+
"day": "year={YYYY}/month={MM}/day={DD}",
|
|
822
|
+
}.get(self.config.temporal_bin, "year={YYYY}/month={MM}")
|
|
823
|
+
|
|
824
|
+
description = (
|
|
825
|
+
f"Catalog uses Hive-style partitioning: "
|
|
826
|
+
f"{{mission}}/partition={{grid_type}}/level={{resolution}}/{{spatial_id}}/{temporal_desc}/items.{ext_suffix}"
|
|
827
|
+
)
|
|
828
|
+
|
|
829
|
+
grid_type = self.config.grid_system
|
|
830
|
+
resolution = self.config.grid_resolution
|
|
831
|
+
|
|
832
|
+
if self.config.grid_system == "h3":
|
|
833
|
+
if self.config.temporal_bin == "day":
|
|
834
|
+
example_paths = [
|
|
835
|
+
f"sentinel2/partition={grid_type}/level={resolution}/8a2a1072b59ffff/year=2024/month=01/day=15/items.{ext_suffix}",
|
|
836
|
+
f"landsat8/partition={grid_type}/level={resolution}/8a2a1072b5bffff/year=2024/month=01/day=15/items.{ext_suffix}",
|
|
837
|
+
]
|
|
838
|
+
elif self.config.temporal_bin == "year":
|
|
839
|
+
example_paths = [
|
|
840
|
+
f"sentinel2/partition={grid_type}/level={resolution}/8a2a1072b59ffff/year=2024/items.{ext_suffix}",
|
|
841
|
+
f"landsat8/partition={grid_type}/level={resolution}/8a2a1072b5bffff/year=2024/items.{ext_suffix}",
|
|
842
|
+
]
|
|
843
|
+
else: # month (default)
|
|
844
|
+
example_paths = [
|
|
845
|
+
f"sentinel2/partition={grid_type}/level={resolution}/8a2a1072b59ffff/year=2024/month=01/items.{ext_suffix}",
|
|
846
|
+
f"landsat8/partition={grid_type}/level={resolution}/8a2a1072b5bffff/year=2024/month=01/items.{ext_suffix}",
|
|
847
|
+
]
|
|
848
|
+
else:
|
|
849
|
+
if self.config.temporal_bin == "day":
|
|
850
|
+
example_paths = [
|
|
851
|
+
f"mission_a/partition={grid_type}/level={resolution}/tile_001/year=2024/month=01/day=15/items.{ext_suffix}",
|
|
852
|
+
f"mission_b/partition={grid_type}/level={resolution}/tile_002/year=2024/month=01/day=15/items.{ext_suffix}",
|
|
853
|
+
]
|
|
854
|
+
elif self.config.temporal_bin == "year":
|
|
855
|
+
example_paths = [
|
|
856
|
+
f"mission_a/partition={grid_type}/level={resolution}/tile_001/year=2024/items.{ext_suffix}",
|
|
857
|
+
f"mission_b/partition={grid_type}/level={resolution}/tile_002/year=2024/items.{ext_suffix}",
|
|
858
|
+
]
|
|
859
|
+
else: # month (default)
|
|
860
|
+
example_paths = [
|
|
861
|
+
f"mission_a/partition={grid_type}/level={resolution}/tile_001/year=2024/month=01/items.{ext_suffix}",
|
|
862
|
+
f"mission_b/partition={grid_type}/level={resolution}/tile_002/year=2024/month=01/items.{ext_suffix}",
|
|
863
|
+
]
|
|
864
|
+
|
|
865
|
+
if self.config.enable_global_partitioning:
|
|
866
|
+
if self.config.temporal_bin == "day":
|
|
867
|
+
example_paths.append(
|
|
868
|
+
f"sentinel2/partition={grid_type}/level={resolution}/global/year=2024/month=01/day=15/items.{ext_suffix}"
|
|
869
|
+
)
|
|
870
|
+
elif self.config.temporal_bin == "year":
|
|
871
|
+
example_paths.append(
|
|
872
|
+
f"sentinel2/partition={grid_type}/level={resolution}/global/year=2024/items.{ext_suffix}"
|
|
873
|
+
)
|
|
874
|
+
else:
|
|
875
|
+
example_paths.append(
|
|
876
|
+
f"sentinel2/partition={grid_type}/level={resolution}/global/year=2024/month=01/items.{ext_suffix}"
|
|
877
|
+
)
|
|
878
|
+
|
|
879
|
+
return {
|
|
880
|
+
"description": description,
|
|
881
|
+
"example_paths": example_paths,
|
|
882
|
+
"temporal_partitioning": f"Hive-style ({self.config.temporal_bin} granularity)",
|
|
883
|
+
"pruning_benefit": "DuckDB/Athena/Spark skip entire directories during temporal filtering",
|
|
884
|
+
}
|
|
885
|
+
|
|
886
|
+
def _write_schema(self, schema: dict[str, Any], filename: str) -> None:
|
|
887
|
+
"""Write schema to the output location."""
|
|
888
|
+
schema_content = json.dumps(schema, indent=2, ensure_ascii=False)
|
|
889
|
+
|
|
890
|
+
# Build full path to schema file
|
|
891
|
+
full_path = f"{self.config.output_catalog}/{filename}"
|
|
892
|
+
|
|
893
|
+
try:
|
|
894
|
+
# Use the storage backend to write the schema
|
|
895
|
+
self.storage.makedirs(Path(self.config.output_catalog))
|
|
896
|
+
with self.storage.open(full_path, "w") as f:
|
|
897
|
+
f.write(schema_content.encode("utf-8"))
|
|
898
|
+
except (OSError, ValueError, TypeError, RuntimeError) as e:
|
|
899
|
+
logger.error(f"Failed to write schema using storage backend: {e}")
|
|
900
|
+
# Fallback to local write if storage backend fails
|
|
901
|
+
try:
|
|
902
|
+
output_path = Path(self.config.output_catalog) / filename
|
|
903
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
904
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
905
|
+
f.write(schema_content)
|
|
906
|
+
logger.info(f"Schema written to local path: {output_path}")
|
|
907
|
+
except (OSError, ValueError, TypeError) as fallback_error:
|
|
908
|
+
logger.error(f"Failed to write schema to local path: {fallback_error}")
|
|
909
|
+
raise
|
|
910
|
+
|
|
911
|
+
logger.info(f"Schema written to: {full_path}")
|