earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. earthcatalog/__init__.py +164 -0
  2. earthcatalog/async_http_client.py +1006 -0
  3. earthcatalog/config.py +97 -0
  4. earthcatalog/engines/__init__.py +308 -0
  5. earthcatalog/engines/rustac_engine.py +142 -0
  6. earthcatalog/engines/stac_geoparquet_engine.py +126 -0
  7. earthcatalog/exceptions.py +471 -0
  8. earthcatalog/grid_systems.py +1114 -0
  9. earthcatalog/ingestion_pipeline.py +2281 -0
  10. earthcatalog/input_readers.py +603 -0
  11. earthcatalog/job_tracking.py +485 -0
  12. earthcatalog/pipeline.py +606 -0
  13. earthcatalog/schema_generator.py +911 -0
  14. earthcatalog/spatial_resolver.py +1207 -0
  15. earthcatalog/stac_hooks.py +754 -0
  16. earthcatalog/statistics.py +677 -0
  17. earthcatalog/storage_backends.py +548 -0
  18. earthcatalog/tests/__init__.py +1 -0
  19. earthcatalog/tests/conftest.py +76 -0
  20. earthcatalog/tests/test_all_grids.py +793 -0
  21. earthcatalog/tests/test_async_http.py +700 -0
  22. earthcatalog/tests/test_cli_and_storage.py +230 -0
  23. earthcatalog/tests/test_config.py +245 -0
  24. earthcatalog/tests/test_dask_integration.py +580 -0
  25. earthcatalog/tests/test_e2e_synthetic.py +1624 -0
  26. earthcatalog/tests/test_engines.py +272 -0
  27. earthcatalog/tests/test_exceptions.py +346 -0
  28. earthcatalog/tests/test_file_structure.py +245 -0
  29. earthcatalog/tests/test_input_readers.py +666 -0
  30. earthcatalog/tests/test_integration.py +200 -0
  31. earthcatalog/tests/test_integration_async.py +283 -0
  32. earthcatalog/tests/test_job_tracking.py +603 -0
  33. earthcatalog/tests/test_multi_file_input.py +336 -0
  34. earthcatalog/tests/test_passthrough_hook.py +196 -0
  35. earthcatalog/tests/test_pipeline.py +684 -0
  36. earthcatalog/tests/test_pipeline_components.py +665 -0
  37. earthcatalog/tests/test_schema_generator.py +506 -0
  38. earthcatalog/tests/test_spatial_resolver.py +413 -0
  39. earthcatalog/tests/test_stac_hooks.py +776 -0
  40. earthcatalog/tests/test_statistics.py +477 -0
  41. earthcatalog/tests/test_storage_backends.py +236 -0
  42. earthcatalog/tests/test_validation.py +435 -0
  43. earthcatalog/tests/test_workers.py +653 -0
  44. earthcatalog/validation.py +921 -0
  45. earthcatalog/workers.py +682 -0
  46. earthcatalog-0.2.0.dist-info/METADATA +333 -0
  47. earthcatalog-0.2.0.dist-info/RECORD +50 -0
  48. earthcatalog-0.2.0.dist-info/WHEEL +5 -0
  49. earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
  50. earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,911 @@
1
+ """Comprehensive catalog schema generator for EarthCatalog metadata and documentation.
2
+
3
+ This module provides intelligent schema generation capabilities that create detailed
4
+ metadata describing catalog structure, partitioning strategies, performance characteristics,
5
+ and usage patterns. The generated schemas enable efficient catalog discovery, validation,
6
+ and integration with spatial query tools.
7
+
8
+ Schema Generation Features:
9
+ - Complete catalog metadata with processing configuration
10
+ - Spatial partitioning documentation with grid system details
11
+ - Temporal organization structure and binning strategies
12
+ - Statistical summaries and performance metrics
13
+ - Usage examples and query optimization guidance
14
+ - Version tracking and compatibility information
15
+
16
+ Generated Schema Components:
17
+ Catalog Info: Basic metadata about source data and processing configuration
18
+ Spatial Partitioning: Detailed grid system parameters and spatial organization
19
+ Temporal Partitioning: Time-based binning configuration and structure
20
+ Partition Structure: Directory layout and file organization patterns
21
+ Global Partitioning: Large geometry handling and optimization strategies
22
+ Statistics: Performance metrics, item counts, and processing summaries
23
+ Usage Guidelines: Query examples and optimization recommendations
24
+
25
+ Integration Benefits:
26
+ - Enables spatial_resolver to automatically configure partition resolution
27
+ - Provides documentation for catalog users and developers
28
+ - Supports catalog validation and integrity checking
29
+ - Facilitates integration with external tools and frameworks
30
+ - Enables performance monitoring and optimization tracking
31
+
32
+ Performance Tracking:
33
+ The schema includes detailed performance metrics that help users understand:
34
+ - Processing throughput and timing information
35
+ - Memory usage patterns and optimization opportunities
36
+ - Error rates and reliability statistics
37
+ - Partition distribution and load balancing effectiveness
38
+
39
+ Example Generated Schema Structure:
40
+ {
41
+ "earthcatalog_version": "1.0.0",
42
+ "generated_at": "2024-12-04T10:30:00Z",
43
+ "spatial_partitioning": {
44
+ "grid_system": "h3",
45
+ "resolution": 6,
46
+ "cell_area_km2": 36.1,
47
+ "description": "H3 hexagonal grid level 6..."
48
+ },
49
+ "temporal_partitioning": {
50
+ "bin_size": "month",
51
+ "pattern": "YYYY-MM",
52
+ "description": "Monthly temporal bins..."
53
+ },
54
+ "statistics": {
55
+ "total_items": 1000000,
56
+ "processing_time_seconds": 3600,
57
+ "partitions_created": 450,
58
+ "average_items_per_partition": 2222
59
+ }
60
+ }
61
+
62
+ Usage Patterns:
63
+ >>> # Generate schema during pipeline execution
64
+ >>> generator = SchemaGenerator(config, grid, storage)
65
+ >>> schema = generator.generate_catalog_schema(stats)
66
+ >>>
67
+ >>> # Load existing schema for analysis
68
+ >>> with open('catalog_schema.json') as f:
69
+ ... schema = json.load(f)
70
+ >>> print(f"Catalog uses {schema['spatial_partitioning']['grid_system']} grid")
71
+ """
72
+
73
+ import json
74
+ import logging
75
+ from datetime import UTC, datetime
76
+ from pathlib import Path
77
+ from typing import Any
78
+
79
+ from . import grid_systems
80
+ from .statistics import IngestionStatistics
81
+
82
+ logger = logging.getLogger(__name__)
83
+
84
+
85
+ class SchemaGenerator:
86
+ """Intelligent catalog schema generator producing comprehensive metadata for EarthCatalog outputs.
87
+
88
+ This class generates rich metadata schemas that document every aspect of catalog
89
+ creation, from spatial partitioning strategies to performance characteristics.
90
+ The schemas serve as both technical documentation and enable automated tooling
91
+ for catalog discovery, validation, and query optimization.
92
+
93
+ Schema Generation Philosophy:
94
+ The generator follows a comprehensive documentation approach that captures
95
+ not just configuration parameters, but also derived information, performance
96
+ metrics, and usage guidance. This enables users to understand catalog
97
+ characteristics without examining the underlying data structure.
98
+
99
+ Key Features:
100
+ - Automatic grid system parameter calculation and documentation
101
+ - Comprehensive performance metrics and statistical summaries
102
+ - Usage examples and query optimization recommendations
103
+ - Version tracking for catalog format evolution
104
+ - Integration metadata for external tool compatibility
105
+
106
+ Generated Metadata Categories:
107
+ Configuration: Complete processing configuration and parameters
108
+ Spatial: Grid system details, resolution characteristics, and spatial organization
109
+ Temporal: Time-based binning strategies and partition patterns
110
+ Structure: Directory layout, file naming conventions, and organization
111
+ Performance: Processing metrics, throughput statistics, and optimization data
112
+ Usage: Query examples, best practices, and integration guidance
113
+
114
+ Performance Documentation:
115
+ The generator captures detailed performance characteristics including:
116
+ - Processing throughput and timing breakdowns
117
+ - Memory usage patterns and resource requirements
118
+ - Error rates and reliability metrics
119
+ - Partition distribution and load balancing statistics
120
+ - Query performance optimization recommendations
121
+
122
+ Integration Support:
123
+ Generated schemas enable seamless integration with:
124
+ - spatial_resolver for automatic partition resolution
125
+ - DuckDB and other query engines via documented structure
126
+ - Monitoring and observability tools via performance metrics
127
+ - Catalog validation and integrity checking systems
128
+ - External geospatial analysis frameworks
129
+
130
+ Thread Safety:
131
+ This class is thread-safe for read operations after initialization.
132
+ Schema generation methods can be called concurrently, though each
133
+ instance should be used for a single catalog generation workflow.
134
+
135
+ Example:
136
+ >>> # Initialize with pipeline components
137
+ >>> generator = SchemaGenerator(config, grid_system, storage_backend)
138
+ >>>
139
+ >>> # Generate complete schema with processing statistics
140
+ >>> schema = generator.generate_catalog_schema(
141
+ ... partition_stats={'total_items': 1000000, 'processing_time': 3600}
142
+ ... )
143
+ >>>
144
+ >>> # Schema automatically written to catalog directory
145
+ >>> print(f"Generated schema for {schema['statistics']['total_items']} items")
146
+ """
147
+
148
+ def __init__(
149
+ self,
150
+ config: Any,
151
+ grid: grid_systems.GridSystem,
152
+ storage: Any,
153
+ stats: IngestionStatistics | None = None,
154
+ ):
155
+ """Initialize schema generator with pipeline configuration.
156
+
157
+ Args:
158
+ config: ProcessingConfig with all pipeline settings
159
+ grid: Grid system instance used for partitioning
160
+ storage: Storage backend for the output catalog
161
+ stats: Optional IngestionStatistics with comprehensive metrics
162
+ """
163
+ self.config = config
164
+ self.grid = grid
165
+ self.storage = storage
166
+ self.stats = stats
167
+
168
+ def generate_catalog_schema(
169
+ self, partition_stats: dict[str, Any], output_filename: str = "catalog_schema.json"
170
+ ) -> dict[str, Any]:
171
+ """Generate complete catalog schema metadata.
172
+
173
+ Args:
174
+ partition_stats: Statistics from pipeline execution
175
+ output_filename: Name of schema file to write
176
+
177
+ Returns:
178
+ Dictionary containing the complete schema
179
+ """
180
+ logger.info("Generating catalog schema metadata...")
181
+
182
+ # Build the schema
183
+ schema = {
184
+ "earthcatalog_version": "1.0.0", # TODO: Get from package version
185
+ "schema_version": "1.0.0",
186
+ "generated_at": datetime.now(UTC).isoformat().replace("+00:00", "Z"),
187
+ "catalog_info": self._get_catalog_info(),
188
+ "spatial_partitioning": self._get_spatial_partitioning_info(),
189
+ "temporal_partitioning": self._get_temporal_partitioning_info(),
190
+ "partition_structure": self._get_partition_structure(partition_stats),
191
+ "global_partitioning": self._get_global_partitioning_info(),
192
+ "statistics": self._get_catalog_statistics(partition_stats),
193
+ "usage": self._get_usage_info(),
194
+ }
195
+
196
+ # Write schema to output location
197
+ self._write_schema(schema, output_filename)
198
+
199
+ return schema
200
+
201
+ def _get_catalog_info(self) -> dict[str, Any]:
202
+ """Get basic catalog information."""
203
+ config_dict = {
204
+ "sort_key": self.config.sort_key,
205
+ "sort_ascending": self.config.sort_ascending,
206
+ "items_per_shard": self.config.items_per_shard,
207
+ "max_workers": self.config.max_workers,
208
+ }
209
+
210
+ # Add standard configuration
211
+ config_dict["output_format"] = getattr(self.config, "output_format", "geoparquet")
212
+ config_dict["mission_field"] = getattr(self.config, "mission_field", "dataset_id")
213
+
214
+ return {
215
+ "output_path": self.config.output_catalog,
216
+ "input_source": self.config.input_file,
217
+ "processing_config": config_dict,
218
+ "directory_structure": "{mission}/partition={grid_type}/level={resolution}/{h3_cell}/{temporal_bin}.{ext}",
219
+ "mission_extraction": f"Extracts mission from '{config_dict['mission_field']}' field, falls back to 'collection'",
220
+ "output_format": config_dict["output_format"],
221
+ }
222
+
223
+ def _get_spatial_partitioning_info(self) -> dict[str, Any]:
224
+ """Get spatial partitioning configuration."""
225
+ grid_info = {
226
+ "grid_system": self.config.grid_system,
227
+ "coordinate_system": "EPSG:4326", # All grids use WGS84
228
+ "description": self._get_grid_description(),
229
+ }
230
+
231
+ # Add grid-specific parameters
232
+ if self.config.grid_system == "h3":
233
+ grid_info.update(
234
+ {
235
+ "resolution": self.config.grid_resolution,
236
+ "resolution_description": self._get_h3_resolution_description(self.config.grid_resolution),
237
+ "cell_area_km2": self._get_h3_average_area(self.config.grid_resolution),
238
+ "cell_edge_length_km": self._get_h3_average_edge_length(self.config.grid_resolution),
239
+ }
240
+ )
241
+ elif self.config.grid_system == "s2":
242
+ grid_info.update(
243
+ {
244
+ "level": self.config.grid_resolution,
245
+ "level_description": self._get_s2_level_description(self.config.grid_resolution),
246
+ "average_cell_area_km2": self._get_s2_average_area(self.config.grid_resolution),
247
+ }
248
+ )
249
+ elif self.config.grid_system == "mgrs":
250
+ grid_info.update(
251
+ {
252
+ "precision": self.config.grid_resolution,
253
+ "precision_description": self._get_mgrs_precision_description(self.config.grid_resolution),
254
+ }
255
+ )
256
+ elif self.config.grid_system == "utm":
257
+ grid_info.update(
258
+ {
259
+ "precision": self.config.grid_resolution,
260
+ "precision_description": self._get_utm_precision_description(self.config.grid_resolution),
261
+ }
262
+ )
263
+ elif self.config.grid_system == "latlon":
264
+ grid_info.update(
265
+ {
266
+ "cell_size_degrees": self.config.grid_resolution,
267
+ "cell_size_description": f"Each cell is {self.config.grid_resolution}° x {self.config.grid_resolution}°",
268
+ }
269
+ )
270
+ elif self.config.grid_system == "itslive":
271
+ grid_info.update(
272
+ {
273
+ "cell_size_degrees": 10,
274
+ "cell_size_description": "Fixed 10° x 10° cells with center-based naming",
275
+ "naming_convention": "{N|S}{lat:02d}{E|W}{lon:03d}",
276
+ "example_cell_name": "N60W040",
277
+ }
278
+ )
279
+ elif self.config.grid_system == "geojson":
280
+ grid_info.update(
281
+ {
282
+ "custom_grid": True,
283
+ "geojson_source": getattr(self.config, "geojson_path", "unknown"),
284
+ "custom_tiles": self._get_custom_tiles_info(),
285
+ }
286
+ )
287
+
288
+ return grid_info
289
+
290
+ def _get_temporal_partitioning_info(self) -> dict[str, Any]:
291
+ """Get temporal partitioning configuration."""
292
+ return {
293
+ "temporal_bin": self.config.temporal_bin,
294
+ "temporal_bin_description": {
295
+ "year": "Items partitioned by year using Hive-style directories (year=YYYY/)",
296
+ "month": "Items partitioned by year-month using Hive-style directories (year=YYYY/month=MM/)",
297
+ "day": "Items partitioned by year-month-day using Hive-style directories (year=YYYY/month=MM/day=DD/)",
298
+ }[self.config.temporal_bin],
299
+ "datetime_field": self.config.sort_key,
300
+ "hive_path_examples": {
301
+ "year": "year=2024/items.parquet",
302
+ "month": "year=2024/month=01/items.parquet",
303
+ "day": "year=2024/month=01/day=15/items.parquet",
304
+ }[self.config.temporal_bin],
305
+ "pruning_benefit": "Directory-level pruning in DuckDB, Athena, Spark, and Trino",
306
+ }
307
+
308
+ def _get_partition_structure(self, partition_stats: dict[str, Any]) -> dict[str, Any]:
309
+ """Get information about the actual partitions created."""
310
+ partitions = []
311
+ spatial_partitions = set()
312
+ temporal_partitions = set()
313
+ missions = set()
314
+
315
+ for partition_key, stats in partition_stats.items():
316
+ # Parse partition key with Hive-style temporal parts:
317
+ # "mission/partition=grid_type/level=resolution/spatial_id/year=YYYY/month=MM[/day=DD]"
318
+ parts = partition_key.split("/")
319
+ if len(parts) >= 5:
320
+ mission = parts[0]
321
+ # Skip partition=grid_type and level=resolution parts
322
+ spatial_id = parts[3] if len(parts) > 3 else "unknown"
323
+
324
+ # Extract Hive-style temporal parts (year=YYYY, month=MM, day=DD)
325
+ temporal_parts = []
326
+ for part in parts[4:]:
327
+ if part.startswith(("year=", "month=", "day=")):
328
+ temporal_parts.append(part)
329
+
330
+ # Convert to readable format (e.g., "year=2024/month=01" -> "2024-01")
331
+ temporal_bin = self._hive_parts_to_temporal_bin(temporal_parts)
332
+
333
+ missions.add(mission)
334
+ spatial_partitions.add(spatial_id)
335
+ temporal_partitions.add(temporal_bin)
336
+
337
+ partitions.append(
338
+ {
339
+ "partition_key": partition_key,
340
+ "mission": mission,
341
+ "spatial_id": spatial_id,
342
+ "temporal_bin": temporal_bin,
343
+ "total_items": stats.get("total_items", 0),
344
+ "new_items": stats.get("new_items", 0),
345
+ "existing_items": stats.get("existing_items", 0),
346
+ }
347
+ )
348
+
349
+ result = {
350
+ "total_partitions": len(partitions),
351
+ "spatial_partitions_count": len(spatial_partitions),
352
+ "temporal_partitions_count": len(temporal_partitions),
353
+ "missions_count": len(missions),
354
+ "spatial_partitions": sorted(spatial_partitions),
355
+ "temporal_partitions": sorted(temporal_partitions),
356
+ "missions": sorted(missions),
357
+ "partitions": partitions,
358
+ }
359
+
360
+ return result
361
+
362
+ def _hive_parts_to_temporal_bin(self, temporal_parts: list[str]) -> str:
363
+ """Convert Hive-style temporal parts to readable temporal bin format.
364
+
365
+ Args:
366
+ temporal_parts: List like ["year=2024", "month=01", "day=15"]
367
+
368
+ Returns:
369
+ Formatted string like "2024-01-15" or "2024-01" or "2024"
370
+ """
371
+ if not temporal_parts:
372
+ return "unknown"
373
+
374
+ year = month = day = None
375
+ for part in temporal_parts:
376
+ if part.startswith("year="):
377
+ year = part.split("=")[1]
378
+ elif part.startswith("month="):
379
+ month = part.split("=")[1]
380
+ elif part.startswith("day="):
381
+ day = part.split("=")[1]
382
+
383
+ if year and month and day:
384
+ return f"{year}-{month}-{day}"
385
+ elif year and month:
386
+ return f"{year}-{month}"
387
+ elif year:
388
+ return year
389
+ else:
390
+ return "unknown"
391
+
392
+ def _get_global_partitioning_info(self) -> dict[str, Any]:
393
+ """Get global partitioning configuration."""
394
+ return {
395
+ "enabled": self.config.enable_global_partitioning,
396
+ "threshold": self.config.global_partition_threshold,
397
+ "description": (
398
+ (
399
+ "Items spanning more than the threshold number of spatial cells "
400
+ "are placed in the 'global' partition instead of individual cells"
401
+ )
402
+ if self.config.enable_global_partitioning
403
+ else "Global partitioning disabled"
404
+ ),
405
+ }
406
+
407
+ def _get_catalog_statistics(self, partition_stats: dict[str, Any]) -> dict[str, Any]:
408
+ """Get overall catalog statistics.
409
+
410
+ If IngestionStatistics is available, returns comprehensive metrics including:
411
+ - Unique granule counts (via HyperLogLog)
412
+ - Overhead/duplication metrics
413
+ - Spatial and temporal distribution
414
+ - Data quality metrics
415
+ - Processing performance
416
+
417
+ Falls back to basic partition-derived stats if IngestionStatistics not provided.
418
+ """
419
+ # If we have comprehensive statistics from ingestion, use them
420
+ if self.stats is not None:
421
+ return self.stats.get_summary()
422
+
423
+ # Fallback: derive basic stats from partition_stats
424
+ total_items = sum(s.get("total_items", 0) for s in partition_stats.values())
425
+ new_items = sum(s.get("new_items", 0) for s in partition_stats.values())
426
+ existing_items = sum(s.get("existing_items", 0) for s in partition_stats.values())
427
+
428
+ return {
429
+ "unique_granules": total_items, # Best approximation without HyperLogLog
430
+ "stored_references": total_items,
431
+ "total_partitions": len(partition_stats),
432
+ "total_files": len(partition_stats),
433
+ "overhead": {
434
+ "spanning_items": 0,
435
+ "spanning_percentage": 0.0,
436
+ "duplication_ratio": 1.0,
437
+ "overhead_percentage": 0.0,
438
+ "avg_tiles_per_spanning_item": 0.0,
439
+ "max_tiles_per_item": 0,
440
+ "tiles_distribution": {},
441
+ },
442
+ "global_partition": {
443
+ "items_routed_to_global": 0,
444
+ "percentage_global": 0.0,
445
+ },
446
+ "spatial": {
447
+ "bbox": None,
448
+ "cells_with_data": len(partition_stats),
449
+ "items_per_cell": None,
450
+ "hotspot_cells": [],
451
+ },
452
+ "temporal": {
453
+ "earliest": None,
454
+ "latest": None,
455
+ "years_with_data": [],
456
+ "distribution": {},
457
+ },
458
+ "quality": {
459
+ "null_geometries": 0,
460
+ "missing_datetime": 0,
461
+ "geometry_types": {},
462
+ },
463
+ "missions": {},
464
+ "files": {
465
+ "total_count": len(partition_stats),
466
+ "total_size_bytes": None,
467
+ "size_stats": None,
468
+ "items_per_file": None,
469
+ },
470
+ "processing": {
471
+ "run_timestamp": None,
472
+ "duration_seconds": 0,
473
+ "urls_processed": 0,
474
+ "urls_failed": 0,
475
+ "success_rate": 100.0,
476
+ "items_per_second": 0.0,
477
+ "new_items": new_items,
478
+ "existing_items": existing_items,
479
+ "duplicates_removed": 0,
480
+ },
481
+ }
482
+
483
+ def _get_usage_info(self) -> dict[str, Any]:
484
+ """Get usage examples and recommendations."""
485
+ return {
486
+ "file_structure": self._get_file_structure_info(),
487
+ "spatial_partition_resolution": {
488
+ "description": "Automatically resolve spatial partitions that intersect with your area of interest",
489
+ "python_example": """
490
+ from earthcatalog.spatial_resolver import spatial_resolver
491
+ from shapely.geometry import box
492
+ import duckdb
493
+
494
+ # Load resolver from this schema
495
+ resolver = spatial_resolver('catalog_schema.json')
496
+
497
+ # For remote schemas (requires fsspec):
498
+ # resolver = spatial_resolver('s3://bucket/catalog_schema.json', 's3://bucket/catalog/')
499
+ # resolver = spatial_resolver('https://example.com/schema.json', './catalog/')
500
+
501
+ # Define area of interest (example: San Francisco Bay Area)
502
+ aoi = box(-122.5, 37.7, -122.0, 38.0)
503
+
504
+ # Resolve intersecting partitions dynamically
505
+ partition_ids = resolver.resolve_partitions(aoi, overlap=True, buffer_cells=1)
506
+
507
+ # Generate query paths with Hive-style temporal filtering
508
+ # '2024-01' becomes 'year=2024/month=01/items.parquet'
509
+ query_paths = resolver.generate_query_paths(partition_ids, '2024-01')
510
+
511
+ # Query only relevant data - DuckDB skips non-matching temporal directories
512
+ if query_paths:
513
+ result = duckdb.sql(f"SELECT * FROM read_parquet({query_paths})").df()
514
+ print(f"Queried {len(partition_ids)} partitions, found {len(result)} items")
515
+ """,
516
+ "grid_specific_notes": self._get_grid_specific_notes(),
517
+ },
518
+ "partition_pruning": {
519
+ "description": "Use Hive-style directory structure for automatic partition pruning",
520
+ "recommended_approach": "Use SpatialPartitionResolver for automatic spatial and temporal filtering",
521
+ "manual_spatial_filter_example": self._get_spatial_filter_example(),
522
+ "temporal_filter_note": "Temporal filtering uses Hive-style directories (year=YYYY/month=MM/) for directory-level pruning",
523
+ "duckdb_examples": self._get_duckdb_examples(),
524
+ },
525
+ "recommended_tools": [
526
+ {
527
+ "tool": "EarthCatalog SpatialPartitionResolver",
528
+ "use_case": "Automatic spatial partition resolution from geometry",
529
+ "example": "resolver.resolve_partitions(your_geometry) -> ['partition1', 'partition2']",
530
+ },
531
+ {
532
+ "tool": "DuckDB",
533
+ "use_case": "Fast analytical queries with automatic partition pruning",
534
+ "example": "SELECT * FROM read_parquet(['catalog/**/year=2024/month=01/items.parquet'])",
535
+ },
536
+ {
537
+ "tool": "Apache Arrow/Parquet",
538
+ "use_case": "Column-oriented analysis and filtering",
539
+ "example": "Use spatial_id and temporal_bin columns for efficient filtering",
540
+ },
541
+ ],
542
+ }
543
+
544
+ def _get_grid_description(self) -> str:
545
+ """Get human-readable description of the grid system."""
546
+ descriptions = {
547
+ "h3": "Uber H3 hexagonal hierarchical spatial index",
548
+ "s2": "Google S2 spherical geometry library cells",
549
+ "mgrs": "Military Grid Reference System (MGRS) grid",
550
+ "utm": "Universal Transverse Mercator (UTM) grid zones",
551
+ "latlon": "Simple latitude-longitude rectangular grid",
552
+ "itslive": "ITS_LIVE center-based 10-degree grid for glacier/ice datasets",
553
+ "geojson": "Custom grid defined by GeoJSON features",
554
+ }
555
+ return descriptions.get(self.config.grid_system, f"Unknown grid system: {self.config.grid_system}")
556
+
557
+ def _get_h3_resolution_description(self, resolution: int) -> str:
558
+ """Get description of H3 resolution level."""
559
+ descriptions = {
560
+ 0: "Very coarse - continents/countries",
561
+ 1: "Large countries/regions",
562
+ 2: "Countries/large states",
563
+ 3: "States/provinces",
564
+ 4: "Large counties/regions",
565
+ 5: "Counties/metropolitan areas",
566
+ 6: "Cities/large municipalities",
567
+ 7: "City districts/neighborhoods",
568
+ 8: "Neighborhoods/census blocks",
569
+ 9: "City blocks/large buildings",
570
+ 10: "Buildings/small areas",
571
+ 11: "Building parts/rooms",
572
+ 12: "Very fine - room-level detail",
573
+ 13: "Sub-room level",
574
+ 14: "Very high precision",
575
+ 15: "Highest precision",
576
+ }
577
+ return descriptions.get(resolution, f"Resolution {resolution}")
578
+
579
+ def _get_h3_average_area(self, resolution: int) -> float | None:
580
+ """Get average H3 cell area in km²."""
581
+ # Approximate areas from H3 documentation
582
+ areas = {
583
+ 0: 4250546.848,
584
+ 1: 607220.9782,
585
+ 2: 86745.85403,
586
+ 3: 12392.26486,
587
+ 4: 1770.323552,
588
+ 5: 252.9033645,
589
+ 6: 36.1290521,
590
+ 7: 5.1612932,
591
+ 8: 0.7373276,
592
+ 9: 0.1053325,
593
+ 10: 0.0150475,
594
+ 11: 0.0021496,
595
+ 12: 0.0003071,
596
+ 13: 0.0000439,
597
+ 14: 0.0000063,
598
+ 15: 0.0000009,
599
+ }
600
+ return areas.get(resolution)
601
+
602
+ def _get_h3_average_edge_length(self, resolution: int) -> float | None:
603
+ """Get average H3 cell edge length in km."""
604
+ # Approximate edge lengths from H3 documentation
605
+ edges = {
606
+ 0: 1107.712591,
607
+ 1: 418.6760055,
608
+ 2: 158.2446558,
609
+ 3: 59.81085794,
610
+ 4: 22.6063794,
611
+ 5: 8.544408276,
612
+ 6: 3.229953667,
613
+ 7: 1.220629759,
614
+ 8: 0.461354684,
615
+ 9: 0.174375668,
616
+ 10: 0.065907807,
617
+ 11: 0.024910561,
618
+ 12: 0.009415526,
619
+ 13: 0.003559893,
620
+ 14: 0.001348575,
621
+ 15: 0.000509713,
622
+ }
623
+ return edges.get(resolution)
624
+
625
+ def _get_s2_level_description(self, level: int) -> str:
626
+ """Get description of S2 level."""
627
+ if level <= 3:
628
+ return f"Very coarse - level {level}"
629
+ elif level <= 6:
630
+ return f"Coarse - level {level}"
631
+ elif level <= 10:
632
+ return f"Medium - level {level}"
633
+ elif level <= 15:
634
+ return f"Fine - level {level}"
635
+ else:
636
+ return f"Very fine - level {level}"
637
+
638
+ def _get_s2_average_area(self, level: int) -> float | None:
639
+ """Get approximate S2 cell area in km²."""
640
+ # S2 cells get 4x smaller each level
641
+ base_area = 85011012.19 # Area at level 0 in km²
642
+ return float(base_area / (4**level))
643
+
644
+ def _get_mgrs_precision_description(self, precision: int) -> str:
645
+ """Get MGRS precision description."""
646
+ descriptions = {
647
+ 1: "100km x 100km grid squares",
648
+ 2: "10km x 10km grid squares",
649
+ 3: "1km x 1km grid squares",
650
+ 4: "100m x 100m grid squares",
651
+ 5: "10m x 10m grid squares",
652
+ 6: "1m x 1m grid squares",
653
+ }
654
+ return descriptions.get(precision, f"Precision {precision}")
655
+
656
+ def _get_utm_precision_description(self, precision: int) -> str:
657
+ """Get UTM precision description."""
658
+ descriptions = {
659
+ 1: "UTM zones (6° wide)",
660
+ 2: "100km x 100km squares",
661
+ 3: "10km x 10km squares",
662
+ 4: "1km x 1km squares",
663
+ 5: "100m x 100m squares",
664
+ }
665
+ return descriptions.get(precision, f"Precision {precision}")
666
+
667
+ def _get_custom_tiles_info(self) -> dict[str, Any] | None:
668
+ """Get information about custom tiles if using GeoJSON grid."""
669
+ if not hasattr(self.config, "geojson_path") or not self.config.geojson_path:
670
+ return None
671
+
672
+ try:
673
+ # Try to read the GeoJSON file to get tile information
674
+ if self.config.geojson_path.startswith("s3://"):
675
+ # Would need to implement S3 reading for geojson
676
+ return {"note": "Custom GeoJSON tiles (S3 path cannot be analyzed)"}
677
+ else:
678
+ with open(self.config.geojson_path) as f:
679
+ geojson_data = json.load(f)
680
+
681
+ features = geojson_data.get("features", [])
682
+ tile_ids = [f.get("properties", {}).get("id") for f in features]
683
+ tile_ids = [tid for tid in tile_ids if tid] # Filter None values
684
+
685
+ return {"total_tiles": len(features), "tile_ids": tile_ids, "source_file": self.config.geojson_path}
686
+ except (OSError, json.JSONDecodeError, ValueError, TypeError) as e:
687
+ logger.warning(f"Could not read custom GeoJSON file: {e}")
688
+ return {"note": "Custom GeoJSON tiles (file could not be read)"}
689
+
690
+ def _get_spatial_filter_example(self) -> str:
691
+ """Get example of spatial filtering based on grid system."""
692
+ if self.config.grid_system == "h3":
693
+ return "spatial_id IN ('8a2a1072b59ffff', '8a2a1072b5bffff', ...)"
694
+ elif self.config.grid_system == "itslive":
695
+ return "spatial_id IN ('N60W040', 'N60W030', 'N70W040', ...)"
696
+ elif self.config.grid_system == "geojson":
697
+ return "spatial_id IN ('region_a', 'region_b', ...)"
698
+ else:
699
+ return f"spatial_id IN ('tile_1', 'tile_2', ...) -- {self.config.grid_system} tile IDs"
700
+
701
+ def _get_duckdb_examples(self) -> list[dict[str, str]]:
702
+ """Get DuckDB query examples for partition pruning."""
703
+ examples = []
704
+
705
+ # Dynamic spatial resolution example (recommended approach)
706
+ global_note = ""
707
+ if self.config.enable_global_partitioning:
708
+ global_note = f"""
709
+ # NOTE: Large queries automatically include 'global' partition
710
+ # Threshold: {self.config.global_partition_threshold} cells
711
+ # Items spanning > threshold cells are stored in global/"""
712
+
713
+ examples.append(
714
+ {
715
+ "description": "Dynamic spatial partition resolution with global partition support (RECOMMENDED)",
716
+ "query": f"""
717
+ # Python code using SpatialPartitionResolver
718
+ from earthcatalog.spatial_resolver import spatial_resolver
719
+ from shapely.geometry import box
720
+
721
+ # Load resolver from schema
722
+ resolver = spatial_resolver('catalog_schema.json')
723
+
724
+ # Small area (city-scale) - no global partition needed
725
+ small_aoi = box(-122.5, 37.7, -122.0, 38.0) # San Francisco
726
+ small_partitions = resolver.resolve_partitions(small_aoi, overlap=True)
727
+ print(f"Small query: {{len(small_partitions)}} partitions, global: {{'global' in small_partitions}}")
728
+
729
+ # Large area (state-scale) - automatically includes global partition!
730
+ large_aoi = box(-124.0, 32.0, -114.0, 42.0) # California
731
+ large_partitions = resolver.resolve_partitions(large_aoi, overlap=True)
732
+ print(f"Large query: {{len(large_partitions)}} partitions, global: {{'global' in large_partitions}}")
733
+
734
+ # Generate Hive-style query paths with temporal filter
735
+ # '2024-01' becomes 'year=2024/month=01/items.parquet'
736
+ query_patterns = resolver.generate_query_paths(large_partitions, '2024-01')
737
+
738
+ # Query captures both spatial cells AND global partition items
739
+ # DuckDB skips non-matching temporal directories (directory-level pruning)
740
+ import duckdb
741
+ result = duckdb.sql(f"SELECT * FROM read_parquet({{query_patterns}})").df(){global_note}
742
+ """.strip(),
743
+ }
744
+ )
745
+
746
+ # Manual spatial filter (fallback)
747
+ examples.append(
748
+ {
749
+ "description": "Manual spatial partition filter (if you know the partition IDs)",
750
+ "query": """
751
+ SELECT * FROM read_parquet('catalog/**/items.parquet')
752
+ WHERE spatial_id IN ('8a2a1072b59ffff', '8a2a1072b5bffff')
753
+ """.strip(),
754
+ }
755
+ )
756
+
757
+ # Temporal filter using directory paths
758
+ examples.append(
759
+ {
760
+ "description": f"Filter by temporal range using Hive-style paths ({self.config.temporal_bin})",
761
+ "query": """
762
+ -- Use glob patterns for directory-level temporal pruning (most efficient)
763
+ SELECT * FROM read_parquet('catalog/**/year=2024/month=*/items.parquet')
764
+
765
+ -- Or query all and filter by datetime column (less efficient, but more flexible)
766
+ SELECT * FROM read_parquet('catalog/**/items.parquet')
767
+ WHERE datetime >= '2024-01-01' AND datetime < '2025-01-01'
768
+ """.strip(),
769
+ }
770
+ )
771
+
772
+ # Combined with spatial intersection
773
+ examples.append(
774
+ {
775
+ "description": "Combine partition pruning with geometric intersection",
776
+ "query": """
777
+ -- First use SpatialPartitionResolver to get relevant partitions, then:
778
+ -- The resolver returns Hive-style paths like 'catalog/.../year=2024/month=06/items.parquet'
779
+ SELECT * FROM read_parquet('catalog/[resolved_partitions]/**/year=2024/month=06/items.parquet')
780
+ WHERE ST_Intersects(geometry, ST_GeomFromText('POLYGON((-122.5 37.7, -122.0 37.7, -122.0 38.0, -122.5 38.0, -122.5 37.7))'))
781
+ """.strip(),
782
+ }
783
+ )
784
+
785
+ # Global partition query
786
+ if self.config.enable_global_partitioning:
787
+ examples.append(
788
+ {
789
+ "description": "Query items that span multiple spatial partitions",
790
+ "query": """
791
+ -- Query global partition with Hive-style temporal directory
792
+ SELECT * FROM read_parquet('catalog/**/global/year=2024/month=01/items.parquet')
793
+ """.strip(),
794
+ }
795
+ )
796
+
797
+ return examples
798
+
799
+ def _get_grid_specific_notes(self) -> dict[str, str]:
800
+ """Get grid-specific notes for spatial resolution."""
801
+ notes = {
802
+ "h3": "H3 cells use hexagonal geometry. Use overlap=True to include boundary cells.",
803
+ "s2": "S2 cells use spherical geometry. Higher levels provide finer resolution.",
804
+ "mgrs": "MGRS uses military grid system. Precision affects cell size.",
805
+ "utm": "UTM zones are 6° wide. Consider zone boundaries for large areas.",
806
+ "latlon": "Simple rectangular grid. Cell size is in degrees.",
807
+ "itslive": "ITS_LIVE uses fixed 10° cells with center-based naming. Optimized for glacier datasets.",
808
+ "geojson": "Custom geometry tiles. Intersection depends on your tile definitions.",
809
+ }
810
+ return {self.config.grid_system: notes.get(self.config.grid_system, "Grid-specific resolution")}
811
+
812
+ def _get_file_structure_info(self) -> dict[str, str | list[str]]:
813
+ """Get file structure description and examples."""
814
+ ext = getattr(self.config, "output_format", "geoparquet")
815
+ ext_suffix = "parquet" if ext == "geoparquet" else ext
816
+
817
+ # Describe Hive-style temporal partitioning
818
+ temporal_desc = {
819
+ "year": "year={YYYY}",
820
+ "month": "year={YYYY}/month={MM}",
821
+ "day": "year={YYYY}/month={MM}/day={DD}",
822
+ }.get(self.config.temporal_bin, "year={YYYY}/month={MM}")
823
+
824
+ description = (
825
+ f"Catalog uses Hive-style partitioning: "
826
+ f"{{mission}}/partition={{grid_type}}/level={{resolution}}/{{spatial_id}}/{temporal_desc}/items.{ext_suffix}"
827
+ )
828
+
829
+ grid_type = self.config.grid_system
830
+ resolution = self.config.grid_resolution
831
+
832
+ if self.config.grid_system == "h3":
833
+ if self.config.temporal_bin == "day":
834
+ example_paths = [
835
+ f"sentinel2/partition={grid_type}/level={resolution}/8a2a1072b59ffff/year=2024/month=01/day=15/items.{ext_suffix}",
836
+ f"landsat8/partition={grid_type}/level={resolution}/8a2a1072b5bffff/year=2024/month=01/day=15/items.{ext_suffix}",
837
+ ]
838
+ elif self.config.temporal_bin == "year":
839
+ example_paths = [
840
+ f"sentinel2/partition={grid_type}/level={resolution}/8a2a1072b59ffff/year=2024/items.{ext_suffix}",
841
+ f"landsat8/partition={grid_type}/level={resolution}/8a2a1072b5bffff/year=2024/items.{ext_suffix}",
842
+ ]
843
+ else: # month (default)
844
+ example_paths = [
845
+ f"sentinel2/partition={grid_type}/level={resolution}/8a2a1072b59ffff/year=2024/month=01/items.{ext_suffix}",
846
+ f"landsat8/partition={grid_type}/level={resolution}/8a2a1072b5bffff/year=2024/month=01/items.{ext_suffix}",
847
+ ]
848
+ else:
849
+ if self.config.temporal_bin == "day":
850
+ example_paths = [
851
+ f"mission_a/partition={grid_type}/level={resolution}/tile_001/year=2024/month=01/day=15/items.{ext_suffix}",
852
+ f"mission_b/partition={grid_type}/level={resolution}/tile_002/year=2024/month=01/day=15/items.{ext_suffix}",
853
+ ]
854
+ elif self.config.temporal_bin == "year":
855
+ example_paths = [
856
+ f"mission_a/partition={grid_type}/level={resolution}/tile_001/year=2024/items.{ext_suffix}",
857
+ f"mission_b/partition={grid_type}/level={resolution}/tile_002/year=2024/items.{ext_suffix}",
858
+ ]
859
+ else: # month (default)
860
+ example_paths = [
861
+ f"mission_a/partition={grid_type}/level={resolution}/tile_001/year=2024/month=01/items.{ext_suffix}",
862
+ f"mission_b/partition={grid_type}/level={resolution}/tile_002/year=2024/month=01/items.{ext_suffix}",
863
+ ]
864
+
865
+ if self.config.enable_global_partitioning:
866
+ if self.config.temporal_bin == "day":
867
+ example_paths.append(
868
+ f"sentinel2/partition={grid_type}/level={resolution}/global/year=2024/month=01/day=15/items.{ext_suffix}"
869
+ )
870
+ elif self.config.temporal_bin == "year":
871
+ example_paths.append(
872
+ f"sentinel2/partition={grid_type}/level={resolution}/global/year=2024/items.{ext_suffix}"
873
+ )
874
+ else:
875
+ example_paths.append(
876
+ f"sentinel2/partition={grid_type}/level={resolution}/global/year=2024/month=01/items.{ext_suffix}"
877
+ )
878
+
879
+ return {
880
+ "description": description,
881
+ "example_paths": example_paths,
882
+ "temporal_partitioning": f"Hive-style ({self.config.temporal_bin} granularity)",
883
+ "pruning_benefit": "DuckDB/Athena/Spark skip entire directories during temporal filtering",
884
+ }
885
+
886
+ def _write_schema(self, schema: dict[str, Any], filename: str) -> None:
887
+ """Write schema to the output location."""
888
+ schema_content = json.dumps(schema, indent=2, ensure_ascii=False)
889
+
890
+ # Build full path to schema file
891
+ full_path = f"{self.config.output_catalog}/{filename}"
892
+
893
+ try:
894
+ # Use the storage backend to write the schema
895
+ self.storage.makedirs(Path(self.config.output_catalog))
896
+ with self.storage.open(full_path, "w") as f:
897
+ f.write(schema_content.encode("utf-8"))
898
+ except (OSError, ValueError, TypeError, RuntimeError) as e:
899
+ logger.error(f"Failed to write schema using storage backend: {e}")
900
+ # Fallback to local write if storage backend fails
901
+ try:
902
+ output_path = Path(self.config.output_catalog) / filename
903
+ output_path.parent.mkdir(parents=True, exist_ok=True)
904
+ with open(output_path, "w", encoding="utf-8") as f:
905
+ f.write(schema_content)
906
+ logger.info(f"Schema written to local path: {output_path}")
907
+ except (OSError, ValueError, TypeError) as fallback_error:
908
+ logger.error(f"Failed to write schema to local path: {fallback_error}")
909
+ raise
910
+
911
+ logger.info(f"Schema written to: {full_path}")