earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. earthcatalog/__init__.py +164 -0
  2. earthcatalog/async_http_client.py +1006 -0
  3. earthcatalog/config.py +97 -0
  4. earthcatalog/engines/__init__.py +308 -0
  5. earthcatalog/engines/rustac_engine.py +142 -0
  6. earthcatalog/engines/stac_geoparquet_engine.py +126 -0
  7. earthcatalog/exceptions.py +471 -0
  8. earthcatalog/grid_systems.py +1114 -0
  9. earthcatalog/ingestion_pipeline.py +2281 -0
  10. earthcatalog/input_readers.py +603 -0
  11. earthcatalog/job_tracking.py +485 -0
  12. earthcatalog/pipeline.py +606 -0
  13. earthcatalog/schema_generator.py +911 -0
  14. earthcatalog/spatial_resolver.py +1207 -0
  15. earthcatalog/stac_hooks.py +754 -0
  16. earthcatalog/statistics.py +677 -0
  17. earthcatalog/storage_backends.py +548 -0
  18. earthcatalog/tests/__init__.py +1 -0
  19. earthcatalog/tests/conftest.py +76 -0
  20. earthcatalog/tests/test_all_grids.py +793 -0
  21. earthcatalog/tests/test_async_http.py +700 -0
  22. earthcatalog/tests/test_cli_and_storage.py +230 -0
  23. earthcatalog/tests/test_config.py +245 -0
  24. earthcatalog/tests/test_dask_integration.py +580 -0
  25. earthcatalog/tests/test_e2e_synthetic.py +1624 -0
  26. earthcatalog/tests/test_engines.py +272 -0
  27. earthcatalog/tests/test_exceptions.py +346 -0
  28. earthcatalog/tests/test_file_structure.py +245 -0
  29. earthcatalog/tests/test_input_readers.py +666 -0
  30. earthcatalog/tests/test_integration.py +200 -0
  31. earthcatalog/tests/test_integration_async.py +283 -0
  32. earthcatalog/tests/test_job_tracking.py +603 -0
  33. earthcatalog/tests/test_multi_file_input.py +336 -0
  34. earthcatalog/tests/test_passthrough_hook.py +196 -0
  35. earthcatalog/tests/test_pipeline.py +684 -0
  36. earthcatalog/tests/test_pipeline_components.py +665 -0
  37. earthcatalog/tests/test_schema_generator.py +506 -0
  38. earthcatalog/tests/test_spatial_resolver.py +413 -0
  39. earthcatalog/tests/test_stac_hooks.py +776 -0
  40. earthcatalog/tests/test_statistics.py +477 -0
  41. earthcatalog/tests/test_storage_backends.py +236 -0
  42. earthcatalog/tests/test_validation.py +435 -0
  43. earthcatalog/tests/test_workers.py +653 -0
  44. earthcatalog/validation.py +921 -0
  45. earthcatalog/workers.py +682 -0
  46. earthcatalog-0.2.0.dist-info/METADATA +333 -0
  47. earthcatalog-0.2.0.dist-info/RECORD +50 -0
  48. earthcatalog-0.2.0.dist-info/WHEEL +5 -0
  49. earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
  50. earthcatalog-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,921 @@
1
+ """STAC GeoParquet Validation Module.
2
+
3
+ This module provides validation functions for STAC items during ingestion
4
+ and for verifying GeoParquet files conform to the GeoParquet specification.
5
+
6
+ Validation Levels:
7
+ - **On-ingest**: Validates STAC items before writing (geometry validity, bbox consistency)
8
+ - **Post-hoc**: Validates existing GeoParquet files for spec compliance
9
+
10
+ Key Features:
11
+ - Geometry validity checking (self-intersection, ring orientation)
12
+ - Bbox-geometry consistency validation
13
+ - GeoParquet geo metadata verification
14
+ - CRS validation (EPSG:4326 expected for STAC)
15
+ - Covering/bbox metadata validation
16
+
17
+ Usage:
18
+ >>> from earthcatalog.validation import (
19
+ ... validate_stac_item,
20
+ ... validate_geoparquet_file,
21
+ ... validate_catalog,
22
+ ... ValidationResult,
23
+ ... )
24
+ >>>
25
+ >>> # Validate a single STAC item
26
+ >>> result = validate_stac_item(item)
27
+ >>> if not result.is_valid:
28
+ ... print(f"Warnings: {result.warnings}")
29
+ >>>
30
+ >>> # Validate a GeoParquet file
31
+ >>> result = validate_geoparquet_file("path/to/file.parquet")
32
+ >>> print(result.summary())
33
+ """
34
+
35
+ from __future__ import annotations
36
+
37
+ import json
38
+ import logging
39
+ from dataclasses import dataclass, field
40
+ from pathlib import Path
41
+ from typing import Any
42
+
43
+ logger = logging.getLogger(__name__)
44
+
45
+
46
+ # =============================================================================
47
+ # Validation Result Data Classes
48
+ # =============================================================================
49
+
50
+
51
+ @dataclass
52
+ class ValidationIssue:
53
+ """A single validation issue (warning or error)."""
54
+
55
+ level: str # 'warning' or 'error'
56
+ code: str # e.g., 'INVALID_GEOMETRY', 'BBOX_MISMATCH'
57
+ message: str
58
+ context: dict[str, Any] = field(default_factory=dict)
59
+
60
+ def __str__(self) -> str:
61
+ return f"[{self.level.upper()}] {self.code}: {self.message}"
62
+
63
+
64
+ @dataclass
65
+ class ValidationResult:
66
+ """Result of a validation check."""
67
+
68
+ is_valid: bool
69
+ issues: list[ValidationIssue] = field(default_factory=list)
70
+ metadata: dict[str, Any] = field(default_factory=dict)
71
+
72
+ @property
73
+ def warnings(self) -> list[ValidationIssue]:
74
+ """Get all warning-level issues."""
75
+ return [i for i in self.issues if i.level == "warning"]
76
+
77
+ @property
78
+ def errors(self) -> list[ValidationIssue]:
79
+ """Get all error-level issues."""
80
+ return [i for i in self.issues if i.level == "error"]
81
+
82
+ def add_warning(self, code: str, message: str, **context: Any) -> None:
83
+ """Add a warning to the result."""
84
+ self.issues.append(ValidationIssue("warning", code, message, context))
85
+
86
+ def add_error(self, code: str, message: str, **context: Any) -> None:
87
+ """Add an error to the result."""
88
+ self.issues.append(ValidationIssue("error", code, message, context))
89
+ self.is_valid = False
90
+
91
+ def merge(self, other: ValidationResult) -> ValidationResult:
92
+ """Merge another validation result into this one."""
93
+ self.issues.extend(other.issues)
94
+ self.is_valid = self.is_valid and other.is_valid
95
+ self.metadata.update(other.metadata)
96
+ return self
97
+
98
+ def summary(self) -> str:
99
+ """Generate a summary of the validation result."""
100
+ lines = [
101
+ f"Valid: {self.is_valid}",
102
+ f"Warnings: {len(self.warnings)}",
103
+ f"Errors: {len(self.errors)}",
104
+ ]
105
+ if self.issues:
106
+ lines.append("\nIssues:")
107
+ for issue in self.issues:
108
+ lines.append(f" - {issue}")
109
+ return "\n".join(lines)
110
+
111
+
112
+ @dataclass
113
+ class CatalogValidationResult:
114
+ """Result of validating an entire catalog."""
115
+
116
+ total_files: int = 0
117
+ valid_files: int = 0
118
+ invalid_files: int = 0
119
+ file_results: dict[str, ValidationResult] = field(default_factory=dict)
120
+ warnings_count: int = 0
121
+ errors_count: int = 0
122
+
123
+ def add_file_result(self, path: str, result: ValidationResult) -> None:
124
+ """Add a file validation result."""
125
+ self.total_files += 1
126
+ self.file_results[path] = result
127
+ if result.is_valid:
128
+ self.valid_files += 1
129
+ else:
130
+ self.invalid_files += 1
131
+ self.warnings_count += len(result.warnings)
132
+ self.errors_count += len(result.errors)
133
+
134
+ @property
135
+ def is_valid(self) -> bool:
136
+ """Check if entire catalog is valid."""
137
+ return self.invalid_files == 0
138
+
139
+ def summary(self) -> str:
140
+ """Generate a summary of the catalog validation."""
141
+ lines = [
142
+ f"Total files: {self.total_files}",
143
+ f"Valid files: {self.valid_files}",
144
+ f"Invalid files: {self.invalid_files}",
145
+ f"Total warnings: {self.warnings_count}",
146
+ f"Total errors: {self.errors_count}",
147
+ ]
148
+ if self.invalid_files > 0:
149
+ lines.append("\nInvalid files:")
150
+ for path, result in self.file_results.items():
151
+ if not result.is_valid:
152
+ lines.append(f" - {path}")
153
+ for issue in result.errors:
154
+ lines.append(f" {issue}")
155
+ return "\n".join(lines)
156
+
157
+
158
+ # =============================================================================
159
+ # STAC Item Validation (On-Ingest)
160
+ # =============================================================================
161
+
162
+
163
+ def validate_stac_item(
164
+ item: dict[str, Any],
165
+ fix_geometry: bool = True,
166
+ bbox_tolerance: float = 1e-6,
167
+ ) -> tuple[ValidationResult, dict[str, Any] | None]:
168
+ """Validate a STAC item before ingestion.
169
+
170
+ Performs standard validation including:
171
+ - Required fields check (id, type, geometry, properties)
172
+ - Geometry validity (self-intersection, ring orientation)
173
+ - Bbox-geometry consistency
174
+
175
+ Args:
176
+ item: STAC item dictionary
177
+ fix_geometry: If True, attempt to fix invalid geometries
178
+ bbox_tolerance: Tolerance for bbox comparison (in degrees)
179
+
180
+ Returns:
181
+ Tuple of (ValidationResult, corrected_item or None if unfixable)
182
+ """
183
+ from shapely import make_valid
184
+ from shapely.geometry import shape
185
+
186
+ result = ValidationResult(is_valid=True)
187
+ corrected_item = item.copy()
188
+
189
+ # Check required fields
190
+ required_fields = ["id", "type", "geometry", "properties"]
191
+ for field_name in required_fields:
192
+ if field_name not in item:
193
+ result.add_warning(
194
+ "MISSING_FIELD",
195
+ f"Missing required STAC field: {field_name}",
196
+ field=field_name,
197
+ )
198
+
199
+ # Check type is Feature
200
+ if item.get("type") != "Feature":
201
+ result.add_warning(
202
+ "INVALID_TYPE",
203
+ f"STAC item type should be 'Feature', got: {item.get('type')}",
204
+ expected="Feature",
205
+ actual=item.get("type"),
206
+ )
207
+
208
+ # Validate geometry
209
+ geom_dict = item.get("geometry")
210
+ if geom_dict is None:
211
+ result.add_warning(
212
+ "NULL_GEOMETRY",
213
+ "STAC item has null geometry",
214
+ item_id=item.get("id"),
215
+ )
216
+ else:
217
+ try:
218
+ geom = shape(geom_dict)
219
+
220
+ # Check geometry validity
221
+ if not geom.is_valid:
222
+ result.add_warning(
223
+ "INVALID_GEOMETRY",
224
+ f"Geometry is invalid: {geom.is_valid}",
225
+ item_id=item.get("id"),
226
+ reason=str(geom.is_valid),
227
+ )
228
+
229
+ if fix_geometry:
230
+ # Attempt to fix the geometry
231
+ fixed_geom = make_valid(geom)
232
+ if fixed_geom.is_valid:
233
+ corrected_item["geometry"] = fixed_geom.__geo_interface__
234
+ result.metadata["geometry_fixed"] = True
235
+ logger.debug(f"Fixed invalid geometry for item {item.get('id')}")
236
+ else:
237
+ result.add_warning(
238
+ "UNFIXABLE_GEOMETRY",
239
+ "Could not fix invalid geometry",
240
+ item_id=item.get("id"),
241
+ )
242
+
243
+ # Check bbox consistency
244
+ bbox = item.get("bbox")
245
+ if bbox is not None:
246
+ geom_to_check = shape(corrected_item.get("geometry", geom_dict)) if fix_geometry else geom
247
+ bbox_result = _validate_bbox_geometry_consistency(bbox, geom_to_check, bbox_tolerance)
248
+ result.merge(bbox_result)
249
+
250
+ # If bbox doesn't match, compute correct one
251
+ if not bbox_result.is_valid or bbox_result.warnings:
252
+ computed_bbox = list(geom_to_check.bounds)
253
+ corrected_item["bbox"] = computed_bbox
254
+ result.metadata["bbox_corrected"] = True
255
+ result.metadata["original_bbox"] = bbox
256
+ result.metadata["computed_bbox"] = computed_bbox
257
+
258
+ except (ValueError, TypeError, AttributeError) as e:
259
+ result.add_warning(
260
+ "GEOMETRY_PARSE_ERROR",
261
+ f"Failed to parse geometry: {e}",
262
+ item_id=item.get("id"),
263
+ error=str(e),
264
+ )
265
+
266
+ # Validate datetime
267
+ props = item.get("properties", {})
268
+ datetime_val = props.get("datetime")
269
+ if datetime_val is None and not (props.get("start_datetime") and props.get("end_datetime")):
270
+ result.add_warning(
271
+ "MISSING_DATETIME",
272
+ "STAC item missing datetime (and no start/end_datetime range)",
273
+ item_id=item.get("id"),
274
+ )
275
+
276
+ return result, corrected_item
277
+
278
+
279
+ def _validate_bbox_geometry_consistency(
280
+ bbox: list[float],
281
+ geometry,
282
+ tolerance: float = 1e-6,
283
+ ) -> ValidationResult:
284
+ """Validate that bbox matches geometry bounds.
285
+
286
+ Args:
287
+ bbox: [minx, miny, maxx, maxy] or [minx, miny, minz, maxx, maxy, maxz]
288
+ geometry: Shapely geometry object
289
+ tolerance: Tolerance for comparison in degrees
290
+
291
+ Returns:
292
+ ValidationResult with any bbox issues
293
+ """
294
+ result = ValidationResult(is_valid=True)
295
+
296
+ if len(bbox) == 4:
297
+ minx, miny, maxx, maxy = bbox
298
+ elif len(bbox) == 6:
299
+ minx, miny, _minz, maxx, maxy, _maxz = bbox
300
+ else:
301
+ result.add_warning(
302
+ "INVALID_BBOX_LENGTH",
303
+ f"Bbox should have 4 or 6 elements, got {len(bbox)}",
304
+ bbox=bbox,
305
+ )
306
+ return result
307
+
308
+ # Get geometry bounds
309
+ geom_minx, geom_miny, geom_maxx, geom_maxy = geometry.bounds
310
+
311
+ # Check if bbox contains the geometry (with tolerance)
312
+ issues = []
313
+ if minx - geom_minx > tolerance:
314
+ issues.append(f"bbox minx ({minx}) > geometry minx ({geom_minx})")
315
+ if miny - geom_miny > tolerance:
316
+ issues.append(f"bbox miny ({miny}) > geometry miny ({geom_miny})")
317
+ if geom_maxx - maxx > tolerance:
318
+ issues.append(f"bbox maxx ({maxx}) < geometry maxx ({geom_maxx})")
319
+ if geom_maxy - maxy > tolerance:
320
+ issues.append(f"bbox maxy ({maxy}) < geometry maxy ({geom_maxy})")
321
+
322
+ if issues:
323
+ result.add_warning(
324
+ "BBOX_MISMATCH",
325
+ f"Bbox does not match geometry bounds: {'; '.join(issues)}",
326
+ bbox=bbox,
327
+ geometry_bounds=[geom_minx, geom_miny, geom_maxx, geom_maxy],
328
+ )
329
+
330
+ return result
331
+
332
+
333
+ def validate_stac_items_batch(
334
+ items: list[dict[str, Any]],
335
+ fix_geometry: bool = True,
336
+ bbox_tolerance: float = 1e-6,
337
+ ) -> tuple[list[ValidationResult], list[dict[str, Any]]]:
338
+ """Validate a batch of STAC items.
339
+
340
+ Args:
341
+ items: List of STAC item dictionaries
342
+ fix_geometry: If True, attempt to fix invalid geometries
343
+ bbox_tolerance: Tolerance for bbox comparison
344
+
345
+ Returns:
346
+ Tuple of (list of ValidationResults, list of corrected items)
347
+ """
348
+ results = []
349
+ corrected_items = []
350
+
351
+ for item in items:
352
+ result, corrected = validate_stac_item(item, fix_geometry, bbox_tolerance)
353
+ results.append(result)
354
+ if corrected is not None:
355
+ corrected_items.append(corrected)
356
+
357
+ return results, corrected_items
358
+
359
+
360
+ # =============================================================================
361
+ # GeoParquet File Validation (Post-hoc)
362
+ # =============================================================================
363
+
364
+
365
+ def validate_geoparquet_file(
366
+ file_path: str | Path,
367
+ expected_crs: str = "EPSG:4326",
368
+ ) -> ValidationResult:
369
+ """Validate a GeoParquet file for spec compliance.
370
+
371
+ Checks:
372
+ - File is readable as Parquet
373
+ - Has valid 'geo' metadata in schema
374
+ - Primary geometry column is properly defined
375
+ - CRS is correctly specified
376
+ - Covering/bbox metadata is present (if applicable)
377
+
378
+ Args:
379
+ file_path: Path to the GeoParquet file
380
+ expected_crs: Expected CRS (default: EPSG:4326 for STAC)
381
+
382
+ Returns:
383
+ ValidationResult with any issues found
384
+ """
385
+ import pyarrow.parquet as pq
386
+
387
+ result = ValidationResult(is_valid=True)
388
+ file_path = Path(file_path)
389
+
390
+ if not file_path.exists():
391
+ result.add_error(
392
+ "FILE_NOT_FOUND",
393
+ f"File does not exist: {file_path}",
394
+ path=str(file_path),
395
+ )
396
+ return result
397
+
398
+ try:
399
+ # Read parquet metadata
400
+ parquet_file = pq.ParquetFile(file_path)
401
+ schema = parquet_file.schema_arrow
402
+ metadata = schema.metadata
403
+
404
+ result.metadata["num_rows"] = parquet_file.metadata.num_rows
405
+ result.metadata["num_columns"] = len(schema)
406
+
407
+ # Check for geo metadata
408
+ geo_result = _validate_geo_metadata(metadata, expected_crs)
409
+ result.merge(geo_result)
410
+
411
+ # Validate geometry column exists and has correct type
412
+ if geo_result.metadata.get("primary_column"):
413
+ geom_col = geo_result.metadata["primary_column"]
414
+ geom_col_result = _validate_geometry_column(schema, geom_col)
415
+ result.merge(geom_col_result)
416
+
417
+ # Validate covering bbox if present
418
+ if geo_result.metadata.get("has_covering"):
419
+ covering_result = _validate_covering_metadata(geo_result.metadata.get("geo_metadata", {}), file_path)
420
+ result.merge(covering_result)
421
+
422
+ except (OSError, ValueError, TypeError, RuntimeError) as e:
423
+ result.add_error(
424
+ "PARQUET_READ_ERROR",
425
+ f"Failed to read parquet file: {e}",
426
+ path=str(file_path),
427
+ error=str(e),
428
+ )
429
+
430
+ return result
431
+
432
+
433
+ def _validate_geo_metadata(
434
+ metadata: dict[bytes, bytes] | None,
435
+ expected_crs: str = "EPSG:4326",
436
+ ) -> ValidationResult:
437
+ """Validate the 'geo' metadata in a GeoParquet file.
438
+
439
+ Args:
440
+ metadata: Parquet schema metadata
441
+ expected_crs: Expected CRS string
442
+
443
+ Returns:
444
+ ValidationResult with geo metadata validation results
445
+ """
446
+ result = ValidationResult(is_valid=True)
447
+
448
+ if metadata is None:
449
+ result.add_error(
450
+ "NO_SCHEMA_METADATA",
451
+ "Parquet file has no schema metadata",
452
+ )
453
+ return result
454
+
455
+ # Check for 'geo' key
456
+ geo_bytes = metadata.get(b"geo")
457
+ if geo_bytes is None:
458
+ result.add_error(
459
+ "NO_GEO_METADATA",
460
+ "GeoParquet file missing required 'geo' metadata key",
461
+ )
462
+ return result
463
+
464
+ try:
465
+ geo_metadata = json.loads(geo_bytes.decode("utf-8"))
466
+ result.metadata["geo_metadata"] = geo_metadata
467
+ except (json.JSONDecodeError, UnicodeDecodeError) as e:
468
+ result.add_error(
469
+ "INVALID_GEO_JSON",
470
+ f"Failed to parse 'geo' metadata as JSON: {e}",
471
+ error=str(e),
472
+ )
473
+ return result
474
+
475
+ # Validate version (optional but recommended)
476
+ version = geo_metadata.get("version")
477
+ if version:
478
+ result.metadata["geoparquet_version"] = version
479
+ else:
480
+ result.add_warning(
481
+ "MISSING_VERSION",
482
+ "GeoParquet 'geo' metadata missing 'version' field",
483
+ )
484
+
485
+ # Validate primary_column
486
+ primary_column = geo_metadata.get("primary_column")
487
+ if not primary_column:
488
+ result.add_error(
489
+ "MISSING_PRIMARY_COLUMN",
490
+ "GeoParquet 'geo' metadata missing 'primary_column' field",
491
+ )
492
+ else:
493
+ result.metadata["primary_column"] = primary_column
494
+
495
+ # Validate columns
496
+ columns = geo_metadata.get("columns", {})
497
+ if not columns:
498
+ result.add_error(
499
+ "MISSING_COLUMNS",
500
+ "GeoParquet 'geo' metadata missing 'columns' field",
501
+ )
502
+ elif primary_column and primary_column not in columns:
503
+ result.add_error(
504
+ "PRIMARY_COLUMN_NOT_IN_COLUMNS",
505
+ f"Primary column '{primary_column}' not found in columns metadata",
506
+ primary_column=primary_column,
507
+ available_columns=list(columns.keys()),
508
+ )
509
+
510
+ # Validate CRS for primary column
511
+ if primary_column and primary_column in columns:
512
+ col_meta = columns[primary_column]
513
+
514
+ # Check encoding
515
+ encoding = col_meta.get("encoding")
516
+ if encoding:
517
+ result.metadata["geometry_encoding"] = encoding
518
+ valid_encodings = [
519
+ "WKB",
520
+ "wkb",
521
+ "point",
522
+ "linestring",
523
+ "polygon",
524
+ "multipoint",
525
+ "multilinestring",
526
+ "multipolygon",
527
+ ]
528
+ if encoding.lower() not in [e.lower() for e in valid_encodings]:
529
+ result.add_warning(
530
+ "UNKNOWN_ENCODING",
531
+ f"Unknown geometry encoding: {encoding}",
532
+ encoding=encoding,
533
+ )
534
+ else:
535
+ result.add_warning(
536
+ "MISSING_ENCODING",
537
+ f"Column '{primary_column}' missing 'encoding' field",
538
+ )
539
+
540
+ # Check CRS
541
+ crs = col_meta.get("crs")
542
+ if crs is None:
543
+ # CRS can be null for "OGC:CRS84" equivalent
544
+ result.metadata["crs"] = None
545
+ result.add_warning(
546
+ "NULL_CRS",
547
+ "CRS is null (interpreted as OGC:CRS84/WGS84)",
548
+ )
549
+ elif isinstance(crs, dict):
550
+ # PROJJSON format
551
+ result.metadata["crs"] = crs
552
+ crs_id = crs.get("id", {})
553
+ crs_code = f"{crs_id.get('authority', '')}:{crs_id.get('code', '')}"
554
+ if crs_code and crs_code != expected_crs and crs_code != ":":
555
+ # Also check for EPSG:4326 in various formats
556
+ if not _crs_matches_expected(crs, expected_crs):
557
+ result.add_warning(
558
+ "UNEXPECTED_CRS",
559
+ f"CRS '{crs_code}' does not match expected '{expected_crs}'",
560
+ actual_crs=crs,
561
+ expected_crs=expected_crs,
562
+ )
563
+ elif isinstance(crs, str):
564
+ result.metadata["crs"] = crs
565
+ if crs != expected_crs:
566
+ result.add_warning(
567
+ "UNEXPECTED_CRS",
568
+ f"CRS '{crs}' does not match expected '{expected_crs}'",
569
+ actual_crs=crs,
570
+ expected_crs=expected_crs,
571
+ )
572
+
573
+ # Check for covering/bbox
574
+ covering = col_meta.get("covering")
575
+ if covering:
576
+ result.metadata["has_covering"] = True
577
+ result.metadata["covering"] = covering
578
+ else:
579
+ result.metadata["has_covering"] = False
580
+ # Covering is optional but recommended for performance
581
+ result.add_warning(
582
+ "MISSING_COVERING",
583
+ f"Column '{primary_column}' missing 'covering' (bbox) metadata - "
584
+ "recommended for spatial query performance",
585
+ )
586
+
587
+ # Check geometry_types
588
+ geometry_types = col_meta.get("geometry_types")
589
+ if geometry_types:
590
+ result.metadata["geometry_types"] = geometry_types
591
+ else:
592
+ result.add_warning(
593
+ "MISSING_GEOMETRY_TYPES",
594
+ f"Column '{primary_column}' missing 'geometry_types' field",
595
+ )
596
+
597
+ return result
598
+
599
+
600
+ def _crs_matches_expected(crs: dict | str, expected: str) -> bool:
601
+ """Check if a CRS matches the expected value.
602
+
603
+ Handles various CRS representations (PROJJSON, WKT, EPSG codes).
604
+ """
605
+ if isinstance(crs, str):
606
+ return crs == expected
607
+
608
+ if isinstance(crs, dict):
609
+ # Check PROJJSON id
610
+ crs_id = crs.get("id", {})
611
+ authority = crs_id.get("authority", "")
612
+ code = crs_id.get("code", "")
613
+ if f"{authority}:{code}" == expected:
614
+ return True
615
+
616
+ # Check for WGS 84 / EPSG:4326 equivalents
617
+ if expected == "EPSG:4326":
618
+ name = crs.get("name", "").lower()
619
+ if "wgs 84" in name or "wgs84" in name:
620
+ return True
621
+ if authority == "EPSG" and str(code) == "4326":
622
+ return True
623
+ if authority == "OGC" and code == "CRS84":
624
+ return True
625
+
626
+ return False
627
+
628
+
629
+ def _validate_geometry_column(schema, column_name: str) -> ValidationResult:
630
+ """Validate that the geometry column exists and has correct type."""
631
+ result = ValidationResult(is_valid=True)
632
+
633
+ # Find the column in schema
634
+ try:
635
+ field_index = schema.get_field_index(column_name)
636
+ field = schema.field(field_index)
637
+ result.metadata["geometry_column_type"] = str(field.type)
638
+
639
+ # GeoParquet uses binary (WKB) encoding
640
+ if not (str(field.type) == "binary" or str(field.type) == "large_binary"):
641
+ result.add_warning(
642
+ "UNEXPECTED_GEOMETRY_TYPE",
643
+ f"Geometry column has type '{field.type}', expected 'binary' (WKB)",
644
+ column=column_name,
645
+ actual_type=str(field.type),
646
+ )
647
+
648
+ except KeyError:
649
+ result.add_error(
650
+ "GEOMETRY_COLUMN_NOT_FOUND",
651
+ f"Geometry column '{column_name}' not found in schema",
652
+ column=column_name,
653
+ available_columns=[f.name for f in schema],
654
+ )
655
+
656
+ return result
657
+
658
+
659
+ def _validate_covering_metadata(
660
+ geo_metadata: dict[str, Any],
661
+ file_path: Path,
662
+ ) -> ValidationResult:
663
+ """Validate the covering (bbox) metadata against actual data.
664
+
665
+ Args:
666
+ geo_metadata: The parsed 'geo' metadata
667
+ file_path: Path to the parquet file
668
+
669
+ Returns:
670
+ ValidationResult for covering validation
671
+ """
672
+ import geopandas as gpd
673
+
674
+ result = ValidationResult(is_valid=True)
675
+
676
+ columns = geo_metadata.get("columns", {})
677
+ primary_column = geo_metadata.get("primary_column", "geometry")
678
+ col_meta = columns.get(primary_column, {})
679
+ covering = col_meta.get("covering")
680
+
681
+ if not covering:
682
+ return result
683
+
684
+ try:
685
+ bbox_col = covering.get("bbox", {})
686
+ xmin_col = bbox_col.get("xmin")
687
+ ymin_col = bbox_col.get("ymin")
688
+ xmax_col = bbox_col.get("xmax")
689
+ ymax_col = bbox_col.get("ymax")
690
+
691
+ if not all([xmin_col, ymin_col, xmax_col, ymax_col]):
692
+ result.add_warning(
693
+ "INCOMPLETE_COVERING",
694
+ "Covering metadata missing some bbox column references",
695
+ covering=covering,
696
+ )
697
+ return result
698
+
699
+ # Read the file and verify bbox columns exist
700
+ gdf = gpd.read_parquet(file_path)
701
+
702
+ for col in [xmin_col, ymin_col, xmax_col, ymax_col]:
703
+ if col not in gdf.columns:
704
+ result.add_error(
705
+ "COVERING_COLUMN_NOT_FOUND",
706
+ f"Covering references column '{col}' which doesn't exist",
707
+ missing_column=col,
708
+ available_columns=list(gdf.columns),
709
+ )
710
+
711
+ result.metadata["covering_validated"] = True
712
+
713
+ except (OSError, ValueError, TypeError, RuntimeError) as e:
714
+ result.add_warning(
715
+ "COVERING_VALIDATION_ERROR",
716
+ f"Could not validate covering metadata: {e}",
717
+ error=str(e),
718
+ )
719
+
720
+ return result
721
+
722
+
723
+ # =============================================================================
724
+ # Catalog-Level Validation
725
+ # =============================================================================
726
+
727
+
728
+ def validate_catalog(
729
+ catalog_path: str | Path,
730
+ expected_crs: str = "EPSG:4326",
731
+ recursive: bool = True,
732
+ pattern: str = "**/*.parquet",
733
+ ) -> CatalogValidationResult:
734
+ """Validate all GeoParquet files in a catalog.
735
+
736
+ Args:
737
+ catalog_path: Path to the catalog directory
738
+ expected_crs: Expected CRS for all files
739
+ recursive: Whether to search recursively
740
+ pattern: Glob pattern for finding parquet files
741
+
742
+ Returns:
743
+ CatalogValidationResult with all file results
744
+ """
745
+ catalog_path = Path(catalog_path)
746
+ result = CatalogValidationResult()
747
+
748
+ if not catalog_path.exists():
749
+ logger.error(f"Catalog path does not exist: {catalog_path}")
750
+ return result
751
+
752
+ if not catalog_path.is_dir():
753
+ # Single file validation
754
+ file_result = validate_geoparquet_file(catalog_path, expected_crs)
755
+ result.add_file_result(str(catalog_path), file_result)
756
+ return result
757
+
758
+ # Find all parquet files
759
+ if recursive:
760
+ parquet_files = list(catalog_path.glob(pattern))
761
+ else:
762
+ parquet_files = list(catalog_path.glob("*.parquet"))
763
+
764
+ logger.info(f"Found {len(parquet_files)} parquet files to validate")
765
+
766
+ for pq_file in parquet_files:
767
+ try:
768
+ file_result = validate_geoparquet_file(pq_file, expected_crs)
769
+ result.add_file_result(str(pq_file.relative_to(catalog_path)), file_result)
770
+
771
+ if not file_result.is_valid:
772
+ logger.warning(f"Invalid file: {pq_file}")
773
+ for issue in file_result.errors:
774
+ logger.warning(f" {issue}")
775
+ elif file_result.warnings:
776
+ logger.debug(f"Warnings for {pq_file}: {len(file_result.warnings)}")
777
+
778
+ except (OSError, ValueError, TypeError, RuntimeError) as e:
779
+ file_result = ValidationResult(is_valid=False)
780
+ file_result.add_error(
781
+ "VALIDATION_EXCEPTION",
782
+ f"Exception during validation: {e}",
783
+ error=str(e),
784
+ )
785
+ result.add_file_result(str(pq_file.relative_to(catalog_path)), file_result)
786
+
787
+ return result
788
+
789
+
790
+ def validate_catalog_s3(
791
+ s3_path: str,
792
+ expected_crs: str = "EPSG:4326",
793
+ pattern: str = "**/*.parquet",
794
+ ) -> CatalogValidationResult:
795
+ """Validate all GeoParquet files in an S3 catalog.
796
+
797
+ Args:
798
+ s3_path: S3 path (s3://bucket/prefix)
799
+ expected_crs: Expected CRS for all files
800
+ pattern: Glob pattern for finding parquet files
801
+
802
+ Returns:
803
+ CatalogValidationResult with all file results
804
+ """
805
+ import tempfile
806
+
807
+ import fsspec
808
+
809
+ result = CatalogValidationResult()
810
+
811
+ try:
812
+ fs = fsspec.filesystem("s3")
813
+
814
+ # List all parquet files
815
+ if s3_path.startswith("s3://"):
816
+ bucket_path = s3_path[5:]
817
+ else:
818
+ bucket_path = s3_path
819
+
820
+ # Use glob to find files
821
+ files = fs.glob(f"{bucket_path}/{pattern}")
822
+ logger.info(f"Found {len(files)} parquet files to validate in S3")
823
+
824
+ for s3_file in files:
825
+ try:
826
+ # Download to temp file for validation
827
+ with tempfile.NamedTemporaryFile(suffix=".parquet", delete=True) as tmp:
828
+ fs.get(s3_file, tmp.name)
829
+ file_result = validate_geoparquet_file(tmp.name, expected_crs)
830
+ result.add_file_result(f"s3://{s3_file}", file_result)
831
+
832
+ except (OSError, ValueError, TypeError, RuntimeError, ConnectionError) as e:
833
+ file_result = ValidationResult(is_valid=False)
834
+ file_result.add_error(
835
+ "S3_VALIDATION_ERROR",
836
+ f"Failed to validate S3 file: {e}",
837
+ path=f"s3://{s3_file}",
838
+ error=str(e),
839
+ )
840
+ result.add_file_result(f"s3://{s3_file}", file_result)
841
+
842
+ except (OSError, ValueError, ConnectionError) as e:
843
+ logger.error(f"Failed to access S3 catalog: {e}")
844
+
845
+ return result
846
+
847
+
848
+ # =============================================================================
849
+ # Utility Functions
850
+ # =============================================================================
851
+
852
+
853
+ def get_geoparquet_metadata(file_path: str | Path) -> dict[str, Any]:
854
+ """Extract GeoParquet metadata from a file.
855
+
856
+ Args:
857
+ file_path: Path to the GeoParquet file
858
+
859
+ Returns:
860
+ Dictionary with geo metadata, or empty dict if not found
861
+ """
862
+ import pyarrow.parquet as pq
863
+
864
+ try:
865
+ parquet_file = pq.ParquetFile(file_path)
866
+ metadata = parquet_file.schema_arrow.metadata
867
+
868
+ if metadata and b"geo" in metadata:
869
+ return json.loads(metadata[b"geo"].decode("utf-8"))
870
+
871
+ except (OSError, ValueError, TypeError, RuntimeError) as e:
872
+ logger.warning(f"Failed to read geo metadata from {file_path}: {e}")
873
+
874
+ return {}
875
+
876
+
877
+ def fix_geoparquet_covering(
878
+ input_path: str | Path,
879
+ output_path: str | Path | None = None,
880
+ ) -> ValidationResult:
881
+ """Fix missing covering (bbox) metadata in a GeoParquet file.
882
+
883
+ This computes bbox columns from geometry and adds covering metadata.
884
+
885
+ Args:
886
+ input_path: Path to input GeoParquet file
887
+ output_path: Path to output file (defaults to overwriting input)
888
+
889
+ Returns:
890
+ ValidationResult indicating success or failure
891
+ """
892
+ import geopandas as gpd
893
+
894
+ result = ValidationResult(is_valid=True)
895
+ input_path = Path(input_path)
896
+ output_path = Path(output_path) if output_path else input_path
897
+
898
+ try:
899
+ gdf = gpd.read_parquet(input_path)
900
+
901
+ # Compute bbox columns if not present
902
+ if "bbox.xmin" not in gdf.columns:
903
+ bounds = gdf.geometry.bounds
904
+ gdf["bbox.xmin"] = bounds["minx"]
905
+ gdf["bbox.ymin"] = bounds["miny"]
906
+ gdf["bbox.xmax"] = bounds["maxx"]
907
+ gdf["bbox.ymax"] = bounds["maxy"]
908
+ result.metadata["bbox_columns_added"] = True
909
+
910
+ # Write with proper GeoParquet metadata
911
+ gdf.to_parquet(output_path, index=False)
912
+ result.metadata["output_path"] = str(output_path)
913
+
914
+ except (OSError, ValueError, TypeError, RuntimeError) as e:
915
+ result.add_error(
916
+ "FIX_COVERING_ERROR",
917
+ f"Failed to fix covering metadata: {e}",
918
+ error=str(e),
919
+ )
920
+
921
+ return result