PyPI - earthcatalog - Versions diffs - 0.2.0__py3-none-any.whl - Mend

earthcatalog 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

earthcatalog/__init__.py +164 -0
earthcatalog/async_http_client.py +1006 -0
earthcatalog/config.py +97 -0
earthcatalog/engines/__init__.py +308 -0
earthcatalog/engines/rustac_engine.py +142 -0
earthcatalog/engines/stac_geoparquet_engine.py +126 -0
earthcatalog/exceptions.py +471 -0
earthcatalog/grid_systems.py +1114 -0
earthcatalog/ingestion_pipeline.py +2281 -0
earthcatalog/input_readers.py +603 -0
earthcatalog/job_tracking.py +485 -0
earthcatalog/pipeline.py +606 -0
earthcatalog/schema_generator.py +911 -0
earthcatalog/spatial_resolver.py +1207 -0
earthcatalog/stac_hooks.py +754 -0
earthcatalog/statistics.py +677 -0
earthcatalog/storage_backends.py +548 -0
earthcatalog/tests/__init__.py +1 -0
earthcatalog/tests/conftest.py +76 -0
earthcatalog/tests/test_all_grids.py +793 -0
earthcatalog/tests/test_async_http.py +700 -0
earthcatalog/tests/test_cli_and_storage.py +230 -0
earthcatalog/tests/test_config.py +245 -0
earthcatalog/tests/test_dask_integration.py +580 -0
earthcatalog/tests/test_e2e_synthetic.py +1624 -0
earthcatalog/tests/test_engines.py +272 -0
earthcatalog/tests/test_exceptions.py +346 -0
earthcatalog/tests/test_file_structure.py +245 -0
earthcatalog/tests/test_input_readers.py +666 -0
earthcatalog/tests/test_integration.py +200 -0
earthcatalog/tests/test_integration_async.py +283 -0
earthcatalog/tests/test_job_tracking.py +603 -0
earthcatalog/tests/test_multi_file_input.py +336 -0
earthcatalog/tests/test_passthrough_hook.py +196 -0
earthcatalog/tests/test_pipeline.py +684 -0
earthcatalog/tests/test_pipeline_components.py +665 -0
earthcatalog/tests/test_schema_generator.py +506 -0
earthcatalog/tests/test_spatial_resolver.py +413 -0
earthcatalog/tests/test_stac_hooks.py +776 -0
earthcatalog/tests/test_statistics.py +477 -0
earthcatalog/tests/test_storage_backends.py +236 -0
earthcatalog/tests/test_validation.py +435 -0
earthcatalog/tests/test_workers.py +653 -0
earthcatalog/validation.py +921 -0
earthcatalog/workers.py +682 -0
earthcatalog-0.2.0.dist-info/METADATA +333 -0
earthcatalog-0.2.0.dist-info/RECORD +50 -0
earthcatalog-0.2.0.dist-info/WHEEL +5 -0
earthcatalog-0.2.0.dist-info/entry_points.txt +3 -0
earthcatalog-0.2.0.dist-info/top_level.txt +1 -0

earthcatalog/validation.py ADDED Viewed

@@ -0,0 +1,921 @@
+"""STAC GeoParquet Validation Module.
+This module provides validation functions for STAC items during ingestion
+and for verifying GeoParquet files conform to the GeoParquet specification.
+Validation Levels:
+    - **On-ingest**: Validates STAC items before writing (geometry validity, bbox consistency)
+    - **Post-hoc**: Validates existing GeoParquet files for spec compliance
+Key Features:
+    - Geometry validity checking (self-intersection, ring orientation)
+    - Bbox-geometry consistency validation
+    - GeoParquet geo metadata verification
+    - CRS validation (EPSG:4326 expected for STAC)
+    - Covering/bbox metadata validation
+Usage:
+    >>> from earthcatalog.validation import (
+    ...     validate_stac_item,
+    ...     validate_geoparquet_file,
+    ...     validate_catalog,
+    ...     ValidationResult,
+    ... )
+    >>>
+    >>> # Validate a single STAC item
+    >>> result = validate_stac_item(item)
+    >>> if not result.is_valid:
+    ...     print(f"Warnings: {result.warnings}")
+    >>>
+    >>> # Validate a GeoParquet file
+    >>> result = validate_geoparquet_file("path/to/file.parquet")
+    >>> print(result.summary())
+"""
+from __future__ import annotations
+import json
+import logging
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+logger = logging.getLogger(__name__)
+# =============================================================================
+# Validation Result Data Classes
+# =============================================================================
+@dataclass
+class ValidationIssue:
+    """A single validation issue (warning or error)."""
+    level: str  # 'warning' or 'error'
+    code: str  # e.g., 'INVALID_GEOMETRY', 'BBOX_MISMATCH'
+    message: str
+    context: dict[str, Any] = field(default_factory=dict)
+    def __str__(self) -> str:
+        return f"[{self.level.upper()}] {self.code}: {self.message}"
+@dataclass
+class ValidationResult:
+    """Result of a validation check."""
+    is_valid: bool
+    issues: list[ValidationIssue] = field(default_factory=list)
+    metadata: dict[str, Any] = field(default_factory=dict)
+    @property
+    def warnings(self) -> list[ValidationIssue]:
+        """Get all warning-level issues."""
+        return [i for i in self.issues if i.level == "warning"]
+    @property
+    def errors(self) -> list[ValidationIssue]:
+        """Get all error-level issues."""
+        return [i for i in self.issues if i.level == "error"]
+    def add_warning(self, code: str, message: str, **context: Any) -> None:
+        """Add a warning to the result."""
+        self.issues.append(ValidationIssue("warning", code, message, context))
+    def add_error(self, code: str, message: str, **context: Any) -> None:
+        """Add an error to the result."""
+        self.issues.append(ValidationIssue("error", code, message, context))
+        self.is_valid = False
+    def merge(self, other: ValidationResult) -> ValidationResult:
+        """Merge another validation result into this one."""
+        self.issues.extend(other.issues)
+        self.is_valid = self.is_valid and other.is_valid
+        self.metadata.update(other.metadata)
+        return self
+    def summary(self) -> str:
+        """Generate a summary of the validation result."""
+        lines = [
+            f"Valid: {self.is_valid}",
+            f"Warnings: {len(self.warnings)}",
+            f"Errors: {len(self.errors)}",
+        ]
+        if self.issues:
+            lines.append("\nIssues:")
+            for issue in self.issues:
+                lines.append(f"  - {issue}")
+        return "\n".join(lines)
+@dataclass
+class CatalogValidationResult:
+    """Result of validating an entire catalog."""
+    total_files: int = 0
+    valid_files: int = 0
+    invalid_files: int = 0
+    file_results: dict[str, ValidationResult] = field(default_factory=dict)
+    warnings_count: int = 0
+    errors_count: int = 0
+    def add_file_result(self, path: str, result: ValidationResult) -> None:
+        """Add a file validation result."""
+        self.total_files += 1
+        self.file_results[path] = result
+        if result.is_valid:
+            self.valid_files += 1
+        else:
+            self.invalid_files += 1
+        self.warnings_count += len(result.warnings)
+        self.errors_count += len(result.errors)
+    @property
+    def is_valid(self) -> bool:
+        """Check if entire catalog is valid."""
+        return self.invalid_files == 0
+    def summary(self) -> str:
+        """Generate a summary of the catalog validation."""
+        lines = [
+            f"Total files: {self.total_files}",
+            f"Valid files: {self.valid_files}",
+            f"Invalid files: {self.invalid_files}",
+            f"Total warnings: {self.warnings_count}",
+            f"Total errors: {self.errors_count}",
+        ]
+        if self.invalid_files > 0:
+            lines.append("\nInvalid files:")
+            for path, result in self.file_results.items():
+                if not result.is_valid:
+                    lines.append(f"  - {path}")
+                    for issue in result.errors:
+                        lines.append(f"      {issue}")
+        return "\n".join(lines)
+# =============================================================================
+# STAC Item Validation (On-Ingest)
+# =============================================================================
+def validate_stac_item(
+    item: dict[str, Any],
+    fix_geometry: bool = True,
+    bbox_tolerance: float = 1e-6,
+) -> tuple[ValidationResult, dict[str, Any] | None]:
+    """Validate a STAC item before ingestion.
+    Performs standard validation including:
+    - Required fields check (id, type, geometry, properties)
+    - Geometry validity (self-intersection, ring orientation)
+    - Bbox-geometry consistency
+    Args:
+        item: STAC item dictionary
+        fix_geometry: If True, attempt to fix invalid geometries
+        bbox_tolerance: Tolerance for bbox comparison (in degrees)
+    Returns:
+        Tuple of (ValidationResult, corrected_item or None if unfixable)
+    """
+    from shapely import make_valid
+    from shapely.geometry import shape
+    result = ValidationResult(is_valid=True)
+    corrected_item = item.copy()
+    # Check required fields
+    required_fields = ["id", "type", "geometry", "properties"]
+    for field_name in required_fields:
+        if field_name not in item:
+            result.add_warning(
+                "MISSING_FIELD",
+                f"Missing required STAC field: {field_name}",
+                field=field_name,
+            )
+    # Check type is Feature
+    if item.get("type") != "Feature":
+        result.add_warning(
+            "INVALID_TYPE",
+            f"STAC item type should be 'Feature', got: {item.get('type')}",
+            expected="Feature",
+            actual=item.get("type"),
+        )
+    # Validate geometry
+    geom_dict = item.get("geometry")
+    if geom_dict is None:
+        result.add_warning(
+            "NULL_GEOMETRY",
+            "STAC item has null geometry",
+            item_id=item.get("id"),
+        )
+    else:
+        try:
+            geom = shape(geom_dict)
+            # Check geometry validity
+            if not geom.is_valid:
+                result.add_warning(
+                    "INVALID_GEOMETRY",
+                    f"Geometry is invalid: {geom.is_valid}",
+                    item_id=item.get("id"),
+                    reason=str(geom.is_valid),
+                )
+                if fix_geometry:
+                    # Attempt to fix the geometry
+                    fixed_geom = make_valid(geom)
+                    if fixed_geom.is_valid:
+                        corrected_item["geometry"] = fixed_geom.__geo_interface__
+                        result.metadata["geometry_fixed"] = True
+                        logger.debug(f"Fixed invalid geometry for item {item.get('id')}")
+                    else:
+                        result.add_warning(
+                            "UNFIXABLE_GEOMETRY",
+                            "Could not fix invalid geometry",
+                            item_id=item.get("id"),
+                        )
+            # Check bbox consistency
+            bbox = item.get("bbox")
+            if bbox is not None:
+                geom_to_check = shape(corrected_item.get("geometry", geom_dict)) if fix_geometry else geom
+                bbox_result = _validate_bbox_geometry_consistency(bbox, geom_to_check, bbox_tolerance)
+                result.merge(bbox_result)
+                # If bbox doesn't match, compute correct one
+                if not bbox_result.is_valid or bbox_result.warnings:
+                    computed_bbox = list(geom_to_check.bounds)
+                    corrected_item["bbox"] = computed_bbox
+                    result.metadata["bbox_corrected"] = True
+                    result.metadata["original_bbox"] = bbox
+                    result.metadata["computed_bbox"] = computed_bbox
+        except (ValueError, TypeError, AttributeError) as e:
+            result.add_warning(
+                "GEOMETRY_PARSE_ERROR",
+                f"Failed to parse geometry: {e}",
+                item_id=item.get("id"),
+                error=str(e),
+            )
+    # Validate datetime
+    props = item.get("properties", {})
+    datetime_val = props.get("datetime")
+    if datetime_val is None and not (props.get("start_datetime") and props.get("end_datetime")):
+        result.add_warning(
+            "MISSING_DATETIME",
+            "STAC item missing datetime (and no start/end_datetime range)",
+            item_id=item.get("id"),
+        )
+    return result, corrected_item
+def _validate_bbox_geometry_consistency(
+    bbox: list[float],
+    geometry,
+    tolerance: float = 1e-6,
+) -> ValidationResult:
+    """Validate that bbox matches geometry bounds.
+    Args:
+        bbox: [minx, miny, maxx, maxy] or [minx, miny, minz, maxx, maxy, maxz]
+        geometry: Shapely geometry object
+        tolerance: Tolerance for comparison in degrees
+    Returns:
+        ValidationResult with any bbox issues
+    """
+    result = ValidationResult(is_valid=True)
+    if len(bbox) == 4:
+        minx, miny, maxx, maxy = bbox
+    elif len(bbox) == 6:
+        minx, miny, _minz, maxx, maxy, _maxz = bbox
+    else:
+        result.add_warning(
+            "INVALID_BBOX_LENGTH",
+            f"Bbox should have 4 or 6 elements, got {len(bbox)}",
+            bbox=bbox,
+        )
+        return result
+    # Get geometry bounds
+    geom_minx, geom_miny, geom_maxx, geom_maxy = geometry.bounds
+    # Check if bbox contains the geometry (with tolerance)
+    issues = []
+    if minx - geom_minx > tolerance:
+        issues.append(f"bbox minx ({minx}) > geometry minx ({geom_minx})")
+    if miny - geom_miny > tolerance:
+        issues.append(f"bbox miny ({miny}) > geometry miny ({geom_miny})")
+    if geom_maxx - maxx > tolerance:
+        issues.append(f"bbox maxx ({maxx}) < geometry maxx ({geom_maxx})")
+    if geom_maxy - maxy > tolerance:
+        issues.append(f"bbox maxy ({maxy}) < geometry maxy ({geom_maxy})")
+    if issues:
+        result.add_warning(
+            "BBOX_MISMATCH",
+            f"Bbox does not match geometry bounds: {'; '.join(issues)}",
+            bbox=bbox,
+            geometry_bounds=[geom_minx, geom_miny, geom_maxx, geom_maxy],
+        )
+    return result
+def validate_stac_items_batch(
+    items: list[dict[str, Any]],
+    fix_geometry: bool = True,
+    bbox_tolerance: float = 1e-6,
+) -> tuple[list[ValidationResult], list[dict[str, Any]]]:
+    """Validate a batch of STAC items.
+    Args:
+        items: List of STAC item dictionaries
+        fix_geometry: If True, attempt to fix invalid geometries
+        bbox_tolerance: Tolerance for bbox comparison
+    Returns:
+        Tuple of (list of ValidationResults, list of corrected items)
+    """
+    results = []
+    corrected_items = []
+    for item in items:
+        result, corrected = validate_stac_item(item, fix_geometry, bbox_tolerance)
+        results.append(result)
+        if corrected is not None:
+            corrected_items.append(corrected)
+    return results, corrected_items
+# =============================================================================
+# GeoParquet File Validation (Post-hoc)
+# =============================================================================
+def validate_geoparquet_file(
+    file_path: str | Path,
+    expected_crs: str = "EPSG:4326",
+) -> ValidationResult:
+    """Validate a GeoParquet file for spec compliance.
+    Checks:
+    - File is readable as Parquet
+    - Has valid 'geo' metadata in schema
+    - Primary geometry column is properly defined
+    - CRS is correctly specified
+    - Covering/bbox metadata is present (if applicable)
+    Args:
+        file_path: Path to the GeoParquet file
+        expected_crs: Expected CRS (default: EPSG:4326 for STAC)
+    Returns:
+        ValidationResult with any issues found
+    """
+    import pyarrow.parquet as pq
+    result = ValidationResult(is_valid=True)
+    file_path = Path(file_path)
+    if not file_path.exists():
+        result.add_error(
+            "FILE_NOT_FOUND",
+            f"File does not exist: {file_path}",
+            path=str(file_path),
+        )
+        return result
+    try:
+        # Read parquet metadata
+        parquet_file = pq.ParquetFile(file_path)
+        schema = parquet_file.schema_arrow
+        metadata = schema.metadata
+        result.metadata["num_rows"] = parquet_file.metadata.num_rows
+        result.metadata["num_columns"] = len(schema)
+        # Check for geo metadata
+        geo_result = _validate_geo_metadata(metadata, expected_crs)
+        result.merge(geo_result)
+        # Validate geometry column exists and has correct type
+        if geo_result.metadata.get("primary_column"):
+            geom_col = geo_result.metadata["primary_column"]
+            geom_col_result = _validate_geometry_column(schema, geom_col)
+            result.merge(geom_col_result)
+        # Validate covering bbox if present
+        if geo_result.metadata.get("has_covering"):
+            covering_result = _validate_covering_metadata(geo_result.metadata.get("geo_metadata", {}), file_path)
+            result.merge(covering_result)
+    except (OSError, ValueError, TypeError, RuntimeError) as e:
+        result.add_error(
+            "PARQUET_READ_ERROR",
+            f"Failed to read parquet file: {e}",
+            path=str(file_path),
+            error=str(e),
+        )
+    return result
+def _validate_geo_metadata(
+    metadata: dict[bytes, bytes] | None,
+    expected_crs: str = "EPSG:4326",
+) -> ValidationResult:
+    """Validate the 'geo' metadata in a GeoParquet file.
+    Args:
+        metadata: Parquet schema metadata
+        expected_crs: Expected CRS string
+    Returns:
+        ValidationResult with geo metadata validation results
+    """
+    result = ValidationResult(is_valid=True)
+    if metadata is None:
+        result.add_error(
+            "NO_SCHEMA_METADATA",
+            "Parquet file has no schema metadata",
+        )
+        return result
+    # Check for 'geo' key
+    geo_bytes = metadata.get(b"geo")
+    if geo_bytes is None:
+        result.add_error(
+            "NO_GEO_METADATA",
+            "GeoParquet file missing required 'geo' metadata key",
+        )
+        return result
+    try:
+        geo_metadata = json.loads(geo_bytes.decode("utf-8"))
+        result.metadata["geo_metadata"] = geo_metadata
+    except (json.JSONDecodeError, UnicodeDecodeError) as e:
+        result.add_error(
+            "INVALID_GEO_JSON",
+            f"Failed to parse 'geo' metadata as JSON: {e}",
+            error=str(e),
+        )
+        return result
+    # Validate version (optional but recommended)
+    version = geo_metadata.get("version")
+    if version:
+        result.metadata["geoparquet_version"] = version
+    else:
+        result.add_warning(
+            "MISSING_VERSION",
+            "GeoParquet 'geo' metadata missing 'version' field",
+        )
+    # Validate primary_column
+    primary_column = geo_metadata.get("primary_column")
+    if not primary_column:
+        result.add_error(
+            "MISSING_PRIMARY_COLUMN",
+            "GeoParquet 'geo' metadata missing 'primary_column' field",
+        )
+    else:
+        result.metadata["primary_column"] = primary_column
+    # Validate columns
+    columns = geo_metadata.get("columns", {})
+    if not columns:
+        result.add_error(
+            "MISSING_COLUMNS",
+            "GeoParquet 'geo' metadata missing 'columns' field",
+        )
+    elif primary_column and primary_column not in columns:
+        result.add_error(
+            "PRIMARY_COLUMN_NOT_IN_COLUMNS",
+            f"Primary column '{primary_column}' not found in columns metadata",
+            primary_column=primary_column,
+            available_columns=list(columns.keys()),
+        )
+    # Validate CRS for primary column
+    if primary_column and primary_column in columns:
+        col_meta = columns[primary_column]
+        # Check encoding
+        encoding = col_meta.get("encoding")
+        if encoding:
+            result.metadata["geometry_encoding"] = encoding
+            valid_encodings = [
+                "WKB",
+                "wkb",
+                "point",
+                "linestring",
+                "polygon",
+                "multipoint",
+                "multilinestring",
+                "multipolygon",
+            ]
+            if encoding.lower() not in [e.lower() for e in valid_encodings]:
+                result.add_warning(
+                    "UNKNOWN_ENCODING",
+                    f"Unknown geometry encoding: {encoding}",
+                    encoding=encoding,
+                )
+        else:
+            result.add_warning(
+                "MISSING_ENCODING",
+                f"Column '{primary_column}' missing 'encoding' field",
+            )
+        # Check CRS
+        crs = col_meta.get("crs")
+        if crs is None:
+            # CRS can be null for "OGC:CRS84" equivalent
+            result.metadata["crs"] = None
+            result.add_warning(
+                "NULL_CRS",
+                "CRS is null (interpreted as OGC:CRS84/WGS84)",
+            )
+        elif isinstance(crs, dict):
+            # PROJJSON format
+            result.metadata["crs"] = crs
+            crs_id = crs.get("id", {})
+            crs_code = f"{crs_id.get('authority', '')}:{crs_id.get('code', '')}"
+            if crs_code and crs_code != expected_crs and crs_code != ":":
+                # Also check for EPSG:4326 in various formats
+                if not _crs_matches_expected(crs, expected_crs):
+                    result.add_warning(
+                        "UNEXPECTED_CRS",
+                        f"CRS '{crs_code}' does not match expected '{expected_crs}'",
+                        actual_crs=crs,
+                        expected_crs=expected_crs,
+                    )
+        elif isinstance(crs, str):
+            result.metadata["crs"] = crs
+            if crs != expected_crs:
+                result.add_warning(
+                    "UNEXPECTED_CRS",
+                    f"CRS '{crs}' does not match expected '{expected_crs}'",
+                    actual_crs=crs,
+                    expected_crs=expected_crs,
+                )
+        # Check for covering/bbox
+        covering = col_meta.get("covering")
+        if covering:
+            result.metadata["has_covering"] = True
+            result.metadata["covering"] = covering
+        else:
+            result.metadata["has_covering"] = False
+            # Covering is optional but recommended for performance
+            result.add_warning(
+                "MISSING_COVERING",
+                f"Column '{primary_column}' missing 'covering' (bbox) metadata - "
+                "recommended for spatial query performance",
+            )
+        # Check geometry_types
+        geometry_types = col_meta.get("geometry_types")
+        if geometry_types:
+            result.metadata["geometry_types"] = geometry_types
+        else:
+            result.add_warning(
+                "MISSING_GEOMETRY_TYPES",
+                f"Column '{primary_column}' missing 'geometry_types' field",
+            )
+    return result
+def _crs_matches_expected(crs: dict | str, expected: str) -> bool:
+    """Check if a CRS matches the expected value.
+    Handles various CRS representations (PROJJSON, WKT, EPSG codes).
+    """
+    if isinstance(crs, str):
+        return crs == expected
+    if isinstance(crs, dict):
+        # Check PROJJSON id
+        crs_id = crs.get("id", {})
+        authority = crs_id.get("authority", "")
+        code = crs_id.get("code", "")
+        if f"{authority}:{code}" == expected:
+            return True
+        # Check for WGS 84 / EPSG:4326 equivalents
+        if expected == "EPSG:4326":
+            name = crs.get("name", "").lower()
+            if "wgs 84" in name or "wgs84" in name:
+                return True
+            if authority == "EPSG" and str(code) == "4326":
+                return True
+            if authority == "OGC" and code == "CRS84":
+                return True
+    return False
+def _validate_geometry_column(schema, column_name: str) -> ValidationResult:
+    """Validate that the geometry column exists and has correct type."""
+    result = ValidationResult(is_valid=True)
+    # Find the column in schema
+    try:
+        field_index = schema.get_field_index(column_name)
+        field = schema.field(field_index)
+        result.metadata["geometry_column_type"] = str(field.type)
+        # GeoParquet uses binary (WKB) encoding
+        if not (str(field.type) == "binary" or str(field.type) == "large_binary"):
+            result.add_warning(
+                "UNEXPECTED_GEOMETRY_TYPE",
+                f"Geometry column has type '{field.type}', expected 'binary' (WKB)",
+                column=column_name,
+                actual_type=str(field.type),
+            )
+    except KeyError:
+        result.add_error(
+            "GEOMETRY_COLUMN_NOT_FOUND",
+            f"Geometry column '{column_name}' not found in schema",
+            column=column_name,
+            available_columns=[f.name for f in schema],
+        )
+    return result
+def _validate_covering_metadata(
+    geo_metadata: dict[str, Any],
+    file_path: Path,
+) -> ValidationResult:
+    """Validate the covering (bbox) metadata against actual data.
+    Args:
+        geo_metadata: The parsed 'geo' metadata
+        file_path: Path to the parquet file
+    Returns:
+        ValidationResult for covering validation
+    """
+    import geopandas as gpd
+    result = ValidationResult(is_valid=True)
+    columns = geo_metadata.get("columns", {})
+    primary_column = geo_metadata.get("primary_column", "geometry")
+    col_meta = columns.get(primary_column, {})
+    covering = col_meta.get("covering")
+    if not covering:
+        return result
+    try:
+        bbox_col = covering.get("bbox", {})
+        xmin_col = bbox_col.get("xmin")
+        ymin_col = bbox_col.get("ymin")
+        xmax_col = bbox_col.get("xmax")
+        ymax_col = bbox_col.get("ymax")
+        if not all([xmin_col, ymin_col, xmax_col, ymax_col]):
+            result.add_warning(
+                "INCOMPLETE_COVERING",
+                "Covering metadata missing some bbox column references",
+                covering=covering,
+            )
+            return result
+        # Read the file and verify bbox columns exist
+        gdf = gpd.read_parquet(file_path)
+        for col in [xmin_col, ymin_col, xmax_col, ymax_col]:
+            if col not in gdf.columns:
+                result.add_error(
+                    "COVERING_COLUMN_NOT_FOUND",
+                    f"Covering references column '{col}' which doesn't exist",
+                    missing_column=col,
+                    available_columns=list(gdf.columns),
+                )
+        result.metadata["covering_validated"] = True
+    except (OSError, ValueError, TypeError, RuntimeError) as e:
+        result.add_warning(
+            "COVERING_VALIDATION_ERROR",
+            f"Could not validate covering metadata: {e}",
+            error=str(e),
+        )
+    return result
+# =============================================================================
+# Catalog-Level Validation
+# =============================================================================
+def validate_catalog(
+    catalog_path: str | Path,
+    expected_crs: str = "EPSG:4326",
+    recursive: bool = True,
+    pattern: str = "**/*.parquet",
+) -> CatalogValidationResult:
+    """Validate all GeoParquet files in a catalog.
+    Args:
+        catalog_path: Path to the catalog directory
+        expected_crs: Expected CRS for all files
+        recursive: Whether to search recursively
+        pattern: Glob pattern for finding parquet files
+    Returns:
+        CatalogValidationResult with all file results
+    """
+    catalog_path = Path(catalog_path)
+    result = CatalogValidationResult()
+    if not catalog_path.exists():
+        logger.error(f"Catalog path does not exist: {catalog_path}")
+        return result
+    if not catalog_path.is_dir():
+        # Single file validation
+        file_result = validate_geoparquet_file(catalog_path, expected_crs)
+        result.add_file_result(str(catalog_path), file_result)
+        return result
+    # Find all parquet files
+    if recursive:
+        parquet_files = list(catalog_path.glob(pattern))
+    else:
+        parquet_files = list(catalog_path.glob("*.parquet"))
+    logger.info(f"Found {len(parquet_files)} parquet files to validate")
+    for pq_file in parquet_files:
+        try:
+            file_result = validate_geoparquet_file(pq_file, expected_crs)
+            result.add_file_result(str(pq_file.relative_to(catalog_path)), file_result)
+            if not file_result.is_valid:
+                logger.warning(f"Invalid file: {pq_file}")
+                for issue in file_result.errors:
+                    logger.warning(f"  {issue}")
+            elif file_result.warnings:
+                logger.debug(f"Warnings for {pq_file}: {len(file_result.warnings)}")
+        except (OSError, ValueError, TypeError, RuntimeError) as e:
+            file_result = ValidationResult(is_valid=False)
+            file_result.add_error(
+                "VALIDATION_EXCEPTION",
+                f"Exception during validation: {e}",
+                error=str(e),
+            )
+            result.add_file_result(str(pq_file.relative_to(catalog_path)), file_result)
+    return result
+def validate_catalog_s3(
+    s3_path: str,
+    expected_crs: str = "EPSG:4326",
+    pattern: str = "**/*.parquet",
+) -> CatalogValidationResult:
+    """Validate all GeoParquet files in an S3 catalog.
+    Args:
+        s3_path: S3 path (s3://bucket/prefix)
+        expected_crs: Expected CRS for all files
+        pattern: Glob pattern for finding parquet files
+    Returns:
+        CatalogValidationResult with all file results
+    """
+    import tempfile
+    import fsspec
+    result = CatalogValidationResult()
+    try:
+        fs = fsspec.filesystem("s3")
+        # List all parquet files
+        if s3_path.startswith("s3://"):
+            bucket_path = s3_path[5:]
+        else:
+            bucket_path = s3_path
+        # Use glob to find files
+        files = fs.glob(f"{bucket_path}/{pattern}")
+        logger.info(f"Found {len(files)} parquet files to validate in S3")
+        for s3_file in files:
+            try:
+                # Download to temp file for validation
+                with tempfile.NamedTemporaryFile(suffix=".parquet", delete=True) as tmp:
+                    fs.get(s3_file, tmp.name)
+                    file_result = validate_geoparquet_file(tmp.name, expected_crs)
+                    result.add_file_result(f"s3://{s3_file}", file_result)
+            except (OSError, ValueError, TypeError, RuntimeError, ConnectionError) as e:
+                file_result = ValidationResult(is_valid=False)
+                file_result.add_error(
+                    "S3_VALIDATION_ERROR",
+                    f"Failed to validate S3 file: {e}",
+                    path=f"s3://{s3_file}",
+                    error=str(e),
+                )
+                result.add_file_result(f"s3://{s3_file}", file_result)
+    except (OSError, ValueError, ConnectionError) as e:
+        logger.error(f"Failed to access S3 catalog: {e}")
+    return result
+# =============================================================================
+# Utility Functions
+# =============================================================================
+def get_geoparquet_metadata(file_path: str | Path) -> dict[str, Any]:
+    """Extract GeoParquet metadata from a file.
+    Args:
+        file_path: Path to the GeoParquet file
+    Returns:
+        Dictionary with geo metadata, or empty dict if not found
+    """
+    import pyarrow.parquet as pq
+    try:
+        parquet_file = pq.ParquetFile(file_path)
+        metadata = parquet_file.schema_arrow.metadata
+        if metadata and b"geo" in metadata:
+            return json.loads(metadata[b"geo"].decode("utf-8"))
+    except (OSError, ValueError, TypeError, RuntimeError) as e:
+        logger.warning(f"Failed to read geo metadata from {file_path}: {e}")
+    return {}
+def fix_geoparquet_covering(
+    input_path: str | Path,
+    output_path: str | Path | None = None,
+) -> ValidationResult:
+    """Fix missing covering (bbox) metadata in a GeoParquet file.
+    This computes bbox columns from geometry and adds covering metadata.
+    Args:
+        input_path: Path to input GeoParquet file
+        output_path: Path to output file (defaults to overwriting input)
+    Returns:
+        ValidationResult indicating success or failure
+    """
+    import geopandas as gpd
+    result = ValidationResult(is_valid=True)
+    input_path = Path(input_path)
+    output_path = Path(output_path) if output_path else input_path
+    try:
+        gdf = gpd.read_parquet(input_path)
+        # Compute bbox columns if not present
+        if "bbox.xmin" not in gdf.columns:
+            bounds = gdf.geometry.bounds
+            gdf["bbox.xmin"] = bounds["minx"]
+            gdf["bbox.ymin"] = bounds["miny"]
+            gdf["bbox.xmax"] = bounds["maxx"]
+            gdf["bbox.ymax"] = bounds["maxy"]
+            result.metadata["bbox_columns_added"] = True
+        # Write with proper GeoParquet metadata
+        gdf.to_parquet(output_path, index=False)
+        result.metadata["output_path"] = str(output_path)
+    except (OSError, ValueError, TypeError, RuntimeError) as e:
+        result.add_error(
+            "FIX_COVERING_ERROR",
+            f"Failed to fix covering metadata: {e}",
+            error=str(e),
+        )
+    return result