PyPI - openforis-whisp - Versions diffs - 3.0.0a1__py3-none-any.whl → 3.0.0a3__py3-none-any.whl - Mend

openforis-whisp 3.0.0a1py3-none-any.whl → 3.0.0a3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

openforis_whisp/__init__.py +7 -7
openforis_whisp/advanced_stats.py +400 -93
openforis_whisp/data_checks.py +178 -15
openforis_whisp/data_conversion.py +154 -59
openforis_whisp/reformat.py +2 -29
openforis_whisp/stats.py +15 -45
openforis_whisp/utils.py +449 -80
{openforis_whisp-3.0.0a1.dist-info → openforis_whisp-3.0.0a3.dist-info}/METADATA +1 -1
{openforis_whisp-3.0.0a1.dist-info → openforis_whisp-3.0.0a3.dist-info}/RECORD +11 -11
{openforis_whisp-3.0.0a1.dist-info → openforis_whisp-3.0.0a3.dist-info}/LICENSE +0 -0
{openforis_whisp-3.0.0a1.dist-info → openforis_whisp-3.0.0a3.dist-info}/WHEEL +0 -0

openforis_whisp/data_checks.py CHANGED Viewed

@@ -7,10 +7,69 @@ and thresholds, raising informative errors when constraints are violated.
 import json
 from pathlib import Path
-from shapely.geometry import Polygon as ShapelyPolygon
+from shapely.geometry import Polygon as ShapelyPolygon, shape as shapely_shape
 # Note: area summary stats are estimations for use in deciding pathways for analysis
 # (estimation preferred here as allows efficient processing speed and limits overhead of checking file)
+def _convert_projected_area_to_ha(area_sq_units: float, crs: str = None) -> float:
+    """
+    Convert area from projected CRS units to hectares.
+    Most projected CRS use meters as units, so:
+    - area_sq_units is in square meters
+    - 1 hectare = 10,000 m²
+    Args:
+        area_sq_units: Area in square units of the projection (typically square meters)
+        crs: CRS string for reference (e.g., 'EPSG:3857'). Used for validation.
+    Returns:
+        Area in hectares
+    """
+    # Standard conversion: 1 hectare = 10,000 m²
+    # Most projected CRS use meters, so this works universally
+    return area_sq_units / 10000
+def _estimate_area_from_bounds(coords, area_conversion_factor: float) -> float:
+    """
+    Estimate area from bounding box when actual area calculation fails.
+    Extracts bounding box and calculates its area as a fallback estimate.
+    Returns area in hectares.
+    """
+    try:
+        # Flatten all coordinates to find bounds
+        all_coords = []
+        def flatten_coords(c):
+            if isinstance(c[0], (list, tuple)) and isinstance(c[0][0], (list, tuple)):
+                for sub in c:
+                    flatten_coords(sub)
+            else:
+                all_coords.extend(c)
+        flatten_coords(coords)
+        if not all_coords:
+            return 0
+        # Extract lon/lat values
+        lons = [c[0] for c in all_coords]
+        lats = [c[1] for c in all_coords]
+        min_lon, max_lon = min(lons), max(lons)
+        min_lat, max_lat = min(lats), max(lats)
+        # Bounding box area
+        bbox_area = (max_lon - min_lon) * (max_lat - min_lat)
+        # Apply conversion factor
+        return abs(bbox_area) * area_conversion_factor
+    except:
+        return 0
 def analyze_geojson(
     geojson_data: Path | str | dict,
     metrics=[
@@ -76,6 +135,8 @@ def analyze_geojson(
         - 'vertex_percentiles': {'p25': int, 'p50': int, 'p75': int, 'p90': int}
     """
     results = {}
+    crs_warning = None
+    file_path = None
     try:
         # Load GeoJSON from file if path provided
@@ -83,11 +144,45 @@ def analyze_geojson(
             file_path = Path(geojson_data)
             if not file_path.exists():
                 raise FileNotFoundError(f"GeoJSON file not found: {file_path}")
-            with open(file_path, "r") as f:
-                geojson_data = json.load(f)
+            # Try UTF-8 first (most common), then fall back to auto-detection
+            try:
+                with open(file_path, "r", encoding="utf-8") as f:
+                    geojson_data = json.load(f)
+            except UnicodeDecodeError:
+                # Auto-detect encoding if UTF-8 fails
+                try:
+                    import chardet
+                    with open(file_path, "rb") as f:
+                        raw_data = f.read()
+                        detected = chardet.detect(raw_data)
+                        encoding = detected.get("encoding", "latin-1")
+                    with open(file_path, "r", encoding=encoding, errors="replace") as f:
+                        geojson_data = json.load(f)
+                except Exception:
+                    # Final fallback: use latin-1 which accepts all byte values
+                    with open(file_path, "r", encoding="latin-1") as f:
+                        geojson_data = json.load(f)
+            # Detect CRS from file if available
+            try:
+                import geopandas as gpd
+                gdf = gpd.read_file(file_path)
+                if gdf.crs and gdf.crs != "EPSG:4326":
+                    crs_warning = f"⚠️  CRS is {gdf.crs}, not EPSG:4326. Area metrics will be inaccurate. Data will be auto-reprojected during processing."
+            except Exception:
+                pass  # If we can't detect CRS, continue without warning
         features = geojson_data.get("features", [])
+        # Add CRS warning to results if detected
+        if crs_warning:
+            results["crs_warning"] = crs_warning
+            print(crs_warning)
         if "count" in metrics:
             results["count"] = len(features)
@@ -113,6 +208,29 @@ def analyze_geojson(
             geometry_type_counts = {}
             valid_polygons = 0
+            # Tracking for fallback geometries
+            bbox_fallback_count = 0  # Geometries that used bounding box estimate
+            geometry_skip_count = 0  # Geometries completely skipped
+            polygon_type_stats = {}  # Track stats by geometry type
+            # Detect CRS to determine area conversion factor
+            area_conversion_factor = 1232100  # Default: WGS84 (degrees to ha)
+            detected_crs = None
+            # Try to detect CRS from file if available
+            if file_path:
+                try:
+                    import geopandas as gpd
+                    gdf_temp = gpd.read_file(str(file_path))
+                    detected_crs = gdf_temp.crs
+                    if detected_crs and detected_crs != "EPSG:4326":
+                        # Projected CRS typically uses meters, so convert m² to ha
+                        # 1 ha = 10,000 m²
+                        area_conversion_factor = 1 / 10000
+                except Exception:
+                    pass  # Use default if CRS detection fails
             for feature in features:
                 try:
                     coords = feature["geometry"]["coordinates"]
@@ -133,13 +251,27 @@ def analyze_geojson(
                         # Calculate area from coordinates using shapely
                         try:
-                            poly = ShapelyPolygon(coords[0])
-                            # Convert square degrees to hectares (near equator)
-                            # 1 degree latitude ≈ 111 km, so 1 degree² ≈ 111² km² = 12,321 km² = 1,232,100 ha
-                            area_ha = abs(poly.area) * 1232100
+                            # Use shapely.geometry.shape to properly handle all geometry components
+                            geom = shapely_shape(feature["geometry"])
+                            # Convert using detected CRS
+                            area_ha = abs(geom.area) * area_conversion_factor
                             areas.append(area_ha)
-                        except:
-                            pass  # Skip if calculation fails
+                        except Exception as e:
+                            # Fallback: estimate from bounding box if geometry fails
+                            bbox_area = _estimate_area_from_bounds(
+                                coords, area_conversion_factor
+                            )
+                            if bbox_area > 0:
+                                areas.append(bbox_area)
+                                bbox_fallback_count += 1
+                                polygon_type_stats["Polygon_bbox"] = (
+                                    polygon_type_stats.get("Polygon_bbox", 0) + 1
+                                )
+                            else:
+                                geometry_skip_count += 1
+                                polygon_type_stats["Polygon_skipped"] = (
+                                    polygon_type_stats.get("Polygon_skipped", 0) + 1
+                                )
                         valid_polygons += 1
                     elif geom_type == "MultiPolygon":
@@ -152,12 +284,28 @@ def analyze_geojson(
                         # Calculate area from coordinates using shapely
                         try:
-                            for polygon in coords:
-                                poly = ShapelyPolygon(polygon[0])
-                                area_ha = abs(poly.area) * 1232100
-                                areas.append(area_ha)
-                        except:
-                            pass  # Skip if calculation fails
+                            # Use shapely.geometry.shape to properly handle MultiPolygon
+                            geom = shapely_shape(feature["geometry"])
+                            # Convert using detected CRS - use total area of all parts
+                            area_ha = abs(geom.area) * area_conversion_factor
+                            areas.append(area_ha)
+                        except Exception as e:
+                            # Fallback: estimate from bounding box if geometry fails
+                            bbox_area = _estimate_area_from_bounds(
+                                coords, area_conversion_factor
+                            )
+                            if bbox_area > 0:
+                                areas.append(bbox_area)
+                                bbox_fallback_count += 1
+                                polygon_type_stats["MultiPolygon_bbox"] = (
+                                    polygon_type_stats.get("MultiPolygon_bbox", 0) + 1
+                                )
+                            else:
+                                geometry_skip_count += 1
+                                polygon_type_stats["MultiPolygon_skipped"] = (
+                                    polygon_type_stats.get("MultiPolygon_skipped", 0)
+                                    + 1
+                                )
                         valid_polygons += 1
                 except:
@@ -312,6 +460,21 @@ def analyze_geojson(
                             else {"p25": 0, "p50": 0, "p75": 0, "p90": 0}
                         )
+        # Add geometry quality logging to results
+        if bbox_fallback_count > 0 or geometry_skip_count > 0:
+            geometry_quality_log = (
+                f"Geometry quality summary:\n"
+                f"  - Bounding box fallback used: {bbox_fallback_count} features\n"
+                f"  - Geometries skipped: {geometry_skip_count} features"
+            )
+            if polygon_type_stats:
+                geometry_quality_log += "\n  - Breakdown:"
+                for stat_type, count in sorted(polygon_type_stats.items()):
+                    geometry_quality_log += f"\n    - {stat_type}: {count}"
+            results["geometry_quality_note"] = geometry_quality_log
+            print(geometry_quality_log)
         return results
     except Exception as e:

openforis_whisp/data_conversion.py CHANGED Viewed

@@ -12,67 +12,81 @@ import geopandas as gpd
 import ee
-def convert_geojson_to_ee(
-    geojson_filepath: Union[str, Path, dict],
-    enforce_wgs84: bool = True,
-    strip_z_coords: bool = True,
-) -> ee.FeatureCollection:
+# ============================================================================
+# HELPER FUNCTIONS FOR UNIFIED PROCESSING PATHWAY
+# ============================================================================
+def _sanitize_geodataframe(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
     """
-    Converts GeoJSON data to an Earth Engine FeatureCollection.
-    Accepts either a file path or a GeoJSON dictionary object.
-    Optionally checks and converts the CRS to WGS 84 (EPSG:4326) if needed.
-    Automatically handles 3D coordinates by stripping Z values when necessary.
+    Sanitize GeoDataFrame data types for JSON serialization.
+    Converts problematic data types that cannot be directly serialized:
+    - DateTime/Timestamp columns → ISO format strings
+    - Object columns → strings
+    - Skips geometry column
     Args:
-        geojson_filepath (Union[str, Path, dict]): The filepath to the GeoJSON file (str or Path)
-                                                    or a GeoJSON dictionary object.
-        enforce_wgs84 (bool): Whether to enforce WGS 84 projection (EPSG:4326). Defaults to True.
-                              Only applies when input is a file path (dicts are assumed to be in WGS84).
-        strip_z_coords (bool): Whether to automatically strip Z coordinates from 3D geometries. Defaults to True.
+        gdf (gpd.GeoDataFrame): Input GeoDataFrame
     Returns:
-        ee.FeatureCollection: Earth Engine FeatureCollection created from the GeoJSON.
+        gpd.GeoDataFrame: GeoDataFrame with sanitized data types
+    """
+    gdf = gdf.copy()
+    for col in gdf.columns:
+        if col != gdf.geometry.name:  # Skip geometry column
+            # Handle datetime/timestamp columns
+            if pd.api.types.is_datetime64_any_dtype(gdf[col]):
+                gdf[col] = gdf[col].dt.strftime("%Y-%m-%d %H:%M:%S").fillna("")
+            # Handle other problematic types
+            elif gdf[col].dtype == "object":
+                # Convert any remaining non-serializable objects to strings
+                gdf[col] = gdf[col].astype(str)
+    return gdf
+def _ensure_wgs84_crs(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
     """
-    if isinstance(geojson_filepath, dict):
-        # Input is already a GeoJSON dictionary - skip file reading
-        geojson_data = geojson_filepath
-    elif isinstance(geojson_filepath, (str, Path)):
-        file_path = os.path.abspath(geojson_filepath)
+    Ensure GeoDataFrame uses WGS 84 (EPSG:4326) coordinate reference system.
-        # Use GeoPandas to read the file and handle CRS
-        gdf = gpd.read_file(file_path)
+    - If CRS is None, assumes WGS 84
+    - If CRS is not WGS 84, converts to WGS 84
+    - If already WGS 84, returns unchanged
-        # NEW: Handle problematic data types before JSON conversion
-        for col in gdf.columns:
-            if col != gdf.geometry.name:  # Skip geometry column
-                # Handle datetime/timestamp columns
-                if pd.api.types.is_datetime64_any_dtype(gdf[col]):
-                    gdf[col] = gdf[col].dt.strftime("%Y-%m-%d %H:%M:%S").fillna("")
-                # Handle other problematic types
-                elif gdf[col].dtype == "object":
-                    # Convert any remaining non-serializable objects to strings
-                    gdf[col] = gdf[col].astype(str)
-        # Check and convert CRS if needed
-        if enforce_wgs84:
-            if gdf.crs is None:
-                # Assuming WGS 84 if no CRS defined
-                pass
-            elif gdf.crs != "EPSG:4326":
-                gdf = gdf.to_crs("EPSG:4326")
-        # Convert to GeoJSON
-        geojson_data = json.loads(gdf.to_json())
-    else:
-        raise ValueError(
-            "Input must be a file path (str or Path) or a GeoJSON dictionary object (dict)"
-        )
+    Args:
+        gdf (gpd.GeoDataFrame): Input GeoDataFrame
-    validation_errors = validate_geojson(geojson_data)
-    if validation_errors:
-        raise ValueError(f"GeoJSON validation errors: {validation_errors}")
+    Returns:
+        gpd.GeoDataFrame: GeoDataFrame in WGS 84
+    """
+    if gdf.crs is None:
+        # Assuming WGS 84 if no CRS defined
+        return gdf
+    elif gdf.crs != "EPSG:4326":
+        return gdf.to_crs("EPSG:4326")
+    return gdf
+def _create_ee_feature_collection(
+    geojson_data: dict, strip_z_coords: bool = True, input_source: str = "input"
+) -> ee.FeatureCollection:
+    """
+    Create Earth Engine FeatureCollection from GeoJSON dict with error recovery.
+    Attempts to create EE FeatureCollection. If it fails due to 3D coordinates
+    and strip_z_coords is True, automatically strips Z values and retries.
+    Args:
+        geojson_data (dict): GeoJSON data dictionary
+        strip_z_coords (bool): Whether to retry with 2D geometries on failure
+        input_source (str): Description of input source for logging
+    Returns:
+        ee.FeatureCollection: Earth Engine FeatureCollection
-    # Try to create the feature collection, handle 3D coordinate issues automatically
+    Raises:
+        ee.EEException: If conversion fails even after retries
+    """
     try:
         feature_collection = ee.FeatureCollection(
             create_feature_collection(geojson_data)
@@ -81,16 +95,16 @@ def convert_geojson_to_ee(
     except ee.EEException as e:
         if "Invalid GeoJSON geometry" in str(e) and strip_z_coords:
             # Apply print_once deduplication for Z-coordinate stripping messages
-            if not hasattr(convert_geojson_to_ee, "_printed_z_messages"):
-                convert_geojson_to_ee._printed_z_messages = set()
+            if not hasattr(_create_ee_feature_collection, "_printed_z_messages"):
+                _create_ee_feature_collection._printed_z_messages = set()
-            z_message_key = f"z_coords_{file_path}"
-            if z_message_key not in convert_geojson_to_ee._printed_z_messages:
+            z_message_key = f"z_coords_{input_source}"
+            if z_message_key not in _create_ee_feature_collection._printed_z_messages:
                 print(
                     "Warning: Invalid GeoJSON geometry detected, likely due to 3D coordinates."
                 )
                 print("Attempting to fix by stripping Z coordinates...")
-                convert_geojson_to_ee._printed_z_messages.add(z_message_key)
+                _create_ee_feature_collection._printed_z_messages.add(z_message_key)
             # Apply Z-coordinate stripping
             geojson_data_fixed = _strip_z_coordinates_from_geojson(geojson_data)
@@ -101,10 +115,15 @@ def convert_geojson_to_ee(
                     create_feature_collection(geojson_data_fixed)
                 )
-                success_message_key = f"z_coords_success_{file_path}"
-                if success_message_key not in convert_geojson_to_ee._printed_z_messages:
+                success_message_key = f"z_coords_success_{input_source}"
+                if (
+                    success_message_key
+                    not in _create_ee_feature_collection._printed_z_messages
+                ):
                     print("Successfully converted after stripping Z coordinates")
-                    convert_geojson_to_ee._printed_z_messages.add(success_message_key)
+                    _create_ee_feature_collection._printed_z_messages.add(
+                        success_message_key
+                    )
                 return feature_collection
             except Exception as retry_error:
@@ -115,6 +134,82 @@ def convert_geojson_to_ee(
             raise e
+def convert_geojson_to_ee(
+    geojson_input: Union[str, Path, dict, gpd.GeoDataFrame],
+    enforce_wgs84: bool = True,
+    strip_z_coords: bool = True,
+) -> ee.FeatureCollection:
+    """
+    Converts GeoJSON data to an Earth Engine FeatureCollection.
+    Accepts flexible input types with a unified processing pathway:
+    - File path (str or Path) → loads with GeoPandas
+    - GeoJSON dict → uses directly
+    - GeoDataFrame → uses directly
+    Automatically handles:
+    - CRS conversion to WGS 84 (EPSG:4326) if needed
+    - DateTime/Timestamp columns → converts to ISO strings before JSON serialization
+    - Non-serializable objects → converts to strings
+    - 3D coordinates → strips Z values when necessary
+    - Z-coordinate errors → retries with 2D geometries if enabled
+    Args:
+        geojson_input (Union[str, Path, dict, gpd.GeoDataFrame]):
+            - File path (str or Path) to GeoJSON file
+            - GeoJSON dictionary object
+            - GeoPandas GeoDataFrame
+        enforce_wgs84 (bool): Whether to enforce WGS 84 projection (EPSG:4326).
+            Defaults to True. Only applies to file path and GeoDataFrame inputs.
+        strip_z_coords (bool): Whether to automatically strip Z coordinates from 3D geometries.
+            Defaults to True.
+    Returns:
+        ee.FeatureCollection: Earth Engine FeatureCollection created from the GeoJSON.
+    Raises:
+        ValueError: If input type is unsupported or GeoJSON validation fails.
+        ee.EEException: If GeoJSON cannot be converted even after retries.
+    """
+    # UNIFIED INPUT NORMALIZATION: Convert all inputs to GeoDataFrame first
+    if isinstance(geojson_input, gpd.GeoDataFrame):
+        gdf = geojson_input.copy()
+        input_source = "GeoDataFrame"
+    elif isinstance(geojson_input, dict):
+        # Convert dict to GeoDataFrame for unified processing
+        gdf = gpd.GeoDataFrame.from_features(geojson_input.get("features", []))
+        input_source = "dict"
+    elif isinstance(geojson_input, (str, Path)):
+        # Load file and convert to GeoDataFrame
+        file_path = os.path.abspath(geojson_input)
+        gdf = gpd.read_file(file_path)
+        input_source = f"file ({file_path})"
+    else:
+        raise ValueError(
+            f"Input must be a file path (str or Path), GeoJSON dict, or GeoDataFrame. "
+            f"Got {type(geojson_input).__name__}"
+        )
+    # UNIFIED DATA SANITIZATION PATHWAY
+    # Handle problematic data types before JSON conversion
+    gdf = _sanitize_geodataframe(gdf)
+    # UNIFIED CRS HANDLING
+    if enforce_wgs84:
+        gdf = _ensure_wgs84_crs(gdf)
+    # UNIFIED GEOJSON CONVERSION
+    geojson_data = json.loads(gdf.to_json())
+    # UNIFIED VALIDATION
+    validation_errors = validate_geojson(geojson_data)
+    if validation_errors:
+        raise ValueError(f"GeoJSON validation errors: {validation_errors}")
+    # UNIFIED EE CONVERSION with error recovery
+    return _create_ee_feature_collection(geojson_data, strip_z_coords, input_source)
 def _strip_z_coordinates_from_geojson(geojson_data: dict) -> dict:
     """
     Helper function to strip Z coordinates from GeoJSON data.

openforis_whisp/reformat.py CHANGED Viewed

@@ -125,7 +125,7 @@ def validate_dataframe(
     Returns:
         pd.DataFrame: The validated DataFrame with columns ordered according to the schema, or None if validation fails.
     """
-    log_missing_columns(df_stats, schema)
+    _log_missing_columns(df_stats, schema)
     # df_stats = df_stats.reindex(schema.columns.keys(), axis=1)
@@ -251,7 +251,7 @@ def create_schema_from_dataframe(schema_df: pd.DataFrame) -> pa.DataFrameSchema:
 #     return logger
-def log_missing_columns(df_stats: pd.DataFrame, template_schema: pa.DataFrameSchema):
+def _log_missing_columns(df_stats: pd.DataFrame, template_schema: pa.DataFrameSchema):
     # Initialize the logger
     logger = setup_logger(__name__)
@@ -675,33 +675,6 @@ def _process_custom_bands(df_extra: pd.DataFrame, custom_bands) -> pd.DataFrame:
 # Fix the duplicate logging issue
-def log_missing_columns(df_stats: pd.DataFrame, template_schema: pa.DataFrameSchema):
-    # Remove the duplicate logger creation line
-    # logger = setup_logger(__name__)  # DELETE THIS LINE
-    # Use the existing module-level logger (line 18: logger = StdoutLogger(__name__))
-    # Extract the expected columns from the DataFrameSchema
-    template_columns = list(template_schema.columns.keys())
-    df_stats_columns = df_stats.columns.tolist()
-    # Find missing and extra columns
-    missing_in_df = [col for col in template_columns if col not in df_stats_columns]
-    extra_in_df = [col for col in df_stats_columns if col not in template_columns]
-    # Log missing schema columns
-    if missing_in_df:
-        logger.warning(f"Missing expected schema columns: {missing_in_df}")
-    else:
-        logger.info("All expected schema columns found in DataFrame.")
-    # Log extra columns (will be preserved)
-    if extra_in_df:
-        logger.info(f"Extra columns found (will be preserved): {extra_in_df}")
-    else:
-        logger.info("No extra columns found in DataFrame.")
 def format_stats_dataframe(
     df,
     area_col="Area_sum",

openforis_whisp/stats.py CHANGED Viewed

@@ -93,7 +93,6 @@ def whisp_formatted_stats_geojson_to_df_legacy(
     unit_type="ha",
     whisp_image=None,
     custom_bands=None,  # New parameter
-    validate_geometries: bool = False,
 ) -> pd.DataFrame:
     """
         Legacy function for basic Whisp stats extraction.
@@ -135,48 +134,15 @@ def whisp_formatted_stats_geojson_to_df_legacy(
             - List of band names: ['Aa_test', 'elevation']
             - Dict with types: {'Aa_test': 'float64', 'elevation': 'float32'}
             - None: preserves all extra columns automatically
-        validate_geometries : bool, optional
-            Whether to validate and fix invalid geometries, by default False.
-            Set to True to automatically fix invalid/self-intersecting polygons.
     Returns
         -------
         df_stats : pd.DataFrame
             The DataFrame containing the Whisp stats for the input ROI.
     """
-    # Load GeoJSON and validate geometries if requested
-    if validate_geometries:
-        import json
-        import geopandas as gpd
-        from shapely.validation import make_valid
-        import logging as py_logging
-        logger = py_logging.getLogger("whisp-legacy")
-        # Load GeoJSON file
-        with open(input_geojson_filepath, "r") as f:
-            geojson_data = json.load(f)
-        # Convert to GeoDataFrame
-        gdf = gpd.GeoDataFrame.from_features(geojson_data["features"])
-        # Validate and fix invalid geometries
-        valid_count = gdf.geometry.is_valid.sum()
-        invalid_count = len(gdf) - valid_count
-        if invalid_count > 0:
-            logger.warning(f"Fixing {invalid_count} invalid geometries")
-            gdf["geometry"] = gdf["geometry"].apply(
-                lambda g: make_valid(g) if g and not g.is_valid else g
-            )
-        # Convert back to GeoJSON dict (stays in memory - no temp files!)
-        geojson_cleaned = json.loads(gdf.to_json())
-        # OPTIMIZATION: Pass GeoJSON dict directly - eliminates file I/O overhead
-        feature_collection = convert_geojson_to_ee(geojson_cleaned)
-    else:
-        # Original path - no validation
-        feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
+    # Convert GeoJSON to Earth Engine FeatureCollection
+    # Note: Geometry validation/cleaning should be done before calling this function
+    feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
     return whisp_formatted_stats_ee_to_df(
         feature_collection,
@@ -200,7 +166,7 @@ def whisp_formatted_stats_geojson_to_df(
     mode: str = "sequential",
     batch_size: int = 10,
     max_concurrent: int = 20,
-    validate_geometries: bool = False,
+    geometry_audit_trail: bool = False,
 ) -> pd.DataFrame:
     """
     Main entry point for converting GeoJSON to Whisp statistics.
@@ -248,11 +214,16 @@ def whisp_formatted_stats_geojson_to_df(
     max_concurrent : int, optional
         Maximum concurrent EE calls for concurrent mode, by default 20.
         Only applicable for "concurrent" mode.
-    validate_geometries : bool, optional
-        Whether to validate and fix invalid geometries, by default False.
-        Set to True to automatically fix invalid/self-intersecting polygons.
-        For production workflows, it's recommended to use geometry validation and
-        cleaning tools BEFORE processing with this function.
+    geometry_audit_trail : bool, default True
+        If True (default), includes audit trail columns:
+        - geo_original: Original input geometry
+        - geometry_type_original: Original geometry type
+        - geometry_type: Processed geometry type (from EE)
+        - geometry_type_changed: Boolean flag if geometry changed
+        - geometry_degradation_type: Description of how it changed
+        Processing metadata stored in df.attrs['processing_metadata'].
+        These columns enable full transparency for geometry modifications during processing.
     Returns
     -------
@@ -317,7 +288,6 @@ def whisp_formatted_stats_geojson_to_df(
             unit_type=unit_type,
             whisp_image=whisp_image,
             custom_bands=custom_bands,
-            validate_geometries=validate_geometries,
         )
     elif mode in ("concurrent", "sequential"):
         # Log info if batch_size or max_concurrent are not used in sequential mode
@@ -344,7 +314,7 @@ def whisp_formatted_stats_geojson_to_df(
             mode=mode,  # Pass mode directly (concurrent or sequential)
             batch_size=batch_size,
             max_concurrent=max_concurrent,
-            validate_geometries=validate_geometries,
+            geometry_audit_trail=geometry_audit_trail,
         )
     else:
         raise ValueError(

openforis-whisp 3.0.0a1__py3-none-any.whl → 3.0.0a3__py3-none-any.whl

openforis-whisp 3.0.0a1py3-none-any.whl → 3.0.0a3py3-none-any.whl