PyPI - openforis-whisp - Versions diffs - 3.0.0a1__tar.gz → 3.0.0a2__tar.gz - Mend

openforis-whisp 3.0.0a1tar.gz → 3.0.0a2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

{openforis_whisp-3.0.0a1 → openforis_whisp-3.0.0a2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: openforis-whisp
-Version: 3.0.0a1
+Version: 3.0.0a2
 Summary: Whisp (What is in that plot) is an open-source solution which helps to produce relevant forest monitoring information and support compliance with deforestation-related regulations.
 License: MIT
 Keywords: whisp,geospatial,data-processing

{openforis_whisp-3.0.0a1 → openforis_whisp-3.0.0a2}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "openforis-whisp"
-version = "3.0.0a1"
+version = "3.0.0a2"
 description = "Whisp (What is in that plot) is an open-source solution which helps to produce relevant forest monitoring information and support compliance with deforestation-related regulations."
 repository = "https://github.com/forestdatapartnership/whisp"
 authors = ["Andy Arnell <andrew.arnell@fao.org>"]

{openforis_whisp-3.0.0a1 → openforis_whisp-3.0.0a2}/src/openforis_whisp/advanced_stats.py RENAMED Viewed

@@ -32,7 +32,7 @@ import os
 import subprocess
 from contextlib import redirect_stdout, contextmanager
 from pathlib import Path
-from typing import Optional, List, Dict, Any, Tuple
+from typing import Optional, List, Dict, Any, Tuple, Union
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import tempfile
@@ -203,6 +203,57 @@ def _extract_decimal_places(format_string: str) -> int:
     return 2  # Default to 2 decimal places
+def _normalize_keep_external_columns(
+    keep_external_columns: Union[bool, List[str]],
+    all_columns: List[str],
+    plot_id_column: str = "plotId",
+) -> List[str]:
+    """
+    Normalize keep_external_columns parameter to a list of column names.
+    Converts flexible user input (bool or list) to a concrete list of columns to keep.
+    Parameters
+    ----------
+    keep_external_columns : bool or List[str]
+        - False: keep nothing (return empty list)
+        - True: keep all columns except geometry and plot_id
+        - List[str]: keep specific columns (return as-is)
+    all_columns : List[str]
+        All available columns to choose from
+    plot_id_column : str
+        Name of plot ID column to exclude
+    Returns
+    -------
+    List[str]
+        Columns to keep from external (GeoJSON) data
+    Examples
+    --------
+    >>> cols = _normalize_keep_external_columns(False, ["id", "Country", "geom"], "id")
+    >>> cols
+    []
+    >>> cols = _normalize_keep_external_columns(True, ["id", "Country", "geom"], "id")
+    >>> cols
+    ['Country']
+    >>> cols = _normalize_keep_external_columns(["Country"], ["id", "Country", "geom"], "id")
+    >>> cols
+    ['Country']
+    """
+    if keep_external_columns is True:
+        # Keep all columns except geometry and plot_id
+        return [c for c in all_columns if c not in [plot_id_column, "geometry"]]
+    elif keep_external_columns is False:
+        # Keep nothing
+        return []
+    else:
+        # Use provided list (handle None case)
+        return keep_external_columns or []
 def _add_admin_context(
     df: pd.DataFrame, admin_code_col: str = "admin_code_median", debug: bool = False
 ) -> pd.DataFrame:
@@ -226,7 +277,7 @@ def _add_admin_context(
     pd.DataFrame
         DataFrame with added Country, ProducerCountry, Admin_Level_1 columns
     """
-    logger = logging.getLogger("whisp-concurrent")
+    logger = logging.getLogger("whisp")
     # Return early if admin code column doesn't exist
     if admin_code_col not in df.columns:
@@ -347,7 +398,7 @@ def join_admin_codes(
     pd.DataFrame
         DataFrame with added Country, ProducerCountry, Admin_Level_1 columns
     """
-    logger = logging.getLogger("whisp-concurrent")
+    logger = logging.getLogger("whisp")
     # Return early if admin code column doesn't exist
     if id_col not in df.columns:
@@ -408,8 +459,9 @@ class ProgressTracker:
     """
     Track batch processing progress with time estimation.
-    Shows progress at key milestones (25%, 50%, 75%, 100%) with estimated
-    time remaining based on processing speed.
+    Shows progress at adaptive milestones (more frequent for small datasets,
+    less frequent for large datasets) with estimated time remaining based on
+    processing speed.
     """
     def __init__(self, total: int, logger: logging.Logger = None):
@@ -426,8 +478,19 @@ class ProgressTracker:
         self.total = total
         self.completed = 0
         self.lock = threading.Lock()
-        self.logger = logger or logging.getLogger("whisp-concurrent")
-        self.milestones = {25, 50, 75, 100}
+        self.logger = logger or logging.getLogger("whisp")
+        # Adaptive milestones based on dataset size
+        # Small datasets (< 50): show every 25% (not too spammy)
+        # Medium (50-500): show every 20%
+        # Large (500+): show every 10% (more frequent feedback on long runs)
+        if total < 50:
+            self.milestones = {25, 50, 75, 100}
+        elif total < 500:
+            self.milestones = {20, 40, 60, 80, 100}
+        else:
+            self.milestones = {10, 20, 30, 40, 50, 60, 70, 80, 90, 100}
         self.shown_milestones = set()
         self.start_time = time.time()
         self.last_update_time = self.start_time
@@ -544,9 +607,11 @@ def validate_ee_endpoint(endpoint_type: str = "high-volume", raise_error: bool =
         )
         msg += "ee.Reset()\n"
         if endpoint_type == "high-volume":
-            msg += "  ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')"
+            msg += (
+                "ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')"
+            )
         else:
-            msg += "  ee.Initialize()  # Uses standard endpoint by default"
+            msg += "ee.Initialize()  # Uses standard endpoint by default"
         if raise_error:
             raise RuntimeError(msg)
@@ -713,8 +778,8 @@ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
     """
     Convert a batch GeoDataFrame to EE FeatureCollection efficiently.
-    OPTIMIZATION: Uses GeoJSON dict input directly to avoid temp file I/O.
-    This provides ~67% performance improvement over writing to disk.
+    OPTIMIZATION: Passes GeoDataFrame directly to convert_geojson_to_ee to preserve CRS.
+    This ensures proper coordinate system handling and reprojection to WGS84 if needed.
     Preserves the __row_id__ column if present so it can be retrieved after processing.
@@ -728,10 +793,13 @@ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
     ee.FeatureCollection
         EE FeatureCollection with __row_id__ as a feature property
     """
-    # OPTIMIZATION: Convert to GeoJSON dict and pass directly
-    # This eliminates the need to write to/read from temp files (~67% faster)
-    geojson_dict = json.loads(batch_gdf.to_json())
-    fc = convert_geojson_to_ee(geojson_dict)
+    # Pass GeoDataFrame directly to preserve CRS metadata
+    # convert_geojson_to_ee will handle:
+    # - CRS detection and conversion to WGS84 if needed
+    # - Data type sanitization (datetime, object columns)
+    # - Geometry validation and Z-coordinate stripping
+    fc = convert_geojson_to_ee(batch_gdf, enforce_wgs84=True, strip_z_coords=True)
     # If __row_id__ is in the original GeoDataFrame, it will be preserved
     # as a feature property in the GeoJSON and thus in the EE FeatureCollection
@@ -763,7 +831,7 @@ def clean_geodataframe(
     gpd.GeoDataFrame
         Cleaned GeoDataFrame
     """
-    logger = logger or logging.getLogger("whisp-concurrent")
+    logger = logger or logging.getLogger("whisp")
     if remove_nulls:
         null_count = gdf.geometry.isna().sum()
@@ -828,7 +896,7 @@ def process_ee_batch(
     RuntimeError
         If processing fails after all retries
     """
-    logger = logger or logging.getLogger("whisp-concurrent")
+    logger = logger or logging.getLogger("whisp")
     for attempt in range(max_retries):
         try:
@@ -955,7 +1023,7 @@ def whisp_stats_geojson_to_df_concurrent(
     """
     from openforis_whisp.reformat import format_stats_dataframe
-    logger = logger or logging.getLogger("whisp-concurrent")
+    logger = logger or logging.getLogger("whisp")
     # Suppress verbose output from dependencies (dynamically adjust based on max_concurrent)
     _suppress_verbose_output(max_concurrent=max_concurrent)
@@ -978,6 +1046,16 @@ def whisp_stats_geojson_to_df_concurrent(
     # Add stable plotIds for merging (starting from 1, not 0)
     gdf[plot_id_column] = range(1, len(gdf) + 1)
+    # Strip unnecessary properties before sending to EE
+    # Keep only: geometry, plot_id_column, and external_id_column
+    # This prevents duplication of GeoJSON properties in EE results
+    keep_cols = ["geometry", plot_id_column]
+    if external_id_column and external_id_column in gdf.columns:
+        keep_cols.append(external_id_column)
+    gdf_for_ee = gdf[keep_cols].copy()
+    logger.debug(f"Stripped GeoJSON to essential columns: {keep_cols}")
     # Create image if not provided
     if whisp_image is None:
         logger.debug("Creating Whisp image...")
@@ -1001,8 +1079,8 @@ def whisp_stats_geojson_to_df_concurrent(
     reducer = ee.Reducer.sum().combine(ee.Reducer.median(), sharedInputs=True)
     # Batch the data
-    batches = batch_geodataframe(gdf, batch_size)
-    logger.info(f"Processing {len(gdf):,} features in {len(batches)} batches")
+    batches = batch_geodataframe(gdf_for_ee, batch_size)
+    logger.info(f"Processing {len(gdf_for_ee):,} features in {len(batches)} batches")
     # Setup semaphore for EE concurrency control
     ee_semaphore = threading.BoundedSemaphore(max_concurrent)
@@ -1064,8 +1142,35 @@ def whisp_stats_geojson_to_df_concurrent(
                         if plot_id_column not in df_server.columns:
                             df_server[plot_id_column] = range(len(df_server))
-                        merged = df_server.merge(
-                            df_client,
+                        # Keep all EE statistics from server (all columns with _sum and _median suffixes)
+                        # These are the actual EE processing results
+                        df_server_clean = df_server.copy()
+                        # Keep external metadata: plot_id, external_id, geometry, geometry type, and centroids from client
+                        # (formatted wrapper handles keep_external_columns parameter)
+                        keep_external_columns = [plot_id_column]
+                        if (
+                            external_id_column
+                            and external_id_column in df_client.columns
+                        ):
+                            keep_external_columns.append(external_id_column)
+                        if "geometry" in df_client.columns:
+                            keep_external_columns.append("geometry")
+                        # Keep geometry type column (Geometry_type)
+                        if geometry_type_column in df_client.columns:
+                            keep_external_columns.append(geometry_type_column)
+                        # Also keep centroid columns (Centroid_lon, Centroid_lat)
+                        centroid_cols = [
+                            c for c in df_client.columns if c.startswith("Centroid_")
+                        ]
+                        keep_external_columns.extend(centroid_cols)
+                        df_client_clean = df_client[
+                            [c for c in keep_external_columns if c in df_client.columns]
+                        ].drop_duplicates()
+                        merged = df_server_clean.merge(
+                            df_client_clean,
                             on=plot_id_column,
                             how="left",
                             suffixes=("_ee", "_client"),
@@ -1442,7 +1547,7 @@ def whisp_stats_geojson_to_df_sequential(
     """
     from openforis_whisp.reformat import format_stats_dataframe
-    logger = logger or logging.getLogger("whisp-concurrent")
+    logger = logger or logging.getLogger("whisp")
     # Suppress verbose output from dependencies (sequential has lower concurrency, use default)
     _suppress_verbose_output(max_concurrent=1)
@@ -1469,6 +1574,16 @@ def whisp_stats_geojson_to_df_sequential(
     row_id_col = "__row_id__"
     gdf[row_id_col] = range(len(gdf))
+    # Strip unnecessary properties before sending to EE
+    # Keep only: geometry, plot_id_column, and external_id_column
+    # This prevents duplication of GeoJSON properties in EE results
+    keep_cols = ["geometry", plot_id_column, row_id_col]
+    if external_id_column and external_id_column in gdf.columns:
+        keep_cols.append(external_id_column)
+    gdf_for_ee = gdf[keep_cols].copy()
+    logger.debug(f"Stripped GeoJSON to essential columns: {keep_cols}")
     # Create image if not provided
     if whisp_image is None:
         logger.debug("Creating Whisp image...")
@@ -1491,7 +1606,7 @@ def whisp_stats_geojson_to_df_sequential(
     # Convert to EE (suppress print statements from convert_geojson_to_ee)
     logger.debug("Converting to EE FeatureCollection...")
     with redirect_stdout(io.StringIO()):
-        fc = convert_geojson_to_ee(input_geojson_filepath)
+        fc = convert_geojson_to_ee(gdf_for_ee, enforce_wgs84=True, strip_z_coords=True)
     # Create reducer
     reducer = ee.Reducer.sum().combine(ee.Reducer.median(), sharedInputs=True)
@@ -1633,6 +1748,7 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
     convert_water_flag: bool = True,
     water_flag_threshold: float = 0.5,
     sort_column: str = "plotId",
+    include_geometry_audit_trail: bool = False,
 ) -> pd.DataFrame:
     """
     Process GeoJSON concurrently with automatic formatting and validation.
@@ -1683,15 +1799,26 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
         Water flag ratio threshold (default 0.5)
     sort_column : str
         Column to sort by (default "plotId", None to skip)
+    include_geometry_audit_trail : bool, default False
+        If True, includes audit trail columns:
+        - geo_original: Original input geometry (before EE processing)
+        - geometry_type_original: Original geometry type
+        - geometry_type: Processed geometry type (from EE)
+        - geometry_type_changed: Boolean flag if geometry changed
+        - geometry_type_transition: Description of how it changed
+        These columns enable full transparency and auditability for compliance tracking.
     Returns
     -------
     pd.DataFrame
-        Validated, formatted results DataFrame
+        Validated, formatted results DataFrame with optional audit trail
     """
     from openforis_whisp.reformat import format_stats_dataframe
+    from datetime import datetime, timezone
+    import json
+    from shapely.geometry import mapping
-    logger = logger or logging.getLogger("whisp-concurrent")
+    logger = logger or logging.getLogger("whisp")
     # Auto-detect decimal places from config if not provided
     if decimal_places is None:
@@ -1699,6 +1826,9 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
         decimal_places = _extract_decimal_places(stats_area_columns_formatting)
         logger.debug(f"Using decimal_places={decimal_places} from config")
+    # Normalize keep_external_columns parameter early (will be used in merge logic later)
+    # Load GeoJSON temporarily to get column names for normalization
     # Step 1: Get raw stats
     logger.debug("Step 1/2: Extracting statistics (concurrent)...")
     df_raw = whisp_stats_geojson_to_df_concurrent(
@@ -1759,6 +1889,113 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
         custom_bands=custom_bands,
     )
+    # Step 2c: Add audit trail columns (AFTER validation to preserve columns)
+    if include_geometry_audit_trail:
+        logger.debug("Adding audit trail columns...")
+        try:
+            # Capture original geometries AFTER we have the raw stats
+            logger.debug("Capturing original geometries for audit trail...")
+            gdf_original = _load_geojson_silently(input_geojson_filepath)
+            # Use plotId from df_validated to maintain mapping
+            df_original_geom = pd.DataFrame(
+                {
+                    "plotId": df_validated["plotId"].values[: len(gdf_original)],
+                    "geo_original": gdf_original["geometry"].apply(
+                        lambda g: json.dumps(mapping(g)) if g is not None else None
+                    ),
+                    "geometry_type_original": gdf_original["geometry"].geom_type.values,
+                }
+            )
+            # Merge original geometries back
+            df_validated = df_validated.merge(df_original_geom, on="plotId", how="left")
+            # Extract geometry type from processed 'geo' column if it exists
+            # Note: 'geo' column may not exist after validation removes extra columns
+            if "geo" in df_validated.columns:
+                # Use geo column from validated dataframe
+                def extract_geom_type(x):
+                    try:
+                        if isinstance(x, dict):
+                            return x.get("type")
+                        elif isinstance(x, str):
+                            # Handle both JSON strings and Python dict string representations
+                            try:
+                                parsed = json.loads(x)
+                            except:
+                                # Try ast.literal_eval for Python dict representations
+                                import ast
+                                parsed = ast.literal_eval(x)
+                            return (
+                                parsed.get("type") if isinstance(parsed, dict) else None
+                            )
+                    except:
+                        pass
+                    return None
+                df_validated["geometry_type"] = df_validated["geo"].apply(
+                    extract_geom_type
+                )
+            else:
+                # If geo doesn't exist, just use the original type
+                df_validated["geometry_type"] = df_validated["geometry_type_original"]
+            # Flag if geometry changed
+            df_validated["geometry_type_changed"] = (
+                df_validated["geometry_type_original"] != df_validated["geometry_type"]
+            )
+            # Classify the geometry type transition
+            def classify_transition(orig, proc):
+                if orig == proc:
+                    return "no_change"
+                elif proc == "LineString":
+                    return f"{orig}_simplified_to_linestring"
+                elif proc == "Point":
+                    return f"{orig}_simplified_to_point"
+                else:
+                    return f"{orig}_to_{proc}"
+            df_validated["geometry_type_transition"] = df_validated.apply(
+                lambda row: classify_transition(
+                    row["geometry_type_original"], row["geometry_type"]
+                ),
+                axis=1,
+            )
+            # Store processing metadata
+            df_validated.attrs["processing_metadata"] = {
+                "whisp_version": "2.0",
+                "processing_date": datetime.now().isoformat(),
+                "processing_mode": "concurrent",
+                "ee_endpoint": "high_volume",
+                "validate_geometries": validate_geometries,
+                "datasets_used": national_codes or [],
+                "include_geometry_audit_trail": True,
+            }
+            logger.info(
+                f"Audit trail added: {df_validated['geometry_type_changed'].sum()} geometries with type changes"
+            )
+        except Exception as e:
+            logger.warning(f"Error adding audit trail: {e}")
+            # Continue without audit trail if something fails
+    # Add processing metadata column using pd.concat to avoid fragmentation warning
+    metadata_dict = {
+        "whisp_version": "3.0.0a1",
+        "processing_timestamp_utc": datetime.now(timezone.utc).strftime(
+            "%Y-%m-%d %H:%M:%S UTC"
+        ),
+    }
+    metadata_series = pd.Series(
+        [metadata_dict] * len(df_validated), name="whisp_processing_metadata"
+    )
+    df_validated = pd.concat([df_validated, metadata_series], axis=1)
     logger.info("Concurrent processing + formatting + validation complete")
     return df_validated
@@ -1779,6 +2016,7 @@ def whisp_formatted_stats_geojson_to_df_sequential(
     convert_water_flag: bool = True,
     water_flag_threshold: float = 0.5,
     sort_column: str = "plotId",
+    include_geometry_audit_trail: bool = False,
 ) -> pd.DataFrame:
     """
     Process GeoJSON sequentially with automatic formatting and validation.
@@ -1821,15 +2059,26 @@ def whisp_formatted_stats_geojson_to_df_sequential(
         Water flag ratio threshold (default 0.5)
     sort_column : str
         Column to sort by (default "plotId", None to skip)
+    include_geometry_audit_trail : bool, default True
+        If True, includes audit trail columns:
+        - geo_original: Original input geometry (before EE processing)
+        - geometry_type_original: Original geometry type
+        - geometry_type: Processed geometry type (from EE)
+        - geometry_type_changed: Boolean flag if geometry changed
+        - geometry_type_transition: Description of how it changed
+        These columns enable full transparency and auditability for EUDR compliance.
     Returns
     -------
     pd.DataFrame
-        Validated, formatted results DataFrame
+        Validated, formatted results DataFrame with optional audit trail
     """
     from openforis_whisp.reformat import format_stats_dataframe
+    from datetime import datetime, timezone
+    import json
+    from shapely.geometry import mapping
-    logger = logger or logging.getLogger("whisp-concurrent")
+    logger = logger or logging.getLogger("whisp")
     # Auto-detect decimal places from config if not provided
     if decimal_places is None:
@@ -1893,6 +2142,112 @@ def whisp_formatted_stats_geojson_to_df_sequential(
         custom_bands=custom_bands,
     )
+    # Step 2c: Add audit trail columns (AFTER validation to preserve columns)
+    if include_geometry_audit_trail:
+        logger.debug("Adding audit trail columns...")
+        try:
+            # Capture original geometries AFTER we have the raw stats
+            logger.debug("Capturing original geometries for audit trail...")
+            gdf_original = _load_geojson_silently(input_geojson_filepath)
+            # Use plotId from df_validated to maintain mapping
+            df_original_geom = pd.DataFrame(
+                {
+                    "plotId": df_validated["plotId"].values[: len(gdf_original)],
+                    "geo_original": gdf_original["geometry"].apply(
+                        lambda g: json.dumps(mapping(g)) if g is not None else None
+                    ),
+                    "geometry_type_original": gdf_original["geometry"].geom_type.values,
+                }
+            )
+            # Merge original geometries back
+            df_validated = df_validated.merge(df_original_geom, on="plotId", how="left")
+            # Extract geometry type from processed 'geo' column if it exists
+            # Note: 'geo' column may not exist after validation removes extra columns
+            if "geo" in df_validated.columns:
+                # Use geo column from validated dataframe
+                def extract_geom_type(x):
+                    try:
+                        if isinstance(x, dict):
+                            return x.get("type")
+                        elif isinstance(x, str):
+                            # Handle both JSON strings and Python dict string representations
+                            try:
+                                parsed = json.loads(x)
+                            except:
+                                # Try ast.literal_eval for Python dict representations
+                                import ast
+                                parsed = ast.literal_eval(x)
+                            return (
+                                parsed.get("type") if isinstance(parsed, dict) else None
+                            )
+                    except:
+                        pass
+                    return None
+                df_validated["geometry_type"] = df_validated["geo"].apply(
+                    extract_geom_type
+                )
+            else:
+                # If geo doesn't exist, just use the original type
+                df_validated["geometry_type"] = df_validated["geometry_type_original"]
+            # Flag if geometry changed
+            df_validated["geometry_type_changed"] = (
+                df_validated["geometry_type_original"] != df_validated["geometry_type"]
+            )
+            # Classify the geometry type transition
+            def classify_transition(orig, proc):
+                if orig == proc:
+                    return "no_change"
+                elif proc == "LineString":
+                    return f"{orig}_simplified_to_linestring"
+                elif proc == "Point":
+                    return f"{orig}_simplified_to_point"
+                else:
+                    return f"{orig}_to_{proc}"
+            df_validated["geometry_type_transition"] = df_validated.apply(
+                lambda row: classify_transition(
+                    row["geometry_type_original"], row["geometry_type"]
+                ),
+                axis=1,
+            )
+            # Store processing metadata
+            df_validated.attrs["processing_metadata"] = {
+                "whisp_version": "2.0",
+                "processing_date": datetime.now().isoformat(),
+                "processing_mode": "sequential",
+                "ee_endpoint": "standard",
+                "datasets_used": national_codes or [],
+                "include_geometry_audit_trail": True,
+            }
+            logger.info(
+                f"Audit trail added: {df_validated['geometry_type_changed'].sum()} geometries with type changes"
+            )
+        except Exception as e:
+            logger.warning(f"Error adding audit trail: {e}")
+            # Continue without audit trail if something fails
+    # Add processing metadata column using pd.concat to avoid fragmentation warning
+    metadata_dict = {
+        "whisp_version": "3.0.0a1",
+        "processing_timestamp_utc": datetime.now(timezone.utc).strftime(
+            "%Y-%m-%d %H:%M:%S UTC"
+        ),
+    }
+    metadata_series = pd.Series(
+        [metadata_dict] * len(df_validated), name="whisp_processing_metadata"
+    )
+    df_validated = pd.concat([df_validated, metadata_series], axis=1)
     logger.info("Sequential processing + formatting + validation complete")
     return df_validated
@@ -1923,6 +2278,7 @@ def whisp_formatted_stats_geojson_to_df_fast(
     convert_water_flag: bool = True,
     water_flag_threshold: float = 0.5,
     sort_column: str = "plotId",
+    include_geometry_audit_trail: bool = False,
 ) -> pd.DataFrame:
     """
     Process GeoJSON to Whisp statistics with optimized fast processing.
@@ -1999,7 +2355,7 @@ def whisp_formatted_stats_geojson_to_df_fast(
     ...     mode="sequential"
     ... )
     """
-    logger = logging.getLogger("whisp-concurrent")
+    logger = logging.getLogger("whisp")
     # Determine processing mode
     if mode == "auto":
@@ -2050,6 +2406,7 @@ def whisp_formatted_stats_geojson_to_df_fast(
             convert_water_flag=convert_water_flag,
             water_flag_threshold=water_flag_threshold,
             sort_column=sort_column,
+            include_geometry_audit_trail=include_geometry_audit_trail,
         )
     else:  # sequential
         logger.debug("Routing to sequential processing...")
@@ -2067,4 +2424,5 @@ def whisp_formatted_stats_geojson_to_df_fast(
             convert_water_flag=convert_water_flag,
             water_flag_threshold=water_flag_threshold,
             sort_column=sort_column,
+            include_geometry_audit_trail=include_geometry_audit_trail,
         )

{openforis_whisp-3.0.0a1 → openforis_whisp-3.0.0a2}/src/openforis_whisp/data_checks.py RENAMED Viewed

@@ -7,10 +7,69 @@ and thresholds, raising informative errors when constraints are violated.
 import json
 from pathlib import Path
-from shapely.geometry import Polygon as ShapelyPolygon
+from shapely.geometry import Polygon as ShapelyPolygon, shape as shapely_shape
 # Note: area summary stats are estimations for use in deciding pathways for analysis
 # (estimation preferred here as allows efficient processing speed and limits overhead of checking file)
+def _convert_projected_area_to_ha(area_sq_units: float, crs: str = None) -> float:
+    """
+    Convert area from projected CRS units to hectares.
+    Most projected CRS use meters as units, so:
+    - area_sq_units is in square meters
+    - 1 hectare = 10,000 m²
+    Args:
+        area_sq_units: Area in square units of the projection (typically square meters)
+        crs: CRS string for reference (e.g., 'EPSG:3857'). Used for validation.
+    Returns:
+        Area in hectares
+    """
+    # Standard conversion: 1 hectare = 10,000 m²
+    # Most projected CRS use meters, so this works universally
+    return area_sq_units / 10000
+def _estimate_area_from_bounds(coords, area_conversion_factor: float) -> float:
+    """
+    Estimate area from bounding box when actual area calculation fails.
+    Extracts bounding box and calculates its area as a fallback estimate.
+    Returns area in hectares.
+    """
+    try:
+        # Flatten all coordinates to find bounds
+        all_coords = []
+        def flatten_coords(c):
+            if isinstance(c[0], (list, tuple)) and isinstance(c[0][0], (list, tuple)):
+                for sub in c:
+                    flatten_coords(sub)
+            else:
+                all_coords.extend(c)
+        flatten_coords(coords)
+        if not all_coords:
+            return 0
+        # Extract lon/lat values
+        lons = [c[0] for c in all_coords]
+        lats = [c[1] for c in all_coords]
+        min_lon, max_lon = min(lons), max(lons)
+        min_lat, max_lat = min(lats), max(lats)
+        # Bounding box area
+        bbox_area = (max_lon - min_lon) * (max_lat - min_lat)
+        # Apply conversion factor
+        return abs(bbox_area) * area_conversion_factor
+    except:
+        return 0
 def analyze_geojson(
     geojson_data: Path | str | dict,
     metrics=[
@@ -76,6 +135,8 @@ def analyze_geojson(
         - 'vertex_percentiles': {'p25': int, 'p50': int, 'p75': int, 'p90': int}
     """
     results = {}
+    crs_warning = None
+    file_path = None
     try:
         # Load GeoJSON from file if path provided
@@ -83,11 +144,45 @@ def analyze_geojson(
             file_path = Path(geojson_data)
             if not file_path.exists():
                 raise FileNotFoundError(f"GeoJSON file not found: {file_path}")
-            with open(file_path, "r") as f:
-                geojson_data = json.load(f)
+            # Try UTF-8 first (most common), then fall back to auto-detection
+            try:
+                with open(file_path, "r", encoding="utf-8") as f:
+                    geojson_data = json.load(f)
+            except UnicodeDecodeError:
+                # Auto-detect encoding if UTF-8 fails
+                try:
+                    import chardet
+                    with open(file_path, "rb") as f:
+                        raw_data = f.read()
+                        detected = chardet.detect(raw_data)
+                        encoding = detected.get("encoding", "latin-1")
+                    with open(file_path, "r", encoding=encoding, errors="replace") as f:
+                        geojson_data = json.load(f)
+                except Exception:
+                    # Final fallback: use latin-1 which accepts all byte values
+                    with open(file_path, "r", encoding="latin-1") as f:
+                        geojson_data = json.load(f)
+            # Detect CRS from file if available
+            try:
+                import geopandas as gpd
+                gdf = gpd.read_file(file_path)
+                if gdf.crs and gdf.crs != "EPSG:4326":
+                    crs_warning = f"⚠️  CRS is {gdf.crs}, not EPSG:4326. Area metrics will be inaccurate. Data will be auto-reprojected during processing."
+            except Exception:
+                pass  # If we can't detect CRS, continue without warning
         features = geojson_data.get("features", [])
+        # Add CRS warning to results if detected
+        if crs_warning:
+            results["crs_warning"] = crs_warning
+            print(crs_warning)
         if "count" in metrics:
             results["count"] = len(features)
@@ -113,6 +208,29 @@ def analyze_geojson(
             geometry_type_counts = {}
             valid_polygons = 0
+            # Tracking for fallback geometries
+            bbox_fallback_count = 0  # Geometries that used bounding box estimate
+            geometry_skip_count = 0  # Geometries completely skipped
+            polygon_type_stats = {}  # Track stats by geometry type
+            # Detect CRS to determine area conversion factor
+            area_conversion_factor = 1232100  # Default: WGS84 (degrees to ha)
+            detected_crs = None
+            # Try to detect CRS from file if available
+            if file_path:
+                try:
+                    import geopandas as gpd
+                    gdf_temp = gpd.read_file(str(file_path))
+                    detected_crs = gdf_temp.crs
+                    if detected_crs and detected_crs != "EPSG:4326":
+                        # Projected CRS typically uses meters, so convert m² to ha
+                        # 1 ha = 10,000 m²
+                        area_conversion_factor = 1 / 10000
+                except Exception:
+                    pass  # Use default if CRS detection fails
             for feature in features:
                 try:
                     coords = feature["geometry"]["coordinates"]
@@ -133,13 +251,27 @@ def analyze_geojson(
                         # Calculate area from coordinates using shapely
                         try:
-                            poly = ShapelyPolygon(coords[0])
-                            # Convert square degrees to hectares (near equator)
-                            # 1 degree latitude ≈ 111 km, so 1 degree² ≈ 111² km² = 12,321 km² = 1,232,100 ha
-                            area_ha = abs(poly.area) * 1232100
+                            # Use shapely.geometry.shape to properly handle all geometry components
+                            geom = shapely_shape(feature["geometry"])
+                            # Convert using detected CRS
+                            area_ha = abs(geom.area) * area_conversion_factor
                             areas.append(area_ha)
-                        except:
-                            pass  # Skip if calculation fails
+                        except Exception as e:
+                            # Fallback: estimate from bounding box if geometry fails
+                            bbox_area = _estimate_area_from_bounds(
+                                coords, area_conversion_factor
+                            )
+                            if bbox_area > 0:
+                                areas.append(bbox_area)
+                                bbox_fallback_count += 1
+                                polygon_type_stats["Polygon_bbox"] = (
+                                    polygon_type_stats.get("Polygon_bbox", 0) + 1
+                                )
+                            else:
+                                geometry_skip_count += 1
+                                polygon_type_stats["Polygon_skipped"] = (
+                                    polygon_type_stats.get("Polygon_skipped", 0) + 1
+                                )
                         valid_polygons += 1
                     elif geom_type == "MultiPolygon":
@@ -152,12 +284,28 @@ def analyze_geojson(
                         # Calculate area from coordinates using shapely
                         try:
-                            for polygon in coords:
-                                poly = ShapelyPolygon(polygon[0])
-                                area_ha = abs(poly.area) * 1232100
-                                areas.append(area_ha)
-                        except:
-                            pass  # Skip if calculation fails
+                            # Use shapely.geometry.shape to properly handle MultiPolygon
+                            geom = shapely_shape(feature["geometry"])
+                            # Convert using detected CRS - use total area of all parts
+                            area_ha = abs(geom.area) * area_conversion_factor
+                            areas.append(area_ha)
+                        except Exception as e:
+                            # Fallback: estimate from bounding box if geometry fails
+                            bbox_area = _estimate_area_from_bounds(
+                                coords, area_conversion_factor
+                            )
+                            if bbox_area > 0:
+                                areas.append(bbox_area)
+                                bbox_fallback_count += 1
+                                polygon_type_stats["MultiPolygon_bbox"] = (
+                                    polygon_type_stats.get("MultiPolygon_bbox", 0) + 1
+                                )
+                            else:
+                                geometry_skip_count += 1
+                                polygon_type_stats["MultiPolygon_skipped"] = (
+                                    polygon_type_stats.get("MultiPolygon_skipped", 0)
+                                    + 1
+                                )
                         valid_polygons += 1
                 except:
@@ -312,6 +460,21 @@ def analyze_geojson(
                             else {"p25": 0, "p50": 0, "p75": 0, "p90": 0}
                         )
+        # Add geometry quality logging to results
+        if bbox_fallback_count > 0 or geometry_skip_count > 0:
+            geometry_quality_log = (
+                f"Geometry quality summary:\n"
+                f"  - Bounding box fallback used: {bbox_fallback_count} features\n"
+                f"  - Geometries skipped: {geometry_skip_count} features"
+            )
+            if polygon_type_stats:
+                geometry_quality_log += "\n  - Breakdown:"
+                for stat_type, count in sorted(polygon_type_stats.items()):
+                    geometry_quality_log += f"\n    - {stat_type}: {count}"
+            results["geometry_quality_note"] = geometry_quality_log
+            print(geometry_quality_log)
         return results
     except Exception as e:

{openforis_whisp-3.0.0a1 → openforis_whisp-3.0.0a2}/src/openforis_whisp/data_conversion.py RENAMED Viewed

@@ -12,67 +12,81 @@ import geopandas as gpd
 import ee
-def convert_geojson_to_ee(
-    geojson_filepath: Union[str, Path, dict],
-    enforce_wgs84: bool = True,
-    strip_z_coords: bool = True,
-) -> ee.FeatureCollection:
+# ============================================================================
+# HELPER FUNCTIONS FOR UNIFIED PROCESSING PATHWAY
+# ============================================================================
+def _sanitize_geodataframe(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
     """
-    Converts GeoJSON data to an Earth Engine FeatureCollection.
-    Accepts either a file path or a GeoJSON dictionary object.
-    Optionally checks and converts the CRS to WGS 84 (EPSG:4326) if needed.
-    Automatically handles 3D coordinates by stripping Z values when necessary.
+    Sanitize GeoDataFrame data types for JSON serialization.
+    Converts problematic data types that cannot be directly serialized:
+    - DateTime/Timestamp columns → ISO format strings
+    - Object columns → strings
+    - Skips geometry column
     Args:
-        geojson_filepath (Union[str, Path, dict]): The filepath to the GeoJSON file (str or Path)
-                                                    or a GeoJSON dictionary object.
-        enforce_wgs84 (bool): Whether to enforce WGS 84 projection (EPSG:4326). Defaults to True.
-                              Only applies when input is a file path (dicts are assumed to be in WGS84).
-        strip_z_coords (bool): Whether to automatically strip Z coordinates from 3D geometries. Defaults to True.
+        gdf (gpd.GeoDataFrame): Input GeoDataFrame
     Returns:
-        ee.FeatureCollection: Earth Engine FeatureCollection created from the GeoJSON.
+        gpd.GeoDataFrame: GeoDataFrame with sanitized data types
+    """
+    gdf = gdf.copy()
+    for col in gdf.columns:
+        if col != gdf.geometry.name:  # Skip geometry column
+            # Handle datetime/timestamp columns
+            if pd.api.types.is_datetime64_any_dtype(gdf[col]):
+                gdf[col] = gdf[col].dt.strftime("%Y-%m-%d %H:%M:%S").fillna("")
+            # Handle other problematic types
+            elif gdf[col].dtype == "object":
+                # Convert any remaining non-serializable objects to strings
+                gdf[col] = gdf[col].astype(str)
+    return gdf
+def _ensure_wgs84_crs(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
     """
-    if isinstance(geojson_filepath, dict):
-        # Input is already a GeoJSON dictionary - skip file reading
-        geojson_data = geojson_filepath
-    elif isinstance(geojson_filepath, (str, Path)):
-        file_path = os.path.abspath(geojson_filepath)
+    Ensure GeoDataFrame uses WGS 84 (EPSG:4326) coordinate reference system.
-        # Use GeoPandas to read the file and handle CRS
-        gdf = gpd.read_file(file_path)
+    - If CRS is None, assumes WGS 84
+    - If CRS is not WGS 84, converts to WGS 84
+    - If already WGS 84, returns unchanged
-        # NEW: Handle problematic data types before JSON conversion
-        for col in gdf.columns:
-            if col != gdf.geometry.name:  # Skip geometry column
-                # Handle datetime/timestamp columns
-                if pd.api.types.is_datetime64_any_dtype(gdf[col]):
-                    gdf[col] = gdf[col].dt.strftime("%Y-%m-%d %H:%M:%S").fillna("")
-                # Handle other problematic types
-                elif gdf[col].dtype == "object":
-                    # Convert any remaining non-serializable objects to strings
-                    gdf[col] = gdf[col].astype(str)
-        # Check and convert CRS if needed
-        if enforce_wgs84:
-            if gdf.crs is None:
-                # Assuming WGS 84 if no CRS defined
-                pass
-            elif gdf.crs != "EPSG:4326":
-                gdf = gdf.to_crs("EPSG:4326")
-        # Convert to GeoJSON
-        geojson_data = json.loads(gdf.to_json())
-    else:
-        raise ValueError(
-            "Input must be a file path (str or Path) or a GeoJSON dictionary object (dict)"
-        )
+    Args:
+        gdf (gpd.GeoDataFrame): Input GeoDataFrame
-    validation_errors = validate_geojson(geojson_data)
-    if validation_errors:
-        raise ValueError(f"GeoJSON validation errors: {validation_errors}")
+    Returns:
+        gpd.GeoDataFrame: GeoDataFrame in WGS 84
+    """
+    if gdf.crs is None:
+        # Assuming WGS 84 if no CRS defined
+        return gdf
+    elif gdf.crs != "EPSG:4326":
+        return gdf.to_crs("EPSG:4326")
+    return gdf
+def _create_ee_feature_collection(
+    geojson_data: dict, strip_z_coords: bool = True, input_source: str = "input"
+) -> ee.FeatureCollection:
+    """
+    Create Earth Engine FeatureCollection from GeoJSON dict with error recovery.
+    Attempts to create EE FeatureCollection. If it fails due to 3D coordinates
+    and strip_z_coords is True, automatically strips Z values and retries.
+    Args:
+        geojson_data (dict): GeoJSON data dictionary
+        strip_z_coords (bool): Whether to retry with 2D geometries on failure
+        input_source (str): Description of input source for logging
+    Returns:
+        ee.FeatureCollection: Earth Engine FeatureCollection
-    # Try to create the feature collection, handle 3D coordinate issues automatically
+    Raises:
+        ee.EEException: If conversion fails even after retries
+    """
     try:
         feature_collection = ee.FeatureCollection(
             create_feature_collection(geojson_data)
@@ -81,16 +95,16 @@ def convert_geojson_to_ee(
     except ee.EEException as e:
         if "Invalid GeoJSON geometry" in str(e) and strip_z_coords:
             # Apply print_once deduplication for Z-coordinate stripping messages
-            if not hasattr(convert_geojson_to_ee, "_printed_z_messages"):
-                convert_geojson_to_ee._printed_z_messages = set()
+            if not hasattr(_create_ee_feature_collection, "_printed_z_messages"):
+                _create_ee_feature_collection._printed_z_messages = set()
-            z_message_key = f"z_coords_{file_path}"
-            if z_message_key not in convert_geojson_to_ee._printed_z_messages:
+            z_message_key = f"z_coords_{input_source}"
+            if z_message_key not in _create_ee_feature_collection._printed_z_messages:
                 print(
                     "Warning: Invalid GeoJSON geometry detected, likely due to 3D coordinates."
                 )
                 print("Attempting to fix by stripping Z coordinates...")
-                convert_geojson_to_ee._printed_z_messages.add(z_message_key)
+                _create_ee_feature_collection._printed_z_messages.add(z_message_key)
             # Apply Z-coordinate stripping
             geojson_data_fixed = _strip_z_coordinates_from_geojson(geojson_data)
@@ -101,10 +115,15 @@ def convert_geojson_to_ee(
                     create_feature_collection(geojson_data_fixed)
                 )
-                success_message_key = f"z_coords_success_{file_path}"
-                if success_message_key not in convert_geojson_to_ee._printed_z_messages:
+                success_message_key = f"z_coords_success_{input_source}"
+                if (
+                    success_message_key
+                    not in _create_ee_feature_collection._printed_z_messages
+                ):
                     print("Successfully converted after stripping Z coordinates")
-                    convert_geojson_to_ee._printed_z_messages.add(success_message_key)
+                    _create_ee_feature_collection._printed_z_messages.add(
+                        success_message_key
+                    )
                 return feature_collection
             except Exception as retry_error:
@@ -115,6 +134,82 @@ def convert_geojson_to_ee(
             raise e
+def convert_geojson_to_ee(
+    geojson_input: Union[str, Path, dict, gpd.GeoDataFrame],
+    enforce_wgs84: bool = True,
+    strip_z_coords: bool = True,
+) -> ee.FeatureCollection:
+    """
+    Converts GeoJSON data to an Earth Engine FeatureCollection.
+    Accepts flexible input types with a unified processing pathway:
+    - File path (str or Path) → loads with GeoPandas
+    - GeoJSON dict → uses directly
+    - GeoDataFrame → uses directly
+    Automatically handles:
+    - CRS conversion to WGS 84 (EPSG:4326) if needed
+    - DateTime/Timestamp columns → converts to ISO strings before JSON serialization
+    - Non-serializable objects → converts to strings
+    - 3D coordinates → strips Z values when necessary
+    - Z-coordinate errors → retries with 2D geometries if enabled
+    Args:
+        geojson_input (Union[str, Path, dict, gpd.GeoDataFrame]):
+            - File path (str or Path) to GeoJSON file
+            - GeoJSON dictionary object
+            - GeoPandas GeoDataFrame
+        enforce_wgs84 (bool): Whether to enforce WGS 84 projection (EPSG:4326).
+            Defaults to True. Only applies to file path and GeoDataFrame inputs.
+        strip_z_coords (bool): Whether to automatically strip Z coordinates from 3D geometries.
+            Defaults to True.
+    Returns:
+        ee.FeatureCollection: Earth Engine FeatureCollection created from the GeoJSON.
+    Raises:
+        ValueError: If input type is unsupported or GeoJSON validation fails.
+        ee.EEException: If GeoJSON cannot be converted even after retries.
+    """
+    # UNIFIED INPUT NORMALIZATION: Convert all inputs to GeoDataFrame first
+    if isinstance(geojson_input, gpd.GeoDataFrame):
+        gdf = geojson_input.copy()
+        input_source = "GeoDataFrame"
+    elif isinstance(geojson_input, dict):
+        # Convert dict to GeoDataFrame for unified processing
+        gdf = gpd.GeoDataFrame.from_features(geojson_input.get("features", []))
+        input_source = "dict"
+    elif isinstance(geojson_input, (str, Path)):
+        # Load file and convert to GeoDataFrame
+        file_path = os.path.abspath(geojson_input)
+        gdf = gpd.read_file(file_path)
+        input_source = f"file ({file_path})"
+    else:
+        raise ValueError(
+            f"Input must be a file path (str or Path), GeoJSON dict, or GeoDataFrame. "
+            f"Got {type(geojson_input).__name__}"
+        )
+    # UNIFIED DATA SANITIZATION PATHWAY
+    # Handle problematic data types before JSON conversion
+    gdf = _sanitize_geodataframe(gdf)
+    # UNIFIED CRS HANDLING
+    if enforce_wgs84:
+        gdf = _ensure_wgs84_crs(gdf)
+    # UNIFIED GEOJSON CONVERSION
+    geojson_data = json.loads(gdf.to_json())
+    # UNIFIED VALIDATION
+    validation_errors = validate_geojson(geojson_data)
+    if validation_errors:
+        raise ValueError(f"GeoJSON validation errors: {validation_errors}")
+    # UNIFIED EE CONVERSION with error recovery
+    return _create_ee_feature_collection(geojson_data, strip_z_coords, input_source)
 def _strip_z_coordinates_from_geojson(geojson_data: dict) -> dict:
     """
     Helper function to strip Z coordinates from GeoJSON data.

{openforis_whisp-3.0.0a1 → openforis_whisp-3.0.0a2}/src/openforis_whisp/stats.py RENAMED Viewed

@@ -151,7 +151,7 @@ def whisp_formatted_stats_geojson_to_df_legacy(
         from shapely.validation import make_valid
         import logging as py_logging
-        logger = py_logging.getLogger("whisp-legacy")
+        logger = py_logging.getLogger("whisp")
         # Load GeoJSON file
         with open(input_geojson_filepath, "r") as f:
@@ -169,11 +169,14 @@ def whisp_formatted_stats_geojson_to_df_legacy(
                 lambda g: make_valid(g) if g and not g.is_valid else g
             )
-        # Convert back to GeoJSON dict (stays in memory - no temp files!)
-        geojson_cleaned = json.loads(gdf.to_json())
-        # OPTIMIZATION: Pass GeoJSON dict directly - eliminates file I/O overhead
-        feature_collection = convert_geojson_to_ee(geojson_cleaned)
+        # Pass GeoDataFrame directly to preserve CRS metadata
+        # convert_geojson_to_ee will handle:
+        # - CRS detection and conversion to WGS84 if needed
+        # - Data type sanitization (datetime, object columns)
+        # - Geometry validation and Z-coordinate stripping
+        feature_collection = convert_geojson_to_ee(
+            gdf, enforce_wgs84=True, strip_z_coords=True
+        )
     else:
         # Original path - no validation
         feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
@@ -201,6 +204,7 @@ def whisp_formatted_stats_geojson_to_df(
     batch_size: int = 10,
     max_concurrent: int = 20,
     validate_geometries: bool = False,
+    include_geometry_audit_trail: bool = False,
 ) -> pd.DataFrame:
     """
     Main entry point for converting GeoJSON to Whisp statistics.
@@ -253,6 +257,16 @@ def whisp_formatted_stats_geojson_to_df(
         Set to True to automatically fix invalid/self-intersecting polygons.
         For production workflows, it's recommended to use geometry validation and
         cleaning tools BEFORE processing with this function.
+    include_geometry_audit_trail : bool, default True
+        If True (default), includes audit trail columns:
+        - geo_original: Original input geometry
+        - geometry_type_original: Original geometry type
+        - geometry_type: Processed geometry type (from EE)
+        - geometry_type_changed: Boolean flag if geometry changed
+        - geometry_degradation_type: Description of how it changed
+        Processing metadata stored in df.attrs['processing_metadata'].
+        These columns enable full transparency for geometry modifications during processing.
     Returns
     -------
@@ -345,6 +359,7 @@ def whisp_formatted_stats_geojson_to_df(
             batch_size=batch_size,
             max_concurrent=max_concurrent,
             validate_geometries=validate_geometries,
+            include_geometry_audit_trail=include_geometry_audit_trail,
         )
     else:
         raise ValueError(