PyPI - openforis-whisp - Versions diffs - 3.0.0a1__py3-none-any.whl → 3.0.0a3__py3-none-any.whl - Mend

openforis-whisp 3.0.0a1py3-none-any.whl → 3.0.0a3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

openforis_whisp/__init__.py +7 -7
openforis_whisp/advanced_stats.py +400 -93
openforis_whisp/data_checks.py +178 -15
openforis_whisp/data_conversion.py +154 -59
openforis_whisp/reformat.py +2 -29
openforis_whisp/stats.py +15 -45
openforis_whisp/utils.py +449 -80
{openforis_whisp-3.0.0a1.dist-info → openforis_whisp-3.0.0a3.dist-info}/METADATA +1 -1
{openforis_whisp-3.0.0a1.dist-info → openforis_whisp-3.0.0a3.dist-info}/RECORD +11 -11
{openforis_whisp-3.0.0a1.dist-info → openforis_whisp-3.0.0a3.dist-info}/LICENSE +0 -0
{openforis_whisp-3.0.0a1.dist-info → openforis_whisp-3.0.0a3.dist-info}/WHEEL +0 -0

openforis_whisp/advanced_stats.py CHANGED Viewed

@@ -32,7 +32,7 @@ import os
 import subprocess
 from contextlib import redirect_stdout, contextmanager
 from pathlib import Path
-from typing import Optional, List, Dict, Any, Tuple
+from typing import Optional, List, Dict, Any, Tuple, Union
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import tempfile
@@ -203,6 +203,57 @@ def _extract_decimal_places(format_string: str) -> int:
     return 2  # Default to 2 decimal places
+def _normalize_keep_external_columns(
+    keep_external_columns: Union[bool, List[str]],
+    all_columns: List[str],
+    plot_id_column: str = "plotId",
+) -> List[str]:
+    """
+    Normalize keep_external_columns parameter to a list of column names.
+    Converts flexible user input (bool or list) to a concrete list of columns to keep.
+    Parameters
+    ----------
+    keep_external_columns : bool or List[str]
+        - False: keep nothing (return empty list)
+        - True: keep all columns except geometry and plot_id
+        - List[str]: keep specific columns (return as-is)
+    all_columns : List[str]
+        All available columns to choose from
+    plot_id_column : str
+        Name of plot ID column to exclude
+    Returns
+    -------
+    List[str]
+        Columns to keep from external (GeoJSON) data
+    Examples
+    --------
+    >>> cols = _normalize_keep_external_columns(False, ["id", "Country", "geom"], "id")
+    >>> cols
+    []
+    >>> cols = _normalize_keep_external_columns(True, ["id", "Country", "geom"], "id")
+    >>> cols
+    ['Country']
+    >>> cols = _normalize_keep_external_columns(["Country"], ["id", "Country", "geom"], "id")
+    >>> cols
+    ['Country']
+    """
+    if keep_external_columns is True:
+        # Keep all columns except geometry and plot_id
+        return [c for c in all_columns if c not in [plot_id_column, "geometry"]]
+    elif keep_external_columns is False:
+        # Keep nothing
+        return []
+    else:
+        # Use provided list (handle None case)
+        return keep_external_columns or []
 def _add_admin_context(
     df: pd.DataFrame, admin_code_col: str = "admin_code_median", debug: bool = False
 ) -> pd.DataFrame:
@@ -226,7 +277,7 @@ def _add_admin_context(
     pd.DataFrame
         DataFrame with added Country, ProducerCountry, Admin_Level_1 columns
     """
-    logger = logging.getLogger("whisp-concurrent")
+    logger = logging.getLogger("whisp")
     # Return early if admin code column doesn't exist
     if admin_code_col not in df.columns:
@@ -347,7 +398,7 @@ def join_admin_codes(
     pd.DataFrame
         DataFrame with added Country, ProducerCountry, Admin_Level_1 columns
     """
-    logger = logging.getLogger("whisp-concurrent")
+    logger = logging.getLogger("whisp")
     # Return early if admin code column doesn't exist
     if id_col not in df.columns:
@@ -408,8 +459,9 @@ class ProgressTracker:
     """
     Track batch processing progress with time estimation.
-    Shows progress at key milestones (25%, 50%, 75%, 100%) with estimated
-    time remaining based on processing speed.
+    Shows progress at adaptive milestones (more frequent for small datasets,
+    less frequent for large datasets) with estimated time remaining based on
+    processing speed.
     """
     def __init__(self, total: int, logger: logging.Logger = None):
@@ -426,8 +478,19 @@ class ProgressTracker:
         self.total = total
         self.completed = 0
         self.lock = threading.Lock()
-        self.logger = logger or logging.getLogger("whisp-concurrent")
-        self.milestones = {25, 50, 75, 100}
+        self.logger = logger or logging.getLogger("whisp")
+        # Adaptive milestones based on dataset size
+        # Small datasets (< 50): show every 25% (not too spammy)
+        # Medium (50-500): show every 20%
+        # Large (500+): show every 10% (more frequent feedback on long runs)
+        if total < 50:
+            self.milestones = {25, 50, 75, 100}
+        elif total < 500:
+            self.milestones = {20, 40, 60, 80, 100}
+        else:
+            self.milestones = {10, 20, 30, 40, 50, 60, 70, 80, 90, 100}
         self.shown_milestones = set()
         self.start_time = time.time()
         self.last_update_time = self.start_time
@@ -537,16 +600,22 @@ def validate_ee_endpoint(endpoint_type: str = "high-volume", raise_error: bool =
         If incorrect endpoint and raise_error=True
     """
     if not check_ee_endpoint(endpoint_type):
-        msg = (
-            f"Not using {endpoint_type.upper()} endpoint.\n"
-            f"Current URL: {ee.data._cloud_api_base_url}\n"
-            f"\nTo use {endpoint_type} endpoint, run:\n"
-        )
-        msg += "ee.Reset()\n"
         if endpoint_type == "high-volume":
-            msg += "  ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')"
-        else:
-            msg += "  ee.Initialize()  # Uses standard endpoint by default"
+            msg = (
+                "Concurrent mode requires the HIGH-VOLUME endpoint. To change endpoint run:\n"
+                "ee.Reset()\n"
+                "ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')\n"
+                "Or with project specified (e.g. when in Colab):\n"
+                "ee.Initialize(project='your_cloud_project_name', opt_url='https://earthengine-highvolume.googleapis.com')"
+            )
+        else:  # standard endpoint
+            msg = (
+                "Sequential mode requires the STANDARD endpoint. To change endpoint run:\n"
+                "ee.Reset()\n"
+                "ee.Initialize()\n"
+                "Or with project specified (e.g. when in Colab):\n"
+                "ee.Initialize(project='your_cloud_project_name')"
+            )
         if raise_error:
             raise RuntimeError(msg)
@@ -713,8 +782,8 @@ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
     """
     Convert a batch GeoDataFrame to EE FeatureCollection efficiently.
-    OPTIMIZATION: Uses GeoJSON dict input directly to avoid temp file I/O.
-    This provides ~67% performance improvement over writing to disk.
+    OPTIMIZATION: Passes GeoDataFrame directly to convert_geojson_to_ee to preserve CRS.
+    This ensures proper coordinate system handling and reprojection to WGS84 if needed.
     Preserves the __row_id__ column if present so it can be retrieved after processing.
@@ -728,10 +797,13 @@ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
     ee.FeatureCollection
         EE FeatureCollection with __row_id__ as a feature property
     """
-    # OPTIMIZATION: Convert to GeoJSON dict and pass directly
-    # This eliminates the need to write to/read from temp files (~67% faster)
-    geojson_dict = json.loads(batch_gdf.to_json())
-    fc = convert_geojson_to_ee(geojson_dict)
+    # Pass GeoDataFrame directly to preserve CRS metadata
+    # convert_geojson_to_ee will handle:
+    # - CRS detection and conversion to WGS84 if needed
+    # - Data type sanitization (datetime, object columns)
+    # - Geometry validation and Z-coordinate stripping
+    fc = convert_geojson_to_ee(batch_gdf, enforce_wgs84=True, strip_z_coords=True)
     # If __row_id__ is in the original GeoDataFrame, it will be preserved
     # as a feature property in the GeoJSON and thus in the EE FeatureCollection
@@ -740,8 +812,8 @@ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
 def clean_geodataframe(
     gdf: gpd.GeoDataFrame,
-    remove_nulls: bool = True,
-    fix_invalid: bool = True,
+    remove_nulls: bool = False,
+    repair_geometries: bool = False,
     logger: logging.Logger = None,
 ) -> gpd.GeoDataFrame:
     """
@@ -752,9 +824,11 @@ def clean_geodataframe(
     gdf : gpd.GeoDataFrame
         Input GeoDataFrame
     remove_nulls : bool
-        Remove null geometries
-    fix_invalid : bool
-        Fix invalid geometries
+        Remove null geometries. Defaults to False to preserve data integrity.
+        Set to True only if you explicitly want to drop rows with null geometries.
+    repair_geometries : bool
+        Repair invalid geometries using Shapely's make_valid(). Defaults to False to preserve
+        original geometries. Set to True only if you want to automatically repair invalid geometries.
     logger : logging.Logger, optional
         Logger for output
@@ -763,7 +837,7 @@ def clean_geodataframe(
     gpd.GeoDataFrame
         Cleaned GeoDataFrame
     """
-    logger = logger or logging.getLogger("whisp-concurrent")
+    logger = logger or logging.getLogger("whisp")
     if remove_nulls:
         null_count = gdf.geometry.isna().sum()
@@ -771,11 +845,11 @@ def clean_geodataframe(
             logger.warning(f"Removing {null_count} null geometries")
             gdf = gdf[~gdf.geometry.isna()].copy()
-    if fix_invalid:
+    if repair_geometries:
         valid_count = gdf.geometry.is_valid.sum()
         invalid_count = len(gdf) - valid_count
         if invalid_count > 0:
-            logger.warning(f"Fixing {invalid_count} invalid geometries")
+            logger.warning(f"Repairing {invalid_count} invalid geometries")
             from shapely.validation import make_valid
             gdf = gdf.copy()
@@ -787,6 +861,19 @@ def clean_geodataframe(
     return gdf
+# ============================================================================
+# BATCH RETRY HELPER
+# ============================================================================
+# ============================================================================
+# BATCH RETRY HELPER - DEPRECATED (removed due to semaphore deadlock issues)
+# ============================================================================
+# Note: Retry logic via sub-batching has been removed. Instead, use fail-fast
+# approach: when a batch fails, reduce batch_size parameter and retry manually.
+# This avoids semaphore deadlocks and provides clearer error messages.
 # ============================================================================
 # EE PROCESSING WITH RETRY LOGIC
 # ============================================================================
@@ -828,7 +915,7 @@ def process_ee_batch(
     RuntimeError
         If processing fails after all retries
     """
-    logger = logger or logging.getLogger("whisp-concurrent")
+    logger = logger or logging.getLogger("whisp")
     for attempt in range(max_retries):
         try:
@@ -955,7 +1042,7 @@ def whisp_stats_geojson_to_df_concurrent(
     """
     from openforis_whisp.reformat import format_stats_dataframe
-    logger = logger or logging.getLogger("whisp-concurrent")
+    logger = logger or logging.getLogger("whisp")
     # Suppress verbose output from dependencies (dynamically adjust based on max_concurrent)
     _suppress_verbose_output(max_concurrent=max_concurrent)
@@ -973,11 +1060,23 @@ def whisp_stats_geojson_to_df_concurrent(
     logger.info(f"Loaded {len(gdf):,} features")
     if validate_geometries:
-        gdf = clean_geodataframe(gdf, logger=logger)
+        gdf = clean_geodataframe(
+            gdf, remove_nulls=False, repair_geometries=False, logger=logger
+        )
     # Add stable plotIds for merging (starting from 1, not 0)
     gdf[plot_id_column] = range(1, len(gdf) + 1)
+    # Strip unnecessary properties before sending to EE
+    # Keep only: geometry, plot_id_column, and external_id_column
+    # This prevents duplication of GeoJSON properties in EE results
+    keep_cols = ["geometry", plot_id_column]
+    if external_id_column and external_id_column in gdf.columns:
+        keep_cols.append(external_id_column)
+    gdf_for_ee = gdf[keep_cols].copy()
+    logger.debug(f"Stripped GeoJSON to essential columns: {keep_cols}")
     # Create image if not provided
     if whisp_image is None:
         logger.debug("Creating Whisp image...")
@@ -1001,8 +1100,8 @@ def whisp_stats_geojson_to_df_concurrent(
     reducer = ee.Reducer.sum().combine(ee.Reducer.median(), sharedInputs=True)
     # Batch the data
-    batches = batch_geodataframe(gdf, batch_size)
-    logger.info(f"Processing {len(gdf):,} features in {len(batches)} batches")
+    batches = batch_geodataframe(gdf_for_ee, batch_size)
+    logger.info(f"Processing {len(gdf_for_ee):,} features in {len(batches)} batches")
     # Setup semaphore for EE concurrency control
     ee_semaphore = threading.BoundedSemaphore(max_concurrent)
@@ -1056,7 +1155,12 @@ def whisp_stats_geojson_to_df_concurrent(
                     for i, batch in enumerate(batches)
                 }
+                # Track which batches failed for retry
+                batch_map = {i: batch for i, batch in enumerate(batches)}
+                batch_futures = {future: i for future, i in futures.items()}
                 for future in as_completed(futures):
+                    batch_idx = batch_futures[future]
                     try:
                         batch_idx, df_server, df_client = future.result()
@@ -1064,8 +1168,35 @@ def whisp_stats_geojson_to_df_concurrent(
                         if plot_id_column not in df_server.columns:
                             df_server[plot_id_column] = range(len(df_server))
-                        merged = df_server.merge(
-                            df_client,
+                        # Keep all EE statistics from server (all columns with _sum and _median suffixes)
+                        # These are the actual EE processing results
+                        df_server_clean = df_server.copy()
+                        # Keep external metadata: plot_id, external_id, geometry, geometry type, and centroids from client
+                        # (formatted wrapper handles keep_external_columns parameter)
+                        keep_external_columns = [plot_id_column]
+                        if (
+                            external_id_column
+                            and external_id_column in df_client.columns
+                        ):
+                            keep_external_columns.append(external_id_column)
+                        if "geometry" in df_client.columns:
+                            keep_external_columns.append("geometry")
+                        # Keep geometry type column (Geometry_type)
+                        if geometry_type_column in df_client.columns:
+                            keep_external_columns.append(geometry_type_column)
+                        # Also keep centroid columns (Centroid_lon, Centroid_lat)
+                        centroid_cols = [
+                            c for c in df_client.columns if c.startswith("Centroid_")
+                        ]
+                        keep_external_columns.extend(centroid_cols)
+                        df_client_clean = df_client[
+                            [c for c in keep_external_columns if c in df_client.columns]
+                        ].drop_duplicates()
+                        merged = df_server_clean.merge(
+                            df_client_clean,
                             on=plot_id_column,
                             how="left",
                             suffixes=("_ee", "_client"),
@@ -1074,12 +1205,16 @@ def whisp_stats_geojson_to_df_concurrent(
                         progress.update()
                     except Exception as e:
+                        # Batch failed - fail fast with clear guidance
                         error_msg = str(e)
-                        logger.error(f"Batch processing error: {error_msg[:100]}")
-                        import traceback
+                        logger.error(f"Batch {batch_idx} failed: {error_msg[:100]}")
+                        logger.debug(f"Full error: {error_msg}")
+                        # Get original batch for error reporting
+                        original_batch = batch_map[batch_idx]
-                        logger.debug(traceback.format_exc())
-                        batch_errors.append(error_msg)
+                        # Add to batch errors for final reporting
+                        batch_errors.append((batch_idx, original_batch, error_msg))
     finally:
         # Restore logger levels
         fiona_logger.setLevel(old_fiona_level)
@@ -1087,8 +1222,60 @@ def whisp_stats_geojson_to_df_concurrent(
     progress.finish()
-    # Check if we should retry with validation due to band errors
-    if batch_errors and not results:
+    # If we have batch errors after retry attempts, fail the entire process
+    if batch_errors:
+        total_failed_rows = sum(len(batch) for _, batch, _ in batch_errors)
+        failed_batch_indices = [str(idx) for idx, _, _ in batch_errors]
+        # Format detailed error information for debugging
+        error_details_list = []
+        for idx, batch, msg in batch_errors:
+            error_details_list.append(f"  Batch {idx} ({len(batch)} features): {msg}")
+        error_details = "\n".join(error_details_list)
+        # Analyze error patterns for debugging hints
+        error_patterns = {
+            "memory": any("memory" in msg.lower() for _, _, msg in batch_errors),
+            "request_size": any(
+                keyword in msg.lower()
+                for _, _, msg in batch_errors
+                for keyword in ["too large", "10mb", "payload", "size limit"]
+            ),
+            "quota": any("quota" in msg.lower() for _, _, msg in batch_errors),
+            "timeout": any("timeout" in msg.lower() for _, _, msg in batch_errors),
+        }
+        # Build helpful suggestions based on error patterns
+        suggestions = []
+        if error_patterns["memory"]:
+            suggestions.append(
+                f"  • Reduce batch_size parameter (currently: {batch_size}). Try: batch_size=5 or lower"
+            )
+        if error_patterns["request_size"]:
+            suggestions.append(
+                "  • Request payload too large: reduce batch_size or simplify input geometries"
+            )
+        if error_patterns["quota"]:
+            suggestions.append("  • Earth Engine quota exceeded: wait and retry later")
+        if error_patterns["timeout"]:
+            suggestions.append(
+                "  • Processing timeout: reduce batch_size or simplify input geometries"
+            )
+        suggestions_text = (
+            "\nDebugging hints:\n" + "\n".join(suggestions) if suggestions else ""
+        )
+        raise RuntimeError(
+            f"Failed to process {len(batch_errors)} batch(es):\n"
+            f"\n{error_details}\n"
+            f"\nTotal rows affected: {total_failed_rows}\n"
+            f"{suggestions_text}\n"
+            f"Please reduce batch_size and try again."
+        )
+    # Check if we should retry with validation due to band errors (legacy band error handling)
+    if not results:
         # All batches failed - likely a bad band issue
         is_band_error = any(
             keyword in str(batch_errors)
@@ -1442,7 +1629,7 @@ def whisp_stats_geojson_to_df_sequential(
     """
     from openforis_whisp.reformat import format_stats_dataframe
-    logger = logger or logging.getLogger("whisp-concurrent")
+    logger = logger or logging.getLogger("whisp")
     # Suppress verbose output from dependencies (sequential has lower concurrency, use default)
     _suppress_verbose_output(max_concurrent=1)
@@ -1459,8 +1646,10 @@ def whisp_stats_geojson_to_df_sequential(
     gdf = _load_geojson_silently(input_geojson_filepath)
     logger.info(f"Loaded {len(gdf):,} features")
-    # Clean geometries
-    gdf = clean_geodataframe(gdf, logger=logger)
+    # Clean geometries (preserve both null and invalid geometries by default)
+    gdf = clean_geodataframe(
+        gdf, remove_nulls=False, repair_geometries=False, logger=logger
+    )
     # Add stable plotIds for merging (starting from 1, not 0)
     gdf[plot_id_column] = range(1, len(gdf) + 1)
@@ -1469,6 +1658,16 @@ def whisp_stats_geojson_to_df_sequential(
     row_id_col = "__row_id__"
     gdf[row_id_col] = range(len(gdf))
+    # Strip unnecessary properties before sending to EE
+    # Keep only: geometry, plot_id_column, and external_id_column
+    # This prevents duplication of GeoJSON properties in EE results
+    keep_cols = ["geometry", plot_id_column, row_id_col]
+    if external_id_column and external_id_column in gdf.columns:
+        keep_cols.append(external_id_column)
+    gdf_for_ee = gdf[keep_cols].copy()
+    logger.debug(f"Stripped GeoJSON to essential columns: {keep_cols}")
     # Create image if not provided
     if whisp_image is None:
         logger.debug("Creating Whisp image...")
@@ -1491,7 +1690,7 @@ def whisp_stats_geojson_to_df_sequential(
     # Convert to EE (suppress print statements from convert_geojson_to_ee)
     logger.debug("Converting to EE FeatureCollection...")
     with redirect_stdout(io.StringIO()):
-        fc = convert_geojson_to_ee(input_geojson_filepath)
+        fc = convert_geojson_to_ee(gdf_for_ee, enforce_wgs84=True, strip_z_coords=True)
     # Create reducer
     reducer = ee.Reducer.sum().combine(ee.Reducer.median(), sharedInputs=True)
@@ -1633,6 +1832,7 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
     convert_water_flag: bool = True,
     water_flag_threshold: float = 0.5,
     sort_column: str = "plotId",
+    geometry_audit_trail: bool = False,
 ) -> pd.DataFrame:
     """
     Process GeoJSON concurrently with automatic formatting and validation.
@@ -1683,15 +1883,22 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
         Water flag ratio threshold (default 0.5)
     sort_column : str
         Column to sort by (default "plotId", None to skip)
+    geometry_audit_trail : bool, default False
+        If True, includes original input geometry column:
+        - geo_original: Original input geometry (before EE processing), stored as GeoJSON
+        Enables geometry traceability for compliance and audit purposes.
     Returns
     -------
     pd.DataFrame
-        Validated, formatted results DataFrame
+        Validated, formatted results DataFrame with optional audit trail
     """
     from openforis_whisp.reformat import format_stats_dataframe
+    from datetime import datetime, timezone
+    import json
+    from shapely.geometry import mapping
-    logger = logger or logging.getLogger("whisp-concurrent")
+    logger = logger or logging.getLogger("whisp")
     # Auto-detect decimal places from config if not provided
     if decimal_places is None:
@@ -1699,6 +1906,12 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
         decimal_places = _extract_decimal_places(stats_area_columns_formatting)
         logger.debug(f"Using decimal_places={decimal_places} from config")
+    # Load original geometries once here if needed for audit trail (avoid reloading later)
+    gdf_original_geoms = None
+    if geometry_audit_trail:
+        logger.debug("Pre-loading GeoJSON for geometry audit trail...")
+        gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
     # Step 1: Get raw stats
     logger.debug("Step 1/2: Extracting statistics (concurrent)...")
     df_raw = whisp_stats_geojson_to_df_concurrent(
@@ -1759,6 +1972,57 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
         custom_bands=custom_bands,
     )
+    # Step 2c: Add audit trail columns (AFTER validation to preserve columns)
+    if geometry_audit_trail:
+        logger.debug("Adding audit trail columns...")
+        try:
+            # Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
+            if gdf_original_geoms is None:
+                logger.warning("Original geometries not pre-loaded, loading now...")
+                gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
+            # Use plotId from df_validated to maintain mapping
+            df_original_geom = pd.DataFrame(
+                {
+                    "plotId": df_validated["plotId"].values[: len(gdf_original_geoms)],
+                    "geo_original": gdf_original_geoms["geometry"].apply(
+                        lambda g: json.dumps(mapping(g)) if g is not None else None
+                    ),
+                }
+            )
+            # Merge original geometries back
+            df_validated = df_validated.merge(df_original_geom, on="plotId", how="left")
+            # Store processing metadata
+            df_validated.attrs["processing_metadata"] = {
+                "whisp_version": "3.0.0a1",
+                "processing_date": datetime.now().isoformat(),
+                "processing_mode": "concurrent",
+                "ee_endpoint": "high_volume",
+                "validate_geometries": validate_geometries,
+                "datasets_used": national_codes or [],
+                "geometry_audit_trail": True,
+            }
+            logger.info(f"Audit trail added: geo_original column")
+        except Exception as e:
+            logger.warning(f"Error adding audit trail: {e}")
+            # Continue without audit trail if something fails
+    # Add processing metadata column using pd.concat to avoid fragmentation warning
+    metadata_dict = {
+        "whisp_version": "3.0.0a1",
+        "processing_timestamp_utc": datetime.now(timezone.utc).strftime(
+            "%Y-%m-%d %H:%M:%S UTC"
+        ),
+    }
+    metadata_series = pd.Series(
+        [metadata_dict] * len(df_validated), name="whisp_processing_metadata"
+    )
+    df_validated = pd.concat([df_validated, metadata_series], axis=1)
     logger.info("Concurrent processing + formatting + validation complete")
     return df_validated
@@ -1779,6 +2043,7 @@ def whisp_formatted_stats_geojson_to_df_sequential(
     convert_water_flag: bool = True,
     water_flag_threshold: float = 0.5,
     sort_column: str = "plotId",
+    geometry_audit_trail: bool = False,
 ) -> pd.DataFrame:
     """
     Process GeoJSON sequentially with automatic formatting and validation.
@@ -1821,15 +2086,22 @@ def whisp_formatted_stats_geojson_to_df_sequential(
         Water flag ratio threshold (default 0.5)
     sort_column : str
         Column to sort by (default "plotId", None to skip)
+    geometry_audit_trail : bool, default True
+        If True, includes original input geometry column:
+        - geo_original: Original input geometry (before EE processing), stored as GeoJSON
+        Enables geometry traceability for compliance and audit purposes.
     Returns
     -------
     pd.DataFrame
-        Validated, formatted results DataFrame
+        Validated, formatted results DataFrame with optional audit trail
     """
     from openforis_whisp.reformat import format_stats_dataframe
+    from datetime import datetime, timezone
+    import json
+    from shapely.geometry import mapping
-    logger = logger or logging.getLogger("whisp-concurrent")
+    logger = logger or logging.getLogger("whisp")
     # Auto-detect decimal places from config if not provided
     if decimal_places is None:
@@ -1837,6 +2109,12 @@ def whisp_formatted_stats_geojson_to_df_sequential(
         decimal_places = _extract_decimal_places(stats_area_columns_formatting)
         logger.debug(f"Using decimal_places={decimal_places} from config")
+    # Load original geometries once here if needed for audit trail (avoid reloading later)
+    gdf_original_geoms = None
+    if geometry_audit_trail:
+        logger.debug("Pre-loading GeoJSON for geometry audit trail...")
+        gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
     # Step 1: Get raw stats
     logger.debug("Step 1/2: Extracting statistics (sequential)...")
     df_raw = whisp_stats_geojson_to_df_sequential(
@@ -1893,6 +2171,56 @@ def whisp_formatted_stats_geojson_to_df_sequential(
         custom_bands=custom_bands,
     )
+    # Step 2c: Add audit trail columns (AFTER validation to preserve columns)
+    if geometry_audit_trail:
+        logger.debug("Adding audit trail columns...")
+        try:
+            # Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
+            if gdf_original_geoms is None:
+                logger.warning("Original geometries not pre-loaded, loading now...")
+                gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
+            # Use plotId from df_validated to maintain mapping
+            df_original_geom = pd.DataFrame(
+                {
+                    "plotId": df_validated["plotId"].values[: len(gdf_original_geoms)],
+                    "geo_original": gdf_original_geoms["geometry"].apply(
+                        lambda g: json.dumps(mapping(g)) if g is not None else None
+                    ),
+                }
+            )
+            # Merge original geometries back
+            df_validated = df_validated.merge(df_original_geom, on="plotId", how="left")
+            # Store processing metadata
+            df_validated.attrs["processing_metadata"] = {
+                "whisp_version": "3.0.0a1",
+                "processing_date": datetime.now().isoformat(),
+                "processing_mode": "sequential",
+                "ee_endpoint": "standard",
+                "datasets_used": national_codes or [],
+                "geometry_audit_trail": True,
+            }
+            logger.info(f"Audit trail added: geo_original column")
+        except Exception as e:
+            logger.warning(f"Error adding audit trail: {e}")
+            # Continue without audit trail if something fails
+    # Add processing metadata column using pd.concat to avoid fragmentation warning
+    metadata_dict = {
+        "whisp_version": "3.0.0a1",
+        "processing_timestamp_utc": datetime.now(timezone.utc).strftime(
+            "%Y-%m-%d %H:%M:%S UTC"
+        ),
+    }
+    metadata_series = pd.Series(
+        [metadata_dict] * len(df_validated), name="whisp_processing_metadata"
+    )
+    df_validated = pd.concat([df_validated, metadata_series], axis=1)
     logger.info("Sequential processing + formatting + validation complete")
     return df_validated
@@ -1910,7 +2238,7 @@ def whisp_formatted_stats_geojson_to_df_fast(
     unit_type: str = "ha",
     whisp_image: ee.Image = None,
     custom_bands: Dict[str, Any] = None,
-    mode: str = "auto",
+    mode: str = "sequential",
     # Concurrent-specific parameters
     batch_size: int = 10,
     max_concurrent: int = 20,
@@ -1923,14 +2251,15 @@ def whisp_formatted_stats_geojson_to_df_fast(
     convert_water_flag: bool = True,
     water_flag_threshold: float = 0.5,
     sort_column: str = "plotId",
+    geometry_audit_trail: bool = False,
 ) -> pd.DataFrame:
     """
     Process GeoJSON to Whisp statistics with optimized fast processing.
-    Automatically selects between concurrent (high-volume endpoint) and sequential
-    (standard endpoint) based on file size, or allows explicit mode selection.
+    Routes to concurrent (high-volume endpoint) or sequential (standard endpoint)
+    based on explicit mode selection.
-    This is the recommended entry point for most users who want automatic optimization.
+    This is the recommended entry point for most users.
     Parameters
     ----------
@@ -1950,12 +2279,8 @@ def whisp_formatted_stats_geojson_to_df_fast(
         Custom band information
     mode : str
         Processing mode:
-        - "auto": Choose based on file size (default)
-          * <1MB: sequential
-          * 1-5MB: sequential
-          * >5MB: concurrent
-        - "concurrent": Force high-volume endpoint (batch processing)
-        - "sequential": Force standard endpoint (single-threaded)
+        - "concurrent": Uses high-volume endpoint with batch processing
+        - "sequential": Uses standard endpoint for sequential processing
     batch_size : int
         Features per batch (only for concurrent mode)
     max_concurrent : int
@@ -1976,6 +2301,8 @@ def whisp_formatted_stats_geojson_to_df_fast(
         Water flag ratio threshold
     sort_column : str
         Column to sort by
+    geometry_audit_trail : bool
+        Include geometry modification audit trail columns
     Returns
     -------
@@ -1984,52 +2311,30 @@ def whisp_formatted_stats_geojson_to_df_fast(
     Examples
     --------
-    >>> # Auto-detect best method based on file size
-    >>> df = whisp_formatted_stats_geojson_to_df_fast("data.geojson")
-    >>> # Force concurrent processing for large datasets
+    >>> # Use concurrent processing (recommended for most datasets)
     >>> df = whisp_formatted_stats_geojson_to_df_fast(
-    ...     "large_data.geojson",
+    ...     "data.geojson",
     ...     mode="concurrent"
     ... )
-    >>> # Use sequential for guaranteed completion
+    >>> # Use sequential processing for more stable results
     >>> df = whisp_formatted_stats_geojson_to_df_fast(
     ...     "data.geojson",
     ...     mode="sequential"
     ... )
     """
-    logger = logging.getLogger("whisp-concurrent")
+    logger = logging.getLogger("whisp")
-    # Determine processing mode
-    if mode == "auto":
-        try:
-            file_size = Path(input_geojson_filepath).stat().st_size
-            if file_size > 5_000_000:  # >5MB
-                chosen_mode = "concurrent"
-                logger.info(
-                    f"File size {file_size/1e6:.1f}MB → Using concurrent (high-volume endpoint)"
-                )
-            else:  # <=5MB
-                chosen_mode = "sequential"
-                logger.info(
-                    f"File size {file_size/1e6:.1f}MB → Using sequential (standard endpoint)"
-                )
-        except Exception as e:
-            logger.warning(
-                f"Could not determine file size: {e}. Defaulting to sequential."
-            )
-            chosen_mode = "sequential"
-    elif mode in ("concurrent", "sequential"):
-        chosen_mode = mode
-        logger.info(f"Mode explicitly set to: {mode}")
-    else:
+    # Validate mode parameter
+    if mode not in ("concurrent", "sequential"):
         raise ValueError(
-            f"Invalid mode '{mode}'. Must be 'auto', 'concurrent', or 'sequential'."
+            f"Invalid mode '{mode}'. Must be 'concurrent' or 'sequential'."
         )
+    logger.info(f"Mode: {mode}")
     # Route to appropriate function
-    if chosen_mode == "concurrent":
+    if mode == "concurrent":
         logger.debug("Routing to concurrent processing...")
         return whisp_formatted_stats_geojson_to_df_concurrent(
             input_geojson_filepath=input_geojson_filepath,
@@ -2050,6 +2355,7 @@ def whisp_formatted_stats_geojson_to_df_fast(
             convert_water_flag=convert_water_flag,
             water_flag_threshold=water_flag_threshold,
             sort_column=sort_column,
+            geometry_audit_trail=geometry_audit_trail,
         )
     else:  # sequential
         logger.debug("Routing to sequential processing...")
@@ -2067,4 +2373,5 @@ def whisp_formatted_stats_geojson_to_df_fast(
             convert_water_flag=convert_water_flag,
             water_flag_threshold=water_flag_threshold,
             sort_column=sort_column,
+            geometry_audit_trail=geometry_audit_trail,
         )

openforis-whisp 3.0.0a1__py3-none-any.whl → 3.0.0a3__py3-none-any.whl

openforis-whisp 3.0.0a1py3-none-any.whl → 3.0.0a3py3-none-any.whl