PyPI - openforis-whisp - Versions diffs - 3.0.0a4__tar.gz → 3.0.0a5__tar.gz - Mend

openforis-whisp 3.0.0a4tar.gz → 3.0.0a5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

{openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: openforis-whisp
-Version: 3.0.0a4
+Version: 3.0.0a5
 Summary: Whisp (What is in that plot) is an open-source solution which helps to produce relevant forest monitoring information and support compliance with deforestation-related regulations.
 License: MIT
 Keywords: whisp,geospatial,data-processing

{openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a5}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 [tool.poetry]
 name = "openforis-whisp"
-version = "3.0.0a4"
+version = "3.0.0a5"
 description = "Whisp (What is in that plot) is an open-source solution which helps to produce relevant forest monitoring information and support compliance with deforestation-related regulations."
 repository = "https://github.com/forestdatapartnership/whisp"
 authors = ["Andy Arnell <andrew.arnell@fao.org>"]

{openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a5}/src/openforis_whisp/advanced_stats.py RENAMED Viewed

@@ -181,8 +181,25 @@ def _suppress_verbose_output(max_concurrent: int = None):
     reformat_logger.setLevel(logging.ERROR)
-def _load_geojson_silently(filepath: str) -> gpd.GeoDataFrame:
-    """Load GeoJSON file with all output suppressed."""
+def _load_and_prepare_geojson(
+    filepath: str, external_id_column: Optional[str] = None
+) -> gpd.GeoDataFrame:
+    """Load GeoJSON file and prepare for processing.
+    Suppresses logging output and optionally renames external_id column.
+    Parameters
+    ----------
+    filepath : str
+        Path to GeoJSON file
+    external_id_column : str, optional
+        If provided, rename this column to 'external_id' immediately after loading
+    Returns
+    -------
+    gpd.GeoDataFrame
+        Loaded GeoDataFrame with external_id renamed if specified
+    """
     fiona_logger = logging.getLogger("fiona")
     pyogrio_logger = logging.getLogger("pyogrio._io")
     old_fiona_level = fiona_logger.level
@@ -193,6 +210,16 @@ def _load_geojson_silently(filepath: str) -> gpd.GeoDataFrame:
     try:
         with redirect_stdout(io.StringIO()):
             gdf = gpd.read_file(filepath)
+        # Rename external_id column early and convert to string
+        if external_id_column and external_id_column in gdf.columns:
+            if external_id_column != "external_id":
+                gdf = gdf.rename(
+                    columns={external_id_column: "external_id"}
+                )  # hard coding here to avoid confusion later
+            # Convert to string to ensure consistent type throughout pipeline
+            gdf["external_id"] = gdf["external_id"].astype(str)
         return gdf
     finally:
         fiona_logger.setLevel(old_fiona_level)
@@ -780,19 +807,17 @@ def validate_ee_endpoint(endpoint_type: str = "high-volume", raise_error: bool =
     if not check_ee_endpoint(endpoint_type):
         if endpoint_type == "high-volume":
             msg = (
-                "Concurrent mode requires the HIGH-VOLUME endpoint. To change endpoint run:\n"
+                "# Concurrent mode requires the HIGH-VOLUME endpoint. To change endpoint run:\n"
                 "ee.Reset()\n"
-                "ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')\n"
-                "Or with project specified (e.g. when in Colab):\n"
-                "ee.Initialize(project='your_cloud_project_name', opt_url='https://earthengine-highvolume.googleapis.com')"
+                "ee.Initialize(project=gee_project_name, opt_url='https://earthengine-highvolume.googleapis.com')\n"
+                "# where gee_project_name is your GEE project (necessary in Colab)"
             )
         else:  # standard endpoint
             msg = (
                 "Sequential mode requires the STANDARD endpoint. To change endpoint run:\n"
                 "ee.Reset()\n"
-                "ee.Initialize()\n"
-                "Or with project specified (e.g. when in Colab):\n"
-                "ee.Initialize(project='your_cloud_project_name')"
+                "ee.Initialize(project=gee_project_name)\n"
+                "# where gee_project_name is your GEE project (necessary in Colab)"
             )
         if raise_error:
@@ -865,13 +890,13 @@ def extract_centroid_and_geomtype_client(
         if plot_id_column in gdf.columns:
             cols.append(plot_id_column)
-        # Include external_id_column if provided and exists
+        # Include external_id if it exists (already renamed during load)
         if (
             external_id_column
-            and external_id_column in gdf.columns
-            and external_id_column not in cols
+            and "external_id" in gdf.columns
+            and "external_id" not in cols
         ):
-            cols.append(external_id_column)
+            cols.append("external_id")
         # Always include metadata columns (centroid, geometry type)
         cols.extend([x_col, y_col, type_col])
@@ -965,6 +990,10 @@ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
     Preserves the __row_id__ column if present so it can be retrieved after processing.
+    IMPORTANT: Drops external_id column before sending to EE to enable query caching.
+    external_id is user metadata that's not needed for EE computation. Including it
+    breaks EE's caching mechanism since each unique external_id creates a different query.
     Parameters
     ----------
     batch_gdf : gpd.GeoDataFrame
@@ -973,15 +1002,21 @@ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
     Returns
     -------
     ee.FeatureCollection
-        EE FeatureCollection with __row_id__ as a feature property
+        EE FeatureCollection with __row_id__ as a feature property (no external_id)
     """
+    # Drop external_id before sending to EE to enable caching
+    # (external_id is preserved separately on client side for merging)
+    batch_for_ee = batch_gdf.copy()
+    if "external_id" in batch_for_ee.columns:
+        batch_for_ee = batch_for_ee.drop(columns=["external_id"])
     # Pass GeoDataFrame directly to preserve CRS metadata
     # convert_geojson_to_ee will handle:
     # - CRS detection and conversion to WGS84 if needed
     # - Data type sanitization (datetime, object columns)
     # - Geometry validation and Z-coordinate stripping
-    fc = convert_geojson_to_ee(batch_gdf, enforce_wgs84=True, strip_z_coords=True)
+    fc = convert_geojson_to_ee(batch_for_ee, enforce_wgs84=True, strip_z_coords=True)
     # If __row_id__ is in the original GeoDataFrame, it will be preserved
     # as a feature property in the GeoJSON and thus in the EE FeatureCollection
@@ -1107,7 +1142,19 @@ def process_ee_batch(
             # Ensure plot_id_column is present for merging
             # It should come from the feature properties (added before EE processing)
             if plot_id_column not in df.columns:
-                df[plot_id_column] = range(len(df))
+                logger.warning(
+                    f"Batch {batch_idx + 1}: plotId column DROPPED by EE. "
+                    f"Regenerating with 1-indexed range. "
+                    f"Columns from EE: {list(df.columns)}"
+                )
+                # Use 1-indexed range to match client-side assignment
+                df[plot_id_column] = range(1, len(df) + 1)
+            # Ensure plotId is integer type (EE may return as string)
+            if plot_id_column in df.columns:
+                df[plot_id_column] = pd.to_numeric(
+                    df[plot_id_column], errors="coerce"
+                ).astype("Int64")
             # Ensure all column names are strings (fixes pandas .str accessor issues)
             df.columns = df.columns.astype(str)
@@ -1231,12 +1278,15 @@ def whisp_stats_geojson_to_df_concurrent(
     # Validate endpoint
     validate_ee_endpoint("high-volume", raise_error=True)
-    # Load GeoJSON with output suppressed
-    gdf = _load_geojson_silently(input_geojson_filepath)
+    # Load GeoJSON with output suppressed (external_id_column renamed to 'external_id' if provided)
+    gdf = _load_and_prepare_geojson(
+        input_geojson_filepath, external_id_column=external_id_column
+    )
     logger.info(f"Loaded {len(gdf):,} features")
-    # Validate external_id_column if provided (lightweight client-side check)
-    if external_id_column and external_id_column not in gdf.columns:
+    # Validate external_id if provided (lightweight client-side check)
+    # Note: external_id_column already renamed to 'external_id' during load
+    if external_id_column and "external_id" not in gdf.columns:
         # Exclude geometry column from available columns list
         available_cols = [c for c in gdf.columns if c != gdf.geometry.name]
         raise ValueError(
@@ -1244,13 +1294,13 @@ def whisp_stats_geojson_to_df_concurrent(
             f"Available columns: {available_cols}"
         )
-    # Check completeness of external_id_column (warn if nulls exist)
-    if external_id_column and external_id_column in gdf.columns:
-        null_count = gdf[external_id_column].isna().sum()
+    # Check completeness of external_id (warn if nulls exist)
+    if external_id_column and "external_id" in gdf.columns:
+        null_count = gdf["external_id"].isna().sum()
         if null_count > 0:
             null_pct = (null_count / len(gdf)) * 100
             logger.warning(
-                f"Column '{external_id_column}' has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
+                f"Column 'external_id' (from '{external_id_column}') has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
                 f"These features may have missing external IDs in output."
             )
@@ -1263,13 +1313,21 @@ def whisp_stats_geojson_to_df_concurrent(
     gdf[plot_id_column] = range(1, len(gdf) + 1)
     # Strip unnecessary properties before sending to EE
-    # Keep only: geometry, plot_id_column, and external_id_column
+    # Keep only: geometry, plot_id_column, and external_id
     # This prevents duplication of GeoJSON properties in EE results
     keep_cols = ["geometry", plot_id_column]
-    if external_id_column and external_id_column in gdf.columns:
-        keep_cols.append(external_id_column)
+    if (
+        external_id_column and "external_id" in gdf.columns
+    ):  # Already renamed during load
+        keep_cols.append("external_id")
     gdf_for_ee = gdf[keep_cols].copy()
+    # CRITICAL: Convert external_id to string to prevent EE from confusing it with integer plotId
+    if external_id_column and "external_id" in gdf_for_ee.columns:
+        gdf_for_ee["external_id"] = gdf_for_ee["external_id"].astype(str)
+        logger.debug(f"Converted external_id column to string type")
     logger.debug(f"Stripped GeoJSON to essential columns: {keep_cols}")
     # Create image if not provided
@@ -1366,17 +1424,37 @@ def whisp_stats_geojson_to_df_concurrent(
                     # Merge server and client results
                     if plot_id_column not in df_server.columns:
-                        df_server[plot_id_column] = range(len(df_server))
+                        logger.warning(
+                            f"Batch {batch_idx + 1} (concurrent merge): plotId DROPPED by EE. "
+                            f"Regenerating. Columns from EE: {list(df_server.columns)}"
+                        )
+                        df_server[plot_id_column] = pd.array(
+                            range(1, len(df_server) + 1), dtype="Int64"
+                        )
+                    else:
+                        df_server[plot_id_column] = pd.to_numeric(
+                            df_server[plot_id_column], errors="coerce"
+                        ).astype("Int64")
+                    # Ensure plotId is Int64 in client data too
+                    if plot_id_column in df_client.columns:
+                        df_client[plot_id_column] = pd.to_numeric(
+                            df_client[plot_id_column], errors="coerce"
+                        ).astype("Int64")
                     # Keep all EE statistics from server (all columns with _sum and _median suffixes)
                     # These are the actual EE processing results
                     df_server_clean = df_server.copy()
+                    # Drop external_id from df_server if it exists (already in df_client)
+                    if "external_id" in df_server_clean.columns:
+                        df_server_clean = df_server_clean.drop(columns=["external_id"])
                     # Keep external metadata: plot_id, external_id, geometry, geometry type, and centroids from client
                     # (formatted wrapper handles keep_external_columns parameter)
                     keep_external_columns = [plot_id_column]
-                    if external_id_column and external_id_column in df_client.columns:
-                        keep_external_columns.append(external_id_column)
+                    if external_id_column and "external_id" in df_client.columns:
+                        keep_external_columns.append("external_id")
                     if "geometry" in df_client.columns:
                         keep_external_columns.append("geometry")
                     # Keep geometry type column (Geometry_type)
@@ -1522,7 +1600,10 @@ def whisp_stats_geojson_to_df_concurrent(
                             try:
                                 batch_idx, df_server, df_client = future.result()
                                 if plot_id_column not in df_server.columns:
-                                    df_server[plot_id_column] = range(len(df_server))
+                                    # Use 1-indexed range to match client-side assignment
+                                    df_server[plot_id_column] = range(
+                                        1, len(df_server) + 1
+                                    )
                                 merged = df_server.merge(
                                     df_client,
                                     on=plot_id_column,
@@ -1566,31 +1647,21 @@ def whisp_stats_geojson_to_df_concurrent(
         else:
             return pd.DataFrame()
-        # Clean up duplicate external_id columns created by merges
-        # Rename external_id_column to standardized 'external_id' for schema validation
-        if external_id_column:
-            # Find all columns related to external_id
-            external_id_variants = [
+        # Clean up duplicate external_id columns created by merges (if any exist)
+        # external_id was already renamed during load, so we just need to handle duplicates
+        if external_id_column and "external_id" in combined.columns:
+            # Find merge duplicates like external_id_x, external_id_y, external_id_ee, external_id_client
+            duplicate_variants = [
                 col
                 for col in combined.columns
-                if external_id_column.lower() in col.lower()
+                if col != "external_id" and col.startswith("external_id_")
             ]
-            if external_id_variants:
-                # Use the base column name if it exists, otherwise use first variant
-                base_col = (
-                    external_id_column
-                    if external_id_column in combined.columns
-                    else external_id_variants[0]
+            if duplicate_variants:
+                logger.debug(
+                    f"Dropping duplicate external_id columns: {duplicate_variants}"
                 )
-                # Rename to standardized 'external_id'
-                if base_col != "external_id":
-                    combined = combined.rename(columns={base_col: "external_id"})
-                # Drop all other variants
-                cols_to_drop = [c for c in external_id_variants if c != base_col]
-                combined = combined.drop(columns=cols_to_drop, errors="ignore")
+                combined = combined.drop(columns=duplicate_variants, errors="ignore")
         # plotId column is already present from batch processing
         # Just ensure it's at position 0
@@ -1673,14 +1744,26 @@ def whisp_stats_geojson_to_df_concurrent(
                         try:
                             batch_idx, df_server, df_client = future.result()
                             if plot_id_column not in df_server.columns:
-                                df_server[plot_id_column] = range(len(df_server))
-                            # Drop external_id_column from df_client if it exists (already in df_server)
-                            if (
-                                external_id_column
-                                and external_id_column in df_client.columns
-                            ):
-                                df_client = df_client.drop(columns=[external_id_column])
+                                logger.warning(
+                                    f"Batch {batch_idx + 1} (retry): plotId DROPPED by EE. "
+                                    f"Regenerating. Columns from EE: {list(df_server.columns)}"
+                                )
+                                # Use 1-indexed range to match client-side assignment
+                                df_server[plot_id_column] = range(1, len(df_server) + 1)
+                            # Ensure plotId is integer type (EE may return as string)
+                            if plot_id_column in df_server.columns:
+                                df_server[plot_id_column] = pd.to_numeric(
+                                    df_server[plot_id_column], errors="coerce"
+                                ).astype("Int64")
+                            if plot_id_column in df_client.columns:
+                                df_client[plot_id_column] = pd.to_numeric(
+                                    df_client[plot_id_column], errors="coerce"
+                                ).astype("Int64")
+                            # Drop external_id from df_server if it exists (already in df_client)
+                            if "external_id" in df_server.columns:
+                                df_server = df_server.drop(columns=["external_id"])
                             merged = df_server.merge(
                                 df_client,
@@ -1702,30 +1785,22 @@ def whisp_stats_geojson_to_df_concurrent(
                     # Ensure all column names are strings (fixes pandas .str accessor issues later)
                     combined.columns = combined.columns.astype(str)
-                    # Clean up duplicate external_id columns created by merges
-                    if external_id_column:
-                        external_id_variants = [
+                    # Clean up duplicate external_id columns created by merges (if any exist)
+                    # external_id was already renamed during load, so we just need to handle duplicates
+                    if external_id_column and "external_id" in combined.columns:
+                        # Find merge duplicates like external_id_x, external_id_y, external_id_ee, external_id_client
+                        duplicate_variants = [
                             col
                             for col in combined.columns
-                            if external_id_column.lower() in col.lower()
+                            if col != "external_id" and col.startswith("external_id_")
                         ]
-                        if external_id_variants:
-                            base_col = external_id_column
-                            if (
-                                base_col not in combined.columns
-                                and external_id_variants
-                            ):
-                                base_col = external_id_variants[0]
-                                combined = combined.rename(
-                                    columns={base_col: "external_id"}
-                                )
-                            cols_to_drop = [
-                                c for c in external_id_variants if c != base_col
-                            ]
+                        if duplicate_variants:
+                            logger.debug(
+                                f"Dropping duplicate external_id columns: {duplicate_variants}"
+                            )
                             combined = combined.drop(
-                                columns=cols_to_drop, errors="ignore"
+                                columns=duplicate_variants, errors="ignore"
                             )
                     # plotId column is already present, just ensure it's at position 0
@@ -1769,6 +1844,14 @@ def whisp_stats_geojson_to_df_concurrent(
                 )
                 raise retry_e
+        # Ensure plot_id is present (should already be there from batch processing)
+        if plot_id_column not in formatted.columns:
+            logger.warning(f"{plot_id_column} column missing, regenerating...")
+            formatted.insert(0, plot_id_column, range(1, len(formatted) + 1))
+        # Sort by plot_id to ensure consistent output order
+        formatted = formatted.sort_values(by=plot_id_column).reset_index(drop=True)
         logger.info(f"Processing complete: {len(formatted):,} features")
         return formatted
     else:
@@ -1843,12 +1926,15 @@ def whisp_stats_geojson_to_df_sequential(
     # Validate endpoint
     validate_ee_endpoint("standard", raise_error=True)
-    # Load GeoJSON with output suppressed
-    gdf = _load_geojson_silently(input_geojson_filepath)
+    # Load GeoJSON with output suppressed (external_id_column renamed to 'external_id' if provided)
+    gdf = _load_and_prepare_geojson(
+        input_geojson_filepath, external_id_column=external_id_column
+    )
     logger.info(f"Loaded {len(gdf):,} features")
-    # Validate external_id_column if provided (lightweight client-side check)
-    if external_id_column and external_id_column not in gdf.columns:
+    # Validate external_id if provided (lightweight client-side check)
+    # Note: external_id_column already renamed to 'external_id' during load
+    if external_id_column and "external_id" not in gdf.columns:
         # Exclude geometry column from available columns list
         available_cols = [c for c in gdf.columns if c != gdf.geometry.name]
         raise ValueError(
@@ -1856,13 +1942,13 @@ def whisp_stats_geojson_to_df_sequential(
             f"Available columns: {available_cols}"
         )
-    # Check completeness of external_id_column (warn if nulls exist)
-    if external_id_column and external_id_column in gdf.columns:
-        null_count = gdf[external_id_column].isna().sum()
+    # Check completeness of external_id (warn if nulls exist)
+    if external_id_column and "external_id" in gdf.columns:
+        null_count = gdf["external_id"].isna().sum()
         if null_count > 0:
             null_pct = (null_count / len(gdf)) * 100
             logger.warning(
-                f"Column '{external_id_column}' has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
+                f"Column 'external_id' (from '{external_id_column}') has {null_count:,} null values ({null_pct:.1f}% of {len(gdf):,} features). "
                 f"These features may have missing external IDs in output."
             )
@@ -1874,18 +1960,22 @@ def whisp_stats_geojson_to_df_sequential(
     # Add stable plotIds for merging (starting from 1, not 0)
     gdf[plot_id_column] = range(1, len(gdf) + 1)
-    # Add stable row IDs
-    row_id_col = "__row_id__"
-    gdf[row_id_col] = range(len(gdf))
     # Strip unnecessary properties before sending to EE
-    # Keep only: geometry, plot_id_column, and external_id_column
+    # Keep only: geometry, plot_id_column, and external_id
     # This prevents duplication of GeoJSON properties in EE results
-    keep_cols = ["geometry", plot_id_column, row_id_col]
-    if external_id_column and external_id_column in gdf.columns:
-        keep_cols.append(external_id_column)
+    keep_cols = ["geometry", plot_id_column]
+    if (
+        external_id_column and "external_id" in gdf.columns
+    ):  # Already renamed during load
+        keep_cols.append("external_id")
     gdf_for_ee = gdf[keep_cols].copy()
+    # CRITICAL: Convert external_id to string to prevent EE from confusing it with integer plotId
+    if external_id_column and "external_id" in gdf_for_ee.columns:
+        gdf_for_ee["external_id"] = gdf_for_ee["external_id"].astype(str)
+        logger.debug(f"Converted external_id column to string type")
     logger.debug(f"Stripped GeoJSON to essential columns: {keep_cols}")
     # Create image if not provided
@@ -1907,10 +1997,19 @@ def whisp_stats_geojson_to_df_sequential(
                     national_codes=national_codes, validate_bands=True
                 )
+    # Drop external_id before sending to EE to enable caching
+    # (external_id is preserved separately in gdf for client-side merging)
+    gdf_for_ee_clean = gdf_for_ee.copy()
+    if "external_id" in gdf_for_ee_clean.columns:
+        gdf_for_ee_clean = gdf_for_ee_clean.drop(columns=["external_id"])
+        logger.debug("Dropped external_id from data sent to EE (enables caching)")
     # Convert to EE (suppress print statements from convert_geojson_to_ee)
     logger.debug("Converting to EE FeatureCollection...")
     with redirect_stdout(io.StringIO()):
-        fc = convert_geojson_to_ee(gdf_for_ee, enforce_wgs84=True, strip_z_coords=True)
+        fc = convert_geojson_to_ee(
+            gdf_for_ee_clean, enforce_wgs84=True, strip_z_coords=True
+        )
     # Create reducer
     reducer = ee.Reducer.sum().combine(ee.Reducer.median(), sharedInputs=True)
@@ -1950,11 +2049,13 @@ def whisp_stats_geojson_to_df_sequential(
         else:
             raise
-    logger.debug("Server-side processing complete")
+    logger.info("Server-side processing complete")
-    # Add row_id if missing
-    if row_id_col not in df_server.columns:
-        df_server[row_id_col] = range(len(df_server))
+    # Ensure plotId is Int64 type for fast merges
+    if plot_id_column in df_server.columns:
+        df_server[plot_id_column] = pd.to_numeric(
+            df_server[plot_id_column], errors="coerce"
+        ).astype("Int64")
     # Add client-side metadata if requested
     if add_metadata_client_side:
@@ -1965,21 +2066,23 @@ def whisp_stats_geojson_to_df_sequential(
             return_attributes_only=True,
         )
-        # Drop external_id_column from df_client if it exists (already in df_server)
-        if external_id_column and external_id_column in df_client.columns:
-            df_client = df_client.drop(columns=[external_id_column])
+        # Ensure plotId is Int64 type for fast merges
+        if plot_id_column in df_client.columns:
+            df_client[plot_id_column] = pd.to_numeric(
+                df_client[plot_id_column], errors="coerce"
+            ).astype("Int64")
+        # Drop external_id from df_server if it exists (keep from df_client - more reliable)
+        if "external_id" in df_server.columns:
+            df_server = df_server.drop(columns=["external_id"])
-        # Merge
+        # Merge on plotId (same strategy as concurrent mode)
         result = df_server.merge(
-            df_client, on=row_id_col, how="left", suffixes=("", "_client")
+            df_client, on=plot_id_column, how="left", suffixes=("", "_client")
         )
     else:
         result = df_server
-    # Remove internal __row_id__ column if present
-    if row_id_col in result.columns:
-        result = result.drop(columns=[row_id_col])
     # Format the output
     # Add admin context (Country, ProducerCountry, Admin_Level_1) from admin_code
     # MUST be done BEFORE formatting (which removes _median columns)
@@ -2004,27 +2107,14 @@ def whisp_stats_geojson_to_df_sequential(
         convert_water_flag=True,
     )
+    # Ensure plot_id exists and sort by it
+    if plot_id_column not in formatted.columns:
+        formatted.insert(0, plot_id_column, range(1, len(formatted) + 1))
+    formatted = formatted.sort_values(by=plot_id_column).reset_index(drop=True)
     logger.info(f"Processing complete: {len(formatted):,} features")
-    # Consolidate external_id_column to standardized 'external_id'
-    if external_id_column:
-        variants = [
-            col
-            for col in formatted.columns
-            if external_id_column.lower() in col.lower()
-        ]
-        if variants:
-            base_col = (
-                external_id_column
-                if external_id_column in formatted.columns
-                else variants[0]
-            )
-            if base_col != "external_id":
-                formatted = formatted.rename(columns={base_col: "external_id"})
-            # Drop other variants
-            formatted = formatted.drop(
-                columns=[c for c in variants if c != base_col], errors="ignore"
-            )
+    # external_id_column already renamed to 'external_id' during load - no action needed here
     return formatted
@@ -2130,7 +2220,7 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
     gdf_original_geoms = None
     if geometry_audit_trail:
         logger.debug("Pre-loading GeoJSON for geometry audit trail...")
-        gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
+        gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
     # Step 1: Get raw stats
     logger.debug("Step 1/2: Extracting statistics (concurrent)...")
@@ -2199,7 +2289,7 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
             # Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
             if gdf_original_geoms is None:
                 logger.warning("Original geometries not pre-loaded, loading now...")
-                gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
+                gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
             # Use plotId from df_validated to maintain mapping
             df_original_geom = pd.DataFrame(
@@ -2331,7 +2421,7 @@ def whisp_formatted_stats_geojson_to_df_sequential(
     gdf_original_geoms = None
     if geometry_audit_trail:
         logger.debug("Pre-loading GeoJSON for geometry audit trail...")
-        gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
+        gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
     # Step 1: Get raw stats
     logger.debug("Step 1/2: Extracting statistics (sequential)...")
@@ -2395,7 +2485,7 @@ def whisp_formatted_stats_geojson_to_df_sequential(
             # Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
             if gdf_original_geoms is None:
                 logger.warning("Original geometries not pre-loaded, loading now...")
-                gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
+                gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
             # Use plotId from df_validated to maintain mapping
             df_original_geom = pd.DataFrame(

{openforis_whisp-3.0.0a4 → openforis_whisp-3.0.0a5}/src/openforis_whisp/parameters/lookup_context_and_metadata.csv RENAMED Viewed

@@ -1,5 +1,5 @@
 name,order,ISO2_code,theme,theme_timber,use_for_risk,use_for_risk_timber,exclude_from_output,col_type,is_nullable,is_required,corresponding_variable
-plotId,-10,,context_and_metadata,context_and_metadata,NA,NA,0,string,1,0,plot_id_column
+plotId,-10,,context_and_metadata,context_and_metadata,NA,NA,0,int64,1,0,plot_id_column
 external_id,-9,,context_and_metadata,context_and_metadata,NA,NA,0,string,1,0,external_id_column
 Area,-8,,context_and_metadata,context_and_metadata,NA,NA,0,float32,1,1,geometry_area_column
 Geometry_type,-7,,context_and_metadata,context_and_metadata,NA,NA,0,string,1,1,geometry_type_column