PyPI - openforis-whisp - Versions diffs - 2.0.0b2__py3-none-any.whl → 3.0.0a1__py3-none-any.whl - Mend

openforis-whisp 2.0.0b2py3-none-any.whl → 3.0.0a1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

openforis_whisp/__init__.py +35 -4
openforis_whisp/advanced_stats.py +2070 -0
openforis_whisp/data_checks.py +642 -0
openforis_whisp/data_conversion.py +86 -44
openforis_whisp/datasets.py +298 -225
openforis_whisp/logger.py +26 -0
openforis_whisp/parameters/__init__.py +0 -0
openforis_whisp/parameters/lookup_gaul1_admin.py +18663 -0
openforis_whisp/reformat.py +198 -2
openforis_whisp/stats.py +488 -68
{openforis_whisp-2.0.0b2.dist-info → openforis_whisp-3.0.0a1.dist-info}/METADATA +1 -1
openforis_whisp-3.0.0a1.dist-info/RECORD +20 -0
openforis_whisp-2.0.0b2.dist-info/RECORD +0 -16
{openforis_whisp-2.0.0b2.dist-info → openforis_whisp-3.0.0a1.dist-info}/LICENSE +0 -0
{openforis_whisp-2.0.0b2.dist-info → openforis_whisp-3.0.0a1.dist-info}/WHEEL +0 -0

openforis_whisp/stats.py CHANGED Viewed

@@ -3,6 +3,7 @@ import pandas as pd
 from pathlib import Path
 from .datasets import combine_datasets
 import json
+import logging
 import country_converter as coco
 from openforis_whisp.parameters.config_runtime import (
     plot_id_column,
@@ -34,8 +35,57 @@ from .reformat import (
 # NB functions that included "formatted" in the name apply a schema for validation and reformatting of the output dataframe. The schema is created from lookup tables.
+# ============================================================================
+# PERFORMANCE OPTIMIZATION: Cache expensive Earth Engine datasets
+# ============================================================================
+# These images/collections are loaded once and reused across all features
+# to avoid repeated expensive operations. This saves 7-15 seconds per analysis.
-def whisp_formatted_stats_geojson_to_df(
+_WATER_FLAG_IMAGE = None
+_admin_boundaries_FC = None
+def get_water_flag_image():
+    """
+    Get cached water flag image.
+    OPTIMIZATION: Water flag image is created once and reused for all features.
+    This avoids recreating ocean/water datasets for every feature (previously
+    called in get_type_and_location for each feature).
+    Returns
+    -------
+    ee.Image
+        Cached water flag image
+    """
+    global _WATER_FLAG_IMAGE
+    if _WATER_FLAG_IMAGE is None:
+        _WATER_FLAG_IMAGE = water_flag_all_prep()
+    return _WATER_FLAG_IMAGE
+def get_admin_boundaries_fc():
+    """
+    Get cached GAUL 2024 L1 administrative boundary feature collection.
+    OPTIMIZATION: GAUL 2024 L1 collection is loaded once and reused for all features.
+    This avoids loading the large FeatureCollection for every feature (previously
+    called in get_admin_boundaries_info for each feature).
+    Returns
+    -------
+    ee.FeatureCollection
+        Cached GAUL 2024 L1 administrative boundary feature collection
+    """
+    global _admin_boundaries_FC
+    if _admin_boundaries_FC is None:
+        _admin_boundaries_FC = ee.FeatureCollection(
+            "projects/sat-io/open-datasets/FAO/GAUL/GAUL_2024_L1"
+        )
+    return _admin_boundaries_FC
+def whisp_formatted_stats_geojson_to_df_legacy(
     input_geojson_filepath: Path | str,
     external_id_column=None,
     remove_geom=False,
@@ -43,9 +93,15 @@ def whisp_formatted_stats_geojson_to_df(
     unit_type="ha",
     whisp_image=None,
     custom_bands=None,  # New parameter
+    validate_geometries: bool = False,
 ) -> pd.DataFrame:
     """
-        Main function for most users.
+        Legacy function for basic Whisp stats extraction.
+        DEPRECATED: This is the original implementation maintained for backward compatibility.
+        Use whisp_formatted_stats_geojson_to_df() for new code, which provides automatic
+        optimization, formatting, and schema validation.
         Converts a GeoJSON file to a pandas DataFrame containing Whisp stats for the input ROI.
         Output df is validated against a panderas schema (created on the fly from the two lookup CSVs).
@@ -79,13 +135,48 @@ def whisp_formatted_stats_geojson_to_df(
             - List of band names: ['Aa_test', 'elevation']
             - Dict with types: {'Aa_test': 'float64', 'elevation': 'float32'}
             - None: preserves all extra columns automatically
+        validate_geometries : bool, optional
+            Whether to validate and fix invalid geometries, by default False.
+            Set to True to automatically fix invalid/self-intersecting polygons.
     Returns
         -------
         df_stats : pd.DataFrame
             The DataFrame containing the Whisp stats for the input ROI.
     """
-    feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
+    # Load GeoJSON and validate geometries if requested
+    if validate_geometries:
+        import json
+        import geopandas as gpd
+        from shapely.validation import make_valid
+        import logging as py_logging
+        logger = py_logging.getLogger("whisp-legacy")
+        # Load GeoJSON file
+        with open(input_geojson_filepath, "r") as f:
+            geojson_data = json.load(f)
+        # Convert to GeoDataFrame
+        gdf = gpd.GeoDataFrame.from_features(geojson_data["features"])
+        # Validate and fix invalid geometries
+        valid_count = gdf.geometry.is_valid.sum()
+        invalid_count = len(gdf) - valid_count
+        if invalid_count > 0:
+            logger.warning(f"Fixing {invalid_count} invalid geometries")
+            gdf["geometry"] = gdf["geometry"].apply(
+                lambda g: make_valid(g) if g and not g.is_valid else g
+            )
+        # Convert back to GeoJSON dict (stays in memory - no temp files!)
+        geojson_cleaned = json.loads(gdf.to_json())
+        # OPTIMIZATION: Pass GeoJSON dict directly - eliminates file I/O overhead
+        feature_collection = convert_geojson_to_ee(geojson_cleaned)
+    else:
+        # Original path - no validation
+        feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
     return whisp_formatted_stats_ee_to_df(
         feature_collection,
@@ -98,6 +189,169 @@ def whisp_formatted_stats_geojson_to_df(
     )
+def whisp_formatted_stats_geojson_to_df(
+    input_geojson_filepath: Path | str,
+    external_id_column=None,
+    remove_geom=False,
+    national_codes=None,
+    unit_type="ha",
+    whisp_image=None,
+    custom_bands=None,
+    mode: str = "sequential",
+    batch_size: int = 10,
+    max_concurrent: int = 20,
+    validate_geometries: bool = False,
+) -> pd.DataFrame:
+    """
+    Main entry point for converting GeoJSON to Whisp statistics.
+    Routes to the appropriate processing mode with automatic formatting and validation.
+    Converts a GeoJSON file to a pandas DataFrame containing Whisp stats for the input ROI.
+    Output DataFrame is validated against a Panderas schema (created from lookup CSVs).
+    Results are automatically formatted and unit-converted (ha or percent).
+    If `external_id_column` is provided, it will be used to link external identifiers
+    from the input GeoJSON to the output DataFrame.
+    Parameters
+    ----------
+    input_geojson_filepath : Path | str
+        The filepath to the GeoJSON of the ROI to analyze.
+    external_id_column : str, optional
+        The column in the GeoJSON containing external IDs to be preserved in the output DataFrame.
+        This column must exist as a property in ALL features of the GeoJSON file.
+        Use debug_feature_collection_properties() to inspect available properties if you encounter errors.
+    remove_geom : bool, default=False
+        If True, the geometry of the GeoJSON is removed from the output DataFrame.
+    national_codes : list, optional
+        List of ISO2 country codes to include national datasets.
+    unit_type: str, optional
+        Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
+    whisp_image : ee.Image, optional
+        Pre-combined multiband Earth Engine Image containing all Whisp datasets.
+        If provided, this image will be used instead of combining datasets based on national_codes.
+        If None, datasets will be combined automatically using national_codes parameter.
+    custom_bands : list or dict, optional
+        Custom band information for extra columns. Can be:
+        - List of band names: ['Aa_test', 'elevation']
+        - Dict with types: {'Aa_test': 'float64', 'elevation': 'float32'}
+        - None: preserves all extra columns automatically
+    mode : str, optional
+        Processing mode, by default "concurrent":
+        - "concurrent": Uses high-volume endpoint with concurrent batching (recommended for large files)
+        - "sequential": Uses standard endpoint for sequential processing (more stable)
+        - "legacy": Uses original implementation (basic stats extraction only, no formatting)
+    batch_size : int, optional
+        Features per batch for concurrent/sequential modes, by default 10.
+        Only applicable for "concurrent" and "sequential" modes.
+    max_concurrent : int, optional
+        Maximum concurrent EE calls for concurrent mode, by default 20.
+        Only applicable for "concurrent" mode.
+    validate_geometries : bool, optional
+        Whether to validate and fix invalid geometries, by default False.
+        Set to True to automatically fix invalid/self-intersecting polygons.
+        For production workflows, it's recommended to use geometry validation and
+        cleaning tools BEFORE processing with this function.
+    Returns
+    -------
+    df_stats : pd.DataFrame
+        The DataFrame containing the Whisp stats for the input ROI,
+        automatically formatted and validated.
+    Examples
+    --------
+    >>> # Use concurrent processing (default, recommended for large datasets)
+    >>> df = whisp_formatted_stats_geojson_to_df("data.geojson")
+    >>> # Use sequential processing for more stable/predictable results
+    >>> df = whisp_formatted_stats_geojson_to_df(
+    ...     "data.geojson",
+    ...     mode="sequential"
+    ... )
+    >>> # Adjust concurrency parameters
+    >>> df = whisp_formatted_stats_geojson_to_df(
+    ...     "large_data.geojson",
+    ...     mode="concurrent",
+    ...     max_concurrent=30,
+    ...     batch_size=15
+    ... )
+    >>> # Use legacy mode for backward compatibility (basic extraction only)
+    >>> df = whisp_formatted_stats_geojson_to_df(
+    ...     "data.geojson",
+    ...     mode="legacy"
+    ... )
+    """
+    # Import here to avoid circular imports
+    try:
+        from openforis_whisp.advanced_stats import (
+            whisp_formatted_stats_geojson_to_df_fast,
+        )
+    except ImportError:
+        # Fallback to legacy if advanced_stats not available
+        mode = "legacy"
+    logger = logging.getLogger("whisp")
+    if mode == "legacy":
+        # Log info if batch_size or max_concurrent were passed but won't be used
+        if batch_size != 10 or max_concurrent != 20:
+            unused = []
+            if batch_size != 10:
+                unused.append(f"batch_size={batch_size}")
+            if max_concurrent != 20:
+                unused.append(f"max_concurrent={max_concurrent}")
+            logger.info(
+                f"Mode is 'legacy': {', '.join(unused)}\n"
+                "parameter(s) are not used in legacy mode."
+            )
+        # Use original implementation (basic stats extraction only)
+        return whisp_formatted_stats_geojson_to_df_legacy(
+            input_geojson_filepath=input_geojson_filepath,
+            external_id_column=external_id_column,
+            remove_geom=remove_geom,
+            national_codes=national_codes,
+            unit_type=unit_type,
+            whisp_image=whisp_image,
+            custom_bands=custom_bands,
+            validate_geometries=validate_geometries,
+        )
+    elif mode in ("concurrent", "sequential"):
+        # Log info if batch_size or max_concurrent are not used in sequential mode
+        if mode == "sequential":
+            unused = []
+            if batch_size != 10:
+                unused.append(f"batch_size={batch_size}")
+            if max_concurrent != 20:
+                unused.append(f"max_concurrent={max_concurrent}")
+            if unused:
+                logger.info(
+                    f"Mode is 'sequential': {', '.join(unused)}\n"
+                    "parameter(s) are not used in sequential (single-threaded) mode."
+                )
+        # Route to fast function with explicit mode (skip auto-detection)
+        return whisp_formatted_stats_geojson_to_df_fast(
+            input_geojson_filepath=input_geojson_filepath,
+            external_id_column=external_id_column,
+            remove_geom=remove_geom,
+            national_codes=national_codes,
+            unit_type=unit_type,
+            whisp_image=whisp_image,
+            custom_bands=custom_bands,
+            mode=mode,  # Pass mode directly (concurrent or sequential)
+            batch_size=batch_size,
+            max_concurrent=max_concurrent,
+            validate_geometries=validate_geometries,
+        )
+    else:
+        raise ValueError(
+            f"Invalid mode '{mode}'. Must be 'concurrent', 'sequential', or 'legacy'."
+        )
 def whisp_formatted_stats_geojson_to_geojson(
     input_geojson_filepath,
     output_geojson_filepath,
@@ -141,7 +395,8 @@ def whisp_formatted_stats_geojson_to_geojson(
     # Convert the df to GeoJSON
     convert_df_to_geojson(df, output_geojson_filepath, geo_column)
-    print(f"GeoJSON with Whisp stats saved to {output_geojson_filepath}")
+    # Suppress verbose output
+    # print(f"GeoJSON with Whisp stats saved to {output_geojson_filepath}")
 def whisp_formatted_stats_ee_to_geojson(
@@ -425,7 +680,9 @@ def whisp_stats_ee_to_ee(
     national_codes=None,
     unit_type="ha",
     keep_properties=None,
-    whisp_image=None,  # New parameter
+    whisp_image=None,
+    validate_external_id=True,
+    validate_bands=False,  # New parameter
 ):
     """
     Process a feature collection to get statistics for each feature.
@@ -442,19 +699,25 @@ def whisp_stats_ee_to_ee(
         whisp_image (ee.Image, optional): Pre-combined multiband Earth Engine Image containing
             all Whisp datasets. If provided, this image will be used instead of combining
             datasets based on national_codes.
+        validate_external_id (bool, optional): If True, validates that external_id_column exists
+            in all features (default: True). Set to False to skip validation and save 2-4 seconds.
+            Only disable if you're confident the column exists in all features.
     Returns:
         ee.FeatureCollection: The output feature collection with statistics.
     """
     if external_id_column is not None:
         try:
-            # Validate that the external_id_column exists in all features
-            validation_result = validate_external_id_column(
-                feature_collection, external_id_column
-            )
+            # OPTIMIZATION: Make validation optional to save 2-4 seconds
+            # Validation includes multiple .getInfo() calls which are slow
+            if validate_external_id:
+                # Validate that the external_id_column exists in all features
+                validation_result = validate_external_id_column(
+                    feature_collection, external_id_column
+                )
-            if not validation_result["is_valid"]:
-                raise ValueError(validation_result["error_message"])
+                if not validation_result["is_valid"]:
+                    raise ValueError(validation_result["error_message"])
             # First handle property selection, but preserve the external_id_column
             if keep_properties is not None:
@@ -506,19 +769,27 @@ def whisp_stats_ee_to_ee(
         national_codes=national_codes,
         unit_type=unit_type,
         whisp_image=whisp_image,  # Pass through
+        validate_bands=validate_bands,
     )
     return add_id_to_feature_collection(dataset=fc, id_name=plot_id_column)
 def _keep_fc_properties(feature_collection, keep_properties):
+    """
+    Filter feature collection properties based on keep_properties parameter.
+    OPTIMIZATION: When keep_properties is True, we no longer call .getInfo()
+    to get property names. Instead, we simply return the collection as-is,
+    since True means "keep all properties". This saves 1-2 seconds.
+    """
     # If keep_properties is specified, select only those properties
     if keep_properties is None:
         feature_collection = feature_collection.select([])
     elif keep_properties == True:
-        # If keep_properties is true, select all properties
-        first_feature_props = feature_collection.first().propertyNames().getInfo()
-        feature_collection = feature_collection.select(first_feature_props)
+        # If keep_properties is true, keep all properties
+        # No need to call .select() or .getInfo() - just return as-is
+        pass
     elif isinstance(keep_properties, list):
         feature_collection = feature_collection.select(keep_properties)
     else:
@@ -534,7 +805,8 @@ def whisp_stats_ee_to_df(
     remove_geom=False,
     national_codes=None,
     unit_type="ha",
-    whisp_image=None,  # New parameter
+    whisp_image=None,
+    validate_bands=False,  # New parameter
 ) -> pd.DataFrame:
     """
     Convert a Google Earth Engine FeatureCollection to a pandas DataFrame and convert ISO3 to ISO2 country codes.
@@ -561,27 +833,52 @@ def whisp_stats_ee_to_df(
     """
     # First, do the whisp processing to get the EE feature collection with stats
     try:
-        stats_feature_collection = whisp_stats_ee_to_ee(
-            feature_collection,
-            external_id_column,
-            national_codes=national_codes,
-            unit_type=unit_type,
-            whisp_image=whisp_image,  # Pass through
-        )
-    except Exception as e:
-        print(f"An error occurred during Whisp stats processing: {e}")
-        raise e
+        try:
+            stats_feature_collection = whisp_stats_ee_to_ee(
+                feature_collection,
+                external_id_column,
+                national_codes=national_codes,
+                unit_type=unit_type,
+                whisp_image=whisp_image,  # Pass through
+                validate_bands=False,  # try withoutb validation first
+            )
+        except Exception as e:
+            print(f"An error occurred during Whisp stats processing: {e}")
+            raise e
-    # Then, convert the EE feature collection to DataFrame
-    try:
-        df_stats = convert_ee_to_df(
-            ee_object=stats_feature_collection,
-            remove_geom=remove_geom,
-        )
-    except Exception as e:
-        print(f"An error occurred during the conversion from EE to DataFrame: {e}")
-        raise e
+        # Then, convert the EE feature collection to DataFrame
+        try:
+            df_stats = convert_ee_to_df(
+                ee_object=stats_feature_collection,
+                remove_geom=remove_geom,
+            )
+        except Exception as e:
+            print(f"An error occurred during the conversion from EE to DataFrame: {e}")
+            raise e
+    except:  # retry with validation of whisp input datasets
+        try:
+            stats_feature_collection = whisp_stats_ee_to_ee(
+                feature_collection,
+                external_id_column,
+                national_codes=national_codes,
+                unit_type=unit_type,
+                whisp_image=whisp_image,
+                validate_bands=True,  # If error, try with validation
+            )
+        except Exception as e:
+            print(f"An error occurred during Whisp stats processing: {e}")
+            raise e
+        # Then, convert the EE feature collection to DataFrame
+        try:
+            df_stats = convert_ee_to_df(
+                ee_object=stats_feature_collection,
+                remove_geom=remove_geom,
+            )
+        except Exception as e:
+            print(f"An error occurred during the conversion from EE to DataFrame: {e}")
+            raise e
     try:
         df_stats = convert_iso3_to_iso2(
             df=df_stats,
@@ -599,6 +896,13 @@ def whisp_stats_ee_to_df(
         print(f"An error occurred during point geometry area adjustment: {e}")
         # Continue without the adjustment rather than failing completely
+    # Reformat geometry types (MultiPolygon -> Polygon)
+    try:
+        df_stats = reformat_geometry_type(df_stats)
+    except Exception as e:
+        print(f"An error occurred during geometry type reformatting: {e}")
+        # Continue without the adjustment rather than failing completely
     return df_stats
@@ -623,12 +927,6 @@ def set_point_geometry_area_to_zero(df: pd.DataFrame) -> pd.DataFrame:
         )
         return df
-    if geometry_area_column not in df.columns:
-        print(
-            f"Warning: {geometry_area_column} column not found. Skipping area adjustment for points."
-        )
-        return df
     # Create a copy to avoid modifying the original
     df_modified = df.copy()
@@ -644,6 +942,43 @@ def set_point_geometry_area_to_zero(df: pd.DataFrame) -> pd.DataFrame:
     return df_modified
+def reformat_geometry_type(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Reformat geometry type classification in the DataFrame output.
+    Standardizes MultiPolygon geometry type to Polygon for consistent output.
+    Parameters
+    ----------
+    df : pd.DataFrame
+        DataFrame containing geometry type column
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with standardized geometry types
+    """
+    # Check if required columns exist
+    if geometry_type_column not in df.columns:
+        print(
+            f"Warning: {geometry_type_column} column not found. Skipping geometry type reformatting."
+        )
+        return df
+    # Create a copy to avoid modifying the original
+    df_modified = df.copy()
+    # Reformat MultiPolygon to Polygon
+    multipolygon_mask = df_modified[geometry_type_column] == "MultiPolygon"
+    df_modified.loc[multipolygon_mask, geometry_type_column] = "Polygon"
+    # Log the changes
+    num_reformatted = multipolygon_mask.sum()
+    # if num_reformatted > 0:
+    #     print(f"Reformatted {num_reformatted} MultiPolygon geometries to Polygon")
+    return df_modified
 def whisp_stats_ee_to_drive(
     feature_collection: ee.FeatureCollection,
     external_id_column=None,
@@ -696,7 +1031,11 @@ def whisp_stats_ee_to_drive(
 # Get stats for a feature or feature collection
 def get_stats(
-    feature_or_feature_col, national_codes=None, unit_type="ha", whisp_image=None
+    feature_or_feature_col,
+    national_codes=None,
+    unit_type="ha",
+    whisp_image=None,
+    validate_bands=False,
 ):
     """
     Get stats for a feature or feature collection with optional pre-combined image.
@@ -725,16 +1064,27 @@ def get_stats(
         img_combined = whisp_image
         print("Using provided whisp_image")
     else:
-        img_combined = combine_datasets(national_codes=national_codes)
+        img_combined = combine_datasets(
+            national_codes=national_codes,
+            validate_bands=validate_bands,
+            include_context_bands=False,
+        )
         print(f"Combining datasets with national_codes: {national_codes}")
     # Check if the input is a Feature or a FeatureCollection
     if isinstance(feature_or_feature_col, ee.Feature):
         print("Processing single feature")
+        # OPTIMIZATION: Create cached images for single feature processing
+        water_all = get_water_flag_image()
+        bounds_ADM1 = get_admin_boundaries_fc()
         output = ee.FeatureCollection(
             [
                 get_stats_feature(
-                    feature_or_feature_col, img_combined, unit_type=unit_type
+                    feature_or_feature_col,
+                    img_combined,
+                    unit_type=unit_type,
+                    water_all=water_all,
+                    bounds_ADM1=bounds_ADM1,
                 )
             ]
         )
@@ -756,6 +1106,10 @@ def get_stats_fc(feature_col, national_codes=None, unit_type="ha", img_combined=
     """
     Calculate statistics for a feature collection using Whisp datasets.
+    OPTIMIZATION: Creates water flag and admin_boundaries images once and reuses
+    them for all features instead of recreating them for each feature.
+    This saves 7-15 seconds per analysis.
     Parameters
     ----------
     feature_col : ee.FeatureCollection
@@ -775,15 +1129,19 @@ def get_stats_fc(feature_col, national_codes=None, unit_type="ha", img_combined=
     ee.FeatureCollection
         Feature collection with calculated statistics
     """
-    # # Use provided image or combine datasets
-    # if img_combined is None:
-    #     img_combined = combine_datasets(national_codes=national_codes)
+    # OPTIMIZATION: Create cached images once before processing features
+    # These will be reused for all features instead of being recreated each time
+    water_all = get_water_flag_image()
+    bounds_ADM1 = get_admin_boundaries_fc()
     out_feature_col = ee.FeatureCollection(
         feature_col.map(
             lambda feature: get_stats_feature(
-                feature, img_combined, unit_type=unit_type
+                feature,
+                img_combined,
+                unit_type=unit_type,
+                water_all=water_all,
+                bounds_ADM1=bounds_ADM1,
             )
         )
     )
@@ -796,10 +1154,15 @@ def get_stats_fc(feature_col, national_codes=None, unit_type="ha", img_combined=
 # Note: This function doesn't need whisp_image parameter since it already accepts img_combined directly
-def get_stats_feature(feature, img_combined, unit_type="ha"):
+def get_stats_feature(
+    feature, img_combined, unit_type="ha", water_all=None, bounds_ADM1=None
+):
     """
     Get statistics for a single feature using a pre-combined image.
+    OPTIMIZATION: Accepts cached water/admin_boundaries images to avoid recreating
+    them for every feature.
     Parameters
     ----------
     feature : ee.Feature
@@ -808,6 +1171,10 @@ def get_stats_feature(feature, img_combined, unit_type="ha"):
         Pre-combined image with all the datasets
     unit_type : str, optional
         Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
+    water_all : ee.Image, optional
+        Cached water flag image
+    bounds_ADM1 : ee.FeatureCollection, optional
+        Cached admin_boundaries feature collection
     Returns
     -------
@@ -822,8 +1189,8 @@ def get_stats_feature(feature, img_combined, unit_type="ha"):
         tileScale=8,
     )
-    # Get basic feature information
-    feature_info = get_type_and_location(feature)
+    # Get basic feature information with cached images
+    feature_info = get_type_and_location(feature, water_all, bounds_ADM1)
     # add statistics unit type (e.g., percentage or hectares) to dictionary
     stats_unit_type = ee.Dictionary({stats_unit_type_column: unit_type})
@@ -872,22 +1239,51 @@ def get_stats_feature(feature, img_combined, unit_type="ha"):
 # Get basic feature information - uses admin and water datasets in gee.
-def get_type_and_location(feature):
-    """Extracts basic feature information including country, admin area, geometry type, coordinates, and water flags."""
+def get_type_and_location(feature, water_all=None, bounds_ADM1=None):
+    """
+    Extracts basic feature information including country, admin area, geometry type, coordinates, and water flags.
+    OPTIMIZATION: Accepts cached water flag image and admin_boundaries collection
+    to avoid recreating them for every feature (saves 7-15 seconds per analysis).
+    Parameters
+    ----------
+    feature : ee.Feature
+        The feature to extract information from
+    water_all : ee.Image, optional
+        Cached water flag image. If None, creates it.
+    bounds_ADM1 : ee.FeatureCollection, optional
+        Cached admin_boundaries feature collection. If None, loads it.
+    Returns
+    -------
+    ee.Dictionary
+        Dictionary with feature information
+    """
     # Get centroid of the feature's geometry
-    centroid = feature.geometry().centroid(1)
+    centroid = feature.geometry().centroid(0.1)
+    # OPTIMIZATION: Use cached admin_boundaries
+    if bounds_ADM1 is None:
+        bounds_ADM1 = get_admin_boundaries_fc()
-    # Fetch location info from geoboundaries (country, admin)
-    location = ee.Dictionary(get_geoboundaries_info(centroid))
-    country = ee.Dictionary({iso3_country_column: location.get("shapeGroup")})
+    # Fetch location info from GAUL 2024 L1 (country, admin)
+    location = ee.Dictionary(get_admin_boundaries_info(centroid, bounds_ADM1))
+    country = ee.Dictionary({iso3_country_column: location.get("iso3_code")})
     admin_1 = ee.Dictionary(
-        {admin_1_column: location.get("shapeName")}
-    )  # Administrative level 1 (if available)
+        {admin_1_column: location.get("gaul1_name")}
+    )  # Administrative level 1 (from GAUL 2024 L1)
+    # OPTIMIZATION: Use cached water flag image
+    if water_all is None:
+        water_all = get_water_flag_image()
+    # OPTIMIZATION: Use cached water flag image
+    if water_all is None:
+        water_all = get_water_flag_image()
     # Prepare the water flag information
-    water_all = water_flag_all_prep()
     water_flag_dict = value_at_point_flag(
         point=centroid, image=water_all, band_name=water_flag, output_name=water_flag
     )
@@ -899,8 +1295,12 @@ def get_type_and_location(feature):
     coords_list = centroid.coordinates()
     coords_dict = ee.Dictionary(
         {
-            centroid_x_coord_column: coords_list.get(0),  # Longitude
-            centroid_y_coord_column: coords_list.get(1),  # Latitude
+            centroid_x_coord_column: ee.Number(coords_list.get(0)).format(
+                "%.6f"
+            ),  # Longitude (6 dp)
+            centroid_y_coord_column: ee.Number(coords_list.get(1)).format(
+                "%.6f"
+            ),  # Latitude (6 dp)
         }
     )
@@ -938,16 +1338,36 @@ def percent_and_format(val, area_ha):
     return ee.Number(formatted_value)
-# geoboundaries - admin units from a freqently updated database, allows commercial use (CC BY 4.0 DEED) (disputed territories may need checking)
-def get_geoboundaries_info(geometry):
-    gbounds_ADM0 = ee.FeatureCollection("WM/geoLab/geoBoundaries/600/ADM1")
-    polygonsIntersectPoint = gbounds_ADM0.filterBounds(geometry)
-    backup_dict = ee.Dictionary({"shapeGroup": "Unknown", "shapeName": "Unknown"})
+# GAUL 2024 L1 - admin units from FAO, allows commercial use
+def get_admin_boundaries_info(geometry, bounds_ADM1=None):
+    """
+    Get GAUL 2024 L1 info for a geometry (country ISO3 code and admin boundary name).
+    OPTIMIZATION: Accepts cached GAUL 2024 L1 FeatureCollection to avoid
+    reloading it for every feature (saves 2-5 seconds per analysis).
+    Parameters
+    ----------
+    geometry : ee.Geometry
+        The geometry to query
+    bounds_ADM1 : ee.FeatureCollection, optional
+        Cached GAUL 2024 L1 feature collection. If None, loads it.
+    Returns
+    -------
+    ee.Dictionary
+        Dictionary with iso3_code (country) and gaul1_name (admin boundary name)
+    """
+    if bounds_ADM1 is None:
+        bounds_ADM1 = get_admin_boundaries_fc()
+    polygonsIntersectPoint = bounds_ADM1.filterBounds(geometry)
+    backup_dict = ee.Dictionary({"iso3_code": "Unknown", "gaul1_name": "Unknown"})
     return ee.Algorithms.If(
         polygonsIntersectPoint.size().gt(0),
         polygonsIntersectPoint.first()
         .toDictionary()
-        .select(["shapeGroup", "shapeName"]),
+        .select(["iso3_code", "gaul1_name"]),
         backup_dict,
     )

openforis-whisp 2.0.0b2__py3-none-any.whl → 3.0.0a1__py3-none-any.whl

openforis-whisp 2.0.0b2py3-none-any.whl → 3.0.0a1py3-none-any.whl