PyPI - openforis-whisp - Versions diffs - 3.0.0a2__py3-none-any.whl → 3.0.0a4__py3-none-any.whl - Mend

openforis-whisp 3.0.0a2py3-none-any.whl → 3.0.0a4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

openforis_whisp/__init__.py +8 -8
openforis_whisp/advanced_stats.py +476 -312
openforis_whisp/data_checks.py +80 -28
openforis_whisp/datasets.py +14 -0
openforis_whisp/logger.py +15 -3
openforis_whisp/parameters/lookup_gee_datasets.csv +3 -2
openforis_whisp/pd_schemas.py +7 -2
openforis_whisp/reformat.py +8 -30
openforis_whisp/stats.py +16 -62
openforis_whisp/utils.py +468 -80
{openforis_whisp-3.0.0a2.dist-info → openforis_whisp-3.0.0a4.dist-info}/METADATA +1 -1
openforis_whisp-3.0.0a4.dist-info/RECORD +20 -0
openforis_whisp-3.0.0a2.dist-info/RECORD +0 -20
{openforis_whisp-3.0.0a2.dist-info → openforis_whisp-3.0.0a4.dist-info}/LICENSE +0 -0
{openforis_whisp-3.0.0a2.dist-info → openforis_whisp-3.0.0a4.dist-info}/WHEEL +0 -0

openforis_whisp/data_checks.py CHANGED Viewed

@@ -750,23 +750,43 @@ def validate_geojson_constraints(
     return results
-def suggest_method(polygon_count, mean_area_ha, mean_vertices=None, verbose=True):
+def suggest_processing_mode(
+    feature_count,
+    mean_area_ha=None,
+    mean_vertices=None,
+    feature_type="polygon",
+    verbose=True,
+):
     """
-    Suggest processing method based on polygon characteristics.
+    Suggest processing mode based on feature characteristics.
+    Decision thresholds from comprehensive benchmark data (Nov 2025):
-    Decision thresholds from benchmark data (area per polygon × polygon count):
-    - Small polygons (10 ha): need 250+ polygons for concurrent
-    - Medium polygons (100 ha): breakeven at ~100 polygons
-    - Large polygons (500 ha): concurrent wins at 50+ polygons
+    POINTS:
+    - Break-even: 750-1000 features
+    - Sequential faster: < 750 features
+    - Concurrent faster: >= 750 features
+    POLYGONS (area-based thresholds):
+    - Tiny (< 1 ha): break-even ~500 features
+    - Small (1-5 ha, simple): break-even ~500 features
+    - Small (1-5 ha, complex 20-50v): break-even ~500 features
+    - Medium (5-20 ha): break-even ~250 features
+    - Large (20-100 ha): break-even ~250 features
+    - Very large (50-200 ha): break-even ~250 features
+    Vertex complexity adjustment: High vertex counts (>50) favor concurrent at lower thresholds
     Parameters:
     -----------
-    polygon_count : int
-        Number of polygons
-    mean_area_ha : float
-        Mean area per polygon in hectares
+    feature_count : int
+        Number of features (polygons or points)
+    mean_area_ha : float, optional
+        Mean area per polygon in hectares (required for polygons, ignored for points)
     mean_vertices : float, optional
-        Mean number of vertices per polygon (can influence decision for complex geometries)
+        Mean number of vertices per polygon (influences decision for complex geometries)
+    feature_type : str
+        'polygon', 'multipolygon', or 'point' (default: 'polygon')
     verbose : bool
         Print recommendation explanation
@@ -775,31 +795,63 @@ def suggest_method(polygon_count, mean_area_ha, mean_vertices=None, verbose=True
     str: 'concurrent' or 'sequential'
     """
-    # Primary decision based on area
-    if mean_area_ha >= 300:  # Large polygons
-        breakeven = 50
-        method = "concurrent" if polygon_count >= breakeven else "sequential"
-    elif mean_area_ha >= 50:  # Medium polygons
-        breakeven = 100
-        method = "concurrent" if polygon_count >= breakeven else "sequential"
-    else:  # Small polygons
+    # Points: simple threshold-based decision
+    if feature_type == "point":
+        breakeven = 750
+        method = "concurrent" if feature_count >= breakeven else "sequential"
+        if verbose:
+            print(f"\nMETHOD RECOMMENDATION (Points)")
+            print(f"   Features: {feature_count} points")
+            print(f"   Break-even: {breakeven} features | Method: {method.upper()}")
+        return method
+    # Polygons and MultiPolygons: area and complexity-based decision
+    # MultiPolygons use same breakpoints as Polygons
+    if mean_area_ha is None:
+        # Default to conservative threshold if area unknown
+        breakeven = 500
+        method = "concurrent" if feature_count >= breakeven else "sequential"
+        if verbose:
+            print(f"\nMETHOD RECOMMENDATION (Polygons - area unknown)")
+            print(f"   Features: {feature_count} polygons")
+            print(
+                f"   Break-even: {breakeven} (conservative) | Method: {method.upper()}"
+            )
+        return method
+    # Area-based thresholds from benchmark data
+    if mean_area_ha >= 20:  # Large to very large polygons
+        breakeven = 250
+    elif mean_area_ha >= 5:  # Medium polygons
         breakeven = 250
-        method = "concurrent" if polygon_count >= breakeven else "sequential"
+    elif mean_area_ha >= 1:  # Small polygons
+        # Vertex complexity matters more for small polygons
+        if mean_vertices is not None and mean_vertices >= 30:
+            breakeven = 500  # Complex small polygons
+        else:
+            breakeven = 500  # Simple small polygons
+    else:  # Tiny polygons (< 1 ha)
+        breakeven = 500
+    # Vertex complexity adjustment for high-complexity geometries
+    if mean_vertices is not None and mean_vertices >= 50:
+        # High complexity: reduce breakeven by 20% (concurrent beneficial sooner)
+        breakeven = int(breakeven * 0.8)
-    # Optional adjustment based on vertex complexity (very high complexity favors concurrent)
-    if mean_vertices is not None and mean_vertices > 500:
-        # Reduce breakeven by 25% for very complex geometries
-        adjusted_breakeven = int(breakeven * 0.75)
-        method = "concurrent" if polygon_count >= adjusted_breakeven else "sequential"
+    method = "concurrent" if feature_count >= breakeven else "sequential"
     if verbose:
-        print(f"\nMETHOD RECOMMENDATION")
+        print(f"\nMETHOD RECOMMENDATION (Polygons)")
         print(
-            f"   Polygons: {polygon_count} | Mean Area: {mean_area_ha:.1f} ha", end=""
+            f"   Features: {feature_count} | Mean Area: {mean_area_ha:.1f} ha", end=""
         )
         if mean_vertices is not None:
             print(f" | Mean Vertices: {mean_vertices:.1f}", end="")
         print()
-        print(f"   Breakeven: {breakeven} polygons | Method: {method.upper()}")
+        print(f"   Break-even: {breakeven} features | Method: {method.upper()}")
     return method

openforis_whisp/datasets.py CHANGED Viewed

@@ -1160,6 +1160,20 @@ def nci_ocs2020_prep():
     ).selfMask()  # cocoa from national land cover map for Côte d'Ivoire
+# nCM - Cameroon
+# data from Aurelie Shapiro (FAO) working directly with country experts - info on methods and accuracy assessment to follow
+def ncm_treecover_2020_prep():
+    return (
+        ee.Image("projects/ee-cocoacmr/assets/land_cover/CMR_TNTMMU_2020")
+        .select("FNF_2020")
+        .eq(1)
+        .rename("nCM_Treecover_2020")
+        .selfMask()
+    )
 # ============================================================================
 # CONTEXT BANDS (Administrative boundaries and water mask)
 # ============================================================================

openforis_whisp/logger.py CHANGED Viewed

@@ -8,9 +8,21 @@ BASE_MSG_FORMAT = (
 class StdoutLogger:
     def __init__(self, name: str, msg_format: str = BASE_MSG_FORMAT) -> None:
-        self.handler = logging.StreamHandler(sys.stdout)
-        self.handler.setFormatter(logging.Formatter(msg_format))
-        self.handler.setLevel(logging.DEBUG)
+        # Create handler that auto-flushes for Colab/notebook visibility
+        handler = logging.StreamHandler(sys.stdout)
+        handler.setFormatter(logging.Formatter(msg_format))
+        handler.setLevel(logging.DEBUG)
+        # Override emit to force flush after each message
+        original_emit = handler.emit
+        def emit_with_flush(record):
+            original_emit(record)
+            sys.stdout.flush()
+        handler.emit = emit_with_flush
+        self.handler = handler
         self.logger = logging.getLogger(name)
         self.logger.addHandler(self.handler)
         self.logger.propagate = False

openforis_whisp/parameters/lookup_gee_datasets.csv CHANGED Viewed

@@ -2,9 +2,9 @@ name,order,ISO2_code,theme,theme_timber,use_for_risk,use_for_risk_timber,exclude
 EUFO_2020,10,,treecover,naturally_reg_2020,1,1,0,float32,1,0,g_jrc_gfc_2020_prep
 GLAD_Primary,20,,treecover,primary,1,1,0,float32,1,0,g_glad_pht_prep
 TMF_undist,30,,treecover,primary,1,1,0,float32,1,0,g_jrc_tmf_undisturbed_prep
-GFC_TC_2020,50,,treecover,naturally_reg_2020,1,1,0,float32,1,0,g_glad_gfc_10pc_prep
+GFC_TC_2020,50,,treecover,naturally_reg_2020,0,0,0,float32,1,0,g_glad_gfc_10pc_prep
 Forest_FDaP,60,,treecover,naturally_reg_2020,1,1,0,float32,1,0,g_glad_gfc_10pc_prep
-ESA_TC_2020,70,,treecover,naturally_reg_2020,1,1,0,float32,1,0,g_esa_worldcover_trees_prep
+ESA_TC_2020,70,,treecover,naturally_reg_2020,0,0,0,float32,1,0,g_esa_worldcover_trees_prep
 TMF_plant,80,,commodities,NA,1,1,0,float32,1,0,g_jrc_tmf_plantation_prep
 Oil_palm_Descals,90,,commodities,NA,1,1,0,float32,1,0,g_creaf_descals_palm_prep
 Oil_palm_FDaP,100,,commodities,NA,1,1,0,float32,1,0,g_fdap_palm_prep
@@ -197,3 +197,4 @@ nBR_INPE_TCamz_pasture_2020,2422,BR,commodities,NA,1,1,0,float32,1,0,nbr_terracl
 nBR_INPE_TCcer_pasture_2020,2423,BR,commodities,NA,1,1,0,float32,1,0,nbr_terraclass_cer20_ac_prep
 nBR_MapBiomas_col9_pasture_2020,2424,BR,commodities,NA,1,1,0,float32,1,0,nbr_mapbiomasc9_pasture_prep
 nCI_Cocoa_bnetd,3000,CI,commodities,NA,1,1,0,float32,1,0,nci_ocs2020_prep
+nCM_Treecover_2020,3100,CM,treecover,NA,1,0,0,float32,1,0,ncm_treecover_2020_prep

openforis_whisp/pd_schemas.py CHANGED Viewed

@@ -1,5 +1,10 @@
-import pandera as pa
-from pandera.typing import DataFrame, Series
+# Support both old and new pandera import paths
+try:
+    import pandera.pandas as pa
+    from pandera.typing.pandas import DataFrame, Series
+except (ImportError, ModuleNotFoundError):
+    import pandera as pa
+    from pandera.typing import DataFrame, Series
 # Define a schema for validating a DataFrame related to GEE (Google Earth Engine) datasets.
 class DataLookupSchema(pa.DataFrameModel):

openforis_whisp/reformat.py CHANGED Viewed

@@ -1,5 +1,10 @@
 # !pip install pandera[io] # special version used
-import pandera as pa
+# Support both old and new pandera import paths
+try:
+    import pandera.pandas as pa
+except (ImportError, ModuleNotFoundError):
+    import pandera as pa
 import pandas as pd
 import os
 import logging
@@ -125,7 +130,7 @@ def validate_dataframe(
     Returns:
         pd.DataFrame: The validated DataFrame with columns ordered according to the schema, or None if validation fails.
     """
-    log_missing_columns(df_stats, schema)
+    _log_missing_columns(df_stats, schema)
     # df_stats = df_stats.reindex(schema.columns.keys(), axis=1)
@@ -251,7 +256,7 @@ def create_schema_from_dataframe(schema_df: pd.DataFrame) -> pa.DataFrameSchema:
 #     return logger
-def log_missing_columns(df_stats: pd.DataFrame, template_schema: pa.DataFrameSchema):
+def _log_missing_columns(df_stats: pd.DataFrame, template_schema: pa.DataFrameSchema):
     # Initialize the logger
     logger = setup_logger(__name__)
@@ -675,33 +680,6 @@ def _process_custom_bands(df_extra: pd.DataFrame, custom_bands) -> pd.DataFrame:
 # Fix the duplicate logging issue
-def log_missing_columns(df_stats: pd.DataFrame, template_schema: pa.DataFrameSchema):
-    # Remove the duplicate logger creation line
-    # logger = setup_logger(__name__)  # DELETE THIS LINE
-    # Use the existing module-level logger (line 18: logger = StdoutLogger(__name__))
-    # Extract the expected columns from the DataFrameSchema
-    template_columns = list(template_schema.columns.keys())
-    df_stats_columns = df_stats.columns.tolist()
-    # Find missing and extra columns
-    missing_in_df = [col for col in template_columns if col not in df_stats_columns]
-    extra_in_df = [col for col in df_stats_columns if col not in template_columns]
-    # Log missing schema columns
-    if missing_in_df:
-        logger.warning(f"Missing expected schema columns: {missing_in_df}")
-    else:
-        logger.info("All expected schema columns found in DataFrame.")
-    # Log extra columns (will be preserved)
-    if extra_in_df:
-        logger.info(f"Extra columns found (will be preserved): {extra_in_df}")
-    else:
-        logger.info("No extra columns found in DataFrame.")
 def format_stats_dataframe(
     df,
     area_col="Area_sum",

openforis_whisp/stats.py CHANGED Viewed

@@ -88,12 +88,10 @@ def get_admin_boundaries_fc():
 def whisp_formatted_stats_geojson_to_df_legacy(
     input_geojson_filepath: Path | str,
     external_id_column=None,
-    remove_geom=False,
     national_codes=None,
     unit_type="ha",
     whisp_image=None,
     custom_bands=None,  # New parameter
-    validate_geometries: bool = False,
 ) -> pd.DataFrame:
     """
         Legacy function for basic Whisp stats extraction.
@@ -135,56 +133,19 @@ def whisp_formatted_stats_geojson_to_df_legacy(
             - List of band names: ['Aa_test', 'elevation']
             - Dict with types: {'Aa_test': 'float64', 'elevation': 'float32'}
             - None: preserves all extra columns automatically
-        validate_geometries : bool, optional
-            Whether to validate and fix invalid geometries, by default False.
-            Set to True to automatically fix invalid/self-intersecting polygons.
     Returns
         -------
         df_stats : pd.DataFrame
             The DataFrame containing the Whisp stats for the input ROI.
     """
-    # Load GeoJSON and validate geometries if requested
-    if validate_geometries:
-        import json
-        import geopandas as gpd
-        from shapely.validation import make_valid
-        import logging as py_logging
-        logger = py_logging.getLogger("whisp")
-        # Load GeoJSON file
-        with open(input_geojson_filepath, "r") as f:
-            geojson_data = json.load(f)
-        # Convert to GeoDataFrame
-        gdf = gpd.GeoDataFrame.from_features(geojson_data["features"])
-        # Validate and fix invalid geometries
-        valid_count = gdf.geometry.is_valid.sum()
-        invalid_count = len(gdf) - valid_count
-        if invalid_count > 0:
-            logger.warning(f"Fixing {invalid_count} invalid geometries")
-            gdf["geometry"] = gdf["geometry"].apply(
-                lambda g: make_valid(g) if g and not g.is_valid else g
-            )
-        # Pass GeoDataFrame directly to preserve CRS metadata
-        # convert_geojson_to_ee will handle:
-        # - CRS detection and conversion to WGS84 if needed
-        # - Data type sanitization (datetime, object columns)
-        # - Geometry validation and Z-coordinate stripping
-        feature_collection = convert_geojson_to_ee(
-            gdf, enforce_wgs84=True, strip_z_coords=True
-        )
-    else:
-        # Original path - no validation
-        feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
+    # Convert GeoJSON to Earth Engine FeatureCollection
+    # Note: Geometry validation/cleaning should be done before calling this function
+    feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
     return whisp_formatted_stats_ee_to_df(
         feature_collection,
         external_id_column,
-        remove_geom,
         national_codes=national_codes,
         unit_type=unit_type,
         whisp_image=whisp_image,
@@ -203,8 +164,8 @@ def whisp_formatted_stats_geojson_to_df(
     mode: str = "sequential",
     batch_size: int = 10,
     max_concurrent: int = 20,
-    validate_geometries: bool = False,
-    include_geometry_audit_trail: bool = False,
+    geometry_audit_trail: bool = False,
+    status_file: str = None,
 ) -> pd.DataFrame:
     """
     Main entry point for converting GeoJSON to Whisp statistics.
@@ -226,11 +187,7 @@ def whisp_formatted_stats_geojson_to_df(
         The column in the GeoJSON containing external IDs to be preserved in the output DataFrame.
         This column must exist as a property in ALL features of the GeoJSON file.
         Use debug_feature_collection_properties() to inspect available properties if you encounter errors.
-    remove_geom : bool, default=False
-        If True, the geometry of the GeoJSON is removed from the output DataFrame.
     national_codes : list, optional
-        List of ISO2 country codes to include national datasets.
-    unit_type: str, optional
         Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
     whisp_image : ee.Image, optional
         Pre-combined multiband Earth Engine Image containing all Whisp datasets.
@@ -252,12 +209,7 @@ def whisp_formatted_stats_geojson_to_df(
     max_concurrent : int, optional
         Maximum concurrent EE calls for concurrent mode, by default 20.
         Only applicable for "concurrent" mode.
-    validate_geometries : bool, optional
-        Whether to validate and fix invalid geometries, by default False.
-        Set to True to automatically fix invalid/self-intersecting polygons.
-        For production workflows, it's recommended to use geometry validation and
-        cleaning tools BEFORE processing with this function.
-    include_geometry_audit_trail : bool, default True
+    geometry_audit_trail : bool, default True
         If True (default), includes audit trail columns:
         - geo_original: Original input geometry
         - geometry_type_original: Original geometry type
@@ -267,6 +219,13 @@ def whisp_formatted_stats_geojson_to_df(
         Processing metadata stored in df.attrs['processing_metadata'].
         These columns enable full transparency for geometry modifications during processing.
+    status_file : str, optional
+        Path to JSON status file or directory for real-time progress tracking.
+        If a directory is provided, creates 'whisp_processing_status.json' in that directory.
+        Updates every 3 minutes and at progress milestones (5%, 10%, etc.).
+        Format: {"status": "processing", "progress": "450/1000", "percent": 45.0,
+                 "elapsed_sec": 120, "eta_sec": 145, "updated_at": "2025-11-13T14:23:45"}
+        Most useful for large concurrent jobs. Works in both concurrent and sequential modes.
     Returns
     -------
@@ -326,12 +285,10 @@ def whisp_formatted_stats_geojson_to_df(
         return whisp_formatted_stats_geojson_to_df_legacy(
             input_geojson_filepath=input_geojson_filepath,
             external_id_column=external_id_column,
-            remove_geom=remove_geom,
             national_codes=national_codes,
             unit_type=unit_type,
             whisp_image=whisp_image,
             custom_bands=custom_bands,
-            validate_geometries=validate_geometries,
         )
     elif mode in ("concurrent", "sequential"):
         # Log info if batch_size or max_concurrent are not used in sequential mode
@@ -350,7 +307,6 @@ def whisp_formatted_stats_geojson_to_df(
         return whisp_formatted_stats_geojson_to_df_fast(
             input_geojson_filepath=input_geojson_filepath,
             external_id_column=external_id_column,
-            remove_geom=remove_geom,
             national_codes=national_codes,
             unit_type=unit_type,
             whisp_image=whisp_image,
@@ -358,8 +314,8 @@ def whisp_formatted_stats_geojson_to_df(
             mode=mode,  # Pass mode directly (concurrent or sequential)
             batch_size=batch_size,
             max_concurrent=max_concurrent,
-            validate_geometries=validate_geometries,
-            include_geometry_audit_trail=include_geometry_audit_trail,
+            geometry_audit_trail=geometry_audit_trail,
+            status_file=status_file,
         )
     else:
         raise ValueError(
@@ -518,7 +474,6 @@ def whisp_formatted_stats_ee_to_df(
 def whisp_stats_geojson_to_df(
     input_geojson_filepath: Path | str,
     external_id_column=None,
-    remove_geom=False,
     national_codes=None,
     unit_type="ha",
     whisp_image=None,  # New parameter
@@ -551,7 +506,6 @@ def whisp_stats_geojson_to_df(
     return whisp_stats_ee_to_df(
         feature_collection,
         external_id_column,
-        remove_geom,
         national_codes=national_codes,
         unit_type=unit_type,
         whisp_image=whisp_image,  # Pass through
@@ -1035,7 +989,7 @@ def whisp_stats_ee_to_drive(
         )
         task.start()
         print(
-            "Exporting to Google Drive: 'whisp_results/whisp_output_table.csv'. To track progress: https://code.earthengine.google.com/tasks"
+            "Exporting to Google Drive: 'whisp_output_table.csv'. To track progress: https://code.earthengine.google.com/tasks"
         )
     except Exception as e:
         print(f"An error occurred during the export: {e}")

openforis-whisp 3.0.0a2__py3-none-any.whl → 3.0.0a4__py3-none-any.whl

openforis-whisp 3.0.0a2py3-none-any.whl → 3.0.0a4py3-none-any.whl