PyPI - openforis-whisp - Versions diffs - 2.0.0b3__py3-none-any.whl → 3.0.0a1__py3-none-any.whl - Mend

openforis-whisp 2.0.0b3py3-none-any.whl → 3.0.0a1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

openforis_whisp/__init__.py +35 -4
openforis_whisp/advanced_stats.py +2070 -0
openforis_whisp/data_checks.py +642 -0
openforis_whisp/data_conversion.py +86 -44
openforis_whisp/datasets.py +124 -36
openforis_whisp/logger.py +26 -0
openforis_whisp/parameters/__init__.py +0 -0
openforis_whisp/parameters/lookup_gaul1_admin.py +18663 -0
openforis_whisp/reformat.py +198 -2
openforis_whisp/stats.py +314 -52
{openforis_whisp-2.0.0b3.dist-info → openforis_whisp-3.0.0a1.dist-info}/METADATA +1 -1
openforis_whisp-3.0.0a1.dist-info/RECORD +20 -0
openforis_whisp-2.0.0b3.dist-info/RECORD +0 -16
{openforis_whisp-2.0.0b3.dist-info → openforis_whisp-3.0.0a1.dist-info}/LICENSE +0 -0
{openforis_whisp-2.0.0b3.dist-info → openforis_whisp-3.0.0a1.dist-info}/WHEEL +0 -0

openforis_whisp/data_conversion.py CHANGED Viewed

@@ -4,7 +4,7 @@ from shapely.geometry import shape
 from pathlib import Path
 # Existing imports
-from typing import List, Any
+from typing import List, Any, Union
 from geojson import Feature, FeatureCollection, Polygon, Point
 import json
 import os
@@ -13,32 +13,32 @@ import ee
 def convert_geojson_to_ee(
-    geojson_filepath: Any, enforce_wgs84: bool = True, strip_z_coords: bool = True
+    geojson_filepath: Union[str, Path, dict],
+    enforce_wgs84: bool = True,
+    strip_z_coords: bool = True,
 ) -> ee.FeatureCollection:
     """
-    Reads a GeoJSON file from the given path and converts it to an Earth Engine FeatureCollection.
+    Converts GeoJSON data to an Earth Engine FeatureCollection.
+    Accepts either a file path or a GeoJSON dictionary object.
     Optionally checks and converts the CRS to WGS 84 (EPSG:4326) if needed.
     Automatically handles 3D coordinates by stripping Z values when necessary.
     Args:
-        geojson_filepath (Any): The filepath to the GeoJSON file.
+        geojson_filepath (Union[str, Path, dict]): The filepath to the GeoJSON file (str or Path)
+                                                    or a GeoJSON dictionary object.
         enforce_wgs84 (bool): Whether to enforce WGS 84 projection (EPSG:4326). Defaults to True.
+                              Only applies when input is a file path (dicts are assumed to be in WGS84).
         strip_z_coords (bool): Whether to automatically strip Z coordinates from 3D geometries. Defaults to True.
     Returns:
         ee.FeatureCollection: Earth Engine FeatureCollection created from the GeoJSON.
     """
-    if isinstance(geojson_filepath, (str, Path)):
+    if isinstance(geojson_filepath, dict):
+        # Input is already a GeoJSON dictionary - skip file reading
+        geojson_data = geojson_filepath
+    elif isinstance(geojson_filepath, (str, Path)):
         file_path = os.path.abspath(geojson_filepath)
-        # Apply print_once deduplication for file reading message
-        if not hasattr(convert_geojson_to_ee, "_printed_file_messages"):
-            convert_geojson_to_ee._printed_file_messages = set()
-        if file_path not in convert_geojson_to_ee._printed_file_messages:
-            print(f"Reading GeoJSON file from: {file_path}")
-            convert_geojson_to_ee._printed_file_messages.add(file_path)
         # Use GeoPandas to read the file and handle CRS
         gdf = gpd.read_file(file_path)
@@ -56,15 +56,17 @@ def convert_geojson_to_ee(
         # Check and convert CRS if needed
         if enforce_wgs84:
             if gdf.crs is None:
-                print("Warning: Input GeoJSON has no CRS defined, assuming WGS 84")
+                # Assuming WGS 84 if no CRS defined
+                pass
             elif gdf.crs != "EPSG:4326":
-                print(f"Converting CRS from {gdf.crs} to WGS 84 (EPSG:4326)")
                 gdf = gdf.to_crs("EPSG:4326")
         # Convert to GeoJSON
         geojson_data = json.loads(gdf.to_json())
     else:
-        raise ValueError("Input must be a file path (str or Path)")
+        raise ValueError(
+            "Input must be a file path (str or Path) or a GeoJSON dictionary object (dict)"
+        )
     validation_errors = validate_geojson(geojson_data)
     if validation_errors:
@@ -101,7 +103,7 @@ def convert_geojson_to_ee(
                 success_message_key = f"z_coords_success_{file_path}"
                 if success_message_key not in convert_geojson_to_ee._printed_z_messages:
-                    print("✓ Successfully converted after stripping Z coordinates")
+                    print("Successfully converted after stripping Z coordinates")
                     convert_geojson_to_ee._printed_z_messages.add(success_message_key)
                 return feature_collection
@@ -250,6 +252,58 @@ def convert_shapefile_to_ee(shapefile_path):
     return roi
+# def convert_ee_to_df(
+#     ee_object,
+#     columns=None,
+#     remove_geom=False,
+#     **kwargs,
+# ):
+#     """Converts an ee.FeatureCollection to pandas dataframe.
+#     Args:
+#         ee_object (ee.FeatureCollection): ee.FeatureCollection.
+#         columns (list): List of column names. Defaults to None.
+#         remove_geom (bool): Whether to remove the geometry column. Defaults to True.
+#         kwargs: Additional arguments passed to ee.data.computeFeature.
+#     Raises:
+#         TypeError: ee_object must be an ee.FeatureCollection
+#     Returns:
+#         pd.DataFrame: pandas DataFrame
+#     """
+#     if isinstance(ee_object, ee.Feature):
+#         ee_object = ee.FeatureCollection([ee_object])
+#     if not isinstance(ee_object, ee.FeatureCollection):
+#         raise TypeError("ee_object must be an ee.FeatureCollection")
+#     try:
+#         if remove_geom:
+#             data = ee_object.map(
+#                 lambda f: ee.Feature(None, f.toDictionary(f.propertyNames().sort()))
+#             )
+#         else:
+#             data = ee_object
+#         kwargs["expression"] = data
+#         kwargs["fileFormat"] = "PANDAS_DATAFRAME"
+#         df = ee.data.computeFeatures(kwargs)
+#         if isinstance(columns, list):
+#             df = df[columns]
+#         if remove_geom and ("geometry" in df.columns):
+#             df = df.drop(columns=["geometry"], axis=1)
+#         # Sorting columns is not supported server-side and is removed from this function.
+#         return df
+#     except Exception as e:
+#         raise Exception(e)
 def convert_ee_to_df(
     ee_object,
     columns=None,
@@ -257,49 +311,37 @@ def convert_ee_to_df(
     sort_columns=False,
     **kwargs,
 ):
-    """Converts an ee.FeatureCollection to pandas dataframe.
+    """
+    Converts an ee.FeatureCollection to pandas DataFrame, maximizing server-side operations.
     Args:
         ee_object (ee.FeatureCollection): ee.FeatureCollection.
-        columns (list): List of column names. Defaults to None.
-        remove_geom (bool): Whether to remove the geometry column. Defaults to True.
-        sort_columns (bool): Whether to sort the column names. Defaults to False.
-        kwargs: Additional arguments passed to ee.data.computeFeature.
-    Raises:
-        TypeError: ee_object must be an ee.FeatureCollection
+        columns (list): List of column names to select (server-side if possible).
+        remove_geom (bool): Remove geometry column server-side.
+        kwargs: Additional arguments for ee.data.computeFeatures.
     Returns:
         pd.DataFrame: pandas DataFrame
     """
+    import ee
     if isinstance(ee_object, ee.Feature):
         ee_object = ee.FeatureCollection([ee_object])
     if not isinstance(ee_object, ee.FeatureCollection):
         raise TypeError("ee_object must be an ee.FeatureCollection")
-    try:
-        if remove_geom:
-            data = ee_object.map(
-                lambda f: ee.Feature(None, f.toDictionary(f.propertyNames().sort()))
-            )
-        else:
-            data = ee_object
+    # Server-side: select columns and remove geometry
+    if columns is not None:
+        ee_object = ee_object.select(columns)
+    if remove_geom:
+        ee_object = ee_object.map(lambda f: ee.Feature(None, f.toDictionary()))
-        kwargs["expression"] = data
+    try:
+        kwargs["expression"] = ee_object
         kwargs["fileFormat"] = "PANDAS_DATAFRAME"
         df = ee.data.computeFeatures(kwargs)
-        if isinstance(columns, list):
-            df = df[columns]
-        if remove_geom and ("geometry" in df.columns):
-            df = df.drop(columns=["geometry"], axis=1)
-        if sort_columns:
-            df = df.reindex(sorted(df.columns), axis=1)
         return df
     except Exception as e:
         raise Exception(e)
@@ -443,7 +485,7 @@ def convert_csv_to_geojson(
     try:
         df = pd.read_csv(csv_filepath)
-        df_to_geojson(df, geojson_filepath, geo_column)
+        convert_df_to_geojson(df, geojson_filepath, geo_column)
     except Exception as e:
         print(f"An error occurred while converting CSV to GeoJSON: {e}")

openforis_whisp/datasets.py CHANGED Viewed

@@ -177,7 +177,7 @@ def g_jrc_tmf_plantation_prep():
     plantation_2020 = plantation.where(
         deforestation_year.gte(2021), 0
     )  # update from https://github.com/forestdatapartnership/whisp/issues/42
-    return plantation_2020.rename("TMF_plant")
+    return plantation_2020.rename("TMF_plant").selfMask()
 # # Oil_palm_Descals
@@ -390,6 +390,7 @@ def g_radd_year_prep():
             .updateMask(radd_date.lte(end))
             .gt(0)
             .rename("RADD_year_" + "20" + str(year))
+            .selfMask()
         )
         return ee.Image(img_stack).addBands(radd_year)
@@ -403,6 +404,7 @@ def g_radd_year_prep():
         .updateMask(radd_date.lte(end))
         .gt(0)
         .rename(band_name)
+        .selfMask()
     )
     def make_band(year, img_stack):
@@ -415,6 +417,7 @@ def g_radd_year_prep():
             .updateMask(radd_date.lte(end))
             .gt(0)
             .rename(band_name)
+            .selfMask()
         )
         return ee.Image(img_stack).addBands(radd_year)
@@ -431,7 +434,7 @@ def g_tmf_def_per_year_prep():
     for i in range(0, 24 + 1):
         year_num = ee.Number(2000 + i)
         band_name = ee.String("TMF_def_").cat(year_num.format("%d"))
-        tmf_def_year = tmf_def.eq(year_num).rename(band_name)
+        tmf_def_year = tmf_def.eq(year_num).rename(band_name).selfMask()
         if img_stack is None:
             img_stack = tmf_def_year
         else:
@@ -448,7 +451,7 @@ def g_tmf_deg_per_year_prep():
     for i in range(0, 24 + 1):
         year_num = ee.Number(2000 + i)
         band_name = ee.String("TMF_deg_").cat(year_num.format("%d"))
-        tmf_def_year = tmf_def.eq(year_num).rename(band_name)
+        tmf_def_year = tmf_def.eq(year_num).rename(band_name).selfMask()
         if img_stack is None:
             img_stack = tmf_def_year
         else:
@@ -468,7 +471,7 @@ def g_glad_gfc_loss_per_year_prep():
         gfc_loss_year = (
             gfc.select(["lossyear"]).eq(i).And(gfc.select(["treecover2000"]).gt(10))
         )
-        gfc_loss_year = gfc_loss_year.rename(band_name)
+        gfc_loss_year = gfc_loss_year.rename(band_name).selfMask()
         if img_stack is None:
             img_stack = gfc_loss_year
         else:
@@ -499,6 +502,7 @@ def g_modis_fire_prep():
             .select(["BurnDate"])
             .gte(0)
             .rename(band_name)
+            .selfMask()
         )
         img_stack = modis_year if img_stack is None else img_stack.addBands(modis_year)
@@ -528,6 +532,7 @@ def g_esa_fire_prep():
             .select(["BurnDate"])
             .gte(0)
             .rename(band_name)
+            .selfMask()
         )
         img_stack = esa_year if img_stack is None else img_stack.addBands(esa_year)
@@ -1155,10 +1160,55 @@ def nci_ocs2020_prep():
     ).selfMask()  # cocoa from national land cover map for Côte d'Ivoire
+# ============================================================================
+# CONTEXT BANDS (Administrative boundaries and water mask)
+# ============================================================================
+def g_gaul_admin_code():
+    """
+    GAUL 2024 Level 1 administrative boundary codes (500m resolution).
+    Used for spatial context and administrative aggregation.
+    Returns
+    -------
+    ee.Image
+        Image with admin codes renamed to 'admin_code' (as int32)
+    """
+    admin_image = ee.Image(
+        "projects/ee-andyarnellgee/assets/gaul_2024_level_1_code_500m"
+    )
+    # Cast to int32 to ensure integer GAUL codes, then rename
+    return admin_image.rename("admin_code")
+def g_water_mask_prep():
+    """
+    Water mask from JRC/USGS combined dataset.
+    Used to identify water bodies for downstream filtering and context.
+    Multiplied by pixel area to get water area in hectares.
+    Returns
+    -------
+    ee.Image
+        Binary water mask image renamed to In_waterbody (will be multiplied by pixel area)
+    """
+    from openforis_whisp.parameters.config_runtime import water_flag
+    water_mask_image = ee.Image("projects/ee-andyarnellgee/assets/water_mask_jrc_usgs")
+    return water_mask_image.selfMask().rename(water_flag)
 ###Combining datasets
-def combine_datasets(national_codes=None, validate_bands=False):
+def combine_datasets(
+    national_codes=None,
+    validate_bands=False,
+    include_context_bands=True,
+    auto_recovery=False,
+):
     """
     Combines datasets into a single multiband image, with fallback if assets are missing.
@@ -1169,48 +1219,76 @@ def combine_datasets(national_codes=None, validate_bands=False):
     validate_bands : bool, optional
         If True, validates band names with a slow .getInfo() call (default: False)
         Only enable for debugging. Normal operation relies on exception handling.
+    include_context_bands : bool, optional
+        If True (default), includes context bands (admin_code, water_flag) in the output.
+        Set to False when using stats.py implementations that compile datasets differently.
+    auto_recovery : bool, optional
+        If True (default), automatically enables validate_bands when an error is detected
+        during initial assembly. This allows graceful recovery from missing/broken datasets.
     Returns
     -------
     ee.Image
-        Combined multiband image with all datasets
+        Combined multiband image with all datasets (and optionally context bands)
     """
-    img_combined = ee.Image(1).rename(geometry_area_column)
-    # Combine images directly
-    for img in [func() for func in list_functions(national_codes=national_codes)]:
+    # Step 1: Combine all main dataset images
+    all_images = [ee.Image(1).rename(geometry_area_column)]
+    for func in list_functions(national_codes=national_codes):
         try:
-            img_combined = img_combined.addBands(img)
-            # img_combined = img_combined.addBands(img)
+            all_images.append(func())
         except ee.EEException as e:
-            # logger.error(f"Error adding image: {e}")
-            print(f"Error adding image: {e}")
+            print(f"Error loading image: {e}")
+    img_combined = ee.Image.cat(all_images)
-    # OPTIMIZATION: Removed slow .getInfo() call for band validation
-    # The validation is now optional and disabled by default
-    # Image processing will fail downstream if there's an issue, which is handled by exception blocks
-    if validate_bands:
+    # Step 2: Determine if validation needed
+    should_validate = validate_bands
+    if auto_recovery and not validate_bands:
         try:
-            # This is SLOW - only use for debugging
-            img_combined.bandNames().getInfo()
+            # Fast error detection: batch check main + context bands in one call
+            bands_to_check = [img_combined.bandNames().get(0)]
+            if include_context_bands:
+                admin_image = g_gaul_admin_code()
+                water_mask = g_water_mask_prep()
+                bands_to_check.extend(
+                    [admin_image.bandNames().get(0), water_mask.bandNames().get(0)]
+                )
+            ee.List(bands_to_check).getInfo()  # trigger error if any band is invalid
         except ee.EEException as e:
-            # logger.error(f"Error validating band names: {e}")
-            # logger.info("Running code for filtering to only valid datasets due to error in input")
-            print("using valid datasets filter due to error in validation")
-            # Validate images
-            images_to_test = [
-                func() for func in list_functions(national_codes=national_codes)
-            ]
-            valid_imgs = keep_valid_images(images_to_test)  # Validate images
-            # Retry combining images after validation
-            img_combined = ee.Image(1).rename(geometry_area_column)
-            for img in valid_imgs:
-                img_combined = img_combined.addBands(img)
+            print(f"Error detected, enabling recovery mode: {str(e)[:80]}...")
+            should_validate = True
+    # Step 3: Validate and recover if needed
+    if should_validate:
+        try:
+            img_combined.bandNames().getInfo()  # check all bands
+        except ee.EEException as e:
+            print("Using valid datasets filter due to error in validation")
+            valid_imgs = keep_valid_images(
+                [func() for func in list_functions(national_codes=national_codes)]
+            )
+            all_images_retry = [ee.Image(1).rename(geometry_area_column)]
+            all_images_retry.extend(valid_imgs)
+            img_combined = ee.Image.cat(all_images_retry)
+    # Step 4: Multiply main datasets by pixel area
     img_combined = img_combined.multiply(ee.Image.pixelArea())
-    print("Whisp multiband image compiled")
+    # Step 5: Add context bands (admin_code only - water mask is now in prep functions)
+    if include_context_bands:
+        for band_func, band_name in [
+            (g_gaul_admin_code, "admin_code"),
+            (g_water_mask_prep, "In_waterbody"),
+        ]:
+            try:
+                band_img = band_func()
+                if should_validate:
+                    band_img.bandNames().getInfo()
+                img_combined = img_combined.addBands(band_img)
+            except ee.EEException as e:
+                print(f"Warning: Could not add {band_name} band: {e}")
+    print("Whisp multiband image compiled")
     return img_combined
@@ -1230,9 +1308,12 @@ def combine_datasets(national_codes=None, validate_bands=False):
 def list_functions(national_codes=None):
     """
     Returns a list of functions that end with "_prep" and either:
-    - Start with "g_" (global/regional products)
+    - Start with "g_" (global/regional products, excluding context bands)
     - Start with any provided national code prefix (nXX_)
+    Context band functions (g_gaul_admin_code, g_water_mask_prep) are handled
+    separately and excluded from this list to avoid duplication.
     Args:
         national_codes: List of ISO2 country codes (without the 'n' prefix)
     """
@@ -1243,15 +1324,19 @@ def list_functions(national_codes=None):
     if national_codes is None:
         national_codes = []
+    # Context band functions that are handled separately
+    context_functions = {"g_gaul_admin_code", "g_water_mask_prep"}
     # Create prefixes list with proper formatting ('n' + code + '_')
     allowed_prefixes = ["g_"] + [f"n{code.lower()}_" for code in national_codes]
-    # Filter functions in a single pass
+    # Filter functions in a single pass, excluding context band functions
     functions = [
         func
         for name, func in inspect.getmembers(current_module, inspect.isfunction)
         if name.endswith("_prep")
         and any(name.startswith(prefix) for prefix in allowed_prefixes)
+        and name not in context_functions
     ]
     return functions
@@ -1335,3 +1420,6 @@ def combine_custom_bands(custom_images, custom_bands_info):
     custom_ee_image = custom_ee_image.multiply(ee.Image.pixelArea())
     return custom_ee_image  # Only return the image
+# %%

openforis_whisp/logger.py CHANGED Viewed

@@ -34,6 +34,19 @@ class StdoutLogger:
     def setLevel(self, level):
         self.logger.setLevel(level)
+    @property
+    def level(self):
+        """Return the logger's effective level."""
+        return self.logger.level
+    def hasHandlers(self):
+        """Check if the logger has any handlers."""
+        return self.logger.hasHandlers()
+    def addHandler(self, handler):
+        """Add a handler to the logger."""
+        self.logger.addHandler(handler)
 class FileLogger:
     def __init__(
@@ -73,3 +86,16 @@ class FileLogger:
     def setLevel(self, level):
         self.logger.setLevel(level)
+    @property
+    def level(self):
+        """Return the logger's effective level."""
+        return self.logger.level
+    def hasHandlers(self):
+        """Check if the logger has any handlers."""
+        return self.logger.hasHandlers()
+    def addHandler(self, handler):
+        """Add a handler to the logger."""
+        self.logger.addHandler(handler)

openforis_whisp/parameters/__init__.py ADDED Viewed

File without changes

openforis-whisp 2.0.0b3__py3-none-any.whl → 3.0.0a1__py3-none-any.whl

openforis-whisp 2.0.0b3py3-none-any.whl → 3.0.0a1py3-none-any.whl