openforis-whisp 3.0.0a7__py3-none-any.whl → 3.0.0a8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -33,6 +33,7 @@ import subprocess
33
33
  from contextlib import redirect_stdout, contextmanager
34
34
  from pathlib import Path
35
35
  from typing import Optional, List, Dict, Any, Tuple, Union
36
+ from importlib.metadata import version as get_version
36
37
  from concurrent.futures import ThreadPoolExecutor, as_completed
37
38
  import tempfile
38
39
 
@@ -924,10 +925,67 @@ def clean_geodataframe(
924
925
 
925
926
 
926
927
  # ============================================================================
927
- # BATCH RETRY HELPER
928
+ # AUDIT TRAIL HELPER
928
929
  # ============================================================================
929
930
 
930
931
 
932
+ def _add_geometry_audit_trail(
933
+ df_validated: pd.DataFrame,
934
+ input_geojson_filepath: str,
935
+ gdf_original_geoms: gpd.GeoDataFrame = None,
936
+ logger: logging.Logger = None,
937
+ ) -> pd.DataFrame:
938
+ """
939
+ Add original input geometries as geo_original column for audit trail.
940
+
941
+ Parameters
942
+ ----------
943
+ df_validated : pd.DataFrame
944
+ Validated DataFrame to add audit trail to
945
+ input_geojson_filepath : str
946
+ Path to original GeoJSON file
947
+ gdf_original_geoms : gpd.GeoDataFrame, optional
948
+ Pre-loaded original geometries (to avoid reloading)
949
+ logger : logging.Logger, optional
950
+ Logger for output
951
+
952
+ Returns
953
+ -------
954
+ pd.DataFrame
955
+ DataFrame with geo_original column added
956
+ """
957
+ import json
958
+ from shapely.geometry import mapping
959
+
960
+ logger = logger or logging.getLogger("whisp")
961
+
962
+ try:
963
+ # Load original geometries if not provided
964
+ if gdf_original_geoms is None:
965
+ logger.warning("Original geometries not pre-loaded, loading now...")
966
+ gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
967
+
968
+ # Create DataFrame with plotId and geo_original
969
+ df_original_geom = pd.DataFrame(
970
+ {
971
+ "plotId": df_validated["plotId"].values[: len(gdf_original_geoms)],
972
+ "geo_original": gdf_original_geoms["geometry"].apply(
973
+ lambda g: json.dumps(mapping(g)) if g is not None else None
974
+ ),
975
+ }
976
+ )
977
+
978
+ # Merge original geometries back
979
+ df_result = df_validated.merge(df_original_geom, on="plotId", how="left")
980
+ logger.info("Audit trail added: geo_original column")
981
+ return df_result
982
+
983
+ except Exception as e:
984
+ logger.warning(f"Error adding audit trail: {e}")
985
+ # Return original DataFrame if audit trail fails
986
+ return df_validated
987
+
988
+
931
989
  # ============================================================================
932
990
  # BATCH RETRY HELPER - DEPRECATED (removed due to semaphore deadlock issues)
933
991
  # ============================================================================
@@ -1727,8 +1785,7 @@ def whisp_stats_geojson_to_df_concurrent(
1727
1785
  logger.warning(f"{plot_id_column} column missing, regenerating...")
1728
1786
  formatted.insert(0, plot_id_column, range(1, len(formatted) + 1))
1729
1787
 
1730
- # Sort by plot_id to ensure consistent output order
1731
- formatted = formatted.sort_values(by=plot_id_column).reset_index(drop=True)
1788
+ # Note: Sorting is handled by format_stats_dataframe in the formatted wrapper functions
1732
1789
 
1733
1790
  logger.info(f"Processing complete: {len(formatted):,} features")
1734
1791
  return formatted
@@ -1981,10 +2038,11 @@ def whisp_stats_geojson_to_df_sequential(
1981
2038
  convert_water_flag=True,
1982
2039
  )
1983
2040
 
1984
- # Ensure plot_id exists and sort by it
2041
+ # Ensure plot_id exists
1985
2042
  if plot_id_column not in formatted.columns:
1986
2043
  formatted.insert(0, plot_id_column, range(1, len(formatted) + 1))
1987
- formatted = formatted.sort_values(by=plot_id_column).reset_index(drop=True)
2044
+
2045
+ # Note: Sorting is handled by format_stats_dataframe in the formatted wrapper functions
1988
2046
 
1989
2047
  logger.info(f"Processing complete: {len(formatted):,} features")
1990
2048
 
@@ -2154,50 +2212,21 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
2154
2212
  custom_bands=custom_bands,
2155
2213
  )
2156
2214
 
2157
- # Step 2c: Add audit trail columns (AFTER validation to preserve columns)
2215
+ # Step 2c: Add audit trail column (AFTER validation to preserve columns)
2158
2216
  if geometry_audit_trail:
2159
- logger.debug("Adding audit trail columns...")
2160
- try:
2161
- # Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
2162
- if gdf_original_geoms is None:
2163
- logger.warning("Original geometries not pre-loaded, loading now...")
2164
- gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
2165
-
2166
- # Use plotId from df_validated to maintain mapping
2167
- df_original_geom = pd.DataFrame(
2168
- {
2169
- "plotId": df_validated["plotId"].values[: len(gdf_original_geoms)],
2170
- "geo_original": gdf_original_geoms["geometry"].apply(
2171
- lambda g: json.dumps(mapping(g)) if g is not None else None
2172
- ),
2173
- }
2174
- )
2175
-
2176
- # Merge original geometries back
2177
- df_validated = df_validated.merge(df_original_geom, on="plotId", how="left")
2178
-
2179
- # Store processing metadata
2180
- df_validated.attrs["processing_metadata"] = {
2181
- "whisp_version": "3.0.0a1",
2182
- "processing_date": datetime.now().isoformat(),
2183
- "processing_mode": "concurrent",
2184
- "ee_endpoint": "high_volume",
2185
- "validate_geometries": validate_geometries,
2186
- "datasets_used": national_codes or [],
2187
- "geometry_audit_trail": True,
2188
- }
2189
-
2190
- logger.info(f"Audit trail added: geo_original column")
2191
-
2192
- except Exception as e:
2193
- logger.warning(f"Error adding audit trail: {e}")
2194
- # Continue without audit trail if something fails
2217
+ logger.debug("Adding geo_original column for audit trail...")
2218
+ df_validated = _add_geometry_audit_trail(
2219
+ df_validated=df_validated,
2220
+ input_geojson_filepath=input_geojson_filepath,
2221
+ gdf_original_geoms=gdf_original_geoms,
2222
+ logger=logger,
2223
+ )
2195
2224
 
2196
2225
  # Add processing metadata column using pd.concat to avoid fragmentation warning
2197
2226
  metadata_dict = {
2198
- "whisp_version": "3.0.0a1",
2227
+ "whisp_version": get_version("openforis-whisp"),
2199
2228
  "processing_timestamp_utc": datetime.now(timezone.utc).strftime(
2200
- "%Y-%m-%d %H:%M:%S UTC"
2229
+ "%Y-%m-%d %H:%M:%S%z"
2201
2230
  ),
2202
2231
  }
2203
2232
  metadata_series = pd.Series(
@@ -2349,49 +2378,21 @@ def whisp_formatted_stats_geojson_to_df_sequential(
2349
2378
  custom_bands=custom_bands,
2350
2379
  )
2351
2380
 
2352
- # Step 2c: Add audit trail columns (AFTER validation to preserve columns)
2381
+ # Step 2c: Add audit trail column (AFTER validation to preserve columns)
2353
2382
  if geometry_audit_trail:
2354
- logger.debug("Adding audit trail columns...")
2355
- try:
2356
- # Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
2357
- if gdf_original_geoms is None:
2358
- logger.warning("Original geometries not pre-loaded, loading now...")
2359
- gdf_original_geoms = _load_and_prepare_geojson(input_geojson_filepath)
2360
-
2361
- # Use plotId from df_validated to maintain mapping
2362
- df_original_geom = pd.DataFrame(
2363
- {
2364
- "plotId": df_validated["plotId"].values[: len(gdf_original_geoms)],
2365
- "geo_original": gdf_original_geoms["geometry"].apply(
2366
- lambda g: json.dumps(mapping(g)) if g is not None else None
2367
- ),
2368
- }
2369
- )
2370
-
2371
- # Merge original geometries back
2372
- df_validated = df_validated.merge(df_original_geom, on="plotId", how="left")
2373
-
2374
- # Store processing metadata
2375
- df_validated.attrs["processing_metadata"] = {
2376
- "whisp_version": "3.0.0a1",
2377
- "processing_date": datetime.now().isoformat(),
2378
- "processing_mode": "sequential",
2379
- "ee_endpoint": "standard",
2380
- "datasets_used": national_codes or [],
2381
- "geometry_audit_trail": True,
2382
- }
2383
-
2384
- logger.info(f"Audit trail added: geo_original column")
2385
-
2386
- except Exception as e:
2387
- logger.warning(f"Error adding audit trail: {e}")
2388
- # Continue without audit trail if something fails
2383
+ logger.debug("Adding geo_original column for audit trail...")
2384
+ df_validated = _add_geometry_audit_trail(
2385
+ df_validated=df_validated,
2386
+ input_geojson_filepath=input_geojson_filepath,
2387
+ gdf_original_geoms=gdf_original_geoms,
2388
+ logger=logger,
2389
+ )
2389
2390
 
2390
2391
  # Add processing metadata column using pd.concat to avoid fragmentation warning
2391
2392
  metadata_dict = {
2392
- "whisp_version": "3.0.0a1",
2393
+ "whisp_version": get_version("openforis-whisp"),
2393
2394
  "processing_timestamp_utc": datetime.now(timezone.utc).strftime(
2394
- "%Y-%m-%d %H:%M:%S UTC"
2395
+ "%Y-%m-%d %H:%M:%S%z"
2395
2396
  ),
2396
2397
  }
2397
2398
  metadata_series = pd.Series(
@@ -374,14 +374,12 @@ def g_esri_2020_2023_crop_prep():
374
374
 
375
375
  # RADD_year_2019 to RADD_year_< current year >
376
376
  def g_radd_year_prep():
377
- from datetime import datetime
378
-
379
377
  radd = ee.ImageCollection("projects/radar-wur/raddalert/v1")
380
378
  radd_date = (
381
379
  radd.filterMetadata("layer", "contains", "alert").select("Date").mosaic()
382
380
  )
383
381
  start_year = 19
384
- current_year = datetime.now().year % 100
382
+ current_year = CURRENT_YEAR_2DIGIT
385
383
 
386
384
  def make_band(year, img_stack):
387
385
  start = year * 1000
@@ -859,12 +859,14 @@ def format_stats_dataframe(
859
859
  )
860
860
  df.rename(columns={area_col: area_col_stripped}, inplace=True)
861
861
 
862
- # 10) reorder by plotId column if present
863
- df = (
864
- df.sort_values(sort_column).reset_index(drop=True)
865
- if sort_column in df.columns
866
- else df
867
- )
862
+ # 10) reorder by plotId column numerically if present (column is string but contains int values)
863
+ if sort_column in df.columns:
864
+ df["_sort_key"] = pd.to_numeric(df[sort_column], errors="coerce")
865
+ df = (
866
+ df.sort_values(by="_sort_key")
867
+ .drop(columns=["_sort_key"])
868
+ .reset_index(drop=True)
869
+ )
868
870
 
869
871
  # 11) Defragment final DataFrame and return
870
872
  return df.copy()
openforis_whisp/risk.py CHANGED
@@ -1,12 +1,14 @@
1
1
  import pandas as pd
2
2
 
3
3
  from .pd_schemas import data_lookup_type
4
+ from .logger import StdoutLogger
4
5
 
5
6
 
6
7
  from openforis_whisp.parameters.config_runtime import (
7
8
  geometry_area_column,
8
9
  DEFAULT_GEE_DATASETS_LOOKUP_TABLE_PATH,
9
- stats_unit_type_column, # Add this import
10
+ DEFAULT_CONTEXT_LOOKUP_TABLE_PATH,
11
+ stats_unit_type_column,
10
12
  )
11
13
 
12
14
  from openforis_whisp.reformat import filter_lookup_by_country_codes
@@ -16,6 +18,8 @@ lookup_gee_datasets_df: data_lookup_type = pd.read_csv(
16
18
  DEFAULT_GEE_DATASETS_LOOKUP_TABLE_PATH
17
19
  )
18
20
 
21
+ logger = StdoutLogger(__name__)
22
+
19
23
 
20
24
  # requires lookup_gee_datasets_df
21
25
 
@@ -113,9 +117,10 @@ def whisp_risk(
113
117
  explicit_unit_type: str = None,
114
118
  national_codes: list[str] = None, # List of ISO2 country codes to filter by
115
119
  custom_bands_info: dict = None, # New parameter for custom band risk info
120
+ drop_unused_columns: bool = False, # Remove columns not used in risk calculations
116
121
  ) -> data_lookup_type:
117
122
  """
118
- Adds the EUDR (European Union Deforestation Risk) column to the DataFrame based on indicator values.
123
+ Adds the risk column to the DataFrame based on indicator values.
119
124
 
120
125
  Args:
121
126
  df (DataFrame): Input DataFrame.
@@ -145,6 +150,9 @@ def whisp_risk(
145
150
  }
146
151
  }
147
152
  If None, custom bands won't be included in risk calculations.
153
+ drop_unused_columns (bool, optional): If True, removes dataset columns not used in risk calculations,
154
+ keeping only context/metadata columns, datasets used in indicators, indicator columns,
155
+ and final risk columns. Defaults to False (backward compatible).
148
156
 
149
157
  Returns:
150
158
  data_lookup_type: DataFrame with added risk columns.
@@ -278,7 +286,8 @@ def whisp_risk(
278
286
  unit_type, # Pass the unit type
279
287
  )
280
288
 
281
- df_w_indicators_and_risk_pcrop = add_eudr_risk_pcrop_col(
289
+ # these "add_" functions modify the 'df_w_indicators' dataframe in place
290
+ add_risk_pcrop_col(
282
291
  df=df_w_indicators,
283
292
  ind_1_name=ind_1_name,
284
293
  ind_2_name=ind_2_name,
@@ -286,14 +295,14 @@ def whisp_risk(
286
295
  ind_4_name=ind_4_name,
287
296
  )
288
297
 
289
- df_w_indicators_and_risk_acrop = add_eudr_risk_acrop_col(
298
+ add_risk_acrop_col(
290
299
  df=df_w_indicators,
291
300
  ind_1_name=ind_1_name,
292
301
  ind_2_name=ind_2_name,
293
302
  ind_4_name=ind_4_name,
294
303
  )
295
304
 
296
- df_w_indicators_and_risk_timber = add_eudr_risk_timber_col(
305
+ add_risk_timber_col(
297
306
  df=df_w_indicators,
298
307
  ind_2_name=ind_2_name,
299
308
  ind_5_name=ind_5_name,
@@ -305,10 +314,14 @@ def whisp_risk(
305
314
  ind_11_name=ind_11_name,
306
315
  )
307
316
 
308
- return df_w_indicators_and_risk_timber
317
+ # Filter to risk-relevant columns if requested (after all columns added)
318
+ if drop_unused_columns:
319
+ df_w_indicators = filter_to_risk_columns(df_w_indicators, input_cols, names)
320
+
321
+ return df_w_indicators
309
322
 
310
323
 
311
- def add_eudr_risk_pcrop_col(
324
+ def add_risk_pcrop_col(
312
325
  df: data_lookup_type,
313
326
  ind_1_name: str,
314
327
  ind_2_name: str,
@@ -316,7 +329,7 @@ def add_eudr_risk_pcrop_col(
316
329
  ind_4_name: str,
317
330
  ) -> data_lookup_type:
318
331
  """
319
- Adds the EUDR (European Union Deforestation Risk) column to the DataFrame based on indicator values.
332
+ Adds the risk column to the DataFrame based on indicator values.
320
333
 
321
334
  Args:
322
335
  df (DataFrame): Input DataFrame.
@@ -326,35 +339,35 @@ def add_eudr_risk_pcrop_col(
326
339
  ind_4_name (str, optional): Name of fourth indicator column. Defaults to "Ind_04_disturbance_after_2020".
327
340
 
328
341
  Returns:
329
- DataFrame: DataFrame with added 'EUDR_risk' column.
342
+ DataFrame: DataFrame with added 'risk' column.
330
343
  """
331
344
 
332
345
  for index, row in df.iterrows():
333
- # If any of the first three indicators suggest low risk, set EUDR_risk to "low"
346
+ # If any of the first three indicators suggest low risk, set risk to "low"
334
347
  if (
335
348
  row[ind_1_name] == "no"
336
349
  or row[ind_2_name] == "yes"
337
350
  or row[ind_3_name] == "yes"
338
351
  ):
339
352
  df.at[index, "risk_pcrop"] = "low"
340
- # If none of the first three indicators suggest low risk and Indicator 4 suggests no risk, set EUDR_risk to "more_info_needed"
353
+ # If none of the first three indicators suggest low risk and Indicator 4 suggests no risk, set risk to "more_info_needed"
341
354
  elif row[ind_4_name] == "no":
342
355
  df.at[index, "risk_pcrop"] = "more_info_needed"
343
- # If none of the above conditions are met, set EUDR_risk to "high"
356
+ # If none of the above conditions are met, set risk to "high"
344
357
  else:
345
358
  df.at[index, "risk_pcrop"] = "high"
346
359
 
347
360
  return df
348
361
 
349
362
 
350
- def add_eudr_risk_acrop_col(
363
+ def add_risk_acrop_col(
351
364
  df: data_lookup_type,
352
365
  ind_1_name: str,
353
366
  ind_2_name: str,
354
367
  ind_4_name: str,
355
368
  ) -> data_lookup_type:
356
369
  """
357
- Adds the EUDR (European Union Deforestation Risk) column to the DataFrame based on indicator values.
370
+ Adds the risk column to the DataFrame based on indicator values.
358
371
 
359
372
  Args:
360
373
  df (DataFrame): Input DataFrame.
@@ -363,25 +376,25 @@ def add_eudr_risk_acrop_col(
363
376
  ind_4_name (str, optional): Name of fourth indicator column. Defaults to "Ind_04_disturbance_after_2020".
364
377
 
365
378
  Returns:
366
- DataFrame: DataFrame with added 'EUDR_risk' column.
379
+ DataFrame: DataFrame with added 'risk' column.
367
380
  """
368
381
 
369
382
  # soy risk
370
383
  for index, row in df.iterrows():
371
- # If there is no tree cover in 2020, set EUDR_risk_soy to "low"
384
+ # If there is no tree cover in 2020, set risk_soy to "low"
372
385
  if row[ind_1_name] == "no" or row[ind_2_name] == "yes":
373
386
  df.at[index, "risk_acrop"] = "low"
374
- # If there is tree cover in 2020 and distrubances post 2020, set EUDR_risk_soy to "high"
387
+ # If there is tree cover in 2020 and distrubances post 2020, set risk_soy to "high"
375
388
  elif row[ind_1_name] == "yes" and row[ind_4_name] == "yes":
376
389
  df.at[index, "risk_acrop"] = "high"
377
- # If tree cover and no disturbances post 2020, set EUDR_risk to "more_info_needed"
390
+ # If tree cover and no disturbances post 2020, set risk to "more_info_needed"
378
391
  else:
379
392
  df.at[index, "risk_acrop"] = "more_info_needed"
380
393
 
381
394
  return df
382
395
 
383
396
 
384
- def add_eudr_risk_timber_col(
397
+ def add_risk_timber_col(
385
398
  df: data_lookup_type,
386
399
  ind_2_name: str,
387
400
  ind_5_name: str,
@@ -393,7 +406,7 @@ def add_eudr_risk_timber_col(
393
406
  ind_11_name: str,
394
407
  ) -> data_lookup_type:
395
408
  """
396
- Adds the EUDR (European Union Deforestation Risk) column to the DataFrame based on indicator values.
409
+ Adds the risk column to the DataFrame based on indicator values.
397
410
 
398
411
  Args:
399
412
  df (DataFrame): Input DataFrame.
@@ -407,42 +420,42 @@ def add_eudr_risk_timber_col(
407
420
  ind_11_name (str, optional): Name of eleventh indicator column. Defaults to "Ind_11_logging_concession_before_2020".
408
421
 
409
422
  Returns:
410
- DataFrame: DataFrame with added 'EUDR_risk' column.
423
+ DataFrame: DataFrame with added risk column.
411
424
  """
412
425
 
413
426
  for index, row in df.iterrows():
414
427
  # If there is a commodity in 2020 (ind_2_name)
415
- # OR if there is planted-plantation in 2020 (ind_7_name) AND no agriculture in 2023 (ind_10_name), set EUDR_risk_timber to "low"
428
+ # OR if there is planted-plantation in 2020 (ind_7_name) AND no agriculture in 2023 (ind_10_name), set risk_timber to "low"
416
429
  if row[ind_2_name] == "yes" or (
417
430
  row[ind_7_name] == "yes" and row[ind_10_name] == "no"
418
431
  ):
419
432
  df.at[index, "risk_timber"] = "low"
420
- # If there is a natural forest primary (ind_5_name) or naturally regenerating (ind_6_name) or planted forest (ind_7_name) in 2020 AND agricultural after 2020 (ind_10_name), set EUDR_timber to high
433
+ # If there is a natural forest primary (ind_5_name) or naturally regenerating (ind_6_name) or planted forest (ind_7_name) in 2020 AND agricultural after 2020 (ind_10_name), set risk_timber to high
421
434
  elif (
422
435
  row[ind_5_name] == "yes"
423
436
  or row[ind_6_name] == "yes"
424
437
  or row[ind_7_name] == "yes"
425
438
  ) and row[ind_10_name] == "yes":
426
439
  df.at[index, "risk_timber"] = "high"
427
- # If there is a natural forest primary (ind_5_name) or naturally regenerating (ind_6_name) AND planted after 2020 (ind_8_name), set EUDR_risk to "high"
440
+ # If there is a natural forest primary (ind_5_name) or naturally regenerating (ind_6_name) AND planted after 2020 (ind_8_name), set risk to "high"
428
441
  elif (row[ind_5_name] == "yes" or row[ind_6_name] == "yes") and row[
429
442
  ind_8_name
430
443
  ] == "yes":
431
444
  df.at[index, "risk_timber"] = "high"
432
445
  # No data yet on OWL conversion
433
- # If primary or naturally regenerating or planted forest in 2020 and OWL in 2023, set EUDR_risk to high
446
+ # If primary or naturally regenerating or planted forest in 2020 and OWL in 2023, set risk to high
434
447
  # elif (row[ind_5_name] == "yes" or row[ind_6_name] == "yes" or row[ind_7_name] == "yes") and row[ind_10_name] == "yes":
435
- # df.at[index, 'EUDR_risk_timber'] = "high"
448
+ # df.at[index, 'risk_timber'] = "high"
436
449
 
437
- # If there is a natural primary forest (ind_5_name) OR naturally regenerating in 2020 (ind_6_name) AND an information on management practice any time (ind_11_name) OR tree cover or regrowth post 2020 (ind_9_name), set EUDR_risk_timber to "low"
450
+ # If there is a natural primary forest (ind_5_name) OR naturally regenerating in 2020 (ind_6_name) AND an information on management practice any time (ind_11_name) OR tree cover or regrowth post 2020 (ind_9_name), set risk_timber to "low"
438
451
  elif (row[ind_5_name] == "yes" or row[ind_6_name] == "yes") and (
439
452
  row[ind_9_name] == "yes" or row[ind_11_name] == "yes"
440
453
  ):
441
454
  df.at[index, "risk_timber"] = "low"
442
- # If primary (ind_5_name) OR naturally regenerating in 2020 (ind_6_name) and no other info, set EUDR_risk to "more_info_needed"
455
+ # If primary (ind_5_name) OR naturally regenerating in 2020 (ind_6_name) and no other info, set risk to "more_info_needed"
443
456
  elif row[ind_5_name] == "yes" or row[ind_6_name] == "yes":
444
457
  df.at[index, "risk_timber"] = "more_info_needed"
445
- # If none of the above conditions are met, set EUDR_risk to "low"
458
+ # If none of the above conditions are met, set risk to "low"
446
459
  else:
447
460
  df.at[index, "risk_timber"] = "low"
448
461
 
@@ -790,6 +803,77 @@ def check_range(value: float) -> None:
790
803
  raise ValueError("Value must be between 0 and 100.")
791
804
 
792
805
 
806
+ def get_context_metadata_columns() -> list[str]:
807
+ """
808
+ Get list of context/metadata column names from lookup CSV.
809
+
810
+ Returns
811
+ -------
812
+ list[str]
813
+ List of column names marked as context_and_metadata
814
+ """
815
+ lookup_df = pd.read_csv(DEFAULT_CONTEXT_LOOKUP_TABLE_PATH)
816
+ return list(lookup_df["name"])
817
+
818
+
819
+ def filter_to_risk_columns(
820
+ df: pd.DataFrame, input_cols: list[list[str]], names: list[str]
821
+ ) -> pd.DataFrame:
822
+ """
823
+ Filter DataFrame to only columns relevant for risk calculations.
824
+
825
+ Keeps:
826
+ - Context/metadata columns (plotId, Area, Country, etc.)
827
+ - Dataset columns used in risk indicators
828
+ - Indicator columns (Ind_01_treecover, etc.)
829
+ - Risk columns (risk_pcrop, risk_acrop, risk_timber, risk_livestock)
830
+
831
+ Parameters
832
+ ----------
833
+ df : pd.DataFrame
834
+ DataFrame with all columns
835
+ input_cols : list[list[str]]
836
+ List of lists containing dataset column names used in each indicator
837
+ names : list[str]
838
+ Names of indicator columns
839
+
840
+ Returns
841
+ -------
842
+ pd.DataFrame
843
+ Filtered DataFrame with only risk-relevant columns
844
+ """
845
+ # Get context/metadata columns
846
+ context_cols = get_context_metadata_columns()
847
+
848
+ # Flatten input_cols to get dataset columns used in risk
849
+ dataset_cols = []
850
+ for col_list in input_cols:
851
+ dataset_cols.extend(col_list)
852
+
853
+ # Risk output columns (present in df if function called at end)
854
+ risk_cols = ["risk_pcrop", "risk_acrop", "risk_timber", "risk_livestock"]
855
+
856
+ # Post-processing metadata columns (added after validation, not in schema CSV)
857
+ metadata_cols = ["whisp_processing_metadata", "geo_original"]
858
+
859
+ # Build set of all columns to keep (for fast lookup)
860
+ cols_to_keep_set = set(
861
+ context_cols + dataset_cols + names + risk_cols + metadata_cols
862
+ )
863
+
864
+ # Preserve original DataFrame column order, filter to only columns we want to keep
865
+ cols_to_keep = [col for col in df.columns if col in cols_to_keep_set]
866
+
867
+ # Log dropped columns at debug level
868
+ dropped_cols = [col for col in df.columns if col not in cols_to_keep_set]
869
+ if dropped_cols:
870
+ logger.debug(
871
+ f"Dropped {len(dropped_cols)} columns: {', '.join(sorted(dropped_cols))}"
872
+ )
873
+
874
+ return df[cols_to_keep]
875
+
876
+
793
877
  def add_custom_bands_info_to_lookup(
794
878
  lookup_df: pd.DataFrame, custom_bands_info: dict, df_columns: list
795
879
  ) -> pd.DataFrame:
@@ -1,9 +1,8 @@
1
- Metadata-Version: 2.4
1
+ Metadata-Version: 2.3
2
2
  Name: openforis-whisp
3
- Version: 3.0.0a7
3
+ Version: 3.0.0a8
4
4
  Summary: Whisp (What is in that plot) is an open-source solution which helps to produce relevant forest monitoring information and support compliance with deforestation-related regulations.
5
5
  License: MIT
6
- License-File: LICENSE
7
6
  Keywords: whisp,geospatial,data-processing
8
7
  Author: Andy Arnell
9
8
  Author-email: andrew.arnell@fao.org
@@ -17,7 +16,6 @@ Classifier: Programming Language :: Python :: 3.10
17
16
  Classifier: Programming Language :: Python :: 3.11
18
17
  Classifier: Programming Language :: Python :: 3.12
19
18
  Classifier: Programming Language :: Python :: 3.13
20
- Classifier: Programming Language :: Python :: 3.14
21
19
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
20
  Requires-Dist: country_converter (>=0.7,<2.0.0)
23
21
  Requires-Dist: earthengine-api
@@ -31,6 +29,7 @@ Requires-Dist: pydantic-core (>=2.14.0,<3.0.0)
31
29
  Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
32
30
  Requires-Dist: rsa (>=4.2,<5.0.0)
33
31
  Requires-Dist: shapely (>=2.0.2,<3.0.0)
32
+ Project-URL: Changelog, https://github.com/forestdatapartnership/whisp/releases
34
33
  Project-URL: Documentation, https://github.com/forestdatapartnership/whisp#readme
35
34
  Project-URL: Development Branch, https://github.com/forestdatapartnership/whisp/tree/main
36
35
  Project-URL: Issues, https://github.com/forestdatapartnership/whisp/issues
@@ -64,6 +63,7 @@ Description-Content-Type: text/markdown
64
63
  - [Add data layers](#whisp_add_data)
65
64
  - [Contribute to the code](#whisp_contribute)
66
65
  - [Code of conduct](#whisp_conduct)
66
+ - [Feedback](#whisp_feedback)
67
67
 
68
68
  <br>
69
69
 
@@ -71,11 +71,11 @@ Description-Content-Type: text/markdown
71
71
  ***Whisp*** can currently be used directly or implemented in your own code through three different pathways:
72
72
 
73
73
 
74
- 1. The Whisp App with its simple interface can be used [right here](https://whisp.openforis.org/) or called from other software by [API](https://whisp.openforis.org/documentation/api-guide). The Whisp App currently supports the processing of up to 3,000 geometries per job. The original JS & Python code behind the Whisp App and API can be found [here](https://github.com/forestdatapartnership/whisp-app).
74
+ 1. The Whisp App with its simple interface can be accessed [here](https://whisp.openforis.org/) or called from other software by [API](https://whisp.openforis.org/documentation/api-guide). The Whisp App currently supports the processing of up to 3,000 geometries per job. The original JS & Python code behind the Whisp App and API can be found [here](https://github.com/forestdatapartnership/whisp-app).
75
75
 
76
76
  2. [Whisp in Earthmap](https://whisp.earthmap.org/?aoi=WHISP&boundary=plot1&layers=%7B%22CocoaETH%22%3A%7B%22opacity%22%3A1%7D%2C%22JRCForestMask%22%3A%7B%22opacity%22%3A1%7D%2C%22planet_rgb%22%3A%7B%22opacity%22%3A1%2C%22date%22%3A%222020-12%22%7D%7D&map=%7B%22center%22%3A%7B%22lat%22%3A7%2C%22lng%22%3A4%7D%2C%22zoom%22%3A3%2C%22mapType%22%3A%22satellite%22%7D&statisticsOpen=true) supports the visualization of geometries on actual maps with the possibility to toggle different relevant map products around tree cover, commodities and deforestation. It is practical for demonstration purposes and spot checks of single geometries but not recommended for larger datasets.
77
77
 
78
- 3. Datasets of any size, especially when holding more than 3,000 geometries, can be analyzed with Whisp through the [python package on pip](https://pypi.org/project/openforis-whisp/). See example [Colab Notebook](https://github.com/forestdatapartnership/whisp/blob/main/notebooks/Colab_whisp_geojson_to_csv.ipynb) for implementation with a geojson input. For the detailed procedure please go to the section [Whisp notebooks](#whisp_notebooks).
78
+ 3. Datasets of any size, especially when holding more than 3,000 geometries, can be analyzed with Whisp through the [python package on pip](https://pypi.org/project/openforis-whisp/). See example [Colab Notebook](https://github.com/forestdatapartnership/whisp/blob/main/notebooks/Colab_whisp_geojson_to_csv.ipynb) for implementation with a geojson input. For further notebooks processing options see [Whisp notebooks](#whisp_notebooks).
79
79
 
80
80
 
81
81
  ## Whisp datasets <a name="whisp_datasets"></a>
@@ -98,7 +98,7 @@ Additional categories are specific for the timber commodity, considering a harve
98
98
 
99
99
  There are multiple datasets for each category. Find the full current [list of datasets used in Whisp here](https://github.com/forestdatapartnership/whisp/blob/main/layers_description.md).
100
100
 
101
- ### Whisp risk assessment <a name="whisp_risk"></a>
101
+ ### Whisp risk assessment <a name="whisp_risk"></a>
102
102
 
103
103
  Whisp checks the plots provided by the user by running zonal statistics on them to answer the following questions:
104
104
 
@@ -155,24 +155,24 @@ The **relevant risk assessment column depends on the commodity** in question:
155
155
 
156
156
  *The Whisp algorithm for **Perennial Crops** visualized:*
157
157
  ![CoE_Graphic 5](https://github.com/user-attachments/assets/007b5f50-3939-4707-95fa-98be4d56745f)
158
-
158
+
159
159
  If no treecover dataset indicates any tree cover for a plot by the end of 2020, **Whisp will categorize the deforestation risk as low.**
160
160
 
161
161
  If one or more treecover datasets indicate tree cover on a plot by the end of 2020, but a commodity dataset indicates agricultural use by the end of 2020, **Whisp will categorize the deforestation risk as low.**
162
162
 
163
- If treecover datasets indicate tree cover on a plot by late 2020, no commodity datasets indicate agricultural use, but a disturbance dataset indicates disturbances before the end of 2020, **Whisp will categorize the deforestation risk as <u>low</u>.** Such deforestation has happened before 2020, which aligns with the cutoff date for legislation such as EUDR, and is therefore not considered high risk.
163
+ If treecover datasets indicate tree cover on a plot by late 2020, no commodity datasets indicate agricultural use, but a disturbance dataset indicates disturbances before the end of 2020, **Whisp will categorize the deforestation risk as <u>low</u>.** Such deforestation has happened before 2020, which aligns with the cutoff date for legislation, such as EUDR (European Union Deforestation Risk), and is therefore not considered high risk.
164
164
 
165
165
  Now, if the datasets under 1., 2. & 3. indicate that there was tree cover, but no agriculture and no disturbances before or by the end of 2020, the Whisp algorithm checks whether degradation or deforestation have been reported in a disturbance dataset after 2020-12-31. If they have, **Whisp will categorize the deforestation risk as <u>high</u>.** <br>
166
166
  However, under the same circumstances but with <u>no</u> disturbances reported after 2020-12-31 there is insufficient evidence and the **Whisp output will be "More info needed".** Such can be the case for, e.g., cocoa or coffee grown under the shade of treecover or agroforestry.
167
167
 
168
168
 
169
169
  ## Run Whisp python package from a notebook <a name="whisp_notebooks"></a>
170
-
170
+
171
171
  For most users we suggest using the Whisp App to process their plot data. But for some, using the python package directly will fit their workflow.
172
172
 
173
- A simple example of the package functionality can be seen in this [Colab Notebook](https://github.com/forestdatapartnership/whisp/blob/main/notebooks/Colab_whisp_geojson_to_csv.ipynb)
173
+ An example of the package functionality can be seen in this [Colab Notebook](https://github.com/forestdatapartnership/whisp/blob/main/notebooks/Colab_whisp_geojson_to_csv.ipynb)
174
174
 
175
- For an example notebook adapted for running locally (or in Sepal), see: [whisp_geojson_to_csv.ipynb](https://github.com/forestdatapartnership/whisp/blob/main/notebooks/whisp_geojson_to_csv.ipynb) or if datasets are very large, see [whisp_geojson_to_drive.ipynb](https://github.com/forestdatapartnership/whisp/blob/main/notebooks/whisp_geojson_to_drive.ipynb)
175
+ For running locally (or in Sepal), see: [whisp_geojson_to_csv.ipynb](https://github.com/forestdatapartnership/whisp/blob/main/notebooks/whisp_geojson_to_csv.ipynb) or if datasets are very large (e.g., >100,000 features), see [whisp_ee_asset_to_drive.ipynb](https://github.com/forestdatapartnership/whisp/blob/main/notebooks/whisp_ee_asset_to_drive.ipynb)
176
176
 
177
177
  ### Requirements for running the package
178
178
 
@@ -180,8 +180,6 @@ The **relevant risk assessment column depends on the commodity** in question:
180
180
  - A registered cloud GEE project.
181
181
  - Some experience in Python or a similar language.
182
182
 
183
- More info on Whisp can be found in [here](https://openknowledge.fao.org/items/e9284dc7-4b19-4f9c-b3e1-e6c142585865)
184
-
185
183
 
186
184
  ### Python package installation
187
185
 
@@ -195,7 +193,6 @@ The **relevant risk assessment column depends on the commodity** in question:
195
193
  pip install --pre openforis-whisp
196
194
  ```
197
195
 
198
- If running the package locally we recommend a [virtual environment](https://docs.python.org/3/library/venv.html) to keep your main python installation clean. For users running the package in Sepal see [here](https://docs.sepal.io/en/latest/cli/python.html#virtual-environment).
199
196
 
200
197
  The package relies upon the google earth engine api being setup correctly using a registered cloud project.
201
198
 
@@ -242,129 +239,46 @@ Before submitting a request, consider the following:
242
239
  ---
243
240
 
244
241
 
245
-
246
242
  ### Adding your own data directly
247
243
 
244
+ The python notebooks allow the user to add custom data layers. You can edit the Prepare layers section to do this in the [Colab Notebook](https://github.com/forestdatapartnership/whisp/blob/main/notebooks/Colab_whisp_geojson_to_csv.ipynb)
245
+ To add your own data directly you will need some coding experience as well as familiarity with Google Earth Engine.
248
246
 
249
- To add your own data you will need some coding experience as well as familiarity with GitHub and Google Earth Engine.
250
-
251
- This approach is for those who want to run a bespoke analysis combining their own data with those already in Whisp.
252
-
253
- Firstly follow the steps below to install the package in editable mode.
254
-
255
- As with the regular pip installation, we recommend a separate [virtual environment](https://docs.python.org/3/library/venv.html) for running in editable mode. For Sepal users see [here](https://docs.sepal.io/en/latest/cli/python.html#virtual-environment).
256
-
257
- ```bash
258
-
259
- git clone https://github.com/forestdatapartnership/whisp.git
260
-
261
- cd whisp/
262
-
263
- pip install -e .[dev]
264
-
265
- ```
266
- Once in editable mode you are running the Whisp package locally based on a cloned version of the code.
267
-
268
-
269
-
270
- There are two files to edit to add your own data:
271
-
272
- - `src/openforis_whisp/datasets.py`
273
-
274
- - `src/openforis_whisp/parameters/lookup_gee_datasets.csv`
275
-
276
-
277
-
278
- The `datasets.py` file is a Python script that defines functions which return GEE images composed of one or more bands.
279
-
280
-
281
-
282
- #### To add your own dataset:
283
-
284
- 1. Add code to `datasets.py` in the form of a function that returns a **single-band binary image** for your dataset. See notes at the top of the file and example functions for formatting.
285
-
286
- 2. Edit the `lookup_gee_datasets.csv` and add a row for your dataset.
287
-
288
-
289
-
290
- **NB:** You need to know what the dataset represents and define how it will be used in the different risk decision trees (if at all).
291
-
292
- For example, if it is a dataset for tree cover in 2000, then add `'treecover'` under the `Theme` column.
293
-
294
-
295
-
296
- #### Example function in `datasets.py`:
297
-
298
-
299
-
300
- ```python
301
-
302
- def my_custom_dataset_prep():
303
-
304
- image = ee.Image("MY/GEE/DATASET")
305
-
306
- binary = image.gt(10) # Example threshold
307
-
308
- return binary.rename("My_custom_dataset")
309
-
310
- ```
311
-
312
-
313
-
314
- ---
315
-
316
-
317
- We are working on ways to make this process smoother. However, in the meantime do contact us through the [issues page on GitHub](https://github.com/forestdatapartnership/whisp/issues), or via the Open Foris email, if this functionality is useful to you or you need help.
318
-
319
-
320
-
321
- ---
322
-
323
-
324
-
325
- ## Contributing to the Whisp code base <a name="whisp_contribute"></a>
326
-
327
- Contributions to the Whisp code in GitHub are welcome. These could be additional functionality, datasets or just cleaner code! Contributions can be made by forking the repository, making and pushing the required changes, then making a pull request to the Whisp repository. After briefly reviewing the request, we can make a branch for which to make a new pull request to. After final checks, we can then incorporate the code into the main branch. If in doubt, get in contact first or log as an issue [here](https://github.com/forestdatapartnership/whisp/issues/).
328
-
329
-
330
- Install the package in editable mode (see Adding your own data directly above):
331
-
332
- Then add additional dependencies required for testing and running pre-commit hooks:
333
-
334
-
335
- ```bash
336
-
337
- pre-commit install
338
247
 
339
- ```
248
+ ## Contributing <a name="whisp_contribute"></a>
340
249
 
250
+ Contributions are welcome!
251
+ - Fork the repo, make changes, and open a pull request.
252
+ - For adding new datasets to the codebase and for project-specific coding standards see [.github/copilot-instructions.md](.github/copilot-instructions.md)
341
253
 
342
- You should be able to run the Pytest suite by simply running the `pytest` command from the repo's root folder.
254
+ ## Code of Conduct <a name="whisp_conduct"></a>
343
255
 
256
+ **Purpose**
257
+ We are dedicated to maintaining a safe and respectful environment for all users. Harassment or abusive behavior will not be tolerated. <br>
344
258
 
345
- Please read the [contributing guidelines](contributing_guidelines.md) for good practice recommendations
259
+ **Scope**
260
+ This Code applies to all interactions on the repository and on the app.
346
261
 
262
+ **Expectations** <br>
263
+ *- Respect others:* Treat all contributors and users with courtesy and kindness. <br>
264
+ *- Constructive communication:* Engage respectfully, even in disagreements. <br>
265
+ *- Protect privacy:* Do not share personal information without consent.
347
266
 
348
- ## Code of Conduct <a name="whisp_conduct"></a>
267
+ **Prohibited Conduct** <br>
268
+ *- Harassment:* Unwanted or abusive communication, stalking, threats, or bullying.<br>
269
+ *- Discrimination:* Any form of hate speech or exclusion based on race, gender, orientation, or other identities.<br>
270
+ *- Inappropriate Content:* Posting offensive, harmful, or explicit material.
349
271
 
350
- **Purpose**
351
- We are dedicated to maintaining a safe and respectful environment for all users. Harassment or abusive behavior will not be tolerated. <br>
272
+ **Reporting** <br>
273
+ Users can report violations of this Code of Conduct confidentially by contacting the Open Foris team at
274
+ [open-foris@fao.org](mailto:open-foris@fao.org).
352
275
 
353
- **Scope**
354
- This Code applies to all interactions on the repository and on the app.
355
276
 
356
- **Expectations** <br>
357
- *- Respect others:* Treat all contributors and users with courtesy and kindness. <br>
358
- *- Constructive communication:* Engage respectfully, even in disagreements. <br>
359
- *- Protect privacy:* Do not share personal information without consent.
277
+ ## Feedback <a name="whisp_feedback"></a>
278
+ - For issues or feature requests [open a GitHub issue](https://github.com/forestdatapartnership/whisp/issues).
279
+ - For general questions, feedback or support, email [open-foris@fao.org](mailto:open-foris@fao.org).
360
280
 
361
- **Prohibited Conduct** <br>
362
- *- Harassment:* Unwanted or abusive communication, stalking, threats, or bullying.<br>
363
- *- Discrimination:* Any form of hate speech or exclusion based on race, gender, orientation, or other identities.<br>
364
- *- Inappropriate Content:* Posting offensive, harmful, or explicit material.
281
+ We welcome all feedback and contributions!
365
282
 
366
- **Reporting**
367
- Users can report violations directly to us by emailing the address listed in the "Contact Us" section of the website:
368
- https://openforis.org/solutions/whisp/
369
283
 
370
284
 
@@ -1,8 +1,8 @@
1
1
  openforis_whisp/__init__.py,sha256=YihdNrybfFygwcwa2Bis59V7sYpNR9aAxL-VNO4dqEI,3659
2
- openforis_whisp/advanced_stats.py,sha256=1ZhIwdlZjephXvXVChVrNmouPgN_urXvYXYGeCs0Ay0,99731
2
+ openforis_whisp/advanced_stats.py,sha256=yXwPIimbHZV3jxRL-mLMQoWZk9_UEec30I-0flNsOx8,99055
3
3
  openforis_whisp/data_checks.py,sha256=jxShBiihtX0rel__Vkzu1bZfqgVQIx_l-uPP1OeCaKY,37015
4
4
  openforis_whisp/data_conversion.py,sha256=L2IsiUyQUt3aHgSYGbIhgPGwM7eyS3nLVEoNO9YqQeM,21888
5
- openforis_whisp/datasets.py,sha256=05m-8dj1r11CWTQd5xAStV3JEStmfiNuBm2zjyiTr0Y,53898
5
+ openforis_whisp/datasets.py,sha256=fAGj1jaeoPszWm60p8N00x2qrw398-iDklX-4nkC6mI,53855
6
6
  openforis_whisp/logger.py,sha256=gFkRTwJDJKIBWcHDOK74Uln3JM7fAybURo7pQpGL790,3395
7
7
  openforis_whisp/parameters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  openforis_whisp/parameters/config_runtime.py,sha256=NOo39MAi60XCwEx5pwkS0EHKJBh0XY1q06y4j0HAABg,1421
@@ -10,11 +10,11 @@ openforis_whisp/parameters/lookup_context_and_metadata.csv,sha256=KgK0ik_Gd4t_Nq
10
10
  openforis_whisp/parameters/lookup_gaul1_admin.py,sha256=cQr5liRdXi85QieTxrz4VAkn0COvRCp82ZV0dYFWOio,474980
11
11
  openforis_whisp/parameters/lookup_gee_datasets.csv,sha256=7KdnFocEgbZO5m8JmWQchzZTurg9rJ96y17z8UyLtI0,17537
12
12
  openforis_whisp/pd_schemas.py,sha256=0z-oPmYIDUIn7mNY41W_uUpmTwjoR7e254mOCoHVsOg,2878
13
- openforis_whisp/reformat.py,sha256=gvhIa-_kTT5BSO8LuVmJ1TQcf_NwheskXboFM9e0KJY,32758
14
- openforis_whisp/risk.py,sha256=d_Di5XB8BnHdVXG56xdHTcpB4-CIF5vo2ZRMQRG7Pek,34420
13
+ openforis_whisp/reformat.py,sha256=i_ckmxuOirrfRHbeY05_5JajrJ00T5MoZ_jgzj_h0wA,32939
14
+ openforis_whisp/risk.py,sha256=tVkgVdRpdxaCBtyCjw8Z8MQt7EV9lGy34Bz8r_1Qb8Y,37135
15
15
  openforis_whisp/stats.py,sha256=RJ_PJSXyvz9FnoHeQ3tqrfhhWibXjz9AlX27suSKiO4,63319
16
16
  openforis_whisp/utils.py,sha256=AISWF-MpfFdYkhd6bei4BViw2Iag20mmq61ykrF9YTk,31287
17
- openforis_whisp-3.0.0a7.dist-info/licenses/LICENSE,sha256=nqyqICO95iw_iwzP1t_IIAf7ZX3DPbL_M9WyQfh2q1k,1085
18
- openforis_whisp-3.0.0a7.dist-info/METADATA,sha256=U-VC2XOZJ1DIz_Ar8ZIuXqJFhasA7NkzufKP_ykl2NY,16760
19
- openforis_whisp-3.0.0a7.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
20
- openforis_whisp-3.0.0a7.dist-info/RECORD,,
17
+ openforis_whisp-3.0.0a8.dist-info/LICENSE,sha256=nqyqICO95iw_iwzP1t_IIAf7ZX3DPbL_M9WyQfh2q1k,1085
18
+ openforis_whisp-3.0.0a8.dist-info/METADATA,sha256=2kDHgW5mjXMry11nvYsX7auboQMf4Mzj6BVgVa8TIsI,14173
19
+ openforis_whisp-3.0.0a8.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
20
+ openforis_whisp-3.0.0a8.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 2.2.1
2
+ Generator: poetry-core 2.1.1
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any