openforis-whisp 3.0.0a2__py3-none-any.whl → 3.0.0a4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -750,23 +750,43 @@ def validate_geojson_constraints(
750
750
  return results
751
751
 
752
752
 
753
- def suggest_method(polygon_count, mean_area_ha, mean_vertices=None, verbose=True):
753
+ def suggest_processing_mode(
754
+ feature_count,
755
+ mean_area_ha=None,
756
+ mean_vertices=None,
757
+ feature_type="polygon",
758
+ verbose=True,
759
+ ):
754
760
  """
755
- Suggest processing method based on polygon characteristics.
761
+ Suggest processing mode based on feature characteristics.
762
+
763
+ Decision thresholds from comprehensive benchmark data (Nov 2025):
756
764
 
757
- Decision thresholds from benchmark data (area per polygon × polygon count):
758
- - Small polygons (10 ha): need 250+ polygons for concurrent
759
- - Medium polygons (100 ha): breakeven at ~100 polygons
760
- - Large polygons (500 ha): concurrent wins at 50+ polygons
765
+ POINTS:
766
+ - Break-even: 750-1000 features
767
+ - Sequential faster: < 750 features
768
+ - Concurrent faster: >= 750 features
769
+
770
+ POLYGONS (area-based thresholds):
771
+ - Tiny (< 1 ha): break-even ~500 features
772
+ - Small (1-5 ha, simple): break-even ~500 features
773
+ - Small (1-5 ha, complex 20-50v): break-even ~500 features
774
+ - Medium (5-20 ha): break-even ~250 features
775
+ - Large (20-100 ha): break-even ~250 features
776
+ - Very large (50-200 ha): break-even ~250 features
777
+
778
+ Vertex complexity adjustment: High vertex counts (>50) favor concurrent at lower thresholds
761
779
 
762
780
  Parameters:
763
781
  -----------
764
- polygon_count : int
765
- Number of polygons
766
- mean_area_ha : float
767
- Mean area per polygon in hectares
782
+ feature_count : int
783
+ Number of features (polygons or points)
784
+ mean_area_ha : float, optional
785
+ Mean area per polygon in hectares (required for polygons, ignored for points)
768
786
  mean_vertices : float, optional
769
- Mean number of vertices per polygon (can influence decision for complex geometries)
787
+ Mean number of vertices per polygon (influences decision for complex geometries)
788
+ feature_type : str
789
+ 'polygon', 'multipolygon', or 'point' (default: 'polygon')
770
790
  verbose : bool
771
791
  Print recommendation explanation
772
792
 
@@ -775,31 +795,63 @@ def suggest_method(polygon_count, mean_area_ha, mean_vertices=None, verbose=True
775
795
  str: 'concurrent' or 'sequential'
776
796
  """
777
797
 
778
- # Primary decision based on area
779
- if mean_area_ha >= 300: # Large polygons
780
- breakeven = 50
781
- method = "concurrent" if polygon_count >= breakeven else "sequential"
782
- elif mean_area_ha >= 50: # Medium polygons
783
- breakeven = 100
784
- method = "concurrent" if polygon_count >= breakeven else "sequential"
785
- else: # Small polygons
798
+ # Points: simple threshold-based decision
799
+ if feature_type == "point":
800
+ breakeven = 750
801
+ method = "concurrent" if feature_count >= breakeven else "sequential"
802
+
803
+ if verbose:
804
+ print(f"\nMETHOD RECOMMENDATION (Points)")
805
+ print(f" Features: {feature_count} points")
806
+ print(f" Break-even: {breakeven} features | Method: {method.upper()}")
807
+
808
+ return method
809
+
810
+ # Polygons and MultiPolygons: area and complexity-based decision
811
+ # MultiPolygons use same breakpoints as Polygons
812
+ if mean_area_ha is None:
813
+ # Default to conservative threshold if area unknown
814
+ breakeven = 500
815
+ method = "concurrent" if feature_count >= breakeven else "sequential"
816
+
817
+ if verbose:
818
+ print(f"\nMETHOD RECOMMENDATION (Polygons - area unknown)")
819
+ print(f" Features: {feature_count} polygons")
820
+ print(
821
+ f" Break-even: {breakeven} (conservative) | Method: {method.upper()}"
822
+ )
823
+
824
+ return method
825
+
826
+ # Area-based thresholds from benchmark data
827
+ if mean_area_ha >= 20: # Large to very large polygons
828
+ breakeven = 250
829
+ elif mean_area_ha >= 5: # Medium polygons
786
830
  breakeven = 250
787
- method = "concurrent" if polygon_count >= breakeven else "sequential"
831
+ elif mean_area_ha >= 1: # Small polygons
832
+ # Vertex complexity matters more for small polygons
833
+ if mean_vertices is not None and mean_vertices >= 30:
834
+ breakeven = 500 # Complex small polygons
835
+ else:
836
+ breakeven = 500 # Simple small polygons
837
+ else: # Tiny polygons (< 1 ha)
838
+ breakeven = 500
839
+
840
+ # Vertex complexity adjustment for high-complexity geometries
841
+ if mean_vertices is not None and mean_vertices >= 50:
842
+ # High complexity: reduce breakeven by 20% (concurrent beneficial sooner)
843
+ breakeven = int(breakeven * 0.8)
788
844
 
789
- # Optional adjustment based on vertex complexity (very high complexity favors concurrent)
790
- if mean_vertices is not None and mean_vertices > 500:
791
- # Reduce breakeven by 25% for very complex geometries
792
- adjusted_breakeven = int(breakeven * 0.75)
793
- method = "concurrent" if polygon_count >= adjusted_breakeven else "sequential"
845
+ method = "concurrent" if feature_count >= breakeven else "sequential"
794
846
 
795
847
  if verbose:
796
- print(f"\nMETHOD RECOMMENDATION")
848
+ print(f"\nMETHOD RECOMMENDATION (Polygons)")
797
849
  print(
798
- f" Polygons: {polygon_count} | Mean Area: {mean_area_ha:.1f} ha", end=""
850
+ f" Features: {feature_count} | Mean Area: {mean_area_ha:.1f} ha", end=""
799
851
  )
800
852
  if mean_vertices is not None:
801
853
  print(f" | Mean Vertices: {mean_vertices:.1f}", end="")
802
854
  print()
803
- print(f" Breakeven: {breakeven} polygons | Method: {method.upper()}")
855
+ print(f" Break-even: {breakeven} features | Method: {method.upper()}")
804
856
 
805
857
  return method
@@ -1160,6 +1160,20 @@ def nci_ocs2020_prep():
1160
1160
  ).selfMask() # cocoa from national land cover map for Côte d'Ivoire
1161
1161
 
1162
1162
 
1163
+ # nCM - Cameroon
1164
+ # data from Aurelie Shapiro (FAO) working directly with country experts - info on methods and accuracy assessment to follow
1165
+
1166
+
1167
+ def ncm_treecover_2020_prep():
1168
+ return (
1169
+ ee.Image("projects/ee-cocoacmr/assets/land_cover/CMR_TNTMMU_2020")
1170
+ .select("FNF_2020")
1171
+ .eq(1)
1172
+ .rename("nCM_Treecover_2020")
1173
+ .selfMask()
1174
+ )
1175
+
1176
+
1163
1177
  # ============================================================================
1164
1178
  # CONTEXT BANDS (Administrative boundaries and water mask)
1165
1179
  # ============================================================================
openforis_whisp/logger.py CHANGED
@@ -8,9 +8,21 @@ BASE_MSG_FORMAT = (
8
8
 
9
9
  class StdoutLogger:
10
10
  def __init__(self, name: str, msg_format: str = BASE_MSG_FORMAT) -> None:
11
- self.handler = logging.StreamHandler(sys.stdout)
12
- self.handler.setFormatter(logging.Formatter(msg_format))
13
- self.handler.setLevel(logging.DEBUG)
11
+ # Create handler that auto-flushes for Colab/notebook visibility
12
+ handler = logging.StreamHandler(sys.stdout)
13
+ handler.setFormatter(logging.Formatter(msg_format))
14
+ handler.setLevel(logging.DEBUG)
15
+
16
+ # Override emit to force flush after each message
17
+ original_emit = handler.emit
18
+
19
+ def emit_with_flush(record):
20
+ original_emit(record)
21
+ sys.stdout.flush()
22
+
23
+ handler.emit = emit_with_flush
24
+
25
+ self.handler = handler
14
26
  self.logger = logging.getLogger(name)
15
27
  self.logger.addHandler(self.handler)
16
28
  self.logger.propagate = False
@@ -2,9 +2,9 @@ name,order,ISO2_code,theme,theme_timber,use_for_risk,use_for_risk_timber,exclude
2
2
  EUFO_2020,10,,treecover,naturally_reg_2020,1,1,0,float32,1,0,g_jrc_gfc_2020_prep
3
3
  GLAD_Primary,20,,treecover,primary,1,1,0,float32,1,0,g_glad_pht_prep
4
4
  TMF_undist,30,,treecover,primary,1,1,0,float32,1,0,g_jrc_tmf_undisturbed_prep
5
- GFC_TC_2020,50,,treecover,naturally_reg_2020,1,1,0,float32,1,0,g_glad_gfc_10pc_prep
5
+ GFC_TC_2020,50,,treecover,naturally_reg_2020,0,0,0,float32,1,0,g_glad_gfc_10pc_prep
6
6
  Forest_FDaP,60,,treecover,naturally_reg_2020,1,1,0,float32,1,0,g_glad_gfc_10pc_prep
7
- ESA_TC_2020,70,,treecover,naturally_reg_2020,1,1,0,float32,1,0,g_esa_worldcover_trees_prep
7
+ ESA_TC_2020,70,,treecover,naturally_reg_2020,0,0,0,float32,1,0,g_esa_worldcover_trees_prep
8
8
  TMF_plant,80,,commodities,NA,1,1,0,float32,1,0,g_jrc_tmf_plantation_prep
9
9
  Oil_palm_Descals,90,,commodities,NA,1,1,0,float32,1,0,g_creaf_descals_palm_prep
10
10
  Oil_palm_FDaP,100,,commodities,NA,1,1,0,float32,1,0,g_fdap_palm_prep
@@ -197,3 +197,4 @@ nBR_INPE_TCamz_pasture_2020,2422,BR,commodities,NA,1,1,0,float32,1,0,nbr_terracl
197
197
  nBR_INPE_TCcer_pasture_2020,2423,BR,commodities,NA,1,1,0,float32,1,0,nbr_terraclass_cer20_ac_prep
198
198
  nBR_MapBiomas_col9_pasture_2020,2424,BR,commodities,NA,1,1,0,float32,1,0,nbr_mapbiomasc9_pasture_prep
199
199
  nCI_Cocoa_bnetd,3000,CI,commodities,NA,1,1,0,float32,1,0,nci_ocs2020_prep
200
+ nCM_Treecover_2020,3100,CM,treecover,NA,1,0,0,float32,1,0,ncm_treecover_2020_prep
@@ -1,5 +1,10 @@
1
- import pandera as pa
2
- from pandera.typing import DataFrame, Series
1
+ # Support both old and new pandera import paths
2
+ try:
3
+ import pandera.pandas as pa
4
+ from pandera.typing.pandas import DataFrame, Series
5
+ except (ImportError, ModuleNotFoundError):
6
+ import pandera as pa
7
+ from pandera.typing import DataFrame, Series
3
8
 
4
9
  # Define a schema for validating a DataFrame related to GEE (Google Earth Engine) datasets.
5
10
  class DataLookupSchema(pa.DataFrameModel):
@@ -1,5 +1,10 @@
1
1
  # !pip install pandera[io] # special version used
2
- import pandera as pa
2
+ # Support both old and new pandera import paths
3
+ try:
4
+ import pandera.pandas as pa
5
+ except (ImportError, ModuleNotFoundError):
6
+ import pandera as pa
7
+
3
8
  import pandas as pd
4
9
  import os
5
10
  import logging
@@ -125,7 +130,7 @@ def validate_dataframe(
125
130
  Returns:
126
131
  pd.DataFrame: The validated DataFrame with columns ordered according to the schema, or None if validation fails.
127
132
  """
128
- log_missing_columns(df_stats, schema)
133
+ _log_missing_columns(df_stats, schema)
129
134
 
130
135
  # df_stats = df_stats.reindex(schema.columns.keys(), axis=1)
131
136
 
@@ -251,7 +256,7 @@ def create_schema_from_dataframe(schema_df: pd.DataFrame) -> pa.DataFrameSchema:
251
256
  # return logger
252
257
 
253
258
 
254
- def log_missing_columns(df_stats: pd.DataFrame, template_schema: pa.DataFrameSchema):
259
+ def _log_missing_columns(df_stats: pd.DataFrame, template_schema: pa.DataFrameSchema):
255
260
  # Initialize the logger
256
261
  logger = setup_logger(__name__)
257
262
 
@@ -675,33 +680,6 @@ def _process_custom_bands(df_extra: pd.DataFrame, custom_bands) -> pd.DataFrame:
675
680
 
676
681
 
677
682
  # Fix the duplicate logging issue
678
- def log_missing_columns(df_stats: pd.DataFrame, template_schema: pa.DataFrameSchema):
679
- # Remove the duplicate logger creation line
680
- # logger = setup_logger(__name__) # DELETE THIS LINE
681
-
682
- # Use the existing module-level logger (line 18: logger = StdoutLogger(__name__))
683
-
684
- # Extract the expected columns from the DataFrameSchema
685
- template_columns = list(template_schema.columns.keys())
686
- df_stats_columns = df_stats.columns.tolist()
687
-
688
- # Find missing and extra columns
689
- missing_in_df = [col for col in template_columns if col not in df_stats_columns]
690
- extra_in_df = [col for col in df_stats_columns if col not in template_columns]
691
-
692
- # Log missing schema columns
693
- if missing_in_df:
694
- logger.warning(f"Missing expected schema columns: {missing_in_df}")
695
- else:
696
- logger.info("All expected schema columns found in DataFrame.")
697
-
698
- # Log extra columns (will be preserved)
699
- if extra_in_df:
700
- logger.info(f"Extra columns found (will be preserved): {extra_in_df}")
701
- else:
702
- logger.info("No extra columns found in DataFrame.")
703
-
704
-
705
683
  def format_stats_dataframe(
706
684
  df,
707
685
  area_col="Area_sum",
openforis_whisp/stats.py CHANGED
@@ -88,12 +88,10 @@ def get_admin_boundaries_fc():
88
88
  def whisp_formatted_stats_geojson_to_df_legacy(
89
89
  input_geojson_filepath: Path | str,
90
90
  external_id_column=None,
91
- remove_geom=False,
92
91
  national_codes=None,
93
92
  unit_type="ha",
94
93
  whisp_image=None,
95
94
  custom_bands=None, # New parameter
96
- validate_geometries: bool = False,
97
95
  ) -> pd.DataFrame:
98
96
  """
99
97
  Legacy function for basic Whisp stats extraction.
@@ -135,56 +133,19 @@ def whisp_formatted_stats_geojson_to_df_legacy(
135
133
  - List of band names: ['Aa_test', 'elevation']
136
134
  - Dict with types: {'Aa_test': 'float64', 'elevation': 'float32'}
137
135
  - None: preserves all extra columns automatically
138
- validate_geometries : bool, optional
139
- Whether to validate and fix invalid geometries, by default False.
140
- Set to True to automatically fix invalid/self-intersecting polygons.
141
136
 
142
137
  Returns
143
138
  -------
144
139
  df_stats : pd.DataFrame
145
140
  The DataFrame containing the Whisp stats for the input ROI.
146
141
  """
147
- # Load GeoJSON and validate geometries if requested
148
- if validate_geometries:
149
- import json
150
- import geopandas as gpd
151
- from shapely.validation import make_valid
152
- import logging as py_logging
153
-
154
- logger = py_logging.getLogger("whisp")
155
-
156
- # Load GeoJSON file
157
- with open(input_geojson_filepath, "r") as f:
158
- geojson_data = json.load(f)
159
-
160
- # Convert to GeoDataFrame
161
- gdf = gpd.GeoDataFrame.from_features(geojson_data["features"])
162
-
163
- # Validate and fix invalid geometries
164
- valid_count = gdf.geometry.is_valid.sum()
165
- invalid_count = len(gdf) - valid_count
166
- if invalid_count > 0:
167
- logger.warning(f"Fixing {invalid_count} invalid geometries")
168
- gdf["geometry"] = gdf["geometry"].apply(
169
- lambda g: make_valid(g) if g and not g.is_valid else g
170
- )
171
-
172
- # Pass GeoDataFrame directly to preserve CRS metadata
173
- # convert_geojson_to_ee will handle:
174
- # - CRS detection and conversion to WGS84 if needed
175
- # - Data type sanitization (datetime, object columns)
176
- # - Geometry validation and Z-coordinate stripping
177
- feature_collection = convert_geojson_to_ee(
178
- gdf, enforce_wgs84=True, strip_z_coords=True
179
- )
180
- else:
181
- # Original path - no validation
182
- feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
142
+ # Convert GeoJSON to Earth Engine FeatureCollection
143
+ # Note: Geometry validation/cleaning should be done before calling this function
144
+ feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
183
145
 
184
146
  return whisp_formatted_stats_ee_to_df(
185
147
  feature_collection,
186
148
  external_id_column,
187
- remove_geom,
188
149
  national_codes=national_codes,
189
150
  unit_type=unit_type,
190
151
  whisp_image=whisp_image,
@@ -203,8 +164,8 @@ def whisp_formatted_stats_geojson_to_df(
203
164
  mode: str = "sequential",
204
165
  batch_size: int = 10,
205
166
  max_concurrent: int = 20,
206
- validate_geometries: bool = False,
207
- include_geometry_audit_trail: bool = False,
167
+ geometry_audit_trail: bool = False,
168
+ status_file: str = None,
208
169
  ) -> pd.DataFrame:
209
170
  """
210
171
  Main entry point for converting GeoJSON to Whisp statistics.
@@ -226,11 +187,7 @@ def whisp_formatted_stats_geojson_to_df(
226
187
  The column in the GeoJSON containing external IDs to be preserved in the output DataFrame.
227
188
  This column must exist as a property in ALL features of the GeoJSON file.
228
189
  Use debug_feature_collection_properties() to inspect available properties if you encounter errors.
229
- remove_geom : bool, default=False
230
- If True, the geometry of the GeoJSON is removed from the output DataFrame.
231
190
  national_codes : list, optional
232
- List of ISO2 country codes to include national datasets.
233
- unit_type: str, optional
234
191
  Whether to use hectares ("ha") or percentage ("percent"), by default "ha".
235
192
  whisp_image : ee.Image, optional
236
193
  Pre-combined multiband Earth Engine Image containing all Whisp datasets.
@@ -252,12 +209,7 @@ def whisp_formatted_stats_geojson_to_df(
252
209
  max_concurrent : int, optional
253
210
  Maximum concurrent EE calls for concurrent mode, by default 20.
254
211
  Only applicable for "concurrent" mode.
255
- validate_geometries : bool, optional
256
- Whether to validate and fix invalid geometries, by default False.
257
- Set to True to automatically fix invalid/self-intersecting polygons.
258
- For production workflows, it's recommended to use geometry validation and
259
- cleaning tools BEFORE processing with this function.
260
- include_geometry_audit_trail : bool, default True
212
+ geometry_audit_trail : bool, default True
261
213
  If True (default), includes audit trail columns:
262
214
  - geo_original: Original input geometry
263
215
  - geometry_type_original: Original geometry type
@@ -267,6 +219,13 @@ def whisp_formatted_stats_geojson_to_df(
267
219
 
268
220
  Processing metadata stored in df.attrs['processing_metadata'].
269
221
  These columns enable full transparency for geometry modifications during processing.
222
+ status_file : str, optional
223
+ Path to JSON status file or directory for real-time progress tracking.
224
+ If a directory is provided, creates 'whisp_processing_status.json' in that directory.
225
+ Updates every 3 minutes and at progress milestones (5%, 10%, etc.).
226
+ Format: {"status": "processing", "progress": "450/1000", "percent": 45.0,
227
+ "elapsed_sec": 120, "eta_sec": 145, "updated_at": "2025-11-13T14:23:45"}
228
+ Most useful for large concurrent jobs. Works in both concurrent and sequential modes.
270
229
 
271
230
  Returns
272
231
  -------
@@ -326,12 +285,10 @@ def whisp_formatted_stats_geojson_to_df(
326
285
  return whisp_formatted_stats_geojson_to_df_legacy(
327
286
  input_geojson_filepath=input_geojson_filepath,
328
287
  external_id_column=external_id_column,
329
- remove_geom=remove_geom,
330
288
  national_codes=national_codes,
331
289
  unit_type=unit_type,
332
290
  whisp_image=whisp_image,
333
291
  custom_bands=custom_bands,
334
- validate_geometries=validate_geometries,
335
292
  )
336
293
  elif mode in ("concurrent", "sequential"):
337
294
  # Log info if batch_size or max_concurrent are not used in sequential mode
@@ -350,7 +307,6 @@ def whisp_formatted_stats_geojson_to_df(
350
307
  return whisp_formatted_stats_geojson_to_df_fast(
351
308
  input_geojson_filepath=input_geojson_filepath,
352
309
  external_id_column=external_id_column,
353
- remove_geom=remove_geom,
354
310
  national_codes=national_codes,
355
311
  unit_type=unit_type,
356
312
  whisp_image=whisp_image,
@@ -358,8 +314,8 @@ def whisp_formatted_stats_geojson_to_df(
358
314
  mode=mode, # Pass mode directly (concurrent or sequential)
359
315
  batch_size=batch_size,
360
316
  max_concurrent=max_concurrent,
361
- validate_geometries=validate_geometries,
362
- include_geometry_audit_trail=include_geometry_audit_trail,
317
+ geometry_audit_trail=geometry_audit_trail,
318
+ status_file=status_file,
363
319
  )
364
320
  else:
365
321
  raise ValueError(
@@ -518,7 +474,6 @@ def whisp_formatted_stats_ee_to_df(
518
474
  def whisp_stats_geojson_to_df(
519
475
  input_geojson_filepath: Path | str,
520
476
  external_id_column=None,
521
- remove_geom=False,
522
477
  national_codes=None,
523
478
  unit_type="ha",
524
479
  whisp_image=None, # New parameter
@@ -551,7 +506,6 @@ def whisp_stats_geojson_to_df(
551
506
  return whisp_stats_ee_to_df(
552
507
  feature_collection,
553
508
  external_id_column,
554
- remove_geom,
555
509
  national_codes=national_codes,
556
510
  unit_type=unit_type,
557
511
  whisp_image=whisp_image, # Pass through
@@ -1035,7 +989,7 @@ def whisp_stats_ee_to_drive(
1035
989
  )
1036
990
  task.start()
1037
991
  print(
1038
- "Exporting to Google Drive: 'whisp_results/whisp_output_table.csv'. To track progress: https://code.earthengine.google.com/tasks"
992
+ "Exporting to Google Drive: 'whisp_output_table.csv'. To track progress: https://code.earthengine.google.com/tasks"
1039
993
  )
1040
994
  except Exception as e:
1041
995
  print(f"An error occurred during the export: {e}")