openforis-whisp 3.0.0a2__py3-none-any.whl → 3.0.0a3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -125,7 +125,7 @@ def validate_dataframe(
125
125
  Returns:
126
126
  pd.DataFrame: The validated DataFrame with columns ordered according to the schema, or None if validation fails.
127
127
  """
128
- log_missing_columns(df_stats, schema)
128
+ _log_missing_columns(df_stats, schema)
129
129
 
130
130
  # df_stats = df_stats.reindex(schema.columns.keys(), axis=1)
131
131
 
@@ -251,7 +251,7 @@ def create_schema_from_dataframe(schema_df: pd.DataFrame) -> pa.DataFrameSchema:
251
251
  # return logger
252
252
 
253
253
 
254
- def log_missing_columns(df_stats: pd.DataFrame, template_schema: pa.DataFrameSchema):
254
+ def _log_missing_columns(df_stats: pd.DataFrame, template_schema: pa.DataFrameSchema):
255
255
  # Initialize the logger
256
256
  logger = setup_logger(__name__)
257
257
 
@@ -675,33 +675,6 @@ def _process_custom_bands(df_extra: pd.DataFrame, custom_bands) -> pd.DataFrame:
675
675
 
676
676
 
677
677
  # Fix the duplicate logging issue
678
- def log_missing_columns(df_stats: pd.DataFrame, template_schema: pa.DataFrameSchema):
679
- # Remove the duplicate logger creation line
680
- # logger = setup_logger(__name__) # DELETE THIS LINE
681
-
682
- # Use the existing module-level logger (line 18: logger = StdoutLogger(__name__))
683
-
684
- # Extract the expected columns from the DataFrameSchema
685
- template_columns = list(template_schema.columns.keys())
686
- df_stats_columns = df_stats.columns.tolist()
687
-
688
- # Find missing and extra columns
689
- missing_in_df = [col for col in template_columns if col not in df_stats_columns]
690
- extra_in_df = [col for col in df_stats_columns if col not in template_columns]
691
-
692
- # Log missing schema columns
693
- if missing_in_df:
694
- logger.warning(f"Missing expected schema columns: {missing_in_df}")
695
- else:
696
- logger.info("All expected schema columns found in DataFrame.")
697
-
698
- # Log extra columns (will be preserved)
699
- if extra_in_df:
700
- logger.info(f"Extra columns found (will be preserved): {extra_in_df}")
701
- else:
702
- logger.info("No extra columns found in DataFrame.")
703
-
704
-
705
678
  def format_stats_dataframe(
706
679
  df,
707
680
  area_col="Area_sum",
openforis_whisp/stats.py CHANGED
@@ -93,7 +93,6 @@ def whisp_formatted_stats_geojson_to_df_legacy(
93
93
  unit_type="ha",
94
94
  whisp_image=None,
95
95
  custom_bands=None, # New parameter
96
- validate_geometries: bool = False,
97
96
  ) -> pd.DataFrame:
98
97
  """
99
98
  Legacy function for basic Whisp stats extraction.
@@ -135,51 +134,15 @@ def whisp_formatted_stats_geojson_to_df_legacy(
135
134
  - List of band names: ['Aa_test', 'elevation']
136
135
  - Dict with types: {'Aa_test': 'float64', 'elevation': 'float32'}
137
136
  - None: preserves all extra columns automatically
138
- validate_geometries : bool, optional
139
- Whether to validate and fix invalid geometries, by default False.
140
- Set to True to automatically fix invalid/self-intersecting polygons.
141
137
 
142
138
  Returns
143
139
  -------
144
140
  df_stats : pd.DataFrame
145
141
  The DataFrame containing the Whisp stats for the input ROI.
146
142
  """
147
- # Load GeoJSON and validate geometries if requested
148
- if validate_geometries:
149
- import json
150
- import geopandas as gpd
151
- from shapely.validation import make_valid
152
- import logging as py_logging
153
-
154
- logger = py_logging.getLogger("whisp")
155
-
156
- # Load GeoJSON file
157
- with open(input_geojson_filepath, "r") as f:
158
- geojson_data = json.load(f)
159
-
160
- # Convert to GeoDataFrame
161
- gdf = gpd.GeoDataFrame.from_features(geojson_data["features"])
162
-
163
- # Validate and fix invalid geometries
164
- valid_count = gdf.geometry.is_valid.sum()
165
- invalid_count = len(gdf) - valid_count
166
- if invalid_count > 0:
167
- logger.warning(f"Fixing {invalid_count} invalid geometries")
168
- gdf["geometry"] = gdf["geometry"].apply(
169
- lambda g: make_valid(g) if g and not g.is_valid else g
170
- )
171
-
172
- # Pass GeoDataFrame directly to preserve CRS metadata
173
- # convert_geojson_to_ee will handle:
174
- # - CRS detection and conversion to WGS84 if needed
175
- # - Data type sanitization (datetime, object columns)
176
- # - Geometry validation and Z-coordinate stripping
177
- feature_collection = convert_geojson_to_ee(
178
- gdf, enforce_wgs84=True, strip_z_coords=True
179
- )
180
- else:
181
- # Original path - no validation
182
- feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
143
+ # Convert GeoJSON to Earth Engine FeatureCollection
144
+ # Note: Geometry validation/cleaning should be done before calling this function
145
+ feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
183
146
 
184
147
  return whisp_formatted_stats_ee_to_df(
185
148
  feature_collection,
@@ -203,8 +166,7 @@ def whisp_formatted_stats_geojson_to_df(
203
166
  mode: str = "sequential",
204
167
  batch_size: int = 10,
205
168
  max_concurrent: int = 20,
206
- validate_geometries: bool = False,
207
- include_geometry_audit_trail: bool = False,
169
+ geometry_audit_trail: bool = False,
208
170
  ) -> pd.DataFrame:
209
171
  """
210
172
  Main entry point for converting GeoJSON to Whisp statistics.
@@ -252,12 +214,7 @@ def whisp_formatted_stats_geojson_to_df(
252
214
  max_concurrent : int, optional
253
215
  Maximum concurrent EE calls for concurrent mode, by default 20.
254
216
  Only applicable for "concurrent" mode.
255
- validate_geometries : bool, optional
256
- Whether to validate and fix invalid geometries, by default False.
257
- Set to True to automatically fix invalid/self-intersecting polygons.
258
- For production workflows, it's recommended to use geometry validation and
259
- cleaning tools BEFORE processing with this function.
260
- include_geometry_audit_trail : bool, default True
217
+ geometry_audit_trail : bool, default True
261
218
  If True (default), includes audit trail columns:
262
219
  - geo_original: Original input geometry
263
220
  - geometry_type_original: Original geometry type
@@ -331,7 +288,6 @@ def whisp_formatted_stats_geojson_to_df(
331
288
  unit_type=unit_type,
332
289
  whisp_image=whisp_image,
333
290
  custom_bands=custom_bands,
334
- validate_geometries=validate_geometries,
335
291
  )
336
292
  elif mode in ("concurrent", "sequential"):
337
293
  # Log info if batch_size or max_concurrent are not used in sequential mode
@@ -358,8 +314,7 @@ def whisp_formatted_stats_geojson_to_df(
358
314
  mode=mode, # Pass mode directly (concurrent or sequential)
359
315
  batch_size=batch_size,
360
316
  max_concurrent=max_concurrent,
361
- validate_geometries=validate_geometries,
362
- include_geometry_audit_trail=include_geometry_audit_trail,
317
+ geometry_audit_trail=geometry_audit_trail,
363
318
  )
364
319
  else:
365
320
  raise ValueError(