openforis-whisp 3.0.0a1__py3-none-any.whl → 3.0.0a3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,10 +7,69 @@ and thresholds, raising informative errors when constraints are violated.
7
7
 
8
8
  import json
9
9
  from pathlib import Path
10
- from shapely.geometry import Polygon as ShapelyPolygon
10
+ from shapely.geometry import Polygon as ShapelyPolygon, shape as shapely_shape
11
11
 
12
12
  # Note: area summary stats are estimations for use in deciding pathways for analysis
13
13
  # (estimation preferred here as allows efficient processing speed and limits overhead of checking file)
14
+
15
+
16
+ def _convert_projected_area_to_ha(area_sq_units: float, crs: str = None) -> float:
17
+ """
18
+ Convert area from projected CRS units to hectares.
19
+
20
+ Most projected CRS use meters as units, so:
21
+ - area_sq_units is in square meters
22
+ - 1 hectare = 10,000 m²
23
+
24
+ Args:
25
+ area_sq_units: Area in square units of the projection (typically square meters)
26
+ crs: CRS string for reference (e.g., 'EPSG:3857'). Used for validation.
27
+
28
+ Returns:
29
+ Area in hectares
30
+ """
31
+ # Standard conversion: 1 hectare = 10,000 m²
32
+ # Most projected CRS use meters, so this works universally
33
+ return area_sq_units / 10000
34
+
35
+
36
+ def _estimate_area_from_bounds(coords, area_conversion_factor: float) -> float:
37
+ """
38
+ Estimate area from bounding box when actual area calculation fails.
39
+ Extracts bounding box and calculates its area as a fallback estimate.
40
+ Returns area in hectares.
41
+ """
42
+ try:
43
+ # Flatten all coordinates to find bounds
44
+ all_coords = []
45
+
46
+ def flatten_coords(c):
47
+ if isinstance(c[0], (list, tuple)) and isinstance(c[0][0], (list, tuple)):
48
+ for sub in c:
49
+ flatten_coords(sub)
50
+ else:
51
+ all_coords.extend(c)
52
+
53
+ flatten_coords(coords)
54
+ if not all_coords:
55
+ return 0
56
+
57
+ # Extract lon/lat values
58
+ lons = [c[0] for c in all_coords]
59
+ lats = [c[1] for c in all_coords]
60
+
61
+ min_lon, max_lon = min(lons), max(lons)
62
+ min_lat, max_lat = min(lats), max(lats)
63
+
64
+ # Bounding box area
65
+ bbox_area = (max_lon - min_lon) * (max_lat - min_lat)
66
+
67
+ # Apply conversion factor
68
+ return abs(bbox_area) * area_conversion_factor
69
+ except:
70
+ return 0
71
+
72
+
14
73
  def analyze_geojson(
15
74
  geojson_data: Path | str | dict,
16
75
  metrics=[
@@ -76,6 +135,8 @@ def analyze_geojson(
76
135
  - 'vertex_percentiles': {'p25': int, 'p50': int, 'p75': int, 'p90': int}
77
136
  """
78
137
  results = {}
138
+ crs_warning = None
139
+ file_path = None
79
140
 
80
141
  try:
81
142
  # Load GeoJSON from file if path provided
@@ -83,11 +144,45 @@ def analyze_geojson(
83
144
  file_path = Path(geojson_data)
84
145
  if not file_path.exists():
85
146
  raise FileNotFoundError(f"GeoJSON file not found: {file_path}")
86
- with open(file_path, "r") as f:
87
- geojson_data = json.load(f)
147
+
148
+ # Try UTF-8 first (most common), then fall back to auto-detection
149
+ try:
150
+ with open(file_path, "r", encoding="utf-8") as f:
151
+ geojson_data = json.load(f)
152
+ except UnicodeDecodeError:
153
+ # Auto-detect encoding if UTF-8 fails
154
+ try:
155
+ import chardet
156
+
157
+ with open(file_path, "rb") as f:
158
+ raw_data = f.read()
159
+ detected = chardet.detect(raw_data)
160
+ encoding = detected.get("encoding", "latin-1")
161
+
162
+ with open(file_path, "r", encoding=encoding, errors="replace") as f:
163
+ geojson_data = json.load(f)
164
+ except Exception:
165
+ # Final fallback: use latin-1 which accepts all byte values
166
+ with open(file_path, "r", encoding="latin-1") as f:
167
+ geojson_data = json.load(f)
168
+
169
+ # Detect CRS from file if available
170
+ try:
171
+ import geopandas as gpd
172
+
173
+ gdf = gpd.read_file(file_path)
174
+ if gdf.crs and gdf.crs != "EPSG:4326":
175
+ crs_warning = f"⚠️ CRS is {gdf.crs}, not EPSG:4326. Area metrics will be inaccurate. Data will be auto-reprojected during processing."
176
+ except Exception:
177
+ pass # If we can't detect CRS, continue without warning
88
178
 
89
179
  features = geojson_data.get("features", [])
90
180
 
181
+ # Add CRS warning to results if detected
182
+ if crs_warning:
183
+ results["crs_warning"] = crs_warning
184
+ print(crs_warning)
185
+
91
186
  if "count" in metrics:
92
187
  results["count"] = len(features)
93
188
 
@@ -113,6 +208,29 @@ def analyze_geojson(
113
208
  geometry_type_counts = {}
114
209
  valid_polygons = 0
115
210
 
211
+ # Tracking for fallback geometries
212
+ bbox_fallback_count = 0 # Geometries that used bounding box estimate
213
+ geometry_skip_count = 0 # Geometries completely skipped
214
+ polygon_type_stats = {} # Track stats by geometry type
215
+
216
+ # Detect CRS to determine area conversion factor
217
+ area_conversion_factor = 1232100 # Default: WGS84 (degrees to ha)
218
+ detected_crs = None
219
+
220
+ # Try to detect CRS from file if available
221
+ if file_path:
222
+ try:
223
+ import geopandas as gpd
224
+
225
+ gdf_temp = gpd.read_file(str(file_path))
226
+ detected_crs = gdf_temp.crs
227
+ if detected_crs and detected_crs != "EPSG:4326":
228
+ # Projected CRS typically uses meters, so convert m² to ha
229
+ # 1 ha = 10,000 m²
230
+ area_conversion_factor = 1 / 10000
231
+ except Exception:
232
+ pass # Use default if CRS detection fails
233
+
116
234
  for feature in features:
117
235
  try:
118
236
  coords = feature["geometry"]["coordinates"]
@@ -133,13 +251,27 @@ def analyze_geojson(
133
251
 
134
252
  # Calculate area from coordinates using shapely
135
253
  try:
136
- poly = ShapelyPolygon(coords[0])
137
- # Convert square degrees to hectares (near equator)
138
- # 1 degree latitude ≈ 111 km, so 1 degree² ≈ 111² km² = 12,321 km² = 1,232,100 ha
139
- area_ha = abs(poly.area) * 1232100
254
+ # Use shapely.geometry.shape to properly handle all geometry components
255
+ geom = shapely_shape(feature["geometry"])
256
+ # Convert using detected CRS
257
+ area_ha = abs(geom.area) * area_conversion_factor
140
258
  areas.append(area_ha)
141
- except:
142
- pass # Skip if calculation fails
259
+ except Exception as e:
260
+ # Fallback: estimate from bounding box if geometry fails
261
+ bbox_area = _estimate_area_from_bounds(
262
+ coords, area_conversion_factor
263
+ )
264
+ if bbox_area > 0:
265
+ areas.append(bbox_area)
266
+ bbox_fallback_count += 1
267
+ polygon_type_stats["Polygon_bbox"] = (
268
+ polygon_type_stats.get("Polygon_bbox", 0) + 1
269
+ )
270
+ else:
271
+ geometry_skip_count += 1
272
+ polygon_type_stats["Polygon_skipped"] = (
273
+ polygon_type_stats.get("Polygon_skipped", 0) + 1
274
+ )
143
275
  valid_polygons += 1
144
276
 
145
277
  elif geom_type == "MultiPolygon":
@@ -152,12 +284,28 @@ def analyze_geojson(
152
284
 
153
285
  # Calculate area from coordinates using shapely
154
286
  try:
155
- for polygon in coords:
156
- poly = ShapelyPolygon(polygon[0])
157
- area_ha = abs(poly.area) * 1232100
158
- areas.append(area_ha)
159
- except:
160
- pass # Skip if calculation fails
287
+ # Use shapely.geometry.shape to properly handle MultiPolygon
288
+ geom = shapely_shape(feature["geometry"])
289
+ # Convert using detected CRS - use total area of all parts
290
+ area_ha = abs(geom.area) * area_conversion_factor
291
+ areas.append(area_ha)
292
+ except Exception as e:
293
+ # Fallback: estimate from bounding box if geometry fails
294
+ bbox_area = _estimate_area_from_bounds(
295
+ coords, area_conversion_factor
296
+ )
297
+ if bbox_area > 0:
298
+ areas.append(bbox_area)
299
+ bbox_fallback_count += 1
300
+ polygon_type_stats["MultiPolygon_bbox"] = (
301
+ polygon_type_stats.get("MultiPolygon_bbox", 0) + 1
302
+ )
303
+ else:
304
+ geometry_skip_count += 1
305
+ polygon_type_stats["MultiPolygon_skipped"] = (
306
+ polygon_type_stats.get("MultiPolygon_skipped", 0)
307
+ + 1
308
+ )
161
309
  valid_polygons += 1
162
310
 
163
311
  except:
@@ -312,6 +460,21 @@ def analyze_geojson(
312
460
  else {"p25": 0, "p50": 0, "p75": 0, "p90": 0}
313
461
  )
314
462
 
463
+ # Add geometry quality logging to results
464
+ if bbox_fallback_count > 0 or geometry_skip_count > 0:
465
+ geometry_quality_log = (
466
+ f"Geometry quality summary:\n"
467
+ f" - Bounding box fallback used: {bbox_fallback_count} features\n"
468
+ f" - Geometries skipped: {geometry_skip_count} features"
469
+ )
470
+ if polygon_type_stats:
471
+ geometry_quality_log += "\n - Breakdown:"
472
+ for stat_type, count in sorted(polygon_type_stats.items()):
473
+ geometry_quality_log += f"\n - {stat_type}: {count}"
474
+
475
+ results["geometry_quality_note"] = geometry_quality_log
476
+ print(geometry_quality_log)
477
+
315
478
  return results
316
479
 
317
480
  except Exception as e:
@@ -12,67 +12,81 @@ import geopandas as gpd
12
12
  import ee
13
13
 
14
14
 
15
- def convert_geojson_to_ee(
16
- geojson_filepath: Union[str, Path, dict],
17
- enforce_wgs84: bool = True,
18
- strip_z_coords: bool = True,
19
- ) -> ee.FeatureCollection:
15
+ # ============================================================================
16
+ # HELPER FUNCTIONS FOR UNIFIED PROCESSING PATHWAY
17
+ # ============================================================================
18
+
19
+
20
+ def _sanitize_geodataframe(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
20
21
  """
21
- Converts GeoJSON data to an Earth Engine FeatureCollection.
22
- Accepts either a file path or a GeoJSON dictionary object.
23
- Optionally checks and converts the CRS to WGS 84 (EPSG:4326) if needed.
24
- Automatically handles 3D coordinates by stripping Z values when necessary.
22
+ Sanitize GeoDataFrame data types for JSON serialization.
23
+
24
+ Converts problematic data types that cannot be directly serialized:
25
+ - DateTime/Timestamp columns ISO format strings
26
+ - Object columns → strings
27
+ - Skips geometry column
25
28
 
26
29
  Args:
27
- geojson_filepath (Union[str, Path, dict]): The filepath to the GeoJSON file (str or Path)
28
- or a GeoJSON dictionary object.
29
- enforce_wgs84 (bool): Whether to enforce WGS 84 projection (EPSG:4326). Defaults to True.
30
- Only applies when input is a file path (dicts are assumed to be in WGS84).
31
- strip_z_coords (bool): Whether to automatically strip Z coordinates from 3D geometries. Defaults to True.
30
+ gdf (gpd.GeoDataFrame): Input GeoDataFrame
32
31
 
33
32
  Returns:
34
- ee.FeatureCollection: Earth Engine FeatureCollection created from the GeoJSON.
33
+ gpd.GeoDataFrame: GeoDataFrame with sanitized data types
34
+ """
35
+ gdf = gdf.copy()
36
+ for col in gdf.columns:
37
+ if col != gdf.geometry.name: # Skip geometry column
38
+ # Handle datetime/timestamp columns
39
+ if pd.api.types.is_datetime64_any_dtype(gdf[col]):
40
+ gdf[col] = gdf[col].dt.strftime("%Y-%m-%d %H:%M:%S").fillna("")
41
+ # Handle other problematic types
42
+ elif gdf[col].dtype == "object":
43
+ # Convert any remaining non-serializable objects to strings
44
+ gdf[col] = gdf[col].astype(str)
45
+ return gdf
46
+
47
+
48
+ def _ensure_wgs84_crs(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
35
49
  """
36
- if isinstance(geojson_filepath, dict):
37
- # Input is already a GeoJSON dictionary - skip file reading
38
- geojson_data = geojson_filepath
39
- elif isinstance(geojson_filepath, (str, Path)):
40
- file_path = os.path.abspath(geojson_filepath)
50
+ Ensure GeoDataFrame uses WGS 84 (EPSG:4326) coordinate reference system.
41
51
 
42
- # Use GeoPandas to read the file and handle CRS
43
- gdf = gpd.read_file(file_path)
52
+ - If CRS is None, assumes WGS 84
53
+ - If CRS is not WGS 84, converts to WGS 84
54
+ - If already WGS 84, returns unchanged
44
55
 
45
- # NEW: Handle problematic data types before JSON conversion
46
- for col in gdf.columns:
47
- if col != gdf.geometry.name: # Skip geometry column
48
- # Handle datetime/timestamp columns
49
- if pd.api.types.is_datetime64_any_dtype(gdf[col]):
50
- gdf[col] = gdf[col].dt.strftime("%Y-%m-%d %H:%M:%S").fillna("")
51
- # Handle other problematic types
52
- elif gdf[col].dtype == "object":
53
- # Convert any remaining non-serializable objects to strings
54
- gdf[col] = gdf[col].astype(str)
55
-
56
- # Check and convert CRS if needed
57
- if enforce_wgs84:
58
- if gdf.crs is None:
59
- # Assuming WGS 84 if no CRS defined
60
- pass
61
- elif gdf.crs != "EPSG:4326":
62
- gdf = gdf.to_crs("EPSG:4326")
63
-
64
- # Convert to GeoJSON
65
- geojson_data = json.loads(gdf.to_json())
66
- else:
67
- raise ValueError(
68
- "Input must be a file path (str or Path) or a GeoJSON dictionary object (dict)"
69
- )
56
+ Args:
57
+ gdf (gpd.GeoDataFrame): Input GeoDataFrame
70
58
 
71
- validation_errors = validate_geojson(geojson_data)
72
- if validation_errors:
73
- raise ValueError(f"GeoJSON validation errors: {validation_errors}")
59
+ Returns:
60
+ gpd.GeoDataFrame: GeoDataFrame in WGS 84
61
+ """
62
+ if gdf.crs is None:
63
+ # Assuming WGS 84 if no CRS defined
64
+ return gdf
65
+ elif gdf.crs != "EPSG:4326":
66
+ return gdf.to_crs("EPSG:4326")
67
+ return gdf
68
+
69
+
70
+ def _create_ee_feature_collection(
71
+ geojson_data: dict, strip_z_coords: bool = True, input_source: str = "input"
72
+ ) -> ee.FeatureCollection:
73
+ """
74
+ Create Earth Engine FeatureCollection from GeoJSON dict with error recovery.
75
+
76
+ Attempts to create EE FeatureCollection. If it fails due to 3D coordinates
77
+ and strip_z_coords is True, automatically strips Z values and retries.
78
+
79
+ Args:
80
+ geojson_data (dict): GeoJSON data dictionary
81
+ strip_z_coords (bool): Whether to retry with 2D geometries on failure
82
+ input_source (str): Description of input source for logging
83
+
84
+ Returns:
85
+ ee.FeatureCollection: Earth Engine FeatureCollection
74
86
 
75
- # Try to create the feature collection, handle 3D coordinate issues automatically
87
+ Raises:
88
+ ee.EEException: If conversion fails even after retries
89
+ """
76
90
  try:
77
91
  feature_collection = ee.FeatureCollection(
78
92
  create_feature_collection(geojson_data)
@@ -81,16 +95,16 @@ def convert_geojson_to_ee(
81
95
  except ee.EEException as e:
82
96
  if "Invalid GeoJSON geometry" in str(e) and strip_z_coords:
83
97
  # Apply print_once deduplication for Z-coordinate stripping messages
84
- if not hasattr(convert_geojson_to_ee, "_printed_z_messages"):
85
- convert_geojson_to_ee._printed_z_messages = set()
98
+ if not hasattr(_create_ee_feature_collection, "_printed_z_messages"):
99
+ _create_ee_feature_collection._printed_z_messages = set()
86
100
 
87
- z_message_key = f"z_coords_{file_path}"
88
- if z_message_key not in convert_geojson_to_ee._printed_z_messages:
101
+ z_message_key = f"z_coords_{input_source}"
102
+ if z_message_key not in _create_ee_feature_collection._printed_z_messages:
89
103
  print(
90
104
  "Warning: Invalid GeoJSON geometry detected, likely due to 3D coordinates."
91
105
  )
92
106
  print("Attempting to fix by stripping Z coordinates...")
93
- convert_geojson_to_ee._printed_z_messages.add(z_message_key)
107
+ _create_ee_feature_collection._printed_z_messages.add(z_message_key)
94
108
 
95
109
  # Apply Z-coordinate stripping
96
110
  geojson_data_fixed = _strip_z_coordinates_from_geojson(geojson_data)
@@ -101,10 +115,15 @@ def convert_geojson_to_ee(
101
115
  create_feature_collection(geojson_data_fixed)
102
116
  )
103
117
 
104
- success_message_key = f"z_coords_success_{file_path}"
105
- if success_message_key not in convert_geojson_to_ee._printed_z_messages:
118
+ success_message_key = f"z_coords_success_{input_source}"
119
+ if (
120
+ success_message_key
121
+ not in _create_ee_feature_collection._printed_z_messages
122
+ ):
106
123
  print("Successfully converted after stripping Z coordinates")
107
- convert_geojson_to_ee._printed_z_messages.add(success_message_key)
124
+ _create_ee_feature_collection._printed_z_messages.add(
125
+ success_message_key
126
+ )
108
127
 
109
128
  return feature_collection
110
129
  except Exception as retry_error:
@@ -115,6 +134,82 @@ def convert_geojson_to_ee(
115
134
  raise e
116
135
 
117
136
 
137
+ def convert_geojson_to_ee(
138
+ geojson_input: Union[str, Path, dict, gpd.GeoDataFrame],
139
+ enforce_wgs84: bool = True,
140
+ strip_z_coords: bool = True,
141
+ ) -> ee.FeatureCollection:
142
+ """
143
+ Converts GeoJSON data to an Earth Engine FeatureCollection.
144
+
145
+ Accepts flexible input types with a unified processing pathway:
146
+ - File path (str or Path) → loads with GeoPandas
147
+ - GeoJSON dict → uses directly
148
+ - GeoDataFrame → uses directly
149
+
150
+ Automatically handles:
151
+ - CRS conversion to WGS 84 (EPSG:4326) if needed
152
+ - DateTime/Timestamp columns → converts to ISO strings before JSON serialization
153
+ - Non-serializable objects → converts to strings
154
+ - 3D coordinates → strips Z values when necessary
155
+ - Z-coordinate errors → retries with 2D geometries if enabled
156
+
157
+ Args:
158
+ geojson_input (Union[str, Path, dict, gpd.GeoDataFrame]):
159
+ - File path (str or Path) to GeoJSON file
160
+ - GeoJSON dictionary object
161
+ - GeoPandas GeoDataFrame
162
+ enforce_wgs84 (bool): Whether to enforce WGS 84 projection (EPSG:4326).
163
+ Defaults to True. Only applies to file path and GeoDataFrame inputs.
164
+ strip_z_coords (bool): Whether to automatically strip Z coordinates from 3D geometries.
165
+ Defaults to True.
166
+
167
+ Returns:
168
+ ee.FeatureCollection: Earth Engine FeatureCollection created from the GeoJSON.
169
+
170
+ Raises:
171
+ ValueError: If input type is unsupported or GeoJSON validation fails.
172
+ ee.EEException: If GeoJSON cannot be converted even after retries.
173
+ """
174
+ # UNIFIED INPUT NORMALIZATION: Convert all inputs to GeoDataFrame first
175
+ if isinstance(geojson_input, gpd.GeoDataFrame):
176
+ gdf = geojson_input.copy()
177
+ input_source = "GeoDataFrame"
178
+ elif isinstance(geojson_input, dict):
179
+ # Convert dict to GeoDataFrame for unified processing
180
+ gdf = gpd.GeoDataFrame.from_features(geojson_input.get("features", []))
181
+ input_source = "dict"
182
+ elif isinstance(geojson_input, (str, Path)):
183
+ # Load file and convert to GeoDataFrame
184
+ file_path = os.path.abspath(geojson_input)
185
+ gdf = gpd.read_file(file_path)
186
+ input_source = f"file ({file_path})"
187
+ else:
188
+ raise ValueError(
189
+ f"Input must be a file path (str or Path), GeoJSON dict, or GeoDataFrame. "
190
+ f"Got {type(geojson_input).__name__}"
191
+ )
192
+
193
+ # UNIFIED DATA SANITIZATION PATHWAY
194
+ # Handle problematic data types before JSON conversion
195
+ gdf = _sanitize_geodataframe(gdf)
196
+
197
+ # UNIFIED CRS HANDLING
198
+ if enforce_wgs84:
199
+ gdf = _ensure_wgs84_crs(gdf)
200
+
201
+ # UNIFIED GEOJSON CONVERSION
202
+ geojson_data = json.loads(gdf.to_json())
203
+
204
+ # UNIFIED VALIDATION
205
+ validation_errors = validate_geojson(geojson_data)
206
+ if validation_errors:
207
+ raise ValueError(f"GeoJSON validation errors: {validation_errors}")
208
+
209
+ # UNIFIED EE CONVERSION with error recovery
210
+ return _create_ee_feature_collection(geojson_data, strip_z_coords, input_source)
211
+
212
+
118
213
  def _strip_z_coordinates_from_geojson(geojson_data: dict) -> dict:
119
214
  """
120
215
  Helper function to strip Z coordinates from GeoJSON data.
@@ -125,7 +125,7 @@ def validate_dataframe(
125
125
  Returns:
126
126
  pd.DataFrame: The validated DataFrame with columns ordered according to the schema, or None if validation fails.
127
127
  """
128
- log_missing_columns(df_stats, schema)
128
+ _log_missing_columns(df_stats, schema)
129
129
 
130
130
  # df_stats = df_stats.reindex(schema.columns.keys(), axis=1)
131
131
 
@@ -251,7 +251,7 @@ def create_schema_from_dataframe(schema_df: pd.DataFrame) -> pa.DataFrameSchema:
251
251
  # return logger
252
252
 
253
253
 
254
- def log_missing_columns(df_stats: pd.DataFrame, template_schema: pa.DataFrameSchema):
254
+ def _log_missing_columns(df_stats: pd.DataFrame, template_schema: pa.DataFrameSchema):
255
255
  # Initialize the logger
256
256
  logger = setup_logger(__name__)
257
257
 
@@ -675,33 +675,6 @@ def _process_custom_bands(df_extra: pd.DataFrame, custom_bands) -> pd.DataFrame:
675
675
 
676
676
 
677
677
  # Fix the duplicate logging issue
678
- def log_missing_columns(df_stats: pd.DataFrame, template_schema: pa.DataFrameSchema):
679
- # Remove the duplicate logger creation line
680
- # logger = setup_logger(__name__) # DELETE THIS LINE
681
-
682
- # Use the existing module-level logger (line 18: logger = StdoutLogger(__name__))
683
-
684
- # Extract the expected columns from the DataFrameSchema
685
- template_columns = list(template_schema.columns.keys())
686
- df_stats_columns = df_stats.columns.tolist()
687
-
688
- # Find missing and extra columns
689
- missing_in_df = [col for col in template_columns if col not in df_stats_columns]
690
- extra_in_df = [col for col in df_stats_columns if col not in template_columns]
691
-
692
- # Log missing schema columns
693
- if missing_in_df:
694
- logger.warning(f"Missing expected schema columns: {missing_in_df}")
695
- else:
696
- logger.info("All expected schema columns found in DataFrame.")
697
-
698
- # Log extra columns (will be preserved)
699
- if extra_in_df:
700
- logger.info(f"Extra columns found (will be preserved): {extra_in_df}")
701
- else:
702
- logger.info("No extra columns found in DataFrame.")
703
-
704
-
705
678
  def format_stats_dataframe(
706
679
  df,
707
680
  area_col="Area_sum",
openforis_whisp/stats.py CHANGED
@@ -93,7 +93,6 @@ def whisp_formatted_stats_geojson_to_df_legacy(
93
93
  unit_type="ha",
94
94
  whisp_image=None,
95
95
  custom_bands=None, # New parameter
96
- validate_geometries: bool = False,
97
96
  ) -> pd.DataFrame:
98
97
  """
99
98
  Legacy function for basic Whisp stats extraction.
@@ -135,48 +134,15 @@ def whisp_formatted_stats_geojson_to_df_legacy(
135
134
  - List of band names: ['Aa_test', 'elevation']
136
135
  - Dict with types: {'Aa_test': 'float64', 'elevation': 'float32'}
137
136
  - None: preserves all extra columns automatically
138
- validate_geometries : bool, optional
139
- Whether to validate and fix invalid geometries, by default False.
140
- Set to True to automatically fix invalid/self-intersecting polygons.
141
137
 
142
138
  Returns
143
139
  -------
144
140
  df_stats : pd.DataFrame
145
141
  The DataFrame containing the Whisp stats for the input ROI.
146
142
  """
147
- # Load GeoJSON and validate geometries if requested
148
- if validate_geometries:
149
- import json
150
- import geopandas as gpd
151
- from shapely.validation import make_valid
152
- import logging as py_logging
153
-
154
- logger = py_logging.getLogger("whisp-legacy")
155
-
156
- # Load GeoJSON file
157
- with open(input_geojson_filepath, "r") as f:
158
- geojson_data = json.load(f)
159
-
160
- # Convert to GeoDataFrame
161
- gdf = gpd.GeoDataFrame.from_features(geojson_data["features"])
162
-
163
- # Validate and fix invalid geometries
164
- valid_count = gdf.geometry.is_valid.sum()
165
- invalid_count = len(gdf) - valid_count
166
- if invalid_count > 0:
167
- logger.warning(f"Fixing {invalid_count} invalid geometries")
168
- gdf["geometry"] = gdf["geometry"].apply(
169
- lambda g: make_valid(g) if g and not g.is_valid else g
170
- )
171
-
172
- # Convert back to GeoJSON dict (stays in memory - no temp files!)
173
- geojson_cleaned = json.loads(gdf.to_json())
174
-
175
- # OPTIMIZATION: Pass GeoJSON dict directly - eliminates file I/O overhead
176
- feature_collection = convert_geojson_to_ee(geojson_cleaned)
177
- else:
178
- # Original path - no validation
179
- feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
143
+ # Convert GeoJSON to Earth Engine FeatureCollection
144
+ # Note: Geometry validation/cleaning should be done before calling this function
145
+ feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
180
146
 
181
147
  return whisp_formatted_stats_ee_to_df(
182
148
  feature_collection,
@@ -200,7 +166,7 @@ def whisp_formatted_stats_geojson_to_df(
200
166
  mode: str = "sequential",
201
167
  batch_size: int = 10,
202
168
  max_concurrent: int = 20,
203
- validate_geometries: bool = False,
169
+ geometry_audit_trail: bool = False,
204
170
  ) -> pd.DataFrame:
205
171
  """
206
172
  Main entry point for converting GeoJSON to Whisp statistics.
@@ -248,11 +214,16 @@ def whisp_formatted_stats_geojson_to_df(
248
214
  max_concurrent : int, optional
249
215
  Maximum concurrent EE calls for concurrent mode, by default 20.
250
216
  Only applicable for "concurrent" mode.
251
- validate_geometries : bool, optional
252
- Whether to validate and fix invalid geometries, by default False.
253
- Set to True to automatically fix invalid/self-intersecting polygons.
254
- For production workflows, it's recommended to use geometry validation and
255
- cleaning tools BEFORE processing with this function.
217
+ geometry_audit_trail : bool, default True
218
+ If True (default), includes audit trail columns:
219
+ - geo_original: Original input geometry
220
+ - geometry_type_original: Original geometry type
221
+ - geometry_type: Processed geometry type (from EE)
222
+ - geometry_type_changed: Boolean flag if geometry changed
223
+ - geometry_degradation_type: Description of how it changed
224
+
225
+ Processing metadata stored in df.attrs['processing_metadata'].
226
+ These columns enable full transparency for geometry modifications during processing.
256
227
 
257
228
  Returns
258
229
  -------
@@ -317,7 +288,6 @@ def whisp_formatted_stats_geojson_to_df(
317
288
  unit_type=unit_type,
318
289
  whisp_image=whisp_image,
319
290
  custom_bands=custom_bands,
320
- validate_geometries=validate_geometries,
321
291
  )
322
292
  elif mode in ("concurrent", "sequential"):
323
293
  # Log info if batch_size or max_concurrent are not used in sequential mode
@@ -344,7 +314,7 @@ def whisp_formatted_stats_geojson_to_df(
344
314
  mode=mode, # Pass mode directly (concurrent or sequential)
345
315
  batch_size=batch_size,
346
316
  max_concurrent=max_concurrent,
347
- validate_geometries=validate_geometries,
317
+ geometry_audit_trail=geometry_audit_trail,
348
318
  )
349
319
  else:
350
320
  raise ValueError(