openforis-whisp 3.0.0a1__py3-none-any.whl → 3.0.0a3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -32,7 +32,7 @@ import os
32
32
  import subprocess
33
33
  from contextlib import redirect_stdout, contextmanager
34
34
  from pathlib import Path
35
- from typing import Optional, List, Dict, Any, Tuple
35
+ from typing import Optional, List, Dict, Any, Tuple, Union
36
36
  from concurrent.futures import ThreadPoolExecutor, as_completed
37
37
  import tempfile
38
38
 
@@ -203,6 +203,57 @@ def _extract_decimal_places(format_string: str) -> int:
203
203
  return 2 # Default to 2 decimal places
204
204
 
205
205
 
206
+ def _normalize_keep_external_columns(
207
+ keep_external_columns: Union[bool, List[str]],
208
+ all_columns: List[str],
209
+ plot_id_column: str = "plotId",
210
+ ) -> List[str]:
211
+ """
212
+ Normalize keep_external_columns parameter to a list of column names.
213
+
214
+ Converts flexible user input (bool or list) to a concrete list of columns to keep.
215
+
216
+ Parameters
217
+ ----------
218
+ keep_external_columns : bool or List[str]
219
+ - False: keep nothing (return empty list)
220
+ - True: keep all columns except geometry and plot_id
221
+ - List[str]: keep specific columns (return as-is)
222
+ all_columns : List[str]
223
+ All available columns to choose from
224
+ plot_id_column : str
225
+ Name of plot ID column to exclude
226
+
227
+ Returns
228
+ -------
229
+ List[str]
230
+ Columns to keep from external (GeoJSON) data
231
+
232
+ Examples
233
+ --------
234
+ >>> cols = _normalize_keep_external_columns(False, ["id", "Country", "geom"], "id")
235
+ >>> cols
236
+ []
237
+
238
+ >>> cols = _normalize_keep_external_columns(True, ["id", "Country", "geom"], "id")
239
+ >>> cols
240
+ ['Country']
241
+
242
+ >>> cols = _normalize_keep_external_columns(["Country"], ["id", "Country", "geom"], "id")
243
+ >>> cols
244
+ ['Country']
245
+ """
246
+ if keep_external_columns is True:
247
+ # Keep all columns except geometry and plot_id
248
+ return [c for c in all_columns if c not in [plot_id_column, "geometry"]]
249
+ elif keep_external_columns is False:
250
+ # Keep nothing
251
+ return []
252
+ else:
253
+ # Use provided list (handle None case)
254
+ return keep_external_columns or []
255
+
256
+
206
257
  def _add_admin_context(
207
258
  df: pd.DataFrame, admin_code_col: str = "admin_code_median", debug: bool = False
208
259
  ) -> pd.DataFrame:
@@ -226,7 +277,7 @@ def _add_admin_context(
226
277
  pd.DataFrame
227
278
  DataFrame with added Country, ProducerCountry, Admin_Level_1 columns
228
279
  """
229
- logger = logging.getLogger("whisp-concurrent")
280
+ logger = logging.getLogger("whisp")
230
281
 
231
282
  # Return early if admin code column doesn't exist
232
283
  if admin_code_col not in df.columns:
@@ -347,7 +398,7 @@ def join_admin_codes(
347
398
  pd.DataFrame
348
399
  DataFrame with added Country, ProducerCountry, Admin_Level_1 columns
349
400
  """
350
- logger = logging.getLogger("whisp-concurrent")
401
+ logger = logging.getLogger("whisp")
351
402
 
352
403
  # Return early if admin code column doesn't exist
353
404
  if id_col not in df.columns:
@@ -408,8 +459,9 @@ class ProgressTracker:
408
459
  """
409
460
  Track batch processing progress with time estimation.
410
461
 
411
- Shows progress at key milestones (25%, 50%, 75%, 100%) with estimated
412
- time remaining based on processing speed.
462
+ Shows progress at adaptive milestones (more frequent for small datasets,
463
+ less frequent for large datasets) with estimated time remaining based on
464
+ processing speed.
413
465
  """
414
466
 
415
467
  def __init__(self, total: int, logger: logging.Logger = None):
@@ -426,8 +478,19 @@ class ProgressTracker:
426
478
  self.total = total
427
479
  self.completed = 0
428
480
  self.lock = threading.Lock()
429
- self.logger = logger or logging.getLogger("whisp-concurrent")
430
- self.milestones = {25, 50, 75, 100}
481
+ self.logger = logger or logging.getLogger("whisp")
482
+
483
+ # Adaptive milestones based on dataset size
484
+ # Small datasets (< 50): show every 25% (not too spammy)
485
+ # Medium (50-500): show every 20%
486
+ # Large (500+): show every 10% (more frequent feedback on long runs)
487
+ if total < 50:
488
+ self.milestones = {25, 50, 75, 100}
489
+ elif total < 500:
490
+ self.milestones = {20, 40, 60, 80, 100}
491
+ else:
492
+ self.milestones = {10, 20, 30, 40, 50, 60, 70, 80, 90, 100}
493
+
431
494
  self.shown_milestones = set()
432
495
  self.start_time = time.time()
433
496
  self.last_update_time = self.start_time
@@ -537,16 +600,22 @@ def validate_ee_endpoint(endpoint_type: str = "high-volume", raise_error: bool =
537
600
  If incorrect endpoint and raise_error=True
538
601
  """
539
602
  if not check_ee_endpoint(endpoint_type):
540
- msg = (
541
- f"Not using {endpoint_type.upper()} endpoint.\n"
542
- f"Current URL: {ee.data._cloud_api_base_url}\n"
543
- f"\nTo use {endpoint_type} endpoint, run:\n"
544
- )
545
- msg += "ee.Reset()\n"
546
603
  if endpoint_type == "high-volume":
547
- msg += " ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')"
548
- else:
549
- msg += " ee.Initialize() # Uses standard endpoint by default"
604
+ msg = (
605
+ "Concurrent mode requires the HIGH-VOLUME endpoint. To change endpoint run:\n"
606
+ "ee.Reset()\n"
607
+ "ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')\n"
608
+ "Or with project specified (e.g. when in Colab):\n"
609
+ "ee.Initialize(project='your_cloud_project_name', opt_url='https://earthengine-highvolume.googleapis.com')"
610
+ )
611
+ else: # standard endpoint
612
+ msg = (
613
+ "Sequential mode requires the STANDARD endpoint. To change endpoint run:\n"
614
+ "ee.Reset()\n"
615
+ "ee.Initialize()\n"
616
+ "Or with project specified (e.g. when in Colab):\n"
617
+ "ee.Initialize(project='your_cloud_project_name')"
618
+ )
550
619
 
551
620
  if raise_error:
552
621
  raise RuntimeError(msg)
@@ -713,8 +782,8 @@ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
713
782
  """
714
783
  Convert a batch GeoDataFrame to EE FeatureCollection efficiently.
715
784
 
716
- OPTIMIZATION: Uses GeoJSON dict input directly to avoid temp file I/O.
717
- This provides ~67% performance improvement over writing to disk.
785
+ OPTIMIZATION: Passes GeoDataFrame directly to convert_geojson_to_ee to preserve CRS.
786
+ This ensures proper coordinate system handling and reprojection to WGS84 if needed.
718
787
 
719
788
  Preserves the __row_id__ column if present so it can be retrieved after processing.
720
789
 
@@ -728,10 +797,13 @@ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
728
797
  ee.FeatureCollection
729
798
  EE FeatureCollection with __row_id__ as a feature property
730
799
  """
731
- # OPTIMIZATION: Convert to GeoJSON dict and pass directly
732
- # This eliminates the need to write to/read from temp files (~67% faster)
733
- geojson_dict = json.loads(batch_gdf.to_json())
734
- fc = convert_geojson_to_ee(geojson_dict)
800
+ # Pass GeoDataFrame directly to preserve CRS metadata
801
+ # convert_geojson_to_ee will handle:
802
+ # - CRS detection and conversion to WGS84 if needed
803
+ # - Data type sanitization (datetime, object columns)
804
+ # - Geometry validation and Z-coordinate stripping
805
+
806
+ fc = convert_geojson_to_ee(batch_gdf, enforce_wgs84=True, strip_z_coords=True)
735
807
 
736
808
  # If __row_id__ is in the original GeoDataFrame, it will be preserved
737
809
  # as a feature property in the GeoJSON and thus in the EE FeatureCollection
@@ -740,8 +812,8 @@ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
740
812
 
741
813
  def clean_geodataframe(
742
814
  gdf: gpd.GeoDataFrame,
743
- remove_nulls: bool = True,
744
- fix_invalid: bool = True,
815
+ remove_nulls: bool = False,
816
+ repair_geometries: bool = False,
745
817
  logger: logging.Logger = None,
746
818
  ) -> gpd.GeoDataFrame:
747
819
  """
@@ -752,9 +824,11 @@ def clean_geodataframe(
752
824
  gdf : gpd.GeoDataFrame
753
825
  Input GeoDataFrame
754
826
  remove_nulls : bool
755
- Remove null geometries
756
- fix_invalid : bool
757
- Fix invalid geometries
827
+ Remove null geometries. Defaults to False to preserve data integrity.
828
+ Set to True only if you explicitly want to drop rows with null geometries.
829
+ repair_geometries : bool
830
+ Repair invalid geometries using Shapely's make_valid(). Defaults to False to preserve
831
+ original geometries. Set to True only if you want to automatically repair invalid geometries.
758
832
  logger : logging.Logger, optional
759
833
  Logger for output
760
834
 
@@ -763,7 +837,7 @@ def clean_geodataframe(
763
837
  gpd.GeoDataFrame
764
838
  Cleaned GeoDataFrame
765
839
  """
766
- logger = logger or logging.getLogger("whisp-concurrent")
840
+ logger = logger or logging.getLogger("whisp")
767
841
 
768
842
  if remove_nulls:
769
843
  null_count = gdf.geometry.isna().sum()
@@ -771,11 +845,11 @@ def clean_geodataframe(
771
845
  logger.warning(f"Removing {null_count} null geometries")
772
846
  gdf = gdf[~gdf.geometry.isna()].copy()
773
847
 
774
- if fix_invalid:
848
+ if repair_geometries:
775
849
  valid_count = gdf.geometry.is_valid.sum()
776
850
  invalid_count = len(gdf) - valid_count
777
851
  if invalid_count > 0:
778
- logger.warning(f"Fixing {invalid_count} invalid geometries")
852
+ logger.warning(f"Repairing {invalid_count} invalid geometries")
779
853
  from shapely.validation import make_valid
780
854
 
781
855
  gdf = gdf.copy()
@@ -787,6 +861,19 @@ def clean_geodataframe(
787
861
  return gdf
788
862
 
789
863
 
864
+ # ============================================================================
865
+ # BATCH RETRY HELPER
866
+ # ============================================================================
867
+
868
+
869
+ # ============================================================================
870
+ # BATCH RETRY HELPER - DEPRECATED (removed due to semaphore deadlock issues)
871
+ # ============================================================================
872
+ # Note: Retry logic via sub-batching has been removed. Instead, use fail-fast
873
+ # approach: when a batch fails, reduce batch_size parameter and retry manually.
874
+ # This avoids semaphore deadlocks and provides clearer error messages.
875
+
876
+
790
877
  # ============================================================================
791
878
  # EE PROCESSING WITH RETRY LOGIC
792
879
  # ============================================================================
@@ -828,7 +915,7 @@ def process_ee_batch(
828
915
  RuntimeError
829
916
  If processing fails after all retries
830
917
  """
831
- logger = logger or logging.getLogger("whisp-concurrent")
918
+ logger = logger or logging.getLogger("whisp")
832
919
 
833
920
  for attempt in range(max_retries):
834
921
  try:
@@ -955,7 +1042,7 @@ def whisp_stats_geojson_to_df_concurrent(
955
1042
  """
956
1043
  from openforis_whisp.reformat import format_stats_dataframe
957
1044
 
958
- logger = logger or logging.getLogger("whisp-concurrent")
1045
+ logger = logger or logging.getLogger("whisp")
959
1046
 
960
1047
  # Suppress verbose output from dependencies (dynamically adjust based on max_concurrent)
961
1048
  _suppress_verbose_output(max_concurrent=max_concurrent)
@@ -973,11 +1060,23 @@ def whisp_stats_geojson_to_df_concurrent(
973
1060
  logger.info(f"Loaded {len(gdf):,} features")
974
1061
 
975
1062
  if validate_geometries:
976
- gdf = clean_geodataframe(gdf, logger=logger)
1063
+ gdf = clean_geodataframe(
1064
+ gdf, remove_nulls=False, repair_geometries=False, logger=logger
1065
+ )
977
1066
 
978
1067
  # Add stable plotIds for merging (starting from 1, not 0)
979
1068
  gdf[plot_id_column] = range(1, len(gdf) + 1)
980
1069
 
1070
+ # Strip unnecessary properties before sending to EE
1071
+ # Keep only: geometry, plot_id_column, and external_id_column
1072
+ # This prevents duplication of GeoJSON properties in EE results
1073
+ keep_cols = ["geometry", plot_id_column]
1074
+ if external_id_column and external_id_column in gdf.columns:
1075
+ keep_cols.append(external_id_column)
1076
+
1077
+ gdf_for_ee = gdf[keep_cols].copy()
1078
+ logger.debug(f"Stripped GeoJSON to essential columns: {keep_cols}")
1079
+
981
1080
  # Create image if not provided
982
1081
  if whisp_image is None:
983
1082
  logger.debug("Creating Whisp image...")
@@ -1001,8 +1100,8 @@ def whisp_stats_geojson_to_df_concurrent(
1001
1100
  reducer = ee.Reducer.sum().combine(ee.Reducer.median(), sharedInputs=True)
1002
1101
 
1003
1102
  # Batch the data
1004
- batches = batch_geodataframe(gdf, batch_size)
1005
- logger.info(f"Processing {len(gdf):,} features in {len(batches)} batches")
1103
+ batches = batch_geodataframe(gdf_for_ee, batch_size)
1104
+ logger.info(f"Processing {len(gdf_for_ee):,} features in {len(batches)} batches")
1006
1105
 
1007
1106
  # Setup semaphore for EE concurrency control
1008
1107
  ee_semaphore = threading.BoundedSemaphore(max_concurrent)
@@ -1056,7 +1155,12 @@ def whisp_stats_geojson_to_df_concurrent(
1056
1155
  for i, batch in enumerate(batches)
1057
1156
  }
1058
1157
 
1158
+ # Track which batches failed for retry
1159
+ batch_map = {i: batch for i, batch in enumerate(batches)}
1160
+ batch_futures = {future: i for future, i in futures.items()}
1161
+
1059
1162
  for future in as_completed(futures):
1163
+ batch_idx = batch_futures[future]
1060
1164
  try:
1061
1165
  batch_idx, df_server, df_client = future.result()
1062
1166
 
@@ -1064,8 +1168,35 @@ def whisp_stats_geojson_to_df_concurrent(
1064
1168
  if plot_id_column not in df_server.columns:
1065
1169
  df_server[plot_id_column] = range(len(df_server))
1066
1170
 
1067
- merged = df_server.merge(
1068
- df_client,
1171
+ # Keep all EE statistics from server (all columns with _sum and _median suffixes)
1172
+ # These are the actual EE processing results
1173
+ df_server_clean = df_server.copy()
1174
+
1175
+ # Keep external metadata: plot_id, external_id, geometry, geometry type, and centroids from client
1176
+ # (formatted wrapper handles keep_external_columns parameter)
1177
+ keep_external_columns = [plot_id_column]
1178
+ if (
1179
+ external_id_column
1180
+ and external_id_column in df_client.columns
1181
+ ):
1182
+ keep_external_columns.append(external_id_column)
1183
+ if "geometry" in df_client.columns:
1184
+ keep_external_columns.append("geometry")
1185
+ # Keep geometry type column (Geometry_type)
1186
+ if geometry_type_column in df_client.columns:
1187
+ keep_external_columns.append(geometry_type_column)
1188
+ # Also keep centroid columns (Centroid_lon, Centroid_lat)
1189
+ centroid_cols = [
1190
+ c for c in df_client.columns if c.startswith("Centroid_")
1191
+ ]
1192
+ keep_external_columns.extend(centroid_cols)
1193
+
1194
+ df_client_clean = df_client[
1195
+ [c for c in keep_external_columns if c in df_client.columns]
1196
+ ].drop_duplicates()
1197
+
1198
+ merged = df_server_clean.merge(
1199
+ df_client_clean,
1069
1200
  on=plot_id_column,
1070
1201
  how="left",
1071
1202
  suffixes=("_ee", "_client"),
@@ -1074,12 +1205,16 @@ def whisp_stats_geojson_to_df_concurrent(
1074
1205
  progress.update()
1075
1206
 
1076
1207
  except Exception as e:
1208
+ # Batch failed - fail fast with clear guidance
1077
1209
  error_msg = str(e)
1078
- logger.error(f"Batch processing error: {error_msg[:100]}")
1079
- import traceback
1210
+ logger.error(f"Batch {batch_idx} failed: {error_msg[:100]}")
1211
+ logger.debug(f"Full error: {error_msg}")
1212
+
1213
+ # Get original batch for error reporting
1214
+ original_batch = batch_map[batch_idx]
1080
1215
 
1081
- logger.debug(traceback.format_exc())
1082
- batch_errors.append(error_msg)
1216
+ # Add to batch errors for final reporting
1217
+ batch_errors.append((batch_idx, original_batch, error_msg))
1083
1218
  finally:
1084
1219
  # Restore logger levels
1085
1220
  fiona_logger.setLevel(old_fiona_level)
@@ -1087,8 +1222,60 @@ def whisp_stats_geojson_to_df_concurrent(
1087
1222
 
1088
1223
  progress.finish()
1089
1224
 
1090
- # Check if we should retry with validation due to band errors
1091
- if batch_errors and not results:
1225
+ # If we have batch errors after retry attempts, fail the entire process
1226
+ if batch_errors:
1227
+ total_failed_rows = sum(len(batch) for _, batch, _ in batch_errors)
1228
+ failed_batch_indices = [str(idx) for idx, _, _ in batch_errors]
1229
+
1230
+ # Format detailed error information for debugging
1231
+ error_details_list = []
1232
+ for idx, batch, msg in batch_errors:
1233
+ error_details_list.append(f" Batch {idx} ({len(batch)} features): {msg}")
1234
+ error_details = "\n".join(error_details_list)
1235
+
1236
+ # Analyze error patterns for debugging hints
1237
+ error_patterns = {
1238
+ "memory": any("memory" in msg.lower() for _, _, msg in batch_errors),
1239
+ "request_size": any(
1240
+ keyword in msg.lower()
1241
+ for _, _, msg in batch_errors
1242
+ for keyword in ["too large", "10mb", "payload", "size limit"]
1243
+ ),
1244
+ "quota": any("quota" in msg.lower() for _, _, msg in batch_errors),
1245
+ "timeout": any("timeout" in msg.lower() for _, _, msg in batch_errors),
1246
+ }
1247
+
1248
+ # Build helpful suggestions based on error patterns
1249
+ suggestions = []
1250
+ if error_patterns["memory"]:
1251
+ suggestions.append(
1252
+ f" • Reduce batch_size parameter (currently: {batch_size}). Try: batch_size=5 or lower"
1253
+ )
1254
+ if error_patterns["request_size"]:
1255
+ suggestions.append(
1256
+ " • Request payload too large: reduce batch_size or simplify input geometries"
1257
+ )
1258
+ if error_patterns["quota"]:
1259
+ suggestions.append(" • Earth Engine quota exceeded: wait and retry later")
1260
+ if error_patterns["timeout"]:
1261
+ suggestions.append(
1262
+ " • Processing timeout: reduce batch_size or simplify input geometries"
1263
+ )
1264
+
1265
+ suggestions_text = (
1266
+ "\nDebugging hints:\n" + "\n".join(suggestions) if suggestions else ""
1267
+ )
1268
+
1269
+ raise RuntimeError(
1270
+ f"Failed to process {len(batch_errors)} batch(es):\n"
1271
+ f"\n{error_details}\n"
1272
+ f"\nTotal rows affected: {total_failed_rows}\n"
1273
+ f"{suggestions_text}\n"
1274
+ f"Please reduce batch_size and try again."
1275
+ )
1276
+
1277
+ # Check if we should retry with validation due to band errors (legacy band error handling)
1278
+ if not results:
1092
1279
  # All batches failed - likely a bad band issue
1093
1280
  is_band_error = any(
1094
1281
  keyword in str(batch_errors)
@@ -1442,7 +1629,7 @@ def whisp_stats_geojson_to_df_sequential(
1442
1629
  """
1443
1630
  from openforis_whisp.reformat import format_stats_dataframe
1444
1631
 
1445
- logger = logger or logging.getLogger("whisp-concurrent")
1632
+ logger = logger or logging.getLogger("whisp")
1446
1633
 
1447
1634
  # Suppress verbose output from dependencies (sequential has lower concurrency, use default)
1448
1635
  _suppress_verbose_output(max_concurrent=1)
@@ -1459,8 +1646,10 @@ def whisp_stats_geojson_to_df_sequential(
1459
1646
  gdf = _load_geojson_silently(input_geojson_filepath)
1460
1647
  logger.info(f"Loaded {len(gdf):,} features")
1461
1648
 
1462
- # Clean geometries
1463
- gdf = clean_geodataframe(gdf, logger=logger)
1649
+ # Clean geometries (preserve both null and invalid geometries by default)
1650
+ gdf = clean_geodataframe(
1651
+ gdf, remove_nulls=False, repair_geometries=False, logger=logger
1652
+ )
1464
1653
 
1465
1654
  # Add stable plotIds for merging (starting from 1, not 0)
1466
1655
  gdf[plot_id_column] = range(1, len(gdf) + 1)
@@ -1469,6 +1658,16 @@ def whisp_stats_geojson_to_df_sequential(
1469
1658
  row_id_col = "__row_id__"
1470
1659
  gdf[row_id_col] = range(len(gdf))
1471
1660
 
1661
+ # Strip unnecessary properties before sending to EE
1662
+ # Keep only: geometry, plot_id_column, and external_id_column
1663
+ # This prevents duplication of GeoJSON properties in EE results
1664
+ keep_cols = ["geometry", plot_id_column, row_id_col]
1665
+ if external_id_column and external_id_column in gdf.columns:
1666
+ keep_cols.append(external_id_column)
1667
+
1668
+ gdf_for_ee = gdf[keep_cols].copy()
1669
+ logger.debug(f"Stripped GeoJSON to essential columns: {keep_cols}")
1670
+
1472
1671
  # Create image if not provided
1473
1672
  if whisp_image is None:
1474
1673
  logger.debug("Creating Whisp image...")
@@ -1491,7 +1690,7 @@ def whisp_stats_geojson_to_df_sequential(
1491
1690
  # Convert to EE (suppress print statements from convert_geojson_to_ee)
1492
1691
  logger.debug("Converting to EE FeatureCollection...")
1493
1692
  with redirect_stdout(io.StringIO()):
1494
- fc = convert_geojson_to_ee(input_geojson_filepath)
1693
+ fc = convert_geojson_to_ee(gdf_for_ee, enforce_wgs84=True, strip_z_coords=True)
1495
1694
 
1496
1695
  # Create reducer
1497
1696
  reducer = ee.Reducer.sum().combine(ee.Reducer.median(), sharedInputs=True)
@@ -1633,6 +1832,7 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
1633
1832
  convert_water_flag: bool = True,
1634
1833
  water_flag_threshold: float = 0.5,
1635
1834
  sort_column: str = "plotId",
1835
+ geometry_audit_trail: bool = False,
1636
1836
  ) -> pd.DataFrame:
1637
1837
  """
1638
1838
  Process GeoJSON concurrently with automatic formatting and validation.
@@ -1683,15 +1883,22 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
1683
1883
  Water flag ratio threshold (default 0.5)
1684
1884
  sort_column : str
1685
1885
  Column to sort by (default "plotId", None to skip)
1886
+ geometry_audit_trail : bool, default False
1887
+ If True, includes original input geometry column:
1888
+ - geo_original: Original input geometry (before EE processing), stored as GeoJSON
1889
+ Enables geometry traceability for compliance and audit purposes.
1686
1890
 
1687
1891
  Returns
1688
1892
  -------
1689
1893
  pd.DataFrame
1690
- Validated, formatted results DataFrame
1894
+ Validated, formatted results DataFrame with optional audit trail
1691
1895
  """
1692
1896
  from openforis_whisp.reformat import format_stats_dataframe
1897
+ from datetime import datetime, timezone
1898
+ import json
1899
+ from shapely.geometry import mapping
1693
1900
 
1694
- logger = logger or logging.getLogger("whisp-concurrent")
1901
+ logger = logger or logging.getLogger("whisp")
1695
1902
 
1696
1903
  # Auto-detect decimal places from config if not provided
1697
1904
  if decimal_places is None:
@@ -1699,6 +1906,12 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
1699
1906
  decimal_places = _extract_decimal_places(stats_area_columns_formatting)
1700
1907
  logger.debug(f"Using decimal_places={decimal_places} from config")
1701
1908
 
1909
+ # Load original geometries once here if needed for audit trail (avoid reloading later)
1910
+ gdf_original_geoms = None
1911
+ if geometry_audit_trail:
1912
+ logger.debug("Pre-loading GeoJSON for geometry audit trail...")
1913
+ gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
1914
+
1702
1915
  # Step 1: Get raw stats
1703
1916
  logger.debug("Step 1/2: Extracting statistics (concurrent)...")
1704
1917
  df_raw = whisp_stats_geojson_to_df_concurrent(
@@ -1759,6 +1972,57 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
1759
1972
  custom_bands=custom_bands,
1760
1973
  )
1761
1974
 
1975
+ # Step 2c: Add audit trail columns (AFTER validation to preserve columns)
1976
+ if geometry_audit_trail:
1977
+ logger.debug("Adding audit trail columns...")
1978
+ try:
1979
+ # Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
1980
+ if gdf_original_geoms is None:
1981
+ logger.warning("Original geometries not pre-loaded, loading now...")
1982
+ gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
1983
+
1984
+ # Use plotId from df_validated to maintain mapping
1985
+ df_original_geom = pd.DataFrame(
1986
+ {
1987
+ "plotId": df_validated["plotId"].values[: len(gdf_original_geoms)],
1988
+ "geo_original": gdf_original_geoms["geometry"].apply(
1989
+ lambda g: json.dumps(mapping(g)) if g is not None else None
1990
+ ),
1991
+ }
1992
+ )
1993
+
1994
+ # Merge original geometries back
1995
+ df_validated = df_validated.merge(df_original_geom, on="plotId", how="left")
1996
+
1997
+ # Store processing metadata
1998
+ df_validated.attrs["processing_metadata"] = {
1999
+ "whisp_version": "3.0.0a1",
2000
+ "processing_date": datetime.now().isoformat(),
2001
+ "processing_mode": "concurrent",
2002
+ "ee_endpoint": "high_volume",
2003
+ "validate_geometries": validate_geometries,
2004
+ "datasets_used": national_codes or [],
2005
+ "geometry_audit_trail": True,
2006
+ }
2007
+
2008
+ logger.info(f"Audit trail added: geo_original column")
2009
+
2010
+ except Exception as e:
2011
+ logger.warning(f"Error adding audit trail: {e}")
2012
+ # Continue without audit trail if something fails
2013
+
2014
+ # Add processing metadata column using pd.concat to avoid fragmentation warning
2015
+ metadata_dict = {
2016
+ "whisp_version": "3.0.0a1",
2017
+ "processing_timestamp_utc": datetime.now(timezone.utc).strftime(
2018
+ "%Y-%m-%d %H:%M:%S UTC"
2019
+ ),
2020
+ }
2021
+ metadata_series = pd.Series(
2022
+ [metadata_dict] * len(df_validated), name="whisp_processing_metadata"
2023
+ )
2024
+ df_validated = pd.concat([df_validated, metadata_series], axis=1)
2025
+
1762
2026
  logger.info("Concurrent processing + formatting + validation complete")
1763
2027
  return df_validated
1764
2028
 
@@ -1779,6 +2043,7 @@ def whisp_formatted_stats_geojson_to_df_sequential(
1779
2043
  convert_water_flag: bool = True,
1780
2044
  water_flag_threshold: float = 0.5,
1781
2045
  sort_column: str = "plotId",
2046
+ geometry_audit_trail: bool = False,
1782
2047
  ) -> pd.DataFrame:
1783
2048
  """
1784
2049
  Process GeoJSON sequentially with automatic formatting and validation.
@@ -1821,15 +2086,22 @@ def whisp_formatted_stats_geojson_to_df_sequential(
1821
2086
  Water flag ratio threshold (default 0.5)
1822
2087
  sort_column : str
1823
2088
  Column to sort by (default "plotId", None to skip)
2089
+ geometry_audit_trail : bool, default True
2090
+ If True, includes original input geometry column:
2091
+ - geo_original: Original input geometry (before EE processing), stored as GeoJSON
2092
+ Enables geometry traceability for compliance and audit purposes.
1824
2093
 
1825
2094
  Returns
1826
2095
  -------
1827
2096
  pd.DataFrame
1828
- Validated, formatted results DataFrame
2097
+ Validated, formatted results DataFrame with optional audit trail
1829
2098
  """
1830
2099
  from openforis_whisp.reformat import format_stats_dataframe
2100
+ from datetime import datetime, timezone
2101
+ import json
2102
+ from shapely.geometry import mapping
1831
2103
 
1832
- logger = logger or logging.getLogger("whisp-concurrent")
2104
+ logger = logger or logging.getLogger("whisp")
1833
2105
 
1834
2106
  # Auto-detect decimal places from config if not provided
1835
2107
  if decimal_places is None:
@@ -1837,6 +2109,12 @@ def whisp_formatted_stats_geojson_to_df_sequential(
1837
2109
  decimal_places = _extract_decimal_places(stats_area_columns_formatting)
1838
2110
  logger.debug(f"Using decimal_places={decimal_places} from config")
1839
2111
 
2112
+ # Load original geometries once here if needed for audit trail (avoid reloading later)
2113
+ gdf_original_geoms = None
2114
+ if geometry_audit_trail:
2115
+ logger.debug("Pre-loading GeoJSON for geometry audit trail...")
2116
+ gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
2117
+
1840
2118
  # Step 1: Get raw stats
1841
2119
  logger.debug("Step 1/2: Extracting statistics (sequential)...")
1842
2120
  df_raw = whisp_stats_geojson_to_df_sequential(
@@ -1893,6 +2171,56 @@ def whisp_formatted_stats_geojson_to_df_sequential(
1893
2171
  custom_bands=custom_bands,
1894
2172
  )
1895
2173
 
2174
+ # Step 2c: Add audit trail columns (AFTER validation to preserve columns)
2175
+ if geometry_audit_trail:
2176
+ logger.debug("Adding audit trail columns...")
2177
+ try:
2178
+ # Use pre-loaded original geometries (loaded at wrapper start to avoid reloading)
2179
+ if gdf_original_geoms is None:
2180
+ logger.warning("Original geometries not pre-loaded, loading now...")
2181
+ gdf_original_geoms = _load_geojson_silently(input_geojson_filepath)
2182
+
2183
+ # Use plotId from df_validated to maintain mapping
2184
+ df_original_geom = pd.DataFrame(
2185
+ {
2186
+ "plotId": df_validated["plotId"].values[: len(gdf_original_geoms)],
2187
+ "geo_original": gdf_original_geoms["geometry"].apply(
2188
+ lambda g: json.dumps(mapping(g)) if g is not None else None
2189
+ ),
2190
+ }
2191
+ )
2192
+
2193
+ # Merge original geometries back
2194
+ df_validated = df_validated.merge(df_original_geom, on="plotId", how="left")
2195
+
2196
+ # Store processing metadata
2197
+ df_validated.attrs["processing_metadata"] = {
2198
+ "whisp_version": "3.0.0a1",
2199
+ "processing_date": datetime.now().isoformat(),
2200
+ "processing_mode": "sequential",
2201
+ "ee_endpoint": "standard",
2202
+ "datasets_used": national_codes or [],
2203
+ "geometry_audit_trail": True,
2204
+ }
2205
+
2206
+ logger.info(f"Audit trail added: geo_original column")
2207
+
2208
+ except Exception as e:
2209
+ logger.warning(f"Error adding audit trail: {e}")
2210
+ # Continue without audit trail if something fails
2211
+
2212
+ # Add processing metadata column using pd.concat to avoid fragmentation warning
2213
+ metadata_dict = {
2214
+ "whisp_version": "3.0.0a1",
2215
+ "processing_timestamp_utc": datetime.now(timezone.utc).strftime(
2216
+ "%Y-%m-%d %H:%M:%S UTC"
2217
+ ),
2218
+ }
2219
+ metadata_series = pd.Series(
2220
+ [metadata_dict] * len(df_validated), name="whisp_processing_metadata"
2221
+ )
2222
+ df_validated = pd.concat([df_validated, metadata_series], axis=1)
2223
+
1896
2224
  logger.info("Sequential processing + formatting + validation complete")
1897
2225
  return df_validated
1898
2226
 
@@ -1910,7 +2238,7 @@ def whisp_formatted_stats_geojson_to_df_fast(
1910
2238
  unit_type: str = "ha",
1911
2239
  whisp_image: ee.Image = None,
1912
2240
  custom_bands: Dict[str, Any] = None,
1913
- mode: str = "auto",
2241
+ mode: str = "sequential",
1914
2242
  # Concurrent-specific parameters
1915
2243
  batch_size: int = 10,
1916
2244
  max_concurrent: int = 20,
@@ -1923,14 +2251,15 @@ def whisp_formatted_stats_geojson_to_df_fast(
1923
2251
  convert_water_flag: bool = True,
1924
2252
  water_flag_threshold: float = 0.5,
1925
2253
  sort_column: str = "plotId",
2254
+ geometry_audit_trail: bool = False,
1926
2255
  ) -> pd.DataFrame:
1927
2256
  """
1928
2257
  Process GeoJSON to Whisp statistics with optimized fast processing.
1929
2258
 
1930
- Automatically selects between concurrent (high-volume endpoint) and sequential
1931
- (standard endpoint) based on file size, or allows explicit mode selection.
2259
+ Routes to concurrent (high-volume endpoint) or sequential (standard endpoint)
2260
+ based on explicit mode selection.
1932
2261
 
1933
- This is the recommended entry point for most users who want automatic optimization.
2262
+ This is the recommended entry point for most users.
1934
2263
 
1935
2264
  Parameters
1936
2265
  ----------
@@ -1950,12 +2279,8 @@ def whisp_formatted_stats_geojson_to_df_fast(
1950
2279
  Custom band information
1951
2280
  mode : str
1952
2281
  Processing mode:
1953
- - "auto": Choose based on file size (default)
1954
- * <1MB: sequential
1955
- * 1-5MB: sequential
1956
- * >5MB: concurrent
1957
- - "concurrent": Force high-volume endpoint (batch processing)
1958
- - "sequential": Force standard endpoint (single-threaded)
2282
+ - "concurrent": Uses high-volume endpoint with batch processing
2283
+ - "sequential": Uses standard endpoint for sequential processing
1959
2284
  batch_size : int
1960
2285
  Features per batch (only for concurrent mode)
1961
2286
  max_concurrent : int
@@ -1976,6 +2301,8 @@ def whisp_formatted_stats_geojson_to_df_fast(
1976
2301
  Water flag ratio threshold
1977
2302
  sort_column : str
1978
2303
  Column to sort by
2304
+ geometry_audit_trail : bool
2305
+ Include geometry modification audit trail columns
1979
2306
 
1980
2307
  Returns
1981
2308
  -------
@@ -1984,52 +2311,30 @@ def whisp_formatted_stats_geojson_to_df_fast(
1984
2311
 
1985
2312
  Examples
1986
2313
  --------
1987
- >>> # Auto-detect best method based on file size
1988
- >>> df = whisp_formatted_stats_geojson_to_df_fast("data.geojson")
1989
-
1990
- >>> # Force concurrent processing for large datasets
2314
+ >>> # Use concurrent processing (recommended for most datasets)
1991
2315
  >>> df = whisp_formatted_stats_geojson_to_df_fast(
1992
- ... "large_data.geojson",
2316
+ ... "data.geojson",
1993
2317
  ... mode="concurrent"
1994
2318
  ... )
1995
2319
 
1996
- >>> # Use sequential for guaranteed completion
2320
+ >>> # Use sequential processing for more stable results
1997
2321
  >>> df = whisp_formatted_stats_geojson_to_df_fast(
1998
2322
  ... "data.geojson",
1999
2323
  ... mode="sequential"
2000
2324
  ... )
2001
2325
  """
2002
- logger = logging.getLogger("whisp-concurrent")
2326
+ logger = logging.getLogger("whisp")
2003
2327
 
2004
- # Determine processing mode
2005
- if mode == "auto":
2006
- try:
2007
- file_size = Path(input_geojson_filepath).stat().st_size
2008
- if file_size > 5_000_000: # >5MB
2009
- chosen_mode = "concurrent"
2010
- logger.info(
2011
- f"File size {file_size/1e6:.1f}MB → Using concurrent (high-volume endpoint)"
2012
- )
2013
- else: # <=5MB
2014
- chosen_mode = "sequential"
2015
- logger.info(
2016
- f"File size {file_size/1e6:.1f}MB → Using sequential (standard endpoint)"
2017
- )
2018
- except Exception as e:
2019
- logger.warning(
2020
- f"Could not determine file size: {e}. Defaulting to sequential."
2021
- )
2022
- chosen_mode = "sequential"
2023
- elif mode in ("concurrent", "sequential"):
2024
- chosen_mode = mode
2025
- logger.info(f"Mode explicitly set to: {mode}")
2026
- else:
2328
+ # Validate mode parameter
2329
+ if mode not in ("concurrent", "sequential"):
2027
2330
  raise ValueError(
2028
- f"Invalid mode '{mode}'. Must be 'auto', 'concurrent', or 'sequential'."
2331
+ f"Invalid mode '{mode}'. Must be 'concurrent' or 'sequential'."
2029
2332
  )
2030
2333
 
2334
+ logger.info(f"Mode: {mode}")
2335
+
2031
2336
  # Route to appropriate function
2032
- if chosen_mode == "concurrent":
2337
+ if mode == "concurrent":
2033
2338
  logger.debug("Routing to concurrent processing...")
2034
2339
  return whisp_formatted_stats_geojson_to_df_concurrent(
2035
2340
  input_geojson_filepath=input_geojson_filepath,
@@ -2050,6 +2355,7 @@ def whisp_formatted_stats_geojson_to_df_fast(
2050
2355
  convert_water_flag=convert_water_flag,
2051
2356
  water_flag_threshold=water_flag_threshold,
2052
2357
  sort_column=sort_column,
2358
+ geometry_audit_trail=geometry_audit_trail,
2053
2359
  )
2054
2360
  else: # sequential
2055
2361
  logger.debug("Routing to sequential processing...")
@@ -2067,4 +2373,5 @@ def whisp_formatted_stats_geojson_to_df_fast(
2067
2373
  convert_water_flag=convert_water_flag,
2068
2374
  water_flag_threshold=water_flag_threshold,
2069
2375
  sort_column=sort_column,
2376
+ geometry_audit_trail=geometry_audit_trail,
2070
2377
  )