openforis-whisp 3.0.0a1__tar.gz → 3.0.0a2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (20) hide show
  1. {openforis_whisp-3.0.0a1 → openforis_whisp-3.0.0a2}/PKG-INFO +1 -1
  2. {openforis_whisp-3.0.0a1 → openforis_whisp-3.0.0a2}/pyproject.toml +1 -1
  3. {openforis_whisp-3.0.0a1 → openforis_whisp-3.0.0a2}/src/openforis_whisp/advanced_stats.py +387 -29
  4. {openforis_whisp-3.0.0a1 → openforis_whisp-3.0.0a2}/src/openforis_whisp/data_checks.py +178 -15
  5. {openforis_whisp-3.0.0a1 → openforis_whisp-3.0.0a2}/src/openforis_whisp/data_conversion.py +154 -59
  6. {openforis_whisp-3.0.0a1 → openforis_whisp-3.0.0a2}/src/openforis_whisp/stats.py +21 -6
  7. {openforis_whisp-3.0.0a1 → openforis_whisp-3.0.0a2}/LICENSE +0 -0
  8. {openforis_whisp-3.0.0a1 → openforis_whisp-3.0.0a2}/README.md +0 -0
  9. {openforis_whisp-3.0.0a1 → openforis_whisp-3.0.0a2}/src/openforis_whisp/__init__.py +0 -0
  10. {openforis_whisp-3.0.0a1 → openforis_whisp-3.0.0a2}/src/openforis_whisp/datasets.py +0 -0
  11. {openforis_whisp-3.0.0a1 → openforis_whisp-3.0.0a2}/src/openforis_whisp/logger.py +0 -0
  12. {openforis_whisp-3.0.0a1 → openforis_whisp-3.0.0a2}/src/openforis_whisp/parameters/__init__.py +0 -0
  13. {openforis_whisp-3.0.0a1 → openforis_whisp-3.0.0a2}/src/openforis_whisp/parameters/config_runtime.py +0 -0
  14. {openforis_whisp-3.0.0a1 → openforis_whisp-3.0.0a2}/src/openforis_whisp/parameters/lookup_context_and_metadata.csv +0 -0
  15. {openforis_whisp-3.0.0a1 → openforis_whisp-3.0.0a2}/src/openforis_whisp/parameters/lookup_gaul1_admin.py +0 -0
  16. {openforis_whisp-3.0.0a1 → openforis_whisp-3.0.0a2}/src/openforis_whisp/parameters/lookup_gee_datasets.csv +0 -0
  17. {openforis_whisp-3.0.0a1 → openforis_whisp-3.0.0a2}/src/openforis_whisp/pd_schemas.py +0 -0
  18. {openforis_whisp-3.0.0a1 → openforis_whisp-3.0.0a2}/src/openforis_whisp/reformat.py +0 -0
  19. {openforis_whisp-3.0.0a1 → openforis_whisp-3.0.0a2}/src/openforis_whisp/risk.py +0 -0
  20. {openforis_whisp-3.0.0a1 → openforis_whisp-3.0.0a2}/src/openforis_whisp/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: openforis-whisp
3
- Version: 3.0.0a1
3
+ Version: 3.0.0a2
4
4
  Summary: Whisp (What is in that plot) is an open-source solution which helps to produce relevant forest monitoring information and support compliance with deforestation-related regulations.
5
5
  License: MIT
6
6
  Keywords: whisp,geospatial,data-processing
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "openforis-whisp"
7
- version = "3.0.0a1"
7
+ version = "3.0.0a2"
8
8
  description = "Whisp (What is in that plot) is an open-source solution which helps to produce relevant forest monitoring information and support compliance with deforestation-related regulations."
9
9
  repository = "https://github.com/forestdatapartnership/whisp"
10
10
  authors = ["Andy Arnell <andrew.arnell@fao.org>"]
@@ -32,7 +32,7 @@ import os
32
32
  import subprocess
33
33
  from contextlib import redirect_stdout, contextmanager
34
34
  from pathlib import Path
35
- from typing import Optional, List, Dict, Any, Tuple
35
+ from typing import Optional, List, Dict, Any, Tuple, Union
36
36
  from concurrent.futures import ThreadPoolExecutor, as_completed
37
37
  import tempfile
38
38
 
@@ -203,6 +203,57 @@ def _extract_decimal_places(format_string: str) -> int:
203
203
  return 2 # Default to 2 decimal places
204
204
 
205
205
 
206
+ def _normalize_keep_external_columns(
207
+ keep_external_columns: Union[bool, List[str]],
208
+ all_columns: List[str],
209
+ plot_id_column: str = "plotId",
210
+ ) -> List[str]:
211
+ """
212
+ Normalize keep_external_columns parameter to a list of column names.
213
+
214
+ Converts flexible user input (bool or list) to a concrete list of columns to keep.
215
+
216
+ Parameters
217
+ ----------
218
+ keep_external_columns : bool or List[str]
219
+ - False: keep nothing (return empty list)
220
+ - True: keep all columns except geometry and plot_id
221
+ - List[str]: keep specific columns (return as-is)
222
+ all_columns : List[str]
223
+ All available columns to choose from
224
+ plot_id_column : str
225
+ Name of plot ID column to exclude
226
+
227
+ Returns
228
+ -------
229
+ List[str]
230
+ Columns to keep from external (GeoJSON) data
231
+
232
+ Examples
233
+ --------
234
+ >>> cols = _normalize_keep_external_columns(False, ["id", "Country", "geom"], "id")
235
+ >>> cols
236
+ []
237
+
238
+ >>> cols = _normalize_keep_external_columns(True, ["id", "Country", "geom"], "id")
239
+ >>> cols
240
+ ['Country']
241
+
242
+ >>> cols = _normalize_keep_external_columns(["Country"], ["id", "Country", "geom"], "id")
243
+ >>> cols
244
+ ['Country']
245
+ """
246
+ if keep_external_columns is True:
247
+ # Keep all columns except geometry and plot_id
248
+ return [c for c in all_columns if c not in [plot_id_column, "geometry"]]
249
+ elif keep_external_columns is False:
250
+ # Keep nothing
251
+ return []
252
+ else:
253
+ # Use provided list (handle None case)
254
+ return keep_external_columns or []
255
+
256
+
206
257
  def _add_admin_context(
207
258
  df: pd.DataFrame, admin_code_col: str = "admin_code_median", debug: bool = False
208
259
  ) -> pd.DataFrame:
@@ -226,7 +277,7 @@ def _add_admin_context(
226
277
  pd.DataFrame
227
278
  DataFrame with added Country, ProducerCountry, Admin_Level_1 columns
228
279
  """
229
- logger = logging.getLogger("whisp-concurrent")
280
+ logger = logging.getLogger("whisp")
230
281
 
231
282
  # Return early if admin code column doesn't exist
232
283
  if admin_code_col not in df.columns:
@@ -347,7 +398,7 @@ def join_admin_codes(
347
398
  pd.DataFrame
348
399
  DataFrame with added Country, ProducerCountry, Admin_Level_1 columns
349
400
  """
350
- logger = logging.getLogger("whisp-concurrent")
401
+ logger = logging.getLogger("whisp")
351
402
 
352
403
  # Return early if admin code column doesn't exist
353
404
  if id_col not in df.columns:
@@ -408,8 +459,9 @@ class ProgressTracker:
408
459
  """
409
460
  Track batch processing progress with time estimation.
410
461
 
411
- Shows progress at key milestones (25%, 50%, 75%, 100%) with estimated
412
- time remaining based on processing speed.
462
+ Shows progress at adaptive milestones (more frequent for small datasets,
463
+ less frequent for large datasets) with estimated time remaining based on
464
+ processing speed.
413
465
  """
414
466
 
415
467
  def __init__(self, total: int, logger: logging.Logger = None):
@@ -426,8 +478,19 @@ class ProgressTracker:
426
478
  self.total = total
427
479
  self.completed = 0
428
480
  self.lock = threading.Lock()
429
- self.logger = logger or logging.getLogger("whisp-concurrent")
430
- self.milestones = {25, 50, 75, 100}
481
+ self.logger = logger or logging.getLogger("whisp")
482
+
483
+ # Adaptive milestones based on dataset size
484
+ # Small datasets (< 50): show every 25% (not too spammy)
485
+ # Medium (50-500): show every 20%
486
+ # Large (500+): show every 10% (more frequent feedback on long runs)
487
+ if total < 50:
488
+ self.milestones = {25, 50, 75, 100}
489
+ elif total < 500:
490
+ self.milestones = {20, 40, 60, 80, 100}
491
+ else:
492
+ self.milestones = {10, 20, 30, 40, 50, 60, 70, 80, 90, 100}
493
+
431
494
  self.shown_milestones = set()
432
495
  self.start_time = time.time()
433
496
  self.last_update_time = self.start_time
@@ -544,9 +607,11 @@ def validate_ee_endpoint(endpoint_type: str = "high-volume", raise_error: bool =
544
607
  )
545
608
  msg += "ee.Reset()\n"
546
609
  if endpoint_type == "high-volume":
547
- msg += " ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')"
610
+ msg += (
611
+ "ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')"
612
+ )
548
613
  else:
549
- msg += " ee.Initialize() # Uses standard endpoint by default"
614
+ msg += "ee.Initialize() # Uses standard endpoint by default"
550
615
 
551
616
  if raise_error:
552
617
  raise RuntimeError(msg)
@@ -713,8 +778,8 @@ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
713
778
  """
714
779
  Convert a batch GeoDataFrame to EE FeatureCollection efficiently.
715
780
 
716
- OPTIMIZATION: Uses GeoJSON dict input directly to avoid temp file I/O.
717
- This provides ~67% performance improvement over writing to disk.
781
+ OPTIMIZATION: Passes GeoDataFrame directly to convert_geojson_to_ee to preserve CRS.
782
+ This ensures proper coordinate system handling and reprojection to WGS84 if needed.
718
783
 
719
784
  Preserves the __row_id__ column if present so it can be retrieved after processing.
720
785
 
@@ -728,10 +793,13 @@ def convert_batch_to_ee(batch_gdf: gpd.GeoDataFrame) -> ee.FeatureCollection:
728
793
  ee.FeatureCollection
729
794
  EE FeatureCollection with __row_id__ as a feature property
730
795
  """
731
- # OPTIMIZATION: Convert to GeoJSON dict and pass directly
732
- # This eliminates the need to write to/read from temp files (~67% faster)
733
- geojson_dict = json.loads(batch_gdf.to_json())
734
- fc = convert_geojson_to_ee(geojson_dict)
796
+ # Pass GeoDataFrame directly to preserve CRS metadata
797
+ # convert_geojson_to_ee will handle:
798
+ # - CRS detection and conversion to WGS84 if needed
799
+ # - Data type sanitization (datetime, object columns)
800
+ # - Geometry validation and Z-coordinate stripping
801
+
802
+ fc = convert_geojson_to_ee(batch_gdf, enforce_wgs84=True, strip_z_coords=True)
735
803
 
736
804
  # If __row_id__ is in the original GeoDataFrame, it will be preserved
737
805
  # as a feature property in the GeoJSON and thus in the EE FeatureCollection
@@ -763,7 +831,7 @@ def clean_geodataframe(
763
831
  gpd.GeoDataFrame
764
832
  Cleaned GeoDataFrame
765
833
  """
766
- logger = logger or logging.getLogger("whisp-concurrent")
834
+ logger = logger or logging.getLogger("whisp")
767
835
 
768
836
  if remove_nulls:
769
837
  null_count = gdf.geometry.isna().sum()
@@ -828,7 +896,7 @@ def process_ee_batch(
828
896
  RuntimeError
829
897
  If processing fails after all retries
830
898
  """
831
- logger = logger or logging.getLogger("whisp-concurrent")
899
+ logger = logger or logging.getLogger("whisp")
832
900
 
833
901
  for attempt in range(max_retries):
834
902
  try:
@@ -955,7 +1023,7 @@ def whisp_stats_geojson_to_df_concurrent(
955
1023
  """
956
1024
  from openforis_whisp.reformat import format_stats_dataframe
957
1025
 
958
- logger = logger or logging.getLogger("whisp-concurrent")
1026
+ logger = logger or logging.getLogger("whisp")
959
1027
 
960
1028
  # Suppress verbose output from dependencies (dynamically adjust based on max_concurrent)
961
1029
  _suppress_verbose_output(max_concurrent=max_concurrent)
@@ -978,6 +1046,16 @@ def whisp_stats_geojson_to_df_concurrent(
978
1046
  # Add stable plotIds for merging (starting from 1, not 0)
979
1047
  gdf[plot_id_column] = range(1, len(gdf) + 1)
980
1048
 
1049
+ # Strip unnecessary properties before sending to EE
1050
+ # Keep only: geometry, plot_id_column, and external_id_column
1051
+ # This prevents duplication of GeoJSON properties in EE results
1052
+ keep_cols = ["geometry", plot_id_column]
1053
+ if external_id_column and external_id_column in gdf.columns:
1054
+ keep_cols.append(external_id_column)
1055
+
1056
+ gdf_for_ee = gdf[keep_cols].copy()
1057
+ logger.debug(f"Stripped GeoJSON to essential columns: {keep_cols}")
1058
+
981
1059
  # Create image if not provided
982
1060
  if whisp_image is None:
983
1061
  logger.debug("Creating Whisp image...")
@@ -1001,8 +1079,8 @@ def whisp_stats_geojson_to_df_concurrent(
1001
1079
  reducer = ee.Reducer.sum().combine(ee.Reducer.median(), sharedInputs=True)
1002
1080
 
1003
1081
  # Batch the data
1004
- batches = batch_geodataframe(gdf, batch_size)
1005
- logger.info(f"Processing {len(gdf):,} features in {len(batches)} batches")
1082
+ batches = batch_geodataframe(gdf_for_ee, batch_size)
1083
+ logger.info(f"Processing {len(gdf_for_ee):,} features in {len(batches)} batches")
1006
1084
 
1007
1085
  # Setup semaphore for EE concurrency control
1008
1086
  ee_semaphore = threading.BoundedSemaphore(max_concurrent)
@@ -1064,8 +1142,35 @@ def whisp_stats_geojson_to_df_concurrent(
1064
1142
  if plot_id_column not in df_server.columns:
1065
1143
  df_server[plot_id_column] = range(len(df_server))
1066
1144
 
1067
- merged = df_server.merge(
1068
- df_client,
1145
+ # Keep all EE statistics from server (all columns with _sum and _median suffixes)
1146
+ # These are the actual EE processing results
1147
+ df_server_clean = df_server.copy()
1148
+
1149
+ # Keep external metadata: plot_id, external_id, geometry, geometry type, and centroids from client
1150
+ # (formatted wrapper handles keep_external_columns parameter)
1151
+ keep_external_columns = [plot_id_column]
1152
+ if (
1153
+ external_id_column
1154
+ and external_id_column in df_client.columns
1155
+ ):
1156
+ keep_external_columns.append(external_id_column)
1157
+ if "geometry" in df_client.columns:
1158
+ keep_external_columns.append("geometry")
1159
+ # Keep geometry type column (Geometry_type)
1160
+ if geometry_type_column in df_client.columns:
1161
+ keep_external_columns.append(geometry_type_column)
1162
+ # Also keep centroid columns (Centroid_lon, Centroid_lat)
1163
+ centroid_cols = [
1164
+ c for c in df_client.columns if c.startswith("Centroid_")
1165
+ ]
1166
+ keep_external_columns.extend(centroid_cols)
1167
+
1168
+ df_client_clean = df_client[
1169
+ [c for c in keep_external_columns if c in df_client.columns]
1170
+ ].drop_duplicates()
1171
+
1172
+ merged = df_server_clean.merge(
1173
+ df_client_clean,
1069
1174
  on=plot_id_column,
1070
1175
  how="left",
1071
1176
  suffixes=("_ee", "_client"),
@@ -1442,7 +1547,7 @@ def whisp_stats_geojson_to_df_sequential(
1442
1547
  """
1443
1548
  from openforis_whisp.reformat import format_stats_dataframe
1444
1549
 
1445
- logger = logger or logging.getLogger("whisp-concurrent")
1550
+ logger = logger or logging.getLogger("whisp")
1446
1551
 
1447
1552
  # Suppress verbose output from dependencies (sequential has lower concurrency, use default)
1448
1553
  _suppress_verbose_output(max_concurrent=1)
@@ -1469,6 +1574,16 @@ def whisp_stats_geojson_to_df_sequential(
1469
1574
  row_id_col = "__row_id__"
1470
1575
  gdf[row_id_col] = range(len(gdf))
1471
1576
 
1577
+ # Strip unnecessary properties before sending to EE
1578
+ # Keep only: geometry, plot_id_column, and external_id_column
1579
+ # This prevents duplication of GeoJSON properties in EE results
1580
+ keep_cols = ["geometry", plot_id_column, row_id_col]
1581
+ if external_id_column and external_id_column in gdf.columns:
1582
+ keep_cols.append(external_id_column)
1583
+
1584
+ gdf_for_ee = gdf[keep_cols].copy()
1585
+ logger.debug(f"Stripped GeoJSON to essential columns: {keep_cols}")
1586
+
1472
1587
  # Create image if not provided
1473
1588
  if whisp_image is None:
1474
1589
  logger.debug("Creating Whisp image...")
@@ -1491,7 +1606,7 @@ def whisp_stats_geojson_to_df_sequential(
1491
1606
  # Convert to EE (suppress print statements from convert_geojson_to_ee)
1492
1607
  logger.debug("Converting to EE FeatureCollection...")
1493
1608
  with redirect_stdout(io.StringIO()):
1494
- fc = convert_geojson_to_ee(input_geojson_filepath)
1609
+ fc = convert_geojson_to_ee(gdf_for_ee, enforce_wgs84=True, strip_z_coords=True)
1495
1610
 
1496
1611
  # Create reducer
1497
1612
  reducer = ee.Reducer.sum().combine(ee.Reducer.median(), sharedInputs=True)
@@ -1633,6 +1748,7 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
1633
1748
  convert_water_flag: bool = True,
1634
1749
  water_flag_threshold: float = 0.5,
1635
1750
  sort_column: str = "plotId",
1751
+ include_geometry_audit_trail: bool = False,
1636
1752
  ) -> pd.DataFrame:
1637
1753
  """
1638
1754
  Process GeoJSON concurrently with automatic formatting and validation.
@@ -1683,15 +1799,26 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
1683
1799
  Water flag ratio threshold (default 0.5)
1684
1800
  sort_column : str
1685
1801
  Column to sort by (default "plotId", None to skip)
1802
+ include_geometry_audit_trail : bool, default False
1803
+ If True, includes audit trail columns:
1804
+ - geo_original: Original input geometry (before EE processing)
1805
+ - geometry_type_original: Original geometry type
1806
+ - geometry_type: Processed geometry type (from EE)
1807
+ - geometry_type_changed: Boolean flag if geometry changed
1808
+ - geometry_type_transition: Description of how it changed
1809
+ These columns enable full transparency and auditability for compliance tracking.
1686
1810
 
1687
1811
  Returns
1688
1812
  -------
1689
1813
  pd.DataFrame
1690
- Validated, formatted results DataFrame
1814
+ Validated, formatted results DataFrame with optional audit trail
1691
1815
  """
1692
1816
  from openforis_whisp.reformat import format_stats_dataframe
1817
+ from datetime import datetime, timezone
1818
+ import json
1819
+ from shapely.geometry import mapping
1693
1820
 
1694
- logger = logger or logging.getLogger("whisp-concurrent")
1821
+ logger = logger or logging.getLogger("whisp")
1695
1822
 
1696
1823
  # Auto-detect decimal places from config if not provided
1697
1824
  if decimal_places is None:
@@ -1699,6 +1826,9 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
1699
1826
  decimal_places = _extract_decimal_places(stats_area_columns_formatting)
1700
1827
  logger.debug(f"Using decimal_places={decimal_places} from config")
1701
1828
 
1829
+ # Normalize keep_external_columns parameter early (will be used in merge logic later)
1830
+ # Load GeoJSON temporarily to get column names for normalization
1831
+
1702
1832
  # Step 1: Get raw stats
1703
1833
  logger.debug("Step 1/2: Extracting statistics (concurrent)...")
1704
1834
  df_raw = whisp_stats_geojson_to_df_concurrent(
@@ -1759,6 +1889,113 @@ def whisp_formatted_stats_geojson_to_df_concurrent(
1759
1889
  custom_bands=custom_bands,
1760
1890
  )
1761
1891
 
1892
+ # Step 2c: Add audit trail columns (AFTER validation to preserve columns)
1893
+ if include_geometry_audit_trail:
1894
+ logger.debug("Adding audit trail columns...")
1895
+ try:
1896
+ # Capture original geometries AFTER we have the raw stats
1897
+ logger.debug("Capturing original geometries for audit trail...")
1898
+ gdf_original = _load_geojson_silently(input_geojson_filepath)
1899
+
1900
+ # Use plotId from df_validated to maintain mapping
1901
+ df_original_geom = pd.DataFrame(
1902
+ {
1903
+ "plotId": df_validated["plotId"].values[: len(gdf_original)],
1904
+ "geo_original": gdf_original["geometry"].apply(
1905
+ lambda g: json.dumps(mapping(g)) if g is not None else None
1906
+ ),
1907
+ "geometry_type_original": gdf_original["geometry"].geom_type.values,
1908
+ }
1909
+ )
1910
+
1911
+ # Merge original geometries back
1912
+ df_validated = df_validated.merge(df_original_geom, on="plotId", how="left")
1913
+
1914
+ # Extract geometry type from processed 'geo' column if it exists
1915
+ # Note: 'geo' column may not exist after validation removes extra columns
1916
+ if "geo" in df_validated.columns:
1917
+ # Use geo column from validated dataframe
1918
+ def extract_geom_type(x):
1919
+ try:
1920
+ if isinstance(x, dict):
1921
+ return x.get("type")
1922
+ elif isinstance(x, str):
1923
+ # Handle both JSON strings and Python dict string representations
1924
+ try:
1925
+ parsed = json.loads(x)
1926
+ except:
1927
+ # Try ast.literal_eval for Python dict representations
1928
+ import ast
1929
+
1930
+ parsed = ast.literal_eval(x)
1931
+ return (
1932
+ parsed.get("type") if isinstance(parsed, dict) else None
1933
+ )
1934
+ except:
1935
+ pass
1936
+ return None
1937
+
1938
+ df_validated["geometry_type"] = df_validated["geo"].apply(
1939
+ extract_geom_type
1940
+ )
1941
+ else:
1942
+ # If geo doesn't exist, just use the original type
1943
+ df_validated["geometry_type"] = df_validated["geometry_type_original"]
1944
+
1945
+ # Flag if geometry changed
1946
+ df_validated["geometry_type_changed"] = (
1947
+ df_validated["geometry_type_original"] != df_validated["geometry_type"]
1948
+ )
1949
+
1950
+ # Classify the geometry type transition
1951
+ def classify_transition(orig, proc):
1952
+ if orig == proc:
1953
+ return "no_change"
1954
+ elif proc == "LineString":
1955
+ return f"{orig}_simplified_to_linestring"
1956
+ elif proc == "Point":
1957
+ return f"{orig}_simplified_to_point"
1958
+ else:
1959
+ return f"{orig}_to_{proc}"
1960
+
1961
+ df_validated["geometry_type_transition"] = df_validated.apply(
1962
+ lambda row: classify_transition(
1963
+ row["geometry_type_original"], row["geometry_type"]
1964
+ ),
1965
+ axis=1,
1966
+ )
1967
+
1968
+ # Store processing metadata
1969
+ df_validated.attrs["processing_metadata"] = {
1970
+ "whisp_version": "2.0",
1971
+ "processing_date": datetime.now().isoformat(),
1972
+ "processing_mode": "concurrent",
1973
+ "ee_endpoint": "high_volume",
1974
+ "validate_geometries": validate_geometries,
1975
+ "datasets_used": national_codes or [],
1976
+ "include_geometry_audit_trail": True,
1977
+ }
1978
+
1979
+ logger.info(
1980
+ f"Audit trail added: {df_validated['geometry_type_changed'].sum()} geometries with type changes"
1981
+ )
1982
+
1983
+ except Exception as e:
1984
+ logger.warning(f"Error adding audit trail: {e}")
1985
+ # Continue without audit trail if something fails
1986
+
1987
+ # Add processing metadata column using pd.concat to avoid fragmentation warning
1988
+ metadata_dict = {
1989
+ "whisp_version": "3.0.0a1",
1990
+ "processing_timestamp_utc": datetime.now(timezone.utc).strftime(
1991
+ "%Y-%m-%d %H:%M:%S UTC"
1992
+ ),
1993
+ }
1994
+ metadata_series = pd.Series(
1995
+ [metadata_dict] * len(df_validated), name="whisp_processing_metadata"
1996
+ )
1997
+ df_validated = pd.concat([df_validated, metadata_series], axis=1)
1998
+
1762
1999
  logger.info("Concurrent processing + formatting + validation complete")
1763
2000
  return df_validated
1764
2001
 
@@ -1779,6 +2016,7 @@ def whisp_formatted_stats_geojson_to_df_sequential(
1779
2016
  convert_water_flag: bool = True,
1780
2017
  water_flag_threshold: float = 0.5,
1781
2018
  sort_column: str = "plotId",
2019
+ include_geometry_audit_trail: bool = False,
1782
2020
  ) -> pd.DataFrame:
1783
2021
  """
1784
2022
  Process GeoJSON sequentially with automatic formatting and validation.
@@ -1821,15 +2059,26 @@ def whisp_formatted_stats_geojson_to_df_sequential(
1821
2059
  Water flag ratio threshold (default 0.5)
1822
2060
  sort_column : str
1823
2061
  Column to sort by (default "plotId", None to skip)
2062
+ include_geometry_audit_trail : bool, default True
2063
+ If True, includes audit trail columns:
2064
+ - geo_original: Original input geometry (before EE processing)
2065
+ - geometry_type_original: Original geometry type
2066
+ - geometry_type: Processed geometry type (from EE)
2067
+ - geometry_type_changed: Boolean flag if geometry changed
2068
+ - geometry_type_transition: Description of how it changed
2069
+ These columns enable full transparency and auditability for EUDR compliance.
1824
2070
 
1825
2071
  Returns
1826
2072
  -------
1827
2073
  pd.DataFrame
1828
- Validated, formatted results DataFrame
2074
+ Validated, formatted results DataFrame with optional audit trail
1829
2075
  """
1830
2076
  from openforis_whisp.reformat import format_stats_dataframe
2077
+ from datetime import datetime, timezone
2078
+ import json
2079
+ from shapely.geometry import mapping
1831
2080
 
1832
- logger = logger or logging.getLogger("whisp-concurrent")
2081
+ logger = logger or logging.getLogger("whisp")
1833
2082
 
1834
2083
  # Auto-detect decimal places from config if not provided
1835
2084
  if decimal_places is None:
@@ -1893,6 +2142,112 @@ def whisp_formatted_stats_geojson_to_df_sequential(
1893
2142
  custom_bands=custom_bands,
1894
2143
  )
1895
2144
 
2145
+ # Step 2c: Add audit trail columns (AFTER validation to preserve columns)
2146
+ if include_geometry_audit_trail:
2147
+ logger.debug("Adding audit trail columns...")
2148
+ try:
2149
+ # Capture original geometries AFTER we have the raw stats
2150
+ logger.debug("Capturing original geometries for audit trail...")
2151
+ gdf_original = _load_geojson_silently(input_geojson_filepath)
2152
+
2153
+ # Use plotId from df_validated to maintain mapping
2154
+ df_original_geom = pd.DataFrame(
2155
+ {
2156
+ "plotId": df_validated["plotId"].values[: len(gdf_original)],
2157
+ "geo_original": gdf_original["geometry"].apply(
2158
+ lambda g: json.dumps(mapping(g)) if g is not None else None
2159
+ ),
2160
+ "geometry_type_original": gdf_original["geometry"].geom_type.values,
2161
+ }
2162
+ )
2163
+
2164
+ # Merge original geometries back
2165
+ df_validated = df_validated.merge(df_original_geom, on="plotId", how="left")
2166
+
2167
+ # Extract geometry type from processed 'geo' column if it exists
2168
+ # Note: 'geo' column may not exist after validation removes extra columns
2169
+ if "geo" in df_validated.columns:
2170
+ # Use geo column from validated dataframe
2171
+ def extract_geom_type(x):
2172
+ try:
2173
+ if isinstance(x, dict):
2174
+ return x.get("type")
2175
+ elif isinstance(x, str):
2176
+ # Handle both JSON strings and Python dict string representations
2177
+ try:
2178
+ parsed = json.loads(x)
2179
+ except:
2180
+ # Try ast.literal_eval for Python dict representations
2181
+ import ast
2182
+
2183
+ parsed = ast.literal_eval(x)
2184
+ return (
2185
+ parsed.get("type") if isinstance(parsed, dict) else None
2186
+ )
2187
+ except:
2188
+ pass
2189
+ return None
2190
+
2191
+ df_validated["geometry_type"] = df_validated["geo"].apply(
2192
+ extract_geom_type
2193
+ )
2194
+ else:
2195
+ # If geo doesn't exist, just use the original type
2196
+ df_validated["geometry_type"] = df_validated["geometry_type_original"]
2197
+
2198
+ # Flag if geometry changed
2199
+ df_validated["geometry_type_changed"] = (
2200
+ df_validated["geometry_type_original"] != df_validated["geometry_type"]
2201
+ )
2202
+
2203
+ # Classify the geometry type transition
2204
+ def classify_transition(orig, proc):
2205
+ if orig == proc:
2206
+ return "no_change"
2207
+ elif proc == "LineString":
2208
+ return f"{orig}_simplified_to_linestring"
2209
+ elif proc == "Point":
2210
+ return f"{orig}_simplified_to_point"
2211
+ else:
2212
+ return f"{orig}_to_{proc}"
2213
+
2214
+ df_validated["geometry_type_transition"] = df_validated.apply(
2215
+ lambda row: classify_transition(
2216
+ row["geometry_type_original"], row["geometry_type"]
2217
+ ),
2218
+ axis=1,
2219
+ )
2220
+
2221
+ # Store processing metadata
2222
+ df_validated.attrs["processing_metadata"] = {
2223
+ "whisp_version": "2.0",
2224
+ "processing_date": datetime.now().isoformat(),
2225
+ "processing_mode": "sequential",
2226
+ "ee_endpoint": "standard",
2227
+ "datasets_used": national_codes or [],
2228
+ "include_geometry_audit_trail": True,
2229
+ }
2230
+
2231
+ logger.info(
2232
+ f"Audit trail added: {df_validated['geometry_type_changed'].sum()} geometries with type changes"
2233
+ )
2234
+
2235
+ except Exception as e:
2236
+ logger.warning(f"Error adding audit trail: {e}")
2237
+ # Continue without audit trail if something fails
2238
+
2239
+ # Add processing metadata column using pd.concat to avoid fragmentation warning
2240
+ metadata_dict = {
2241
+ "whisp_version": "3.0.0a1",
2242
+ "processing_timestamp_utc": datetime.now(timezone.utc).strftime(
2243
+ "%Y-%m-%d %H:%M:%S UTC"
2244
+ ),
2245
+ }
2246
+ metadata_series = pd.Series(
2247
+ [metadata_dict] * len(df_validated), name="whisp_processing_metadata"
2248
+ )
2249
+ df_validated = pd.concat([df_validated, metadata_series], axis=1)
2250
+
1896
2251
  logger.info("Sequential processing + formatting + validation complete")
1897
2252
  return df_validated
1898
2253
 
@@ -1923,6 +2278,7 @@ def whisp_formatted_stats_geojson_to_df_fast(
1923
2278
  convert_water_flag: bool = True,
1924
2279
  water_flag_threshold: float = 0.5,
1925
2280
  sort_column: str = "plotId",
2281
+ include_geometry_audit_trail: bool = False,
1926
2282
  ) -> pd.DataFrame:
1927
2283
  """
1928
2284
  Process GeoJSON to Whisp statistics with optimized fast processing.
@@ -1999,7 +2355,7 @@ def whisp_formatted_stats_geojson_to_df_fast(
1999
2355
  ... mode="sequential"
2000
2356
  ... )
2001
2357
  """
2002
- logger = logging.getLogger("whisp-concurrent")
2358
+ logger = logging.getLogger("whisp")
2003
2359
 
2004
2360
  # Determine processing mode
2005
2361
  if mode == "auto":
@@ -2050,6 +2406,7 @@ def whisp_formatted_stats_geojson_to_df_fast(
2050
2406
  convert_water_flag=convert_water_flag,
2051
2407
  water_flag_threshold=water_flag_threshold,
2052
2408
  sort_column=sort_column,
2409
+ include_geometry_audit_trail=include_geometry_audit_trail,
2053
2410
  )
2054
2411
  else: # sequential
2055
2412
  logger.debug("Routing to sequential processing...")
@@ -2067,4 +2424,5 @@ def whisp_formatted_stats_geojson_to_df_fast(
2067
2424
  convert_water_flag=convert_water_flag,
2068
2425
  water_flag_threshold=water_flag_threshold,
2069
2426
  sort_column=sort_column,
2427
+ include_geometry_audit_trail=include_geometry_audit_trail,
2070
2428
  )
@@ -7,10 +7,69 @@ and thresholds, raising informative errors when constraints are violated.
7
7
 
8
8
  import json
9
9
  from pathlib import Path
10
- from shapely.geometry import Polygon as ShapelyPolygon
10
+ from shapely.geometry import Polygon as ShapelyPolygon, shape as shapely_shape
11
11
 
12
12
  # Note: area summary stats are estimations for use in deciding pathways for analysis
13
13
  # (estimation preferred here as allows efficient processing speed and limits overhead of checking file)
14
+
15
+
16
+ def _convert_projected_area_to_ha(area_sq_units: float, crs: str = None) -> float:
17
+ """
18
+ Convert area from projected CRS units to hectares.
19
+
20
+ Most projected CRS use meters as units, so:
21
+ - area_sq_units is in square meters
22
+ - 1 hectare = 10,000 m²
23
+
24
+ Args:
25
+ area_sq_units: Area in square units of the projection (typically square meters)
26
+ crs: CRS string for reference (e.g., 'EPSG:3857'). Used for validation.
27
+
28
+ Returns:
29
+ Area in hectares
30
+ """
31
+ # Standard conversion: 1 hectare = 10,000 m²
32
+ # Most projected CRS use meters, so this works universally
33
+ return area_sq_units / 10000
34
+
35
+
36
+ def _estimate_area_from_bounds(coords, area_conversion_factor: float) -> float:
37
+ """
38
+ Estimate area from bounding box when actual area calculation fails.
39
+ Extracts bounding box and calculates its area as a fallback estimate.
40
+ Returns area in hectares.
41
+ """
42
+ try:
43
+ # Flatten all coordinates to find bounds
44
+ all_coords = []
45
+
46
+ def flatten_coords(c):
47
+ if isinstance(c[0], (list, tuple)) and isinstance(c[0][0], (list, tuple)):
48
+ for sub in c:
49
+ flatten_coords(sub)
50
+ else:
51
+ all_coords.extend(c)
52
+
53
+ flatten_coords(coords)
54
+ if not all_coords:
55
+ return 0
56
+
57
+ # Extract lon/lat values
58
+ lons = [c[0] for c in all_coords]
59
+ lats = [c[1] for c in all_coords]
60
+
61
+ min_lon, max_lon = min(lons), max(lons)
62
+ min_lat, max_lat = min(lats), max(lats)
63
+
64
+ # Bounding box area
65
+ bbox_area = (max_lon - min_lon) * (max_lat - min_lat)
66
+
67
+ # Apply conversion factor
68
+ return abs(bbox_area) * area_conversion_factor
69
+ except:
70
+ return 0
71
+
72
+
14
73
  def analyze_geojson(
15
74
  geojson_data: Path | str | dict,
16
75
  metrics=[
@@ -76,6 +135,8 @@ def analyze_geojson(
76
135
  - 'vertex_percentiles': {'p25': int, 'p50': int, 'p75': int, 'p90': int}
77
136
  """
78
137
  results = {}
138
+ crs_warning = None
139
+ file_path = None
79
140
 
80
141
  try:
81
142
  # Load GeoJSON from file if path provided
@@ -83,11 +144,45 @@ def analyze_geojson(
83
144
  file_path = Path(geojson_data)
84
145
  if not file_path.exists():
85
146
  raise FileNotFoundError(f"GeoJSON file not found: {file_path}")
86
- with open(file_path, "r") as f:
87
- geojson_data = json.load(f)
147
+
148
+ # Try UTF-8 first (most common), then fall back to auto-detection
149
+ try:
150
+ with open(file_path, "r", encoding="utf-8") as f:
151
+ geojson_data = json.load(f)
152
+ except UnicodeDecodeError:
153
+ # Auto-detect encoding if UTF-8 fails
154
+ try:
155
+ import chardet
156
+
157
+ with open(file_path, "rb") as f:
158
+ raw_data = f.read()
159
+ detected = chardet.detect(raw_data)
160
+ encoding = detected.get("encoding", "latin-1")
161
+
162
+ with open(file_path, "r", encoding=encoding, errors="replace") as f:
163
+ geojson_data = json.load(f)
164
+ except Exception:
165
+ # Final fallback: use latin-1 which accepts all byte values
166
+ with open(file_path, "r", encoding="latin-1") as f:
167
+ geojson_data = json.load(f)
168
+
169
+ # Detect CRS from file if available
170
+ try:
171
+ import geopandas as gpd
172
+
173
+ gdf = gpd.read_file(file_path)
174
+ if gdf.crs and gdf.crs != "EPSG:4326":
175
+ crs_warning = f"⚠️ CRS is {gdf.crs}, not EPSG:4326. Area metrics will be inaccurate. Data will be auto-reprojected during processing."
176
+ except Exception:
177
+ pass # If we can't detect CRS, continue without warning
88
178
 
89
179
  features = geojson_data.get("features", [])
90
180
 
181
+ # Add CRS warning to results if detected
182
+ if crs_warning:
183
+ results["crs_warning"] = crs_warning
184
+ print(crs_warning)
185
+
91
186
  if "count" in metrics:
92
187
  results["count"] = len(features)
93
188
 
@@ -113,6 +208,29 @@ def analyze_geojson(
113
208
  geometry_type_counts = {}
114
209
  valid_polygons = 0
115
210
 
211
+ # Tracking for fallback geometries
212
+ bbox_fallback_count = 0 # Geometries that used bounding box estimate
213
+ geometry_skip_count = 0 # Geometries completely skipped
214
+ polygon_type_stats = {} # Track stats by geometry type
215
+
216
+ # Detect CRS to determine area conversion factor
217
+ area_conversion_factor = 1232100 # Default: WGS84 (degrees to ha)
218
+ detected_crs = None
219
+
220
+ # Try to detect CRS from file if available
221
+ if file_path:
222
+ try:
223
+ import geopandas as gpd
224
+
225
+ gdf_temp = gpd.read_file(str(file_path))
226
+ detected_crs = gdf_temp.crs
227
+ if detected_crs and detected_crs != "EPSG:4326":
228
+ # Projected CRS typically uses meters, so convert m² to ha
229
+ # 1 ha = 10,000 m²
230
+ area_conversion_factor = 1 / 10000
231
+ except Exception:
232
+ pass # Use default if CRS detection fails
233
+
116
234
  for feature in features:
117
235
  try:
118
236
  coords = feature["geometry"]["coordinates"]
@@ -133,13 +251,27 @@ def analyze_geojson(
133
251
 
134
252
  # Calculate area from coordinates using shapely
135
253
  try:
136
- poly = ShapelyPolygon(coords[0])
137
- # Convert square degrees to hectares (near equator)
138
- # 1 degree latitude ≈ 111 km, so 1 degree² ≈ 111² km² = 12,321 km² = 1,232,100 ha
139
- area_ha = abs(poly.area) * 1232100
254
+ # Use shapely.geometry.shape to properly handle all geometry components
255
+ geom = shapely_shape(feature["geometry"])
256
+ # Convert using detected CRS
257
+ area_ha = abs(geom.area) * area_conversion_factor
140
258
  areas.append(area_ha)
141
- except:
142
- pass # Skip if calculation fails
259
+ except Exception as e:
260
+ # Fallback: estimate from bounding box if geometry fails
261
+ bbox_area = _estimate_area_from_bounds(
262
+ coords, area_conversion_factor
263
+ )
264
+ if bbox_area > 0:
265
+ areas.append(bbox_area)
266
+ bbox_fallback_count += 1
267
+ polygon_type_stats["Polygon_bbox"] = (
268
+ polygon_type_stats.get("Polygon_bbox", 0) + 1
269
+ )
270
+ else:
271
+ geometry_skip_count += 1
272
+ polygon_type_stats["Polygon_skipped"] = (
273
+ polygon_type_stats.get("Polygon_skipped", 0) + 1
274
+ )
143
275
  valid_polygons += 1
144
276
 
145
277
  elif geom_type == "MultiPolygon":
@@ -152,12 +284,28 @@ def analyze_geojson(
152
284
 
153
285
  # Calculate area from coordinates using shapely
154
286
  try:
155
- for polygon in coords:
156
- poly = ShapelyPolygon(polygon[0])
157
- area_ha = abs(poly.area) * 1232100
158
- areas.append(area_ha)
159
- except:
160
- pass # Skip if calculation fails
287
+ # Use shapely.geometry.shape to properly handle MultiPolygon
288
+ geom = shapely_shape(feature["geometry"])
289
+ # Convert using detected CRS - use total area of all parts
290
+ area_ha = abs(geom.area) * area_conversion_factor
291
+ areas.append(area_ha)
292
+ except Exception as e:
293
+ # Fallback: estimate from bounding box if geometry fails
294
+ bbox_area = _estimate_area_from_bounds(
295
+ coords, area_conversion_factor
296
+ )
297
+ if bbox_area > 0:
298
+ areas.append(bbox_area)
299
+ bbox_fallback_count += 1
300
+ polygon_type_stats["MultiPolygon_bbox"] = (
301
+ polygon_type_stats.get("MultiPolygon_bbox", 0) + 1
302
+ )
303
+ else:
304
+ geometry_skip_count += 1
305
+ polygon_type_stats["MultiPolygon_skipped"] = (
306
+ polygon_type_stats.get("MultiPolygon_skipped", 0)
307
+ + 1
308
+ )
161
309
  valid_polygons += 1
162
310
 
163
311
  except:
@@ -312,6 +460,21 @@ def analyze_geojson(
312
460
  else {"p25": 0, "p50": 0, "p75": 0, "p90": 0}
313
461
  )
314
462
 
463
+ # Add geometry quality logging to results
464
+ if bbox_fallback_count > 0 or geometry_skip_count > 0:
465
+ geometry_quality_log = (
466
+ f"Geometry quality summary:\n"
467
+ f" - Bounding box fallback used: {bbox_fallback_count} features\n"
468
+ f" - Geometries skipped: {geometry_skip_count} features"
469
+ )
470
+ if polygon_type_stats:
471
+ geometry_quality_log += "\n - Breakdown:"
472
+ for stat_type, count in sorted(polygon_type_stats.items()):
473
+ geometry_quality_log += f"\n - {stat_type}: {count}"
474
+
475
+ results["geometry_quality_note"] = geometry_quality_log
476
+ print(geometry_quality_log)
477
+
315
478
  return results
316
479
 
317
480
  except Exception as e:
@@ -12,67 +12,81 @@ import geopandas as gpd
12
12
  import ee
13
13
 
14
14
 
15
- def convert_geojson_to_ee(
16
- geojson_filepath: Union[str, Path, dict],
17
- enforce_wgs84: bool = True,
18
- strip_z_coords: bool = True,
19
- ) -> ee.FeatureCollection:
15
+ # ============================================================================
16
+ # HELPER FUNCTIONS FOR UNIFIED PROCESSING PATHWAY
17
+ # ============================================================================
18
+
19
+
20
+ def _sanitize_geodataframe(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
20
21
  """
21
- Converts GeoJSON data to an Earth Engine FeatureCollection.
22
- Accepts either a file path or a GeoJSON dictionary object.
23
- Optionally checks and converts the CRS to WGS 84 (EPSG:4326) if needed.
24
- Automatically handles 3D coordinates by stripping Z values when necessary.
22
+ Sanitize GeoDataFrame data types for JSON serialization.
23
+
24
+ Converts problematic data types that cannot be directly serialized:
25
+ - DateTime/Timestamp columns ISO format strings
26
+ - Object columns → strings
27
+ - Skips geometry column
25
28
 
26
29
  Args:
27
- geojson_filepath (Union[str, Path, dict]): The filepath to the GeoJSON file (str or Path)
28
- or a GeoJSON dictionary object.
29
- enforce_wgs84 (bool): Whether to enforce WGS 84 projection (EPSG:4326). Defaults to True.
30
- Only applies when input is a file path (dicts are assumed to be in WGS84).
31
- strip_z_coords (bool): Whether to automatically strip Z coordinates from 3D geometries. Defaults to True.
30
+ gdf (gpd.GeoDataFrame): Input GeoDataFrame
32
31
 
33
32
  Returns:
34
- ee.FeatureCollection: Earth Engine FeatureCollection created from the GeoJSON.
33
+ gpd.GeoDataFrame: GeoDataFrame with sanitized data types
34
+ """
35
+ gdf = gdf.copy()
36
+ for col in gdf.columns:
37
+ if col != gdf.geometry.name: # Skip geometry column
38
+ # Handle datetime/timestamp columns
39
+ if pd.api.types.is_datetime64_any_dtype(gdf[col]):
40
+ gdf[col] = gdf[col].dt.strftime("%Y-%m-%d %H:%M:%S").fillna("")
41
+ # Handle other problematic types
42
+ elif gdf[col].dtype == "object":
43
+ # Convert any remaining non-serializable objects to strings
44
+ gdf[col] = gdf[col].astype(str)
45
+ return gdf
46
+
47
+
48
+ def _ensure_wgs84_crs(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
35
49
  """
36
- if isinstance(geojson_filepath, dict):
37
- # Input is already a GeoJSON dictionary - skip file reading
38
- geojson_data = geojson_filepath
39
- elif isinstance(geojson_filepath, (str, Path)):
40
- file_path = os.path.abspath(geojson_filepath)
50
+ Ensure GeoDataFrame uses WGS 84 (EPSG:4326) coordinate reference system.
41
51
 
42
- # Use GeoPandas to read the file and handle CRS
43
- gdf = gpd.read_file(file_path)
52
+ - If CRS is None, assumes WGS 84
53
+ - If CRS is not WGS 84, converts to WGS 84
54
+ - If already WGS 84, returns unchanged
44
55
 
45
- # NEW: Handle problematic data types before JSON conversion
46
- for col in gdf.columns:
47
- if col != gdf.geometry.name: # Skip geometry column
48
- # Handle datetime/timestamp columns
49
- if pd.api.types.is_datetime64_any_dtype(gdf[col]):
50
- gdf[col] = gdf[col].dt.strftime("%Y-%m-%d %H:%M:%S").fillna("")
51
- # Handle other problematic types
52
- elif gdf[col].dtype == "object":
53
- # Convert any remaining non-serializable objects to strings
54
- gdf[col] = gdf[col].astype(str)
55
-
56
- # Check and convert CRS if needed
57
- if enforce_wgs84:
58
- if gdf.crs is None:
59
- # Assuming WGS 84 if no CRS defined
60
- pass
61
- elif gdf.crs != "EPSG:4326":
62
- gdf = gdf.to_crs("EPSG:4326")
63
-
64
- # Convert to GeoJSON
65
- geojson_data = json.loads(gdf.to_json())
66
- else:
67
- raise ValueError(
68
- "Input must be a file path (str or Path) or a GeoJSON dictionary object (dict)"
69
- )
56
+ Args:
57
+ gdf (gpd.GeoDataFrame): Input GeoDataFrame
70
58
 
71
- validation_errors = validate_geojson(geojson_data)
72
- if validation_errors:
73
- raise ValueError(f"GeoJSON validation errors: {validation_errors}")
59
+ Returns:
60
+ gpd.GeoDataFrame: GeoDataFrame in WGS 84
61
+ """
62
+ if gdf.crs is None:
63
+ # Assuming WGS 84 if no CRS defined
64
+ return gdf
65
+ elif gdf.crs != "EPSG:4326":
66
+ return gdf.to_crs("EPSG:4326")
67
+ return gdf
68
+
69
+
70
+ def _create_ee_feature_collection(
71
+ geojson_data: dict, strip_z_coords: bool = True, input_source: str = "input"
72
+ ) -> ee.FeatureCollection:
73
+ """
74
+ Create Earth Engine FeatureCollection from GeoJSON dict with error recovery.
75
+
76
+ Attempts to create EE FeatureCollection. If it fails due to 3D coordinates
77
+ and strip_z_coords is True, automatically strips Z values and retries.
78
+
79
+ Args:
80
+ geojson_data (dict): GeoJSON data dictionary
81
+ strip_z_coords (bool): Whether to retry with 2D geometries on failure
82
+ input_source (str): Description of input source for logging
83
+
84
+ Returns:
85
+ ee.FeatureCollection: Earth Engine FeatureCollection
74
86
 
75
- # Try to create the feature collection, handle 3D coordinate issues automatically
87
+ Raises:
88
+ ee.EEException: If conversion fails even after retries
89
+ """
76
90
  try:
77
91
  feature_collection = ee.FeatureCollection(
78
92
  create_feature_collection(geojson_data)
@@ -81,16 +95,16 @@ def convert_geojson_to_ee(
81
95
  except ee.EEException as e:
82
96
  if "Invalid GeoJSON geometry" in str(e) and strip_z_coords:
83
97
  # Apply print_once deduplication for Z-coordinate stripping messages
84
- if not hasattr(convert_geojson_to_ee, "_printed_z_messages"):
85
- convert_geojson_to_ee._printed_z_messages = set()
98
+ if not hasattr(_create_ee_feature_collection, "_printed_z_messages"):
99
+ _create_ee_feature_collection._printed_z_messages = set()
86
100
 
87
- z_message_key = f"z_coords_{file_path}"
88
- if z_message_key not in convert_geojson_to_ee._printed_z_messages:
101
+ z_message_key = f"z_coords_{input_source}"
102
+ if z_message_key not in _create_ee_feature_collection._printed_z_messages:
89
103
  print(
90
104
  "Warning: Invalid GeoJSON geometry detected, likely due to 3D coordinates."
91
105
  )
92
106
  print("Attempting to fix by stripping Z coordinates...")
93
- convert_geojson_to_ee._printed_z_messages.add(z_message_key)
107
+ _create_ee_feature_collection._printed_z_messages.add(z_message_key)
94
108
 
95
109
  # Apply Z-coordinate stripping
96
110
  geojson_data_fixed = _strip_z_coordinates_from_geojson(geojson_data)
@@ -101,10 +115,15 @@ def convert_geojson_to_ee(
101
115
  create_feature_collection(geojson_data_fixed)
102
116
  )
103
117
 
104
- success_message_key = f"z_coords_success_{file_path}"
105
- if success_message_key not in convert_geojson_to_ee._printed_z_messages:
118
+ success_message_key = f"z_coords_success_{input_source}"
119
+ if (
120
+ success_message_key
121
+ not in _create_ee_feature_collection._printed_z_messages
122
+ ):
106
123
  print("Successfully converted after stripping Z coordinates")
107
- convert_geojson_to_ee._printed_z_messages.add(success_message_key)
124
+ _create_ee_feature_collection._printed_z_messages.add(
125
+ success_message_key
126
+ )
108
127
 
109
128
  return feature_collection
110
129
  except Exception as retry_error:
@@ -115,6 +134,82 @@ def convert_geojson_to_ee(
115
134
  raise e
116
135
 
117
136
 
137
+ def convert_geojson_to_ee(
138
+ geojson_input: Union[str, Path, dict, gpd.GeoDataFrame],
139
+ enforce_wgs84: bool = True,
140
+ strip_z_coords: bool = True,
141
+ ) -> ee.FeatureCollection:
142
+ """
143
+ Converts GeoJSON data to an Earth Engine FeatureCollection.
144
+
145
+ Accepts flexible input types with a unified processing pathway:
146
+ - File path (str or Path) → loads with GeoPandas
147
+ - GeoJSON dict → uses directly
148
+ - GeoDataFrame → uses directly
149
+
150
+ Automatically handles:
151
+ - CRS conversion to WGS 84 (EPSG:4326) if needed
152
+ - DateTime/Timestamp columns → converts to ISO strings before JSON serialization
153
+ - Non-serializable objects → converts to strings
154
+ - 3D coordinates → strips Z values when necessary
155
+ - Z-coordinate errors → retries with 2D geometries if enabled
156
+
157
+ Args:
158
+ geojson_input (Union[str, Path, dict, gpd.GeoDataFrame]):
159
+ - File path (str or Path) to GeoJSON file
160
+ - GeoJSON dictionary object
161
+ - GeoPandas GeoDataFrame
162
+ enforce_wgs84 (bool): Whether to enforce WGS 84 projection (EPSG:4326).
163
+ Defaults to True. Only applies to file path and GeoDataFrame inputs.
164
+ strip_z_coords (bool): Whether to automatically strip Z coordinates from 3D geometries.
165
+ Defaults to True.
166
+
167
+ Returns:
168
+ ee.FeatureCollection: Earth Engine FeatureCollection created from the GeoJSON.
169
+
170
+ Raises:
171
+ ValueError: If input type is unsupported or GeoJSON validation fails.
172
+ ee.EEException: If GeoJSON cannot be converted even after retries.
173
+ """
174
+ # UNIFIED INPUT NORMALIZATION: Convert all inputs to GeoDataFrame first
175
+ if isinstance(geojson_input, gpd.GeoDataFrame):
176
+ gdf = geojson_input.copy()
177
+ input_source = "GeoDataFrame"
178
+ elif isinstance(geojson_input, dict):
179
+ # Convert dict to GeoDataFrame for unified processing
180
+ gdf = gpd.GeoDataFrame.from_features(geojson_input.get("features", []))
181
+ input_source = "dict"
182
+ elif isinstance(geojson_input, (str, Path)):
183
+ # Load file and convert to GeoDataFrame
184
+ file_path = os.path.abspath(geojson_input)
185
+ gdf = gpd.read_file(file_path)
186
+ input_source = f"file ({file_path})"
187
+ else:
188
+ raise ValueError(
189
+ f"Input must be a file path (str or Path), GeoJSON dict, or GeoDataFrame. "
190
+ f"Got {type(geojson_input).__name__}"
191
+ )
192
+
193
+ # UNIFIED DATA SANITIZATION PATHWAY
194
+ # Handle problematic data types before JSON conversion
195
+ gdf = _sanitize_geodataframe(gdf)
196
+
197
+ # UNIFIED CRS HANDLING
198
+ if enforce_wgs84:
199
+ gdf = _ensure_wgs84_crs(gdf)
200
+
201
+ # UNIFIED GEOJSON CONVERSION
202
+ geojson_data = json.loads(gdf.to_json())
203
+
204
+ # UNIFIED VALIDATION
205
+ validation_errors = validate_geojson(geojson_data)
206
+ if validation_errors:
207
+ raise ValueError(f"GeoJSON validation errors: {validation_errors}")
208
+
209
+ # UNIFIED EE CONVERSION with error recovery
210
+ return _create_ee_feature_collection(geojson_data, strip_z_coords, input_source)
211
+
212
+
118
213
  def _strip_z_coordinates_from_geojson(geojson_data: dict) -> dict:
119
214
  """
120
215
  Helper function to strip Z coordinates from GeoJSON data.
@@ -151,7 +151,7 @@ def whisp_formatted_stats_geojson_to_df_legacy(
151
151
  from shapely.validation import make_valid
152
152
  import logging as py_logging
153
153
 
154
- logger = py_logging.getLogger("whisp-legacy")
154
+ logger = py_logging.getLogger("whisp")
155
155
 
156
156
  # Load GeoJSON file
157
157
  with open(input_geojson_filepath, "r") as f:
@@ -169,11 +169,14 @@ def whisp_formatted_stats_geojson_to_df_legacy(
169
169
  lambda g: make_valid(g) if g and not g.is_valid else g
170
170
  )
171
171
 
172
- # Convert back to GeoJSON dict (stays in memory - no temp files!)
173
- geojson_cleaned = json.loads(gdf.to_json())
174
-
175
- # OPTIMIZATION: Pass GeoJSON dict directly - eliminates file I/O overhead
176
- feature_collection = convert_geojson_to_ee(geojson_cleaned)
172
+ # Pass GeoDataFrame directly to preserve CRS metadata
173
+ # convert_geojson_to_ee will handle:
174
+ # - CRS detection and conversion to WGS84 if needed
175
+ # - Data type sanitization (datetime, object columns)
176
+ # - Geometry validation and Z-coordinate stripping
177
+ feature_collection = convert_geojson_to_ee(
178
+ gdf, enforce_wgs84=True, strip_z_coords=True
179
+ )
177
180
  else:
178
181
  # Original path - no validation
179
182
  feature_collection = convert_geojson_to_ee(str(input_geojson_filepath))
@@ -201,6 +204,7 @@ def whisp_formatted_stats_geojson_to_df(
201
204
  batch_size: int = 10,
202
205
  max_concurrent: int = 20,
203
206
  validate_geometries: bool = False,
207
+ include_geometry_audit_trail: bool = False,
204
208
  ) -> pd.DataFrame:
205
209
  """
206
210
  Main entry point for converting GeoJSON to Whisp statistics.
@@ -253,6 +257,16 @@ def whisp_formatted_stats_geojson_to_df(
253
257
  Set to True to automatically fix invalid/self-intersecting polygons.
254
258
  For production workflows, it's recommended to use geometry validation and
255
259
  cleaning tools BEFORE processing with this function.
260
+ include_geometry_audit_trail : bool, default True
261
+ If True (default), includes audit trail columns:
262
+ - geo_original: Original input geometry
263
+ - geometry_type_original: Original geometry type
264
+ - geometry_type: Processed geometry type (from EE)
265
+ - geometry_type_changed: Boolean flag if geometry changed
266
+ - geometry_degradation_type: Description of how it changed
267
+
268
+ Processing metadata stored in df.attrs['processing_metadata'].
269
+ These columns enable full transparency for geometry modifications during processing.
256
270
 
257
271
  Returns
258
272
  -------
@@ -345,6 +359,7 @@ def whisp_formatted_stats_geojson_to_df(
345
359
  batch_size=batch_size,
346
360
  max_concurrent=max_concurrent,
347
361
  validate_geometries=validate_geometries,
362
+ include_geometry_audit_trail=include_geometry_audit_trail,
348
363
  )
349
364
  else:
350
365
  raise ValueError(